10l
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5
FB
25int mm_flags; /* multimedia extension flags */
26
de6d9b64
FB
27int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
28int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h);
29int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
30int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
31int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
32
8def0299
FB
33/* external functions, from idct_mmx.c */
34void ff_mmx_idct(DCTELEM *block);
35void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 36
de6d9b64 37/* pixel operations */
a9b3f630
NK
38static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
39static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
40//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
41//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
de6d9b64 42
d6a4c0b1
ZK
43#define JUMPALIGN() __asm __volatile (".balign 8"::)
44#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
45
46#ifndef PIC
47#define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
48#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
49#else
50// for shared library it's better to use this way for accessing constants
51// pcmpeqd -> -1
52#define MOVQ_WONE(regd) \
53 __asm __volatile ( \
54 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
55 "psrlw $15, %%" #regd ::)
56
57#define MOVQ_WTWO(regd) \
58 __asm __volatile ( \
59 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
60 "psrlw $15, %%" #regd " \n\t" \
61 "psllw $1, %%" #regd ::)
62#endif
63
de6d9b64
FB
64/***********************************/
65/* 3Dnow specific */
66
67#define DEF(x) x ## _3dnow
68/* for Athlons PAVGUSB is prefered */
69#define PAVGB "pavgusb"
70
71#include "dsputil_mmx_avg.h"
72
73#undef DEF
74#undef PAVGB
75
76/***********************************/
77/* MMX2 specific */
78
79#define DEF(x) x ## _sse
80
81/* Introduced only in MMX2 set */
82#define PAVGB "pavgb"
83
84#include "dsputil_mmx_avg.h"
85
86#undef DEF
87#undef PAVGB
88
89/***********************************/
90/* standard MMX */
91
92static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
93{
94 DCTELEM *p;
95 const UINT8 *pix;
96 int i;
97
98 /* read the pixels */
99 p = block;
100 pix = pixels;
d6a4c0b1 101 MOVQ_ZERO(mm7);
de6d9b64
FB
102 for(i=0;i<4;i++) {
103 __asm __volatile(
104 "movq %1, %%mm0\n\t"
105 "movq %2, %%mm1\n\t"
106 "movq %%mm0, %%mm2\n\t"
107 "movq %%mm1, %%mm3\n\t"
108 "punpcklbw %%mm7, %%mm0\n\t"
109 "punpckhbw %%mm7, %%mm2\n\t"
110 "punpcklbw %%mm7, %%mm1\n\t"
111 "punpckhbw %%mm7, %%mm3\n\t"
112 "movq %%mm0, %0\n\t"
113 "movq %%mm2, 8%0\n\t"
114 "movq %%mm1, 16%0\n\t"
115 "movq %%mm3, 24%0\n\t"
116 :"=m"(*p)
117 :"m"(*pix), "m"(*(pix+line_size))
118 :"memory");
119 pix += line_size*2;
120 p += 16;
121 }
de6d9b64
FB
122}
123
124static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
125{
126 const DCTELEM *p;
127 UINT8 *pix;
de6d9b64
FB
128
129 /* read the pixels */
130 p = block;
131 pix = pixels;
d6a4c0b1 132 /* unrolled loop */
de6d9b64 133 __asm __volatile(
a822a479
NK
134 "movq %3, %%mm0\n\t"
135 "movq 8%3, %%mm1\n\t"
136 "movq 16%3, %%mm2\n\t"
137 "movq 24%3, %%mm3\n\t"
138 "movq 32%3, %%mm4\n\t"
139 "movq 40%3, %%mm5\n\t"
140 "movq 48%3, %%mm6\n\t"
141 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
142 "packuswb %%mm1, %%mm0\n\t"
143 "packuswb %%mm3, %%mm2\n\t"
144 "packuswb %%mm5, %%mm4\n\t"
145 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
146 "movq %%mm0, (%0)\n\t"
147 "movq %%mm2, (%0, %1)\n\t"
148 "movq %%mm4, (%0, %1, 2)\n\t"
149 "movq %%mm6, (%0, %2)\n\t"
150 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
151 :"memory");
152 pix += line_size*4;
153 p += 32;
d6a4c0b1
ZK
154
155 // if here would be an exact copy of the code above
156 // compiler would generate some very strange code
157 // thus using "r"
158 __asm __volatile(
159 "movq (%3), %%mm0\n\t"
160 "movq 8(%3), %%mm1\n\t"
161 "movq 16(%3), %%mm2\n\t"
162 "movq 24(%3), %%mm3\n\t"
163 "movq 32(%3), %%mm4\n\t"
164 "movq 40(%3), %%mm5\n\t"
165 "movq 48(%3), %%mm6\n\t"
166 "movq 56(%3), %%mm7\n\t"
167 "packuswb %%mm1, %%mm0\n\t"
168 "packuswb %%mm3, %%mm2\n\t"
169 "packuswb %%mm5, %%mm4\n\t"
170 "packuswb %%mm7, %%mm6\n\t"
171 "movq %%mm0, (%0)\n\t"
172 "movq %%mm2, (%0, %1)\n\t"
173 "movq %%mm4, (%0, %1, 2)\n\t"
174 "movq %%mm6, (%0, %2)\n\t"
175 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
176 :"memory");
de6d9b64
FB
177}
178
179static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
180{
181 const DCTELEM *p;
182 UINT8 *pix;
183 int i;
184
185 /* read the pixels */
186 p = block;
187 pix = pixels;
d6a4c0b1
ZK
188 MOVQ_ZERO(mm7);
189 i = 4;
190 while (i) {
de6d9b64
FB
191 __asm __volatile(
192 "movq %2, %%mm0\n\t"
193 "movq 8%2, %%mm1\n\t"
194 "movq 16%2, %%mm2\n\t"
195 "movq 24%2, %%mm3\n\t"
196 "movq %0, %%mm4\n\t"
197 "movq %1, %%mm6\n\t"
198 "movq %%mm4, %%mm5\n\t"
199 "punpcklbw %%mm7, %%mm4\n\t"
200 "punpckhbw %%mm7, %%mm5\n\t"
201 "paddsw %%mm4, %%mm0\n\t"
202 "paddsw %%mm5, %%mm1\n\t"
203 "movq %%mm6, %%mm5\n\t"
204 "punpcklbw %%mm7, %%mm6\n\t"
205 "punpckhbw %%mm7, %%mm5\n\t"
206 "paddsw %%mm6, %%mm2\n\t"
207 "paddsw %%mm5, %%mm3\n\t"
208 "packuswb %%mm1, %%mm0\n\t"
209 "packuswb %%mm3, %%mm2\n\t"
210 "movq %%mm0, %0\n\t"
211 "movq %%mm2, %1\n\t"
a822a479 212 :"+m"(*pix), "+m"(*(pix+line_size))
de6d9b64
FB
213 :"m"(*p)
214 :"memory");
215 pix += line_size*2;
216 p += 16;
d6a4c0b1
ZK
217 i--;
218 };
de6d9b64
FB
219}
220
221static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
222{
d6a4c0b1 223 int hh;
de6d9b64
FB
224 UINT8 *p;
225 const UINT8 *pix;
d6a4c0b1 226
de6d9b64 227 p = block;
d6a4c0b1
ZK
228 pix = pixels; // 2s
229#if 0
230 do {
231 __asm __volatile(
232 "movq %1, %%mm0\n\t"
233 "movq %%mm0, %0\n\t"
234 :"=m"(*p)
235 :"m"(*pix)
236 :"memory");
237 pix += line_size;
238 p += line_size;
239 } while (--h);
240#else
241 // this optimized code is not very usefull
242 // the above loop is definitely faster
243 // at least on Celeron 500MHz
244 hh = h & 3;
245 while (hh) {
246 __asm __volatile(
247 "movq %1, %%mm0\n\t"
248 "movq %%mm0, %0\n\t"
249 :"=m"(*p)
250 :"m"(*pix)
251 :"memory");
252 pix += line_size;
253 p += line_size;
254 hh--;
255 }
de6d9b64 256 hh=h>>2;
d6a4c0b1 257 while (hh) {
de6d9b64 258 __asm __volatile(
a822a479
NK
259 "movq (%1), %%mm0 \n\t"
260 "movq (%1, %2), %%mm1 \n\t"
261 "movq (%1, %2, 2), %%mm2 \n\t"
262 "movq (%1, %3), %%mm3 \n\t"
263 "movq %%mm0, (%0) \n\t"
264 "movq %%mm1, (%0, %2) \n\t"
265 "movq %%mm2, (%0, %2, 2) \n\t"
266 "movq %%mm3, (%0, %3) \n\t"
267 ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3)
de6d9b64 268 :"memory");
d6a4c0b1
ZK
269 pix += line_size*4;
270 p += line_size*4;
271 hh--;
de6d9b64 272 }
d6a4c0b1 273#endif
de6d9b64
FB
274}
275
276static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
277{
278 UINT8 *p;
279 const UINT8 *pix;
280 p = block;
281 pix = pixels;
d6a4c0b1
ZK
282 MOVQ_ZERO(mm7);
283 MOVQ_WONE(mm4);
284 JUMPALIGN();
de6d9b64
FB
285 do {
286 __asm __volatile(
287 "movq %1, %%mm0\n\t"
288 "movq 1%1, %%mm1\n\t"
289 "movq %%mm0, %%mm2\n\t"
290 "movq %%mm1, %%mm3\n\t"
291 "punpcklbw %%mm7, %%mm0\n\t"
292 "punpcklbw %%mm7, %%mm1\n\t"
293 "punpckhbw %%mm7, %%mm2\n\t"
294 "punpckhbw %%mm7, %%mm3\n\t"
295 "paddusw %%mm1, %%mm0\n\t"
296 "paddusw %%mm3, %%mm2\n\t"
297 "paddusw %%mm4, %%mm0\n\t"
298 "paddusw %%mm4, %%mm2\n\t"
299 "psrlw $1, %%mm0\n\t"
300 "psrlw $1, %%mm2\n\t"
301 "packuswb %%mm2, %%mm0\n\t"
302 "movq %%mm0, %0\n\t"
303 :"=m"(*p)
304 :"m"(*pix)
305 :"memory");
306 pix += line_size; p += line_size;
307 } while (--h);
de6d9b64
FB
308}
309
310static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
311{
312 UINT8 *p;
313 const UINT8 *pix;
314 p = block;
315 pix = pixels;
d6a4c0b1
ZK
316 MOVQ_ZERO(mm7);
317 MOVQ_WONE(mm4);
318 JUMPALIGN();
de6d9b64
FB
319 do {
320 __asm __volatile(
321 "movq %1, %%mm0\n\t"
322 "movq %2, %%mm1\n\t"
323 "movq %%mm0, %%mm2\n\t"
324 "movq %%mm1, %%mm3\n\t"
325 "punpcklbw %%mm7, %%mm0\n\t"
326 "punpcklbw %%mm7, %%mm1\n\t"
327 "punpckhbw %%mm7, %%mm2\n\t"
328 "punpckhbw %%mm7, %%mm3\n\t"
329 "paddusw %%mm1, %%mm0\n\t"
330 "paddusw %%mm3, %%mm2\n\t"
331 "paddusw %%mm4, %%mm0\n\t"
332 "paddusw %%mm4, %%mm2\n\t"
333 "psrlw $1, %%mm0\n\t"
334 "psrlw $1, %%mm2\n\t"
335 "packuswb %%mm2, %%mm0\n\t"
336 "movq %%mm0, %0\n\t"
337 :"=m"(*p)
338 :"m"(*pix),
339 "m"(*(pix+line_size))
340 :"memory");
341 pix += line_size;
342 p += line_size;
343 } while (--h);
de6d9b64
FB
344}
345
346static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
347{
348 UINT8 *p;
349 const UINT8 *pix;
350 p = block;
d6a4c0b1
ZK
351 pix = pixels; // 1s
352 MOVQ_ZERO(mm7);
353 MOVQ_WTWO(mm6);
354 JUMPALIGN();
de6d9b64
FB
355 do {
356 __asm __volatile(
357 "movq %1, %%mm0\n\t"
358 "movq %2, %%mm1\n\t"
359 "movq 1%1, %%mm4\n\t"
360 "movq 1%2, %%mm5\n\t"
361 "movq %%mm0, %%mm2\n\t"
362 "movq %%mm1, %%mm3\n\t"
363 "punpcklbw %%mm7, %%mm0\n\t"
364 "punpcklbw %%mm7, %%mm1\n\t"
365 "punpckhbw %%mm7, %%mm2\n\t"
366 "punpckhbw %%mm7, %%mm3\n\t"
367 "paddusw %%mm1, %%mm0\n\t"
368 "paddusw %%mm3, %%mm2\n\t"
369 "movq %%mm4, %%mm1\n\t"
370 "movq %%mm5, %%mm3\n\t"
371 "punpcklbw %%mm7, %%mm4\n\t"
372 "punpcklbw %%mm7, %%mm5\n\t"
373 "punpckhbw %%mm7, %%mm1\n\t"
374 "punpckhbw %%mm7, %%mm3\n\t"
375 "paddusw %%mm5, %%mm4\n\t"
376 "paddusw %%mm3, %%mm1\n\t"
377 "paddusw %%mm6, %%mm4\n\t"
378 "paddusw %%mm6, %%mm1\n\t"
379 "paddusw %%mm4, %%mm0\n\t"
380 "paddusw %%mm1, %%mm2\n\t"
381 "psrlw $2, %%mm0\n\t"
382 "psrlw $2, %%mm2\n\t"
383 "packuswb %%mm2, %%mm0\n\t"
384 "movq %%mm0, %0\n\t"
385 :"=m"(*p)
386 :"m"(*pix),
387 "m"(*(pix+line_size))
388 :"memory");
389 pix += line_size;
390 p += line_size;
391 } while(--h);
de6d9b64
FB
392}
393
394static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
395{
396 UINT8 *p;
397 const UINT8 *pix;
398 p = block;
399 pix = pixels;
d6a4c0b1 400 MOVQ_ZERO(mm7);
de6d9b64
FB
401 do {
402 __asm __volatile(
403 "movq %1, %%mm0\n\t"
404 "movq 1%1, %%mm1\n\t"
405 "movq %%mm0, %%mm2\n\t"
406 "movq %%mm1, %%mm3\n\t"
407 "punpcklbw %%mm7, %%mm0\n\t"
408 "punpcklbw %%mm7, %%mm1\n\t"
409 "punpckhbw %%mm7, %%mm2\n\t"
410 "punpckhbw %%mm7, %%mm3\n\t"
411 "paddusw %%mm1, %%mm0\n\t"
412 "paddusw %%mm3, %%mm2\n\t"
413 "psrlw $1, %%mm0\n\t"
414 "psrlw $1, %%mm2\n\t"
415 "packuswb %%mm2, %%mm0\n\t"
416 "movq %%mm0, %0\n\t"
417 :"=m"(*p)
418 :"m"(*pix)
419 :"memory");
420 pix += line_size;
421 p += line_size;
422 } while (--h);
de6d9b64
FB
423}
424
425static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
426{
427 UINT8 *p;
428 const UINT8 *pix;
429 p = block;
430 pix = pixels;
d6a4c0b1
ZK
431 MOVQ_ZERO(mm7);
432 JUMPALIGN();
de6d9b64
FB
433 do {
434 __asm __volatile(
435 "movq %1, %%mm0\n\t"
436 "movq %2, %%mm1\n\t"
437 "movq %%mm0, %%mm2\n\t"
438 "movq %%mm1, %%mm3\n\t"
439 "punpcklbw %%mm7, %%mm0\n\t"
440 "punpcklbw %%mm7, %%mm1\n\t"
441 "punpckhbw %%mm7, %%mm2\n\t"
442 "punpckhbw %%mm7, %%mm3\n\t"
443 "paddusw %%mm1, %%mm0\n\t"
444 "paddusw %%mm3, %%mm2\n\t"
445 "psrlw $1, %%mm0\n\t"
446 "psrlw $1, %%mm2\n\t"
447 "packuswb %%mm2, %%mm0\n\t"
448 "movq %%mm0, %0\n\t"
449 :"=m"(*p)
450 :"m"(*pix),
451 "m"(*(pix+line_size))
452 :"memory");
453 pix += line_size;
454 p += line_size;
455 } while(--h);
de6d9b64
FB
456}
457
458static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
459{
460 UINT8 *p;
461 const UINT8 *pix;
462 p = block;
463 pix = pixels;
d6a4c0b1
ZK
464 MOVQ_ZERO(mm7);
465 MOVQ_WONE(mm6);
466 JUMPALIGN();
de6d9b64
FB
467 do {
468 __asm __volatile(
469 "movq %1, %%mm0\n\t"
470 "movq %2, %%mm1\n\t"
471 "movq 1%1, %%mm4\n\t"
472 "movq 1%2, %%mm5\n\t"
473 "movq %%mm0, %%mm2\n\t"
474 "movq %%mm1, %%mm3\n\t"
475 "punpcklbw %%mm7, %%mm0\n\t"
476 "punpcklbw %%mm7, %%mm1\n\t"
477 "punpckhbw %%mm7, %%mm2\n\t"
478 "punpckhbw %%mm7, %%mm3\n\t"
479 "paddusw %%mm1, %%mm0\n\t"
480 "paddusw %%mm3, %%mm2\n\t"
481 "movq %%mm4, %%mm1\n\t"
482 "movq %%mm5, %%mm3\n\t"
483 "punpcklbw %%mm7, %%mm4\n\t"
484 "punpcklbw %%mm7, %%mm5\n\t"
485 "punpckhbw %%mm7, %%mm1\n\t"
486 "punpckhbw %%mm7, %%mm3\n\t"
487 "paddusw %%mm5, %%mm4\n\t"
488 "paddusw %%mm3, %%mm1\n\t"
489 "paddusw %%mm6, %%mm4\n\t"
490 "paddusw %%mm6, %%mm1\n\t"
491 "paddusw %%mm4, %%mm0\n\t"
492 "paddusw %%mm1, %%mm2\n\t"
493 "psrlw $2, %%mm0\n\t"
494 "psrlw $2, %%mm2\n\t"
495 "packuswb %%mm2, %%mm0\n\t"
496 "movq %%mm0, %0\n\t"
497 :"=m"(*p)
498 :"m"(*pix),
499 "m"(*(pix+line_size))
500 :"memory");
501 pix += line_size;
502 p += line_size;
503 } while(--h);
de6d9b64
FB
504}
505
506static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
507{
508 UINT8 *p;
509 const UINT8 *pix;
510 p = block;
511 pix = pixels;
d6a4c0b1
ZK
512 MOVQ_ZERO(mm7);
513 MOVQ_WONE(mm6);
514 JUMPALIGN();
de6d9b64
FB
515 do {
516 __asm __volatile(
517 "movq %0, %%mm0\n\t"
518 "movq %1, %%mm1\n\t"
519 "movq %%mm0, %%mm2\n\t"
520 "movq %%mm1, %%mm3\n\t"
521 "punpcklbw %%mm7, %%mm0\n\t"
522 "punpcklbw %%mm7, %%mm1\n\t"
523 "punpckhbw %%mm7, %%mm2\n\t"
524 "punpckhbw %%mm7, %%mm3\n\t"
525 "paddusw %%mm1, %%mm0\n\t"
526 "paddusw %%mm3, %%mm2\n\t"
527 "paddusw %%mm6, %%mm0\n\t"
528 "paddusw %%mm6, %%mm2\n\t"
529 "psrlw $1, %%mm0\n\t"
530 "psrlw $1, %%mm2\n\t"
531 "packuswb %%mm2, %%mm0\n\t"
532 "movq %%mm0, %0\n\t"
a822a479 533 :"+m"(*p)
de6d9b64
FB
534 :"m"(*pix)
535 :"memory");
536 pix += line_size;
537 p += line_size;
538 }
539 while (--h);
de6d9b64
FB
540}
541
542static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
543{
544 UINT8 *p;
545 const UINT8 *pix;
546 p = block;
547 pix = pixels;
d6a4c0b1
ZK
548 MOVQ_ZERO(mm7);
549 MOVQ_WONE(mm6);
550 JUMPALIGN();
de6d9b64
FB
551 do {
552 __asm __volatile(
553 "movq %1, %%mm1\n\t"
554 "movq %0, %%mm0\n\t"
555 "movq 1%1, %%mm4\n\t"
556 "movq %%mm0, %%mm2\n\t"
557 "movq %%mm1, %%mm3\n\t"
558 "movq %%mm4, %%mm5\n\t"
559 "punpcklbw %%mm7, %%mm1\n\t"
560 "punpckhbw %%mm7, %%mm3\n\t"
561 "punpcklbw %%mm7, %%mm4\n\t"
562 "punpckhbw %%mm7, %%mm5\n\t"
563 "punpcklbw %%mm7, %%mm0\n\t"
564 "punpckhbw %%mm7, %%mm2\n\t"
565 "paddusw %%mm4, %%mm1\n\t"
566 "paddusw %%mm5, %%mm3\n\t"
567 "paddusw %%mm6, %%mm1\n\t"
568 "paddusw %%mm6, %%mm3\n\t"
569 "psrlw $1, %%mm1\n\t"
570 "psrlw $1, %%mm3\n\t"
571 "paddusw %%mm6, %%mm0\n\t"
572 "paddusw %%mm6, %%mm2\n\t"
573 "paddusw %%mm1, %%mm0\n\t"
574 "paddusw %%mm3, %%mm2\n\t"
575 "psrlw $1, %%mm0\n\t"
576 "psrlw $1, %%mm2\n\t"
577 "packuswb %%mm2, %%mm0\n\t"
578 "movq %%mm0, %0\n\t"
a822a479 579 :"+m"(*p)
de6d9b64
FB
580 :"m"(*pix)
581 :"memory");
582 pix += line_size;
583 p += line_size;
584 } while (--h);
de6d9b64
FB
585}
586
587static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
588{
589 UINT8 *p;
590 const UINT8 *pix;
591 p = block;
592 pix = pixels;
d6a4c0b1
ZK
593 MOVQ_ZERO(mm7);
594 MOVQ_WONE(mm6);
595 JUMPALIGN();
de6d9b64
FB
596 do {
597 __asm __volatile(
598 "movq %1, %%mm1\n\t"
599 "movq %0, %%mm0\n\t"
600 "movq %2, %%mm4\n\t"
601 "movq %%mm0, %%mm2\n\t"
602 "movq %%mm1, %%mm3\n\t"
603 "movq %%mm4, %%mm5\n\t"
604 "punpcklbw %%mm7, %%mm1\n\t"
605 "punpckhbw %%mm7, %%mm3\n\t"
606 "punpcklbw %%mm7, %%mm4\n\t"
607 "punpckhbw %%mm7, %%mm5\n\t"
608 "punpcklbw %%mm7, %%mm0\n\t"
609 "punpckhbw %%mm7, %%mm2\n\t"
610 "paddusw %%mm4, %%mm1\n\t"
611 "paddusw %%mm5, %%mm3\n\t"
612 "paddusw %%mm6, %%mm1\n\t"
613 "paddusw %%mm6, %%mm3\n\t"
614 "psrlw $1, %%mm1\n\t"
615 "psrlw $1, %%mm3\n\t"
616 "paddusw %%mm6, %%mm0\n\t"
617 "paddusw %%mm6, %%mm2\n\t"
618 "paddusw %%mm1, %%mm0\n\t"
619 "paddusw %%mm3, %%mm2\n\t"
620 "psrlw $1, %%mm0\n\t"
621 "psrlw $1, %%mm2\n\t"
622 "packuswb %%mm2, %%mm0\n\t"
623 "movq %%mm0, %0\n\t"
a822a479 624 :"+m"(*p)
de6d9b64
FB
625 :"m"(*pix), "m"(*(pix+line_size))
626 :"memory");
627 pix += line_size;
628 p += line_size ;
629 } while(--h);
de6d9b64
FB
630}
631
632static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
633{
634 UINT8 *p;
635 const UINT8 *pix;
636 p = block;
637 pix = pixels;
d6a4c0b1
ZK
638 MOVQ_ZERO(mm7);
639 // this doesn't seem to be used offten - so
640 // the inside usage of mm_wone is not optimized
641 MOVQ_WTWO(mm6);
de6d9b64
FB
642 do {
643 __asm __volatile(
644 "movq %1, %%mm0\n\t"
645 "movq %2, %%mm1\n\t"
646 "movq 1%1, %%mm4\n\t"
647 "movq 1%2, %%mm5\n\t"
648 "movq %%mm0, %%mm2\n\t"
649 "movq %%mm1, %%mm3\n\t"
650 "punpcklbw %%mm7, %%mm0\n\t"
651 "punpcklbw %%mm7, %%mm1\n\t"
652 "punpckhbw %%mm7, %%mm2\n\t"
653 "punpckhbw %%mm7, %%mm3\n\t"
654 "paddusw %%mm1, %%mm0\n\t"
655 "paddusw %%mm3, %%mm2\n\t"
656 "movq %%mm4, %%mm1\n\t"
657 "movq %%mm5, %%mm3\n\t"
658 "punpcklbw %%mm7, %%mm4\n\t"
659 "punpcklbw %%mm7, %%mm5\n\t"
660 "punpckhbw %%mm7, %%mm1\n\t"
661 "punpckhbw %%mm7, %%mm3\n\t"
662 "paddusw %%mm5, %%mm4\n\t"
663 "paddusw %%mm3, %%mm1\n\t"
664 "paddusw %%mm6, %%mm4\n\t"
665 "paddusw %%mm6, %%mm1\n\t"
666 "paddusw %%mm4, %%mm0\n\t"
667 "paddusw %%mm1, %%mm2\n\t"
668 "movq %3, %%mm5\n\t"
669 "psrlw $2, %%mm0\n\t"
670 "movq %0, %%mm1\n\t"
671 "psrlw $2, %%mm2\n\t"
672 "movq %%mm1, %%mm3\n\t"
673 "punpcklbw %%mm7, %%mm1\n\t"
674 "punpckhbw %%mm7, %%mm3\n\t"
675 "paddusw %%mm1, %%mm0\n\t"
676 "paddusw %%mm3, %%mm2\n\t"
677 "paddusw %%mm5, %%mm0\n\t"
678 "paddusw %%mm5, %%mm2\n\t"
679 "psrlw $1, %%mm0\n\t"
680 "psrlw $1, %%mm2\n\t"
681 "packuswb %%mm2, %%mm0\n\t"
682 "movq %%mm0, %0\n\t"
a822a479 683 :"+m"(*p)
de6d9b64 684 :"m"(*pix),
a9b3f630 685 "m"(*(pix+line_size)), "m"(mm_wone)
de6d9b64
FB
686 :"memory");
687 pix += line_size;
688 p += line_size ;
689 } while(--h);
de6d9b64
FB
690}
691
692static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
693{
694 UINT8 *p;
695 const UINT8 *pix;
696 p = block;
697 pix = pixels;
d6a4c0b1 698 MOVQ_ZERO(mm7);
de6d9b64
FB
699 do {
700 __asm __volatile(
701 "movq %1, %%mm0\n\t"
702 "movq %0, %%mm1\n\t"
703 "movq %%mm0, %%mm2\n\t"
704 "movq %%mm1, %%mm3\n\t"
705 "punpcklbw %%mm7, %%mm0\n\t"
706 "punpcklbw %%mm7, %%mm1\n\t"
707 "punpckhbw %%mm7, %%mm2\n\t"
708 "punpckhbw %%mm7, %%mm3\n\t"
709 "paddusw %%mm1, %%mm0\n\t"
710 "paddusw %%mm3, %%mm2\n\t"
711 "psrlw $1, %%mm0\n\t"
712 "psrlw $1, %%mm2\n\t"
713 "packuswb %%mm2, %%mm0\n\t"
714 "movq %%mm0, %0\n\t"
a822a479 715 :"+m"(*p)
de6d9b64
FB
716 :"m"(*pix)
717 :"memory");
718 pix += line_size;
719 p += line_size ;
720 } while (--h);
de6d9b64
FB
721}
722
723static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
724{
725 UINT8 *p;
726 const UINT8 *pix;
727 p = block;
728 pix = pixels;
d6a4c0b1 729 MOVQ_ZERO(mm7);
de6d9b64
FB
730 do {
731 __asm __volatile(
732 "movq %1, %%mm0\n\t"
733 "movq 1%1, %%mm1\n\t"
734 "movq %0, %%mm4\n\t"
735 "movq %%mm0, %%mm2\n\t"
736 "movq %%mm1, %%mm3\n\t"
737 "movq %%mm4, %%mm5\n\t"
738 "punpcklbw %%mm7, %%mm0\n\t"
739 "punpcklbw %%mm7, %%mm1\n\t"
740 "punpckhbw %%mm7, %%mm2\n\t"
741 "punpckhbw %%mm7, %%mm3\n\t"
742 "punpcklbw %%mm7, %%mm4\n\t"
743 "punpckhbw %%mm7, %%mm5\n\t"
744 "paddusw %%mm1, %%mm0\n\t"
745 "paddusw %%mm3, %%mm2\n\t"
746 "psrlw $1, %%mm0\n\t"
747 "psrlw $1, %%mm2\n\t"
748 "paddusw %%mm4, %%mm0\n\t"
749 "paddusw %%mm5, %%mm2\n\t"
750 "psrlw $1, %%mm0\n\t"
751 "psrlw $1, %%mm2\n\t"
752 "packuswb %%mm2, %%mm0\n\t"
753 "movq %%mm0, %0\n\t"
a822a479 754 :"+m"(*p)
de6d9b64
FB
755 :"m"(*pix)
756 :"memory");
757 pix += line_size;
758 p += line_size;
759 } while (--h);
de6d9b64
FB
760}
761
762static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
763{
764 UINT8 *p;
765 const UINT8 *pix;
766 p = block;
767 pix = pixels;
d6a4c0b1 768 MOVQ_ZERO(mm7);
de6d9b64
FB
769 do {
770 __asm __volatile(
771 "movq %1, %%mm0\n\t"
772 "movq %2, %%mm1\n\t"
773 "movq %0, %%mm4\n\t"
774 "movq %%mm0, %%mm2\n\t"
775 "movq %%mm1, %%mm3\n\t"
776 "movq %%mm4, %%mm5\n\t"
777 "punpcklbw %%mm7, %%mm0\n\t"
778 "punpcklbw %%mm7, %%mm1\n\t"
779 "punpckhbw %%mm7, %%mm2\n\t"
780 "punpckhbw %%mm7, %%mm3\n\t"
781 "punpcklbw %%mm7, %%mm4\n\t"
782 "punpckhbw %%mm7, %%mm5\n\t"
783 "paddusw %%mm1, %%mm0\n\t"
784 "paddusw %%mm3, %%mm2\n\t"
785 "psrlw $1, %%mm0\n\t"
786 "psrlw $1, %%mm2\n\t"
787 "paddusw %%mm4, %%mm0\n\t"
788 "paddusw %%mm5, %%mm2\n\t"
789 "psrlw $1, %%mm0\n\t"
790 "psrlw $1, %%mm2\n\t"
791 "packuswb %%mm2, %%mm0\n\t"
792 "movq %%mm0, %0\n\t"
a822a479 793 :"+m"(*p)
de6d9b64
FB
794 :"m"(*pix), "m"(*(pix+line_size))
795 :"memory");
796 pix += line_size;
797 p += line_size ;
798 } while(--h);
de6d9b64
FB
799}
800
801static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
802{
803 UINT8 *p;
804 const UINT8 *pix;
805 p = block;
806 pix = pixels;
d6a4c0b1
ZK
807 MOVQ_ZERO(mm7);
808 MOVQ_WONE(mm6);
809 JUMPALIGN();
de6d9b64
FB
810 do {
811 __asm __volatile(
812 "movq %1, %%mm0\n\t"
813 "movq %2, %%mm1\n\t"
814 "movq 1%1, %%mm4\n\t"
815 "movq 1%2, %%mm5\n\t"
816 "movq %%mm0, %%mm2\n\t"
817 "movq %%mm1, %%mm3\n\t"
818 "punpcklbw %%mm7, %%mm0\n\t"
819 "punpcklbw %%mm7, %%mm1\n\t"
820 "punpckhbw %%mm7, %%mm2\n\t"
821 "punpckhbw %%mm7, %%mm3\n\t"
822 "paddusw %%mm1, %%mm0\n\t"
823 "paddusw %%mm3, %%mm2\n\t"
824 "movq %%mm4, %%mm1\n\t"
825 "movq %%mm5, %%mm3\n\t"
826 "punpcklbw %%mm7, %%mm4\n\t"
827 "punpcklbw %%mm7, %%mm5\n\t"
828 "punpckhbw %%mm7, %%mm1\n\t"
829 "punpckhbw %%mm7, %%mm3\n\t"
830 "paddusw %%mm5, %%mm4\n\t"
831 "paddusw %%mm3, %%mm1\n\t"
832 "paddusw %%mm6, %%mm4\n\t"
833 "paddusw %%mm6, %%mm1\n\t"
834 "paddusw %%mm4, %%mm0\n\t"
835 "paddusw %%mm1, %%mm2\n\t"
836 "movq %0, %%mm1\n\t"
837 "psrlw $2, %%mm0\n\t"
838 "movq %%mm1, %%mm3\n\t"
839 "psrlw $2, %%mm2\n\t"
840 "punpcklbw %%mm7, %%mm1\n\t"
841 "punpckhbw %%mm7, %%mm3\n\t"
842 "paddusw %%mm1, %%mm0\n\t"
843 "paddusw %%mm3, %%mm2\n\t"
844 "psrlw $1, %%mm0\n\t"
845 "psrlw $1, %%mm2\n\t"
846 "packuswb %%mm2, %%mm0\n\t"
847 "movq %%mm0, %0\n\t"
a822a479 848 :"+m"(*p)
de6d9b64
FB
849 :"m"(*pix),
850 "m"(*(pix+line_size))
851 :"memory");
852 pix += line_size;
853 p += line_size;
854 } while(--h);
de6d9b64
FB
855}
856
857static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
858{
859 DCTELEM *p;
860 const UINT8 *pix;
861 p = block;
862 pix = pixels;
d6a4c0b1 863 MOVQ_ZERO(mm7);
de6d9b64
FB
864 do {
865 __asm __volatile(
866 "movq %0, %%mm0\n\t"
867 "movq %1, %%mm2\n\t"
868 "movq 8%0, %%mm1\n\t"
869 "movq %%mm2, %%mm3\n\t"
870 "punpcklbw %%mm7, %%mm2\n\t"
871 "punpckhbw %%mm7, %%mm3\n\t"
872 "psubsw %%mm2, %%mm0\n\t"
873 "psubsw %%mm3, %%mm1\n\t"
874 "movq %%mm0, %0\n\t"
875 "movq %%mm1, 8%0\n\t"
a822a479 876 :"+m"(*p)
de6d9b64
FB
877 :"m"(*pix)
878 :"memory");
879 pix += line_size;
880 p += 8;
881 } while (--h);
de6d9b64
FB
882}
883
884static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
885{
886 DCTELEM *p;
887 const UINT8 *pix;
888 p = block;
889 pix = pixels;
d6a4c0b1
ZK
890 MOVQ_ZERO(mm7);
891 MOVQ_WONE(mm6);
892 JUMPALIGN();
de6d9b64
FB
893 do {
894 __asm __volatile(
895 "movq %0, %%mm0\n\t"
896 "movq %1, %%mm2\n\t"
897 "movq 8%0, %%mm1\n\t"
898 "movq 1%1, %%mm4\n\t"
899 "movq %%mm2, %%mm3\n\t"
900 "movq %%mm4, %%mm5\n\t"
901 "punpcklbw %%mm7, %%mm2\n\t"
902 "punpckhbw %%mm7, %%mm3\n\t"
903 "punpcklbw %%mm7, %%mm4\n\t"
904 "punpckhbw %%mm7, %%mm5\n\t"
905 "paddusw %%mm4, %%mm2\n\t"
906 "paddusw %%mm5, %%mm3\n\t"
907 "paddusw %%mm6, %%mm2\n\t"
908 "paddusw %%mm6, %%mm3\n\t"
909 "psrlw $1, %%mm2\n\t"
910 "psrlw $1, %%mm3\n\t"
911 "psubsw %%mm2, %%mm0\n\t"
912 "psubsw %%mm3, %%mm1\n\t"
913 "movq %%mm0, %0\n\t"
914 "movq %%mm1, 8%0\n\t"
a822a479 915 :"+m"(*p)
de6d9b64
FB
916 :"m"(*pix)
917 :"memory");
918 pix += line_size;
919 p += 8;
920 } while (--h);
de6d9b64
FB
921}
922
923static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
924{
925 DCTELEM *p;
926 const UINT8 *pix;
927 p = block;
928 pix = pixels;
d6a4c0b1
ZK
929 MOVQ_ZERO(mm7);
930 MOVQ_WONE(mm6);
de6d9b64
FB
931 do {
932 __asm __volatile(
933 "movq %0, %%mm0\n\t"
934 "movq %1, %%mm2\n\t"
935 "movq 8%0, %%mm1\n\t"
936 "movq %2, %%mm4\n\t"
937 "movq %%mm2, %%mm3\n\t"
938 "movq %%mm4, %%mm5\n\t"
939 "punpcklbw %%mm7, %%mm2\n\t"
940 "punpckhbw %%mm7, %%mm3\n\t"
941 "punpcklbw %%mm7, %%mm4\n\t"
942 "punpckhbw %%mm7, %%mm5\n\t"
943 "paddusw %%mm4, %%mm2\n\t"
944 "paddusw %%mm5, %%mm3\n\t"
945 "paddusw %%mm6, %%mm2\n\t"
946 "paddusw %%mm6, %%mm3\n\t"
947 "psrlw $1, %%mm2\n\t"
948 "psrlw $1, %%mm3\n\t"
949 "psubsw %%mm2, %%mm0\n\t"
950 "psubsw %%mm3, %%mm1\n\t"
951 "movq %%mm0, %0\n\t"
952 "movq %%mm1, 8%0\n\t"
a822a479 953 :"+m"(*p)
de6d9b64
FB
954 :"m"(*pix), "m"(*(pix+line_size))
955 :"memory");
956 pix += line_size;
957 p += 8;
958 } while (--h);
de6d9b64
FB
959}
960
961static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
962{
963 DCTELEM *p;
964 const UINT8 *pix;
965 p = block;
966 pix = pixels;
d6a4c0b1
ZK
967 MOVQ_ZERO(mm7);
968 MOVQ_WTWO(mm6);
969 JUMPALIGN();
de6d9b64
FB
970 do {
971 __asm __volatile(
972 "movq %1, %%mm0\n\t"
973 "movq %2, %%mm1\n\t"
974 "movq 1%1, %%mm4\n\t"
975 "movq 1%2, %%mm5\n\t"
976 "movq %%mm0, %%mm2\n\t"
977 "movq %%mm1, %%mm3\n\t"
978 "punpcklbw %%mm7, %%mm0\n\t"
979 "punpcklbw %%mm7, %%mm1\n\t"
980 "punpckhbw %%mm7, %%mm2\n\t"
981 "punpckhbw %%mm7, %%mm3\n\t"
982 "paddusw %%mm1, %%mm0\n\t"
983 "paddusw %%mm3, %%mm2\n\t"
984 "movq %%mm4, %%mm1\n\t"
985 "movq %%mm5, %%mm3\n\t"
986 "punpcklbw %%mm7, %%mm4\n\t"
987 "punpcklbw %%mm7, %%mm5\n\t"
988 "punpckhbw %%mm7, %%mm1\n\t"
989 "punpckhbw %%mm7, %%mm3\n\t"
990 "paddusw %%mm5, %%mm4\n\t"
991 "paddusw %%mm3, %%mm1\n\t"
992 "paddusw %%mm6, %%mm4\n\t"
993 "paddusw %%mm6, %%mm1\n\t"
994 "paddusw %%mm4, %%mm0\n\t"
995 "paddusw %%mm1, %%mm2\n\t"
996 "movq %0, %%mm1\n\t"
997 "movq 8%0, %%mm3\n\t"
998 "psrlw $2, %%mm0\n\t"
999 "psrlw $2, %%mm2\n\t"
1000 "psubsw %%mm0, %%mm1\n\t"
1001 "psubsw %%mm2, %%mm3\n\t"
1002 "movq %%mm1, %0\n\t"
1003 "movq %%mm3, 8%0\n\t"
a822a479 1004 :"+m"(*p)
de6d9b64
FB
1005 :"m"(*pix),
1006 "m"(*(pix+line_size))
1007 :"memory");
1008 pix += line_size;
1009 p += 8 ;
1010 } while(--h);
de6d9b64
FB
1011}
1012
d6a4c0b1
ZK
1013static void just_return() { return; }
1014
de6d9b64
FB
1015void dsputil_init_mmx(void)
1016{
1017 mm_flags = mm_support();
f4470e09
MN
1018#if 1
1019 printf("libavcodec: CPU flags:");
de6d9b64
FB
1020 if (mm_flags & MM_MMX)
1021 printf(" mmx");
1022 if (mm_flags & MM_MMXEXT)
1023 printf(" mmxext");
1024 if (mm_flags & MM_3DNOW)
1025 printf(" 3dnow");
1026 if (mm_flags & MM_SSE)
1027 printf(" sse");
1028 if (mm_flags & MM_SSE2)
1029 printf(" sse2");
1030 printf("\n");
1031#endif
1032
1033 if (mm_flags & MM_MMX) {
1034 get_pixels = get_pixels_mmx;
1035 put_pixels_clamped = put_pixels_clamped_mmx;
1036 add_pixels_clamped = add_pixels_clamped_mmx;
1037
1038 pix_abs16x16 = pix_abs16x16_mmx;
1039 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1040 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1041 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1042 av_fdct = fdct_mmx;
1043
1044 put_pixels_tab[0] = put_pixels_mmx;
1045 put_pixels_tab[1] = put_pixels_x2_mmx;
1046 put_pixels_tab[2] = put_pixels_y2_mmx;
1047 put_pixels_tab[3] = put_pixels_xy2_mmx;
1048
1049 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
1050 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1051 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1052 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
1053
1054 avg_pixels_tab[0] = avg_pixels_mmx;
1055 avg_pixels_tab[1] = avg_pixels_x2_mmx;
1056 avg_pixels_tab[2] = avg_pixels_y2_mmx;
1057 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1058
1059 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
1060 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
1061 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
1062 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
1063
1064 sub_pixels_tab[0] = sub_pixels_mmx;
1065 sub_pixels_tab[1] = sub_pixels_x2_mmx;
1066 sub_pixels_tab[2] = sub_pixels_y2_mmx;
1067 sub_pixels_tab[3] = sub_pixels_xy2_mmx;
1068
1069 if (mm_flags & MM_MMXEXT) {
1070 pix_abs16x16 = pix_abs16x16_sse;
1071 }
1072
1073 if (mm_flags & MM_SSE) {
1074 put_pixels_tab[1] = put_pixels_x2_sse;
1075 put_pixels_tab[2] = put_pixels_y2_sse;
1076
1077 avg_pixels_tab[0] = avg_pixels_sse;
1078 avg_pixels_tab[1] = avg_pixels_x2_sse;
1079 avg_pixels_tab[2] = avg_pixels_y2_sse;
1080 avg_pixels_tab[3] = avg_pixels_xy2_sse;
1081
1082 sub_pixels_tab[1] = sub_pixels_x2_sse;
1083 sub_pixels_tab[2] = sub_pixels_y2_sse;
1084 } else if (mm_flags & MM_3DNOW) {
1085 put_pixels_tab[1] = put_pixels_x2_3dnow;
1086 put_pixels_tab[2] = put_pixels_y2_3dnow;
1087
1088 avg_pixels_tab[0] = avg_pixels_3dnow;
1089 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
1090 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
1091 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
1092
1093 sub_pixels_tab[1] = sub_pixels_x2_3dnow;
1094 sub_pixels_tab[2] = sub_pixels_y2_3dnow;
1095 }
4af7bcc1 1096
8def0299
FB
1097 /* idct */
1098 if (mm_flags & MM_MMXEXT) {
1099 ff_idct = ff_mmxext_idct;
1100 } else {
1101 ff_idct = ff_mmx_idct;
1102 }
d962f6fd
A
1103#ifdef SIMPLE_IDCT
1104// ff_idct = simple_idct;
1105 ff_idct = simple_idct_mmx;
1106#endif
de6d9b64 1107 }
d6a4c0b1
ZK
1108
1109#if 0
1110 // for speed testing
1111 get_pixels = just_return;
1112 put_pixels_clamped = just_return;
1113 add_pixels_clamped = just_return;
1114
1115 pix_abs16x16 = just_return;
1116 pix_abs16x16_x2 = just_return;
1117 pix_abs16x16_y2 = just_return;
1118 pix_abs16x16_xy2 = just_return;
1119
1120 put_pixels_tab[0] = just_return;
1121 put_pixels_tab[1] = just_return;
1122 put_pixels_tab[2] = just_return;
1123 put_pixels_tab[3] = just_return;
1124
1125 put_no_rnd_pixels_tab[0] = just_return;
1126 put_no_rnd_pixels_tab[1] = just_return;
1127 put_no_rnd_pixels_tab[2] = just_return;
1128 put_no_rnd_pixels_tab[3] = just_return;
1129
1130 avg_pixels_tab[0] = just_return;
1131 avg_pixels_tab[1] = just_return;
1132 avg_pixels_tab[2] = just_return;
1133 avg_pixels_tab[3] = just_return;
1134
1135 avg_no_rnd_pixels_tab[0] = just_return;
1136 avg_no_rnd_pixels_tab[1] = just_return;
1137 avg_no_rnd_pixels_tab[2] = just_return;
1138 avg_no_rnd_pixels_tab[3] = just_return;
1139
1140 sub_pixels_tab[0] = just_return;
1141 sub_pixels_tab[1] = just_return;
1142 sub_pixels_tab[2] = just_return;
1143 sub_pixels_tab[3] = just_return;
1144
1145 //av_fdct = just_return;
1146 //ff_idct = just_return;
1147#endif
de6d9b64 1148}