* solaris does not support -q
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5
FB
25int mm_flags; /* multimedia extension flags */
26
ba6802de
MN
27int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31
32int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36
37int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41
42int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46
de6d9b64 47
8def0299
FB
48/* external functions, from idct_mmx.c */
49void ff_mmx_idct(DCTELEM *block);
50void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 51
de6d9b64 52/* pixel operations */
ba6802de
MN
53static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL;
54static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL;
a9b3f630
NK
55//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
56//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
de6d9b64 57
d6a4c0b1
ZK
58#define JUMPALIGN() __asm __volatile (".balign 8"::)
59#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
60
61#ifndef PIC
62#define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
63#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
64#else
65// for shared library it's better to use this way for accessing constants
66// pcmpeqd -> -1
67#define MOVQ_WONE(regd) \
68 __asm __volatile ( \
69 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
70 "psrlw $15, %%" #regd ::)
71
72#define MOVQ_WTWO(regd) \
73 __asm __volatile ( \
74 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
75 "psrlw $15, %%" #regd " \n\t" \
76 "psllw $1, %%" #regd ::)
77#endif
78
de6d9b64
FB
79/***********************************/
80/* 3Dnow specific */
81
82#define DEF(x) x ## _3dnow
83/* for Athlons PAVGUSB is prefered */
84#define PAVGB "pavgusb"
85
86#include "dsputil_mmx_avg.h"
87
88#undef DEF
89#undef PAVGB
90
91/***********************************/
92/* MMX2 specific */
93
94#define DEF(x) x ## _sse
95
96/* Introduced only in MMX2 set */
97#define PAVGB "pavgb"
98
99#include "dsputil_mmx_avg.h"
100
101#undef DEF
102#undef PAVGB
103
104/***********************************/
105/* standard MMX */
106
107static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
108{
109 DCTELEM *p;
110 const UINT8 *pix;
111 int i;
112
113 /* read the pixels */
114 p = block;
115 pix = pixels;
d6a4c0b1 116 MOVQ_ZERO(mm7);
de6d9b64
FB
117 for(i=0;i<4;i++) {
118 __asm __volatile(
119 "movq %1, %%mm0\n\t"
120 "movq %2, %%mm1\n\t"
121 "movq %%mm0, %%mm2\n\t"
122 "movq %%mm1, %%mm3\n\t"
123 "punpcklbw %%mm7, %%mm0\n\t"
124 "punpckhbw %%mm7, %%mm2\n\t"
125 "punpcklbw %%mm7, %%mm1\n\t"
126 "punpckhbw %%mm7, %%mm3\n\t"
127 "movq %%mm0, %0\n\t"
128 "movq %%mm2, 8%0\n\t"
129 "movq %%mm1, 16%0\n\t"
130 "movq %%mm3, 24%0\n\t"
131 :"=m"(*p)
132 :"m"(*pix), "m"(*(pix+line_size))
133 :"memory");
134 pix += line_size*2;
135 p += 16;
136 }
de6d9b64
FB
137}
138
139static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
140{
141 const DCTELEM *p;
142 UINT8 *pix;
de6d9b64
FB
143
144 /* read the pixels */
145 p = block;
146 pix = pixels;
d6a4c0b1 147 /* unrolled loop */
de6d9b64 148 __asm __volatile(
a822a479
NK
149 "movq %3, %%mm0\n\t"
150 "movq 8%3, %%mm1\n\t"
151 "movq 16%3, %%mm2\n\t"
152 "movq 24%3, %%mm3\n\t"
153 "movq 32%3, %%mm4\n\t"
154 "movq 40%3, %%mm5\n\t"
155 "movq 48%3, %%mm6\n\t"
156 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
157 "packuswb %%mm1, %%mm0\n\t"
158 "packuswb %%mm3, %%mm2\n\t"
159 "packuswb %%mm5, %%mm4\n\t"
160 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
161 "movq %%mm0, (%0)\n\t"
162 "movq %%mm2, (%0, %1)\n\t"
163 "movq %%mm4, (%0, %1, 2)\n\t"
164 "movq %%mm6, (%0, %2)\n\t"
165 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
166 :"memory");
167 pix += line_size*4;
168 p += 32;
d6a4c0b1
ZK
169
170 // if here would be an exact copy of the code above
171 // compiler would generate some very strange code
172 // thus using "r"
173 __asm __volatile(
174 "movq (%3), %%mm0\n\t"
175 "movq 8(%3), %%mm1\n\t"
176 "movq 16(%3), %%mm2\n\t"
177 "movq 24(%3), %%mm3\n\t"
178 "movq 32(%3), %%mm4\n\t"
179 "movq 40(%3), %%mm5\n\t"
180 "movq 48(%3), %%mm6\n\t"
181 "movq 56(%3), %%mm7\n\t"
182 "packuswb %%mm1, %%mm0\n\t"
183 "packuswb %%mm3, %%mm2\n\t"
184 "packuswb %%mm5, %%mm4\n\t"
185 "packuswb %%mm7, %%mm6\n\t"
186 "movq %%mm0, (%0)\n\t"
187 "movq %%mm2, (%0, %1)\n\t"
188 "movq %%mm4, (%0, %1, 2)\n\t"
189 "movq %%mm6, (%0, %2)\n\t"
190 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
191 :"memory");
de6d9b64
FB
192}
193
194static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
195{
196 const DCTELEM *p;
197 UINT8 *pix;
198 int i;
199
200 /* read the pixels */
201 p = block;
202 pix = pixels;
d6a4c0b1
ZK
203 MOVQ_ZERO(mm7);
204 i = 4;
205 while (i) {
de6d9b64
FB
206 __asm __volatile(
207 "movq %2, %%mm0\n\t"
208 "movq 8%2, %%mm1\n\t"
209 "movq 16%2, %%mm2\n\t"
210 "movq 24%2, %%mm3\n\t"
211 "movq %0, %%mm4\n\t"
212 "movq %1, %%mm6\n\t"
213 "movq %%mm4, %%mm5\n\t"
214 "punpcklbw %%mm7, %%mm4\n\t"
215 "punpckhbw %%mm7, %%mm5\n\t"
216 "paddsw %%mm4, %%mm0\n\t"
217 "paddsw %%mm5, %%mm1\n\t"
218 "movq %%mm6, %%mm5\n\t"
219 "punpcklbw %%mm7, %%mm6\n\t"
220 "punpckhbw %%mm7, %%mm5\n\t"
221 "paddsw %%mm6, %%mm2\n\t"
222 "paddsw %%mm5, %%mm3\n\t"
223 "packuswb %%mm1, %%mm0\n\t"
224 "packuswb %%mm3, %%mm2\n\t"
225 "movq %%mm0, %0\n\t"
226 "movq %%mm2, %1\n\t"
a822a479 227 :"+m"(*pix), "+m"(*(pix+line_size))
de6d9b64
FB
228 :"m"(*p)
229 :"memory");
230 pix += line_size*2;
231 p += 16;
d6a4c0b1
ZK
232 i--;
233 };
de6d9b64
FB
234}
235
236static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
237{
d6a4c0b1 238 int hh;
de6d9b64
FB
239 UINT8 *p;
240 const UINT8 *pix;
d6a4c0b1 241
de6d9b64 242 p = block;
d6a4c0b1
ZK
243 pix = pixels; // 2s
244#if 0
245 do {
246 __asm __volatile(
247 "movq %1, %%mm0\n\t"
248 "movq %%mm0, %0\n\t"
249 :"=m"(*p)
250 :"m"(*pix)
251 :"memory");
252 pix += line_size;
253 p += line_size;
254 } while (--h);
255#else
256 // this optimized code is not very usefull
257 // the above loop is definitely faster
258 // at least on Celeron 500MHz
259 hh = h & 3;
260 while (hh) {
261 __asm __volatile(
262 "movq %1, %%mm0\n\t"
263 "movq %%mm0, %0\n\t"
264 :"=m"(*p)
265 :"m"(*pix)
266 :"memory");
267 pix += line_size;
268 p += line_size;
269 hh--;
270 }
de6d9b64 271 hh=h>>2;
d6a4c0b1 272 while (hh) {
de6d9b64 273 __asm __volatile(
a822a479
NK
274 "movq (%1), %%mm0 \n\t"
275 "movq (%1, %2), %%mm1 \n\t"
276 "movq (%1, %2, 2), %%mm2 \n\t"
277 "movq (%1, %3), %%mm3 \n\t"
278 "movq %%mm0, (%0) \n\t"
279 "movq %%mm1, (%0, %2) \n\t"
280 "movq %%mm2, (%0, %2, 2) \n\t"
281 "movq %%mm3, (%0, %3) \n\t"
282 ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3)
de6d9b64 283 :"memory");
d6a4c0b1
ZK
284 pix += line_size*4;
285 p += line_size*4;
286 hh--;
de6d9b64 287 }
d6a4c0b1 288#endif
de6d9b64
FB
289}
290
291static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
292{
293 UINT8 *p;
294 const UINT8 *pix;
295 p = block;
296 pix = pixels;
d6a4c0b1
ZK
297 MOVQ_ZERO(mm7);
298 MOVQ_WONE(mm4);
299 JUMPALIGN();
de6d9b64
FB
300 do {
301 __asm __volatile(
302 "movq %1, %%mm0\n\t"
303 "movq 1%1, %%mm1\n\t"
304 "movq %%mm0, %%mm2\n\t"
305 "movq %%mm1, %%mm3\n\t"
306 "punpcklbw %%mm7, %%mm0\n\t"
307 "punpcklbw %%mm7, %%mm1\n\t"
308 "punpckhbw %%mm7, %%mm2\n\t"
309 "punpckhbw %%mm7, %%mm3\n\t"
310 "paddusw %%mm1, %%mm0\n\t"
311 "paddusw %%mm3, %%mm2\n\t"
312 "paddusw %%mm4, %%mm0\n\t"
313 "paddusw %%mm4, %%mm2\n\t"
314 "psrlw $1, %%mm0\n\t"
315 "psrlw $1, %%mm2\n\t"
316 "packuswb %%mm2, %%mm0\n\t"
317 "movq %%mm0, %0\n\t"
318 :"=m"(*p)
319 :"m"(*pix)
320 :"memory");
321 pix += line_size; p += line_size;
322 } while (--h);
de6d9b64
FB
323}
324
325static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
326{
327 UINT8 *p;
328 const UINT8 *pix;
329 p = block;
330 pix = pixels;
d6a4c0b1
ZK
331 MOVQ_ZERO(mm7);
332 MOVQ_WONE(mm4);
333 JUMPALIGN();
de6d9b64
FB
334 do {
335 __asm __volatile(
336 "movq %1, %%mm0\n\t"
337 "movq %2, %%mm1\n\t"
338 "movq %%mm0, %%mm2\n\t"
339 "movq %%mm1, %%mm3\n\t"
340 "punpcklbw %%mm7, %%mm0\n\t"
341 "punpcklbw %%mm7, %%mm1\n\t"
342 "punpckhbw %%mm7, %%mm2\n\t"
343 "punpckhbw %%mm7, %%mm3\n\t"
344 "paddusw %%mm1, %%mm0\n\t"
345 "paddusw %%mm3, %%mm2\n\t"
346 "paddusw %%mm4, %%mm0\n\t"
347 "paddusw %%mm4, %%mm2\n\t"
348 "psrlw $1, %%mm0\n\t"
349 "psrlw $1, %%mm2\n\t"
350 "packuswb %%mm2, %%mm0\n\t"
351 "movq %%mm0, %0\n\t"
352 :"=m"(*p)
353 :"m"(*pix),
354 "m"(*(pix+line_size))
355 :"memory");
356 pix += line_size;
357 p += line_size;
358 } while (--h);
de6d9b64
FB
359}
360
361static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
362{
363 UINT8 *p;
364 const UINT8 *pix;
365 p = block;
d6a4c0b1
ZK
366 pix = pixels; // 1s
367 MOVQ_ZERO(mm7);
368 MOVQ_WTWO(mm6);
369 JUMPALIGN();
de6d9b64
FB
370 do {
371 __asm __volatile(
372 "movq %1, %%mm0\n\t"
373 "movq %2, %%mm1\n\t"
374 "movq 1%1, %%mm4\n\t"
375 "movq 1%2, %%mm5\n\t"
376 "movq %%mm0, %%mm2\n\t"
377 "movq %%mm1, %%mm3\n\t"
378 "punpcklbw %%mm7, %%mm0\n\t"
379 "punpcklbw %%mm7, %%mm1\n\t"
380 "punpckhbw %%mm7, %%mm2\n\t"
381 "punpckhbw %%mm7, %%mm3\n\t"
382 "paddusw %%mm1, %%mm0\n\t"
383 "paddusw %%mm3, %%mm2\n\t"
384 "movq %%mm4, %%mm1\n\t"
385 "movq %%mm5, %%mm3\n\t"
386 "punpcklbw %%mm7, %%mm4\n\t"
387 "punpcklbw %%mm7, %%mm5\n\t"
388 "punpckhbw %%mm7, %%mm1\n\t"
389 "punpckhbw %%mm7, %%mm3\n\t"
390 "paddusw %%mm5, %%mm4\n\t"
391 "paddusw %%mm3, %%mm1\n\t"
392 "paddusw %%mm6, %%mm4\n\t"
393 "paddusw %%mm6, %%mm1\n\t"
394 "paddusw %%mm4, %%mm0\n\t"
395 "paddusw %%mm1, %%mm2\n\t"
396 "psrlw $2, %%mm0\n\t"
397 "psrlw $2, %%mm2\n\t"
398 "packuswb %%mm2, %%mm0\n\t"
399 "movq %%mm0, %0\n\t"
400 :"=m"(*p)
401 :"m"(*pix),
402 "m"(*(pix+line_size))
403 :"memory");
404 pix += line_size;
405 p += line_size;
406 } while(--h);
de6d9b64
FB
407}
408
409static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
410{
411 UINT8 *p;
412 const UINT8 *pix;
413 p = block;
414 pix = pixels;
d6a4c0b1 415 MOVQ_ZERO(mm7);
de6d9b64
FB
416 do {
417 __asm __volatile(
418 "movq %1, %%mm0\n\t"
419 "movq 1%1, %%mm1\n\t"
420 "movq %%mm0, %%mm2\n\t"
421 "movq %%mm1, %%mm3\n\t"
422 "punpcklbw %%mm7, %%mm0\n\t"
423 "punpcklbw %%mm7, %%mm1\n\t"
424 "punpckhbw %%mm7, %%mm2\n\t"
425 "punpckhbw %%mm7, %%mm3\n\t"
426 "paddusw %%mm1, %%mm0\n\t"
427 "paddusw %%mm3, %%mm2\n\t"
428 "psrlw $1, %%mm0\n\t"
429 "psrlw $1, %%mm2\n\t"
430 "packuswb %%mm2, %%mm0\n\t"
431 "movq %%mm0, %0\n\t"
432 :"=m"(*p)
433 :"m"(*pix)
434 :"memory");
435 pix += line_size;
436 p += line_size;
437 } while (--h);
de6d9b64
FB
438}
439
440static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
441{
442 UINT8 *p;
443 const UINT8 *pix;
444 p = block;
445 pix = pixels;
d6a4c0b1
ZK
446 MOVQ_ZERO(mm7);
447 JUMPALIGN();
de6d9b64
FB
448 do {
449 __asm __volatile(
450 "movq %1, %%mm0\n\t"
451 "movq %2, %%mm1\n\t"
452 "movq %%mm0, %%mm2\n\t"
453 "movq %%mm1, %%mm3\n\t"
454 "punpcklbw %%mm7, %%mm0\n\t"
455 "punpcklbw %%mm7, %%mm1\n\t"
456 "punpckhbw %%mm7, %%mm2\n\t"
457 "punpckhbw %%mm7, %%mm3\n\t"
458 "paddusw %%mm1, %%mm0\n\t"
459 "paddusw %%mm3, %%mm2\n\t"
460 "psrlw $1, %%mm0\n\t"
461 "psrlw $1, %%mm2\n\t"
462 "packuswb %%mm2, %%mm0\n\t"
463 "movq %%mm0, %0\n\t"
464 :"=m"(*p)
465 :"m"(*pix),
466 "m"(*(pix+line_size))
467 :"memory");
468 pix += line_size;
469 p += line_size;
470 } while(--h);
de6d9b64
FB
471}
472
473static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
474{
475 UINT8 *p;
476 const UINT8 *pix;
477 p = block;
478 pix = pixels;
d6a4c0b1
ZK
479 MOVQ_ZERO(mm7);
480 MOVQ_WONE(mm6);
481 JUMPALIGN();
de6d9b64
FB
482 do {
483 __asm __volatile(
484 "movq %1, %%mm0\n\t"
485 "movq %2, %%mm1\n\t"
486 "movq 1%1, %%mm4\n\t"
487 "movq 1%2, %%mm5\n\t"
488 "movq %%mm0, %%mm2\n\t"
489 "movq %%mm1, %%mm3\n\t"
490 "punpcklbw %%mm7, %%mm0\n\t"
491 "punpcklbw %%mm7, %%mm1\n\t"
492 "punpckhbw %%mm7, %%mm2\n\t"
493 "punpckhbw %%mm7, %%mm3\n\t"
494 "paddusw %%mm1, %%mm0\n\t"
495 "paddusw %%mm3, %%mm2\n\t"
496 "movq %%mm4, %%mm1\n\t"
497 "movq %%mm5, %%mm3\n\t"
498 "punpcklbw %%mm7, %%mm4\n\t"
499 "punpcklbw %%mm7, %%mm5\n\t"
500 "punpckhbw %%mm7, %%mm1\n\t"
501 "punpckhbw %%mm7, %%mm3\n\t"
502 "paddusw %%mm5, %%mm4\n\t"
503 "paddusw %%mm3, %%mm1\n\t"
504 "paddusw %%mm6, %%mm4\n\t"
505 "paddusw %%mm6, %%mm1\n\t"
506 "paddusw %%mm4, %%mm0\n\t"
507 "paddusw %%mm1, %%mm2\n\t"
508 "psrlw $2, %%mm0\n\t"
509 "psrlw $2, %%mm2\n\t"
510 "packuswb %%mm2, %%mm0\n\t"
511 "movq %%mm0, %0\n\t"
512 :"=m"(*p)
513 :"m"(*pix),
514 "m"(*(pix+line_size))
515 :"memory");
516 pix += line_size;
517 p += line_size;
518 } while(--h);
de6d9b64
FB
519}
520
521static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
522{
523 UINT8 *p;
524 const UINT8 *pix;
525 p = block;
526 pix = pixels;
d6a4c0b1
ZK
527 MOVQ_ZERO(mm7);
528 MOVQ_WONE(mm6);
529 JUMPALIGN();
de6d9b64
FB
530 do {
531 __asm __volatile(
532 "movq %0, %%mm0\n\t"
533 "movq %1, %%mm1\n\t"
534 "movq %%mm0, %%mm2\n\t"
535 "movq %%mm1, %%mm3\n\t"
536 "punpcklbw %%mm7, %%mm0\n\t"
537 "punpcklbw %%mm7, %%mm1\n\t"
538 "punpckhbw %%mm7, %%mm2\n\t"
539 "punpckhbw %%mm7, %%mm3\n\t"
540 "paddusw %%mm1, %%mm0\n\t"
541 "paddusw %%mm3, %%mm2\n\t"
542 "paddusw %%mm6, %%mm0\n\t"
543 "paddusw %%mm6, %%mm2\n\t"
544 "psrlw $1, %%mm0\n\t"
545 "psrlw $1, %%mm2\n\t"
546 "packuswb %%mm2, %%mm0\n\t"
547 "movq %%mm0, %0\n\t"
a822a479 548 :"+m"(*p)
de6d9b64
FB
549 :"m"(*pix)
550 :"memory");
551 pix += line_size;
552 p += line_size;
553 }
554 while (--h);
de6d9b64
FB
555}
556
557static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
558{
559 UINT8 *p;
560 const UINT8 *pix;
561 p = block;
562 pix = pixels;
d6a4c0b1
ZK
563 MOVQ_ZERO(mm7);
564 MOVQ_WONE(mm6);
565 JUMPALIGN();
de6d9b64
FB
566 do {
567 __asm __volatile(
568 "movq %1, %%mm1\n\t"
569 "movq %0, %%mm0\n\t"
570 "movq 1%1, %%mm4\n\t"
571 "movq %%mm0, %%mm2\n\t"
572 "movq %%mm1, %%mm3\n\t"
573 "movq %%mm4, %%mm5\n\t"
574 "punpcklbw %%mm7, %%mm1\n\t"
575 "punpckhbw %%mm7, %%mm3\n\t"
576 "punpcklbw %%mm7, %%mm4\n\t"
577 "punpckhbw %%mm7, %%mm5\n\t"
578 "punpcklbw %%mm7, %%mm0\n\t"
579 "punpckhbw %%mm7, %%mm2\n\t"
580 "paddusw %%mm4, %%mm1\n\t"
581 "paddusw %%mm5, %%mm3\n\t"
582 "paddusw %%mm6, %%mm1\n\t"
583 "paddusw %%mm6, %%mm3\n\t"
584 "psrlw $1, %%mm1\n\t"
585 "psrlw $1, %%mm3\n\t"
586 "paddusw %%mm6, %%mm0\n\t"
587 "paddusw %%mm6, %%mm2\n\t"
588 "paddusw %%mm1, %%mm0\n\t"
589 "paddusw %%mm3, %%mm2\n\t"
590 "psrlw $1, %%mm0\n\t"
591 "psrlw $1, %%mm2\n\t"
592 "packuswb %%mm2, %%mm0\n\t"
593 "movq %%mm0, %0\n\t"
a822a479 594 :"+m"(*p)
de6d9b64
FB
595 :"m"(*pix)
596 :"memory");
597 pix += line_size;
598 p += line_size;
599 } while (--h);
de6d9b64
FB
600}
601
602static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
603{
604 UINT8 *p;
605 const UINT8 *pix;
606 p = block;
607 pix = pixels;
d6a4c0b1
ZK
608 MOVQ_ZERO(mm7);
609 MOVQ_WONE(mm6);
610 JUMPALIGN();
de6d9b64
FB
611 do {
612 __asm __volatile(
613 "movq %1, %%mm1\n\t"
614 "movq %0, %%mm0\n\t"
615 "movq %2, %%mm4\n\t"
616 "movq %%mm0, %%mm2\n\t"
617 "movq %%mm1, %%mm3\n\t"
618 "movq %%mm4, %%mm5\n\t"
619 "punpcklbw %%mm7, %%mm1\n\t"
620 "punpckhbw %%mm7, %%mm3\n\t"
621 "punpcklbw %%mm7, %%mm4\n\t"
622 "punpckhbw %%mm7, %%mm5\n\t"
623 "punpcklbw %%mm7, %%mm0\n\t"
624 "punpckhbw %%mm7, %%mm2\n\t"
625 "paddusw %%mm4, %%mm1\n\t"
626 "paddusw %%mm5, %%mm3\n\t"
627 "paddusw %%mm6, %%mm1\n\t"
628 "paddusw %%mm6, %%mm3\n\t"
629 "psrlw $1, %%mm1\n\t"
630 "psrlw $1, %%mm3\n\t"
631 "paddusw %%mm6, %%mm0\n\t"
632 "paddusw %%mm6, %%mm2\n\t"
633 "paddusw %%mm1, %%mm0\n\t"
634 "paddusw %%mm3, %%mm2\n\t"
635 "psrlw $1, %%mm0\n\t"
636 "psrlw $1, %%mm2\n\t"
637 "packuswb %%mm2, %%mm0\n\t"
638 "movq %%mm0, %0\n\t"
a822a479 639 :"+m"(*p)
de6d9b64
FB
640 :"m"(*pix), "m"(*(pix+line_size))
641 :"memory");
642 pix += line_size;
643 p += line_size ;
644 } while(--h);
de6d9b64
FB
645}
646
647static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
648{
649 UINT8 *p;
650 const UINT8 *pix;
651 p = block;
652 pix = pixels;
d6a4c0b1
ZK
653 MOVQ_ZERO(mm7);
654 // this doesn't seem to be used offten - so
655 // the inside usage of mm_wone is not optimized
656 MOVQ_WTWO(mm6);
de6d9b64
FB
657 do {
658 __asm __volatile(
659 "movq %1, %%mm0\n\t"
660 "movq %2, %%mm1\n\t"
661 "movq 1%1, %%mm4\n\t"
662 "movq 1%2, %%mm5\n\t"
663 "movq %%mm0, %%mm2\n\t"
664 "movq %%mm1, %%mm3\n\t"
665 "punpcklbw %%mm7, %%mm0\n\t"
666 "punpcklbw %%mm7, %%mm1\n\t"
667 "punpckhbw %%mm7, %%mm2\n\t"
668 "punpckhbw %%mm7, %%mm3\n\t"
669 "paddusw %%mm1, %%mm0\n\t"
670 "paddusw %%mm3, %%mm2\n\t"
671 "movq %%mm4, %%mm1\n\t"
672 "movq %%mm5, %%mm3\n\t"
673 "punpcklbw %%mm7, %%mm4\n\t"
674 "punpcklbw %%mm7, %%mm5\n\t"
675 "punpckhbw %%mm7, %%mm1\n\t"
676 "punpckhbw %%mm7, %%mm3\n\t"
677 "paddusw %%mm5, %%mm4\n\t"
678 "paddusw %%mm3, %%mm1\n\t"
679 "paddusw %%mm6, %%mm4\n\t"
680 "paddusw %%mm6, %%mm1\n\t"
681 "paddusw %%mm4, %%mm0\n\t"
682 "paddusw %%mm1, %%mm2\n\t"
683 "movq %3, %%mm5\n\t"
684 "psrlw $2, %%mm0\n\t"
685 "movq %0, %%mm1\n\t"
686 "psrlw $2, %%mm2\n\t"
687 "movq %%mm1, %%mm3\n\t"
688 "punpcklbw %%mm7, %%mm1\n\t"
689 "punpckhbw %%mm7, %%mm3\n\t"
690 "paddusw %%mm1, %%mm0\n\t"
691 "paddusw %%mm3, %%mm2\n\t"
692 "paddusw %%mm5, %%mm0\n\t"
693 "paddusw %%mm5, %%mm2\n\t"
694 "psrlw $1, %%mm0\n\t"
695 "psrlw $1, %%mm2\n\t"
696 "packuswb %%mm2, %%mm0\n\t"
697 "movq %%mm0, %0\n\t"
a822a479 698 :"+m"(*p)
de6d9b64 699 :"m"(*pix),
a9b3f630 700 "m"(*(pix+line_size)), "m"(mm_wone)
de6d9b64
FB
701 :"memory");
702 pix += line_size;
703 p += line_size ;
704 } while(--h);
de6d9b64
FB
705}
706
707static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
708{
709 UINT8 *p;
710 const UINT8 *pix;
711 p = block;
712 pix = pixels;
d6a4c0b1 713 MOVQ_ZERO(mm7);
de6d9b64
FB
714 do {
715 __asm __volatile(
716 "movq %1, %%mm0\n\t"
717 "movq %0, %%mm1\n\t"
718 "movq %%mm0, %%mm2\n\t"
719 "movq %%mm1, %%mm3\n\t"
720 "punpcklbw %%mm7, %%mm0\n\t"
721 "punpcklbw %%mm7, %%mm1\n\t"
722 "punpckhbw %%mm7, %%mm2\n\t"
723 "punpckhbw %%mm7, %%mm3\n\t"
724 "paddusw %%mm1, %%mm0\n\t"
725 "paddusw %%mm3, %%mm2\n\t"
726 "psrlw $1, %%mm0\n\t"
727 "psrlw $1, %%mm2\n\t"
728 "packuswb %%mm2, %%mm0\n\t"
729 "movq %%mm0, %0\n\t"
a822a479 730 :"+m"(*p)
de6d9b64
FB
731 :"m"(*pix)
732 :"memory");
733 pix += line_size;
734 p += line_size ;
735 } while (--h);
de6d9b64
FB
736}
737
738static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
739{
740 UINT8 *p;
741 const UINT8 *pix;
742 p = block;
743 pix = pixels;
d6a4c0b1 744 MOVQ_ZERO(mm7);
de6d9b64
FB
745 do {
746 __asm __volatile(
747 "movq %1, %%mm0\n\t"
748 "movq 1%1, %%mm1\n\t"
749 "movq %0, %%mm4\n\t"
750 "movq %%mm0, %%mm2\n\t"
751 "movq %%mm1, %%mm3\n\t"
752 "movq %%mm4, %%mm5\n\t"
753 "punpcklbw %%mm7, %%mm0\n\t"
754 "punpcklbw %%mm7, %%mm1\n\t"
755 "punpckhbw %%mm7, %%mm2\n\t"
756 "punpckhbw %%mm7, %%mm3\n\t"
757 "punpcklbw %%mm7, %%mm4\n\t"
758 "punpckhbw %%mm7, %%mm5\n\t"
759 "paddusw %%mm1, %%mm0\n\t"
760 "paddusw %%mm3, %%mm2\n\t"
761 "psrlw $1, %%mm0\n\t"
762 "psrlw $1, %%mm2\n\t"
763 "paddusw %%mm4, %%mm0\n\t"
764 "paddusw %%mm5, %%mm2\n\t"
765 "psrlw $1, %%mm0\n\t"
766 "psrlw $1, %%mm2\n\t"
767 "packuswb %%mm2, %%mm0\n\t"
768 "movq %%mm0, %0\n\t"
a822a479 769 :"+m"(*p)
de6d9b64
FB
770 :"m"(*pix)
771 :"memory");
772 pix += line_size;
773 p += line_size;
774 } while (--h);
de6d9b64
FB
775}
776
777static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
778{
779 UINT8 *p;
780 const UINT8 *pix;
781 p = block;
782 pix = pixels;
d6a4c0b1 783 MOVQ_ZERO(mm7);
de6d9b64
FB
784 do {
785 __asm __volatile(
786 "movq %1, %%mm0\n\t"
787 "movq %2, %%mm1\n\t"
788 "movq %0, %%mm4\n\t"
789 "movq %%mm0, %%mm2\n\t"
790 "movq %%mm1, %%mm3\n\t"
791 "movq %%mm4, %%mm5\n\t"
792 "punpcklbw %%mm7, %%mm0\n\t"
793 "punpcklbw %%mm7, %%mm1\n\t"
794 "punpckhbw %%mm7, %%mm2\n\t"
795 "punpckhbw %%mm7, %%mm3\n\t"
796 "punpcklbw %%mm7, %%mm4\n\t"
797 "punpckhbw %%mm7, %%mm5\n\t"
798 "paddusw %%mm1, %%mm0\n\t"
799 "paddusw %%mm3, %%mm2\n\t"
800 "psrlw $1, %%mm0\n\t"
801 "psrlw $1, %%mm2\n\t"
802 "paddusw %%mm4, %%mm0\n\t"
803 "paddusw %%mm5, %%mm2\n\t"
804 "psrlw $1, %%mm0\n\t"
805 "psrlw $1, %%mm2\n\t"
806 "packuswb %%mm2, %%mm0\n\t"
807 "movq %%mm0, %0\n\t"
a822a479 808 :"+m"(*p)
de6d9b64
FB
809 :"m"(*pix), "m"(*(pix+line_size))
810 :"memory");
811 pix += line_size;
812 p += line_size ;
813 } while(--h);
de6d9b64
FB
814}
815
816static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
817{
818 UINT8 *p;
819 const UINT8 *pix;
820 p = block;
821 pix = pixels;
d6a4c0b1
ZK
822 MOVQ_ZERO(mm7);
823 MOVQ_WONE(mm6);
824 JUMPALIGN();
de6d9b64
FB
825 do {
826 __asm __volatile(
827 "movq %1, %%mm0\n\t"
828 "movq %2, %%mm1\n\t"
829 "movq 1%1, %%mm4\n\t"
830 "movq 1%2, %%mm5\n\t"
831 "movq %%mm0, %%mm2\n\t"
832 "movq %%mm1, %%mm3\n\t"
833 "punpcklbw %%mm7, %%mm0\n\t"
834 "punpcklbw %%mm7, %%mm1\n\t"
835 "punpckhbw %%mm7, %%mm2\n\t"
836 "punpckhbw %%mm7, %%mm3\n\t"
837 "paddusw %%mm1, %%mm0\n\t"
838 "paddusw %%mm3, %%mm2\n\t"
839 "movq %%mm4, %%mm1\n\t"
840 "movq %%mm5, %%mm3\n\t"
841 "punpcklbw %%mm7, %%mm4\n\t"
842 "punpcklbw %%mm7, %%mm5\n\t"
843 "punpckhbw %%mm7, %%mm1\n\t"
844 "punpckhbw %%mm7, %%mm3\n\t"
845 "paddusw %%mm5, %%mm4\n\t"
846 "paddusw %%mm3, %%mm1\n\t"
847 "paddusw %%mm6, %%mm4\n\t"
848 "paddusw %%mm6, %%mm1\n\t"
849 "paddusw %%mm4, %%mm0\n\t"
850 "paddusw %%mm1, %%mm2\n\t"
851 "movq %0, %%mm1\n\t"
852 "psrlw $2, %%mm0\n\t"
853 "movq %%mm1, %%mm3\n\t"
854 "psrlw $2, %%mm2\n\t"
855 "punpcklbw %%mm7, %%mm1\n\t"
856 "punpckhbw %%mm7, %%mm3\n\t"
857 "paddusw %%mm1, %%mm0\n\t"
858 "paddusw %%mm3, %%mm2\n\t"
859 "psrlw $1, %%mm0\n\t"
860 "psrlw $1, %%mm2\n\t"
861 "packuswb %%mm2, %%mm0\n\t"
862 "movq %%mm0, %0\n\t"
a822a479 863 :"+m"(*p)
de6d9b64
FB
864 :"m"(*pix),
865 "m"(*(pix+line_size))
866 :"memory");
867 pix += line_size;
868 p += line_size;
869 } while(--h);
de6d9b64
FB
870}
871
872static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
873{
874 DCTELEM *p;
875 const UINT8 *pix;
876 p = block;
877 pix = pixels;
d6a4c0b1 878 MOVQ_ZERO(mm7);
de6d9b64
FB
879 do {
880 __asm __volatile(
881 "movq %0, %%mm0\n\t"
882 "movq %1, %%mm2\n\t"
883 "movq 8%0, %%mm1\n\t"
884 "movq %%mm2, %%mm3\n\t"
885 "punpcklbw %%mm7, %%mm2\n\t"
886 "punpckhbw %%mm7, %%mm3\n\t"
887 "psubsw %%mm2, %%mm0\n\t"
888 "psubsw %%mm3, %%mm1\n\t"
889 "movq %%mm0, %0\n\t"
890 "movq %%mm1, 8%0\n\t"
a822a479 891 :"+m"(*p)
de6d9b64
FB
892 :"m"(*pix)
893 :"memory");
894 pix += line_size;
895 p += 8;
896 } while (--h);
de6d9b64
FB
897}
898
899static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
900{
901 DCTELEM *p;
902 const UINT8 *pix;
903 p = block;
904 pix = pixels;
d6a4c0b1
ZK
905 MOVQ_ZERO(mm7);
906 MOVQ_WONE(mm6);
907 JUMPALIGN();
de6d9b64
FB
908 do {
909 __asm __volatile(
910 "movq %0, %%mm0\n\t"
911 "movq %1, %%mm2\n\t"
912 "movq 8%0, %%mm1\n\t"
913 "movq 1%1, %%mm4\n\t"
914 "movq %%mm2, %%mm3\n\t"
915 "movq %%mm4, %%mm5\n\t"
916 "punpcklbw %%mm7, %%mm2\n\t"
917 "punpckhbw %%mm7, %%mm3\n\t"
918 "punpcklbw %%mm7, %%mm4\n\t"
919 "punpckhbw %%mm7, %%mm5\n\t"
920 "paddusw %%mm4, %%mm2\n\t"
921 "paddusw %%mm5, %%mm3\n\t"
922 "paddusw %%mm6, %%mm2\n\t"
923 "paddusw %%mm6, %%mm3\n\t"
924 "psrlw $1, %%mm2\n\t"
925 "psrlw $1, %%mm3\n\t"
926 "psubsw %%mm2, %%mm0\n\t"
927 "psubsw %%mm3, %%mm1\n\t"
928 "movq %%mm0, %0\n\t"
929 "movq %%mm1, 8%0\n\t"
a822a479 930 :"+m"(*p)
de6d9b64
FB
931 :"m"(*pix)
932 :"memory");
933 pix += line_size;
934 p += 8;
935 } while (--h);
de6d9b64
FB
936}
937
938static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
939{
940 DCTELEM *p;
941 const UINT8 *pix;
942 p = block;
943 pix = pixels;
d6a4c0b1
ZK
944 MOVQ_ZERO(mm7);
945 MOVQ_WONE(mm6);
de6d9b64
FB
946 do {
947 __asm __volatile(
948 "movq %0, %%mm0\n\t"
949 "movq %1, %%mm2\n\t"
950 "movq 8%0, %%mm1\n\t"
951 "movq %2, %%mm4\n\t"
952 "movq %%mm2, %%mm3\n\t"
953 "movq %%mm4, %%mm5\n\t"
954 "punpcklbw %%mm7, %%mm2\n\t"
955 "punpckhbw %%mm7, %%mm3\n\t"
956 "punpcklbw %%mm7, %%mm4\n\t"
957 "punpckhbw %%mm7, %%mm5\n\t"
958 "paddusw %%mm4, %%mm2\n\t"
959 "paddusw %%mm5, %%mm3\n\t"
960 "paddusw %%mm6, %%mm2\n\t"
961 "paddusw %%mm6, %%mm3\n\t"
962 "psrlw $1, %%mm2\n\t"
963 "psrlw $1, %%mm3\n\t"
964 "psubsw %%mm2, %%mm0\n\t"
965 "psubsw %%mm3, %%mm1\n\t"
966 "movq %%mm0, %0\n\t"
967 "movq %%mm1, 8%0\n\t"
a822a479 968 :"+m"(*p)
de6d9b64
FB
969 :"m"(*pix), "m"(*(pix+line_size))
970 :"memory");
971 pix += line_size;
972 p += 8;
973 } while (--h);
de6d9b64
FB
974}
975
976static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
977{
978 DCTELEM *p;
979 const UINT8 *pix;
980 p = block;
981 pix = pixels;
d6a4c0b1
ZK
982 MOVQ_ZERO(mm7);
983 MOVQ_WTWO(mm6);
984 JUMPALIGN();
de6d9b64
FB
985 do {
986 __asm __volatile(
987 "movq %1, %%mm0\n\t"
988 "movq %2, %%mm1\n\t"
989 "movq 1%1, %%mm4\n\t"
990 "movq 1%2, %%mm5\n\t"
991 "movq %%mm0, %%mm2\n\t"
992 "movq %%mm1, %%mm3\n\t"
993 "punpcklbw %%mm7, %%mm0\n\t"
994 "punpcklbw %%mm7, %%mm1\n\t"
995 "punpckhbw %%mm7, %%mm2\n\t"
996 "punpckhbw %%mm7, %%mm3\n\t"
997 "paddusw %%mm1, %%mm0\n\t"
998 "paddusw %%mm3, %%mm2\n\t"
999 "movq %%mm4, %%mm1\n\t"
1000 "movq %%mm5, %%mm3\n\t"
1001 "punpcklbw %%mm7, %%mm4\n\t"
1002 "punpcklbw %%mm7, %%mm5\n\t"
1003 "punpckhbw %%mm7, %%mm1\n\t"
1004 "punpckhbw %%mm7, %%mm3\n\t"
1005 "paddusw %%mm5, %%mm4\n\t"
1006 "paddusw %%mm3, %%mm1\n\t"
1007 "paddusw %%mm6, %%mm4\n\t"
1008 "paddusw %%mm6, %%mm1\n\t"
1009 "paddusw %%mm4, %%mm0\n\t"
1010 "paddusw %%mm1, %%mm2\n\t"
1011 "movq %0, %%mm1\n\t"
1012 "movq 8%0, %%mm3\n\t"
1013 "psrlw $2, %%mm0\n\t"
1014 "psrlw $2, %%mm2\n\t"
1015 "psubsw %%mm0, %%mm1\n\t"
1016 "psubsw %%mm2, %%mm3\n\t"
1017 "movq %%mm1, %0\n\t"
1018 "movq %%mm3, 8%0\n\t"
a822a479 1019 :"+m"(*p)
de6d9b64
FB
1020 :"m"(*pix),
1021 "m"(*(pix+line_size))
1022 :"memory");
1023 pix += line_size;
1024 p += 8 ;
1025 } while(--h);
de6d9b64
FB
1026}
1027
649c00c9
MN
1028static void clear_blocks_mmx(DCTELEM *blocks)
1029{
1030 asm volatile(
1031 "pxor %%mm7, %%mm7 \n\t"
1032 "movl $-128*6, %%eax \n\t"
1033 "1: \n\t"
1034 "movq %%mm7, (%0, %%eax) \n\t"
1035 "movq %%mm7, 8(%0, %%eax) \n\t"
1036 "movq %%mm7, 16(%0, %%eax) \n\t"
1037 "movq %%mm7, 24(%0, %%eax) \n\t"
1038 "addl $32, %%eax \n\t"
1039 " js 1b \n\t"
1040 : : "r" (((int)blocks)+128*6)
1041 : "%eax"
1042 );
1043}
1044
d6a4c0b1
ZK
1045static void just_return() { return; }
1046
de6d9b64
FB
1047void dsputil_init_mmx(void)
1048{
1049 mm_flags = mm_support();
f4470e09
MN
1050#if 1
1051 printf("libavcodec: CPU flags:");
de6d9b64
FB
1052 if (mm_flags & MM_MMX)
1053 printf(" mmx");
1054 if (mm_flags & MM_MMXEXT)
1055 printf(" mmxext");
1056 if (mm_flags & MM_3DNOW)
1057 printf(" 3dnow");
1058 if (mm_flags & MM_SSE)
1059 printf(" sse");
1060 if (mm_flags & MM_SSE2)
1061 printf(" sse2");
1062 printf("\n");
1063#endif
1064
1065 if (mm_flags & MM_MMX) {
1066 get_pixels = get_pixels_mmx;
1067 put_pixels_clamped = put_pixels_clamped_mmx;
1068 add_pixels_clamped = add_pixels_clamped_mmx;
649c00c9
MN
1069 clear_blocks= clear_blocks_mmx;
1070
ba6802de
MN
1071 pix_abs16x16 = pix_abs16x16_mmx;
1072 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1073 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
de6d9b64 1074 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
ba6802de
MN
1075 pix_abs8x8 = pix_abs8x8_mmx;
1076 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1077 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1078 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
de6d9b64
FB
1079 av_fdct = fdct_mmx;
1080
1081 put_pixels_tab[0] = put_pixels_mmx;
1082 put_pixels_tab[1] = put_pixels_x2_mmx;
1083 put_pixels_tab[2] = put_pixels_y2_mmx;
1084 put_pixels_tab[3] = put_pixels_xy2_mmx;
1085
1086 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
1087 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1088 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1089 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
1090
1091 avg_pixels_tab[0] = avg_pixels_mmx;
1092 avg_pixels_tab[1] = avg_pixels_x2_mmx;
1093 avg_pixels_tab[2] = avg_pixels_y2_mmx;
1094 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1095
1096 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
1097 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
1098 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
1099 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
1100
1101 sub_pixels_tab[0] = sub_pixels_mmx;
1102 sub_pixels_tab[1] = sub_pixels_x2_mmx;
1103 sub_pixels_tab[2] = sub_pixels_y2_mmx;
1104 sub_pixels_tab[3] = sub_pixels_xy2_mmx;
1105
1106 if (mm_flags & MM_MMXEXT) {
ba6802de
MN
1107 pix_abs16x16 = pix_abs16x16_mmx2;
1108 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
1109 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
1110 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
1111
1112 pix_abs8x8 = pix_abs8x8_mmx2;
1113 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
1114 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
1115 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
1116
de6d9b64
FB
1117 put_pixels_tab[1] = put_pixels_x2_sse;
1118 put_pixels_tab[2] = put_pixels_y2_sse;
1119
1120 avg_pixels_tab[0] = avg_pixels_sse;
1121 avg_pixels_tab[1] = avg_pixels_x2_sse;
1122 avg_pixels_tab[2] = avg_pixels_y2_sse;
1123 avg_pixels_tab[3] = avg_pixels_xy2_sse;
1124
1125 sub_pixels_tab[1] = sub_pixels_x2_sse;
1126 sub_pixels_tab[2] = sub_pixels_y2_sse;
1127 } else if (mm_flags & MM_3DNOW) {
1128 put_pixels_tab[1] = put_pixels_x2_3dnow;
1129 put_pixels_tab[2] = put_pixels_y2_3dnow;
1130
1131 avg_pixels_tab[0] = avg_pixels_3dnow;
1132 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
1133 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
1134 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
1135
1136 sub_pixels_tab[1] = sub_pixels_x2_3dnow;
1137 sub_pixels_tab[2] = sub_pixels_y2_3dnow;
1138 }
4af7bcc1 1139
8def0299
FB
1140 /* idct */
1141 if (mm_flags & MM_MMXEXT) {
1142 ff_idct = ff_mmxext_idct;
1143 } else {
1144 ff_idct = ff_mmx_idct;
1145 }
d962f6fd
A
1146#ifdef SIMPLE_IDCT
1147// ff_idct = simple_idct;
1148 ff_idct = simple_idct_mmx;
1149#endif
de6d9b64 1150 }
d6a4c0b1
ZK
1151
1152#if 0
1153 // for speed testing
1154 get_pixels = just_return;
1155 put_pixels_clamped = just_return;
1156 add_pixels_clamped = just_return;
1157
1158 pix_abs16x16 = just_return;
1159 pix_abs16x16_x2 = just_return;
1160 pix_abs16x16_y2 = just_return;
1161 pix_abs16x16_xy2 = just_return;
1162
1163 put_pixels_tab[0] = just_return;
1164 put_pixels_tab[1] = just_return;
1165 put_pixels_tab[2] = just_return;
1166 put_pixels_tab[3] = just_return;
1167
1168 put_no_rnd_pixels_tab[0] = just_return;
1169 put_no_rnd_pixels_tab[1] = just_return;
1170 put_no_rnd_pixels_tab[2] = just_return;
1171 put_no_rnd_pixels_tab[3] = just_return;
1172
1173 avg_pixels_tab[0] = just_return;
1174 avg_pixels_tab[1] = just_return;
1175 avg_pixels_tab[2] = just_return;
1176 avg_pixels_tab[3] = just_return;
1177
1178 avg_no_rnd_pixels_tab[0] = just_return;
1179 avg_no_rnd_pixels_tab[1] = just_return;
1180 avg_no_rnd_pixels_tab[2] = just_return;
1181 avg_no_rnd_pixels_tab[3] = just_return;
1182
1183 sub_pixels_tab[0] = just_return;
1184 sub_pixels_tab[1] = just_return;
1185 sub_pixels_tab[2] = just_return;
1186 sub_pixels_tab[3] = just_return;
1187
1188 //av_fdct = just_return;
1189 //ff_idct = just_return;
1190#endif
de6d9b64 1191}