new hopefully faster MC
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5
FB
25int mm_flags; /* multimedia extension flags */
26
ba6802de
MN
27int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31
32int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36
37int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41
42int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46
8def0299
FB
47/* external functions, from idct_mmx.c */
48void ff_mmx_idct(DCTELEM *block);
49void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 50
de6d9b64 51/* pixel operations */
ba6802de
MN
52static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL;
53static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL;
a9b3f630
NK
54//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
55//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
de6d9b64 56
d6a4c0b1
ZK
57#define JUMPALIGN() __asm __volatile (".balign 8"::)
58#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
59
60#ifndef PIC
61#define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
62#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
63#else
64// for shared library it's better to use this way for accessing constants
65// pcmpeqd -> -1
66#define MOVQ_WONE(regd) \
67 __asm __volatile ( \
68 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
69 "psrlw $15, %%" #regd ::)
70
71#define MOVQ_WTWO(regd) \
72 __asm __volatile ( \
73 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
74 "psrlw $15, %%" #regd " \n\t" \
75 "psllw $1, %%" #regd ::)
76#endif
77
de6d9b64
FB
78/***********************************/
79/* 3Dnow specific */
80
81#define DEF(x) x ## _3dnow
82/* for Athlons PAVGUSB is prefered */
83#define PAVGB "pavgusb"
84
85#include "dsputil_mmx_avg.h"
86
87#undef DEF
88#undef PAVGB
89
90/***********************************/
91/* MMX2 specific */
92
93#define DEF(x) x ## _sse
94
95/* Introduced only in MMX2 set */
96#define PAVGB "pavgb"
97
98#include "dsputil_mmx_avg.h"
99
100#undef DEF
101#undef PAVGB
102
103/***********************************/
104/* standard MMX */
105
106static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
107{
108 DCTELEM *p;
109 const UINT8 *pix;
110 int i;
111
112 /* read the pixels */
113 p = block;
114 pix = pixels;
d6a4c0b1 115 MOVQ_ZERO(mm7);
de6d9b64
FB
116 for(i=0;i<4;i++) {
117 __asm __volatile(
118 "movq %1, %%mm0\n\t"
119 "movq %2, %%mm1\n\t"
120 "movq %%mm0, %%mm2\n\t"
121 "movq %%mm1, %%mm3\n\t"
122 "punpcklbw %%mm7, %%mm0\n\t"
123 "punpckhbw %%mm7, %%mm2\n\t"
124 "punpcklbw %%mm7, %%mm1\n\t"
125 "punpckhbw %%mm7, %%mm3\n\t"
126 "movq %%mm0, %0\n\t"
127 "movq %%mm2, 8%0\n\t"
128 "movq %%mm1, 16%0\n\t"
129 "movq %%mm3, 24%0\n\t"
130 :"=m"(*p)
131 :"m"(*pix), "m"(*(pix+line_size))
132 :"memory");
133 pix += line_size*2;
134 p += 16;
135 }
de6d9b64
FB
136}
137
9dbcbd92
MN
138static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
139{
140 asm volatile(
141 ".balign 16 \n\t"
142 "movl $-128, %%eax \n\t"
143 "1: \n\t"
144 "movq (%0), %%mm0 \n\t"
145 "movq (%1), %%mm2 \n\t"
146 "movq %%mm0, %%mm1 \n\t"
147 "movq %%mm2, %%mm3 \n\t"
148 "punpcklbw %%mm7, %%mm0 \n\t"
149 "punpckhbw %%mm7, %%mm1 \n\t"
150 "punpcklbw %%mm7, %%mm2 \n\t"
151 "punpckhbw %%mm7, %%mm3 \n\t"
152 "psubw %%mm2, %%mm0 \n\t"
153 "psubw %%mm3, %%mm1 \n\t"
154 "movq %%mm0, (%2, %%eax)\n\t"
155 "movq %%mm1, 8(%2, %%eax)\n\t"
156 "addl %3, %0 \n\t"
157 "addl %3, %1 \n\t"
158 "addl $16, %%eax \n\t"
159 "jnz 1b \n\t"
160 : "+r" (s1), "+r" (s2)
161 : "r" (block+64), "r" (stride)
162 : "%eax"
163 );
164}
165
de6d9b64
FB
166static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
167{
168 const DCTELEM *p;
169 UINT8 *pix;
de6d9b64
FB
170
171 /* read the pixels */
172 p = block;
173 pix = pixels;
d6a4c0b1 174 /* unrolled loop */
de6d9b64 175 __asm __volatile(
a822a479
NK
176 "movq %3, %%mm0\n\t"
177 "movq 8%3, %%mm1\n\t"
178 "movq 16%3, %%mm2\n\t"
179 "movq 24%3, %%mm3\n\t"
180 "movq 32%3, %%mm4\n\t"
181 "movq 40%3, %%mm5\n\t"
182 "movq 48%3, %%mm6\n\t"
183 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
184 "packuswb %%mm1, %%mm0\n\t"
185 "packuswb %%mm3, %%mm2\n\t"
186 "packuswb %%mm5, %%mm4\n\t"
187 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
188 "movq %%mm0, (%0)\n\t"
189 "movq %%mm2, (%0, %1)\n\t"
190 "movq %%mm4, (%0, %1, 2)\n\t"
191 "movq %%mm6, (%0, %2)\n\t"
192 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
193 :"memory");
194 pix += line_size*4;
195 p += 32;
d6a4c0b1
ZK
196
197 // if here would be an exact copy of the code above
198 // compiler would generate some very strange code
199 // thus using "r"
200 __asm __volatile(
201 "movq (%3), %%mm0\n\t"
202 "movq 8(%3), %%mm1\n\t"
203 "movq 16(%3), %%mm2\n\t"
204 "movq 24(%3), %%mm3\n\t"
205 "movq 32(%3), %%mm4\n\t"
206 "movq 40(%3), %%mm5\n\t"
207 "movq 48(%3), %%mm6\n\t"
208 "movq 56(%3), %%mm7\n\t"
209 "packuswb %%mm1, %%mm0\n\t"
210 "packuswb %%mm3, %%mm2\n\t"
211 "packuswb %%mm5, %%mm4\n\t"
212 "packuswb %%mm7, %%mm6\n\t"
213 "movq %%mm0, (%0)\n\t"
214 "movq %%mm2, (%0, %1)\n\t"
215 "movq %%mm4, (%0, %1, 2)\n\t"
216 "movq %%mm6, (%0, %2)\n\t"
217 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
218 :"memory");
de6d9b64
FB
219}
220
221static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
222{
223 const DCTELEM *p;
224 UINT8 *pix;
225 int i;
226
227 /* read the pixels */
228 p = block;
229 pix = pixels;
d6a4c0b1
ZK
230 MOVQ_ZERO(mm7);
231 i = 4;
cd8e5f96 232 do {
de6d9b64 233 __asm __volatile(
cd8e5f96
ZK
234 "movq (%2), %%mm0\n\t"
235 "movq 8(%2), %%mm1\n\t"
236 "movq 16(%2), %%mm2\n\t"
237 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
238 "movq %0, %%mm4\n\t"
239 "movq %1, %%mm6\n\t"
240 "movq %%mm4, %%mm5\n\t"
241 "punpcklbw %%mm7, %%mm4\n\t"
242 "punpckhbw %%mm7, %%mm5\n\t"
243 "paddsw %%mm4, %%mm0\n\t"
244 "paddsw %%mm5, %%mm1\n\t"
245 "movq %%mm6, %%mm5\n\t"
246 "punpcklbw %%mm7, %%mm6\n\t"
247 "punpckhbw %%mm7, %%mm5\n\t"
248 "paddsw %%mm6, %%mm2\n\t"
249 "paddsw %%mm5, %%mm3\n\t"
250 "packuswb %%mm1, %%mm0\n\t"
251 "packuswb %%mm3, %%mm2\n\t"
252 "movq %%mm0, %0\n\t"
253 "movq %%mm2, %1\n\t"
a822a479 254 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 255 :"r"(p)
de6d9b64
FB
256 :"memory");
257 pix += line_size*2;
258 p += 16;
cd8e5f96 259 } while (--i);
de6d9b64
FB
260}
261
262static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
263{
d6a4c0b1 264 int hh;
de6d9b64
FB
265 UINT8 *p;
266 const UINT8 *pix;
d6a4c0b1 267
de6d9b64 268 p = block;
d6a4c0b1
ZK
269 pix = pixels; // 2s
270#if 0
271 do {
272 __asm __volatile(
273 "movq %1, %%mm0\n\t"
274 "movq %%mm0, %0\n\t"
275 :"=m"(*p)
276 :"m"(*pix)
277 :"memory");
278 pix += line_size;
279 p += line_size;
280 } while (--h);
281#else
282 // this optimized code is not very usefull
283 // the above loop is definitely faster
284 // at least on Celeron 500MHz
285 hh = h & 3;
286 while (hh) {
287 __asm __volatile(
288 "movq %1, %%mm0\n\t"
289 "movq %%mm0, %0\n\t"
290 :"=m"(*p)
291 :"m"(*pix)
292 :"memory");
293 pix += line_size;
294 p += line_size;
295 hh--;
296 }
de6d9b64 297 hh=h>>2;
d6a4c0b1 298 while (hh) {
de6d9b64 299 __asm __volatile(
a822a479
NK
300 "movq (%1), %%mm0 \n\t"
301 "movq (%1, %2), %%mm1 \n\t"
302 "movq (%1, %2, 2), %%mm2 \n\t"
303 "movq (%1, %3), %%mm3 \n\t"
304 "movq %%mm0, (%0) \n\t"
305 "movq %%mm1, (%0, %2) \n\t"
306 "movq %%mm2, (%0, %2, 2) \n\t"
307 "movq %%mm3, (%0, %3) \n\t"
308 ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3)
de6d9b64 309 :"memory");
d6a4c0b1
ZK
310 pix += line_size*4;
311 p += line_size*4;
312 hh--;
de6d9b64 313 }
d6a4c0b1 314#endif
de6d9b64
FB
315}
316
317static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
318{
319 UINT8 *p;
320 const UINT8 *pix;
321 p = block;
322 pix = pixels;
d6a4c0b1
ZK
323 MOVQ_ZERO(mm7);
324 MOVQ_WONE(mm4);
325 JUMPALIGN();
de6d9b64
FB
326 do {
327 __asm __volatile(
328 "movq %1, %%mm0\n\t"
329 "movq 1%1, %%mm1\n\t"
330 "movq %%mm0, %%mm2\n\t"
331 "movq %%mm1, %%mm3\n\t"
332 "punpcklbw %%mm7, %%mm0\n\t"
333 "punpcklbw %%mm7, %%mm1\n\t"
334 "punpckhbw %%mm7, %%mm2\n\t"
335 "punpckhbw %%mm7, %%mm3\n\t"
336 "paddusw %%mm1, %%mm0\n\t"
337 "paddusw %%mm3, %%mm2\n\t"
338 "paddusw %%mm4, %%mm0\n\t"
339 "paddusw %%mm4, %%mm2\n\t"
340 "psrlw $1, %%mm0\n\t"
341 "psrlw $1, %%mm2\n\t"
342 "packuswb %%mm2, %%mm0\n\t"
343 "movq %%mm0, %0\n\t"
344 :"=m"(*p)
345 :"m"(*pix)
346 :"memory");
347 pix += line_size; p += line_size;
348 } while (--h);
de6d9b64
FB
349}
350
351static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
352{
353 UINT8 *p;
354 const UINT8 *pix;
355 p = block;
356 pix = pixels;
d6a4c0b1
ZK
357 MOVQ_ZERO(mm7);
358 MOVQ_WONE(mm4);
359 JUMPALIGN();
de6d9b64
FB
360 do {
361 __asm __volatile(
362 "movq %1, %%mm0\n\t"
363 "movq %2, %%mm1\n\t"
364 "movq %%mm0, %%mm2\n\t"
365 "movq %%mm1, %%mm3\n\t"
366 "punpcklbw %%mm7, %%mm0\n\t"
367 "punpcklbw %%mm7, %%mm1\n\t"
368 "punpckhbw %%mm7, %%mm2\n\t"
369 "punpckhbw %%mm7, %%mm3\n\t"
370 "paddusw %%mm1, %%mm0\n\t"
371 "paddusw %%mm3, %%mm2\n\t"
372 "paddusw %%mm4, %%mm0\n\t"
373 "paddusw %%mm4, %%mm2\n\t"
374 "psrlw $1, %%mm0\n\t"
375 "psrlw $1, %%mm2\n\t"
376 "packuswb %%mm2, %%mm0\n\t"
377 "movq %%mm0, %0\n\t"
378 :"=m"(*p)
379 :"m"(*pix),
380 "m"(*(pix+line_size))
381 :"memory");
382 pix += line_size;
383 p += line_size;
384 } while (--h);
de6d9b64
FB
385}
386
387static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
388{
389 UINT8 *p;
390 const UINT8 *pix;
391 p = block;
d6a4c0b1
ZK
392 pix = pixels; // 1s
393 MOVQ_ZERO(mm7);
394 MOVQ_WTWO(mm6);
395 JUMPALIGN();
de6d9b64
FB
396 do {
397 __asm __volatile(
398 "movq %1, %%mm0\n\t"
399 "movq %2, %%mm1\n\t"
400 "movq 1%1, %%mm4\n\t"
401 "movq 1%2, %%mm5\n\t"
402 "movq %%mm0, %%mm2\n\t"
403 "movq %%mm1, %%mm3\n\t"
404 "punpcklbw %%mm7, %%mm0\n\t"
405 "punpcklbw %%mm7, %%mm1\n\t"
406 "punpckhbw %%mm7, %%mm2\n\t"
407 "punpckhbw %%mm7, %%mm3\n\t"
408 "paddusw %%mm1, %%mm0\n\t"
409 "paddusw %%mm3, %%mm2\n\t"
410 "movq %%mm4, %%mm1\n\t"
411 "movq %%mm5, %%mm3\n\t"
412 "punpcklbw %%mm7, %%mm4\n\t"
413 "punpcklbw %%mm7, %%mm5\n\t"
414 "punpckhbw %%mm7, %%mm1\n\t"
415 "punpckhbw %%mm7, %%mm3\n\t"
416 "paddusw %%mm5, %%mm4\n\t"
417 "paddusw %%mm3, %%mm1\n\t"
418 "paddusw %%mm6, %%mm4\n\t"
419 "paddusw %%mm6, %%mm1\n\t"
420 "paddusw %%mm4, %%mm0\n\t"
421 "paddusw %%mm1, %%mm2\n\t"
422 "psrlw $2, %%mm0\n\t"
423 "psrlw $2, %%mm2\n\t"
424 "packuswb %%mm2, %%mm0\n\t"
425 "movq %%mm0, %0\n\t"
426 :"=m"(*p)
427 :"m"(*pix),
428 "m"(*(pix+line_size))
429 :"memory");
430 pix += line_size;
431 p += line_size;
432 } while(--h);
de6d9b64
FB
433}
434
435static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
436{
437 UINT8 *p;
438 const UINT8 *pix;
439 p = block;
440 pix = pixels;
d6a4c0b1 441 MOVQ_ZERO(mm7);
de6d9b64
FB
442 do {
443 __asm __volatile(
444 "movq %1, %%mm0\n\t"
445 "movq 1%1, %%mm1\n\t"
446 "movq %%mm0, %%mm2\n\t"
447 "movq %%mm1, %%mm3\n\t"
448 "punpcklbw %%mm7, %%mm0\n\t"
449 "punpcklbw %%mm7, %%mm1\n\t"
450 "punpckhbw %%mm7, %%mm2\n\t"
451 "punpckhbw %%mm7, %%mm3\n\t"
452 "paddusw %%mm1, %%mm0\n\t"
453 "paddusw %%mm3, %%mm2\n\t"
454 "psrlw $1, %%mm0\n\t"
455 "psrlw $1, %%mm2\n\t"
456 "packuswb %%mm2, %%mm0\n\t"
457 "movq %%mm0, %0\n\t"
458 :"=m"(*p)
459 :"m"(*pix)
460 :"memory");
461 pix += line_size;
462 p += line_size;
463 } while (--h);
de6d9b64
FB
464}
465
466static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
467{
468 UINT8 *p;
469 const UINT8 *pix;
470 p = block;
471 pix = pixels;
d6a4c0b1
ZK
472 MOVQ_ZERO(mm7);
473 JUMPALIGN();
de6d9b64
FB
474 do {
475 __asm __volatile(
476 "movq %1, %%mm0\n\t"
477 "movq %2, %%mm1\n\t"
478 "movq %%mm0, %%mm2\n\t"
479 "movq %%mm1, %%mm3\n\t"
480 "punpcklbw %%mm7, %%mm0\n\t"
481 "punpcklbw %%mm7, %%mm1\n\t"
482 "punpckhbw %%mm7, %%mm2\n\t"
483 "punpckhbw %%mm7, %%mm3\n\t"
484 "paddusw %%mm1, %%mm0\n\t"
485 "paddusw %%mm3, %%mm2\n\t"
486 "psrlw $1, %%mm0\n\t"
487 "psrlw $1, %%mm2\n\t"
488 "packuswb %%mm2, %%mm0\n\t"
489 "movq %%mm0, %0\n\t"
490 :"=m"(*p)
491 :"m"(*pix),
492 "m"(*(pix+line_size))
493 :"memory");
494 pix += line_size;
495 p += line_size;
496 } while(--h);
de6d9b64
FB
497}
498
499static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
500{
501 UINT8 *p;
502 const UINT8 *pix;
503 p = block;
504 pix = pixels;
d6a4c0b1
ZK
505 MOVQ_ZERO(mm7);
506 MOVQ_WONE(mm6);
507 JUMPALIGN();
de6d9b64
FB
508 do {
509 __asm __volatile(
510 "movq %1, %%mm0\n\t"
511 "movq %2, %%mm1\n\t"
512 "movq 1%1, %%mm4\n\t"
513 "movq 1%2, %%mm5\n\t"
514 "movq %%mm0, %%mm2\n\t"
515 "movq %%mm1, %%mm3\n\t"
516 "punpcklbw %%mm7, %%mm0\n\t"
517 "punpcklbw %%mm7, %%mm1\n\t"
518 "punpckhbw %%mm7, %%mm2\n\t"
519 "punpckhbw %%mm7, %%mm3\n\t"
520 "paddusw %%mm1, %%mm0\n\t"
521 "paddusw %%mm3, %%mm2\n\t"
522 "movq %%mm4, %%mm1\n\t"
523 "movq %%mm5, %%mm3\n\t"
524 "punpcklbw %%mm7, %%mm4\n\t"
525 "punpcklbw %%mm7, %%mm5\n\t"
526 "punpckhbw %%mm7, %%mm1\n\t"
527 "punpckhbw %%mm7, %%mm3\n\t"
528 "paddusw %%mm5, %%mm4\n\t"
529 "paddusw %%mm3, %%mm1\n\t"
530 "paddusw %%mm6, %%mm4\n\t"
531 "paddusw %%mm6, %%mm1\n\t"
532 "paddusw %%mm4, %%mm0\n\t"
533 "paddusw %%mm1, %%mm2\n\t"
534 "psrlw $2, %%mm0\n\t"
535 "psrlw $2, %%mm2\n\t"
536 "packuswb %%mm2, %%mm0\n\t"
537 "movq %%mm0, %0\n\t"
538 :"=m"(*p)
539 :"m"(*pix),
540 "m"(*(pix+line_size))
541 :"memory");
542 pix += line_size;
543 p += line_size;
544 } while(--h);
de6d9b64
FB
545}
546
547static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
548{
549 UINT8 *p;
550 const UINT8 *pix;
551 p = block;
552 pix = pixels;
d6a4c0b1
ZK
553 MOVQ_ZERO(mm7);
554 MOVQ_WONE(mm6);
555 JUMPALIGN();
de6d9b64
FB
556 do {
557 __asm __volatile(
558 "movq %0, %%mm0\n\t"
559 "movq %1, %%mm1\n\t"
560 "movq %%mm0, %%mm2\n\t"
561 "movq %%mm1, %%mm3\n\t"
562 "punpcklbw %%mm7, %%mm0\n\t"
563 "punpcklbw %%mm7, %%mm1\n\t"
564 "punpckhbw %%mm7, %%mm2\n\t"
565 "punpckhbw %%mm7, %%mm3\n\t"
566 "paddusw %%mm1, %%mm0\n\t"
567 "paddusw %%mm3, %%mm2\n\t"
568 "paddusw %%mm6, %%mm0\n\t"
569 "paddusw %%mm6, %%mm2\n\t"
570 "psrlw $1, %%mm0\n\t"
571 "psrlw $1, %%mm2\n\t"
572 "packuswb %%mm2, %%mm0\n\t"
573 "movq %%mm0, %0\n\t"
a822a479 574 :"+m"(*p)
de6d9b64
FB
575 :"m"(*pix)
576 :"memory");
577 pix += line_size;
578 p += line_size;
579 }
580 while (--h);
de6d9b64
FB
581}
582
583static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
584{
585 UINT8 *p;
586 const UINT8 *pix;
587 p = block;
588 pix = pixels;
d6a4c0b1
ZK
589 MOVQ_ZERO(mm7);
590 MOVQ_WONE(mm6);
591 JUMPALIGN();
de6d9b64
FB
592 do {
593 __asm __volatile(
594 "movq %1, %%mm1\n\t"
595 "movq %0, %%mm0\n\t"
596 "movq 1%1, %%mm4\n\t"
597 "movq %%mm0, %%mm2\n\t"
598 "movq %%mm1, %%mm3\n\t"
599 "movq %%mm4, %%mm5\n\t"
600 "punpcklbw %%mm7, %%mm1\n\t"
601 "punpckhbw %%mm7, %%mm3\n\t"
602 "punpcklbw %%mm7, %%mm4\n\t"
603 "punpckhbw %%mm7, %%mm5\n\t"
604 "punpcklbw %%mm7, %%mm0\n\t"
605 "punpckhbw %%mm7, %%mm2\n\t"
606 "paddusw %%mm4, %%mm1\n\t"
607 "paddusw %%mm5, %%mm3\n\t"
608 "paddusw %%mm6, %%mm1\n\t"
609 "paddusw %%mm6, %%mm3\n\t"
610 "psrlw $1, %%mm1\n\t"
611 "psrlw $1, %%mm3\n\t"
612 "paddusw %%mm6, %%mm0\n\t"
613 "paddusw %%mm6, %%mm2\n\t"
614 "paddusw %%mm1, %%mm0\n\t"
615 "paddusw %%mm3, %%mm2\n\t"
616 "psrlw $1, %%mm0\n\t"
617 "psrlw $1, %%mm2\n\t"
618 "packuswb %%mm2, %%mm0\n\t"
619 "movq %%mm0, %0\n\t"
a822a479 620 :"+m"(*p)
de6d9b64
FB
621 :"m"(*pix)
622 :"memory");
623 pix += line_size;
624 p += line_size;
625 } while (--h);
de6d9b64
FB
626}
627
628static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
629{
630 UINT8 *p;
631 const UINT8 *pix;
632 p = block;
633 pix = pixels;
d6a4c0b1
ZK
634 MOVQ_ZERO(mm7);
635 MOVQ_WONE(mm6);
636 JUMPALIGN();
de6d9b64
FB
637 do {
638 __asm __volatile(
639 "movq %1, %%mm1\n\t"
640 "movq %0, %%mm0\n\t"
641 "movq %2, %%mm4\n\t"
642 "movq %%mm0, %%mm2\n\t"
643 "movq %%mm1, %%mm3\n\t"
644 "movq %%mm4, %%mm5\n\t"
645 "punpcklbw %%mm7, %%mm1\n\t"
646 "punpckhbw %%mm7, %%mm3\n\t"
647 "punpcklbw %%mm7, %%mm4\n\t"
648 "punpckhbw %%mm7, %%mm5\n\t"
649 "punpcklbw %%mm7, %%mm0\n\t"
650 "punpckhbw %%mm7, %%mm2\n\t"
651 "paddusw %%mm4, %%mm1\n\t"
652 "paddusw %%mm5, %%mm3\n\t"
653 "paddusw %%mm6, %%mm1\n\t"
654 "paddusw %%mm6, %%mm3\n\t"
655 "psrlw $1, %%mm1\n\t"
656 "psrlw $1, %%mm3\n\t"
657 "paddusw %%mm6, %%mm0\n\t"
658 "paddusw %%mm6, %%mm2\n\t"
659 "paddusw %%mm1, %%mm0\n\t"
660 "paddusw %%mm3, %%mm2\n\t"
661 "psrlw $1, %%mm0\n\t"
662 "psrlw $1, %%mm2\n\t"
663 "packuswb %%mm2, %%mm0\n\t"
664 "movq %%mm0, %0\n\t"
a822a479 665 :"+m"(*p)
de6d9b64
FB
666 :"m"(*pix), "m"(*(pix+line_size))
667 :"memory");
668 pix += line_size;
669 p += line_size ;
670 } while(--h);
de6d9b64
FB
671}
672
673static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
674{
675 UINT8 *p;
676 const UINT8 *pix;
677 p = block;
678 pix = pixels;
d6a4c0b1
ZK
679 MOVQ_ZERO(mm7);
680 // this doesn't seem to be used offten - so
681 // the inside usage of mm_wone is not optimized
682 MOVQ_WTWO(mm6);
de6d9b64
FB
683 do {
684 __asm __volatile(
685 "movq %1, %%mm0\n\t"
686 "movq %2, %%mm1\n\t"
687 "movq 1%1, %%mm4\n\t"
688 "movq 1%2, %%mm5\n\t"
689 "movq %%mm0, %%mm2\n\t"
690 "movq %%mm1, %%mm3\n\t"
691 "punpcklbw %%mm7, %%mm0\n\t"
692 "punpcklbw %%mm7, %%mm1\n\t"
693 "punpckhbw %%mm7, %%mm2\n\t"
694 "punpckhbw %%mm7, %%mm3\n\t"
695 "paddusw %%mm1, %%mm0\n\t"
696 "paddusw %%mm3, %%mm2\n\t"
697 "movq %%mm4, %%mm1\n\t"
698 "movq %%mm5, %%mm3\n\t"
699 "punpcklbw %%mm7, %%mm4\n\t"
700 "punpcklbw %%mm7, %%mm5\n\t"
701 "punpckhbw %%mm7, %%mm1\n\t"
702 "punpckhbw %%mm7, %%mm3\n\t"
703 "paddusw %%mm5, %%mm4\n\t"
704 "paddusw %%mm3, %%mm1\n\t"
705 "paddusw %%mm6, %%mm4\n\t"
706 "paddusw %%mm6, %%mm1\n\t"
707 "paddusw %%mm4, %%mm0\n\t"
708 "paddusw %%mm1, %%mm2\n\t"
709 "movq %3, %%mm5\n\t"
710 "psrlw $2, %%mm0\n\t"
711 "movq %0, %%mm1\n\t"
712 "psrlw $2, %%mm2\n\t"
713 "movq %%mm1, %%mm3\n\t"
714 "punpcklbw %%mm7, %%mm1\n\t"
715 "punpckhbw %%mm7, %%mm3\n\t"
716 "paddusw %%mm1, %%mm0\n\t"
717 "paddusw %%mm3, %%mm2\n\t"
718 "paddusw %%mm5, %%mm0\n\t"
719 "paddusw %%mm5, %%mm2\n\t"
720 "psrlw $1, %%mm0\n\t"
721 "psrlw $1, %%mm2\n\t"
722 "packuswb %%mm2, %%mm0\n\t"
723 "movq %%mm0, %0\n\t"
a822a479 724 :"+m"(*p)
de6d9b64 725 :"m"(*pix),
a9b3f630 726 "m"(*(pix+line_size)), "m"(mm_wone)
de6d9b64
FB
727 :"memory");
728 pix += line_size;
729 p += line_size ;
730 } while(--h);
de6d9b64
FB
731}
732
733static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
734{
735 UINT8 *p;
736 const UINT8 *pix;
737 p = block;
738 pix = pixels;
d6a4c0b1 739 MOVQ_ZERO(mm7);
de6d9b64
FB
740 do {
741 __asm __volatile(
742 "movq %1, %%mm0\n\t"
743 "movq %0, %%mm1\n\t"
744 "movq %%mm0, %%mm2\n\t"
745 "movq %%mm1, %%mm3\n\t"
746 "punpcklbw %%mm7, %%mm0\n\t"
747 "punpcklbw %%mm7, %%mm1\n\t"
748 "punpckhbw %%mm7, %%mm2\n\t"
749 "punpckhbw %%mm7, %%mm3\n\t"
750 "paddusw %%mm1, %%mm0\n\t"
751 "paddusw %%mm3, %%mm2\n\t"
752 "psrlw $1, %%mm0\n\t"
753 "psrlw $1, %%mm2\n\t"
754 "packuswb %%mm2, %%mm0\n\t"
755 "movq %%mm0, %0\n\t"
a822a479 756 :"+m"(*p)
de6d9b64
FB
757 :"m"(*pix)
758 :"memory");
759 pix += line_size;
760 p += line_size ;
761 } while (--h);
de6d9b64
FB
762}
763
764static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
765{
766 UINT8 *p;
767 const UINT8 *pix;
768 p = block;
769 pix = pixels;
d6a4c0b1 770 MOVQ_ZERO(mm7);
de6d9b64
FB
771 do {
772 __asm __volatile(
773 "movq %1, %%mm0\n\t"
774 "movq 1%1, %%mm1\n\t"
775 "movq %0, %%mm4\n\t"
776 "movq %%mm0, %%mm2\n\t"
777 "movq %%mm1, %%mm3\n\t"
778 "movq %%mm4, %%mm5\n\t"
779 "punpcklbw %%mm7, %%mm0\n\t"
780 "punpcklbw %%mm7, %%mm1\n\t"
781 "punpckhbw %%mm7, %%mm2\n\t"
782 "punpckhbw %%mm7, %%mm3\n\t"
783 "punpcklbw %%mm7, %%mm4\n\t"
784 "punpckhbw %%mm7, %%mm5\n\t"
785 "paddusw %%mm1, %%mm0\n\t"
786 "paddusw %%mm3, %%mm2\n\t"
787 "psrlw $1, %%mm0\n\t"
788 "psrlw $1, %%mm2\n\t"
789 "paddusw %%mm4, %%mm0\n\t"
790 "paddusw %%mm5, %%mm2\n\t"
791 "psrlw $1, %%mm0\n\t"
792 "psrlw $1, %%mm2\n\t"
793 "packuswb %%mm2, %%mm0\n\t"
794 "movq %%mm0, %0\n\t"
a822a479 795 :"+m"(*p)
de6d9b64
FB
796 :"m"(*pix)
797 :"memory");
798 pix += line_size;
799 p += line_size;
800 } while (--h);
de6d9b64
FB
801}
802
803static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
804{
805 UINT8 *p;
806 const UINT8 *pix;
807 p = block;
808 pix = pixels;
d6a4c0b1 809 MOVQ_ZERO(mm7);
de6d9b64
FB
810 do {
811 __asm __volatile(
812 "movq %1, %%mm0\n\t"
813 "movq %2, %%mm1\n\t"
814 "movq %0, %%mm4\n\t"
815 "movq %%mm0, %%mm2\n\t"
816 "movq %%mm1, %%mm3\n\t"
817 "movq %%mm4, %%mm5\n\t"
818 "punpcklbw %%mm7, %%mm0\n\t"
819 "punpcklbw %%mm7, %%mm1\n\t"
820 "punpckhbw %%mm7, %%mm2\n\t"
821 "punpckhbw %%mm7, %%mm3\n\t"
822 "punpcklbw %%mm7, %%mm4\n\t"
823 "punpckhbw %%mm7, %%mm5\n\t"
824 "paddusw %%mm1, %%mm0\n\t"
825 "paddusw %%mm3, %%mm2\n\t"
826 "psrlw $1, %%mm0\n\t"
827 "psrlw $1, %%mm2\n\t"
828 "paddusw %%mm4, %%mm0\n\t"
829 "paddusw %%mm5, %%mm2\n\t"
830 "psrlw $1, %%mm0\n\t"
831 "psrlw $1, %%mm2\n\t"
832 "packuswb %%mm2, %%mm0\n\t"
833 "movq %%mm0, %0\n\t"
a822a479 834 :"+m"(*p)
de6d9b64
FB
835 :"m"(*pix), "m"(*(pix+line_size))
836 :"memory");
837 pix += line_size;
838 p += line_size ;
839 } while(--h);
de6d9b64
FB
840}
841
842static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
843{
844 UINT8 *p;
845 const UINT8 *pix;
846 p = block;
847 pix = pixels;
d6a4c0b1
ZK
848 MOVQ_ZERO(mm7);
849 MOVQ_WONE(mm6);
850 JUMPALIGN();
de6d9b64
FB
851 do {
852 __asm __volatile(
853 "movq %1, %%mm0\n\t"
854 "movq %2, %%mm1\n\t"
855 "movq 1%1, %%mm4\n\t"
856 "movq 1%2, %%mm5\n\t"
857 "movq %%mm0, %%mm2\n\t"
858 "movq %%mm1, %%mm3\n\t"
859 "punpcklbw %%mm7, %%mm0\n\t"
860 "punpcklbw %%mm7, %%mm1\n\t"
861 "punpckhbw %%mm7, %%mm2\n\t"
862 "punpckhbw %%mm7, %%mm3\n\t"
863 "paddusw %%mm1, %%mm0\n\t"
864 "paddusw %%mm3, %%mm2\n\t"
865 "movq %%mm4, %%mm1\n\t"
866 "movq %%mm5, %%mm3\n\t"
867 "punpcklbw %%mm7, %%mm4\n\t"
868 "punpcklbw %%mm7, %%mm5\n\t"
869 "punpckhbw %%mm7, %%mm1\n\t"
870 "punpckhbw %%mm7, %%mm3\n\t"
871 "paddusw %%mm5, %%mm4\n\t"
872 "paddusw %%mm3, %%mm1\n\t"
873 "paddusw %%mm6, %%mm4\n\t"
874 "paddusw %%mm6, %%mm1\n\t"
875 "paddusw %%mm4, %%mm0\n\t"
876 "paddusw %%mm1, %%mm2\n\t"
877 "movq %0, %%mm1\n\t"
878 "psrlw $2, %%mm0\n\t"
879 "movq %%mm1, %%mm3\n\t"
880 "psrlw $2, %%mm2\n\t"
881 "punpcklbw %%mm7, %%mm1\n\t"
882 "punpckhbw %%mm7, %%mm3\n\t"
883 "paddusw %%mm1, %%mm0\n\t"
884 "paddusw %%mm3, %%mm2\n\t"
885 "psrlw $1, %%mm0\n\t"
886 "psrlw $1, %%mm2\n\t"
887 "packuswb %%mm2, %%mm0\n\t"
888 "movq %%mm0, %0\n\t"
a822a479 889 :"+m"(*p)
de6d9b64
FB
890 :"m"(*pix),
891 "m"(*(pix+line_size))
892 :"memory");
893 pix += line_size;
894 p += line_size;
895 } while(--h);
de6d9b64
FB
896}
897
898static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
899{
900 DCTELEM *p;
901 const UINT8 *pix;
902 p = block;
903 pix = pixels;
d6a4c0b1 904 MOVQ_ZERO(mm7);
de6d9b64
FB
905 do {
906 __asm __volatile(
907 "movq %0, %%mm0\n\t"
908 "movq %1, %%mm2\n\t"
909 "movq 8%0, %%mm1\n\t"
910 "movq %%mm2, %%mm3\n\t"
911 "punpcklbw %%mm7, %%mm2\n\t"
912 "punpckhbw %%mm7, %%mm3\n\t"
913 "psubsw %%mm2, %%mm0\n\t"
914 "psubsw %%mm3, %%mm1\n\t"
915 "movq %%mm0, %0\n\t"
916 "movq %%mm1, 8%0\n\t"
a822a479 917 :"+m"(*p)
de6d9b64
FB
918 :"m"(*pix)
919 :"memory");
920 pix += line_size;
921 p += 8;
922 } while (--h);
de6d9b64
FB
923}
924
925static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
926{
927 DCTELEM *p;
928 const UINT8 *pix;
929 p = block;
930 pix = pixels;
d6a4c0b1
ZK
931 MOVQ_ZERO(mm7);
932 MOVQ_WONE(mm6);
933 JUMPALIGN();
de6d9b64
FB
934 do {
935 __asm __volatile(
936 "movq %0, %%mm0\n\t"
937 "movq %1, %%mm2\n\t"
938 "movq 8%0, %%mm1\n\t"
939 "movq 1%1, %%mm4\n\t"
940 "movq %%mm2, %%mm3\n\t"
941 "movq %%mm4, %%mm5\n\t"
942 "punpcklbw %%mm7, %%mm2\n\t"
943 "punpckhbw %%mm7, %%mm3\n\t"
944 "punpcklbw %%mm7, %%mm4\n\t"
945 "punpckhbw %%mm7, %%mm5\n\t"
946 "paddusw %%mm4, %%mm2\n\t"
947 "paddusw %%mm5, %%mm3\n\t"
948 "paddusw %%mm6, %%mm2\n\t"
949 "paddusw %%mm6, %%mm3\n\t"
950 "psrlw $1, %%mm2\n\t"
951 "psrlw $1, %%mm3\n\t"
952 "psubsw %%mm2, %%mm0\n\t"
953 "psubsw %%mm3, %%mm1\n\t"
954 "movq %%mm0, %0\n\t"
955 "movq %%mm1, 8%0\n\t"
a822a479 956 :"+m"(*p)
de6d9b64
FB
957 :"m"(*pix)
958 :"memory");
959 pix += line_size;
960 p += 8;
961 } while (--h);
de6d9b64
FB
962}
963
964static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
965{
966 DCTELEM *p;
967 const UINT8 *pix;
968 p = block;
969 pix = pixels;
d6a4c0b1
ZK
970 MOVQ_ZERO(mm7);
971 MOVQ_WONE(mm6);
de6d9b64
FB
972 do {
973 __asm __volatile(
974 "movq %0, %%mm0\n\t"
975 "movq %1, %%mm2\n\t"
976 "movq 8%0, %%mm1\n\t"
977 "movq %2, %%mm4\n\t"
978 "movq %%mm2, %%mm3\n\t"
979 "movq %%mm4, %%mm5\n\t"
980 "punpcklbw %%mm7, %%mm2\n\t"
981 "punpckhbw %%mm7, %%mm3\n\t"
982 "punpcklbw %%mm7, %%mm4\n\t"
983 "punpckhbw %%mm7, %%mm5\n\t"
984 "paddusw %%mm4, %%mm2\n\t"
985 "paddusw %%mm5, %%mm3\n\t"
986 "paddusw %%mm6, %%mm2\n\t"
987 "paddusw %%mm6, %%mm3\n\t"
988 "psrlw $1, %%mm2\n\t"
989 "psrlw $1, %%mm3\n\t"
990 "psubsw %%mm2, %%mm0\n\t"
991 "psubsw %%mm3, %%mm1\n\t"
992 "movq %%mm0, %0\n\t"
993 "movq %%mm1, 8%0\n\t"
a822a479 994 :"+m"(*p)
de6d9b64
FB
995 :"m"(*pix), "m"(*(pix+line_size))
996 :"memory");
997 pix += line_size;
998 p += 8;
999 } while (--h);
de6d9b64
FB
1000}
1001
1002static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
1003{
1004 DCTELEM *p;
1005 const UINT8 *pix;
1006 p = block;
1007 pix = pixels;
d6a4c0b1
ZK
1008 MOVQ_ZERO(mm7);
1009 MOVQ_WTWO(mm6);
1010 JUMPALIGN();
de6d9b64
FB
1011 do {
1012 __asm __volatile(
1013 "movq %1, %%mm0\n\t"
1014 "movq %2, %%mm1\n\t"
1015 "movq 1%1, %%mm4\n\t"
1016 "movq 1%2, %%mm5\n\t"
1017 "movq %%mm0, %%mm2\n\t"
1018 "movq %%mm1, %%mm3\n\t"
1019 "punpcklbw %%mm7, %%mm0\n\t"
1020 "punpcklbw %%mm7, %%mm1\n\t"
1021 "punpckhbw %%mm7, %%mm2\n\t"
1022 "punpckhbw %%mm7, %%mm3\n\t"
1023 "paddusw %%mm1, %%mm0\n\t"
1024 "paddusw %%mm3, %%mm2\n\t"
1025 "movq %%mm4, %%mm1\n\t"
1026 "movq %%mm5, %%mm3\n\t"
1027 "punpcklbw %%mm7, %%mm4\n\t"
1028 "punpcklbw %%mm7, %%mm5\n\t"
1029 "punpckhbw %%mm7, %%mm1\n\t"
1030 "punpckhbw %%mm7, %%mm3\n\t"
1031 "paddusw %%mm5, %%mm4\n\t"
1032 "paddusw %%mm3, %%mm1\n\t"
1033 "paddusw %%mm6, %%mm4\n\t"
1034 "paddusw %%mm6, %%mm1\n\t"
1035 "paddusw %%mm4, %%mm0\n\t"
1036 "paddusw %%mm1, %%mm2\n\t"
1037 "movq %0, %%mm1\n\t"
1038 "movq 8%0, %%mm3\n\t"
1039 "psrlw $2, %%mm0\n\t"
1040 "psrlw $2, %%mm2\n\t"
1041 "psubsw %%mm0, %%mm1\n\t"
1042 "psubsw %%mm2, %%mm3\n\t"
1043 "movq %%mm1, %0\n\t"
1044 "movq %%mm3, 8%0\n\t"
a822a479 1045 :"+m"(*p)
de6d9b64
FB
1046 :"m"(*pix),
1047 "m"(*(pix+line_size))
1048 :"memory");
1049 pix += line_size;
1050 p += 8 ;
1051 } while(--h);
de6d9b64
FB
1052}
1053
649c00c9
MN
1054static void clear_blocks_mmx(DCTELEM *blocks)
1055{
1056 asm volatile(
1057 "pxor %%mm7, %%mm7 \n\t"
1058 "movl $-128*6, %%eax \n\t"
1059 "1: \n\t"
1060 "movq %%mm7, (%0, %%eax) \n\t"
1061 "movq %%mm7, 8(%0, %%eax) \n\t"
1062 "movq %%mm7, 16(%0, %%eax) \n\t"
1063 "movq %%mm7, 24(%0, %%eax) \n\t"
1064 "addl $32, %%eax \n\t"
1065 " js 1b \n\t"
1066 : : "r" (((int)blocks)+128*6)
1067 : "%eax"
1068 );
1069}
1070
d6a4c0b1
ZK
1071static void just_return() { return; }
1072
de6d9b64
FB
1073void dsputil_init_mmx(void)
1074{
1075 mm_flags = mm_support();
f4470e09
MN
1076#if 1
1077 printf("libavcodec: CPU flags:");
de6d9b64
FB
1078 if (mm_flags & MM_MMX)
1079 printf(" mmx");
1080 if (mm_flags & MM_MMXEXT)
1081 printf(" mmxext");
1082 if (mm_flags & MM_3DNOW)
1083 printf(" 3dnow");
1084 if (mm_flags & MM_SSE)
1085 printf(" sse");
1086 if (mm_flags & MM_SSE2)
1087 printf(" sse2");
1088 printf("\n");
1089#endif
1090
1091 if (mm_flags & MM_MMX) {
1092 get_pixels = get_pixels_mmx;
9dbcbd92 1093 diff_pixels = diff_pixels_mmx;
de6d9b64
FB
1094 put_pixels_clamped = put_pixels_clamped_mmx;
1095 add_pixels_clamped = add_pixels_clamped_mmx;
649c00c9
MN
1096 clear_blocks= clear_blocks_mmx;
1097
ba6802de
MN
1098 pix_abs16x16 = pix_abs16x16_mmx;
1099 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1100 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
de6d9b64 1101 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
ba6802de
MN
1102 pix_abs8x8 = pix_abs8x8_mmx;
1103 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1104 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1105 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
de6d9b64
FB
1106 av_fdct = fdct_mmx;
1107
1108 put_pixels_tab[0] = put_pixels_mmx;
1109 put_pixels_tab[1] = put_pixels_x2_mmx;
1110 put_pixels_tab[2] = put_pixels_y2_mmx;
1111 put_pixels_tab[3] = put_pixels_xy2_mmx;
1112
1113 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
1114 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1115 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1116 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
1117
1118 avg_pixels_tab[0] = avg_pixels_mmx;
1119 avg_pixels_tab[1] = avg_pixels_x2_mmx;
1120 avg_pixels_tab[2] = avg_pixels_y2_mmx;
1121 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1122
1123 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
1124 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
1125 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
1126 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
1127
1128 sub_pixels_tab[0] = sub_pixels_mmx;
1129 sub_pixels_tab[1] = sub_pixels_x2_mmx;
1130 sub_pixels_tab[2] = sub_pixels_y2_mmx;
1131 sub_pixels_tab[3] = sub_pixels_xy2_mmx;
1132
1133 if (mm_flags & MM_MMXEXT) {
ba6802de
MN
1134 pix_abs16x16 = pix_abs16x16_mmx2;
1135 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
1136 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
1137 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
1138
1139 pix_abs8x8 = pix_abs8x8_mmx2;
1140 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
1141 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
1142 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
1143
de6d9b64
FB
1144 put_pixels_tab[1] = put_pixels_x2_sse;
1145 put_pixels_tab[2] = put_pixels_y2_sse;
1146
1147 avg_pixels_tab[0] = avg_pixels_sse;
1148 avg_pixels_tab[1] = avg_pixels_x2_sse;
1149 avg_pixels_tab[2] = avg_pixels_y2_sse;
1150 avg_pixels_tab[3] = avg_pixels_xy2_sse;
1151
1152 sub_pixels_tab[1] = sub_pixels_x2_sse;
1153 sub_pixels_tab[2] = sub_pixels_y2_sse;
1154 } else if (mm_flags & MM_3DNOW) {
1155 put_pixels_tab[1] = put_pixels_x2_3dnow;
1156 put_pixels_tab[2] = put_pixels_y2_3dnow;
1157
1158 avg_pixels_tab[0] = avg_pixels_3dnow;
1159 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
1160 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
1161 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
1162
1163 sub_pixels_tab[1] = sub_pixels_x2_3dnow;
1164 sub_pixels_tab[2] = sub_pixels_y2_3dnow;
1165 }
4af7bcc1 1166
8def0299
FB
1167 /* idct */
1168 if (mm_flags & MM_MMXEXT) {
1169 ff_idct = ff_mmxext_idct;
1170 } else {
1171 ff_idct = ff_mmx_idct;
1172 }
d962f6fd
A
1173#ifdef SIMPLE_IDCT
1174// ff_idct = simple_idct;
1175 ff_idct = simple_idct_mmx;
1176#endif
de6d9b64 1177 }
d6a4c0b1
ZK
1178
1179#if 0
1180 // for speed testing
1181 get_pixels = just_return;
1182 put_pixels_clamped = just_return;
1183 add_pixels_clamped = just_return;
1184
1185 pix_abs16x16 = just_return;
1186 pix_abs16x16_x2 = just_return;
1187 pix_abs16x16_y2 = just_return;
1188 pix_abs16x16_xy2 = just_return;
1189
1190 put_pixels_tab[0] = just_return;
1191 put_pixels_tab[1] = just_return;
1192 put_pixels_tab[2] = just_return;
1193 put_pixels_tab[3] = just_return;
1194
1195 put_no_rnd_pixels_tab[0] = just_return;
1196 put_no_rnd_pixels_tab[1] = just_return;
1197 put_no_rnd_pixels_tab[2] = just_return;
1198 put_no_rnd_pixels_tab[3] = just_return;
1199
1200 avg_pixels_tab[0] = just_return;
1201 avg_pixels_tab[1] = just_return;
1202 avg_pixels_tab[2] = just_return;
1203 avg_pixels_tab[3] = just_return;
1204
1205 avg_no_rnd_pixels_tab[0] = just_return;
1206 avg_no_rnd_pixels_tab[1] = just_return;
1207 avg_no_rnd_pixels_tab[2] = just_return;
1208 avg_no_rnd_pixels_tab[3] = just_return;
1209
1210 sub_pixels_tab[0] = just_return;
1211 sub_pixels_tab[1] = just_return;
1212 sub_pixels_tab[2] = just_return;
1213 sub_pixels_tab[3] = just_return;
1214
1215 //av_fdct = just_return;
1216 //ff_idct = just_return;
1217#endif
de6d9b64 1218}