* const warning fix
[libav.git] / libavcodec / i386 / dsputil_mmx.c
1 /*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22 #include "../dsputil.h"
23
24 int mm_flags; /* multimedia extension flags */
25 /* FIXME use them in static form */
26 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
31 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
36 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
41 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
46 /* pixel operations */
47 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
48 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
49 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
50
51 #define JUMPALIGN() __asm __volatile (".balign 8"::)
52 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
53
54 #define MOVQ_WONE(regd) \
55 __asm __volatile ( \
56 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
57 "psrlw $15, %%" #regd ::)
58
59 #define MOVQ_BFE(regd) \
60 __asm __volatile ( \
61 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
62 "paddb %%" #regd ", %%" #regd " \n\t" ::)
63
64 #ifndef PIC
65 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
66 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
67 #else
68 // for shared library it's better to use this way for accessing constants
69 // pcmpeqd -> -1
70 #define MOVQ_BONE(regd) \
71 __asm __volatile ( \
72 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73 "psrlw $15, %%" #regd " \n\t" \
74 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
75
76 #define MOVQ_WTWO(regd) \
77 __asm __volatile ( \
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79 "psrlw $15, %%" #regd " \n\t" \
80 "psllw $1, %%" #regd " \n\t"::)
81
82 #endif
83
84 // using regr as temporary and for the output result
85 // first argument is unmodifed and second is trashed
86 // regfe is supposed to contain 0xfefefefefefefefe
87 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
88 "movq " #rega ", " #regr " \n\t"\
89 "pand " #regb ", " #regr " \n\t"\
90 "pxor " #rega ", " #regb " \n\t"\
91 "pand " #regfe "," #regb " \n\t"\
92 "psrlq $1, " #regb " \n\t"\
93 "paddb " #regb ", " #regr " \n\t"
94
95 #define PAVGB_MMX(rega, regb, regr, regfe) \
96 "movq " #rega ", " #regr " \n\t"\
97 "por " #regb ", " #regr " \n\t"\
98 "pxor " #rega ", " #regb " \n\t"\
99 "pand " #regfe "," #regb " \n\t"\
100 "psrlq $1, " #regb " \n\t"\
101 "psubb " #regb ", " #regr " \n\t"
102
103 // mm6 is supposed to contain 0xfefefefefefefefe
104 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
105 "movq " #rega ", " #regr " \n\t"\
106 "movq " #regc ", " #regp " \n\t"\
107 "pand " #regb ", " #regr " \n\t"\
108 "pand " #regd ", " #regp " \n\t"\
109 "pxor " #rega ", " #regb " \n\t"\
110 "pxor " #regc ", " #regd " \n\t"\
111 "pand %%mm6, " #regb " \n\t"\
112 "pand %%mm6, " #regd " \n\t"\
113 "psrlq $1, " #regb " \n\t"\
114 "psrlq $1, " #regd " \n\t"\
115 "paddb " #regb ", " #regr " \n\t"\
116 "paddb " #regd ", " #regp " \n\t"
117
118 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
119 "movq " #rega ", " #regr " \n\t"\
120 "movq " #regc ", " #regp " \n\t"\
121 "por " #regb ", " #regr " \n\t"\
122 "por " #regd ", " #regp " \n\t"\
123 "pxor " #rega ", " #regb " \n\t"\
124 "pxor " #regc ", " #regd " \n\t"\
125 "pand %%mm6, " #regb " \n\t"\
126 "pand %%mm6, " #regd " \n\t"\
127 "psrlq $1, " #regd " \n\t"\
128 "psrlq $1, " #regb " \n\t"\
129 "psubb " #regb ", " #regr " \n\t"\
130 "psubb " #regd ", " #regp " \n\t"
131
132 /***********************************/
133 /* MMX no rounding */
134 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
135 #define SET_RND MOVQ_WONE
136 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
137 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
138
139 #include "dsputil_mmx_rnd.h"
140
141 #undef DEF
142 #undef SET_RND
143 #undef PAVGBP
144 #undef PAVGB
145 /***********************************/
146 /* MMX rounding */
147
148 #define DEF(x, y) x ## _ ## y ##_mmx
149 #define SET_RND MOVQ_WTWO
150 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
151 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
152
153 #include "dsputil_mmx_rnd.h"
154
155 #undef DEF
156 #undef SET_RND
157 #undef PAVGBP
158 #undef PAVGB
159
160 /***********************************/
161 /* 3Dnow specific */
162
163 #define DEF(x) x ## _3dnow
164 /* for Athlons PAVGUSB is prefered */
165 #define PAVGB "pavgusb"
166
167 #include "dsputil_mmx_avg.h"
168
169 #undef DEF
170 #undef PAVGB
171
172 /***********************************/
173 /* MMX2 specific */
174
175 #define DEF(x) x ## _mmx2
176
177 /* Introduced only in MMX2 set */
178 #define PAVGB "pavgb"
179
180 #include "dsputil_mmx_avg.h"
181
182 #undef DEF
183 #undef PAVGB
184
185 /***********************************/
186 /* standard MMX */
187
188 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
189 {
190 asm volatile(
191 "movl $-128, %%eax \n\t"
192 "pxor %%mm7, %%mm7 \n\t"
193 ".balign 16 \n\t"
194 "1: \n\t"
195 "movq (%0), %%mm0 \n\t"
196 "movq (%0, %2), %%mm2 \n\t"
197 "movq %%mm0, %%mm1 \n\t"
198 "movq %%mm2, %%mm3 \n\t"
199 "punpcklbw %%mm7, %%mm0 \n\t"
200 "punpckhbw %%mm7, %%mm1 \n\t"
201 "punpcklbw %%mm7, %%mm2 \n\t"
202 "punpckhbw %%mm7, %%mm3 \n\t"
203 "movq %%mm0, (%1, %%eax)\n\t"
204 "movq %%mm1, 8(%1, %%eax)\n\t"
205 "movq %%mm2, 16(%1, %%eax)\n\t"
206 "movq %%mm3, 24(%1, %%eax)\n\t"
207 "addl %3, %0 \n\t"
208 "addl $32, %%eax \n\t"
209 "js 1b \n\t"
210 : "+r" (pixels)
211 : "r" (block+64), "r" (line_size), "r" (line_size*2)
212 : "%eax"
213 );
214 }
215
216 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
217 {
218 asm volatile(
219 "pxor %%mm7, %%mm7 \n\t"
220 "movl $-128, %%eax \n\t"
221 ".balign 16 \n\t"
222 "1: \n\t"
223 "movq (%0), %%mm0 \n\t"
224 "movq (%1), %%mm2 \n\t"
225 "movq %%mm0, %%mm1 \n\t"
226 "movq %%mm2, %%mm3 \n\t"
227 "punpcklbw %%mm7, %%mm0 \n\t"
228 "punpckhbw %%mm7, %%mm1 \n\t"
229 "punpcklbw %%mm7, %%mm2 \n\t"
230 "punpckhbw %%mm7, %%mm3 \n\t"
231 "psubw %%mm2, %%mm0 \n\t"
232 "psubw %%mm3, %%mm1 \n\t"
233 "movq %%mm0, (%2, %%eax)\n\t"
234 "movq %%mm1, 8(%2, %%eax)\n\t"
235 "addl %3, %0 \n\t"
236 "addl %3, %1 \n\t"
237 "addl $16, %%eax \n\t"
238 "jnz 1b \n\t"
239 : "+r" (s1), "+r" (s2)
240 : "r" (block+64), "r" (stride)
241 : "%eax"
242 );
243 }
244
245 void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
246 {
247 const DCTELEM *p;
248 UINT8 *pix;
249
250 /* read the pixels */
251 p = block;
252 pix = pixels;
253 /* unrolled loop */
254 __asm __volatile(
255 "movq %3, %%mm0\n\t"
256 "movq 8%3, %%mm1\n\t"
257 "movq 16%3, %%mm2\n\t"
258 "movq 24%3, %%mm3\n\t"
259 "movq 32%3, %%mm4\n\t"
260 "movq 40%3, %%mm5\n\t"
261 "movq 48%3, %%mm6\n\t"
262 "movq 56%3, %%mm7\n\t"
263 "packuswb %%mm1, %%mm0\n\t"
264 "packuswb %%mm3, %%mm2\n\t"
265 "packuswb %%mm5, %%mm4\n\t"
266 "packuswb %%mm7, %%mm6\n\t"
267 "movq %%mm0, (%0)\n\t"
268 "movq %%mm2, (%0, %1)\n\t"
269 "movq %%mm4, (%0, %1, 2)\n\t"
270 "movq %%mm6, (%0, %2)\n\t"
271 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
272 :"memory");
273 pix += line_size*4;
274 p += 32;
275
276 // if here would be an exact copy of the code above
277 // compiler would generate some very strange code
278 // thus using "r"
279 __asm __volatile(
280 "movq (%3), %%mm0\n\t"
281 "movq 8(%3), %%mm1\n\t"
282 "movq 16(%3), %%mm2\n\t"
283 "movq 24(%3), %%mm3\n\t"
284 "movq 32(%3), %%mm4\n\t"
285 "movq 40(%3), %%mm5\n\t"
286 "movq 48(%3), %%mm6\n\t"
287 "movq 56(%3), %%mm7\n\t"
288 "packuswb %%mm1, %%mm0\n\t"
289 "packuswb %%mm3, %%mm2\n\t"
290 "packuswb %%mm5, %%mm4\n\t"
291 "packuswb %%mm7, %%mm6\n\t"
292 "movq %%mm0, (%0)\n\t"
293 "movq %%mm2, (%0, %1)\n\t"
294 "movq %%mm4, (%0, %1, 2)\n\t"
295 "movq %%mm6, (%0, %2)\n\t"
296 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
297 :"memory");
298 }
299
300 void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
301 {
302 const DCTELEM *p;
303 UINT8 *pix;
304 int i;
305
306 /* read the pixels */
307 p = block;
308 pix = pixels;
309 MOVQ_ZERO(mm7);
310 i = 4;
311 do {
312 __asm __volatile(
313 "movq (%2), %%mm0\n\t"
314 "movq 8(%2), %%mm1\n\t"
315 "movq 16(%2), %%mm2\n\t"
316 "movq 24(%2), %%mm3\n\t"
317 "movq %0, %%mm4\n\t"
318 "movq %1, %%mm6\n\t"
319 "movq %%mm4, %%mm5\n\t"
320 "punpcklbw %%mm7, %%mm4\n\t"
321 "punpckhbw %%mm7, %%mm5\n\t"
322 "paddsw %%mm4, %%mm0\n\t"
323 "paddsw %%mm5, %%mm1\n\t"
324 "movq %%mm6, %%mm5\n\t"
325 "punpcklbw %%mm7, %%mm6\n\t"
326 "punpckhbw %%mm7, %%mm5\n\t"
327 "paddsw %%mm6, %%mm2\n\t"
328 "paddsw %%mm5, %%mm3\n\t"
329 "packuswb %%mm1, %%mm0\n\t"
330 "packuswb %%mm3, %%mm2\n\t"
331 "movq %%mm0, %0\n\t"
332 "movq %%mm2, %1\n\t"
333 :"+m"(*pix), "+m"(*(pix+line_size))
334 :"r"(p)
335 :"memory");
336 pix += line_size*2;
337 p += 16;
338 } while (--i);
339 }
340
341 static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
342 {
343 __asm __volatile(
344 "lea (%3, %3), %%eax \n\t"
345 ".balign 8 \n\t"
346 "1: \n\t"
347 "movq (%1), %%mm0 \n\t"
348 "movq (%1, %3), %%mm1 \n\t"
349 "movq %%mm0, (%2) \n\t"
350 "movq %%mm1, (%2, %3) \n\t"
351 "addl %%eax, %1 \n\t"
352 "addl %%eax, %2 \n\t"
353 "movq (%1), %%mm0 \n\t"
354 "movq (%1, %3), %%mm1 \n\t"
355 "movq %%mm0, (%2) \n\t"
356 "movq %%mm1, (%2, %3) \n\t"
357 "addl %%eax, %1 \n\t"
358 "addl %%eax, %2 \n\t"
359 "subl $4, %0 \n\t"
360 "jnz 1b \n\t"
361 : "+g"(h), "+r" (pixels), "+r" (block)
362 : "r"(line_size)
363 : "%eax", "memory"
364 );
365 }
366
367 static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
368 {
369 __asm __volatile(
370 "lea (%3, %3), %%eax \n\t"
371 ".balign 8 \n\t"
372 "1: \n\t"
373 "movq (%1), %%mm0 \n\t"
374 "movq 8(%1), %%mm4 \n\t"
375 "movq (%1, %3), %%mm1 \n\t"
376 "movq 8(%1, %3), %%mm5 \n\t"
377 "movq %%mm0, (%2) \n\t"
378 "movq %%mm4, 8(%2) \n\t"
379 "movq %%mm1, (%2, %3) \n\t"
380 "movq %%mm5, 8(%2, %3) \n\t"
381 "addl %%eax, %1 \n\t"
382 "addl %%eax, %2 \n\t"
383 "movq (%1), %%mm0 \n\t"
384 "movq 8(%1), %%mm4 \n\t"
385 "movq (%1, %3), %%mm1 \n\t"
386 "movq 8(%1, %3), %%mm5 \n\t"
387 "movq %%mm0, (%2) \n\t"
388 "movq %%mm4, 8(%2) \n\t"
389 "movq %%mm1, (%2, %3) \n\t"
390 "movq %%mm5, 8(%2, %3) \n\t"
391 "addl %%eax, %1 \n\t"
392 "addl %%eax, %2 \n\t"
393 "subl $4, %0 \n\t"
394 "jnz 1b \n\t"
395 : "+g"(h), "+r" (pixels), "+r" (block)
396 : "r"(line_size)
397 : "%eax", "memory"
398 );
399 }
400
401 static void clear_blocks_mmx(DCTELEM *blocks)
402 {
403 __asm __volatile(
404 "pxor %%mm7, %%mm7 \n\t"
405 "movl $-128*6, %%eax \n\t"
406 "1: \n\t"
407 "movq %%mm7, (%0, %%eax) \n\t"
408 "movq %%mm7, 8(%0, %%eax) \n\t"
409 "movq %%mm7, 16(%0, %%eax) \n\t"
410 "movq %%mm7, 24(%0, %%eax) \n\t"
411 "addl $32, %%eax \n\t"
412 " js 1b \n\t"
413 : : "r" (((int)blocks)+128*6)
414 : "%eax"
415 );
416 }
417
418 static int pix_sum16_mmx(UINT8 * pix, int line_size){
419 const int h=16;
420 int sum;
421 int index= -line_size*h;
422
423 __asm __volatile(
424 "pxor %%mm7, %%mm7 \n\t"
425 "pxor %%mm6, %%mm6 \n\t"
426 "1: \n\t"
427 "movq (%2, %1), %%mm0 \n\t"
428 "movq (%2, %1), %%mm1 \n\t"
429 "movq 8(%2, %1), %%mm2 \n\t"
430 "movq 8(%2, %1), %%mm3 \n\t"
431 "punpcklbw %%mm7, %%mm0 \n\t"
432 "punpckhbw %%mm7, %%mm1 \n\t"
433 "punpcklbw %%mm7, %%mm2 \n\t"
434 "punpckhbw %%mm7, %%mm3 \n\t"
435 "paddw %%mm0, %%mm1 \n\t"
436 "paddw %%mm2, %%mm3 \n\t"
437 "paddw %%mm1, %%mm3 \n\t"
438 "paddw %%mm3, %%mm6 \n\t"
439 "addl %3, %1 \n\t"
440 " js 1b \n\t"
441 "movq %%mm6, %%mm5 \n\t"
442 "psrlq $32, %%mm6 \n\t"
443 "paddw %%mm5, %%mm6 \n\t"
444 "movq %%mm6, %%mm5 \n\t"
445 "psrlq $16, %%mm6 \n\t"
446 "paddw %%mm5, %%mm6 \n\t"
447 "movd %%mm6, %0 \n\t"
448 "andl $0xFFFF, %0 \n\t"
449 : "=&r" (sum), "+r" (index)
450 : "r" (pix - index), "r" (line_size)
451 );
452
453 return sum;
454 }
455
456 #if 0
457 static void just_return() { return; }
458 #endif
459
460 void dsputil_init_mmx(DSPContext* c, unsigned mask)
461 {
462 mm_flags = mm_support();
463 #if 0
464 fprintf(stderr, "libavcodec: CPU flags:");
465 if (mm_flags & MM_MMX)
466 fprintf(stderr, " mmx");
467 if (mm_flags & MM_MMXEXT)
468 fprintf(stderr, " mmxext");
469 if (mm_flags & MM_3DNOW)
470 fprintf(stderr, " 3dnow");
471 if (mm_flags & MM_SSE)
472 fprintf(stderr, " sse");
473 if (mm_flags & MM_SSE2)
474 fprintf(stderr, " sse2");
475 fprintf(stderr, "\n");
476 #endif
477
478 if (mm_flags & MM_MMX) {
479 c->get_pixels = get_pixels_mmx;
480 c->diff_pixels = diff_pixels_mmx;
481 c->put_pixels_clamped = put_pixels_clamped_mmx;
482 c->add_pixels_clamped = add_pixels_clamped_mmx;
483 c->clear_blocks = clear_blocks_mmx;
484 c->pix_sum = pix_sum16_mmx;
485
486 c->pix_abs16x16 = pix_abs16x16_mmx;
487 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
488 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
489 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
490 c->pix_abs8x8 = pix_abs8x8_mmx;
491 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
492 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
493 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
494
495 c->put_pixels_tab[0][0] = put_pixels16_mmx;
496 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
497 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
498 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
499
500 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
501 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
502 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
503 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
504
505 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
506 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
507 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
508 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
509
510 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
511 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
512 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
513 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
514
515 c->put_pixels_tab[1][0] = put_pixels8_mmx;
516 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
517 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
518 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
519
520 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
521 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
522 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
523 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
524
525 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
526 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
527 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
528 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
529
530 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
531 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
532 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
533 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
534
535 if (mm_flags & MM_MMXEXT) {
536 c->pix_abs16x16 = pix_abs16x16_mmx2;
537 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
538 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
539 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
540
541 c->pix_abs8x8 = pix_abs8x8_mmx2;
542 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
543 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
544 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
545
546 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
547 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
548 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
549 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
550
551 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
552 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
553 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
554 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
555
556 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
557 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
558 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
559 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
560
561 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
562 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
563 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
564 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
565 } else if (mm_flags & MM_3DNOW) {
566 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
567 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
568 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
569 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
570
571 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
572 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
573 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
574 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
575
576 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
577 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
578 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
579 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
580
581 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
582 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
583 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
584 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
585 }
586 }
587
588 #if 0
589 // for speed testing
590 get_pixels = just_return;
591 put_pixels_clamped = just_return;
592 add_pixels_clamped = just_return;
593
594 pix_abs16x16 = just_return;
595 pix_abs16x16_x2 = just_return;
596 pix_abs16x16_y2 = just_return;
597 pix_abs16x16_xy2 = just_return;
598
599 put_pixels_tab[0] = just_return;
600 put_pixels_tab[1] = just_return;
601 put_pixels_tab[2] = just_return;
602 put_pixels_tab[3] = just_return;
603
604 put_no_rnd_pixels_tab[0] = just_return;
605 put_no_rnd_pixels_tab[1] = just_return;
606 put_no_rnd_pixels_tab[2] = just_return;
607 put_no_rnd_pixels_tab[3] = just_return;
608
609 avg_pixels_tab[0] = just_return;
610 avg_pixels_tab[1] = just_return;
611 avg_pixels_tab[2] = just_return;
612 avg_pixels_tab[3] = just_return;
613
614 avg_no_rnd_pixels_tab[0] = just_return;
615 avg_no_rnd_pixels_tab[1] = just_return;
616 avg_no_rnd_pixels_tab[2] = just_return;
617 avg_no_rnd_pixels_tab[3] = just_return;
618
619 //av_fdct = just_return;
620 //ff_idct = just_return;
621 #endif
622 }
623
624 /* remove any non bit exact operation (testing purpose). NOTE that
625 this function should be kept as small as possible because it is
626 always difficult to test automatically non bit exact cases. */
627 void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
628 {
629 if (mm_flags & MM_MMX) {
630 /* MMX2 & 3DNOW */
631 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
632 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
633 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
634 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
635 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
636 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
637
638 if (mm_flags & MM_MMXEXT) {
639 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
640 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
641 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
642 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
643 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
644 c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
645 }
646 }
647 }