* minor update
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
607dce96 24#include "../mangle.h"
de6d9b64 25
7d650cb5
FB
26int mm_flags; /* multimedia extension flags */
27
ba6802de
MN
28int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
32
33int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
37
38int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
42
43int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
47
8def0299
FB
48/* external functions, from idct_mmx.c */
49void ff_mmx_idct(DCTELEM *block);
50void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 51
de6d9b64 52/* pixel operations */
a7bd8797
MN
53static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
54static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
55static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
a9b3f630
NK
56//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
57//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
de6d9b64 58
d6a4c0b1
ZK
59#define JUMPALIGN() __asm __volatile (".balign 8"::)
60#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
61
62#ifndef PIC
63#define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
64#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
a7bd8797 65#define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t"
d6a4c0b1
ZK
66#else
67// for shared library it's better to use this way for accessing constants
68// pcmpeqd -> -1
69#define MOVQ_WONE(regd) \
70 __asm __volatile ( \
71 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
72 "psrlw $15, %%" #regd ::)
73
74#define MOVQ_WTWO(regd) \
75 __asm __volatile ( \
76 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
77 "psrlw $15, %%" #regd " \n\t" \
78 "psllw $1, %%" #regd ::)
a7bd8797
MN
79
80#define MOVQ_BONE(regd) \
81 "pcmpeqd " #regd ", " #regd " \n\t" \
82 "psrlw $15, " #regd " \n\t"\
83 "packuswb " #regd ", " #regd " \n\t"
d6a4c0b1
ZK
84#endif
85
a7bd8797 86
de6d9b64
FB
87/***********************************/
88/* 3Dnow specific */
89
90#define DEF(x) x ## _3dnow
91/* for Athlons PAVGUSB is prefered */
92#define PAVGB "pavgusb"
93
94#include "dsputil_mmx_avg.h"
95
96#undef DEF
97#undef PAVGB
98
99/***********************************/
100/* MMX2 specific */
101
607dce96 102#define DEF(x) x ## _mmx2
de6d9b64
FB
103
104/* Introduced only in MMX2 set */
105#define PAVGB "pavgb"
106
107#include "dsputil_mmx_avg.h"
108
109#undef DEF
110#undef PAVGB
111
112/***********************************/
113/* standard MMX */
114
115static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
116{
607dce96
MN
117 asm volatile(
118 "movl $-128, %%eax \n\t"
119 "pxor %%mm7, %%mm7 \n\t"
120 ".balign 16 \n\t"
121 "1: \n\t"
122 "movq (%0), %%mm0 \n\t"
123 "movq (%0, %2), %%mm2 \n\t"
124 "movq %%mm0, %%mm1 \n\t"
125 "movq %%mm2, %%mm3 \n\t"
126 "punpcklbw %%mm7, %%mm0 \n\t"
127 "punpckhbw %%mm7, %%mm1 \n\t"
128 "punpcklbw %%mm7, %%mm2 \n\t"
129 "punpckhbw %%mm7, %%mm3 \n\t"
130 "movq %%mm0, (%1, %%eax)\n\t"
131 "movq %%mm1, 8(%1, %%eax)\n\t"
132 "movq %%mm2, 16(%1, %%eax)\n\t"
133 "movq %%mm3, 24(%1, %%eax)\n\t"
134 "addl %3, %0 \n\t"
135 "addl $32, %%eax \n\t"
136 "js 1b \n\t"
137 : "+r" (pixels)
138 : "r" (block+64), "r" (line_size), "r" (line_size*2)
139 : "%eax"
140 );
de6d9b64
FB
141}
142
9dbcbd92
MN
143static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
144{
145 asm volatile(
607dce96 146 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 147 "movl $-128, %%eax \n\t"
607dce96 148 ".balign 16 \n\t"
9dbcbd92
MN
149 "1: \n\t"
150 "movq (%0), %%mm0 \n\t"
151 "movq (%1), %%mm2 \n\t"
152 "movq %%mm0, %%mm1 \n\t"
153 "movq %%mm2, %%mm3 \n\t"
154 "punpcklbw %%mm7, %%mm0 \n\t"
155 "punpckhbw %%mm7, %%mm1 \n\t"
156 "punpcklbw %%mm7, %%mm2 \n\t"
157 "punpckhbw %%mm7, %%mm3 \n\t"
158 "psubw %%mm2, %%mm0 \n\t"
159 "psubw %%mm3, %%mm1 \n\t"
160 "movq %%mm0, (%2, %%eax)\n\t"
161 "movq %%mm1, 8(%2, %%eax)\n\t"
162 "addl %3, %0 \n\t"
163 "addl %3, %1 \n\t"
164 "addl $16, %%eax \n\t"
165 "jnz 1b \n\t"
166 : "+r" (s1), "+r" (s2)
167 : "r" (block+64), "r" (stride)
168 : "%eax"
169 );
170}
171
de6d9b64
FB
172static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
173{
174 const DCTELEM *p;
175 UINT8 *pix;
de6d9b64
FB
176
177 /* read the pixels */
178 p = block;
179 pix = pixels;
d6a4c0b1 180 /* unrolled loop */
de6d9b64 181 __asm __volatile(
a822a479
NK
182 "movq %3, %%mm0\n\t"
183 "movq 8%3, %%mm1\n\t"
184 "movq 16%3, %%mm2\n\t"
185 "movq 24%3, %%mm3\n\t"
186 "movq 32%3, %%mm4\n\t"
187 "movq 40%3, %%mm5\n\t"
188 "movq 48%3, %%mm6\n\t"
189 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
190 "packuswb %%mm1, %%mm0\n\t"
191 "packuswb %%mm3, %%mm2\n\t"
192 "packuswb %%mm5, %%mm4\n\t"
193 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
194 "movq %%mm0, (%0)\n\t"
195 "movq %%mm2, (%0, %1)\n\t"
196 "movq %%mm4, (%0, %1, 2)\n\t"
197 "movq %%mm6, (%0, %2)\n\t"
198 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
199 :"memory");
200 pix += line_size*4;
201 p += 32;
d6a4c0b1
ZK
202
203 // if here would be an exact copy of the code above
204 // compiler would generate some very strange code
205 // thus using "r"
206 __asm __volatile(
207 "movq (%3), %%mm0\n\t"
208 "movq 8(%3), %%mm1\n\t"
209 "movq 16(%3), %%mm2\n\t"
210 "movq 24(%3), %%mm3\n\t"
211 "movq 32(%3), %%mm4\n\t"
212 "movq 40(%3), %%mm5\n\t"
213 "movq 48(%3), %%mm6\n\t"
214 "movq 56(%3), %%mm7\n\t"
215 "packuswb %%mm1, %%mm0\n\t"
216 "packuswb %%mm3, %%mm2\n\t"
217 "packuswb %%mm5, %%mm4\n\t"
218 "packuswb %%mm7, %%mm6\n\t"
219 "movq %%mm0, (%0)\n\t"
220 "movq %%mm2, (%0, %1)\n\t"
221 "movq %%mm4, (%0, %1, 2)\n\t"
222 "movq %%mm6, (%0, %2)\n\t"
223 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
224 :"memory");
de6d9b64
FB
225}
226
227static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
228{
229 const DCTELEM *p;
230 UINT8 *pix;
231 int i;
232
233 /* read the pixels */
234 p = block;
235 pix = pixels;
d6a4c0b1
ZK
236 MOVQ_ZERO(mm7);
237 i = 4;
cd8e5f96 238 do {
de6d9b64 239 __asm __volatile(
cd8e5f96
ZK
240 "movq (%2), %%mm0\n\t"
241 "movq 8(%2), %%mm1\n\t"
242 "movq 16(%2), %%mm2\n\t"
243 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
244 "movq %0, %%mm4\n\t"
245 "movq %1, %%mm6\n\t"
246 "movq %%mm4, %%mm5\n\t"
247 "punpcklbw %%mm7, %%mm4\n\t"
248 "punpckhbw %%mm7, %%mm5\n\t"
249 "paddsw %%mm4, %%mm0\n\t"
250 "paddsw %%mm5, %%mm1\n\t"
251 "movq %%mm6, %%mm5\n\t"
252 "punpcklbw %%mm7, %%mm6\n\t"
253 "punpckhbw %%mm7, %%mm5\n\t"
254 "paddsw %%mm6, %%mm2\n\t"
255 "paddsw %%mm5, %%mm3\n\t"
256 "packuswb %%mm1, %%mm0\n\t"
257 "packuswb %%mm3, %%mm2\n\t"
258 "movq %%mm0, %0\n\t"
259 "movq %%mm2, %1\n\t"
a822a479 260 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 261 :"r"(p)
de6d9b64
FB
262 :"memory");
263 pix += line_size*2;
264 p += 16;
cd8e5f96 265 } while (--i);
de6d9b64
FB
266}
267
268static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
269{
607dce96
MN
270#if 0 //FIXME h==4 case
271 asm volatile(
272 "xorl %%eax, %%eax \n\t"
273 "movl %3, %%esi \n\t"
274 "1: \n\t"
275 "movq (%1, %%eax), %%mm0 \n\t"
276 "movq %%mm0, (%0, %%eax) \n\t"
277 "addl %2, %%eax \n\t"
278 "movq (%1, %%eax), %%mm0 \n\t"
279 "movq %%mm0, (%0, %%eax) \n\t"
280 "addl %2, %%eax \n\t"
281 "movq (%1, %%eax), %%mm0 \n\t"
282 "movq %%mm0, (%0, %%eax) \n\t"
283 "addl %2, %%eax \n\t"
284 "movq (%1, %%eax), %%mm0 \n\t"
285 "movq %%mm0, (%0, %%eax) \n\t"
286 "addl %2, %%eax \n\t"
287 "movq (%1, %%eax), %%mm0 \n\t"
288 "movq %%mm0, (%0, %%eax) \n\t"
289 "addl %2, %%eax \n\t"
290 "movq (%1, %%eax), %%mm0 \n\t"
291 "movq %%mm0, (%0, %%eax) \n\t"
292 "addl %2, %%eax \n\t"
293 "movq (%1, %%eax), %%mm0 \n\t"
294 "movq %%mm0, (%0, %%eax) \n\t"
295 "addl %2, %%eax \n\t"
296 "movq (%1, %%eax), %%mm0 \n\t"
297 "movq %%mm0, (%0, %%eax) \n\t"
298 "addl %2, %%eax \n\t"
299 "subl $8, %%esi \n\t"
300 " jnz 1b \n\t"
301 :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
302 : "%eax", "%esi", "memory"
303 );
d6a4c0b1 304#else
607dce96
MN
305 asm volatile(
306 "xorl %%eax, %%eax \n\t"
307 "movl %3, %%esi \n\t"
308 "1: \n\t"
309 "movq (%1, %%eax), %%mm0 \n\t"
310 "movq %%mm0, (%0, %%eax) \n\t"
311 "addl %2, %%eax \n\t"
312 "movq (%1, %%eax), %%mm0 \n\t"
313 "movq %%mm0, (%0, %%eax) \n\t"
314 "addl %2, %%eax \n\t"
315 "movq (%1, %%eax), %%mm0 \n\t"
316 "movq %%mm0, (%0, %%eax) \n\t"
317 "addl %2, %%eax \n\t"
318 "movq (%1, %%eax), %%mm0 \n\t"
319 "movq %%mm0, (%0, %%eax) \n\t"
320 "addl %2, %%eax \n\t"
321 "subl $4, %%esi \n\t"
322 " jnz 1b \n\t"
323 :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
324 : "%eax", "%esi", "memory"
325 );
d6a4c0b1 326#endif
de6d9b64
FB
327}
328
329static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
330{
331 UINT8 *p;
332 const UINT8 *pix;
333 p = block;
334 pix = pixels;
d6a4c0b1
ZK
335 MOVQ_ZERO(mm7);
336 MOVQ_WONE(mm4);
337 JUMPALIGN();
de6d9b64
FB
338 do {
339 __asm __volatile(
340 "movq %1, %%mm0\n\t"
341 "movq 1%1, %%mm1\n\t"
342 "movq %%mm0, %%mm2\n\t"
343 "movq %%mm1, %%mm3\n\t"
344 "punpcklbw %%mm7, %%mm0\n\t"
345 "punpcklbw %%mm7, %%mm1\n\t"
346 "punpckhbw %%mm7, %%mm2\n\t"
347 "punpckhbw %%mm7, %%mm3\n\t"
348 "paddusw %%mm1, %%mm0\n\t"
349 "paddusw %%mm3, %%mm2\n\t"
350 "paddusw %%mm4, %%mm0\n\t"
351 "paddusw %%mm4, %%mm2\n\t"
352 "psrlw $1, %%mm0\n\t"
353 "psrlw $1, %%mm2\n\t"
354 "packuswb %%mm2, %%mm0\n\t"
355 "movq %%mm0, %0\n\t"
356 :"=m"(*p)
357 :"m"(*pix)
358 :"memory");
359 pix += line_size; p += line_size;
360 } while (--h);
de6d9b64
FB
361}
362
363static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
364{
365 UINT8 *p;
366 const UINT8 *pix;
367 p = block;
368 pix = pixels;
d6a4c0b1
ZK
369 MOVQ_ZERO(mm7);
370 MOVQ_WONE(mm4);
371 JUMPALIGN();
de6d9b64
FB
372 do {
373 __asm __volatile(
374 "movq %1, %%mm0\n\t"
375 "movq %2, %%mm1\n\t"
376 "movq %%mm0, %%mm2\n\t"
377 "movq %%mm1, %%mm3\n\t"
378 "punpcklbw %%mm7, %%mm0\n\t"
379 "punpcklbw %%mm7, %%mm1\n\t"
380 "punpckhbw %%mm7, %%mm2\n\t"
381 "punpckhbw %%mm7, %%mm3\n\t"
382 "paddusw %%mm1, %%mm0\n\t"
383 "paddusw %%mm3, %%mm2\n\t"
384 "paddusw %%mm4, %%mm0\n\t"
385 "paddusw %%mm4, %%mm2\n\t"
386 "psrlw $1, %%mm0\n\t"
387 "psrlw $1, %%mm2\n\t"
388 "packuswb %%mm2, %%mm0\n\t"
389 "movq %%mm0, %0\n\t"
390 :"=m"(*p)
391 :"m"(*pix),
392 "m"(*(pix+line_size))
393 :"memory");
394 pix += line_size;
395 p += line_size;
396 } while (--h);
de6d9b64
FB
397}
398
399static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
400{
401 UINT8 *p;
402 const UINT8 *pix;
403 p = block;
d6a4c0b1
ZK
404 pix = pixels; // 1s
405 MOVQ_ZERO(mm7);
406 MOVQ_WTWO(mm6);
407 JUMPALIGN();
de6d9b64
FB
408 do {
409 __asm __volatile(
410 "movq %1, %%mm0\n\t"
411 "movq %2, %%mm1\n\t"
412 "movq 1%1, %%mm4\n\t"
413 "movq 1%2, %%mm5\n\t"
414 "movq %%mm0, %%mm2\n\t"
415 "movq %%mm1, %%mm3\n\t"
416 "punpcklbw %%mm7, %%mm0\n\t"
417 "punpcklbw %%mm7, %%mm1\n\t"
418 "punpckhbw %%mm7, %%mm2\n\t"
419 "punpckhbw %%mm7, %%mm3\n\t"
420 "paddusw %%mm1, %%mm0\n\t"
421 "paddusw %%mm3, %%mm2\n\t"
422 "movq %%mm4, %%mm1\n\t"
423 "movq %%mm5, %%mm3\n\t"
424 "punpcklbw %%mm7, %%mm4\n\t"
425 "punpcklbw %%mm7, %%mm5\n\t"
426 "punpckhbw %%mm7, %%mm1\n\t"
427 "punpckhbw %%mm7, %%mm3\n\t"
428 "paddusw %%mm5, %%mm4\n\t"
429 "paddusw %%mm3, %%mm1\n\t"
430 "paddusw %%mm6, %%mm4\n\t"
431 "paddusw %%mm6, %%mm1\n\t"
432 "paddusw %%mm4, %%mm0\n\t"
433 "paddusw %%mm1, %%mm2\n\t"
434 "psrlw $2, %%mm0\n\t"
435 "psrlw $2, %%mm2\n\t"
436 "packuswb %%mm2, %%mm0\n\t"
437 "movq %%mm0, %0\n\t"
438 :"=m"(*p)
439 :"m"(*pix),
440 "m"(*(pix+line_size))
441 :"memory");
442 pix += line_size;
443 p += line_size;
444 } while(--h);
de6d9b64
FB
445}
446
447static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
448{
449 UINT8 *p;
450 const UINT8 *pix;
451 p = block;
452 pix = pixels;
d6a4c0b1 453 MOVQ_ZERO(mm7);
de6d9b64
FB
454 do {
455 __asm __volatile(
456 "movq %1, %%mm0\n\t"
457 "movq 1%1, %%mm1\n\t"
458 "movq %%mm0, %%mm2\n\t"
459 "movq %%mm1, %%mm3\n\t"
460 "punpcklbw %%mm7, %%mm0\n\t"
461 "punpcklbw %%mm7, %%mm1\n\t"
462 "punpckhbw %%mm7, %%mm2\n\t"
463 "punpckhbw %%mm7, %%mm3\n\t"
464 "paddusw %%mm1, %%mm0\n\t"
465 "paddusw %%mm3, %%mm2\n\t"
466 "psrlw $1, %%mm0\n\t"
467 "psrlw $1, %%mm2\n\t"
468 "packuswb %%mm2, %%mm0\n\t"
469 "movq %%mm0, %0\n\t"
470 :"=m"(*p)
471 :"m"(*pix)
472 :"memory");
473 pix += line_size;
474 p += line_size;
475 } while (--h);
de6d9b64
FB
476}
477
478static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
479{
480 UINT8 *p;
481 const UINT8 *pix;
482 p = block;
483 pix = pixels;
d6a4c0b1
ZK
484 MOVQ_ZERO(mm7);
485 JUMPALIGN();
de6d9b64
FB
486 do {
487 __asm __volatile(
488 "movq %1, %%mm0\n\t"
489 "movq %2, %%mm1\n\t"
490 "movq %%mm0, %%mm2\n\t"
491 "movq %%mm1, %%mm3\n\t"
492 "punpcklbw %%mm7, %%mm0\n\t"
493 "punpcklbw %%mm7, %%mm1\n\t"
494 "punpckhbw %%mm7, %%mm2\n\t"
495 "punpckhbw %%mm7, %%mm3\n\t"
496 "paddusw %%mm1, %%mm0\n\t"
497 "paddusw %%mm3, %%mm2\n\t"
498 "psrlw $1, %%mm0\n\t"
499 "psrlw $1, %%mm2\n\t"
500 "packuswb %%mm2, %%mm0\n\t"
501 "movq %%mm0, %0\n\t"
502 :"=m"(*p)
503 :"m"(*pix),
504 "m"(*(pix+line_size))
505 :"memory");
506 pix += line_size;
507 p += line_size;
508 } while(--h);
de6d9b64
FB
509}
510
511static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
512{
513 UINT8 *p;
514 const UINT8 *pix;
515 p = block;
516 pix = pixels;
d6a4c0b1
ZK
517 MOVQ_ZERO(mm7);
518 MOVQ_WONE(mm6);
519 JUMPALIGN();
de6d9b64
FB
520 do {
521 __asm __volatile(
522 "movq %1, %%mm0\n\t"
523 "movq %2, %%mm1\n\t"
524 "movq 1%1, %%mm4\n\t"
525 "movq 1%2, %%mm5\n\t"
526 "movq %%mm0, %%mm2\n\t"
527 "movq %%mm1, %%mm3\n\t"
528 "punpcklbw %%mm7, %%mm0\n\t"
529 "punpcklbw %%mm7, %%mm1\n\t"
530 "punpckhbw %%mm7, %%mm2\n\t"
531 "punpckhbw %%mm7, %%mm3\n\t"
532 "paddusw %%mm1, %%mm0\n\t"
533 "paddusw %%mm3, %%mm2\n\t"
534 "movq %%mm4, %%mm1\n\t"
535 "movq %%mm5, %%mm3\n\t"
536 "punpcklbw %%mm7, %%mm4\n\t"
537 "punpcklbw %%mm7, %%mm5\n\t"
538 "punpckhbw %%mm7, %%mm1\n\t"
539 "punpckhbw %%mm7, %%mm3\n\t"
540 "paddusw %%mm5, %%mm4\n\t"
541 "paddusw %%mm3, %%mm1\n\t"
542 "paddusw %%mm6, %%mm4\n\t"
543 "paddusw %%mm6, %%mm1\n\t"
544 "paddusw %%mm4, %%mm0\n\t"
545 "paddusw %%mm1, %%mm2\n\t"
546 "psrlw $2, %%mm0\n\t"
547 "psrlw $2, %%mm2\n\t"
548 "packuswb %%mm2, %%mm0\n\t"
549 "movq %%mm0, %0\n\t"
550 :"=m"(*p)
551 :"m"(*pix),
552 "m"(*(pix+line_size))
553 :"memory");
554 pix += line_size;
555 p += line_size;
556 } while(--h);
de6d9b64
FB
557}
558
559static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
560{
561 UINT8 *p;
562 const UINT8 *pix;
563 p = block;
564 pix = pixels;
d6a4c0b1
ZK
565 MOVQ_ZERO(mm7);
566 MOVQ_WONE(mm6);
567 JUMPALIGN();
de6d9b64
FB
568 do {
569 __asm __volatile(
570 "movq %0, %%mm0\n\t"
571 "movq %1, %%mm1\n\t"
572 "movq %%mm0, %%mm2\n\t"
573 "movq %%mm1, %%mm3\n\t"
574 "punpcklbw %%mm7, %%mm0\n\t"
575 "punpcklbw %%mm7, %%mm1\n\t"
576 "punpckhbw %%mm7, %%mm2\n\t"
577 "punpckhbw %%mm7, %%mm3\n\t"
578 "paddusw %%mm1, %%mm0\n\t"
579 "paddusw %%mm3, %%mm2\n\t"
580 "paddusw %%mm6, %%mm0\n\t"
581 "paddusw %%mm6, %%mm2\n\t"
582 "psrlw $1, %%mm0\n\t"
583 "psrlw $1, %%mm2\n\t"
584 "packuswb %%mm2, %%mm0\n\t"
585 "movq %%mm0, %0\n\t"
a822a479 586 :"+m"(*p)
de6d9b64
FB
587 :"m"(*pix)
588 :"memory");
589 pix += line_size;
590 p += line_size;
591 }
592 while (--h);
de6d9b64
FB
593}
594
595static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
596{
597 UINT8 *p;
598 const UINT8 *pix;
599 p = block;
600 pix = pixels;
d6a4c0b1
ZK
601 MOVQ_ZERO(mm7);
602 MOVQ_WONE(mm6);
603 JUMPALIGN();
de6d9b64
FB
604 do {
605 __asm __volatile(
606 "movq %1, %%mm1\n\t"
607 "movq %0, %%mm0\n\t"
608 "movq 1%1, %%mm4\n\t"
609 "movq %%mm0, %%mm2\n\t"
610 "movq %%mm1, %%mm3\n\t"
611 "movq %%mm4, %%mm5\n\t"
612 "punpcklbw %%mm7, %%mm1\n\t"
613 "punpckhbw %%mm7, %%mm3\n\t"
614 "punpcklbw %%mm7, %%mm4\n\t"
615 "punpckhbw %%mm7, %%mm5\n\t"
616 "punpcklbw %%mm7, %%mm0\n\t"
617 "punpckhbw %%mm7, %%mm2\n\t"
618 "paddusw %%mm4, %%mm1\n\t"
619 "paddusw %%mm5, %%mm3\n\t"
620 "paddusw %%mm6, %%mm1\n\t"
621 "paddusw %%mm6, %%mm3\n\t"
622 "psrlw $1, %%mm1\n\t"
623 "psrlw $1, %%mm3\n\t"
624 "paddusw %%mm6, %%mm0\n\t"
625 "paddusw %%mm6, %%mm2\n\t"
626 "paddusw %%mm1, %%mm0\n\t"
627 "paddusw %%mm3, %%mm2\n\t"
628 "psrlw $1, %%mm0\n\t"
629 "psrlw $1, %%mm2\n\t"
630 "packuswb %%mm2, %%mm0\n\t"
631 "movq %%mm0, %0\n\t"
a822a479 632 :"+m"(*p)
de6d9b64
FB
633 :"m"(*pix)
634 :"memory");
635 pix += line_size;
636 p += line_size;
637 } while (--h);
de6d9b64
FB
638}
639
640static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
641{
642 UINT8 *p;
643 const UINT8 *pix;
644 p = block;
645 pix = pixels;
d6a4c0b1
ZK
646 MOVQ_ZERO(mm7);
647 MOVQ_WONE(mm6);
648 JUMPALIGN();
de6d9b64
FB
649 do {
650 __asm __volatile(
651 "movq %1, %%mm1\n\t"
652 "movq %0, %%mm0\n\t"
653 "movq %2, %%mm4\n\t"
654 "movq %%mm0, %%mm2\n\t"
655 "movq %%mm1, %%mm3\n\t"
656 "movq %%mm4, %%mm5\n\t"
657 "punpcklbw %%mm7, %%mm1\n\t"
658 "punpckhbw %%mm7, %%mm3\n\t"
659 "punpcklbw %%mm7, %%mm4\n\t"
660 "punpckhbw %%mm7, %%mm5\n\t"
661 "punpcklbw %%mm7, %%mm0\n\t"
662 "punpckhbw %%mm7, %%mm2\n\t"
663 "paddusw %%mm4, %%mm1\n\t"
664 "paddusw %%mm5, %%mm3\n\t"
665 "paddusw %%mm6, %%mm1\n\t"
666 "paddusw %%mm6, %%mm3\n\t"
667 "psrlw $1, %%mm1\n\t"
668 "psrlw $1, %%mm3\n\t"
669 "paddusw %%mm6, %%mm0\n\t"
670 "paddusw %%mm6, %%mm2\n\t"
671 "paddusw %%mm1, %%mm0\n\t"
672 "paddusw %%mm3, %%mm2\n\t"
673 "psrlw $1, %%mm0\n\t"
674 "psrlw $1, %%mm2\n\t"
675 "packuswb %%mm2, %%mm0\n\t"
676 "movq %%mm0, %0\n\t"
a822a479 677 :"+m"(*p)
de6d9b64
FB
678 :"m"(*pix), "m"(*(pix+line_size))
679 :"memory");
680 pix += line_size;
681 p += line_size ;
682 } while(--h);
de6d9b64
FB
683}
684
685static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
686{
687 UINT8 *p;
688 const UINT8 *pix;
689 p = block;
690 pix = pixels;
d6a4c0b1
ZK
691 MOVQ_ZERO(mm7);
692 // this doesn't seem to be used offten - so
693 // the inside usage of mm_wone is not optimized
694 MOVQ_WTWO(mm6);
de6d9b64
FB
695 do {
696 __asm __volatile(
697 "movq %1, %%mm0\n\t"
698 "movq %2, %%mm1\n\t"
699 "movq 1%1, %%mm4\n\t"
700 "movq 1%2, %%mm5\n\t"
701 "movq %%mm0, %%mm2\n\t"
702 "movq %%mm1, %%mm3\n\t"
703 "punpcklbw %%mm7, %%mm0\n\t"
704 "punpcklbw %%mm7, %%mm1\n\t"
705 "punpckhbw %%mm7, %%mm2\n\t"
706 "punpckhbw %%mm7, %%mm3\n\t"
707 "paddusw %%mm1, %%mm0\n\t"
708 "paddusw %%mm3, %%mm2\n\t"
709 "movq %%mm4, %%mm1\n\t"
710 "movq %%mm5, %%mm3\n\t"
711 "punpcklbw %%mm7, %%mm4\n\t"
712 "punpcklbw %%mm7, %%mm5\n\t"
713 "punpckhbw %%mm7, %%mm1\n\t"
714 "punpckhbw %%mm7, %%mm3\n\t"
715 "paddusw %%mm5, %%mm4\n\t"
716 "paddusw %%mm3, %%mm1\n\t"
717 "paddusw %%mm6, %%mm4\n\t"
718 "paddusw %%mm6, %%mm1\n\t"
719 "paddusw %%mm4, %%mm0\n\t"
720 "paddusw %%mm1, %%mm2\n\t"
721 "movq %3, %%mm5\n\t"
722 "psrlw $2, %%mm0\n\t"
723 "movq %0, %%mm1\n\t"
724 "psrlw $2, %%mm2\n\t"
725 "movq %%mm1, %%mm3\n\t"
726 "punpcklbw %%mm7, %%mm1\n\t"
727 "punpckhbw %%mm7, %%mm3\n\t"
728 "paddusw %%mm1, %%mm0\n\t"
729 "paddusw %%mm3, %%mm2\n\t"
730 "paddusw %%mm5, %%mm0\n\t"
731 "paddusw %%mm5, %%mm2\n\t"
732 "psrlw $1, %%mm0\n\t"
733 "psrlw $1, %%mm2\n\t"
734 "packuswb %%mm2, %%mm0\n\t"
735 "movq %%mm0, %0\n\t"
a822a479 736 :"+m"(*p)
de6d9b64 737 :"m"(*pix),
a9b3f630 738 "m"(*(pix+line_size)), "m"(mm_wone)
de6d9b64
FB
739 :"memory");
740 pix += line_size;
741 p += line_size ;
742 } while(--h);
de6d9b64
FB
743}
744
745static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
746{
747 UINT8 *p;
748 const UINT8 *pix;
749 p = block;
750 pix = pixels;
d6a4c0b1 751 MOVQ_ZERO(mm7);
de6d9b64
FB
752 do {
753 __asm __volatile(
754 "movq %1, %%mm0\n\t"
755 "movq %0, %%mm1\n\t"
756 "movq %%mm0, %%mm2\n\t"
757 "movq %%mm1, %%mm3\n\t"
758 "punpcklbw %%mm7, %%mm0\n\t"
759 "punpcklbw %%mm7, %%mm1\n\t"
760 "punpckhbw %%mm7, %%mm2\n\t"
761 "punpckhbw %%mm7, %%mm3\n\t"
762 "paddusw %%mm1, %%mm0\n\t"
763 "paddusw %%mm3, %%mm2\n\t"
764 "psrlw $1, %%mm0\n\t"
765 "psrlw $1, %%mm2\n\t"
766 "packuswb %%mm2, %%mm0\n\t"
767 "movq %%mm0, %0\n\t"
a822a479 768 :"+m"(*p)
de6d9b64
FB
769 :"m"(*pix)
770 :"memory");
771 pix += line_size;
772 p += line_size ;
773 } while (--h);
de6d9b64
FB
774}
775
776static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
777{
778 UINT8 *p;
779 const UINT8 *pix;
780 p = block;
781 pix = pixels;
d6a4c0b1 782 MOVQ_ZERO(mm7);
de6d9b64
FB
783 do {
784 __asm __volatile(
785 "movq %1, %%mm0\n\t"
786 "movq 1%1, %%mm1\n\t"
787 "movq %0, %%mm4\n\t"
788 "movq %%mm0, %%mm2\n\t"
789 "movq %%mm1, %%mm3\n\t"
790 "movq %%mm4, %%mm5\n\t"
791 "punpcklbw %%mm7, %%mm0\n\t"
792 "punpcklbw %%mm7, %%mm1\n\t"
793 "punpckhbw %%mm7, %%mm2\n\t"
794 "punpckhbw %%mm7, %%mm3\n\t"
795 "punpcklbw %%mm7, %%mm4\n\t"
796 "punpckhbw %%mm7, %%mm5\n\t"
797 "paddusw %%mm1, %%mm0\n\t"
798 "paddusw %%mm3, %%mm2\n\t"
799 "psrlw $1, %%mm0\n\t"
800 "psrlw $1, %%mm2\n\t"
801 "paddusw %%mm4, %%mm0\n\t"
802 "paddusw %%mm5, %%mm2\n\t"
803 "psrlw $1, %%mm0\n\t"
804 "psrlw $1, %%mm2\n\t"
805 "packuswb %%mm2, %%mm0\n\t"
806 "movq %%mm0, %0\n\t"
a822a479 807 :"+m"(*p)
de6d9b64
FB
808 :"m"(*pix)
809 :"memory");
810 pix += line_size;
811 p += line_size;
812 } while (--h);
de6d9b64
FB
813}
814
815static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
816{
817 UINT8 *p;
818 const UINT8 *pix;
819 p = block;
820 pix = pixels;
d6a4c0b1 821 MOVQ_ZERO(mm7);
de6d9b64
FB
822 do {
823 __asm __volatile(
824 "movq %1, %%mm0\n\t"
825 "movq %2, %%mm1\n\t"
826 "movq %0, %%mm4\n\t"
827 "movq %%mm0, %%mm2\n\t"
828 "movq %%mm1, %%mm3\n\t"
829 "movq %%mm4, %%mm5\n\t"
830 "punpcklbw %%mm7, %%mm0\n\t"
831 "punpcklbw %%mm7, %%mm1\n\t"
832 "punpckhbw %%mm7, %%mm2\n\t"
833 "punpckhbw %%mm7, %%mm3\n\t"
834 "punpcklbw %%mm7, %%mm4\n\t"
835 "punpckhbw %%mm7, %%mm5\n\t"
836 "paddusw %%mm1, %%mm0\n\t"
837 "paddusw %%mm3, %%mm2\n\t"
838 "psrlw $1, %%mm0\n\t"
839 "psrlw $1, %%mm2\n\t"
840 "paddusw %%mm4, %%mm0\n\t"
841 "paddusw %%mm5, %%mm2\n\t"
842 "psrlw $1, %%mm0\n\t"
843 "psrlw $1, %%mm2\n\t"
844 "packuswb %%mm2, %%mm0\n\t"
845 "movq %%mm0, %0\n\t"
a822a479 846 :"+m"(*p)
de6d9b64
FB
847 :"m"(*pix), "m"(*(pix+line_size))
848 :"memory");
849 pix += line_size;
850 p += line_size ;
851 } while(--h);
de6d9b64
FB
852}
853
854static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
855{
856 UINT8 *p;
857 const UINT8 *pix;
858 p = block;
859 pix = pixels;
d6a4c0b1
ZK
860 MOVQ_ZERO(mm7);
861 MOVQ_WONE(mm6);
862 JUMPALIGN();
de6d9b64
FB
863 do {
864 __asm __volatile(
865 "movq %1, %%mm0\n\t"
866 "movq %2, %%mm1\n\t"
867 "movq 1%1, %%mm4\n\t"
868 "movq 1%2, %%mm5\n\t"
869 "movq %%mm0, %%mm2\n\t"
870 "movq %%mm1, %%mm3\n\t"
871 "punpcklbw %%mm7, %%mm0\n\t"
872 "punpcklbw %%mm7, %%mm1\n\t"
873 "punpckhbw %%mm7, %%mm2\n\t"
874 "punpckhbw %%mm7, %%mm3\n\t"
875 "paddusw %%mm1, %%mm0\n\t"
876 "paddusw %%mm3, %%mm2\n\t"
877 "movq %%mm4, %%mm1\n\t"
878 "movq %%mm5, %%mm3\n\t"
879 "punpcklbw %%mm7, %%mm4\n\t"
880 "punpcklbw %%mm7, %%mm5\n\t"
881 "punpckhbw %%mm7, %%mm1\n\t"
882 "punpckhbw %%mm7, %%mm3\n\t"
883 "paddusw %%mm5, %%mm4\n\t"
884 "paddusw %%mm3, %%mm1\n\t"
885 "paddusw %%mm6, %%mm4\n\t"
886 "paddusw %%mm6, %%mm1\n\t"
887 "paddusw %%mm4, %%mm0\n\t"
888 "paddusw %%mm1, %%mm2\n\t"
889 "movq %0, %%mm1\n\t"
890 "psrlw $2, %%mm0\n\t"
891 "movq %%mm1, %%mm3\n\t"
892 "psrlw $2, %%mm2\n\t"
893 "punpcklbw %%mm7, %%mm1\n\t"
894 "punpckhbw %%mm7, %%mm3\n\t"
895 "paddusw %%mm1, %%mm0\n\t"
896 "paddusw %%mm3, %%mm2\n\t"
897 "psrlw $1, %%mm0\n\t"
898 "psrlw $1, %%mm2\n\t"
899 "packuswb %%mm2, %%mm0\n\t"
900 "movq %%mm0, %0\n\t"
a822a479 901 :"+m"(*p)
de6d9b64
FB
902 :"m"(*pix),
903 "m"(*(pix+line_size))
904 :"memory");
905 pix += line_size;
906 p += line_size;
907 } while(--h);
de6d9b64
FB
908}
909
649c00c9
MN
910static void clear_blocks_mmx(DCTELEM *blocks)
911{
912 asm volatile(
913 "pxor %%mm7, %%mm7 \n\t"
914 "movl $-128*6, %%eax \n\t"
915 "1: \n\t"
916 "movq %%mm7, (%0, %%eax) \n\t"
917 "movq %%mm7, 8(%0, %%eax) \n\t"
918 "movq %%mm7, 16(%0, %%eax) \n\t"
919 "movq %%mm7, 24(%0, %%eax) \n\t"
920 "addl $32, %%eax \n\t"
921 " js 1b \n\t"
922 : : "r" (((int)blocks)+128*6)
923 : "%eax"
924 );
925}
926
61a4e8ae 927#if 0
d6a4c0b1 928static void just_return() { return; }
61a4e8ae 929#endif
d6a4c0b1 930
dcb9cd4b 931#ifndef TESTCPU_MAIN
de6d9b64
FB
932void dsputil_init_mmx(void)
933{
934 mm_flags = mm_support();
f4470e09
MN
935#if 1
936 printf("libavcodec: CPU flags:");
de6d9b64
FB
937 if (mm_flags & MM_MMX)
938 printf(" mmx");
939 if (mm_flags & MM_MMXEXT)
940 printf(" mmxext");
941 if (mm_flags & MM_3DNOW)
942 printf(" 3dnow");
943 if (mm_flags & MM_SSE)
944 printf(" sse");
945 if (mm_flags & MM_SSE2)
946 printf(" sse2");
947 printf("\n");
948#endif
949
950 if (mm_flags & MM_MMX) {
951 get_pixels = get_pixels_mmx;
9dbcbd92 952 diff_pixels = diff_pixels_mmx;
de6d9b64
FB
953 put_pixels_clamped = put_pixels_clamped_mmx;
954 add_pixels_clamped = add_pixels_clamped_mmx;
649c00c9 955 clear_blocks= clear_blocks_mmx;
dcb9cd4b 956
ba6802de
MN
957 pix_abs16x16 = pix_abs16x16_mmx;
958 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
959 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
de6d9b64 960 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
ba6802de
MN
961 pix_abs8x8 = pix_abs8x8_mmx;
962 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
963 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
964 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
de6d9b64
FB
965 av_fdct = fdct_mmx;
966
967 put_pixels_tab[0] = put_pixels_mmx;
968 put_pixels_tab[1] = put_pixels_x2_mmx;
969 put_pixels_tab[2] = put_pixels_y2_mmx;
970 put_pixels_tab[3] = put_pixels_xy2_mmx;
971
972 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
973 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
974 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
975 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
dcb9cd4b 976
de6d9b64
FB
977 avg_pixels_tab[0] = avg_pixels_mmx;
978 avg_pixels_tab[1] = avg_pixels_x2_mmx;
979 avg_pixels_tab[2] = avg_pixels_y2_mmx;
980 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
981
982 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
983 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
984 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
985 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
607dce96 986
de6d9b64 987 if (mm_flags & MM_MMXEXT) {
ba6802de
MN
988 pix_abs16x16 = pix_abs16x16_mmx2;
989 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
990 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
991 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
dcb9cd4b 992
ba6802de
MN
993 pix_abs8x8 = pix_abs8x8_mmx2;
994 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
995 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
996 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
607dce96
MN
997
998 put_pixels_tab[1] = put_pixels_x2_mmx2;
999 put_pixels_tab[2] = put_pixels_y2_mmx2;
1000 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
1001 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
dcb9cd4b 1002
607dce96
MN
1003 avg_pixels_tab[0] = avg_pixels_mmx2;
1004 avg_pixels_tab[1] = avg_pixels_x2_mmx2;
1005 avg_pixels_tab[2] = avg_pixels_y2_mmx2;
1006 avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
de6d9b64
FB
1007 } else if (mm_flags & MM_3DNOW) {
1008 put_pixels_tab[1] = put_pixels_x2_3dnow;
1009 put_pixels_tab[2] = put_pixels_y2_3dnow;
607dce96
MN
1010 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
1011 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
61a4e8ae 1012
de6d9b64
FB
1013 avg_pixels_tab[0] = avg_pixels_3dnow;
1014 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
1015 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
1016 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
de6d9b64 1017 }
4af7bcc1 1018
8def0299
FB
1019 /* idct */
1020 if (mm_flags & MM_MMXEXT) {
1021 ff_idct = ff_mmxext_idct;
1022 } else {
1023 ff_idct = ff_mmx_idct;
1024 }
d962f6fd
A
1025#ifdef SIMPLE_IDCT
1026// ff_idct = simple_idct;
1027 ff_idct = simple_idct_mmx;
1028#endif
de6d9b64 1029 }
d6a4c0b1
ZK
1030
1031#if 0
1032 // for speed testing
1033 get_pixels = just_return;
1034 put_pixels_clamped = just_return;
1035 add_pixels_clamped = just_return;
1036
1037 pix_abs16x16 = just_return;
1038 pix_abs16x16_x2 = just_return;
1039 pix_abs16x16_y2 = just_return;
1040 pix_abs16x16_xy2 = just_return;
1041
1042 put_pixels_tab[0] = just_return;
1043 put_pixels_tab[1] = just_return;
1044 put_pixels_tab[2] = just_return;
1045 put_pixels_tab[3] = just_return;
1046
1047 put_no_rnd_pixels_tab[0] = just_return;
1048 put_no_rnd_pixels_tab[1] = just_return;
1049 put_no_rnd_pixels_tab[2] = just_return;
1050 put_no_rnd_pixels_tab[3] = just_return;
1051
1052 avg_pixels_tab[0] = just_return;
1053 avg_pixels_tab[1] = just_return;
1054 avg_pixels_tab[2] = just_return;
1055 avg_pixels_tab[3] = just_return;
1056
1057 avg_no_rnd_pixels_tab[0] = just_return;
1058 avg_no_rnd_pixels_tab[1] = just_return;
1059 avg_no_rnd_pixels_tab[2] = just_return;
1060 avg_no_rnd_pixels_tab[3] = just_return;
1061
d6a4c0b1
ZK
1062 //av_fdct = just_return;
1063 //ff_idct = just_return;
1064#endif
de6d9b64 1065}
4f12a497
FB
1066
1067/* remove any non bit exact operation (testing purpose). NOTE that
1068 this function should be kept as small as possible because it is
1069 always difficult to test automatically non bit exact cases. */
1070void dsputil_set_bit_exact_mmx(void)
1071{
1072 if (mm_flags & MM_MMX) {
1073 if (mm_flags & MM_MMXEXT) {
1074 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1075 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1076 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1077 } else if (mm_flags & MM_3DNOW) {
1078 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1079 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1080 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1081 }
1082 }
1083}
dcb9cd4b 1084
dcb9cd4b 1085#endif