* new mmx code - based upon http://aggregate.org/MAGIC
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5
FB
25int mm_flags; /* multimedia extension flags */
26
ba6802de
MN
27int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31
32int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36
37int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41
42int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46
8def0299
FB
47/* external functions, from idct_mmx.c */
48void ff_mmx_idct(DCTELEM *block);
49void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 50
de6d9b64 51/* pixel operations */
def60345 52static const uint64_t mm_bfe __attribute__ ((aligned(8))) = 0xfefefefefefefefeULL;
a7bd8797
MN
53static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
54static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
55static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
a9b3f630
NK
56//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
57//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
de6d9b64 58
d6a4c0b1
ZK
59#define JUMPALIGN() __asm __volatile (".balign 8"::)
60#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
61
62#ifndef PIC
63#define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
64#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
a7bd8797 65#define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t"
def60345 66#define MOVQ_BFE(regd) "movq "MANGLE(mm_bfe)", "#regd" \n\t"
d6a4c0b1
ZK
67#else
68// for shared library it's better to use this way for accessing constants
69// pcmpeqd -> -1
70#define MOVQ_WONE(regd) \
71 __asm __volatile ( \
72 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73 "psrlw $15, %%" #regd ::)
74
75#define MOVQ_WTWO(regd) \
76 __asm __volatile ( \
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
78 "psrlw $15, %%" #regd " \n\t" \
79 "psllw $1, %%" #regd ::)
a7bd8797
MN
80
81#define MOVQ_BONE(regd) \
82 "pcmpeqd " #regd ", " #regd " \n\t" \
83 "psrlw $15, " #regd " \n\t"\
84 "packuswb " #regd ", " #regd " \n\t"
def60345
ZK
85
86#define MOVQ_BFE(regd) \
87 "pcmpeqd " #regd ", " #regd " \n\t"\
88 "paddb " #regd ", " #regd " \n\t"
d6a4c0b1
ZK
89#endif
90
def60345
ZK
91// using mm6 as temporary and for the output result
92// first argument is unmodifed and second is trashed
93// mm7 is supposed to contain 0xfefefefefefefefe
94#define PAVG_MMX_NO_RND(rega, regb) \
95 "movq " #rega ", %%mm6 \n\t"\
96 "pand " #regb ", %%mm6 \n\t"\
97 "pxor " #rega ", " #regb " \n\t"\
98 "pand %%mm7, " #regb " \n\t"\
99 "psrlq $1, " #regb " \n\t"\
100 "paddb " #regb ", %%mm6 \n\t"
101
102#define PAVG_MMX(rega, regb) \
103 "movq " #rega ", %%mm6 \n\t"\
104 "por " #regb ", %%mm6 \n\t"\
105 "pxor " #rega ", " #regb " \n\t"\
106 "pand %%mm7, " #regb " \n\t"\
107 "psrlq $1, " #regb " \n\t"\
108 "psubb " #regb ", %%mm6 \n\t"
109
a7bd8797 110
de6d9b64
FB
111/***********************************/
112/* 3Dnow specific */
113
114#define DEF(x) x ## _3dnow
115/* for Athlons PAVGUSB is prefered */
116#define PAVGB "pavgusb"
117
118#include "dsputil_mmx_avg.h"
119
120#undef DEF
121#undef PAVGB
122
123/***********************************/
124/* MMX2 specific */
125
607dce96 126#define DEF(x) x ## _mmx2
de6d9b64
FB
127
128/* Introduced only in MMX2 set */
129#define PAVGB "pavgb"
130
131#include "dsputil_mmx_avg.h"
132
133#undef DEF
134#undef PAVGB
135
136/***********************************/
137/* standard MMX */
138
139static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
140{
607dce96
MN
141 asm volatile(
142 "movl $-128, %%eax \n\t"
143 "pxor %%mm7, %%mm7 \n\t"
144 ".balign 16 \n\t"
145 "1: \n\t"
146 "movq (%0), %%mm0 \n\t"
147 "movq (%0, %2), %%mm2 \n\t"
148 "movq %%mm0, %%mm1 \n\t"
149 "movq %%mm2, %%mm3 \n\t"
150 "punpcklbw %%mm7, %%mm0 \n\t"
151 "punpckhbw %%mm7, %%mm1 \n\t"
152 "punpcklbw %%mm7, %%mm2 \n\t"
153 "punpckhbw %%mm7, %%mm3 \n\t"
154 "movq %%mm0, (%1, %%eax)\n\t"
155 "movq %%mm1, 8(%1, %%eax)\n\t"
156 "movq %%mm2, 16(%1, %%eax)\n\t"
157 "movq %%mm3, 24(%1, %%eax)\n\t"
158 "addl %3, %0 \n\t"
159 "addl $32, %%eax \n\t"
160 "js 1b \n\t"
161 : "+r" (pixels)
162 : "r" (block+64), "r" (line_size), "r" (line_size*2)
163 : "%eax"
164 );
de6d9b64
FB
165}
166
9dbcbd92
MN
167static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
168{
169 asm volatile(
607dce96 170 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 171 "movl $-128, %%eax \n\t"
607dce96 172 ".balign 16 \n\t"
9dbcbd92
MN
173 "1: \n\t"
174 "movq (%0), %%mm0 \n\t"
175 "movq (%1), %%mm2 \n\t"
176 "movq %%mm0, %%mm1 \n\t"
177 "movq %%mm2, %%mm3 \n\t"
178 "punpcklbw %%mm7, %%mm0 \n\t"
179 "punpckhbw %%mm7, %%mm1 \n\t"
180 "punpcklbw %%mm7, %%mm2 \n\t"
181 "punpckhbw %%mm7, %%mm3 \n\t"
182 "psubw %%mm2, %%mm0 \n\t"
183 "psubw %%mm3, %%mm1 \n\t"
184 "movq %%mm0, (%2, %%eax)\n\t"
185 "movq %%mm1, 8(%2, %%eax)\n\t"
186 "addl %3, %0 \n\t"
187 "addl %3, %1 \n\t"
188 "addl $16, %%eax \n\t"
189 "jnz 1b \n\t"
190 : "+r" (s1), "+r" (s2)
191 : "r" (block+64), "r" (stride)
192 : "%eax"
193 );
194}
195
de6d9b64
FB
196static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
197{
198 const DCTELEM *p;
199 UINT8 *pix;
de6d9b64
FB
200
201 /* read the pixels */
202 p = block;
203 pix = pixels;
d6a4c0b1 204 /* unrolled loop */
de6d9b64 205 __asm __volatile(
a822a479
NK
206 "movq %3, %%mm0\n\t"
207 "movq 8%3, %%mm1\n\t"
208 "movq 16%3, %%mm2\n\t"
209 "movq 24%3, %%mm3\n\t"
210 "movq 32%3, %%mm4\n\t"
211 "movq 40%3, %%mm5\n\t"
212 "movq 48%3, %%mm6\n\t"
213 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
214 "packuswb %%mm1, %%mm0\n\t"
215 "packuswb %%mm3, %%mm2\n\t"
216 "packuswb %%mm5, %%mm4\n\t"
217 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
218 "movq %%mm0, (%0)\n\t"
219 "movq %%mm2, (%0, %1)\n\t"
220 "movq %%mm4, (%0, %1, 2)\n\t"
221 "movq %%mm6, (%0, %2)\n\t"
222 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
223 :"memory");
224 pix += line_size*4;
225 p += 32;
d6a4c0b1
ZK
226
227 // if here would be an exact copy of the code above
228 // compiler would generate some very strange code
229 // thus using "r"
230 __asm __volatile(
231 "movq (%3), %%mm0\n\t"
232 "movq 8(%3), %%mm1\n\t"
233 "movq 16(%3), %%mm2\n\t"
234 "movq 24(%3), %%mm3\n\t"
235 "movq 32(%3), %%mm4\n\t"
236 "movq 40(%3), %%mm5\n\t"
237 "movq 48(%3), %%mm6\n\t"
238 "movq 56(%3), %%mm7\n\t"
239 "packuswb %%mm1, %%mm0\n\t"
240 "packuswb %%mm3, %%mm2\n\t"
241 "packuswb %%mm5, %%mm4\n\t"
242 "packuswb %%mm7, %%mm6\n\t"
243 "movq %%mm0, (%0)\n\t"
244 "movq %%mm2, (%0, %1)\n\t"
245 "movq %%mm4, (%0, %1, 2)\n\t"
246 "movq %%mm6, (%0, %2)\n\t"
247 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
248 :"memory");
de6d9b64
FB
249}
250
251static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
252{
253 const DCTELEM *p;
254 UINT8 *pix;
255 int i;
256
257 /* read the pixels */
258 p = block;
259 pix = pixels;
d6a4c0b1
ZK
260 MOVQ_ZERO(mm7);
261 i = 4;
cd8e5f96 262 do {
de6d9b64 263 __asm __volatile(
cd8e5f96
ZK
264 "movq (%2), %%mm0\n\t"
265 "movq 8(%2), %%mm1\n\t"
266 "movq 16(%2), %%mm2\n\t"
267 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
268 "movq %0, %%mm4\n\t"
269 "movq %1, %%mm6\n\t"
270 "movq %%mm4, %%mm5\n\t"
271 "punpcklbw %%mm7, %%mm4\n\t"
272 "punpckhbw %%mm7, %%mm5\n\t"
273 "paddsw %%mm4, %%mm0\n\t"
274 "paddsw %%mm5, %%mm1\n\t"
275 "movq %%mm6, %%mm5\n\t"
276 "punpcklbw %%mm7, %%mm6\n\t"
277 "punpckhbw %%mm7, %%mm5\n\t"
278 "paddsw %%mm6, %%mm2\n\t"
279 "paddsw %%mm5, %%mm3\n\t"
280 "packuswb %%mm1, %%mm0\n\t"
281 "packuswb %%mm3, %%mm2\n\t"
282 "movq %%mm0, %0\n\t"
283 "movq %%mm2, %1\n\t"
a822a479 284 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 285 :"r"(p)
de6d9b64
FB
286 :"memory");
287 pix += line_size*2;
288 p += 16;
cd8e5f96 289 } while (--i);
de6d9b64
FB
290}
291
292static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
293{
31ddcf98
ZK
294 asm volatile
295 (
296 "lea (%3, %3), %%eax \n\t"
52af45ad 297 ".balign 8 \n\t"
31ddcf98
ZK
298 "1: \n\t"
299 "movq (%1), %%mm0 \n\t"
300 "movq (%1, %3), %%mm1 \n\t"
301 "movq %%mm0, (%2) \n\t"
302 "movq %%mm1, (%2, %3) \n\t"
303 "addl %%eax, %1 \n\t"
304 "addl %%eax, %2 \n\t"
305 "movq (%1), %%mm0 \n\t"
306 "movq (%1, %3), %%mm1 \n\t"
307 "movq %%mm0, (%2) \n\t"
308 "movq %%mm1, (%2, %3) \n\t"
309 "addl %%eax, %1 \n\t"
310 "addl %%eax, %2 \n\t"
311 "subl $4, %0 \n\t"
312 "jnz 1b \n\t"
313 : "+g"(h), "+r" (pixels), "+r" (block)
314 : "r"(line_size)
315 : "%eax", "memory"
316 );
de6d9b64
FB
317}
318
def60345
ZK
319// will have to be check if it's better to have bigger
320// unrolled code also on Celerons - for now yes
321#define LONG_UNROLL 1
de6d9b64
FB
322static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
323{
def60345 324#if 0
de6d9b64
FB
325 UINT8 *p;
326 const UINT8 *pix;
327 p = block;
328 pix = pixels;
d6a4c0b1
ZK
329 MOVQ_ZERO(mm7);
330 MOVQ_WONE(mm4);
331 JUMPALIGN();
de6d9b64 332 do {
def60345 333 __asm __volatile(
de6d9b64
FB
334 "movq %1, %%mm0\n\t"
335 "movq 1%1, %%mm1\n\t"
336 "movq %%mm0, %%mm2\n\t"
337 "movq %%mm1, %%mm3\n\t"
338 "punpcklbw %%mm7, %%mm0\n\t"
339 "punpcklbw %%mm7, %%mm1\n\t"
340 "punpckhbw %%mm7, %%mm2\n\t"
341 "punpckhbw %%mm7, %%mm3\n\t"
342 "paddusw %%mm1, %%mm0\n\t"
343 "paddusw %%mm3, %%mm2\n\t"
344 "paddusw %%mm4, %%mm0\n\t"
345 "paddusw %%mm4, %%mm2\n\t"
346 "psrlw $1, %%mm0\n\t"
347 "psrlw $1, %%mm2\n\t"
348 "packuswb %%mm2, %%mm0\n\t"
349 "movq %%mm0, %0\n\t"
350 :"=m"(*p)
351 :"m"(*pix)
def60345 352 :"memory");
de6d9b64
FB
353 pix += line_size; p += line_size;
354 } while (--h);
def60345
ZK
355#else
356 __asm __volatile(
357 MOVQ_BFE(%%mm7)
358 "lea (%3, %3), %%eax \n\t"
359 ".balign 8 \n\t"
360 "1: \n\t"
361 "movq (%1), %%mm0 \n\t"
362 "movq (%1, %3), %%mm2 \n\t"
363 "movq 1(%1), %%mm1 \n\t"
364 "movq 1(%1, %3), %%mm3 \n\t"
365 PAVG_MMX(%%mm0, %%mm1)
366 "movq %%mm6, (%2) \n\t"
367 PAVG_MMX(%%mm2, %%mm3)
368 "movq %%mm6, (%2, %3) \n\t"
369 "addl %%eax, %1 \n\t"
370 "addl %%eax, %2 \n\t"
371#if LONG_UNROLL
372 "movq (%1), %%mm0 \n\t"
373 "movq (%1, %3), %%mm2 \n\t"
374 "movq 1(%1), %%mm1 \n\t"
375 "movq 1(%1, %3), %%mm3 \n\t"
376 PAVG_MMX(%%mm0, %%mm1)
377 "movq %%mm6, (%2) \n\t"
378 PAVG_MMX(%%mm2, %%mm3)
379 "movq %%mm6, (%2, %3) \n\t"
380 "addl %%eax, %1 \n\t"
381 "addl %%eax, %2 \n\t"
382 "subl $4, %0 \n\t"
383#else
384 "subl $2, %0 \n\t"
385#endif
386 "jnz 1b \n\t"
387 :"+g"(h), "+S"(pixels), "+D"(block)
388 :"r"(line_size)
389 :"eax", "memory");
390#endif
de6d9b64
FB
391}
392
393static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
394{
def60345 395#if 0
de6d9b64
FB
396 UINT8 *p;
397 const UINT8 *pix;
398 p = block;
399 pix = pixels;
d6a4c0b1
ZK
400 MOVQ_ZERO(mm7);
401 MOVQ_WONE(mm4);
402 JUMPALIGN();
de6d9b64
FB
403 do {
404 __asm __volatile(
405 "movq %1, %%mm0\n\t"
406 "movq %2, %%mm1\n\t"
407 "movq %%mm0, %%mm2\n\t"
408 "movq %%mm1, %%mm3\n\t"
409 "punpcklbw %%mm7, %%mm0\n\t"
410 "punpcklbw %%mm7, %%mm1\n\t"
411 "punpckhbw %%mm7, %%mm2\n\t"
412 "punpckhbw %%mm7, %%mm3\n\t"
413 "paddusw %%mm1, %%mm0\n\t"
414 "paddusw %%mm3, %%mm2\n\t"
415 "paddusw %%mm4, %%mm0\n\t"
416 "paddusw %%mm4, %%mm2\n\t"
417 "psrlw $1, %%mm0\n\t"
418 "psrlw $1, %%mm2\n\t"
419 "packuswb %%mm2, %%mm0\n\t"
420 "movq %%mm0, %0\n\t"
421 :"=m"(*p)
422 :"m"(*pix),
423 "m"(*(pix+line_size))
424 :"memory");
425 pix += line_size;
426 p += line_size;
427 } while (--h);
def60345
ZK
428#else
429 __asm __volatile(
430 MOVQ_BFE(%%mm7)
431 "lea (%3, %3), %%eax \n\t"
432 "movq (%1), %%mm0 \n\t"
433 ".balign 8 \n\t"
434 "1: \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq (%1, %%eax),%%mm2 \n\t"
437 PAVG_MMX(%%mm1, %%mm0)
438 "movq %%mm6, (%2) \n\t"
439 PAVG_MMX(%%mm2, %%mm1)
440 "movq %%mm6, (%2, %3) \n\t"
441 "addl %%eax, %1 \n\t"
442 "addl %%eax, %2 \n\t"
443#ifdef LONG_UNROLL
444 "movq (%1, %3), %%mm1 \n\t"
445 "movq (%1, %%eax),%%mm0 \n\t"
446 PAVG_MMX(%%mm1, %%mm2)
447 "movq %%mm6, (%2) \n\t"
448 PAVG_MMX(%%mm0, %%mm1)
449 "movq %%mm6, (%2, %3) \n\t"
450 "addl %%eax, %1 \n\t"
451 "addl %%eax, %2 \n\t"
452 "subl $4, %0 \n\t"
453#else
454 "subl $2, %0 \n\t"
455#endif
456 "jnz 1b \n\t"
457 :"+g"(h), "+S"(pixels), "+D"(block)
458 :"r"(line_size)
459 :"eax", "memory");
460#endif
461
462
de6d9b64
FB
463}
464
465static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
466{
467 UINT8 *p;
468 const UINT8 *pix;
469 p = block;
d6a4c0b1
ZK
470 pix = pixels; // 1s
471 MOVQ_ZERO(mm7);
472 MOVQ_WTWO(mm6);
473 JUMPALIGN();
de6d9b64
FB
474 do {
475 __asm __volatile(
476 "movq %1, %%mm0\n\t"
477 "movq %2, %%mm1\n\t"
478 "movq 1%1, %%mm4\n\t"
479 "movq 1%2, %%mm5\n\t"
480 "movq %%mm0, %%mm2\n\t"
481 "movq %%mm1, %%mm3\n\t"
482 "punpcklbw %%mm7, %%mm0\n\t"
483 "punpcklbw %%mm7, %%mm1\n\t"
484 "punpckhbw %%mm7, %%mm2\n\t"
485 "punpckhbw %%mm7, %%mm3\n\t"
486 "paddusw %%mm1, %%mm0\n\t"
487 "paddusw %%mm3, %%mm2\n\t"
488 "movq %%mm4, %%mm1\n\t"
489 "movq %%mm5, %%mm3\n\t"
490 "punpcklbw %%mm7, %%mm4\n\t"
491 "punpcklbw %%mm7, %%mm5\n\t"
492 "punpckhbw %%mm7, %%mm1\n\t"
493 "punpckhbw %%mm7, %%mm3\n\t"
494 "paddusw %%mm5, %%mm4\n\t"
495 "paddusw %%mm3, %%mm1\n\t"
496 "paddusw %%mm6, %%mm4\n\t"
497 "paddusw %%mm6, %%mm1\n\t"
498 "paddusw %%mm4, %%mm0\n\t"
499 "paddusw %%mm1, %%mm2\n\t"
500 "psrlw $2, %%mm0\n\t"
501 "psrlw $2, %%mm2\n\t"
502 "packuswb %%mm2, %%mm0\n\t"
503 "movq %%mm0, %0\n\t"
504 :"=m"(*p)
505 :"m"(*pix),
506 "m"(*(pix+line_size))
507 :"memory");
508 pix += line_size;
509 p += line_size;
510 } while(--h);
de6d9b64
FB
511}
512
513static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
514{
515 UINT8 *p;
516 const UINT8 *pix;
517 p = block;
518 pix = pixels;
d6a4c0b1 519 MOVQ_ZERO(mm7);
de6d9b64
FB
520 do {
521 __asm __volatile(
522 "movq %1, %%mm0\n\t"
523 "movq 1%1, %%mm1\n\t"
524 "movq %%mm0, %%mm2\n\t"
525 "movq %%mm1, %%mm3\n\t"
526 "punpcklbw %%mm7, %%mm0\n\t"
527 "punpcklbw %%mm7, %%mm1\n\t"
528 "punpckhbw %%mm7, %%mm2\n\t"
529 "punpckhbw %%mm7, %%mm3\n\t"
530 "paddusw %%mm1, %%mm0\n\t"
531 "paddusw %%mm3, %%mm2\n\t"
532 "psrlw $1, %%mm0\n\t"
533 "psrlw $1, %%mm2\n\t"
534 "packuswb %%mm2, %%mm0\n\t"
535 "movq %%mm0, %0\n\t"
536 :"=m"(*p)
537 :"m"(*pix)
538 :"memory");
539 pix += line_size;
540 p += line_size;
541 } while (--h);
de6d9b64
FB
542}
543
544static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
545{
546 UINT8 *p;
547 const UINT8 *pix;
548 p = block;
549 pix = pixels;
d6a4c0b1
ZK
550 MOVQ_ZERO(mm7);
551 JUMPALIGN();
de6d9b64
FB
552 do {
553 __asm __volatile(
554 "movq %1, %%mm0\n\t"
555 "movq %2, %%mm1\n\t"
556 "movq %%mm0, %%mm2\n\t"
557 "movq %%mm1, %%mm3\n\t"
558 "punpcklbw %%mm7, %%mm0\n\t"
559 "punpcklbw %%mm7, %%mm1\n\t"
560 "punpckhbw %%mm7, %%mm2\n\t"
561 "punpckhbw %%mm7, %%mm3\n\t"
562 "paddusw %%mm1, %%mm0\n\t"
563 "paddusw %%mm3, %%mm2\n\t"
564 "psrlw $1, %%mm0\n\t"
565 "psrlw $1, %%mm2\n\t"
566 "packuswb %%mm2, %%mm0\n\t"
567 "movq %%mm0, %0\n\t"
568 :"=m"(*p)
569 :"m"(*pix),
570 "m"(*(pix+line_size))
571 :"memory");
572 pix += line_size;
573 p += line_size;
574 } while(--h);
de6d9b64
FB
575}
576
577static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
578{
579 UINT8 *p;
580 const UINT8 *pix;
581 p = block;
582 pix = pixels;
d6a4c0b1
ZK
583 MOVQ_ZERO(mm7);
584 MOVQ_WONE(mm6);
585 JUMPALIGN();
de6d9b64
FB
586 do {
587 __asm __volatile(
588 "movq %1, %%mm0\n\t"
589 "movq %2, %%mm1\n\t"
590 "movq 1%1, %%mm4\n\t"
591 "movq 1%2, %%mm5\n\t"
592 "movq %%mm0, %%mm2\n\t"
593 "movq %%mm1, %%mm3\n\t"
594 "punpcklbw %%mm7, %%mm0\n\t"
595 "punpcklbw %%mm7, %%mm1\n\t"
596 "punpckhbw %%mm7, %%mm2\n\t"
597 "punpckhbw %%mm7, %%mm3\n\t"
598 "paddusw %%mm1, %%mm0\n\t"
599 "paddusw %%mm3, %%mm2\n\t"
600 "movq %%mm4, %%mm1\n\t"
601 "movq %%mm5, %%mm3\n\t"
602 "punpcklbw %%mm7, %%mm4\n\t"
603 "punpcklbw %%mm7, %%mm5\n\t"
604 "punpckhbw %%mm7, %%mm1\n\t"
605 "punpckhbw %%mm7, %%mm3\n\t"
606 "paddusw %%mm5, %%mm4\n\t"
607 "paddusw %%mm3, %%mm1\n\t"
608 "paddusw %%mm6, %%mm4\n\t"
609 "paddusw %%mm6, %%mm1\n\t"
610 "paddusw %%mm4, %%mm0\n\t"
611 "paddusw %%mm1, %%mm2\n\t"
612 "psrlw $2, %%mm0\n\t"
613 "psrlw $2, %%mm2\n\t"
614 "packuswb %%mm2, %%mm0\n\t"
615 "movq %%mm0, %0\n\t"
616 :"=m"(*p)
617 :"m"(*pix),
618 "m"(*(pix+line_size))
619 :"memory");
620 pix += line_size;
621 p += line_size;
622 } while(--h);
de6d9b64
FB
623}
624
625static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
626{
627 UINT8 *p;
628 const UINT8 *pix;
629 p = block;
630 pix = pixels;
d6a4c0b1
ZK
631 MOVQ_ZERO(mm7);
632 MOVQ_WONE(mm6);
633 JUMPALIGN();
de6d9b64
FB
634 do {
635 __asm __volatile(
636 "movq %0, %%mm0\n\t"
637 "movq %1, %%mm1\n\t"
638 "movq %%mm0, %%mm2\n\t"
639 "movq %%mm1, %%mm3\n\t"
640 "punpcklbw %%mm7, %%mm0\n\t"
641 "punpcklbw %%mm7, %%mm1\n\t"
642 "punpckhbw %%mm7, %%mm2\n\t"
643 "punpckhbw %%mm7, %%mm3\n\t"
644 "paddusw %%mm1, %%mm0\n\t"
645 "paddusw %%mm3, %%mm2\n\t"
646 "paddusw %%mm6, %%mm0\n\t"
647 "paddusw %%mm6, %%mm2\n\t"
648 "psrlw $1, %%mm0\n\t"
649 "psrlw $1, %%mm2\n\t"
650 "packuswb %%mm2, %%mm0\n\t"
651 "movq %%mm0, %0\n\t"
a822a479 652 :"+m"(*p)
de6d9b64
FB
653 :"m"(*pix)
654 :"memory");
655 pix += line_size;
656 p += line_size;
657 }
658 while (--h);
de6d9b64
FB
659}
660
661static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
662{
663 UINT8 *p;
664 const UINT8 *pix;
665 p = block;
666 pix = pixels;
d6a4c0b1
ZK
667 MOVQ_ZERO(mm7);
668 MOVQ_WONE(mm6);
669 JUMPALIGN();
de6d9b64
FB
670 do {
671 __asm __volatile(
672 "movq %1, %%mm1\n\t"
673 "movq %0, %%mm0\n\t"
674 "movq 1%1, %%mm4\n\t"
675 "movq %%mm0, %%mm2\n\t"
676 "movq %%mm1, %%mm3\n\t"
677 "movq %%mm4, %%mm5\n\t"
678 "punpcklbw %%mm7, %%mm1\n\t"
679 "punpckhbw %%mm7, %%mm3\n\t"
680 "punpcklbw %%mm7, %%mm4\n\t"
681 "punpckhbw %%mm7, %%mm5\n\t"
682 "punpcklbw %%mm7, %%mm0\n\t"
683 "punpckhbw %%mm7, %%mm2\n\t"
684 "paddusw %%mm4, %%mm1\n\t"
685 "paddusw %%mm5, %%mm3\n\t"
686 "paddusw %%mm6, %%mm1\n\t"
687 "paddusw %%mm6, %%mm3\n\t"
688 "psrlw $1, %%mm1\n\t"
689 "psrlw $1, %%mm3\n\t"
690 "paddusw %%mm6, %%mm0\n\t"
691 "paddusw %%mm6, %%mm2\n\t"
692 "paddusw %%mm1, %%mm0\n\t"
693 "paddusw %%mm3, %%mm2\n\t"
694 "psrlw $1, %%mm0\n\t"
695 "psrlw $1, %%mm2\n\t"
696 "packuswb %%mm2, %%mm0\n\t"
697 "movq %%mm0, %0\n\t"
a822a479 698 :"+m"(*p)
de6d9b64
FB
699 :"m"(*pix)
700 :"memory");
701 pix += line_size;
702 p += line_size;
703 } while (--h);
de6d9b64
FB
704}
705
706static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
707{
708 UINT8 *p;
709 const UINT8 *pix;
710 p = block;
711 pix = pixels;
d6a4c0b1
ZK
712 MOVQ_ZERO(mm7);
713 MOVQ_WONE(mm6);
714 JUMPALIGN();
de6d9b64
FB
715 do {
716 __asm __volatile(
717 "movq %1, %%mm1\n\t"
718 "movq %0, %%mm0\n\t"
719 "movq %2, %%mm4\n\t"
720 "movq %%mm0, %%mm2\n\t"
721 "movq %%mm1, %%mm3\n\t"
722 "movq %%mm4, %%mm5\n\t"
723 "punpcklbw %%mm7, %%mm1\n\t"
724 "punpckhbw %%mm7, %%mm3\n\t"
725 "punpcklbw %%mm7, %%mm4\n\t"
726 "punpckhbw %%mm7, %%mm5\n\t"
727 "punpcklbw %%mm7, %%mm0\n\t"
728 "punpckhbw %%mm7, %%mm2\n\t"
729 "paddusw %%mm4, %%mm1\n\t"
730 "paddusw %%mm5, %%mm3\n\t"
731 "paddusw %%mm6, %%mm1\n\t"
732 "paddusw %%mm6, %%mm3\n\t"
733 "psrlw $1, %%mm1\n\t"
734 "psrlw $1, %%mm3\n\t"
735 "paddusw %%mm6, %%mm0\n\t"
736 "paddusw %%mm6, %%mm2\n\t"
737 "paddusw %%mm1, %%mm0\n\t"
738 "paddusw %%mm3, %%mm2\n\t"
739 "psrlw $1, %%mm0\n\t"
740 "psrlw $1, %%mm2\n\t"
741 "packuswb %%mm2, %%mm0\n\t"
742 "movq %%mm0, %0\n\t"
a822a479 743 :"+m"(*p)
de6d9b64
FB
744 :"m"(*pix), "m"(*(pix+line_size))
745 :"memory");
746 pix += line_size;
747 p += line_size ;
748 } while(--h);
de6d9b64
FB
749}
750
751static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
752{
753 UINT8 *p;
754 const UINT8 *pix;
755 p = block;
756 pix = pixels;
d6a4c0b1
ZK
757 MOVQ_ZERO(mm7);
758 // this doesn't seem to be used offten - so
759 // the inside usage of mm_wone is not optimized
760 MOVQ_WTWO(mm6);
de6d9b64
FB
761 do {
762 __asm __volatile(
763 "movq %1, %%mm0\n\t"
764 "movq %2, %%mm1\n\t"
765 "movq 1%1, %%mm4\n\t"
766 "movq 1%2, %%mm5\n\t"
767 "movq %%mm0, %%mm2\n\t"
768 "movq %%mm1, %%mm3\n\t"
769 "punpcklbw %%mm7, %%mm0\n\t"
770 "punpcklbw %%mm7, %%mm1\n\t"
771 "punpckhbw %%mm7, %%mm2\n\t"
772 "punpckhbw %%mm7, %%mm3\n\t"
773 "paddusw %%mm1, %%mm0\n\t"
774 "paddusw %%mm3, %%mm2\n\t"
775 "movq %%mm4, %%mm1\n\t"
776 "movq %%mm5, %%mm3\n\t"
777 "punpcklbw %%mm7, %%mm4\n\t"
778 "punpcklbw %%mm7, %%mm5\n\t"
779 "punpckhbw %%mm7, %%mm1\n\t"
780 "punpckhbw %%mm7, %%mm3\n\t"
781 "paddusw %%mm5, %%mm4\n\t"
782 "paddusw %%mm3, %%mm1\n\t"
783 "paddusw %%mm6, %%mm4\n\t"
784 "paddusw %%mm6, %%mm1\n\t"
785 "paddusw %%mm4, %%mm0\n\t"
786 "paddusw %%mm1, %%mm2\n\t"
787 "movq %3, %%mm5\n\t"
788 "psrlw $2, %%mm0\n\t"
789 "movq %0, %%mm1\n\t"
790 "psrlw $2, %%mm2\n\t"
791 "movq %%mm1, %%mm3\n\t"
792 "punpcklbw %%mm7, %%mm1\n\t"
793 "punpckhbw %%mm7, %%mm3\n\t"
794 "paddusw %%mm1, %%mm0\n\t"
795 "paddusw %%mm3, %%mm2\n\t"
796 "paddusw %%mm5, %%mm0\n\t"
797 "paddusw %%mm5, %%mm2\n\t"
798 "psrlw $1, %%mm0\n\t"
799 "psrlw $1, %%mm2\n\t"
800 "packuswb %%mm2, %%mm0\n\t"
801 "movq %%mm0, %0\n\t"
a822a479 802 :"+m"(*p)
de6d9b64 803 :"m"(*pix),
a9b3f630 804 "m"(*(pix+line_size)), "m"(mm_wone)
de6d9b64
FB
805 :"memory");
806 pix += line_size;
807 p += line_size ;
808 } while(--h);
de6d9b64
FB
809}
810
811static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
812{
813 UINT8 *p;
814 const UINT8 *pix;
815 p = block;
816 pix = pixels;
d6a4c0b1 817 MOVQ_ZERO(mm7);
de6d9b64
FB
818 do {
819 __asm __volatile(
820 "movq %1, %%mm0\n\t"
821 "movq %0, %%mm1\n\t"
822 "movq %%mm0, %%mm2\n\t"
823 "movq %%mm1, %%mm3\n\t"
824 "punpcklbw %%mm7, %%mm0\n\t"
825 "punpcklbw %%mm7, %%mm1\n\t"
826 "punpckhbw %%mm7, %%mm2\n\t"
827 "punpckhbw %%mm7, %%mm3\n\t"
828 "paddusw %%mm1, %%mm0\n\t"
829 "paddusw %%mm3, %%mm2\n\t"
830 "psrlw $1, %%mm0\n\t"
831 "psrlw $1, %%mm2\n\t"
832 "packuswb %%mm2, %%mm0\n\t"
833 "movq %%mm0, %0\n\t"
a822a479 834 :"+m"(*p)
de6d9b64
FB
835 :"m"(*pix)
836 :"memory");
837 pix += line_size;
838 p += line_size ;
839 } while (--h);
de6d9b64
FB
840}
841
842static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
843{
844 UINT8 *p;
845 const UINT8 *pix;
846 p = block;
847 pix = pixels;
d6a4c0b1 848 MOVQ_ZERO(mm7);
de6d9b64
FB
849 do {
850 __asm __volatile(
851 "movq %1, %%mm0\n\t"
852 "movq 1%1, %%mm1\n\t"
853 "movq %0, %%mm4\n\t"
854 "movq %%mm0, %%mm2\n\t"
855 "movq %%mm1, %%mm3\n\t"
856 "movq %%mm4, %%mm5\n\t"
857 "punpcklbw %%mm7, %%mm0\n\t"
858 "punpcklbw %%mm7, %%mm1\n\t"
859 "punpckhbw %%mm7, %%mm2\n\t"
860 "punpckhbw %%mm7, %%mm3\n\t"
861 "punpcklbw %%mm7, %%mm4\n\t"
862 "punpckhbw %%mm7, %%mm5\n\t"
863 "paddusw %%mm1, %%mm0\n\t"
864 "paddusw %%mm3, %%mm2\n\t"
865 "psrlw $1, %%mm0\n\t"
866 "psrlw $1, %%mm2\n\t"
867 "paddusw %%mm4, %%mm0\n\t"
868 "paddusw %%mm5, %%mm2\n\t"
869 "psrlw $1, %%mm0\n\t"
870 "psrlw $1, %%mm2\n\t"
871 "packuswb %%mm2, %%mm0\n\t"
872 "movq %%mm0, %0\n\t"
a822a479 873 :"+m"(*p)
de6d9b64
FB
874 :"m"(*pix)
875 :"memory");
876 pix += line_size;
877 p += line_size;
878 } while (--h);
de6d9b64
FB
879}
880
881static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
882{
883 UINT8 *p;
884 const UINT8 *pix;
885 p = block;
886 pix = pixels;
d6a4c0b1 887 MOVQ_ZERO(mm7);
de6d9b64
FB
888 do {
889 __asm __volatile(
890 "movq %1, %%mm0\n\t"
891 "movq %2, %%mm1\n\t"
892 "movq %0, %%mm4\n\t"
893 "movq %%mm0, %%mm2\n\t"
894 "movq %%mm1, %%mm3\n\t"
895 "movq %%mm4, %%mm5\n\t"
896 "punpcklbw %%mm7, %%mm0\n\t"
897 "punpcklbw %%mm7, %%mm1\n\t"
898 "punpckhbw %%mm7, %%mm2\n\t"
899 "punpckhbw %%mm7, %%mm3\n\t"
900 "punpcklbw %%mm7, %%mm4\n\t"
901 "punpckhbw %%mm7, %%mm5\n\t"
902 "paddusw %%mm1, %%mm0\n\t"
903 "paddusw %%mm3, %%mm2\n\t"
904 "psrlw $1, %%mm0\n\t"
905 "psrlw $1, %%mm2\n\t"
906 "paddusw %%mm4, %%mm0\n\t"
907 "paddusw %%mm5, %%mm2\n\t"
908 "psrlw $1, %%mm0\n\t"
909 "psrlw $1, %%mm2\n\t"
910 "packuswb %%mm2, %%mm0\n\t"
911 "movq %%mm0, %0\n\t"
a822a479 912 :"+m"(*p)
de6d9b64
FB
913 :"m"(*pix), "m"(*(pix+line_size))
914 :"memory");
915 pix += line_size;
916 p += line_size ;
917 } while(--h);
de6d9b64
FB
918}
919
920static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
921{
922 UINT8 *p;
923 const UINT8 *pix;
924 p = block;
925 pix = pixels;
d6a4c0b1
ZK
926 MOVQ_ZERO(mm7);
927 MOVQ_WONE(mm6);
928 JUMPALIGN();
de6d9b64
FB
929 do {
930 __asm __volatile(
931 "movq %1, %%mm0\n\t"
932 "movq %2, %%mm1\n\t"
933 "movq 1%1, %%mm4\n\t"
934 "movq 1%2, %%mm5\n\t"
935 "movq %%mm0, %%mm2\n\t"
936 "movq %%mm1, %%mm3\n\t"
937 "punpcklbw %%mm7, %%mm0\n\t"
938 "punpcklbw %%mm7, %%mm1\n\t"
939 "punpckhbw %%mm7, %%mm2\n\t"
940 "punpckhbw %%mm7, %%mm3\n\t"
941 "paddusw %%mm1, %%mm0\n\t"
942 "paddusw %%mm3, %%mm2\n\t"
943 "movq %%mm4, %%mm1\n\t"
944 "movq %%mm5, %%mm3\n\t"
945 "punpcklbw %%mm7, %%mm4\n\t"
946 "punpcklbw %%mm7, %%mm5\n\t"
947 "punpckhbw %%mm7, %%mm1\n\t"
948 "punpckhbw %%mm7, %%mm3\n\t"
949 "paddusw %%mm5, %%mm4\n\t"
950 "paddusw %%mm3, %%mm1\n\t"
951 "paddusw %%mm6, %%mm4\n\t"
952 "paddusw %%mm6, %%mm1\n\t"
953 "paddusw %%mm4, %%mm0\n\t"
954 "paddusw %%mm1, %%mm2\n\t"
955 "movq %0, %%mm1\n\t"
956 "psrlw $2, %%mm0\n\t"
957 "movq %%mm1, %%mm3\n\t"
958 "psrlw $2, %%mm2\n\t"
959 "punpcklbw %%mm7, %%mm1\n\t"
960 "punpckhbw %%mm7, %%mm3\n\t"
961 "paddusw %%mm1, %%mm0\n\t"
962 "paddusw %%mm3, %%mm2\n\t"
963 "psrlw $1, %%mm0\n\t"
964 "psrlw $1, %%mm2\n\t"
965 "packuswb %%mm2, %%mm0\n\t"
966 "movq %%mm0, %0\n\t"
a822a479 967 :"+m"(*p)
de6d9b64
FB
968 :"m"(*pix),
969 "m"(*(pix+line_size))
970 :"memory");
971 pix += line_size;
972 p += line_size;
973 } while(--h);
de6d9b64
FB
974}
975
649c00c9
MN
976static void clear_blocks_mmx(DCTELEM *blocks)
977{
978 asm volatile(
979 "pxor %%mm7, %%mm7 \n\t"
980 "movl $-128*6, %%eax \n\t"
981 "1: \n\t"
982 "movq %%mm7, (%0, %%eax) \n\t"
983 "movq %%mm7, 8(%0, %%eax) \n\t"
984 "movq %%mm7, 16(%0, %%eax) \n\t"
985 "movq %%mm7, 24(%0, %%eax) \n\t"
986 "addl $32, %%eax \n\t"
987 " js 1b \n\t"
988 : : "r" (((int)blocks)+128*6)
989 : "%eax"
990 );
991}
992
61a4e8ae 993#if 0
d6a4c0b1 994static void just_return() { return; }
61a4e8ae 995#endif
d6a4c0b1 996
de6d9b64
FB
997void dsputil_init_mmx(void)
998{
999 mm_flags = mm_support();
f4470e09
MN
1000#if 1
1001 printf("libavcodec: CPU flags:");
de6d9b64
FB
1002 if (mm_flags & MM_MMX)
1003 printf(" mmx");
1004 if (mm_flags & MM_MMXEXT)
1005 printf(" mmxext");
1006 if (mm_flags & MM_3DNOW)
1007 printf(" 3dnow");
1008 if (mm_flags & MM_SSE)
1009 printf(" sse");
1010 if (mm_flags & MM_SSE2)
1011 printf(" sse2");
1012 printf("\n");
1013#endif
1014
1015 if (mm_flags & MM_MMX) {
1016 get_pixels = get_pixels_mmx;
9dbcbd92 1017 diff_pixels = diff_pixels_mmx;
de6d9b64
FB
1018 put_pixels_clamped = put_pixels_clamped_mmx;
1019 add_pixels_clamped = add_pixels_clamped_mmx;
649c00c9 1020 clear_blocks= clear_blocks_mmx;
dcb9cd4b 1021
ba6802de
MN
1022 pix_abs16x16 = pix_abs16x16_mmx;
1023 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1024 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
de6d9b64 1025 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
ba6802de
MN
1026 pix_abs8x8 = pix_abs8x8_mmx;
1027 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1028 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1029 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
de6d9b64
FB
1030 av_fdct = fdct_mmx;
1031
1032 put_pixels_tab[0] = put_pixels_mmx;
1033 put_pixels_tab[1] = put_pixels_x2_mmx;
1034 put_pixels_tab[2] = put_pixels_y2_mmx;
1035 put_pixels_tab[3] = put_pixels_xy2_mmx;
1036
1037 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
1038 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1039 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1040 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
dcb9cd4b 1041
de6d9b64
FB
1042 avg_pixels_tab[0] = avg_pixels_mmx;
1043 avg_pixels_tab[1] = avg_pixels_x2_mmx;
1044 avg_pixels_tab[2] = avg_pixels_y2_mmx;
1045 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1046
1047 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
1048 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
1049 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
1050 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
607dce96 1051
de6d9b64 1052 if (mm_flags & MM_MMXEXT) {
ba6802de
MN
1053 pix_abs16x16 = pix_abs16x16_mmx2;
1054 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
1055 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
1056 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
dcb9cd4b 1057
ba6802de
MN
1058 pix_abs8x8 = pix_abs8x8_mmx2;
1059 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
1060 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
1061 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
607dce96
MN
1062
1063 put_pixels_tab[1] = put_pixels_x2_mmx2;
1064 put_pixels_tab[2] = put_pixels_y2_mmx2;
1065 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
1066 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
dcb9cd4b 1067
607dce96
MN
1068 avg_pixels_tab[0] = avg_pixels_mmx2;
1069 avg_pixels_tab[1] = avg_pixels_x2_mmx2;
1070 avg_pixels_tab[2] = avg_pixels_y2_mmx2;
1071 avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
de6d9b64
FB
1072 } else if (mm_flags & MM_3DNOW) {
1073 put_pixels_tab[1] = put_pixels_x2_3dnow;
1074 put_pixels_tab[2] = put_pixels_y2_3dnow;
607dce96
MN
1075 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
1076 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
61a4e8ae 1077
de6d9b64
FB
1078 avg_pixels_tab[0] = avg_pixels_3dnow;
1079 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
1080 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
1081 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
de6d9b64 1082 }
4af7bcc1 1083
8def0299
FB
1084 /* idct */
1085 if (mm_flags & MM_MMXEXT) {
1086 ff_idct = ff_mmxext_idct;
1087 } else {
1088 ff_idct = ff_mmx_idct;
1089 }
d962f6fd
A
1090#ifdef SIMPLE_IDCT
1091// ff_idct = simple_idct;
1092 ff_idct = simple_idct_mmx;
1093#endif
de6d9b64 1094 }
d6a4c0b1
ZK
1095
1096#if 0
1097 // for speed testing
1098 get_pixels = just_return;
1099 put_pixels_clamped = just_return;
1100 add_pixels_clamped = just_return;
1101
1102 pix_abs16x16 = just_return;
1103 pix_abs16x16_x2 = just_return;
1104 pix_abs16x16_y2 = just_return;
1105 pix_abs16x16_xy2 = just_return;
1106
1107 put_pixels_tab[0] = just_return;
1108 put_pixels_tab[1] = just_return;
1109 put_pixels_tab[2] = just_return;
1110 put_pixels_tab[3] = just_return;
1111
1112 put_no_rnd_pixels_tab[0] = just_return;
1113 put_no_rnd_pixels_tab[1] = just_return;
1114 put_no_rnd_pixels_tab[2] = just_return;
1115 put_no_rnd_pixels_tab[3] = just_return;
1116
1117 avg_pixels_tab[0] = just_return;
1118 avg_pixels_tab[1] = just_return;
1119 avg_pixels_tab[2] = just_return;
1120 avg_pixels_tab[3] = just_return;
1121
1122 avg_no_rnd_pixels_tab[0] = just_return;
1123 avg_no_rnd_pixels_tab[1] = just_return;
1124 avg_no_rnd_pixels_tab[2] = just_return;
1125 avg_no_rnd_pixels_tab[3] = just_return;
1126
d6a4c0b1
ZK
1127 //av_fdct = just_return;
1128 //ff_idct = just_return;
1129#endif
de6d9b64 1130}
4f12a497
FB
1131
1132/* remove any non bit exact operation (testing purpose). NOTE that
1133 this function should be kept as small as possible because it is
1134 always difficult to test automatically non bit exact cases. */
1135void dsputil_set_bit_exact_mmx(void)
1136{
1137 if (mm_flags & MM_MMX) {
1138 if (mm_flags & MM_MMXEXT) {
1139 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1140 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1141 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1142 } else if (mm_flags & MM_3DNOW) {
1143 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1144 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1145 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1146 }
1147 }
1148}