fdct_mmx -> ff_fdct_mmx (renamed to avoid namespace conflict with xvid)
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5
FB
25int mm_flags; /* multimedia extension flags */
26
ba6802de
MN
27int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31
32int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36
37int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41
42int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46
8def0299
FB
47/* external functions, from idct_mmx.c */
48void ff_mmx_idct(DCTELEM *block);
49void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 50
de6d9b64 51/* pixel operations */
a7bd8797
MN
52static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 55
d6a4c0b1
ZK
56#define JUMPALIGN() __asm __volatile (".balign 8"::)
57#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
58
fca0f0e5
ZK
59#define MOVQ_WONE(regd) \
60 __asm __volatile ( \
61 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
62 "psrlw $15, %%" #regd ::)
63
64#define MOVQ_BFE(regd) \
65 __asm __volatile ( \
66 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
67 "paddb %%" #regd ", %%" #regd " \n\t" ::)
68
d6a4c0b1 69#ifndef PIC
fca0f0e5 70#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
71#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
72#else
73// for shared library it's better to use this way for accessing constants
74// pcmpeqd -> -1
fca0f0e5 75#define MOVQ_BONE(regd) \
d6a4c0b1 76 __asm __volatile ( \
fca0f0e5
ZK
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
78 "psrlw $15, %%" #regd " \n\t" \
79 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
80
81#define MOVQ_WTWO(regd) \
82 __asm __volatile ( \
fca0f0e5
ZK
83 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
84 "psrlw $15, %%" #regd " \n\t" \
85 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 86
d6a4c0b1
ZK
87#endif
88
fca0f0e5 89// using regr as temporary and for the output result
def60345 90// first argument is unmodifed and second is trashed
39825f31
ZK
91// regfe is supposed to contain 0xfefefefefefefefe
92#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
93 "movq " #rega ", " #regr " \n\t"\
94 "pand " #regb ", " #regr " \n\t"\
def60345 95 "pxor " #rega ", " #regb " \n\t"\
39825f31 96 "pand " #regfe "," #regb " \n\t"\
def60345 97 "psrlq $1, " #regb " \n\t"\
91abb473 98 "paddb " #regb ", " #regr " \n\t"
def60345 99
39825f31 100#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
101 "movq " #rega ", " #regr " \n\t"\
102 "por " #regb ", " #regr " \n\t"\
def60345 103 "pxor " #rega ", " #regb " \n\t"\
39825f31 104 "pand " #regfe "," #regb " \n\t"\
def60345 105 "psrlq $1, " #regb " \n\t"\
91abb473 106 "psubb " #regb ", " #regr " \n\t"
def60345 107
39825f31 108// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
109#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
110 "movq " #rega ", " #regr " \n\t"\
111 "movq " #regc ", " #regp " \n\t"\
112 "pand " #regb ", " #regr " \n\t"\
113 "pand " #regd ", " #regp " \n\t"\
114 "pxor " #rega ", " #regb " \n\t"\
115 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
116 "pand %%mm6, " #regb " \n\t"\
117 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
118 "psrlq $1, " #regb " \n\t"\
119 "psrlq $1, " #regd " \n\t"\
120 "paddb " #regb ", " #regr " \n\t"\
121 "paddb " #regd ", " #regp " \n\t"
122
123#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
124 "movq " #rega ", " #regr " \n\t"\
125 "movq " #regc ", " #regp " \n\t"\
126 "por " #regb ", " #regr " \n\t"\
127 "por " #regd ", " #regp " \n\t"\
128 "pxor " #rega ", " #regb " \n\t"\
129 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
130 "pand %%mm6, " #regb " \n\t"\
131 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
132 "psrlq $1, " #regd " \n\t"\
133 "psrlq $1, " #regb " \n\t"\
134 "psubb " #regb ", " #regr " \n\t"\
135 "psubb " #regd ", " #regp " \n\t"
136
91abb473
ZK
137/***********************************/
138/* MMX no rounding */
139#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 140#define SET_RND MOVQ_WONE
6aa6ea8e 141#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 142#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 143
91abb473
ZK
144#include "dsputil_mmx_rnd.h"
145
146#undef DEF
fca0f0e5 147#undef SET_RND
6aa6ea8e 148#undef PAVGBP
39825f31 149#undef PAVGB
91abb473
ZK
150/***********************************/
151/* MMX rounding */
152
153#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 154#define SET_RND MOVQ_WTWO
6aa6ea8e 155#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 156#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 157
91abb473
ZK
158#include "dsputil_mmx_rnd.h"
159
160#undef DEF
fca0f0e5 161#undef SET_RND
6aa6ea8e 162#undef PAVGBP
39825f31 163#undef PAVGB
a7bd8797 164
de6d9b64
FB
165/***********************************/
166/* 3Dnow specific */
167
168#define DEF(x) x ## _3dnow
169/* for Athlons PAVGUSB is prefered */
170#define PAVGB "pavgusb"
171
172#include "dsputil_mmx_avg.h"
173
174#undef DEF
175#undef PAVGB
176
177/***********************************/
178/* MMX2 specific */
179
607dce96 180#define DEF(x) x ## _mmx2
de6d9b64
FB
181
182/* Introduced only in MMX2 set */
183#define PAVGB "pavgb"
184
185#include "dsputil_mmx_avg.h"
186
187#undef DEF
188#undef PAVGB
189
190/***********************************/
191/* standard MMX */
192
193static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
194{
607dce96
MN
195 asm volatile(
196 "movl $-128, %%eax \n\t"
197 "pxor %%mm7, %%mm7 \n\t"
198 ".balign 16 \n\t"
199 "1: \n\t"
200 "movq (%0), %%mm0 \n\t"
201 "movq (%0, %2), %%mm2 \n\t"
202 "movq %%mm0, %%mm1 \n\t"
203 "movq %%mm2, %%mm3 \n\t"
204 "punpcklbw %%mm7, %%mm0 \n\t"
205 "punpckhbw %%mm7, %%mm1 \n\t"
206 "punpcklbw %%mm7, %%mm2 \n\t"
207 "punpckhbw %%mm7, %%mm3 \n\t"
208 "movq %%mm0, (%1, %%eax)\n\t"
209 "movq %%mm1, 8(%1, %%eax)\n\t"
210 "movq %%mm2, 16(%1, %%eax)\n\t"
211 "movq %%mm3, 24(%1, %%eax)\n\t"
212 "addl %3, %0 \n\t"
213 "addl $32, %%eax \n\t"
214 "js 1b \n\t"
215 : "+r" (pixels)
216 : "r" (block+64), "r" (line_size), "r" (line_size*2)
217 : "%eax"
218 );
de6d9b64
FB
219}
220
9dbcbd92
MN
221static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
222{
223 asm volatile(
607dce96 224 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 225 "movl $-128, %%eax \n\t"
607dce96 226 ".balign 16 \n\t"
9dbcbd92
MN
227 "1: \n\t"
228 "movq (%0), %%mm0 \n\t"
229 "movq (%1), %%mm2 \n\t"
230 "movq %%mm0, %%mm1 \n\t"
231 "movq %%mm2, %%mm3 \n\t"
232 "punpcklbw %%mm7, %%mm0 \n\t"
233 "punpckhbw %%mm7, %%mm1 \n\t"
234 "punpcklbw %%mm7, %%mm2 \n\t"
235 "punpckhbw %%mm7, %%mm3 \n\t"
236 "psubw %%mm2, %%mm0 \n\t"
237 "psubw %%mm3, %%mm1 \n\t"
238 "movq %%mm0, (%2, %%eax)\n\t"
239 "movq %%mm1, 8(%2, %%eax)\n\t"
240 "addl %3, %0 \n\t"
241 "addl %3, %1 \n\t"
242 "addl $16, %%eax \n\t"
243 "jnz 1b \n\t"
244 : "+r" (s1), "+r" (s2)
245 : "r" (block+64), "r" (stride)
246 : "%eax"
247 );
248}
249
de6d9b64
FB
250static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
251{
252 const DCTELEM *p;
253 UINT8 *pix;
de6d9b64
FB
254
255 /* read the pixels */
256 p = block;
257 pix = pixels;
d6a4c0b1 258 /* unrolled loop */
de6d9b64 259 __asm __volatile(
a822a479
NK
260 "movq %3, %%mm0\n\t"
261 "movq 8%3, %%mm1\n\t"
262 "movq 16%3, %%mm2\n\t"
263 "movq 24%3, %%mm3\n\t"
264 "movq 32%3, %%mm4\n\t"
265 "movq 40%3, %%mm5\n\t"
266 "movq 48%3, %%mm6\n\t"
267 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
268 "packuswb %%mm1, %%mm0\n\t"
269 "packuswb %%mm3, %%mm2\n\t"
270 "packuswb %%mm5, %%mm4\n\t"
271 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
272 "movq %%mm0, (%0)\n\t"
273 "movq %%mm2, (%0, %1)\n\t"
274 "movq %%mm4, (%0, %1, 2)\n\t"
275 "movq %%mm6, (%0, %2)\n\t"
276 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
277 :"memory");
278 pix += line_size*4;
279 p += 32;
d6a4c0b1
ZK
280
281 // if here would be an exact copy of the code above
282 // compiler would generate some very strange code
283 // thus using "r"
284 __asm __volatile(
285 "movq (%3), %%mm0\n\t"
286 "movq 8(%3), %%mm1\n\t"
287 "movq 16(%3), %%mm2\n\t"
288 "movq 24(%3), %%mm3\n\t"
289 "movq 32(%3), %%mm4\n\t"
290 "movq 40(%3), %%mm5\n\t"
291 "movq 48(%3), %%mm6\n\t"
292 "movq 56(%3), %%mm7\n\t"
293 "packuswb %%mm1, %%mm0\n\t"
294 "packuswb %%mm3, %%mm2\n\t"
295 "packuswb %%mm5, %%mm4\n\t"
296 "packuswb %%mm7, %%mm6\n\t"
297 "movq %%mm0, (%0)\n\t"
298 "movq %%mm2, (%0, %1)\n\t"
299 "movq %%mm4, (%0, %1, 2)\n\t"
300 "movq %%mm6, (%0, %2)\n\t"
301 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
302 :"memory");
de6d9b64
FB
303}
304
305static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
306{
307 const DCTELEM *p;
308 UINT8 *pix;
309 int i;
310
311 /* read the pixels */
312 p = block;
313 pix = pixels;
d6a4c0b1
ZK
314 MOVQ_ZERO(mm7);
315 i = 4;
cd8e5f96 316 do {
de6d9b64 317 __asm __volatile(
cd8e5f96
ZK
318 "movq (%2), %%mm0\n\t"
319 "movq 8(%2), %%mm1\n\t"
320 "movq 16(%2), %%mm2\n\t"
321 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
322 "movq %0, %%mm4\n\t"
323 "movq %1, %%mm6\n\t"
324 "movq %%mm4, %%mm5\n\t"
325 "punpcklbw %%mm7, %%mm4\n\t"
326 "punpckhbw %%mm7, %%mm5\n\t"
327 "paddsw %%mm4, %%mm0\n\t"
328 "paddsw %%mm5, %%mm1\n\t"
329 "movq %%mm6, %%mm5\n\t"
330 "punpcklbw %%mm7, %%mm6\n\t"
331 "punpckhbw %%mm7, %%mm5\n\t"
332 "paddsw %%mm6, %%mm2\n\t"
333 "paddsw %%mm5, %%mm3\n\t"
334 "packuswb %%mm1, %%mm0\n\t"
335 "packuswb %%mm3, %%mm2\n\t"
336 "movq %%mm0, %0\n\t"
337 "movq %%mm2, %1\n\t"
a822a479 338 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 339 :"r"(p)
de6d9b64
FB
340 :"memory");
341 pix += line_size*2;
342 p += 16;
cd8e5f96 343 } while (--i);
de6d9b64
FB
344}
345
b3184779 346static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 347{
39825f31 348 __asm __volatile(
31ddcf98 349 "lea (%3, %3), %%eax \n\t"
52af45ad 350 ".balign 8 \n\t"
31ddcf98
ZK
351 "1: \n\t"
352 "movq (%1), %%mm0 \n\t"
353 "movq (%1, %3), %%mm1 \n\t"
354 "movq %%mm0, (%2) \n\t"
355 "movq %%mm1, (%2, %3) \n\t"
356 "addl %%eax, %1 \n\t"
357 "addl %%eax, %2 \n\t"
358 "movq (%1), %%mm0 \n\t"
359 "movq (%1, %3), %%mm1 \n\t"
360 "movq %%mm0, (%2) \n\t"
361 "movq %%mm1, (%2, %3) \n\t"
362 "addl %%eax, %1 \n\t"
363 "addl %%eax, %2 \n\t"
364 "subl $4, %0 \n\t"
365 "jnz 1b \n\t"
366 : "+g"(h), "+r" (pixels), "+r" (block)
367 : "r"(line_size)
368 : "%eax", "memory"
369 );
de6d9b64
FB
370}
371
b3184779
MN
372static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
373{
374 __asm __volatile(
375 "lea (%3, %3), %%eax \n\t"
376 ".balign 8 \n\t"
377 "1: \n\t"
378 "movq (%1), %%mm0 \n\t"
379 "movq 8(%1), %%mm4 \n\t"
380 "movq (%1, %3), %%mm1 \n\t"
381 "movq 8(%1, %3), %%mm5 \n\t"
382 "movq %%mm0, (%2) \n\t"
383 "movq %%mm4, 8(%2) \n\t"
384 "movq %%mm1, (%2, %3) \n\t"
385 "movq %%mm5, 8(%2, %3) \n\t"
386 "addl %%eax, %1 \n\t"
387 "addl %%eax, %2 \n\t"
388 "movq (%1), %%mm0 \n\t"
389 "movq 8(%1), %%mm4 \n\t"
390 "movq (%1, %3), %%mm1 \n\t"
391 "movq 8(%1, %3), %%mm5 \n\t"
392 "movq %%mm0, (%2) \n\t"
393 "movq %%mm4, 8(%2) \n\t"
394 "movq %%mm1, (%2, %3) \n\t"
395 "movq %%mm5, 8(%2, %3) \n\t"
396 "addl %%eax, %1 \n\t"
397 "addl %%eax, %2 \n\t"
398 "subl $4, %0 \n\t"
399 "jnz 1b \n\t"
400 : "+g"(h), "+r" (pixels), "+r" (block)
401 : "r"(line_size)
402 : "%eax", "memory"
403 );
404}
405
649c00c9
MN
406static void clear_blocks_mmx(DCTELEM *blocks)
407{
39825f31 408 __asm __volatile(
649c00c9
MN
409 "pxor %%mm7, %%mm7 \n\t"
410 "movl $-128*6, %%eax \n\t"
411 "1: \n\t"
412 "movq %%mm7, (%0, %%eax) \n\t"
413 "movq %%mm7, 8(%0, %%eax) \n\t"
414 "movq %%mm7, 16(%0, %%eax) \n\t"
415 "movq %%mm7, 24(%0, %%eax) \n\t"
416 "addl $32, %%eax \n\t"
417 " js 1b \n\t"
418 : : "r" (((int)blocks)+128*6)
419 : "%eax"
420 );
421}
422
61a4e8ae 423#if 0
d6a4c0b1 424static void just_return() { return; }
61a4e8ae 425#endif
d6a4c0b1 426
de6d9b64
FB
427void dsputil_init_mmx(void)
428{
429 mm_flags = mm_support();
1565dabc
LB
430#if 0
431 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 432 if (mm_flags & MM_MMX)
1565dabc 433 fprintf(stderr, " mmx");
de6d9b64 434 if (mm_flags & MM_MMXEXT)
1565dabc 435 fprintf(stderr, " mmxext");
de6d9b64 436 if (mm_flags & MM_3DNOW)
1565dabc 437 fprintf(stderr, " 3dnow");
de6d9b64 438 if (mm_flags & MM_SSE)
1565dabc 439 fprintf(stderr, " sse");
de6d9b64 440 if (mm_flags & MM_SSE2)
1565dabc
LB
441 fprintf(stderr, " sse2");
442 fprintf(stderr, "\n");
de6d9b64
FB
443#endif
444
445 if (mm_flags & MM_MMX) {
446 get_pixels = get_pixels_mmx;
9dbcbd92 447 diff_pixels = diff_pixels_mmx;
de6d9b64
FB
448 put_pixels_clamped = put_pixels_clamped_mmx;
449 add_pixels_clamped = add_pixels_clamped_mmx;
649c00c9 450 clear_blocks= clear_blocks_mmx;
dcb9cd4b 451
ba6802de
MN
452 pix_abs16x16 = pix_abs16x16_mmx;
453 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
454 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
de6d9b64 455 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
ba6802de
MN
456 pix_abs8x8 = pix_abs8x8_mmx;
457 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
458 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
459 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4afeaec9 460
b3184779
MN
461 put_pixels_tab[0][0] = put_pixels16_mmx;
462 put_pixels_tab[0][1] = put_pixels16_x2_mmx;
463 put_pixels_tab[0][2] = put_pixels16_y2_mmx;
464 put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
465
466 put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
467 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
468 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
469 put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
470
471 avg_pixels_tab[0][0] = avg_pixels16_mmx;
472 avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
473 avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
474 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
475
476 avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
477 avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
478 avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
479 avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
480
481 put_pixels_tab[1][0] = put_pixels8_mmx;
482 put_pixels_tab[1][1] = put_pixels8_x2_mmx;
483 put_pixels_tab[1][2] = put_pixels8_y2_mmx;
484 put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
485
486 put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
487 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
488 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
489 put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
490
491 avg_pixels_tab[1][0] = avg_pixels8_mmx;
492 avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
493 avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
494 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
495
496 avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
497 avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
498 avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
499 avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
607dce96 500
de6d9b64 501 if (mm_flags & MM_MMXEXT) {
ba6802de
MN
502 pix_abs16x16 = pix_abs16x16_mmx2;
503 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
504 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
505 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
dcb9cd4b 506
ba6802de
MN
507 pix_abs8x8 = pix_abs8x8_mmx2;
508 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
509 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
510 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
607dce96 511
b3184779
MN
512 put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
513 put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
514 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
515 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
516
517 avg_pixels_tab[0][0] = avg_pixels16_mmx2;
518 avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
519 avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
520 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
521
522 put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
523 put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
524 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
525 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
526
527 avg_pixels_tab[1][0] = avg_pixels8_mmx2;
528 avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
529 avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
530 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
de6d9b64 531 } else if (mm_flags & MM_3DNOW) {
b3184779
MN
532 put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
533 put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
534 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
535 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
536
537 avg_pixels_tab[0][0] = avg_pixels16_3dnow;
538 avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
539 avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
540 avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
541
542 put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
543 put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
544 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
545 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
546
547 avg_pixels_tab[1][0] = avg_pixels8_3dnow;
548 avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
549 avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
550 avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
de6d9b64 551 }
4af7bcc1 552
8def0299
FB
553 /* idct */
554 if (mm_flags & MM_MMXEXT) {
555 ff_idct = ff_mmxext_idct;
556 } else {
557 ff_idct = ff_mmx_idct;
558 }
d962f6fd
A
559#ifdef SIMPLE_IDCT
560// ff_idct = simple_idct;
561 ff_idct = simple_idct_mmx;
562#endif
de6d9b64 563 }
d6a4c0b1
ZK
564
565#if 0
566 // for speed testing
567 get_pixels = just_return;
568 put_pixels_clamped = just_return;
569 add_pixels_clamped = just_return;
570
571 pix_abs16x16 = just_return;
572 pix_abs16x16_x2 = just_return;
573 pix_abs16x16_y2 = just_return;
574 pix_abs16x16_xy2 = just_return;
575
576 put_pixels_tab[0] = just_return;
577 put_pixels_tab[1] = just_return;
578 put_pixels_tab[2] = just_return;
579 put_pixels_tab[3] = just_return;
580
581 put_no_rnd_pixels_tab[0] = just_return;
582 put_no_rnd_pixels_tab[1] = just_return;
583 put_no_rnd_pixels_tab[2] = just_return;
584 put_no_rnd_pixels_tab[3] = just_return;
585
586 avg_pixels_tab[0] = just_return;
587 avg_pixels_tab[1] = just_return;
588 avg_pixels_tab[2] = just_return;
589 avg_pixels_tab[3] = just_return;
590
591 avg_no_rnd_pixels_tab[0] = just_return;
592 avg_no_rnd_pixels_tab[1] = just_return;
593 avg_no_rnd_pixels_tab[2] = just_return;
594 avg_no_rnd_pixels_tab[3] = just_return;
595
d6a4c0b1
ZK
596 //av_fdct = just_return;
597 //ff_idct = just_return;
598#endif
de6d9b64 599}
4f12a497 600
e7fce5e9
MN
601void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block);
602
603/**
604 * this will send coeff matrixes which would have different results for the 16383 type MMX vs C IDCTs to the C IDCT
605 */
606void bit_exact_idct_put(UINT8 *dest, int line_size, INT16 *block){
607 if( block[0]>1022 && block[1]==0 && block[4 ]==0 && block[5 ]==0
608 && block[8]==0 && block[9]==0 && block[12]==0 && block[13]==0){
609 int16_t tmp[64];
610 int i;
611
612 for(i=0; i<64; i++)
613 tmp[i]= block[i];
614 for(i=0; i<64; i++)
615 block[i]= tmp[block_permute_op(i)];
616
617 simple_idct_put(dest, line_size, block);
618 }
619 else
620 gen_idct_put(dest, line_size, block);
621}
622
4f12a497
FB
623/* remove any non bit exact operation (testing purpose). NOTE that
624 this function should be kept as small as possible because it is
625 always difficult to test automatically non bit exact cases. */
626void dsputil_set_bit_exact_mmx(void)
627{
628 if (mm_flags & MM_MMX) {
b3184779
MN
629
630 /* MMX2 & 3DNOW */
631 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
632 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
633 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
634 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
635 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
636 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4afeaec9 637
b3184779 638 if (mm_flags & MM_MMXEXT) {
4afeaec9
MN
639 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
640 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
641 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
642 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
643 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
644 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497 645 }
e7fce5e9
MN
646#ifdef SIMPLE_IDCT
647 if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx)
648 ff_idct_put= bit_exact_idct_put;
649#endif
4f12a497
FB
650 }
651}