reducing sizeof MpegEncContext to avoid stack overflow on crap M$ windo$
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5
FB
25int mm_flags; /* multimedia extension flags */
26
ba6802de
MN
27int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31
32int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36
37int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41
42int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46
8def0299
FB
47/* external functions, from idct_mmx.c */
48void ff_mmx_idct(DCTELEM *block);
49void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 50
de6d9b64 51/* pixel operations */
a7bd8797
MN
52static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 55
d6a4c0b1
ZK
56#define JUMPALIGN() __asm __volatile (".balign 8"::)
57#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
58
fca0f0e5
ZK
59#define MOVQ_WONE(regd) \
60 __asm __volatile ( \
61 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
62 "psrlw $15, %%" #regd ::)
63
64#define MOVQ_BFE(regd) \
65 __asm __volatile ( \
66 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
67 "paddb %%" #regd ", %%" #regd " \n\t" ::)
68
d6a4c0b1 69#ifndef PIC
fca0f0e5 70#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
71#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
72#else
73// for shared library it's better to use this way for accessing constants
74// pcmpeqd -> -1
fca0f0e5 75#define MOVQ_BONE(regd) \
d6a4c0b1 76 __asm __volatile ( \
fca0f0e5
ZK
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
78 "psrlw $15, %%" #regd " \n\t" \
79 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
80
81#define MOVQ_WTWO(regd) \
82 __asm __volatile ( \
fca0f0e5
ZK
83 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
84 "psrlw $15, %%" #regd " \n\t" \
85 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 86
d6a4c0b1
ZK
87#endif
88
fca0f0e5 89// using regr as temporary and for the output result
def60345 90// first argument is unmodifed and second is trashed
39825f31
ZK
91// regfe is supposed to contain 0xfefefefefefefefe
92#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
93 "movq " #rega ", " #regr " \n\t"\
94 "pand " #regb ", " #regr " \n\t"\
def60345 95 "pxor " #rega ", " #regb " \n\t"\
39825f31 96 "pand " #regfe "," #regb " \n\t"\
def60345 97 "psrlq $1, " #regb " \n\t"\
91abb473 98 "paddb " #regb ", " #regr " \n\t"
def60345 99
39825f31 100#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
101 "movq " #rega ", " #regr " \n\t"\
102 "por " #regb ", " #regr " \n\t"\
def60345 103 "pxor " #rega ", " #regb " \n\t"\
39825f31 104 "pand " #regfe "," #regb " \n\t"\
def60345 105 "psrlq $1, " #regb " \n\t"\
91abb473 106 "psubb " #regb ", " #regr " \n\t"
def60345 107
39825f31 108// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
109#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
110 "movq " #rega ", " #regr " \n\t"\
111 "movq " #regc ", " #regp " \n\t"\
112 "pand " #regb ", " #regr " \n\t"\
113 "pand " #regd ", " #regp " \n\t"\
114 "pxor " #rega ", " #regb " \n\t"\
115 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
116 "pand %%mm6, " #regb " \n\t"\
117 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
118 "psrlq $1, " #regb " \n\t"\
119 "psrlq $1, " #regd " \n\t"\
120 "paddb " #regb ", " #regr " \n\t"\
121 "paddb " #regd ", " #regp " \n\t"
122
123#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
124 "movq " #rega ", " #regr " \n\t"\
125 "movq " #regc ", " #regp " \n\t"\
126 "por " #regb ", " #regr " \n\t"\
127 "por " #regd ", " #regp " \n\t"\
128 "pxor " #rega ", " #regb " \n\t"\
129 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
130 "pand %%mm6, " #regb " \n\t"\
131 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
132 "psrlq $1, " #regd " \n\t"\
133 "psrlq $1, " #regb " \n\t"\
134 "psubb " #regb ", " #regr " \n\t"\
135 "psubb " #regd ", " #regp " \n\t"
136
91abb473
ZK
137/***********************************/
138/* MMX no rounding */
139#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 140#define SET_RND MOVQ_WONE
6aa6ea8e 141#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 142#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 143
91abb473
ZK
144#include "dsputil_mmx_rnd.h"
145
146#undef DEF
fca0f0e5 147#undef SET_RND
6aa6ea8e 148#undef PAVGBP
39825f31 149#undef PAVGB
91abb473
ZK
150/***********************************/
151/* MMX rounding */
152
153#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 154#define SET_RND MOVQ_WTWO
6aa6ea8e 155#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 156#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 157
91abb473
ZK
158#include "dsputil_mmx_rnd.h"
159
160#undef DEF
fca0f0e5 161#undef SET_RND
6aa6ea8e 162#undef PAVGBP
39825f31 163#undef PAVGB
a7bd8797 164
de6d9b64
FB
165/***********************************/
166/* 3Dnow specific */
167
168#define DEF(x) x ## _3dnow
169/* for Athlons PAVGUSB is prefered */
170#define PAVGB "pavgusb"
171
172#include "dsputil_mmx_avg.h"
173
174#undef DEF
175#undef PAVGB
176
177/***********************************/
178/* MMX2 specific */
179
607dce96 180#define DEF(x) x ## _mmx2
de6d9b64
FB
181
182/* Introduced only in MMX2 set */
183#define PAVGB "pavgb"
184
185#include "dsputil_mmx_avg.h"
186
187#undef DEF
188#undef PAVGB
189
190/***********************************/
191/* standard MMX */
192
193static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
194{
607dce96
MN
195 asm volatile(
196 "movl $-128, %%eax \n\t"
197 "pxor %%mm7, %%mm7 \n\t"
198 ".balign 16 \n\t"
199 "1: \n\t"
200 "movq (%0), %%mm0 \n\t"
201 "movq (%0, %2), %%mm2 \n\t"
202 "movq %%mm0, %%mm1 \n\t"
203 "movq %%mm2, %%mm3 \n\t"
204 "punpcklbw %%mm7, %%mm0 \n\t"
205 "punpckhbw %%mm7, %%mm1 \n\t"
206 "punpcklbw %%mm7, %%mm2 \n\t"
207 "punpckhbw %%mm7, %%mm3 \n\t"
208 "movq %%mm0, (%1, %%eax)\n\t"
209 "movq %%mm1, 8(%1, %%eax)\n\t"
210 "movq %%mm2, 16(%1, %%eax)\n\t"
211 "movq %%mm3, 24(%1, %%eax)\n\t"
212 "addl %3, %0 \n\t"
213 "addl $32, %%eax \n\t"
214 "js 1b \n\t"
215 : "+r" (pixels)
216 : "r" (block+64), "r" (line_size), "r" (line_size*2)
217 : "%eax"
218 );
de6d9b64
FB
219}
220
9dbcbd92
MN
221static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
222{
223 asm volatile(
607dce96 224 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 225 "movl $-128, %%eax \n\t"
607dce96 226 ".balign 16 \n\t"
9dbcbd92
MN
227 "1: \n\t"
228 "movq (%0), %%mm0 \n\t"
229 "movq (%1), %%mm2 \n\t"
230 "movq %%mm0, %%mm1 \n\t"
231 "movq %%mm2, %%mm3 \n\t"
232 "punpcklbw %%mm7, %%mm0 \n\t"
233 "punpckhbw %%mm7, %%mm1 \n\t"
234 "punpcklbw %%mm7, %%mm2 \n\t"
235 "punpckhbw %%mm7, %%mm3 \n\t"
236 "psubw %%mm2, %%mm0 \n\t"
237 "psubw %%mm3, %%mm1 \n\t"
238 "movq %%mm0, (%2, %%eax)\n\t"
239 "movq %%mm1, 8(%2, %%eax)\n\t"
240 "addl %3, %0 \n\t"
241 "addl %3, %1 \n\t"
242 "addl $16, %%eax \n\t"
243 "jnz 1b \n\t"
244 : "+r" (s1), "+r" (s2)
245 : "r" (block+64), "r" (stride)
246 : "%eax"
247 );
248}
249
de6d9b64
FB
250static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
251{
252 const DCTELEM *p;
253 UINT8 *pix;
de6d9b64
FB
254
255 /* read the pixels */
256 p = block;
257 pix = pixels;
d6a4c0b1 258 /* unrolled loop */
de6d9b64 259 __asm __volatile(
a822a479
NK
260 "movq %3, %%mm0\n\t"
261 "movq 8%3, %%mm1\n\t"
262 "movq 16%3, %%mm2\n\t"
263 "movq 24%3, %%mm3\n\t"
264 "movq 32%3, %%mm4\n\t"
265 "movq 40%3, %%mm5\n\t"
266 "movq 48%3, %%mm6\n\t"
267 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
268 "packuswb %%mm1, %%mm0\n\t"
269 "packuswb %%mm3, %%mm2\n\t"
270 "packuswb %%mm5, %%mm4\n\t"
271 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
272 "movq %%mm0, (%0)\n\t"
273 "movq %%mm2, (%0, %1)\n\t"
274 "movq %%mm4, (%0, %1, 2)\n\t"
275 "movq %%mm6, (%0, %2)\n\t"
276 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
277 :"memory");
278 pix += line_size*4;
279 p += 32;
d6a4c0b1
ZK
280
281 // if here would be an exact copy of the code above
282 // compiler would generate some very strange code
283 // thus using "r"
284 __asm __volatile(
285 "movq (%3), %%mm0\n\t"
286 "movq 8(%3), %%mm1\n\t"
287 "movq 16(%3), %%mm2\n\t"
288 "movq 24(%3), %%mm3\n\t"
289 "movq 32(%3), %%mm4\n\t"
290 "movq 40(%3), %%mm5\n\t"
291 "movq 48(%3), %%mm6\n\t"
292 "movq 56(%3), %%mm7\n\t"
293 "packuswb %%mm1, %%mm0\n\t"
294 "packuswb %%mm3, %%mm2\n\t"
295 "packuswb %%mm5, %%mm4\n\t"
296 "packuswb %%mm7, %%mm6\n\t"
297 "movq %%mm0, (%0)\n\t"
298 "movq %%mm2, (%0, %1)\n\t"
299 "movq %%mm4, (%0, %1, 2)\n\t"
300 "movq %%mm6, (%0, %2)\n\t"
301 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
302 :"memory");
de6d9b64
FB
303}
304
305static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
306{
307 const DCTELEM *p;
308 UINT8 *pix;
309 int i;
310
311 /* read the pixels */
312 p = block;
313 pix = pixels;
d6a4c0b1
ZK
314 MOVQ_ZERO(mm7);
315 i = 4;
cd8e5f96 316 do {
de6d9b64 317 __asm __volatile(
cd8e5f96
ZK
318 "movq (%2), %%mm0\n\t"
319 "movq 8(%2), %%mm1\n\t"
320 "movq 16(%2), %%mm2\n\t"
321 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
322 "movq %0, %%mm4\n\t"
323 "movq %1, %%mm6\n\t"
324 "movq %%mm4, %%mm5\n\t"
325 "punpcklbw %%mm7, %%mm4\n\t"
326 "punpckhbw %%mm7, %%mm5\n\t"
327 "paddsw %%mm4, %%mm0\n\t"
328 "paddsw %%mm5, %%mm1\n\t"
329 "movq %%mm6, %%mm5\n\t"
330 "punpcklbw %%mm7, %%mm6\n\t"
331 "punpckhbw %%mm7, %%mm5\n\t"
332 "paddsw %%mm6, %%mm2\n\t"
333 "paddsw %%mm5, %%mm3\n\t"
334 "packuswb %%mm1, %%mm0\n\t"
335 "packuswb %%mm3, %%mm2\n\t"
336 "movq %%mm0, %0\n\t"
337 "movq %%mm2, %1\n\t"
a822a479 338 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 339 :"r"(p)
de6d9b64
FB
340 :"memory");
341 pix += line_size*2;
342 p += 16;
cd8e5f96 343 } while (--i);
de6d9b64
FB
344}
345
346static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
347{
39825f31 348 __asm __volatile(
31ddcf98 349 "lea (%3, %3), %%eax \n\t"
52af45ad 350 ".balign 8 \n\t"
31ddcf98
ZK
351 "1: \n\t"
352 "movq (%1), %%mm0 \n\t"
353 "movq (%1, %3), %%mm1 \n\t"
354 "movq %%mm0, (%2) \n\t"
355 "movq %%mm1, (%2, %3) \n\t"
356 "addl %%eax, %1 \n\t"
357 "addl %%eax, %2 \n\t"
358 "movq (%1), %%mm0 \n\t"
359 "movq (%1, %3), %%mm1 \n\t"
360 "movq %%mm0, (%2) \n\t"
361 "movq %%mm1, (%2, %3) \n\t"
362 "addl %%eax, %1 \n\t"
363 "addl %%eax, %2 \n\t"
364 "subl $4, %0 \n\t"
365 "jnz 1b \n\t"
366 : "+g"(h), "+r" (pixels), "+r" (block)
367 : "r"(line_size)
368 : "%eax", "memory"
369 );
de6d9b64
FB
370}
371
649c00c9
MN
372static void clear_blocks_mmx(DCTELEM *blocks)
373{
39825f31 374 __asm __volatile(
649c00c9
MN
375 "pxor %%mm7, %%mm7 \n\t"
376 "movl $-128*6, %%eax \n\t"
377 "1: \n\t"
378 "movq %%mm7, (%0, %%eax) \n\t"
379 "movq %%mm7, 8(%0, %%eax) \n\t"
380 "movq %%mm7, 16(%0, %%eax) \n\t"
381 "movq %%mm7, 24(%0, %%eax) \n\t"
382 "addl $32, %%eax \n\t"
383 " js 1b \n\t"
384 : : "r" (((int)blocks)+128*6)
385 : "%eax"
386 );
387}
388
61a4e8ae 389#if 0
d6a4c0b1 390static void just_return() { return; }
61a4e8ae 391#endif
d6a4c0b1 392
de6d9b64
FB
393void dsputil_init_mmx(void)
394{
395 mm_flags = mm_support();
1565dabc
LB
396#if 0
397 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 398 if (mm_flags & MM_MMX)
1565dabc 399 fprintf(stderr, " mmx");
de6d9b64 400 if (mm_flags & MM_MMXEXT)
1565dabc 401 fprintf(stderr, " mmxext");
de6d9b64 402 if (mm_flags & MM_3DNOW)
1565dabc 403 fprintf(stderr, " 3dnow");
de6d9b64 404 if (mm_flags & MM_SSE)
1565dabc 405 fprintf(stderr, " sse");
de6d9b64 406 if (mm_flags & MM_SSE2)
1565dabc
LB
407 fprintf(stderr, " sse2");
408 fprintf(stderr, "\n");
de6d9b64
FB
409#endif
410
411 if (mm_flags & MM_MMX) {
412 get_pixels = get_pixels_mmx;
9dbcbd92 413 diff_pixels = diff_pixels_mmx;
de6d9b64
FB
414 put_pixels_clamped = put_pixels_clamped_mmx;
415 add_pixels_clamped = add_pixels_clamped_mmx;
649c00c9 416 clear_blocks= clear_blocks_mmx;
dcb9cd4b 417
ba6802de
MN
418 pix_abs16x16 = pix_abs16x16_mmx;
419 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
420 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
de6d9b64 421 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
ba6802de
MN
422 pix_abs8x8 = pix_abs8x8_mmx;
423 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
424 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
425 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4afeaec9 426
de6d9b64
FB
427 put_pixels_tab[0] = put_pixels_mmx;
428 put_pixels_tab[1] = put_pixels_x2_mmx;
429 put_pixels_tab[2] = put_pixels_y2_mmx;
430 put_pixels_tab[3] = put_pixels_xy2_mmx;
431
432 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
433 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
434 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
435 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
dcb9cd4b 436
de6d9b64
FB
437 avg_pixels_tab[0] = avg_pixels_mmx;
438 avg_pixels_tab[1] = avg_pixels_x2_mmx;
439 avg_pixels_tab[2] = avg_pixels_y2_mmx;
440 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
441
442 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
443 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
444 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
445 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
607dce96 446
de6d9b64 447 if (mm_flags & MM_MMXEXT) {
ba6802de
MN
448 pix_abs16x16 = pix_abs16x16_mmx2;
449 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
450 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
451 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
dcb9cd4b 452
ba6802de
MN
453 pix_abs8x8 = pix_abs8x8_mmx2;
454 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
455 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
456 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
607dce96
MN
457
458 put_pixels_tab[1] = put_pixels_x2_mmx2;
459 put_pixels_tab[2] = put_pixels_y2_mmx2;
460 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
461 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
dcb9cd4b 462
607dce96
MN
463 avg_pixels_tab[0] = avg_pixels_mmx2;
464 avg_pixels_tab[1] = avg_pixels_x2_mmx2;
465 avg_pixels_tab[2] = avg_pixels_y2_mmx2;
466 avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
de6d9b64
FB
467 } else if (mm_flags & MM_3DNOW) {
468 put_pixels_tab[1] = put_pixels_x2_3dnow;
469 put_pixels_tab[2] = put_pixels_y2_3dnow;
607dce96
MN
470 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
471 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
61a4e8ae 472
de6d9b64
FB
473 avg_pixels_tab[0] = avg_pixels_3dnow;
474 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
475 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
476 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
de6d9b64 477 }
4af7bcc1 478
8def0299
FB
479 /* idct */
480 if (mm_flags & MM_MMXEXT) {
481 ff_idct = ff_mmxext_idct;
482 } else {
483 ff_idct = ff_mmx_idct;
484 }
d962f6fd
A
485#ifdef SIMPLE_IDCT
486// ff_idct = simple_idct;
487 ff_idct = simple_idct_mmx;
488#endif
de6d9b64 489 }
d6a4c0b1
ZK
490
491#if 0
492 // for speed testing
493 get_pixels = just_return;
494 put_pixels_clamped = just_return;
495 add_pixels_clamped = just_return;
496
497 pix_abs16x16 = just_return;
498 pix_abs16x16_x2 = just_return;
499 pix_abs16x16_y2 = just_return;
500 pix_abs16x16_xy2 = just_return;
501
502 put_pixels_tab[0] = just_return;
503 put_pixels_tab[1] = just_return;
504 put_pixels_tab[2] = just_return;
505 put_pixels_tab[3] = just_return;
506
507 put_no_rnd_pixels_tab[0] = just_return;
508 put_no_rnd_pixels_tab[1] = just_return;
509 put_no_rnd_pixels_tab[2] = just_return;
510 put_no_rnd_pixels_tab[3] = just_return;
511
512 avg_pixels_tab[0] = just_return;
513 avg_pixels_tab[1] = just_return;
514 avg_pixels_tab[2] = just_return;
515 avg_pixels_tab[3] = just_return;
516
517 avg_no_rnd_pixels_tab[0] = just_return;
518 avg_no_rnd_pixels_tab[1] = just_return;
519 avg_no_rnd_pixels_tab[2] = just_return;
520 avg_no_rnd_pixels_tab[3] = just_return;
521
d6a4c0b1
ZK
522 //av_fdct = just_return;
523 //ff_idct = just_return;
524#endif
de6d9b64 525}
4f12a497 526
e7fce5e9
MN
527void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block);
528
529/**
530 * this will send coeff matrixes which would have different results for the 16383 type MMX vs C IDCTs to the C IDCT
531 */
532void bit_exact_idct_put(UINT8 *dest, int line_size, INT16 *block){
533 if( block[0]>1022 && block[1]==0 && block[4 ]==0 && block[5 ]==0
534 && block[8]==0 && block[9]==0 && block[12]==0 && block[13]==0){
535 int16_t tmp[64];
536 int i;
537
538 for(i=0; i<64; i++)
539 tmp[i]= block[i];
540 for(i=0; i<64; i++)
541 block[i]= tmp[block_permute_op(i)];
542
543 simple_idct_put(dest, line_size, block);
544 }
545 else
546 gen_idct_put(dest, line_size, block);
547}
548
4f12a497
FB
549/* remove any non bit exact operation (testing purpose). NOTE that
550 this function should be kept as small as possible because it is
551 always difficult to test automatically non bit exact cases. */
552void dsputil_set_bit_exact_mmx(void)
553{
554 if (mm_flags & MM_MMX) {
555 if (mm_flags & MM_MMXEXT) {
556 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
557 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
558 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
4afeaec9
MN
559
560 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
561 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
562 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
563 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
564 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
565 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497
FB
566 } else if (mm_flags & MM_3DNOW) {
567 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
568 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
569 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
570 }
e7fce5e9
MN
571#ifdef SIMPLE_IDCT
572 if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx)
573 ff_idct_put= bit_exact_idct_put;
574#endif
4f12a497
FB
575 }
576}