* useless commit - ignore
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5
FB
24int mm_flags; /* multimedia extension flags */
25
ba6802de
MN
26int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
31int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
36int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
41int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
de6d9b64 46/* pixel operations */
a7bd8797
MN
47static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
48static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
49static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 50
d6a4c0b1
ZK
51#define JUMPALIGN() __asm __volatile (".balign 8"::)
52#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
53
fca0f0e5
ZK
54#define MOVQ_WONE(regd) \
55 __asm __volatile ( \
56 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
57 "psrlw $15, %%" #regd ::)
58
59#define MOVQ_BFE(regd) \
60 __asm __volatile ( \
61 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
62 "paddb %%" #regd ", %%" #regd " \n\t" ::)
63
d6a4c0b1 64#ifndef PIC
fca0f0e5 65#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
66#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
67#else
68// for shared library it's better to use this way for accessing constants
69// pcmpeqd -> -1
fca0f0e5 70#define MOVQ_BONE(regd) \
d6a4c0b1 71 __asm __volatile ( \
fca0f0e5
ZK
72 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73 "psrlw $15, %%" #regd " \n\t" \
74 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
75
76#define MOVQ_WTWO(regd) \
77 __asm __volatile ( \
fca0f0e5
ZK
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79 "psrlw $15, %%" #regd " \n\t" \
80 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 81
d6a4c0b1
ZK
82#endif
83
fca0f0e5 84// using regr as temporary and for the output result
def60345 85// first argument is unmodifed and second is trashed
39825f31
ZK
86// regfe is supposed to contain 0xfefefefefefefefe
87#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
88 "movq " #rega ", " #regr " \n\t"\
89 "pand " #regb ", " #regr " \n\t"\
def60345 90 "pxor " #rega ", " #regb " \n\t"\
39825f31 91 "pand " #regfe "," #regb " \n\t"\
def60345 92 "psrlq $1, " #regb " \n\t"\
91abb473 93 "paddb " #regb ", " #regr " \n\t"
def60345 94
39825f31 95#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
96 "movq " #rega ", " #regr " \n\t"\
97 "por " #regb ", " #regr " \n\t"\
def60345 98 "pxor " #rega ", " #regb " \n\t"\
39825f31 99 "pand " #regfe "," #regb " \n\t"\
def60345 100 "psrlq $1, " #regb " \n\t"\
91abb473 101 "psubb " #regb ", " #regr " \n\t"
def60345 102
39825f31 103// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
104#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
105 "movq " #rega ", " #regr " \n\t"\
106 "movq " #regc ", " #regp " \n\t"\
107 "pand " #regb ", " #regr " \n\t"\
108 "pand " #regd ", " #regp " \n\t"\
109 "pxor " #rega ", " #regb " \n\t"\
110 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
111 "pand %%mm6, " #regb " \n\t"\
112 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
113 "psrlq $1, " #regb " \n\t"\
114 "psrlq $1, " #regd " \n\t"\
115 "paddb " #regb ", " #regr " \n\t"\
116 "paddb " #regd ", " #regp " \n\t"
117
118#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
119 "movq " #rega ", " #regr " \n\t"\
120 "movq " #regc ", " #regp " \n\t"\
121 "por " #regb ", " #regr " \n\t"\
122 "por " #regd ", " #regp " \n\t"\
123 "pxor " #rega ", " #regb " \n\t"\
124 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
125 "pand %%mm6, " #regb " \n\t"\
126 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
127 "psrlq $1, " #regd " \n\t"\
128 "psrlq $1, " #regb " \n\t"\
129 "psubb " #regb ", " #regr " \n\t"\
130 "psubb " #regd ", " #regp " \n\t"
131
91abb473
ZK
132/***********************************/
133/* MMX no rounding */
134#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 135#define SET_RND MOVQ_WONE
6aa6ea8e 136#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 137#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 138
91abb473
ZK
139#include "dsputil_mmx_rnd.h"
140
141#undef DEF
fca0f0e5 142#undef SET_RND
6aa6ea8e 143#undef PAVGBP
39825f31 144#undef PAVGB
91abb473
ZK
145/***********************************/
146/* MMX rounding */
147
148#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 149#define SET_RND MOVQ_WTWO
6aa6ea8e 150#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 151#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 152
91abb473
ZK
153#include "dsputil_mmx_rnd.h"
154
155#undef DEF
fca0f0e5 156#undef SET_RND
6aa6ea8e 157#undef PAVGBP
39825f31 158#undef PAVGB
a7bd8797 159
de6d9b64
FB
160/***********************************/
161/* 3Dnow specific */
162
163#define DEF(x) x ## _3dnow
164/* for Athlons PAVGUSB is prefered */
165#define PAVGB "pavgusb"
166
167#include "dsputil_mmx_avg.h"
168
169#undef DEF
170#undef PAVGB
171
172/***********************************/
173/* MMX2 specific */
174
607dce96 175#define DEF(x) x ## _mmx2
de6d9b64
FB
176
177/* Introduced only in MMX2 set */
178#define PAVGB "pavgb"
179
180#include "dsputil_mmx_avg.h"
181
182#undef DEF
183#undef PAVGB
184
185/***********************************/
186/* standard MMX */
187
188static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
189{
607dce96
MN
190 asm volatile(
191 "movl $-128, %%eax \n\t"
192 "pxor %%mm7, %%mm7 \n\t"
193 ".balign 16 \n\t"
194 "1: \n\t"
195 "movq (%0), %%mm0 \n\t"
196 "movq (%0, %2), %%mm2 \n\t"
197 "movq %%mm0, %%mm1 \n\t"
198 "movq %%mm2, %%mm3 \n\t"
199 "punpcklbw %%mm7, %%mm0 \n\t"
200 "punpckhbw %%mm7, %%mm1 \n\t"
201 "punpcklbw %%mm7, %%mm2 \n\t"
202 "punpckhbw %%mm7, %%mm3 \n\t"
203 "movq %%mm0, (%1, %%eax)\n\t"
204 "movq %%mm1, 8(%1, %%eax)\n\t"
205 "movq %%mm2, 16(%1, %%eax)\n\t"
206 "movq %%mm3, 24(%1, %%eax)\n\t"
207 "addl %3, %0 \n\t"
208 "addl $32, %%eax \n\t"
209 "js 1b \n\t"
210 : "+r" (pixels)
211 : "r" (block+64), "r" (line_size), "r" (line_size*2)
212 : "%eax"
213 );
de6d9b64
FB
214}
215
9dbcbd92
MN
216static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
217{
218 asm volatile(
607dce96 219 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 220 "movl $-128, %%eax \n\t"
607dce96 221 ".balign 16 \n\t"
9dbcbd92
MN
222 "1: \n\t"
223 "movq (%0), %%mm0 \n\t"
224 "movq (%1), %%mm2 \n\t"
225 "movq %%mm0, %%mm1 \n\t"
226 "movq %%mm2, %%mm3 \n\t"
227 "punpcklbw %%mm7, %%mm0 \n\t"
228 "punpckhbw %%mm7, %%mm1 \n\t"
229 "punpcklbw %%mm7, %%mm2 \n\t"
230 "punpckhbw %%mm7, %%mm3 \n\t"
231 "psubw %%mm2, %%mm0 \n\t"
232 "psubw %%mm3, %%mm1 \n\t"
233 "movq %%mm0, (%2, %%eax)\n\t"
234 "movq %%mm1, 8(%2, %%eax)\n\t"
235 "addl %3, %0 \n\t"
236 "addl %3, %1 \n\t"
237 "addl $16, %%eax \n\t"
238 "jnz 1b \n\t"
239 : "+r" (s1), "+r" (s2)
240 : "r" (block+64), "r" (stride)
241 : "%eax"
242 );
243}
244
de6d9b64
FB
245static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
246{
247 const DCTELEM *p;
248 UINT8 *pix;
de6d9b64
FB
249
250 /* read the pixels */
251 p = block;
252 pix = pixels;
d6a4c0b1 253 /* unrolled loop */
de6d9b64 254 __asm __volatile(
a822a479
NK
255 "movq %3, %%mm0\n\t"
256 "movq 8%3, %%mm1\n\t"
257 "movq 16%3, %%mm2\n\t"
258 "movq 24%3, %%mm3\n\t"
259 "movq 32%3, %%mm4\n\t"
260 "movq 40%3, %%mm5\n\t"
261 "movq 48%3, %%mm6\n\t"
262 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
263 "packuswb %%mm1, %%mm0\n\t"
264 "packuswb %%mm3, %%mm2\n\t"
265 "packuswb %%mm5, %%mm4\n\t"
266 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
267 "movq %%mm0, (%0)\n\t"
268 "movq %%mm2, (%0, %1)\n\t"
269 "movq %%mm4, (%0, %1, 2)\n\t"
270 "movq %%mm6, (%0, %2)\n\t"
271 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
272 :"memory");
273 pix += line_size*4;
274 p += 32;
d6a4c0b1
ZK
275
276 // if here would be an exact copy of the code above
277 // compiler would generate some very strange code
278 // thus using "r"
279 __asm __volatile(
280 "movq (%3), %%mm0\n\t"
281 "movq 8(%3), %%mm1\n\t"
282 "movq 16(%3), %%mm2\n\t"
283 "movq 24(%3), %%mm3\n\t"
284 "movq 32(%3), %%mm4\n\t"
285 "movq 40(%3), %%mm5\n\t"
286 "movq 48(%3), %%mm6\n\t"
287 "movq 56(%3), %%mm7\n\t"
288 "packuswb %%mm1, %%mm0\n\t"
289 "packuswb %%mm3, %%mm2\n\t"
290 "packuswb %%mm5, %%mm4\n\t"
291 "packuswb %%mm7, %%mm6\n\t"
292 "movq %%mm0, (%0)\n\t"
293 "movq %%mm2, (%0, %1)\n\t"
294 "movq %%mm4, (%0, %1, 2)\n\t"
295 "movq %%mm6, (%0, %2)\n\t"
296 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
297 :"memory");
de6d9b64
FB
298}
299
300static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
301{
302 const DCTELEM *p;
303 UINT8 *pix;
304 int i;
305
306 /* read the pixels */
307 p = block;
308 pix = pixels;
d6a4c0b1
ZK
309 MOVQ_ZERO(mm7);
310 i = 4;
cd8e5f96 311 do {
de6d9b64 312 __asm __volatile(
cd8e5f96
ZK
313 "movq (%2), %%mm0\n\t"
314 "movq 8(%2), %%mm1\n\t"
315 "movq 16(%2), %%mm2\n\t"
316 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
317 "movq %0, %%mm4\n\t"
318 "movq %1, %%mm6\n\t"
319 "movq %%mm4, %%mm5\n\t"
320 "punpcklbw %%mm7, %%mm4\n\t"
321 "punpckhbw %%mm7, %%mm5\n\t"
322 "paddsw %%mm4, %%mm0\n\t"
323 "paddsw %%mm5, %%mm1\n\t"
324 "movq %%mm6, %%mm5\n\t"
325 "punpcklbw %%mm7, %%mm6\n\t"
326 "punpckhbw %%mm7, %%mm5\n\t"
327 "paddsw %%mm6, %%mm2\n\t"
328 "paddsw %%mm5, %%mm3\n\t"
329 "packuswb %%mm1, %%mm0\n\t"
330 "packuswb %%mm3, %%mm2\n\t"
331 "movq %%mm0, %0\n\t"
332 "movq %%mm2, %1\n\t"
a822a479 333 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 334 :"r"(p)
de6d9b64
FB
335 :"memory");
336 pix += line_size*2;
337 p += 16;
cd8e5f96 338 } while (--i);
de6d9b64
FB
339}
340
b3184779 341static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 342{
39825f31 343 __asm __volatile(
31ddcf98 344 "lea (%3, %3), %%eax \n\t"
52af45ad 345 ".balign 8 \n\t"
31ddcf98
ZK
346 "1: \n\t"
347 "movq (%1), %%mm0 \n\t"
348 "movq (%1, %3), %%mm1 \n\t"
349 "movq %%mm0, (%2) \n\t"
350 "movq %%mm1, (%2, %3) \n\t"
351 "addl %%eax, %1 \n\t"
352 "addl %%eax, %2 \n\t"
353 "movq (%1), %%mm0 \n\t"
354 "movq (%1, %3), %%mm1 \n\t"
355 "movq %%mm0, (%2) \n\t"
356 "movq %%mm1, (%2, %3) \n\t"
357 "addl %%eax, %1 \n\t"
358 "addl %%eax, %2 \n\t"
359 "subl $4, %0 \n\t"
360 "jnz 1b \n\t"
361 : "+g"(h), "+r" (pixels), "+r" (block)
362 : "r"(line_size)
363 : "%eax", "memory"
364 );
de6d9b64
FB
365}
366
b3184779
MN
367static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
368{
369 __asm __volatile(
370 "lea (%3, %3), %%eax \n\t"
371 ".balign 8 \n\t"
372 "1: \n\t"
373 "movq (%1), %%mm0 \n\t"
374 "movq 8(%1), %%mm4 \n\t"
375 "movq (%1, %3), %%mm1 \n\t"
376 "movq 8(%1, %3), %%mm5 \n\t"
377 "movq %%mm0, (%2) \n\t"
378 "movq %%mm4, 8(%2) \n\t"
379 "movq %%mm1, (%2, %3) \n\t"
380 "movq %%mm5, 8(%2, %3) \n\t"
381 "addl %%eax, %1 \n\t"
382 "addl %%eax, %2 \n\t"
383 "movq (%1), %%mm0 \n\t"
384 "movq 8(%1), %%mm4 \n\t"
385 "movq (%1, %3), %%mm1 \n\t"
386 "movq 8(%1, %3), %%mm5 \n\t"
387 "movq %%mm0, (%2) \n\t"
388 "movq %%mm4, 8(%2) \n\t"
389 "movq %%mm1, (%2, %3) \n\t"
390 "movq %%mm5, 8(%2, %3) \n\t"
391 "addl %%eax, %1 \n\t"
392 "addl %%eax, %2 \n\t"
393 "subl $4, %0 \n\t"
394 "jnz 1b \n\t"
395 : "+g"(h), "+r" (pixels), "+r" (block)
396 : "r"(line_size)
397 : "%eax", "memory"
398 );
399}
400
649c00c9
MN
401static void clear_blocks_mmx(DCTELEM *blocks)
402{
39825f31 403 __asm __volatile(
649c00c9
MN
404 "pxor %%mm7, %%mm7 \n\t"
405 "movl $-128*6, %%eax \n\t"
406 "1: \n\t"
407 "movq %%mm7, (%0, %%eax) \n\t"
408 "movq %%mm7, 8(%0, %%eax) \n\t"
409 "movq %%mm7, 16(%0, %%eax) \n\t"
410 "movq %%mm7, 24(%0, %%eax) \n\t"
411 "addl $32, %%eax \n\t"
412 " js 1b \n\t"
413 : : "r" (((int)blocks)+128*6)
414 : "%eax"
415 );
416}
417
084c726b
MN
418static int pix_sum16_mmx(UINT8 * pix, int line_size){
419 const int h=16;
420 int sum;
421 int index= -line_size*h;
422
423 __asm __volatile(
424 "pxor %%mm7, %%mm7 \n\t"
425 "pxor %%mm6, %%mm6 \n\t"
426 "1: \n\t"
427 "movq (%2, %1), %%mm0 \n\t"
428 "movq (%2, %1), %%mm1 \n\t"
429 "movq 8(%2, %1), %%mm2 \n\t"
430 "movq 8(%2, %1), %%mm3 \n\t"
431 "punpcklbw %%mm7, %%mm0 \n\t"
432 "punpckhbw %%mm7, %%mm1 \n\t"
433 "punpcklbw %%mm7, %%mm2 \n\t"
434 "punpckhbw %%mm7, %%mm3 \n\t"
435 "paddw %%mm0, %%mm1 \n\t"
436 "paddw %%mm2, %%mm3 \n\t"
437 "paddw %%mm1, %%mm3 \n\t"
438 "paddw %%mm3, %%mm6 \n\t"
439 "addl %3, %1 \n\t"
440 " js 1b \n\t"
441 "movq %%mm6, %%mm5 \n\t"
442 "psrlq $32, %%mm6 \n\t"
443 "paddw %%mm5, %%mm6 \n\t"
444 "movq %%mm6, %%mm5 \n\t"
445 "psrlq $16, %%mm6 \n\t"
446 "paddw %%mm5, %%mm6 \n\t"
447 "movd %%mm6, %0 \n\t"
448 "andl $0xFFFF, %0 \n\t"
449 : "=&r" (sum), "+r" (index)
450 : "r" (pix - index), "r" (line_size)
451 );
452
453 return sum;
454}
455
61a4e8ae 456#if 0
d6a4c0b1 457static void just_return() { return; }
61a4e8ae 458#endif
d6a4c0b1 459
de6d9b64
FB
460void dsputil_init_mmx(void)
461{
462 mm_flags = mm_support();
1565dabc
LB
463#if 0
464 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 465 if (mm_flags & MM_MMX)
1565dabc 466 fprintf(stderr, " mmx");
de6d9b64 467 if (mm_flags & MM_MMXEXT)
1565dabc 468 fprintf(stderr, " mmxext");
de6d9b64 469 if (mm_flags & MM_3DNOW)
1565dabc 470 fprintf(stderr, " 3dnow");
de6d9b64 471 if (mm_flags & MM_SSE)
1565dabc 472 fprintf(stderr, " sse");
de6d9b64 473 if (mm_flags & MM_SSE2)
1565dabc
LB
474 fprintf(stderr, " sse2");
475 fprintf(stderr, "\n");
de6d9b64
FB
476#endif
477
478 if (mm_flags & MM_MMX) {
479 get_pixels = get_pixels_mmx;
9dbcbd92 480 diff_pixels = diff_pixels_mmx;
de6d9b64
FB
481 put_pixels_clamped = put_pixels_clamped_mmx;
482 add_pixels_clamped = add_pixels_clamped_mmx;
649c00c9 483 clear_blocks= clear_blocks_mmx;
084c726b 484 pix_sum= pix_sum16_mmx;
dcb9cd4b 485
ba6802de
MN
486 pix_abs16x16 = pix_abs16x16_mmx;
487 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
488 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
de6d9b64 489 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
ba6802de
MN
490 pix_abs8x8 = pix_abs8x8_mmx;
491 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
492 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
493 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4afeaec9 494
b3184779
MN
495 put_pixels_tab[0][0] = put_pixels16_mmx;
496 put_pixels_tab[0][1] = put_pixels16_x2_mmx;
497 put_pixels_tab[0][2] = put_pixels16_y2_mmx;
498 put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
499
500 put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
501 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
502 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
503 put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
504
505 avg_pixels_tab[0][0] = avg_pixels16_mmx;
506 avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
507 avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
508 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
509
510 avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
511 avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
512 avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
513 avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
514
515 put_pixels_tab[1][0] = put_pixels8_mmx;
516 put_pixels_tab[1][1] = put_pixels8_x2_mmx;
517 put_pixels_tab[1][2] = put_pixels8_y2_mmx;
518 put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
519
520 put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
521 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
522 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
523 put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
524
525 avg_pixels_tab[1][0] = avg_pixels8_mmx;
526 avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
527 avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
528 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
529
530 avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
531 avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
532 avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
533 avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
607dce96 534
de6d9b64 535 if (mm_flags & MM_MMXEXT) {
ba6802de
MN
536 pix_abs16x16 = pix_abs16x16_mmx2;
537 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
538 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
539 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
dcb9cd4b 540
ba6802de
MN
541 pix_abs8x8 = pix_abs8x8_mmx2;
542 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
543 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
544 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
607dce96 545
b3184779
MN
546 put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
547 put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
548 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
549 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
550
551 avg_pixels_tab[0][0] = avg_pixels16_mmx2;
552 avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
553 avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
554 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
555
556 put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
557 put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
558 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
559 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
560
561 avg_pixels_tab[1][0] = avg_pixels8_mmx2;
562 avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
563 avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
564 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
de6d9b64 565 } else if (mm_flags & MM_3DNOW) {
b3184779
MN
566 put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
567 put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
568 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
569 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
570
571 avg_pixels_tab[0][0] = avg_pixels16_3dnow;
572 avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
573 avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
574 avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
575
576 put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
577 put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
578 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
579 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
580
581 avg_pixels_tab[1][0] = avg_pixels8_3dnow;
582 avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
583 avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
584 avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
de6d9b64
FB
585 }
586 }
d6a4c0b1
ZK
587
588#if 0
589 // for speed testing
590 get_pixels = just_return;
591 put_pixels_clamped = just_return;
592 add_pixels_clamped = just_return;
593
594 pix_abs16x16 = just_return;
595 pix_abs16x16_x2 = just_return;
596 pix_abs16x16_y2 = just_return;
597 pix_abs16x16_xy2 = just_return;
598
599 put_pixels_tab[0] = just_return;
600 put_pixels_tab[1] = just_return;
601 put_pixels_tab[2] = just_return;
602 put_pixels_tab[3] = just_return;
603
604 put_no_rnd_pixels_tab[0] = just_return;
605 put_no_rnd_pixels_tab[1] = just_return;
606 put_no_rnd_pixels_tab[2] = just_return;
607 put_no_rnd_pixels_tab[3] = just_return;
608
609 avg_pixels_tab[0] = just_return;
610 avg_pixels_tab[1] = just_return;
611 avg_pixels_tab[2] = just_return;
612 avg_pixels_tab[3] = just_return;
613
614 avg_no_rnd_pixels_tab[0] = just_return;
615 avg_no_rnd_pixels_tab[1] = just_return;
616 avg_no_rnd_pixels_tab[2] = just_return;
617 avg_no_rnd_pixels_tab[3] = just_return;
618
d6a4c0b1
ZK
619 //av_fdct = just_return;
620 //ff_idct = just_return;
621#endif
de6d9b64 622}
4f12a497
FB
623
624/* remove any non bit exact operation (testing purpose). NOTE that
625 this function should be kept as small as possible because it is
626 always difficult to test automatically non bit exact cases. */
627void dsputil_set_bit_exact_mmx(void)
628{
629 if (mm_flags & MM_MMX) {
b3184779
MN
630
631 /* MMX2 & 3DNOW */
632 put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
633 put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
634 avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
635 put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
636 put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
637 avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4afeaec9 638
b3184779 639 if (mm_flags & MM_MMXEXT) {
4afeaec9
MN
640 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
641 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
642 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
643 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
644 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
645 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497
FB
646 }
647 }
648}