fixed wmv2 slices
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5 24int mm_flags; /* multimedia extension flags */
eb4b3dd3 25/* FIXME use them in static form */
ba6802de
MN
26int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
31int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
36int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
41int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
1457ab52
MN
46int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
47int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
48int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
49int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
50
de6d9b64 51/* pixel operations */
a7bd8797
MN
52static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 55
826f429a
MN
56static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
57static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
58static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
59static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
60
d6a4c0b1
ZK
61#define JUMPALIGN() __asm __volatile (".balign 8"::)
62#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
63
fca0f0e5
ZK
64#define MOVQ_WONE(regd) \
65 __asm __volatile ( \
66 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
67 "psrlw $15, %%" #regd ::)
68
69#define MOVQ_BFE(regd) \
70 __asm __volatile ( \
71 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
72 "paddb %%" #regd ", %%" #regd " \n\t" ::)
73
d6a4c0b1 74#ifndef PIC
fca0f0e5 75#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
76#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
77#else
78// for shared library it's better to use this way for accessing constants
79// pcmpeqd -> -1
fca0f0e5 80#define MOVQ_BONE(regd) \
d6a4c0b1 81 __asm __volatile ( \
fca0f0e5
ZK
82 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
83 "psrlw $15, %%" #regd " \n\t" \
84 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
85
86#define MOVQ_WTWO(regd) \
87 __asm __volatile ( \
fca0f0e5
ZK
88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
89 "psrlw $15, %%" #regd " \n\t" \
90 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 91
d6a4c0b1
ZK
92#endif
93
fca0f0e5 94// using regr as temporary and for the output result
def60345 95// first argument is unmodifed and second is trashed
39825f31
ZK
96// regfe is supposed to contain 0xfefefefefefefefe
97#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
98 "movq " #rega ", " #regr " \n\t"\
99 "pand " #regb ", " #regr " \n\t"\
def60345 100 "pxor " #rega ", " #regb " \n\t"\
39825f31 101 "pand " #regfe "," #regb " \n\t"\
def60345 102 "psrlq $1, " #regb " \n\t"\
91abb473 103 "paddb " #regb ", " #regr " \n\t"
def60345 104
39825f31 105#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
106 "movq " #rega ", " #regr " \n\t"\
107 "por " #regb ", " #regr " \n\t"\
def60345 108 "pxor " #rega ", " #regb " \n\t"\
39825f31 109 "pand " #regfe "," #regb " \n\t"\
def60345 110 "psrlq $1, " #regb " \n\t"\
91abb473 111 "psubb " #regb ", " #regr " \n\t"
def60345 112
39825f31 113// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
114#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
115 "movq " #rega ", " #regr " \n\t"\
116 "movq " #regc ", " #regp " \n\t"\
117 "pand " #regb ", " #regr " \n\t"\
118 "pand " #regd ", " #regp " \n\t"\
119 "pxor " #rega ", " #regb " \n\t"\
120 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
121 "pand %%mm6, " #regb " \n\t"\
122 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
123 "psrlq $1, " #regb " \n\t"\
124 "psrlq $1, " #regd " \n\t"\
125 "paddb " #regb ", " #regr " \n\t"\
126 "paddb " #regd ", " #regp " \n\t"
127
128#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
129 "movq " #rega ", " #regr " \n\t"\
130 "movq " #regc ", " #regp " \n\t"\
131 "por " #regb ", " #regr " \n\t"\
132 "por " #regd ", " #regp " \n\t"\
133 "pxor " #rega ", " #regb " \n\t"\
134 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
135 "pand %%mm6, " #regb " \n\t"\
136 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
137 "psrlq $1, " #regd " \n\t"\
138 "psrlq $1, " #regb " \n\t"\
139 "psubb " #regb ", " #regr " \n\t"\
140 "psubb " #regd ", " #regp " \n\t"
141
91abb473
ZK
142/***********************************/
143/* MMX no rounding */
144#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 145#define SET_RND MOVQ_WONE
6aa6ea8e 146#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 147#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 148
91abb473
ZK
149#include "dsputil_mmx_rnd.h"
150
151#undef DEF
fca0f0e5 152#undef SET_RND
6aa6ea8e 153#undef PAVGBP
39825f31 154#undef PAVGB
91abb473
ZK
155/***********************************/
156/* MMX rounding */
157
158#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 159#define SET_RND MOVQ_WTWO
6aa6ea8e 160#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 161#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 162
91abb473
ZK
163#include "dsputil_mmx_rnd.h"
164
165#undef DEF
fca0f0e5 166#undef SET_RND
6aa6ea8e 167#undef PAVGBP
39825f31 168#undef PAVGB
a7bd8797 169
de6d9b64
FB
170/***********************************/
171/* 3Dnow specific */
172
173#define DEF(x) x ## _3dnow
174/* for Athlons PAVGUSB is prefered */
175#define PAVGB "pavgusb"
176
177#include "dsputil_mmx_avg.h"
178
179#undef DEF
180#undef PAVGB
181
182/***********************************/
183/* MMX2 specific */
184
607dce96 185#define DEF(x) x ## _mmx2
de6d9b64
FB
186
187/* Introduced only in MMX2 set */
188#define PAVGB "pavgb"
189
190#include "dsputil_mmx_avg.h"
191
192#undef DEF
193#undef PAVGB
194
195/***********************************/
196/* standard MMX */
197
198static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
199{
607dce96
MN
200 asm volatile(
201 "movl $-128, %%eax \n\t"
202 "pxor %%mm7, %%mm7 \n\t"
203 ".balign 16 \n\t"
204 "1: \n\t"
205 "movq (%0), %%mm0 \n\t"
206 "movq (%0, %2), %%mm2 \n\t"
207 "movq %%mm0, %%mm1 \n\t"
208 "movq %%mm2, %%mm3 \n\t"
209 "punpcklbw %%mm7, %%mm0 \n\t"
210 "punpckhbw %%mm7, %%mm1 \n\t"
211 "punpcklbw %%mm7, %%mm2 \n\t"
212 "punpckhbw %%mm7, %%mm3 \n\t"
213 "movq %%mm0, (%1, %%eax)\n\t"
214 "movq %%mm1, 8(%1, %%eax)\n\t"
215 "movq %%mm2, 16(%1, %%eax)\n\t"
216 "movq %%mm3, 24(%1, %%eax)\n\t"
217 "addl %3, %0 \n\t"
218 "addl $32, %%eax \n\t"
219 "js 1b \n\t"
220 : "+r" (pixels)
221 : "r" (block+64), "r" (line_size), "r" (line_size*2)
222 : "%eax"
223 );
de6d9b64
FB
224}
225
1457ab52 226static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
9dbcbd92
MN
227{
228 asm volatile(
607dce96 229 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 230 "movl $-128, %%eax \n\t"
607dce96 231 ".balign 16 \n\t"
9dbcbd92
MN
232 "1: \n\t"
233 "movq (%0), %%mm0 \n\t"
234 "movq (%1), %%mm2 \n\t"
235 "movq %%mm0, %%mm1 \n\t"
236 "movq %%mm2, %%mm3 \n\t"
237 "punpcklbw %%mm7, %%mm0 \n\t"
238 "punpckhbw %%mm7, %%mm1 \n\t"
239 "punpcklbw %%mm7, %%mm2 \n\t"
240 "punpckhbw %%mm7, %%mm3 \n\t"
241 "psubw %%mm2, %%mm0 \n\t"
242 "psubw %%mm3, %%mm1 \n\t"
243 "movq %%mm0, (%2, %%eax)\n\t"
244 "movq %%mm1, 8(%2, %%eax)\n\t"
245 "addl %3, %0 \n\t"
246 "addl %3, %1 \n\t"
247 "addl $16, %%eax \n\t"
248 "jnz 1b \n\t"
249 : "+r" (s1), "+r" (s2)
250 : "r" (block+64), "r" (stride)
251 : "%eax"
252 );
253}
254
eb4b3dd3 255void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
256{
257 const DCTELEM *p;
258 UINT8 *pix;
de6d9b64
FB
259
260 /* read the pixels */
261 p = block;
262 pix = pixels;
d6a4c0b1 263 /* unrolled loop */
de6d9b64 264 __asm __volatile(
a822a479
NK
265 "movq %3, %%mm0\n\t"
266 "movq 8%3, %%mm1\n\t"
267 "movq 16%3, %%mm2\n\t"
268 "movq 24%3, %%mm3\n\t"
269 "movq 32%3, %%mm4\n\t"
270 "movq 40%3, %%mm5\n\t"
271 "movq 48%3, %%mm6\n\t"
272 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
273 "packuswb %%mm1, %%mm0\n\t"
274 "packuswb %%mm3, %%mm2\n\t"
275 "packuswb %%mm5, %%mm4\n\t"
276 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
277 "movq %%mm0, (%0)\n\t"
278 "movq %%mm2, (%0, %1)\n\t"
279 "movq %%mm4, (%0, %1, 2)\n\t"
280 "movq %%mm6, (%0, %2)\n\t"
281 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
282 :"memory");
283 pix += line_size*4;
284 p += 32;
d6a4c0b1
ZK
285
286 // if here would be an exact copy of the code above
287 // compiler would generate some very strange code
288 // thus using "r"
289 __asm __volatile(
290 "movq (%3), %%mm0\n\t"
291 "movq 8(%3), %%mm1\n\t"
292 "movq 16(%3), %%mm2\n\t"
293 "movq 24(%3), %%mm3\n\t"
294 "movq 32(%3), %%mm4\n\t"
295 "movq 40(%3), %%mm5\n\t"
296 "movq 48(%3), %%mm6\n\t"
297 "movq 56(%3), %%mm7\n\t"
298 "packuswb %%mm1, %%mm0\n\t"
299 "packuswb %%mm3, %%mm2\n\t"
300 "packuswb %%mm5, %%mm4\n\t"
301 "packuswb %%mm7, %%mm6\n\t"
302 "movq %%mm0, (%0)\n\t"
303 "movq %%mm2, (%0, %1)\n\t"
304 "movq %%mm4, (%0, %1, 2)\n\t"
305 "movq %%mm6, (%0, %2)\n\t"
306 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
307 :"memory");
de6d9b64
FB
308}
309
eb4b3dd3 310void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
311{
312 const DCTELEM *p;
313 UINT8 *pix;
314 int i;
315
316 /* read the pixels */
317 p = block;
318 pix = pixels;
d6a4c0b1
ZK
319 MOVQ_ZERO(mm7);
320 i = 4;
cd8e5f96 321 do {
de6d9b64 322 __asm __volatile(
cd8e5f96
ZK
323 "movq (%2), %%mm0\n\t"
324 "movq 8(%2), %%mm1\n\t"
325 "movq 16(%2), %%mm2\n\t"
326 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
327 "movq %0, %%mm4\n\t"
328 "movq %1, %%mm6\n\t"
329 "movq %%mm4, %%mm5\n\t"
330 "punpcklbw %%mm7, %%mm4\n\t"
331 "punpckhbw %%mm7, %%mm5\n\t"
332 "paddsw %%mm4, %%mm0\n\t"
333 "paddsw %%mm5, %%mm1\n\t"
334 "movq %%mm6, %%mm5\n\t"
335 "punpcklbw %%mm7, %%mm6\n\t"
336 "punpckhbw %%mm7, %%mm5\n\t"
337 "paddsw %%mm6, %%mm2\n\t"
338 "paddsw %%mm5, %%mm3\n\t"
339 "packuswb %%mm1, %%mm0\n\t"
340 "packuswb %%mm3, %%mm2\n\t"
341 "movq %%mm0, %0\n\t"
342 "movq %%mm2, %1\n\t"
a822a479 343 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 344 :"r"(p)
de6d9b64
FB
345 :"memory");
346 pix += line_size*2;
347 p += 16;
cd8e5f96 348 } while (--i);
de6d9b64
FB
349}
350
b3184779 351static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 352{
39825f31 353 __asm __volatile(
31ddcf98 354 "lea (%3, %3), %%eax \n\t"
52af45ad 355 ".balign 8 \n\t"
31ddcf98
ZK
356 "1: \n\t"
357 "movq (%1), %%mm0 \n\t"
358 "movq (%1, %3), %%mm1 \n\t"
359 "movq %%mm0, (%2) \n\t"
360 "movq %%mm1, (%2, %3) \n\t"
361 "addl %%eax, %1 \n\t"
362 "addl %%eax, %2 \n\t"
363 "movq (%1), %%mm0 \n\t"
364 "movq (%1, %3), %%mm1 \n\t"
365 "movq %%mm0, (%2) \n\t"
366 "movq %%mm1, (%2, %3) \n\t"
367 "addl %%eax, %1 \n\t"
368 "addl %%eax, %2 \n\t"
369 "subl $4, %0 \n\t"
370 "jnz 1b \n\t"
371 : "+g"(h), "+r" (pixels), "+r" (block)
372 : "r"(line_size)
373 : "%eax", "memory"
374 );
de6d9b64
FB
375}
376
b3184779
MN
377static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
378{
379 __asm __volatile(
380 "lea (%3, %3), %%eax \n\t"
381 ".balign 8 \n\t"
382 "1: \n\t"
383 "movq (%1), %%mm0 \n\t"
384 "movq 8(%1), %%mm4 \n\t"
385 "movq (%1, %3), %%mm1 \n\t"
386 "movq 8(%1, %3), %%mm5 \n\t"
387 "movq %%mm0, (%2) \n\t"
388 "movq %%mm4, 8(%2) \n\t"
389 "movq %%mm1, (%2, %3) \n\t"
390 "movq %%mm5, 8(%2, %3) \n\t"
391 "addl %%eax, %1 \n\t"
392 "addl %%eax, %2 \n\t"
393 "movq (%1), %%mm0 \n\t"
394 "movq 8(%1), %%mm4 \n\t"
395 "movq (%1, %3), %%mm1 \n\t"
396 "movq 8(%1, %3), %%mm5 \n\t"
397 "movq %%mm0, (%2) \n\t"
398 "movq %%mm4, 8(%2) \n\t"
399 "movq %%mm1, (%2, %3) \n\t"
400 "movq %%mm5, 8(%2, %3) \n\t"
401 "addl %%eax, %1 \n\t"
402 "addl %%eax, %2 \n\t"
403 "subl $4, %0 \n\t"
404 "jnz 1b \n\t"
405 : "+g"(h), "+r" (pixels), "+r" (block)
406 : "r"(line_size)
407 : "%eax", "memory"
408 );
409}
410
649c00c9
MN
411static void clear_blocks_mmx(DCTELEM *blocks)
412{
39825f31 413 __asm __volatile(
649c00c9
MN
414 "pxor %%mm7, %%mm7 \n\t"
415 "movl $-128*6, %%eax \n\t"
416 "1: \n\t"
417 "movq %%mm7, (%0, %%eax) \n\t"
418 "movq %%mm7, 8(%0, %%eax) \n\t"
419 "movq %%mm7, 16(%0, %%eax) \n\t"
420 "movq %%mm7, 24(%0, %%eax) \n\t"
421 "addl $32, %%eax \n\t"
422 " js 1b \n\t"
423 : : "r" (((int)blocks)+128*6)
424 : "%eax"
425 );
426}
427
084c726b
MN
428static int pix_sum16_mmx(UINT8 * pix, int line_size){
429 const int h=16;
430 int sum;
431 int index= -line_size*h;
432
433 __asm __volatile(
434 "pxor %%mm7, %%mm7 \n\t"
435 "pxor %%mm6, %%mm6 \n\t"
436 "1: \n\t"
437 "movq (%2, %1), %%mm0 \n\t"
438 "movq (%2, %1), %%mm1 \n\t"
439 "movq 8(%2, %1), %%mm2 \n\t"
440 "movq 8(%2, %1), %%mm3 \n\t"
441 "punpcklbw %%mm7, %%mm0 \n\t"
442 "punpckhbw %%mm7, %%mm1 \n\t"
443 "punpcklbw %%mm7, %%mm2 \n\t"
444 "punpckhbw %%mm7, %%mm3 \n\t"
445 "paddw %%mm0, %%mm1 \n\t"
446 "paddw %%mm2, %%mm3 \n\t"
447 "paddw %%mm1, %%mm3 \n\t"
448 "paddw %%mm3, %%mm6 \n\t"
449 "addl %3, %1 \n\t"
450 " js 1b \n\t"
451 "movq %%mm6, %%mm5 \n\t"
452 "psrlq $32, %%mm6 \n\t"
453 "paddw %%mm5, %%mm6 \n\t"
454 "movq %%mm6, %%mm5 \n\t"
455 "psrlq $16, %%mm6 \n\t"
456 "paddw %%mm5, %%mm6 \n\t"
457 "movd %%mm6, %0 \n\t"
458 "andl $0xFFFF, %0 \n\t"
459 : "=&r" (sum), "+r" (index)
460 : "r" (pix - index), "r" (line_size)
461 );
462
463 return sum;
464}
465
11f18faf
MN
466static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
467 int i=0;
468 asm volatile(
469 "1: \n\t"
470 "movq (%1, %0), %%mm0 \n\t"
471 "movq (%2, %0), %%mm1 \n\t"
472 "paddb %%mm0, %%mm1 \n\t"
473 "movq %%mm1, (%2, %0) \n\t"
474 "movq 8(%1, %0), %%mm0 \n\t"
475 "movq 8(%2, %0), %%mm1 \n\t"
476 "paddb %%mm0, %%mm1 \n\t"
477 "movq %%mm1, 8(%2, %0) \n\t"
478 "addl $16, %0 \n\t"
479 "cmpl %3, %0 \n\t"
480 " jb 1b \n\t"
481 : "+r" (i)
482 : "r"(src), "r"(dst), "r"(w-15)
483 );
484 for(; i<w; i++)
485 dst[i+0] += src[i+0];
486}
487
488static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
489 int i=0;
490 asm volatile(
491 "1: \n\t"
492 "movq (%2, %0), %%mm0 \n\t"
493 "movq (%1, %0), %%mm1 \n\t"
494 "psubb %%mm0, %%mm1 \n\t"
495 "movq %%mm1, (%3, %0) \n\t"
496 "movq 8(%2, %0), %%mm0 \n\t"
497 "movq 8(%1, %0), %%mm1 \n\t"
498 "psubb %%mm0, %%mm1 \n\t"
499 "movq %%mm1, 8(%3, %0) \n\t"
500 "addl $16, %0 \n\t"
501 "cmpl %4, %0 \n\t"
502 " jb 1b \n\t"
503 : "+r" (i)
504 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
505 );
506 for(; i<w; i++)
507 dst[i+0] = src1[i+0]-src2[i+0];
508}
1457ab52
MN
509#define LBUTTERFLY(a,b)\
510 "paddw " #b ", " #a " \n\t"\
511 "paddw " #b ", " #b " \n\t"\
512 "psubw " #a ", " #b " \n\t"
513
514#define HADAMARD48\
515 LBUTTERFLY(%%mm0, %%mm1)\
516 LBUTTERFLY(%%mm2, %%mm3)\
517 LBUTTERFLY(%%mm4, %%mm5)\
518 LBUTTERFLY(%%mm6, %%mm7)\
519 \
520 LBUTTERFLY(%%mm0, %%mm2)\
521 LBUTTERFLY(%%mm1, %%mm3)\
522 LBUTTERFLY(%%mm4, %%mm6)\
523 LBUTTERFLY(%%mm5, %%mm7)\
524 \
525 LBUTTERFLY(%%mm0, %%mm4)\
526 LBUTTERFLY(%%mm1, %%mm5)\
527 LBUTTERFLY(%%mm2, %%mm6)\
528 LBUTTERFLY(%%mm3, %%mm7)
529
530#define MMABS(a,z)\
531 "pxor " #z ", " #z " \n\t"\
532 "pcmpgtw " #a ", " #z " \n\t"\
533 "pxor " #z ", " #a " \n\t"\
534 "psubw " #z ", " #a " \n\t"
535
536#define MMABS_SUM(a,z, sum)\
537 "pxor " #z ", " #z " \n\t"\
538 "pcmpgtw " #a ", " #z " \n\t"\
539 "pxor " #z ", " #a " \n\t"\
540 "psubw " #z ", " #a " \n\t"\
541 "paddusw " #a ", " #sum " \n\t"
542
543
544#define SBUTTERFLY(a,b,t,n)\
545 "movq " #a ", " #t " \n\t" /* abcd */\
546 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
547 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
548
549#define TRANSPOSE4(a,b,c,d,t)\
550 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
551 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
552 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
553 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
554
555#define LOAD4(o, a, b, c, d)\
556 "movq "#o"(%1), " #a " \n\t"\
557 "movq "#o"+16(%1), " #b " \n\t"\
558 "movq "#o"+32(%1), " #c " \n\t"\
559 "movq "#o"+48(%1), " #d " \n\t"
560
561#define STORE4(o, a, b, c, d)\
562 "movq "#a", "#o"(%1) \n\t"\
563 "movq "#b", "#o"+16(%1) \n\t"\
564 "movq "#c", "#o"+32(%1) \n\t"\
565 "movq "#d", "#o"+48(%1) \n\t"\
566
567static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
568 uint64_t temp[16] __align8;
569 int sum=0;
570
571 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 572
1457ab52
MN
573 asm volatile(
574 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
575 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
576
577 HADAMARD48
578
579 "movq %%mm7, 112(%1) \n\t"
580
581 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
582 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
583
584 "movq 112(%1), %%mm7 \n\t"
585 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
586 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
587
588 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
589 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
590
591 HADAMARD48
592
593 "movq %%mm7, 120(%1) \n\t"
594
595 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
596 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
597
598 "movq 120(%1), %%mm7 \n\t"
599 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
600 "movq %%mm7, %%mm5 \n\t"//FIXME remove
601 "movq %%mm6, %%mm7 \n\t"
602 "movq %%mm0, %%mm6 \n\t"
603// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
604
605 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
606// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
607
608 HADAMARD48
609 "movq %%mm7, 64(%1) \n\t"
610 MMABS(%%mm0, %%mm7)
611 MMABS_SUM(%%mm1, %%mm7, %%mm0)
612 MMABS_SUM(%%mm2, %%mm7, %%mm0)
613 MMABS_SUM(%%mm3, %%mm7, %%mm0)
614 MMABS_SUM(%%mm4, %%mm7, %%mm0)
615 MMABS_SUM(%%mm5, %%mm7, %%mm0)
616 MMABS_SUM(%%mm6, %%mm7, %%mm0)
617 "movq 64(%1), %%mm1 \n\t"
618 MMABS_SUM(%%mm1, %%mm7, %%mm0)
619 "movq %%mm0, 64(%1) \n\t"
620
621 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
622 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
623
624 HADAMARD48
625 "movq %%mm7, (%1) \n\t"
626 MMABS(%%mm0, %%mm7)
627 MMABS_SUM(%%mm1, %%mm7, %%mm0)
628 MMABS_SUM(%%mm2, %%mm7, %%mm0)
629 MMABS_SUM(%%mm3, %%mm7, %%mm0)
630 MMABS_SUM(%%mm4, %%mm7, %%mm0)
631 MMABS_SUM(%%mm5, %%mm7, %%mm0)
632 MMABS_SUM(%%mm6, %%mm7, %%mm0)
633 "movq (%1), %%mm1 \n\t"
634 MMABS_SUM(%%mm1, %%mm7, %%mm0)
635 "movq 64(%1), %%mm1 \n\t"
636 MMABS_SUM(%%mm1, %%mm7, %%mm0)
637
638 "movq %%mm0, %%mm1 \n\t"
639 "psrlq $32, %%mm0 \n\t"
640 "paddusw %%mm1, %%mm0 \n\t"
641 "movq %%mm0, %%mm1 \n\t"
642 "psrlq $16, %%mm0 \n\t"
643 "paddusw %%mm1, %%mm0 \n\t"
644 "movd %%mm0, %0 \n\t"
645
646 : "=r" (sum)
647 : "r"(temp)
648 );
649 return sum&0xFFFF;
650}
651
652WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
11f18faf 653
3178ee4c
MN
654#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
655#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
656
826f429a
MN
657#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
658 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
c296f66b 659 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
826f429a
MN
660 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
661 "movq "#in7", " #m3 " \n\t" /* d */\
662 "movq "#in0", %%mm5 \n\t" /* D */\
663 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
664 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
665 "movq "#in1", %%mm5 \n\t" /* C */\
666 "movq "#in2", %%mm6 \n\t" /* B */\
667 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
668 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
669 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
670 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
c296f66b 671 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
826f429a
MN
672 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
673 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
674 "psraw $5, %%mm5 \n\t"\
675 "packuswb %%mm5, %%mm5 \n\t"\
676 OP(%%mm5, out, %%mm7, d)
677
3178ee4c 678#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
826f429a
MN
679void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
680 uint64_t temp;\
681\
682 asm volatile(\
683 "pxor %%mm7, %%mm7 \n\t"\
684 "1: \n\t"\
685 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
686 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
687 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
688 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
689 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
690 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
691 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
692 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
693 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
694 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
695 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
696 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
697 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
698 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
699 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
700 "paddw %%mm3, %%mm5 \n\t" /* b */\
701 "paddw %%mm2, %%mm6 \n\t" /* c */\
702 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
703 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
704 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 705 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
706 "paddw %%mm4, %%mm0 \n\t" /* a */\
707 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 708 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 709 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 710 "paddw %6, %%mm6 \n\t"\
826f429a
MN
711 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
712 "psraw $5, %%mm0 \n\t"\
c296f66b 713 "movq %%mm0, %5 \n\t"\
826f429a
MN
714 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
715 \
716 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
717 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
718 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
719 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
720 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
721 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
723 "paddw %%mm0, %%mm2 \n\t" /* b */\
724 "paddw %%mm5, %%mm3 \n\t" /* c */\
725 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
726 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
727 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
728 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
729 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
730 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
c296f66b 731 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a
MN
732 "paddw %%mm2, %%mm1 \n\t" /* a */\
733 "paddw %%mm6, %%mm4 \n\t" /* d */\
c296f66b 734 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
826f429a 735 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
c296f66b 736 "paddw %6, %%mm1 \n\t"\
826f429a
MN
737 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
738 "psraw $5, %%mm3 \n\t"\
c296f66b 739 "movq %5, %%mm1 \n\t"\
826f429a 740 "packuswb %%mm3, %%mm1 \n\t"\
3178ee4c 741 OP_MMX2(%%mm1, (%1),%%mm4, q)\
826f429a
MN
742 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
743 \
744 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
745 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
746 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
747 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
748 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
749 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
750 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
751 "paddw %%mm1, %%mm5 \n\t" /* b */\
752 "paddw %%mm4, %%mm0 \n\t" /* c */\
753 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
754 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
755 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
756 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
c296f66b 757 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
826f429a
MN
758 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
759 "paddw %%mm3, %%mm2 \n\t" /* d */\
760 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
761 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
762 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
763 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
764 "paddw %%mm2, %%mm6 \n\t" /* a */\
c296f66b
MN
765 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
766 "paddw %6, %%mm0 \n\t"\
826f429a
MN
767 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
768 "psraw $5, %%mm0 \n\t"\
769 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
770 \
771 "paddw %%mm5, %%mm3 \n\t" /* a */\
772 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
773 "paddw %%mm4, %%mm6 \n\t" /* b */\
774 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
775 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
776 "paddw %%mm1, %%mm4 \n\t" /* c */\
777 "paddw %%mm2, %%mm5 \n\t" /* d */\
778 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
779 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
c296f66b
MN
780 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
781 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
826f429a 782 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 783 "paddw %6, %%mm4 \n\t"\
826f429a
MN
784 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
785 "psraw $5, %%mm4 \n\t"\
786 "packuswb %%mm4, %%mm0 \n\t"\
3178ee4c 787 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
826f429a
MN
788 \
789 "addl %3, %0 \n\t"\
790 "addl %4, %1 \n\t"\
791 "decl %2 \n\t"\
792 " jnz 1b \n\t"\
5a508a98 793 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
794 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
795 : "memory"\
826f429a
MN
796 );\
797}\
798\
799static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
800 int i;\
801 int16_t temp[16];\
802 /* quick HACK, XXX FIXME MUST be optimized */\
803 for(i=0; i<h; i++)\
804 {\
805 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
806 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
807 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
808 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
809 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
810 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
811 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
812 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
813 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
814 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
815 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
816 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
817 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
818 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
819 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
820 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
821 asm volatile(\
822 "movq (%0), %%mm0 \n\t"\
823 "movq 8(%0), %%mm1 \n\t"\
824 "paddw %2, %%mm0 \n\t"\
825 "paddw %2, %%mm1 \n\t"\
826 "psraw $5, %%mm0 \n\t"\
827 "psraw $5, %%mm1 \n\t"\
828 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 829 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
830 "movq 16(%0), %%mm0 \n\t"\
831 "movq 24(%0), %%mm1 \n\t"\
832 "paddw %2, %%mm0 \n\t"\
833 "paddw %2, %%mm1 \n\t"\
834 "psraw $5, %%mm0 \n\t"\
835 "psraw $5, %%mm1 \n\t"\
836 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 837 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
826f429a 838 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 839 : "memory"\
826f429a
MN
840 );\
841 dst+=dstStride;\
842 src+=srcStride;\
843 }\
844}\
845\
826f429a
MN
846void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
847 uint64_t temp;\
848\
849 asm volatile(\
850 "pxor %%mm7, %%mm7 \n\t"\
851 "1: \n\t"\
852 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
853 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
854 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
855 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
856 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
857 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
858 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
859 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
860 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
861 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
862 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
863 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
864 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
865 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
866 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
867 "paddw %%mm3, %%mm5 \n\t" /* b */\
868 "paddw %%mm2, %%mm6 \n\t" /* c */\
869 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
870 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
871 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 872 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
873 "paddw %%mm4, %%mm0 \n\t" /* a */\
874 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 875 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 876 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 877 "paddw %6, %%mm6 \n\t"\
826f429a
MN
878 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
879 "psraw $5, %%mm0 \n\t"\
880 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
881 \
882 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
883 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
884 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
885 "paddw %%mm5, %%mm1 \n\t" /* a */\
886 "paddw %%mm6, %%mm2 \n\t" /* b */\
887 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
888 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
889 "paddw %%mm6, %%mm3 \n\t" /* c */\
890 "paddw %%mm5, %%mm4 \n\t" /* d */\
891 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
892 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
c296f66b
MN
893 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
894 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a 895 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 896 "paddw %6, %%mm1 \n\t"\
826f429a
MN
897 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
898 "psraw $5, %%mm3 \n\t"\
899 "packuswb %%mm3, %%mm0 \n\t"\
3178ee4c 900 OP_MMX2(%%mm0, (%1), %%mm4, q)\
826f429a
MN
901 \
902 "addl %3, %0 \n\t"\
903 "addl %4, %1 \n\t"\
904 "decl %2 \n\t"\
c296f66b 905 " jnz 1b \n\t"\
5a508a98 906 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
907 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
908 : "memory"\
826f429a
MN
909 );\
910}\
911\
912static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
913 int i;\
914 int16_t temp[8];\
915 /* quick HACK, XXX FIXME MUST be optimized */\
916 for(i=0; i<h; i++)\
917 {\
918 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
919 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
920 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
921 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
922 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
923 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
924 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
925 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
926 asm volatile(\
927 "movq (%0), %%mm0 \n\t"\
928 "movq 8(%0), %%mm1 \n\t"\
929 "paddw %2, %%mm0 \n\t"\
930 "paddw %2, %%mm1 \n\t"\
931 "psraw $5, %%mm0 \n\t"\
932 "psraw $5, %%mm1 \n\t"\
933 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 934 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a 935 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 936 :"memory"\
826f429a
MN
937 );\
938 dst+=dstStride;\
939 src+=srcStride;\
940 }\
3178ee4c
MN
941}
942
943#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
944\
945static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
946 uint64_t temp[17*4];\
947 uint64_t *temp_ptr= temp;\
948 int count= 17;\
949\
950 /*FIXME unroll */\
951 asm volatile(\
952 "pxor %%mm7, %%mm7 \n\t"\
953 "1: \n\t"\
954 "movq (%0), %%mm0 \n\t"\
955 "movq (%0), %%mm1 \n\t"\
956 "movq 8(%0), %%mm2 \n\t"\
957 "movq 8(%0), %%mm3 \n\t"\
958 "punpcklbw %%mm7, %%mm0 \n\t"\
959 "punpckhbw %%mm7, %%mm1 \n\t"\
960 "punpcklbw %%mm7, %%mm2 \n\t"\
961 "punpckhbw %%mm7, %%mm3 \n\t"\
962 "movq %%mm0, (%1) \n\t"\
963 "movq %%mm1, 17*8(%1) \n\t"\
5a508a98
MN
964 "movq %%mm2, 2*17*8(%1) \n\t"\
965 "movq %%mm3, 3*17*8(%1) \n\t"\
3178ee4c
MN
966 "addl $8, %1 \n\t"\
967 "addl %3, %0 \n\t"\
968 "decl %2 \n\t"\
969 " jnz 1b \n\t"\
970 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
5a508a98 971 : "r" (srcStride)\
0b093b6f 972 : "memory"\
3178ee4c
MN
973 );\
974 \
975 temp_ptr= temp;\
976 count=4;\
977 \
978/*FIXME reorder for speed */\
3178ee4c
MN
979 asm volatile(\
980 /*"pxor %%mm7, %%mm7 \n\t"*/\
3178ee4c
MN
981 "1: \n\t"\
982 "movq (%0), %%mm0 \n\t"\
983 "movq 8(%0), %%mm1 \n\t"\
984 "movq 16(%0), %%mm2 \n\t"\
985 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
986 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
987 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
3178ee4c 988 "addl %4, %1 \n\t"\
c296f66b 989 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
3178ee4c 990 \
c296f66b 991 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
3178ee4c 992 "addl %4, %1 \n\t"\
c296f66b
MN
993 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
994 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
3178ee4c 995 "addl %4, %1 \n\t"\
c296f66b
MN
996 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
997 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
3178ee4c 998 "addl %4, %1 \n\t"\
c296f66b
MN
999 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1000 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
3178ee4c 1001 "addl %4, %1 \n\t"\
c296f66b
MN
1002 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1003 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
3178ee4c 1004 "addl %4, %1 \n\t"\
c296f66b 1005 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
3178ee4c 1006 \
c296f66b 1007 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
3178ee4c 1008 "addl %4, %1 \n\t" \
c296f66b
MN
1009 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1010 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
3178ee4c
MN
1011 \
1012 "addl $136, %0 \n\t"\
c296f66b 1013 "addl %6, %1 \n\t"\
3178ee4c
MN
1014 "decl %2 \n\t"\
1015 " jnz 1b \n\t"\
3178ee4c 1016 \
5a508a98 1017 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
c296f66b 1018 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
0b093b6f 1019 :"memory"\
3178ee4c 1020 );\
826f429a
MN
1021}\
1022\
3178ee4c 1023void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
826f429a
MN
1024 uint64_t temp[9*4];\
1025 uint64_t *temp_ptr= temp;\
1026 int count= 9;\
1027\
1028 /*FIXME unroll */\
1029 asm volatile(\
1030 "pxor %%mm7, %%mm7 \n\t"\
1031 "1: \n\t"\
1032 "movq (%0), %%mm0 \n\t"\
1033 "movq (%0), %%mm1 \n\t"\
1034 "punpcklbw %%mm7, %%mm0 \n\t"\
1035 "punpckhbw %%mm7, %%mm1 \n\t"\
1036 "movq %%mm0, (%1) \n\t"\
1037 "movq %%mm1, 9*8(%1) \n\t"\
1038 "addl $8, %1 \n\t"\
1039 "addl %3, %0 \n\t"\
1040 "decl %2 \n\t"\
1041 " jnz 1b \n\t"\
1042 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1043 : "r" (srcStride)\
0b093b6f 1044 : "memory"\
826f429a
MN
1045 );\
1046 \
1047 temp_ptr= temp;\
1048 count=2;\
1049 \
1050/*FIXME reorder for speed */\
1051 asm volatile(\
1052 /*"pxor %%mm7, %%mm7 \n\t"*/\
1053 "1: \n\t"\
1054 "movq (%0), %%mm0 \n\t"\
1055 "movq 8(%0), %%mm1 \n\t"\
1056 "movq 16(%0), %%mm2 \n\t"\
1057 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1058 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1059 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
826f429a 1060 "addl %4, %1 \n\t"\
c296f66b 1061 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
826f429a 1062 \
c296f66b 1063 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
826f429a 1064 "addl %4, %1 \n\t"\
c296f66b 1065 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
826f429a 1066 \
c296f66b 1067 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
826f429a 1068 "addl %4, %1 \n\t"\
c296f66b
MN
1069 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1070 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
826f429a
MN
1071 \
1072 "addl $72, %0 \n\t"\
c296f66b 1073 "addl %6, %1 \n\t"\
826f429a
MN
1074 "decl %2 \n\t"\
1075 " jnz 1b \n\t"\
1076 \
c296f66b
MN
1077 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1078 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
0b093b6f
MN
1079 : "memory"\
1080 );\
3178ee4c 1081}\
826f429a
MN
1082\
1083static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1084 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
826f429a
MN
1085}\
1086\
1087static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1088 uint64_t temp[32];\
1089 uint8_t * const half= (uint8_t*)temp;\
1090 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1091 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1092}\
1093\
1094static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1095 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1096}\
1097\
1098static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1099 uint64_t temp[32];\
1100 uint8_t * const half= (uint8_t*)temp;\
1101 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1102 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1103}\
1104\
1105static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1106 uint64_t temp[32];\
1107 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1108 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1109 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1110}\
1111\
1112static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1113 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1114}\
1115\
1116static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1117 uint64_t temp[32];\
1118 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1119 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1120 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1121}\
1122static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1123 uint64_t half[8*2 + 8*2 + 18*2];\
1124 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1125 uint8_t * const halfV= ((uint8_t*)half);\
1126 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1127 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1128 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1129 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1130 OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\
1131}\
1132static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1133 uint64_t half[8*2 + 8*2 + 18*2];\
1134 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1135 uint8_t * const halfV= ((uint8_t*)half);\
1136 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1137 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1138 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1139 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1140 OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\
1141}\
1142static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1143 uint64_t half[8*2 + 8*2 + 9*2];\
1144 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1145 uint8_t * const halfV= ((uint8_t*)half);\
1146 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1147 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1148 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1149 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1150 OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\
1151}\
1152static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1153 uint64_t half[8*2 + 8*2 + 9*2];\
1154 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1155 uint8_t * const halfV= ((uint8_t*)half);\
1156 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1157 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src , 8, stride, 9);\
3178ee4c
MN
1158 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1159 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1160 OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\
1161}\
1162static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1163 uint64_t half[8*2 + 9*2];\
1164 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1165 uint8_t * const halfHV= ((uint8_t*)half);\
1166 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1167 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1168 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1169}\
1170static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1171 uint64_t half[8*2 + 9*2];\
1172 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1173 uint8_t * const halfHV= ((uint8_t*)half);\
1174 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1175 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1176 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1177}\
1178static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1179 uint64_t half[8*2 + 8*2 + 9*2];\
1180 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1181 uint8_t * const halfV= ((uint8_t*)half);\
1182 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1183 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1184 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1185 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1186 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1187}\
1188static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1189 uint64_t half[8*2 + 8*2 + 9*2];\
1190 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1191 uint8_t * const halfV= ((uint8_t*)half);\
1192 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1193 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1194 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1195 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1196 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1197}\
1198static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1199 uint64_t half[9*2];\
1200 uint8_t * const halfH= ((uint8_t*)half);\
1201 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1202 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a
MN
1203}\
1204static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1205 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
826f429a
MN
1206}\
1207\
1208static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1209 uint64_t temp[32];\
1210 uint8_t * const half= (uint8_t*)temp;\
1211 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1212 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1213}\
1214\
1215static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1216 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1217}\
1218\
1219static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1220 uint64_t temp[32];\
1221 uint8_t * const half= (uint8_t*)temp;\
1222 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1223 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1224}\
1225\
1226static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1227 uint64_t temp[32];\
1228 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1229 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1230 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1231}\
1232\
1233static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1234 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1235}\
1236\
1237static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1238 uint64_t temp[32];\
1239 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1240 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1241 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1242}\
1243static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1244 uint64_t half[16*2 + 16*2 + 18*2];\
1245 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1246 uint8_t * const halfV= ((uint8_t*)half);\
1247 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1248 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1249 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1250 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1251 OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\
1252}\
1253static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1254 uint64_t half[16*2 + 16*2 + 18*2];\
1255 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1256 uint8_t * const halfV= ((uint8_t*)half);\
1257 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1258 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1259 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1260 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1261 OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\
1262}\
1263static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1264 uint64_t half[16*2 + 16*2 + 17*2];\
1265 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1266 uint8_t * const halfV= ((uint8_t*)half);\
1267 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1268 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1269 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1270 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1271 OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\
1272}\
1273static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1274 uint64_t half[16*2 + 16*2 + 17*2];\
1275 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1276 uint8_t * const halfV= ((uint8_t*)half);\
1277 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1278 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src , 16, stride, 17);\
3178ee4c
MN
1279 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1280 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1281 OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\
1282}\
1283static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1284 uint64_t half[16*2 + 17*2];\
1285 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1286 uint8_t * const halfHV= ((uint8_t*)half);\
1287 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1288 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1289 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1290}\
1291static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1292 uint64_t half[16*2 + 17*2];\
1293 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1294 uint8_t * const halfHV= ((uint8_t*)half);\
1295 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1296 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1297 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1298}\
1299static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1300 uint64_t half[16*2 + 16*2 + 17*2];\
1301 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1302 uint8_t * const halfV= ((uint8_t*)half);\
1303 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1304 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1305 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1306 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1307 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1308}\
1309static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1310 uint64_t half[16*2 + 16*2 + 17*2];\
1311 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1312 uint8_t * const halfV= ((uint8_t*)half);\
1313 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1314 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1315 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1316 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1317 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1318}\
1319static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1320 uint64_t half[17*2];\
1321 uint8_t * const halfH= ((uint8_t*)half);\
1322 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1323 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1324}
1325
1326
1327#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
3178ee4c 1328#define AVG_3DNOW_OP(a,b,temp, size) \
826f429a
MN
1329"mov" #size " " #b ", " #temp " \n\t"\
1330"pavgusb " #temp ", " #a " \n\t"\
1331"mov" #size " " #a ", " #b " \n\t"
3178ee4c 1332#define AVG_MMX2_OP(a,b,temp, size) \
826f429a
MN
1333"mov" #size " " #b ", " #temp " \n\t"\
1334"pavgb " #temp ", " #a " \n\t"\
1335"mov" #size " " #a ", " #b " \n\t"
3178ee4c
MN
1336
1337QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1338QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1339QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1340QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1341QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1342QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
826f429a 1343QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
3178ee4c 1344QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
826f429a
MN
1345QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1346
61a4e8ae 1347#if 0
d6a4c0b1 1348static void just_return() { return; }
61a4e8ae 1349#endif
d6a4c0b1 1350
826f429a
MN
1351#define SET_QPEL_FUNC(postfix1, postfix2) \
1352 c->put_ ## postfix1 = put_ ## postfix2;\
1353 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1354 c->avg_ ## postfix1 = avg_ ## postfix2;
1355
eb4b3dd3 1356void dsputil_init_mmx(DSPContext* c, unsigned mask)
de6d9b64
FB
1357{
1358 mm_flags = mm_support();
1565dabc
LB
1359#if 0
1360 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 1361 if (mm_flags & MM_MMX)
1565dabc 1362 fprintf(stderr, " mmx");
de6d9b64 1363 if (mm_flags & MM_MMXEXT)
1565dabc 1364 fprintf(stderr, " mmxext");
de6d9b64 1365 if (mm_flags & MM_3DNOW)
1565dabc 1366 fprintf(stderr, " 3dnow");
de6d9b64 1367 if (mm_flags & MM_SSE)
1565dabc 1368 fprintf(stderr, " sse");
de6d9b64 1369 if (mm_flags & MM_SSE2)
1565dabc
LB
1370 fprintf(stderr, " sse2");
1371 fprintf(stderr, "\n");
de6d9b64
FB
1372#endif
1373
1374 if (mm_flags & MM_MMX) {
eb4b3dd3
ZK
1375 c->get_pixels = get_pixels_mmx;
1376 c->diff_pixels = diff_pixels_mmx;
1377 c->put_pixels_clamped = put_pixels_clamped_mmx;
1378 c->add_pixels_clamped = add_pixels_clamped_mmx;
1379 c->clear_blocks = clear_blocks_mmx;
1380 c->pix_sum = pix_sum16_mmx;
1381
1382 c->pix_abs16x16 = pix_abs16x16_mmx;
1383 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1384 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1385 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1386 c->pix_abs8x8 = pix_abs8x8_mmx;
1387 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1388 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1389 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
1390
1391 c->put_pixels_tab[0][0] = put_pixels16_mmx;
1392 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1393 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1394 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1395
1396 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1397 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1398 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1399 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1400
1401 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1402 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1403 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1404 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1405
1406 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1407 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1408 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1409 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1410
1411 c->put_pixels_tab[1][0] = put_pixels8_mmx;
1412 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1413 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1414 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1415
1416 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1417 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1418 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1419 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1420
1421 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1422 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1423 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1424 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1425
1426 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1427 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1428 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1429 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
826f429a 1430
11f18faf
MN
1431 c->add_bytes= add_bytes_mmx;
1432 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
1433
1434 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1435 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1436
1437 c->sad[0]= sad16x16_mmx;
1438 c->sad[1]= sad8x8_mmx;
1439
de6d9b64 1440 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1441 c->pix_abs16x16 = pix_abs16x16_mmx2;
1442 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
1443 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
1444 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
1445
1446 c->pix_abs8x8 = pix_abs8x8_mmx2;
1447 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
1448 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
1449 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
1450
1457ab52
MN
1451 c->sad[0]= sad16x16_mmx2;
1452 c->sad[1]= sad8x8_mmx2;
1453
eb4b3dd3
ZK
1454 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1455 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
1456 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1457 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1458
1459 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1460 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1461 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
1462 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1463
1464 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1465 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
1466 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1467 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1468
1469 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1470 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1471 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1472 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3178ee4c 1473
c296f66b 1474#if 1
826f429a
MN
1475 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1476 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1477 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1478 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1479 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1480 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1481 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1482 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1483 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1484 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1485 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1486 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1487 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1488 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1489 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1490 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1491 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1492 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1493 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1494 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1495 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1496 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1497 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1498 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1499 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1500 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1501 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1502 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1503 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1504 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1505 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1506 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
c296f66b 1507#endif
de6d9b64 1508 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
1509 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1510 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
1511 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1512 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1513
1514 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1515 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1516 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
1517 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1518
1519 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1520 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
1521 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1522 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1523
1524 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1525 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1526 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1527 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
826f429a
MN
1528
1529 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1530 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1531 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1532 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1533 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1534 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1535 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1536 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1537 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1538 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1539 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1540 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1541 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1542 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1543 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1544 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1545 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1546 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1547 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1548 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1549 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1550 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1551 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1552 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1553 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1554 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1555 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1556 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1557 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1558 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1559 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1560 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
de6d9b64
FB
1561 }
1562 }
d6a4c0b1
ZK
1563
1564#if 0
1565 // for speed testing
1566 get_pixels = just_return;
1567 put_pixels_clamped = just_return;
1568 add_pixels_clamped = just_return;
1569
1570 pix_abs16x16 = just_return;
1571 pix_abs16x16_x2 = just_return;
1572 pix_abs16x16_y2 = just_return;
1573 pix_abs16x16_xy2 = just_return;
1574
1575 put_pixels_tab[0] = just_return;
1576 put_pixels_tab[1] = just_return;
1577 put_pixels_tab[2] = just_return;
1578 put_pixels_tab[3] = just_return;
1579
1580 put_no_rnd_pixels_tab[0] = just_return;
1581 put_no_rnd_pixels_tab[1] = just_return;
1582 put_no_rnd_pixels_tab[2] = just_return;
1583 put_no_rnd_pixels_tab[3] = just_return;
1584
1585 avg_pixels_tab[0] = just_return;
1586 avg_pixels_tab[1] = just_return;
1587 avg_pixels_tab[2] = just_return;
1588 avg_pixels_tab[3] = just_return;
1589
1590 avg_no_rnd_pixels_tab[0] = just_return;
1591 avg_no_rnd_pixels_tab[1] = just_return;
1592 avg_no_rnd_pixels_tab[2] = just_return;
1593 avg_no_rnd_pixels_tab[3] = just_return;
1594
d6a4c0b1
ZK
1595 //av_fdct = just_return;
1596 //ff_idct = just_return;
1597#endif
de6d9b64 1598}
4f12a497
FB
1599
1600/* remove any non bit exact operation (testing purpose). NOTE that
1601 this function should be kept as small as possible because it is
1602 always difficult to test automatically non bit exact cases. */
eb4b3dd3 1603void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
4f12a497
FB
1604{
1605 if (mm_flags & MM_MMX) {
b3184779 1606 /* MMX2 & 3DNOW */
eb4b3dd3
ZK
1607 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1608 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1609 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1610 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1611 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1612 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4afeaec9 1613
b3184779 1614 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1615 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1616 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1617 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1618 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1619 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1620 c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497
FB
1621 }
1622 }
1623}