add/diff_bytes bugfix patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>)
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5 24int mm_flags; /* multimedia extension flags */
eb4b3dd3 25/* FIXME use them in static form */
ba6802de
MN
26int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
31int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
36int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
41int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
1457ab52
MN
46int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
47int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
48int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
49int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
50
de6d9b64 51/* pixel operations */
a7bd8797
MN
52static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 55
826f429a
MN
56static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
57static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
58static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
59static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
60
d6a4c0b1
ZK
61#define JUMPALIGN() __asm __volatile (".balign 8"::)
62#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
63
fca0f0e5
ZK
64#define MOVQ_WONE(regd) \
65 __asm __volatile ( \
66 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
67 "psrlw $15, %%" #regd ::)
68
69#define MOVQ_BFE(regd) \
70 __asm __volatile ( \
71 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
72 "paddb %%" #regd ", %%" #regd " \n\t" ::)
73
d6a4c0b1 74#ifndef PIC
fca0f0e5 75#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
76#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
77#else
78// for shared library it's better to use this way for accessing constants
79// pcmpeqd -> -1
fca0f0e5 80#define MOVQ_BONE(regd) \
d6a4c0b1 81 __asm __volatile ( \
fca0f0e5
ZK
82 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
83 "psrlw $15, %%" #regd " \n\t" \
84 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
85
86#define MOVQ_WTWO(regd) \
87 __asm __volatile ( \
fca0f0e5
ZK
88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
89 "psrlw $15, %%" #regd " \n\t" \
90 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 91
d6a4c0b1
ZK
92#endif
93
fca0f0e5 94// using regr as temporary and for the output result
def60345 95// first argument is unmodifed and second is trashed
39825f31
ZK
96// regfe is supposed to contain 0xfefefefefefefefe
97#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
98 "movq " #rega ", " #regr " \n\t"\
99 "pand " #regb ", " #regr " \n\t"\
def60345 100 "pxor " #rega ", " #regb " \n\t"\
39825f31 101 "pand " #regfe "," #regb " \n\t"\
def60345 102 "psrlq $1, " #regb " \n\t"\
91abb473 103 "paddb " #regb ", " #regr " \n\t"
def60345 104
39825f31 105#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
106 "movq " #rega ", " #regr " \n\t"\
107 "por " #regb ", " #regr " \n\t"\
def60345 108 "pxor " #rega ", " #regb " \n\t"\
39825f31 109 "pand " #regfe "," #regb " \n\t"\
def60345 110 "psrlq $1, " #regb " \n\t"\
91abb473 111 "psubb " #regb ", " #regr " \n\t"
def60345 112
39825f31 113// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
114#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
115 "movq " #rega ", " #regr " \n\t"\
116 "movq " #regc ", " #regp " \n\t"\
117 "pand " #regb ", " #regr " \n\t"\
118 "pand " #regd ", " #regp " \n\t"\
119 "pxor " #rega ", " #regb " \n\t"\
120 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
121 "pand %%mm6, " #regb " \n\t"\
122 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
123 "psrlq $1, " #regb " \n\t"\
124 "psrlq $1, " #regd " \n\t"\
125 "paddb " #regb ", " #regr " \n\t"\
126 "paddb " #regd ", " #regp " \n\t"
127
128#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
129 "movq " #rega ", " #regr " \n\t"\
130 "movq " #regc ", " #regp " \n\t"\
131 "por " #regb ", " #regr " \n\t"\
132 "por " #regd ", " #regp " \n\t"\
133 "pxor " #rega ", " #regb " \n\t"\
134 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
135 "pand %%mm6, " #regb " \n\t"\
136 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
137 "psrlq $1, " #regd " \n\t"\
138 "psrlq $1, " #regb " \n\t"\
139 "psubb " #regb ", " #regr " \n\t"\
140 "psubb " #regd ", " #regp " \n\t"
141
91abb473
ZK
142/***********************************/
143/* MMX no rounding */
144#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 145#define SET_RND MOVQ_WONE
6aa6ea8e 146#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 147#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 148
91abb473
ZK
149#include "dsputil_mmx_rnd.h"
150
151#undef DEF
fca0f0e5 152#undef SET_RND
6aa6ea8e 153#undef PAVGBP
39825f31 154#undef PAVGB
91abb473
ZK
155/***********************************/
156/* MMX rounding */
157
158#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 159#define SET_RND MOVQ_WTWO
6aa6ea8e 160#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 161#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 162
91abb473
ZK
163#include "dsputil_mmx_rnd.h"
164
165#undef DEF
fca0f0e5 166#undef SET_RND
6aa6ea8e 167#undef PAVGBP
39825f31 168#undef PAVGB
a7bd8797 169
de6d9b64
FB
170/***********************************/
171/* 3Dnow specific */
172
173#define DEF(x) x ## _3dnow
174/* for Athlons PAVGUSB is prefered */
175#define PAVGB "pavgusb"
176
177#include "dsputil_mmx_avg.h"
178
179#undef DEF
180#undef PAVGB
181
182/***********************************/
183/* MMX2 specific */
184
607dce96 185#define DEF(x) x ## _mmx2
de6d9b64
FB
186
187/* Introduced only in MMX2 set */
188#define PAVGB "pavgb"
189
190#include "dsputil_mmx_avg.h"
191
192#undef DEF
193#undef PAVGB
194
195/***********************************/
196/* standard MMX */
197
198static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
199{
607dce96
MN
200 asm volatile(
201 "movl $-128, %%eax \n\t"
202 "pxor %%mm7, %%mm7 \n\t"
203 ".balign 16 \n\t"
204 "1: \n\t"
205 "movq (%0), %%mm0 \n\t"
206 "movq (%0, %2), %%mm2 \n\t"
207 "movq %%mm0, %%mm1 \n\t"
208 "movq %%mm2, %%mm3 \n\t"
209 "punpcklbw %%mm7, %%mm0 \n\t"
210 "punpckhbw %%mm7, %%mm1 \n\t"
211 "punpcklbw %%mm7, %%mm2 \n\t"
212 "punpckhbw %%mm7, %%mm3 \n\t"
213 "movq %%mm0, (%1, %%eax)\n\t"
214 "movq %%mm1, 8(%1, %%eax)\n\t"
215 "movq %%mm2, 16(%1, %%eax)\n\t"
216 "movq %%mm3, 24(%1, %%eax)\n\t"
217 "addl %3, %0 \n\t"
218 "addl $32, %%eax \n\t"
219 "js 1b \n\t"
220 : "+r" (pixels)
221 : "r" (block+64), "r" (line_size), "r" (line_size*2)
222 : "%eax"
223 );
de6d9b64
FB
224}
225
1457ab52 226static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
9dbcbd92
MN
227{
228 asm volatile(
607dce96 229 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 230 "movl $-128, %%eax \n\t"
607dce96 231 ".balign 16 \n\t"
9dbcbd92
MN
232 "1: \n\t"
233 "movq (%0), %%mm0 \n\t"
234 "movq (%1), %%mm2 \n\t"
235 "movq %%mm0, %%mm1 \n\t"
236 "movq %%mm2, %%mm3 \n\t"
237 "punpcklbw %%mm7, %%mm0 \n\t"
238 "punpckhbw %%mm7, %%mm1 \n\t"
239 "punpcklbw %%mm7, %%mm2 \n\t"
240 "punpckhbw %%mm7, %%mm3 \n\t"
241 "psubw %%mm2, %%mm0 \n\t"
242 "psubw %%mm3, %%mm1 \n\t"
243 "movq %%mm0, (%2, %%eax)\n\t"
244 "movq %%mm1, 8(%2, %%eax)\n\t"
245 "addl %3, %0 \n\t"
246 "addl %3, %1 \n\t"
247 "addl $16, %%eax \n\t"
248 "jnz 1b \n\t"
249 : "+r" (s1), "+r" (s2)
250 : "r" (block+64), "r" (stride)
251 : "%eax"
252 );
253}
254
eb4b3dd3 255void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
256{
257 const DCTELEM *p;
258 UINT8 *pix;
de6d9b64
FB
259
260 /* read the pixels */
261 p = block;
262 pix = pixels;
d6a4c0b1 263 /* unrolled loop */
de6d9b64 264 __asm __volatile(
a822a479
NK
265 "movq %3, %%mm0\n\t"
266 "movq 8%3, %%mm1\n\t"
267 "movq 16%3, %%mm2\n\t"
268 "movq 24%3, %%mm3\n\t"
269 "movq 32%3, %%mm4\n\t"
270 "movq 40%3, %%mm5\n\t"
271 "movq 48%3, %%mm6\n\t"
272 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
273 "packuswb %%mm1, %%mm0\n\t"
274 "packuswb %%mm3, %%mm2\n\t"
275 "packuswb %%mm5, %%mm4\n\t"
276 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
277 "movq %%mm0, (%0)\n\t"
278 "movq %%mm2, (%0, %1)\n\t"
279 "movq %%mm4, (%0, %1, 2)\n\t"
280 "movq %%mm6, (%0, %2)\n\t"
281 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
282 :"memory");
283 pix += line_size*4;
284 p += 32;
d6a4c0b1
ZK
285
286 // if here would be an exact copy of the code above
287 // compiler would generate some very strange code
288 // thus using "r"
289 __asm __volatile(
290 "movq (%3), %%mm0\n\t"
291 "movq 8(%3), %%mm1\n\t"
292 "movq 16(%3), %%mm2\n\t"
293 "movq 24(%3), %%mm3\n\t"
294 "movq 32(%3), %%mm4\n\t"
295 "movq 40(%3), %%mm5\n\t"
296 "movq 48(%3), %%mm6\n\t"
297 "movq 56(%3), %%mm7\n\t"
298 "packuswb %%mm1, %%mm0\n\t"
299 "packuswb %%mm3, %%mm2\n\t"
300 "packuswb %%mm5, %%mm4\n\t"
301 "packuswb %%mm7, %%mm6\n\t"
302 "movq %%mm0, (%0)\n\t"
303 "movq %%mm2, (%0, %1)\n\t"
304 "movq %%mm4, (%0, %1, 2)\n\t"
305 "movq %%mm6, (%0, %2)\n\t"
306 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
307 :"memory");
de6d9b64
FB
308}
309
eb4b3dd3 310void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
311{
312 const DCTELEM *p;
313 UINT8 *pix;
314 int i;
315
316 /* read the pixels */
317 p = block;
318 pix = pixels;
d6a4c0b1
ZK
319 MOVQ_ZERO(mm7);
320 i = 4;
cd8e5f96 321 do {
de6d9b64 322 __asm __volatile(
cd8e5f96
ZK
323 "movq (%2), %%mm0\n\t"
324 "movq 8(%2), %%mm1\n\t"
325 "movq 16(%2), %%mm2\n\t"
326 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
327 "movq %0, %%mm4\n\t"
328 "movq %1, %%mm6\n\t"
329 "movq %%mm4, %%mm5\n\t"
330 "punpcklbw %%mm7, %%mm4\n\t"
331 "punpckhbw %%mm7, %%mm5\n\t"
332 "paddsw %%mm4, %%mm0\n\t"
333 "paddsw %%mm5, %%mm1\n\t"
334 "movq %%mm6, %%mm5\n\t"
335 "punpcklbw %%mm7, %%mm6\n\t"
336 "punpckhbw %%mm7, %%mm5\n\t"
337 "paddsw %%mm6, %%mm2\n\t"
338 "paddsw %%mm5, %%mm3\n\t"
339 "packuswb %%mm1, %%mm0\n\t"
340 "packuswb %%mm3, %%mm2\n\t"
341 "movq %%mm0, %0\n\t"
342 "movq %%mm2, %1\n\t"
a822a479 343 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 344 :"r"(p)
de6d9b64
FB
345 :"memory");
346 pix += line_size*2;
347 p += 16;
cd8e5f96 348 } while (--i);
de6d9b64
FB
349}
350
b3184779 351static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 352{
39825f31 353 __asm __volatile(
31ddcf98 354 "lea (%3, %3), %%eax \n\t"
52af45ad 355 ".balign 8 \n\t"
31ddcf98
ZK
356 "1: \n\t"
357 "movq (%1), %%mm0 \n\t"
358 "movq (%1, %3), %%mm1 \n\t"
359 "movq %%mm0, (%2) \n\t"
360 "movq %%mm1, (%2, %3) \n\t"
361 "addl %%eax, %1 \n\t"
362 "addl %%eax, %2 \n\t"
363 "movq (%1), %%mm0 \n\t"
364 "movq (%1, %3), %%mm1 \n\t"
365 "movq %%mm0, (%2) \n\t"
366 "movq %%mm1, (%2, %3) \n\t"
367 "addl %%eax, %1 \n\t"
368 "addl %%eax, %2 \n\t"
369 "subl $4, %0 \n\t"
370 "jnz 1b \n\t"
371 : "+g"(h), "+r" (pixels), "+r" (block)
372 : "r"(line_size)
373 : "%eax", "memory"
374 );
de6d9b64
FB
375}
376
b3184779
MN
377static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
378{
379 __asm __volatile(
380 "lea (%3, %3), %%eax \n\t"
381 ".balign 8 \n\t"
382 "1: \n\t"
383 "movq (%1), %%mm0 \n\t"
384 "movq 8(%1), %%mm4 \n\t"
385 "movq (%1, %3), %%mm1 \n\t"
386 "movq 8(%1, %3), %%mm5 \n\t"
387 "movq %%mm0, (%2) \n\t"
388 "movq %%mm4, 8(%2) \n\t"
389 "movq %%mm1, (%2, %3) \n\t"
390 "movq %%mm5, 8(%2, %3) \n\t"
391 "addl %%eax, %1 \n\t"
392 "addl %%eax, %2 \n\t"
393 "movq (%1), %%mm0 \n\t"
394 "movq 8(%1), %%mm4 \n\t"
395 "movq (%1, %3), %%mm1 \n\t"
396 "movq 8(%1, %3), %%mm5 \n\t"
397 "movq %%mm0, (%2) \n\t"
398 "movq %%mm4, 8(%2) \n\t"
399 "movq %%mm1, (%2, %3) \n\t"
400 "movq %%mm5, 8(%2, %3) \n\t"
401 "addl %%eax, %1 \n\t"
402 "addl %%eax, %2 \n\t"
403 "subl $4, %0 \n\t"
404 "jnz 1b \n\t"
405 : "+g"(h), "+r" (pixels), "+r" (block)
406 : "r"(line_size)
407 : "%eax", "memory"
408 );
409}
410
649c00c9
MN
411static void clear_blocks_mmx(DCTELEM *blocks)
412{
39825f31 413 __asm __volatile(
649c00c9
MN
414 "pxor %%mm7, %%mm7 \n\t"
415 "movl $-128*6, %%eax \n\t"
416 "1: \n\t"
417 "movq %%mm7, (%0, %%eax) \n\t"
418 "movq %%mm7, 8(%0, %%eax) \n\t"
419 "movq %%mm7, 16(%0, %%eax) \n\t"
420 "movq %%mm7, 24(%0, %%eax) \n\t"
421 "addl $32, %%eax \n\t"
422 " js 1b \n\t"
423 : : "r" (((int)blocks)+128*6)
424 : "%eax"
425 );
426}
427
084c726b
MN
428static int pix_sum16_mmx(UINT8 * pix, int line_size){
429 const int h=16;
430 int sum;
431 int index= -line_size*h;
432
433 __asm __volatile(
434 "pxor %%mm7, %%mm7 \n\t"
435 "pxor %%mm6, %%mm6 \n\t"
436 "1: \n\t"
437 "movq (%2, %1), %%mm0 \n\t"
438 "movq (%2, %1), %%mm1 \n\t"
439 "movq 8(%2, %1), %%mm2 \n\t"
440 "movq 8(%2, %1), %%mm3 \n\t"
441 "punpcklbw %%mm7, %%mm0 \n\t"
442 "punpckhbw %%mm7, %%mm1 \n\t"
443 "punpcklbw %%mm7, %%mm2 \n\t"
444 "punpckhbw %%mm7, %%mm3 \n\t"
445 "paddw %%mm0, %%mm1 \n\t"
446 "paddw %%mm2, %%mm3 \n\t"
447 "paddw %%mm1, %%mm3 \n\t"
448 "paddw %%mm3, %%mm6 \n\t"
449 "addl %3, %1 \n\t"
450 " js 1b \n\t"
451 "movq %%mm6, %%mm5 \n\t"
452 "psrlq $32, %%mm6 \n\t"
453 "paddw %%mm5, %%mm6 \n\t"
454 "movq %%mm6, %%mm5 \n\t"
455 "psrlq $16, %%mm6 \n\t"
456 "paddw %%mm5, %%mm6 \n\t"
457 "movd %%mm6, %0 \n\t"
458 "andl $0xFFFF, %0 \n\t"
459 : "=&r" (sum), "+r" (index)
460 : "r" (pix - index), "r" (line_size)
461 );
462
463 return sum;
464}
465
11f18faf
MN
466static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
467 int i=0;
468 asm volatile(
469 "1: \n\t"
470 "movq (%1, %0), %%mm0 \n\t"
471 "movq (%2, %0), %%mm1 \n\t"
472 "paddb %%mm0, %%mm1 \n\t"
473 "movq %%mm1, (%2, %0) \n\t"
474 "movq 8(%1, %0), %%mm0 \n\t"
475 "movq 8(%2, %0), %%mm1 \n\t"
476 "paddb %%mm0, %%mm1 \n\t"
477 "movq %%mm1, 8(%2, %0) \n\t"
478 "addl $16, %0 \n\t"
479 "cmpl %3, %0 \n\t"
480 " jb 1b \n\t"
481 : "+r" (i)
482 : "r"(src), "r"(dst), "r"(w-15)
483 );
484 for(; i<w; i++)
485 dst[i+0] += src[i+0];
486}
487
488static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
489 int i=0;
490 asm volatile(
491 "1: \n\t"
492 "movq (%2, %0), %%mm0 \n\t"
493 "movq (%1, %0), %%mm1 \n\t"
494 "psubb %%mm0, %%mm1 \n\t"
495 "movq %%mm1, (%3, %0) \n\t"
496 "movq 8(%2, %0), %%mm0 \n\t"
497 "movq 8(%1, %0), %%mm1 \n\t"
498 "psubb %%mm0, %%mm1 \n\t"
499 "movq %%mm1, 8(%3, %0) \n\t"
500 "addl $16, %0 \n\t"
501 "cmpl %4, %0 \n\t"
502 " jb 1b \n\t"
503 : "+r" (i)
504 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
505 );
506 for(; i<w; i++)
507 dst[i+0] = src1[i+0]-src2[i+0];
508}
1457ab52
MN
509#define LBUTTERFLY(a,b)\
510 "paddw " #b ", " #a " \n\t"\
511 "paddw " #b ", " #b " \n\t"\
512 "psubw " #a ", " #b " \n\t"
513
514#define HADAMARD48\
515 LBUTTERFLY(%%mm0, %%mm1)\
516 LBUTTERFLY(%%mm2, %%mm3)\
517 LBUTTERFLY(%%mm4, %%mm5)\
518 LBUTTERFLY(%%mm6, %%mm7)\
519 \
520 LBUTTERFLY(%%mm0, %%mm2)\
521 LBUTTERFLY(%%mm1, %%mm3)\
522 LBUTTERFLY(%%mm4, %%mm6)\
523 LBUTTERFLY(%%mm5, %%mm7)\
524 \
525 LBUTTERFLY(%%mm0, %%mm4)\
526 LBUTTERFLY(%%mm1, %%mm5)\
527 LBUTTERFLY(%%mm2, %%mm6)\
528 LBUTTERFLY(%%mm3, %%mm7)
529
530#define MMABS(a,z)\
531 "pxor " #z ", " #z " \n\t"\
532 "pcmpgtw " #a ", " #z " \n\t"\
533 "pxor " #z ", " #a " \n\t"\
534 "psubw " #z ", " #a " \n\t"
535
536#define MMABS_SUM(a,z, sum)\
537 "pxor " #z ", " #z " \n\t"\
538 "pcmpgtw " #a ", " #z " \n\t"\
539 "pxor " #z ", " #a " \n\t"\
540 "psubw " #z ", " #a " \n\t"\
541 "paddusw " #a ", " #sum " \n\t"
542
543
544#define SBUTTERFLY(a,b,t,n)\
545 "movq " #a ", " #t " \n\t" /* abcd */\
546 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
547 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
548
549#define TRANSPOSE4(a,b,c,d,t)\
550 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
551 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
552 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
553 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
554
555#define LOAD4(o, a, b, c, d)\
556 "movq "#o"(%1), " #a " \n\t"\
557 "movq "#o"+16(%1), " #b " \n\t"\
558 "movq "#o"+32(%1), " #c " \n\t"\
559 "movq "#o"+48(%1), " #d " \n\t"
560
561#define STORE4(o, a, b, c, d)\
562 "movq "#a", "#o"(%1) \n\t"\
563 "movq "#b", "#o"+16(%1) \n\t"\
564 "movq "#c", "#o"+32(%1) \n\t"\
565 "movq "#d", "#o"+48(%1) \n\t"\
566
567static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
568 uint64_t temp[16] __align8;
569 int sum=0;
570
571 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 572
1457ab52
MN
573 asm volatile(
574 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
575 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
576
577 HADAMARD48
578
579 "movq %%mm7, 112(%1) \n\t"
580
581 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
582 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
583
584 "movq 112(%1), %%mm7 \n\t"
585 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
586 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
587
588 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
589 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
590
591 HADAMARD48
592
593 "movq %%mm7, 120(%1) \n\t"
594
595 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
596 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
597
598 "movq 120(%1), %%mm7 \n\t"
599 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
600 "movq %%mm7, %%mm5 \n\t"//FIXME remove
601 "movq %%mm6, %%mm7 \n\t"
602 "movq %%mm0, %%mm6 \n\t"
603// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
604
605 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
606// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
607
608 HADAMARD48
609 "movq %%mm7, 64(%1) \n\t"
610 MMABS(%%mm0, %%mm7)
611 MMABS_SUM(%%mm1, %%mm7, %%mm0)
612 MMABS_SUM(%%mm2, %%mm7, %%mm0)
613 MMABS_SUM(%%mm3, %%mm7, %%mm0)
614 MMABS_SUM(%%mm4, %%mm7, %%mm0)
615 MMABS_SUM(%%mm5, %%mm7, %%mm0)
616 MMABS_SUM(%%mm6, %%mm7, %%mm0)
617 "movq 64(%1), %%mm1 \n\t"
618 MMABS_SUM(%%mm1, %%mm7, %%mm0)
619 "movq %%mm0, 64(%1) \n\t"
620
621 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
622 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
623
624 HADAMARD48
625 "movq %%mm7, (%1) \n\t"
626 MMABS(%%mm0, %%mm7)
627 MMABS_SUM(%%mm1, %%mm7, %%mm0)
628 MMABS_SUM(%%mm2, %%mm7, %%mm0)
629 MMABS_SUM(%%mm3, %%mm7, %%mm0)
630 MMABS_SUM(%%mm4, %%mm7, %%mm0)
631 MMABS_SUM(%%mm5, %%mm7, %%mm0)
632 MMABS_SUM(%%mm6, %%mm7, %%mm0)
633 "movq (%1), %%mm1 \n\t"
634 MMABS_SUM(%%mm1, %%mm7, %%mm0)
635 "movq 64(%1), %%mm1 \n\t"
636 MMABS_SUM(%%mm1, %%mm7, %%mm0)
637
638 "movq %%mm0, %%mm1 \n\t"
639 "psrlq $32, %%mm0 \n\t"
640 "paddusw %%mm1, %%mm0 \n\t"
641 "movq %%mm0, %%mm1 \n\t"
642 "psrlq $16, %%mm0 \n\t"
643 "paddusw %%mm1, %%mm0 \n\t"
644 "movd %%mm0, %0 \n\t"
645
646 : "=r" (sum)
647 : "r"(temp)
648 );
649 return sum&0xFFFF;
650}
651
652WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
11f18faf 653
3178ee4c
MN
654#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
655#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
656
826f429a
MN
657#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
658 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
c296f66b 659 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
826f429a
MN
660 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
661 "movq "#in7", " #m3 " \n\t" /* d */\
662 "movq "#in0", %%mm5 \n\t" /* D */\
663 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
664 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
665 "movq "#in1", %%mm5 \n\t" /* C */\
666 "movq "#in2", %%mm6 \n\t" /* B */\
667 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
668 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
669 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
670 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
c296f66b 671 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
826f429a
MN
672 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
673 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
674 "psraw $5, %%mm5 \n\t"\
675 "packuswb %%mm5, %%mm5 \n\t"\
676 OP(%%mm5, out, %%mm7, d)
677
3178ee4c 678#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
826f429a
MN
679void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
680 uint64_t temp;\
681\
682 asm volatile(\
683 "pxor %%mm7, %%mm7 \n\t"\
684 "1: \n\t"\
685 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
686 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
687 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
688 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
689 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
690 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
691 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
692 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
693 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
694 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
695 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
696 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
697 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
698 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
699 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
700 "paddw %%mm3, %%mm5 \n\t" /* b */\
701 "paddw %%mm2, %%mm6 \n\t" /* c */\
702 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
703 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
704 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 705 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
706 "paddw %%mm4, %%mm0 \n\t" /* a */\
707 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 708 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 709 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 710 "paddw %6, %%mm6 \n\t"\
826f429a
MN
711 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
712 "psraw $5, %%mm0 \n\t"\
c296f66b 713 "movq %%mm0, %5 \n\t"\
826f429a
MN
714 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
715 \
716 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
717 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
718 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
719 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
720 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
721 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
723 "paddw %%mm0, %%mm2 \n\t" /* b */\
724 "paddw %%mm5, %%mm3 \n\t" /* c */\
725 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
726 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
727 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
728 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
729 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
730 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
c296f66b 731 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a
MN
732 "paddw %%mm2, %%mm1 \n\t" /* a */\
733 "paddw %%mm6, %%mm4 \n\t" /* d */\
c296f66b 734 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
826f429a 735 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
c296f66b 736 "paddw %6, %%mm1 \n\t"\
826f429a
MN
737 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
738 "psraw $5, %%mm3 \n\t"\
c296f66b 739 "movq %5, %%mm1 \n\t"\
826f429a 740 "packuswb %%mm3, %%mm1 \n\t"\
3178ee4c 741 OP_MMX2(%%mm1, (%1),%%mm4, q)\
826f429a
MN
742 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
743 \
744 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
745 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
746 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
747 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
748 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
749 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
750 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
751 "paddw %%mm1, %%mm5 \n\t" /* b */\
752 "paddw %%mm4, %%mm0 \n\t" /* c */\
753 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
754 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
755 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
756 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
c296f66b 757 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
826f429a
MN
758 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
759 "paddw %%mm3, %%mm2 \n\t" /* d */\
760 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
761 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
762 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
763 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
764 "paddw %%mm2, %%mm6 \n\t" /* a */\
c296f66b
MN
765 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
766 "paddw %6, %%mm0 \n\t"\
826f429a
MN
767 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
768 "psraw $5, %%mm0 \n\t"\
769 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
770 \
771 "paddw %%mm5, %%mm3 \n\t" /* a */\
772 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
773 "paddw %%mm4, %%mm6 \n\t" /* b */\
774 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
775 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
776 "paddw %%mm1, %%mm4 \n\t" /* c */\
777 "paddw %%mm2, %%mm5 \n\t" /* d */\
778 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
779 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
c296f66b
MN
780 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
781 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
826f429a 782 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 783 "paddw %6, %%mm4 \n\t"\
826f429a
MN
784 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
785 "psraw $5, %%mm4 \n\t"\
786 "packuswb %%mm4, %%mm0 \n\t"\
3178ee4c 787 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
826f429a
MN
788 \
789 "addl %3, %0 \n\t"\
790 "addl %4, %1 \n\t"\
791 "decl %2 \n\t"\
792 " jnz 1b \n\t"\
5a508a98 793 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
794 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
795 : "memory"\
826f429a
MN
796 );\
797}\
798\
799static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
800 int i;\
801 int16_t temp[16];\
802 /* quick HACK, XXX FIXME MUST be optimized */\
803 for(i=0; i<h; i++)\
804 {\
805 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
806 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
807 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
808 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
809 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
810 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
811 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
812 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
813 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
814 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
815 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
816 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
817 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
818 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
819 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
820 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
821 asm volatile(\
822 "movq (%0), %%mm0 \n\t"\
823 "movq 8(%0), %%mm1 \n\t"\
824 "paddw %2, %%mm0 \n\t"\
825 "paddw %2, %%mm1 \n\t"\
826 "psraw $5, %%mm0 \n\t"\
827 "psraw $5, %%mm1 \n\t"\
828 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 829 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
830 "movq 16(%0), %%mm0 \n\t"\
831 "movq 24(%0), %%mm1 \n\t"\
832 "paddw %2, %%mm0 \n\t"\
833 "paddw %2, %%mm1 \n\t"\
834 "psraw $5, %%mm0 \n\t"\
835 "psraw $5, %%mm1 \n\t"\
836 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 837 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
826f429a 838 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 839 : "memory"\
826f429a
MN
840 );\
841 dst+=dstStride;\
842 src+=srcStride;\
843 }\
844}\
845\
826f429a
MN
846void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
847 uint64_t temp;\
848\
849 asm volatile(\
850 "pxor %%mm7, %%mm7 \n\t"\
851 "1: \n\t"\
852 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
853 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
854 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
855 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
856 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
857 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
858 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
859 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
860 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
861 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
862 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
863 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
864 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
865 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
866 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
867 "paddw %%mm3, %%mm5 \n\t" /* b */\
868 "paddw %%mm2, %%mm6 \n\t" /* c */\
869 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
870 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
871 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 872 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
873 "paddw %%mm4, %%mm0 \n\t" /* a */\
874 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 875 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 876 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 877 "paddw %6, %%mm6 \n\t"\
826f429a
MN
878 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
879 "psraw $5, %%mm0 \n\t"\
880 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
881 \
882 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
883 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
884 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
885 "paddw %%mm5, %%mm1 \n\t" /* a */\
886 "paddw %%mm6, %%mm2 \n\t" /* b */\
887 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
888 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
889 "paddw %%mm6, %%mm3 \n\t" /* c */\
890 "paddw %%mm5, %%mm4 \n\t" /* d */\
891 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
892 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
c296f66b
MN
893 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
894 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a 895 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 896 "paddw %6, %%mm1 \n\t"\
826f429a
MN
897 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
898 "psraw $5, %%mm3 \n\t"\
899 "packuswb %%mm3, %%mm0 \n\t"\
3178ee4c 900 OP_MMX2(%%mm0, (%1), %%mm4, q)\
826f429a
MN
901 \
902 "addl %3, %0 \n\t"\
903 "addl %4, %1 \n\t"\
904 "decl %2 \n\t"\
c296f66b 905 " jnz 1b \n\t"\
5a508a98 906 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
907 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
908 : "memory"\
826f429a
MN
909 );\
910}\
911\
912static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
913 int i;\
914 int16_t temp[8];\
915 /* quick HACK, XXX FIXME MUST be optimized */\
916 for(i=0; i<h; i++)\
917 {\
918 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
919 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
920 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
921 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
922 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
923 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
924 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
925 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
926 asm volatile(\
927 "movq (%0), %%mm0 \n\t"\
928 "movq 8(%0), %%mm1 \n\t"\
929 "paddw %2, %%mm0 \n\t"\
930 "paddw %2, %%mm1 \n\t"\
931 "psraw $5, %%mm0 \n\t"\
932 "psraw $5, %%mm1 \n\t"\
933 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 934 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a 935 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 936 :"memory"\
826f429a
MN
937 );\
938 dst+=dstStride;\
939 src+=srcStride;\
940 }\
3178ee4c
MN
941}
942
943#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
944\
945static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
946 uint64_t temp[17*4];\
947 uint64_t *temp_ptr= temp;\
948 int count= 17;\
949\
950 /*FIXME unroll */\
951 asm volatile(\
952 "pxor %%mm7, %%mm7 \n\t"\
953 "1: \n\t"\
954 "movq (%0), %%mm0 \n\t"\
955 "movq (%0), %%mm1 \n\t"\
956 "movq 8(%0), %%mm2 \n\t"\
957 "movq 8(%0), %%mm3 \n\t"\
958 "punpcklbw %%mm7, %%mm0 \n\t"\
959 "punpckhbw %%mm7, %%mm1 \n\t"\
960 "punpcklbw %%mm7, %%mm2 \n\t"\
961 "punpckhbw %%mm7, %%mm3 \n\t"\
962 "movq %%mm0, (%1) \n\t"\
963 "movq %%mm1, 17*8(%1) \n\t"\
5a508a98
MN
964 "movq %%mm2, 2*17*8(%1) \n\t"\
965 "movq %%mm3, 3*17*8(%1) \n\t"\
3178ee4c
MN
966 "addl $8, %1 \n\t"\
967 "addl %3, %0 \n\t"\
968 "decl %2 \n\t"\
969 " jnz 1b \n\t"\
970 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
5a508a98 971 : "r" (srcStride)\
0b093b6f 972 : "memory"\
3178ee4c
MN
973 );\
974 \
975 temp_ptr= temp;\
976 count=4;\
977 \
978/*FIXME reorder for speed */\
3178ee4c
MN
979 asm volatile(\
980 /*"pxor %%mm7, %%mm7 \n\t"*/\
3178ee4c
MN
981 "1: \n\t"\
982 "movq (%0), %%mm0 \n\t"\
983 "movq 8(%0), %%mm1 \n\t"\
984 "movq 16(%0), %%mm2 \n\t"\
985 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
986 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
987 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
3178ee4c 988 "addl %4, %1 \n\t"\
c296f66b 989 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
3178ee4c 990 \
c296f66b 991 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
3178ee4c 992 "addl %4, %1 \n\t"\
c296f66b
MN
993 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
994 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
3178ee4c 995 "addl %4, %1 \n\t"\
c296f66b
MN
996 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
997 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
3178ee4c 998 "addl %4, %1 \n\t"\
c296f66b
MN
999 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1000 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
3178ee4c 1001 "addl %4, %1 \n\t"\
c296f66b
MN
1002 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1003 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
3178ee4c 1004 "addl %4, %1 \n\t"\
c296f66b 1005 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
3178ee4c 1006 \
c296f66b 1007 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
3178ee4c 1008 "addl %4, %1 \n\t" \
c296f66b
MN
1009 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1010 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
3178ee4c
MN
1011 \
1012 "addl $136, %0 \n\t"\
c296f66b 1013 "addl %6, %1 \n\t"\
3178ee4c
MN
1014 "decl %2 \n\t"\
1015 " jnz 1b \n\t"\
3178ee4c 1016 \
5a508a98 1017 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
c296f66b 1018 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
0b093b6f 1019 :"memory"\
3178ee4c 1020 );\
826f429a
MN
1021}\
1022\
3178ee4c 1023void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
826f429a
MN
1024 uint64_t temp[9*4];\
1025 uint64_t *temp_ptr= temp;\
1026 int count= 9;\
1027\
1028 /*FIXME unroll */\
1029 asm volatile(\
1030 "pxor %%mm7, %%mm7 \n\t"\
1031 "1: \n\t"\
1032 "movq (%0), %%mm0 \n\t"\
1033 "movq (%0), %%mm1 \n\t"\
1034 "punpcklbw %%mm7, %%mm0 \n\t"\
1035 "punpckhbw %%mm7, %%mm1 \n\t"\
1036 "movq %%mm0, (%1) \n\t"\
1037 "movq %%mm1, 9*8(%1) \n\t"\
1038 "addl $8, %1 \n\t"\
1039 "addl %3, %0 \n\t"\
1040 "decl %2 \n\t"\
1041 " jnz 1b \n\t"\
1042 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1043 : "r" (srcStride)\
0b093b6f 1044 : "memory"\
826f429a
MN
1045 );\
1046 \
1047 temp_ptr= temp;\
1048 count=2;\
1049 \
1050/*FIXME reorder for speed */\
1051 asm volatile(\
1052 /*"pxor %%mm7, %%mm7 \n\t"*/\
1053 "1: \n\t"\
1054 "movq (%0), %%mm0 \n\t"\
1055 "movq 8(%0), %%mm1 \n\t"\
1056 "movq 16(%0), %%mm2 \n\t"\
1057 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1058 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1059 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
826f429a 1060 "addl %4, %1 \n\t"\
c296f66b 1061 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
826f429a 1062 \
c296f66b 1063 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
826f429a 1064 "addl %4, %1 \n\t"\
c296f66b 1065 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
826f429a 1066 \
c296f66b 1067 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
826f429a 1068 "addl %4, %1 \n\t"\
c296f66b
MN
1069 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1070 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
826f429a
MN
1071 \
1072 "addl $72, %0 \n\t"\
c296f66b 1073 "addl %6, %1 \n\t"\
826f429a
MN
1074 "decl %2 \n\t"\
1075 " jnz 1b \n\t"\
1076 \
c296f66b
MN
1077 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1078 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
0b093b6f
MN
1079 : "memory"\
1080 );\
3178ee4c 1081}\
826f429a
MN
1082\
1083static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1084 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
826f429a
MN
1085}\
1086\
1087static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1088 uint64_t temp[8];\
826f429a
MN
1089 uint8_t * const half= (uint8_t*)temp;\
1090 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1091 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1092}\
1093\
1094static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1095 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1096}\
1097\
1098static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1099 uint64_t temp[8];\
826f429a
MN
1100 uint8_t * const half= (uint8_t*)temp;\
1101 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1102 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1103}\
1104\
1105static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1106 uint64_t temp[8];\
826f429a 1107 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1108 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1109 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1110}\
1111\
1112static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1113 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1114}\
1115\
1116static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1117 uint64_t temp[8];\
826f429a 1118 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1119 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1120 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1121}\
1122static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1123 uint64_t half[8 + 9];\
1124 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1125 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1126 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1127 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1128 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1129 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a
MN
1130}\
1131static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1132 uint64_t half[8 + 9];\
1133 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1134 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1135 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1136 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1137 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1138 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a
MN
1139}\
1140static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1141 uint64_t half[8 + 9];\
1142 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1143 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1144 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1145 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1146 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1147 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a
MN
1148}\
1149static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1150 uint64_t half[8 + 9];\
1151 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1152 uint8_t * const halfHV= ((uint8_t*)half);\
1153 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1154 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1155 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1156 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a
MN
1157}\
1158static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1159 uint64_t half[8 + 9];\
826f429a
MN
1160 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1161 uint8_t * const halfHV= ((uint8_t*)half);\
1162 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1163 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1164 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1165}\
1166static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1167 uint64_t half[8 + 9];\
826f429a
MN
1168 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1169 uint8_t * const halfHV= ((uint8_t*)half);\
1170 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1171 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1172 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1173}\
1174static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1175 uint64_t half[8 + 9];\
1176 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1177 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1178 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
1179 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a
MN
1180}\
1181static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1182 uint64_t half[8 + 9];\
1183 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1184 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1185 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
1186 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a
MN
1187}\
1188static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1189 uint64_t half[9];\
826f429a
MN
1190 uint8_t * const halfH= ((uint8_t*)half);\
1191 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1192 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a
MN
1193}\
1194static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1195 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
826f429a
MN
1196}\
1197\
1198static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1199 uint64_t temp[32];\
1200 uint8_t * const half= (uint8_t*)temp;\
1201 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1202 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1203}\
1204\
1205static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1206 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1207}\
1208\
1209static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1210 uint64_t temp[32];\
1211 uint8_t * const half= (uint8_t*)temp;\
1212 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1213 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1214}\
1215\
1216static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1217 uint64_t temp[32];\
1218 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1219 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1220 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1221}\
1222\
1223static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1224 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1225}\
1226\
1227static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1228 uint64_t temp[32];\
1229 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1230 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1231 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1232}\
1233static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1234 uint64_t half[16*2 + 17*2];\
1235 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1236 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1237 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1238 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1239 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1240 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a
MN
1241}\
1242static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1243 uint64_t half[16*2 + 17*2];\
1244 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1245 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1246 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1247 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1248 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1249 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a
MN
1250}\
1251static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1252 uint64_t half[16*2 + 17*2];\
1253 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1254 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1255 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1256 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1257 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1258 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a
MN
1259}\
1260static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1261 uint64_t half[16*2 + 17*2];\
1262 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1263 uint8_t * const halfHV= ((uint8_t*)half);\
1264 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1265 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1266 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1267 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a
MN
1268}\
1269static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1270 uint64_t half[16*2 + 17*2];\
1271 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1272 uint8_t * const halfHV= ((uint8_t*)half);\
1273 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1274 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1275 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1276}\
1277static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1278 uint64_t half[16*2 + 17*2];\
1279 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1280 uint8_t * const halfHV= ((uint8_t*)half);\
1281 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1282 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1283 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1284}\
1285static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1286 uint64_t half[17*2];\
1287 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1288 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1289 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
1290 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1291}\
1292static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1293 uint64_t half[17*2];\
1294 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1295 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1296 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
1297 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1298}\
1299static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1300 uint64_t half[17*2];\
1301 uint8_t * const halfH= ((uint8_t*)half);\
1302 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1303 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1304}
1305
1306
1307#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
3178ee4c 1308#define AVG_3DNOW_OP(a,b,temp, size) \
826f429a
MN
1309"mov" #size " " #b ", " #temp " \n\t"\
1310"pavgusb " #temp ", " #a " \n\t"\
1311"mov" #size " " #a ", " #b " \n\t"
3178ee4c 1312#define AVG_MMX2_OP(a,b,temp, size) \
826f429a
MN
1313"mov" #size " " #b ", " #temp " \n\t"\
1314"pavgb " #temp ", " #a " \n\t"\
1315"mov" #size " " #a ", " #b " \n\t"
3178ee4c
MN
1316
1317QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1318QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1319QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1320QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1321QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1322QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
826f429a 1323QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
3178ee4c 1324QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
826f429a
MN
1325QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1326
61a4e8ae 1327#if 0
d6a4c0b1 1328static void just_return() { return; }
61a4e8ae 1329#endif
d6a4c0b1 1330
826f429a
MN
1331#define SET_QPEL_FUNC(postfix1, postfix2) \
1332 c->put_ ## postfix1 = put_ ## postfix2;\
1333 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1334 c->avg_ ## postfix1 = avg_ ## postfix2;
1335
eb4b3dd3 1336void dsputil_init_mmx(DSPContext* c, unsigned mask)
de6d9b64
FB
1337{
1338 mm_flags = mm_support();
1565dabc
LB
1339#if 0
1340 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 1341 if (mm_flags & MM_MMX)
1565dabc 1342 fprintf(stderr, " mmx");
de6d9b64 1343 if (mm_flags & MM_MMXEXT)
1565dabc 1344 fprintf(stderr, " mmxext");
de6d9b64 1345 if (mm_flags & MM_3DNOW)
1565dabc 1346 fprintf(stderr, " 3dnow");
de6d9b64 1347 if (mm_flags & MM_SSE)
1565dabc 1348 fprintf(stderr, " sse");
de6d9b64 1349 if (mm_flags & MM_SSE2)
1565dabc
LB
1350 fprintf(stderr, " sse2");
1351 fprintf(stderr, "\n");
de6d9b64
FB
1352#endif
1353
1354 if (mm_flags & MM_MMX) {
eb4b3dd3
ZK
1355 c->get_pixels = get_pixels_mmx;
1356 c->diff_pixels = diff_pixels_mmx;
1357 c->put_pixels_clamped = put_pixels_clamped_mmx;
1358 c->add_pixels_clamped = add_pixels_clamped_mmx;
1359 c->clear_blocks = clear_blocks_mmx;
1360 c->pix_sum = pix_sum16_mmx;
1361
1362 c->pix_abs16x16 = pix_abs16x16_mmx;
1363 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1364 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1365 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1366 c->pix_abs8x8 = pix_abs8x8_mmx;
1367 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1368 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1369 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
1370
1371 c->put_pixels_tab[0][0] = put_pixels16_mmx;
1372 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1373 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1374 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1375
1376 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1377 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1378 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1379 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1380
1381 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1382 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1383 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1384 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1385
1386 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1387 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1388 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1389 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1390
1391 c->put_pixels_tab[1][0] = put_pixels8_mmx;
1392 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1393 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1394 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1395
1396 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1397 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1398 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1399 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1400
1401 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1402 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1403 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1404 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1405
1406 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1407 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1408 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1409 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
826f429a 1410
11f18faf
MN
1411 c->add_bytes= add_bytes_mmx;
1412 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
1413
1414 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1415 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1416
1417 c->sad[0]= sad16x16_mmx;
1418 c->sad[1]= sad8x8_mmx;
1419
de6d9b64 1420 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1421 c->pix_abs16x16 = pix_abs16x16_mmx2;
1422 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
1423 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
1424 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
1425
1426 c->pix_abs8x8 = pix_abs8x8_mmx2;
1427 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
1428 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
1429 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
1430
1457ab52
MN
1431 c->sad[0]= sad16x16_mmx2;
1432 c->sad[1]= sad8x8_mmx2;
1433
eb4b3dd3
ZK
1434 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1435 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
1436 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1437 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1438
1439 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1440 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1441 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
1442 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1443
1444 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1445 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
1446 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1447 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1448
1449 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1450 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1451 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1452 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3178ee4c 1453
c296f66b 1454#if 1
826f429a
MN
1455 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1456 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1457 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1458 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1459 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1460 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1461 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1462 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1463 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1464 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1465 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1466 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1467 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1468 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1469 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1470 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1471 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1472 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1473 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1474 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1475 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1476 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1477 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1478 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1479 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1480 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1481 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1482 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1483 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1484 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1485 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1486 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
c296f66b 1487#endif
de6d9b64 1488 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
1489 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1490 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
1491 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1492 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1493
1494 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1495 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1496 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
1497 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1498
1499 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1500 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
1501 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1502 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1503
1504 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1505 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1506 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1507 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
db794953 1508
826f429a
MN
1509 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1510 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1511 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1512 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1513 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1514 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1515 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1516 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1517 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1518 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1519 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1520 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1521 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1522 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1523 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1524 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1525 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1526 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1527 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1528 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1529 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1530 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1531 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1532 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1533 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1534 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1535 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1536 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1537 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1538 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1539 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1540 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
de6d9b64
FB
1541 }
1542 }
d6a4c0b1
ZK
1543
1544#if 0
1545 // for speed testing
1546 get_pixels = just_return;
1547 put_pixels_clamped = just_return;
1548 add_pixels_clamped = just_return;
1549
1550 pix_abs16x16 = just_return;
1551 pix_abs16x16_x2 = just_return;
1552 pix_abs16x16_y2 = just_return;
1553 pix_abs16x16_xy2 = just_return;
1554
1555 put_pixels_tab[0] = just_return;
1556 put_pixels_tab[1] = just_return;
1557 put_pixels_tab[2] = just_return;
1558 put_pixels_tab[3] = just_return;
1559
1560 put_no_rnd_pixels_tab[0] = just_return;
1561 put_no_rnd_pixels_tab[1] = just_return;
1562 put_no_rnd_pixels_tab[2] = just_return;
1563 put_no_rnd_pixels_tab[3] = just_return;
1564
1565 avg_pixels_tab[0] = just_return;
1566 avg_pixels_tab[1] = just_return;
1567 avg_pixels_tab[2] = just_return;
1568 avg_pixels_tab[3] = just_return;
1569
1570 avg_no_rnd_pixels_tab[0] = just_return;
1571 avg_no_rnd_pixels_tab[1] = just_return;
1572 avg_no_rnd_pixels_tab[2] = just_return;
1573 avg_no_rnd_pixels_tab[3] = just_return;
1574
d6a4c0b1
ZK
1575 //av_fdct = just_return;
1576 //ff_idct = just_return;
1577#endif
de6d9b64 1578}
4f12a497
FB
1579
1580/* remove any non bit exact operation (testing purpose). NOTE that
1581 this function should be kept as small as possible because it is
1582 always difficult to test automatically non bit exact cases. */
eb4b3dd3 1583void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
4f12a497
FB
1584{
1585 if (mm_flags & MM_MMX) {
b3184779 1586 /* MMX2 & 3DNOW */
eb4b3dd3
ZK
1587 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1588 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1589 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1590 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1591 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1592 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4afeaec9 1593
b3184779 1594 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1595 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1596 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1597 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1598 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1599 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1600 c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497
FB
1601 }
1602 }
1603}