slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95...
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5 24int mm_flags; /* multimedia extension flags */
eb4b3dd3 25/* FIXME use them in static form */
ba6802de
MN
26int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
31int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
36int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
41int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
1457ab52
MN
46int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
47int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
48int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
49int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
50
de6d9b64 51/* pixel operations */
a7bd8797
MN
52static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 55
826f429a
MN
56static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
57static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
58static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
59static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
60
d6a4c0b1
ZK
61#define JUMPALIGN() __asm __volatile (".balign 8"::)
62#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
63
fca0f0e5
ZK
64#define MOVQ_WONE(regd) \
65 __asm __volatile ( \
66 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
67 "psrlw $15, %%" #regd ::)
68
69#define MOVQ_BFE(regd) \
70 __asm __volatile ( \
71 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
72 "paddb %%" #regd ", %%" #regd " \n\t" ::)
73
d6a4c0b1 74#ifndef PIC
fca0f0e5 75#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
76#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
77#else
78// for shared library it's better to use this way for accessing constants
79// pcmpeqd -> -1
fca0f0e5 80#define MOVQ_BONE(regd) \
d6a4c0b1 81 __asm __volatile ( \
fca0f0e5
ZK
82 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
83 "psrlw $15, %%" #regd " \n\t" \
84 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
85
86#define MOVQ_WTWO(regd) \
87 __asm __volatile ( \
fca0f0e5
ZK
88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
89 "psrlw $15, %%" #regd " \n\t" \
90 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 91
d6a4c0b1
ZK
92#endif
93
fca0f0e5 94// using regr as temporary and for the output result
def60345 95// first argument is unmodifed and second is trashed
39825f31
ZK
96// regfe is supposed to contain 0xfefefefefefefefe
97#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
98 "movq " #rega ", " #regr " \n\t"\
99 "pand " #regb ", " #regr " \n\t"\
def60345 100 "pxor " #rega ", " #regb " \n\t"\
39825f31 101 "pand " #regfe "," #regb " \n\t"\
def60345 102 "psrlq $1, " #regb " \n\t"\
91abb473 103 "paddb " #regb ", " #regr " \n\t"
def60345 104
39825f31 105#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
106 "movq " #rega ", " #regr " \n\t"\
107 "por " #regb ", " #regr " \n\t"\
def60345 108 "pxor " #rega ", " #regb " \n\t"\
39825f31 109 "pand " #regfe "," #regb " \n\t"\
def60345 110 "psrlq $1, " #regb " \n\t"\
91abb473 111 "psubb " #regb ", " #regr " \n\t"
def60345 112
39825f31 113// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
114#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
115 "movq " #rega ", " #regr " \n\t"\
116 "movq " #regc ", " #regp " \n\t"\
117 "pand " #regb ", " #regr " \n\t"\
118 "pand " #regd ", " #regp " \n\t"\
119 "pxor " #rega ", " #regb " \n\t"\
120 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
121 "pand %%mm6, " #regb " \n\t"\
122 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
123 "psrlq $1, " #regb " \n\t"\
124 "psrlq $1, " #regd " \n\t"\
125 "paddb " #regb ", " #regr " \n\t"\
126 "paddb " #regd ", " #regp " \n\t"
127
128#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
129 "movq " #rega ", " #regr " \n\t"\
130 "movq " #regc ", " #regp " \n\t"\
131 "por " #regb ", " #regr " \n\t"\
132 "por " #regd ", " #regp " \n\t"\
133 "pxor " #rega ", " #regb " \n\t"\
134 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
135 "pand %%mm6, " #regb " \n\t"\
136 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
137 "psrlq $1, " #regd " \n\t"\
138 "psrlq $1, " #regb " \n\t"\
139 "psubb " #regb ", " #regr " \n\t"\
140 "psubb " #regd ", " #regp " \n\t"
141
91abb473
ZK
142/***********************************/
143/* MMX no rounding */
144#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 145#define SET_RND MOVQ_WONE
6aa6ea8e 146#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 147#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 148
91abb473
ZK
149#include "dsputil_mmx_rnd.h"
150
151#undef DEF
fca0f0e5 152#undef SET_RND
6aa6ea8e 153#undef PAVGBP
39825f31 154#undef PAVGB
91abb473
ZK
155/***********************************/
156/* MMX rounding */
157
158#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 159#define SET_RND MOVQ_WTWO
6aa6ea8e 160#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 161#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 162
91abb473
ZK
163#include "dsputil_mmx_rnd.h"
164
165#undef DEF
fca0f0e5 166#undef SET_RND
6aa6ea8e 167#undef PAVGBP
39825f31 168#undef PAVGB
a7bd8797 169
de6d9b64
FB
170/***********************************/
171/* 3Dnow specific */
172
173#define DEF(x) x ## _3dnow
174/* for Athlons PAVGUSB is prefered */
175#define PAVGB "pavgusb"
176
177#include "dsputil_mmx_avg.h"
178
179#undef DEF
180#undef PAVGB
181
182/***********************************/
183/* MMX2 specific */
184
607dce96 185#define DEF(x) x ## _mmx2
de6d9b64
FB
186
187/* Introduced only in MMX2 set */
188#define PAVGB "pavgb"
189
190#include "dsputil_mmx_avg.h"
191
192#undef DEF
193#undef PAVGB
194
195/***********************************/
196/* standard MMX */
197
198static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
199{
607dce96
MN
200 asm volatile(
201 "movl $-128, %%eax \n\t"
202 "pxor %%mm7, %%mm7 \n\t"
203 ".balign 16 \n\t"
204 "1: \n\t"
205 "movq (%0), %%mm0 \n\t"
206 "movq (%0, %2), %%mm2 \n\t"
207 "movq %%mm0, %%mm1 \n\t"
208 "movq %%mm2, %%mm3 \n\t"
209 "punpcklbw %%mm7, %%mm0 \n\t"
210 "punpckhbw %%mm7, %%mm1 \n\t"
211 "punpcklbw %%mm7, %%mm2 \n\t"
212 "punpckhbw %%mm7, %%mm3 \n\t"
213 "movq %%mm0, (%1, %%eax)\n\t"
214 "movq %%mm1, 8(%1, %%eax)\n\t"
215 "movq %%mm2, 16(%1, %%eax)\n\t"
216 "movq %%mm3, 24(%1, %%eax)\n\t"
217 "addl %3, %0 \n\t"
218 "addl $32, %%eax \n\t"
219 "js 1b \n\t"
220 : "+r" (pixels)
221 : "r" (block+64), "r" (line_size), "r" (line_size*2)
222 : "%eax"
223 );
de6d9b64
FB
224}
225
1457ab52 226static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
9dbcbd92
MN
227{
228 asm volatile(
607dce96 229 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 230 "movl $-128, %%eax \n\t"
607dce96 231 ".balign 16 \n\t"
9dbcbd92
MN
232 "1: \n\t"
233 "movq (%0), %%mm0 \n\t"
234 "movq (%1), %%mm2 \n\t"
235 "movq %%mm0, %%mm1 \n\t"
236 "movq %%mm2, %%mm3 \n\t"
237 "punpcklbw %%mm7, %%mm0 \n\t"
238 "punpckhbw %%mm7, %%mm1 \n\t"
239 "punpcklbw %%mm7, %%mm2 \n\t"
240 "punpckhbw %%mm7, %%mm3 \n\t"
241 "psubw %%mm2, %%mm0 \n\t"
242 "psubw %%mm3, %%mm1 \n\t"
243 "movq %%mm0, (%2, %%eax)\n\t"
244 "movq %%mm1, 8(%2, %%eax)\n\t"
245 "addl %3, %0 \n\t"
246 "addl %3, %1 \n\t"
247 "addl $16, %%eax \n\t"
248 "jnz 1b \n\t"
249 : "+r" (s1), "+r" (s2)
250 : "r" (block+64), "r" (stride)
251 : "%eax"
252 );
253}
254
eb4b3dd3 255void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
256{
257 const DCTELEM *p;
258 UINT8 *pix;
de6d9b64
FB
259
260 /* read the pixels */
261 p = block;
262 pix = pixels;
d6a4c0b1 263 /* unrolled loop */
de6d9b64 264 __asm __volatile(
a822a479
NK
265 "movq %3, %%mm0\n\t"
266 "movq 8%3, %%mm1\n\t"
267 "movq 16%3, %%mm2\n\t"
268 "movq 24%3, %%mm3\n\t"
269 "movq 32%3, %%mm4\n\t"
270 "movq 40%3, %%mm5\n\t"
271 "movq 48%3, %%mm6\n\t"
272 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
273 "packuswb %%mm1, %%mm0\n\t"
274 "packuswb %%mm3, %%mm2\n\t"
275 "packuswb %%mm5, %%mm4\n\t"
276 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
277 "movq %%mm0, (%0)\n\t"
278 "movq %%mm2, (%0, %1)\n\t"
279 "movq %%mm4, (%0, %1, 2)\n\t"
280 "movq %%mm6, (%0, %2)\n\t"
281 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
282 :"memory");
283 pix += line_size*4;
284 p += 32;
d6a4c0b1
ZK
285
286 // if here would be an exact copy of the code above
287 // compiler would generate some very strange code
288 // thus using "r"
289 __asm __volatile(
290 "movq (%3), %%mm0\n\t"
291 "movq 8(%3), %%mm1\n\t"
292 "movq 16(%3), %%mm2\n\t"
293 "movq 24(%3), %%mm3\n\t"
294 "movq 32(%3), %%mm4\n\t"
295 "movq 40(%3), %%mm5\n\t"
296 "movq 48(%3), %%mm6\n\t"
297 "movq 56(%3), %%mm7\n\t"
298 "packuswb %%mm1, %%mm0\n\t"
299 "packuswb %%mm3, %%mm2\n\t"
300 "packuswb %%mm5, %%mm4\n\t"
301 "packuswb %%mm7, %%mm6\n\t"
302 "movq %%mm0, (%0)\n\t"
303 "movq %%mm2, (%0, %1)\n\t"
304 "movq %%mm4, (%0, %1, 2)\n\t"
305 "movq %%mm6, (%0, %2)\n\t"
306 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
307 :"memory");
de6d9b64
FB
308}
309
eb4b3dd3 310void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
311{
312 const DCTELEM *p;
313 UINT8 *pix;
314 int i;
315
316 /* read the pixels */
317 p = block;
318 pix = pixels;
d6a4c0b1
ZK
319 MOVQ_ZERO(mm7);
320 i = 4;
cd8e5f96 321 do {
de6d9b64 322 __asm __volatile(
cd8e5f96
ZK
323 "movq (%2), %%mm0\n\t"
324 "movq 8(%2), %%mm1\n\t"
325 "movq 16(%2), %%mm2\n\t"
326 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
327 "movq %0, %%mm4\n\t"
328 "movq %1, %%mm6\n\t"
329 "movq %%mm4, %%mm5\n\t"
330 "punpcklbw %%mm7, %%mm4\n\t"
331 "punpckhbw %%mm7, %%mm5\n\t"
332 "paddsw %%mm4, %%mm0\n\t"
333 "paddsw %%mm5, %%mm1\n\t"
334 "movq %%mm6, %%mm5\n\t"
335 "punpcklbw %%mm7, %%mm6\n\t"
336 "punpckhbw %%mm7, %%mm5\n\t"
337 "paddsw %%mm6, %%mm2\n\t"
338 "paddsw %%mm5, %%mm3\n\t"
339 "packuswb %%mm1, %%mm0\n\t"
340 "packuswb %%mm3, %%mm2\n\t"
341 "movq %%mm0, %0\n\t"
342 "movq %%mm2, %1\n\t"
a822a479 343 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 344 :"r"(p)
de6d9b64
FB
345 :"memory");
346 pix += line_size*2;
347 p += 16;
cd8e5f96 348 } while (--i);
de6d9b64
FB
349}
350
b3184779 351static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 352{
39825f31 353 __asm __volatile(
31ddcf98 354 "lea (%3, %3), %%eax \n\t"
52af45ad 355 ".balign 8 \n\t"
31ddcf98
ZK
356 "1: \n\t"
357 "movq (%1), %%mm0 \n\t"
358 "movq (%1, %3), %%mm1 \n\t"
359 "movq %%mm0, (%2) \n\t"
360 "movq %%mm1, (%2, %3) \n\t"
361 "addl %%eax, %1 \n\t"
362 "addl %%eax, %2 \n\t"
363 "movq (%1), %%mm0 \n\t"
364 "movq (%1, %3), %%mm1 \n\t"
365 "movq %%mm0, (%2) \n\t"
366 "movq %%mm1, (%2, %3) \n\t"
367 "addl %%eax, %1 \n\t"
368 "addl %%eax, %2 \n\t"
369 "subl $4, %0 \n\t"
370 "jnz 1b \n\t"
371 : "+g"(h), "+r" (pixels), "+r" (block)
372 : "r"(line_size)
373 : "%eax", "memory"
374 );
de6d9b64
FB
375}
376
b3184779
MN
377static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
378{
379 __asm __volatile(
380 "lea (%3, %3), %%eax \n\t"
381 ".balign 8 \n\t"
382 "1: \n\t"
383 "movq (%1), %%mm0 \n\t"
384 "movq 8(%1), %%mm4 \n\t"
385 "movq (%1, %3), %%mm1 \n\t"
386 "movq 8(%1, %3), %%mm5 \n\t"
387 "movq %%mm0, (%2) \n\t"
388 "movq %%mm4, 8(%2) \n\t"
389 "movq %%mm1, (%2, %3) \n\t"
390 "movq %%mm5, 8(%2, %3) \n\t"
391 "addl %%eax, %1 \n\t"
392 "addl %%eax, %2 \n\t"
393 "movq (%1), %%mm0 \n\t"
394 "movq 8(%1), %%mm4 \n\t"
395 "movq (%1, %3), %%mm1 \n\t"
396 "movq 8(%1, %3), %%mm5 \n\t"
397 "movq %%mm0, (%2) \n\t"
398 "movq %%mm4, 8(%2) \n\t"
399 "movq %%mm1, (%2, %3) \n\t"
400 "movq %%mm5, 8(%2, %3) \n\t"
401 "addl %%eax, %1 \n\t"
402 "addl %%eax, %2 \n\t"
403 "subl $4, %0 \n\t"
404 "jnz 1b \n\t"
405 : "+g"(h), "+r" (pixels), "+r" (block)
406 : "r"(line_size)
407 : "%eax", "memory"
408 );
409}
410
649c00c9
MN
411static void clear_blocks_mmx(DCTELEM *blocks)
412{
39825f31 413 __asm __volatile(
649c00c9
MN
414 "pxor %%mm7, %%mm7 \n\t"
415 "movl $-128*6, %%eax \n\t"
416 "1: \n\t"
417 "movq %%mm7, (%0, %%eax) \n\t"
418 "movq %%mm7, 8(%0, %%eax) \n\t"
419 "movq %%mm7, 16(%0, %%eax) \n\t"
420 "movq %%mm7, 24(%0, %%eax) \n\t"
421 "addl $32, %%eax \n\t"
422 " js 1b \n\t"
423 : : "r" (((int)blocks)+128*6)
424 : "%eax"
425 );
426}
427
084c726b
MN
428static int pix_sum16_mmx(UINT8 * pix, int line_size){
429 const int h=16;
430 int sum;
431 int index= -line_size*h;
432
433 __asm __volatile(
434 "pxor %%mm7, %%mm7 \n\t"
435 "pxor %%mm6, %%mm6 \n\t"
436 "1: \n\t"
437 "movq (%2, %1), %%mm0 \n\t"
438 "movq (%2, %1), %%mm1 \n\t"
439 "movq 8(%2, %1), %%mm2 \n\t"
440 "movq 8(%2, %1), %%mm3 \n\t"
441 "punpcklbw %%mm7, %%mm0 \n\t"
442 "punpckhbw %%mm7, %%mm1 \n\t"
443 "punpcklbw %%mm7, %%mm2 \n\t"
444 "punpckhbw %%mm7, %%mm3 \n\t"
445 "paddw %%mm0, %%mm1 \n\t"
446 "paddw %%mm2, %%mm3 \n\t"
447 "paddw %%mm1, %%mm3 \n\t"
448 "paddw %%mm3, %%mm6 \n\t"
449 "addl %3, %1 \n\t"
450 " js 1b \n\t"
451 "movq %%mm6, %%mm5 \n\t"
452 "psrlq $32, %%mm6 \n\t"
453 "paddw %%mm5, %%mm6 \n\t"
454 "movq %%mm6, %%mm5 \n\t"
455 "psrlq $16, %%mm6 \n\t"
456 "paddw %%mm5, %%mm6 \n\t"
457 "movd %%mm6, %0 \n\t"
458 "andl $0xFFFF, %0 \n\t"
459 : "=&r" (sum), "+r" (index)
460 : "r" (pix - index), "r" (line_size)
461 );
462
463 return sum;
464}
465
11f18faf
MN
466static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
467 int i=0;
468 asm volatile(
469 "1: \n\t"
470 "movq (%1, %0), %%mm0 \n\t"
471 "movq (%2, %0), %%mm1 \n\t"
472 "paddb %%mm0, %%mm1 \n\t"
473 "movq %%mm1, (%2, %0) \n\t"
474 "movq 8(%1, %0), %%mm0 \n\t"
475 "movq 8(%2, %0), %%mm1 \n\t"
476 "paddb %%mm0, %%mm1 \n\t"
477 "movq %%mm1, 8(%2, %0) \n\t"
478 "addl $16, %0 \n\t"
479 "cmpl %3, %0 \n\t"
480 " jb 1b \n\t"
481 : "+r" (i)
482 : "r"(src), "r"(dst), "r"(w-15)
483 );
484 for(; i<w; i++)
485 dst[i+0] += src[i+0];
486}
487
488static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
489 int i=0;
490 asm volatile(
491 "1: \n\t"
492 "movq (%2, %0), %%mm0 \n\t"
493 "movq (%1, %0), %%mm1 \n\t"
494 "psubb %%mm0, %%mm1 \n\t"
495 "movq %%mm1, (%3, %0) \n\t"
496 "movq 8(%2, %0), %%mm0 \n\t"
497 "movq 8(%1, %0), %%mm1 \n\t"
498 "psubb %%mm0, %%mm1 \n\t"
499 "movq %%mm1, 8(%3, %0) \n\t"
500 "addl $16, %0 \n\t"
501 "cmpl %4, %0 \n\t"
502 " jb 1b \n\t"
503 : "+r" (i)
504 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
505 );
506 for(; i<w; i++)
507 dst[i+0] = src1[i+0]-src2[i+0];
508}
1457ab52
MN
509#define LBUTTERFLY(a,b)\
510 "paddw " #b ", " #a " \n\t"\
511 "paddw " #b ", " #b " \n\t"\
512 "psubw " #a ", " #b " \n\t"
513
514#define HADAMARD48\
515 LBUTTERFLY(%%mm0, %%mm1)\
516 LBUTTERFLY(%%mm2, %%mm3)\
517 LBUTTERFLY(%%mm4, %%mm5)\
518 LBUTTERFLY(%%mm6, %%mm7)\
519 \
520 LBUTTERFLY(%%mm0, %%mm2)\
521 LBUTTERFLY(%%mm1, %%mm3)\
522 LBUTTERFLY(%%mm4, %%mm6)\
523 LBUTTERFLY(%%mm5, %%mm7)\
524 \
525 LBUTTERFLY(%%mm0, %%mm4)\
526 LBUTTERFLY(%%mm1, %%mm5)\
527 LBUTTERFLY(%%mm2, %%mm6)\
528 LBUTTERFLY(%%mm3, %%mm7)
529
530#define MMABS(a,z)\
531 "pxor " #z ", " #z " \n\t"\
532 "pcmpgtw " #a ", " #z " \n\t"\
533 "pxor " #z ", " #a " \n\t"\
534 "psubw " #z ", " #a " \n\t"
535
536#define MMABS_SUM(a,z, sum)\
537 "pxor " #z ", " #z " \n\t"\
538 "pcmpgtw " #a ", " #z " \n\t"\
539 "pxor " #z ", " #a " \n\t"\
540 "psubw " #z ", " #a " \n\t"\
541 "paddusw " #a ", " #sum " \n\t"
542
543
544#define SBUTTERFLY(a,b,t,n)\
545 "movq " #a ", " #t " \n\t" /* abcd */\
546 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
547 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
548
549#define TRANSPOSE4(a,b,c,d,t)\
550 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
551 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
552 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
553 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
554
555#define LOAD4(o, a, b, c, d)\
556 "movq "#o"(%1), " #a " \n\t"\
557 "movq "#o"+16(%1), " #b " \n\t"\
558 "movq "#o"+32(%1), " #c " \n\t"\
559 "movq "#o"+48(%1), " #d " \n\t"
560
561#define STORE4(o, a, b, c, d)\
562 "movq "#a", "#o"(%1) \n\t"\
563 "movq "#b", "#o"+16(%1) \n\t"\
564 "movq "#c", "#o"+32(%1) \n\t"\
565 "movq "#d", "#o"+48(%1) \n\t"\
566
567static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
568 uint64_t temp[16] __align8;
569 int sum=0;
570
571 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 572
1457ab52
MN
573 asm volatile(
574 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
575 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
576
577 HADAMARD48
578
579 "movq %%mm7, 112(%1) \n\t"
580
581 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
582 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
583
584 "movq 112(%1), %%mm7 \n\t"
585 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
586 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
587
588 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
589 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
590
591 HADAMARD48
592
593 "movq %%mm7, 120(%1) \n\t"
594
595 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
596 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
597
598 "movq 120(%1), %%mm7 \n\t"
599 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
600 "movq %%mm7, %%mm5 \n\t"//FIXME remove
601 "movq %%mm6, %%mm7 \n\t"
602 "movq %%mm0, %%mm6 \n\t"
603// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
604
605 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
606// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
607
608 HADAMARD48
609 "movq %%mm7, 64(%1) \n\t"
610 MMABS(%%mm0, %%mm7)
611 MMABS_SUM(%%mm1, %%mm7, %%mm0)
612 MMABS_SUM(%%mm2, %%mm7, %%mm0)
613 MMABS_SUM(%%mm3, %%mm7, %%mm0)
614 MMABS_SUM(%%mm4, %%mm7, %%mm0)
615 MMABS_SUM(%%mm5, %%mm7, %%mm0)
616 MMABS_SUM(%%mm6, %%mm7, %%mm0)
617 "movq 64(%1), %%mm1 \n\t"
618 MMABS_SUM(%%mm1, %%mm7, %%mm0)
619 "movq %%mm0, 64(%1) \n\t"
620
621 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
622 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
623
624 HADAMARD48
625 "movq %%mm7, (%1) \n\t"
626 MMABS(%%mm0, %%mm7)
627 MMABS_SUM(%%mm1, %%mm7, %%mm0)
628 MMABS_SUM(%%mm2, %%mm7, %%mm0)
629 MMABS_SUM(%%mm3, %%mm7, %%mm0)
630 MMABS_SUM(%%mm4, %%mm7, %%mm0)
631 MMABS_SUM(%%mm5, %%mm7, %%mm0)
632 MMABS_SUM(%%mm6, %%mm7, %%mm0)
633 "movq (%1), %%mm1 \n\t"
634 MMABS_SUM(%%mm1, %%mm7, %%mm0)
635 "movq 64(%1), %%mm1 \n\t"
636 MMABS_SUM(%%mm1, %%mm7, %%mm0)
637
638 "movq %%mm0, %%mm1 \n\t"
639 "psrlq $32, %%mm0 \n\t"
640 "paddusw %%mm1, %%mm0 \n\t"
641 "movq %%mm0, %%mm1 \n\t"
642 "psrlq $16, %%mm0 \n\t"
643 "paddusw %%mm1, %%mm0 \n\t"
644 "movd %%mm0, %0 \n\t"
645
646 : "=r" (sum)
647 : "r"(temp)
648 );
649 return sum&0xFFFF;
650}
651
652WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
11f18faf 653
826f429a
MN
654#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
655 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
656 "movq " #pw_20 ", %%mm4 \n\t" /* 20 */\
657 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
658 "movq "#in7", " #m3 " \n\t" /* d */\
659 "movq "#in0", %%mm5 \n\t" /* D */\
660 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
661 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
662 "movq "#in1", %%mm5 \n\t" /* C */\
663 "movq "#in2", %%mm6 \n\t" /* B */\
664 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
665 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
666 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
667 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
668 "pmullw " #pw_3 ", %%mm5 \n\t" /* -6x2 + 3x3 */\
669 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
670 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
671 "psraw $5, %%mm5 \n\t"\
672 "packuswb %%mm5, %%mm5 \n\t"\
673 OP(%%mm5, out, %%mm7, d)
674
675#define QPEL_BASE(OPNAME, ROUNDER, RND, OP)\
676void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
677 uint64_t temp;\
678\
679 asm volatile(\
3643bd9c
MN
680 "pushl %0 \n\t"\
681 "pushl %1 \n\t"\
682 "pushl %2 \n\t"\
826f429a
MN
683 "pxor %%mm7, %%mm7 \n\t"\
684 "1: \n\t"\
685 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
686 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
687 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
688 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
689 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
690 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
691 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
692 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
693 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
694 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
695 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
696 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
697 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
698 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
699 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
700 "paddw %%mm3, %%mm5 \n\t" /* b */\
701 "paddw %%mm2, %%mm6 \n\t" /* c */\
702 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
703 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
704 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
705 "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\
706 "paddw %%mm4, %%mm0 \n\t" /* a */\
707 "paddw %%mm1, %%mm5 \n\t" /* d */\
708 "pmullw %5, %%mm0 \n\t" /* 20a */\
709 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
710 "paddw %8, %%mm6 \n\t"\
711 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
712 "psraw $5, %%mm0 \n\t"\
713 "movq %%mm0, %7 \n\t"\
714 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
715 \
716 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
717 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
718 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
719 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
720 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
721 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
723 "paddw %%mm0, %%mm2 \n\t" /* b */\
724 "paddw %%mm5, %%mm3 \n\t" /* c */\
725 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
726 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
727 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
728 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
729 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
730 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
731 "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\
732 "paddw %%mm2, %%mm1 \n\t" /* a */\
733 "paddw %%mm6, %%mm4 \n\t" /* d */\
734 "pmullw %5, %%mm1 \n\t" /* 20a */\
735 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
736 "paddw %8, %%mm1 \n\t"\
737 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
738 "psraw $5, %%mm3 \n\t"\
739 "movq %7, %%mm1 \n\t"\
740 "packuswb %%mm3, %%mm1 \n\t"\
741 OP(%%mm1, (%1),%%mm4, q)\
742 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
743 \
744 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
745 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
746 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
747 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
748 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
749 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
750 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
751 "paddw %%mm1, %%mm5 \n\t" /* b */\
752 "paddw %%mm4, %%mm0 \n\t" /* c */\
753 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
754 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
755 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
756 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
757 "pmullw %6, %%mm0 \n\t" /* 3c - 6b */\
758 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
759 "paddw %%mm3, %%mm2 \n\t" /* d */\
760 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
761 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
762 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
763 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
764 "paddw %%mm2, %%mm6 \n\t" /* a */\
765 "pmullw %5, %%mm6 \n\t" /* 20a */\
766 "paddw %8, %%mm0 \n\t"\
767 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
768 "psraw $5, %%mm0 \n\t"\
769 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
770 \
771 "paddw %%mm5, %%mm3 \n\t" /* a */\
772 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
773 "paddw %%mm4, %%mm6 \n\t" /* b */\
774 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
775 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
776 "paddw %%mm1, %%mm4 \n\t" /* c */\
777 "paddw %%mm2, %%mm5 \n\t" /* d */\
778 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
779 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
780 "pmullw %5, %%mm3 \n\t" /* 20a */\
781 "pmullw %6, %%mm4 \n\t" /* 3c - 6b */\
782 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
783 "paddw %8, %%mm4 \n\t"\
784 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
785 "psraw $5, %%mm4 \n\t"\
786 "packuswb %%mm4, %%mm0 \n\t"\
787 OP(%%mm0, 8(%1), %%mm4, q)\
788 \
789 "addl %3, %0 \n\t"\
790 "addl %4, %1 \n\t"\
791 "decl %2 \n\t"\
792 " jnz 1b \n\t"\
3643bd9c
MN
793 "popl %2 \n\t"\
794 "popl %1 \n\t"\
795 "popl %0 \n\t"\
796 :: "r"(src), "r"(dst), "r"(h),\
797 "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\
826f429a
MN
798 );\
799}\
800\
801static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
802 int i;\
803 int16_t temp[16];\
804 /* quick HACK, XXX FIXME MUST be optimized */\
805 for(i=0; i<h; i++)\
806 {\
807 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
808 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
809 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
810 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
811 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
812 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
813 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
814 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
815 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
816 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
817 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
818 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
819 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
820 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
821 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
822 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
823 asm volatile(\
824 "movq (%0), %%mm0 \n\t"\
825 "movq 8(%0), %%mm1 \n\t"\
826 "paddw %2, %%mm0 \n\t"\
827 "paddw %2, %%mm1 \n\t"\
828 "psraw $5, %%mm0 \n\t"\
829 "psraw $5, %%mm1 \n\t"\
830 "packuswb %%mm1, %%mm0 \n\t"\
831 OP(%%mm0, (%1), %%mm1, q)\
832 "movq 16(%0), %%mm0 \n\t"\
833 "movq 24(%0), %%mm1 \n\t"\
834 "paddw %2, %%mm0 \n\t"\
835 "paddw %2, %%mm1 \n\t"\
836 "psraw $5, %%mm0 \n\t"\
837 "psraw $5, %%mm1 \n\t"\
838 "packuswb %%mm1, %%mm0 \n\t"\
839 OP(%%mm0, 8(%1), %%mm1, q)\
840 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
841 );\
842 dst+=dstStride;\
843 src+=srcStride;\
844 }\
845}\
846\
847void OPNAME ## mpeg4_qpel16_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
848 uint64_t temp[17*4];\
849 uint64_t *temp_ptr= temp;\
850 int count= 17;\
851\
852 /*FIXME unroll */\
853 asm volatile(\
854 "pxor %%mm7, %%mm7 \n\t"\
855 "1: \n\t"\
856 "movq (%0), %%mm0 \n\t"\
857 "movq (%0), %%mm1 \n\t"\
858 "movq 8(%0), %%mm2 \n\t"\
859 "movq 8(%0), %%mm3 \n\t"\
860 "punpcklbw %%mm7, %%mm0 \n\t"\
861 "punpckhbw %%mm7, %%mm1 \n\t"\
862 "punpcklbw %%mm7, %%mm2 \n\t"\
863 "punpckhbw %%mm7, %%mm3 \n\t"\
864 "movq %%mm0, (%1) \n\t"\
865 "movq %%mm1, 17*8(%1) \n\t"\
866 "movq %%mm2, (%1, %4) \n\t"\
867 "movq %%mm3, (%1, %5) \n\t"\
868 "addl $8, %1 \n\t"\
869 "addl %3, %0 \n\t"\
870 "decl %2 \n\t"\
871 " jnz 1b \n\t"\
872 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
873 : "r" (srcStride), "r"(2*8*17), "r"(3*8*17)\
874 );\
875 \
876 temp_ptr= temp;\
877 count=4;\
878 \
879/*FIXME reorder for speed */\
3643bd9c 880/*FIXME remove push/pop gcc 2.95 bug workaround here and in the other 3 lowpass filters */\
826f429a
MN
881 asm volatile(\
882 /*"pxor %%mm7, %%mm7 \n\t"*/\
3643bd9c
MN
883 "pushl %0 \n\t"\
884 "pushl %1 \n\t"\
885 "pushl %2 \n\t"\
826f429a
MN
886 "1: \n\t"\
887 "movq (%0), %%mm0 \n\t"\
888 "movq 8(%0), %%mm1 \n\t"\
889 "movq 16(%0), %%mm2 \n\t"\
890 "movq 24(%0), %%mm3 \n\t"\
891 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
892 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
893 "addl %4, %1 \n\t"\
894 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
895 \
896 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
897 "addl %4, %1 \n\t"\
898 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
899 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
900 "addl %4, %1 \n\t"\
901 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
902 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
903 "addl %4, %1 \n\t"\
904 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
905 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
906 "addl %4, %1 \n\t"\
907 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
908 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
909 "addl %4, %1 \n\t"\
910 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
911 \
912 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
913 "addl %4, %1 \n\t" \
914 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
915 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
916 \
917 "addl $136, %0 \n\t"\
918 "addl %8, %1 \n\t"\
919 "decl %2 \n\t"\
920 " jnz 1b \n\t"\
3643bd9c
MN
921 "popl %2 \n\t"\
922 "popl %1 \n\t"\
923 "popl %0 \n\t"\
924 \
925 :: "r"(temp_ptr), "r"(dst), "r"(count),\
926 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*dstStride)\
826f429a
MN
927 );\
928}\
929void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
930 uint64_t temp;\
931\
932 asm volatile(\
3643bd9c
MN
933 "pushl %0 \n\t"\
934 "pushl %1 \n\t"\
935 "pushl %2 \n\t"\
826f429a
MN
936 "pxor %%mm7, %%mm7 \n\t"\
937 "1: \n\t"\
938 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
939 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
940 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
941 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
942 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
943 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
944 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
945 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
946 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
947 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
948 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
949 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
950 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
951 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
952 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
953 "paddw %%mm3, %%mm5 \n\t" /* b */\
954 "paddw %%mm2, %%mm6 \n\t" /* c */\
955 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
956 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
957 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
958 "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\
959 "paddw %%mm4, %%mm0 \n\t" /* a */\
960 "paddw %%mm1, %%mm5 \n\t" /* d */\
961 "pmullw %5, %%mm0 \n\t" /* 20a */\
962 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
963 "paddw %8, %%mm6 \n\t"\
964 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
965 "psraw $5, %%mm0 \n\t"\
966 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
967 \
968 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
969 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
970 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
971 "paddw %%mm5, %%mm1 \n\t" /* a */\
972 "paddw %%mm6, %%mm2 \n\t" /* b */\
973 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
974 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
975 "paddw %%mm6, %%mm3 \n\t" /* c */\
976 "paddw %%mm5, %%mm4 \n\t" /* d */\
977 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
978 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
979 "pmullw %5, %%mm1 \n\t" /* 20a */\
980 "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\
981 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
982 "paddw %8, %%mm1 \n\t"\
983 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
984 "psraw $5, %%mm3 \n\t"\
985 "packuswb %%mm3, %%mm0 \n\t"\
986 OP(%%mm0, (%1), %%mm4, q)\
987 \
988 "addl %3, %0 \n\t"\
989 "addl %4, %1 \n\t"\
990 "decl %2 \n\t"\
991 " jnz 1b \n\t"\
3643bd9c
MN
992 "popl %2 \n\t"\
993 "popl %1 \n\t"\
994 "popl %0 \n\t"\
995 :: "r"(src), "r"(dst), "r"(h),\
996 "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\
826f429a
MN
997 );\
998}\
999\
1000static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1001 int i;\
1002 int16_t temp[8];\
1003 /* quick HACK, XXX FIXME MUST be optimized */\
1004 for(i=0; i<h; i++)\
1005 {\
1006 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1007 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1008 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1009 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1010 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1011 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1012 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1013 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1014 asm volatile(\
1015 "movq (%0), %%mm0 \n\t"\
1016 "movq 8(%0), %%mm1 \n\t"\
1017 "paddw %2, %%mm0 \n\t"\
1018 "paddw %2, %%mm1 \n\t"\
1019 "psraw $5, %%mm0 \n\t"\
1020 "psraw $5, %%mm1 \n\t"\
1021 "packuswb %%mm1, %%mm0 \n\t"\
1022 OP(%%mm0, (%1), %%mm1, q)\
1023 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1024 );\
1025 dst+=dstStride;\
1026 src+=srcStride;\
1027 }\
1028}\
1029\
1030void OPNAME ## mpeg4_qpel8_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1031 uint64_t temp[9*4];\
1032 uint64_t *temp_ptr= temp;\
1033 int count= 9;\
1034\
1035 /*FIXME unroll */\
1036 asm volatile(\
1037 "pxor %%mm7, %%mm7 \n\t"\
1038 "1: \n\t"\
1039 "movq (%0), %%mm0 \n\t"\
1040 "movq (%0), %%mm1 \n\t"\
1041 "punpcklbw %%mm7, %%mm0 \n\t"\
1042 "punpckhbw %%mm7, %%mm1 \n\t"\
1043 "movq %%mm0, (%1) \n\t"\
1044 "movq %%mm1, 9*8(%1) \n\t"\
1045 "addl $8, %1 \n\t"\
1046 "addl %3, %0 \n\t"\
1047 "decl %2 \n\t"\
1048 " jnz 1b \n\t"\
1049 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1050 : "r" (srcStride)\
1051 );\
1052 \
1053 temp_ptr= temp;\
1054 count=2;\
1055 \
1056/*FIXME reorder for speed */\
1057 asm volatile(\
3643bd9c
MN
1058 "pushl %0 \n\t"\
1059 "pushl %1 \n\t"\
1060 "pushl %2 \n\t"\
826f429a
MN
1061 /*"pxor %%mm7, %%mm7 \n\t"*/\
1062 "1: \n\t"\
1063 "movq (%0), %%mm0 \n\t"\
1064 "movq 8(%0), %%mm1 \n\t"\
1065 "movq 16(%0), %%mm2 \n\t"\
1066 "movq 24(%0), %%mm3 \n\t"\
1067 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1068 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1069 "addl %4, %1 \n\t"\
1070 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1071 \
1072 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1073 "addl %4, %1 \n\t"\
1074 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1075 \
1076 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1077 "addl %4, %1 \n\t"\
1078 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1079 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1080 \
1081 "addl $72, %0 \n\t"\
1082 "addl %8, %1 \n\t"\
1083 "decl %2 \n\t"\
1084 " jnz 1b \n\t"\
3643bd9c
MN
1085 "popl %2 \n\t"\
1086 "popl %1 \n\t"\
1087 "popl %0 \n\t"\
826f429a 1088 \
3643bd9c
MN
1089 :: "r"(temp_ptr), "r"(dst), "r"(count),\
1090 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*dstStride)\
826f429a
MN
1091 );\
1092}
1093
1094#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1095\
1096static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
1097 put_pixels8_mmx(dst, src, stride, 8);\
1098}\
1099\
1100static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1101 uint64_t temp[32];\
1102 uint8_t * const half= (uint8_t*)temp;\
1103 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1104 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1105}\
1106\
1107static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1108 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1109}\
1110\
1111static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1112 uint64_t temp[32];\
1113 uint8_t * const half= (uint8_t*)temp;\
1114 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1115 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1116}\
1117\
1118static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1119 uint64_t temp[32];\
1120 uint8_t * const half= (uint8_t*)temp;\
1121 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\
1122 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1123}\
1124\
1125static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1126 OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, src, stride, stride);\
1127}\
1128\
1129static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1130 uint64_t temp[32];\
1131 uint8_t * const half= (uint8_t*)temp;\
1132 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\
1133 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1134}\
1135static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1136 uint64_t half[8*2 + 8*2 + 18*2];\
1137 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1138 uint8_t * const halfV= ((uint8_t*)half);\
1139 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1140 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1141 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\
1142 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1143 OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\
1144}\
1145static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1146 uint64_t half[8*2 + 8*2 + 18*2];\
1147 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1148 uint8_t * const halfV= ((uint8_t*)half);\
1149 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1150 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1151 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\
1152 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1153 OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\
1154}\
1155static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1156 uint64_t half[8*2 + 8*2 + 9*2];\
1157 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1158 uint8_t * const halfV= ((uint8_t*)half);\
1159 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1160 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1161 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\
1162 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1163 OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\
1164}\
1165static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1166 uint64_t half[8*2 + 8*2 + 9*2];\
1167 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1168 uint8_t * const halfV= ((uint8_t*)half);\
1169 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1170 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src , 8, stride, 9);\
1171 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\
1172 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1173 OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\
1174}\
1175static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1176 uint64_t half[8*2 + 9*2];\
1177 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1178 uint8_t * const halfHV= ((uint8_t*)half);\
1179 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1180 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1181 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1182}\
1183static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1184 uint64_t half[8*2 + 9*2];\
1185 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1186 uint8_t * const halfHV= ((uint8_t*)half);\
1187 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1188 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1189 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1190}\
1191static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1192 uint64_t half[8*2 + 8*2 + 9*2];\
1193 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1194 uint8_t * const halfV= ((uint8_t*)half);\
1195 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1196 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1197 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\
1198 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1199 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1200}\
1201static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1202 uint64_t half[8*2 + 8*2 + 9*2];\
1203 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1204 uint8_t * const halfV= ((uint8_t*)half);\
1205 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1206 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1207 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\
1208 put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1209 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1210}\
1211static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1212 uint64_t half[9*2];\
1213 uint8_t * const halfH= ((uint8_t*)half);\
1214 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1215 OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, halfH, stride, 8);\
1216}\
1217static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
1218 put_pixels16_mmx(dst, src, stride, 16);\
1219}\
1220\
1221static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1222 uint64_t temp[32];\
1223 uint8_t * const half= (uint8_t*)temp;\
1224 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1225 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1226}\
1227\
1228static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1229 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1230}\
1231\
1232static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1233 uint64_t temp[32];\
1234 uint8_t * const half= (uint8_t*)temp;\
1235 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1236 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1237}\
1238\
1239static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1240 uint64_t temp[32];\
1241 uint8_t * const half= (uint8_t*)temp;\
1242 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\
1243 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1244}\
1245\
1246static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1247 OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, src, stride, stride);\
1248}\
1249\
1250static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1251 uint64_t temp[32];\
1252 uint8_t * const half= (uint8_t*)temp;\
1253 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\
1254 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1255}\
1256static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1257 uint64_t half[16*2 + 16*2 + 18*2];\
1258 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1259 uint8_t * const halfV= ((uint8_t*)half);\
1260 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1261 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1262 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\
1263 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1264 OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\
1265}\
1266static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1267 uint64_t half[16*2 + 16*2 + 18*2];\
1268 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1269 uint8_t * const halfV= ((uint8_t*)half);\
1270 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1271 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1272 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\
1273 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1274 OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\
1275}\
1276static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1277 uint64_t half[16*2 + 16*2 + 17*2];\
1278 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1279 uint8_t * const halfV= ((uint8_t*)half);\
1280 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1281 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1282 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\
1283 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1284 OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\
1285}\
1286static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1287 uint64_t half[16*2 + 16*2 + 17*2];\
1288 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1289 uint8_t * const halfV= ((uint8_t*)half);\
1290 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1291 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src , 16, stride, 17);\
1292 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\
1293 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1294 OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\
1295}\
1296static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1297 uint64_t half[16*2 + 17*2];\
1298 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1299 uint8_t * const halfHV= ((uint8_t*)half);\
1300 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1301 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1302 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1303}\
1304static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1305 uint64_t half[16*2 + 17*2];\
1306 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1307 uint8_t * const halfHV= ((uint8_t*)half);\
1308 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1309 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1310 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1311}\
1312static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1313 uint64_t half[16*2 + 16*2 + 17*2];\
1314 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1315 uint8_t * const halfV= ((uint8_t*)half);\
1316 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1317 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1318 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\
1319 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1320 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1321}\
1322static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1323 uint64_t half[16*2 + 16*2 + 17*2];\
1324 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1325 uint8_t * const halfV= ((uint8_t*)half);\
1326 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1327 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1328 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\
1329 put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1330 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1331}\
1332static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1333 uint64_t half[17*2];\
1334 uint8_t * const halfH= ((uint8_t*)half);\
1335 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1336 OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, halfH, stride, 16);\
1337}
1338
1339
1340#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1341#define AVG_OP(a,b,temp, size) \
1342"mov" #size " " #b ", " #temp " \n\t"\
1343"pavgusb " #temp ", " #a " \n\t"\
1344"mov" #size " " #a ", " #b " \n\t"
1345
1346QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP)
1347QPEL_BASE(avg_ , ff_pw_16, _ , AVG_OP)
1348QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1349QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1350QPEL_OP(avg_ , ff_pw_16, _ , AVG_OP, 3dnow)
1351QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1352
1353#undef AVG_OP
1354#define AVG_OP(a,b,temp, size) \
1355"mov" #size " " #b ", " #temp " \n\t"\
1356"pavgb " #temp ", " #a " \n\t"\
1357"mov" #size " " #a ", " #b " \n\t"
1358QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1359QPEL_OP(avg_ , ff_pw_16, _ , AVG_OP, mmx2)
1360QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1361
61a4e8ae 1362#if 0
d6a4c0b1 1363static void just_return() { return; }
61a4e8ae 1364#endif
d6a4c0b1 1365
826f429a
MN
1366#define SET_QPEL_FUNC(postfix1, postfix2) \
1367 c->put_ ## postfix1 = put_ ## postfix2;\
1368 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1369 c->avg_ ## postfix1 = avg_ ## postfix2;
1370
eb4b3dd3 1371void dsputil_init_mmx(DSPContext* c, unsigned mask)
de6d9b64
FB
1372{
1373 mm_flags = mm_support();
1565dabc
LB
1374#if 0
1375 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 1376 if (mm_flags & MM_MMX)
1565dabc 1377 fprintf(stderr, " mmx");
de6d9b64 1378 if (mm_flags & MM_MMXEXT)
1565dabc 1379 fprintf(stderr, " mmxext");
de6d9b64 1380 if (mm_flags & MM_3DNOW)
1565dabc 1381 fprintf(stderr, " 3dnow");
de6d9b64 1382 if (mm_flags & MM_SSE)
1565dabc 1383 fprintf(stderr, " sse");
de6d9b64 1384 if (mm_flags & MM_SSE2)
1565dabc
LB
1385 fprintf(stderr, " sse2");
1386 fprintf(stderr, "\n");
de6d9b64
FB
1387#endif
1388
1389 if (mm_flags & MM_MMX) {
eb4b3dd3
ZK
1390 c->get_pixels = get_pixels_mmx;
1391 c->diff_pixels = diff_pixels_mmx;
1392 c->put_pixels_clamped = put_pixels_clamped_mmx;
1393 c->add_pixels_clamped = add_pixels_clamped_mmx;
1394 c->clear_blocks = clear_blocks_mmx;
1395 c->pix_sum = pix_sum16_mmx;
1396
1397 c->pix_abs16x16 = pix_abs16x16_mmx;
1398 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1399 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1400 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1401 c->pix_abs8x8 = pix_abs8x8_mmx;
1402 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1403 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1404 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
1405
1406 c->put_pixels_tab[0][0] = put_pixels16_mmx;
1407 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1408 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1409 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1410
1411 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1412 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1413 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1414 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1415
1416 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1417 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1418 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1419 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1420
1421 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1422 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1423 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1424 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1425
1426 c->put_pixels_tab[1][0] = put_pixels8_mmx;
1427 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1428 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1429 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1430
1431 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1432 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1433 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1434 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1435
1436 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1437 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1438 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1439 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1440
1441 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1442 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1443 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1444 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
826f429a 1445
11f18faf
MN
1446 c->add_bytes= add_bytes_mmx;
1447 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
1448
1449 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1450 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1451
1452 c->sad[0]= sad16x16_mmx;
1453 c->sad[1]= sad8x8_mmx;
1454
de6d9b64 1455 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1456 c->pix_abs16x16 = pix_abs16x16_mmx2;
1457 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
1458 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
1459 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
1460
1461 c->pix_abs8x8 = pix_abs8x8_mmx2;
1462 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
1463 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
1464 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
1465
1457ab52
MN
1466 c->sad[0]= sad16x16_mmx2;
1467 c->sad[1]= sad8x8_mmx2;
1468
eb4b3dd3
ZK
1469 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1470 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
1471 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1472 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1473
1474 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1475 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1476 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
1477 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1478
1479 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1480 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
1481 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1482 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1483
1484 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1485 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1486 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1487 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
826f429a
MN
1488 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1489 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1490 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1491 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1492 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1493 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1494 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1495 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1496 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1497 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1498 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1499 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1500 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1501 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1502 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1503 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1504 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1505 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1506 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1507 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1508 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1509 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1510 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1511 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1512 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1513 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1514 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1515 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1516 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1517 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1518 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1519 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
de6d9b64 1520 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
1521 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1522 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
1523 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1524 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1525
1526 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1527 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1528 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
1529 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1530
1531 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1532 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
1533 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1534 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1535
1536 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1537 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1538 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1539 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
826f429a
MN
1540
1541 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1542 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1543 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1544 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1545 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1546 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1547 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1548 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1549 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1550 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1551 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1552 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1553 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1554 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1555 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1556 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1557 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1558 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1559 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1560 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1561 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1562 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1563 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1564 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1565 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1566 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1567 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1568 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1569 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1570 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1571 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1572 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
de6d9b64
FB
1573 }
1574 }
d6a4c0b1
ZK
1575
1576#if 0
1577 // for speed testing
1578 get_pixels = just_return;
1579 put_pixels_clamped = just_return;
1580 add_pixels_clamped = just_return;
1581
1582 pix_abs16x16 = just_return;
1583 pix_abs16x16_x2 = just_return;
1584 pix_abs16x16_y2 = just_return;
1585 pix_abs16x16_xy2 = just_return;
1586
1587 put_pixels_tab[0] = just_return;
1588 put_pixels_tab[1] = just_return;
1589 put_pixels_tab[2] = just_return;
1590 put_pixels_tab[3] = just_return;
1591
1592 put_no_rnd_pixels_tab[0] = just_return;
1593 put_no_rnd_pixels_tab[1] = just_return;
1594 put_no_rnd_pixels_tab[2] = just_return;
1595 put_no_rnd_pixels_tab[3] = just_return;
1596
1597 avg_pixels_tab[0] = just_return;
1598 avg_pixels_tab[1] = just_return;
1599 avg_pixels_tab[2] = just_return;
1600 avg_pixels_tab[3] = just_return;
1601
1602 avg_no_rnd_pixels_tab[0] = just_return;
1603 avg_no_rnd_pixels_tab[1] = just_return;
1604 avg_no_rnd_pixels_tab[2] = just_return;
1605 avg_no_rnd_pixels_tab[3] = just_return;
1606
d6a4c0b1
ZK
1607 //av_fdct = just_return;
1608 //ff_idct = just_return;
1609#endif
de6d9b64 1610}
4f12a497
FB
1611
1612/* remove any non bit exact operation (testing purpose). NOTE that
1613 this function should be kept as small as possible because it is
1614 always difficult to test automatically non bit exact cases. */
eb4b3dd3 1615void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
4f12a497
FB
1616{
1617 if (mm_flags & MM_MMX) {
b3184779 1618 /* MMX2 & 3DNOW */
eb4b3dd3
ZK
1619 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1620 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1621 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1622 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1623 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1624 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4afeaec9 1625
b3184779 1626 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1627 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1628 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1629 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1630 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1631 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1632 c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497
FB
1633 }
1634 }
1635}