b frame segfault fix
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5 24int mm_flags; /* multimedia extension flags */
eb4b3dd3 25/* FIXME use them in static form */
ba6802de
MN
26int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
31int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
36int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
41int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
1457ab52
MN
46int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
47int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
48int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
49int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
50
de6d9b64 51/* pixel operations */
a7bd8797
MN
52static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 55
826f429a
MN
56static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
57static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
58static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
59static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
60
d6a4c0b1
ZK
61#define JUMPALIGN() __asm __volatile (".balign 8"::)
62#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
63
fca0f0e5
ZK
64#define MOVQ_WONE(regd) \
65 __asm __volatile ( \
66 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
67 "psrlw $15, %%" #regd ::)
68
69#define MOVQ_BFE(regd) \
70 __asm __volatile ( \
71 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
72 "paddb %%" #regd ", %%" #regd " \n\t" ::)
73
d6a4c0b1 74#ifndef PIC
fca0f0e5 75#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
76#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
77#else
78// for shared library it's better to use this way for accessing constants
79// pcmpeqd -> -1
fca0f0e5 80#define MOVQ_BONE(regd) \
d6a4c0b1 81 __asm __volatile ( \
fca0f0e5
ZK
82 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
83 "psrlw $15, %%" #regd " \n\t" \
84 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
85
86#define MOVQ_WTWO(regd) \
87 __asm __volatile ( \
fca0f0e5
ZK
88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
89 "psrlw $15, %%" #regd " \n\t" \
90 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 91
d6a4c0b1
ZK
92#endif
93
fca0f0e5 94// using regr as temporary and for the output result
def60345 95// first argument is unmodifed and second is trashed
39825f31
ZK
96// regfe is supposed to contain 0xfefefefefefefefe
97#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
98 "movq " #rega ", " #regr " \n\t"\
99 "pand " #regb ", " #regr " \n\t"\
def60345 100 "pxor " #rega ", " #regb " \n\t"\
39825f31 101 "pand " #regfe "," #regb " \n\t"\
def60345 102 "psrlq $1, " #regb " \n\t"\
91abb473 103 "paddb " #regb ", " #regr " \n\t"
def60345 104
39825f31 105#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
106 "movq " #rega ", " #regr " \n\t"\
107 "por " #regb ", " #regr " \n\t"\
def60345 108 "pxor " #rega ", " #regb " \n\t"\
39825f31 109 "pand " #regfe "," #regb " \n\t"\
def60345 110 "psrlq $1, " #regb " \n\t"\
91abb473 111 "psubb " #regb ", " #regr " \n\t"
def60345 112
39825f31 113// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
114#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
115 "movq " #rega ", " #regr " \n\t"\
116 "movq " #regc ", " #regp " \n\t"\
117 "pand " #regb ", " #regr " \n\t"\
118 "pand " #regd ", " #regp " \n\t"\
119 "pxor " #rega ", " #regb " \n\t"\
120 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
121 "pand %%mm6, " #regb " \n\t"\
122 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
123 "psrlq $1, " #regb " \n\t"\
124 "psrlq $1, " #regd " \n\t"\
125 "paddb " #regb ", " #regr " \n\t"\
126 "paddb " #regd ", " #regp " \n\t"
127
128#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
129 "movq " #rega ", " #regr " \n\t"\
130 "movq " #regc ", " #regp " \n\t"\
131 "por " #regb ", " #regr " \n\t"\
132 "por " #regd ", " #regp " \n\t"\
133 "pxor " #rega ", " #regb " \n\t"\
134 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
135 "pand %%mm6, " #regb " \n\t"\
136 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
137 "psrlq $1, " #regd " \n\t"\
138 "psrlq $1, " #regb " \n\t"\
139 "psubb " #regb ", " #regr " \n\t"\
140 "psubb " #regd ", " #regp " \n\t"
141
91abb473
ZK
142/***********************************/
143/* MMX no rounding */
144#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 145#define SET_RND MOVQ_WONE
6aa6ea8e 146#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 147#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 148
91abb473
ZK
149#include "dsputil_mmx_rnd.h"
150
151#undef DEF
fca0f0e5 152#undef SET_RND
6aa6ea8e 153#undef PAVGBP
39825f31 154#undef PAVGB
91abb473
ZK
155/***********************************/
156/* MMX rounding */
157
158#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 159#define SET_RND MOVQ_WTWO
6aa6ea8e 160#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 161#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 162
91abb473
ZK
163#include "dsputil_mmx_rnd.h"
164
165#undef DEF
fca0f0e5 166#undef SET_RND
6aa6ea8e 167#undef PAVGBP
39825f31 168#undef PAVGB
a7bd8797 169
de6d9b64
FB
170/***********************************/
171/* 3Dnow specific */
172
173#define DEF(x) x ## _3dnow
174/* for Athlons PAVGUSB is prefered */
175#define PAVGB "pavgusb"
176
177#include "dsputil_mmx_avg.h"
178
179#undef DEF
180#undef PAVGB
181
182/***********************************/
183/* MMX2 specific */
184
607dce96 185#define DEF(x) x ## _mmx2
de6d9b64
FB
186
187/* Introduced only in MMX2 set */
188#define PAVGB "pavgb"
189
190#include "dsputil_mmx_avg.h"
191
192#undef DEF
193#undef PAVGB
194
195/***********************************/
196/* standard MMX */
197
198static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
199{
607dce96
MN
200 asm volatile(
201 "movl $-128, %%eax \n\t"
202 "pxor %%mm7, %%mm7 \n\t"
203 ".balign 16 \n\t"
204 "1: \n\t"
205 "movq (%0), %%mm0 \n\t"
206 "movq (%0, %2), %%mm2 \n\t"
207 "movq %%mm0, %%mm1 \n\t"
208 "movq %%mm2, %%mm3 \n\t"
209 "punpcklbw %%mm7, %%mm0 \n\t"
210 "punpckhbw %%mm7, %%mm1 \n\t"
211 "punpcklbw %%mm7, %%mm2 \n\t"
212 "punpckhbw %%mm7, %%mm3 \n\t"
213 "movq %%mm0, (%1, %%eax)\n\t"
214 "movq %%mm1, 8(%1, %%eax)\n\t"
215 "movq %%mm2, 16(%1, %%eax)\n\t"
216 "movq %%mm3, 24(%1, %%eax)\n\t"
217 "addl %3, %0 \n\t"
218 "addl $32, %%eax \n\t"
219 "js 1b \n\t"
220 : "+r" (pixels)
221 : "r" (block+64), "r" (line_size), "r" (line_size*2)
222 : "%eax"
223 );
de6d9b64
FB
224}
225
1457ab52 226static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
9dbcbd92
MN
227{
228 asm volatile(
607dce96 229 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 230 "movl $-128, %%eax \n\t"
607dce96 231 ".balign 16 \n\t"
9dbcbd92
MN
232 "1: \n\t"
233 "movq (%0), %%mm0 \n\t"
234 "movq (%1), %%mm2 \n\t"
235 "movq %%mm0, %%mm1 \n\t"
236 "movq %%mm2, %%mm3 \n\t"
237 "punpcklbw %%mm7, %%mm0 \n\t"
238 "punpckhbw %%mm7, %%mm1 \n\t"
239 "punpcklbw %%mm7, %%mm2 \n\t"
240 "punpckhbw %%mm7, %%mm3 \n\t"
241 "psubw %%mm2, %%mm0 \n\t"
242 "psubw %%mm3, %%mm1 \n\t"
243 "movq %%mm0, (%2, %%eax)\n\t"
244 "movq %%mm1, 8(%2, %%eax)\n\t"
245 "addl %3, %0 \n\t"
246 "addl %3, %1 \n\t"
247 "addl $16, %%eax \n\t"
248 "jnz 1b \n\t"
249 : "+r" (s1), "+r" (s2)
250 : "r" (block+64), "r" (stride)
251 : "%eax"
252 );
253}
254
eb4b3dd3 255void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
256{
257 const DCTELEM *p;
258 UINT8 *pix;
de6d9b64
FB
259
260 /* read the pixels */
261 p = block;
262 pix = pixels;
d6a4c0b1 263 /* unrolled loop */
de6d9b64 264 __asm __volatile(
a822a479
NK
265 "movq %3, %%mm0\n\t"
266 "movq 8%3, %%mm1\n\t"
267 "movq 16%3, %%mm2\n\t"
268 "movq 24%3, %%mm3\n\t"
269 "movq 32%3, %%mm4\n\t"
270 "movq 40%3, %%mm5\n\t"
271 "movq 48%3, %%mm6\n\t"
272 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
273 "packuswb %%mm1, %%mm0\n\t"
274 "packuswb %%mm3, %%mm2\n\t"
275 "packuswb %%mm5, %%mm4\n\t"
276 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
277 "movq %%mm0, (%0)\n\t"
278 "movq %%mm2, (%0, %1)\n\t"
279 "movq %%mm4, (%0, %1, 2)\n\t"
280 "movq %%mm6, (%0, %2)\n\t"
281 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
282 :"memory");
283 pix += line_size*4;
284 p += 32;
d6a4c0b1
ZK
285
286 // if here would be an exact copy of the code above
287 // compiler would generate some very strange code
288 // thus using "r"
289 __asm __volatile(
290 "movq (%3), %%mm0\n\t"
291 "movq 8(%3), %%mm1\n\t"
292 "movq 16(%3), %%mm2\n\t"
293 "movq 24(%3), %%mm3\n\t"
294 "movq 32(%3), %%mm4\n\t"
295 "movq 40(%3), %%mm5\n\t"
296 "movq 48(%3), %%mm6\n\t"
297 "movq 56(%3), %%mm7\n\t"
298 "packuswb %%mm1, %%mm0\n\t"
299 "packuswb %%mm3, %%mm2\n\t"
300 "packuswb %%mm5, %%mm4\n\t"
301 "packuswb %%mm7, %%mm6\n\t"
302 "movq %%mm0, (%0)\n\t"
303 "movq %%mm2, (%0, %1)\n\t"
304 "movq %%mm4, (%0, %1, 2)\n\t"
305 "movq %%mm6, (%0, %2)\n\t"
306 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
307 :"memory");
de6d9b64
FB
308}
309
eb4b3dd3 310void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
311{
312 const DCTELEM *p;
313 UINT8 *pix;
314 int i;
315
316 /* read the pixels */
317 p = block;
318 pix = pixels;
d6a4c0b1
ZK
319 MOVQ_ZERO(mm7);
320 i = 4;
cd8e5f96 321 do {
de6d9b64 322 __asm __volatile(
cd8e5f96
ZK
323 "movq (%2), %%mm0\n\t"
324 "movq 8(%2), %%mm1\n\t"
325 "movq 16(%2), %%mm2\n\t"
326 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
327 "movq %0, %%mm4\n\t"
328 "movq %1, %%mm6\n\t"
329 "movq %%mm4, %%mm5\n\t"
330 "punpcklbw %%mm7, %%mm4\n\t"
331 "punpckhbw %%mm7, %%mm5\n\t"
332 "paddsw %%mm4, %%mm0\n\t"
333 "paddsw %%mm5, %%mm1\n\t"
334 "movq %%mm6, %%mm5\n\t"
335 "punpcklbw %%mm7, %%mm6\n\t"
336 "punpckhbw %%mm7, %%mm5\n\t"
337 "paddsw %%mm6, %%mm2\n\t"
338 "paddsw %%mm5, %%mm3\n\t"
339 "packuswb %%mm1, %%mm0\n\t"
340 "packuswb %%mm3, %%mm2\n\t"
341 "movq %%mm0, %0\n\t"
342 "movq %%mm2, %1\n\t"
a822a479 343 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 344 :"r"(p)
de6d9b64
FB
345 :"memory");
346 pix += line_size*2;
347 p += 16;
cd8e5f96 348 } while (--i);
de6d9b64
FB
349}
350
b3184779 351static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 352{
39825f31 353 __asm __volatile(
31ddcf98 354 "lea (%3, %3), %%eax \n\t"
52af45ad 355 ".balign 8 \n\t"
31ddcf98
ZK
356 "1: \n\t"
357 "movq (%1), %%mm0 \n\t"
358 "movq (%1, %3), %%mm1 \n\t"
359 "movq %%mm0, (%2) \n\t"
360 "movq %%mm1, (%2, %3) \n\t"
361 "addl %%eax, %1 \n\t"
362 "addl %%eax, %2 \n\t"
363 "movq (%1), %%mm0 \n\t"
364 "movq (%1, %3), %%mm1 \n\t"
365 "movq %%mm0, (%2) \n\t"
366 "movq %%mm1, (%2, %3) \n\t"
367 "addl %%eax, %1 \n\t"
368 "addl %%eax, %2 \n\t"
369 "subl $4, %0 \n\t"
370 "jnz 1b \n\t"
371 : "+g"(h), "+r" (pixels), "+r" (block)
372 : "r"(line_size)
373 : "%eax", "memory"
374 );
de6d9b64
FB
375}
376
b3184779
MN
377static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
378{
379 __asm __volatile(
380 "lea (%3, %3), %%eax \n\t"
381 ".balign 8 \n\t"
382 "1: \n\t"
383 "movq (%1), %%mm0 \n\t"
384 "movq 8(%1), %%mm4 \n\t"
385 "movq (%1, %3), %%mm1 \n\t"
386 "movq 8(%1, %3), %%mm5 \n\t"
387 "movq %%mm0, (%2) \n\t"
388 "movq %%mm4, 8(%2) \n\t"
389 "movq %%mm1, (%2, %3) \n\t"
390 "movq %%mm5, 8(%2, %3) \n\t"
391 "addl %%eax, %1 \n\t"
392 "addl %%eax, %2 \n\t"
393 "movq (%1), %%mm0 \n\t"
394 "movq 8(%1), %%mm4 \n\t"
395 "movq (%1, %3), %%mm1 \n\t"
396 "movq 8(%1, %3), %%mm5 \n\t"
397 "movq %%mm0, (%2) \n\t"
398 "movq %%mm4, 8(%2) \n\t"
399 "movq %%mm1, (%2, %3) \n\t"
400 "movq %%mm5, 8(%2, %3) \n\t"
401 "addl %%eax, %1 \n\t"
402 "addl %%eax, %2 \n\t"
403 "subl $4, %0 \n\t"
404 "jnz 1b \n\t"
405 : "+g"(h), "+r" (pixels), "+r" (block)
406 : "r"(line_size)
407 : "%eax", "memory"
408 );
409}
410
649c00c9
MN
411static void clear_blocks_mmx(DCTELEM *blocks)
412{
39825f31 413 __asm __volatile(
649c00c9
MN
414 "pxor %%mm7, %%mm7 \n\t"
415 "movl $-128*6, %%eax \n\t"
416 "1: \n\t"
417 "movq %%mm7, (%0, %%eax) \n\t"
418 "movq %%mm7, 8(%0, %%eax) \n\t"
419 "movq %%mm7, 16(%0, %%eax) \n\t"
420 "movq %%mm7, 24(%0, %%eax) \n\t"
421 "addl $32, %%eax \n\t"
422 " js 1b \n\t"
423 : : "r" (((int)blocks)+128*6)
424 : "%eax"
425 );
426}
427
084c726b
MN
428static int pix_sum16_mmx(UINT8 * pix, int line_size){
429 const int h=16;
430 int sum;
431 int index= -line_size*h;
432
433 __asm __volatile(
434 "pxor %%mm7, %%mm7 \n\t"
435 "pxor %%mm6, %%mm6 \n\t"
436 "1: \n\t"
437 "movq (%2, %1), %%mm0 \n\t"
438 "movq (%2, %1), %%mm1 \n\t"
439 "movq 8(%2, %1), %%mm2 \n\t"
440 "movq 8(%2, %1), %%mm3 \n\t"
441 "punpcklbw %%mm7, %%mm0 \n\t"
442 "punpckhbw %%mm7, %%mm1 \n\t"
443 "punpcklbw %%mm7, %%mm2 \n\t"
444 "punpckhbw %%mm7, %%mm3 \n\t"
445 "paddw %%mm0, %%mm1 \n\t"
446 "paddw %%mm2, %%mm3 \n\t"
447 "paddw %%mm1, %%mm3 \n\t"
448 "paddw %%mm3, %%mm6 \n\t"
449 "addl %3, %1 \n\t"
450 " js 1b \n\t"
451 "movq %%mm6, %%mm5 \n\t"
452 "psrlq $32, %%mm6 \n\t"
453 "paddw %%mm5, %%mm6 \n\t"
454 "movq %%mm6, %%mm5 \n\t"
455 "psrlq $16, %%mm6 \n\t"
456 "paddw %%mm5, %%mm6 \n\t"
457 "movd %%mm6, %0 \n\t"
458 "andl $0xFFFF, %0 \n\t"
459 : "=&r" (sum), "+r" (index)
460 : "r" (pix - index), "r" (line_size)
461 );
462
463 return sum;
464}
465
11f18faf
MN
466static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
467 int i=0;
468 asm volatile(
469 "1: \n\t"
470 "movq (%1, %0), %%mm0 \n\t"
471 "movq (%2, %0), %%mm1 \n\t"
472 "paddb %%mm0, %%mm1 \n\t"
473 "movq %%mm1, (%2, %0) \n\t"
474 "movq 8(%1, %0), %%mm0 \n\t"
475 "movq 8(%2, %0), %%mm1 \n\t"
476 "paddb %%mm0, %%mm1 \n\t"
477 "movq %%mm1, 8(%2, %0) \n\t"
478 "addl $16, %0 \n\t"
479 "cmpl %3, %0 \n\t"
480 " jb 1b \n\t"
481 : "+r" (i)
482 : "r"(src), "r"(dst), "r"(w-15)
483 );
484 for(; i<w; i++)
485 dst[i+0] += src[i+0];
486}
487
488static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
489 int i=0;
490 asm volatile(
491 "1: \n\t"
492 "movq (%2, %0), %%mm0 \n\t"
493 "movq (%1, %0), %%mm1 \n\t"
494 "psubb %%mm0, %%mm1 \n\t"
495 "movq %%mm1, (%3, %0) \n\t"
496 "movq 8(%2, %0), %%mm0 \n\t"
497 "movq 8(%1, %0), %%mm1 \n\t"
498 "psubb %%mm0, %%mm1 \n\t"
499 "movq %%mm1, 8(%3, %0) \n\t"
500 "addl $16, %0 \n\t"
501 "cmpl %4, %0 \n\t"
502 " jb 1b \n\t"
503 : "+r" (i)
504 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
505 );
506 for(; i<w; i++)
507 dst[i+0] = src1[i+0]-src2[i+0];
508}
1457ab52
MN
509#define LBUTTERFLY(a,b)\
510 "paddw " #b ", " #a " \n\t"\
511 "paddw " #b ", " #b " \n\t"\
512 "psubw " #a ", " #b " \n\t"
513
514#define HADAMARD48\
515 LBUTTERFLY(%%mm0, %%mm1)\
516 LBUTTERFLY(%%mm2, %%mm3)\
517 LBUTTERFLY(%%mm4, %%mm5)\
518 LBUTTERFLY(%%mm6, %%mm7)\
519 \
520 LBUTTERFLY(%%mm0, %%mm2)\
521 LBUTTERFLY(%%mm1, %%mm3)\
522 LBUTTERFLY(%%mm4, %%mm6)\
523 LBUTTERFLY(%%mm5, %%mm7)\
524 \
525 LBUTTERFLY(%%mm0, %%mm4)\
526 LBUTTERFLY(%%mm1, %%mm5)\
527 LBUTTERFLY(%%mm2, %%mm6)\
528 LBUTTERFLY(%%mm3, %%mm7)
529
530#define MMABS(a,z)\
531 "pxor " #z ", " #z " \n\t"\
532 "pcmpgtw " #a ", " #z " \n\t"\
533 "pxor " #z ", " #a " \n\t"\
534 "psubw " #z ", " #a " \n\t"
535
536#define MMABS_SUM(a,z, sum)\
537 "pxor " #z ", " #z " \n\t"\
538 "pcmpgtw " #a ", " #z " \n\t"\
539 "pxor " #z ", " #a " \n\t"\
540 "psubw " #z ", " #a " \n\t"\
541 "paddusw " #a ", " #sum " \n\t"
542
543
544#define SBUTTERFLY(a,b,t,n)\
545 "movq " #a ", " #t " \n\t" /* abcd */\
546 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
547 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
548
549#define TRANSPOSE4(a,b,c,d,t)\
550 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
551 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
552 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
553 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
554
555#define LOAD4(o, a, b, c, d)\
556 "movq "#o"(%1), " #a " \n\t"\
557 "movq "#o"+16(%1), " #b " \n\t"\
558 "movq "#o"+32(%1), " #c " \n\t"\
559 "movq "#o"+48(%1), " #d " \n\t"
560
561#define STORE4(o, a, b, c, d)\
562 "movq "#a", "#o"(%1) \n\t"\
563 "movq "#b", "#o"+16(%1) \n\t"\
564 "movq "#c", "#o"+32(%1) \n\t"\
565 "movq "#d", "#o"+48(%1) \n\t"\
566
567static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
568 uint64_t temp[16] __align8;
569 int sum=0;
570
571 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 572
1457ab52
MN
573 asm volatile(
574 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
575 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
576
577 HADAMARD48
578
579 "movq %%mm7, 112(%1) \n\t"
580
581 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
582 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
583
584 "movq 112(%1), %%mm7 \n\t"
585 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
586 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
587
588 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
589 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
590
591 HADAMARD48
592
593 "movq %%mm7, 120(%1) \n\t"
594
595 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
596 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
597
598 "movq 120(%1), %%mm7 \n\t"
599 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
600 "movq %%mm7, %%mm5 \n\t"//FIXME remove
601 "movq %%mm6, %%mm7 \n\t"
602 "movq %%mm0, %%mm6 \n\t"
603// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
604
605 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
606// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
607
608 HADAMARD48
609 "movq %%mm7, 64(%1) \n\t"
610 MMABS(%%mm0, %%mm7)
611 MMABS_SUM(%%mm1, %%mm7, %%mm0)
612 MMABS_SUM(%%mm2, %%mm7, %%mm0)
613 MMABS_SUM(%%mm3, %%mm7, %%mm0)
614 MMABS_SUM(%%mm4, %%mm7, %%mm0)
615 MMABS_SUM(%%mm5, %%mm7, %%mm0)
616 MMABS_SUM(%%mm6, %%mm7, %%mm0)
617 "movq 64(%1), %%mm1 \n\t"
618 MMABS_SUM(%%mm1, %%mm7, %%mm0)
619 "movq %%mm0, 64(%1) \n\t"
620
621 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
622 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
623
624 HADAMARD48
625 "movq %%mm7, (%1) \n\t"
626 MMABS(%%mm0, %%mm7)
627 MMABS_SUM(%%mm1, %%mm7, %%mm0)
628 MMABS_SUM(%%mm2, %%mm7, %%mm0)
629 MMABS_SUM(%%mm3, %%mm7, %%mm0)
630 MMABS_SUM(%%mm4, %%mm7, %%mm0)
631 MMABS_SUM(%%mm5, %%mm7, %%mm0)
632 MMABS_SUM(%%mm6, %%mm7, %%mm0)
633 "movq (%1), %%mm1 \n\t"
634 MMABS_SUM(%%mm1, %%mm7, %%mm0)
635 "movq 64(%1), %%mm1 \n\t"
636 MMABS_SUM(%%mm1, %%mm7, %%mm0)
637
638 "movq %%mm0, %%mm1 \n\t"
639 "psrlq $32, %%mm0 \n\t"
640 "paddusw %%mm1, %%mm0 \n\t"
641 "movq %%mm0, %%mm1 \n\t"
642 "psrlq $16, %%mm0 \n\t"
643 "paddusw %%mm1, %%mm0 \n\t"
644 "movd %%mm0, %0 \n\t"
645
646 : "=r" (sum)
647 : "r"(temp)
648 );
649 return sum&0xFFFF;
650}
651
652WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
11f18faf 653
3178ee4c
MN
654#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
655#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
656
826f429a
MN
657#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
658 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
659 "movq " #pw_20 ", %%mm4 \n\t" /* 20 */\
660 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
661 "movq "#in7", " #m3 " \n\t" /* d */\
662 "movq "#in0", %%mm5 \n\t" /* D */\
663 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
664 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
665 "movq "#in1", %%mm5 \n\t" /* C */\
666 "movq "#in2", %%mm6 \n\t" /* B */\
667 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
668 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
669 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
670 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
671 "pmullw " #pw_3 ", %%mm5 \n\t" /* -6x2 + 3x3 */\
672 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
673 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
674 "psraw $5, %%mm5 \n\t"\
675 "packuswb %%mm5, %%mm5 \n\t"\
676 OP(%%mm5, out, %%mm7, d)
677
3178ee4c 678#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
826f429a
MN
679void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
680 uint64_t temp;\
681\
682 asm volatile(\
3643bd9c
MN
683 "pushl %0 \n\t"\
684 "pushl %1 \n\t"\
685 "pushl %2 \n\t"\
826f429a
MN
686 "pxor %%mm7, %%mm7 \n\t"\
687 "1: \n\t"\
688 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
689 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
690 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
691 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
692 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
693 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
694 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
695 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
696 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
697 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
698 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
699 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
700 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
701 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
702 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
703 "paddw %%mm3, %%mm5 \n\t" /* b */\
704 "paddw %%mm2, %%mm6 \n\t" /* c */\
705 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
706 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
707 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
708 "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\
709 "paddw %%mm4, %%mm0 \n\t" /* a */\
710 "paddw %%mm1, %%mm5 \n\t" /* d */\
711 "pmullw %5, %%mm0 \n\t" /* 20a */\
712 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
713 "paddw %8, %%mm6 \n\t"\
714 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
715 "psraw $5, %%mm0 \n\t"\
716 "movq %%mm0, %7 \n\t"\
717 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
718 \
719 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
720 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
721 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
722 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
723 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
724 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
725 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
726 "paddw %%mm0, %%mm2 \n\t" /* b */\
727 "paddw %%mm5, %%mm3 \n\t" /* c */\
728 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
729 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
730 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
731 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
732 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
733 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
734 "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\
735 "paddw %%mm2, %%mm1 \n\t" /* a */\
736 "paddw %%mm6, %%mm4 \n\t" /* d */\
737 "pmullw %5, %%mm1 \n\t" /* 20a */\
738 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
739 "paddw %8, %%mm1 \n\t"\
740 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
741 "psraw $5, %%mm3 \n\t"\
742 "movq %7, %%mm1 \n\t"\
743 "packuswb %%mm3, %%mm1 \n\t"\
3178ee4c 744 OP_MMX2(%%mm1, (%1),%%mm4, q)\
826f429a
MN
745 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
746 \
747 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
748 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
749 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
750 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
751 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
752 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
753 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
754 "paddw %%mm1, %%mm5 \n\t" /* b */\
755 "paddw %%mm4, %%mm0 \n\t" /* c */\
756 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
757 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
758 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
759 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
760 "pmullw %6, %%mm0 \n\t" /* 3c - 6b */\
761 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
762 "paddw %%mm3, %%mm2 \n\t" /* d */\
763 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
764 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
765 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
766 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
767 "paddw %%mm2, %%mm6 \n\t" /* a */\
768 "pmullw %5, %%mm6 \n\t" /* 20a */\
769 "paddw %8, %%mm0 \n\t"\
770 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
771 "psraw $5, %%mm0 \n\t"\
772 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
773 \
774 "paddw %%mm5, %%mm3 \n\t" /* a */\
775 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
776 "paddw %%mm4, %%mm6 \n\t" /* b */\
777 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
778 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
779 "paddw %%mm1, %%mm4 \n\t" /* c */\
780 "paddw %%mm2, %%mm5 \n\t" /* d */\
781 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
782 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
783 "pmullw %5, %%mm3 \n\t" /* 20a */\
784 "pmullw %6, %%mm4 \n\t" /* 3c - 6b */\
785 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
786 "paddw %8, %%mm4 \n\t"\
787 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
788 "psraw $5, %%mm4 \n\t"\
789 "packuswb %%mm4, %%mm0 \n\t"\
3178ee4c 790 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
826f429a
MN
791 \
792 "addl %3, %0 \n\t"\
793 "addl %4, %1 \n\t"\
794 "decl %2 \n\t"\
795 " jnz 1b \n\t"\
3643bd9c
MN
796 "popl %2 \n\t"\
797 "popl %1 \n\t"\
798 "popl %0 \n\t"\
799 :: "r"(src), "r"(dst), "r"(h),\
800 "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\
826f429a
MN
801 );\
802}\
803\
804static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
805 int i;\
806 int16_t temp[16];\
807 /* quick HACK, XXX FIXME MUST be optimized */\
808 for(i=0; i<h; i++)\
809 {\
810 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
811 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
812 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
813 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
814 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
815 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
816 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
817 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
818 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
819 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
820 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
821 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
822 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
823 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
824 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
825 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
826 asm volatile(\
827 "movq (%0), %%mm0 \n\t"\
828 "movq 8(%0), %%mm1 \n\t"\
829 "paddw %2, %%mm0 \n\t"\
830 "paddw %2, %%mm1 \n\t"\
831 "psraw $5, %%mm0 \n\t"\
832 "psraw $5, %%mm1 \n\t"\
833 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 834 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
835 "movq 16(%0), %%mm0 \n\t"\
836 "movq 24(%0), %%mm1 \n\t"\
837 "paddw %2, %%mm0 \n\t"\
838 "paddw %2, %%mm1 \n\t"\
839 "psraw $5, %%mm0 \n\t"\
840 "psraw $5, %%mm1 \n\t"\
841 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 842 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
826f429a
MN
843 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
844 );\
845 dst+=dstStride;\
846 src+=srcStride;\
847 }\
848}\
849\
826f429a
MN
850void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
851 uint64_t temp;\
852\
853 asm volatile(\
3643bd9c
MN
854 "pushl %0 \n\t"\
855 "pushl %1 \n\t"\
856 "pushl %2 \n\t"\
826f429a
MN
857 "pxor %%mm7, %%mm7 \n\t"\
858 "1: \n\t"\
859 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
860 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
861 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
862 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
863 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
864 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
865 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
866 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
867 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
868 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
869 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
870 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
871 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
872 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
873 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
874 "paddw %%mm3, %%mm5 \n\t" /* b */\
875 "paddw %%mm2, %%mm6 \n\t" /* c */\
876 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
877 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
878 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
879 "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\
880 "paddw %%mm4, %%mm0 \n\t" /* a */\
881 "paddw %%mm1, %%mm5 \n\t" /* d */\
882 "pmullw %5, %%mm0 \n\t" /* 20a */\
883 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
884 "paddw %8, %%mm6 \n\t"\
885 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
886 "psraw $5, %%mm0 \n\t"\
887 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
888 \
889 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
890 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
891 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
892 "paddw %%mm5, %%mm1 \n\t" /* a */\
893 "paddw %%mm6, %%mm2 \n\t" /* b */\
894 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
895 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
896 "paddw %%mm6, %%mm3 \n\t" /* c */\
897 "paddw %%mm5, %%mm4 \n\t" /* d */\
898 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
899 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
900 "pmullw %5, %%mm1 \n\t" /* 20a */\
901 "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\
902 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
903 "paddw %8, %%mm1 \n\t"\
904 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
905 "psraw $5, %%mm3 \n\t"\
906 "packuswb %%mm3, %%mm0 \n\t"\
3178ee4c 907 OP_MMX2(%%mm0, (%1), %%mm4, q)\
826f429a
MN
908 \
909 "addl %3, %0 \n\t"\
910 "addl %4, %1 \n\t"\
911 "decl %2 \n\t"\
912 " jnz 1b \n\t"\
3643bd9c
MN
913 "popl %2 \n\t"\
914 "popl %1 \n\t"\
915 "popl %0 \n\t"\
916 :: "r"(src), "r"(dst), "r"(h),\
917 "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\
826f429a
MN
918 );\
919}\
920\
921static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
922 int i;\
923 int16_t temp[8];\
924 /* quick HACK, XXX FIXME MUST be optimized */\
925 for(i=0; i<h; i++)\
926 {\
927 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
928 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
929 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
930 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
931 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
932 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
933 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
934 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
935 asm volatile(\
936 "movq (%0), %%mm0 \n\t"\
937 "movq 8(%0), %%mm1 \n\t"\
938 "paddw %2, %%mm0 \n\t"\
939 "paddw %2, %%mm1 \n\t"\
940 "psraw $5, %%mm0 \n\t"\
941 "psraw $5, %%mm1 \n\t"\
942 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 943 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
944 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
945 );\
946 dst+=dstStride;\
947 src+=srcStride;\
948 }\
3178ee4c
MN
949}
950
951#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
952\
953static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954 uint64_t temp[17*4];\
955 uint64_t *temp_ptr= temp;\
956 int count= 17;\
957\
958 /*FIXME unroll */\
959 asm volatile(\
960 "pxor %%mm7, %%mm7 \n\t"\
961 "1: \n\t"\
962 "movq (%0), %%mm0 \n\t"\
963 "movq (%0), %%mm1 \n\t"\
964 "movq 8(%0), %%mm2 \n\t"\
965 "movq 8(%0), %%mm3 \n\t"\
966 "punpcklbw %%mm7, %%mm0 \n\t"\
967 "punpckhbw %%mm7, %%mm1 \n\t"\
968 "punpcklbw %%mm7, %%mm2 \n\t"\
969 "punpckhbw %%mm7, %%mm3 \n\t"\
970 "movq %%mm0, (%1) \n\t"\
971 "movq %%mm1, 17*8(%1) \n\t"\
972 "movq %%mm2, (%1, %4) \n\t"\
973 "movq %%mm3, (%1, %5) \n\t"\
974 "addl $8, %1 \n\t"\
975 "addl %3, %0 \n\t"\
976 "decl %2 \n\t"\
977 " jnz 1b \n\t"\
978 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
979 : "r" (srcStride), "r"(2*8*17), "r"(3*8*17)\
980 );\
981 \
982 temp_ptr= temp;\
983 count=4;\
984 \
985/*FIXME reorder for speed */\
986/*FIXME remove push/pop gcc 2.95 bug workaround here and in the other 3 lowpass filters */\
987 asm volatile(\
988 /*"pxor %%mm7, %%mm7 \n\t"*/\
989 "pushl %0 \n\t"\
990 "pushl %1 \n\t"\
991 "pushl %2 \n\t"\
992 "1: \n\t"\
993 "movq (%0), %%mm0 \n\t"\
994 "movq 8(%0), %%mm1 \n\t"\
995 "movq 16(%0), %%mm2 \n\t"\
996 "movq 24(%0), %%mm3 \n\t"\
997 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
998 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
999 "addl %4, %1 \n\t"\
1000 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1001 \
1002 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1003 "addl %4, %1 \n\t"\
1004 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1005 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1006 "addl %4, %1 \n\t"\
1007 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1008 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1009 "addl %4, %1 \n\t"\
1010 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1011 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1012 "addl %4, %1 \n\t"\
1013 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1014 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1015 "addl %4, %1 \n\t"\
1016 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1017 \
1018 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1019 "addl %4, %1 \n\t" \
1020 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1021 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1022 \
1023 "addl $136, %0 \n\t"\
1024 "addl %8, %1 \n\t"\
1025 "decl %2 \n\t"\
1026 " jnz 1b \n\t"\
1027 "popl %2 \n\t"\
1028 "popl %1 \n\t"\
1029 "popl %0 \n\t"\
1030 \
1031 :: "r"(temp_ptr), "r"(dst), "r"(count),\
1032 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*dstStride)\
1033 );\
826f429a
MN
1034}\
1035\
3178ee4c 1036void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
826f429a
MN
1037 uint64_t temp[9*4];\
1038 uint64_t *temp_ptr= temp;\
1039 int count= 9;\
1040\
1041 /*FIXME unroll */\
1042 asm volatile(\
1043 "pxor %%mm7, %%mm7 \n\t"\
1044 "1: \n\t"\
1045 "movq (%0), %%mm0 \n\t"\
1046 "movq (%0), %%mm1 \n\t"\
1047 "punpcklbw %%mm7, %%mm0 \n\t"\
1048 "punpckhbw %%mm7, %%mm1 \n\t"\
1049 "movq %%mm0, (%1) \n\t"\
1050 "movq %%mm1, 9*8(%1) \n\t"\
1051 "addl $8, %1 \n\t"\
1052 "addl %3, %0 \n\t"\
1053 "decl %2 \n\t"\
1054 " jnz 1b \n\t"\
1055 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1056 : "r" (srcStride)\
1057 );\
1058 \
1059 temp_ptr= temp;\
1060 count=2;\
1061 \
1062/*FIXME reorder for speed */\
1063 asm volatile(\
3643bd9c
MN
1064 "pushl %0 \n\t"\
1065 "pushl %1 \n\t"\
1066 "pushl %2 \n\t"\
826f429a
MN
1067 /*"pxor %%mm7, %%mm7 \n\t"*/\
1068 "1: \n\t"\
1069 "movq (%0), %%mm0 \n\t"\
1070 "movq 8(%0), %%mm1 \n\t"\
1071 "movq 16(%0), %%mm2 \n\t"\
1072 "movq 24(%0), %%mm3 \n\t"\
1073 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1074 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1075 "addl %4, %1 \n\t"\
1076 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1077 \
1078 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1079 "addl %4, %1 \n\t"\
1080 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1081 \
1082 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1083 "addl %4, %1 \n\t"\
1084 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1085 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1086 \
1087 "addl $72, %0 \n\t"\
1088 "addl %8, %1 \n\t"\
1089 "decl %2 \n\t"\
1090 " jnz 1b \n\t"\
3643bd9c
MN
1091 "popl %2 \n\t"\
1092 "popl %1 \n\t"\
1093 "popl %0 \n\t"\
826f429a 1094 \
3643bd9c
MN
1095 :: "r"(temp_ptr), "r"(dst), "r"(count),\
1096 "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*dstStride)\
826f429a 1097 );\
3178ee4c 1098}\
826f429a
MN
1099\
1100static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1101 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
826f429a
MN
1102}\
1103\
1104static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1105 uint64_t temp[32];\
1106 uint8_t * const half= (uint8_t*)temp;\
1107 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1108 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1109}\
1110\
1111static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1112 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1113}\
1114\
1115static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1116 uint64_t temp[32];\
1117 uint8_t * const half= (uint8_t*)temp;\
1118 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1119 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1120}\
1121\
1122static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1123 uint64_t temp[32];\
1124 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1125 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1126 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1127}\
1128\
1129static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1130 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1131}\
1132\
1133static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1134 uint64_t temp[32];\
1135 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1136 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1137 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1138}\
1139static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1140 uint64_t half[8*2 + 8*2 + 18*2];\
1141 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1142 uint8_t * const halfV= ((uint8_t*)half);\
1143 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1144 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1145 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1146 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1147 OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\
1148}\
1149static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1150 uint64_t half[8*2 + 8*2 + 18*2];\
1151 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1152 uint8_t * const halfV= ((uint8_t*)half);\
1153 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1154 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1155 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1156 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1157 OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\
1158}\
1159static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1160 uint64_t half[8*2 + 8*2 + 9*2];\
1161 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1162 uint8_t * const halfV= ((uint8_t*)half);\
1163 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1164 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1165 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1166 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1167 OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\
1168}\
1169static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1170 uint64_t half[8*2 + 8*2 + 9*2];\
1171 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1172 uint8_t * const halfV= ((uint8_t*)half);\
1173 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1174 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src , 8, stride, 9);\
3178ee4c
MN
1175 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1176 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1177 OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\
1178}\
1179static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1180 uint64_t half[8*2 + 9*2];\
1181 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1182 uint8_t * const halfHV= ((uint8_t*)half);\
1183 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1184 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1185 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1186}\
1187static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1188 uint64_t half[8*2 + 9*2];\
1189 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1190 uint8_t * const halfHV= ((uint8_t*)half);\
1191 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1192 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1193 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1194}\
1195static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1196 uint64_t half[8*2 + 8*2 + 9*2];\
1197 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1198 uint8_t * const halfV= ((uint8_t*)half);\
1199 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1200 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1201 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1202 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1203 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1204}\
1205static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1206 uint64_t half[8*2 + 8*2 + 9*2];\
1207 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1208 uint8_t * const halfV= ((uint8_t*)half);\
1209 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1210 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1211 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1212 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1213 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1214}\
1215static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1216 uint64_t half[9*2];\
1217 uint8_t * const halfH= ((uint8_t*)half);\
1218 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1219 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a
MN
1220}\
1221static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1222 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
826f429a
MN
1223}\
1224\
1225static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1226 uint64_t temp[32];\
1227 uint8_t * const half= (uint8_t*)temp;\
1228 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1229 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1230}\
1231\
1232static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1233 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1234}\
1235\
1236static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1237 uint64_t temp[32];\
1238 uint8_t * const half= (uint8_t*)temp;\
1239 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1240 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1241}\
1242\
1243static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1244 uint64_t temp[32];\
1245 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1246 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1247 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1248}\
1249\
1250static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1251 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1252}\
1253\
1254static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1255 uint64_t temp[32];\
1256 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1257 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1258 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1259}\
1260static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1261 uint64_t half[16*2 + 16*2 + 18*2];\
1262 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1263 uint8_t * const halfV= ((uint8_t*)half);\
1264 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1265 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1266 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1267 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1268 OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\
1269}\
1270static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1271 uint64_t half[16*2 + 16*2 + 18*2];\
1272 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1273 uint8_t * const halfV= ((uint8_t*)half);\
1274 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1275 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1276 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1277 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1278 OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\
1279}\
1280static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1281 uint64_t half[16*2 + 16*2 + 17*2];\
1282 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1283 uint8_t * const halfV= ((uint8_t*)half);\
1284 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1285 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1286 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1287 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1288 OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\
1289}\
1290static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1291 uint64_t half[16*2 + 16*2 + 17*2];\
1292 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1293 uint8_t * const halfV= ((uint8_t*)half);\
1294 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1295 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src , 16, stride, 17);\
3178ee4c
MN
1296 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1297 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1298 OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\
1299}\
1300static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1301 uint64_t half[16*2 + 17*2];\
1302 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1303 uint8_t * const halfHV= ((uint8_t*)half);\
1304 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1305 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1306 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1307}\
1308static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1309 uint64_t half[16*2 + 17*2];\
1310 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1311 uint8_t * const halfHV= ((uint8_t*)half);\
1312 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1313 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1314 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1315}\
1316static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1317 uint64_t half[16*2 + 16*2 + 17*2];\
1318 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1319 uint8_t * const halfV= ((uint8_t*)half);\
1320 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1321 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1322 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1323 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1324 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1325}\
1326static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1327 uint64_t half[16*2 + 16*2 + 17*2];\
1328 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1329 uint8_t * const halfV= ((uint8_t*)half);\
1330 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1331 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1332 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1333 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1334 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1335}\
1336static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1337 uint64_t half[17*2];\
1338 uint8_t * const halfH= ((uint8_t*)half);\
1339 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1340 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1341}
1342
1343
1344#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
3178ee4c 1345#define AVG_3DNOW_OP(a,b,temp, size) \
826f429a
MN
1346"mov" #size " " #b ", " #temp " \n\t"\
1347"pavgusb " #temp ", " #a " \n\t"\
1348"mov" #size " " #a ", " #b " \n\t"
3178ee4c 1349#define AVG_MMX2_OP(a,b,temp, size) \
826f429a
MN
1350"mov" #size " " #b ", " #temp " \n\t"\
1351"pavgb " #temp ", " #a " \n\t"\
1352"mov" #size " " #a ", " #b " \n\t"
3178ee4c
MN
1353
1354QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1355QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1356QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1357QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1358QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1359QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
826f429a 1360QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
3178ee4c 1361QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
826f429a
MN
1362QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1363
61a4e8ae 1364#if 0
d6a4c0b1 1365static void just_return() { return; }
61a4e8ae 1366#endif
d6a4c0b1 1367
826f429a
MN
1368#define SET_QPEL_FUNC(postfix1, postfix2) \
1369 c->put_ ## postfix1 = put_ ## postfix2;\
1370 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1371 c->avg_ ## postfix1 = avg_ ## postfix2;
1372
eb4b3dd3 1373void dsputil_init_mmx(DSPContext* c, unsigned mask)
de6d9b64
FB
1374{
1375 mm_flags = mm_support();
1565dabc
LB
1376#if 0
1377 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 1378 if (mm_flags & MM_MMX)
1565dabc 1379 fprintf(stderr, " mmx");
de6d9b64 1380 if (mm_flags & MM_MMXEXT)
1565dabc 1381 fprintf(stderr, " mmxext");
de6d9b64 1382 if (mm_flags & MM_3DNOW)
1565dabc 1383 fprintf(stderr, " 3dnow");
de6d9b64 1384 if (mm_flags & MM_SSE)
1565dabc 1385 fprintf(stderr, " sse");
de6d9b64 1386 if (mm_flags & MM_SSE2)
1565dabc
LB
1387 fprintf(stderr, " sse2");
1388 fprintf(stderr, "\n");
de6d9b64
FB
1389#endif
1390
1391 if (mm_flags & MM_MMX) {
eb4b3dd3
ZK
1392 c->get_pixels = get_pixels_mmx;
1393 c->diff_pixels = diff_pixels_mmx;
1394 c->put_pixels_clamped = put_pixels_clamped_mmx;
1395 c->add_pixels_clamped = add_pixels_clamped_mmx;
1396 c->clear_blocks = clear_blocks_mmx;
1397 c->pix_sum = pix_sum16_mmx;
1398
1399 c->pix_abs16x16 = pix_abs16x16_mmx;
1400 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1401 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1402 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1403 c->pix_abs8x8 = pix_abs8x8_mmx;
1404 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1405 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1406 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
1407
1408 c->put_pixels_tab[0][0] = put_pixels16_mmx;
1409 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1410 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1411 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1412
1413 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1414 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1415 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1416 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1417
1418 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1419 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1420 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1421 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1422
1423 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1424 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1425 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1426 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1427
1428 c->put_pixels_tab[1][0] = put_pixels8_mmx;
1429 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1430 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1431 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1432
1433 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1434 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1435 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1436 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1437
1438 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1439 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1440 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1441 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1442
1443 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1444 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1445 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1446 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
826f429a 1447
11f18faf
MN
1448 c->add_bytes= add_bytes_mmx;
1449 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
1450
1451 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1452 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1453
1454 c->sad[0]= sad16x16_mmx;
1455 c->sad[1]= sad8x8_mmx;
1456
de6d9b64 1457 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1458 c->pix_abs16x16 = pix_abs16x16_mmx2;
1459 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
1460 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
1461 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
1462
1463 c->pix_abs8x8 = pix_abs8x8_mmx2;
1464 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
1465 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
1466 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
1467
1457ab52
MN
1468 c->sad[0]= sad16x16_mmx2;
1469 c->sad[1]= sad8x8_mmx2;
1470
eb4b3dd3
ZK
1471 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1472 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
1473 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1474 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1475
1476 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1477 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1478 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
1479 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1480
1481 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1482 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
1483 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1484 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1485
1486 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1487 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1488 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1489 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3178ee4c 1490
826f429a
MN
1491 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1492 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1493 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1494 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1495 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1496 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1497 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1498 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1499 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1500 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1501 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1502 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1503 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1504 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1505 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1506 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1507 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1508 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1509 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1510 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1511 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1512 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1513 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1514 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1515 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1516 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1517 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1518 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1519 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1520 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1521 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1522 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
de6d9b64 1523 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
1524 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1525 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
1526 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1527 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1528
1529 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1530 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1531 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
1532 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1533
1534 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1535 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
1536 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1537 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1538
1539 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1540 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1541 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1542 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
826f429a
MN
1543
1544 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1545 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1546 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1547 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1548 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1549 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1550 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1551 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1552 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1553 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1554 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1555 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1556 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1557 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1558 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1559 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1560 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1561 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1562 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1563 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1564 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1565 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1566 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1567 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1568 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1569 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1570 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1571 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1572 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1573 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1574 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1575 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
de6d9b64
FB
1576 }
1577 }
d6a4c0b1
ZK
1578
1579#if 0
1580 // for speed testing
1581 get_pixels = just_return;
1582 put_pixels_clamped = just_return;
1583 add_pixels_clamped = just_return;
1584
1585 pix_abs16x16 = just_return;
1586 pix_abs16x16_x2 = just_return;
1587 pix_abs16x16_y2 = just_return;
1588 pix_abs16x16_xy2 = just_return;
1589
1590 put_pixels_tab[0] = just_return;
1591 put_pixels_tab[1] = just_return;
1592 put_pixels_tab[2] = just_return;
1593 put_pixels_tab[3] = just_return;
1594
1595 put_no_rnd_pixels_tab[0] = just_return;
1596 put_no_rnd_pixels_tab[1] = just_return;
1597 put_no_rnd_pixels_tab[2] = just_return;
1598 put_no_rnd_pixels_tab[3] = just_return;
1599
1600 avg_pixels_tab[0] = just_return;
1601 avg_pixels_tab[1] = just_return;
1602 avg_pixels_tab[2] = just_return;
1603 avg_pixels_tab[3] = just_return;
1604
1605 avg_no_rnd_pixels_tab[0] = just_return;
1606 avg_no_rnd_pixels_tab[1] = just_return;
1607 avg_no_rnd_pixels_tab[2] = just_return;
1608 avg_no_rnd_pixels_tab[3] = just_return;
1609
d6a4c0b1
ZK
1610 //av_fdct = just_return;
1611 //ff_idct = just_return;
1612#endif
de6d9b64 1613}
4f12a497
FB
1614
1615/* remove any non bit exact operation (testing purpose). NOTE that
1616 this function should be kept as small as possible because it is
1617 always difficult to test automatically non bit exact cases. */
eb4b3dd3 1618void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
4f12a497
FB
1619{
1620 if (mm_flags & MM_MMX) {
b3184779 1621 /* MMX2 & 3DNOW */
eb4b3dd3
ZK
1622 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1623 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1624 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1625 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1626 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1627 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4afeaec9 1628
b3184779 1629 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1630 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1631 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1632 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1633 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1634 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1635 c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497
FB
1636 }
1637 }
1638}