gcc 3.2.2 -O3 bug workaround (older gcc are very likely affected too but didnt check)
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5 24int mm_flags; /* multimedia extension flags */
eb4b3dd3 25/* FIXME use them in static form */
ba6802de
MN
26int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
31int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
36int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
41int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
1457ab52
MN
46int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
47int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
48int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
49int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
50
de6d9b64 51/* pixel operations */
a7bd8797
MN
52static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 55
826f429a
MN
56static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
57static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
58static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
59static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
60
d6a4c0b1
ZK
61#define JUMPALIGN() __asm __volatile (".balign 8"::)
62#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
63
fca0f0e5
ZK
64#define MOVQ_WONE(regd) \
65 __asm __volatile ( \
66 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
67 "psrlw $15, %%" #regd ::)
68
69#define MOVQ_BFE(regd) \
70 __asm __volatile ( \
71 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
72 "paddb %%" #regd ", %%" #regd " \n\t" ::)
73
d6a4c0b1 74#ifndef PIC
fca0f0e5 75#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
76#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
77#else
78// for shared library it's better to use this way for accessing constants
79// pcmpeqd -> -1
fca0f0e5 80#define MOVQ_BONE(regd) \
d6a4c0b1 81 __asm __volatile ( \
fca0f0e5
ZK
82 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
83 "psrlw $15, %%" #regd " \n\t" \
84 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
85
86#define MOVQ_WTWO(regd) \
87 __asm __volatile ( \
fca0f0e5
ZK
88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
89 "psrlw $15, %%" #regd " \n\t" \
90 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 91
d6a4c0b1
ZK
92#endif
93
fca0f0e5 94// using regr as temporary and for the output result
def60345 95// first argument is unmodifed and second is trashed
39825f31
ZK
96// regfe is supposed to contain 0xfefefefefefefefe
97#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
98 "movq " #rega ", " #regr " \n\t"\
99 "pand " #regb ", " #regr " \n\t"\
def60345 100 "pxor " #rega ", " #regb " \n\t"\
39825f31 101 "pand " #regfe "," #regb " \n\t"\
def60345 102 "psrlq $1, " #regb " \n\t"\
91abb473 103 "paddb " #regb ", " #regr " \n\t"
def60345 104
39825f31 105#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
106 "movq " #rega ", " #regr " \n\t"\
107 "por " #regb ", " #regr " \n\t"\
def60345 108 "pxor " #rega ", " #regb " \n\t"\
39825f31 109 "pand " #regfe "," #regb " \n\t"\
def60345 110 "psrlq $1, " #regb " \n\t"\
91abb473 111 "psubb " #regb ", " #regr " \n\t"
def60345 112
39825f31 113// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
114#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
115 "movq " #rega ", " #regr " \n\t"\
116 "movq " #regc ", " #regp " \n\t"\
117 "pand " #regb ", " #regr " \n\t"\
118 "pand " #regd ", " #regp " \n\t"\
119 "pxor " #rega ", " #regb " \n\t"\
120 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
121 "pand %%mm6, " #regb " \n\t"\
122 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
123 "psrlq $1, " #regb " \n\t"\
124 "psrlq $1, " #regd " \n\t"\
125 "paddb " #regb ", " #regr " \n\t"\
126 "paddb " #regd ", " #regp " \n\t"
127
128#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
129 "movq " #rega ", " #regr " \n\t"\
130 "movq " #regc ", " #regp " \n\t"\
131 "por " #regb ", " #regr " \n\t"\
132 "por " #regd ", " #regp " \n\t"\
133 "pxor " #rega ", " #regb " \n\t"\
134 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
135 "pand %%mm6, " #regb " \n\t"\
136 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
137 "psrlq $1, " #regd " \n\t"\
138 "psrlq $1, " #regb " \n\t"\
139 "psubb " #regb ", " #regr " \n\t"\
140 "psubb " #regd ", " #regp " \n\t"
141
91abb473
ZK
142/***********************************/
143/* MMX no rounding */
144#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 145#define SET_RND MOVQ_WONE
6aa6ea8e 146#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 147#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 148
91abb473
ZK
149#include "dsputil_mmx_rnd.h"
150
151#undef DEF
fca0f0e5 152#undef SET_RND
6aa6ea8e 153#undef PAVGBP
39825f31 154#undef PAVGB
91abb473
ZK
155/***********************************/
156/* MMX rounding */
157
158#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 159#define SET_RND MOVQ_WTWO
6aa6ea8e 160#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 161#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 162
91abb473
ZK
163#include "dsputil_mmx_rnd.h"
164
165#undef DEF
fca0f0e5 166#undef SET_RND
6aa6ea8e 167#undef PAVGBP
39825f31 168#undef PAVGB
a7bd8797 169
de6d9b64
FB
170/***********************************/
171/* 3Dnow specific */
172
173#define DEF(x) x ## _3dnow
174/* for Athlons PAVGUSB is prefered */
175#define PAVGB "pavgusb"
176
177#include "dsputil_mmx_avg.h"
178
179#undef DEF
180#undef PAVGB
181
182/***********************************/
183/* MMX2 specific */
184
607dce96 185#define DEF(x) x ## _mmx2
de6d9b64
FB
186
187/* Introduced only in MMX2 set */
188#define PAVGB "pavgb"
189
190#include "dsputil_mmx_avg.h"
191
192#undef DEF
193#undef PAVGB
194
195/***********************************/
196/* standard MMX */
197
198static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
199{
607dce96
MN
200 asm volatile(
201 "movl $-128, %%eax \n\t"
202 "pxor %%mm7, %%mm7 \n\t"
203 ".balign 16 \n\t"
204 "1: \n\t"
205 "movq (%0), %%mm0 \n\t"
206 "movq (%0, %2), %%mm2 \n\t"
207 "movq %%mm0, %%mm1 \n\t"
208 "movq %%mm2, %%mm3 \n\t"
209 "punpcklbw %%mm7, %%mm0 \n\t"
210 "punpckhbw %%mm7, %%mm1 \n\t"
211 "punpcklbw %%mm7, %%mm2 \n\t"
212 "punpckhbw %%mm7, %%mm3 \n\t"
213 "movq %%mm0, (%1, %%eax)\n\t"
214 "movq %%mm1, 8(%1, %%eax)\n\t"
215 "movq %%mm2, 16(%1, %%eax)\n\t"
216 "movq %%mm3, 24(%1, %%eax)\n\t"
217 "addl %3, %0 \n\t"
218 "addl $32, %%eax \n\t"
219 "js 1b \n\t"
220 : "+r" (pixels)
221 : "r" (block+64), "r" (line_size), "r" (line_size*2)
222 : "%eax"
223 );
de6d9b64
FB
224}
225
1457ab52 226static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
9dbcbd92
MN
227{
228 asm volatile(
607dce96 229 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 230 "movl $-128, %%eax \n\t"
607dce96 231 ".balign 16 \n\t"
9dbcbd92
MN
232 "1: \n\t"
233 "movq (%0), %%mm0 \n\t"
234 "movq (%1), %%mm2 \n\t"
235 "movq %%mm0, %%mm1 \n\t"
236 "movq %%mm2, %%mm3 \n\t"
237 "punpcklbw %%mm7, %%mm0 \n\t"
238 "punpckhbw %%mm7, %%mm1 \n\t"
239 "punpcklbw %%mm7, %%mm2 \n\t"
240 "punpckhbw %%mm7, %%mm3 \n\t"
241 "psubw %%mm2, %%mm0 \n\t"
242 "psubw %%mm3, %%mm1 \n\t"
243 "movq %%mm0, (%2, %%eax)\n\t"
244 "movq %%mm1, 8(%2, %%eax)\n\t"
245 "addl %3, %0 \n\t"
246 "addl %3, %1 \n\t"
247 "addl $16, %%eax \n\t"
248 "jnz 1b \n\t"
249 : "+r" (s1), "+r" (s2)
250 : "r" (block+64), "r" (stride)
251 : "%eax"
252 );
253}
254
eb4b3dd3 255void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
256{
257 const DCTELEM *p;
258 UINT8 *pix;
de6d9b64
FB
259
260 /* read the pixels */
261 p = block;
262 pix = pixels;
d6a4c0b1 263 /* unrolled loop */
de6d9b64 264 __asm __volatile(
a822a479
NK
265 "movq %3, %%mm0\n\t"
266 "movq 8%3, %%mm1\n\t"
267 "movq 16%3, %%mm2\n\t"
268 "movq 24%3, %%mm3\n\t"
269 "movq 32%3, %%mm4\n\t"
270 "movq 40%3, %%mm5\n\t"
271 "movq 48%3, %%mm6\n\t"
272 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
273 "packuswb %%mm1, %%mm0\n\t"
274 "packuswb %%mm3, %%mm2\n\t"
275 "packuswb %%mm5, %%mm4\n\t"
276 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
277 "movq %%mm0, (%0)\n\t"
278 "movq %%mm2, (%0, %1)\n\t"
279 "movq %%mm4, (%0, %1, 2)\n\t"
280 "movq %%mm6, (%0, %2)\n\t"
281 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
282 :"memory");
283 pix += line_size*4;
284 p += 32;
d6a4c0b1
ZK
285
286 // if here would be an exact copy of the code above
287 // compiler would generate some very strange code
288 // thus using "r"
289 __asm __volatile(
290 "movq (%3), %%mm0\n\t"
291 "movq 8(%3), %%mm1\n\t"
292 "movq 16(%3), %%mm2\n\t"
293 "movq 24(%3), %%mm3\n\t"
294 "movq 32(%3), %%mm4\n\t"
295 "movq 40(%3), %%mm5\n\t"
296 "movq 48(%3), %%mm6\n\t"
297 "movq 56(%3), %%mm7\n\t"
298 "packuswb %%mm1, %%mm0\n\t"
299 "packuswb %%mm3, %%mm2\n\t"
300 "packuswb %%mm5, %%mm4\n\t"
301 "packuswb %%mm7, %%mm6\n\t"
302 "movq %%mm0, (%0)\n\t"
303 "movq %%mm2, (%0, %1)\n\t"
304 "movq %%mm4, (%0, %1, 2)\n\t"
305 "movq %%mm6, (%0, %2)\n\t"
306 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
307 :"memory");
de6d9b64
FB
308}
309
eb4b3dd3 310void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
311{
312 const DCTELEM *p;
313 UINT8 *pix;
314 int i;
315
316 /* read the pixels */
317 p = block;
318 pix = pixels;
d6a4c0b1
ZK
319 MOVQ_ZERO(mm7);
320 i = 4;
cd8e5f96 321 do {
de6d9b64 322 __asm __volatile(
cd8e5f96
ZK
323 "movq (%2), %%mm0\n\t"
324 "movq 8(%2), %%mm1\n\t"
325 "movq 16(%2), %%mm2\n\t"
326 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
327 "movq %0, %%mm4\n\t"
328 "movq %1, %%mm6\n\t"
329 "movq %%mm4, %%mm5\n\t"
330 "punpcklbw %%mm7, %%mm4\n\t"
331 "punpckhbw %%mm7, %%mm5\n\t"
332 "paddsw %%mm4, %%mm0\n\t"
333 "paddsw %%mm5, %%mm1\n\t"
334 "movq %%mm6, %%mm5\n\t"
335 "punpcklbw %%mm7, %%mm6\n\t"
336 "punpckhbw %%mm7, %%mm5\n\t"
337 "paddsw %%mm6, %%mm2\n\t"
338 "paddsw %%mm5, %%mm3\n\t"
339 "packuswb %%mm1, %%mm0\n\t"
340 "packuswb %%mm3, %%mm2\n\t"
341 "movq %%mm0, %0\n\t"
342 "movq %%mm2, %1\n\t"
a822a479 343 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 344 :"r"(p)
de6d9b64
FB
345 :"memory");
346 pix += line_size*2;
347 p += 16;
cd8e5f96 348 } while (--i);
de6d9b64
FB
349}
350
b3184779 351static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 352{
39825f31 353 __asm __volatile(
31ddcf98 354 "lea (%3, %3), %%eax \n\t"
52af45ad 355 ".balign 8 \n\t"
31ddcf98
ZK
356 "1: \n\t"
357 "movq (%1), %%mm0 \n\t"
358 "movq (%1, %3), %%mm1 \n\t"
359 "movq %%mm0, (%2) \n\t"
360 "movq %%mm1, (%2, %3) \n\t"
361 "addl %%eax, %1 \n\t"
362 "addl %%eax, %2 \n\t"
363 "movq (%1), %%mm0 \n\t"
364 "movq (%1, %3), %%mm1 \n\t"
365 "movq %%mm0, (%2) \n\t"
366 "movq %%mm1, (%2, %3) \n\t"
367 "addl %%eax, %1 \n\t"
368 "addl %%eax, %2 \n\t"
369 "subl $4, %0 \n\t"
370 "jnz 1b \n\t"
371 : "+g"(h), "+r" (pixels), "+r" (block)
372 : "r"(line_size)
373 : "%eax", "memory"
374 );
de6d9b64
FB
375}
376
b3184779
MN
377static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
378{
379 __asm __volatile(
380 "lea (%3, %3), %%eax \n\t"
381 ".balign 8 \n\t"
382 "1: \n\t"
383 "movq (%1), %%mm0 \n\t"
384 "movq 8(%1), %%mm4 \n\t"
385 "movq (%1, %3), %%mm1 \n\t"
386 "movq 8(%1, %3), %%mm5 \n\t"
387 "movq %%mm0, (%2) \n\t"
388 "movq %%mm4, 8(%2) \n\t"
389 "movq %%mm1, (%2, %3) \n\t"
390 "movq %%mm5, 8(%2, %3) \n\t"
391 "addl %%eax, %1 \n\t"
392 "addl %%eax, %2 \n\t"
393 "movq (%1), %%mm0 \n\t"
394 "movq 8(%1), %%mm4 \n\t"
395 "movq (%1, %3), %%mm1 \n\t"
396 "movq 8(%1, %3), %%mm5 \n\t"
397 "movq %%mm0, (%2) \n\t"
398 "movq %%mm4, 8(%2) \n\t"
399 "movq %%mm1, (%2, %3) \n\t"
400 "movq %%mm5, 8(%2, %3) \n\t"
401 "addl %%eax, %1 \n\t"
402 "addl %%eax, %2 \n\t"
403 "subl $4, %0 \n\t"
404 "jnz 1b \n\t"
405 : "+g"(h), "+r" (pixels), "+r" (block)
406 : "r"(line_size)
407 : "%eax", "memory"
408 );
409}
410
649c00c9
MN
411static void clear_blocks_mmx(DCTELEM *blocks)
412{
39825f31 413 __asm __volatile(
649c00c9
MN
414 "pxor %%mm7, %%mm7 \n\t"
415 "movl $-128*6, %%eax \n\t"
416 "1: \n\t"
417 "movq %%mm7, (%0, %%eax) \n\t"
418 "movq %%mm7, 8(%0, %%eax) \n\t"
419 "movq %%mm7, 16(%0, %%eax) \n\t"
420 "movq %%mm7, 24(%0, %%eax) \n\t"
421 "addl $32, %%eax \n\t"
422 " js 1b \n\t"
423 : : "r" (((int)blocks)+128*6)
424 : "%eax"
425 );
426}
427
084c726b
MN
428static int pix_sum16_mmx(UINT8 * pix, int line_size){
429 const int h=16;
430 int sum;
431 int index= -line_size*h;
432
433 __asm __volatile(
434 "pxor %%mm7, %%mm7 \n\t"
435 "pxor %%mm6, %%mm6 \n\t"
436 "1: \n\t"
437 "movq (%2, %1), %%mm0 \n\t"
438 "movq (%2, %1), %%mm1 \n\t"
439 "movq 8(%2, %1), %%mm2 \n\t"
440 "movq 8(%2, %1), %%mm3 \n\t"
441 "punpcklbw %%mm7, %%mm0 \n\t"
442 "punpckhbw %%mm7, %%mm1 \n\t"
443 "punpcklbw %%mm7, %%mm2 \n\t"
444 "punpckhbw %%mm7, %%mm3 \n\t"
445 "paddw %%mm0, %%mm1 \n\t"
446 "paddw %%mm2, %%mm3 \n\t"
447 "paddw %%mm1, %%mm3 \n\t"
448 "paddw %%mm3, %%mm6 \n\t"
449 "addl %3, %1 \n\t"
450 " js 1b \n\t"
451 "movq %%mm6, %%mm5 \n\t"
452 "psrlq $32, %%mm6 \n\t"
453 "paddw %%mm5, %%mm6 \n\t"
454 "movq %%mm6, %%mm5 \n\t"
455 "psrlq $16, %%mm6 \n\t"
456 "paddw %%mm5, %%mm6 \n\t"
457 "movd %%mm6, %0 \n\t"
458 "andl $0xFFFF, %0 \n\t"
459 : "=&r" (sum), "+r" (index)
460 : "r" (pix - index), "r" (line_size)
461 );
462
463 return sum;
464}
465
11f18faf
MN
466static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
467 int i=0;
468 asm volatile(
469 "1: \n\t"
470 "movq (%1, %0), %%mm0 \n\t"
471 "movq (%2, %0), %%mm1 \n\t"
472 "paddb %%mm0, %%mm1 \n\t"
473 "movq %%mm1, (%2, %0) \n\t"
474 "movq 8(%1, %0), %%mm0 \n\t"
475 "movq 8(%2, %0), %%mm1 \n\t"
476 "paddb %%mm0, %%mm1 \n\t"
477 "movq %%mm1, 8(%2, %0) \n\t"
478 "addl $16, %0 \n\t"
479 "cmpl %3, %0 \n\t"
480 " jb 1b \n\t"
481 : "+r" (i)
482 : "r"(src), "r"(dst), "r"(w-15)
483 );
484 for(; i<w; i++)
485 dst[i+0] += src[i+0];
486}
487
488static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
489 int i=0;
490 asm volatile(
491 "1: \n\t"
492 "movq (%2, %0), %%mm0 \n\t"
493 "movq (%1, %0), %%mm1 \n\t"
494 "psubb %%mm0, %%mm1 \n\t"
495 "movq %%mm1, (%3, %0) \n\t"
496 "movq 8(%2, %0), %%mm0 \n\t"
497 "movq 8(%1, %0), %%mm1 \n\t"
498 "psubb %%mm0, %%mm1 \n\t"
499 "movq %%mm1, 8(%3, %0) \n\t"
500 "addl $16, %0 \n\t"
501 "cmpl %4, %0 \n\t"
502 " jb 1b \n\t"
503 : "+r" (i)
504 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
505 );
506 for(; i<w; i++)
507 dst[i+0] = src1[i+0]-src2[i+0];
508}
1457ab52
MN
509#define LBUTTERFLY(a,b)\
510 "paddw " #b ", " #a " \n\t"\
511 "paddw " #b ", " #b " \n\t"\
512 "psubw " #a ", " #b " \n\t"
513
514#define HADAMARD48\
515 LBUTTERFLY(%%mm0, %%mm1)\
516 LBUTTERFLY(%%mm2, %%mm3)\
517 LBUTTERFLY(%%mm4, %%mm5)\
518 LBUTTERFLY(%%mm6, %%mm7)\
519 \
520 LBUTTERFLY(%%mm0, %%mm2)\
521 LBUTTERFLY(%%mm1, %%mm3)\
522 LBUTTERFLY(%%mm4, %%mm6)\
523 LBUTTERFLY(%%mm5, %%mm7)\
524 \
525 LBUTTERFLY(%%mm0, %%mm4)\
526 LBUTTERFLY(%%mm1, %%mm5)\
527 LBUTTERFLY(%%mm2, %%mm6)\
528 LBUTTERFLY(%%mm3, %%mm7)
529
530#define MMABS(a,z)\
531 "pxor " #z ", " #z " \n\t"\
532 "pcmpgtw " #a ", " #z " \n\t"\
533 "pxor " #z ", " #a " \n\t"\
534 "psubw " #z ", " #a " \n\t"
535
536#define MMABS_SUM(a,z, sum)\
537 "pxor " #z ", " #z " \n\t"\
538 "pcmpgtw " #a ", " #z " \n\t"\
539 "pxor " #z ", " #a " \n\t"\
540 "psubw " #z ", " #a " \n\t"\
541 "paddusw " #a ", " #sum " \n\t"
542
543
544#define SBUTTERFLY(a,b,t,n)\
545 "movq " #a ", " #t " \n\t" /* abcd */\
546 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
547 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
548
549#define TRANSPOSE4(a,b,c,d,t)\
550 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
551 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
552 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
553 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
554
555#define LOAD4(o, a, b, c, d)\
556 "movq "#o"(%1), " #a " \n\t"\
557 "movq "#o"+16(%1), " #b " \n\t"\
558 "movq "#o"+32(%1), " #c " \n\t"\
559 "movq "#o"+48(%1), " #d " \n\t"
560
561#define STORE4(o, a, b, c, d)\
562 "movq "#a", "#o"(%1) \n\t"\
563 "movq "#b", "#o"+16(%1) \n\t"\
564 "movq "#c", "#o"+32(%1) \n\t"\
565 "movq "#d", "#o"+48(%1) \n\t"\
566
567static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
568 uint64_t temp[16] __align8;
569 int sum=0;
570
571 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 572
1457ab52
MN
573 asm volatile(
574 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
575 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
576
577 HADAMARD48
578
579 "movq %%mm7, 112(%1) \n\t"
580
581 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
582 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
583
584 "movq 112(%1), %%mm7 \n\t"
585 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
586 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
587
588 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
589 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
590
591 HADAMARD48
592
593 "movq %%mm7, 120(%1) \n\t"
594
595 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
596 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
597
598 "movq 120(%1), %%mm7 \n\t"
599 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
600 "movq %%mm7, %%mm5 \n\t"//FIXME remove
601 "movq %%mm6, %%mm7 \n\t"
602 "movq %%mm0, %%mm6 \n\t"
603// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
604
605 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
606// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
607
608 HADAMARD48
609 "movq %%mm7, 64(%1) \n\t"
610 MMABS(%%mm0, %%mm7)
611 MMABS_SUM(%%mm1, %%mm7, %%mm0)
612 MMABS_SUM(%%mm2, %%mm7, %%mm0)
613 MMABS_SUM(%%mm3, %%mm7, %%mm0)
614 MMABS_SUM(%%mm4, %%mm7, %%mm0)
615 MMABS_SUM(%%mm5, %%mm7, %%mm0)
616 MMABS_SUM(%%mm6, %%mm7, %%mm0)
617 "movq 64(%1), %%mm1 \n\t"
618 MMABS_SUM(%%mm1, %%mm7, %%mm0)
619 "movq %%mm0, 64(%1) \n\t"
620
621 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
622 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
623
624 HADAMARD48
625 "movq %%mm7, (%1) \n\t"
626 MMABS(%%mm0, %%mm7)
627 MMABS_SUM(%%mm1, %%mm7, %%mm0)
628 MMABS_SUM(%%mm2, %%mm7, %%mm0)
629 MMABS_SUM(%%mm3, %%mm7, %%mm0)
630 MMABS_SUM(%%mm4, %%mm7, %%mm0)
631 MMABS_SUM(%%mm5, %%mm7, %%mm0)
632 MMABS_SUM(%%mm6, %%mm7, %%mm0)
633 "movq (%1), %%mm1 \n\t"
634 MMABS_SUM(%%mm1, %%mm7, %%mm0)
635 "movq 64(%1), %%mm1 \n\t"
636 MMABS_SUM(%%mm1, %%mm7, %%mm0)
637
638 "movq %%mm0, %%mm1 \n\t"
639 "psrlq $32, %%mm0 \n\t"
640 "paddusw %%mm1, %%mm0 \n\t"
641 "movq %%mm0, %%mm1 \n\t"
642 "psrlq $16, %%mm0 \n\t"
643 "paddusw %%mm1, %%mm0 \n\t"
644 "movd %%mm0, %0 \n\t"
645
646 : "=r" (sum)
647 : "r"(temp)
648 );
649 return sum&0xFFFF;
650}
651
652WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
11f18faf 653
3178ee4c
MN
654#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
655#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
656
826f429a
MN
657#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
658 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
c296f66b 659 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
826f429a
MN
660 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
661 "movq "#in7", " #m3 " \n\t" /* d */\
662 "movq "#in0", %%mm5 \n\t" /* D */\
663 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
664 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
665 "movq "#in1", %%mm5 \n\t" /* C */\
666 "movq "#in2", %%mm6 \n\t" /* B */\
667 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
668 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
669 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
670 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
c296f66b 671 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
826f429a
MN
672 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
673 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
674 "psraw $5, %%mm5 \n\t"\
675 "packuswb %%mm5, %%mm5 \n\t"\
676 OP(%%mm5, out, %%mm7, d)
677
3178ee4c 678#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
826f429a
MN
679void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
680 uint64_t temp;\
681\
682 asm volatile(\
683 "pxor %%mm7, %%mm7 \n\t"\
684 "1: \n\t"\
685 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
686 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
687 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
688 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
689 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
690 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
691 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
692 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
693 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
694 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
695 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
696 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
697 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
698 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
699 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
700 "paddw %%mm3, %%mm5 \n\t" /* b */\
701 "paddw %%mm2, %%mm6 \n\t" /* c */\
702 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
703 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
704 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 705 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
706 "paddw %%mm4, %%mm0 \n\t" /* a */\
707 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 708 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 709 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 710 "paddw %6, %%mm6 \n\t"\
826f429a
MN
711 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
712 "psraw $5, %%mm0 \n\t"\
c296f66b 713 "movq %%mm0, %5 \n\t"\
826f429a
MN
714 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
715 \
716 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
717 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
718 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
719 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
720 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
721 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
723 "paddw %%mm0, %%mm2 \n\t" /* b */\
724 "paddw %%mm5, %%mm3 \n\t" /* c */\
725 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
726 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
727 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
728 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
729 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
730 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
c296f66b 731 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a
MN
732 "paddw %%mm2, %%mm1 \n\t" /* a */\
733 "paddw %%mm6, %%mm4 \n\t" /* d */\
c296f66b 734 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
826f429a 735 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
c296f66b 736 "paddw %6, %%mm1 \n\t"\
826f429a
MN
737 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
738 "psraw $5, %%mm3 \n\t"\
c296f66b 739 "movq %5, %%mm1 \n\t"\
826f429a 740 "packuswb %%mm3, %%mm1 \n\t"\
3178ee4c 741 OP_MMX2(%%mm1, (%1),%%mm4, q)\
826f429a
MN
742 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
743 \
744 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
745 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
746 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
747 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
748 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
749 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
750 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
751 "paddw %%mm1, %%mm5 \n\t" /* b */\
752 "paddw %%mm4, %%mm0 \n\t" /* c */\
753 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
754 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
755 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
756 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
c296f66b 757 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
826f429a
MN
758 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
759 "paddw %%mm3, %%mm2 \n\t" /* d */\
760 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
761 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
762 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
763 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
764 "paddw %%mm2, %%mm6 \n\t" /* a */\
c296f66b
MN
765 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
766 "paddw %6, %%mm0 \n\t"\
826f429a
MN
767 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
768 "psraw $5, %%mm0 \n\t"\
769 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
770 \
771 "paddw %%mm5, %%mm3 \n\t" /* a */\
772 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
773 "paddw %%mm4, %%mm6 \n\t" /* b */\
774 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
775 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
776 "paddw %%mm1, %%mm4 \n\t" /* c */\
777 "paddw %%mm2, %%mm5 \n\t" /* d */\
778 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
779 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
c296f66b
MN
780 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
781 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
826f429a 782 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 783 "paddw %6, %%mm4 \n\t"\
826f429a
MN
784 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
785 "psraw $5, %%mm4 \n\t"\
786 "packuswb %%mm4, %%mm0 \n\t"\
3178ee4c 787 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
826f429a
MN
788 \
789 "addl %3, %0 \n\t"\
790 "addl %4, %1 \n\t"\
791 "decl %2 \n\t"\
792 " jnz 1b \n\t"\
c296f66b
MN
793 : "+r"(src), "+r"(dst), "+g"(h)\
794 : "r"(srcStride), "r"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
826f429a
MN
795 );\
796}\
797\
798static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
799 int i;\
800 int16_t temp[16];\
801 /* quick HACK, XXX FIXME MUST be optimized */\
802 for(i=0; i<h; i++)\
803 {\
804 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
805 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
806 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
807 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
808 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
809 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
810 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
811 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
812 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
813 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
814 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
815 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
816 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
817 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
818 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
819 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
820 asm volatile(\
821 "movq (%0), %%mm0 \n\t"\
822 "movq 8(%0), %%mm1 \n\t"\
823 "paddw %2, %%mm0 \n\t"\
824 "paddw %2, %%mm1 \n\t"\
825 "psraw $5, %%mm0 \n\t"\
826 "psraw $5, %%mm1 \n\t"\
827 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 828 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
829 "movq 16(%0), %%mm0 \n\t"\
830 "movq 24(%0), %%mm1 \n\t"\
831 "paddw %2, %%mm0 \n\t"\
832 "paddw %2, %%mm1 \n\t"\
833 "psraw $5, %%mm0 \n\t"\
834 "psraw $5, %%mm1 \n\t"\
835 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 836 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
826f429a
MN
837 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
838 );\
839 dst+=dstStride;\
840 src+=srcStride;\
841 }\
842}\
843\
826f429a
MN
844void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
845 uint64_t temp;\
846\
847 asm volatile(\
848 "pxor %%mm7, %%mm7 \n\t"\
849 "1: \n\t"\
850 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
851 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
852 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
853 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
854 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
855 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
856 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
857 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
858 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
859 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
860 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
861 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
862 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
863 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
864 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
865 "paddw %%mm3, %%mm5 \n\t" /* b */\
866 "paddw %%mm2, %%mm6 \n\t" /* c */\
867 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
868 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
869 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 870 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
871 "paddw %%mm4, %%mm0 \n\t" /* a */\
872 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 873 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 874 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 875 "paddw %6, %%mm6 \n\t"\
826f429a
MN
876 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
877 "psraw $5, %%mm0 \n\t"\
878 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
879 \
880 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
881 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
882 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
883 "paddw %%mm5, %%mm1 \n\t" /* a */\
884 "paddw %%mm6, %%mm2 \n\t" /* b */\
885 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
886 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
887 "paddw %%mm6, %%mm3 \n\t" /* c */\
888 "paddw %%mm5, %%mm4 \n\t" /* d */\
889 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
890 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
c296f66b
MN
891 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
892 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a 893 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 894 "paddw %6, %%mm1 \n\t"\
826f429a
MN
895 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
896 "psraw $5, %%mm3 \n\t"\
897 "packuswb %%mm3, %%mm0 \n\t"\
3178ee4c 898 OP_MMX2(%%mm0, (%1), %%mm4, q)\
826f429a
MN
899 \
900 "addl %3, %0 \n\t"\
901 "addl %4, %1 \n\t"\
902 "decl %2 \n\t"\
c296f66b
MN
903 " jnz 1b \n\t"\
904 : "+r"(src), "+r"(dst), "+r"(h)\
905 : "r"(srcStride), "r"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
826f429a
MN
906 );\
907}\
908\
909static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
910 int i;\
911 int16_t temp[8];\
912 /* quick HACK, XXX FIXME MUST be optimized */\
913 for(i=0; i<h; i++)\
914 {\
915 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
916 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
917 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
918 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
919 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
920 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
921 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
922 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
923 asm volatile(\
924 "movq (%0), %%mm0 \n\t"\
925 "movq 8(%0), %%mm1 \n\t"\
926 "paddw %2, %%mm0 \n\t"\
927 "paddw %2, %%mm1 \n\t"\
928 "psraw $5, %%mm0 \n\t"\
929 "psraw $5, %%mm1 \n\t"\
930 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 931 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
932 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
933 );\
934 dst+=dstStride;\
935 src+=srcStride;\
936 }\
3178ee4c
MN
937}
938
939#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
940\
941static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
942 uint64_t temp[17*4];\
943 uint64_t *temp_ptr= temp;\
944 int count= 17;\
945\
946 /*FIXME unroll */\
947 asm volatile(\
948 "pxor %%mm7, %%mm7 \n\t"\
949 "1: \n\t"\
950 "movq (%0), %%mm0 \n\t"\
951 "movq (%0), %%mm1 \n\t"\
952 "movq 8(%0), %%mm2 \n\t"\
953 "movq 8(%0), %%mm3 \n\t"\
954 "punpcklbw %%mm7, %%mm0 \n\t"\
955 "punpckhbw %%mm7, %%mm1 \n\t"\
956 "punpcklbw %%mm7, %%mm2 \n\t"\
957 "punpckhbw %%mm7, %%mm3 \n\t"\
958 "movq %%mm0, (%1) \n\t"\
959 "movq %%mm1, 17*8(%1) \n\t"\
960 "movq %%mm2, (%1, %4) \n\t"\
961 "movq %%mm3, (%1, %5) \n\t"\
962 "addl $8, %1 \n\t"\
963 "addl %3, %0 \n\t"\
964 "decl %2 \n\t"\
965 " jnz 1b \n\t"\
966 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
967 : "r" (srcStride), "r"(2*8*17), "r"(3*8*17)\
968 );\
969 \
970 temp_ptr= temp;\
971 count=4;\
972 \
973/*FIXME reorder for speed */\
3178ee4c
MN
974 asm volatile(\
975 /*"pxor %%mm7, %%mm7 \n\t"*/\
3178ee4c
MN
976 "1: \n\t"\
977 "movq (%0), %%mm0 \n\t"\
978 "movq 8(%0), %%mm1 \n\t"\
979 "movq 16(%0), %%mm2 \n\t"\
980 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
981 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
982 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
3178ee4c 983 "addl %4, %1 \n\t"\
c296f66b 984 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
3178ee4c 985 \
c296f66b 986 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
3178ee4c 987 "addl %4, %1 \n\t"\
c296f66b
MN
988 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
989 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
3178ee4c 990 "addl %4, %1 \n\t"\
c296f66b
MN
991 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
992 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
3178ee4c 993 "addl %4, %1 \n\t"\
c296f66b
MN
994 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
995 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
3178ee4c 996 "addl %4, %1 \n\t"\
c296f66b
MN
997 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
998 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
3178ee4c 999 "addl %4, %1 \n\t"\
c296f66b 1000 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
3178ee4c 1001 \
c296f66b 1002 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
3178ee4c 1003 "addl %4, %1 \n\t" \
c296f66b
MN
1004 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1005 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
3178ee4c
MN
1006 \
1007 "addl $136, %0 \n\t"\
c296f66b 1008 "addl %6, %1 \n\t"\
3178ee4c
MN
1009 "decl %2 \n\t"\
1010 " jnz 1b \n\t"\
3178ee4c 1011 \
c296f66b
MN
1012 : "+r"(temp_ptr), "+r"(dst), "+r"(count)\
1013 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
3178ee4c 1014 );\
826f429a
MN
1015}\
1016\
3178ee4c 1017void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
826f429a
MN
1018 uint64_t temp[9*4];\
1019 uint64_t *temp_ptr= temp;\
1020 int count= 9;\
1021\
1022 /*FIXME unroll */\
1023 asm volatile(\
1024 "pxor %%mm7, %%mm7 \n\t"\
1025 "1: \n\t"\
1026 "movq (%0), %%mm0 \n\t"\
1027 "movq (%0), %%mm1 \n\t"\
1028 "punpcklbw %%mm7, %%mm0 \n\t"\
1029 "punpckhbw %%mm7, %%mm1 \n\t"\
1030 "movq %%mm0, (%1) \n\t"\
1031 "movq %%mm1, 9*8(%1) \n\t"\
1032 "addl $8, %1 \n\t"\
1033 "addl %3, %0 \n\t"\
1034 "decl %2 \n\t"\
1035 " jnz 1b \n\t"\
1036 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1037 : "r" (srcStride)\
1038 );\
1039 \
1040 temp_ptr= temp;\
1041 count=2;\
1042 \
1043/*FIXME reorder for speed */\
1044 asm volatile(\
1045 /*"pxor %%mm7, %%mm7 \n\t"*/\
1046 "1: \n\t"\
1047 "movq (%0), %%mm0 \n\t"\
1048 "movq 8(%0), %%mm1 \n\t"\
1049 "movq 16(%0), %%mm2 \n\t"\
1050 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1051 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1052 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
826f429a 1053 "addl %4, %1 \n\t"\
c296f66b 1054 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
826f429a 1055 \
c296f66b 1056 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
826f429a 1057 "addl %4, %1 \n\t"\
c296f66b 1058 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
826f429a 1059 \
c296f66b 1060 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
826f429a 1061 "addl %4, %1 \n\t"\
c296f66b
MN
1062 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1063 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
826f429a
MN
1064 \
1065 "addl $72, %0 \n\t"\
c296f66b 1066 "addl %6, %1 \n\t"\
826f429a
MN
1067 "decl %2 \n\t"\
1068 " jnz 1b \n\t"\
1069 \
c296f66b
MN
1070 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1071 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
826f429a 1072 );\
3178ee4c 1073}\
826f429a
MN
1074\
1075static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1076 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
826f429a
MN
1077}\
1078\
1079static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1080 uint64_t temp[32];\
1081 uint8_t * const half= (uint8_t*)temp;\
1082 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1083 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1084}\
1085\
1086static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1087 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1088}\
1089\
1090static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1091 uint64_t temp[32];\
1092 uint8_t * const half= (uint8_t*)temp;\
1093 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1094 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1095}\
1096\
1097static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1098 uint64_t temp[32];\
1099 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1100 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1101 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1102}\
1103\
1104static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1105 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1106}\
1107\
1108static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1109 uint64_t temp[32];\
1110 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1111 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1112 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1113}\
1114static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1115 uint64_t half[8*2 + 8*2 + 18*2];\
1116 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1117 uint8_t * const halfV= ((uint8_t*)half);\
1118 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1119 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1120 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1121 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1122 OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\
1123}\
1124static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1125 uint64_t half[8*2 + 8*2 + 18*2];\
1126 uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1127 uint8_t * const halfV= ((uint8_t*)half);\
1128 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1129 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1130 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1131 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1132 OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\
1133}\
1134static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1135 uint64_t half[8*2 + 8*2 + 9*2];\
1136 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1137 uint8_t * const halfV= ((uint8_t*)half);\
1138 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1139 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1140 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1141 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1142 OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\
1143}\
1144static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1145 uint64_t half[8*2 + 8*2 + 9*2];\
1146 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1147 uint8_t * const halfV= ((uint8_t*)half);\
1148 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1149 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src , 8, stride, 9);\
3178ee4c
MN
1150 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1151 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1152 OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\
1153}\
1154static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1155 uint64_t half[8*2 + 9*2];\
1156 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1157 uint8_t * const halfHV= ((uint8_t*)half);\
1158 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1159 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1160 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1161}\
1162static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1163 uint64_t half[8*2 + 9*2];\
1164 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1165 uint8_t * const halfHV= ((uint8_t*)half);\
1166 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1167 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1168 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1169}\
1170static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1171 uint64_t half[8*2 + 8*2 + 9*2];\
1172 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1173 uint8_t * const halfV= ((uint8_t*)half);\
1174 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1175 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1176 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1177 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1178 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1179}\
1180static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1181 uint64_t half[8*2 + 8*2 + 9*2];\
1182 uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1183 uint8_t * const halfV= ((uint8_t*)half);\
1184 uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1185 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c
MN
1186 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1187 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1188 OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1189}\
1190static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1191 uint64_t half[9*2];\
1192 uint8_t * const halfH= ((uint8_t*)half);\
1193 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1194 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a
MN
1195}\
1196static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1197 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
826f429a
MN
1198}\
1199\
1200static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1201 uint64_t temp[32];\
1202 uint8_t * const half= (uint8_t*)temp;\
1203 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1204 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1205}\
1206\
1207static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1208 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1209}\
1210\
1211static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1212 uint64_t temp[32];\
1213 uint8_t * const half= (uint8_t*)temp;\
1214 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1215 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1216}\
1217\
1218static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1219 uint64_t temp[32];\
1220 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1221 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1222 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1223}\
1224\
1225static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1226 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1227}\
1228\
1229static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1230 uint64_t temp[32];\
1231 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1232 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1233 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1234}\
1235static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1236 uint64_t half[16*2 + 16*2 + 18*2];\
1237 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1238 uint8_t * const halfV= ((uint8_t*)half);\
1239 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1240 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1241 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1242 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1243 OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\
1244}\
1245static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1246 uint64_t half[16*2 + 16*2 + 18*2];\
1247 uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1248 uint8_t * const halfV= ((uint8_t*)half);\
1249 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1250 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1251 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1252 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1253 OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\
1254}\
1255static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1256 uint64_t half[16*2 + 16*2 + 17*2];\
1257 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1258 uint8_t * const halfV= ((uint8_t*)half);\
1259 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1260 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1261 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1262 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1263 OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\
1264}\
1265static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1266 uint64_t half[16*2 + 16*2 + 17*2];\
1267 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1268 uint8_t * const halfV= ((uint8_t*)half);\
1269 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1270 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src , 16, stride, 17);\
3178ee4c
MN
1271 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1272 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1273 OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\
1274}\
1275static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1276 uint64_t half[16*2 + 17*2];\
1277 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1278 uint8_t * const halfHV= ((uint8_t*)half);\
1279 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1280 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1281 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1282}\
1283static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1284 uint64_t half[16*2 + 17*2];\
1285 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1286 uint8_t * const halfHV= ((uint8_t*)half);\
1287 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1288 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1289 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1290}\
1291static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1292 uint64_t half[16*2 + 16*2 + 17*2];\
1293 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1294 uint8_t * const halfV= ((uint8_t*)half);\
1295 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1296 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1297 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1298 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1299 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1300}\
1301static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1302 uint64_t half[16*2 + 16*2 + 17*2];\
1303 uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1304 uint8_t * const halfV= ((uint8_t*)half);\
1305 uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1306 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c
MN
1307 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1308 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1309 OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1310}\
1311static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1312 uint64_t half[17*2];\
1313 uint8_t * const halfH= ((uint8_t*)half);\
1314 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1315 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1316}
1317
1318
1319#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
3178ee4c 1320#define AVG_3DNOW_OP(a,b,temp, size) \
826f429a
MN
1321"mov" #size " " #b ", " #temp " \n\t"\
1322"pavgusb " #temp ", " #a " \n\t"\
1323"mov" #size " " #a ", " #b " \n\t"
3178ee4c 1324#define AVG_MMX2_OP(a,b,temp, size) \
826f429a
MN
1325"mov" #size " " #b ", " #temp " \n\t"\
1326"pavgb " #temp ", " #a " \n\t"\
1327"mov" #size " " #a ", " #b " \n\t"
3178ee4c
MN
1328
1329QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1330QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1331QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1332QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1333QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1334QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
826f429a 1335QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
3178ee4c 1336QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
826f429a
MN
1337QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1338
61a4e8ae 1339#if 0
d6a4c0b1 1340static void just_return() { return; }
61a4e8ae 1341#endif
d6a4c0b1 1342
826f429a
MN
1343#define SET_QPEL_FUNC(postfix1, postfix2) \
1344 c->put_ ## postfix1 = put_ ## postfix2;\
1345 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1346 c->avg_ ## postfix1 = avg_ ## postfix2;
1347
eb4b3dd3 1348void dsputil_init_mmx(DSPContext* c, unsigned mask)
de6d9b64
FB
1349{
1350 mm_flags = mm_support();
1565dabc
LB
1351#if 0
1352 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 1353 if (mm_flags & MM_MMX)
1565dabc 1354 fprintf(stderr, " mmx");
de6d9b64 1355 if (mm_flags & MM_MMXEXT)
1565dabc 1356 fprintf(stderr, " mmxext");
de6d9b64 1357 if (mm_flags & MM_3DNOW)
1565dabc 1358 fprintf(stderr, " 3dnow");
de6d9b64 1359 if (mm_flags & MM_SSE)
1565dabc 1360 fprintf(stderr, " sse");
de6d9b64 1361 if (mm_flags & MM_SSE2)
1565dabc
LB
1362 fprintf(stderr, " sse2");
1363 fprintf(stderr, "\n");
de6d9b64
FB
1364#endif
1365
1366 if (mm_flags & MM_MMX) {
eb4b3dd3
ZK
1367 c->get_pixels = get_pixels_mmx;
1368 c->diff_pixels = diff_pixels_mmx;
1369 c->put_pixels_clamped = put_pixels_clamped_mmx;
1370 c->add_pixels_clamped = add_pixels_clamped_mmx;
1371 c->clear_blocks = clear_blocks_mmx;
1372 c->pix_sum = pix_sum16_mmx;
1373
1374 c->pix_abs16x16 = pix_abs16x16_mmx;
1375 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1376 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1377 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1378 c->pix_abs8x8 = pix_abs8x8_mmx;
1379 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1380 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1381 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
1382
1383 c->put_pixels_tab[0][0] = put_pixels16_mmx;
1384 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1385 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1386 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1387
1388 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1389 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1390 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1391 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1392
1393 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1394 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1395 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1396 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1397
1398 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1399 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1400 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1401 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1402
1403 c->put_pixels_tab[1][0] = put_pixels8_mmx;
1404 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1405 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1406 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1407
1408 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1409 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1410 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1411 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1412
1413 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1414 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1415 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1416 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1417
1418 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1419 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1420 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1421 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
826f429a 1422
11f18faf
MN
1423 c->add_bytes= add_bytes_mmx;
1424 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
1425
1426 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1427 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1428
1429 c->sad[0]= sad16x16_mmx;
1430 c->sad[1]= sad8x8_mmx;
1431
de6d9b64 1432 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1433 c->pix_abs16x16 = pix_abs16x16_mmx2;
1434 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
1435 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
1436 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
1437
1438 c->pix_abs8x8 = pix_abs8x8_mmx2;
1439 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
1440 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
1441 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
1442
1457ab52
MN
1443 c->sad[0]= sad16x16_mmx2;
1444 c->sad[1]= sad8x8_mmx2;
1445
eb4b3dd3
ZK
1446 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1447 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
1448 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1449 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1450
1451 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1452 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1453 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
1454 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1455
1456 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1457 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
1458 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1459 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1460
1461 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1462 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1463 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1464 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3178ee4c 1465
c296f66b 1466#if 1
826f429a
MN
1467 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1468 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1469 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1470 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1471 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1472 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1473 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1474 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1475 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1476 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1477 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1478 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1479 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1480 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1481 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1482 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1483 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1484 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1485 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1486 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1487 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1488 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1489 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1490 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1491 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1492 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1493 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1494 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1495 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1496 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1497 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1498 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
c296f66b 1499#endif
de6d9b64 1500 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
1501 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1502 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
1503 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1504 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1505
1506 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1507 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1508 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
1509 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1510
1511 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1512 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
1513 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1514 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1515
1516 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1517 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1518 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1519 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
826f429a
MN
1520
1521 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1522 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1523 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1524 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1525 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1526 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1527 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1528 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1529 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1530 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1531 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1532 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1533 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1534 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1535 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1536 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1537 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1538 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1539 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1540 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1541 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1542 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1543 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1544 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1545 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1546 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1547 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1548 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1549 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1550 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1551 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1552 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
de6d9b64
FB
1553 }
1554 }
d6a4c0b1
ZK
1555
1556#if 0
1557 // for speed testing
1558 get_pixels = just_return;
1559 put_pixels_clamped = just_return;
1560 add_pixels_clamped = just_return;
1561
1562 pix_abs16x16 = just_return;
1563 pix_abs16x16_x2 = just_return;
1564 pix_abs16x16_y2 = just_return;
1565 pix_abs16x16_xy2 = just_return;
1566
1567 put_pixels_tab[0] = just_return;
1568 put_pixels_tab[1] = just_return;
1569 put_pixels_tab[2] = just_return;
1570 put_pixels_tab[3] = just_return;
1571
1572 put_no_rnd_pixels_tab[0] = just_return;
1573 put_no_rnd_pixels_tab[1] = just_return;
1574 put_no_rnd_pixels_tab[2] = just_return;
1575 put_no_rnd_pixels_tab[3] = just_return;
1576
1577 avg_pixels_tab[0] = just_return;
1578 avg_pixels_tab[1] = just_return;
1579 avg_pixels_tab[2] = just_return;
1580 avg_pixels_tab[3] = just_return;
1581
1582 avg_no_rnd_pixels_tab[0] = just_return;
1583 avg_no_rnd_pixels_tab[1] = just_return;
1584 avg_no_rnd_pixels_tab[2] = just_return;
1585 avg_no_rnd_pixels_tab[3] = just_return;
1586
d6a4c0b1
ZK
1587 //av_fdct = just_return;
1588 //ff_idct = just_return;
1589#endif
de6d9b64 1590}
4f12a497
FB
1591
1592/* remove any non bit exact operation (testing purpose). NOTE that
1593 this function should be kept as small as possible because it is
1594 always difficult to test automatically non bit exact cases. */
eb4b3dd3 1595void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
4f12a497
FB
1596{
1597 if (mm_flags & MM_MMX) {
b3184779 1598 /* MMX2 & 3DNOW */
eb4b3dd3
ZK
1599 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1600 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1601 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1602 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1603 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1604 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4afeaec9 1605
b3184779 1606 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1607 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
1608 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
1609 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
1610 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
1611 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
1612 c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497
FB
1613 }
1614 }
1615}