h263_v_loop_filter_mmx
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
b0368839 23#include "../simple_idct.h"
de6d9b64 24
359f98de
MN
25extern const uint8_t ff_h263_loop_filter_strength[32];
26
7d650cb5 27int mm_flags; /* multimedia extension flags */
1457ab52 28
de6d9b64 29/* pixel operations */
a7bd8797
MN
30static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
31static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
32static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 33
826f429a
MN
34static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
35static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
36static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
37static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
38
359f98de
MN
39static const uint64_t ff_pb_FC __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
40
d6a4c0b1
ZK
41#define JUMPALIGN() __asm __volatile (".balign 8"::)
42#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
43
fca0f0e5
ZK
44#define MOVQ_WONE(regd) \
45 __asm __volatile ( \
46 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
47 "psrlw $15, %%" #regd ::)
48
49#define MOVQ_BFE(regd) \
50 __asm __volatile ( \
51 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
52 "paddb %%" #regd ", %%" #regd " \n\t" ::)
53
d6a4c0b1 54#ifndef PIC
fca0f0e5 55#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
56#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
57#else
58// for shared library it's better to use this way for accessing constants
59// pcmpeqd -> -1
fca0f0e5 60#define MOVQ_BONE(regd) \
d6a4c0b1 61 __asm __volatile ( \
fca0f0e5
ZK
62 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
63 "psrlw $15, %%" #regd " \n\t" \
64 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
65
66#define MOVQ_WTWO(regd) \
67 __asm __volatile ( \
fca0f0e5
ZK
68 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
69 "psrlw $15, %%" #regd " \n\t" \
70 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 71
d6a4c0b1
ZK
72#endif
73
fca0f0e5 74// using regr as temporary and for the output result
def60345 75// first argument is unmodifed and second is trashed
39825f31
ZK
76// regfe is supposed to contain 0xfefefefefefefefe
77#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
78 "movq " #rega ", " #regr " \n\t"\
79 "pand " #regb ", " #regr " \n\t"\
def60345 80 "pxor " #rega ", " #regb " \n\t"\
39825f31 81 "pand " #regfe "," #regb " \n\t"\
def60345 82 "psrlq $1, " #regb " \n\t"\
91abb473 83 "paddb " #regb ", " #regr " \n\t"
def60345 84
39825f31 85#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
86 "movq " #rega ", " #regr " \n\t"\
87 "por " #regb ", " #regr " \n\t"\
def60345 88 "pxor " #rega ", " #regb " \n\t"\
39825f31 89 "pand " #regfe "," #regb " \n\t"\
def60345 90 "psrlq $1, " #regb " \n\t"\
91abb473 91 "psubb " #regb ", " #regr " \n\t"
def60345 92
39825f31 93// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
94#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
95 "movq " #rega ", " #regr " \n\t"\
96 "movq " #regc ", " #regp " \n\t"\
97 "pand " #regb ", " #regr " \n\t"\
98 "pand " #regd ", " #regp " \n\t"\
99 "pxor " #rega ", " #regb " \n\t"\
100 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
101 "pand %%mm6, " #regb " \n\t"\
102 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
103 "psrlq $1, " #regb " \n\t"\
104 "psrlq $1, " #regd " \n\t"\
105 "paddb " #regb ", " #regr " \n\t"\
106 "paddb " #regd ", " #regp " \n\t"
107
108#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
109 "movq " #rega ", " #regr " \n\t"\
110 "movq " #regc ", " #regp " \n\t"\
111 "por " #regb ", " #regr " \n\t"\
112 "por " #regd ", " #regp " \n\t"\
113 "pxor " #rega ", " #regb " \n\t"\
114 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
115 "pand %%mm6, " #regb " \n\t"\
116 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
117 "psrlq $1, " #regd " \n\t"\
118 "psrlq $1, " #regb " \n\t"\
119 "psubb " #regb ", " #regr " \n\t"\
120 "psubb " #regd ", " #regp " \n\t"
121
91abb473
ZK
122/***********************************/
123/* MMX no rounding */
124#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 125#define SET_RND MOVQ_WONE
6aa6ea8e 126#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 127#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 128
91abb473
ZK
129#include "dsputil_mmx_rnd.h"
130
131#undef DEF
fca0f0e5 132#undef SET_RND
6aa6ea8e 133#undef PAVGBP
39825f31 134#undef PAVGB
91abb473
ZK
135/***********************************/
136/* MMX rounding */
137
138#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 139#define SET_RND MOVQ_WTWO
6aa6ea8e 140#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 141#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 142
91abb473
ZK
143#include "dsputil_mmx_rnd.h"
144
145#undef DEF
fca0f0e5 146#undef SET_RND
6aa6ea8e 147#undef PAVGBP
39825f31 148#undef PAVGB
a7bd8797 149
de6d9b64
FB
150/***********************************/
151/* 3Dnow specific */
152
153#define DEF(x) x ## _3dnow
154/* for Athlons PAVGUSB is prefered */
155#define PAVGB "pavgusb"
156
157#include "dsputil_mmx_avg.h"
158
159#undef DEF
160#undef PAVGB
161
162/***********************************/
163/* MMX2 specific */
164
607dce96 165#define DEF(x) x ## _mmx2
de6d9b64
FB
166
167/* Introduced only in MMX2 set */
168#define PAVGB "pavgb"
169
170#include "dsputil_mmx_avg.h"
171
172#undef DEF
173#undef PAVGB
174
175/***********************************/
176/* standard MMX */
177
764ef400 178#ifdef CONFIG_ENCODERS
0c1a9eda 179static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
de6d9b64 180{
607dce96
MN
181 asm volatile(
182 "movl $-128, %%eax \n\t"
183 "pxor %%mm7, %%mm7 \n\t"
184 ".balign 16 \n\t"
185 "1: \n\t"
186 "movq (%0), %%mm0 \n\t"
187 "movq (%0, %2), %%mm2 \n\t"
188 "movq %%mm0, %%mm1 \n\t"
189 "movq %%mm2, %%mm3 \n\t"
190 "punpcklbw %%mm7, %%mm0 \n\t"
191 "punpckhbw %%mm7, %%mm1 \n\t"
192 "punpcklbw %%mm7, %%mm2 \n\t"
193 "punpckhbw %%mm7, %%mm3 \n\t"
194 "movq %%mm0, (%1, %%eax)\n\t"
195 "movq %%mm1, 8(%1, %%eax)\n\t"
196 "movq %%mm2, 16(%1, %%eax)\n\t"
197 "movq %%mm3, 24(%1, %%eax)\n\t"
198 "addl %3, %0 \n\t"
199 "addl $32, %%eax \n\t"
200 "js 1b \n\t"
201 : "+r" (pixels)
202 : "r" (block+64), "r" (line_size), "r" (line_size*2)
203 : "%eax"
204 );
de6d9b64
FB
205}
206
0c1a9eda 207static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
9dbcbd92
MN
208{
209 asm volatile(
607dce96 210 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 211 "movl $-128, %%eax \n\t"
607dce96 212 ".balign 16 \n\t"
9dbcbd92
MN
213 "1: \n\t"
214 "movq (%0), %%mm0 \n\t"
215 "movq (%1), %%mm2 \n\t"
216 "movq %%mm0, %%mm1 \n\t"
217 "movq %%mm2, %%mm3 \n\t"
218 "punpcklbw %%mm7, %%mm0 \n\t"
219 "punpckhbw %%mm7, %%mm1 \n\t"
220 "punpcklbw %%mm7, %%mm2 \n\t"
221 "punpckhbw %%mm7, %%mm3 \n\t"
222 "psubw %%mm2, %%mm0 \n\t"
223 "psubw %%mm3, %%mm1 \n\t"
224 "movq %%mm0, (%2, %%eax)\n\t"
225 "movq %%mm1, 8(%2, %%eax)\n\t"
226 "addl %3, %0 \n\t"
227 "addl %3, %1 \n\t"
228 "addl $16, %%eax \n\t"
229 "jnz 1b \n\t"
230 : "+r" (s1), "+r" (s2)
231 : "r" (block+64), "r" (stride)
232 : "%eax"
233 );
234}
764ef400 235#endif //CONFIG_ENCODERS
9dbcbd92 236
0c1a9eda 237void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
de6d9b64
FB
238{
239 const DCTELEM *p;
0c1a9eda 240 uint8_t *pix;
de6d9b64
FB
241
242 /* read the pixels */
243 p = block;
244 pix = pixels;
d6a4c0b1 245 /* unrolled loop */
de6d9b64 246 __asm __volatile(
a822a479
NK
247 "movq %3, %%mm0\n\t"
248 "movq 8%3, %%mm1\n\t"
249 "movq 16%3, %%mm2\n\t"
250 "movq 24%3, %%mm3\n\t"
251 "movq 32%3, %%mm4\n\t"
252 "movq 40%3, %%mm5\n\t"
253 "movq 48%3, %%mm6\n\t"
254 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
255 "packuswb %%mm1, %%mm0\n\t"
256 "packuswb %%mm3, %%mm2\n\t"
257 "packuswb %%mm5, %%mm4\n\t"
258 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
259 "movq %%mm0, (%0)\n\t"
260 "movq %%mm2, (%0, %1)\n\t"
261 "movq %%mm4, (%0, %1, 2)\n\t"
262 "movq %%mm6, (%0, %2)\n\t"
263 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
264 :"memory");
265 pix += line_size*4;
266 p += 32;
d6a4c0b1
ZK
267
268 // if here would be an exact copy of the code above
269 // compiler would generate some very strange code
270 // thus using "r"
271 __asm __volatile(
272 "movq (%3), %%mm0\n\t"
273 "movq 8(%3), %%mm1\n\t"
274 "movq 16(%3), %%mm2\n\t"
275 "movq 24(%3), %%mm3\n\t"
276 "movq 32(%3), %%mm4\n\t"
277 "movq 40(%3), %%mm5\n\t"
278 "movq 48(%3), %%mm6\n\t"
279 "movq 56(%3), %%mm7\n\t"
280 "packuswb %%mm1, %%mm0\n\t"
281 "packuswb %%mm3, %%mm2\n\t"
282 "packuswb %%mm5, %%mm4\n\t"
283 "packuswb %%mm7, %%mm6\n\t"
284 "movq %%mm0, (%0)\n\t"
285 "movq %%mm2, (%0, %1)\n\t"
286 "movq %%mm4, (%0, %1, 2)\n\t"
287 "movq %%mm6, (%0, %2)\n\t"
288 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
289 :"memory");
de6d9b64
FB
290}
291
0c1a9eda 292void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
de6d9b64
FB
293{
294 const DCTELEM *p;
0c1a9eda 295 uint8_t *pix;
de6d9b64
FB
296 int i;
297
298 /* read the pixels */
299 p = block;
300 pix = pixels;
d6a4c0b1
ZK
301 MOVQ_ZERO(mm7);
302 i = 4;
cd8e5f96 303 do {
de6d9b64 304 __asm __volatile(
cd8e5f96
ZK
305 "movq (%2), %%mm0\n\t"
306 "movq 8(%2), %%mm1\n\t"
307 "movq 16(%2), %%mm2\n\t"
308 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
309 "movq %0, %%mm4\n\t"
310 "movq %1, %%mm6\n\t"
311 "movq %%mm4, %%mm5\n\t"
312 "punpcklbw %%mm7, %%mm4\n\t"
313 "punpckhbw %%mm7, %%mm5\n\t"
314 "paddsw %%mm4, %%mm0\n\t"
315 "paddsw %%mm5, %%mm1\n\t"
316 "movq %%mm6, %%mm5\n\t"
317 "punpcklbw %%mm7, %%mm6\n\t"
318 "punpckhbw %%mm7, %%mm5\n\t"
319 "paddsw %%mm6, %%mm2\n\t"
320 "paddsw %%mm5, %%mm3\n\t"
321 "packuswb %%mm1, %%mm0\n\t"
322 "packuswb %%mm3, %%mm2\n\t"
323 "movq %%mm0, %0\n\t"
324 "movq %%mm2, %1\n\t"
a822a479 325 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 326 :"r"(p)
de6d9b64
FB
327 :"memory");
328 pix += line_size*2;
329 p += 16;
cd8e5f96 330 } while (--i);
de6d9b64
FB
331}
332
0c1a9eda 333static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
de6d9b64 334{
39825f31 335 __asm __volatile(
31ddcf98 336 "lea (%3, %3), %%eax \n\t"
52af45ad 337 ".balign 8 \n\t"
31ddcf98
ZK
338 "1: \n\t"
339 "movq (%1), %%mm0 \n\t"
340 "movq (%1, %3), %%mm1 \n\t"
341 "movq %%mm0, (%2) \n\t"
342 "movq %%mm1, (%2, %3) \n\t"
343 "addl %%eax, %1 \n\t"
344 "addl %%eax, %2 \n\t"
345 "movq (%1), %%mm0 \n\t"
346 "movq (%1, %3), %%mm1 \n\t"
347 "movq %%mm0, (%2) \n\t"
348 "movq %%mm1, (%2, %3) \n\t"
349 "addl %%eax, %1 \n\t"
350 "addl %%eax, %2 \n\t"
351 "subl $4, %0 \n\t"
352 "jnz 1b \n\t"
353 : "+g"(h), "+r" (pixels), "+r" (block)
354 : "r"(line_size)
355 : "%eax", "memory"
356 );
de6d9b64
FB
357}
358
0c1a9eda 359static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
b3184779
MN
360{
361 __asm __volatile(
362 "lea (%3, %3), %%eax \n\t"
363 ".balign 8 \n\t"
364 "1: \n\t"
365 "movq (%1), %%mm0 \n\t"
366 "movq 8(%1), %%mm4 \n\t"
367 "movq (%1, %3), %%mm1 \n\t"
368 "movq 8(%1, %3), %%mm5 \n\t"
369 "movq %%mm0, (%2) \n\t"
370 "movq %%mm4, 8(%2) \n\t"
371 "movq %%mm1, (%2, %3) \n\t"
372 "movq %%mm5, 8(%2, %3) \n\t"
373 "addl %%eax, %1 \n\t"
374 "addl %%eax, %2 \n\t"
375 "movq (%1), %%mm0 \n\t"
376 "movq 8(%1), %%mm4 \n\t"
377 "movq (%1, %3), %%mm1 \n\t"
378 "movq 8(%1, %3), %%mm5 \n\t"
379 "movq %%mm0, (%2) \n\t"
380 "movq %%mm4, 8(%2) \n\t"
381 "movq %%mm1, (%2, %3) \n\t"
382 "movq %%mm5, 8(%2, %3) \n\t"
383 "addl %%eax, %1 \n\t"
384 "addl %%eax, %2 \n\t"
385 "subl $4, %0 \n\t"
386 "jnz 1b \n\t"
387 : "+g"(h), "+r" (pixels), "+r" (block)
388 : "r"(line_size)
389 : "%eax", "memory"
390 );
391}
392
649c00c9
MN
393static void clear_blocks_mmx(DCTELEM *blocks)
394{
39825f31 395 __asm __volatile(
649c00c9
MN
396 "pxor %%mm7, %%mm7 \n\t"
397 "movl $-128*6, %%eax \n\t"
398 "1: \n\t"
399 "movq %%mm7, (%0, %%eax) \n\t"
400 "movq %%mm7, 8(%0, %%eax) \n\t"
401 "movq %%mm7, 16(%0, %%eax) \n\t"
402 "movq %%mm7, 24(%0, %%eax) \n\t"
403 "addl $32, %%eax \n\t"
404 " js 1b \n\t"
405 : : "r" (((int)blocks)+128*6)
406 : "%eax"
407 );
408}
409
764ef400 410#ifdef CONFIG_ENCODERS
0c1a9eda 411static int pix_sum16_mmx(uint8_t * pix, int line_size){
084c726b
MN
412 const int h=16;
413 int sum;
414 int index= -line_size*h;
415
416 __asm __volatile(
417 "pxor %%mm7, %%mm7 \n\t"
418 "pxor %%mm6, %%mm6 \n\t"
419 "1: \n\t"
420 "movq (%2, %1), %%mm0 \n\t"
421 "movq (%2, %1), %%mm1 \n\t"
422 "movq 8(%2, %1), %%mm2 \n\t"
423 "movq 8(%2, %1), %%mm3 \n\t"
424 "punpcklbw %%mm7, %%mm0 \n\t"
425 "punpckhbw %%mm7, %%mm1 \n\t"
426 "punpcklbw %%mm7, %%mm2 \n\t"
427 "punpckhbw %%mm7, %%mm3 \n\t"
428 "paddw %%mm0, %%mm1 \n\t"
429 "paddw %%mm2, %%mm3 \n\t"
430 "paddw %%mm1, %%mm3 \n\t"
431 "paddw %%mm3, %%mm6 \n\t"
432 "addl %3, %1 \n\t"
433 " js 1b \n\t"
434 "movq %%mm6, %%mm5 \n\t"
435 "psrlq $32, %%mm6 \n\t"
436 "paddw %%mm5, %%mm6 \n\t"
437 "movq %%mm6, %%mm5 \n\t"
438 "psrlq $16, %%mm6 \n\t"
439 "paddw %%mm5, %%mm6 \n\t"
440 "movd %%mm6, %0 \n\t"
441 "andl $0xFFFF, %0 \n\t"
442 : "=&r" (sum), "+r" (index)
443 : "r" (pix - index), "r" (line_size)
444 );
445
446 return sum;
447}
764ef400 448#endif //CONFIG_ENCODERS
084c726b 449
11f18faf
MN
450static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
451 int i=0;
452 asm volatile(
453 "1: \n\t"
454 "movq (%1, %0), %%mm0 \n\t"
455 "movq (%2, %0), %%mm1 \n\t"
456 "paddb %%mm0, %%mm1 \n\t"
457 "movq %%mm1, (%2, %0) \n\t"
458 "movq 8(%1, %0), %%mm0 \n\t"
459 "movq 8(%2, %0), %%mm1 \n\t"
460 "paddb %%mm0, %%mm1 \n\t"
461 "movq %%mm1, 8(%2, %0) \n\t"
462 "addl $16, %0 \n\t"
463 "cmpl %3, %0 \n\t"
464 " jb 1b \n\t"
465 : "+r" (i)
466 : "r"(src), "r"(dst), "r"(w-15)
467 );
468 for(; i<w; i++)
469 dst[i+0] += src[i+0];
470}
471
359f98de
MN
472static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
473 const int strength= ff_h263_loop_filter_strength[qscale];
474
475 asm volatile(
476 "pxor %%mm7, %%mm7 \n\t"
477 "movq %0, %%mm0 \n\t"
478 "movq %0, %%mm1 \n\t"
479 "movq %3, %%mm2 \n\t"
480 "movq %3, %%mm3 \n\t"
481 "punpcklbw %%mm7, %%mm0 \n\t"
482 "punpckhbw %%mm7, %%mm1 \n\t"
483 "punpcklbw %%mm7, %%mm2 \n\t"
484 "punpckhbw %%mm7, %%mm3 \n\t"
485 "psubw %%mm2, %%mm0 \n\t"
486 "psubw %%mm3, %%mm1 \n\t"
487 "movq %1, %%mm2 \n\t"
488 "movq %1, %%mm3 \n\t"
489 "movq %2, %%mm4 \n\t"
490 "movq %2, %%mm5 \n\t"
491 "punpcklbw %%mm7, %%mm2 \n\t"
492 "punpckhbw %%mm7, %%mm3 \n\t"
493 "punpcklbw %%mm7, %%mm4 \n\t"
494 "punpckhbw %%mm7, %%mm5 \n\t"
495 "psubw %%mm2, %%mm4 \n\t"
496 "psubw %%mm3, %%mm5 \n\t"
497 "psllw $2, %%mm4 \n\t"
498 "psllw $2, %%mm5 \n\t"
499 "paddw %%mm0, %%mm4 \n\t"
500 "paddw %%mm1, %%mm5 \n\t"
501 "pxor %%mm6, %%mm6 \n\t"
502 "pcmpgtw %%mm4, %%mm6 \n\t"
503 "pcmpgtw %%mm5, %%mm7 \n\t"
504 "pxor %%mm6, %%mm4 \n\t"
505 "pxor %%mm7, %%mm5 \n\t"
506 "psubw %%mm6, %%mm4 \n\t"
507 "psubw %%mm7, %%mm5 \n\t"
508 "psrlw $3, %%mm4 \n\t"
509 "psrlw $3, %%mm5 \n\t"
510 "packuswb %%mm5, %%mm4 \n\t" //abs(d)
511 "packsswb %%mm7, %%mm6 \n\t" //sign(d)
512 "pxor %%mm7, %%mm7 \n\t"
513 "movd %4, %%mm2 \n\t"
514 "punpcklbw %%mm2, %%mm2 \n\t"
515 "punpcklbw %%mm2, %%mm2 \n\t"
516 "punpcklbw %%mm2, %%mm2 \n\t" //2*strength
517 "psubusb %%mm4, %%mm2 \n\t" // S(2*strength - abs(d))
518 "movq %%mm2, %%mm3 \n\t" // S(2*strength - abs(d))
519 "psubusb %%mm4, %%mm3 \n\t" // S(S(2*strength - abs(d)) - abs(d))
520 "psubb %%mm3, %%mm2 \n\t" // MIN(abs(d), S(2*strength - abs(d)))
521 "movq %1, %%mm3 \n\t"
522 "movq %2, %%mm4 \n\t"
523 "pxor %%mm6, %%mm3 \n\t"
524 "pxor %%mm6, %%mm4 \n\t"
525 "paddusb %%mm2, %%mm3 \n\t"
526 "psubusb %%mm2, %%mm4 \n\t"
527 "pxor %%mm6, %%mm3 \n\t"
528 "pxor %%mm6, %%mm4 \n\t"
529 "movq %%mm3, %1 \n\t"
530 "movq %%mm4, %2 \n\t"
531 "paddusb %%mm2, %%mm2 \n\t"
532 "packsswb %%mm1, %%mm0 \n\t"
533 "pcmpgtb %%mm0, %%mm7 \n\t"
534 "pxor %%mm7, %%mm0 \n\t"
535 "psubb %%mm7, %%mm0 \n\t"
536 "movq %%mm0, %%mm1 \n\t"
537 "psubusb %%mm2, %%mm0 \n\t"
538 "psubb %%mm0, %%mm1 \n\t"
539 "pand %5, %%mm1 \n\t"
540 "psrlw $2, %%mm1 \n\t"
541 "pxor %%mm7, %%mm1 \n\t"
542 "psubb %%mm7, %%mm1 \n\t"
543 "movq %0, %%mm3 \n\t"
544 "movq %3, %%mm4 \n\t"
545 "psubb %%mm1, %%mm3 \n\t"
546 "paddb %%mm1, %%mm4 \n\t"
547 "movq %%mm3, %0 \n\t"
548 "movq %%mm4, %3 \n\t"
549
550 : "+m" (*(uint64_t*)(src - 2*stride)),
551 "+m" (*(uint64_t*)(src - 1*stride)),
552 "+m" (*(uint64_t*)(src + 0*stride)),
553 "+m" (*(uint64_t*)(src + 1*stride))
554 : "g" (2*strength), "m"(ff_pb_FC)
555 );
556}
557
764ef400 558#ifdef CONFIG_ENCODERS
2a006cd3
FL
559static int pix_norm1_mmx(uint8_t *pix, int line_size) {
560 int tmp;
561 asm volatile (
562 "movl $16,%%ecx\n"
563 "pxor %%mm0,%%mm0\n"
564 "pxor %%mm7,%%mm7\n"
565 "1:\n"
566 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
567 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
568
569 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
570
571 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
572 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
573
574 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
575 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
576 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
577
578 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
579 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
580
581 "pmaddwd %%mm3,%%mm3\n"
582 "pmaddwd %%mm4,%%mm4\n"
583
584 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
585 pix2^2+pix3^2+pix6^2+pix7^2) */
586 "paddd %%mm3,%%mm4\n"
587 "paddd %%mm2,%%mm7\n"
588
589 "addl %2, %0\n"
590 "paddd %%mm4,%%mm7\n"
591 "dec %%ecx\n"
592 "jnz 1b\n"
593
594 "movq %%mm7,%%mm1\n"
595 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
596 "paddd %%mm7,%%mm1\n"
597 "movd %%mm1,%1\n"
598 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
599 return tmp;
600}
601
0c1a9eda 602static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
2a006cd3
FL
603 int tmp;
604 asm volatile (
605 "movl $16,%%ecx\n"
606 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
607 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
608 "1:\n"
609 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
610 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
611 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
612 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
613
614 /* todo: mm1-mm2, mm3-mm4 */
615 /* algo: substract mm1 from mm2 with saturation and vice versa */
616 /* OR the results to get absolute difference */
617 "movq %%mm1,%%mm5\n"
618 "movq %%mm3,%%mm6\n"
619 "psubusb %%mm2,%%mm1\n"
620 "psubusb %%mm4,%%mm3\n"
621 "psubusb %%mm5,%%mm2\n"
622 "psubusb %%mm6,%%mm4\n"
623
624 "por %%mm1,%%mm2\n"
625 "por %%mm3,%%mm4\n"
626
627 /* now convert to 16-bit vectors so we can square them */
628 "movq %%mm2,%%mm1\n"
629 "movq %%mm4,%%mm3\n"
630
631 "punpckhbw %%mm0,%%mm2\n"
632 "punpckhbw %%mm0,%%mm4\n"
633 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
634 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
635
636 "pmaddwd %%mm2,%%mm2\n"
637 "pmaddwd %%mm4,%%mm4\n"
638 "pmaddwd %%mm1,%%mm1\n"
639 "pmaddwd %%mm3,%%mm3\n"
640
641 "addl %3,%0\n"
642 "addl %3,%1\n"
643
644 "paddd %%mm2,%%mm1\n"
645 "paddd %%mm4,%%mm3\n"
646 "paddd %%mm1,%%mm7\n"
647 "paddd %%mm3,%%mm7\n"
648
649 "decl %%ecx\n"
650 "jnz 1b\n"
651
652 "movq %%mm7,%%mm1\n"
653 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
654 "paddd %%mm7,%%mm1\n"
655 "movd %%mm1,%2\n"
01a2ddaf 656 : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "%ecx");
2a006cd3
FL
657 return tmp;
658}
659
11f18faf
MN
660static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
661 int i=0;
662 asm volatile(
663 "1: \n\t"
664 "movq (%2, %0), %%mm0 \n\t"
665 "movq (%1, %0), %%mm1 \n\t"
666 "psubb %%mm0, %%mm1 \n\t"
667 "movq %%mm1, (%3, %0) \n\t"
668 "movq 8(%2, %0), %%mm0 \n\t"
669 "movq 8(%1, %0), %%mm1 \n\t"
670 "psubb %%mm0, %%mm1 \n\t"
671 "movq %%mm1, 8(%3, %0) \n\t"
672 "addl $16, %0 \n\t"
673 "cmpl %4, %0 \n\t"
674 " jb 1b \n\t"
675 : "+r" (i)
676 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
677 );
678 for(; i<w; i++)
679 dst[i+0] = src1[i+0]-src2[i+0];
680}
84705403
MN
681
682static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
683 int i=0;
684 uint8_t l, lt;
685
686 asm volatile(
687 "1: \n\t"
688 "movq -1(%1, %0), %%mm0 \n\t" // LT
689 "movq (%1, %0), %%mm1 \n\t" // T
690 "movq -1(%2, %0), %%mm2 \n\t" // L
691 "movq (%2, %0), %%mm3 \n\t" // X
692 "movq %%mm2, %%mm4 \n\t" // L
693 "psubb %%mm0, %%mm2 \n\t"
694 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
695 "movq %%mm4, %%mm5 \n\t" // L
696 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
697 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
698 "pminub %%mm2, %%mm4 \n\t"
699 "pmaxub %%mm1, %%mm4 \n\t"
700 "psubb %%mm4, %%mm3 \n\t" // dst - pred
701 "movq %%mm3, (%3, %0) \n\t"
702 "addl $8, %0 \n\t"
703 "cmpl %4, %0 \n\t"
704 " jb 1b \n\t"
705 : "+r" (i)
706 : "r"(src1), "r"(src2), "r"(dst), "r"(w)
707 );
708
709 l= *left;
710 lt= *left_top;
711
712 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
713
714 *left_top= src1[w-1];
715 *left = src2[w-1];
716}
717
8e0a3db7
MN
718#define LBUTTERFLY2(a1,b1,a2,b2)\
719 "paddw " #b1 ", " #a1 " \n\t"\
720 "paddw " #b2 ", " #a2 " \n\t"\
721 "paddw " #b1 ", " #b1 " \n\t"\
722 "paddw " #b2 ", " #b2 " \n\t"\
723 "psubw " #a1 ", " #b1 " \n\t"\
7db52b63 724 "psubw " #a2 ", " #b2 " \n\t"
1457ab52
MN
725
726#define HADAMARD48\
8e0a3db7
MN
727 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
728 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
729 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
730 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
731 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
732 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
1457ab52
MN
733
734#define MMABS(a,z)\
735 "pxor " #z ", " #z " \n\t"\
736 "pcmpgtw " #a ", " #z " \n\t"\
737 "pxor " #z ", " #a " \n\t"\
738 "psubw " #z ", " #a " \n\t"
739
740#define MMABS_SUM(a,z, sum)\
741 "pxor " #z ", " #z " \n\t"\
742 "pcmpgtw " #a ", " #z " \n\t"\
743 "pxor " #z ", " #a " \n\t"\
744 "psubw " #z ", " #a " \n\t"\
745 "paddusw " #a ", " #sum " \n\t"
746
8e0a3db7
MN
747#define MMABS_MMX2(a,z)\
748 "pxor " #z ", " #z " \n\t"\
749 "psubw " #a ", " #z " \n\t"\
750 "pmaxsw " #z ", " #a " \n\t"
751
752#define MMABS_SUM_MMX2(a,z, sum)\
753 "pxor " #z ", " #z " \n\t"\
754 "psubw " #a ", " #z " \n\t"\
755 "pmaxsw " #z ", " #a " \n\t"\
756 "paddusw " #a ", " #sum " \n\t"
757
1457ab52
MN
758#define SBUTTERFLY(a,b,t,n)\
759 "movq " #a ", " #t " \n\t" /* abcd */\
760 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
761 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
8e0a3db7 762
1457ab52
MN
763#define TRANSPOSE4(a,b,c,d,t)\
764 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
765 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
766 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
767 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
768
769#define LOAD4(o, a, b, c, d)\
770 "movq "#o"(%1), " #a " \n\t"\
771 "movq "#o"+16(%1), " #b " \n\t"\
772 "movq "#o"+32(%1), " #c " \n\t"\
773 "movq "#o"+48(%1), " #d " \n\t"
774
775#define STORE4(o, a, b, c, d)\
776 "movq "#a", "#o"(%1) \n\t"\
777 "movq "#b", "#o"+16(%1) \n\t"\
778 "movq "#c", "#o"+32(%1) \n\t"\
779 "movq "#d", "#o"+48(%1) \n\t"\
780
781static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
782 uint64_t temp[16] __align8;
783 int sum=0;
784
785 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 786
1457ab52
MN
787 asm volatile(
788 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
789 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
790
791 HADAMARD48
792
793 "movq %%mm7, 112(%1) \n\t"
794
795 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
796 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
797
798 "movq 112(%1), %%mm7 \n\t"
799 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
800 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
801
802 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
803 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
804
805 HADAMARD48
806
807 "movq %%mm7, 120(%1) \n\t"
808
809 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
810 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
811
812 "movq 120(%1), %%mm7 \n\t"
813 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
814 "movq %%mm7, %%mm5 \n\t"//FIXME remove
815 "movq %%mm6, %%mm7 \n\t"
816 "movq %%mm0, %%mm6 \n\t"
817// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
818
819 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
820// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
821
822 HADAMARD48
823 "movq %%mm7, 64(%1) \n\t"
824 MMABS(%%mm0, %%mm7)
825 MMABS_SUM(%%mm1, %%mm7, %%mm0)
826 MMABS_SUM(%%mm2, %%mm7, %%mm0)
827 MMABS_SUM(%%mm3, %%mm7, %%mm0)
828 MMABS_SUM(%%mm4, %%mm7, %%mm0)
829 MMABS_SUM(%%mm5, %%mm7, %%mm0)
830 MMABS_SUM(%%mm6, %%mm7, %%mm0)
831 "movq 64(%1), %%mm1 \n\t"
832 MMABS_SUM(%%mm1, %%mm7, %%mm0)
833 "movq %%mm0, 64(%1) \n\t"
834
835 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
836 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
837
838 HADAMARD48
839 "movq %%mm7, (%1) \n\t"
840 MMABS(%%mm0, %%mm7)
841 MMABS_SUM(%%mm1, %%mm7, %%mm0)
842 MMABS_SUM(%%mm2, %%mm7, %%mm0)
843 MMABS_SUM(%%mm3, %%mm7, %%mm0)
844 MMABS_SUM(%%mm4, %%mm7, %%mm0)
845 MMABS_SUM(%%mm5, %%mm7, %%mm0)
846 MMABS_SUM(%%mm6, %%mm7, %%mm0)
847 "movq (%1), %%mm1 \n\t"
848 MMABS_SUM(%%mm1, %%mm7, %%mm0)
849 "movq 64(%1), %%mm1 \n\t"
850 MMABS_SUM(%%mm1, %%mm7, %%mm0)
851
852 "movq %%mm0, %%mm1 \n\t"
853 "psrlq $32, %%mm0 \n\t"
854 "paddusw %%mm1, %%mm0 \n\t"
855 "movq %%mm0, %%mm1 \n\t"
856 "psrlq $16, %%mm0 \n\t"
857 "paddusw %%mm1, %%mm0 \n\t"
858 "movd %%mm0, %0 \n\t"
859
860 : "=r" (sum)
861 : "r"(temp)
862 );
863 return sum&0xFFFF;
864}
865
8e0a3db7
MN
866static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){
867 uint64_t temp[16] __align8;
868 int sum=0;
869
870 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
871
872 asm volatile(
873 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
874 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
875
876 HADAMARD48
877
878 "movq %%mm7, 112(%1) \n\t"
879
880 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
881 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
882
883 "movq 112(%1), %%mm7 \n\t"
884 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
885 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
886
887 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
888 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
889
890 HADAMARD48
891
892 "movq %%mm7, 120(%1) \n\t"
893
894 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
895 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
896
897 "movq 120(%1), %%mm7 \n\t"
898 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
899 "movq %%mm7, %%mm5 \n\t"//FIXME remove
900 "movq %%mm6, %%mm7 \n\t"
901 "movq %%mm0, %%mm6 \n\t"
902// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
903
904 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
905// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
906
907 HADAMARD48
908 "movq %%mm7, 64(%1) \n\t"
909 MMABS_MMX2(%%mm0, %%mm7)
910 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
911 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
912 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
913 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
914 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
915 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
916 "movq 64(%1), %%mm1 \n\t"
917 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
918 "movq %%mm0, 64(%1) \n\t"
919
920 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
921 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
922
923 HADAMARD48
924 "movq %%mm7, (%1) \n\t"
925 MMABS_MMX2(%%mm0, %%mm7)
926 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
927 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
928 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
929 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
930 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
931 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
932 "movq (%1), %%mm1 \n\t"
933 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
934 "movq 64(%1), %%mm1 \n\t"
935 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
936
937 "movq %%mm0, %%mm1 \n\t"
938 "psrlq $32, %%mm0 \n\t"
939 "paddusw %%mm1, %%mm0 \n\t"
940 "movq %%mm0, %%mm1 \n\t"
941 "psrlq $16, %%mm0 \n\t"
942 "paddusw %%mm1, %%mm0 \n\t"
943 "movd %%mm0, %0 \n\t"
944
945 : "=r" (sum)
946 : "r"(temp)
947 );
948 return sum&0xFFFF;
949}
950
951
1457ab52 952WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
8e0a3db7 953WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
764ef400 954#endif //CONFIG_ENCODERS
11f18faf 955
3178ee4c
MN
956#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
957#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
958
826f429a
MN
959#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
960 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
c296f66b 961 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
826f429a
MN
962 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
963 "movq "#in7", " #m3 " \n\t" /* d */\
964 "movq "#in0", %%mm5 \n\t" /* D */\
965 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
966 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
967 "movq "#in1", %%mm5 \n\t" /* C */\
968 "movq "#in2", %%mm6 \n\t" /* B */\
969 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
970 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
971 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
972 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
c296f66b 973 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
826f429a
MN
974 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
975 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
976 "psraw $5, %%mm5 \n\t"\
977 "packuswb %%mm5, %%mm5 \n\t"\
978 OP(%%mm5, out, %%mm7, d)
979
3178ee4c 980#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
5c91a675 981static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
982 uint64_t temp;\
983\
984 asm volatile(\
985 "pxor %%mm7, %%mm7 \n\t"\
986 "1: \n\t"\
987 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
988 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
989 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
990 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
991 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
992 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
993 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
994 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
995 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
996 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
997 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
998 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
999 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1000 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1001 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1002 "paddw %%mm3, %%mm5 \n\t" /* b */\
1003 "paddw %%mm2, %%mm6 \n\t" /* c */\
1004 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1005 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1006 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 1007 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
1008 "paddw %%mm4, %%mm0 \n\t" /* a */\
1009 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 1010 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 1011 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 1012 "paddw %6, %%mm6 \n\t"\
826f429a
MN
1013 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1014 "psraw $5, %%mm0 \n\t"\
c296f66b 1015 "movq %%mm0, %5 \n\t"\
826f429a
MN
1016 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1017 \
1018 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1019 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1020 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1021 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1022 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1023 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1024 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1025 "paddw %%mm0, %%mm2 \n\t" /* b */\
1026 "paddw %%mm5, %%mm3 \n\t" /* c */\
1027 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1028 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1029 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1030 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1031 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1032 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
c296f66b 1033 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a
MN
1034 "paddw %%mm2, %%mm1 \n\t" /* a */\
1035 "paddw %%mm6, %%mm4 \n\t" /* d */\
c296f66b 1036 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
826f429a 1037 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
c296f66b 1038 "paddw %6, %%mm1 \n\t"\
826f429a
MN
1039 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1040 "psraw $5, %%mm3 \n\t"\
c296f66b 1041 "movq %5, %%mm1 \n\t"\
826f429a 1042 "packuswb %%mm3, %%mm1 \n\t"\
3178ee4c 1043 OP_MMX2(%%mm1, (%1),%%mm4, q)\
826f429a
MN
1044 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1045 \
1046 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1047 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1048 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1049 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1050 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1051 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1052 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1053 "paddw %%mm1, %%mm5 \n\t" /* b */\
1054 "paddw %%mm4, %%mm0 \n\t" /* c */\
1055 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1056 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1057 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1058 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
c296f66b 1059 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
826f429a
MN
1060 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1061 "paddw %%mm3, %%mm2 \n\t" /* d */\
1062 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1063 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1064 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1065 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1066 "paddw %%mm2, %%mm6 \n\t" /* a */\
c296f66b
MN
1067 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1068 "paddw %6, %%mm0 \n\t"\
826f429a
MN
1069 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1070 "psraw $5, %%mm0 \n\t"\
1071 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1072 \
1073 "paddw %%mm5, %%mm3 \n\t" /* a */\
1074 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1075 "paddw %%mm4, %%mm6 \n\t" /* b */\
1076 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1077 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1078 "paddw %%mm1, %%mm4 \n\t" /* c */\
1079 "paddw %%mm2, %%mm5 \n\t" /* d */\
1080 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
1081 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
c296f66b
MN
1082 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1083 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
826f429a 1084 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 1085 "paddw %6, %%mm4 \n\t"\
826f429a
MN
1086 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1087 "psraw $5, %%mm4 \n\t"\
1088 "packuswb %%mm4, %%mm0 \n\t"\
3178ee4c 1089 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
826f429a
MN
1090 \
1091 "addl %3, %0 \n\t"\
1092 "addl %4, %1 \n\t"\
1093 "decl %2 \n\t"\
1094 " jnz 1b \n\t"\
5a508a98 1095 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
1096 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1097 : "memory"\
826f429a
MN
1098 );\
1099}\
1100\
1101static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1102 int i;\
1103 int16_t temp[16];\
1104 /* quick HACK, XXX FIXME MUST be optimized */\
1105 for(i=0; i<h; i++)\
1106 {\
1107 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1108 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1109 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1110 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1111 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1112 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1113 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1114 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1115 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1116 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1117 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1118 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1119 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1120 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1121 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1122 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1123 asm volatile(\
1124 "movq (%0), %%mm0 \n\t"\
1125 "movq 8(%0), %%mm1 \n\t"\
1126 "paddw %2, %%mm0 \n\t"\
1127 "paddw %2, %%mm1 \n\t"\
1128 "psraw $5, %%mm0 \n\t"\
1129 "psraw $5, %%mm1 \n\t"\
1130 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 1131 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
1132 "movq 16(%0), %%mm0 \n\t"\
1133 "movq 24(%0), %%mm1 \n\t"\
1134 "paddw %2, %%mm0 \n\t"\
1135 "paddw %2, %%mm1 \n\t"\
1136 "psraw $5, %%mm0 \n\t"\
1137 "psraw $5, %%mm1 \n\t"\
1138 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 1139 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
826f429a 1140 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 1141 : "memory"\
826f429a
MN
1142 );\
1143 dst+=dstStride;\
1144 src+=srcStride;\
1145 }\
1146}\
1147\
5c91a675 1148static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
1149 uint64_t temp;\
1150\
1151 asm volatile(\
1152 "pxor %%mm7, %%mm7 \n\t"\
1153 "1: \n\t"\
1154 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1155 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1156 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1157 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1158 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1159 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1160 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1161 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1162 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1163 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1164 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1165 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1166 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1167 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1168 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1169 "paddw %%mm3, %%mm5 \n\t" /* b */\
1170 "paddw %%mm2, %%mm6 \n\t" /* c */\
1171 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1172 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1173 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 1174 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
1175 "paddw %%mm4, %%mm0 \n\t" /* a */\
1176 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 1177 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 1178 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 1179 "paddw %6, %%mm6 \n\t"\
826f429a
MN
1180 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1181 "psraw $5, %%mm0 \n\t"\
1182 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1183 \
1184 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1185 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1186 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1187 "paddw %%mm5, %%mm1 \n\t" /* a */\
1188 "paddw %%mm6, %%mm2 \n\t" /* b */\
1189 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1190 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1191 "paddw %%mm6, %%mm3 \n\t" /* c */\
1192 "paddw %%mm5, %%mm4 \n\t" /* d */\
1193 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1194 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
c296f66b
MN
1195 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1196 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a 1197 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 1198 "paddw %6, %%mm1 \n\t"\
826f429a
MN
1199 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1200 "psraw $5, %%mm3 \n\t"\
1201 "packuswb %%mm3, %%mm0 \n\t"\
3178ee4c 1202 OP_MMX2(%%mm0, (%1), %%mm4, q)\
826f429a
MN
1203 \
1204 "addl %3, %0 \n\t"\
1205 "addl %4, %1 \n\t"\
1206 "decl %2 \n\t"\
c296f66b 1207 " jnz 1b \n\t"\
5a508a98 1208 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
1209 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1210 : "memory"\
826f429a
MN
1211 );\
1212}\
1213\
1214static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1215 int i;\
1216 int16_t temp[8];\
1217 /* quick HACK, XXX FIXME MUST be optimized */\
1218 for(i=0; i<h; i++)\
1219 {\
1220 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1221 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1222 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1223 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1224 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1225 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1226 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1227 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1228 asm volatile(\
1229 "movq (%0), %%mm0 \n\t"\
1230 "movq 8(%0), %%mm1 \n\t"\
1231 "paddw %2, %%mm0 \n\t"\
1232 "paddw %2, %%mm1 \n\t"\
1233 "psraw $5, %%mm0 \n\t"\
1234 "psraw $5, %%mm1 \n\t"\
1235 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 1236 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a 1237 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 1238 :"memory"\
826f429a
MN
1239 );\
1240 dst+=dstStride;\
1241 src+=srcStride;\
1242 }\
3178ee4c
MN
1243}
1244
1245#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1246\
1247static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1248 uint64_t temp[17*4];\
1249 uint64_t *temp_ptr= temp;\
1250 int count= 17;\
1251\
1252 /*FIXME unroll */\
1253 asm volatile(\
1254 "pxor %%mm7, %%mm7 \n\t"\
1255 "1: \n\t"\
1256 "movq (%0), %%mm0 \n\t"\
1257 "movq (%0), %%mm1 \n\t"\
1258 "movq 8(%0), %%mm2 \n\t"\
1259 "movq 8(%0), %%mm3 \n\t"\
1260 "punpcklbw %%mm7, %%mm0 \n\t"\
1261 "punpckhbw %%mm7, %%mm1 \n\t"\
1262 "punpcklbw %%mm7, %%mm2 \n\t"\
1263 "punpckhbw %%mm7, %%mm3 \n\t"\
1264 "movq %%mm0, (%1) \n\t"\
1265 "movq %%mm1, 17*8(%1) \n\t"\
5a508a98
MN
1266 "movq %%mm2, 2*17*8(%1) \n\t"\
1267 "movq %%mm3, 3*17*8(%1) \n\t"\
3178ee4c
MN
1268 "addl $8, %1 \n\t"\
1269 "addl %3, %0 \n\t"\
1270 "decl %2 \n\t"\
1271 " jnz 1b \n\t"\
1272 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
5a508a98 1273 : "r" (srcStride)\
0b093b6f 1274 : "memory"\
3178ee4c
MN
1275 );\
1276 \
1277 temp_ptr= temp;\
1278 count=4;\
1279 \
1280/*FIXME reorder for speed */\
3178ee4c
MN
1281 asm volatile(\
1282 /*"pxor %%mm7, %%mm7 \n\t"*/\
3178ee4c
MN
1283 "1: \n\t"\
1284 "movq (%0), %%mm0 \n\t"\
1285 "movq 8(%0), %%mm1 \n\t"\
1286 "movq 16(%0), %%mm2 \n\t"\
1287 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1288 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1289 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
3178ee4c 1290 "addl %4, %1 \n\t"\
c296f66b 1291 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
3178ee4c 1292 \
c296f66b 1293 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
3178ee4c 1294 "addl %4, %1 \n\t"\
c296f66b
MN
1295 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1296 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
3178ee4c 1297 "addl %4, %1 \n\t"\
c296f66b
MN
1298 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1299 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
3178ee4c 1300 "addl %4, %1 \n\t"\
c296f66b
MN
1301 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1302 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
3178ee4c 1303 "addl %4, %1 \n\t"\
c296f66b
MN
1304 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1305 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
3178ee4c 1306 "addl %4, %1 \n\t"\
c296f66b 1307 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
3178ee4c 1308 \
c296f66b 1309 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
3178ee4c 1310 "addl %4, %1 \n\t" \
c296f66b
MN
1311 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1312 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
3178ee4c
MN
1313 \
1314 "addl $136, %0 \n\t"\
c296f66b 1315 "addl %6, %1 \n\t"\
3178ee4c
MN
1316 "decl %2 \n\t"\
1317 " jnz 1b \n\t"\
3178ee4c 1318 \
5a508a98 1319 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
c296f66b 1320 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
0b093b6f 1321 :"memory"\
3178ee4c 1322 );\
826f429a
MN
1323}\
1324\
5c91a675 1325static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
826f429a
MN
1326 uint64_t temp[9*4];\
1327 uint64_t *temp_ptr= temp;\
1328 int count= 9;\
1329\
1330 /*FIXME unroll */\
1331 asm volatile(\
1332 "pxor %%mm7, %%mm7 \n\t"\
1333 "1: \n\t"\
1334 "movq (%0), %%mm0 \n\t"\
1335 "movq (%0), %%mm1 \n\t"\
1336 "punpcklbw %%mm7, %%mm0 \n\t"\
1337 "punpckhbw %%mm7, %%mm1 \n\t"\
1338 "movq %%mm0, (%1) \n\t"\
1339 "movq %%mm1, 9*8(%1) \n\t"\
1340 "addl $8, %1 \n\t"\
1341 "addl %3, %0 \n\t"\
1342 "decl %2 \n\t"\
1343 " jnz 1b \n\t"\
1344 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1345 : "r" (srcStride)\
0b093b6f 1346 : "memory"\
826f429a
MN
1347 );\
1348 \
1349 temp_ptr= temp;\
1350 count=2;\
1351 \
1352/*FIXME reorder for speed */\
1353 asm volatile(\
1354 /*"pxor %%mm7, %%mm7 \n\t"*/\
1355 "1: \n\t"\
1356 "movq (%0), %%mm0 \n\t"\
1357 "movq 8(%0), %%mm1 \n\t"\
1358 "movq 16(%0), %%mm2 \n\t"\
1359 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1360 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1361 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
826f429a 1362 "addl %4, %1 \n\t"\
c296f66b 1363 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
826f429a 1364 \
c296f66b 1365 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
826f429a 1366 "addl %4, %1 \n\t"\
c296f66b 1367 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
826f429a 1368 \
c296f66b 1369 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
826f429a 1370 "addl %4, %1 \n\t"\
c296f66b
MN
1371 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1372 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
826f429a
MN
1373 \
1374 "addl $72, %0 \n\t"\
c296f66b 1375 "addl %6, %1 \n\t"\
826f429a
MN
1376 "decl %2 \n\t"\
1377 " jnz 1b \n\t"\
1378 \
c296f66b
MN
1379 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1380 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
0b093b6f
MN
1381 : "memory"\
1382 );\
3178ee4c 1383}\
826f429a 1384\
0c1a9eda 1385static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1386 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
826f429a
MN
1387}\
1388\
0c1a9eda 1389static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1390 uint64_t temp[8];\
826f429a
MN
1391 uint8_t * const half= (uint8_t*)temp;\
1392 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1393 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1394}\
1395\
0c1a9eda 1396static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1397 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1398}\
1399\
0c1a9eda 1400static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1401 uint64_t temp[8];\
826f429a
MN
1402 uint8_t * const half= (uint8_t*)temp;\
1403 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1404 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1405}\
1406\
0c1a9eda 1407static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1408 uint64_t temp[8];\
826f429a 1409 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1410 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1411 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1412}\
1413\
0c1a9eda 1414static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1415 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1416}\
1417\
0c1a9eda 1418static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1419 uint64_t temp[8];\
826f429a 1420 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1421 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1422 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1423}\
0c1a9eda 1424static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1425 uint64_t half[8 + 9];\
1426 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1427 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1428 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1429 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1430 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1431 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a 1432}\
0c1a9eda 1433static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1434 uint64_t half[8 + 9];\
1435 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1436 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1437 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1438 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1439 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1440 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a 1441}\
0c1a9eda 1442static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1443 uint64_t half[8 + 9];\
1444 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1445 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1446 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1447 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1448 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1449 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a 1450}\
0c1a9eda 1451static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1452 uint64_t half[8 + 9];\
1453 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1454 uint8_t * const halfHV= ((uint8_t*)half);\
1455 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1456 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1457 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1458 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a 1459}\
0c1a9eda 1460static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1461 uint64_t half[8 + 9];\
826f429a
MN
1462 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1463 uint8_t * const halfHV= ((uint8_t*)half);\
1464 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1465 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1466 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1467}\
0c1a9eda 1468static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1469 uint64_t half[8 + 9];\
826f429a
MN
1470 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1471 uint8_t * const halfHV= ((uint8_t*)half);\
1472 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1473 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1474 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1475}\
0c1a9eda 1476static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1477 uint64_t half[8 + 9];\
1478 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1479 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1480 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
1481 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1482}\
0c1a9eda 1483static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1484 uint64_t half[8 + 9];\
1485 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1486 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1487 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
1488 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1489}\
0c1a9eda 1490static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1491 uint64_t half[9];\
826f429a
MN
1492 uint8_t * const halfH= ((uint8_t*)half);\
1493 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1494 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1495}\
0c1a9eda 1496static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1497 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
826f429a
MN
1498}\
1499\
0c1a9eda 1500static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1501 uint64_t temp[32];\
1502 uint8_t * const half= (uint8_t*)temp;\
1503 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1504 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1505}\
1506\
0c1a9eda 1507static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1508 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1509}\
1510\
0c1a9eda 1511static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1512 uint64_t temp[32];\
1513 uint8_t * const half= (uint8_t*)temp;\
1514 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1515 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1516}\
1517\
0c1a9eda 1518static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1519 uint64_t temp[32];\
1520 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1521 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1522 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1523}\
1524\
0c1a9eda 1525static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1526 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1527}\
1528\
0c1a9eda 1529static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1530 uint64_t temp[32];\
1531 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1532 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1533 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1534}\
0c1a9eda 1535static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1536 uint64_t half[16*2 + 17*2];\
1537 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1538 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1539 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1540 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1541 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1542 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a 1543}\
0c1a9eda 1544static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1545 uint64_t half[16*2 + 17*2];\
1546 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1547 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1548 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1549 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1550 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1551 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a 1552}\
0c1a9eda 1553static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1554 uint64_t half[16*2 + 17*2];\
1555 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1556 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1557 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1558 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1559 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1560 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a 1561}\
0c1a9eda 1562static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1563 uint64_t half[16*2 + 17*2];\
1564 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1565 uint8_t * const halfHV= ((uint8_t*)half);\
1566 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1567 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1568 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1569 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a 1570}\
0c1a9eda 1571static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1572 uint64_t half[16*2 + 17*2];\
1573 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1574 uint8_t * const halfHV= ((uint8_t*)half);\
1575 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1576 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1577 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1578}\
0c1a9eda 1579static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1580 uint64_t half[16*2 + 17*2];\
1581 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1582 uint8_t * const halfHV= ((uint8_t*)half);\
1583 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1584 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1585 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1586}\
0c1a9eda 1587static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1588 uint64_t half[17*2];\
1589 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1590 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1591 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
1592 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a 1593}\
0c1a9eda 1594static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1595 uint64_t half[17*2];\
1596 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1597 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1598 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
1599 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a 1600}\
0c1a9eda 1601static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1602 uint64_t half[17*2];\
1603 uint8_t * const halfH= ((uint8_t*)half);\
1604 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1605 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1606}
1607
1608
1609#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
3178ee4c 1610#define AVG_3DNOW_OP(a,b,temp, size) \
826f429a
MN
1611"mov" #size " " #b ", " #temp " \n\t"\
1612"pavgusb " #temp ", " #a " \n\t"\
1613"mov" #size " " #a ", " #b " \n\t"
3178ee4c 1614#define AVG_MMX2_OP(a,b,temp, size) \
826f429a
MN
1615"mov" #size " " #b ", " #temp " \n\t"\
1616"pavgb " #temp ", " #a " \n\t"\
1617"mov" #size " " #a ", " #b " \n\t"
3178ee4c
MN
1618
1619QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1620QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1621QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1622QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1623QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1624QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
826f429a 1625QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
3178ee4c 1626QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
826f429a
MN
1627QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1628
61a4e8ae 1629#if 0
d6a4c0b1 1630static void just_return() { return; }
61a4e8ae 1631#endif
d6a4c0b1 1632
826f429a
MN
1633#define SET_QPEL_FUNC(postfix1, postfix2) \
1634 c->put_ ## postfix1 = put_ ## postfix2;\
1635 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1636 c->avg_ ## postfix1 = avg_ ## postfix2;
b0368839
MN
1637
1638/* external functions, from idct_mmx.c */
1639void ff_mmx_idct(DCTELEM *block);
1640void ff_mmxext_idct(DCTELEM *block);
1641
1642/* XXX: those functions should be suppressed ASAP when all IDCTs are
1643 converted */
1644static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1645{
1646 ff_mmx_idct (block);
1647 put_pixels_clamped_mmx(block, dest, line_size);
1648}
1649static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1650{
1651 ff_mmx_idct (block);
1652 add_pixels_clamped_mmx(block, dest, line_size);
1653}
1654static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1655{
1656 ff_mmxext_idct (block);
1657 put_pixels_clamped_mmx(block, dest, line_size);
1658}
1659static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1660{
1661 ff_mmxext_idct (block);
1662 add_pixels_clamped_mmx(block, dest, line_size);
1663}
826f429a 1664
b0368839 1665void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
de6d9b64
FB
1666{
1667 mm_flags = mm_support();
63f60826 1668
e5247b5f
ZK
1669 if (avctx->dsp_mask) {
1670 if (avctx->dsp_mask & FF_MM_FORCE)
1671 mm_flags |= (avctx->dsp_mask & 0xffff);
1672 else
1673 mm_flags &= ~(avctx->dsp_mask & 0xffff);
1674 }
63f60826 1675
1565dabc
LB
1676#if 0
1677 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 1678 if (mm_flags & MM_MMX)
1565dabc 1679 fprintf(stderr, " mmx");
de6d9b64 1680 if (mm_flags & MM_MMXEXT)
1565dabc 1681 fprintf(stderr, " mmxext");
de6d9b64 1682 if (mm_flags & MM_3DNOW)
1565dabc 1683 fprintf(stderr, " 3dnow");
de6d9b64 1684 if (mm_flags & MM_SSE)
1565dabc 1685 fprintf(stderr, " sse");
de6d9b64 1686 if (mm_flags & MM_SSE2)
1565dabc
LB
1687 fprintf(stderr, " sse2");
1688 fprintf(stderr, "\n");
de6d9b64
FB
1689#endif
1690
1691 if (mm_flags & MM_MMX) {
b0368839
MN
1692 const int dct_algo = avctx->dct_algo;
1693 const int idct_algo= avctx->idct_algo;
1694
5fd74135 1695#ifdef CONFIG_ENCODERS
cf3bf5bb
MN
1696 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1697 if(mm_flags & MM_MMXEXT){
1698 c->fdct = ff_fdct_mmx2;
1699 }else{
1700 c->fdct = ff_fdct_mmx;
1701 }
1702 }
5fd74135 1703#endif //CONFIG_ENCODERS
b0368839
MN
1704
1705 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
1706 c->idct_put= ff_simple_idct_put_mmx;
1707 c->idct_add= ff_simple_idct_add_mmx;
4fb518c3 1708 c->idct = ff_simple_idct_mmx;
b0368839
MN
1709 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
1710 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
1711 if(mm_flags & MM_MMXEXT){
1712 c->idct_put= ff_libmpeg2mmx2_idct_put;
1713 c->idct_add= ff_libmpeg2mmx2_idct_add;
4fb518c3 1714 c->idct = ff_mmxext_idct;
b0368839
MN
1715 }else{
1716 c->idct_put= ff_libmpeg2mmx_idct_put;
1717 c->idct_add= ff_libmpeg2mmx_idct_add;
4fb518c3 1718 c->idct = ff_mmx_idct;
b0368839
MN
1719 }
1720 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
1721 }
1722
764ef400 1723#ifdef CONFIG_ENCODERS
eb4b3dd3
ZK
1724 c->get_pixels = get_pixels_mmx;
1725 c->diff_pixels = diff_pixels_mmx;
764ef400 1726#endif //CONFIG_ENCODERS
eb4b3dd3
ZK
1727 c->put_pixels_clamped = put_pixels_clamped_mmx;
1728 c->add_pixels_clamped = add_pixels_clamped_mmx;
1729 c->clear_blocks = clear_blocks_mmx;
764ef400 1730#ifdef CONFIG_ENCODERS
eb4b3dd3 1731 c->pix_sum = pix_sum16_mmx;
764ef400 1732#endif //CONFIG_ENCODERS
eb4b3dd3 1733
eb4b3dd3
ZK
1734 c->put_pixels_tab[0][0] = put_pixels16_mmx;
1735 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1736 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1737 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1738
1739 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1740 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1741 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1742 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1743
1744 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1745 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1746 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1747 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1748
1749 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1750 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1751 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1752 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1753
1754 c->put_pixels_tab[1][0] = put_pixels8_mmx;
1755 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1756 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1757 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1758
1759 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1760 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1761 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1762 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1763
1764 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1765 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1766 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1767 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1768
1769 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1770 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1771 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1772 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
826f429a 1773
11f18faf 1774 c->add_bytes= add_bytes_mmx;
764ef400 1775#ifdef CONFIG_ENCODERS
11f18faf 1776 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
1777
1778 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1779 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1780
2a006cd3
FL
1781 c->pix_norm1 = pix_norm1_mmx;
1782 c->sse[0] = sse16_mmx;
764ef400 1783#endif //CONFIG_ENCODERS
359f98de
MN
1784
1785 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
1457ab52 1786
de6d9b64 1787 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1788 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1789 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
eb4b3dd3
ZK
1790
1791 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1792 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1793 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
eb4b3dd3
ZK
1794
1795 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1796 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
eb4b3dd3
ZK
1797
1798 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1799 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1800 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
b0368839 1801
764ef400 1802#ifdef CONFIG_ENCODERS
8e0a3db7
MN
1803 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1804 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
764ef400 1805#endif //CONFIG_ENCODERS
8e0a3db7 1806
b0368839
MN
1807 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1808 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1809 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1810 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1811 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1812 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1813 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1814 }
3178ee4c 1815
c296f66b 1816#if 1
826f429a
MN
1817 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1818 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1819 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1820 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1821 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1822 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1823 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1824 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1825 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1826 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1827 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1828 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1829 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1830 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1831 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1832 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1833 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1834 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1835 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1836 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1837 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1838 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1839 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1840 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1841 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1842 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1843 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1844 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1845 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1846 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1847 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1848 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
c296f66b 1849#endif
84705403
MN
1850
1851 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
de6d9b64 1852 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
1853 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1854 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
eb4b3dd3
ZK
1855
1856 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1857 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1858 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
eb4b3dd3
ZK
1859
1860 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1861 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eb4b3dd3
ZK
1862
1863 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1864 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1865 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
b0368839
MN
1866
1867 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1868 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1869 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1870 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1871 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1872 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1873 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
1874 }
db794953 1875
826f429a
MN
1876 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1877 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1878 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1879 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1880 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1881 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1882 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1883 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1884 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1885 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1886 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1887 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1888 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1889 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1890 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1891 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1892 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1893 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1894 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1895 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1896 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1897 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1898 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1899 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1900 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1901 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1902 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1903 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1904 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1905 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1906 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1907 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
de6d9b64
FB
1908 }
1909 }
b0368839 1910
764ef400 1911#ifdef CONFIG_ENCODERS
b0368839 1912 dsputil_init_pix_mmx(c, avctx);
764ef400 1913#endif //CONFIG_ENCODERS
d6a4c0b1
ZK
1914#if 0
1915 // for speed testing
1916 get_pixels = just_return;
1917 put_pixels_clamped = just_return;
1918 add_pixels_clamped = just_return;
1919
1920 pix_abs16x16 = just_return;
1921 pix_abs16x16_x2 = just_return;
1922 pix_abs16x16_y2 = just_return;
1923 pix_abs16x16_xy2 = just_return;
1924
1925 put_pixels_tab[0] = just_return;
1926 put_pixels_tab[1] = just_return;
1927 put_pixels_tab[2] = just_return;
1928 put_pixels_tab[3] = just_return;
1929
1930 put_no_rnd_pixels_tab[0] = just_return;
1931 put_no_rnd_pixels_tab[1] = just_return;
1932 put_no_rnd_pixels_tab[2] = just_return;
1933 put_no_rnd_pixels_tab[3] = just_return;
1934
1935 avg_pixels_tab[0] = just_return;
1936 avg_pixels_tab[1] = just_return;
1937 avg_pixels_tab[2] = just_return;
1938 avg_pixels_tab[3] = just_return;
1939
1940 avg_no_rnd_pixels_tab[0] = just_return;
1941 avg_no_rnd_pixels_tab[1] = just_return;
1942 avg_no_rnd_pixels_tab[2] = just_return;
1943 avg_no_rnd_pixels_tab[3] = just_return;
1944
d6a4c0b1
ZK
1945 //av_fdct = just_return;
1946 //ff_idct = just_return;
1947#endif
de6d9b64 1948}