h263 MV prediction doesnt match mpeg4, for some slices configurations (fixes RV20...
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
b0368839 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5 25int mm_flags; /* multimedia extension flags */
1457ab52 26
de6d9b64 27/* pixel operations */
a7bd8797
MN
28static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
29static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
30static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 31
826f429a
MN
32static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
33static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
34static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
35static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
36
d6a4c0b1
ZK
37#define JUMPALIGN() __asm __volatile (".balign 8"::)
38#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
39
fca0f0e5
ZK
40#define MOVQ_WONE(regd) \
41 __asm __volatile ( \
42 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
43 "psrlw $15, %%" #regd ::)
44
45#define MOVQ_BFE(regd) \
46 __asm __volatile ( \
47 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
48 "paddb %%" #regd ", %%" #regd " \n\t" ::)
49
d6a4c0b1 50#ifndef PIC
fca0f0e5 51#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
52#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
53#else
54// for shared library it's better to use this way for accessing constants
55// pcmpeqd -> -1
fca0f0e5 56#define MOVQ_BONE(regd) \
d6a4c0b1 57 __asm __volatile ( \
fca0f0e5
ZK
58 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
59 "psrlw $15, %%" #regd " \n\t" \
60 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
61
62#define MOVQ_WTWO(regd) \
63 __asm __volatile ( \
fca0f0e5
ZK
64 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
65 "psrlw $15, %%" #regd " \n\t" \
66 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 67
d6a4c0b1
ZK
68#endif
69
fca0f0e5 70// using regr as temporary and for the output result
def60345 71// first argument is unmodifed and second is trashed
39825f31
ZK
72// regfe is supposed to contain 0xfefefefefefefefe
73#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
74 "movq " #rega ", " #regr " \n\t"\
75 "pand " #regb ", " #regr " \n\t"\
def60345 76 "pxor " #rega ", " #regb " \n\t"\
39825f31 77 "pand " #regfe "," #regb " \n\t"\
def60345 78 "psrlq $1, " #regb " \n\t"\
91abb473 79 "paddb " #regb ", " #regr " \n\t"
def60345 80
39825f31 81#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
82 "movq " #rega ", " #regr " \n\t"\
83 "por " #regb ", " #regr " \n\t"\
def60345 84 "pxor " #rega ", " #regb " \n\t"\
39825f31 85 "pand " #regfe "," #regb " \n\t"\
def60345 86 "psrlq $1, " #regb " \n\t"\
91abb473 87 "psubb " #regb ", " #regr " \n\t"
def60345 88
39825f31 89// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
90#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
91 "movq " #rega ", " #regr " \n\t"\
92 "movq " #regc ", " #regp " \n\t"\
93 "pand " #regb ", " #regr " \n\t"\
94 "pand " #regd ", " #regp " \n\t"\
95 "pxor " #rega ", " #regb " \n\t"\
96 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
97 "pand %%mm6, " #regb " \n\t"\
98 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
99 "psrlq $1, " #regb " \n\t"\
100 "psrlq $1, " #regd " \n\t"\
101 "paddb " #regb ", " #regr " \n\t"\
102 "paddb " #regd ", " #regp " \n\t"
103
104#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
105 "movq " #rega ", " #regr " \n\t"\
106 "movq " #regc ", " #regp " \n\t"\
107 "por " #regb ", " #regr " \n\t"\
108 "por " #regd ", " #regp " \n\t"\
109 "pxor " #rega ", " #regb " \n\t"\
110 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
111 "pand %%mm6, " #regb " \n\t"\
112 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
113 "psrlq $1, " #regd " \n\t"\
114 "psrlq $1, " #regb " \n\t"\
115 "psubb " #regb ", " #regr " \n\t"\
116 "psubb " #regd ", " #regp " \n\t"
117
91abb473
ZK
118/***********************************/
119/* MMX no rounding */
120#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 121#define SET_RND MOVQ_WONE
6aa6ea8e 122#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 123#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 124
91abb473
ZK
125#include "dsputil_mmx_rnd.h"
126
127#undef DEF
fca0f0e5 128#undef SET_RND
6aa6ea8e 129#undef PAVGBP
39825f31 130#undef PAVGB
91abb473
ZK
131/***********************************/
132/* MMX rounding */
133
134#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 135#define SET_RND MOVQ_WTWO
6aa6ea8e 136#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 137#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 138
91abb473
ZK
139#include "dsputil_mmx_rnd.h"
140
141#undef DEF
fca0f0e5 142#undef SET_RND
6aa6ea8e 143#undef PAVGBP
39825f31 144#undef PAVGB
a7bd8797 145
de6d9b64
FB
146/***********************************/
147/* 3Dnow specific */
148
149#define DEF(x) x ## _3dnow
150/* for Athlons PAVGUSB is prefered */
151#define PAVGB "pavgusb"
152
153#include "dsputil_mmx_avg.h"
154
155#undef DEF
156#undef PAVGB
157
158/***********************************/
159/* MMX2 specific */
160
607dce96 161#define DEF(x) x ## _mmx2
de6d9b64
FB
162
163/* Introduced only in MMX2 set */
164#define PAVGB "pavgb"
165
166#include "dsputil_mmx_avg.h"
167
168#undef DEF
169#undef PAVGB
170
171/***********************************/
172/* standard MMX */
173
764ef400 174#ifdef CONFIG_ENCODERS
0c1a9eda 175static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
de6d9b64 176{
607dce96
MN
177 asm volatile(
178 "movl $-128, %%eax \n\t"
179 "pxor %%mm7, %%mm7 \n\t"
180 ".balign 16 \n\t"
181 "1: \n\t"
182 "movq (%0), %%mm0 \n\t"
183 "movq (%0, %2), %%mm2 \n\t"
184 "movq %%mm0, %%mm1 \n\t"
185 "movq %%mm2, %%mm3 \n\t"
186 "punpcklbw %%mm7, %%mm0 \n\t"
187 "punpckhbw %%mm7, %%mm1 \n\t"
188 "punpcklbw %%mm7, %%mm2 \n\t"
189 "punpckhbw %%mm7, %%mm3 \n\t"
190 "movq %%mm0, (%1, %%eax)\n\t"
191 "movq %%mm1, 8(%1, %%eax)\n\t"
192 "movq %%mm2, 16(%1, %%eax)\n\t"
193 "movq %%mm3, 24(%1, %%eax)\n\t"
194 "addl %3, %0 \n\t"
195 "addl $32, %%eax \n\t"
196 "js 1b \n\t"
197 : "+r" (pixels)
198 : "r" (block+64), "r" (line_size), "r" (line_size*2)
199 : "%eax"
200 );
de6d9b64
FB
201}
202
0c1a9eda 203static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
9dbcbd92
MN
204{
205 asm volatile(
607dce96 206 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 207 "movl $-128, %%eax \n\t"
607dce96 208 ".balign 16 \n\t"
9dbcbd92
MN
209 "1: \n\t"
210 "movq (%0), %%mm0 \n\t"
211 "movq (%1), %%mm2 \n\t"
212 "movq %%mm0, %%mm1 \n\t"
213 "movq %%mm2, %%mm3 \n\t"
214 "punpcklbw %%mm7, %%mm0 \n\t"
215 "punpckhbw %%mm7, %%mm1 \n\t"
216 "punpcklbw %%mm7, %%mm2 \n\t"
217 "punpckhbw %%mm7, %%mm3 \n\t"
218 "psubw %%mm2, %%mm0 \n\t"
219 "psubw %%mm3, %%mm1 \n\t"
220 "movq %%mm0, (%2, %%eax)\n\t"
221 "movq %%mm1, 8(%2, %%eax)\n\t"
222 "addl %3, %0 \n\t"
223 "addl %3, %1 \n\t"
224 "addl $16, %%eax \n\t"
225 "jnz 1b \n\t"
226 : "+r" (s1), "+r" (s2)
227 : "r" (block+64), "r" (stride)
228 : "%eax"
229 );
230}
764ef400 231#endif //CONFIG_ENCODERS
9dbcbd92 232
0c1a9eda 233void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
de6d9b64
FB
234{
235 const DCTELEM *p;
0c1a9eda 236 uint8_t *pix;
de6d9b64
FB
237
238 /* read the pixels */
239 p = block;
240 pix = pixels;
d6a4c0b1 241 /* unrolled loop */
de6d9b64 242 __asm __volatile(
a822a479
NK
243 "movq %3, %%mm0\n\t"
244 "movq 8%3, %%mm1\n\t"
245 "movq 16%3, %%mm2\n\t"
246 "movq 24%3, %%mm3\n\t"
247 "movq 32%3, %%mm4\n\t"
248 "movq 40%3, %%mm5\n\t"
249 "movq 48%3, %%mm6\n\t"
250 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
251 "packuswb %%mm1, %%mm0\n\t"
252 "packuswb %%mm3, %%mm2\n\t"
253 "packuswb %%mm5, %%mm4\n\t"
254 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
255 "movq %%mm0, (%0)\n\t"
256 "movq %%mm2, (%0, %1)\n\t"
257 "movq %%mm4, (%0, %1, 2)\n\t"
258 "movq %%mm6, (%0, %2)\n\t"
259 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
260 :"memory");
261 pix += line_size*4;
262 p += 32;
d6a4c0b1
ZK
263
264 // if here would be an exact copy of the code above
265 // compiler would generate some very strange code
266 // thus using "r"
267 __asm __volatile(
268 "movq (%3), %%mm0\n\t"
269 "movq 8(%3), %%mm1\n\t"
270 "movq 16(%3), %%mm2\n\t"
271 "movq 24(%3), %%mm3\n\t"
272 "movq 32(%3), %%mm4\n\t"
273 "movq 40(%3), %%mm5\n\t"
274 "movq 48(%3), %%mm6\n\t"
275 "movq 56(%3), %%mm7\n\t"
276 "packuswb %%mm1, %%mm0\n\t"
277 "packuswb %%mm3, %%mm2\n\t"
278 "packuswb %%mm5, %%mm4\n\t"
279 "packuswb %%mm7, %%mm6\n\t"
280 "movq %%mm0, (%0)\n\t"
281 "movq %%mm2, (%0, %1)\n\t"
282 "movq %%mm4, (%0, %1, 2)\n\t"
283 "movq %%mm6, (%0, %2)\n\t"
284 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
285 :"memory");
de6d9b64
FB
286}
287
0c1a9eda 288void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
de6d9b64
FB
289{
290 const DCTELEM *p;
0c1a9eda 291 uint8_t *pix;
de6d9b64
FB
292 int i;
293
294 /* read the pixels */
295 p = block;
296 pix = pixels;
d6a4c0b1
ZK
297 MOVQ_ZERO(mm7);
298 i = 4;
cd8e5f96 299 do {
de6d9b64 300 __asm __volatile(
cd8e5f96
ZK
301 "movq (%2), %%mm0\n\t"
302 "movq 8(%2), %%mm1\n\t"
303 "movq 16(%2), %%mm2\n\t"
304 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
305 "movq %0, %%mm4\n\t"
306 "movq %1, %%mm6\n\t"
307 "movq %%mm4, %%mm5\n\t"
308 "punpcklbw %%mm7, %%mm4\n\t"
309 "punpckhbw %%mm7, %%mm5\n\t"
310 "paddsw %%mm4, %%mm0\n\t"
311 "paddsw %%mm5, %%mm1\n\t"
312 "movq %%mm6, %%mm5\n\t"
313 "punpcklbw %%mm7, %%mm6\n\t"
314 "punpckhbw %%mm7, %%mm5\n\t"
315 "paddsw %%mm6, %%mm2\n\t"
316 "paddsw %%mm5, %%mm3\n\t"
317 "packuswb %%mm1, %%mm0\n\t"
318 "packuswb %%mm3, %%mm2\n\t"
319 "movq %%mm0, %0\n\t"
320 "movq %%mm2, %1\n\t"
a822a479 321 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 322 :"r"(p)
de6d9b64
FB
323 :"memory");
324 pix += line_size*2;
325 p += 16;
cd8e5f96 326 } while (--i);
de6d9b64
FB
327}
328
0c1a9eda 329static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
de6d9b64 330{
39825f31 331 __asm __volatile(
31ddcf98 332 "lea (%3, %3), %%eax \n\t"
52af45ad 333 ".balign 8 \n\t"
31ddcf98
ZK
334 "1: \n\t"
335 "movq (%1), %%mm0 \n\t"
336 "movq (%1, %3), %%mm1 \n\t"
337 "movq %%mm0, (%2) \n\t"
338 "movq %%mm1, (%2, %3) \n\t"
339 "addl %%eax, %1 \n\t"
340 "addl %%eax, %2 \n\t"
341 "movq (%1), %%mm0 \n\t"
342 "movq (%1, %3), %%mm1 \n\t"
343 "movq %%mm0, (%2) \n\t"
344 "movq %%mm1, (%2, %3) \n\t"
345 "addl %%eax, %1 \n\t"
346 "addl %%eax, %2 \n\t"
347 "subl $4, %0 \n\t"
348 "jnz 1b \n\t"
349 : "+g"(h), "+r" (pixels), "+r" (block)
350 : "r"(line_size)
351 : "%eax", "memory"
352 );
de6d9b64
FB
353}
354
0c1a9eda 355static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
b3184779
MN
356{
357 __asm __volatile(
358 "lea (%3, %3), %%eax \n\t"
359 ".balign 8 \n\t"
360 "1: \n\t"
361 "movq (%1), %%mm0 \n\t"
362 "movq 8(%1), %%mm4 \n\t"
363 "movq (%1, %3), %%mm1 \n\t"
364 "movq 8(%1, %3), %%mm5 \n\t"
365 "movq %%mm0, (%2) \n\t"
366 "movq %%mm4, 8(%2) \n\t"
367 "movq %%mm1, (%2, %3) \n\t"
368 "movq %%mm5, 8(%2, %3) \n\t"
369 "addl %%eax, %1 \n\t"
370 "addl %%eax, %2 \n\t"
371 "movq (%1), %%mm0 \n\t"
372 "movq 8(%1), %%mm4 \n\t"
373 "movq (%1, %3), %%mm1 \n\t"
374 "movq 8(%1, %3), %%mm5 \n\t"
375 "movq %%mm0, (%2) \n\t"
376 "movq %%mm4, 8(%2) \n\t"
377 "movq %%mm1, (%2, %3) \n\t"
378 "movq %%mm5, 8(%2, %3) \n\t"
379 "addl %%eax, %1 \n\t"
380 "addl %%eax, %2 \n\t"
381 "subl $4, %0 \n\t"
382 "jnz 1b \n\t"
383 : "+g"(h), "+r" (pixels), "+r" (block)
384 : "r"(line_size)
385 : "%eax", "memory"
386 );
387}
388
649c00c9
MN
389static void clear_blocks_mmx(DCTELEM *blocks)
390{
39825f31 391 __asm __volatile(
649c00c9
MN
392 "pxor %%mm7, %%mm7 \n\t"
393 "movl $-128*6, %%eax \n\t"
394 "1: \n\t"
395 "movq %%mm7, (%0, %%eax) \n\t"
396 "movq %%mm7, 8(%0, %%eax) \n\t"
397 "movq %%mm7, 16(%0, %%eax) \n\t"
398 "movq %%mm7, 24(%0, %%eax) \n\t"
399 "addl $32, %%eax \n\t"
400 " js 1b \n\t"
401 : : "r" (((int)blocks)+128*6)
402 : "%eax"
403 );
404}
405
764ef400 406#ifdef CONFIG_ENCODERS
0c1a9eda 407static int pix_sum16_mmx(uint8_t * pix, int line_size){
084c726b
MN
408 const int h=16;
409 int sum;
410 int index= -line_size*h;
411
412 __asm __volatile(
413 "pxor %%mm7, %%mm7 \n\t"
414 "pxor %%mm6, %%mm6 \n\t"
415 "1: \n\t"
416 "movq (%2, %1), %%mm0 \n\t"
417 "movq (%2, %1), %%mm1 \n\t"
418 "movq 8(%2, %1), %%mm2 \n\t"
419 "movq 8(%2, %1), %%mm3 \n\t"
420 "punpcklbw %%mm7, %%mm0 \n\t"
421 "punpckhbw %%mm7, %%mm1 \n\t"
422 "punpcklbw %%mm7, %%mm2 \n\t"
423 "punpckhbw %%mm7, %%mm3 \n\t"
424 "paddw %%mm0, %%mm1 \n\t"
425 "paddw %%mm2, %%mm3 \n\t"
426 "paddw %%mm1, %%mm3 \n\t"
427 "paddw %%mm3, %%mm6 \n\t"
428 "addl %3, %1 \n\t"
429 " js 1b \n\t"
430 "movq %%mm6, %%mm5 \n\t"
431 "psrlq $32, %%mm6 \n\t"
432 "paddw %%mm5, %%mm6 \n\t"
433 "movq %%mm6, %%mm5 \n\t"
434 "psrlq $16, %%mm6 \n\t"
435 "paddw %%mm5, %%mm6 \n\t"
436 "movd %%mm6, %0 \n\t"
437 "andl $0xFFFF, %0 \n\t"
438 : "=&r" (sum), "+r" (index)
439 : "r" (pix - index), "r" (line_size)
440 );
441
442 return sum;
443}
764ef400 444#endif //CONFIG_ENCODERS
084c726b 445
11f18faf
MN
446static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
447 int i=0;
448 asm volatile(
449 "1: \n\t"
450 "movq (%1, %0), %%mm0 \n\t"
451 "movq (%2, %0), %%mm1 \n\t"
452 "paddb %%mm0, %%mm1 \n\t"
453 "movq %%mm1, (%2, %0) \n\t"
454 "movq 8(%1, %0), %%mm0 \n\t"
455 "movq 8(%2, %0), %%mm1 \n\t"
456 "paddb %%mm0, %%mm1 \n\t"
457 "movq %%mm1, 8(%2, %0) \n\t"
458 "addl $16, %0 \n\t"
459 "cmpl %3, %0 \n\t"
460 " jb 1b \n\t"
461 : "+r" (i)
462 : "r"(src), "r"(dst), "r"(w-15)
463 );
464 for(; i<w; i++)
465 dst[i+0] += src[i+0];
466}
467
764ef400 468#ifdef CONFIG_ENCODERS
2a006cd3
FL
469static int pix_norm1_mmx(uint8_t *pix, int line_size) {
470 int tmp;
471 asm volatile (
472 "movl $16,%%ecx\n"
473 "pxor %%mm0,%%mm0\n"
474 "pxor %%mm7,%%mm7\n"
475 "1:\n"
476 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
477 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
478
479 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
480
481 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
482 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
483
484 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
485 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
486 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
487
488 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
489 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
490
491 "pmaddwd %%mm3,%%mm3\n"
492 "pmaddwd %%mm4,%%mm4\n"
493
494 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
495 pix2^2+pix3^2+pix6^2+pix7^2) */
496 "paddd %%mm3,%%mm4\n"
497 "paddd %%mm2,%%mm7\n"
498
499 "addl %2, %0\n"
500 "paddd %%mm4,%%mm7\n"
501 "dec %%ecx\n"
502 "jnz 1b\n"
503
504 "movq %%mm7,%%mm1\n"
505 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
506 "paddd %%mm7,%%mm1\n"
507 "movd %%mm1,%1\n"
508 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
509 return tmp;
510}
511
0c1a9eda 512static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
2a006cd3
FL
513 int tmp;
514 asm volatile (
515 "movl $16,%%ecx\n"
516 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
517 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
518 "1:\n"
519 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
520 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
521 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
522 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
523
524 /* todo: mm1-mm2, mm3-mm4 */
525 /* algo: substract mm1 from mm2 with saturation and vice versa */
526 /* OR the results to get absolute difference */
527 "movq %%mm1,%%mm5\n"
528 "movq %%mm3,%%mm6\n"
529 "psubusb %%mm2,%%mm1\n"
530 "psubusb %%mm4,%%mm3\n"
531 "psubusb %%mm5,%%mm2\n"
532 "psubusb %%mm6,%%mm4\n"
533
534 "por %%mm1,%%mm2\n"
535 "por %%mm3,%%mm4\n"
536
537 /* now convert to 16-bit vectors so we can square them */
538 "movq %%mm2,%%mm1\n"
539 "movq %%mm4,%%mm3\n"
540
541 "punpckhbw %%mm0,%%mm2\n"
542 "punpckhbw %%mm0,%%mm4\n"
543 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
544 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
545
546 "pmaddwd %%mm2,%%mm2\n"
547 "pmaddwd %%mm4,%%mm4\n"
548 "pmaddwd %%mm1,%%mm1\n"
549 "pmaddwd %%mm3,%%mm3\n"
550
551 "addl %3,%0\n"
552 "addl %3,%1\n"
553
554 "paddd %%mm2,%%mm1\n"
555 "paddd %%mm4,%%mm3\n"
556 "paddd %%mm1,%%mm7\n"
557 "paddd %%mm3,%%mm7\n"
558
559 "decl %%ecx\n"
560 "jnz 1b\n"
561
562 "movq %%mm7,%%mm1\n"
563 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
564 "paddd %%mm7,%%mm1\n"
565 "movd %%mm1,%2\n"
01a2ddaf 566 : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "%ecx");
2a006cd3
FL
567 return tmp;
568}
569
11f18faf
MN
570static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
571 int i=0;
572 asm volatile(
573 "1: \n\t"
574 "movq (%2, %0), %%mm0 \n\t"
575 "movq (%1, %0), %%mm1 \n\t"
576 "psubb %%mm0, %%mm1 \n\t"
577 "movq %%mm1, (%3, %0) \n\t"
578 "movq 8(%2, %0), %%mm0 \n\t"
579 "movq 8(%1, %0), %%mm1 \n\t"
580 "psubb %%mm0, %%mm1 \n\t"
581 "movq %%mm1, 8(%3, %0) \n\t"
582 "addl $16, %0 \n\t"
583 "cmpl %4, %0 \n\t"
584 " jb 1b \n\t"
585 : "+r" (i)
586 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
587 );
588 for(; i<w; i++)
589 dst[i+0] = src1[i+0]-src2[i+0];
590}
84705403
MN
591
592static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
593 int i=0;
594 uint8_t l, lt;
595
596 asm volatile(
597 "1: \n\t"
598 "movq -1(%1, %0), %%mm0 \n\t" // LT
599 "movq (%1, %0), %%mm1 \n\t" // T
600 "movq -1(%2, %0), %%mm2 \n\t" // L
601 "movq (%2, %0), %%mm3 \n\t" // X
602 "movq %%mm2, %%mm4 \n\t" // L
603 "psubb %%mm0, %%mm2 \n\t"
604 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
605 "movq %%mm4, %%mm5 \n\t" // L
606 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
607 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
608 "pminub %%mm2, %%mm4 \n\t"
609 "pmaxub %%mm1, %%mm4 \n\t"
610 "psubb %%mm4, %%mm3 \n\t" // dst - pred
611 "movq %%mm3, (%3, %0) \n\t"
612 "addl $8, %0 \n\t"
613 "cmpl %4, %0 \n\t"
614 " jb 1b \n\t"
615 : "+r" (i)
616 : "r"(src1), "r"(src2), "r"(dst), "r"(w)
617 );
618
619 l= *left;
620 lt= *left_top;
621
622 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
623
624 *left_top= src1[w-1];
625 *left = src2[w-1];
626}
627
8e0a3db7
MN
628#define LBUTTERFLY2(a1,b1,a2,b2)\
629 "paddw " #b1 ", " #a1 " \n\t"\
630 "paddw " #b2 ", " #a2 " \n\t"\
631 "paddw " #b1 ", " #b1 " \n\t"\
632 "paddw " #b2 ", " #b2 " \n\t"\
633 "psubw " #a1 ", " #b1 " \n\t"\
7db52b63 634 "psubw " #a2 ", " #b2 " \n\t"
1457ab52
MN
635
636#define HADAMARD48\
8e0a3db7
MN
637 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
638 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
639 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
640 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
641 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
642 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
1457ab52
MN
643
644#define MMABS(a,z)\
645 "pxor " #z ", " #z " \n\t"\
646 "pcmpgtw " #a ", " #z " \n\t"\
647 "pxor " #z ", " #a " \n\t"\
648 "psubw " #z ", " #a " \n\t"
649
650#define MMABS_SUM(a,z, sum)\
651 "pxor " #z ", " #z " \n\t"\
652 "pcmpgtw " #a ", " #z " \n\t"\
653 "pxor " #z ", " #a " \n\t"\
654 "psubw " #z ", " #a " \n\t"\
655 "paddusw " #a ", " #sum " \n\t"
656
8e0a3db7
MN
657#define MMABS_MMX2(a,z)\
658 "pxor " #z ", " #z " \n\t"\
659 "psubw " #a ", " #z " \n\t"\
660 "pmaxsw " #z ", " #a " \n\t"
661
662#define MMABS_SUM_MMX2(a,z, sum)\
663 "pxor " #z ", " #z " \n\t"\
664 "psubw " #a ", " #z " \n\t"\
665 "pmaxsw " #z ", " #a " \n\t"\
666 "paddusw " #a ", " #sum " \n\t"
667
1457ab52
MN
668#define SBUTTERFLY(a,b,t,n)\
669 "movq " #a ", " #t " \n\t" /* abcd */\
670 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
671 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
8e0a3db7 672
1457ab52
MN
673#define TRANSPOSE4(a,b,c,d,t)\
674 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
675 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
676 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
677 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
678
679#define LOAD4(o, a, b, c, d)\
680 "movq "#o"(%1), " #a " \n\t"\
681 "movq "#o"+16(%1), " #b " \n\t"\
682 "movq "#o"+32(%1), " #c " \n\t"\
683 "movq "#o"+48(%1), " #d " \n\t"
684
685#define STORE4(o, a, b, c, d)\
686 "movq "#a", "#o"(%1) \n\t"\
687 "movq "#b", "#o"+16(%1) \n\t"\
688 "movq "#c", "#o"+32(%1) \n\t"\
689 "movq "#d", "#o"+48(%1) \n\t"\
690
691static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
692 uint64_t temp[16] __align8;
693 int sum=0;
694
695 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 696
1457ab52
MN
697 asm volatile(
698 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
699 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
700
701 HADAMARD48
702
703 "movq %%mm7, 112(%1) \n\t"
704
705 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
706 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
707
708 "movq 112(%1), %%mm7 \n\t"
709 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
710 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
711
712 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
713 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
714
715 HADAMARD48
716
717 "movq %%mm7, 120(%1) \n\t"
718
719 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
720 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
721
722 "movq 120(%1), %%mm7 \n\t"
723 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
724 "movq %%mm7, %%mm5 \n\t"//FIXME remove
725 "movq %%mm6, %%mm7 \n\t"
726 "movq %%mm0, %%mm6 \n\t"
727// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
728
729 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
730// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
731
732 HADAMARD48
733 "movq %%mm7, 64(%1) \n\t"
734 MMABS(%%mm0, %%mm7)
735 MMABS_SUM(%%mm1, %%mm7, %%mm0)
736 MMABS_SUM(%%mm2, %%mm7, %%mm0)
737 MMABS_SUM(%%mm3, %%mm7, %%mm0)
738 MMABS_SUM(%%mm4, %%mm7, %%mm0)
739 MMABS_SUM(%%mm5, %%mm7, %%mm0)
740 MMABS_SUM(%%mm6, %%mm7, %%mm0)
741 "movq 64(%1), %%mm1 \n\t"
742 MMABS_SUM(%%mm1, %%mm7, %%mm0)
743 "movq %%mm0, 64(%1) \n\t"
744
745 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
746 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
747
748 HADAMARD48
749 "movq %%mm7, (%1) \n\t"
750 MMABS(%%mm0, %%mm7)
751 MMABS_SUM(%%mm1, %%mm7, %%mm0)
752 MMABS_SUM(%%mm2, %%mm7, %%mm0)
753 MMABS_SUM(%%mm3, %%mm7, %%mm0)
754 MMABS_SUM(%%mm4, %%mm7, %%mm0)
755 MMABS_SUM(%%mm5, %%mm7, %%mm0)
756 MMABS_SUM(%%mm6, %%mm7, %%mm0)
757 "movq (%1), %%mm1 \n\t"
758 MMABS_SUM(%%mm1, %%mm7, %%mm0)
759 "movq 64(%1), %%mm1 \n\t"
760 MMABS_SUM(%%mm1, %%mm7, %%mm0)
761
762 "movq %%mm0, %%mm1 \n\t"
763 "psrlq $32, %%mm0 \n\t"
764 "paddusw %%mm1, %%mm0 \n\t"
765 "movq %%mm0, %%mm1 \n\t"
766 "psrlq $16, %%mm0 \n\t"
767 "paddusw %%mm1, %%mm0 \n\t"
768 "movd %%mm0, %0 \n\t"
769
770 : "=r" (sum)
771 : "r"(temp)
772 );
773 return sum&0xFFFF;
774}
775
8e0a3db7
MN
776static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){
777 uint64_t temp[16] __align8;
778 int sum=0;
779
780 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
781
782 asm volatile(
783 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
784 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
785
786 HADAMARD48
787
788 "movq %%mm7, 112(%1) \n\t"
789
790 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
791 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
792
793 "movq 112(%1), %%mm7 \n\t"
794 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
795 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
796
797 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
798 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
799
800 HADAMARD48
801
802 "movq %%mm7, 120(%1) \n\t"
803
804 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
805 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
806
807 "movq 120(%1), %%mm7 \n\t"
808 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
809 "movq %%mm7, %%mm5 \n\t"//FIXME remove
810 "movq %%mm6, %%mm7 \n\t"
811 "movq %%mm0, %%mm6 \n\t"
812// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
813
814 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
815// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
816
817 HADAMARD48
818 "movq %%mm7, 64(%1) \n\t"
819 MMABS_MMX2(%%mm0, %%mm7)
820 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
821 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
822 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
823 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
824 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
825 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
826 "movq 64(%1), %%mm1 \n\t"
827 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
828 "movq %%mm0, 64(%1) \n\t"
829
830 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
831 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
832
833 HADAMARD48
834 "movq %%mm7, (%1) \n\t"
835 MMABS_MMX2(%%mm0, %%mm7)
836 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
837 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
838 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
839 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
840 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
841 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
842 "movq (%1), %%mm1 \n\t"
843 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
844 "movq 64(%1), %%mm1 \n\t"
845 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
846
847 "movq %%mm0, %%mm1 \n\t"
848 "psrlq $32, %%mm0 \n\t"
849 "paddusw %%mm1, %%mm0 \n\t"
850 "movq %%mm0, %%mm1 \n\t"
851 "psrlq $16, %%mm0 \n\t"
852 "paddusw %%mm1, %%mm0 \n\t"
853 "movd %%mm0, %0 \n\t"
854
855 : "=r" (sum)
856 : "r"(temp)
857 );
858 return sum&0xFFFF;
859}
860
861
1457ab52 862WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
8e0a3db7 863WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
764ef400 864#endif //CONFIG_ENCODERS
11f18faf 865
3178ee4c
MN
866#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
867#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
868
826f429a
MN
869#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
870 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
c296f66b 871 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
826f429a
MN
872 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
873 "movq "#in7", " #m3 " \n\t" /* d */\
874 "movq "#in0", %%mm5 \n\t" /* D */\
875 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
876 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
877 "movq "#in1", %%mm5 \n\t" /* C */\
878 "movq "#in2", %%mm6 \n\t" /* B */\
879 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
880 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
881 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
882 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
c296f66b 883 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
826f429a
MN
884 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
885 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
886 "psraw $5, %%mm5 \n\t"\
887 "packuswb %%mm5, %%mm5 \n\t"\
888 OP(%%mm5, out, %%mm7, d)
889
3178ee4c 890#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
5c91a675 891static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
892 uint64_t temp;\
893\
894 asm volatile(\
895 "pxor %%mm7, %%mm7 \n\t"\
896 "1: \n\t"\
897 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
898 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
899 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
900 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
901 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
902 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
903 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
904 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
905 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
906 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
907 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
908 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
909 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
910 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
911 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
912 "paddw %%mm3, %%mm5 \n\t" /* b */\
913 "paddw %%mm2, %%mm6 \n\t" /* c */\
914 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
915 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
916 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 917 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
918 "paddw %%mm4, %%mm0 \n\t" /* a */\
919 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 920 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 921 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 922 "paddw %6, %%mm6 \n\t"\
826f429a
MN
923 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
924 "psraw $5, %%mm0 \n\t"\
c296f66b 925 "movq %%mm0, %5 \n\t"\
826f429a
MN
926 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
927 \
928 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
929 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
930 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
931 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
932 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
933 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
934 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
935 "paddw %%mm0, %%mm2 \n\t" /* b */\
936 "paddw %%mm5, %%mm3 \n\t" /* c */\
937 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
938 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
939 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
940 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
941 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
942 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
c296f66b 943 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a
MN
944 "paddw %%mm2, %%mm1 \n\t" /* a */\
945 "paddw %%mm6, %%mm4 \n\t" /* d */\
c296f66b 946 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
826f429a 947 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
c296f66b 948 "paddw %6, %%mm1 \n\t"\
826f429a
MN
949 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
950 "psraw $5, %%mm3 \n\t"\
c296f66b 951 "movq %5, %%mm1 \n\t"\
826f429a 952 "packuswb %%mm3, %%mm1 \n\t"\
3178ee4c 953 OP_MMX2(%%mm1, (%1),%%mm4, q)\
826f429a
MN
954 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
955 \
956 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
957 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
958 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
959 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
960 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
961 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
962 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
963 "paddw %%mm1, %%mm5 \n\t" /* b */\
964 "paddw %%mm4, %%mm0 \n\t" /* c */\
965 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
966 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
967 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
968 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
c296f66b 969 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
826f429a
MN
970 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
971 "paddw %%mm3, %%mm2 \n\t" /* d */\
972 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
973 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
974 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
975 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
976 "paddw %%mm2, %%mm6 \n\t" /* a */\
c296f66b
MN
977 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
978 "paddw %6, %%mm0 \n\t"\
826f429a
MN
979 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
980 "psraw $5, %%mm0 \n\t"\
981 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
982 \
983 "paddw %%mm5, %%mm3 \n\t" /* a */\
984 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
985 "paddw %%mm4, %%mm6 \n\t" /* b */\
986 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
987 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
988 "paddw %%mm1, %%mm4 \n\t" /* c */\
989 "paddw %%mm2, %%mm5 \n\t" /* d */\
990 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
991 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
c296f66b
MN
992 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
993 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
826f429a 994 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 995 "paddw %6, %%mm4 \n\t"\
826f429a
MN
996 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
997 "psraw $5, %%mm4 \n\t"\
998 "packuswb %%mm4, %%mm0 \n\t"\
3178ee4c 999 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
826f429a
MN
1000 \
1001 "addl %3, %0 \n\t"\
1002 "addl %4, %1 \n\t"\
1003 "decl %2 \n\t"\
1004 " jnz 1b \n\t"\
5a508a98 1005 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
1006 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1007 : "memory"\
826f429a
MN
1008 );\
1009}\
1010\
1011static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1012 int i;\
1013 int16_t temp[16];\
1014 /* quick HACK, XXX FIXME MUST be optimized */\
1015 for(i=0; i<h; i++)\
1016 {\
1017 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1018 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1019 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1020 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1021 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1022 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1023 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1024 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1025 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1026 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1027 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1028 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1029 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1030 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1031 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1032 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1033 asm volatile(\
1034 "movq (%0), %%mm0 \n\t"\
1035 "movq 8(%0), %%mm1 \n\t"\
1036 "paddw %2, %%mm0 \n\t"\
1037 "paddw %2, %%mm1 \n\t"\
1038 "psraw $5, %%mm0 \n\t"\
1039 "psraw $5, %%mm1 \n\t"\
1040 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 1041 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
1042 "movq 16(%0), %%mm0 \n\t"\
1043 "movq 24(%0), %%mm1 \n\t"\
1044 "paddw %2, %%mm0 \n\t"\
1045 "paddw %2, %%mm1 \n\t"\
1046 "psraw $5, %%mm0 \n\t"\
1047 "psraw $5, %%mm1 \n\t"\
1048 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 1049 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
826f429a 1050 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 1051 : "memory"\
826f429a
MN
1052 );\
1053 dst+=dstStride;\
1054 src+=srcStride;\
1055 }\
1056}\
1057\
5c91a675 1058static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
1059 uint64_t temp;\
1060\
1061 asm volatile(\
1062 "pxor %%mm7, %%mm7 \n\t"\
1063 "1: \n\t"\
1064 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1065 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1066 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1067 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1068 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1069 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1070 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1071 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1072 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1073 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1074 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1075 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1076 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1077 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1078 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1079 "paddw %%mm3, %%mm5 \n\t" /* b */\
1080 "paddw %%mm2, %%mm6 \n\t" /* c */\
1081 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1082 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1083 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 1084 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
1085 "paddw %%mm4, %%mm0 \n\t" /* a */\
1086 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 1087 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 1088 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 1089 "paddw %6, %%mm6 \n\t"\
826f429a
MN
1090 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1091 "psraw $5, %%mm0 \n\t"\
1092 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1093 \
1094 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1095 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1096 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1097 "paddw %%mm5, %%mm1 \n\t" /* a */\
1098 "paddw %%mm6, %%mm2 \n\t" /* b */\
1099 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1100 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1101 "paddw %%mm6, %%mm3 \n\t" /* c */\
1102 "paddw %%mm5, %%mm4 \n\t" /* d */\
1103 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1104 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
c296f66b
MN
1105 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1106 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a 1107 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 1108 "paddw %6, %%mm1 \n\t"\
826f429a
MN
1109 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1110 "psraw $5, %%mm3 \n\t"\
1111 "packuswb %%mm3, %%mm0 \n\t"\
3178ee4c 1112 OP_MMX2(%%mm0, (%1), %%mm4, q)\
826f429a
MN
1113 \
1114 "addl %3, %0 \n\t"\
1115 "addl %4, %1 \n\t"\
1116 "decl %2 \n\t"\
c296f66b 1117 " jnz 1b \n\t"\
5a508a98 1118 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
1119 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1120 : "memory"\
826f429a
MN
1121 );\
1122}\
1123\
1124static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1125 int i;\
1126 int16_t temp[8];\
1127 /* quick HACK, XXX FIXME MUST be optimized */\
1128 for(i=0; i<h; i++)\
1129 {\
1130 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1131 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1132 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1133 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1134 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1135 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1136 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1137 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1138 asm volatile(\
1139 "movq (%0), %%mm0 \n\t"\
1140 "movq 8(%0), %%mm1 \n\t"\
1141 "paddw %2, %%mm0 \n\t"\
1142 "paddw %2, %%mm1 \n\t"\
1143 "psraw $5, %%mm0 \n\t"\
1144 "psraw $5, %%mm1 \n\t"\
1145 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 1146 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a 1147 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 1148 :"memory"\
826f429a
MN
1149 );\
1150 dst+=dstStride;\
1151 src+=srcStride;\
1152 }\
3178ee4c
MN
1153}
1154
1155#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1156\
1157static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1158 uint64_t temp[17*4];\
1159 uint64_t *temp_ptr= temp;\
1160 int count= 17;\
1161\
1162 /*FIXME unroll */\
1163 asm volatile(\
1164 "pxor %%mm7, %%mm7 \n\t"\
1165 "1: \n\t"\
1166 "movq (%0), %%mm0 \n\t"\
1167 "movq (%0), %%mm1 \n\t"\
1168 "movq 8(%0), %%mm2 \n\t"\
1169 "movq 8(%0), %%mm3 \n\t"\
1170 "punpcklbw %%mm7, %%mm0 \n\t"\
1171 "punpckhbw %%mm7, %%mm1 \n\t"\
1172 "punpcklbw %%mm7, %%mm2 \n\t"\
1173 "punpckhbw %%mm7, %%mm3 \n\t"\
1174 "movq %%mm0, (%1) \n\t"\
1175 "movq %%mm1, 17*8(%1) \n\t"\
5a508a98
MN
1176 "movq %%mm2, 2*17*8(%1) \n\t"\
1177 "movq %%mm3, 3*17*8(%1) \n\t"\
3178ee4c
MN
1178 "addl $8, %1 \n\t"\
1179 "addl %3, %0 \n\t"\
1180 "decl %2 \n\t"\
1181 " jnz 1b \n\t"\
1182 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
5a508a98 1183 : "r" (srcStride)\
0b093b6f 1184 : "memory"\
3178ee4c
MN
1185 );\
1186 \
1187 temp_ptr= temp;\
1188 count=4;\
1189 \
1190/*FIXME reorder for speed */\
3178ee4c
MN
1191 asm volatile(\
1192 /*"pxor %%mm7, %%mm7 \n\t"*/\
3178ee4c
MN
1193 "1: \n\t"\
1194 "movq (%0), %%mm0 \n\t"\
1195 "movq 8(%0), %%mm1 \n\t"\
1196 "movq 16(%0), %%mm2 \n\t"\
1197 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1198 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1199 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
3178ee4c 1200 "addl %4, %1 \n\t"\
c296f66b 1201 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
3178ee4c 1202 \
c296f66b 1203 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
3178ee4c 1204 "addl %4, %1 \n\t"\
c296f66b
MN
1205 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1206 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
3178ee4c 1207 "addl %4, %1 \n\t"\
c296f66b
MN
1208 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1209 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
3178ee4c 1210 "addl %4, %1 \n\t"\
c296f66b
MN
1211 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1212 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
3178ee4c 1213 "addl %4, %1 \n\t"\
c296f66b
MN
1214 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1215 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
3178ee4c 1216 "addl %4, %1 \n\t"\
c296f66b 1217 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
3178ee4c 1218 \
c296f66b 1219 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
3178ee4c 1220 "addl %4, %1 \n\t" \
c296f66b
MN
1221 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1222 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
3178ee4c
MN
1223 \
1224 "addl $136, %0 \n\t"\
c296f66b 1225 "addl %6, %1 \n\t"\
3178ee4c
MN
1226 "decl %2 \n\t"\
1227 " jnz 1b \n\t"\
3178ee4c 1228 \
5a508a98 1229 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
c296f66b 1230 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
0b093b6f 1231 :"memory"\
3178ee4c 1232 );\
826f429a
MN
1233}\
1234\
5c91a675 1235static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
826f429a
MN
1236 uint64_t temp[9*4];\
1237 uint64_t *temp_ptr= temp;\
1238 int count= 9;\
1239\
1240 /*FIXME unroll */\
1241 asm volatile(\
1242 "pxor %%mm7, %%mm7 \n\t"\
1243 "1: \n\t"\
1244 "movq (%0), %%mm0 \n\t"\
1245 "movq (%0), %%mm1 \n\t"\
1246 "punpcklbw %%mm7, %%mm0 \n\t"\
1247 "punpckhbw %%mm7, %%mm1 \n\t"\
1248 "movq %%mm0, (%1) \n\t"\
1249 "movq %%mm1, 9*8(%1) \n\t"\
1250 "addl $8, %1 \n\t"\
1251 "addl %3, %0 \n\t"\
1252 "decl %2 \n\t"\
1253 " jnz 1b \n\t"\
1254 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1255 : "r" (srcStride)\
0b093b6f 1256 : "memory"\
826f429a
MN
1257 );\
1258 \
1259 temp_ptr= temp;\
1260 count=2;\
1261 \
1262/*FIXME reorder for speed */\
1263 asm volatile(\
1264 /*"pxor %%mm7, %%mm7 \n\t"*/\
1265 "1: \n\t"\
1266 "movq (%0), %%mm0 \n\t"\
1267 "movq 8(%0), %%mm1 \n\t"\
1268 "movq 16(%0), %%mm2 \n\t"\
1269 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1270 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1271 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
826f429a 1272 "addl %4, %1 \n\t"\
c296f66b 1273 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
826f429a 1274 \
c296f66b 1275 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
826f429a 1276 "addl %4, %1 \n\t"\
c296f66b 1277 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
826f429a 1278 \
c296f66b 1279 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
826f429a 1280 "addl %4, %1 \n\t"\
c296f66b
MN
1281 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1282 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
826f429a
MN
1283 \
1284 "addl $72, %0 \n\t"\
c296f66b 1285 "addl %6, %1 \n\t"\
826f429a
MN
1286 "decl %2 \n\t"\
1287 " jnz 1b \n\t"\
1288 \
c296f66b
MN
1289 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1290 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
0b093b6f
MN
1291 : "memory"\
1292 );\
3178ee4c 1293}\
826f429a 1294\
0c1a9eda 1295static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1296 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
826f429a
MN
1297}\
1298\
0c1a9eda 1299static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1300 uint64_t temp[8];\
826f429a
MN
1301 uint8_t * const half= (uint8_t*)temp;\
1302 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1303 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1304}\
1305\
0c1a9eda 1306static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1307 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1308}\
1309\
0c1a9eda 1310static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1311 uint64_t temp[8];\
826f429a
MN
1312 uint8_t * const half= (uint8_t*)temp;\
1313 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1314 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1315}\
1316\
0c1a9eda 1317static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1318 uint64_t temp[8];\
826f429a 1319 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1320 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1321 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1322}\
1323\
0c1a9eda 1324static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1325 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1326}\
1327\
0c1a9eda 1328static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1329 uint64_t temp[8];\
826f429a 1330 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1331 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1332 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1333}\
0c1a9eda 1334static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1335 uint64_t half[8 + 9];\
1336 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1337 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1338 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1339 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1340 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1341 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a 1342}\
0c1a9eda 1343static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1344 uint64_t half[8 + 9];\
1345 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1346 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1347 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1348 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1349 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1350 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a 1351}\
0c1a9eda 1352static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1353 uint64_t half[8 + 9];\
1354 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1355 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1356 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1357 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1358 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1359 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a 1360}\
0c1a9eda 1361static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1362 uint64_t half[8 + 9];\
1363 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1364 uint8_t * const halfHV= ((uint8_t*)half);\
1365 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1366 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1367 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1368 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a 1369}\
0c1a9eda 1370static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1371 uint64_t half[8 + 9];\
826f429a
MN
1372 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1373 uint8_t * const halfHV= ((uint8_t*)half);\
1374 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1375 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1376 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1377}\
0c1a9eda 1378static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1379 uint64_t half[8 + 9];\
826f429a
MN
1380 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1381 uint8_t * const halfHV= ((uint8_t*)half);\
1382 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1383 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1384 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1385}\
0c1a9eda 1386static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1387 uint64_t half[8 + 9];\
1388 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1389 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1390 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
1391 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1392}\
0c1a9eda 1393static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1394 uint64_t half[8 + 9];\
1395 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1396 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1397 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
1398 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1399}\
0c1a9eda 1400static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1401 uint64_t half[9];\
826f429a
MN
1402 uint8_t * const halfH= ((uint8_t*)half);\
1403 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1404 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1405}\
0c1a9eda 1406static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1407 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
826f429a
MN
1408}\
1409\
0c1a9eda 1410static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1411 uint64_t temp[32];\
1412 uint8_t * const half= (uint8_t*)temp;\
1413 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1414 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1415}\
1416\
0c1a9eda 1417static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1418 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1419}\
1420\
0c1a9eda 1421static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1422 uint64_t temp[32];\
1423 uint8_t * const half= (uint8_t*)temp;\
1424 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1425 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1426}\
1427\
0c1a9eda 1428static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1429 uint64_t temp[32];\
1430 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1431 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1432 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1433}\
1434\
0c1a9eda 1435static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1436 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1437}\
1438\
0c1a9eda 1439static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1440 uint64_t temp[32];\
1441 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1442 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1443 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1444}\
0c1a9eda 1445static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1446 uint64_t half[16*2 + 17*2];\
1447 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1448 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1449 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1450 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1451 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1452 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a 1453}\
0c1a9eda 1454static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1455 uint64_t half[16*2 + 17*2];\
1456 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1457 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1458 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1459 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1460 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1461 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a 1462}\
0c1a9eda 1463static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1464 uint64_t half[16*2 + 17*2];\
1465 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1466 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1467 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1468 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1469 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1470 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a 1471}\
0c1a9eda 1472static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1473 uint64_t half[16*2 + 17*2];\
1474 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1475 uint8_t * const halfHV= ((uint8_t*)half);\
1476 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1477 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1478 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1479 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a 1480}\
0c1a9eda 1481static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1482 uint64_t half[16*2 + 17*2];\
1483 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1484 uint8_t * const halfHV= ((uint8_t*)half);\
1485 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1486 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1487 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1488}\
0c1a9eda 1489static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1490 uint64_t half[16*2 + 17*2];\
1491 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1492 uint8_t * const halfHV= ((uint8_t*)half);\
1493 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1494 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1495 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1496}\
0c1a9eda 1497static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1498 uint64_t half[17*2];\
1499 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1500 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1501 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
1502 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a 1503}\
0c1a9eda 1504static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1505 uint64_t half[17*2];\
1506 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1507 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1508 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
1509 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a 1510}\
0c1a9eda 1511static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1512 uint64_t half[17*2];\
1513 uint8_t * const halfH= ((uint8_t*)half);\
1514 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1515 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1516}
1517
1518
1519#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
3178ee4c 1520#define AVG_3DNOW_OP(a,b,temp, size) \
826f429a
MN
1521"mov" #size " " #b ", " #temp " \n\t"\
1522"pavgusb " #temp ", " #a " \n\t"\
1523"mov" #size " " #a ", " #b " \n\t"
3178ee4c 1524#define AVG_MMX2_OP(a,b,temp, size) \
826f429a
MN
1525"mov" #size " " #b ", " #temp " \n\t"\
1526"pavgb " #temp ", " #a " \n\t"\
1527"mov" #size " " #a ", " #b " \n\t"
3178ee4c
MN
1528
1529QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1530QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1531QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1532QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1533QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1534QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
826f429a 1535QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
3178ee4c 1536QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
826f429a
MN
1537QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1538
61a4e8ae 1539#if 0
d6a4c0b1 1540static void just_return() { return; }
61a4e8ae 1541#endif
d6a4c0b1 1542
826f429a
MN
1543#define SET_QPEL_FUNC(postfix1, postfix2) \
1544 c->put_ ## postfix1 = put_ ## postfix2;\
1545 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1546 c->avg_ ## postfix1 = avg_ ## postfix2;
b0368839
MN
1547
1548/* external functions, from idct_mmx.c */
1549void ff_mmx_idct(DCTELEM *block);
1550void ff_mmxext_idct(DCTELEM *block);
1551
1552/* XXX: those functions should be suppressed ASAP when all IDCTs are
1553 converted */
1554static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1555{
1556 ff_mmx_idct (block);
1557 put_pixels_clamped_mmx(block, dest, line_size);
1558}
1559static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1560{
1561 ff_mmx_idct (block);
1562 add_pixels_clamped_mmx(block, dest, line_size);
1563}
1564static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1565{
1566 ff_mmxext_idct (block);
1567 put_pixels_clamped_mmx(block, dest, line_size);
1568}
1569static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1570{
1571 ff_mmxext_idct (block);
1572 add_pixels_clamped_mmx(block, dest, line_size);
1573}
826f429a 1574
b0368839 1575void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
de6d9b64
FB
1576{
1577 mm_flags = mm_support();
63f60826 1578
e5247b5f
ZK
1579 if (avctx->dsp_mask) {
1580 if (avctx->dsp_mask & FF_MM_FORCE)
1581 mm_flags |= (avctx->dsp_mask & 0xffff);
1582 else
1583 mm_flags &= ~(avctx->dsp_mask & 0xffff);
1584 }
63f60826 1585
1565dabc
LB
1586#if 0
1587 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 1588 if (mm_flags & MM_MMX)
1565dabc 1589 fprintf(stderr, " mmx");
de6d9b64 1590 if (mm_flags & MM_MMXEXT)
1565dabc 1591 fprintf(stderr, " mmxext");
de6d9b64 1592 if (mm_flags & MM_3DNOW)
1565dabc 1593 fprintf(stderr, " 3dnow");
de6d9b64 1594 if (mm_flags & MM_SSE)
1565dabc 1595 fprintf(stderr, " sse");
de6d9b64 1596 if (mm_flags & MM_SSE2)
1565dabc
LB
1597 fprintf(stderr, " sse2");
1598 fprintf(stderr, "\n");
de6d9b64
FB
1599#endif
1600
1601 if (mm_flags & MM_MMX) {
b0368839
MN
1602 const int dct_algo = avctx->dct_algo;
1603 const int idct_algo= avctx->idct_algo;
1604
5fd74135 1605#ifdef CONFIG_ENCODERS
cf3bf5bb
MN
1606 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1607 if(mm_flags & MM_MMXEXT){
1608 c->fdct = ff_fdct_mmx2;
1609 }else{
1610 c->fdct = ff_fdct_mmx;
1611 }
1612 }
5fd74135 1613#endif //CONFIG_ENCODERS
b0368839
MN
1614
1615 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
1616 c->idct_put= ff_simple_idct_put_mmx;
1617 c->idct_add= ff_simple_idct_add_mmx;
4fb518c3 1618 c->idct = ff_simple_idct_mmx;
b0368839
MN
1619 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
1620 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
1621 if(mm_flags & MM_MMXEXT){
1622 c->idct_put= ff_libmpeg2mmx2_idct_put;
1623 c->idct_add= ff_libmpeg2mmx2_idct_add;
4fb518c3 1624 c->idct = ff_mmxext_idct;
b0368839
MN
1625 }else{
1626 c->idct_put= ff_libmpeg2mmx_idct_put;
1627 c->idct_add= ff_libmpeg2mmx_idct_add;
4fb518c3 1628 c->idct = ff_mmx_idct;
b0368839
MN
1629 }
1630 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
1631 }
1632
764ef400 1633#ifdef CONFIG_ENCODERS
eb4b3dd3
ZK
1634 c->get_pixels = get_pixels_mmx;
1635 c->diff_pixels = diff_pixels_mmx;
764ef400 1636#endif //CONFIG_ENCODERS
eb4b3dd3
ZK
1637 c->put_pixels_clamped = put_pixels_clamped_mmx;
1638 c->add_pixels_clamped = add_pixels_clamped_mmx;
1639 c->clear_blocks = clear_blocks_mmx;
764ef400 1640#ifdef CONFIG_ENCODERS
eb4b3dd3 1641 c->pix_sum = pix_sum16_mmx;
764ef400 1642#endif //CONFIG_ENCODERS
eb4b3dd3 1643
eb4b3dd3
ZK
1644 c->put_pixels_tab[0][0] = put_pixels16_mmx;
1645 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1646 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1647 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1648
1649 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1650 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1651 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1652 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1653
1654 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1655 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1656 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1657 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1658
1659 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1660 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1661 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1662 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1663
1664 c->put_pixels_tab[1][0] = put_pixels8_mmx;
1665 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1666 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1667 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1668
1669 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1670 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1671 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1672 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1673
1674 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1675 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1676 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1677 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1678
1679 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1680 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1681 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1682 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
826f429a 1683
11f18faf 1684 c->add_bytes= add_bytes_mmx;
764ef400 1685#ifdef CONFIG_ENCODERS
11f18faf 1686 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
1687
1688 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1689 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1690
2a006cd3
FL
1691 c->pix_norm1 = pix_norm1_mmx;
1692 c->sse[0] = sse16_mmx;
764ef400 1693#endif //CONFIG_ENCODERS
1457ab52 1694
de6d9b64 1695 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1696 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1697 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
eb4b3dd3
ZK
1698
1699 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1700 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1701 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
eb4b3dd3
ZK
1702
1703 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1704 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
eb4b3dd3
ZK
1705
1706 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1707 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1708 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
b0368839 1709
764ef400 1710#ifdef CONFIG_ENCODERS
8e0a3db7
MN
1711 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1712 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
764ef400 1713#endif //CONFIG_ENCODERS
8e0a3db7 1714
b0368839
MN
1715 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1716 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1717 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1718 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1719 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1720 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1721 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1722 }
3178ee4c 1723
c296f66b 1724#if 1
826f429a
MN
1725 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1726 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1727 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1728 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1729 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1730 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1731 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1732 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1733 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1734 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1735 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1736 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1737 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1738 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1739 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1740 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1741 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1742 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1743 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1744 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1745 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1746 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1747 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1748 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1749 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1750 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1751 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1752 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1753 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1754 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1755 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1756 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
c296f66b 1757#endif
84705403
MN
1758
1759 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
de6d9b64 1760 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
1761 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1762 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
eb4b3dd3
ZK
1763
1764 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1765 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1766 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
eb4b3dd3
ZK
1767
1768 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1769 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eb4b3dd3
ZK
1770
1771 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1772 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1773 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
b0368839
MN
1774
1775 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1776 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1777 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1778 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1779 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1780 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1781 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
1782 }
db794953 1783
826f429a
MN
1784 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1785 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1786 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1787 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1788 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1789 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1790 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1791 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1792 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1793 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1794 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1795 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1796 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1797 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1798 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1799 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1800 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1801 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1802 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1803 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1804 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1805 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1806 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1807 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1808 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1809 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1810 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1811 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1812 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1813 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1814 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1815 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
de6d9b64
FB
1816 }
1817 }
b0368839 1818
764ef400 1819#ifdef CONFIG_ENCODERS
b0368839 1820 dsputil_init_pix_mmx(c, avctx);
764ef400 1821#endif //CONFIG_ENCODERS
d6a4c0b1
ZK
1822#if 0
1823 // for speed testing
1824 get_pixels = just_return;
1825 put_pixels_clamped = just_return;
1826 add_pixels_clamped = just_return;
1827
1828 pix_abs16x16 = just_return;
1829 pix_abs16x16_x2 = just_return;
1830 pix_abs16x16_y2 = just_return;
1831 pix_abs16x16_xy2 = just_return;
1832
1833 put_pixels_tab[0] = just_return;
1834 put_pixels_tab[1] = just_return;
1835 put_pixels_tab[2] = just_return;
1836 put_pixels_tab[3] = just_return;
1837
1838 put_no_rnd_pixels_tab[0] = just_return;
1839 put_no_rnd_pixels_tab[1] = just_return;
1840 put_no_rnd_pixels_tab[2] = just_return;
1841 put_no_rnd_pixels_tab[3] = just_return;
1842
1843 avg_pixels_tab[0] = just_return;
1844 avg_pixels_tab[1] = just_return;
1845 avg_pixels_tab[2] = just_return;
1846 avg_pixels_tab[3] = just_return;
1847
1848 avg_no_rnd_pixels_tab[0] = just_return;
1849 avg_no_rnd_pixels_tab[1] = just_return;
1850 avg_no_rnd_pixels_tab[2] = just_return;
1851 avg_no_rnd_pixels_tab[3] = just_return;
1852
d6a4c0b1
ZK
1853 //av_fdct = just_return;
1854 //ff_idct = just_return;
1855#endif
de6d9b64 1856}