fixing inaccurate frame_rate
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5 24int mm_flags; /* multimedia extension flags */
1457ab52 25
de6d9b64 26/* pixel operations */
a7bd8797
MN
27static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
28static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
29static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 30
826f429a
MN
31static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
32static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
33static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
34static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
35
d6a4c0b1
ZK
36#define JUMPALIGN() __asm __volatile (".balign 8"::)
37#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
38
fca0f0e5
ZK
39#define MOVQ_WONE(regd) \
40 __asm __volatile ( \
41 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
42 "psrlw $15, %%" #regd ::)
43
44#define MOVQ_BFE(regd) \
45 __asm __volatile ( \
46 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
47 "paddb %%" #regd ", %%" #regd " \n\t" ::)
48
d6a4c0b1 49#ifndef PIC
fca0f0e5 50#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
51#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
52#else
53// for shared library it's better to use this way for accessing constants
54// pcmpeqd -> -1
fca0f0e5 55#define MOVQ_BONE(regd) \
d6a4c0b1 56 __asm __volatile ( \
fca0f0e5
ZK
57 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
58 "psrlw $15, %%" #regd " \n\t" \
59 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
60
61#define MOVQ_WTWO(regd) \
62 __asm __volatile ( \
fca0f0e5
ZK
63 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
64 "psrlw $15, %%" #regd " \n\t" \
65 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 66
d6a4c0b1
ZK
67#endif
68
fca0f0e5 69// using regr as temporary and for the output result
def60345 70// first argument is unmodifed and second is trashed
39825f31
ZK
71// regfe is supposed to contain 0xfefefefefefefefe
72#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
73 "movq " #rega ", " #regr " \n\t"\
74 "pand " #regb ", " #regr " \n\t"\
def60345 75 "pxor " #rega ", " #regb " \n\t"\
39825f31 76 "pand " #regfe "," #regb " \n\t"\
def60345 77 "psrlq $1, " #regb " \n\t"\
91abb473 78 "paddb " #regb ", " #regr " \n\t"
def60345 79
39825f31 80#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
81 "movq " #rega ", " #regr " \n\t"\
82 "por " #regb ", " #regr " \n\t"\
def60345 83 "pxor " #rega ", " #regb " \n\t"\
39825f31 84 "pand " #regfe "," #regb " \n\t"\
def60345 85 "psrlq $1, " #regb " \n\t"\
91abb473 86 "psubb " #regb ", " #regr " \n\t"
def60345 87
39825f31 88// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
89#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
90 "movq " #rega ", " #regr " \n\t"\
91 "movq " #regc ", " #regp " \n\t"\
92 "pand " #regb ", " #regr " \n\t"\
93 "pand " #regd ", " #regp " \n\t"\
94 "pxor " #rega ", " #regb " \n\t"\
95 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
96 "pand %%mm6, " #regb " \n\t"\
97 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
98 "psrlq $1, " #regb " \n\t"\
99 "psrlq $1, " #regd " \n\t"\
100 "paddb " #regb ", " #regr " \n\t"\
101 "paddb " #regd ", " #regp " \n\t"
102
103#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
104 "movq " #rega ", " #regr " \n\t"\
105 "movq " #regc ", " #regp " \n\t"\
106 "por " #regb ", " #regr " \n\t"\
107 "por " #regd ", " #regp " \n\t"\
108 "pxor " #rega ", " #regb " \n\t"\
109 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
110 "pand %%mm6, " #regb " \n\t"\
111 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
112 "psrlq $1, " #regd " \n\t"\
113 "psrlq $1, " #regb " \n\t"\
114 "psubb " #regb ", " #regr " \n\t"\
115 "psubb " #regd ", " #regp " \n\t"
116
91abb473
ZK
117/***********************************/
118/* MMX no rounding */
119#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 120#define SET_RND MOVQ_WONE
6aa6ea8e 121#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 122#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 123
91abb473
ZK
124#include "dsputil_mmx_rnd.h"
125
126#undef DEF
fca0f0e5 127#undef SET_RND
6aa6ea8e 128#undef PAVGBP
39825f31 129#undef PAVGB
91abb473
ZK
130/***********************************/
131/* MMX rounding */
132
133#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 134#define SET_RND MOVQ_WTWO
6aa6ea8e 135#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 136#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 137
91abb473
ZK
138#include "dsputil_mmx_rnd.h"
139
140#undef DEF
fca0f0e5 141#undef SET_RND
6aa6ea8e 142#undef PAVGBP
39825f31 143#undef PAVGB
a7bd8797 144
de6d9b64
FB
145/***********************************/
146/* 3Dnow specific */
147
148#define DEF(x) x ## _3dnow
149/* for Athlons PAVGUSB is prefered */
150#define PAVGB "pavgusb"
151
152#include "dsputil_mmx_avg.h"
153
154#undef DEF
155#undef PAVGB
156
157/***********************************/
158/* MMX2 specific */
159
607dce96 160#define DEF(x) x ## _mmx2
de6d9b64
FB
161
162/* Introduced only in MMX2 set */
163#define PAVGB "pavgb"
164
165#include "dsputil_mmx_avg.h"
166
167#undef DEF
168#undef PAVGB
169
170/***********************************/
171/* standard MMX */
172
0c1a9eda 173static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
de6d9b64 174{
607dce96
MN
175 asm volatile(
176 "movl $-128, %%eax \n\t"
177 "pxor %%mm7, %%mm7 \n\t"
178 ".balign 16 \n\t"
179 "1: \n\t"
180 "movq (%0), %%mm0 \n\t"
181 "movq (%0, %2), %%mm2 \n\t"
182 "movq %%mm0, %%mm1 \n\t"
183 "movq %%mm2, %%mm3 \n\t"
184 "punpcklbw %%mm7, %%mm0 \n\t"
185 "punpckhbw %%mm7, %%mm1 \n\t"
186 "punpcklbw %%mm7, %%mm2 \n\t"
187 "punpckhbw %%mm7, %%mm3 \n\t"
188 "movq %%mm0, (%1, %%eax)\n\t"
189 "movq %%mm1, 8(%1, %%eax)\n\t"
190 "movq %%mm2, 16(%1, %%eax)\n\t"
191 "movq %%mm3, 24(%1, %%eax)\n\t"
192 "addl %3, %0 \n\t"
193 "addl $32, %%eax \n\t"
194 "js 1b \n\t"
195 : "+r" (pixels)
196 : "r" (block+64), "r" (line_size), "r" (line_size*2)
197 : "%eax"
198 );
de6d9b64
FB
199}
200
0c1a9eda 201static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
9dbcbd92
MN
202{
203 asm volatile(
607dce96 204 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 205 "movl $-128, %%eax \n\t"
607dce96 206 ".balign 16 \n\t"
9dbcbd92
MN
207 "1: \n\t"
208 "movq (%0), %%mm0 \n\t"
209 "movq (%1), %%mm2 \n\t"
210 "movq %%mm0, %%mm1 \n\t"
211 "movq %%mm2, %%mm3 \n\t"
212 "punpcklbw %%mm7, %%mm0 \n\t"
213 "punpckhbw %%mm7, %%mm1 \n\t"
214 "punpcklbw %%mm7, %%mm2 \n\t"
215 "punpckhbw %%mm7, %%mm3 \n\t"
216 "psubw %%mm2, %%mm0 \n\t"
217 "psubw %%mm3, %%mm1 \n\t"
218 "movq %%mm0, (%2, %%eax)\n\t"
219 "movq %%mm1, 8(%2, %%eax)\n\t"
220 "addl %3, %0 \n\t"
221 "addl %3, %1 \n\t"
222 "addl $16, %%eax \n\t"
223 "jnz 1b \n\t"
224 : "+r" (s1), "+r" (s2)
225 : "r" (block+64), "r" (stride)
226 : "%eax"
227 );
228}
229
0c1a9eda 230void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
de6d9b64
FB
231{
232 const DCTELEM *p;
0c1a9eda 233 uint8_t *pix;
de6d9b64
FB
234
235 /* read the pixels */
236 p = block;
237 pix = pixels;
d6a4c0b1 238 /* unrolled loop */
de6d9b64 239 __asm __volatile(
a822a479
NK
240 "movq %3, %%mm0\n\t"
241 "movq 8%3, %%mm1\n\t"
242 "movq 16%3, %%mm2\n\t"
243 "movq 24%3, %%mm3\n\t"
244 "movq 32%3, %%mm4\n\t"
245 "movq 40%3, %%mm5\n\t"
246 "movq 48%3, %%mm6\n\t"
247 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
248 "packuswb %%mm1, %%mm0\n\t"
249 "packuswb %%mm3, %%mm2\n\t"
250 "packuswb %%mm5, %%mm4\n\t"
251 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
252 "movq %%mm0, (%0)\n\t"
253 "movq %%mm2, (%0, %1)\n\t"
254 "movq %%mm4, (%0, %1, 2)\n\t"
255 "movq %%mm6, (%0, %2)\n\t"
256 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
257 :"memory");
258 pix += line_size*4;
259 p += 32;
d6a4c0b1
ZK
260
261 // if here would be an exact copy of the code above
262 // compiler would generate some very strange code
263 // thus using "r"
264 __asm __volatile(
265 "movq (%3), %%mm0\n\t"
266 "movq 8(%3), %%mm1\n\t"
267 "movq 16(%3), %%mm2\n\t"
268 "movq 24(%3), %%mm3\n\t"
269 "movq 32(%3), %%mm4\n\t"
270 "movq 40(%3), %%mm5\n\t"
271 "movq 48(%3), %%mm6\n\t"
272 "movq 56(%3), %%mm7\n\t"
273 "packuswb %%mm1, %%mm0\n\t"
274 "packuswb %%mm3, %%mm2\n\t"
275 "packuswb %%mm5, %%mm4\n\t"
276 "packuswb %%mm7, %%mm6\n\t"
277 "movq %%mm0, (%0)\n\t"
278 "movq %%mm2, (%0, %1)\n\t"
279 "movq %%mm4, (%0, %1, 2)\n\t"
280 "movq %%mm6, (%0, %2)\n\t"
281 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
282 :"memory");
de6d9b64
FB
283}
284
0c1a9eda 285void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
de6d9b64
FB
286{
287 const DCTELEM *p;
0c1a9eda 288 uint8_t *pix;
de6d9b64
FB
289 int i;
290
291 /* read the pixels */
292 p = block;
293 pix = pixels;
d6a4c0b1
ZK
294 MOVQ_ZERO(mm7);
295 i = 4;
cd8e5f96 296 do {
de6d9b64 297 __asm __volatile(
cd8e5f96
ZK
298 "movq (%2), %%mm0\n\t"
299 "movq 8(%2), %%mm1\n\t"
300 "movq 16(%2), %%mm2\n\t"
301 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
302 "movq %0, %%mm4\n\t"
303 "movq %1, %%mm6\n\t"
304 "movq %%mm4, %%mm5\n\t"
305 "punpcklbw %%mm7, %%mm4\n\t"
306 "punpckhbw %%mm7, %%mm5\n\t"
307 "paddsw %%mm4, %%mm0\n\t"
308 "paddsw %%mm5, %%mm1\n\t"
309 "movq %%mm6, %%mm5\n\t"
310 "punpcklbw %%mm7, %%mm6\n\t"
311 "punpckhbw %%mm7, %%mm5\n\t"
312 "paddsw %%mm6, %%mm2\n\t"
313 "paddsw %%mm5, %%mm3\n\t"
314 "packuswb %%mm1, %%mm0\n\t"
315 "packuswb %%mm3, %%mm2\n\t"
316 "movq %%mm0, %0\n\t"
317 "movq %%mm2, %1\n\t"
a822a479 318 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 319 :"r"(p)
de6d9b64
FB
320 :"memory");
321 pix += line_size*2;
322 p += 16;
cd8e5f96 323 } while (--i);
de6d9b64
FB
324}
325
0c1a9eda 326static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
de6d9b64 327{
39825f31 328 __asm __volatile(
31ddcf98 329 "lea (%3, %3), %%eax \n\t"
52af45ad 330 ".balign 8 \n\t"
31ddcf98
ZK
331 "1: \n\t"
332 "movq (%1), %%mm0 \n\t"
333 "movq (%1, %3), %%mm1 \n\t"
334 "movq %%mm0, (%2) \n\t"
335 "movq %%mm1, (%2, %3) \n\t"
336 "addl %%eax, %1 \n\t"
337 "addl %%eax, %2 \n\t"
338 "movq (%1), %%mm0 \n\t"
339 "movq (%1, %3), %%mm1 \n\t"
340 "movq %%mm0, (%2) \n\t"
341 "movq %%mm1, (%2, %3) \n\t"
342 "addl %%eax, %1 \n\t"
343 "addl %%eax, %2 \n\t"
344 "subl $4, %0 \n\t"
345 "jnz 1b \n\t"
346 : "+g"(h), "+r" (pixels), "+r" (block)
347 : "r"(line_size)
348 : "%eax", "memory"
349 );
de6d9b64
FB
350}
351
0c1a9eda 352static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
b3184779
MN
353{
354 __asm __volatile(
355 "lea (%3, %3), %%eax \n\t"
356 ".balign 8 \n\t"
357 "1: \n\t"
358 "movq (%1), %%mm0 \n\t"
359 "movq 8(%1), %%mm4 \n\t"
360 "movq (%1, %3), %%mm1 \n\t"
361 "movq 8(%1, %3), %%mm5 \n\t"
362 "movq %%mm0, (%2) \n\t"
363 "movq %%mm4, 8(%2) \n\t"
364 "movq %%mm1, (%2, %3) \n\t"
365 "movq %%mm5, 8(%2, %3) \n\t"
366 "addl %%eax, %1 \n\t"
367 "addl %%eax, %2 \n\t"
368 "movq (%1), %%mm0 \n\t"
369 "movq 8(%1), %%mm4 \n\t"
370 "movq (%1, %3), %%mm1 \n\t"
371 "movq 8(%1, %3), %%mm5 \n\t"
372 "movq %%mm0, (%2) \n\t"
373 "movq %%mm4, 8(%2) \n\t"
374 "movq %%mm1, (%2, %3) \n\t"
375 "movq %%mm5, 8(%2, %3) \n\t"
376 "addl %%eax, %1 \n\t"
377 "addl %%eax, %2 \n\t"
378 "subl $4, %0 \n\t"
379 "jnz 1b \n\t"
380 : "+g"(h), "+r" (pixels), "+r" (block)
381 : "r"(line_size)
382 : "%eax", "memory"
383 );
384}
385
649c00c9
MN
386static void clear_blocks_mmx(DCTELEM *blocks)
387{
39825f31 388 __asm __volatile(
649c00c9
MN
389 "pxor %%mm7, %%mm7 \n\t"
390 "movl $-128*6, %%eax \n\t"
391 "1: \n\t"
392 "movq %%mm7, (%0, %%eax) \n\t"
393 "movq %%mm7, 8(%0, %%eax) \n\t"
394 "movq %%mm7, 16(%0, %%eax) \n\t"
395 "movq %%mm7, 24(%0, %%eax) \n\t"
396 "addl $32, %%eax \n\t"
397 " js 1b \n\t"
398 : : "r" (((int)blocks)+128*6)
399 : "%eax"
400 );
401}
402
0c1a9eda 403static int pix_sum16_mmx(uint8_t * pix, int line_size){
084c726b
MN
404 const int h=16;
405 int sum;
406 int index= -line_size*h;
407
408 __asm __volatile(
409 "pxor %%mm7, %%mm7 \n\t"
410 "pxor %%mm6, %%mm6 \n\t"
411 "1: \n\t"
412 "movq (%2, %1), %%mm0 \n\t"
413 "movq (%2, %1), %%mm1 \n\t"
414 "movq 8(%2, %1), %%mm2 \n\t"
415 "movq 8(%2, %1), %%mm3 \n\t"
416 "punpcklbw %%mm7, %%mm0 \n\t"
417 "punpckhbw %%mm7, %%mm1 \n\t"
418 "punpcklbw %%mm7, %%mm2 \n\t"
419 "punpckhbw %%mm7, %%mm3 \n\t"
420 "paddw %%mm0, %%mm1 \n\t"
421 "paddw %%mm2, %%mm3 \n\t"
422 "paddw %%mm1, %%mm3 \n\t"
423 "paddw %%mm3, %%mm6 \n\t"
424 "addl %3, %1 \n\t"
425 " js 1b \n\t"
426 "movq %%mm6, %%mm5 \n\t"
427 "psrlq $32, %%mm6 \n\t"
428 "paddw %%mm5, %%mm6 \n\t"
429 "movq %%mm6, %%mm5 \n\t"
430 "psrlq $16, %%mm6 \n\t"
431 "paddw %%mm5, %%mm6 \n\t"
432 "movd %%mm6, %0 \n\t"
433 "andl $0xFFFF, %0 \n\t"
434 : "=&r" (sum), "+r" (index)
435 : "r" (pix - index), "r" (line_size)
436 );
437
438 return sum;
439}
440
11f18faf
MN
441static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
442 int i=0;
443 asm volatile(
444 "1: \n\t"
445 "movq (%1, %0), %%mm0 \n\t"
446 "movq (%2, %0), %%mm1 \n\t"
447 "paddb %%mm0, %%mm1 \n\t"
448 "movq %%mm1, (%2, %0) \n\t"
449 "movq 8(%1, %0), %%mm0 \n\t"
450 "movq 8(%2, %0), %%mm1 \n\t"
451 "paddb %%mm0, %%mm1 \n\t"
452 "movq %%mm1, 8(%2, %0) \n\t"
453 "addl $16, %0 \n\t"
454 "cmpl %3, %0 \n\t"
455 " jb 1b \n\t"
456 : "+r" (i)
457 : "r"(src), "r"(dst), "r"(w-15)
458 );
459 for(; i<w; i++)
460 dst[i+0] += src[i+0];
461}
462
2a006cd3
FL
463static int pix_norm1_mmx(uint8_t *pix, int line_size) {
464 int tmp;
465 asm volatile (
466 "movl $16,%%ecx\n"
467 "pxor %%mm0,%%mm0\n"
468 "pxor %%mm7,%%mm7\n"
469 "1:\n"
470 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
471 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
472
473 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
474
475 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
476 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
477
478 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
479 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
480 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
481
482 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
483 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
484
485 "pmaddwd %%mm3,%%mm3\n"
486 "pmaddwd %%mm4,%%mm4\n"
487
488 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
489 pix2^2+pix3^2+pix6^2+pix7^2) */
490 "paddd %%mm3,%%mm4\n"
491 "paddd %%mm2,%%mm7\n"
492
493 "addl %2, %0\n"
494 "paddd %%mm4,%%mm7\n"
495 "dec %%ecx\n"
496 "jnz 1b\n"
497
498 "movq %%mm7,%%mm1\n"
499 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
500 "paddd %%mm7,%%mm1\n"
501 "movd %%mm1,%1\n"
502 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
503 return tmp;
504}
505
0c1a9eda 506static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
2a006cd3
FL
507 int tmp;
508 asm volatile (
509 "movl $16,%%ecx\n"
510 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
511 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
512 "1:\n"
513 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
514 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
515 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
516 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
517
518 /* todo: mm1-mm2, mm3-mm4 */
519 /* algo: substract mm1 from mm2 with saturation and vice versa */
520 /* OR the results to get absolute difference */
521 "movq %%mm1,%%mm5\n"
522 "movq %%mm3,%%mm6\n"
523 "psubusb %%mm2,%%mm1\n"
524 "psubusb %%mm4,%%mm3\n"
525 "psubusb %%mm5,%%mm2\n"
526 "psubusb %%mm6,%%mm4\n"
527
528 "por %%mm1,%%mm2\n"
529 "por %%mm3,%%mm4\n"
530
531 /* now convert to 16-bit vectors so we can square them */
532 "movq %%mm2,%%mm1\n"
533 "movq %%mm4,%%mm3\n"
534
535 "punpckhbw %%mm0,%%mm2\n"
536 "punpckhbw %%mm0,%%mm4\n"
537 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
538 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
539
540 "pmaddwd %%mm2,%%mm2\n"
541 "pmaddwd %%mm4,%%mm4\n"
542 "pmaddwd %%mm1,%%mm1\n"
543 "pmaddwd %%mm3,%%mm3\n"
544
545 "addl %3,%0\n"
546 "addl %3,%1\n"
547
548 "paddd %%mm2,%%mm1\n"
549 "paddd %%mm4,%%mm3\n"
550 "paddd %%mm1,%%mm7\n"
551 "paddd %%mm3,%%mm7\n"
552
553 "decl %%ecx\n"
554 "jnz 1b\n"
555
556 "movq %%mm7,%%mm1\n"
557 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
558 "paddd %%mm7,%%mm1\n"
559 "movd %%mm1,%2\n"
560 : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
561 return tmp;
562}
563
11f18faf
MN
564static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
565 int i=0;
566 asm volatile(
567 "1: \n\t"
568 "movq (%2, %0), %%mm0 \n\t"
569 "movq (%1, %0), %%mm1 \n\t"
570 "psubb %%mm0, %%mm1 \n\t"
571 "movq %%mm1, (%3, %0) \n\t"
572 "movq 8(%2, %0), %%mm0 \n\t"
573 "movq 8(%1, %0), %%mm1 \n\t"
574 "psubb %%mm0, %%mm1 \n\t"
575 "movq %%mm1, 8(%3, %0) \n\t"
576 "addl $16, %0 \n\t"
577 "cmpl %4, %0 \n\t"
578 " jb 1b \n\t"
579 : "+r" (i)
580 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
581 );
582 for(; i<w; i++)
583 dst[i+0] = src1[i+0]-src2[i+0];
584}
1457ab52
MN
585#define LBUTTERFLY(a,b)\
586 "paddw " #b ", " #a " \n\t"\
587 "paddw " #b ", " #b " \n\t"\
588 "psubw " #a ", " #b " \n\t"
589
590#define HADAMARD48\
591 LBUTTERFLY(%%mm0, %%mm1)\
592 LBUTTERFLY(%%mm2, %%mm3)\
593 LBUTTERFLY(%%mm4, %%mm5)\
594 LBUTTERFLY(%%mm6, %%mm7)\
595 \
596 LBUTTERFLY(%%mm0, %%mm2)\
597 LBUTTERFLY(%%mm1, %%mm3)\
598 LBUTTERFLY(%%mm4, %%mm6)\
599 LBUTTERFLY(%%mm5, %%mm7)\
600 \
601 LBUTTERFLY(%%mm0, %%mm4)\
602 LBUTTERFLY(%%mm1, %%mm5)\
603 LBUTTERFLY(%%mm2, %%mm6)\
604 LBUTTERFLY(%%mm3, %%mm7)
605
606#define MMABS(a,z)\
607 "pxor " #z ", " #z " \n\t"\
608 "pcmpgtw " #a ", " #z " \n\t"\
609 "pxor " #z ", " #a " \n\t"\
610 "psubw " #z ", " #a " \n\t"
611
612#define MMABS_SUM(a,z, sum)\
613 "pxor " #z ", " #z " \n\t"\
614 "pcmpgtw " #a ", " #z " \n\t"\
615 "pxor " #z ", " #a " \n\t"\
616 "psubw " #z ", " #a " \n\t"\
617 "paddusw " #a ", " #sum " \n\t"
618
619
620#define SBUTTERFLY(a,b,t,n)\
621 "movq " #a ", " #t " \n\t" /* abcd */\
622 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
623 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
624
625#define TRANSPOSE4(a,b,c,d,t)\
626 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
627 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
628 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
629 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
630
631#define LOAD4(o, a, b, c, d)\
632 "movq "#o"(%1), " #a " \n\t"\
633 "movq "#o"+16(%1), " #b " \n\t"\
634 "movq "#o"+32(%1), " #c " \n\t"\
635 "movq "#o"+48(%1), " #d " \n\t"
636
637#define STORE4(o, a, b, c, d)\
638 "movq "#a", "#o"(%1) \n\t"\
639 "movq "#b", "#o"+16(%1) \n\t"\
640 "movq "#c", "#o"+32(%1) \n\t"\
641 "movq "#d", "#o"+48(%1) \n\t"\
642
643static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
644 uint64_t temp[16] __align8;
645 int sum=0;
646
647 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 648
1457ab52
MN
649 asm volatile(
650 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
651 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
652
653 HADAMARD48
654
655 "movq %%mm7, 112(%1) \n\t"
656
657 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
658 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
659
660 "movq 112(%1), %%mm7 \n\t"
661 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
662 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
663
664 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
665 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
666
667 HADAMARD48
668
669 "movq %%mm7, 120(%1) \n\t"
670
671 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
672 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
673
674 "movq 120(%1), %%mm7 \n\t"
675 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
676 "movq %%mm7, %%mm5 \n\t"//FIXME remove
677 "movq %%mm6, %%mm7 \n\t"
678 "movq %%mm0, %%mm6 \n\t"
679// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
680
681 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
682// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
683
684 HADAMARD48
685 "movq %%mm7, 64(%1) \n\t"
686 MMABS(%%mm0, %%mm7)
687 MMABS_SUM(%%mm1, %%mm7, %%mm0)
688 MMABS_SUM(%%mm2, %%mm7, %%mm0)
689 MMABS_SUM(%%mm3, %%mm7, %%mm0)
690 MMABS_SUM(%%mm4, %%mm7, %%mm0)
691 MMABS_SUM(%%mm5, %%mm7, %%mm0)
692 MMABS_SUM(%%mm6, %%mm7, %%mm0)
693 "movq 64(%1), %%mm1 \n\t"
694 MMABS_SUM(%%mm1, %%mm7, %%mm0)
695 "movq %%mm0, 64(%1) \n\t"
696
697 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
698 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
699
700 HADAMARD48
701 "movq %%mm7, (%1) \n\t"
702 MMABS(%%mm0, %%mm7)
703 MMABS_SUM(%%mm1, %%mm7, %%mm0)
704 MMABS_SUM(%%mm2, %%mm7, %%mm0)
705 MMABS_SUM(%%mm3, %%mm7, %%mm0)
706 MMABS_SUM(%%mm4, %%mm7, %%mm0)
707 MMABS_SUM(%%mm5, %%mm7, %%mm0)
708 MMABS_SUM(%%mm6, %%mm7, %%mm0)
709 "movq (%1), %%mm1 \n\t"
710 MMABS_SUM(%%mm1, %%mm7, %%mm0)
711 "movq 64(%1), %%mm1 \n\t"
712 MMABS_SUM(%%mm1, %%mm7, %%mm0)
713
714 "movq %%mm0, %%mm1 \n\t"
715 "psrlq $32, %%mm0 \n\t"
716 "paddusw %%mm1, %%mm0 \n\t"
717 "movq %%mm0, %%mm1 \n\t"
718 "psrlq $16, %%mm0 \n\t"
719 "paddusw %%mm1, %%mm0 \n\t"
720 "movd %%mm0, %0 \n\t"
721
722 : "=r" (sum)
723 : "r"(temp)
724 );
725 return sum&0xFFFF;
726}
727
728WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
11f18faf 729
3178ee4c
MN
730#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
731#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
732
826f429a
MN
733#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
734 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
c296f66b 735 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
826f429a
MN
736 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
737 "movq "#in7", " #m3 " \n\t" /* d */\
738 "movq "#in0", %%mm5 \n\t" /* D */\
739 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
740 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
741 "movq "#in1", %%mm5 \n\t" /* C */\
742 "movq "#in2", %%mm6 \n\t" /* B */\
743 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
744 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
745 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
746 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
c296f66b 747 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
826f429a
MN
748 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
749 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
750 "psraw $5, %%mm5 \n\t"\
751 "packuswb %%mm5, %%mm5 \n\t"\
752 OP(%%mm5, out, %%mm7, d)
753
3178ee4c 754#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
5c91a675 755static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
756 uint64_t temp;\
757\
758 asm volatile(\
759 "pxor %%mm7, %%mm7 \n\t"\
760 "1: \n\t"\
761 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
762 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
763 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
764 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
765 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
766 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
767 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
768 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
769 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
770 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
771 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
772 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
773 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
774 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
775 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
776 "paddw %%mm3, %%mm5 \n\t" /* b */\
777 "paddw %%mm2, %%mm6 \n\t" /* c */\
778 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
779 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
780 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 781 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
782 "paddw %%mm4, %%mm0 \n\t" /* a */\
783 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 784 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 785 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 786 "paddw %6, %%mm6 \n\t"\
826f429a
MN
787 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
788 "psraw $5, %%mm0 \n\t"\
c296f66b 789 "movq %%mm0, %5 \n\t"\
826f429a
MN
790 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
791 \
792 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
793 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
794 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
795 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
796 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
797 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
798 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
799 "paddw %%mm0, %%mm2 \n\t" /* b */\
800 "paddw %%mm5, %%mm3 \n\t" /* c */\
801 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
802 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
803 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
804 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
805 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
806 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
c296f66b 807 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a
MN
808 "paddw %%mm2, %%mm1 \n\t" /* a */\
809 "paddw %%mm6, %%mm4 \n\t" /* d */\
c296f66b 810 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
826f429a 811 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
c296f66b 812 "paddw %6, %%mm1 \n\t"\
826f429a
MN
813 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
814 "psraw $5, %%mm3 \n\t"\
c296f66b 815 "movq %5, %%mm1 \n\t"\
826f429a 816 "packuswb %%mm3, %%mm1 \n\t"\
3178ee4c 817 OP_MMX2(%%mm1, (%1),%%mm4, q)\
826f429a
MN
818 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
819 \
820 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
821 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
822 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
823 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
824 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
825 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
826 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
827 "paddw %%mm1, %%mm5 \n\t" /* b */\
828 "paddw %%mm4, %%mm0 \n\t" /* c */\
829 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
830 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
831 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
832 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
c296f66b 833 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
826f429a
MN
834 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
835 "paddw %%mm3, %%mm2 \n\t" /* d */\
836 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
837 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
838 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
839 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
840 "paddw %%mm2, %%mm6 \n\t" /* a */\
c296f66b
MN
841 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
842 "paddw %6, %%mm0 \n\t"\
826f429a
MN
843 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
844 "psraw $5, %%mm0 \n\t"\
845 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
846 \
847 "paddw %%mm5, %%mm3 \n\t" /* a */\
848 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
849 "paddw %%mm4, %%mm6 \n\t" /* b */\
850 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
851 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
852 "paddw %%mm1, %%mm4 \n\t" /* c */\
853 "paddw %%mm2, %%mm5 \n\t" /* d */\
854 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
855 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
c296f66b
MN
856 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
857 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
826f429a 858 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 859 "paddw %6, %%mm4 \n\t"\
826f429a
MN
860 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
861 "psraw $5, %%mm4 \n\t"\
862 "packuswb %%mm4, %%mm0 \n\t"\
3178ee4c 863 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
826f429a
MN
864 \
865 "addl %3, %0 \n\t"\
866 "addl %4, %1 \n\t"\
867 "decl %2 \n\t"\
868 " jnz 1b \n\t"\
5a508a98 869 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
870 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
871 : "memory"\
826f429a
MN
872 );\
873}\
874\
875static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
876 int i;\
877 int16_t temp[16];\
878 /* quick HACK, XXX FIXME MUST be optimized */\
879 for(i=0; i<h; i++)\
880 {\
881 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
882 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
883 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
884 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
885 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
886 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
887 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
888 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
889 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
890 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
891 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
892 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
893 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
894 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
895 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
896 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
897 asm volatile(\
898 "movq (%0), %%mm0 \n\t"\
899 "movq 8(%0), %%mm1 \n\t"\
900 "paddw %2, %%mm0 \n\t"\
901 "paddw %2, %%mm1 \n\t"\
902 "psraw $5, %%mm0 \n\t"\
903 "psraw $5, %%mm1 \n\t"\
904 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 905 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
906 "movq 16(%0), %%mm0 \n\t"\
907 "movq 24(%0), %%mm1 \n\t"\
908 "paddw %2, %%mm0 \n\t"\
909 "paddw %2, %%mm1 \n\t"\
910 "psraw $5, %%mm0 \n\t"\
911 "psraw $5, %%mm1 \n\t"\
912 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 913 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
826f429a 914 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 915 : "memory"\
826f429a
MN
916 );\
917 dst+=dstStride;\
918 src+=srcStride;\
919 }\
920}\
921\
5c91a675 922static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
923 uint64_t temp;\
924\
925 asm volatile(\
926 "pxor %%mm7, %%mm7 \n\t"\
927 "1: \n\t"\
928 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
929 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
930 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
931 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
932 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
933 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
934 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
935 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
936 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
937 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
938 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
939 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
940 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
941 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
942 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
943 "paddw %%mm3, %%mm5 \n\t" /* b */\
944 "paddw %%mm2, %%mm6 \n\t" /* c */\
945 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
946 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
947 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 948 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
949 "paddw %%mm4, %%mm0 \n\t" /* a */\
950 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 951 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 952 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 953 "paddw %6, %%mm6 \n\t"\
826f429a
MN
954 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
955 "psraw $5, %%mm0 \n\t"\
956 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
957 \
958 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
959 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
960 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
961 "paddw %%mm5, %%mm1 \n\t" /* a */\
962 "paddw %%mm6, %%mm2 \n\t" /* b */\
963 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
964 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
965 "paddw %%mm6, %%mm3 \n\t" /* c */\
966 "paddw %%mm5, %%mm4 \n\t" /* d */\
967 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
968 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
c296f66b
MN
969 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
970 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a 971 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 972 "paddw %6, %%mm1 \n\t"\
826f429a
MN
973 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
974 "psraw $5, %%mm3 \n\t"\
975 "packuswb %%mm3, %%mm0 \n\t"\
3178ee4c 976 OP_MMX2(%%mm0, (%1), %%mm4, q)\
826f429a
MN
977 \
978 "addl %3, %0 \n\t"\
979 "addl %4, %1 \n\t"\
980 "decl %2 \n\t"\
c296f66b 981 " jnz 1b \n\t"\
5a508a98 982 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
983 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
984 : "memory"\
826f429a
MN
985 );\
986}\
987\
988static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
989 int i;\
990 int16_t temp[8];\
991 /* quick HACK, XXX FIXME MUST be optimized */\
992 for(i=0; i<h; i++)\
993 {\
994 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
995 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
996 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
997 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
998 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
999 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1000 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1001 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1002 asm volatile(\
1003 "movq (%0), %%mm0 \n\t"\
1004 "movq 8(%0), %%mm1 \n\t"\
1005 "paddw %2, %%mm0 \n\t"\
1006 "paddw %2, %%mm1 \n\t"\
1007 "psraw $5, %%mm0 \n\t"\
1008 "psraw $5, %%mm1 \n\t"\
1009 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 1010 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a 1011 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 1012 :"memory"\
826f429a
MN
1013 );\
1014 dst+=dstStride;\
1015 src+=srcStride;\
1016 }\
3178ee4c
MN
1017}
1018
1019#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1020\
1021static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1022 uint64_t temp[17*4];\
1023 uint64_t *temp_ptr= temp;\
1024 int count= 17;\
1025\
1026 /*FIXME unroll */\
1027 asm volatile(\
1028 "pxor %%mm7, %%mm7 \n\t"\
1029 "1: \n\t"\
1030 "movq (%0), %%mm0 \n\t"\
1031 "movq (%0), %%mm1 \n\t"\
1032 "movq 8(%0), %%mm2 \n\t"\
1033 "movq 8(%0), %%mm3 \n\t"\
1034 "punpcklbw %%mm7, %%mm0 \n\t"\
1035 "punpckhbw %%mm7, %%mm1 \n\t"\
1036 "punpcklbw %%mm7, %%mm2 \n\t"\
1037 "punpckhbw %%mm7, %%mm3 \n\t"\
1038 "movq %%mm0, (%1) \n\t"\
1039 "movq %%mm1, 17*8(%1) \n\t"\
5a508a98
MN
1040 "movq %%mm2, 2*17*8(%1) \n\t"\
1041 "movq %%mm3, 3*17*8(%1) \n\t"\
3178ee4c
MN
1042 "addl $8, %1 \n\t"\
1043 "addl %3, %0 \n\t"\
1044 "decl %2 \n\t"\
1045 " jnz 1b \n\t"\
1046 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
5a508a98 1047 : "r" (srcStride)\
0b093b6f 1048 : "memory"\
3178ee4c
MN
1049 );\
1050 \
1051 temp_ptr= temp;\
1052 count=4;\
1053 \
1054/*FIXME reorder for speed */\
3178ee4c
MN
1055 asm volatile(\
1056 /*"pxor %%mm7, %%mm7 \n\t"*/\
3178ee4c
MN
1057 "1: \n\t"\
1058 "movq (%0), %%mm0 \n\t"\
1059 "movq 8(%0), %%mm1 \n\t"\
1060 "movq 16(%0), %%mm2 \n\t"\
1061 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1062 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1063 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
3178ee4c 1064 "addl %4, %1 \n\t"\
c296f66b 1065 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
3178ee4c 1066 \
c296f66b 1067 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
3178ee4c 1068 "addl %4, %1 \n\t"\
c296f66b
MN
1069 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1070 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
3178ee4c 1071 "addl %4, %1 \n\t"\
c296f66b
MN
1072 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1073 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
3178ee4c 1074 "addl %4, %1 \n\t"\
c296f66b
MN
1075 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1076 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
3178ee4c 1077 "addl %4, %1 \n\t"\
c296f66b
MN
1078 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1079 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
3178ee4c 1080 "addl %4, %1 \n\t"\
c296f66b 1081 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
3178ee4c 1082 \
c296f66b 1083 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
3178ee4c 1084 "addl %4, %1 \n\t" \
c296f66b
MN
1085 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1086 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
3178ee4c
MN
1087 \
1088 "addl $136, %0 \n\t"\
c296f66b 1089 "addl %6, %1 \n\t"\
3178ee4c
MN
1090 "decl %2 \n\t"\
1091 " jnz 1b \n\t"\
3178ee4c 1092 \
5a508a98 1093 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
c296f66b 1094 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
0b093b6f 1095 :"memory"\
3178ee4c 1096 );\
826f429a
MN
1097}\
1098\
5c91a675 1099static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
826f429a
MN
1100 uint64_t temp[9*4];\
1101 uint64_t *temp_ptr= temp;\
1102 int count= 9;\
1103\
1104 /*FIXME unroll */\
1105 asm volatile(\
1106 "pxor %%mm7, %%mm7 \n\t"\
1107 "1: \n\t"\
1108 "movq (%0), %%mm0 \n\t"\
1109 "movq (%0), %%mm1 \n\t"\
1110 "punpcklbw %%mm7, %%mm0 \n\t"\
1111 "punpckhbw %%mm7, %%mm1 \n\t"\
1112 "movq %%mm0, (%1) \n\t"\
1113 "movq %%mm1, 9*8(%1) \n\t"\
1114 "addl $8, %1 \n\t"\
1115 "addl %3, %0 \n\t"\
1116 "decl %2 \n\t"\
1117 " jnz 1b \n\t"\
1118 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1119 : "r" (srcStride)\
0b093b6f 1120 : "memory"\
826f429a
MN
1121 );\
1122 \
1123 temp_ptr= temp;\
1124 count=2;\
1125 \
1126/*FIXME reorder for speed */\
1127 asm volatile(\
1128 /*"pxor %%mm7, %%mm7 \n\t"*/\
1129 "1: \n\t"\
1130 "movq (%0), %%mm0 \n\t"\
1131 "movq 8(%0), %%mm1 \n\t"\
1132 "movq 16(%0), %%mm2 \n\t"\
1133 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1134 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1135 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
826f429a 1136 "addl %4, %1 \n\t"\
c296f66b 1137 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
826f429a 1138 \
c296f66b 1139 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
826f429a 1140 "addl %4, %1 \n\t"\
c296f66b 1141 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
826f429a 1142 \
c296f66b 1143 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
826f429a 1144 "addl %4, %1 \n\t"\
c296f66b
MN
1145 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1146 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
826f429a
MN
1147 \
1148 "addl $72, %0 \n\t"\
c296f66b 1149 "addl %6, %1 \n\t"\
826f429a
MN
1150 "decl %2 \n\t"\
1151 " jnz 1b \n\t"\
1152 \
c296f66b
MN
1153 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1154 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
0b093b6f
MN
1155 : "memory"\
1156 );\
3178ee4c 1157}\
826f429a 1158\
0c1a9eda 1159static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1160 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
826f429a
MN
1161}\
1162\
0c1a9eda 1163static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1164 uint64_t temp[8];\
826f429a
MN
1165 uint8_t * const half= (uint8_t*)temp;\
1166 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1167 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1168}\
1169\
0c1a9eda 1170static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1171 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1172}\
1173\
0c1a9eda 1174static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1175 uint64_t temp[8];\
826f429a
MN
1176 uint8_t * const half= (uint8_t*)temp;\
1177 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1178 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1179}\
1180\
0c1a9eda 1181static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1182 uint64_t temp[8];\
826f429a 1183 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1184 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1185 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1186}\
1187\
0c1a9eda 1188static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1189 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1190}\
1191\
0c1a9eda 1192static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1193 uint64_t temp[8];\
826f429a 1194 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1195 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1196 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1197}\
0c1a9eda 1198static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1199 uint64_t half[8 + 9];\
1200 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1201 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1202 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1203 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1204 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1205 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a 1206}\
0c1a9eda 1207static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1208 uint64_t half[8 + 9];\
1209 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1210 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1211 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1212 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1213 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1214 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a 1215}\
0c1a9eda 1216static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1217 uint64_t half[8 + 9];\
1218 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1219 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1220 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1221 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1222 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1223 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a 1224}\
0c1a9eda 1225static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1226 uint64_t half[8 + 9];\
1227 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1228 uint8_t * const halfHV= ((uint8_t*)half);\
1229 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1230 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1231 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1232 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a 1233}\
0c1a9eda 1234static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1235 uint64_t half[8 + 9];\
826f429a
MN
1236 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1237 uint8_t * const halfHV= ((uint8_t*)half);\
1238 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1239 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1240 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1241}\
0c1a9eda 1242static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1243 uint64_t half[8 + 9];\
826f429a
MN
1244 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1245 uint8_t * const halfHV= ((uint8_t*)half);\
1246 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1247 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1248 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1249}\
0c1a9eda 1250static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1251 uint64_t half[8 + 9];\
1252 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1253 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1254 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
1255 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1256}\
0c1a9eda 1257static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1258 uint64_t half[8 + 9];\
1259 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1260 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1261 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
1262 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1263}\
0c1a9eda 1264static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1265 uint64_t half[9];\
826f429a
MN
1266 uint8_t * const halfH= ((uint8_t*)half);\
1267 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1268 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1269}\
0c1a9eda 1270static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1271 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
826f429a
MN
1272}\
1273\
0c1a9eda 1274static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1275 uint64_t temp[32];\
1276 uint8_t * const half= (uint8_t*)temp;\
1277 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1278 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1279}\
1280\
0c1a9eda 1281static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1282 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1283}\
1284\
0c1a9eda 1285static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1286 uint64_t temp[32];\
1287 uint8_t * const half= (uint8_t*)temp;\
1288 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1289 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1290}\
1291\
0c1a9eda 1292static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1293 uint64_t temp[32];\
1294 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1295 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1296 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1297}\
1298\
0c1a9eda 1299static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1300 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1301}\
1302\
0c1a9eda 1303static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1304 uint64_t temp[32];\
1305 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1306 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1307 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1308}\
0c1a9eda 1309static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1310 uint64_t half[16*2 + 17*2];\
1311 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1312 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1313 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1314 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1315 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1316 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a 1317}\
0c1a9eda 1318static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1319 uint64_t half[16*2 + 17*2];\
1320 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1321 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1322 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1323 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1324 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1325 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a 1326}\
0c1a9eda 1327static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1328 uint64_t half[16*2 + 17*2];\
1329 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1330 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1331 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1332 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1333 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1334 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a 1335}\
0c1a9eda 1336static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1337 uint64_t half[16*2 + 17*2];\
1338 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1339 uint8_t * const halfHV= ((uint8_t*)half);\
1340 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1341 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1342 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1343 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a 1344}\
0c1a9eda 1345static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1346 uint64_t half[16*2 + 17*2];\
1347 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1348 uint8_t * const halfHV= ((uint8_t*)half);\
1349 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1350 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1351 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1352}\
0c1a9eda 1353static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1354 uint64_t half[16*2 + 17*2];\
1355 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1356 uint8_t * const halfHV= ((uint8_t*)half);\
1357 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1358 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1359 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1360}\
0c1a9eda 1361static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1362 uint64_t half[17*2];\
1363 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1364 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1365 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
1366 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a 1367}\
0c1a9eda 1368static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1369 uint64_t half[17*2];\
1370 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1371 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1372 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
1373 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a 1374}\
0c1a9eda 1375static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1376 uint64_t half[17*2];\
1377 uint8_t * const halfH= ((uint8_t*)half);\
1378 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1379 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1380}
1381
1382
1383#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
3178ee4c 1384#define AVG_3DNOW_OP(a,b,temp, size) \
826f429a
MN
1385"mov" #size " " #b ", " #temp " \n\t"\
1386"pavgusb " #temp ", " #a " \n\t"\
1387"mov" #size " " #a ", " #b " \n\t"
3178ee4c 1388#define AVG_MMX2_OP(a,b,temp, size) \
826f429a
MN
1389"mov" #size " " #b ", " #temp " \n\t"\
1390"pavgb " #temp ", " #a " \n\t"\
1391"mov" #size " " #a ", " #b " \n\t"
3178ee4c
MN
1392
1393QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1394QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1395QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1396QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1397QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1398QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
826f429a 1399QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
3178ee4c 1400QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
826f429a
MN
1401QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1402
61a4e8ae 1403#if 0
d6a4c0b1 1404static void just_return() { return; }
61a4e8ae 1405#endif
d6a4c0b1 1406
826f429a
MN
1407#define SET_QPEL_FUNC(postfix1, postfix2) \
1408 c->put_ ## postfix1 = put_ ## postfix2;\
1409 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1410 c->avg_ ## postfix1 = avg_ ## postfix2;
1411
eb4b3dd3 1412void dsputil_init_mmx(DSPContext* c, unsigned mask)
de6d9b64
FB
1413{
1414 mm_flags = mm_support();
1565dabc
LB
1415#if 0
1416 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 1417 if (mm_flags & MM_MMX)
1565dabc 1418 fprintf(stderr, " mmx");
de6d9b64 1419 if (mm_flags & MM_MMXEXT)
1565dabc 1420 fprintf(stderr, " mmxext");
de6d9b64 1421 if (mm_flags & MM_3DNOW)
1565dabc 1422 fprintf(stderr, " 3dnow");
de6d9b64 1423 if (mm_flags & MM_SSE)
1565dabc 1424 fprintf(stderr, " sse");
de6d9b64 1425 if (mm_flags & MM_SSE2)
1565dabc
LB
1426 fprintf(stderr, " sse2");
1427 fprintf(stderr, "\n");
de6d9b64
FB
1428#endif
1429
1430 if (mm_flags & MM_MMX) {
eb4b3dd3
ZK
1431 c->get_pixels = get_pixels_mmx;
1432 c->diff_pixels = diff_pixels_mmx;
1433 c->put_pixels_clamped = put_pixels_clamped_mmx;
1434 c->add_pixels_clamped = add_pixels_clamped_mmx;
1435 c->clear_blocks = clear_blocks_mmx;
1436 c->pix_sum = pix_sum16_mmx;
1437
eb4b3dd3
ZK
1438 c->put_pixels_tab[0][0] = put_pixels16_mmx;
1439 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1440 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1441 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1442
1443 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1444 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1445 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1446 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1447
1448 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1449 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1450 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1451 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1452
1453 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1454 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1455 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1456 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1457
1458 c->put_pixels_tab[1][0] = put_pixels8_mmx;
1459 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1460 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1461 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1462
1463 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1464 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1465 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1466 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1467
1468 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1469 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1470 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1471 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1472
1473 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1474 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1475 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1476 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
826f429a 1477
11f18faf
MN
1478 c->add_bytes= add_bytes_mmx;
1479 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
1480
1481 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1482 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1483
2a006cd3
FL
1484 c->pix_norm1 = pix_norm1_mmx;
1485 c->sse[0] = sse16_mmx;
1457ab52 1486
de6d9b64 1487 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1488 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1489 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
1490 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1491 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1492
1493 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1494 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1495 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
1496 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1497
1498 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1499 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
1500 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1501 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1502
1503 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1504 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1505 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1506 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3178ee4c 1507
c296f66b 1508#if 1
826f429a
MN
1509 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1510 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1511 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1512 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1513 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1514 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1515 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1516 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1517 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1518 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1519 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1520 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1521 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1522 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1523 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1524 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1525 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1526 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1527 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1528 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1529 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1530 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1531 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1532 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1533 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1534 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1535 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1536 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1537 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1538 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1539 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1540 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
c296f66b 1541#endif
de6d9b64 1542 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
1543 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1544 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
1545 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1546 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1547
1548 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1549 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1550 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
1551 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1552
1553 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1554 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
1555 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1556 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1557
1558 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1559 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1560 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1561 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
db794953 1562
826f429a
MN
1563 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1564 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1565 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1566 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1567 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1568 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1569 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1570 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1571 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1572 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1573 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1574 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1575 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1576 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1577 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1578 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1579 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1580 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1581 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1582 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1583 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1584 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1585 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1586 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1587 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1588 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1589 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1590 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1591 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1592 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1593 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1594 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
de6d9b64
FB
1595 }
1596 }
5c91a675 1597 dsputil_init_pix_mmx(c, mask);
d6a4c0b1
ZK
1598#if 0
1599 // for speed testing
1600 get_pixels = just_return;
1601 put_pixels_clamped = just_return;
1602 add_pixels_clamped = just_return;
1603
1604 pix_abs16x16 = just_return;
1605 pix_abs16x16_x2 = just_return;
1606 pix_abs16x16_y2 = just_return;
1607 pix_abs16x16_xy2 = just_return;
1608
1609 put_pixels_tab[0] = just_return;
1610 put_pixels_tab[1] = just_return;
1611 put_pixels_tab[2] = just_return;
1612 put_pixels_tab[3] = just_return;
1613
1614 put_no_rnd_pixels_tab[0] = just_return;
1615 put_no_rnd_pixels_tab[1] = just_return;
1616 put_no_rnd_pixels_tab[2] = just_return;
1617 put_no_rnd_pixels_tab[3] = just_return;
1618
1619 avg_pixels_tab[0] = just_return;
1620 avg_pixels_tab[1] = just_return;
1621 avg_pixels_tab[2] = just_return;
1622 avg_pixels_tab[3] = just_return;
1623
1624 avg_no_rnd_pixels_tab[0] = just_return;
1625 avg_no_rnd_pixels_tab[1] = just_return;
1626 avg_no_rnd_pixels_tab[2] = just_return;
1627 avg_no_rnd_pixels_tab[3] = just_return;
1628
d6a4c0b1
ZK
1629 //av_fdct = just_return;
1630 //ff_idct = just_return;
1631#endif
de6d9b64 1632}
4f12a497
FB
1633
1634/* remove any non bit exact operation (testing purpose). NOTE that
1635 this function should be kept as small as possible because it is
1636 always difficult to test automatically non bit exact cases. */
eb4b3dd3 1637void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
4f12a497
FB
1638{
1639 if (mm_flags & MM_MMX) {
b3184779 1640 /* MMX2 & 3DNOW */
eb4b3dd3
ZK
1641 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1642 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1643 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1644 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1645 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1646 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4f12a497 1647 }
5c91a675 1648 dsputil_set_bit_exact_pix_mmx(c, mask);
4f12a497 1649}