-fPIC compileable
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5 24int mm_flags; /* multimedia extension flags */
eb4b3dd3 25/* FIXME use them in static form */
5c91a675 26void dsputil_init_pix_mmx(DSPContext* c, unsigned mask);
1457ab52 27
de6d9b64 28/* pixel operations */
a7bd8797
MN
29static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
30static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
31static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 32
826f429a
MN
33static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
34static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
35static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
36static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
37
d6a4c0b1
ZK
38#define JUMPALIGN() __asm __volatile (".balign 8"::)
39#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
40
fca0f0e5
ZK
41#define MOVQ_WONE(regd) \
42 __asm __volatile ( \
43 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
44 "psrlw $15, %%" #regd ::)
45
46#define MOVQ_BFE(regd) \
47 __asm __volatile ( \
48 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
49 "paddb %%" #regd ", %%" #regd " \n\t" ::)
50
d6a4c0b1 51#ifndef PIC
fca0f0e5 52#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
53#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
54#else
55// for shared library it's better to use this way for accessing constants
56// pcmpeqd -> -1
fca0f0e5 57#define MOVQ_BONE(regd) \
d6a4c0b1 58 __asm __volatile ( \
fca0f0e5
ZK
59 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
60 "psrlw $15, %%" #regd " \n\t" \
61 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
62
63#define MOVQ_WTWO(regd) \
64 __asm __volatile ( \
fca0f0e5
ZK
65 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
66 "psrlw $15, %%" #regd " \n\t" \
67 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 68
d6a4c0b1
ZK
69#endif
70
fca0f0e5 71// using regr as temporary and for the output result
def60345 72// first argument is unmodifed and second is trashed
39825f31
ZK
73// regfe is supposed to contain 0xfefefefefefefefe
74#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
75 "movq " #rega ", " #regr " \n\t"\
76 "pand " #regb ", " #regr " \n\t"\
def60345 77 "pxor " #rega ", " #regb " \n\t"\
39825f31 78 "pand " #regfe "," #regb " \n\t"\
def60345 79 "psrlq $1, " #regb " \n\t"\
91abb473 80 "paddb " #regb ", " #regr " \n\t"
def60345 81
39825f31 82#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
83 "movq " #rega ", " #regr " \n\t"\
84 "por " #regb ", " #regr " \n\t"\
def60345 85 "pxor " #rega ", " #regb " \n\t"\
39825f31 86 "pand " #regfe "," #regb " \n\t"\
def60345 87 "psrlq $1, " #regb " \n\t"\
91abb473 88 "psubb " #regb ", " #regr " \n\t"
def60345 89
39825f31 90// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
91#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
92 "movq " #rega ", " #regr " \n\t"\
93 "movq " #regc ", " #regp " \n\t"\
94 "pand " #regb ", " #regr " \n\t"\
95 "pand " #regd ", " #regp " \n\t"\
96 "pxor " #rega ", " #regb " \n\t"\
97 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
98 "pand %%mm6, " #regb " \n\t"\
99 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
100 "psrlq $1, " #regb " \n\t"\
101 "psrlq $1, " #regd " \n\t"\
102 "paddb " #regb ", " #regr " \n\t"\
103 "paddb " #regd ", " #regp " \n\t"
104
105#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
106 "movq " #rega ", " #regr " \n\t"\
107 "movq " #regc ", " #regp " \n\t"\
108 "por " #regb ", " #regr " \n\t"\
109 "por " #regd ", " #regp " \n\t"\
110 "pxor " #rega ", " #regb " \n\t"\
111 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
112 "pand %%mm6, " #regb " \n\t"\
113 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
114 "psrlq $1, " #regd " \n\t"\
115 "psrlq $1, " #regb " \n\t"\
116 "psubb " #regb ", " #regr " \n\t"\
117 "psubb " #regd ", " #regp " \n\t"
118
91abb473
ZK
119/***********************************/
120/* MMX no rounding */
121#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 122#define SET_RND MOVQ_WONE
6aa6ea8e 123#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 124#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 125
91abb473
ZK
126#include "dsputil_mmx_rnd.h"
127
128#undef DEF
fca0f0e5 129#undef SET_RND
6aa6ea8e 130#undef PAVGBP
39825f31 131#undef PAVGB
91abb473
ZK
132/***********************************/
133/* MMX rounding */
134
135#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 136#define SET_RND MOVQ_WTWO
6aa6ea8e 137#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 138#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 139
91abb473
ZK
140#include "dsputil_mmx_rnd.h"
141
142#undef DEF
fca0f0e5 143#undef SET_RND
6aa6ea8e 144#undef PAVGBP
39825f31 145#undef PAVGB
a7bd8797 146
de6d9b64
FB
147/***********************************/
148/* 3Dnow specific */
149
150#define DEF(x) x ## _3dnow
151/* for Athlons PAVGUSB is prefered */
152#define PAVGB "pavgusb"
153
154#include "dsputil_mmx_avg.h"
155
156#undef DEF
157#undef PAVGB
158
159/***********************************/
160/* MMX2 specific */
161
607dce96 162#define DEF(x) x ## _mmx2
de6d9b64
FB
163
164/* Introduced only in MMX2 set */
165#define PAVGB "pavgb"
166
167#include "dsputil_mmx_avg.h"
168
169#undef DEF
170#undef PAVGB
171
172/***********************************/
173/* standard MMX */
174
175static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
176{
607dce96
MN
177 asm volatile(
178 "movl $-128, %%eax \n\t"
179 "pxor %%mm7, %%mm7 \n\t"
180 ".balign 16 \n\t"
181 "1: \n\t"
182 "movq (%0), %%mm0 \n\t"
183 "movq (%0, %2), %%mm2 \n\t"
184 "movq %%mm0, %%mm1 \n\t"
185 "movq %%mm2, %%mm3 \n\t"
186 "punpcklbw %%mm7, %%mm0 \n\t"
187 "punpckhbw %%mm7, %%mm1 \n\t"
188 "punpcklbw %%mm7, %%mm2 \n\t"
189 "punpckhbw %%mm7, %%mm3 \n\t"
190 "movq %%mm0, (%1, %%eax)\n\t"
191 "movq %%mm1, 8(%1, %%eax)\n\t"
192 "movq %%mm2, 16(%1, %%eax)\n\t"
193 "movq %%mm3, 24(%1, %%eax)\n\t"
194 "addl %3, %0 \n\t"
195 "addl $32, %%eax \n\t"
196 "js 1b \n\t"
197 : "+r" (pixels)
198 : "r" (block+64), "r" (line_size), "r" (line_size*2)
199 : "%eax"
200 );
de6d9b64
FB
201}
202
1457ab52 203static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
9dbcbd92
MN
204{
205 asm volatile(
607dce96 206 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 207 "movl $-128, %%eax \n\t"
607dce96 208 ".balign 16 \n\t"
9dbcbd92
MN
209 "1: \n\t"
210 "movq (%0), %%mm0 \n\t"
211 "movq (%1), %%mm2 \n\t"
212 "movq %%mm0, %%mm1 \n\t"
213 "movq %%mm2, %%mm3 \n\t"
214 "punpcklbw %%mm7, %%mm0 \n\t"
215 "punpckhbw %%mm7, %%mm1 \n\t"
216 "punpcklbw %%mm7, %%mm2 \n\t"
217 "punpckhbw %%mm7, %%mm3 \n\t"
218 "psubw %%mm2, %%mm0 \n\t"
219 "psubw %%mm3, %%mm1 \n\t"
220 "movq %%mm0, (%2, %%eax)\n\t"
221 "movq %%mm1, 8(%2, %%eax)\n\t"
222 "addl %3, %0 \n\t"
223 "addl %3, %1 \n\t"
224 "addl $16, %%eax \n\t"
225 "jnz 1b \n\t"
226 : "+r" (s1), "+r" (s2)
227 : "r" (block+64), "r" (stride)
228 : "%eax"
229 );
230}
231
eb4b3dd3 232void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
233{
234 const DCTELEM *p;
235 UINT8 *pix;
de6d9b64
FB
236
237 /* read the pixels */
238 p = block;
239 pix = pixels;
d6a4c0b1 240 /* unrolled loop */
de6d9b64 241 __asm __volatile(
a822a479
NK
242 "movq %3, %%mm0\n\t"
243 "movq 8%3, %%mm1\n\t"
244 "movq 16%3, %%mm2\n\t"
245 "movq 24%3, %%mm3\n\t"
246 "movq 32%3, %%mm4\n\t"
247 "movq 40%3, %%mm5\n\t"
248 "movq 48%3, %%mm6\n\t"
249 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
250 "packuswb %%mm1, %%mm0\n\t"
251 "packuswb %%mm3, %%mm2\n\t"
252 "packuswb %%mm5, %%mm4\n\t"
253 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
254 "movq %%mm0, (%0)\n\t"
255 "movq %%mm2, (%0, %1)\n\t"
256 "movq %%mm4, (%0, %1, 2)\n\t"
257 "movq %%mm6, (%0, %2)\n\t"
258 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
259 :"memory");
260 pix += line_size*4;
261 p += 32;
d6a4c0b1
ZK
262
263 // if here would be an exact copy of the code above
264 // compiler would generate some very strange code
265 // thus using "r"
266 __asm __volatile(
267 "movq (%3), %%mm0\n\t"
268 "movq 8(%3), %%mm1\n\t"
269 "movq 16(%3), %%mm2\n\t"
270 "movq 24(%3), %%mm3\n\t"
271 "movq 32(%3), %%mm4\n\t"
272 "movq 40(%3), %%mm5\n\t"
273 "movq 48(%3), %%mm6\n\t"
274 "movq 56(%3), %%mm7\n\t"
275 "packuswb %%mm1, %%mm0\n\t"
276 "packuswb %%mm3, %%mm2\n\t"
277 "packuswb %%mm5, %%mm4\n\t"
278 "packuswb %%mm7, %%mm6\n\t"
279 "movq %%mm0, (%0)\n\t"
280 "movq %%mm2, (%0, %1)\n\t"
281 "movq %%mm4, (%0, %1, 2)\n\t"
282 "movq %%mm6, (%0, %2)\n\t"
283 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
284 :"memory");
de6d9b64
FB
285}
286
eb4b3dd3 287void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
288{
289 const DCTELEM *p;
290 UINT8 *pix;
291 int i;
292
293 /* read the pixels */
294 p = block;
295 pix = pixels;
d6a4c0b1
ZK
296 MOVQ_ZERO(mm7);
297 i = 4;
cd8e5f96 298 do {
de6d9b64 299 __asm __volatile(
cd8e5f96
ZK
300 "movq (%2), %%mm0\n\t"
301 "movq 8(%2), %%mm1\n\t"
302 "movq 16(%2), %%mm2\n\t"
303 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
304 "movq %0, %%mm4\n\t"
305 "movq %1, %%mm6\n\t"
306 "movq %%mm4, %%mm5\n\t"
307 "punpcklbw %%mm7, %%mm4\n\t"
308 "punpckhbw %%mm7, %%mm5\n\t"
309 "paddsw %%mm4, %%mm0\n\t"
310 "paddsw %%mm5, %%mm1\n\t"
311 "movq %%mm6, %%mm5\n\t"
312 "punpcklbw %%mm7, %%mm6\n\t"
313 "punpckhbw %%mm7, %%mm5\n\t"
314 "paddsw %%mm6, %%mm2\n\t"
315 "paddsw %%mm5, %%mm3\n\t"
316 "packuswb %%mm1, %%mm0\n\t"
317 "packuswb %%mm3, %%mm2\n\t"
318 "movq %%mm0, %0\n\t"
319 "movq %%mm2, %1\n\t"
a822a479 320 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 321 :"r"(p)
de6d9b64
FB
322 :"memory");
323 pix += line_size*2;
324 p += 16;
cd8e5f96 325 } while (--i);
de6d9b64
FB
326}
327
b3184779 328static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 329{
39825f31 330 __asm __volatile(
31ddcf98 331 "lea (%3, %3), %%eax \n\t"
52af45ad 332 ".balign 8 \n\t"
31ddcf98
ZK
333 "1: \n\t"
334 "movq (%1), %%mm0 \n\t"
335 "movq (%1, %3), %%mm1 \n\t"
336 "movq %%mm0, (%2) \n\t"
337 "movq %%mm1, (%2, %3) \n\t"
338 "addl %%eax, %1 \n\t"
339 "addl %%eax, %2 \n\t"
340 "movq (%1), %%mm0 \n\t"
341 "movq (%1, %3), %%mm1 \n\t"
342 "movq %%mm0, (%2) \n\t"
343 "movq %%mm1, (%2, %3) \n\t"
344 "addl %%eax, %1 \n\t"
345 "addl %%eax, %2 \n\t"
346 "subl $4, %0 \n\t"
347 "jnz 1b \n\t"
348 : "+g"(h), "+r" (pixels), "+r" (block)
349 : "r"(line_size)
350 : "%eax", "memory"
351 );
de6d9b64
FB
352}
353
b3184779
MN
354static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
355{
356 __asm __volatile(
357 "lea (%3, %3), %%eax \n\t"
358 ".balign 8 \n\t"
359 "1: \n\t"
360 "movq (%1), %%mm0 \n\t"
361 "movq 8(%1), %%mm4 \n\t"
362 "movq (%1, %3), %%mm1 \n\t"
363 "movq 8(%1, %3), %%mm5 \n\t"
364 "movq %%mm0, (%2) \n\t"
365 "movq %%mm4, 8(%2) \n\t"
366 "movq %%mm1, (%2, %3) \n\t"
367 "movq %%mm5, 8(%2, %3) \n\t"
368 "addl %%eax, %1 \n\t"
369 "addl %%eax, %2 \n\t"
370 "movq (%1), %%mm0 \n\t"
371 "movq 8(%1), %%mm4 \n\t"
372 "movq (%1, %3), %%mm1 \n\t"
373 "movq 8(%1, %3), %%mm5 \n\t"
374 "movq %%mm0, (%2) \n\t"
375 "movq %%mm4, 8(%2) \n\t"
376 "movq %%mm1, (%2, %3) \n\t"
377 "movq %%mm5, 8(%2, %3) \n\t"
378 "addl %%eax, %1 \n\t"
379 "addl %%eax, %2 \n\t"
380 "subl $4, %0 \n\t"
381 "jnz 1b \n\t"
382 : "+g"(h), "+r" (pixels), "+r" (block)
383 : "r"(line_size)
384 : "%eax", "memory"
385 );
386}
387
649c00c9
MN
388static void clear_blocks_mmx(DCTELEM *blocks)
389{
39825f31 390 __asm __volatile(
649c00c9
MN
391 "pxor %%mm7, %%mm7 \n\t"
392 "movl $-128*6, %%eax \n\t"
393 "1: \n\t"
394 "movq %%mm7, (%0, %%eax) \n\t"
395 "movq %%mm7, 8(%0, %%eax) \n\t"
396 "movq %%mm7, 16(%0, %%eax) \n\t"
397 "movq %%mm7, 24(%0, %%eax) \n\t"
398 "addl $32, %%eax \n\t"
399 " js 1b \n\t"
400 : : "r" (((int)blocks)+128*6)
401 : "%eax"
402 );
403}
404
084c726b
MN
405static int pix_sum16_mmx(UINT8 * pix, int line_size){
406 const int h=16;
407 int sum;
408 int index= -line_size*h;
409
410 __asm __volatile(
411 "pxor %%mm7, %%mm7 \n\t"
412 "pxor %%mm6, %%mm6 \n\t"
413 "1: \n\t"
414 "movq (%2, %1), %%mm0 \n\t"
415 "movq (%2, %1), %%mm1 \n\t"
416 "movq 8(%2, %1), %%mm2 \n\t"
417 "movq 8(%2, %1), %%mm3 \n\t"
418 "punpcklbw %%mm7, %%mm0 \n\t"
419 "punpckhbw %%mm7, %%mm1 \n\t"
420 "punpcklbw %%mm7, %%mm2 \n\t"
421 "punpckhbw %%mm7, %%mm3 \n\t"
422 "paddw %%mm0, %%mm1 \n\t"
423 "paddw %%mm2, %%mm3 \n\t"
424 "paddw %%mm1, %%mm3 \n\t"
425 "paddw %%mm3, %%mm6 \n\t"
426 "addl %3, %1 \n\t"
427 " js 1b \n\t"
428 "movq %%mm6, %%mm5 \n\t"
429 "psrlq $32, %%mm6 \n\t"
430 "paddw %%mm5, %%mm6 \n\t"
431 "movq %%mm6, %%mm5 \n\t"
432 "psrlq $16, %%mm6 \n\t"
433 "paddw %%mm5, %%mm6 \n\t"
434 "movd %%mm6, %0 \n\t"
435 "andl $0xFFFF, %0 \n\t"
436 : "=&r" (sum), "+r" (index)
437 : "r" (pix - index), "r" (line_size)
438 );
439
440 return sum;
441}
442
11f18faf
MN
443static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
444 int i=0;
445 asm volatile(
446 "1: \n\t"
447 "movq (%1, %0), %%mm0 \n\t"
448 "movq (%2, %0), %%mm1 \n\t"
449 "paddb %%mm0, %%mm1 \n\t"
450 "movq %%mm1, (%2, %0) \n\t"
451 "movq 8(%1, %0), %%mm0 \n\t"
452 "movq 8(%2, %0), %%mm1 \n\t"
453 "paddb %%mm0, %%mm1 \n\t"
454 "movq %%mm1, 8(%2, %0) \n\t"
455 "addl $16, %0 \n\t"
456 "cmpl %3, %0 \n\t"
457 " jb 1b \n\t"
458 : "+r" (i)
459 : "r"(src), "r"(dst), "r"(w-15)
460 );
461 for(; i<w; i++)
462 dst[i+0] += src[i+0];
463}
464
2a006cd3
FL
465static int pix_norm1_mmx(uint8_t *pix, int line_size) {
466 int tmp;
467 asm volatile (
468 "movl $16,%%ecx\n"
469 "pxor %%mm0,%%mm0\n"
470 "pxor %%mm7,%%mm7\n"
471 "1:\n"
472 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
473 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
474
475 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
476
477 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
478 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
479
480 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
481 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
482 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
483
484 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
485 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
486
487 "pmaddwd %%mm3,%%mm3\n"
488 "pmaddwd %%mm4,%%mm4\n"
489
490 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
491 pix2^2+pix3^2+pix6^2+pix7^2) */
492 "paddd %%mm3,%%mm4\n"
493 "paddd %%mm2,%%mm7\n"
494
495 "addl %2, %0\n"
496 "paddd %%mm4,%%mm7\n"
497 "dec %%ecx\n"
498 "jnz 1b\n"
499
500 "movq %%mm7,%%mm1\n"
501 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
502 "paddd %%mm7,%%mm1\n"
503 "movd %%mm1,%1\n"
504 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
505 return tmp;
506}
507
508static int sse16_mmx(void *v, UINT8 * pix1, UINT8 * pix2, int line_size) {
509 int tmp;
510 asm volatile (
511 "movl $16,%%ecx\n"
512 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
513 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
514 "1:\n"
515 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
516 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
517 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
518 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
519
520 /* todo: mm1-mm2, mm3-mm4 */
521 /* algo: substract mm1 from mm2 with saturation and vice versa */
522 /* OR the results to get absolute difference */
523 "movq %%mm1,%%mm5\n"
524 "movq %%mm3,%%mm6\n"
525 "psubusb %%mm2,%%mm1\n"
526 "psubusb %%mm4,%%mm3\n"
527 "psubusb %%mm5,%%mm2\n"
528 "psubusb %%mm6,%%mm4\n"
529
530 "por %%mm1,%%mm2\n"
531 "por %%mm3,%%mm4\n"
532
533 /* now convert to 16-bit vectors so we can square them */
534 "movq %%mm2,%%mm1\n"
535 "movq %%mm4,%%mm3\n"
536
537 "punpckhbw %%mm0,%%mm2\n"
538 "punpckhbw %%mm0,%%mm4\n"
539 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
540 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
541
542 "pmaddwd %%mm2,%%mm2\n"
543 "pmaddwd %%mm4,%%mm4\n"
544 "pmaddwd %%mm1,%%mm1\n"
545 "pmaddwd %%mm3,%%mm3\n"
546
547 "addl %3,%0\n"
548 "addl %3,%1\n"
549
550 "paddd %%mm2,%%mm1\n"
551 "paddd %%mm4,%%mm3\n"
552 "paddd %%mm1,%%mm7\n"
553 "paddd %%mm3,%%mm7\n"
554
555 "decl %%ecx\n"
556 "jnz 1b\n"
557
558 "movq %%mm7,%%mm1\n"
559 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
560 "paddd %%mm7,%%mm1\n"
561 "movd %%mm1,%2\n"
562 : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
563 return tmp;
564}
565
11f18faf
MN
566static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
567 int i=0;
568 asm volatile(
569 "1: \n\t"
570 "movq (%2, %0), %%mm0 \n\t"
571 "movq (%1, %0), %%mm1 \n\t"
572 "psubb %%mm0, %%mm1 \n\t"
573 "movq %%mm1, (%3, %0) \n\t"
574 "movq 8(%2, %0), %%mm0 \n\t"
575 "movq 8(%1, %0), %%mm1 \n\t"
576 "psubb %%mm0, %%mm1 \n\t"
577 "movq %%mm1, 8(%3, %0) \n\t"
578 "addl $16, %0 \n\t"
579 "cmpl %4, %0 \n\t"
580 " jb 1b \n\t"
581 : "+r" (i)
582 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
583 );
584 for(; i<w; i++)
585 dst[i+0] = src1[i+0]-src2[i+0];
586}
1457ab52
MN
587#define LBUTTERFLY(a,b)\
588 "paddw " #b ", " #a " \n\t"\
589 "paddw " #b ", " #b " \n\t"\
590 "psubw " #a ", " #b " \n\t"
591
592#define HADAMARD48\
593 LBUTTERFLY(%%mm0, %%mm1)\
594 LBUTTERFLY(%%mm2, %%mm3)\
595 LBUTTERFLY(%%mm4, %%mm5)\
596 LBUTTERFLY(%%mm6, %%mm7)\
597 \
598 LBUTTERFLY(%%mm0, %%mm2)\
599 LBUTTERFLY(%%mm1, %%mm3)\
600 LBUTTERFLY(%%mm4, %%mm6)\
601 LBUTTERFLY(%%mm5, %%mm7)\
602 \
603 LBUTTERFLY(%%mm0, %%mm4)\
604 LBUTTERFLY(%%mm1, %%mm5)\
605 LBUTTERFLY(%%mm2, %%mm6)\
606 LBUTTERFLY(%%mm3, %%mm7)
607
608#define MMABS(a,z)\
609 "pxor " #z ", " #z " \n\t"\
610 "pcmpgtw " #a ", " #z " \n\t"\
611 "pxor " #z ", " #a " \n\t"\
612 "psubw " #z ", " #a " \n\t"
613
614#define MMABS_SUM(a,z, sum)\
615 "pxor " #z ", " #z " \n\t"\
616 "pcmpgtw " #a ", " #z " \n\t"\
617 "pxor " #z ", " #a " \n\t"\
618 "psubw " #z ", " #a " \n\t"\
619 "paddusw " #a ", " #sum " \n\t"
620
621
622#define SBUTTERFLY(a,b,t,n)\
623 "movq " #a ", " #t " \n\t" /* abcd */\
624 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
625 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
626
627#define TRANSPOSE4(a,b,c,d,t)\
628 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
629 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
630 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
631 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
632
633#define LOAD4(o, a, b, c, d)\
634 "movq "#o"(%1), " #a " \n\t"\
635 "movq "#o"+16(%1), " #b " \n\t"\
636 "movq "#o"+32(%1), " #c " \n\t"\
637 "movq "#o"+48(%1), " #d " \n\t"
638
639#define STORE4(o, a, b, c, d)\
640 "movq "#a", "#o"(%1) \n\t"\
641 "movq "#b", "#o"+16(%1) \n\t"\
642 "movq "#c", "#o"+32(%1) \n\t"\
643 "movq "#d", "#o"+48(%1) \n\t"\
644
645static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
646 uint64_t temp[16] __align8;
647 int sum=0;
648
649 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 650
1457ab52
MN
651 asm volatile(
652 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
653 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
654
655 HADAMARD48
656
657 "movq %%mm7, 112(%1) \n\t"
658
659 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
660 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
661
662 "movq 112(%1), %%mm7 \n\t"
663 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
664 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
665
666 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
667 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
668
669 HADAMARD48
670
671 "movq %%mm7, 120(%1) \n\t"
672
673 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
674 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
675
676 "movq 120(%1), %%mm7 \n\t"
677 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
678 "movq %%mm7, %%mm5 \n\t"//FIXME remove
679 "movq %%mm6, %%mm7 \n\t"
680 "movq %%mm0, %%mm6 \n\t"
681// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
682
683 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
684// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
685
686 HADAMARD48
687 "movq %%mm7, 64(%1) \n\t"
688 MMABS(%%mm0, %%mm7)
689 MMABS_SUM(%%mm1, %%mm7, %%mm0)
690 MMABS_SUM(%%mm2, %%mm7, %%mm0)
691 MMABS_SUM(%%mm3, %%mm7, %%mm0)
692 MMABS_SUM(%%mm4, %%mm7, %%mm0)
693 MMABS_SUM(%%mm5, %%mm7, %%mm0)
694 MMABS_SUM(%%mm6, %%mm7, %%mm0)
695 "movq 64(%1), %%mm1 \n\t"
696 MMABS_SUM(%%mm1, %%mm7, %%mm0)
697 "movq %%mm0, 64(%1) \n\t"
698
699 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
700 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
701
702 HADAMARD48
703 "movq %%mm7, (%1) \n\t"
704 MMABS(%%mm0, %%mm7)
705 MMABS_SUM(%%mm1, %%mm7, %%mm0)
706 MMABS_SUM(%%mm2, %%mm7, %%mm0)
707 MMABS_SUM(%%mm3, %%mm7, %%mm0)
708 MMABS_SUM(%%mm4, %%mm7, %%mm0)
709 MMABS_SUM(%%mm5, %%mm7, %%mm0)
710 MMABS_SUM(%%mm6, %%mm7, %%mm0)
711 "movq (%1), %%mm1 \n\t"
712 MMABS_SUM(%%mm1, %%mm7, %%mm0)
713 "movq 64(%1), %%mm1 \n\t"
714 MMABS_SUM(%%mm1, %%mm7, %%mm0)
715
716 "movq %%mm0, %%mm1 \n\t"
717 "psrlq $32, %%mm0 \n\t"
718 "paddusw %%mm1, %%mm0 \n\t"
719 "movq %%mm0, %%mm1 \n\t"
720 "psrlq $16, %%mm0 \n\t"
721 "paddusw %%mm1, %%mm0 \n\t"
722 "movd %%mm0, %0 \n\t"
723
724 : "=r" (sum)
725 : "r"(temp)
726 );
727 return sum&0xFFFF;
728}
729
730WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
11f18faf 731
3178ee4c
MN
732#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
733#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
734
826f429a
MN
735#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
736 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
c296f66b 737 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
826f429a
MN
738 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
739 "movq "#in7", " #m3 " \n\t" /* d */\
740 "movq "#in0", %%mm5 \n\t" /* D */\
741 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
742 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
743 "movq "#in1", %%mm5 \n\t" /* C */\
744 "movq "#in2", %%mm6 \n\t" /* B */\
745 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
746 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
747 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
748 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
c296f66b 749 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
826f429a
MN
750 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
751 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
752 "psraw $5, %%mm5 \n\t"\
753 "packuswb %%mm5, %%mm5 \n\t"\
754 OP(%%mm5, out, %%mm7, d)
755
3178ee4c 756#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
5c91a675 757static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
758 uint64_t temp;\
759\
760 asm volatile(\
761 "pxor %%mm7, %%mm7 \n\t"\
762 "1: \n\t"\
763 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
764 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
765 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
766 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
767 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
768 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
769 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
770 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
771 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
772 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
773 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
774 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
775 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
776 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
777 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
778 "paddw %%mm3, %%mm5 \n\t" /* b */\
779 "paddw %%mm2, %%mm6 \n\t" /* c */\
780 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
781 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
782 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 783 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
784 "paddw %%mm4, %%mm0 \n\t" /* a */\
785 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 786 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 787 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 788 "paddw %6, %%mm6 \n\t"\
826f429a
MN
789 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
790 "psraw $5, %%mm0 \n\t"\
c296f66b 791 "movq %%mm0, %5 \n\t"\
826f429a
MN
792 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
793 \
794 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
795 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
796 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
797 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
798 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
799 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
800 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
801 "paddw %%mm0, %%mm2 \n\t" /* b */\
802 "paddw %%mm5, %%mm3 \n\t" /* c */\
803 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
804 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
805 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
806 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
807 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
808 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
c296f66b 809 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a
MN
810 "paddw %%mm2, %%mm1 \n\t" /* a */\
811 "paddw %%mm6, %%mm4 \n\t" /* d */\
c296f66b 812 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
826f429a 813 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
c296f66b 814 "paddw %6, %%mm1 \n\t"\
826f429a
MN
815 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
816 "psraw $5, %%mm3 \n\t"\
c296f66b 817 "movq %5, %%mm1 \n\t"\
826f429a 818 "packuswb %%mm3, %%mm1 \n\t"\
3178ee4c 819 OP_MMX2(%%mm1, (%1),%%mm4, q)\
826f429a
MN
820 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
821 \
822 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
823 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
824 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
825 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
826 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
827 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
828 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
829 "paddw %%mm1, %%mm5 \n\t" /* b */\
830 "paddw %%mm4, %%mm0 \n\t" /* c */\
831 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
832 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
833 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
834 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
c296f66b 835 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
826f429a
MN
836 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
837 "paddw %%mm3, %%mm2 \n\t" /* d */\
838 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
839 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
840 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
841 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
842 "paddw %%mm2, %%mm6 \n\t" /* a */\
c296f66b
MN
843 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
844 "paddw %6, %%mm0 \n\t"\
826f429a
MN
845 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
846 "psraw $5, %%mm0 \n\t"\
847 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
848 \
849 "paddw %%mm5, %%mm3 \n\t" /* a */\
850 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
851 "paddw %%mm4, %%mm6 \n\t" /* b */\
852 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
853 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
854 "paddw %%mm1, %%mm4 \n\t" /* c */\
855 "paddw %%mm2, %%mm5 \n\t" /* d */\
856 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
857 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
c296f66b
MN
858 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
859 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
826f429a 860 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 861 "paddw %6, %%mm4 \n\t"\
826f429a
MN
862 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
863 "psraw $5, %%mm4 \n\t"\
864 "packuswb %%mm4, %%mm0 \n\t"\
3178ee4c 865 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
826f429a
MN
866 \
867 "addl %3, %0 \n\t"\
868 "addl %4, %1 \n\t"\
869 "decl %2 \n\t"\
870 " jnz 1b \n\t"\
5a508a98 871 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
872 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
873 : "memory"\
826f429a
MN
874 );\
875}\
876\
877static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
878 int i;\
879 int16_t temp[16];\
880 /* quick HACK, XXX FIXME MUST be optimized */\
881 for(i=0; i<h; i++)\
882 {\
883 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
884 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
885 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
886 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
887 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
888 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
889 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
890 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
891 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
892 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
893 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
894 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
895 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
896 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
897 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
898 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
899 asm volatile(\
900 "movq (%0), %%mm0 \n\t"\
901 "movq 8(%0), %%mm1 \n\t"\
902 "paddw %2, %%mm0 \n\t"\
903 "paddw %2, %%mm1 \n\t"\
904 "psraw $5, %%mm0 \n\t"\
905 "psraw $5, %%mm1 \n\t"\
906 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 907 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a
MN
908 "movq 16(%0), %%mm0 \n\t"\
909 "movq 24(%0), %%mm1 \n\t"\
910 "paddw %2, %%mm0 \n\t"\
911 "paddw %2, %%mm1 \n\t"\
912 "psraw $5, %%mm0 \n\t"\
913 "psraw $5, %%mm1 \n\t"\
914 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 915 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
826f429a 916 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 917 : "memory"\
826f429a
MN
918 );\
919 dst+=dstStride;\
920 src+=srcStride;\
921 }\
922}\
923\
5c91a675 924static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
925 uint64_t temp;\
926\
927 asm volatile(\
928 "pxor %%mm7, %%mm7 \n\t"\
929 "1: \n\t"\
930 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
931 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
932 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
933 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
934 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
935 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
936 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
937 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
938 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
939 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
940 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
941 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
942 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
943 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
944 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
945 "paddw %%mm3, %%mm5 \n\t" /* b */\
946 "paddw %%mm2, %%mm6 \n\t" /* c */\
947 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
948 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
949 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
c296f66b 950 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
826f429a
MN
951 "paddw %%mm4, %%mm0 \n\t" /* a */\
952 "paddw %%mm1, %%mm5 \n\t" /* d */\
c296f66b 953 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
826f429a 954 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
c296f66b 955 "paddw %6, %%mm6 \n\t"\
826f429a
MN
956 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
957 "psraw $5, %%mm0 \n\t"\
958 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
959 \
960 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
961 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
962 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
963 "paddw %%mm5, %%mm1 \n\t" /* a */\
964 "paddw %%mm6, %%mm2 \n\t" /* b */\
965 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
966 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
967 "paddw %%mm6, %%mm3 \n\t" /* c */\
968 "paddw %%mm5, %%mm4 \n\t" /* d */\
969 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
970 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
c296f66b
MN
971 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
972 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
826f429a 973 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
c296f66b 974 "paddw %6, %%mm1 \n\t"\
826f429a
MN
975 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
976 "psraw $5, %%mm3 \n\t"\
977 "packuswb %%mm3, %%mm0 \n\t"\
3178ee4c 978 OP_MMX2(%%mm0, (%1), %%mm4, q)\
826f429a
MN
979 \
980 "addl %3, %0 \n\t"\
981 "addl %4, %1 \n\t"\
982 "decl %2 \n\t"\
c296f66b 983 " jnz 1b \n\t"\
5a508a98 984 : "+a"(src), "+c"(dst), "+m"(h)\
0b093b6f
MN
985 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
986 : "memory"\
826f429a
MN
987 );\
988}\
989\
990static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
991 int i;\
992 int16_t temp[8];\
993 /* quick HACK, XXX FIXME MUST be optimized */\
994 for(i=0; i<h; i++)\
995 {\
996 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
997 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
998 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
999 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1000 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1001 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1002 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1003 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1004 asm volatile(\
1005 "movq (%0), %%mm0 \n\t"\
1006 "movq 8(%0), %%mm1 \n\t"\
1007 "paddw %2, %%mm0 \n\t"\
1008 "paddw %2, %%mm1 \n\t"\
1009 "psraw $5, %%mm0 \n\t"\
1010 "psraw $5, %%mm1 \n\t"\
1011 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 1012 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a 1013 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 1014 :"memory"\
826f429a
MN
1015 );\
1016 dst+=dstStride;\
1017 src+=srcStride;\
1018 }\
3178ee4c
MN
1019}
1020
1021#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1022\
1023static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1024 uint64_t temp[17*4];\
1025 uint64_t *temp_ptr= temp;\
1026 int count= 17;\
1027\
1028 /*FIXME unroll */\
1029 asm volatile(\
1030 "pxor %%mm7, %%mm7 \n\t"\
1031 "1: \n\t"\
1032 "movq (%0), %%mm0 \n\t"\
1033 "movq (%0), %%mm1 \n\t"\
1034 "movq 8(%0), %%mm2 \n\t"\
1035 "movq 8(%0), %%mm3 \n\t"\
1036 "punpcklbw %%mm7, %%mm0 \n\t"\
1037 "punpckhbw %%mm7, %%mm1 \n\t"\
1038 "punpcklbw %%mm7, %%mm2 \n\t"\
1039 "punpckhbw %%mm7, %%mm3 \n\t"\
1040 "movq %%mm0, (%1) \n\t"\
1041 "movq %%mm1, 17*8(%1) \n\t"\
5a508a98
MN
1042 "movq %%mm2, 2*17*8(%1) \n\t"\
1043 "movq %%mm3, 3*17*8(%1) \n\t"\
3178ee4c
MN
1044 "addl $8, %1 \n\t"\
1045 "addl %3, %0 \n\t"\
1046 "decl %2 \n\t"\
1047 " jnz 1b \n\t"\
1048 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
5a508a98 1049 : "r" (srcStride)\
0b093b6f 1050 : "memory"\
3178ee4c
MN
1051 );\
1052 \
1053 temp_ptr= temp;\
1054 count=4;\
1055 \
1056/*FIXME reorder for speed */\
3178ee4c
MN
1057 asm volatile(\
1058 /*"pxor %%mm7, %%mm7 \n\t"*/\
3178ee4c
MN
1059 "1: \n\t"\
1060 "movq (%0), %%mm0 \n\t"\
1061 "movq 8(%0), %%mm1 \n\t"\
1062 "movq 16(%0), %%mm2 \n\t"\
1063 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1064 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1065 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
3178ee4c 1066 "addl %4, %1 \n\t"\
c296f66b 1067 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
3178ee4c 1068 \
c296f66b 1069 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
3178ee4c 1070 "addl %4, %1 \n\t"\
c296f66b
MN
1071 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1072 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
3178ee4c 1073 "addl %4, %1 \n\t"\
c296f66b
MN
1074 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1075 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
3178ee4c 1076 "addl %4, %1 \n\t"\
c296f66b
MN
1077 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1078 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
3178ee4c 1079 "addl %4, %1 \n\t"\
c296f66b
MN
1080 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1081 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
3178ee4c 1082 "addl %4, %1 \n\t"\
c296f66b 1083 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
3178ee4c 1084 \
c296f66b 1085 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
3178ee4c 1086 "addl %4, %1 \n\t" \
c296f66b
MN
1087 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1088 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
3178ee4c
MN
1089 \
1090 "addl $136, %0 \n\t"\
c296f66b 1091 "addl %6, %1 \n\t"\
3178ee4c
MN
1092 "decl %2 \n\t"\
1093 " jnz 1b \n\t"\
3178ee4c 1094 \
5a508a98 1095 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
c296f66b 1096 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
0b093b6f 1097 :"memory"\
3178ee4c 1098 );\
826f429a
MN
1099}\
1100\
5c91a675 1101static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
826f429a
MN
1102 uint64_t temp[9*4];\
1103 uint64_t *temp_ptr= temp;\
1104 int count= 9;\
1105\
1106 /*FIXME unroll */\
1107 asm volatile(\
1108 "pxor %%mm7, %%mm7 \n\t"\
1109 "1: \n\t"\
1110 "movq (%0), %%mm0 \n\t"\
1111 "movq (%0), %%mm1 \n\t"\
1112 "punpcklbw %%mm7, %%mm0 \n\t"\
1113 "punpckhbw %%mm7, %%mm1 \n\t"\
1114 "movq %%mm0, (%1) \n\t"\
1115 "movq %%mm1, 9*8(%1) \n\t"\
1116 "addl $8, %1 \n\t"\
1117 "addl %3, %0 \n\t"\
1118 "decl %2 \n\t"\
1119 " jnz 1b \n\t"\
1120 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1121 : "r" (srcStride)\
0b093b6f 1122 : "memory"\
826f429a
MN
1123 );\
1124 \
1125 temp_ptr= temp;\
1126 count=2;\
1127 \
1128/*FIXME reorder for speed */\
1129 asm volatile(\
1130 /*"pxor %%mm7, %%mm7 \n\t"*/\
1131 "1: \n\t"\
1132 "movq (%0), %%mm0 \n\t"\
1133 "movq 8(%0), %%mm1 \n\t"\
1134 "movq 16(%0), %%mm2 \n\t"\
1135 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1136 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1137 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
826f429a 1138 "addl %4, %1 \n\t"\
c296f66b 1139 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
826f429a 1140 \
c296f66b 1141 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
826f429a 1142 "addl %4, %1 \n\t"\
c296f66b 1143 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
826f429a 1144 \
c296f66b 1145 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
826f429a 1146 "addl %4, %1 \n\t"\
c296f66b
MN
1147 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1148 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
826f429a
MN
1149 \
1150 "addl $72, %0 \n\t"\
c296f66b 1151 "addl %6, %1 \n\t"\
826f429a
MN
1152 "decl %2 \n\t"\
1153 " jnz 1b \n\t"\
1154 \
c296f66b
MN
1155 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1156 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
0b093b6f
MN
1157 : "memory"\
1158 );\
3178ee4c 1159}\
826f429a
MN
1160\
1161static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1162 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
826f429a
MN
1163}\
1164\
1165static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1166 uint64_t temp[8];\
826f429a
MN
1167 uint8_t * const half= (uint8_t*)temp;\
1168 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1169 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1170}\
1171\
1172static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1173 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1174}\
1175\
1176static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1177 uint64_t temp[8];\
826f429a
MN
1178 uint8_t * const half= (uint8_t*)temp;\
1179 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1180 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1181}\
1182\
1183static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1184 uint64_t temp[8];\
826f429a 1185 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1186 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1187 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1188}\
1189\
1190static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1191 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1192}\
1193\
1194static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1195 uint64_t temp[8];\
826f429a 1196 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1197 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
826f429a
MN
1198 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1199}\
1200static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1201 uint64_t half[8 + 9];\
1202 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1203 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1204 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1205 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1206 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1207 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a
MN
1208}\
1209static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1210 uint64_t half[8 + 9];\
1211 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1212 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1213 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1214 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1215 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1216 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
826f429a
MN
1217}\
1218static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1219 uint64_t half[8 + 9];\
1220 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1221 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1222 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953 1223 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1224 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1225 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a
MN
1226}\
1227static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1228 uint64_t half[8 + 9];\
1229 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1230 uint8_t * const halfHV= ((uint8_t*)half);\
1231 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1232 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1233 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
db794953 1234 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a
MN
1235}\
1236static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1237 uint64_t half[8 + 9];\
826f429a
MN
1238 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1239 uint8_t * const halfHV= ((uint8_t*)half);\
1240 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1241 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1242 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1243}\
1244static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1245 uint64_t half[8 + 9];\
826f429a
MN
1246 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1247 uint8_t * const halfHV= ((uint8_t*)half);\
1248 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1249 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
826f429a
MN
1250 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1251}\
1252static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1253 uint64_t half[8 + 9];\
1254 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1255 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1256 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
1257 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a
MN
1258}\
1259static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1260 uint64_t half[8 + 9];\
1261 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1262 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
db794953
MN
1263 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
1264 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a
MN
1265}\
1266static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953 1267 uint64_t half[9];\
826f429a
MN
1268 uint8_t * const halfH= ((uint8_t*)half);\
1269 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1270 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a
MN
1271}\
1272static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1273 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
826f429a
MN
1274}\
1275\
1276static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1277 uint64_t temp[32];\
1278 uint8_t * const half= (uint8_t*)temp;\
1279 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1280 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1281}\
1282\
1283static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1284 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1285}\
1286\
1287static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1288 uint64_t temp[32];\
1289 uint8_t * const half= (uint8_t*)temp;\
1290 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1291 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1292}\
1293\
1294static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1295 uint64_t temp[32];\
1296 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1297 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1298 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1299}\
1300\
1301static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
3178ee4c 1302 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1303}\
1304\
1305static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1306 uint64_t temp[32];\
1307 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1308 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
826f429a
MN
1309 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1310}\
1311static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1312 uint64_t half[16*2 + 17*2];\
1313 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1314 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1315 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1316 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1317 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1318 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a
MN
1319}\
1320static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1321 uint64_t half[16*2 + 17*2];\
1322 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1323 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1324 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1325 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1326 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1327 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
826f429a
MN
1328}\
1329static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1330 uint64_t half[16*2 + 17*2];\
1331 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1332 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1333 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953 1334 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1335 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1336 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a
MN
1337}\
1338static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1339 uint64_t half[16*2 + 17*2];\
1340 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1341 uint8_t * const halfHV= ((uint8_t*)half);\
1342 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1343 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1344 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
db794953 1345 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a
MN
1346}\
1347static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1348 uint64_t half[16*2 + 17*2];\
1349 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1350 uint8_t * const halfHV= ((uint8_t*)half);\
1351 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1352 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1353 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1354}\
1355static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1356 uint64_t half[16*2 + 17*2];\
1357 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1358 uint8_t * const halfHV= ((uint8_t*)half);\
1359 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1360 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
826f429a
MN
1361 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1362}\
1363static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1364 uint64_t half[17*2];\
1365 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1366 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1367 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
1368 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1369}\
1370static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
db794953
MN
1371 uint64_t half[17*2];\
1372 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1373 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
db794953
MN
1374 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
1375 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1376}\
1377static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1378 uint64_t half[17*2];\
1379 uint8_t * const halfH= ((uint8_t*)half);\
1380 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1381 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1382}
1383
1384
1385#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
3178ee4c 1386#define AVG_3DNOW_OP(a,b,temp, size) \
826f429a
MN
1387"mov" #size " " #b ", " #temp " \n\t"\
1388"pavgusb " #temp ", " #a " \n\t"\
1389"mov" #size " " #a ", " #b " \n\t"
3178ee4c 1390#define AVG_MMX2_OP(a,b,temp, size) \
826f429a
MN
1391"mov" #size " " #b ", " #temp " \n\t"\
1392"pavgb " #temp ", " #a " \n\t"\
1393"mov" #size " " #a ", " #b " \n\t"
3178ee4c
MN
1394
1395QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1396QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1397QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1398QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1399QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1400QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
826f429a 1401QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
3178ee4c 1402QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
826f429a
MN
1403QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1404
61a4e8ae 1405#if 0
d6a4c0b1 1406static void just_return() { return; }
61a4e8ae 1407#endif
d6a4c0b1 1408
826f429a
MN
1409#define SET_QPEL_FUNC(postfix1, postfix2) \
1410 c->put_ ## postfix1 = put_ ## postfix2;\
1411 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1412 c->avg_ ## postfix1 = avg_ ## postfix2;
1413
eb4b3dd3 1414void dsputil_init_mmx(DSPContext* c, unsigned mask)
de6d9b64
FB
1415{
1416 mm_flags = mm_support();
1565dabc
LB
1417#if 0
1418 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 1419 if (mm_flags & MM_MMX)
1565dabc 1420 fprintf(stderr, " mmx");
de6d9b64 1421 if (mm_flags & MM_MMXEXT)
1565dabc 1422 fprintf(stderr, " mmxext");
de6d9b64 1423 if (mm_flags & MM_3DNOW)
1565dabc 1424 fprintf(stderr, " 3dnow");
de6d9b64 1425 if (mm_flags & MM_SSE)
1565dabc 1426 fprintf(stderr, " sse");
de6d9b64 1427 if (mm_flags & MM_SSE2)
1565dabc
LB
1428 fprintf(stderr, " sse2");
1429 fprintf(stderr, "\n");
de6d9b64
FB
1430#endif
1431
1432 if (mm_flags & MM_MMX) {
eb4b3dd3
ZK
1433 c->get_pixels = get_pixels_mmx;
1434 c->diff_pixels = diff_pixels_mmx;
1435 c->put_pixels_clamped = put_pixels_clamped_mmx;
1436 c->add_pixels_clamped = add_pixels_clamped_mmx;
1437 c->clear_blocks = clear_blocks_mmx;
1438 c->pix_sum = pix_sum16_mmx;
1439
eb4b3dd3
ZK
1440 c->put_pixels_tab[0][0] = put_pixels16_mmx;
1441 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
1442 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
1443 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
1444
1445 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
1446 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1447 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1448 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
1449
1450 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
1451 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
1452 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
1453 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1454
1455 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
1456 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
1457 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
1458 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
1459
1460 c->put_pixels_tab[1][0] = put_pixels8_mmx;
1461 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
1462 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
1463 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
1464
1465 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
1466 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1467 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1468 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
1469
1470 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
1471 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
1472 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
1473 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1474
1475 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
1476 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
1477 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
1478 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
826f429a 1479
11f18faf
MN
1480 c->add_bytes= add_bytes_mmx;
1481 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
1482
1483 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1484 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1485
2a006cd3
FL
1486 c->pix_norm1 = pix_norm1_mmx;
1487 c->sse[0] = sse16_mmx;
1457ab52 1488
de6d9b64 1489 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
1490 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1491 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
1492 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1493 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1494
1495 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1496 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1497 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
1498 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1499
1500 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1501 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
1502 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1503 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1504
1505 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1506 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1507 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1508 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3178ee4c 1509
c296f66b 1510#if 1
826f429a
MN
1511 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1512 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1513 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1514 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
1515 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
1516 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
1517 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
1518 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
1519 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
1520 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
1521 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
1522 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
1523 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
1524 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
1525 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
1526 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
1527 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
1528 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
1529 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
1530 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
1531 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
1532 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
1533 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
1534 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
1535 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
1536 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
1537 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
1538 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
1539 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
1540 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
1541 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
1542 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
c296f66b 1543#endif
de6d9b64 1544 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
1545 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1546 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
1547 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1548 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1549
1550 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1551 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1552 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
1553 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1554
1555 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1556 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
1557 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1558 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1559
1560 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1561 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1562 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1563 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
db794953 1564
826f429a
MN
1565 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1566 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1567 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1568 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1569 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
1570 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
1571 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
1572 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
1573 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
1574 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
1575 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
1576 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
1577 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
1578 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
1579 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
1580 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
1581 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
1582 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
1583 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
1584 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
1585 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
1586 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
1587 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
1588 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
1589 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
1590 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
1591 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
1592 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
1593 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
1594 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1595 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1596 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
de6d9b64
FB
1597 }
1598 }
5c91a675 1599 dsputil_init_pix_mmx(c, mask);
d6a4c0b1
ZK
1600#if 0
1601 // for speed testing
1602 get_pixels = just_return;
1603 put_pixels_clamped = just_return;
1604 add_pixels_clamped = just_return;
1605
1606 pix_abs16x16 = just_return;
1607 pix_abs16x16_x2 = just_return;
1608 pix_abs16x16_y2 = just_return;
1609 pix_abs16x16_xy2 = just_return;
1610
1611 put_pixels_tab[0] = just_return;
1612 put_pixels_tab[1] = just_return;
1613 put_pixels_tab[2] = just_return;
1614 put_pixels_tab[3] = just_return;
1615
1616 put_no_rnd_pixels_tab[0] = just_return;
1617 put_no_rnd_pixels_tab[1] = just_return;
1618 put_no_rnd_pixels_tab[2] = just_return;
1619 put_no_rnd_pixels_tab[3] = just_return;
1620
1621 avg_pixels_tab[0] = just_return;
1622 avg_pixels_tab[1] = just_return;
1623 avg_pixels_tab[2] = just_return;
1624 avg_pixels_tab[3] = just_return;
1625
1626 avg_no_rnd_pixels_tab[0] = just_return;
1627 avg_no_rnd_pixels_tab[1] = just_return;
1628 avg_no_rnd_pixels_tab[2] = just_return;
1629 avg_no_rnd_pixels_tab[3] = just_return;
1630
d6a4c0b1
ZK
1631 //av_fdct = just_return;
1632 //ff_idct = just_return;
1633#endif
de6d9b64 1634}
4f12a497
FB
1635
1636/* remove any non bit exact operation (testing purpose). NOTE that
1637 this function should be kept as small as possible because it is
1638 always difficult to test automatically non bit exact cases. */
eb4b3dd3 1639void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
4f12a497
FB
1640{
1641 if (mm_flags & MM_MMX) {
b3184779 1642 /* MMX2 & 3DNOW */
eb4b3dd3
ZK
1643 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1644 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1645 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1646 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1647 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1648 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4f12a497 1649 }
5c91a675 1650 dsputil_set_bit_exact_pix_mmx(c, mask);
4f12a497 1651}