2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
26 #include "dsputil_mmx.h"
27 #include "simple_idct.h"
28 #include "mpegvideo.h"
31 #include "vp3dsp_mmx.h"
32 #include "vp3dsp_sse2.h"
38 extern void ff_idct_xvid_mmx(short *block
);
39 extern void ff_idct_xvid_mmx2(short *block
);
41 int mm_flags
; /* multimedia extension flags */
43 /* pixel operations */
44 DECLARE_ALIGNED_8 (const uint64_t, ff_bone
) = 0x0101010101010101ULL
;
45 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo
) = 0x0002000200020002ULL
;
47 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000
[2]) =
48 {0x8000000080000000ULL
, 0x8000000080000000ULL
};
50 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3
) = 0x0003000300030003ULL
;
51 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4
) = 0x0004000400040004ULL
;
52 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5
) = 0x0005000500050005ULL
;
53 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8
) = 0x0008000800080008ULL
;
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15
) = 0x000F000F000F000FULL
;
55 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16
) = 0x0010001000100010ULL
;
56 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20
) = 0x0014001400140014ULL
;
57 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_32
) = 0x0020002000200020ULL
;
58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42
) = 0x002A002A002A002AULL
;
59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64
) = 0x0040004000400040ULL
;
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96
) = 0x0060006000600060ULL
;
61 DECLARE_ALIGNED_16(const uint64_t, ff_pw_128
) = 0x0080008000800080ULL
;
63 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1
) = 0x0101010101010101ULL
;
64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3
) = 0x0303030303030303ULL
;
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7
) = 0x0707070707070707ULL
;
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F
) = 0x3F3F3F3F3F3F3F3FULL
;
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1
) = 0xA1A1A1A1A1A1A1A1ULL
;
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC
) = 0xFCFCFCFCFCFCFCFCULL
;
70 DECLARE_ALIGNED_16(const double, ff_pd_1
[2]) = { 1.0, 1.0 };
71 DECLARE_ALIGNED_16(const double, ff_pd_2
[2]) = { 2.0, 2.0 };
73 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
74 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
76 #define MOVQ_WONE(regd) \
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79 "psrlw $15, %%" #regd ::)
81 #define MOVQ_BFE(regd) \
83 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
84 "paddb %%" #regd ", %%" #regd " \n\t" ::)
87 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
88 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
90 // for shared library it's better to use this way for accessing constants
92 #define MOVQ_BONE(regd) \
94 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95 "psrlw $15, %%" #regd " \n\t" \
96 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
98 #define MOVQ_WTWO(regd) \
100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101 "psrlw $15, %%" #regd " \n\t" \
102 "psllw $1, %%" #regd " \n\t"::)
106 // using regr as temporary and for the output result
107 // first argument is unmodifed and second is trashed
108 // regfe is supposed to contain 0xfefefefefefefefe
109 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
110 "movq " #rega ", " #regr " \n\t"\
111 "pand " #regb ", " #regr " \n\t"\
112 "pxor " #rega ", " #regb " \n\t"\
113 "pand " #regfe "," #regb " \n\t"\
114 "psrlq $1, " #regb " \n\t"\
115 "paddb " #regb ", " #regr " \n\t"
117 #define PAVGB_MMX(rega, regb, regr, regfe) \
118 "movq " #rega ", " #regr " \n\t"\
119 "por " #regb ", " #regr " \n\t"\
120 "pxor " #rega ", " #regb " \n\t"\
121 "pand " #regfe "," #regb " \n\t"\
122 "psrlq $1, " #regb " \n\t"\
123 "psubb " #regb ", " #regr " \n\t"
125 // mm6 is supposed to contain 0xfefefefefefefefe
126 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
127 "movq " #rega ", " #regr " \n\t"\
128 "movq " #regc ", " #regp " \n\t"\
129 "pand " #regb ", " #regr " \n\t"\
130 "pand " #regd ", " #regp " \n\t"\
131 "pxor " #rega ", " #regb " \n\t"\
132 "pxor " #regc ", " #regd " \n\t"\
133 "pand %%mm6, " #regb " \n\t"\
134 "pand %%mm6, " #regd " \n\t"\
135 "psrlq $1, " #regb " \n\t"\
136 "psrlq $1, " #regd " \n\t"\
137 "paddb " #regb ", " #regr " \n\t"\
138 "paddb " #regd ", " #regp " \n\t"
140 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
141 "movq " #rega ", " #regr " \n\t"\
142 "movq " #regc ", " #regp " \n\t"\
143 "por " #regb ", " #regr " \n\t"\
144 "por " #regd ", " #regp " \n\t"\
145 "pxor " #rega ", " #regb " \n\t"\
146 "pxor " #regc ", " #regd " \n\t"\
147 "pand %%mm6, " #regb " \n\t"\
148 "pand %%mm6, " #regd " \n\t"\
149 "psrlq $1, " #regd " \n\t"\
150 "psrlq $1, " #regb " \n\t"\
151 "psubb " #regb ", " #regr " \n\t"\
152 "psubb " #regd ", " #regp " \n\t"
154 /***********************************/
155 /* MMX no rounding */
156 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
157 #define SET_RND MOVQ_WONE
158 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
159 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
161 #include "dsputil_mmx_rnd.h"
167 /***********************************/
170 #define DEF(x, y) x ## _ ## y ##_mmx
171 #define SET_RND MOVQ_WTWO
172 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
173 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
175 #include "dsputil_mmx_rnd.h"
182 /***********************************/
185 #define DEF(x) x ## _3dnow
186 #define PAVGB "pavgusb"
188 #include "dsputil_mmx_avg.h"
193 /***********************************/
196 #define DEF(x) x ## _mmx2
198 /* Introduced only in MMX2 set */
199 #define PAVGB "pavgb"
201 #include "dsputil_mmx_avg.h"
206 #define SBUTTERFLY(a,b,t,n,m)\
207 "mov" #m " " #a ", " #t " \n\t" /* abcd */\
208 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
209 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
211 #define TRANSPOSE4(a,b,c,d,t)\
212 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
213 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
214 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
215 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
217 /***********************************/
220 #ifdef CONFIG_ENCODERS
221 static void get_pixels_mmx(DCTELEM
*block
, const uint8_t *pixels
, int line_size
)
224 "mov $-128, %%"REG_a
" \n\t"
225 "pxor %%mm7, %%mm7 \n\t"
228 "movq (%0), %%mm0 \n\t"
229 "movq (%0, %2), %%mm2 \n\t"
230 "movq %%mm0, %%mm1 \n\t"
231 "movq %%mm2, %%mm3 \n\t"
232 "punpcklbw %%mm7, %%mm0 \n\t"
233 "punpckhbw %%mm7, %%mm1 \n\t"
234 "punpcklbw %%mm7, %%mm2 \n\t"
235 "punpckhbw %%mm7, %%mm3 \n\t"
236 "movq %%mm0, (%1, %%"REG_a
") \n\t"
237 "movq %%mm1, 8(%1, %%"REG_a
") \n\t"
238 "movq %%mm2, 16(%1, %%"REG_a
") \n\t"
239 "movq %%mm3, 24(%1, %%"REG_a
") \n\t"
241 "add $32, %%"REG_a
" \n\t"
244 : "r" (block
+64), "r" ((long)line_size
), "r" ((long)line_size
*2)
249 static inline void diff_pixels_mmx(DCTELEM
*block
, const uint8_t *s1
, const uint8_t *s2
, int stride
)
252 "pxor %%mm7, %%mm7 \n\t"
253 "mov $-128, %%"REG_a
" \n\t"
256 "movq (%0), %%mm0 \n\t"
257 "movq (%1), %%mm2 \n\t"
258 "movq %%mm0, %%mm1 \n\t"
259 "movq %%mm2, %%mm3 \n\t"
260 "punpcklbw %%mm7, %%mm0 \n\t"
261 "punpckhbw %%mm7, %%mm1 \n\t"
262 "punpcklbw %%mm7, %%mm2 \n\t"
263 "punpckhbw %%mm7, %%mm3 \n\t"
264 "psubw %%mm2, %%mm0 \n\t"
265 "psubw %%mm3, %%mm1 \n\t"
266 "movq %%mm0, (%2, %%"REG_a
") \n\t"
267 "movq %%mm1, 8(%2, %%"REG_a
") \n\t"
270 "add $16, %%"REG_a
" \n\t"
272 : "+r" (s1
), "+r" (s2
)
273 : "r" (block
+64), "r" ((long)stride
)
277 #endif //CONFIG_ENCODERS
279 void put_pixels_clamped_mmx(const DCTELEM
*block
, uint8_t *pixels
, int line_size
)
284 /* read the pixels */
289 "movq %3, %%mm0 \n\t"
290 "movq 8%3, %%mm1 \n\t"
291 "movq 16%3, %%mm2 \n\t"
292 "movq 24%3, %%mm3 \n\t"
293 "movq 32%3, %%mm4 \n\t"
294 "movq 40%3, %%mm5 \n\t"
295 "movq 48%3, %%mm6 \n\t"
296 "movq 56%3, %%mm7 \n\t"
297 "packuswb %%mm1, %%mm0 \n\t"
298 "packuswb %%mm3, %%mm2 \n\t"
299 "packuswb %%mm5, %%mm4 \n\t"
300 "packuswb %%mm7, %%mm6 \n\t"
301 "movq %%mm0, (%0) \n\t"
302 "movq %%mm2, (%0, %1) \n\t"
303 "movq %%mm4, (%0, %1, 2) \n\t"
304 "movq %%mm6, (%0, %2) \n\t"
305 ::"r" (pix
), "r" ((long)line_size
), "r" ((long)line_size
*3), "m"(*p
)
310 // if here would be an exact copy of the code above
311 // compiler would generate some very strange code
314 "movq (%3), %%mm0 \n\t"
315 "movq 8(%3), %%mm1 \n\t"
316 "movq 16(%3), %%mm2 \n\t"
317 "movq 24(%3), %%mm3 \n\t"
318 "movq 32(%3), %%mm4 \n\t"
319 "movq 40(%3), %%mm5 \n\t"
320 "movq 48(%3), %%mm6 \n\t"
321 "movq 56(%3), %%mm7 \n\t"
322 "packuswb %%mm1, %%mm0 \n\t"
323 "packuswb %%mm3, %%mm2 \n\t"
324 "packuswb %%mm5, %%mm4 \n\t"
325 "packuswb %%mm7, %%mm6 \n\t"
326 "movq %%mm0, (%0) \n\t"
327 "movq %%mm2, (%0, %1) \n\t"
328 "movq %%mm4, (%0, %1, 2) \n\t"
329 "movq %%mm6, (%0, %2) \n\t"
330 ::"r" (pix
), "r" ((long)line_size
), "r" ((long)line_size
*3), "r"(p
)
334 static DECLARE_ALIGNED_8(const unsigned char, vector128
[8]) =
335 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
337 void put_signed_pixels_clamped_mmx(const DCTELEM
*block
, uint8_t *pixels
, int line_size
)
341 movq_m2r(*vector128
, mm1
);
342 for (i
= 0; i
< 8; i
++) {
343 movq_m2r(*(block
), mm0
);
344 packsswb_m2r(*(block
+ 4), mm0
);
347 movq_r2m(mm0
, *pixels
);
352 void add_pixels_clamped_mmx(const DCTELEM
*block
, uint8_t *pixels
, int line_size
)
358 /* read the pixels */
365 "movq (%2), %%mm0 \n\t"
366 "movq 8(%2), %%mm1 \n\t"
367 "movq 16(%2), %%mm2 \n\t"
368 "movq 24(%2), %%mm3 \n\t"
369 "movq %0, %%mm4 \n\t"
370 "movq %1, %%mm6 \n\t"
371 "movq %%mm4, %%mm5 \n\t"
372 "punpcklbw %%mm7, %%mm4 \n\t"
373 "punpckhbw %%mm7, %%mm5 \n\t"
374 "paddsw %%mm4, %%mm0 \n\t"
375 "paddsw %%mm5, %%mm1 \n\t"
376 "movq %%mm6, %%mm5 \n\t"
377 "punpcklbw %%mm7, %%mm6 \n\t"
378 "punpckhbw %%mm7, %%mm5 \n\t"
379 "paddsw %%mm6, %%mm2 \n\t"
380 "paddsw %%mm5, %%mm3 \n\t"
381 "packuswb %%mm1, %%mm0 \n\t"
382 "packuswb %%mm3, %%mm2 \n\t"
383 "movq %%mm0, %0 \n\t"
384 "movq %%mm2, %1 \n\t"
385 :"+m"(*pix
), "+m"(*(pix
+line_size
))
393 static void put_pixels4_mmx(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
396 "lea (%3, %3), %%"REG_a
" \n\t"
399 "movd (%1), %%mm0 \n\t"
400 "movd (%1, %3), %%mm1 \n\t"
401 "movd %%mm0, (%2) \n\t"
402 "movd %%mm1, (%2, %3) \n\t"
403 "add %%"REG_a
", %1 \n\t"
404 "add %%"REG_a
", %2 \n\t"
405 "movd (%1), %%mm0 \n\t"
406 "movd (%1, %3), %%mm1 \n\t"
407 "movd %%mm0, (%2) \n\t"
408 "movd %%mm1, (%2, %3) \n\t"
409 "add %%"REG_a
", %1 \n\t"
410 "add %%"REG_a
", %2 \n\t"
413 : "+g"(h
), "+r" (pixels
), "+r" (block
)
414 : "r"((long)line_size
)
419 static void put_pixels8_mmx(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
422 "lea (%3, %3), %%"REG_a
" \n\t"
425 "movq (%1), %%mm0 \n\t"
426 "movq (%1, %3), %%mm1 \n\t"
427 "movq %%mm0, (%2) \n\t"
428 "movq %%mm1, (%2, %3) \n\t"
429 "add %%"REG_a
", %1 \n\t"
430 "add %%"REG_a
", %2 \n\t"
431 "movq (%1), %%mm0 \n\t"
432 "movq (%1, %3), %%mm1 \n\t"
433 "movq %%mm0, (%2) \n\t"
434 "movq %%mm1, (%2, %3) \n\t"
435 "add %%"REG_a
", %1 \n\t"
436 "add %%"REG_a
", %2 \n\t"
439 : "+g"(h
), "+r" (pixels
), "+r" (block
)
440 : "r"((long)line_size
)
445 static void put_pixels16_mmx(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
448 "lea (%3, %3), %%"REG_a
" \n\t"
451 "movq (%1), %%mm0 \n\t"
452 "movq 8(%1), %%mm4 \n\t"
453 "movq (%1, %3), %%mm1 \n\t"
454 "movq 8(%1, %3), %%mm5 \n\t"
455 "movq %%mm0, (%2) \n\t"
456 "movq %%mm4, 8(%2) \n\t"
457 "movq %%mm1, (%2, %3) \n\t"
458 "movq %%mm5, 8(%2, %3) \n\t"
459 "add %%"REG_a
", %1 \n\t"
460 "add %%"REG_a
", %2 \n\t"
461 "movq (%1), %%mm0 \n\t"
462 "movq 8(%1), %%mm4 \n\t"
463 "movq (%1, %3), %%mm1 \n\t"
464 "movq 8(%1, %3), %%mm5 \n\t"
465 "movq %%mm0, (%2) \n\t"
466 "movq %%mm4, 8(%2) \n\t"
467 "movq %%mm1, (%2, %3) \n\t"
468 "movq %%mm5, 8(%2, %3) \n\t"
469 "add %%"REG_a
", %1 \n\t"
470 "add %%"REG_a
", %2 \n\t"
473 : "+g"(h
), "+r" (pixels
), "+r" (block
)
474 : "r"((long)line_size
)
479 static void clear_blocks_mmx(DCTELEM
*blocks
)
482 "pxor %%mm7, %%mm7 \n\t"
483 "mov $-128*6, %%"REG_a
" \n\t"
485 "movq %%mm7, (%0, %%"REG_a
") \n\t"
486 "movq %%mm7, 8(%0, %%"REG_a
") \n\t"
487 "movq %%mm7, 16(%0, %%"REG_a
") \n\t"
488 "movq %%mm7, 24(%0, %%"REG_a
") \n\t"
489 "add $32, %%"REG_a
" \n\t"
491 : : "r" (((uint8_t *)blocks
)+128*6)
496 #ifdef CONFIG_ENCODERS
497 static int pix_sum16_mmx(uint8_t * pix
, int line_size
){
500 long index
= -line_size
*h
;
503 "pxor %%mm7, %%mm7 \n\t"
504 "pxor %%mm6, %%mm6 \n\t"
506 "movq (%2, %1), %%mm0 \n\t"
507 "movq (%2, %1), %%mm1 \n\t"
508 "movq 8(%2, %1), %%mm2 \n\t"
509 "movq 8(%2, %1), %%mm3 \n\t"
510 "punpcklbw %%mm7, %%mm0 \n\t"
511 "punpckhbw %%mm7, %%mm1 \n\t"
512 "punpcklbw %%mm7, %%mm2 \n\t"
513 "punpckhbw %%mm7, %%mm3 \n\t"
514 "paddw %%mm0, %%mm1 \n\t"
515 "paddw %%mm2, %%mm3 \n\t"
516 "paddw %%mm1, %%mm3 \n\t"
517 "paddw %%mm3, %%mm6 \n\t"
520 "movq %%mm6, %%mm5 \n\t"
521 "psrlq $32, %%mm6 \n\t"
522 "paddw %%mm5, %%mm6 \n\t"
523 "movq %%mm6, %%mm5 \n\t"
524 "psrlq $16, %%mm6 \n\t"
525 "paddw %%mm5, %%mm6 \n\t"
526 "movd %%mm6, %0 \n\t"
527 "andl $0xFFFF, %0 \n\t"
528 : "=&r" (sum
), "+r" (index
)
529 : "r" (pix
- index
), "r" ((long)line_size
)
534 #endif //CONFIG_ENCODERS
536 static void add_bytes_mmx(uint8_t *dst
, uint8_t *src
, int w
){
540 "movq (%1, %0), %%mm0 \n\t"
541 "movq (%2, %0), %%mm1 \n\t"
542 "paddb %%mm0, %%mm1 \n\t"
543 "movq %%mm1, (%2, %0) \n\t"
544 "movq 8(%1, %0), %%mm0 \n\t"
545 "movq 8(%2, %0), %%mm1 \n\t"
546 "paddb %%mm0, %%mm1 \n\t"
547 "movq %%mm1, 8(%2, %0) \n\t"
552 : "r"(src
), "r"(dst
), "r"((long)w
-15)
555 dst
[i
+0] += src
[i
+0];
558 #define H263_LOOP_FILTER \
559 "pxor %%mm7, %%mm7 \n\t"\
560 "movq %0, %%mm0 \n\t"\
561 "movq %0, %%mm1 \n\t"\
562 "movq %3, %%mm2 \n\t"\
563 "movq %3, %%mm3 \n\t"\
564 "punpcklbw %%mm7, %%mm0 \n\t"\
565 "punpckhbw %%mm7, %%mm1 \n\t"\
566 "punpcklbw %%mm7, %%mm2 \n\t"\
567 "punpckhbw %%mm7, %%mm3 \n\t"\
568 "psubw %%mm2, %%mm0 \n\t"\
569 "psubw %%mm3, %%mm1 \n\t"\
570 "movq %1, %%mm2 \n\t"\
571 "movq %1, %%mm3 \n\t"\
572 "movq %2, %%mm4 \n\t"\
573 "movq %2, %%mm5 \n\t"\
574 "punpcklbw %%mm7, %%mm2 \n\t"\
575 "punpckhbw %%mm7, %%mm3 \n\t"\
576 "punpcklbw %%mm7, %%mm4 \n\t"\
577 "punpckhbw %%mm7, %%mm5 \n\t"\
578 "psubw %%mm2, %%mm4 \n\t"\
579 "psubw %%mm3, %%mm5 \n\t"\
580 "psllw $2, %%mm4 \n\t"\
581 "psllw $2, %%mm5 \n\t"\
582 "paddw %%mm0, %%mm4 \n\t"\
583 "paddw %%mm1, %%mm5 \n\t"\
584 "pxor %%mm6, %%mm6 \n\t"\
585 "pcmpgtw %%mm4, %%mm6 \n\t"\
586 "pcmpgtw %%mm5, %%mm7 \n\t"\
587 "pxor %%mm6, %%mm4 \n\t"\
588 "pxor %%mm7, %%mm5 \n\t"\
589 "psubw %%mm6, %%mm4 \n\t"\
590 "psubw %%mm7, %%mm5 \n\t"\
591 "psrlw $3, %%mm4 \n\t"\
592 "psrlw $3, %%mm5 \n\t"\
593 "packuswb %%mm5, %%mm4 \n\t"\
594 "packsswb %%mm7, %%mm6 \n\t"\
595 "pxor %%mm7, %%mm7 \n\t"\
596 "movd %4, %%mm2 \n\t"\
597 "punpcklbw %%mm2, %%mm2 \n\t"\
598 "punpcklbw %%mm2, %%mm2 \n\t"\
599 "punpcklbw %%mm2, %%mm2 \n\t"\
600 "psubusb %%mm4, %%mm2 \n\t"\
601 "movq %%mm2, %%mm3 \n\t"\
602 "psubusb %%mm4, %%mm3 \n\t"\
603 "psubb %%mm3, %%mm2 \n\t"\
604 "movq %1, %%mm3 \n\t"\
605 "movq %2, %%mm4 \n\t"\
606 "pxor %%mm6, %%mm3 \n\t"\
607 "pxor %%mm6, %%mm4 \n\t"\
608 "paddusb %%mm2, %%mm3 \n\t"\
609 "psubusb %%mm2, %%mm4 \n\t"\
610 "pxor %%mm6, %%mm3 \n\t"\
611 "pxor %%mm6, %%mm4 \n\t"\
612 "paddusb %%mm2, %%mm2 \n\t"\
613 "packsswb %%mm1, %%mm0 \n\t"\
614 "pcmpgtb %%mm0, %%mm7 \n\t"\
615 "pxor %%mm7, %%mm0 \n\t"\
616 "psubb %%mm7, %%mm0 \n\t"\
617 "movq %%mm0, %%mm1 \n\t"\
618 "psubusb %%mm2, %%mm0 \n\t"\
619 "psubb %%mm0, %%mm1 \n\t"\
620 "pand %5, %%mm1 \n\t"\
621 "psrlw $2, %%mm1 \n\t"\
622 "pxor %%mm7, %%mm1 \n\t"\
623 "psubb %%mm7, %%mm1 \n\t"\
624 "movq %0, %%mm5 \n\t"\
625 "movq %3, %%mm6 \n\t"\
626 "psubb %%mm1, %%mm5 \n\t"\
627 "paddb %%mm1, %%mm6 \n\t"
629 static void h263_v_loop_filter_mmx(uint8_t *src
, int stride
, int qscale
){
630 if(ENABLE_ANY_H263
) {
631 const int strength
= ff_h263_loop_filter_strength
[qscale
];
637 "movq %%mm3, %1 \n\t"
638 "movq %%mm4, %2 \n\t"
639 "movq %%mm5, %0 \n\t"
640 "movq %%mm6, %3 \n\t"
641 : "+m" (*(uint64_t*)(src
- 2*stride
)),
642 "+m" (*(uint64_t*)(src
- 1*stride
)),
643 "+m" (*(uint64_t*)(src
+ 0*stride
)),
644 "+m" (*(uint64_t*)(src
+ 1*stride
))
645 : "g" (2*strength
), "m"(ff_pb_FC
)
650 static inline void transpose4x4(uint8_t *dst
, uint8_t *src
, int dst_stride
, int src_stride
){
651 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
652 "movd %4, %%mm0 \n\t"
653 "movd %5, %%mm1 \n\t"
654 "movd %6, %%mm2 \n\t"
655 "movd %7, %%mm3 \n\t"
656 "punpcklbw %%mm1, %%mm0 \n\t"
657 "punpcklbw %%mm3, %%mm2 \n\t"
658 "movq %%mm0, %%mm1 \n\t"
659 "punpcklwd %%mm2, %%mm0 \n\t"
660 "punpckhwd %%mm2, %%mm1 \n\t"
661 "movd %%mm0, %0 \n\t"
662 "punpckhdq %%mm0, %%mm0 \n\t"
663 "movd %%mm0, %1 \n\t"
664 "movd %%mm1, %2 \n\t"
665 "punpckhdq %%mm1, %%mm1 \n\t"
666 "movd %%mm1, %3 \n\t"
668 : "=m" (*(uint32_t*)(dst
+ 0*dst_stride
)),
669 "=m" (*(uint32_t*)(dst
+ 1*dst_stride
)),
670 "=m" (*(uint32_t*)(dst
+ 2*dst_stride
)),
671 "=m" (*(uint32_t*)(dst
+ 3*dst_stride
))
672 : "m" (*(uint32_t*)(src
+ 0*src_stride
)),
673 "m" (*(uint32_t*)(src
+ 1*src_stride
)),
674 "m" (*(uint32_t*)(src
+ 2*src_stride
)),
675 "m" (*(uint32_t*)(src
+ 3*src_stride
))
679 static void h263_h_loop_filter_mmx(uint8_t *src
, int stride
, int qscale
){
680 if(ENABLE_ANY_H263
) {
681 const int strength
= ff_h263_loop_filter_strength
[qscale
];
682 uint64_t temp
[4] __attribute__ ((aligned(8)));
683 uint8_t *btemp
= (uint8_t*)temp
;
687 transpose4x4(btemp
, src
, 8, stride
);
688 transpose4x4(btemp
+4, src
+ 4*stride
, 8, stride
);
690 H263_LOOP_FILTER
// 5 3 4 6
696 : "g" (2*strength
), "m"(ff_pb_FC
)
700 "movq %%mm5, %%mm1 \n\t"
701 "movq %%mm4, %%mm0 \n\t"
702 "punpcklbw %%mm3, %%mm5 \n\t"
703 "punpcklbw %%mm6, %%mm4 \n\t"
704 "punpckhbw %%mm3, %%mm1 \n\t"
705 "punpckhbw %%mm6, %%mm0 \n\t"
706 "movq %%mm5, %%mm3 \n\t"
707 "movq %%mm1, %%mm6 \n\t"
708 "punpcklwd %%mm4, %%mm5 \n\t"
709 "punpcklwd %%mm0, %%mm1 \n\t"
710 "punpckhwd %%mm4, %%mm3 \n\t"
711 "punpckhwd %%mm0, %%mm6 \n\t"
712 "movd %%mm5, (%0) \n\t"
713 "punpckhdq %%mm5, %%mm5 \n\t"
714 "movd %%mm5, (%0,%2) \n\t"
715 "movd %%mm3, (%0,%2,2) \n\t"
716 "punpckhdq %%mm3, %%mm3 \n\t"
717 "movd %%mm3, (%0,%3) \n\t"
718 "movd %%mm1, (%1) \n\t"
719 "punpckhdq %%mm1, %%mm1 \n\t"
720 "movd %%mm1, (%1,%2) \n\t"
721 "movd %%mm6, (%1,%2,2) \n\t"
722 "punpckhdq %%mm6, %%mm6 \n\t"
723 "movd %%mm6, (%1,%3) \n\t"
725 "r" (src
+ 4*stride
),
726 "r" ((long) stride
),
727 "r" ((long)(3*stride
))
732 #ifdef CONFIG_ENCODERS
733 static int pix_norm1_mmx(uint8_t *pix
, int line_size
) {
740 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
741 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
743 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
745 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
746 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
748 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
749 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
750 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
752 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
753 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
755 "pmaddwd %%mm3,%%mm3\n"
756 "pmaddwd %%mm4,%%mm4\n"
758 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
759 pix2^2+pix3^2+pix6^2+pix7^2) */
760 "paddd %%mm3,%%mm4\n"
761 "paddd %%mm2,%%mm7\n"
764 "paddd %%mm4,%%mm7\n"
769 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
770 "paddd %%mm7,%%mm1\n"
772 : "+r" (pix
), "=r"(tmp
) : "r" ((long)line_size
) : "%ecx" );
776 static int sse8_mmx(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
781 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
782 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
784 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
785 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
786 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
787 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
789 /* todo: mm1-mm2, mm3-mm4 */
790 /* algo: subtract mm1 from mm2 with saturation and vice versa */
791 /* OR the results to get absolute difference */
794 "psubusb %%mm2,%%mm1\n"
795 "psubusb %%mm4,%%mm3\n"
796 "psubusb %%mm5,%%mm2\n"
797 "psubusb %%mm6,%%mm4\n"
802 /* now convert to 16-bit vectors so we can square them */
806 "punpckhbw %%mm0,%%mm2\n"
807 "punpckhbw %%mm0,%%mm4\n"
808 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
809 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
811 "pmaddwd %%mm2,%%mm2\n"
812 "pmaddwd %%mm4,%%mm4\n"
813 "pmaddwd %%mm1,%%mm1\n"
814 "pmaddwd %%mm3,%%mm3\n"
816 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
817 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
819 "paddd %%mm2,%%mm1\n"
820 "paddd %%mm4,%%mm3\n"
821 "paddd %%mm1,%%mm7\n"
822 "paddd %%mm3,%%mm7\n"
828 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
829 "paddd %%mm7,%%mm1\n"
831 : "+r" (pix1
), "+r" (pix2
), "=r"(tmp
)
832 : "r" ((long)line_size
) , "m" (h
)
837 static int sse16_mmx(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
841 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
842 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
844 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
845 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
846 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
847 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
849 /* todo: mm1-mm2, mm3-mm4 */
850 /* algo: subtract mm1 from mm2 with saturation and vice versa */
851 /* OR the results to get absolute difference */
854 "psubusb %%mm2,%%mm1\n"
855 "psubusb %%mm4,%%mm3\n"
856 "psubusb %%mm5,%%mm2\n"
857 "psubusb %%mm6,%%mm4\n"
862 /* now convert to 16-bit vectors so we can square them */
866 "punpckhbw %%mm0,%%mm2\n"
867 "punpckhbw %%mm0,%%mm4\n"
868 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
869 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
871 "pmaddwd %%mm2,%%mm2\n"
872 "pmaddwd %%mm4,%%mm4\n"
873 "pmaddwd %%mm1,%%mm1\n"
874 "pmaddwd %%mm3,%%mm3\n"
879 "paddd %%mm2,%%mm1\n"
880 "paddd %%mm4,%%mm3\n"
881 "paddd %%mm1,%%mm7\n"
882 "paddd %%mm3,%%mm7\n"
888 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
889 "paddd %%mm7,%%mm1\n"
891 : "+r" (pix1
), "+r" (pix2
), "=r"(tmp
)
892 : "r" ((long)line_size
) , "m" (h
)
897 static int sse16_sse2(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
901 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
902 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
904 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
905 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
906 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
907 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
909 /* todo: mm1-mm2, mm3-mm4 */
910 /* algo: subtract mm1 from mm2 with saturation and vice versa */
911 /* OR the results to get absolute difference */
912 "movdqa %%xmm1,%%xmm5\n"
913 "movdqa %%xmm3,%%xmm6\n"
914 "psubusb %%xmm2,%%xmm1\n"
915 "psubusb %%xmm4,%%xmm3\n"
916 "psubusb %%xmm5,%%xmm2\n"
917 "psubusb %%xmm6,%%xmm4\n"
919 "por %%xmm1,%%xmm2\n"
920 "por %%xmm3,%%xmm4\n"
922 /* now convert to 16-bit vectors so we can square them */
923 "movdqa %%xmm2,%%xmm1\n"
924 "movdqa %%xmm4,%%xmm3\n"
926 "punpckhbw %%xmm0,%%xmm2\n"
927 "punpckhbw %%xmm0,%%xmm4\n"
928 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
929 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
931 "pmaddwd %%xmm2,%%xmm2\n"
932 "pmaddwd %%xmm4,%%xmm4\n"
933 "pmaddwd %%xmm1,%%xmm1\n"
934 "pmaddwd %%xmm3,%%xmm3\n"
936 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
937 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
939 "paddd %%xmm2,%%xmm1\n"
940 "paddd %%xmm4,%%xmm3\n"
941 "paddd %%xmm1,%%xmm7\n"
942 "paddd %%xmm3,%%xmm7\n"
947 "movdqa %%xmm7,%%xmm1\n"
948 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
949 "paddd %%xmm1,%%xmm7\n"
950 "movdqa %%xmm7,%%xmm1\n"
951 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
952 "paddd %%xmm1,%%xmm7\n"
954 : "+r" (pix1
), "+r" (pix2
), "+r"(h
), "=r"(tmp
)
955 : "r" ((long)line_size
));
959 static int hf_noise8_mmx(uint8_t * pix1
, int line_size
, int h
) {
967 "movq %%mm0, %%mm1\n"
971 "movq %%mm0, %%mm2\n"
972 "movq %%mm1, %%mm3\n"
973 "punpcklbw %%mm7,%%mm0\n"
974 "punpcklbw %%mm7,%%mm1\n"
975 "punpckhbw %%mm7,%%mm2\n"
976 "punpckhbw %%mm7,%%mm3\n"
977 "psubw %%mm1, %%mm0\n"
978 "psubw %%mm3, %%mm2\n"
983 "movq %%mm4, %%mm1\n"
987 "movq %%mm4, %%mm5\n"
988 "movq %%mm1, %%mm3\n"
989 "punpcklbw %%mm7,%%mm4\n"
990 "punpcklbw %%mm7,%%mm1\n"
991 "punpckhbw %%mm7,%%mm5\n"
992 "punpckhbw %%mm7,%%mm3\n"
993 "psubw %%mm1, %%mm4\n"
994 "psubw %%mm3, %%mm5\n"
995 "psubw %%mm4, %%mm0\n"
996 "psubw %%mm5, %%mm2\n"
997 "pxor %%mm3, %%mm3\n"
998 "pxor %%mm1, %%mm1\n"
999 "pcmpgtw %%mm0, %%mm3\n\t"
1000 "pcmpgtw %%mm2, %%mm1\n\t"
1001 "pxor %%mm3, %%mm0\n"
1002 "pxor %%mm1, %%mm2\n"
1003 "psubw %%mm3, %%mm0\n"
1004 "psubw %%mm1, %%mm2\n"
1005 "paddw %%mm0, %%mm2\n"
1006 "paddw %%mm2, %%mm6\n"
1012 "movq %%mm0, %%mm1\n"
1016 "movq %%mm0, %%mm2\n"
1017 "movq %%mm1, %%mm3\n"
1018 "punpcklbw %%mm7,%%mm0\n"
1019 "punpcklbw %%mm7,%%mm1\n"
1020 "punpckhbw %%mm7,%%mm2\n"
1021 "punpckhbw %%mm7,%%mm3\n"
1022 "psubw %%mm1, %%mm0\n"
1023 "psubw %%mm3, %%mm2\n"
1024 "psubw %%mm0, %%mm4\n"
1025 "psubw %%mm2, %%mm5\n"
1026 "pxor %%mm3, %%mm3\n"
1027 "pxor %%mm1, %%mm1\n"
1028 "pcmpgtw %%mm4, %%mm3\n\t"
1029 "pcmpgtw %%mm5, %%mm1\n\t"
1030 "pxor %%mm3, %%mm4\n"
1031 "pxor %%mm1, %%mm5\n"
1032 "psubw %%mm3, %%mm4\n"
1033 "psubw %%mm1, %%mm5\n"
1034 "paddw %%mm4, %%mm5\n"
1035 "paddw %%mm5, %%mm6\n"
1040 "movq %%mm4, %%mm1\n"
1044 "movq %%mm4, %%mm5\n"
1045 "movq %%mm1, %%mm3\n"
1046 "punpcklbw %%mm7,%%mm4\n"
1047 "punpcklbw %%mm7,%%mm1\n"
1048 "punpckhbw %%mm7,%%mm5\n"
1049 "punpckhbw %%mm7,%%mm3\n"
1050 "psubw %%mm1, %%mm4\n"
1051 "psubw %%mm3, %%mm5\n"
1052 "psubw %%mm4, %%mm0\n"
1053 "psubw %%mm5, %%mm2\n"
1054 "pxor %%mm3, %%mm3\n"
1055 "pxor %%mm1, %%mm1\n"
1056 "pcmpgtw %%mm0, %%mm3\n\t"
1057 "pcmpgtw %%mm2, %%mm1\n\t"
1058 "pxor %%mm3, %%mm0\n"
1059 "pxor %%mm1, %%mm2\n"
1060 "psubw %%mm3, %%mm0\n"
1061 "psubw %%mm1, %%mm2\n"
1062 "paddw %%mm0, %%mm2\n"
1063 "paddw %%mm2, %%mm6\n"
1069 "movq %%mm6, %%mm0\n"
1070 "punpcklwd %%mm7,%%mm0\n"
1071 "punpckhwd %%mm7,%%mm6\n"
1072 "paddd %%mm0, %%mm6\n"
1074 "movq %%mm6,%%mm0\n"
1075 "psrlq $32, %%mm6\n"
1076 "paddd %%mm6,%%mm0\n"
1078 : "+r" (pix1
), "=r"(tmp
)
1079 : "r" ((long)line_size
) , "g" (h
-2)
1084 static int hf_noise16_mmx(uint8_t * pix1
, int line_size
, int h
) {
1086 uint8_t * pix
= pix1
;
1089 "pxor %%mm7,%%mm7\n"
1090 "pxor %%mm6,%%mm6\n"
1093 "movq 1(%0),%%mm1\n"
1094 "movq %%mm0, %%mm2\n"
1095 "movq %%mm1, %%mm3\n"
1096 "punpcklbw %%mm7,%%mm0\n"
1097 "punpcklbw %%mm7,%%mm1\n"
1098 "punpckhbw %%mm7,%%mm2\n"
1099 "punpckhbw %%mm7,%%mm3\n"
1100 "psubw %%mm1, %%mm0\n"
1101 "psubw %%mm3, %%mm2\n"
1106 "movq 1(%0),%%mm1\n"
1107 "movq %%mm4, %%mm5\n"
1108 "movq %%mm1, %%mm3\n"
1109 "punpcklbw %%mm7,%%mm4\n"
1110 "punpcklbw %%mm7,%%mm1\n"
1111 "punpckhbw %%mm7,%%mm5\n"
1112 "punpckhbw %%mm7,%%mm3\n"
1113 "psubw %%mm1, %%mm4\n"
1114 "psubw %%mm3, %%mm5\n"
1115 "psubw %%mm4, %%mm0\n"
1116 "psubw %%mm5, %%mm2\n"
1117 "pxor %%mm3, %%mm3\n"
1118 "pxor %%mm1, %%mm1\n"
1119 "pcmpgtw %%mm0, %%mm3\n\t"
1120 "pcmpgtw %%mm2, %%mm1\n\t"
1121 "pxor %%mm3, %%mm0\n"
1122 "pxor %%mm1, %%mm2\n"
1123 "psubw %%mm3, %%mm0\n"
1124 "psubw %%mm1, %%mm2\n"
1125 "paddw %%mm0, %%mm2\n"
1126 "paddw %%mm2, %%mm6\n"
1132 "movq 1(%0),%%mm1\n"
1133 "movq %%mm0, %%mm2\n"
1134 "movq %%mm1, %%mm3\n"
1135 "punpcklbw %%mm7,%%mm0\n"
1136 "punpcklbw %%mm7,%%mm1\n"
1137 "punpckhbw %%mm7,%%mm2\n"
1138 "punpckhbw %%mm7,%%mm3\n"
1139 "psubw %%mm1, %%mm0\n"
1140 "psubw %%mm3, %%mm2\n"
1141 "psubw %%mm0, %%mm4\n"
1142 "psubw %%mm2, %%mm5\n"
1143 "pxor %%mm3, %%mm3\n"
1144 "pxor %%mm1, %%mm1\n"
1145 "pcmpgtw %%mm4, %%mm3\n\t"
1146 "pcmpgtw %%mm5, %%mm1\n\t"
1147 "pxor %%mm3, %%mm4\n"
1148 "pxor %%mm1, %%mm5\n"
1149 "psubw %%mm3, %%mm4\n"
1150 "psubw %%mm1, %%mm5\n"
1151 "paddw %%mm4, %%mm5\n"
1152 "paddw %%mm5, %%mm6\n"
1157 "movq 1(%0),%%mm1\n"
1158 "movq %%mm4, %%mm5\n"
1159 "movq %%mm1, %%mm3\n"
1160 "punpcklbw %%mm7,%%mm4\n"
1161 "punpcklbw %%mm7,%%mm1\n"
1162 "punpckhbw %%mm7,%%mm5\n"
1163 "punpckhbw %%mm7,%%mm3\n"
1164 "psubw %%mm1, %%mm4\n"
1165 "psubw %%mm3, %%mm5\n"
1166 "psubw %%mm4, %%mm0\n"
1167 "psubw %%mm5, %%mm2\n"
1168 "pxor %%mm3, %%mm3\n"
1169 "pxor %%mm1, %%mm1\n"
1170 "pcmpgtw %%mm0, %%mm3\n\t"
1171 "pcmpgtw %%mm2, %%mm1\n\t"
1172 "pxor %%mm3, %%mm0\n"
1173 "pxor %%mm1, %%mm2\n"
1174 "psubw %%mm3, %%mm0\n"
1175 "psubw %%mm1, %%mm2\n"
1176 "paddw %%mm0, %%mm2\n"
1177 "paddw %%mm2, %%mm6\n"
1183 "movq %%mm6, %%mm0\n"
1184 "punpcklwd %%mm7,%%mm0\n"
1185 "punpckhwd %%mm7,%%mm6\n"
1186 "paddd %%mm0, %%mm6\n"
1188 "movq %%mm6,%%mm0\n"
1189 "psrlq $32, %%mm6\n"
1190 "paddd %%mm6,%%mm0\n"
1192 : "+r" (pix1
), "=r"(tmp
)
1193 : "r" ((long)line_size
) , "g" (h
-2)
1195 return tmp
+ hf_noise8_mmx(pix
+8, line_size
, h
);
1198 static int nsse16_mmx(void *p
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
1199 MpegEncContext
*c
= p
;
1202 if(c
) score1
= c
->dsp
.sse
[0](c
, pix1
, pix2
, line_size
, h
);
1203 else score1
= sse16_mmx(c
, pix1
, pix2
, line_size
, h
);
1204 score2
= hf_noise16_mmx(pix1
, line_size
, h
) - hf_noise16_mmx(pix2
, line_size
, h
);
1206 if(c
) return score1
+ FFABS(score2
)*c
->avctx
->nsse_weight
;
1207 else return score1
+ FFABS(score2
)*8;
1210 static int nsse8_mmx(void *p
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
1211 MpegEncContext
*c
= p
;
1212 int score1
= sse8_mmx(c
, pix1
, pix2
, line_size
, h
);
1213 int score2
= hf_noise8_mmx(pix1
, line_size
, h
) - hf_noise8_mmx(pix2
, line_size
, h
);
1215 if(c
) return score1
+ FFABS(score2
)*c
->avctx
->nsse_weight
;
1216 else return score1
+ FFABS(score2
)*8;
1219 static int vsad_intra16_mmx(void *v
, uint8_t * pix
, uint8_t * dummy
, int line_size
, int h
) {
1222 assert( (((int)pix
) & 7) == 0);
1223 assert((line_size
&7) ==0);
1225 #define SUM(in0, in1, out0, out1) \
1226 "movq (%0), %%mm2\n"\
1227 "movq 8(%0), %%mm3\n"\
1229 "movq %%mm2, " #out0 "\n"\
1230 "movq %%mm3, " #out1 "\n"\
1231 "psubusb " #in0 ", %%mm2\n"\
1232 "psubusb " #in1 ", %%mm3\n"\
1233 "psubusb " #out0 ", " #in0 "\n"\
1234 "psubusb " #out1 ", " #in1 "\n"\
1235 "por %%mm2, " #in0 "\n"\
1236 "por %%mm3, " #in1 "\n"\
1237 "movq " #in0 ", %%mm2\n"\
1238 "movq " #in1 ", %%mm3\n"\
1239 "punpcklbw %%mm7, " #in0 "\n"\
1240 "punpcklbw %%mm7, " #in1 "\n"\
1241 "punpckhbw %%mm7, %%mm2\n"\
1242 "punpckhbw %%mm7, %%mm3\n"\
1243 "paddw " #in1 ", " #in0 "\n"\
1244 "paddw %%mm3, %%mm2\n"\
1245 "paddw %%mm2, " #in0 "\n"\
1246 "paddw " #in0 ", %%mm6\n"
1251 "pxor %%mm6,%%mm6\n"
1252 "pxor %%mm7,%%mm7\n"
1254 "movq 8(%0),%%mm1\n"
1257 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
1260 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
1262 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
1267 "movq %%mm6,%%mm0\n"
1268 "psrlq $32, %%mm6\n"
1269 "paddw %%mm6,%%mm0\n"
1270 "movq %%mm0,%%mm6\n"
1271 "psrlq $16, %%mm0\n"
1272 "paddw %%mm6,%%mm0\n"
1274 : "+r" (pix
), "=r"(tmp
)
1275 : "r" ((long)line_size
) , "m" (h
)
1277 return tmp
& 0xFFFF;
1281 static int vsad_intra16_mmx2(void *v
, uint8_t * pix
, uint8_t * dummy
, int line_size
, int h
) {
1284 assert( (((int)pix
) & 7) == 0);
1285 assert((line_size
&7) ==0);
1287 #define SUM(in0, in1, out0, out1) \
1288 "movq (%0), " #out0 "\n"\
1289 "movq 8(%0), " #out1 "\n"\
1291 "psadbw " #out0 ", " #in0 "\n"\
1292 "psadbw " #out1 ", " #in1 "\n"\
1293 "paddw " #in1 ", " #in0 "\n"\
1294 "paddw " #in0 ", %%mm6\n"
1298 "pxor %%mm6,%%mm6\n"
1299 "pxor %%mm7,%%mm7\n"
1301 "movq 8(%0),%%mm1\n"
1304 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
1307 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
1309 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
1315 : "+r" (pix
), "=r"(tmp
)
1316 : "r" ((long)line_size
) , "m" (h
)
1322 static int vsad16_mmx(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
1325 assert( (((int)pix1
) & 7) == 0);
1326 assert( (((int)pix2
) & 7) == 0);
1327 assert((line_size
&7) ==0);
1329 #define SUM(in0, in1, out0, out1) \
1330 "movq (%0),%%mm2\n"\
1331 "movq (%1)," #out0 "\n"\
1332 "movq 8(%0),%%mm3\n"\
1333 "movq 8(%1)," #out1 "\n"\
1336 "psubb " #out0 ", %%mm2\n"\
1337 "psubb " #out1 ", %%mm3\n"\
1338 "pxor %%mm7, %%mm2\n"\
1339 "pxor %%mm7, %%mm3\n"\
1340 "movq %%mm2, " #out0 "\n"\
1341 "movq %%mm3, " #out1 "\n"\
1342 "psubusb " #in0 ", %%mm2\n"\
1343 "psubusb " #in1 ", %%mm3\n"\
1344 "psubusb " #out0 ", " #in0 "\n"\
1345 "psubusb " #out1 ", " #in1 "\n"\
1346 "por %%mm2, " #in0 "\n"\
1347 "por %%mm3, " #in1 "\n"\
1348 "movq " #in0 ", %%mm2\n"\
1349 "movq " #in1 ", %%mm3\n"\
1350 "punpcklbw %%mm7, " #in0 "\n"\
1351 "punpcklbw %%mm7, " #in1 "\n"\
1352 "punpckhbw %%mm7, %%mm2\n"\
1353 "punpckhbw %%mm7, %%mm3\n"\
1354 "paddw " #in1 ", " #in0 "\n"\
1355 "paddw %%mm3, %%mm2\n"\
1356 "paddw %%mm2, " #in0 "\n"\
1357 "paddw " #in0 ", %%mm6\n"
1362 "pxor %%mm6,%%mm6\n"
1363 "pcmpeqw %%mm7,%%mm7\n"
1364 "psllw $15, %%mm7\n"
1365 "packsswb %%mm7, %%mm7\n"
1368 "movq 8(%0),%%mm1\n"
1369 "movq 8(%1),%%mm3\n"
1373 "psubb %%mm2, %%mm0\n"
1374 "psubb %%mm3, %%mm1\n"
1375 "pxor %%mm7, %%mm0\n"
1376 "pxor %%mm7, %%mm1\n"
1377 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
1380 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
1382 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
1387 "movq %%mm6,%%mm0\n"
1388 "psrlq $32, %%mm6\n"
1389 "paddw %%mm6,%%mm0\n"
1390 "movq %%mm0,%%mm6\n"
1391 "psrlq $16, %%mm0\n"
1392 "paddw %%mm6,%%mm0\n"
1394 : "+r" (pix1
), "+r" (pix2
), "=r"(tmp
)
1395 : "r" ((long)line_size
) , "m" (h
)
1397 return tmp
& 0x7FFF;
1401 static int vsad16_mmx2(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
1404 assert( (((int)pix1
) & 7) == 0);
1405 assert( (((int)pix2
) & 7) == 0);
1406 assert((line_size
&7) ==0);
1408 #define SUM(in0, in1, out0, out1) \
1409 "movq (%0)," #out0 "\n"\
1410 "movq (%1),%%mm2\n"\
1411 "movq 8(%0)," #out1 "\n"\
1412 "movq 8(%1),%%mm3\n"\
1415 "psubb %%mm2, " #out0 "\n"\
1416 "psubb %%mm3, " #out1 "\n"\
1417 "pxor %%mm7, " #out0 "\n"\
1418 "pxor %%mm7, " #out1 "\n"\
1419 "psadbw " #out0 ", " #in0 "\n"\
1420 "psadbw " #out1 ", " #in1 "\n"\
1421 "paddw " #in1 ", " #in0 "\n"\
1422 "paddw " #in0 ", %%mm6\n"
1426 "pxor %%mm6,%%mm6\n"
1427 "pcmpeqw %%mm7,%%mm7\n"
1428 "psllw $15, %%mm7\n"
1429 "packsswb %%mm7, %%mm7\n"
1432 "movq 8(%0),%%mm1\n"
1433 "movq 8(%1),%%mm3\n"
1437 "psubb %%mm2, %%mm0\n"
1438 "psubb %%mm3, %%mm1\n"
1439 "pxor %%mm7, %%mm0\n"
1440 "pxor %%mm7, %%mm1\n"
1441 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
1444 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
1446 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
1452 : "+r" (pix1
), "+r" (pix2
), "=r"(tmp
)
1453 : "r" ((long)line_size
) , "m" (h
)
1459 static void diff_bytes_mmx(uint8_t *dst
, uint8_t *src1
, uint8_t *src2
, int w
){
1463 "movq (%2, %0), %%mm0 \n\t"
1464 "movq (%1, %0), %%mm1 \n\t"
1465 "psubb %%mm0, %%mm1 \n\t"
1466 "movq %%mm1, (%3, %0) \n\t"
1467 "movq 8(%2, %0), %%mm0 \n\t"
1468 "movq 8(%1, %0), %%mm1 \n\t"
1469 "psubb %%mm0, %%mm1 \n\t"
1470 "movq %%mm1, 8(%3, %0) \n\t"
1475 : "r"(src1
), "r"(src2
), "r"(dst
), "r"((long)w
-15)
1478 dst
[i
+0] = src1
[i
+0]-src2
[i
+0];
1481 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst
, uint8_t *src1
, uint8_t *src2
, int w
, int *left
, int *left_top
){
1487 "movq -1(%1, %0), %%mm0 \n\t" // LT
1488 "movq (%1, %0), %%mm1 \n\t" // T
1489 "movq -1(%2, %0), %%mm2 \n\t" // L
1490 "movq (%2, %0), %%mm3 \n\t" // X
1491 "movq %%mm2, %%mm4 \n\t" // L
1492 "psubb %%mm0, %%mm2 \n\t"
1493 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1494 "movq %%mm4, %%mm5 \n\t" // L
1495 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1496 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1497 "pminub %%mm2, %%mm4 \n\t"
1498 "pmaxub %%mm1, %%mm4 \n\t"
1499 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1500 "movq %%mm3, (%3, %0) \n\t"
1505 : "r"(src1
), "r"(src2
), "r"(dst
), "r"((long)w
)
1511 dst
[0]= src2
[0] - mid_pred(l
, src1
[0], (l
+ src1
[0] - lt
)&0xFF);
1513 *left_top
= src1
[w
-1];
1517 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
1518 "mov"#m" "#p1", "#a" \n\t"\
1519 "mov"#m" "#p2", "#t" \n\t"\
1520 "punpcklbw "#a", "#t" \n\t"\
1521 "punpcklbw "#a", "#a" \n\t"\
1522 "psubw "#t", "#a" \n\t"\
1524 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1525 uint8_t *p1b=p1, *p2b=p2;\
1527 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1528 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1529 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1532 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1533 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1534 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1535 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1536 "mov"#m1" "#mm"0, %0 \n\t"\
1537 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1538 "mov"#m1" %0, "#mm"0 \n\t"\
1539 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1540 : "r"((long)stride), "r"((long)stride*3)\
1543 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1545 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
1546 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1549 // permutes 01234567 -> 05736421
1550 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1551 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1552 SBUTTERFLY(c,d,b,wd,dqa)\
1553 SBUTTERFLY(e,f,d,wd,dqa)\
1554 SBUTTERFLY(g,h,f,wd,dqa)\
1555 SBUTTERFLY(a,c,h,dq,dqa)\
1556 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1557 SBUTTERFLY(e,g,b,dq,dqa)\
1558 SBUTTERFLY(d,f,g,dq,dqa)\
1559 SBUTTERFLY(a,e,f,qdq,dqa)\
1560 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1561 SBUTTERFLY(h,b,d,qdq,dqa)\
1562 SBUTTERFLY(c,g,b,qdq,dqa)\
1563 "movdqa %%xmm8, "#g" \n\t"
1565 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1566 "movdqa "#h", "#t" \n\t"\
1567 SBUTTERFLY(a,b,h,wd,dqa)\
1568 "movdqa "#h", 16"#t" \n\t"\
1569 "movdqa "#t", "#h" \n\t"\
1570 SBUTTERFLY(c,d,b,wd,dqa)\
1571 SBUTTERFLY(e,f,d,wd,dqa)\
1572 SBUTTERFLY(g,h,f,wd,dqa)\
1573 SBUTTERFLY(a,c,h,dq,dqa)\
1574 "movdqa "#h", "#t" \n\t"\
1575 "movdqa 16"#t", "#h" \n\t"\
1576 SBUTTERFLY(h,b,c,dq,dqa)\
1577 SBUTTERFLY(e,g,b,dq,dqa)\
1578 SBUTTERFLY(d,f,g,dq,dqa)\
1579 SBUTTERFLY(a,e,f,qdq,dqa)\
1580 SBUTTERFLY(h,d,e,qdq,dqa)\
1581 "movdqa "#h", 16"#t" \n\t"\
1582 "movdqa "#t", "#h" \n\t"\
1583 SBUTTERFLY(h,b,d,qdq,dqa)\
1584 SBUTTERFLY(c,g,b,qdq,dqa)\
1585 "movdqa 16"#t", "#g" \n\t"
1588 #define LBUTTERFLY2(a1,b1,a2,b2)\
1589 "paddw " #b1 ", " #a1 " \n\t"\
1590 "paddw " #b2 ", " #a2 " \n\t"\
1591 "paddw " #b1 ", " #b1 " \n\t"\
1592 "paddw " #b2 ", " #b2 " \n\t"\
1593 "psubw " #a1 ", " #b1 " \n\t"\
1594 "psubw " #a2 ", " #b2 " \n\t"
1596 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1597 LBUTTERFLY2(m0, m1, m2, m3)\
1598 LBUTTERFLY2(m4, m5, m6, m7)\
1599 LBUTTERFLY2(m0, m2, m1, m3)\
1600 LBUTTERFLY2(m4, m6, m5, m7)\
1601 LBUTTERFLY2(m0, m4, m1, m5)\
1602 LBUTTERFLY2(m2, m6, m3, m7)\
1604 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1606 #define MMABS_MMX(a,z)\
1607 "pxor " #z ", " #z " \n\t"\
1608 "pcmpgtw " #a ", " #z " \n\t"\
1609 "pxor " #z ", " #a " \n\t"\
1610 "psubw " #z ", " #a " \n\t"
1612 #define MMABS_MMX2(a,z)\
1613 "pxor " #z ", " #z " \n\t"\
1614 "psubw " #a ", " #z " \n\t"\
1615 "pmaxsw " #z ", " #a " \n\t"
1617 #define MMABS_SSSE3(a,z)\
1618 "pabsw " #a ", " #a " \n\t"
1620 #define MMABS_SUM(a,z, sum)\
1622 "paddusw " #a ", " #sum " \n\t"
1624 #define MMABS_SUM_8x8_NOSPILL\
1625 MMABS(%%xmm0, %%xmm8)\
1626 MMABS(%%xmm1, %%xmm9)\
1627 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1628 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1629 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1630 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1631 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1632 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1633 "paddusw %%xmm1, %%xmm0 \n\t"
1636 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1638 #define MMABS_SUM_8x8_SSE2\
1639 "movdqa %%xmm7, (%1) \n\t"\
1640 MMABS(%%xmm0, %%xmm7)\
1641 MMABS(%%xmm1, %%xmm7)\
1642 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1643 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1644 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1645 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1646 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1647 "movdqa (%1), %%xmm2 \n\t"\
1648 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1649 "paddusw %%xmm1, %%xmm0 \n\t"
1652 #define LOAD4(o, a, b, c, d)\
1653 "movq "#o"(%1), "#a" \n\t"\
1654 "movq "#o"+8(%1), "#b" \n\t"\
1655 "movq "#o"+16(%1), "#c" \n\t"\
1656 "movq "#o"+24(%1), "#d" \n\t"\
1658 #define STORE4(o, a, b, c, d)\
1659 "movq "#a", "#o"(%1) \n\t"\
1660 "movq "#b", "#o"+8(%1) \n\t"\
1661 "movq "#c", "#o"+16(%1) \n\t"\
1662 "movq "#d", "#o"+24(%1) \n\t"\
1664 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1665 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1666 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1667 #define HSUM_MMX(a, t, dst)\
1668 "movq "#a", "#t" \n\t"\
1669 "psrlq $32, "#a" \n\t"\
1670 "paddusw "#t", "#a" \n\t"\
1671 "movq "#a", "#t" \n\t"\
1672 "psrlq $16, "#a" \n\t"\
1673 "paddusw "#t", "#a" \n\t"\
1674 "movd "#a", "#dst" \n\t"\
1676 #define HSUM_MMX2(a, t, dst)\
1677 "pshufw $0x0E, "#a", "#t" \n\t"\
1678 "paddusw "#t", "#a" \n\t"\
1679 "pshufw $0x01, "#a", "#t" \n\t"\
1680 "paddusw "#t", "#a" \n\t"\
1681 "movd "#a", "#dst" \n\t"\
1683 #define HSUM_SSE2(a, t, dst)\
1684 "movhlps "#a", "#t" \n\t"\
1685 "paddusw "#t", "#a" \n\t"\
1686 "pshuflw $0x0E, "#a", "#t" \n\t"\
1687 "paddusw "#t", "#a" \n\t"\
1688 "pshuflw $0x01, "#a", "#t" \n\t"\
1689 "paddusw "#t", "#a" \n\t"\
1690 "movd "#a", "#dst" \n\t"\
1692 #define HADAMARD8_DIFF_MMX(cpu) \
1693 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1694 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1699 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1704 "movq %%mm7, 96(%1) \n\t"\
1706 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1707 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1709 "movq 96(%1), %%mm7 \n\t"\
1710 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1711 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1717 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1722 "movq %%mm7, 96(%1) \n\t"\
1724 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1725 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1727 "movq 96(%1), %%mm7 \n\t"\
1728 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1729 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1730 "movq %%mm6, %%mm7 \n\t"\
1731 "movq %%mm0, %%mm6 \n\t"\
1733 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1736 "movq %%mm7, 64(%1) \n\t"\
1737 MMABS(%%mm0, %%mm7)\
1738 MMABS(%%mm1, %%mm7)\
1739 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1740 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1741 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1742 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1743 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1744 "movq 64(%1), %%mm2 \n\t"\
1745 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1746 "paddusw %%mm1, %%mm0 \n\t"\
1747 "movq %%mm0, 64(%1) \n\t"\
1749 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1750 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1753 "movq %%mm7, (%1) \n\t"\
1754 MMABS(%%mm0, %%mm7)\
1755 MMABS(%%mm1, %%mm7)\
1756 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1757 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1758 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1759 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1760 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1761 "movq (%1), %%mm2 \n\t"\
1762 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1763 "paddusw 64(%1), %%mm0 \n\t"\
1764 "paddusw %%mm1, %%mm0 \n\t"\
1766 HSUM(%%mm0, %%mm1, %0)\
1773 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1775 #define HADAMARD8_DIFF_SSE2(cpu) \
1776 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1777 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1782 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1785 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1786 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1787 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1789 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1795 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1797 #define MMABS(a,z) MMABS_MMX(a,z)
1798 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1799 HADAMARD8_DIFF_MMX(mmx
)
1803 #define MMABS(a,z) MMABS_MMX2(a,z)
1804 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1805 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1806 HADAMARD8_DIFF_MMX(mmx2
)
1807 HADAMARD8_DIFF_SSE2(sse2
)
1809 #undef MMABS_SUM_8x8
1813 #define MMABS(a,z) MMABS_SSSE3(a,z)
1814 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1815 HADAMARD8_DIFF_SSE2(ssse3
)
1817 #undef MMABS_SUM_8x8
1820 #define DCT_SAD4(m,mm,o)\
1821 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1822 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1823 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1824 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1825 MMABS_SUM(mm##2, mm##6, mm##0)\
1826 MMABS_SUM(mm##3, mm##7, mm##1)\
1827 MMABS_SUM(mm##4, mm##6, mm##0)\
1828 MMABS_SUM(mm##5, mm##7, mm##1)\
1830 #define DCT_SAD_MMX\
1831 "pxor %%mm0, %%mm0 \n\t"\
1832 "pxor %%mm1, %%mm1 \n\t"\
1833 DCT_SAD4(q, %%mm, 0)\
1834 DCT_SAD4(q, %%mm, 8)\
1835 DCT_SAD4(q, %%mm, 64)\
1836 DCT_SAD4(q, %%mm, 72)\
1837 "paddusw %%mm1, %%mm0 \n\t"\
1838 HSUM(%%mm0, %%mm1, %0)
1840 #define DCT_SAD_SSE2\
1841 "pxor %%xmm0, %%xmm0 \n\t"\
1842 "pxor %%xmm1, %%xmm1 \n\t"\
1843 DCT_SAD4(dqa, %%xmm, 0)\
1844 DCT_SAD4(dqa, %%xmm, 64)\
1845 "paddusw %%xmm1, %%xmm0 \n\t"\
1846 HSUM(%%xmm0, %%xmm1, %0)
1848 #define DCT_SAD_FUNC(cpu) \
1849 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1859 #define DCT_SAD DCT_SAD_MMX
1860 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1861 #define MMABS(a,z) MMABS_MMX(a,z)
1866 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1867 #define MMABS(a,z) MMABS_MMX2(a,z)
1872 #define DCT_SAD DCT_SAD_SSE2
1873 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1878 #define MMABS(a,z) MMABS_SSSE3(a,z)
1885 static int ssd_int8_vs_int16_mmx(const int8_t *pix1
, const int16_t *pix2
, int size
){
1889 "pxor %%mm4, %%mm4 \n"
1892 "movq (%2,%0), %%mm2 \n"
1893 "movq (%3,%0,2), %%mm0 \n"
1894 "movq 8(%3,%0,2), %%mm1 \n"
1895 "punpckhbw %%mm2, %%mm3 \n"
1896 "punpcklbw %%mm2, %%mm2 \n"
1897 "psraw $8, %%mm3 \n"
1898 "psraw $8, %%mm2 \n"
1899 "psubw %%mm3, %%mm1 \n"
1900 "psubw %%mm2, %%mm0 \n"
1901 "pmaddwd %%mm1, %%mm1 \n"
1902 "pmaddwd %%mm0, %%mm0 \n"
1903 "paddd %%mm1, %%mm4 \n"
1904 "paddd %%mm0, %%mm4 \n"
1906 "movq %%mm4, %%mm3 \n"
1907 "psrlq $32, %%mm3 \n"
1908 "paddd %%mm3, %%mm4 \n"
1911 :"r"(pix1
), "r"(pix2
)
1916 #endif //CONFIG_ENCODERS
1918 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1919 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1921 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1922 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
1923 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
1924 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
1925 "movq "#in7", " #m3 " \n\t" /* d */\
1926 "movq "#in0", %%mm5 \n\t" /* D */\
1927 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
1928 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
1929 "movq "#in1", %%mm5 \n\t" /* C */\
1930 "movq "#in2", %%mm6 \n\t" /* B */\
1931 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
1932 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
1933 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
1934 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
1935 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
1936 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
1937 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1938 "psraw $5, %%mm5 \n\t"\
1939 "packuswb %%mm5, %%mm5 \n\t"\
1940 OP(%%mm5, out, %%mm7, d)
1942 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1943 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1947 "pxor %%mm7, %%mm7 \n\t"\
1949 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1950 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1951 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1952 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1953 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1954 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1955 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1956 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1957 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1958 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1959 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1960 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1961 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1962 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1963 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1964 "paddw %%mm3, %%mm5 \n\t" /* b */\
1965 "paddw %%mm2, %%mm6 \n\t" /* c */\
1966 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1967 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1968 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1969 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1970 "paddw %%mm4, %%mm0 \n\t" /* a */\
1971 "paddw %%mm1, %%mm5 \n\t" /* d */\
1972 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1973 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1974 "paddw %6, %%mm6 \n\t"\
1975 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1976 "psraw $5, %%mm0 \n\t"\
1977 "movq %%mm0, %5 \n\t"\
1978 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1980 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1981 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1982 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1983 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1984 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1985 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1986 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1987 "paddw %%mm0, %%mm2 \n\t" /* b */\
1988 "paddw %%mm5, %%mm3 \n\t" /* c */\
1989 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1990 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1991 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1992 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1993 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1994 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1995 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1996 "paddw %%mm2, %%mm1 \n\t" /* a */\
1997 "paddw %%mm6, %%mm4 \n\t" /* d */\
1998 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1999 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
2000 "paddw %6, %%mm1 \n\t"\
2001 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
2002 "psraw $5, %%mm3 \n\t"\
2003 "movq %5, %%mm1 \n\t"\
2004 "packuswb %%mm3, %%mm1 \n\t"\
2005 OP_MMX2(%%mm1, (%1),%%mm4, q)\
2006 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
2008 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
2009 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
2010 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
2011 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
2012 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
2013 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
2014 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
2015 "paddw %%mm1, %%mm5 \n\t" /* b */\
2016 "paddw %%mm4, %%mm0 \n\t" /* c */\
2017 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2018 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
2019 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
2020 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
2021 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
2022 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
2023 "paddw %%mm3, %%mm2 \n\t" /* d */\
2024 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
2025 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
2026 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
2027 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
2028 "paddw %%mm2, %%mm6 \n\t" /* a */\
2029 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2030 "paddw %6, %%mm0 \n\t"\
2031 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2032 "psraw $5, %%mm0 \n\t"\
2033 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2035 "paddw %%mm5, %%mm3 \n\t" /* a */\
2036 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
2037 "paddw %%mm4, %%mm6 \n\t" /* b */\
2038 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
2039 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
2040 "paddw %%mm1, %%mm4 \n\t" /* c */\
2041 "paddw %%mm2, %%mm5 \n\t" /* d */\
2042 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
2043 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
2044 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2045 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
2046 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
2047 "paddw %6, %%mm4 \n\t"\
2048 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
2049 "psraw $5, %%mm4 \n\t"\
2050 "packuswb %%mm4, %%mm0 \n\t"\
2051 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2057 : "+a"(src), "+c"(dst), "+m"(h)\
2058 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2063 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2066 /* quick HACK, XXX FIXME MUST be optimized */\
2069 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2070 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2071 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2072 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2073 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2074 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2075 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2076 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2077 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2078 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2079 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2080 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2081 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2082 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2083 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2084 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2086 "movq (%0), %%mm0 \n\t"\
2087 "movq 8(%0), %%mm1 \n\t"\
2088 "paddw %2, %%mm0 \n\t"\
2089 "paddw %2, %%mm1 \n\t"\
2090 "psraw $5, %%mm0 \n\t"\
2091 "psraw $5, %%mm1 \n\t"\
2092 "packuswb %%mm1, %%mm0 \n\t"\
2093 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2094 "movq 16(%0), %%mm0 \n\t"\
2095 "movq 24(%0), %%mm1 \n\t"\
2096 "paddw %2, %%mm0 \n\t"\
2097 "paddw %2, %%mm1 \n\t"\
2098 "psraw $5, %%mm0 \n\t"\
2099 "psraw $5, %%mm1 \n\t"\
2100 "packuswb %%mm1, %%mm0 \n\t"\
2101 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2102 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2110 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2114 "pxor %%mm7, %%mm7 \n\t"\
2116 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
2117 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
2118 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
2119 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
2120 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
2121 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
2122 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
2123 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
2124 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
2125 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
2126 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
2127 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
2128 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
2129 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
2130 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
2131 "paddw %%mm3, %%mm5 \n\t" /* b */\
2132 "paddw %%mm2, %%mm6 \n\t" /* c */\
2133 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2134 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
2135 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
2136 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
2137 "paddw %%mm4, %%mm0 \n\t" /* a */\
2138 "paddw %%mm1, %%mm5 \n\t" /* d */\
2139 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2140 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
2141 "paddw %6, %%mm6 \n\t"\
2142 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2143 "psraw $5, %%mm0 \n\t"\
2144 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2146 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
2147 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
2148 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
2149 "paddw %%mm5, %%mm1 \n\t" /* a */\
2150 "paddw %%mm6, %%mm2 \n\t" /* b */\
2151 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
2152 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
2153 "paddw %%mm6, %%mm3 \n\t" /* c */\
2154 "paddw %%mm5, %%mm4 \n\t" /* d */\
2155 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
2156 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
2157 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2158 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
2159 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
2160 "paddw %6, %%mm1 \n\t"\
2161 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
2162 "psraw $5, %%mm3 \n\t"\
2163 "packuswb %%mm3, %%mm0 \n\t"\
2164 OP_MMX2(%%mm0, (%1), %%mm4, q)\
2170 : "+a"(src), "+c"(dst), "+m"(h)\
2171 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2176 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2179 /* quick HACK, XXX FIXME MUST be optimized */\
2182 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2183 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2184 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2185 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2186 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2187 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2188 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2189 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2191 "movq (%0), %%mm0 \n\t"\
2192 "movq 8(%0), %%mm1 \n\t"\
2193 "paddw %2, %%mm0 \n\t"\
2194 "paddw %2, %%mm1 \n\t"\
2195 "psraw $5, %%mm0 \n\t"\
2196 "psraw $5, %%mm1 \n\t"\
2197 "packuswb %%mm1, %%mm0 \n\t"\
2198 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2199 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2207 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2209 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2210 uint64_t temp[17*4];\
2211 uint64_t *temp_ptr= temp;\
2216 "pxor %%mm7, %%mm7 \n\t"\
2218 "movq (%0), %%mm0 \n\t"\
2219 "movq (%0), %%mm1 \n\t"\
2220 "movq 8(%0), %%mm2 \n\t"\
2221 "movq 8(%0), %%mm3 \n\t"\
2222 "punpcklbw %%mm7, %%mm0 \n\t"\
2223 "punpckhbw %%mm7, %%mm1 \n\t"\
2224 "punpcklbw %%mm7, %%mm2 \n\t"\
2225 "punpckhbw %%mm7, %%mm3 \n\t"\
2226 "movq %%mm0, (%1) \n\t"\
2227 "movq %%mm1, 17*8(%1) \n\t"\
2228 "movq %%mm2, 2*17*8(%1) \n\t"\
2229 "movq %%mm3, 3*17*8(%1) \n\t"\
2234 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2235 : "r" ((long)srcStride)\
2242 /*FIXME reorder for speed */\
2244 /*"pxor %%mm7, %%mm7 \n\t"*/\
2246 "movq (%0), %%mm0 \n\t"\
2247 "movq 8(%0), %%mm1 \n\t"\
2248 "movq 16(%0), %%mm2 \n\t"\
2249 "movq 24(%0), %%mm3 \n\t"\
2250 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2251 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2253 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2255 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2258 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2260 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2261 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2263 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2264 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2266 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2267 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2269 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2271 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2273 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2274 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2276 "add $136, %0 \n\t"\
2281 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2282 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2287 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2288 uint64_t temp[9*2];\
2289 uint64_t *temp_ptr= temp;\
2294 "pxor %%mm7, %%mm7 \n\t"\
2296 "movq (%0), %%mm0 \n\t"\
2297 "movq (%0), %%mm1 \n\t"\
2298 "punpcklbw %%mm7, %%mm0 \n\t"\
2299 "punpckhbw %%mm7, %%mm1 \n\t"\
2300 "movq %%mm0, (%1) \n\t"\
2301 "movq %%mm1, 9*8(%1) \n\t"\
2306 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2307 : "r" ((long)srcStride)\
2314 /*FIXME reorder for speed */\
2316 /*"pxor %%mm7, %%mm7 \n\t"*/\
2318 "movq (%0), %%mm0 \n\t"\
2319 "movq 8(%0), %%mm1 \n\t"\
2320 "movq 16(%0), %%mm2 \n\t"\
2321 "movq 24(%0), %%mm3 \n\t"\
2322 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2323 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2325 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2327 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2329 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2331 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2333 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2334 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2341 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2342 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2347 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2348 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2351 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2353 uint8_t * const half= (uint8_t*)temp;\
2354 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2355 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2358 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2359 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2362 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2364 uint8_t * const half= (uint8_t*)temp;\
2365 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2366 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2369 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2371 uint8_t * const half= (uint8_t*)temp;\
2372 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2373 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2376 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2377 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2380 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2382 uint8_t * const half= (uint8_t*)temp;\
2383 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2384 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2386 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2387 uint64_t half[8 + 9];\
2388 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2389 uint8_t * const halfHV= ((uint8_t*)half);\
2390 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2391 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2392 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2393 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2395 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2396 uint64_t half[8 + 9];\
2397 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2398 uint8_t * const halfHV= ((uint8_t*)half);\
2399 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2400 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2401 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2402 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2404 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2405 uint64_t half[8 + 9];\
2406 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2407 uint8_t * const halfHV= ((uint8_t*)half);\
2408 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2409 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2410 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2411 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2413 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2414 uint64_t half[8 + 9];\
2415 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2416 uint8_t * const halfHV= ((uint8_t*)half);\
2417 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2418 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2419 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2420 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2422 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2423 uint64_t half[8 + 9];\
2424 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2425 uint8_t * const halfHV= ((uint8_t*)half);\
2426 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2427 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2428 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2430 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2431 uint64_t half[8 + 9];\
2432 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2433 uint8_t * const halfHV= ((uint8_t*)half);\
2434 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2435 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2436 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2438 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2439 uint64_t half[8 + 9];\
2440 uint8_t * const halfH= ((uint8_t*)half);\
2441 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2442 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2443 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2445 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2446 uint64_t half[8 + 9];\
2447 uint8_t * const halfH= ((uint8_t*)half);\
2448 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2449 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2450 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2452 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2454 uint8_t * const halfH= ((uint8_t*)half);\
2455 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2456 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2458 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2459 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2462 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2464 uint8_t * const half= (uint8_t*)temp;\
2465 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2466 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2469 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2470 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2473 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2475 uint8_t * const half= (uint8_t*)temp;\
2476 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2477 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2480 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2482 uint8_t * const half= (uint8_t*)temp;\
2483 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2484 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2487 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2488 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2491 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2493 uint8_t * const half= (uint8_t*)temp;\
2494 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2495 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2497 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2498 uint64_t half[16*2 + 17*2];\
2499 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2500 uint8_t * const halfHV= ((uint8_t*)half);\
2501 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2502 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2503 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2504 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2506 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2507 uint64_t half[16*2 + 17*2];\
2508 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2509 uint8_t * const halfHV= ((uint8_t*)half);\
2510 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2511 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2512 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2513 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2515 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2516 uint64_t half[16*2 + 17*2];\
2517 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2518 uint8_t * const halfHV= ((uint8_t*)half);\
2519 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2520 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2521 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2522 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2524 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2525 uint64_t half[16*2 + 17*2];\
2526 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2527 uint8_t * const halfHV= ((uint8_t*)half);\
2528 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2529 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2530 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2531 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2533 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2534 uint64_t half[16*2 + 17*2];\
2535 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2536 uint8_t * const halfHV= ((uint8_t*)half);\
2537 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2538 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2539 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2541 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2542 uint64_t half[16*2 + 17*2];\
2543 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2544 uint8_t * const halfHV= ((uint8_t*)half);\
2545 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2546 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2547 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2549 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2550 uint64_t half[17*2];\
2551 uint8_t * const halfH= ((uint8_t*)half);\
2552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2553 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2554 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2556 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2557 uint64_t half[17*2];\
2558 uint8_t * const halfH= ((uint8_t*)half);\
2559 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2560 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2563 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2564 uint64_t half[17*2];\
2565 uint8_t * const halfH= ((uint8_t*)half);\
2566 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2567 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2570 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2571 #define AVG_3DNOW_OP(a,b,temp, size) \
2572 "mov" #size " " #b ", " #temp " \n\t"\
2573 "pavgusb " #temp ", " #a " \n\t"\
2574 "mov" #size " " #a ", " #b " \n\t"
2575 #define AVG_MMX2_OP(a,b,temp, size) \
2576 "mov" #size " " #b ", " #temp " \n\t"\
2577 "pavgb " #temp ", " #a " \n\t"\
2578 "mov" #size " " #a ", " #b " \n\t"
2580 QPEL_BASE(put_
, ff_pw_16
, _
, PUT_OP
, PUT_OP
)
2581 QPEL_BASE(avg_
, ff_pw_16
, _
, AVG_MMX2_OP
, AVG_3DNOW_OP
)
2582 QPEL_BASE(put_no_rnd_
, ff_pw_15
, _no_rnd_
, PUT_OP
, PUT_OP
)
2583 QPEL_OP(put_
, ff_pw_16
, _
, PUT_OP
, 3dnow
)
2584 QPEL_OP(avg_
, ff_pw_16
, _
, AVG_3DNOW_OP
, 3dnow
)
2585 QPEL_OP(put_no_rnd_
, ff_pw_15
, _no_rnd_
, PUT_OP
, 3dnow
)
2586 QPEL_OP(put_
, ff_pw_16
, _
, PUT_OP
, mmx2
)
2587 QPEL_OP(avg_
, ff_pw_16
, _
, AVG_MMX2_OP
, mmx2
)
2588 QPEL_OP(put_no_rnd_
, ff_pw_15
, _no_rnd_
, PUT_OP
, mmx2
)
2590 /***********************************/
2591 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2593 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2594 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2595 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2597 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2598 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2599 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2602 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
2603 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2604 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2605 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2606 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2607 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2608 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2609 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2610 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2611 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2612 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2613 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2615 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2616 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2618 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
2619 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
2620 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
2621 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
2622 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
2623 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
2624 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
2625 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2627 QPEL_2TAP(put_, 16, mmx2)
2628 QPEL_2TAP(avg_
, 16, mmx2
)
2629 QPEL_2TAP(put_
, 8, mmx2
)
2630 QPEL_2TAP(avg_
, 8, mmx2
)
2631 QPEL_2TAP(put_
, 16, 3dnow
)
2632 QPEL_2TAP(avg_
, 16, 3dnow
)
2633 QPEL_2TAP(put_
, 8, 3dnow
)
2634 QPEL_2TAP(avg_
, 8, 3dnow
)
2638 static void just_return() { return; }
2641 #define SET_QPEL_FUNC(postfix1, postfix2) \
2642 c->put_ ## postfix1 = put_ ## postfix2;\
2643 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2644 c->avg_ ## postfix1 = avg_ ## postfix2;
2646 static void gmc_mmx(uint8_t *dst
, uint8_t *src
, int stride
, int h
, int ox
, int oy
,
2647 int dxx
, int dxy
, int dyx
, int dyy
, int shift
, int r
, int width
, int height
){
2649 const int ix
= ox
>>(16+shift
);
2650 const int iy
= oy
>>(16+shift
);
2651 const int oxs
= ox
>>4;
2652 const int oys
= oy
>>4;
2653 const int dxxs
= dxx
>>4;
2654 const int dxys
= dxy
>>4;
2655 const int dyxs
= dyx
>>4;
2656 const int dyys
= dyy
>>4;
2657 const uint16_t r4
[4] = {r
,r
,r
,r
};
2658 const uint16_t dxy4
[4] = {dxys
,dxys
,dxys
,dxys
};
2659 const uint16_t dyy4
[4] = {dyys
,dyys
,dyys
,dyys
};
2660 const uint64_t shift2
= 2*shift
;
2661 uint8_t edge_buf
[(h
+1)*stride
];
2664 const int dxw
= (dxx
-(1<<(16+shift
)))*(w
-1);
2665 const int dyh
= (dyy
-(1<<(16+shift
)))*(h
-1);
2666 const int dxh
= dxy
*(h
-1);
2667 const int dyw
= dyx
*(w
-1);
2668 if( // non-constant fullpel offset (3% of blocks)
2669 (ox
^(ox
+dxw
) | ox
^(ox
+dxh
) | ox
^(ox
+dxw
+dxh
) |
2670 oy
^(oy
+dyw
) | oy
^(oy
+dyh
) | oy
^(oy
+dyw
+dyh
)) >> (16+shift
)
2671 // uses more than 16 bits of subpel mv (only at huge resolution)
2672 || (dxx
|dxy
|dyx
|dyy
)&15 )
2674 //FIXME could still use mmx for some of the rows
2675 ff_gmc_c(dst
, src
, stride
, h
, ox
, oy
, dxx
, dxy
, dyx
, dyy
, shift
, r
, width
, height
);
2679 src
+= ix
+ iy
*stride
;
2680 if( (unsigned)ix
>= width
-w
||
2681 (unsigned)iy
>= height
-h
)
2683 ff_emulated_edge_mc(edge_buf
, src
, stride
, w
+1, h
+1, ix
, iy
, width
, height
);
2688 "movd %0, %%mm6 \n\t"
2689 "pxor %%mm7, %%mm7 \n\t"
2690 "punpcklwd %%mm6, %%mm6 \n\t"
2691 "punpcklwd %%mm6, %%mm6 \n\t"
2695 for(x
=0; x
<w
; x
+=4){
2696 uint16_t dx4
[4] = { oxs
- dxys
+ dxxs
*(x
+0),
2697 oxs
- dxys
+ dxxs
*(x
+1),
2698 oxs
- dxys
+ dxxs
*(x
+2),
2699 oxs
- dxys
+ dxxs
*(x
+3) };
2700 uint16_t dy4
[4] = { oys
- dyys
+ dyxs
*(x
+0),
2701 oys
- dyys
+ dyxs
*(x
+1),
2702 oys
- dyys
+ dyxs
*(x
+2),
2703 oys
- dyys
+ dyxs
*(x
+3) };
2707 "movq %0, %%mm4 \n\t"
2708 "movq %1, %%mm5 \n\t"
2709 "paddw %2, %%mm4 \n\t"
2710 "paddw %3, %%mm5 \n\t"
2711 "movq %%mm4, %0 \n\t"
2712 "movq %%mm5, %1 \n\t"
2713 "psrlw $12, %%mm4 \n\t"
2714 "psrlw $12, %%mm5 \n\t"
2715 : "+m"(*dx4
), "+m"(*dy4
)
2716 : "m"(*dxy4
), "m"(*dyy4
)
2720 "movq %%mm6, %%mm2 \n\t"
2721 "movq %%mm6, %%mm1 \n\t"
2722 "psubw %%mm4, %%mm2 \n\t"
2723 "psubw %%mm5, %%mm1 \n\t"
2724 "movq %%mm2, %%mm0 \n\t"
2725 "movq %%mm4, %%mm3 \n\t"
2726 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2727 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2728 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2729 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2731 "movd %4, %%mm5 \n\t"
2732 "movd %3, %%mm4 \n\t"
2733 "punpcklbw %%mm7, %%mm5 \n\t"
2734 "punpcklbw %%mm7, %%mm4 \n\t"
2735 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2736 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2738 "movd %2, %%mm5 \n\t"
2739 "movd %1, %%mm4 \n\t"
2740 "punpcklbw %%mm7, %%mm5 \n\t"
2741 "punpcklbw %%mm7, %%mm4 \n\t"
2742 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2743 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2744 "paddw %5, %%mm1 \n\t"
2745 "paddw %%mm3, %%mm2 \n\t"
2746 "paddw %%mm1, %%mm0 \n\t"
2747 "paddw %%mm2, %%mm0 \n\t"
2749 "psrlw %6, %%mm0 \n\t"
2750 "packuswb %%mm0, %%mm0 \n\t"
2751 "movd %%mm0, %0 \n\t"
2753 : "=m"(dst
[x
+y
*stride
])
2754 : "m"(src
[0]), "m"(src
[1]),
2755 "m"(src
[stride
]), "m"(src
[stride
+1]),
2756 "m"(*r4
), "m"(shift2
)
2764 #ifdef CONFIG_ENCODERS
2766 #define PHADDD(a, t)\
2767 "movq "#a", "#t" \n\t"\
2768 "psrlq $32, "#a" \n\t"\
2769 "paddd "#t", "#a" \n\t"
2771 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2772 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2773 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2775 #define PMULHRW(x, y, s, o)\
2776 "pmulhw " #s ", "#x " \n\t"\
2777 "pmulhw " #s ", "#y " \n\t"\
2778 "paddw " #o ", "#x " \n\t"\
2779 "paddw " #o ", "#y " \n\t"\
2780 "psraw $1, "#x " \n\t"\
2781 "psraw $1, "#y " \n\t"
2782 #define DEF(x) x ## _mmx
2783 #define SET_RND MOVQ_WONE
2784 #define SCALE_OFFSET 1
2786 #include "dsputil_mmx_qns.h"
2793 #define DEF(x) x ## _3dnow
2795 #define SCALE_OFFSET 0
2796 #define PMULHRW(x, y, s, o)\
2797 "pmulhrw " #s ", "#x " \n\t"\
2798 "pmulhrw " #s ", "#y " \n\t"
2800 #include "dsputil_mmx_qns.h"
2809 #define DEF(x) x ## _ssse3
2811 #define SCALE_OFFSET -1
2812 #define PHADDD(a, t)\
2813 "pshufw $0x0E, "#a", "#t" \n\t"\
2814 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
2815 #define PMULHRW(x, y, s, o)\
2816 "pmulhrsw " #s ", "#x " \n\t"\
2817 "pmulhrsw " #s ", "#y " \n\t"
2819 #include "dsputil_mmx_qns.h"
2828 #endif /* CONFIG_ENCODERS */
2830 #define PREFETCH(name, op) \
2831 static void name(void *mem, int stride, int h){\
2832 const uint8_t *p= mem;\
2834 asm volatile(#op" %0" :: "m"(*p));\
2838 PREFETCH(prefetch_mmx2
, prefetcht0
)
2839 PREFETCH(prefetch_3dnow
, prefetch
)
2842 #include "h264dsp_mmx.c"
2845 void ff_cavsdsp_init_mmx2(DSPContext
* c
, AVCodecContext
*avctx
);
2847 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst
, uint8_t *src
, int stride
) {
2848 put_pixels8_mmx(dst
, src
, stride
, 8);
2850 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst
, uint8_t *src
, int stride
) {
2851 avg_pixels8_mmx(dst
, src
, stride
, 8);
2853 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst
, uint8_t *src
, int stride
) {
2854 put_pixels16_mmx(dst
, src
, stride
, 16);
2856 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst
, uint8_t *src
, int stride
) {
2857 avg_pixels16_mmx(dst
, src
, stride
, 16);
2861 void ff_vc1dsp_init_mmx(DSPContext
* dsp
, AVCodecContext
*avctx
);
2863 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst
, const uint8_t *src
, int stride
, int rnd
) {
2864 put_pixels8_mmx(dst
, src
, stride
, 8);
2867 /* external functions, from idct_mmx.c */
2868 void ff_mmx_idct(DCTELEM
*block
);
2869 void ff_mmxext_idct(DCTELEM
*block
);
2871 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2874 static void ff_libmpeg2mmx_idct_put(uint8_t *dest
, int line_size
, DCTELEM
*block
)
2876 ff_mmx_idct (block
);
2877 put_pixels_clamped_mmx(block
, dest
, line_size
);
2879 static void ff_libmpeg2mmx_idct_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
2881 ff_mmx_idct (block
);
2882 add_pixels_clamped_mmx(block
, dest
, line_size
);
2884 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest
, int line_size
, DCTELEM
*block
)
2886 ff_mmxext_idct (block
);
2887 put_pixels_clamped_mmx(block
, dest
, line_size
);
2889 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
2891 ff_mmxext_idct (block
);
2892 add_pixels_clamped_mmx(block
, dest
, line_size
);
2895 static void ff_idct_xvid_mmx_put(uint8_t *dest
, int line_size
, DCTELEM
*block
)
2897 ff_idct_xvid_mmx (block
);
2898 put_pixels_clamped_mmx(block
, dest
, line_size
);
2900 static void ff_idct_xvid_mmx_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
2902 ff_idct_xvid_mmx (block
);
2903 add_pixels_clamped_mmx(block
, dest
, line_size
);
2905 static void ff_idct_xvid_mmx2_put(uint8_t *dest
, int line_size
, DCTELEM
*block
)
2907 ff_idct_xvid_mmx2 (block
);
2908 put_pixels_clamped_mmx(block
, dest
, line_size
);
2910 static void ff_idct_xvid_mmx2_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
2912 ff_idct_xvid_mmx2 (block
);
2913 add_pixels_clamped_mmx(block
, dest
, line_size
);
2916 static void vorbis_inverse_coupling_3dnow(float *mag
, float *ang
, int blocksize
)
2919 asm volatile("pxor %%mm7, %%mm7":);
2920 for(i
=0; i
<blocksize
; i
+=2) {
2922 "movq %0, %%mm0 \n\t"
2923 "movq %1, %%mm1 \n\t"
2924 "movq %%mm0, %%mm2 \n\t"
2925 "movq %%mm1, %%mm3 \n\t"
2926 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2927 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2928 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2929 "pxor %%mm2, %%mm1 \n\t"
2930 "movq %%mm3, %%mm4 \n\t"
2931 "pand %%mm1, %%mm3 \n\t"
2932 "pandn %%mm1, %%mm4 \n\t"
2933 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2934 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2935 "movq %%mm3, %1 \n\t"
2936 "movq %%mm0, %0 \n\t"
2937 :"+m"(mag
[i
]), "+m"(ang
[i
])
2941 asm volatile("femms");
2943 static void vorbis_inverse_coupling_sse(float *mag
, float *ang
, int blocksize
)
2948 "movaps %0, %%xmm5 \n\t"
2949 ::"m"(ff_pdw_80000000
[0])
2951 for(i
=0; i
<blocksize
; i
+=4) {
2953 "movaps %0, %%xmm0 \n\t"
2954 "movaps %1, %%xmm1 \n\t"
2955 "xorps %%xmm2, %%xmm2 \n\t"
2956 "xorps %%xmm3, %%xmm3 \n\t"
2957 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2958 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2959 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2960 "xorps %%xmm2, %%xmm1 \n\t"
2961 "movaps %%xmm3, %%xmm4 \n\t"