Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
[libav.git] / libavcodec / i386 / dsputil_mmx.c
1 /*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
25 #include "dsputil.h"
26 #include "dsputil_mmx.h"
27 #include "simple_idct.h"
28 #include "mpegvideo.h"
29 #include "x86_cpu.h"
30 #include "mmx.h"
31 #include "vp3dsp_mmx.h"
32 #include "vp3dsp_sse2.h"
33 #include "h263.h"
34
35 //#undef NDEBUG
36 //#include <assert.h>
37
38 extern void ff_idct_xvid_mmx(short *block);
39 extern void ff_idct_xvid_mmx2(short *block);
40
41 int mm_flags; /* multimedia extension flags */
42
43 /* pixel operations */
44 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
45 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
46
47 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
48 {0x8000000080000000ULL, 0x8000000080000000ULL};
49
50 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
51 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
52 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5 ) = 0x0005000500050005ULL;
53 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
55 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
56 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_32 ) = 0x0020002000200020ULL;
58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
61 DECLARE_ALIGNED_16(const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
62
63 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
69
70 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
71 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
72
73 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
74 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
75
76 #define MOVQ_WONE(regd) \
77 __asm __volatile ( \
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79 "psrlw $15, %%" #regd ::)
80
81 #define MOVQ_BFE(regd) \
82 __asm __volatile ( \
83 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
84 "paddb %%" #regd ", %%" #regd " \n\t" ::)
85
86 #ifndef PIC
87 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
88 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
89 #else
90 // for shared library it's better to use this way for accessing constants
91 // pcmpeqd -> -1
92 #define MOVQ_BONE(regd) \
93 __asm __volatile ( \
94 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95 "psrlw $15, %%" #regd " \n\t" \
96 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
97
98 #define MOVQ_WTWO(regd) \
99 __asm __volatile ( \
100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101 "psrlw $15, %%" #regd " \n\t" \
102 "psllw $1, %%" #regd " \n\t"::)
103
104 #endif
105
106 // using regr as temporary and for the output result
107 // first argument is unmodifed and second is trashed
108 // regfe is supposed to contain 0xfefefefefefefefe
109 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
110 "movq " #rega ", " #regr " \n\t"\
111 "pand " #regb ", " #regr " \n\t"\
112 "pxor " #rega ", " #regb " \n\t"\
113 "pand " #regfe "," #regb " \n\t"\
114 "psrlq $1, " #regb " \n\t"\
115 "paddb " #regb ", " #regr " \n\t"
116
117 #define PAVGB_MMX(rega, regb, regr, regfe) \
118 "movq " #rega ", " #regr " \n\t"\
119 "por " #regb ", " #regr " \n\t"\
120 "pxor " #rega ", " #regb " \n\t"\
121 "pand " #regfe "," #regb " \n\t"\
122 "psrlq $1, " #regb " \n\t"\
123 "psubb " #regb ", " #regr " \n\t"
124
125 // mm6 is supposed to contain 0xfefefefefefefefe
126 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
127 "movq " #rega ", " #regr " \n\t"\
128 "movq " #regc ", " #regp " \n\t"\
129 "pand " #regb ", " #regr " \n\t"\
130 "pand " #regd ", " #regp " \n\t"\
131 "pxor " #rega ", " #regb " \n\t"\
132 "pxor " #regc ", " #regd " \n\t"\
133 "pand %%mm6, " #regb " \n\t"\
134 "pand %%mm6, " #regd " \n\t"\
135 "psrlq $1, " #regb " \n\t"\
136 "psrlq $1, " #regd " \n\t"\
137 "paddb " #regb ", " #regr " \n\t"\
138 "paddb " #regd ", " #regp " \n\t"
139
140 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
141 "movq " #rega ", " #regr " \n\t"\
142 "movq " #regc ", " #regp " \n\t"\
143 "por " #regb ", " #regr " \n\t"\
144 "por " #regd ", " #regp " \n\t"\
145 "pxor " #rega ", " #regb " \n\t"\
146 "pxor " #regc ", " #regd " \n\t"\
147 "pand %%mm6, " #regb " \n\t"\
148 "pand %%mm6, " #regd " \n\t"\
149 "psrlq $1, " #regd " \n\t"\
150 "psrlq $1, " #regb " \n\t"\
151 "psubb " #regb ", " #regr " \n\t"\
152 "psubb " #regd ", " #regp " \n\t"
153
154 /***********************************/
155 /* MMX no rounding */
156 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
157 #define SET_RND MOVQ_WONE
158 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
159 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
160
161 #include "dsputil_mmx_rnd.h"
162
163 #undef DEF
164 #undef SET_RND
165 #undef PAVGBP
166 #undef PAVGB
167 /***********************************/
168 /* MMX rounding */
169
170 #define DEF(x, y) x ## _ ## y ##_mmx
171 #define SET_RND MOVQ_WTWO
172 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
173 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
174
175 #include "dsputil_mmx_rnd.h"
176
177 #undef DEF
178 #undef SET_RND
179 #undef PAVGBP
180 #undef PAVGB
181
182 /***********************************/
183 /* 3Dnow specific */
184
185 #define DEF(x) x ## _3dnow
186 #define PAVGB "pavgusb"
187
188 #include "dsputil_mmx_avg.h"
189
190 #undef DEF
191 #undef PAVGB
192
193 /***********************************/
194 /* MMX2 specific */
195
196 #define DEF(x) x ## _mmx2
197
198 /* Introduced only in MMX2 set */
199 #define PAVGB "pavgb"
200
201 #include "dsputil_mmx_avg.h"
202
203 #undef DEF
204 #undef PAVGB
205
206 #define SBUTTERFLY(a,b,t,n,m)\
207 "mov" #m " " #a ", " #t " \n\t" /* abcd */\
208 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
209 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
210
211 #define TRANSPOSE4(a,b,c,d,t)\
212 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
213 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
214 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
215 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
216
217 /***********************************/
218 /* standard MMX */
219
220 #ifdef CONFIG_ENCODERS
221 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
222 {
223 asm volatile(
224 "mov $-128, %%"REG_a" \n\t"
225 "pxor %%mm7, %%mm7 \n\t"
226 ASMALIGN(4)
227 "1: \n\t"
228 "movq (%0), %%mm0 \n\t"
229 "movq (%0, %2), %%mm2 \n\t"
230 "movq %%mm0, %%mm1 \n\t"
231 "movq %%mm2, %%mm3 \n\t"
232 "punpcklbw %%mm7, %%mm0 \n\t"
233 "punpckhbw %%mm7, %%mm1 \n\t"
234 "punpcklbw %%mm7, %%mm2 \n\t"
235 "punpckhbw %%mm7, %%mm3 \n\t"
236 "movq %%mm0, (%1, %%"REG_a") \n\t"
237 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
238 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
239 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
240 "add %3, %0 \n\t"
241 "add $32, %%"REG_a" \n\t"
242 "js 1b \n\t"
243 : "+r" (pixels)
244 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
245 : "%"REG_a
246 );
247 }
248
249 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
250 {
251 asm volatile(
252 "pxor %%mm7, %%mm7 \n\t"
253 "mov $-128, %%"REG_a" \n\t"
254 ASMALIGN(4)
255 "1: \n\t"
256 "movq (%0), %%mm0 \n\t"
257 "movq (%1), %%mm2 \n\t"
258 "movq %%mm0, %%mm1 \n\t"
259 "movq %%mm2, %%mm3 \n\t"
260 "punpcklbw %%mm7, %%mm0 \n\t"
261 "punpckhbw %%mm7, %%mm1 \n\t"
262 "punpcklbw %%mm7, %%mm2 \n\t"
263 "punpckhbw %%mm7, %%mm3 \n\t"
264 "psubw %%mm2, %%mm0 \n\t"
265 "psubw %%mm3, %%mm1 \n\t"
266 "movq %%mm0, (%2, %%"REG_a") \n\t"
267 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
268 "add %3, %0 \n\t"
269 "add %3, %1 \n\t"
270 "add $16, %%"REG_a" \n\t"
271 "jnz 1b \n\t"
272 : "+r" (s1), "+r" (s2)
273 : "r" (block+64), "r" ((long)stride)
274 : "%"REG_a
275 );
276 }
277 #endif //CONFIG_ENCODERS
278
279 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
280 {
281 const DCTELEM *p;
282 uint8_t *pix;
283
284 /* read the pixels */
285 p = block;
286 pix = pixels;
287 /* unrolled loop */
288 __asm __volatile(
289 "movq %3, %%mm0 \n\t"
290 "movq 8%3, %%mm1 \n\t"
291 "movq 16%3, %%mm2 \n\t"
292 "movq 24%3, %%mm3 \n\t"
293 "movq 32%3, %%mm4 \n\t"
294 "movq 40%3, %%mm5 \n\t"
295 "movq 48%3, %%mm6 \n\t"
296 "movq 56%3, %%mm7 \n\t"
297 "packuswb %%mm1, %%mm0 \n\t"
298 "packuswb %%mm3, %%mm2 \n\t"
299 "packuswb %%mm5, %%mm4 \n\t"
300 "packuswb %%mm7, %%mm6 \n\t"
301 "movq %%mm0, (%0) \n\t"
302 "movq %%mm2, (%0, %1) \n\t"
303 "movq %%mm4, (%0, %1, 2) \n\t"
304 "movq %%mm6, (%0, %2) \n\t"
305 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
306 :"memory");
307 pix += line_size*4;
308 p += 32;
309
310 // if here would be an exact copy of the code above
311 // compiler would generate some very strange code
312 // thus using "r"
313 __asm __volatile(
314 "movq (%3), %%mm0 \n\t"
315 "movq 8(%3), %%mm1 \n\t"
316 "movq 16(%3), %%mm2 \n\t"
317 "movq 24(%3), %%mm3 \n\t"
318 "movq 32(%3), %%mm4 \n\t"
319 "movq 40(%3), %%mm5 \n\t"
320 "movq 48(%3), %%mm6 \n\t"
321 "movq 56(%3), %%mm7 \n\t"
322 "packuswb %%mm1, %%mm0 \n\t"
323 "packuswb %%mm3, %%mm2 \n\t"
324 "packuswb %%mm5, %%mm4 \n\t"
325 "packuswb %%mm7, %%mm6 \n\t"
326 "movq %%mm0, (%0) \n\t"
327 "movq %%mm2, (%0, %1) \n\t"
328 "movq %%mm4, (%0, %1, 2) \n\t"
329 "movq %%mm6, (%0, %2) \n\t"
330 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
331 :"memory");
332 }
333
334 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
335 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
336
337 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
338 {
339 int i;
340
341 movq_m2r(*vector128, mm1);
342 for (i = 0; i < 8; i++) {
343 movq_m2r(*(block), mm0);
344 packsswb_m2r(*(block + 4), mm0);
345 block += 8;
346 paddb_r2r(mm1, mm0);
347 movq_r2m(mm0, *pixels);
348 pixels += line_size;
349 }
350 }
351
352 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
353 {
354 const DCTELEM *p;
355 uint8_t *pix;
356 int i;
357
358 /* read the pixels */
359 p = block;
360 pix = pixels;
361 MOVQ_ZERO(mm7);
362 i = 4;
363 do {
364 __asm __volatile(
365 "movq (%2), %%mm0 \n\t"
366 "movq 8(%2), %%mm1 \n\t"
367 "movq 16(%2), %%mm2 \n\t"
368 "movq 24(%2), %%mm3 \n\t"
369 "movq %0, %%mm4 \n\t"
370 "movq %1, %%mm6 \n\t"
371 "movq %%mm4, %%mm5 \n\t"
372 "punpcklbw %%mm7, %%mm4 \n\t"
373 "punpckhbw %%mm7, %%mm5 \n\t"
374 "paddsw %%mm4, %%mm0 \n\t"
375 "paddsw %%mm5, %%mm1 \n\t"
376 "movq %%mm6, %%mm5 \n\t"
377 "punpcklbw %%mm7, %%mm6 \n\t"
378 "punpckhbw %%mm7, %%mm5 \n\t"
379 "paddsw %%mm6, %%mm2 \n\t"
380 "paddsw %%mm5, %%mm3 \n\t"
381 "packuswb %%mm1, %%mm0 \n\t"
382 "packuswb %%mm3, %%mm2 \n\t"
383 "movq %%mm0, %0 \n\t"
384 "movq %%mm2, %1 \n\t"
385 :"+m"(*pix), "+m"(*(pix+line_size))
386 :"r"(p)
387 :"memory");
388 pix += line_size*2;
389 p += 16;
390 } while (--i);
391 }
392
393 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
394 {
395 __asm __volatile(
396 "lea (%3, %3), %%"REG_a" \n\t"
397 ASMALIGN(3)
398 "1: \n\t"
399 "movd (%1), %%mm0 \n\t"
400 "movd (%1, %3), %%mm1 \n\t"
401 "movd %%mm0, (%2) \n\t"
402 "movd %%mm1, (%2, %3) \n\t"
403 "add %%"REG_a", %1 \n\t"
404 "add %%"REG_a", %2 \n\t"
405 "movd (%1), %%mm0 \n\t"
406 "movd (%1, %3), %%mm1 \n\t"
407 "movd %%mm0, (%2) \n\t"
408 "movd %%mm1, (%2, %3) \n\t"
409 "add %%"REG_a", %1 \n\t"
410 "add %%"REG_a", %2 \n\t"
411 "subl $4, %0 \n\t"
412 "jnz 1b \n\t"
413 : "+g"(h), "+r" (pixels), "+r" (block)
414 : "r"((long)line_size)
415 : "%"REG_a, "memory"
416 );
417 }
418
419 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
420 {
421 __asm __volatile(
422 "lea (%3, %3), %%"REG_a" \n\t"
423 ASMALIGN(3)
424 "1: \n\t"
425 "movq (%1), %%mm0 \n\t"
426 "movq (%1, %3), %%mm1 \n\t"
427 "movq %%mm0, (%2) \n\t"
428 "movq %%mm1, (%2, %3) \n\t"
429 "add %%"REG_a", %1 \n\t"
430 "add %%"REG_a", %2 \n\t"
431 "movq (%1), %%mm0 \n\t"
432 "movq (%1, %3), %%mm1 \n\t"
433 "movq %%mm0, (%2) \n\t"
434 "movq %%mm1, (%2, %3) \n\t"
435 "add %%"REG_a", %1 \n\t"
436 "add %%"REG_a", %2 \n\t"
437 "subl $4, %0 \n\t"
438 "jnz 1b \n\t"
439 : "+g"(h), "+r" (pixels), "+r" (block)
440 : "r"((long)line_size)
441 : "%"REG_a, "memory"
442 );
443 }
444
445 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
446 {
447 __asm __volatile(
448 "lea (%3, %3), %%"REG_a" \n\t"
449 ASMALIGN(3)
450 "1: \n\t"
451 "movq (%1), %%mm0 \n\t"
452 "movq 8(%1), %%mm4 \n\t"
453 "movq (%1, %3), %%mm1 \n\t"
454 "movq 8(%1, %3), %%mm5 \n\t"
455 "movq %%mm0, (%2) \n\t"
456 "movq %%mm4, 8(%2) \n\t"
457 "movq %%mm1, (%2, %3) \n\t"
458 "movq %%mm5, 8(%2, %3) \n\t"
459 "add %%"REG_a", %1 \n\t"
460 "add %%"REG_a", %2 \n\t"
461 "movq (%1), %%mm0 \n\t"
462 "movq 8(%1), %%mm4 \n\t"
463 "movq (%1, %3), %%mm1 \n\t"
464 "movq 8(%1, %3), %%mm5 \n\t"
465 "movq %%mm0, (%2) \n\t"
466 "movq %%mm4, 8(%2) \n\t"
467 "movq %%mm1, (%2, %3) \n\t"
468 "movq %%mm5, 8(%2, %3) \n\t"
469 "add %%"REG_a", %1 \n\t"
470 "add %%"REG_a", %2 \n\t"
471 "subl $4, %0 \n\t"
472 "jnz 1b \n\t"
473 : "+g"(h), "+r" (pixels), "+r" (block)
474 : "r"((long)line_size)
475 : "%"REG_a, "memory"
476 );
477 }
478
479 static void clear_blocks_mmx(DCTELEM *blocks)
480 {
481 __asm __volatile(
482 "pxor %%mm7, %%mm7 \n\t"
483 "mov $-128*6, %%"REG_a" \n\t"
484 "1: \n\t"
485 "movq %%mm7, (%0, %%"REG_a") \n\t"
486 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
487 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
488 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
489 "add $32, %%"REG_a" \n\t"
490 " js 1b \n\t"
491 : : "r" (((uint8_t *)blocks)+128*6)
492 : "%"REG_a
493 );
494 }
495
496 #ifdef CONFIG_ENCODERS
497 static int pix_sum16_mmx(uint8_t * pix, int line_size){
498 const int h=16;
499 int sum;
500 long index= -line_size*h;
501
502 __asm __volatile(
503 "pxor %%mm7, %%mm7 \n\t"
504 "pxor %%mm6, %%mm6 \n\t"
505 "1: \n\t"
506 "movq (%2, %1), %%mm0 \n\t"
507 "movq (%2, %1), %%mm1 \n\t"
508 "movq 8(%2, %1), %%mm2 \n\t"
509 "movq 8(%2, %1), %%mm3 \n\t"
510 "punpcklbw %%mm7, %%mm0 \n\t"
511 "punpckhbw %%mm7, %%mm1 \n\t"
512 "punpcklbw %%mm7, %%mm2 \n\t"
513 "punpckhbw %%mm7, %%mm3 \n\t"
514 "paddw %%mm0, %%mm1 \n\t"
515 "paddw %%mm2, %%mm3 \n\t"
516 "paddw %%mm1, %%mm3 \n\t"
517 "paddw %%mm3, %%mm6 \n\t"
518 "add %3, %1 \n\t"
519 " js 1b \n\t"
520 "movq %%mm6, %%mm5 \n\t"
521 "psrlq $32, %%mm6 \n\t"
522 "paddw %%mm5, %%mm6 \n\t"
523 "movq %%mm6, %%mm5 \n\t"
524 "psrlq $16, %%mm6 \n\t"
525 "paddw %%mm5, %%mm6 \n\t"
526 "movd %%mm6, %0 \n\t"
527 "andl $0xFFFF, %0 \n\t"
528 : "=&r" (sum), "+r" (index)
529 : "r" (pix - index), "r" ((long)line_size)
530 );
531
532 return sum;
533 }
534 #endif //CONFIG_ENCODERS
535
536 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
537 long i=0;
538 asm volatile(
539 "1: \n\t"
540 "movq (%1, %0), %%mm0 \n\t"
541 "movq (%2, %0), %%mm1 \n\t"
542 "paddb %%mm0, %%mm1 \n\t"
543 "movq %%mm1, (%2, %0) \n\t"
544 "movq 8(%1, %0), %%mm0 \n\t"
545 "movq 8(%2, %0), %%mm1 \n\t"
546 "paddb %%mm0, %%mm1 \n\t"
547 "movq %%mm1, 8(%2, %0) \n\t"
548 "add $16, %0 \n\t"
549 "cmp %3, %0 \n\t"
550 " jb 1b \n\t"
551 : "+r" (i)
552 : "r"(src), "r"(dst), "r"((long)w-15)
553 );
554 for(; i<w; i++)
555 dst[i+0] += src[i+0];
556 }
557
558 #define H263_LOOP_FILTER \
559 "pxor %%mm7, %%mm7 \n\t"\
560 "movq %0, %%mm0 \n\t"\
561 "movq %0, %%mm1 \n\t"\
562 "movq %3, %%mm2 \n\t"\
563 "movq %3, %%mm3 \n\t"\
564 "punpcklbw %%mm7, %%mm0 \n\t"\
565 "punpckhbw %%mm7, %%mm1 \n\t"\
566 "punpcklbw %%mm7, %%mm2 \n\t"\
567 "punpckhbw %%mm7, %%mm3 \n\t"\
568 "psubw %%mm2, %%mm0 \n\t"\
569 "psubw %%mm3, %%mm1 \n\t"\
570 "movq %1, %%mm2 \n\t"\
571 "movq %1, %%mm3 \n\t"\
572 "movq %2, %%mm4 \n\t"\
573 "movq %2, %%mm5 \n\t"\
574 "punpcklbw %%mm7, %%mm2 \n\t"\
575 "punpckhbw %%mm7, %%mm3 \n\t"\
576 "punpcklbw %%mm7, %%mm4 \n\t"\
577 "punpckhbw %%mm7, %%mm5 \n\t"\
578 "psubw %%mm2, %%mm4 \n\t"\
579 "psubw %%mm3, %%mm5 \n\t"\
580 "psllw $2, %%mm4 \n\t"\
581 "psllw $2, %%mm5 \n\t"\
582 "paddw %%mm0, %%mm4 \n\t"\
583 "paddw %%mm1, %%mm5 \n\t"\
584 "pxor %%mm6, %%mm6 \n\t"\
585 "pcmpgtw %%mm4, %%mm6 \n\t"\
586 "pcmpgtw %%mm5, %%mm7 \n\t"\
587 "pxor %%mm6, %%mm4 \n\t"\
588 "pxor %%mm7, %%mm5 \n\t"\
589 "psubw %%mm6, %%mm4 \n\t"\
590 "psubw %%mm7, %%mm5 \n\t"\
591 "psrlw $3, %%mm4 \n\t"\
592 "psrlw $3, %%mm5 \n\t"\
593 "packuswb %%mm5, %%mm4 \n\t"\
594 "packsswb %%mm7, %%mm6 \n\t"\
595 "pxor %%mm7, %%mm7 \n\t"\
596 "movd %4, %%mm2 \n\t"\
597 "punpcklbw %%mm2, %%mm2 \n\t"\
598 "punpcklbw %%mm2, %%mm2 \n\t"\
599 "punpcklbw %%mm2, %%mm2 \n\t"\
600 "psubusb %%mm4, %%mm2 \n\t"\
601 "movq %%mm2, %%mm3 \n\t"\
602 "psubusb %%mm4, %%mm3 \n\t"\
603 "psubb %%mm3, %%mm2 \n\t"\
604 "movq %1, %%mm3 \n\t"\
605 "movq %2, %%mm4 \n\t"\
606 "pxor %%mm6, %%mm3 \n\t"\
607 "pxor %%mm6, %%mm4 \n\t"\
608 "paddusb %%mm2, %%mm3 \n\t"\
609 "psubusb %%mm2, %%mm4 \n\t"\
610 "pxor %%mm6, %%mm3 \n\t"\
611 "pxor %%mm6, %%mm4 \n\t"\
612 "paddusb %%mm2, %%mm2 \n\t"\
613 "packsswb %%mm1, %%mm0 \n\t"\
614 "pcmpgtb %%mm0, %%mm7 \n\t"\
615 "pxor %%mm7, %%mm0 \n\t"\
616 "psubb %%mm7, %%mm0 \n\t"\
617 "movq %%mm0, %%mm1 \n\t"\
618 "psubusb %%mm2, %%mm0 \n\t"\
619 "psubb %%mm0, %%mm1 \n\t"\
620 "pand %5, %%mm1 \n\t"\
621 "psrlw $2, %%mm1 \n\t"\
622 "pxor %%mm7, %%mm1 \n\t"\
623 "psubb %%mm7, %%mm1 \n\t"\
624 "movq %0, %%mm5 \n\t"\
625 "movq %3, %%mm6 \n\t"\
626 "psubb %%mm1, %%mm5 \n\t"\
627 "paddb %%mm1, %%mm6 \n\t"
628
629 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
630 if(ENABLE_ANY_H263) {
631 const int strength= ff_h263_loop_filter_strength[qscale];
632
633 asm volatile(
634
635 H263_LOOP_FILTER
636
637 "movq %%mm3, %1 \n\t"
638 "movq %%mm4, %2 \n\t"
639 "movq %%mm5, %0 \n\t"
640 "movq %%mm6, %3 \n\t"
641 : "+m" (*(uint64_t*)(src - 2*stride)),
642 "+m" (*(uint64_t*)(src - 1*stride)),
643 "+m" (*(uint64_t*)(src + 0*stride)),
644 "+m" (*(uint64_t*)(src + 1*stride))
645 : "g" (2*strength), "m"(ff_pb_FC)
646 );
647 }
648 }
649
650 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
651 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
652 "movd %4, %%mm0 \n\t"
653 "movd %5, %%mm1 \n\t"
654 "movd %6, %%mm2 \n\t"
655 "movd %7, %%mm3 \n\t"
656 "punpcklbw %%mm1, %%mm0 \n\t"
657 "punpcklbw %%mm3, %%mm2 \n\t"
658 "movq %%mm0, %%mm1 \n\t"
659 "punpcklwd %%mm2, %%mm0 \n\t"
660 "punpckhwd %%mm2, %%mm1 \n\t"
661 "movd %%mm0, %0 \n\t"
662 "punpckhdq %%mm0, %%mm0 \n\t"
663 "movd %%mm0, %1 \n\t"
664 "movd %%mm1, %2 \n\t"
665 "punpckhdq %%mm1, %%mm1 \n\t"
666 "movd %%mm1, %3 \n\t"
667
668 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
669 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
670 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
671 "=m" (*(uint32_t*)(dst + 3*dst_stride))
672 : "m" (*(uint32_t*)(src + 0*src_stride)),
673 "m" (*(uint32_t*)(src + 1*src_stride)),
674 "m" (*(uint32_t*)(src + 2*src_stride)),
675 "m" (*(uint32_t*)(src + 3*src_stride))
676 );
677 }
678
679 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
680 if(ENABLE_ANY_H263) {
681 const int strength= ff_h263_loop_filter_strength[qscale];
682 uint64_t temp[4] __attribute__ ((aligned(8)));
683 uint8_t *btemp= (uint8_t*)temp;
684
685 src -= 2;
686
687 transpose4x4(btemp , src , 8, stride);
688 transpose4x4(btemp+4, src + 4*stride, 8, stride);
689 asm volatile(
690 H263_LOOP_FILTER // 5 3 4 6
691
692 : "+m" (temp[0]),
693 "+m" (temp[1]),
694 "+m" (temp[2]),
695 "+m" (temp[3])
696 : "g" (2*strength), "m"(ff_pb_FC)
697 );
698
699 asm volatile(
700 "movq %%mm5, %%mm1 \n\t"
701 "movq %%mm4, %%mm0 \n\t"
702 "punpcklbw %%mm3, %%mm5 \n\t"
703 "punpcklbw %%mm6, %%mm4 \n\t"
704 "punpckhbw %%mm3, %%mm1 \n\t"
705 "punpckhbw %%mm6, %%mm0 \n\t"
706 "movq %%mm5, %%mm3 \n\t"
707 "movq %%mm1, %%mm6 \n\t"
708 "punpcklwd %%mm4, %%mm5 \n\t"
709 "punpcklwd %%mm0, %%mm1 \n\t"
710 "punpckhwd %%mm4, %%mm3 \n\t"
711 "punpckhwd %%mm0, %%mm6 \n\t"
712 "movd %%mm5, (%0) \n\t"
713 "punpckhdq %%mm5, %%mm5 \n\t"
714 "movd %%mm5, (%0,%2) \n\t"
715 "movd %%mm3, (%0,%2,2) \n\t"
716 "punpckhdq %%mm3, %%mm3 \n\t"
717 "movd %%mm3, (%0,%3) \n\t"
718 "movd %%mm1, (%1) \n\t"
719 "punpckhdq %%mm1, %%mm1 \n\t"
720 "movd %%mm1, (%1,%2) \n\t"
721 "movd %%mm6, (%1,%2,2) \n\t"
722 "punpckhdq %%mm6, %%mm6 \n\t"
723 "movd %%mm6, (%1,%3) \n\t"
724 :: "r" (src),
725 "r" (src + 4*stride),
726 "r" ((long) stride ),
727 "r" ((long)(3*stride))
728 );
729 }
730 }
731
732 #ifdef CONFIG_ENCODERS
733 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
734 int tmp;
735 asm volatile (
736 "movl $16,%%ecx\n"
737 "pxor %%mm0,%%mm0\n"
738 "pxor %%mm7,%%mm7\n"
739 "1:\n"
740 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
741 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
742
743 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
744
745 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
746 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
747
748 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
749 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
750 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
751
752 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
753 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
754
755 "pmaddwd %%mm3,%%mm3\n"
756 "pmaddwd %%mm4,%%mm4\n"
757
758 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
759 pix2^2+pix3^2+pix6^2+pix7^2) */
760 "paddd %%mm3,%%mm4\n"
761 "paddd %%mm2,%%mm7\n"
762
763 "add %2, %0\n"
764 "paddd %%mm4,%%mm7\n"
765 "dec %%ecx\n"
766 "jnz 1b\n"
767
768 "movq %%mm7,%%mm1\n"
769 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
770 "paddd %%mm7,%%mm1\n"
771 "movd %%mm1,%1\n"
772 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
773 return tmp;
774 }
775
776 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
777 int tmp;
778 asm volatile (
779 "movl %4,%%ecx\n"
780 "shr $1,%%ecx\n"
781 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
782 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
783 "1:\n"
784 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
785 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
786 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
787 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
788
789 /* todo: mm1-mm2, mm3-mm4 */
790 /* algo: subtract mm1 from mm2 with saturation and vice versa */
791 /* OR the results to get absolute difference */
792 "movq %%mm1,%%mm5\n"
793 "movq %%mm3,%%mm6\n"
794 "psubusb %%mm2,%%mm1\n"
795 "psubusb %%mm4,%%mm3\n"
796 "psubusb %%mm5,%%mm2\n"
797 "psubusb %%mm6,%%mm4\n"
798
799 "por %%mm1,%%mm2\n"
800 "por %%mm3,%%mm4\n"
801
802 /* now convert to 16-bit vectors so we can square them */
803 "movq %%mm2,%%mm1\n"
804 "movq %%mm4,%%mm3\n"
805
806 "punpckhbw %%mm0,%%mm2\n"
807 "punpckhbw %%mm0,%%mm4\n"
808 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
809 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
810
811 "pmaddwd %%mm2,%%mm2\n"
812 "pmaddwd %%mm4,%%mm4\n"
813 "pmaddwd %%mm1,%%mm1\n"
814 "pmaddwd %%mm3,%%mm3\n"
815
816 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
817 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
818
819 "paddd %%mm2,%%mm1\n"
820 "paddd %%mm4,%%mm3\n"
821 "paddd %%mm1,%%mm7\n"
822 "paddd %%mm3,%%mm7\n"
823
824 "decl %%ecx\n"
825 "jnz 1b\n"
826
827 "movq %%mm7,%%mm1\n"
828 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
829 "paddd %%mm7,%%mm1\n"
830 "movd %%mm1,%2\n"
831 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
832 : "r" ((long)line_size) , "m" (h)
833 : "%ecx");
834 return tmp;
835 }
836
837 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
838 int tmp;
839 asm volatile (
840 "movl %4,%%ecx\n"
841 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
842 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
843 "1:\n"
844 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
845 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
846 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
847 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
848
849 /* todo: mm1-mm2, mm3-mm4 */
850 /* algo: subtract mm1 from mm2 with saturation and vice versa */
851 /* OR the results to get absolute difference */
852 "movq %%mm1,%%mm5\n"
853 "movq %%mm3,%%mm6\n"
854 "psubusb %%mm2,%%mm1\n"
855 "psubusb %%mm4,%%mm3\n"
856 "psubusb %%mm5,%%mm2\n"
857 "psubusb %%mm6,%%mm4\n"
858
859 "por %%mm1,%%mm2\n"
860 "por %%mm3,%%mm4\n"
861
862 /* now convert to 16-bit vectors so we can square them */
863 "movq %%mm2,%%mm1\n"
864 "movq %%mm4,%%mm3\n"
865
866 "punpckhbw %%mm0,%%mm2\n"
867 "punpckhbw %%mm0,%%mm4\n"
868 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
869 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
870
871 "pmaddwd %%mm2,%%mm2\n"
872 "pmaddwd %%mm4,%%mm4\n"
873 "pmaddwd %%mm1,%%mm1\n"
874 "pmaddwd %%mm3,%%mm3\n"
875
876 "add %3,%0\n"
877 "add %3,%1\n"
878
879 "paddd %%mm2,%%mm1\n"
880 "paddd %%mm4,%%mm3\n"
881 "paddd %%mm1,%%mm7\n"
882 "paddd %%mm3,%%mm7\n"
883
884 "decl %%ecx\n"
885 "jnz 1b\n"
886
887 "movq %%mm7,%%mm1\n"
888 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
889 "paddd %%mm7,%%mm1\n"
890 "movd %%mm1,%2\n"
891 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
892 : "r" ((long)line_size) , "m" (h)
893 : "%ecx");
894 return tmp;
895 }
896
897 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
898 int tmp;
899 asm volatile (
900 "shr $1,%2\n"
901 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
902 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
903 "1:\n"
904 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
905 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
906 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
907 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
908
909 /* todo: mm1-mm2, mm3-mm4 */
910 /* algo: subtract mm1 from mm2 with saturation and vice versa */
911 /* OR the results to get absolute difference */
912 "movdqa %%xmm1,%%xmm5\n"
913 "movdqa %%xmm3,%%xmm6\n"
914 "psubusb %%xmm2,%%xmm1\n"
915 "psubusb %%xmm4,%%xmm3\n"
916 "psubusb %%xmm5,%%xmm2\n"
917 "psubusb %%xmm6,%%xmm4\n"
918
919 "por %%xmm1,%%xmm2\n"
920 "por %%xmm3,%%xmm4\n"
921
922 /* now convert to 16-bit vectors so we can square them */
923 "movdqa %%xmm2,%%xmm1\n"
924 "movdqa %%xmm4,%%xmm3\n"
925
926 "punpckhbw %%xmm0,%%xmm2\n"
927 "punpckhbw %%xmm0,%%xmm4\n"
928 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
929 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
930
931 "pmaddwd %%xmm2,%%xmm2\n"
932 "pmaddwd %%xmm4,%%xmm4\n"
933 "pmaddwd %%xmm1,%%xmm1\n"
934 "pmaddwd %%xmm3,%%xmm3\n"
935
936 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
937 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
938
939 "paddd %%xmm2,%%xmm1\n"
940 "paddd %%xmm4,%%xmm3\n"
941 "paddd %%xmm1,%%xmm7\n"
942 "paddd %%xmm3,%%xmm7\n"
943
944 "decl %2\n"
945 "jnz 1b\n"
946
947 "movdqa %%xmm7,%%xmm1\n"
948 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
949 "paddd %%xmm1,%%xmm7\n"
950 "movdqa %%xmm7,%%xmm1\n"
951 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
952 "paddd %%xmm1,%%xmm7\n"
953 "movd %%xmm7,%3\n"
954 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
955 : "r" ((long)line_size));
956 return tmp;
957 }
958
959 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
960 int tmp;
961 asm volatile (
962 "movl %3,%%ecx\n"
963 "pxor %%mm7,%%mm7\n"
964 "pxor %%mm6,%%mm6\n"
965
966 "movq (%0),%%mm0\n"
967 "movq %%mm0, %%mm1\n"
968 "psllq $8, %%mm0\n"
969 "psrlq $8, %%mm1\n"
970 "psrlq $8, %%mm0\n"
971 "movq %%mm0, %%mm2\n"
972 "movq %%mm1, %%mm3\n"
973 "punpcklbw %%mm7,%%mm0\n"
974 "punpcklbw %%mm7,%%mm1\n"
975 "punpckhbw %%mm7,%%mm2\n"
976 "punpckhbw %%mm7,%%mm3\n"
977 "psubw %%mm1, %%mm0\n"
978 "psubw %%mm3, %%mm2\n"
979
980 "add %2,%0\n"
981
982 "movq (%0),%%mm4\n"
983 "movq %%mm4, %%mm1\n"
984 "psllq $8, %%mm4\n"
985 "psrlq $8, %%mm1\n"
986 "psrlq $8, %%mm4\n"
987 "movq %%mm4, %%mm5\n"
988 "movq %%mm1, %%mm3\n"
989 "punpcklbw %%mm7,%%mm4\n"
990 "punpcklbw %%mm7,%%mm1\n"
991 "punpckhbw %%mm7,%%mm5\n"
992 "punpckhbw %%mm7,%%mm3\n"
993 "psubw %%mm1, %%mm4\n"
994 "psubw %%mm3, %%mm5\n"
995 "psubw %%mm4, %%mm0\n"
996 "psubw %%mm5, %%mm2\n"
997 "pxor %%mm3, %%mm3\n"
998 "pxor %%mm1, %%mm1\n"
999 "pcmpgtw %%mm0, %%mm3\n\t"
1000 "pcmpgtw %%mm2, %%mm1\n\t"
1001 "pxor %%mm3, %%mm0\n"
1002 "pxor %%mm1, %%mm2\n"
1003 "psubw %%mm3, %%mm0\n"
1004 "psubw %%mm1, %%mm2\n"
1005 "paddw %%mm0, %%mm2\n"
1006 "paddw %%mm2, %%mm6\n"
1007
1008 "add %2,%0\n"
1009 "1:\n"
1010
1011 "movq (%0),%%mm0\n"
1012 "movq %%mm0, %%mm1\n"
1013 "psllq $8, %%mm0\n"
1014 "psrlq $8, %%mm1\n"
1015 "psrlq $8, %%mm0\n"
1016 "movq %%mm0, %%mm2\n"
1017 "movq %%mm1, %%mm3\n"
1018 "punpcklbw %%mm7,%%mm0\n"
1019 "punpcklbw %%mm7,%%mm1\n"
1020 "punpckhbw %%mm7,%%mm2\n"
1021 "punpckhbw %%mm7,%%mm3\n"
1022 "psubw %%mm1, %%mm0\n"
1023 "psubw %%mm3, %%mm2\n"
1024 "psubw %%mm0, %%mm4\n"
1025 "psubw %%mm2, %%mm5\n"
1026 "pxor %%mm3, %%mm3\n"
1027 "pxor %%mm1, %%mm1\n"
1028 "pcmpgtw %%mm4, %%mm3\n\t"
1029 "pcmpgtw %%mm5, %%mm1\n\t"
1030 "pxor %%mm3, %%mm4\n"
1031 "pxor %%mm1, %%mm5\n"
1032 "psubw %%mm3, %%mm4\n"
1033 "psubw %%mm1, %%mm5\n"
1034 "paddw %%mm4, %%mm5\n"
1035 "paddw %%mm5, %%mm6\n"
1036
1037 "add %2,%0\n"
1038
1039 "movq (%0),%%mm4\n"
1040 "movq %%mm4, %%mm1\n"
1041 "psllq $8, %%mm4\n"
1042 "psrlq $8, %%mm1\n"
1043 "psrlq $8, %%mm4\n"
1044 "movq %%mm4, %%mm5\n"
1045 "movq %%mm1, %%mm3\n"
1046 "punpcklbw %%mm7,%%mm4\n"
1047 "punpcklbw %%mm7,%%mm1\n"
1048 "punpckhbw %%mm7,%%mm5\n"
1049 "punpckhbw %%mm7,%%mm3\n"
1050 "psubw %%mm1, %%mm4\n"
1051 "psubw %%mm3, %%mm5\n"
1052 "psubw %%mm4, %%mm0\n"
1053 "psubw %%mm5, %%mm2\n"
1054 "pxor %%mm3, %%mm3\n"
1055 "pxor %%mm1, %%mm1\n"
1056 "pcmpgtw %%mm0, %%mm3\n\t"
1057 "pcmpgtw %%mm2, %%mm1\n\t"
1058 "pxor %%mm3, %%mm0\n"
1059 "pxor %%mm1, %%mm2\n"
1060 "psubw %%mm3, %%mm0\n"
1061 "psubw %%mm1, %%mm2\n"
1062 "paddw %%mm0, %%mm2\n"
1063 "paddw %%mm2, %%mm6\n"
1064
1065 "add %2,%0\n"
1066 "subl $2, %%ecx\n"
1067 " jnz 1b\n"
1068
1069 "movq %%mm6, %%mm0\n"
1070 "punpcklwd %%mm7,%%mm0\n"
1071 "punpckhwd %%mm7,%%mm6\n"
1072 "paddd %%mm0, %%mm6\n"
1073
1074 "movq %%mm6,%%mm0\n"
1075 "psrlq $32, %%mm6\n"
1076 "paddd %%mm6,%%mm0\n"
1077 "movd %%mm0,%1\n"
1078 : "+r" (pix1), "=r"(tmp)
1079 : "r" ((long)line_size) , "g" (h-2)
1080 : "%ecx");
1081 return tmp;
1082 }
1083
1084 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1085 int tmp;
1086 uint8_t * pix= pix1;
1087 asm volatile (
1088 "movl %3,%%ecx\n"
1089 "pxor %%mm7,%%mm7\n"
1090 "pxor %%mm6,%%mm6\n"
1091
1092 "movq (%0),%%mm0\n"
1093 "movq 1(%0),%%mm1\n"
1094 "movq %%mm0, %%mm2\n"
1095 "movq %%mm1, %%mm3\n"
1096 "punpcklbw %%mm7,%%mm0\n"
1097 "punpcklbw %%mm7,%%mm1\n"
1098 "punpckhbw %%mm7,%%mm2\n"
1099 "punpckhbw %%mm7,%%mm3\n"
1100 "psubw %%mm1, %%mm0\n"
1101 "psubw %%mm3, %%mm2\n"
1102
1103 "add %2,%0\n"
1104
1105 "movq (%0),%%mm4\n"
1106 "movq 1(%0),%%mm1\n"
1107 "movq %%mm4, %%mm5\n"
1108 "movq %%mm1, %%mm3\n"
1109 "punpcklbw %%mm7,%%mm4\n"
1110 "punpcklbw %%mm7,%%mm1\n"
1111 "punpckhbw %%mm7,%%mm5\n"
1112 "punpckhbw %%mm7,%%mm3\n"
1113 "psubw %%mm1, %%mm4\n"
1114 "psubw %%mm3, %%mm5\n"
1115 "psubw %%mm4, %%mm0\n"
1116 "psubw %%mm5, %%mm2\n"
1117 "pxor %%mm3, %%mm3\n"
1118 "pxor %%mm1, %%mm1\n"
1119 "pcmpgtw %%mm0, %%mm3\n\t"
1120 "pcmpgtw %%mm2, %%mm1\n\t"
1121 "pxor %%mm3, %%mm0\n"
1122 "pxor %%mm1, %%mm2\n"
1123 "psubw %%mm3, %%mm0\n"
1124 "psubw %%mm1, %%mm2\n"
1125 "paddw %%mm0, %%mm2\n"
1126 "paddw %%mm2, %%mm6\n"
1127
1128 "add %2,%0\n"
1129 "1:\n"
1130
1131 "movq (%0),%%mm0\n"
1132 "movq 1(%0),%%mm1\n"
1133 "movq %%mm0, %%mm2\n"
1134 "movq %%mm1, %%mm3\n"
1135 "punpcklbw %%mm7,%%mm0\n"
1136 "punpcklbw %%mm7,%%mm1\n"
1137 "punpckhbw %%mm7,%%mm2\n"
1138 "punpckhbw %%mm7,%%mm3\n"
1139 "psubw %%mm1, %%mm0\n"
1140 "psubw %%mm3, %%mm2\n"
1141 "psubw %%mm0, %%mm4\n"
1142 "psubw %%mm2, %%mm5\n"
1143 "pxor %%mm3, %%mm3\n"
1144 "pxor %%mm1, %%mm1\n"
1145 "pcmpgtw %%mm4, %%mm3\n\t"
1146 "pcmpgtw %%mm5, %%mm1\n\t"
1147 "pxor %%mm3, %%mm4\n"
1148 "pxor %%mm1, %%mm5\n"
1149 "psubw %%mm3, %%mm4\n"
1150 "psubw %%mm1, %%mm5\n"
1151 "paddw %%mm4, %%mm5\n"
1152 "paddw %%mm5, %%mm6\n"
1153
1154 "add %2,%0\n"
1155
1156 "movq (%0),%%mm4\n"
1157 "movq 1(%0),%%mm1\n"
1158 "movq %%mm4, %%mm5\n"
1159 "movq %%mm1, %%mm3\n"
1160 "punpcklbw %%mm7,%%mm4\n"
1161 "punpcklbw %%mm7,%%mm1\n"
1162 "punpckhbw %%mm7,%%mm5\n"
1163 "punpckhbw %%mm7,%%mm3\n"
1164 "psubw %%mm1, %%mm4\n"
1165 "psubw %%mm3, %%mm5\n"
1166 "psubw %%mm4, %%mm0\n"
1167 "psubw %%mm5, %%mm2\n"
1168 "pxor %%mm3, %%mm3\n"
1169 "pxor %%mm1, %%mm1\n"
1170 "pcmpgtw %%mm0, %%mm3\n\t"
1171 "pcmpgtw %%mm2, %%mm1\n\t"
1172 "pxor %%mm3, %%mm0\n"
1173 "pxor %%mm1, %%mm2\n"
1174 "psubw %%mm3, %%mm0\n"
1175 "psubw %%mm1, %%mm2\n"
1176 "paddw %%mm0, %%mm2\n"
1177 "paddw %%mm2, %%mm6\n"
1178
1179 "add %2,%0\n"
1180 "subl $2, %%ecx\n"
1181 " jnz 1b\n"
1182
1183 "movq %%mm6, %%mm0\n"
1184 "punpcklwd %%mm7,%%mm0\n"
1185 "punpckhwd %%mm7,%%mm6\n"
1186 "paddd %%mm0, %%mm6\n"
1187
1188 "movq %%mm6,%%mm0\n"
1189 "psrlq $32, %%mm6\n"
1190 "paddd %%mm6,%%mm0\n"
1191 "movd %%mm0,%1\n"
1192 : "+r" (pix1), "=r"(tmp)
1193 : "r" ((long)line_size) , "g" (h-2)
1194 : "%ecx");
1195 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1196 }
1197
1198 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1199 MpegEncContext *c = p;
1200 int score1, score2;
1201
1202 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1203 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1204 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1205
1206 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1207 else return score1 + FFABS(score2)*8;
1208 }
1209
1210 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1211 MpegEncContext *c = p;
1212 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1213 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1214
1215 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1216 else return score1 + FFABS(score2)*8;
1217 }
1218
1219 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1220 int tmp;
1221
1222 assert( (((int)pix) & 7) == 0);
1223 assert((line_size &7) ==0);
1224
1225 #define SUM(in0, in1, out0, out1) \
1226 "movq (%0), %%mm2\n"\
1227 "movq 8(%0), %%mm3\n"\
1228 "add %2,%0\n"\
1229 "movq %%mm2, " #out0 "\n"\
1230 "movq %%mm3, " #out1 "\n"\
1231 "psubusb " #in0 ", %%mm2\n"\
1232 "psubusb " #in1 ", %%mm3\n"\
1233 "psubusb " #out0 ", " #in0 "\n"\
1234 "psubusb " #out1 ", " #in1 "\n"\
1235 "por %%mm2, " #in0 "\n"\
1236 "por %%mm3, " #in1 "\n"\
1237 "movq " #in0 ", %%mm2\n"\
1238 "movq " #in1 ", %%mm3\n"\
1239 "punpcklbw %%mm7, " #in0 "\n"\
1240 "punpcklbw %%mm7, " #in1 "\n"\
1241 "punpckhbw %%mm7, %%mm2\n"\
1242 "punpckhbw %%mm7, %%mm3\n"\
1243 "paddw " #in1 ", " #in0 "\n"\
1244 "paddw %%mm3, %%mm2\n"\
1245 "paddw %%mm2, " #in0 "\n"\
1246 "paddw " #in0 ", %%mm6\n"
1247
1248
1249 asm volatile (
1250 "movl %3,%%ecx\n"
1251 "pxor %%mm6,%%mm6\n"
1252 "pxor %%mm7,%%mm7\n"
1253 "movq (%0),%%mm0\n"
1254 "movq 8(%0),%%mm1\n"
1255 "add %2,%0\n"
1256 "subl $2, %%ecx\n"
1257 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1258 "1:\n"
1259
1260 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1261
1262 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1263
1264 "subl $2, %%ecx\n"
1265 "jnz 1b\n"
1266
1267 "movq %%mm6,%%mm0\n"
1268 "psrlq $32, %%mm6\n"
1269 "paddw %%mm6,%%mm0\n"
1270 "movq %%mm0,%%mm6\n"
1271 "psrlq $16, %%mm0\n"
1272 "paddw %%mm6,%%mm0\n"
1273 "movd %%mm0,%1\n"
1274 : "+r" (pix), "=r"(tmp)
1275 : "r" ((long)line_size) , "m" (h)
1276 : "%ecx");
1277 return tmp & 0xFFFF;
1278 }
1279 #undef SUM
1280
1281 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1282 int tmp;
1283
1284 assert( (((int)pix) & 7) == 0);
1285 assert((line_size &7) ==0);
1286
1287 #define SUM(in0, in1, out0, out1) \
1288 "movq (%0), " #out0 "\n"\
1289 "movq 8(%0), " #out1 "\n"\
1290 "add %2,%0\n"\
1291 "psadbw " #out0 ", " #in0 "\n"\
1292 "psadbw " #out1 ", " #in1 "\n"\
1293 "paddw " #in1 ", " #in0 "\n"\
1294 "paddw " #in0 ", %%mm6\n"
1295
1296 asm volatile (
1297 "movl %3,%%ecx\n"
1298 "pxor %%mm6,%%mm6\n"
1299 "pxor %%mm7,%%mm7\n"
1300 "movq (%0),%%mm0\n"
1301 "movq 8(%0),%%mm1\n"
1302 "add %2,%0\n"
1303 "subl $2, %%ecx\n"
1304 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1305 "1:\n"
1306
1307 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1308
1309 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1310
1311 "subl $2, %%ecx\n"
1312 "jnz 1b\n"
1313
1314 "movd %%mm6,%1\n"
1315 : "+r" (pix), "=r"(tmp)
1316 : "r" ((long)line_size) , "m" (h)
1317 : "%ecx");
1318 return tmp;
1319 }
1320 #undef SUM
1321
1322 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1323 int tmp;
1324
1325 assert( (((int)pix1) & 7) == 0);
1326 assert( (((int)pix2) & 7) == 0);
1327 assert((line_size &7) ==0);
1328
1329 #define SUM(in0, in1, out0, out1) \
1330 "movq (%0),%%mm2\n"\
1331 "movq (%1)," #out0 "\n"\
1332 "movq 8(%0),%%mm3\n"\
1333 "movq 8(%1)," #out1 "\n"\
1334 "add %3,%0\n"\
1335 "add %3,%1\n"\
1336 "psubb " #out0 ", %%mm2\n"\
1337 "psubb " #out1 ", %%mm3\n"\
1338 "pxor %%mm7, %%mm2\n"\
1339 "pxor %%mm7, %%mm3\n"\
1340 "movq %%mm2, " #out0 "\n"\
1341 "movq %%mm3, " #out1 "\n"\
1342 "psubusb " #in0 ", %%mm2\n"\
1343 "psubusb " #in1 ", %%mm3\n"\
1344 "psubusb " #out0 ", " #in0 "\n"\
1345 "psubusb " #out1 ", " #in1 "\n"\
1346 "por %%mm2, " #in0 "\n"\
1347 "por %%mm3, " #in1 "\n"\
1348 "movq " #in0 ", %%mm2\n"\
1349 "movq " #in1 ", %%mm3\n"\
1350 "punpcklbw %%mm7, " #in0 "\n"\
1351 "punpcklbw %%mm7, " #in1 "\n"\
1352 "punpckhbw %%mm7, %%mm2\n"\
1353 "punpckhbw %%mm7, %%mm3\n"\
1354 "paddw " #in1 ", " #in0 "\n"\
1355 "paddw %%mm3, %%mm2\n"\
1356 "paddw %%mm2, " #in0 "\n"\
1357 "paddw " #in0 ", %%mm6\n"
1358
1359
1360 asm volatile (
1361 "movl %4,%%ecx\n"
1362 "pxor %%mm6,%%mm6\n"
1363 "pcmpeqw %%mm7,%%mm7\n"
1364 "psllw $15, %%mm7\n"
1365 "packsswb %%mm7, %%mm7\n"
1366 "movq (%0),%%mm0\n"
1367 "movq (%1),%%mm2\n"
1368 "movq 8(%0),%%mm1\n"
1369 "movq 8(%1),%%mm3\n"
1370 "add %3,%0\n"
1371 "add %3,%1\n"
1372 "subl $2, %%ecx\n"
1373 "psubb %%mm2, %%mm0\n"
1374 "psubb %%mm3, %%mm1\n"
1375 "pxor %%mm7, %%mm0\n"
1376 "pxor %%mm7, %%mm1\n"
1377 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1378 "1:\n"
1379
1380 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1381
1382 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1383
1384 "subl $2, %%ecx\n"
1385 "jnz 1b\n"
1386
1387 "movq %%mm6,%%mm0\n"
1388 "psrlq $32, %%mm6\n"
1389 "paddw %%mm6,%%mm0\n"
1390 "movq %%mm0,%%mm6\n"
1391 "psrlq $16, %%mm0\n"
1392 "paddw %%mm6,%%mm0\n"
1393 "movd %%mm0,%2\n"
1394 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1395 : "r" ((long)line_size) , "m" (h)
1396 : "%ecx");
1397 return tmp & 0x7FFF;
1398 }
1399 #undef SUM
1400
1401 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1402 int tmp;
1403
1404 assert( (((int)pix1) & 7) == 0);
1405 assert( (((int)pix2) & 7) == 0);
1406 assert((line_size &7) ==0);
1407
1408 #define SUM(in0, in1, out0, out1) \
1409 "movq (%0)," #out0 "\n"\
1410 "movq (%1),%%mm2\n"\
1411 "movq 8(%0)," #out1 "\n"\
1412 "movq 8(%1),%%mm3\n"\
1413 "add %3,%0\n"\
1414 "add %3,%1\n"\
1415 "psubb %%mm2, " #out0 "\n"\
1416 "psubb %%mm3, " #out1 "\n"\
1417 "pxor %%mm7, " #out0 "\n"\
1418 "pxor %%mm7, " #out1 "\n"\
1419 "psadbw " #out0 ", " #in0 "\n"\
1420 "psadbw " #out1 ", " #in1 "\n"\
1421 "paddw " #in1 ", " #in0 "\n"\
1422 "paddw " #in0 ", %%mm6\n"
1423
1424 asm volatile (
1425 "movl %4,%%ecx\n"
1426 "pxor %%mm6,%%mm6\n"
1427 "pcmpeqw %%mm7,%%mm7\n"
1428 "psllw $15, %%mm7\n"
1429 "packsswb %%mm7, %%mm7\n"
1430 "movq (%0),%%mm0\n"
1431 "movq (%1),%%mm2\n"
1432 "movq 8(%0),%%mm1\n"
1433 "movq 8(%1),%%mm3\n"
1434 "add %3,%0\n"
1435 "add %3,%1\n"
1436 "subl $2, %%ecx\n"
1437 "psubb %%mm2, %%mm0\n"
1438 "psubb %%mm3, %%mm1\n"
1439 "pxor %%mm7, %%mm0\n"
1440 "pxor %%mm7, %%mm1\n"
1441 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1442 "1:\n"
1443
1444 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1445
1446 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1447
1448 "subl $2, %%ecx\n"
1449 "jnz 1b\n"
1450
1451 "movd %%mm6,%2\n"
1452 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1453 : "r" ((long)line_size) , "m" (h)
1454 : "%ecx");
1455 return tmp;
1456 }
1457 #undef SUM
1458
1459 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1460 long i=0;
1461 asm volatile(
1462 "1: \n\t"
1463 "movq (%2, %0), %%mm0 \n\t"
1464 "movq (%1, %0), %%mm1 \n\t"
1465 "psubb %%mm0, %%mm1 \n\t"
1466 "movq %%mm1, (%3, %0) \n\t"
1467 "movq 8(%2, %0), %%mm0 \n\t"
1468 "movq 8(%1, %0), %%mm1 \n\t"
1469 "psubb %%mm0, %%mm1 \n\t"
1470 "movq %%mm1, 8(%3, %0) \n\t"
1471 "add $16, %0 \n\t"
1472 "cmp %4, %0 \n\t"
1473 " jb 1b \n\t"
1474 : "+r" (i)
1475 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1476 );
1477 for(; i<w; i++)
1478 dst[i+0] = src1[i+0]-src2[i+0];
1479 }
1480
1481 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1482 long i=0;
1483 uint8_t l, lt;
1484
1485 asm volatile(
1486 "1: \n\t"
1487 "movq -1(%1, %0), %%mm0 \n\t" // LT
1488 "movq (%1, %0), %%mm1 \n\t" // T
1489 "movq -1(%2, %0), %%mm2 \n\t" // L
1490 "movq (%2, %0), %%mm3 \n\t" // X
1491 "movq %%mm2, %%mm4 \n\t" // L
1492 "psubb %%mm0, %%mm2 \n\t"
1493 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1494 "movq %%mm4, %%mm5 \n\t" // L
1495 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1496 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1497 "pminub %%mm2, %%mm4 \n\t"
1498 "pmaxub %%mm1, %%mm4 \n\t"
1499 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1500 "movq %%mm3, (%3, %0) \n\t"
1501 "add $8, %0 \n\t"
1502 "cmp %4, %0 \n\t"
1503 " jb 1b \n\t"
1504 : "+r" (i)
1505 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1506 );
1507
1508 l= *left;
1509 lt= *left_top;
1510
1511 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1512
1513 *left_top= src1[w-1];
1514 *left = src2[w-1];
1515 }
1516
1517 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
1518 "mov"#m" "#p1", "#a" \n\t"\
1519 "mov"#m" "#p2", "#t" \n\t"\
1520 "punpcklbw "#a", "#t" \n\t"\
1521 "punpcklbw "#a", "#a" \n\t"\
1522 "psubw "#t", "#a" \n\t"\
1523
1524 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1525 uint8_t *p1b=p1, *p2b=p2;\
1526 asm volatile(\
1527 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1528 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1529 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1530 "add %4, %1 \n\t"\
1531 "add %4, %2 \n\t"\
1532 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1533 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1534 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1535 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1536 "mov"#m1" "#mm"0, %0 \n\t"\
1537 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1538 "mov"#m1" %0, "#mm"0 \n\t"\
1539 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1540 : "r"((long)stride), "r"((long)stride*3)\
1541 );\
1542 }
1543 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1544
1545 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
1546 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1547
1548 #ifdef ARCH_X86_64
1549 // permutes 01234567 -> 05736421
1550 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1551 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1552 SBUTTERFLY(c,d,b,wd,dqa)\
1553 SBUTTERFLY(e,f,d,wd,dqa)\
1554 SBUTTERFLY(g,h,f,wd,dqa)\
1555 SBUTTERFLY(a,c,h,dq,dqa)\
1556 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1557 SBUTTERFLY(e,g,b,dq,dqa)\
1558 SBUTTERFLY(d,f,g,dq,dqa)\
1559 SBUTTERFLY(a,e,f,qdq,dqa)\
1560 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1561 SBUTTERFLY(h,b,d,qdq,dqa)\
1562 SBUTTERFLY(c,g,b,qdq,dqa)\
1563 "movdqa %%xmm8, "#g" \n\t"
1564 #else
1565 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1566 "movdqa "#h", "#t" \n\t"\
1567 SBUTTERFLY(a,b,h,wd,dqa)\
1568 "movdqa "#h", 16"#t" \n\t"\
1569 "movdqa "#t", "#h" \n\t"\
1570 SBUTTERFLY(c,d,b,wd,dqa)\
1571 SBUTTERFLY(e,f,d,wd,dqa)\
1572 SBUTTERFLY(g,h,f,wd,dqa)\
1573 SBUTTERFLY(a,c,h,dq,dqa)\
1574 "movdqa "#h", "#t" \n\t"\
1575 "movdqa 16"#t", "#h" \n\t"\
1576 SBUTTERFLY(h,b,c,dq,dqa)\
1577 SBUTTERFLY(e,g,b,dq,dqa)\
1578 SBUTTERFLY(d,f,g,dq,dqa)\
1579 SBUTTERFLY(a,e,f,qdq,dqa)\
1580 SBUTTERFLY(h,d,e,qdq,dqa)\
1581 "movdqa "#h", 16"#t" \n\t"\
1582 "movdqa "#t", "#h" \n\t"\
1583 SBUTTERFLY(h,b,d,qdq,dqa)\
1584 SBUTTERFLY(c,g,b,qdq,dqa)\
1585 "movdqa 16"#t", "#g" \n\t"
1586 #endif
1587
1588 #define LBUTTERFLY2(a1,b1,a2,b2)\
1589 "paddw " #b1 ", " #a1 " \n\t"\
1590 "paddw " #b2 ", " #a2 " \n\t"\
1591 "paddw " #b1 ", " #b1 " \n\t"\
1592 "paddw " #b2 ", " #b2 " \n\t"\
1593 "psubw " #a1 ", " #b1 " \n\t"\
1594 "psubw " #a2 ", " #b2 " \n\t"
1595
1596 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1597 LBUTTERFLY2(m0, m1, m2, m3)\
1598 LBUTTERFLY2(m4, m5, m6, m7)\
1599 LBUTTERFLY2(m0, m2, m1, m3)\
1600 LBUTTERFLY2(m4, m6, m5, m7)\
1601 LBUTTERFLY2(m0, m4, m1, m5)\
1602 LBUTTERFLY2(m2, m6, m3, m7)\
1603
1604 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1605
1606 #define MMABS_MMX(a,z)\
1607 "pxor " #z ", " #z " \n\t"\
1608 "pcmpgtw " #a ", " #z " \n\t"\
1609 "pxor " #z ", " #a " \n\t"\
1610 "psubw " #z ", " #a " \n\t"
1611
1612 #define MMABS_MMX2(a,z)\
1613 "pxor " #z ", " #z " \n\t"\
1614 "psubw " #a ", " #z " \n\t"\
1615 "pmaxsw " #z ", " #a " \n\t"
1616
1617 #define MMABS_SSSE3(a,z)\
1618 "pabsw " #a ", " #a " \n\t"
1619
1620 #define MMABS_SUM(a,z, sum)\
1621 MMABS(a,z)\
1622 "paddusw " #a ", " #sum " \n\t"
1623
1624 #define MMABS_SUM_8x8_NOSPILL\
1625 MMABS(%%xmm0, %%xmm8)\
1626 MMABS(%%xmm1, %%xmm9)\
1627 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1628 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1629 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1630 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1631 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1632 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1633 "paddusw %%xmm1, %%xmm0 \n\t"
1634
1635 #ifdef ARCH_X86_64
1636 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1637 #else
1638 #define MMABS_SUM_8x8_SSE2\
1639 "movdqa %%xmm7, (%1) \n\t"\
1640 MMABS(%%xmm0, %%xmm7)\
1641 MMABS(%%xmm1, %%xmm7)\
1642 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1643 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1644 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1645 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1646 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1647 "movdqa (%1), %%xmm2 \n\t"\
1648 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1649 "paddusw %%xmm1, %%xmm0 \n\t"
1650 #endif
1651
1652 #define LOAD4(o, a, b, c, d)\
1653 "movq "#o"(%1), "#a" \n\t"\
1654 "movq "#o"+8(%1), "#b" \n\t"\
1655 "movq "#o"+16(%1), "#c" \n\t"\
1656 "movq "#o"+24(%1), "#d" \n\t"\
1657
1658 #define STORE4(o, a, b, c, d)\
1659 "movq "#a", "#o"(%1) \n\t"\
1660 "movq "#b", "#o"+8(%1) \n\t"\
1661 "movq "#c", "#o"+16(%1) \n\t"\
1662 "movq "#d", "#o"+24(%1) \n\t"\
1663
1664 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1665 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1666 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1667 #define HSUM_MMX(a, t, dst)\
1668 "movq "#a", "#t" \n\t"\
1669 "psrlq $32, "#a" \n\t"\
1670 "paddusw "#t", "#a" \n\t"\
1671 "movq "#a", "#t" \n\t"\
1672 "psrlq $16, "#a" \n\t"\
1673 "paddusw "#t", "#a" \n\t"\
1674 "movd "#a", "#dst" \n\t"\
1675
1676 #define HSUM_MMX2(a, t, dst)\
1677 "pshufw $0x0E, "#a", "#t" \n\t"\
1678 "paddusw "#t", "#a" \n\t"\
1679 "pshufw $0x01, "#a", "#t" \n\t"\
1680 "paddusw "#t", "#a" \n\t"\
1681 "movd "#a", "#dst" \n\t"\
1682
1683 #define HSUM_SSE2(a, t, dst)\
1684 "movhlps "#a", "#t" \n\t"\
1685 "paddusw "#t", "#a" \n\t"\
1686 "pshuflw $0x0E, "#a", "#t" \n\t"\
1687 "paddusw "#t", "#a" \n\t"\
1688 "pshuflw $0x01, "#a", "#t" \n\t"\
1689 "paddusw "#t", "#a" \n\t"\
1690 "movd "#a", "#dst" \n\t"\
1691
1692 #define HADAMARD8_DIFF_MMX(cpu) \
1693 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1694 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1695 int sum;\
1696 \
1697 assert(h==8);\
1698 \
1699 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1700 \
1701 asm volatile(\
1702 HADAMARD48\
1703 \
1704 "movq %%mm7, 96(%1) \n\t"\
1705 \
1706 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1707 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1708 \
1709 "movq 96(%1), %%mm7 \n\t"\
1710 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1711 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1712 \
1713 : "=r" (sum)\
1714 : "r"(temp)\
1715 );\
1716 \
1717 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1718 \
1719 asm volatile(\
1720 HADAMARD48\
1721 \
1722 "movq %%mm7, 96(%1) \n\t"\
1723 \
1724 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1725 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1726 \
1727 "movq 96(%1), %%mm7 \n\t"\
1728 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1729 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1730 "movq %%mm6, %%mm7 \n\t"\
1731 "movq %%mm0, %%mm6 \n\t"\
1732 \
1733 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1734 \
1735 HADAMARD48\
1736 "movq %%mm7, 64(%1) \n\t"\
1737 MMABS(%%mm0, %%mm7)\
1738 MMABS(%%mm1, %%mm7)\
1739 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1740 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1741 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1742 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1743 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1744 "movq 64(%1), %%mm2 \n\t"\
1745 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1746 "paddusw %%mm1, %%mm0 \n\t"\
1747 "movq %%mm0, 64(%1) \n\t"\
1748 \
1749 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1750 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1751 \
1752 HADAMARD48\
1753 "movq %%mm7, (%1) \n\t"\
1754 MMABS(%%mm0, %%mm7)\
1755 MMABS(%%mm1, %%mm7)\
1756 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1757 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1758 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1759 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1760 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1761 "movq (%1), %%mm2 \n\t"\
1762 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1763 "paddusw 64(%1), %%mm0 \n\t"\
1764 "paddusw %%mm1, %%mm0 \n\t"\
1765 \
1766 HSUM(%%mm0, %%mm1, %0)\
1767 \
1768 : "=r" (sum)\
1769 : "r"(temp)\
1770 );\
1771 return sum&0xFFFF;\
1772 }\
1773 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1774
1775 #define HADAMARD8_DIFF_SSE2(cpu) \
1776 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1777 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1778 int sum;\
1779 \
1780 assert(h==8);\
1781 \
1782 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1783 \
1784 asm volatile(\
1785 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1786 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1787 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1788 MMABS_SUM_8x8\
1789 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1790 : "=r" (sum)\
1791 : "r"(temp)\
1792 );\
1793 return sum&0xFFFF;\
1794 }\
1795 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1796
1797 #define MMABS(a,z) MMABS_MMX(a,z)
1798 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1799 HADAMARD8_DIFF_MMX(mmx)
1800 #undef MMABS
1801 #undef HSUM
1802
1803 #define MMABS(a,z) MMABS_MMX2(a,z)
1804 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1805 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1806 HADAMARD8_DIFF_MMX(mmx2)
1807 HADAMARD8_DIFF_SSE2(sse2)
1808 #undef MMABS
1809 #undef MMABS_SUM_8x8
1810 #undef HSUM
1811
1812 #ifdef HAVE_SSSE3
1813 #define MMABS(a,z) MMABS_SSSE3(a,z)
1814 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1815 HADAMARD8_DIFF_SSE2(ssse3)
1816 #undef MMABS
1817 #undef MMABS_SUM_8x8
1818 #endif
1819
1820 #define DCT_SAD4(m,mm,o)\
1821 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1822 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1823 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1824 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1825 MMABS_SUM(mm##2, mm##6, mm##0)\
1826 MMABS_SUM(mm##3, mm##7, mm##1)\
1827 MMABS_SUM(mm##4, mm##6, mm##0)\
1828 MMABS_SUM(mm##5, mm##7, mm##1)\
1829
1830 #define DCT_SAD_MMX\
1831 "pxor %%mm0, %%mm0 \n\t"\
1832 "pxor %%mm1, %%mm1 \n\t"\
1833 DCT_SAD4(q, %%mm, 0)\
1834 DCT_SAD4(q, %%mm, 8)\
1835 DCT_SAD4(q, %%mm, 64)\
1836 DCT_SAD4(q, %%mm, 72)\
1837 "paddusw %%mm1, %%mm0 \n\t"\
1838 HSUM(%%mm0, %%mm1, %0)
1839
1840 #define DCT_SAD_SSE2\
1841 "pxor %%xmm0, %%xmm0 \n\t"\
1842 "pxor %%xmm1, %%xmm1 \n\t"\
1843 DCT_SAD4(dqa, %%xmm, 0)\
1844 DCT_SAD4(dqa, %%xmm, 64)\
1845 "paddusw %%xmm1, %%xmm0 \n\t"\
1846 HSUM(%%xmm0, %%xmm1, %0)
1847
1848 #define DCT_SAD_FUNC(cpu) \
1849 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1850 int sum;\
1851 asm volatile(\
1852 DCT_SAD\
1853 :"=r"(sum)\
1854 :"r"(block)\
1855 );\
1856 return sum&0xFFFF;\
1857 }
1858
1859 #define DCT_SAD DCT_SAD_MMX
1860 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1861 #define MMABS(a,z) MMABS_MMX(a,z)
1862 DCT_SAD_FUNC(mmx)
1863 #undef MMABS
1864 #undef HSUM
1865
1866 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1867 #define MMABS(a,z) MMABS_MMX2(a,z)
1868 DCT_SAD_FUNC(mmx2)
1869 #undef HSUM
1870 #undef DCT_SAD
1871
1872 #define DCT_SAD DCT_SAD_SSE2
1873 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1874 DCT_SAD_FUNC(sse2)
1875 #undef MMABS
1876
1877 #ifdef HAVE_SSSE3
1878 #define MMABS(a,z) MMABS_SSSE3(a,z)
1879 DCT_SAD_FUNC(ssse3)
1880 #undef MMABS
1881 #endif
1882 #undef HSUM
1883 #undef DCT_SAD
1884
1885 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1886 int sum;
1887 long i=size;
1888 asm volatile(
1889 "pxor %%mm4, %%mm4 \n"
1890 "1: \n"
1891 "sub $8, %0 \n"
1892 "movq (%2,%0), %%mm2 \n"
1893 "movq (%3,%0,2), %%mm0 \n"
1894 "movq 8(%3,%0,2), %%mm1 \n"
1895 "punpckhbw %%mm2, %%mm3 \n"
1896 "punpcklbw %%mm2, %%mm2 \n"
1897 "psraw $8, %%mm3 \n"
1898 "psraw $8, %%mm2 \n"
1899 "psubw %%mm3, %%mm1 \n"
1900 "psubw %%mm2, %%mm0 \n"
1901 "pmaddwd %%mm1, %%mm1 \n"
1902 "pmaddwd %%mm0, %%mm0 \n"
1903 "paddd %%mm1, %%mm4 \n"
1904 "paddd %%mm0, %%mm4 \n"
1905 "jg 1b \n"
1906 "movq %%mm4, %%mm3 \n"
1907 "psrlq $32, %%mm3 \n"
1908 "paddd %%mm3, %%mm4 \n"
1909 "movd %%mm4, %1 \n"
1910 :"+r"(i), "=r"(sum)
1911 :"r"(pix1), "r"(pix2)
1912 );
1913 return sum;
1914 }
1915
1916 #endif //CONFIG_ENCODERS
1917
1918 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1919 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1920
1921 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1922 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
1923 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
1924 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
1925 "movq "#in7", " #m3 " \n\t" /* d */\
1926 "movq "#in0", %%mm5 \n\t" /* D */\
1927 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
1928 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
1929 "movq "#in1", %%mm5 \n\t" /* C */\
1930 "movq "#in2", %%mm6 \n\t" /* B */\
1931 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
1932 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
1933 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
1934 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
1935 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
1936 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
1937 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1938 "psraw $5, %%mm5 \n\t"\
1939 "packuswb %%mm5, %%mm5 \n\t"\
1940 OP(%%mm5, out, %%mm7, d)
1941
1942 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1943 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1944 uint64_t temp;\
1945 \
1946 asm volatile(\
1947 "pxor %%mm7, %%mm7 \n\t"\
1948 "1: \n\t"\
1949 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1950 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1951 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1952 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1953 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1954 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1955 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1956 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1957 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1958 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1959 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1960 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1961 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1962 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1963 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1964 "paddw %%mm3, %%mm5 \n\t" /* b */\
1965 "paddw %%mm2, %%mm6 \n\t" /* c */\
1966 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1967 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1968 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1969 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1970 "paddw %%mm4, %%mm0 \n\t" /* a */\
1971 "paddw %%mm1, %%mm5 \n\t" /* d */\
1972 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1973 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1974 "paddw %6, %%mm6 \n\t"\
1975 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1976 "psraw $5, %%mm0 \n\t"\
1977 "movq %%mm0, %5 \n\t"\
1978 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1979 \
1980 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1981 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1982 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1983 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1984 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1985 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1986 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1987 "paddw %%mm0, %%mm2 \n\t" /* b */\
1988 "paddw %%mm5, %%mm3 \n\t" /* c */\
1989 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1990 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1991 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1992 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1993 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1994 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1995 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1996 "paddw %%mm2, %%mm1 \n\t" /* a */\
1997 "paddw %%mm6, %%mm4 \n\t" /* d */\
1998 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1999 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
2000 "paddw %6, %%mm1 \n\t"\
2001 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
2002 "psraw $5, %%mm3 \n\t"\
2003 "movq %5, %%mm1 \n\t"\
2004 "packuswb %%mm3, %%mm1 \n\t"\
2005 OP_MMX2(%%mm1, (%1),%%mm4, q)\
2006 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
2007 \
2008 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
2009 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
2010 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
2011 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
2012 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
2013 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
2014 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
2015 "paddw %%mm1, %%mm5 \n\t" /* b */\
2016 "paddw %%mm4, %%mm0 \n\t" /* c */\
2017 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2018 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
2019 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
2020 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
2021 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
2022 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
2023 "paddw %%mm3, %%mm2 \n\t" /* d */\
2024 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
2025 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
2026 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
2027 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
2028 "paddw %%mm2, %%mm6 \n\t" /* a */\
2029 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2030 "paddw %6, %%mm0 \n\t"\
2031 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2032 "psraw $5, %%mm0 \n\t"\
2033 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2034 \
2035 "paddw %%mm5, %%mm3 \n\t" /* a */\
2036 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
2037 "paddw %%mm4, %%mm6 \n\t" /* b */\
2038 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
2039 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
2040 "paddw %%mm1, %%mm4 \n\t" /* c */\
2041 "paddw %%mm2, %%mm5 \n\t" /* d */\
2042 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
2043 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
2044 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2045 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
2046 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
2047 "paddw %6, %%mm4 \n\t"\
2048 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
2049 "psraw $5, %%mm4 \n\t"\
2050 "packuswb %%mm4, %%mm0 \n\t"\
2051 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2052 \
2053 "add %3, %0 \n\t"\
2054 "add %4, %1 \n\t"\
2055 "decl %2 \n\t"\
2056 " jnz 1b \n\t"\
2057 : "+a"(src), "+c"(dst), "+m"(h)\
2058 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2059 : "memory"\
2060 );\
2061 }\
2062 \
2063 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2064 int i;\
2065 int16_t temp[16];\
2066 /* quick HACK, XXX FIXME MUST be optimized */\
2067 for(i=0; i<h; i++)\
2068 {\
2069 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2070 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2071 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2072 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2073 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2074 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2075 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2076 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2077 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2078 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2079 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2080 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2081 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2082 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2083 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2084 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2085 asm volatile(\
2086 "movq (%0), %%mm0 \n\t"\
2087 "movq 8(%0), %%mm1 \n\t"\
2088 "paddw %2, %%mm0 \n\t"\
2089 "paddw %2, %%mm1 \n\t"\
2090 "psraw $5, %%mm0 \n\t"\
2091 "psraw $5, %%mm1 \n\t"\
2092 "packuswb %%mm1, %%mm0 \n\t"\
2093 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2094 "movq 16(%0), %%mm0 \n\t"\
2095 "movq 24(%0), %%mm1 \n\t"\
2096 "paddw %2, %%mm0 \n\t"\
2097 "paddw %2, %%mm1 \n\t"\
2098 "psraw $5, %%mm0 \n\t"\
2099 "psraw $5, %%mm1 \n\t"\
2100 "packuswb %%mm1, %%mm0 \n\t"\
2101 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2102 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2103 : "memory"\
2104 );\
2105 dst+=dstStride;\
2106 src+=srcStride;\
2107 }\
2108 }\
2109 \
2110 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2111 uint64_t temp;\
2112 \
2113 asm volatile(\
2114 "pxor %%mm7, %%mm7 \n\t"\
2115 "1: \n\t"\
2116 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
2117 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
2118 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
2119 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
2120 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
2121 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
2122 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
2123 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
2124 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
2125 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
2126 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
2127 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
2128 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
2129 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
2130 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
2131 "paddw %%mm3, %%mm5 \n\t" /* b */\
2132 "paddw %%mm2, %%mm6 \n\t" /* c */\
2133 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2134 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
2135 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
2136 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
2137 "paddw %%mm4, %%mm0 \n\t" /* a */\
2138 "paddw %%mm1, %%mm5 \n\t" /* d */\
2139 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2140 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
2141 "paddw %6, %%mm6 \n\t"\
2142 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2143 "psraw $5, %%mm0 \n\t"\
2144 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2145 \
2146 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
2147 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
2148 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
2149 "paddw %%mm5, %%mm1 \n\t" /* a */\
2150 "paddw %%mm6, %%mm2 \n\t" /* b */\
2151 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
2152 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
2153 "paddw %%mm6, %%mm3 \n\t" /* c */\
2154 "paddw %%mm5, %%mm4 \n\t" /* d */\
2155 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
2156 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
2157 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2158 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
2159 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
2160 "paddw %6, %%mm1 \n\t"\
2161 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
2162 "psraw $5, %%mm3 \n\t"\
2163 "packuswb %%mm3, %%mm0 \n\t"\
2164 OP_MMX2(%%mm0, (%1), %%mm4, q)\
2165 \
2166 "add %3, %0 \n\t"\
2167 "add %4, %1 \n\t"\
2168 "decl %2 \n\t"\
2169 " jnz 1b \n\t"\
2170 : "+a"(src), "+c"(dst), "+m"(h)\
2171 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2172 : "memory"\
2173 );\
2174 }\
2175 \
2176 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2177 int i;\
2178 int16_t temp[8];\
2179 /* quick HACK, XXX FIXME MUST be optimized */\
2180 for(i=0; i<h; i++)\
2181 {\
2182 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2183 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2184 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2185 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2186 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2187 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2188 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2189 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2190 asm volatile(\
2191 "movq (%0), %%mm0 \n\t"\
2192 "movq 8(%0), %%mm1 \n\t"\
2193 "paddw %2, %%mm0 \n\t"\
2194 "paddw %2, %%mm1 \n\t"\
2195 "psraw $5, %%mm0 \n\t"\
2196 "psraw $5, %%mm1 \n\t"\
2197 "packuswb %%mm1, %%mm0 \n\t"\
2198 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2199 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2200 :"memory"\
2201 );\
2202 dst+=dstStride;\
2203 src+=srcStride;\
2204 }\
2205 }
2206
2207 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2208 \
2209 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2210 uint64_t temp[17*4];\
2211 uint64_t *temp_ptr= temp;\
2212 int count= 17;\
2213 \
2214 /*FIXME unroll */\
2215 asm volatile(\
2216 "pxor %%mm7, %%mm7 \n\t"\
2217 "1: \n\t"\
2218 "movq (%0), %%mm0 \n\t"\
2219 "movq (%0), %%mm1 \n\t"\
2220 "movq 8(%0), %%mm2 \n\t"\
2221 "movq 8(%0), %%mm3 \n\t"\
2222 "punpcklbw %%mm7, %%mm0 \n\t"\
2223 "punpckhbw %%mm7, %%mm1 \n\t"\
2224 "punpcklbw %%mm7, %%mm2 \n\t"\
2225 "punpckhbw %%mm7, %%mm3 \n\t"\
2226 "movq %%mm0, (%1) \n\t"\
2227 "movq %%mm1, 17*8(%1) \n\t"\
2228 "movq %%mm2, 2*17*8(%1) \n\t"\
2229 "movq %%mm3, 3*17*8(%1) \n\t"\
2230 "add $8, %1 \n\t"\
2231 "add %3, %0 \n\t"\
2232 "decl %2 \n\t"\
2233 " jnz 1b \n\t"\
2234 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2235 : "r" ((long)srcStride)\
2236 : "memory"\
2237 );\
2238 \
2239 temp_ptr= temp;\
2240 count=4;\
2241 \
2242 /*FIXME reorder for speed */\
2243 asm volatile(\
2244 /*"pxor %%mm7, %%mm7 \n\t"*/\
2245 "1: \n\t"\
2246 "movq (%0), %%mm0 \n\t"\
2247 "movq 8(%0), %%mm1 \n\t"\
2248 "movq 16(%0), %%mm2 \n\t"\
2249 "movq 24(%0), %%mm3 \n\t"\
2250 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2251 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2252 "add %4, %1 \n\t"\
2253 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2254 \
2255 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2256 "add %4, %1 \n\t"\
2257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2258 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2259 "add %4, %1 \n\t"\
2260 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2261 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2262 "add %4, %1 \n\t"\
2263 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2264 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2265 "add %4, %1 \n\t"\
2266 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2267 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2268 "add %4, %1 \n\t"\
2269 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2270 \
2271 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2272 "add %4, %1 \n\t" \
2273 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2274 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2275 \
2276 "add $136, %0 \n\t"\
2277 "add %6, %1 \n\t"\
2278 "decl %2 \n\t"\
2279 " jnz 1b \n\t"\
2280 \
2281 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2282 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2283 :"memory"\
2284 );\
2285 }\
2286 \
2287 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2288 uint64_t temp[9*2];\
2289 uint64_t *temp_ptr= temp;\
2290 int count= 9;\
2291 \
2292 /*FIXME unroll */\
2293 asm volatile(\
2294 "pxor %%mm7, %%mm7 \n\t"\
2295 "1: \n\t"\
2296 "movq (%0), %%mm0 \n\t"\
2297 "movq (%0), %%mm1 \n\t"\
2298 "punpcklbw %%mm7, %%mm0 \n\t"\
2299 "punpckhbw %%mm7, %%mm1 \n\t"\
2300 "movq %%mm0, (%1) \n\t"\
2301 "movq %%mm1, 9*8(%1) \n\t"\
2302 "add $8, %1 \n\t"\
2303 "add %3, %0 \n\t"\
2304 "decl %2 \n\t"\
2305 " jnz 1b \n\t"\
2306 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2307 : "r" ((long)srcStride)\
2308 : "memory"\
2309 );\
2310 \
2311 temp_ptr= temp;\
2312 count=2;\
2313 \
2314 /*FIXME reorder for speed */\
2315 asm volatile(\
2316 /*"pxor %%mm7, %%mm7 \n\t"*/\
2317 "1: \n\t"\
2318 "movq (%0), %%mm0 \n\t"\
2319 "movq 8(%0), %%mm1 \n\t"\
2320 "movq 16(%0), %%mm2 \n\t"\
2321 "movq 24(%0), %%mm3 \n\t"\
2322 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2323 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2324 "add %4, %1 \n\t"\
2325 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2326 \
2327 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2328 "add %4, %1 \n\t"\
2329 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2330 \
2331 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2332 "add %4, %1 \n\t"\
2333 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2334 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2335 \
2336 "add $72, %0 \n\t"\
2337 "add %6, %1 \n\t"\
2338 "decl %2 \n\t"\
2339 " jnz 1b \n\t"\
2340 \
2341 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2342 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2343 : "memory"\
2344 );\
2345 }\
2346 \
2347 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2348 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2349 }\
2350 \
2351 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2352 uint64_t temp[8];\
2353 uint8_t * const half= (uint8_t*)temp;\
2354 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2355 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2356 }\
2357 \
2358 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2359 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2360 }\
2361 \
2362 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2363 uint64_t temp[8];\
2364 uint8_t * const half= (uint8_t*)temp;\
2365 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2366 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2367 }\
2368 \
2369 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2370 uint64_t temp[8];\
2371 uint8_t * const half= (uint8_t*)temp;\
2372 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2373 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2374 }\
2375 \
2376 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2377 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2378 }\
2379 \
2380 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2381 uint64_t temp[8];\
2382 uint8_t * const half= (uint8_t*)temp;\
2383 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2384 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2385 }\
2386 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2387 uint64_t half[8 + 9];\
2388 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2389 uint8_t * const halfHV= ((uint8_t*)half);\
2390 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2391 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2392 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2393 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2394 }\
2395 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2396 uint64_t half[8 + 9];\
2397 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2398 uint8_t * const halfHV= ((uint8_t*)half);\
2399 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2400 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2401 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2402 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2403 }\
2404 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2405 uint64_t half[8 + 9];\
2406 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2407 uint8_t * const halfHV= ((uint8_t*)half);\
2408 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2409 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2410 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2411 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2412 }\
2413 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2414 uint64_t half[8 + 9];\
2415 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2416 uint8_t * const halfHV= ((uint8_t*)half);\
2417 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2418 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2419 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2420 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2421 }\
2422 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2423 uint64_t half[8 + 9];\
2424 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2425 uint8_t * const halfHV= ((uint8_t*)half);\
2426 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2427 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2428 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2429 }\
2430 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2431 uint64_t half[8 + 9];\
2432 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2433 uint8_t * const halfHV= ((uint8_t*)half);\
2434 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2435 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2436 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2437 }\
2438 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2439 uint64_t half[8 + 9];\
2440 uint8_t * const halfH= ((uint8_t*)half);\
2441 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2442 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2443 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2444 }\
2445 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2446 uint64_t half[8 + 9];\
2447 uint8_t * const halfH= ((uint8_t*)half);\
2448 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2449 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2450 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2451 }\
2452 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2453 uint64_t half[9];\
2454 uint8_t * const halfH= ((uint8_t*)half);\
2455 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2456 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2457 }\
2458 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2459 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2460 }\
2461 \
2462 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2463 uint64_t temp[32];\
2464 uint8_t * const half= (uint8_t*)temp;\
2465 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2466 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2467 }\
2468 \
2469 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2470 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2471 }\
2472 \
2473 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2474 uint64_t temp[32];\
2475 uint8_t * const half= (uint8_t*)temp;\
2476 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2477 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2478 }\
2479 \
2480 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2481 uint64_t temp[32];\
2482 uint8_t * const half= (uint8_t*)temp;\
2483 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2484 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2485 }\
2486 \
2487 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2488 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2489 }\
2490 \
2491 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2492 uint64_t temp[32];\
2493 uint8_t * const half= (uint8_t*)temp;\
2494 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2495 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2496 }\
2497 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2498 uint64_t half[16*2 + 17*2];\
2499 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2500 uint8_t * const halfHV= ((uint8_t*)half);\
2501 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2502 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2503 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2504 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2505 }\
2506 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2507 uint64_t half[16*2 + 17*2];\
2508 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2509 uint8_t * const halfHV= ((uint8_t*)half);\
2510 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2511 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2512 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2513 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2514 }\
2515 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2516 uint64_t half[16*2 + 17*2];\
2517 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2518 uint8_t * const halfHV= ((uint8_t*)half);\
2519 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2520 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2521 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2522 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2523 }\
2524 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2525 uint64_t half[16*2 + 17*2];\
2526 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2527 uint8_t * const halfHV= ((uint8_t*)half);\
2528 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2529 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2530 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2531 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2532 }\
2533 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2534 uint64_t half[16*2 + 17*2];\
2535 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2536 uint8_t * const halfHV= ((uint8_t*)half);\
2537 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2538 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2539 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2540 }\
2541 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2542 uint64_t half[16*2 + 17*2];\
2543 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2544 uint8_t * const halfHV= ((uint8_t*)half);\
2545 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2546 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2547 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2548 }\
2549 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2550 uint64_t half[17*2];\
2551 uint8_t * const halfH= ((uint8_t*)half);\
2552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2553 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2554 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2555 }\
2556 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2557 uint64_t half[17*2];\
2558 uint8_t * const halfH= ((uint8_t*)half);\
2559 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2560 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2562 }\
2563 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2564 uint64_t half[17*2];\
2565 uint8_t * const halfH= ((uint8_t*)half);\
2566 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2567 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2568 }
2569
2570 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2571 #define AVG_3DNOW_OP(a,b,temp, size) \
2572 "mov" #size " " #b ", " #temp " \n\t"\
2573 "pavgusb " #temp ", " #a " \n\t"\
2574 "mov" #size " " #a ", " #b " \n\t"
2575 #define AVG_MMX2_OP(a,b,temp, size) \
2576 "mov" #size " " #b ", " #temp " \n\t"\
2577 "pavgb " #temp ", " #a " \n\t"\
2578 "mov" #size " " #a ", " #b " \n\t"
2579
2580 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
2581 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
2582 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2583 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
2584 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
2585 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2586 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
2587 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
2588 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2589
2590 /***********************************/
2591 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2592
2593 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2594 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2595 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2596 }
2597 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2598 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2599 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2600 }
2601
2602 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
2603 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2604 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2605 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2606 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2607 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2608 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2609 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2610 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2611 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2612 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2613 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2614 }\
2615 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2616 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2617 }\
2618 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
2619 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
2620 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
2621 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
2622 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
2623 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
2624 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
2625 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2626
2627 QPEL_2TAP(put_, 16, mmx2)
2628 QPEL_2TAP(avg_, 16, mmx2)
2629 QPEL_2TAP(put_, 8, mmx2)
2630 QPEL_2TAP(avg_, 8, mmx2)
2631 QPEL_2TAP(put_, 16, 3dnow)
2632 QPEL_2TAP(avg_, 16, 3dnow)
2633 QPEL_2TAP(put_, 8, 3dnow)
2634 QPEL_2TAP(avg_, 8, 3dnow)
2635
2636
2637 #if 0
2638 static void just_return() { return; }
2639 #endif
2640
2641 #define SET_QPEL_FUNC(postfix1, postfix2) \
2642 c->put_ ## postfix1 = put_ ## postfix2;\
2643 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2644 c->avg_ ## postfix1 = avg_ ## postfix2;
2645
2646 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2647 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2648 const int w = 8;
2649 const int ix = ox>>(16+shift);
2650 const int iy = oy>>(16+shift);
2651 const int oxs = ox>>4;
2652 const int oys = oy>>4;
2653 const int dxxs = dxx>>4;
2654 const int dxys = dxy>>4;
2655 const int dyxs = dyx>>4;
2656 const int dyys = dyy>>4;
2657 const uint16_t r4[4] = {r,r,r,r};
2658 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2659 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2660 const uint64_t shift2 = 2*shift;
2661 uint8_t edge_buf[(h+1)*stride];
2662 int x, y;
2663
2664 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2665 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2666 const int dxh = dxy*(h-1);
2667 const int dyw = dyx*(w-1);
2668 if( // non-constant fullpel offset (3% of blocks)
2669 (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2670 oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2671 // uses more than 16 bits of subpel mv (only at huge resolution)
2672 || (dxx|dxy|dyx|dyy)&15 )
2673 {
2674 //FIXME could still use mmx for some of the rows
2675 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2676 return;
2677 }
2678
2679 src += ix + iy*stride;
2680 if( (unsigned)ix >= width-w ||
2681 (unsigned)iy >= height-h )
2682 {
2683 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2684 src = edge_buf;
2685 }
2686
2687 asm volatile(
2688 "movd %0, %%mm6 \n\t"
2689 "pxor %%mm7, %%mm7 \n\t"
2690 "punpcklwd %%mm6, %%mm6 \n\t"
2691 "punpcklwd %%mm6, %%mm6 \n\t"
2692 :: "r"(1<<shift)
2693 );
2694
2695 for(x=0; x<w; x+=4){
2696 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2697 oxs - dxys + dxxs*(x+1),
2698 oxs - dxys + dxxs*(x+2),
2699 oxs - dxys + dxxs*(x+3) };
2700 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2701 oys - dyys + dyxs*(x+1),
2702 oys - dyys + dyxs*(x+2),
2703 oys - dyys + dyxs*(x+3) };
2704
2705 for(y=0; y<h; y++){
2706 asm volatile(
2707 "movq %0, %%mm4 \n\t"
2708 "movq %1, %%mm5 \n\t"
2709 "paddw %2, %%mm4 \n\t"
2710 "paddw %3, %%mm5 \n\t"
2711 "movq %%mm4, %0 \n\t"
2712 "movq %%mm5, %1 \n\t"
2713 "psrlw $12, %%mm4 \n\t"
2714 "psrlw $12, %%mm5 \n\t"
2715 : "+m"(*dx4), "+m"(*dy4)
2716 : "m"(*dxy4), "m"(*dyy4)
2717 );
2718
2719 asm volatile(
2720 "movq %%mm6, %%mm2 \n\t"
2721 "movq %%mm6, %%mm1 \n\t"
2722 "psubw %%mm4, %%mm2 \n\t"
2723 "psubw %%mm5, %%mm1 \n\t"
2724 "movq %%mm2, %%mm0 \n\t"
2725 "movq %%mm4, %%mm3 \n\t"
2726 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2727 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2728 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2729 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2730
2731 "movd %4, %%mm5 \n\t"
2732 "movd %3, %%mm4 \n\t"
2733 "punpcklbw %%mm7, %%mm5 \n\t"
2734 "punpcklbw %%mm7, %%mm4 \n\t"
2735 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2736 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2737
2738 "movd %2, %%mm5 \n\t"
2739 "movd %1, %%mm4 \n\t"
2740 "punpcklbw %%mm7, %%mm5 \n\t"
2741 "punpcklbw %%mm7, %%mm4 \n\t"
2742 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2743 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2744 "paddw %5, %%mm1 \n\t"
2745 "paddw %%mm3, %%mm2 \n\t"
2746 "paddw %%mm1, %%mm0 \n\t"
2747 "paddw %%mm2, %%mm0 \n\t"
2748
2749 "psrlw %6, %%mm0 \n\t"
2750 "packuswb %%mm0, %%mm0 \n\t"
2751 "movd %%mm0, %0 \n\t"
2752
2753 : "=m"(dst[x+y*stride])
2754 : "m"(src[0]), "m"(src[1]),
2755 "m"(src[stride]), "m"(src[stride+1]),
2756 "m"(*r4), "m"(shift2)
2757 );
2758 src += stride;
2759 }
2760 src += 4-h*stride;
2761 }
2762 }
2763
2764 #ifdef CONFIG_ENCODERS
2765
2766 #define PHADDD(a, t)\
2767 "movq "#a", "#t" \n\t"\
2768 "psrlq $32, "#a" \n\t"\
2769 "paddd "#t", "#a" \n\t"
2770 /*
2771 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2772 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2773 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2774 */
2775 #define PMULHRW(x, y, s, o)\
2776 "pmulhw " #s ", "#x " \n\t"\
2777 "pmulhw " #s ", "#y " \n\t"\
2778 "paddw " #o ", "#x " \n\t"\
2779 "paddw " #o ", "#y " \n\t"\
2780 "psraw $1, "#x " \n\t"\
2781 "psraw $1, "#y " \n\t"
2782 #define DEF(x) x ## _mmx
2783 #define SET_RND MOVQ_WONE
2784 #define SCALE_OFFSET 1
2785
2786 #include "dsputil_mmx_qns.h"
2787
2788 #undef DEF
2789 #undef SET_RND
2790 #undef SCALE_OFFSET
2791 #undef PMULHRW
2792
2793 #define DEF(x) x ## _3dnow
2794 #define SET_RND(x)
2795 #define SCALE_OFFSET 0
2796 #define PMULHRW(x, y, s, o)\
2797 "pmulhrw " #s ", "#x " \n\t"\
2798 "pmulhrw " #s ", "#y " \n\t"
2799
2800 #include "dsputil_mmx_qns.h"
2801
2802 #undef DEF
2803 #undef SET_RND
2804 #undef SCALE_OFFSET
2805 #undef PMULHRW
2806
2807 #ifdef HAVE_SSSE3
2808 #undef PHADDD
2809 #define DEF(x) x ## _ssse3
2810 #define SET_RND(x)
2811 #define SCALE_OFFSET -1
2812 #define PHADDD(a, t)\
2813 "pshufw $0x0E, "#a", "#t" \n\t"\
2814 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
2815 #define PMULHRW(x, y, s, o)\
2816 "pmulhrsw " #s ", "#x " \n\t"\
2817 "pmulhrsw " #s ", "#y " \n\t"
2818
2819 #include "dsputil_mmx_qns.h"
2820
2821 #undef DEF
2822 #undef SET_RND
2823 #undef SCALE_OFFSET
2824 #undef PMULHRW
2825 #undef PHADDD
2826 #endif //HAVE_SSSE3
2827
2828 #endif /* CONFIG_ENCODERS */
2829
2830 #define PREFETCH(name, op) \
2831 static void name(void *mem, int stride, int h){\
2832 const uint8_t *p= mem;\
2833 do{\
2834 asm volatile(#op" %0" :: "m"(*p));\
2835 p+= stride;\
2836 }while(--h);\
2837 }
2838 PREFETCH(prefetch_mmx2, prefetcht0)
2839 PREFETCH(prefetch_3dnow, prefetch)
2840 #undef PREFETCH
2841
2842 #include "h264dsp_mmx.c"
2843
2844 /* CAVS specific */
2845 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2846
2847 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2848 put_pixels8_mmx(dst, src, stride, 8);
2849 }
2850 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2851 avg_pixels8_mmx(dst, src, stride, 8);
2852 }
2853 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2854 put_pixels16_mmx(dst, src, stride, 16);
2855 }
2856 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2857 avg_pixels16_mmx(dst, src, stride, 16);
2858 }
2859
2860 /* FLAC specific */
2861 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
2862 double *autoc);
2863
2864 /* VC1 specific */
2865 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2866
2867 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2868 put_pixels8_mmx(dst, src, stride, 8);
2869 }
2870
2871 /* external functions, from idct_mmx.c */
2872 void ff_mmx_idct(DCTELEM *block);
2873 void ff_mmxext_idct(DCTELEM *block);
2874
2875 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2876 converted */
2877 #ifdef CONFIG_GPL
2878 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2879 {
2880 ff_mmx_idct (block);
2881 put_pixels_clamped_mmx(block, dest, line_size);
2882 }
2883 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2884 {
2885 ff_mmx_idct (block);
2886 add_pixels_clamped_mmx(block, dest, line_size);
2887 }
2888 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2889 {
2890 ff_mmxext_idct (block);
2891 put_pixels_clamped_mmx(block, dest, line_size);
2892 }
2893 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2894 {
2895 ff_mmxext_idct (block);
2896 add_pixels_clamped_mmx(block, dest, line_size);
2897 }
2898 #endif
2899 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2900 {
2901 ff_idct_xvid_mmx (block);
2902 put_pixels_clamped_mmx(block, dest, line_size);
2903 }
2904 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2905 {
2906 ff_idct_xvid_mmx (block);
2907 add_pixels_clamped_mmx(block, dest, line_size);
2908 }
2909 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2910 {
2911 ff_idct_xvid_mmx2 (block);
2912 put_pixels_clamped_mmx(block, dest, line_size);
2913 }
2914 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2915 {
2916 ff_idct_xvid_mmx2 (block);
2917 add_pixels_clamped_mmx(block, dest, line_size);
2918 }
2919
2920 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2921 {
2922 int i;
2923 asm volatile("pxor %%mm7, %%mm7":);
2924 for(i=0; i<blocksize; i+=2) {
2925 asm volatile(
2926 "movq %0, %%mm0 \n\t"
2927 "movq %1, %%mm1 \n\t"
2928 "movq %%mm0, %%mm2 \n\t"
2929 "movq %%mm1, %%mm3 \n\t"
2930 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2931 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2932 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2933 "pxor %%mm2, %%mm1 \n\t"
2934 "movq %%mm3, %%mm4 \n\t"
2935 "pand %%mm1, %%mm3 \n\t"
2936 "pandn %%mm1, %%mm4 \n\t"
2937 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2938 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2939 "movq %%mm3, %1 \n\t"
2940 "movq %%mm0, %0 \n\t"
2941 :"+m"(mag[i]), "+m"(ang[i])
2942 ::"memory"
2943 );
2944 }
2945 asm volatile("femms");
2946 }
2947 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2948 {
2949 int i;
2950
2951 asm volatile(
2952 "movaps %0, %%xmm5 \n\t"
2953 ::"m"(ff_pdw_80000000[0])
2954 );
2955 for(i=0; i<blocksize; i+=4) {
2956 asm volatile(
2957 "movaps %0, %%xmm0 \n\t"
2958 "movaps %1, %%xmm1 \n\t"
2959 "xorps %%xmm2, %%xmm2 \n\t"
2960 "xorps %%xmm3, %%xmm3 \n\t"
2961 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0