9b81b2d6107a457614faa241d1d21545413205d6
[libav.git] / libavcodec / i386 / dsputil_mmx.c
1 /*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
25 #include "../dsputil.h"
26 #include "../simple_idct.h"
27 #include "../mpegvideo.h"
28 #include "x86_cpu.h"
29 #include "mmx.h"
30
31 //#undef NDEBUG
32 //#include <assert.h>
33
34 extern void ff_idct_xvid_mmx(short *block);
35 extern void ff_idct_xvid_mmx2(short *block);
36
37 int mm_flags; /* multimedia extension flags */
38
39 /* pixel operations */
40 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
41 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
42 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
43
44 static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
45 {0x8000000080000000ULL, 0x8000000080000000ULL};
46
47 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
48 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
49 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
50 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
51 static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
52 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
53 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
54 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
55 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
56
57 static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
58 static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
59 static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
60 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
61 static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
62 static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
63 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
64
65 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
66 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
67
68 #define MOVQ_WONE(regd) \
69 __asm __volatile ( \
70 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
71 "psrlw $15, %%" #regd ::)
72
73 #define MOVQ_BFE(regd) \
74 __asm __volatile ( \
75 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
76 "paddb %%" #regd ", %%" #regd " \n\t" ::)
77
78 #ifndef PIC
79 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
80 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
81 #else
82 // for shared library it's better to use this way for accessing constants
83 // pcmpeqd -> -1
84 #define MOVQ_BONE(regd) \
85 __asm __volatile ( \
86 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
87 "psrlw $15, %%" #regd " \n\t" \
88 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
89
90 #define MOVQ_WTWO(regd) \
91 __asm __volatile ( \
92 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
93 "psrlw $15, %%" #regd " \n\t" \
94 "psllw $1, %%" #regd " \n\t"::)
95
96 #endif
97
98 // using regr as temporary and for the output result
99 // first argument is unmodifed and second is trashed
100 // regfe is supposed to contain 0xfefefefefefefefe
101 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
102 "movq " #rega ", " #regr " \n\t"\
103 "pand " #regb ", " #regr " \n\t"\
104 "pxor " #rega ", " #regb " \n\t"\
105 "pand " #regfe "," #regb " \n\t"\
106 "psrlq $1, " #regb " \n\t"\
107 "paddb " #regb ", " #regr " \n\t"
108
109 #define PAVGB_MMX(rega, regb, regr, regfe) \
110 "movq " #rega ", " #regr " \n\t"\
111 "por " #regb ", " #regr " \n\t"\
112 "pxor " #rega ", " #regb " \n\t"\
113 "pand " #regfe "," #regb " \n\t"\
114 "psrlq $1, " #regb " \n\t"\
115 "psubb " #regb ", " #regr " \n\t"
116
117 // mm6 is supposed to contain 0xfefefefefefefefe
118 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
119 "movq " #rega ", " #regr " \n\t"\
120 "movq " #regc ", " #regp " \n\t"\
121 "pand " #regb ", " #regr " \n\t"\
122 "pand " #regd ", " #regp " \n\t"\
123 "pxor " #rega ", " #regb " \n\t"\
124 "pxor " #regc ", " #regd " \n\t"\
125 "pand %%mm6, " #regb " \n\t"\
126 "pand %%mm6, " #regd " \n\t"\
127 "psrlq $1, " #regb " \n\t"\
128 "psrlq $1, " #regd " \n\t"\
129 "paddb " #regb ", " #regr " \n\t"\
130 "paddb " #regd ", " #regp " \n\t"
131
132 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
133 "movq " #rega ", " #regr " \n\t"\
134 "movq " #regc ", " #regp " \n\t"\
135 "por " #regb ", " #regr " \n\t"\
136 "por " #regd ", " #regp " \n\t"\
137 "pxor " #rega ", " #regb " \n\t"\
138 "pxor " #regc ", " #regd " \n\t"\
139 "pand %%mm6, " #regb " \n\t"\
140 "pand %%mm6, " #regd " \n\t"\
141 "psrlq $1, " #regd " \n\t"\
142 "psrlq $1, " #regb " \n\t"\
143 "psubb " #regb ", " #regr " \n\t"\
144 "psubb " #regd ", " #regp " \n\t"
145
146 /***********************************/
147 /* MMX no rounding */
148 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
149 #define SET_RND MOVQ_WONE
150 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
151 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
152
153 #include "dsputil_mmx_rnd.h"
154
155 #undef DEF
156 #undef SET_RND
157 #undef PAVGBP
158 #undef PAVGB
159 /***********************************/
160 /* MMX rounding */
161
162 #define DEF(x, y) x ## _ ## y ##_mmx
163 #define SET_RND MOVQ_WTWO
164 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
165 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
166
167 #include "dsputil_mmx_rnd.h"
168
169 #undef DEF
170 #undef SET_RND
171 #undef PAVGBP
172 #undef PAVGB
173
174 /***********************************/
175 /* 3Dnow specific */
176
177 #define DEF(x) x ## _3dnow
178 /* for Athlons PAVGUSB is preferred */
179 #define PAVGB "pavgusb"
180
181 #include "dsputil_mmx_avg.h"
182
183 #undef DEF
184 #undef PAVGB
185
186 /***********************************/
187 /* MMX2 specific */
188
189 #define DEF(x) x ## _mmx2
190
191 /* Introduced only in MMX2 set */
192 #define PAVGB "pavgb"
193
194 #include "dsputil_mmx_avg.h"
195
196 #undef DEF
197 #undef PAVGB
198
199 #define SBUTTERFLY(a,b,t,n,m)\
200 "mov" #m " " #a ", " #t " \n\t" /* abcd */\
201 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
202 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
203
204 #define TRANSPOSE4(a,b,c,d,t)\
205 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
206 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
207 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
208 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
209
210 /***********************************/
211 /* standard MMX */
212
213 #ifdef CONFIG_ENCODERS
214 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
215 {
216 asm volatile(
217 "mov $-128, %%"REG_a" \n\t"
218 "pxor %%mm7, %%mm7 \n\t"
219 ASMALIGN(4)
220 "1: \n\t"
221 "movq (%0), %%mm0 \n\t"
222 "movq (%0, %2), %%mm2 \n\t"
223 "movq %%mm0, %%mm1 \n\t"
224 "movq %%mm2, %%mm3 \n\t"
225 "punpcklbw %%mm7, %%mm0 \n\t"
226 "punpckhbw %%mm7, %%mm1 \n\t"
227 "punpcklbw %%mm7, %%mm2 \n\t"
228 "punpckhbw %%mm7, %%mm3 \n\t"
229 "movq %%mm0, (%1, %%"REG_a") \n\t"
230 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
231 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
232 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
233 "add %3, %0 \n\t"
234 "add $32, %%"REG_a" \n\t"
235 "js 1b \n\t"
236 : "+r" (pixels)
237 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
238 : "%"REG_a
239 );
240 }
241
242 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
243 {
244 asm volatile(
245 "pxor %%mm7, %%mm7 \n\t"
246 "mov $-128, %%"REG_a" \n\t"
247 ASMALIGN(4)
248 "1: \n\t"
249 "movq (%0), %%mm0 \n\t"
250 "movq (%1), %%mm2 \n\t"
251 "movq %%mm0, %%mm1 \n\t"
252 "movq %%mm2, %%mm3 \n\t"
253 "punpcklbw %%mm7, %%mm0 \n\t"
254 "punpckhbw %%mm7, %%mm1 \n\t"
255 "punpcklbw %%mm7, %%mm2 \n\t"
256 "punpckhbw %%mm7, %%mm3 \n\t"
257 "psubw %%mm2, %%mm0 \n\t"
258 "psubw %%mm3, %%mm1 \n\t"
259 "movq %%mm0, (%2, %%"REG_a") \n\t"
260 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
261 "add %3, %0 \n\t"
262 "add %3, %1 \n\t"
263 "add $16, %%"REG_a" \n\t"
264 "jnz 1b \n\t"
265 : "+r" (s1), "+r" (s2)
266 : "r" (block+64), "r" ((long)stride)
267 : "%"REG_a
268 );
269 }
270 #endif //CONFIG_ENCODERS
271
272 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
273 {
274 const DCTELEM *p;
275 uint8_t *pix;
276
277 /* read the pixels */
278 p = block;
279 pix = pixels;
280 /* unrolled loop */
281 __asm __volatile(
282 "movq %3, %%mm0 \n\t"
283 "movq 8%3, %%mm1 \n\t"
284 "movq 16%3, %%mm2 \n\t"
285 "movq 24%3, %%mm3 \n\t"
286 "movq 32%3, %%mm4 \n\t"
287 "movq 40%3, %%mm5 \n\t"
288 "movq 48%3, %%mm6 \n\t"
289 "movq 56%3, %%mm7 \n\t"
290 "packuswb %%mm1, %%mm0 \n\t"
291 "packuswb %%mm3, %%mm2 \n\t"
292 "packuswb %%mm5, %%mm4 \n\t"
293 "packuswb %%mm7, %%mm6 \n\t"
294 "movq %%mm0, (%0) \n\t"
295 "movq %%mm2, (%0, %1) \n\t"
296 "movq %%mm4, (%0, %1, 2) \n\t"
297 "movq %%mm6, (%0, %2) \n\t"
298 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
299 :"memory");
300 pix += line_size*4;
301 p += 32;
302
303 // if here would be an exact copy of the code above
304 // compiler would generate some very strange code
305 // thus using "r"
306 __asm __volatile(
307 "movq (%3), %%mm0 \n\t"
308 "movq 8(%3), %%mm1 \n\t"
309 "movq 16(%3), %%mm2 \n\t"
310 "movq 24(%3), %%mm3 \n\t"
311 "movq 32(%3), %%mm4 \n\t"
312 "movq 40(%3), %%mm5 \n\t"
313 "movq 48(%3), %%mm6 \n\t"
314 "movq 56(%3), %%mm7 \n\t"
315 "packuswb %%mm1, %%mm0 \n\t"
316 "packuswb %%mm3, %%mm2 \n\t"
317 "packuswb %%mm5, %%mm4 \n\t"
318 "packuswb %%mm7, %%mm6 \n\t"
319 "movq %%mm0, (%0) \n\t"
320 "movq %%mm2, (%0, %1) \n\t"
321 "movq %%mm4, (%0, %1, 2) \n\t"
322 "movq %%mm6, (%0, %2) \n\t"
323 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
324 :"memory");
325 }
326
327 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
328 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
329
330 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
331 {
332 int i;
333
334 movq_m2r(*vector128, mm1);
335 for (i = 0; i < 8; i++) {
336 movq_m2r(*(block), mm0);
337 packsswb_m2r(*(block + 4), mm0);
338 block += 8;
339 paddb_r2r(mm1, mm0);
340 movq_r2m(mm0, *pixels);
341 pixels += line_size;
342 }
343 }
344
345 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
346 {
347 const DCTELEM *p;
348 uint8_t *pix;
349 int i;
350
351 /* read the pixels */
352 p = block;
353 pix = pixels;
354 MOVQ_ZERO(mm7);
355 i = 4;
356 do {
357 __asm __volatile(
358 "movq (%2), %%mm0 \n\t"
359 "movq 8(%2), %%mm1 \n\t"
360 "movq 16(%2), %%mm2 \n\t"
361 "movq 24(%2), %%mm3 \n\t"
362 "movq %0, %%mm4 \n\t"
363 "movq %1, %%mm6 \n\t"
364 "movq %%mm4, %%mm5 \n\t"
365 "punpcklbw %%mm7, %%mm4 \n\t"
366 "punpckhbw %%mm7, %%mm5 \n\t"
367 "paddsw %%mm4, %%mm0 \n\t"
368 "paddsw %%mm5, %%mm1 \n\t"
369 "movq %%mm6, %%mm5 \n\t"
370 "punpcklbw %%mm7, %%mm6 \n\t"
371 "punpckhbw %%mm7, %%mm5 \n\t"
372 "paddsw %%mm6, %%mm2 \n\t"
373 "paddsw %%mm5, %%mm3 \n\t"
374 "packuswb %%mm1, %%mm0 \n\t"
375 "packuswb %%mm3, %%mm2 \n\t"
376 "movq %%mm0, %0 \n\t"
377 "movq %%mm2, %1 \n\t"
378 :"+m"(*pix), "+m"(*(pix+line_size))
379 :"r"(p)
380 :"memory");
381 pix += line_size*2;
382 p += 16;
383 } while (--i);
384 }
385
386 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
387 {
388 __asm __volatile(
389 "lea (%3, %3), %%"REG_a" \n\t"
390 ASMALIGN(3)
391 "1: \n\t"
392 "movd (%1), %%mm0 \n\t"
393 "movd (%1, %3), %%mm1 \n\t"
394 "movd %%mm0, (%2) \n\t"
395 "movd %%mm1, (%2, %3) \n\t"
396 "add %%"REG_a", %1 \n\t"
397 "add %%"REG_a", %2 \n\t"
398 "movd (%1), %%mm0 \n\t"
399 "movd (%1, %3), %%mm1 \n\t"
400 "movd %%mm0, (%2) \n\t"
401 "movd %%mm1, (%2, %3) \n\t"
402 "add %%"REG_a", %1 \n\t"
403 "add %%"REG_a", %2 \n\t"
404 "subl $4, %0 \n\t"
405 "jnz 1b \n\t"
406 : "+g"(h), "+r" (pixels), "+r" (block)
407 : "r"((long)line_size)
408 : "%"REG_a, "memory"
409 );
410 }
411
412 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
413 {
414 __asm __volatile(
415 "lea (%3, %3), %%"REG_a" \n\t"
416 ASMALIGN(3)
417 "1: \n\t"
418 "movq (%1), %%mm0 \n\t"
419 "movq (%1, %3), %%mm1 \n\t"
420 "movq %%mm0, (%2) \n\t"
421 "movq %%mm1, (%2, %3) \n\t"
422 "add %%"REG_a", %1 \n\t"
423 "add %%"REG_a", %2 \n\t"
424 "movq (%1), %%mm0 \n\t"
425 "movq (%1, %3), %%mm1 \n\t"
426 "movq %%mm0, (%2) \n\t"
427 "movq %%mm1, (%2, %3) \n\t"
428 "add %%"REG_a", %1 \n\t"
429 "add %%"REG_a", %2 \n\t"
430 "subl $4, %0 \n\t"
431 "jnz 1b \n\t"
432 : "+g"(h), "+r" (pixels), "+r" (block)
433 : "r"((long)line_size)
434 : "%"REG_a, "memory"
435 );
436 }
437
438 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
439 {
440 __asm __volatile(
441 "lea (%3, %3), %%"REG_a" \n\t"
442 ASMALIGN(3)
443 "1: \n\t"
444 "movq (%1), %%mm0 \n\t"
445 "movq 8(%1), %%mm4 \n\t"
446 "movq (%1, %3), %%mm1 \n\t"
447 "movq 8(%1, %3), %%mm5 \n\t"
448 "movq %%mm0, (%2) \n\t"
449 "movq %%mm4, 8(%2) \n\t"
450 "movq %%mm1, (%2, %3) \n\t"
451 "movq %%mm5, 8(%2, %3) \n\t"
452 "add %%"REG_a", %1 \n\t"
453 "add %%"REG_a", %2 \n\t"
454 "movq (%1), %%mm0 \n\t"
455 "movq 8(%1), %%mm4 \n\t"
456 "movq (%1, %3), %%mm1 \n\t"
457 "movq 8(%1, %3), %%mm5 \n\t"
458 "movq %%mm0, (%2) \n\t"
459 "movq %%mm4, 8(%2) \n\t"
460 "movq %%mm1, (%2, %3) \n\t"
461 "movq %%mm5, 8(%2, %3) \n\t"
462 "add %%"REG_a", %1 \n\t"
463 "add %%"REG_a", %2 \n\t"
464 "subl $4, %0 \n\t"
465 "jnz 1b \n\t"
466 : "+g"(h), "+r" (pixels), "+r" (block)
467 : "r"((long)line_size)
468 : "%"REG_a, "memory"
469 );
470 }
471
472 static void clear_blocks_mmx(DCTELEM *blocks)
473 {
474 __asm __volatile(
475 "pxor %%mm7, %%mm7 \n\t"
476 "mov $-128*6, %%"REG_a" \n\t"
477 "1: \n\t"
478 "movq %%mm7, (%0, %%"REG_a") \n\t"
479 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
480 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
481 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
482 "add $32, %%"REG_a" \n\t"
483 " js 1b \n\t"
484 : : "r" (((uint8_t *)blocks)+128*6)
485 : "%"REG_a
486 );
487 }
488
489 #ifdef CONFIG_ENCODERS
490 static int pix_sum16_mmx(uint8_t * pix, int line_size){
491 const int h=16;
492 int sum;
493 long index= -line_size*h;
494
495 __asm __volatile(
496 "pxor %%mm7, %%mm7 \n\t"
497 "pxor %%mm6, %%mm6 \n\t"
498 "1: \n\t"
499 "movq (%2, %1), %%mm0 \n\t"
500 "movq (%2, %1), %%mm1 \n\t"
501 "movq 8(%2, %1), %%mm2 \n\t"
502 "movq 8(%2, %1), %%mm3 \n\t"
503 "punpcklbw %%mm7, %%mm0 \n\t"
504 "punpckhbw %%mm7, %%mm1 \n\t"
505 "punpcklbw %%mm7, %%mm2 \n\t"
506 "punpckhbw %%mm7, %%mm3 \n\t"
507 "paddw %%mm0, %%mm1 \n\t"
508 "paddw %%mm2, %%mm3 \n\t"
509 "paddw %%mm1, %%mm3 \n\t"
510 "paddw %%mm3, %%mm6 \n\t"
511 "add %3, %1 \n\t"
512 " js 1b \n\t"
513 "movq %%mm6, %%mm5 \n\t"
514 "psrlq $32, %%mm6 \n\t"
515 "paddw %%mm5, %%mm6 \n\t"
516 "movq %%mm6, %%mm5 \n\t"
517 "psrlq $16, %%mm6 \n\t"
518 "paddw %%mm5, %%mm6 \n\t"
519 "movd %%mm6, %0 \n\t"
520 "andl $0xFFFF, %0 \n\t"
521 : "=&r" (sum), "+r" (index)
522 : "r" (pix - index), "r" ((long)line_size)
523 );
524
525 return sum;
526 }
527 #endif //CONFIG_ENCODERS
528
529 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
530 long i=0;
531 asm volatile(
532 "1: \n\t"
533 "movq (%1, %0), %%mm0 \n\t"
534 "movq (%2, %0), %%mm1 \n\t"
535 "paddb %%mm0, %%mm1 \n\t"
536 "movq %%mm1, (%2, %0) \n\t"
537 "movq 8(%1, %0), %%mm0 \n\t"
538 "movq 8(%2, %0), %%mm1 \n\t"
539 "paddb %%mm0, %%mm1 \n\t"
540 "movq %%mm1, 8(%2, %0) \n\t"
541 "add $16, %0 \n\t"
542 "cmp %3, %0 \n\t"
543 " jb 1b \n\t"
544 : "+r" (i)
545 : "r"(src), "r"(dst), "r"((long)w-15)
546 );
547 for(; i<w; i++)
548 dst[i+0] += src[i+0];
549 }
550
551 #define H263_LOOP_FILTER \
552 "pxor %%mm7, %%mm7 \n\t"\
553 "movq %0, %%mm0 \n\t"\
554 "movq %0, %%mm1 \n\t"\
555 "movq %3, %%mm2 \n\t"\
556 "movq %3, %%mm3 \n\t"\
557 "punpcklbw %%mm7, %%mm0 \n\t"\
558 "punpckhbw %%mm7, %%mm1 \n\t"\
559 "punpcklbw %%mm7, %%mm2 \n\t"\
560 "punpckhbw %%mm7, %%mm3 \n\t"\
561 "psubw %%mm2, %%mm0 \n\t"\
562 "psubw %%mm3, %%mm1 \n\t"\
563 "movq %1, %%mm2 \n\t"\
564 "movq %1, %%mm3 \n\t"\
565 "movq %2, %%mm4 \n\t"\
566 "movq %2, %%mm5 \n\t"\
567 "punpcklbw %%mm7, %%mm2 \n\t"\
568 "punpckhbw %%mm7, %%mm3 \n\t"\
569 "punpcklbw %%mm7, %%mm4 \n\t"\
570 "punpckhbw %%mm7, %%mm5 \n\t"\
571 "psubw %%mm2, %%mm4 \n\t"\
572 "psubw %%mm3, %%mm5 \n\t"\
573 "psllw $2, %%mm4 \n\t"\
574 "psllw $2, %%mm5 \n\t"\
575 "paddw %%mm0, %%mm4 \n\t"\
576 "paddw %%mm1, %%mm5 \n\t"\
577 "pxor %%mm6, %%mm6 \n\t"\
578 "pcmpgtw %%mm4, %%mm6 \n\t"\
579 "pcmpgtw %%mm5, %%mm7 \n\t"\
580 "pxor %%mm6, %%mm4 \n\t"\
581 "pxor %%mm7, %%mm5 \n\t"\
582 "psubw %%mm6, %%mm4 \n\t"\
583 "psubw %%mm7, %%mm5 \n\t"\
584 "psrlw $3, %%mm4 \n\t"\
585 "psrlw $3, %%mm5 \n\t"\
586 "packuswb %%mm5, %%mm4 \n\t"\
587 "packsswb %%mm7, %%mm6 \n\t"\
588 "pxor %%mm7, %%mm7 \n\t"\
589 "movd %4, %%mm2 \n\t"\
590 "punpcklbw %%mm2, %%mm2 \n\t"\
591 "punpcklbw %%mm2, %%mm2 \n\t"\
592 "punpcklbw %%mm2, %%mm2 \n\t"\
593 "psubusb %%mm4, %%mm2 \n\t"\
594 "movq %%mm2, %%mm3 \n\t"\
595 "psubusb %%mm4, %%mm3 \n\t"\
596 "psubb %%mm3, %%mm2 \n\t"\
597 "movq %1, %%mm3 \n\t"\
598 "movq %2, %%mm4 \n\t"\
599 "pxor %%mm6, %%mm3 \n\t"\
600 "pxor %%mm6, %%mm4 \n\t"\
601 "paddusb %%mm2, %%mm3 \n\t"\
602 "psubusb %%mm2, %%mm4 \n\t"\
603 "pxor %%mm6, %%mm3 \n\t"\
604 "pxor %%mm6, %%mm4 \n\t"\
605 "paddusb %%mm2, %%mm2 \n\t"\
606 "packsswb %%mm1, %%mm0 \n\t"\
607 "pcmpgtb %%mm0, %%mm7 \n\t"\
608 "pxor %%mm7, %%mm0 \n\t"\
609 "psubb %%mm7, %%mm0 \n\t"\
610 "movq %%mm0, %%mm1 \n\t"\
611 "psubusb %%mm2, %%mm0 \n\t"\
612 "psubb %%mm0, %%mm1 \n\t"\
613 "pand %5, %%mm1 \n\t"\
614 "psrlw $2, %%mm1 \n\t"\
615 "pxor %%mm7, %%mm1 \n\t"\
616 "psubb %%mm7, %%mm1 \n\t"\
617 "movq %0, %%mm5 \n\t"\
618 "movq %3, %%mm6 \n\t"\
619 "psubb %%mm1, %%mm5 \n\t"\
620 "paddb %%mm1, %%mm6 \n\t"
621
622 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
623 const int strength= ff_h263_loop_filter_strength[qscale];
624
625 asm volatile(
626
627 H263_LOOP_FILTER
628
629 "movq %%mm3, %1 \n\t"
630 "movq %%mm4, %2 \n\t"
631 "movq %%mm5, %0 \n\t"
632 "movq %%mm6, %3 \n\t"
633 : "+m" (*(uint64_t*)(src - 2*stride)),
634 "+m" (*(uint64_t*)(src - 1*stride)),
635 "+m" (*(uint64_t*)(src + 0*stride)),
636 "+m" (*(uint64_t*)(src + 1*stride))
637 : "g" (2*strength), "m"(ff_pb_FC)
638 );
639 }
640
641 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
642 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
643 "movd %4, %%mm0 \n\t"
644 "movd %5, %%mm1 \n\t"
645 "movd %6, %%mm2 \n\t"
646 "movd %7, %%mm3 \n\t"
647 "punpcklbw %%mm1, %%mm0 \n\t"
648 "punpcklbw %%mm3, %%mm2 \n\t"
649 "movq %%mm0, %%mm1 \n\t"
650 "punpcklwd %%mm2, %%mm0 \n\t"
651 "punpckhwd %%mm2, %%mm1 \n\t"
652 "movd %%mm0, %0 \n\t"
653 "punpckhdq %%mm0, %%mm0 \n\t"
654 "movd %%mm0, %1 \n\t"
655 "movd %%mm1, %2 \n\t"
656 "punpckhdq %%mm1, %%mm1 \n\t"
657 "movd %%mm1, %3 \n\t"
658
659 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
660 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
661 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
662 "=m" (*(uint32_t*)(dst + 3*dst_stride))
663 : "m" (*(uint32_t*)(src + 0*src_stride)),
664 "m" (*(uint32_t*)(src + 1*src_stride)),
665 "m" (*(uint32_t*)(src + 2*src_stride)),
666 "m" (*(uint32_t*)(src + 3*src_stride))
667 );
668 }
669
670 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
671 const int strength= ff_h263_loop_filter_strength[qscale];
672 uint64_t temp[4] __attribute__ ((aligned(8)));
673 uint8_t *btemp= (uint8_t*)temp;
674
675 src -= 2;
676
677 transpose4x4(btemp , src , 8, stride);
678 transpose4x4(btemp+4, src + 4*stride, 8, stride);
679 asm volatile(
680 H263_LOOP_FILTER // 5 3 4 6
681
682 : "+m" (temp[0]),
683 "+m" (temp[1]),
684 "+m" (temp[2]),
685 "+m" (temp[3])
686 : "g" (2*strength), "m"(ff_pb_FC)
687 );
688
689 asm volatile(
690 "movq %%mm5, %%mm1 \n\t"
691 "movq %%mm4, %%mm0 \n\t"
692 "punpcklbw %%mm3, %%mm5 \n\t"
693 "punpcklbw %%mm6, %%mm4 \n\t"
694 "punpckhbw %%mm3, %%mm1 \n\t"
695 "punpckhbw %%mm6, %%mm0 \n\t"
696 "movq %%mm5, %%mm3 \n\t"
697 "movq %%mm1, %%mm6 \n\t"
698 "punpcklwd %%mm4, %%mm5 \n\t"
699 "punpcklwd %%mm0, %%mm1 \n\t"
700 "punpckhwd %%mm4, %%mm3 \n\t"
701 "punpckhwd %%mm0, %%mm6 \n\t"
702 "movd %%mm5, (%0) \n\t"
703 "punpckhdq %%mm5, %%mm5 \n\t"
704 "movd %%mm5, (%0,%2) \n\t"
705 "movd %%mm3, (%0,%2,2) \n\t"
706 "punpckhdq %%mm3, %%mm3 \n\t"
707 "movd %%mm3, (%0,%3) \n\t"
708 "movd %%mm1, (%1) \n\t"
709 "punpckhdq %%mm1, %%mm1 \n\t"
710 "movd %%mm1, (%1,%2) \n\t"
711 "movd %%mm6, (%1,%2,2) \n\t"
712 "punpckhdq %%mm6, %%mm6 \n\t"
713 "movd %%mm6, (%1,%3) \n\t"
714 :: "r" (src),
715 "r" (src + 4*stride),
716 "r" ((long) stride ),
717 "r" ((long)(3*stride))
718 );
719 }
720
721 #ifdef CONFIG_ENCODERS
722 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
723 int tmp;
724 asm volatile (
725 "movl $16,%%ecx\n"
726 "pxor %%mm0,%%mm0\n"
727 "pxor %%mm7,%%mm7\n"
728 "1:\n"
729 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
730 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
731
732 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
733
734 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
735 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
736
737 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
738 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
739 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
740
741 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
742 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
743
744 "pmaddwd %%mm3,%%mm3\n"
745 "pmaddwd %%mm4,%%mm4\n"
746
747 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
748 pix2^2+pix3^2+pix6^2+pix7^2) */
749 "paddd %%mm3,%%mm4\n"
750 "paddd %%mm2,%%mm7\n"
751
752 "add %2, %0\n"
753 "paddd %%mm4,%%mm7\n"
754 "dec %%ecx\n"
755 "jnz 1b\n"
756
757 "movq %%mm7,%%mm1\n"
758 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
759 "paddd %%mm7,%%mm1\n"
760 "movd %%mm1,%1\n"
761 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
762 return tmp;
763 }
764
765 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
766 int tmp;
767 asm volatile (
768 "movl %4,%%ecx\n"
769 "shr $1,%%ecx\n"
770 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
771 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
772 "1:\n"
773 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
774 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
775 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
776 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
777
778 /* todo: mm1-mm2, mm3-mm4 */
779 /* algo: substract mm1 from mm2 with saturation and vice versa */
780 /* OR the results to get absolute difference */
781 "movq %%mm1,%%mm5\n"
782 "movq %%mm3,%%mm6\n"
783 "psubusb %%mm2,%%mm1\n"
784 "psubusb %%mm4,%%mm3\n"
785 "psubusb %%mm5,%%mm2\n"
786 "psubusb %%mm6,%%mm4\n"
787
788 "por %%mm1,%%mm2\n"
789 "por %%mm3,%%mm4\n"
790
791 /* now convert to 16-bit vectors so we can square them */
792 "movq %%mm2,%%mm1\n"
793 "movq %%mm4,%%mm3\n"
794
795 "punpckhbw %%mm0,%%mm2\n"
796 "punpckhbw %%mm0,%%mm4\n"
797 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
798 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
799
800 "pmaddwd %%mm2,%%mm2\n"
801 "pmaddwd %%mm4,%%mm4\n"
802 "pmaddwd %%mm1,%%mm1\n"
803 "pmaddwd %%mm3,%%mm3\n"
804
805 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
806 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
807
808 "paddd %%mm2,%%mm1\n"
809 "paddd %%mm4,%%mm3\n"
810 "paddd %%mm1,%%mm7\n"
811 "paddd %%mm3,%%mm7\n"
812
813 "decl %%ecx\n"
814 "jnz 1b\n"
815
816 "movq %%mm7,%%mm1\n"
817 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
818 "paddd %%mm7,%%mm1\n"
819 "movd %%mm1,%2\n"
820 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
821 : "r" ((long)line_size) , "m" (h)
822 : "%ecx");
823 return tmp;
824 }
825
826 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
827 int tmp;
828 asm volatile (
829 "movl %4,%%ecx\n"
830 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
831 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
832 "1:\n"
833 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
834 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
835 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
836 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
837
838 /* todo: mm1-mm2, mm3-mm4 */
839 /* algo: substract mm1 from mm2 with saturation and vice versa */
840 /* OR the results to get absolute difference */
841 "movq %%mm1,%%mm5\n"
842 "movq %%mm3,%%mm6\n"
843 "psubusb %%mm2,%%mm1\n"
844 "psubusb %%mm4,%%mm3\n"
845 "psubusb %%mm5,%%mm2\n"
846 "psubusb %%mm6,%%mm4\n"
847
848 "por %%mm1,%%mm2\n"
849 "por %%mm3,%%mm4\n"
850
851 /* now convert to 16-bit vectors so we can square them */
852 "movq %%mm2,%%mm1\n"
853 "movq %%mm4,%%mm3\n"
854
855 "punpckhbw %%mm0,%%mm2\n"
856 "punpckhbw %%mm0,%%mm4\n"
857 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
858 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
859
860 "pmaddwd %%mm2,%%mm2\n"
861 "pmaddwd %%mm4,%%mm4\n"
862 "pmaddwd %%mm1,%%mm1\n"
863 "pmaddwd %%mm3,%%mm3\n"
864
865 "add %3,%0\n"
866 "add %3,%1\n"
867
868 "paddd %%mm2,%%mm1\n"
869 "paddd %%mm4,%%mm3\n"
870 "paddd %%mm1,%%mm7\n"
871 "paddd %%mm3,%%mm7\n"
872
873 "decl %%ecx\n"
874 "jnz 1b\n"
875
876 "movq %%mm7,%%mm1\n"
877 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
878 "paddd %%mm7,%%mm1\n"
879 "movd %%mm1,%2\n"
880 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
881 : "r" ((long)line_size) , "m" (h)
882 : "%ecx");
883 return tmp;
884 }
885
886 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
887 int tmp;
888 asm volatile (
889 "shr $1,%2\n"
890 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
891 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
892 "1:\n"
893 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
894 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
895 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
896 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
897
898 /* todo: mm1-mm2, mm3-mm4 */
899 /* algo: substract mm1 from mm2 with saturation and vice versa */
900 /* OR the results to get absolute difference */
901 "movdqa %%xmm1,%%xmm5\n"
902 "movdqa %%xmm3,%%xmm6\n"
903 "psubusb %%xmm2,%%xmm1\n"
904 "psubusb %%xmm4,%%xmm3\n"
905 "psubusb %%xmm5,%%xmm2\n"
906 "psubusb %%xmm6,%%xmm4\n"
907
908 "por %%xmm1,%%xmm2\n"
909 "por %%xmm3,%%xmm4\n"
910
911 /* now convert to 16-bit vectors so we can square them */
912 "movdqa %%xmm2,%%xmm1\n"
913 "movdqa %%xmm4,%%xmm3\n"
914
915 "punpckhbw %%xmm0,%%xmm2\n"
916 "punpckhbw %%xmm0,%%xmm4\n"
917 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
918 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
919
920 "pmaddwd %%xmm2,%%xmm2\n"
921 "pmaddwd %%xmm4,%%xmm4\n"
922 "pmaddwd %%xmm1,%%xmm1\n"
923 "pmaddwd %%xmm3,%%xmm3\n"
924
925 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
926 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
927
928 "paddd %%xmm2,%%xmm1\n"
929 "paddd %%xmm4,%%xmm3\n"
930 "paddd %%xmm1,%%xmm7\n"
931 "paddd %%xmm3,%%xmm7\n"
932
933 "decl %2\n"
934 "jnz 1b\n"
935
936 "movdqa %%xmm7,%%xmm1\n"
937 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
938 "paddd %%xmm1,%%xmm7\n"
939 "movdqa %%xmm7,%%xmm1\n"
940 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
941 "paddd %%xmm1,%%xmm7\n"
942 "movd %%xmm7,%3\n"
943 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
944 : "r" ((long)line_size));
945 return tmp;
946 }
947
948 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
949 int tmp;
950 asm volatile (
951 "movl %3,%%ecx\n"
952 "pxor %%mm7,%%mm7\n"
953 "pxor %%mm6,%%mm6\n"
954
955 "movq (%0),%%mm0\n"
956 "movq %%mm0, %%mm1\n"
957 "psllq $8, %%mm0\n"
958 "psrlq $8, %%mm1\n"
959 "psrlq $8, %%mm0\n"
960 "movq %%mm0, %%mm2\n"
961 "movq %%mm1, %%mm3\n"
962 "punpcklbw %%mm7,%%mm0\n"
963 "punpcklbw %%mm7,%%mm1\n"
964 "punpckhbw %%mm7,%%mm2\n"
965 "punpckhbw %%mm7,%%mm3\n"
966 "psubw %%mm1, %%mm0\n"
967 "psubw %%mm3, %%mm2\n"
968
969 "add %2,%0\n"
970
971 "movq (%0),%%mm4\n"
972 "movq %%mm4, %%mm1\n"
973 "psllq $8, %%mm4\n"
974 "psrlq $8, %%mm1\n"
975 "psrlq $8, %%mm4\n"
976 "movq %%mm4, %%mm5\n"
977 "movq %%mm1, %%mm3\n"
978 "punpcklbw %%mm7,%%mm4\n"
979 "punpcklbw %%mm7,%%mm1\n"
980 "punpckhbw %%mm7,%%mm5\n"
981 "punpckhbw %%mm7,%%mm3\n"
982 "psubw %%mm1, %%mm4\n"
983 "psubw %%mm3, %%mm5\n"
984 "psubw %%mm4, %%mm0\n"
985 "psubw %%mm5, %%mm2\n"
986 "pxor %%mm3, %%mm3\n"
987 "pxor %%mm1, %%mm1\n"
988 "pcmpgtw %%mm0, %%mm3\n\t"
989 "pcmpgtw %%mm2, %%mm1\n\t"
990 "pxor %%mm3, %%mm0\n"
991 "pxor %%mm1, %%mm2\n"
992 "psubw %%mm3, %%mm0\n"
993 "psubw %%mm1, %%mm2\n"
994 "paddw %%mm0, %%mm2\n"
995 "paddw %%mm2, %%mm6\n"
996
997 "add %2,%0\n"
998 "1:\n"
999
1000 "movq (%0),%%mm0\n"
1001 "movq %%mm0, %%mm1\n"
1002 "psllq $8, %%mm0\n"
1003 "psrlq $8, %%mm1\n"
1004 "psrlq $8, %%mm0\n"
1005 "movq %%mm0, %%mm2\n"
1006 "movq %%mm1, %%mm3\n"
1007 "punpcklbw %%mm7,%%mm0\n"
1008 "punpcklbw %%mm7,%%mm1\n"
1009 "punpckhbw %%mm7,%%mm2\n"
1010 "punpckhbw %%mm7,%%mm3\n"
1011 "psubw %%mm1, %%mm0\n"
1012 "psubw %%mm3, %%mm2\n"
1013 "psubw %%mm0, %%mm4\n"
1014 "psubw %%mm2, %%mm5\n"
1015 "pxor %%mm3, %%mm3\n"
1016 "pxor %%mm1, %%mm1\n"
1017 "pcmpgtw %%mm4, %%mm3\n\t"
1018 "pcmpgtw %%mm5, %%mm1\n\t"
1019 "pxor %%mm3, %%mm4\n"
1020 "pxor %%mm1, %%mm5\n"
1021 "psubw %%mm3, %%mm4\n"
1022 "psubw %%mm1, %%mm5\n"
1023 "paddw %%mm4, %%mm5\n"
1024 "paddw %%mm5, %%mm6\n"
1025
1026 "add %2,%0\n"
1027
1028 "movq (%0),%%mm4\n"
1029 "movq %%mm4, %%mm1\n"
1030 "psllq $8, %%mm4\n"
1031 "psrlq $8, %%mm1\n"
1032 "psrlq $8, %%mm4\n"
1033 "movq %%mm4, %%mm5\n"
1034 "movq %%mm1, %%mm3\n"
1035 "punpcklbw %%mm7,%%mm4\n"
1036 "punpcklbw %%mm7,%%mm1\n"
1037 "punpckhbw %%mm7,%%mm5\n"
1038 "punpckhbw %%mm7,%%mm3\n"
1039 "psubw %%mm1, %%mm4\n"
1040 "psubw %%mm3, %%mm5\n"
1041 "psubw %%mm4, %%mm0\n"
1042 "psubw %%mm5, %%mm2\n"
1043 "pxor %%mm3, %%mm3\n"
1044 "pxor %%mm1, %%mm1\n"
1045 "pcmpgtw %%mm0, %%mm3\n\t"
1046 "pcmpgtw %%mm2, %%mm1\n\t"
1047 "pxor %%mm3, %%mm0\n"
1048 "pxor %%mm1, %%mm2\n"
1049 "psubw %%mm3, %%mm0\n"
1050 "psubw %%mm1, %%mm2\n"
1051 "paddw %%mm0, %%mm2\n"
1052 "paddw %%mm2, %%mm6\n"
1053
1054 "add %2,%0\n"
1055 "subl $2, %%ecx\n"
1056 " jnz 1b\n"
1057
1058 "movq %%mm6, %%mm0\n"
1059 "punpcklwd %%mm7,%%mm0\n"
1060 "punpckhwd %%mm7,%%mm6\n"
1061 "paddd %%mm0, %%mm6\n"
1062
1063 "movq %%mm6,%%mm0\n"
1064 "psrlq $32, %%mm6\n"
1065 "paddd %%mm6,%%mm0\n"
1066 "movd %%mm0,%1\n"
1067 : "+r" (pix1), "=r"(tmp)
1068 : "r" ((long)line_size) , "g" (h-2)
1069 : "%ecx");
1070 return tmp;
1071 }
1072
1073 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1074 int tmp;
1075 uint8_t * pix= pix1;
1076 asm volatile (
1077 "movl %3,%%ecx\n"
1078 "pxor %%mm7,%%mm7\n"
1079 "pxor %%mm6,%%mm6\n"
1080
1081 "movq (%0),%%mm0\n"
1082 "movq 1(%0),%%mm1\n"
1083 "movq %%mm0, %%mm2\n"
1084 "movq %%mm1, %%mm3\n"
1085 "punpcklbw %%mm7,%%mm0\n"
1086 "punpcklbw %%mm7,%%mm1\n"
1087 "punpckhbw %%mm7,%%mm2\n"
1088 "punpckhbw %%mm7,%%mm3\n"
1089 "psubw %%mm1, %%mm0\n"
1090 "psubw %%mm3, %%mm2\n"
1091
1092 "add %2,%0\n"
1093
1094 "movq (%0),%%mm4\n"
1095 "movq 1(%0),%%mm1\n"
1096 "movq %%mm4, %%mm5\n"
1097 "movq %%mm1, %%mm3\n"
1098 "punpcklbw %%mm7,%%mm4\n"
1099 "punpcklbw %%mm7,%%mm1\n"
1100 "punpckhbw %%mm7,%%mm5\n"
1101 "punpckhbw %%mm7,%%mm3\n"
1102 "psubw %%mm1, %%mm4\n"
1103 "psubw %%mm3, %%mm5\n"
1104 "psubw %%mm4, %%mm0\n"
1105 "psubw %%mm5, %%mm2\n"
1106 "pxor %%mm3, %%mm3\n"
1107 "pxor %%mm1, %%mm1\n"
1108 "pcmpgtw %%mm0, %%mm3\n\t"
1109 "pcmpgtw %%mm2, %%mm1\n\t"
1110 "pxor %%mm3, %%mm0\n"
1111 "pxor %%mm1, %%mm2\n"
1112 "psubw %%mm3, %%mm0\n"
1113 "psubw %%mm1, %%mm2\n"
1114 "paddw %%mm0, %%mm2\n"
1115 "paddw %%mm2, %%mm6\n"
1116
1117 "add %2,%0\n"
1118 "1:\n"
1119
1120 "movq (%0),%%mm0\n"
1121 "movq 1(%0),%%mm1\n"
1122 "movq %%mm0, %%mm2\n"
1123 "movq %%mm1, %%mm3\n"
1124 "punpcklbw %%mm7,%%mm0\n"
1125 "punpcklbw %%mm7,%%mm1\n"
1126 "punpckhbw %%mm7,%%mm2\n"
1127 "punpckhbw %%mm7,%%mm3\n"
1128 "psubw %%mm1, %%mm0\n"
1129 "psubw %%mm3, %%mm2\n"
1130 "psubw %%mm0, %%mm4\n"
1131 "psubw %%mm2, %%mm5\n"
1132 "pxor %%mm3, %%mm3\n"
1133 "pxor %%mm1, %%mm1\n"
1134 "pcmpgtw %%mm4, %%mm3\n\t"
1135 "pcmpgtw %%mm5, %%mm1\n\t"
1136 "pxor %%mm3, %%mm4\n"
1137 "pxor %%mm1, %%mm5\n"
1138 "psubw %%mm3, %%mm4\n"
1139 "psubw %%mm1, %%mm5\n"
1140 "paddw %%mm4, %%mm5\n"
1141 "paddw %%mm5, %%mm6\n"
1142
1143 "add %2,%0\n"
1144
1145 "movq (%0),%%mm4\n"
1146 "movq 1(%0),%%mm1\n"
1147 "movq %%mm4, %%mm5\n"
1148 "movq %%mm1, %%mm3\n"
1149 "punpcklbw %%mm7,%%mm4\n"
1150 "punpcklbw %%mm7,%%mm1\n"
1151 "punpckhbw %%mm7,%%mm5\n"
1152 "punpckhbw %%mm7,%%mm3\n"
1153 "psubw %%mm1, %%mm4\n"
1154 "psubw %%mm3, %%mm5\n"
1155 "psubw %%mm4, %%mm0\n"
1156 "psubw %%mm5, %%mm2\n"
1157 "pxor %%mm3, %%mm3\n"
1158 "pxor %%mm1, %%mm1\n"
1159 "pcmpgtw %%mm0, %%mm3\n\t"
1160 "pcmpgtw %%mm2, %%mm1\n\t"
1161 "pxor %%mm3, %%mm0\n"
1162 "pxor %%mm1, %%mm2\n"
1163 "psubw %%mm3, %%mm0\n"
1164 "psubw %%mm1, %%mm2\n"
1165 "paddw %%mm0, %%mm2\n"
1166 "paddw %%mm2, %%mm6\n"
1167
1168 "add %2,%0\n"
1169 "subl $2, %%ecx\n"
1170 " jnz 1b\n"
1171
1172 "movq %%mm6, %%mm0\n"
1173 "punpcklwd %%mm7,%%mm0\n"
1174 "punpckhwd %%mm7,%%mm6\n"
1175 "paddd %%mm0, %%mm6\n"
1176
1177 "movq %%mm6,%%mm0\n"
1178 "psrlq $32, %%mm6\n"
1179 "paddd %%mm6,%%mm0\n"
1180 "movd %%mm0,%1\n"
1181 : "+r" (pix1), "=r"(tmp)
1182 : "r" ((long)line_size) , "g" (h-2)
1183 : "%ecx");
1184 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1185 }
1186
1187 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1188 MpegEncContext *c = p;
1189 int score1, score2;
1190
1191 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1192 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1193 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1194
1195 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1196 else return score1 + FFABS(score2)*8;
1197 }
1198
1199 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1200 MpegEncContext *c = p;
1201 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1202 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1203
1204 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1205 else return score1 + FFABS(score2)*8;
1206 }
1207
1208 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1209 int tmp;
1210
1211 assert( (((int)pix) & 7) == 0);
1212 assert((line_size &7) ==0);
1213
1214 #define SUM(in0, in1, out0, out1) \
1215 "movq (%0), %%mm2\n"\
1216 "movq 8(%0), %%mm3\n"\
1217 "add %2,%0\n"\
1218 "movq %%mm2, " #out0 "\n"\
1219 "movq %%mm3, " #out1 "\n"\
1220 "psubusb " #in0 ", %%mm2\n"\
1221 "psubusb " #in1 ", %%mm3\n"\
1222 "psubusb " #out0 ", " #in0 "\n"\
1223 "psubusb " #out1 ", " #in1 "\n"\
1224 "por %%mm2, " #in0 "\n"\
1225 "por %%mm3, " #in1 "\n"\
1226 "movq " #in0 ", %%mm2\n"\
1227 "movq " #in1 ", %%mm3\n"\
1228 "punpcklbw %%mm7, " #in0 "\n"\
1229 "punpcklbw %%mm7, " #in1 "\n"\
1230 "punpckhbw %%mm7, %%mm2\n"\
1231 "punpckhbw %%mm7, %%mm3\n"\
1232 "paddw " #in1 ", " #in0 "\n"\
1233 "paddw %%mm3, %%mm2\n"\
1234 "paddw %%mm2, " #in0 "\n"\
1235 "paddw " #in0 ", %%mm6\n"
1236
1237
1238 asm volatile (
1239 "movl %3,%%ecx\n"
1240 "pxor %%mm6,%%mm6\n"
1241 "pxor %%mm7,%%mm7\n"
1242 "movq (%0),%%mm0\n"
1243 "movq 8(%0),%%mm1\n"
1244 "add %2,%0\n"
1245 "subl $2, %%ecx\n"
1246 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1247 "1:\n"
1248
1249 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1250
1251 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1252
1253 "subl $2, %%ecx\n"
1254 "jnz 1b\n"
1255
1256 "movq %%mm6,%%mm0\n"
1257 "psrlq $32, %%mm6\n"
1258 "paddw %%mm6,%%mm0\n"
1259 "movq %%mm0,%%mm6\n"
1260 "psrlq $16, %%mm0\n"
1261 "paddw %%mm6,%%mm0\n"
1262 "movd %%mm0,%1\n"
1263 : "+r" (pix), "=r"(tmp)
1264 : "r" ((long)line_size) , "m" (h)
1265 : "%ecx");
1266 return tmp & 0xFFFF;
1267 }
1268 #undef SUM
1269
1270 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1271 int tmp;
1272
1273 assert( (((int)pix) & 7) == 0);
1274 assert((line_size &7) ==0);
1275
1276 #define SUM(in0, in1, out0, out1) \
1277 "movq (%0), " #out0 "\n"\
1278 "movq 8(%0), " #out1 "\n"\
1279 "add %2,%0\n"\
1280 "psadbw " #out0 ", " #in0 "\n"\
1281 "psadbw " #out1 ", " #in1 "\n"\
1282 "paddw " #in1 ", " #in0 "\n"\
1283 "paddw " #in0 ", %%mm6\n"
1284
1285 asm volatile (
1286 "movl %3,%%ecx\n"
1287 "pxor %%mm6,%%mm6\n"
1288 "pxor %%mm7,%%mm7\n"
1289 "movq (%0),%%mm0\n"
1290 "movq 8(%0),%%mm1\n"
1291 "add %2,%0\n"
1292 "subl $2, %%ecx\n"
1293 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1294 "1:\n"
1295
1296 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1297
1298 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1299
1300 "subl $2, %%ecx\n"
1301 "jnz 1b\n"
1302
1303 "movd %%mm6,%1\n"
1304 : "+r" (pix), "=r"(tmp)
1305 : "r" ((long)line_size) , "m" (h)
1306 : "%ecx");
1307 return tmp;
1308 }
1309 #undef SUM
1310
1311 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1312 int tmp;
1313
1314 assert( (((int)pix1) & 7) == 0);
1315 assert( (((int)pix2) & 7) == 0);
1316 assert((line_size &7) ==0);
1317
1318 #define SUM(in0, in1, out0, out1) \
1319 "movq (%0),%%mm2\n"\
1320 "movq (%1)," #out0 "\n"\
1321 "movq 8(%0),%%mm3\n"\
1322 "movq 8(%1)," #out1 "\n"\
1323 "add %3,%0\n"\
1324 "add %3,%1\n"\
1325 "psubb " #out0 ", %%mm2\n"\
1326 "psubb " #out1 ", %%mm3\n"\
1327 "pxor %%mm7, %%mm2\n"\
1328 "pxor %%mm7, %%mm3\n"\
1329 "movq %%mm2, " #out0 "\n"\
1330 "movq %%mm3, " #out1 "\n"\
1331 "psubusb " #in0 ", %%mm2\n"\
1332 "psubusb " #in1 ", %%mm3\n"\
1333 "psubusb " #out0 ", " #in0 "\n"\
1334 "psubusb " #out1 ", " #in1 "\n"\
1335 "por %%mm2, " #in0 "\n"\
1336 "por %%mm3, " #in1 "\n"\
1337 "movq " #in0 ", %%mm2\n"\
1338 "movq " #in1 ", %%mm3\n"\
1339 "punpcklbw %%mm7, " #in0 "\n"\
1340 "punpcklbw %%mm7, " #in1 "\n"\
1341 "punpckhbw %%mm7, %%mm2\n"\
1342 "punpckhbw %%mm7, %%mm3\n"\
1343 "paddw " #in1 ", " #in0 "\n"\
1344 "paddw %%mm3, %%mm2\n"\
1345 "paddw %%mm2, " #in0 "\n"\
1346 "paddw " #in0 ", %%mm6\n"
1347
1348
1349 asm volatile (
1350 "movl %4,%%ecx\n"
1351 "pxor %%mm6,%%mm6\n"
1352 "pcmpeqw %%mm7,%%mm7\n"
1353 "psllw $15, %%mm7\n"
1354 "packsswb %%mm7, %%mm7\n"
1355 "movq (%0),%%mm0\n"
1356 "movq (%1),%%mm2\n"
1357 "movq 8(%0),%%mm1\n"
1358 "movq 8(%1),%%mm3\n"
1359 "add %3,%0\n"
1360 "add %3,%1\n"
1361 "subl $2, %%ecx\n"
1362 "psubb %%mm2, %%mm0\n"
1363 "psubb %%mm3, %%mm1\n"
1364 "pxor %%mm7, %%mm0\n"
1365 "pxor %%mm7, %%mm1\n"
1366 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1367 "1:\n"
1368
1369 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1370
1371 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1372
1373 "subl $2, %%ecx\n"
1374 "jnz 1b\n"
1375
1376 "movq %%mm6,%%mm0\n"
1377 "psrlq $32, %%mm6\n"
1378 "paddw %%mm6,%%mm0\n"
1379 "movq %%mm0,%%mm6\n"
1380 "psrlq $16, %%mm0\n"
1381 "paddw %%mm6,%%mm0\n"
1382 "movd %%mm0,%2\n"
1383 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1384 : "r" ((long)line_size) , "m" (h)
1385 : "%ecx");
1386 return tmp & 0x7FFF;
1387 }
1388 #undef SUM
1389
1390 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1391 int tmp;
1392
1393 assert( (((int)pix1) & 7) == 0);
1394 assert( (((int)pix2) & 7) == 0);
1395 assert((line_size &7) ==0);
1396
1397 #define SUM(in0, in1, out0, out1) \
1398 "movq (%0)," #out0 "\n"\
1399 "movq (%1),%%mm2\n"\
1400 "movq 8(%0)," #out1 "\n"\
1401 "movq 8(%1),%%mm3\n"\
1402 "add %3,%0\n"\
1403 "add %3,%1\n"\
1404 "psubb %%mm2, " #out0 "\n"\
1405 "psubb %%mm3, " #out1 "\n"\
1406 "pxor %%mm7, " #out0 "\n"\
1407 "pxor %%mm7, " #out1 "\n"\
1408 "psadbw " #out0 ", " #in0 "\n"\
1409 "psadbw " #out1 ", " #in1 "\n"\
1410 "paddw " #in1 ", " #in0 "\n"\
1411 "paddw " #in0 ", %%mm6\n"
1412
1413 asm volatile (
1414 "movl %4,%%ecx\n"
1415 "pxor %%mm6,%%mm6\n"
1416 "pcmpeqw %%mm7,%%mm7\n"
1417 "psllw $15, %%mm7\n"
1418 "packsswb %%mm7, %%mm7\n"
1419 "movq (%0),%%mm0\n"
1420 "movq (%1),%%mm2\n"
1421 "movq 8(%0),%%mm1\n"
1422 "movq 8(%1),%%mm3\n"
1423 "add %3,%0\n"
1424 "add %3,%1\n"
1425 "subl $2, %%ecx\n"
1426 "psubb %%mm2, %%mm0\n"
1427 "psubb %%mm3, %%mm1\n"
1428 "pxor %%mm7, %%mm0\n"
1429 "pxor %%mm7, %%mm1\n"
1430 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1431 "1:\n"
1432
1433 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1434
1435 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1436
1437 "subl $2, %%ecx\n"
1438 "jnz 1b\n"
1439
1440 "movd %%mm6,%2\n"
1441 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1442 : "r" ((long)line_size) , "m" (h)
1443 : "%ecx");
1444 return tmp;
1445 }
1446 #undef SUM
1447
1448 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1449 long i=0;
1450 asm volatile(
1451 "1: \n\t"
1452 "movq (%2, %0), %%mm0 \n\t"
1453 "movq (%1, %0), %%mm1 \n\t"
1454 "psubb %%mm0, %%mm1 \n\t"
1455 "movq %%mm1, (%3, %0) \n\t"
1456 "movq 8(%2, %0), %%mm0 \n\t"
1457 "movq 8(%1, %0), %%mm1 \n\t"
1458 "psubb %%mm0, %%mm1 \n\t"
1459 "movq %%mm1, 8(%3, %0) \n\t"
1460 "add $16, %0 \n\t"
1461 "cmp %4, %0 \n\t"
1462 " jb 1b \n\t"
1463 : "+r" (i)
1464 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1465 );
1466 for(; i<w; i++)
1467 dst[i+0] = src1[i+0]-src2[i+0];
1468 }
1469
1470 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1471 long i=0;
1472 uint8_t l, lt;
1473
1474 asm volatile(
1475 "1: \n\t"
1476 "movq -1(%1, %0), %%mm0 \n\t" // LT
1477 "movq (%1, %0), %%mm1 \n\t" // T
1478 "movq -1(%2, %0), %%mm2 \n\t" // L
1479 "movq (%2, %0), %%mm3 \n\t" // X
1480 "movq %%mm2, %%mm4 \n\t" // L
1481 "psubb %%mm0, %%mm2 \n\t"
1482 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1483 "movq %%mm4, %%mm5 \n\t" // L
1484 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1485 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1486 "pminub %%mm2, %%mm4 \n\t"
1487 "pmaxub %%mm1, %%mm4 \n\t"
1488 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1489 "movq %%mm3, (%3, %0) \n\t"
1490 "add $8, %0 \n\t"
1491 "cmp %4, %0 \n\t"
1492 " jb 1b \n\t"
1493 : "+r" (i)
1494 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1495 );
1496
1497 l= *left;
1498 lt= *left_top;
1499
1500 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1501
1502 *left_top= src1[w-1];
1503 *left = src2[w-1];
1504 }
1505
1506 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
1507 "mov"#m" "#p1", "#a" \n\t"\
1508 "mov"#m" "#p2", "#t" \n\t"\
1509 "punpcklbw "#a", "#t" \n\t"\
1510 "punpcklbw "#a", "#a" \n\t"\
1511 "psubw "#t", "#a" \n\t"\
1512
1513 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1514 uint8_t *p1b=p1, *p2b=p2;\
1515 asm volatile(\
1516 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1517 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1518 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1519 "add %4, %1 \n\t"\
1520 "add %4, %2 \n\t"\
1521 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1522 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1523 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1524 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1525 "mov"#m1" "#mm"0, %0 \n\t"\
1526 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1527 "mov"#m1" %0, "#mm"0 \n\t"\
1528 : "=m"(temp), "+r"(p1b), "+r"(p2b)\
1529 : "r"((long)stride), "r"((long)stride*3)\
1530 );\
1531 }
1532
1533 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
1534 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1535
1536 #ifdef ARCH_X86_64
1537 // permutes 01234567 -> 05736421
1538 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1539 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1540 SBUTTERFLY(c,d,b,wd,dqa)\
1541 SBUTTERFLY(e,f,d,wd,dqa)\
1542 SBUTTERFLY(g,h,f,wd,dqa)\
1543 SBUTTERFLY(a,c,h,dq,dqa)\
1544 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1545 SBUTTERFLY(e,g,b,dq,dqa)\
1546 SBUTTERFLY(d,f,g,dq,dqa)\
1547 SBUTTERFLY(a,e,f,qdq,dqa)\
1548 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1549 SBUTTERFLY(h,b,d,qdq,dqa)\
1550 SBUTTERFLY(c,g,b,qdq,dqa)\
1551 "movdqa %%xmm8, "#g" \n\t"
1552 #else
1553 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1554 "movdqa "#h", "#t" \n\t"\
1555 SBUTTERFLY(a,b,h,wd,dqa)\
1556 "movdqa "#h", 16"#t" \n\t"\
1557 "movdqa "#t", "#h" \n\t"\
1558 SBUTTERFLY(c,d,b,wd,dqa)\
1559 SBUTTERFLY(e,f,d,wd,dqa)\
1560 SBUTTERFLY(g,h,f,wd,dqa)\
1561 SBUTTERFLY(a,c,h,dq,dqa)\
1562 "movdqa "#h", "#t" \n\t"\
1563 "movdqa 16"#t", "#h" \n\t"\
1564 SBUTTERFLY(h,b,c,dq,dqa)\
1565 SBUTTERFLY(e,g,b,dq,dqa)\
1566 SBUTTERFLY(d,f,g,dq,dqa)\
1567 SBUTTERFLY(a,e,f,qdq,dqa)\
1568 SBUTTERFLY(h,d,e,qdq,dqa)\
1569 "movdqa "#h", 16"#t" \n\t"\
1570 "movdqa "#t", "#h" \n\t"\
1571 SBUTTERFLY(h,b,d,qdq,dqa)\
1572 SBUTTERFLY(c,g,b,qdq,dqa)\
1573 "movdqa 16"#t", "#g" \n\t"
1574 #endif
1575
1576 #define LBUTTERFLY2(a1,b1,a2,b2)\
1577 "paddw " #b1 ", " #a1 " \n\t"\
1578 "paddw " #b2 ", " #a2 " \n\t"\
1579 "paddw " #b1 ", " #b1 " \n\t"\
1580 "paddw " #b2 ", " #b2 " \n\t"\
1581 "psubw " #a1 ", " #b1 " \n\t"\
1582 "psubw " #a2 ", " #b2 " \n\t"
1583
1584 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1585 LBUTTERFLY2(m0, m1, m2, m3)\
1586 LBUTTERFLY2(m4, m5, m6, m7)\
1587 LBUTTERFLY2(m0, m2, m1, m3)\
1588 LBUTTERFLY2(m4, m6, m5, m7)\
1589 LBUTTERFLY2(m0, m4, m1, m5)\
1590 LBUTTERFLY2(m2, m6, m3, m7)\
1591
1592 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1593
1594 #define MMABS_MMX(a,z)\
1595 "pxor " #z ", " #z " \n\t"\
1596 "pcmpgtw " #a ", " #z " \n\t"\
1597 "pxor " #z ", " #a " \n\t"\
1598 "psubw " #z ", " #a " \n\t"
1599
1600 #define MMABS_MMX2(a,z)\
1601 "pxor " #z ", " #z " \n\t"\
1602 "psubw " #a ", " #z " \n\t"\
1603 "pmaxsw " #z ", " #a " \n\t"
1604
1605 #define MMABS_SSSE3(a,z)\
1606 "pabsw " #a ", " #a " \n\t"
1607
1608 #define MMABS_SUM(a,z, sum)\
1609 MMABS(a,z)\
1610 "paddusw " #a ", " #sum " \n\t"
1611
1612 #define MMABS_SUM_8x8_NOSPILL\
1613 MMABS(%%xmm0, %%xmm8)\
1614 MMABS(%%xmm1, %%xmm9)\
1615 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1616 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1617 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1618 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1619 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1620 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1621 "paddusw %%xmm1, %%xmm0 \n\t"
1622
1623 #ifdef ARCH_X86_64
1624 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1625 #else
1626 #define MMABS_SUM_8x8_SSE2\
1627 "movdqa %%xmm7, (%1) \n\t"\
1628 MMABS(%%xmm0, %%xmm7)\
1629 MMABS(%%xmm1, %%xmm7)\
1630 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1631 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1632 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1633 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1634 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1635 "movdqa (%1), %%xmm2 \n\t"\
1636 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1637 "paddusw %%xmm1, %%xmm0 \n\t"
1638 #endif
1639
1640 #define LOAD4(o, a, b, c, d)\
1641 "movq "#o"(%1), "#a" \n\t"\
1642 "movq "#o"+8(%1), "#b" \n\t"\
1643 "movq "#o"+16(%1), "#c" \n\t"\
1644 "movq "#o"+24(%1), "#d" \n\t"\
1645
1646 #define STORE4(o, a, b, c, d)\
1647 "movq "#a", "#o"(%1) \n\t"\
1648 "movq "#b", "#o"+8(%1) \n\t"\
1649 "movq "#c", "#o"+16(%1) \n\t"\
1650 "movq "#d", "#o"+24(%1) \n\t"\
1651
1652 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1653 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1654 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1655 #define HSUM_MMX(a, t, dst)\
1656 "movq "#a", "#t" \n\t"\
1657 "psrlq $32, "#a" \n\t"\
1658 "paddusw "#t", "#a" \n\t"\
1659 "movq "#a", "#t" \n\t"\
1660 "psrlq $16, "#a" \n\t"\
1661 "paddusw "#t", "#a" \n\t"\
1662 "movd "#a", "#dst" \n\t"\
1663
1664 #define HSUM_MMX2(a, t, dst)\
1665 "pshufw $0x0E, "#a", "#t" \n\t"\
1666 "paddusw "#t", "#a" \n\t"\
1667 "pshufw $0x01, "#a", "#t" \n\t"\
1668 "paddusw "#t", "#a" \n\t"\
1669 "movd "#a", "#dst" \n\t"\
1670
1671 #define HSUM_SSE2(a, t, dst)\
1672 "movhlps "#a", "#t" \n\t"\
1673 "paddusw "#t", "#a" \n\t"\
1674 "pshuflw $0x0E, "#a", "#t" \n\t"\
1675 "paddusw "#t", "#a" \n\t"\
1676 "pshuflw $0x01, "#a", "#t" \n\t"\
1677 "paddusw "#t", "#a" \n\t"\
1678 "movd "#a", "#dst" \n\t"\
1679
1680 #define HADAMARD8_DIFF_MMX(cpu) \
1681 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1682 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1683 int sum;\
1684 \
1685 assert(h==8);\
1686 \
1687 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1688 \
1689 asm volatile(\
1690 HADAMARD48\
1691 \
1692 "movq %%mm7, 96(%1) \n\t"\
1693 \
1694 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1695 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1696 \
1697 "movq 96(%1), %%mm7 \n\t"\
1698 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1699 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1700 \
1701 : "=r" (sum)\
1702 : "r"(temp)\
1703 );\
1704 \
1705 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1706 \
1707 asm volatile(\
1708 HADAMARD48\
1709 \
1710 "movq %%mm7, 96(%1) \n\t"\
1711 \
1712 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1713 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1714 \
1715 "movq 96(%1), %%mm7 \n\t"\
1716 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1717 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1718 "movq %%mm6, %%mm7 \n\t"\
1719 "movq %%mm0, %%mm6 \n\t"\
1720 \
1721 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1722 \
1723 HADAMARD48\
1724 "movq %%mm7, 64(%1) \n\t"\
1725 MMABS(%%mm0, %%mm7)\
1726 MMABS(%%mm1, %%mm7)\
1727 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1728 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1729 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1730 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1731 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1732 "movq 64(%1), %%mm2 \n\t"\
1733 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1734 "paddusw %%mm1, %%mm0 \n\t"\
1735 "movq %%mm0, 64(%1) \n\t"\
1736 \
1737 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1738 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1739 \
1740 HADAMARD48\
1741 "movq %%mm7, (%1) \n\t"\
1742 MMABS(%%mm0, %%mm7)\
1743 MMABS(%%mm1, %%mm7)\
1744 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1745 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1746 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1747 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1748 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1749 "movq (%1), %%mm2 \n\t"\
1750 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1751 "paddusw 64(%1), %%mm0 \n\t"\
1752 "paddusw %%mm1, %%mm0 \n\t"\
1753 \
1754 HSUM(%%mm0, %%mm1, %0)\
1755 \
1756 : "=r" (sum)\
1757 : "r"(temp)\
1758 );\
1759 return sum&0xFFFF;\
1760 }\
1761 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1762
1763 #define HADAMARD8_DIFF_SSE2(cpu) \
1764 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1765 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1766 int sum;\
1767 \
1768 assert(h==8);\
1769 \
1770 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1771 \
1772 asm volatile(\
1773 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1774 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1775 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1776 MMABS_SUM_8x8\
1777 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1778 : "=r" (sum)\
1779 : "r"(temp)\
1780 );\
1781 return sum&0xFFFF;\
1782 }\
1783 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1784
1785 #define MMABS(a,z) MMABS_MMX(a,z)
1786 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1787 HADAMARD8_DIFF_MMX(mmx)
1788 #undef MMABS
1789 #undef HSUM
1790
1791 #define MMABS(a,z) MMABS_MMX2(a,z)
1792 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1793 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1794 HADAMARD8_DIFF_MMX(mmx2)
1795 HADAMARD8_DIFF_SSE2(sse2)
1796 #undef MMABS
1797 #undef MMABS_SUM_8x8
1798 #undef HSUM
1799
1800 #ifdef HAVE_SSSE3
1801 #define MMABS(a,z) MMABS_SSSE3(a,z)
1802 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1803 HADAMARD8_DIFF_SSE2(ssse3)
1804 #undef MMABS
1805 #undef MMABS_SUM_8x8
1806 #endif
1807
1808 #define DCT_SAD4(m,mm,o)\
1809 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1810 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1811 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1812 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1813 MMABS_SUM(mm##2, mm##6, mm##0)\
1814 MMABS_SUM(mm##3, mm##7, mm##1)\
1815 MMABS_SUM(mm##4, mm##6, mm##0)\
1816 MMABS_SUM(mm##5, mm##7, mm##1)\
1817
1818 #define DCT_SAD_MMX\
1819 "pxor %%mm0, %%mm0 \n\t"\
1820 "pxor %%mm1, %%mm1 \n\t"\
1821 DCT_SAD4(q, %%mm, 0)\
1822 DCT_SAD4(q, %%mm, 8)\
1823 DCT_SAD4(q, %%mm, 64)\
1824 DCT_SAD4(q, %%mm, 72)\
1825 "paddusw %%mm1, %%mm0 \n\t"\
1826 HSUM(%%mm0, %%mm1, %0)
1827
1828 #define DCT_SAD_SSE2\
1829 "pxor %%xmm0, %%xmm0 \n\t"\
1830 "pxor %%xmm1, %%xmm1 \n\t"\
1831 DCT_SAD4(dqa, %%xmm, 0)\
1832 DCT_SAD4(dqa, %%xmm, 64)\
1833 "paddusw %%xmm1, %%xmm0 \n\t"\
1834 HSUM(%%xmm0, %%xmm1, %0)
1835
1836 #define DCT_SAD_FUNC(cpu) \
1837 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1838 int sum;\
1839 asm volatile(\
1840 DCT_SAD\
1841 :"=r"(sum)\
1842 :"r"(block)\
1843 );\
1844 return sum&0xFFFF;\
1845 }
1846
1847 #define DCT_SAD DCT_SAD_MMX
1848 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1849 #define MMABS(a,z) MMABS_MMX(a,z)
1850 DCT_SAD_FUNC(mmx)
1851 #undef MMABS
1852 #undef HSUM
1853
1854 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1855 #define MMABS(a,z) MMABS_MMX2(a,z)
1856 DCT_SAD_FUNC(mmx2)
1857 #undef HSUM
1858 #undef DCT_SAD
1859
1860 #define DCT_SAD DCT_SAD_SSE2
1861 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1862 DCT_SAD_FUNC(sse2)
1863 #undef MMABS
1864
1865 #ifdef HAVE_SSSE3
1866 #define MMABS(a,z) MMABS_SSSE3(a,z)
1867 DCT_SAD_FUNC(ssse3)
1868 #undef MMABS
1869 #endif
1870 #undef HSUM
1871 #undef DCT_SAD
1872
1873 static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
1874 int sum;
1875 long i=size;
1876 asm volatile(
1877 "pxor %%mm4, %%mm4 \n"
1878 "1: \n"
1879 "sub $8, %0 \n"
1880 "movq (%2,%0), %%mm2 \n"
1881 "movq (%3,%0,2), %%mm0 \n"
1882 "movq 8(%3,%0,2), %%mm1 \n"
1883 "punpckhbw %%mm2, %%mm3 \n"
1884 "punpcklbw %%mm2, %%mm2 \n"
1885 "psraw $8, %%mm3 \n"
1886 "psraw $8, %%mm2 \n"
1887 "psubw %%mm3, %%mm1 \n"
1888 "psubw %%mm2, %%mm0 \n"
1889 "pmaddwd %%mm1, %%mm1 \n"
1890 "pmaddwd %%mm0, %%mm0 \n"
1891 "paddd %%mm1, %%mm4 \n"
1892 "paddd %%mm0, %%mm4 \n"
1893 "jg 1b \n"
1894 "movq %%mm4, %%mm3 \n"
1895 "psrlq $32, %%mm3 \n"
1896 "paddd %%mm3, %%mm4 \n"
1897 "movd %%mm4, %1 \n"
1898 :"+r"(i), "=r"(sum)
1899 :"r"(pix1), "r"(pix2)
1900 );
1901 return sum;
1902 }
1903
1904 #endif //CONFIG_ENCODERS
1905
1906 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1907 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1908
1909 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1910 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
1911 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
1912 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
1913 "movq "#in7", " #m3 " \n\t" /* d */\
1914 "movq "#in0", %%mm5 \n\t" /* D */\
1915 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
1916 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
1917 "movq "#in1", %%mm5 \n\t" /* C */\
1918 "movq "#in2", %%mm6 \n\t" /* B */\
1919 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
1920 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
1921 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
1922 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
1923 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
1924 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
1925 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1926 "psraw $5, %%mm5 \n\t"\
1927 "packuswb %%mm5, %%mm5 \n\t"\
1928 OP(%%mm5, out, %%mm7, d)
1929
1930 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1931 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1932 uint64_t temp;\
1933 \
1934 asm volatile(\
1935 "pxor %%mm7, %%mm7 \n\t"\
1936 "1: \n\t"\
1937 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1938 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1939 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1940 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1941 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1942 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1943 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1944 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1945 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1946 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1947 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1948 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1949 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1950 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1951 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1952 "paddw %%mm3, %%mm5 \n\t" /* b */\
1953 "paddw %%mm2, %%mm6 \n\t" /* c */\
1954 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1955 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1956 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1957 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1958 "paddw %%mm4, %%mm0 \n\t" /* a */\
1959 "paddw %%mm1, %%mm5 \n\t" /* d */\
1960 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1961 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1962 "paddw %6, %%mm6 \n\t"\
1963 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1964 "psraw $5, %%mm0 \n\t"\
1965 "movq %%mm0, %5 \n\t"\
1966 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1967 \
1968 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1969 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1970 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1971 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1972 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1973 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1974 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1975 "paddw %%mm0, %%mm2 \n\t" /* b */\
1976 "paddw %%mm5, %%mm3 \n\t" /* c */\
1977 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1978 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1979 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1980 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1981 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1982 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1983 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1984 "paddw %%mm2, %%mm1 \n\t" /* a */\
1985 "paddw %%mm6, %%mm4 \n\t" /* d */\
1986 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1987 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1988 "paddw %6, %%mm1 \n\t"\
1989 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1990 "psraw $5, %%mm3 \n\t"\
1991 "movq %5, %%mm1 \n\t"\
1992 "packuswb %%mm3, %%mm1 \n\t"\
1993 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1994 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1995 \
1996 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1997 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1998 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1999 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
2000 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
2001 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
2002 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
2003 "paddw %%mm1, %%mm5 \n\t" /* b */\
2004 "paddw %%mm4, %%mm0 \n\t" /* c */\
2005 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2006 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
2007 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
2008 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
2009 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
2010 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
2011 "paddw %%mm3, %%mm2 \n\t" /* d */\
2012 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
2013 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
2014 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
2015 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
2016 "paddw %%mm2, %%mm6 \n\t" /* a */\
2017 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2018 "paddw %6, %%mm0 \n\t"\
2019 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2020 "psraw $5, %%mm0 \n\t"\
2021 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2022 \
2023 "paddw %%mm5, %%mm3 \n\t" /* a */\
2024 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
2025 "paddw %%mm4, %%mm6 \n\t" /* b */\
2026 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
2027 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
2028 "paddw %%mm1, %%mm4 \n\t" /* c */\
2029 "paddw %%mm2, %%mm5 \n\t" /* d */\
2030 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
2031 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
2032 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2033 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
2034 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
2035 "paddw %6, %%mm4 \n\t"\
2036 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
2037 "psraw $5, %%mm4 \n\t"\
2038 "packuswb %%mm4, %%mm0 \n\t"\
2039 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2040 \
2041 "add %3, %0 \n\t"\
2042 "add %4, %1 \n\t"\
2043 "decl %2 \n\t"\
2044 " jnz 1b \n\t"\
2045 : "+a"(src), "+c"(dst), "+m"(h)\
2046 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2047 : "memory"\
2048 );\
2049 }\
2050 \
2051 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2052 int i;\
2053 int16_t temp[16];\
2054 /* quick HACK, XXX FIXME MUST be optimized */\
2055 for(i=0; i<h; i++)\
2056 {\
2057 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2058 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2059 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2060 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2061 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2062 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2063 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2064 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2065 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2066 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2067 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2068 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2069 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2070 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2071 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2072 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2073 asm volatile(\
2074 "movq (%0), %%mm0 \n\t"\
2075 "movq 8(%0), %%mm1 \n\t"\
2076 "paddw %2, %%mm0 \n\t"\
2077 "paddw %2, %%mm1 \n\t"\
2078 "psraw $5, %%mm0 \n\t"\
2079 "psraw $5, %%mm1 \n\t"\
2080 "packuswb %%mm1, %%mm0 \n\t"\
2081 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2082 "movq 16(%0), %%mm0 \n\t"\
2083 "movq 24(%0), %%mm1 \n\t"\
2084 "paddw %2, %%mm0 \n\t"\
2085 "paddw %2, %%mm1 \n\t"\
2086 "psraw $5, %%mm0 \n\t"\
2087 "psraw $5, %%mm1 \n\t"\
2088 "packuswb %%mm1, %%mm0 \n\t"\
2089 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2090 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2091 : "memory"\
2092 );\
2093 dst+=dstStride;\
2094 src+=srcStride;\
2095 }\
2096 }\
2097 \
2098 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2099 uint64_t temp;\
2100 \
2101 asm volatile(\
2102 "pxor %%mm7, %%mm7 \n\t"\
2103 "1: \n\t"\
2104 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
2105 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
2106 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
2107 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
2108 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
2109 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
2110 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
2111 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
2112 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
2113 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
2114 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
2115 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
2116 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
2117 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
2118 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
2119 "paddw %%mm3, %%mm5 \n\t" /* b */\
2120 "paddw %%mm2, %%mm6 \n\t" /* c */\
2121 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2122 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
2123 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
2124 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
2125 "paddw %%mm4, %%mm0 \n\t" /* a */\
2126 "paddw %%mm1, %%mm5 \n\t" /* d */\
2127 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2128 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
2129 "paddw %6, %%mm6 \n\t"\
2130 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2131 "psraw $5, %%mm0 \n\t"\
2132 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2133 \
2134 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
2135 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
2136 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
2137 "paddw %%mm5, %%mm1 \n\t" /* a */\
2138 "paddw %%mm6, %%mm2 \n\t" /* b */\
2139 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
2140 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
2141 "paddw %%mm6, %%mm3 \n\t" /* c */\
2142 "paddw %%mm5, %%mm4 \n\t" /* d */\
2143 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
2144 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
2145 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2146 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
2147 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
2148 "paddw %6, %%mm1 \n\t"\
2149 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
2150 "psraw $5, %%mm3 \n\t"\
2151 "packuswb %%mm3, %%mm0 \n\t"\
2152 OP_MMX2(%%mm0, (%1), %%mm4, q)\
2153 \
2154 "add %3, %0 \n\t"\
2155 "add %4, %1 \n\t"\
2156 "decl %2 \n\t"\
2157 " jnz 1b \n\t"\
2158 : "+a"(src), "+c"(dst), "+m"(h)\
2159 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2160 : "memory"\
2161 );\
2162 }\
2163 \
2164 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2165 int i;\
2166 int16_t temp[8];\
2167 /* quick HACK, XXX FIXME MUST be optimized */\
2168 for(i=0; i<h; i++)\
2169 {\
2170 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2171 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2172 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2173 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2174 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2175 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2176 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2177 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2178 asm volatile(\
2179 "movq (%0), %%mm0 \n\t"\
2180 "movq 8(%0), %%mm1 \n\t"\
2181 "paddw %2, %%mm0 \n\t"\
2182 "paddw %2, %%mm1 \n\t"\
2183 "psraw $5, %%mm0 \n\t"\
2184 "psraw $5, %%mm1 \n\t"\
2185 "packuswb %%mm1, %%mm0 \n\t"\
2186 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2187 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2188 :"memory"\
2189 );\
2190 dst+=dstStride;\
2191 src+=srcStride;\
2192 }\
2193 }
2194
2195 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2196 \
2197 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2198 uint64_t temp[17*4];\
2199 uint64_t *temp_ptr= temp;\
2200 int count= 17;\
2201 \
2202 /*FIXME unroll */\
2203 asm volatile(\
2204 "pxor %%mm7, %%mm7 \n\t"\
2205 "1: \n\t"\
2206 "movq (%0), %%mm0 \n\t"\
2207 "movq (%0), %%mm1 \n\t"\
2208 "movq 8(%0), %%mm2 \n\t"\
2209 "movq 8(%0), %%mm3 \n\t"\
2210 "punpcklbw %%mm7, %%mm0 \n\t"\
2211 "punpckhbw %%mm7, %%mm1 \n\t"\
2212 "punpcklbw %%mm7, %%mm2 \n\t"\
2213 "punpckhbw %%mm7, %%mm3 \n\t"\
2214 "movq %%mm0, (%1) \n\t"\
2215 "movq %%mm1, 17*8(%1) \n\t"\
2216 "movq %%mm2, 2*17*8(%1) \n\t"\
2217 "movq %%mm3, 3*17*8(%1) \n\t"\
2218 "add $8, %1 \n\t"\
2219 "add %3, %0 \n\t"\
2220 "decl %2 \n\t"\
2221 " jnz 1b \n\t"\
2222 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2223 : "r" ((long)srcStride)\
2224 : "memory"\
2225 );\
2226 \
2227 temp_ptr= temp;\
2228 count=4;\
2229 \
2230 /*FIXME reorder for speed */\
2231 asm volatile(\
2232 /*"pxor %%mm7, %%mm7 \n\t"*/\
2233 "1: \n\t"\
2234 "movq (%0), %%mm0 \n\t"\
2235 "movq 8(%0), %%mm1 \n\t"\
2236 "movq 16(%0), %%mm2 \n\t"\
2237 "movq 24(%0), %%mm3 \n\t"\
2238 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2239 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2240 "add %4, %1 \n\t"\
2241 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2242 \
2243 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2244 "add %4, %1 \n\t"\
2245 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2246 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2247 "add %4, %1 \n\t"\
2248 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2249 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2250 "add %4, %1 \n\t"\
2251 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2252 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2253 "add %4, %1 \n\t"\
2254 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2255 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2256 "add %4, %1 \n\t"\
2257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2258 \
2259 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2260 "add %4, %1 \n\t" \
2261 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2262 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2263 \
2264 "add $136, %0 \n\t"\
2265 "add %6, %1 \n\t"\
2266 "decl %2 \n\t"\
2267 " jnz 1b \n\t"\
2268 \
2269 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2270 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2271 :"memory"\
2272 );\
2273 }\
2274 \
2275 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2276 uint64_t temp[9*2];\
2277 uint64_t *temp_ptr= temp;\
2278 int count= 9;\
2279 \
2280 /*FIXME unroll */\
2281 asm volatile(\
2282 "pxor %%mm7, %%mm7 \n\t"\
2283 "1: \n\t"\
2284 "movq (%0), %%mm0 \n\t"\
2285 "movq (%0), %%mm1 \n\t"\
2286 "punpcklbw %%mm7, %%mm0 \n\t"\
2287 "punpckhbw %%mm7, %%mm1 \n\t"\
2288 "movq %%mm0, (%1) \n\t"\
2289 "movq %%mm1, 9*8(%1) \n\t"\
2290 "add $8, %1 \n\t"\
2291 "add %3, %0 \n\t"\
2292 "decl %2 \n\t"\
2293 " jnz 1b \n\t"\
2294 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2295 : "r" ((long)srcStride)\
2296 : "memory"\
2297 );\
2298 \
2299 temp_ptr= temp;\
2300 count=2;\
2301 \
2302 /*FIXME reorder for speed */\
2303 asm volatile(\
2304 /*"pxor %%mm7, %%mm7 \n\t"*/\
2305 "1: \n\t"\
2306 "movq (%0), %%mm0 \n\t"\
2307 "movq 8(%0), %%mm1 \n\t"\
2308 "movq 16(%0), %%mm2 \n\t"\
2309 "movq 24(%0), %%mm3 \n\t"\
2310 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2311 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2312 "add %4, %1 \n\t"\
2313 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2314 \
2315 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2316 "add %4, %1 \n\t"\
2317 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2318 \
2319 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2320 "add %4, %1 \n\t"\
2321 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2322 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2323 \
2324 "add $72, %0 \n\t"\
2325 "add %6, %1 \n\t"\
2326 "decl %2 \n\t"\
2327 " jnz 1b \n\t"\
2328 \
2329 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2330 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2331 : "memory"\
2332 );\
2333 }\
2334 \
2335 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2336 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2337 }\
2338 \
2339 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2340 uint64_t temp[8];\
2341 uint8_t * const half= (uint8_t*)temp;\
2342 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2343 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2344 }\
2345 \
2346 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2347 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2348 }\
2349 \
2350 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2351 uint64_t temp[8];\
2352 uint8_t * const half= (uint8_t*)temp;\
2353 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2354 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2355 }\
2356 \
2357 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2358 uint64_t temp[8];\
2359 uint8_t * const half= (uint8_t*)temp;\
2360 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2361 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2362 }\
2363 \
2364 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2365 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2366 }\
2367 \
2368 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2369 uint64_t temp[8];\
2370 uint8_t * const half= (uint8_t*)temp;\
2371 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2372 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2373 }\
2374 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2375 uint64_t half[8 + 9];\
2376 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2377 uint8_t * const halfHV= ((uint8_t*)half);\
2378 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2379 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2380 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2381 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2382 }\
2383 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2384 uint64_t half[8 + 9];\
2385 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2386 uint8_t * const halfHV= ((uint8_t*)half);\
2387 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2388 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2389 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2390 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2391 }\
2392 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2393 uint64_t half[8 + 9];\
2394 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2395 uint8_t * const halfHV= ((uint8_t*)half);\
2396 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2397 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2398 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2399 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2400 }\
2401 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2402 uint64_t half[8 + 9];\
2403 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2404 uint8_t * const halfHV= ((uint8_t*)half);\
2405 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2406 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2407 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2408 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2409 }\
2410 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2411 uint64_t half[8 + 9];\
2412 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2413 uint8_t * const halfHV= ((uint8_t*)half);\
2414 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2415 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2416 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2417 }\
2418 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2419 uint64_t half[8 + 9];\
2420 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2421 uint8_t * const halfHV= ((uint8_t*)half);\
2422 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2423 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2424 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2425 }\
2426 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2427 uint64_t half[8 + 9];\
2428 uint8_t * const halfH= ((uint8_t*)half);\
2429 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2430 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2431 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2432 }\
2433 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2434 uint64_t half[8 + 9];\
2435 uint8_t * const halfH= ((uint8_t*)half);\
2436 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2437 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2438 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2439 }\
2440 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2441 uint64_t half[9];\
2442 uint8_t * const halfH= ((uint8_t*)half);\
2443 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2444 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2445 }\
2446 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2447 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2448 }\
2449 \
2450 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2451 uint64_t temp[32];\
2452 uint8_t * const half= (uint8_t*)temp;\
2453 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2454 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2455 }\
2456 \
2457 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2458 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2459 }\
2460 \
2461 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2462 uint64_t temp[32];\
2463 uint8_t * const half= (uint8_t*)temp;\
2464 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2465 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2466 }\
2467 \
2468 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2469 uint64_t temp[32];\
2470 uint8_t * const half= (uint8_t*)temp;\
2471 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2472 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2473 }\
2474 \
2475 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2476 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2477 }\
2478 \
2479 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2480 uint64_t temp[32];\
2481 uint8_t * const half= (uint8_t*)temp;\
2482 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2483 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2484 }\
2485 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2486 uint64_t half[16*2 + 17*2];\
2487 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2488 uint8_t * const halfHV= ((uint8_t*)half);\
2489 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2490 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2491 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2492 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2493 }\
2494 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2495 uint64_t half[16*2 + 17*2];\
2496 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2497 uint8_t * const halfHV= ((uint8_t*)half);\
2498 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2499 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2500 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2501 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2502 }\
2503 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2504 uint64_t half[16*2 + 17*2];\
2505 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2506 uint8_t * const halfHV= ((uint8_t*)half);\
2507 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2508 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2509 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2510 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2511 }\
2512 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2513 uint64_t half[16*2 + 17*2];\
2514 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2515 uint8_t * const halfHV= ((uint8_t*)half);\
2516 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2517 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2518 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2519 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2520 }\
2521 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2522 uint64_t half[16*2 + 17*2];\
2523 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2524 uint8_t * const halfHV= ((uint8_t*)half);\
2525 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2526 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2527 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2528 }\
2529 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2530 uint64_t half[16*2 + 17*2];\
2531 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2532 uint8_t * const halfHV= ((uint8_t*)half);\
2533 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2534 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2535 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2536 }\
2537 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2538 uint64_t half[17*2];\
2539 uint8_t * const halfH= ((uint8_t*)half);\
2540 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2541 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2542 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2543 }\
2544 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2545 uint64_t half[17*2];\
2546 uint8_t * const halfH= ((uint8_t*)half);\
2547 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2548 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2549 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2550 }\
2551 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2552 uint64_t half[17*2];\
2553 uint8_t * const halfH= ((uint8_t*)half);\
2554 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2555 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2556 }
2557
2558 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2559 #define AVG_3DNOW_OP(a,b,temp, size) \
2560 "mov" #size " " #b ", " #temp " \n\t"\
2561 "pavgusb " #temp ", " #a " \n\t"\
2562 "mov" #size " " #a ", " #b " \n\t"
2563 #define AVG_MMX2_OP(a,b,temp, size) \
2564 "mov" #size " " #b ", " #temp " \n\t"\
2565 "pavgb " #temp ", " #a " \n\t"\
2566 "mov" #size " " #a ", " #b " \n\t"
2567
2568 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
2569 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
2570 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2571 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
2572 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
2573 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2574 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
2575 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
2576 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2577
2578 /***********************************/
2579 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2580
2581 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2582 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2583 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2584 }
2585 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2586 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2587 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2588 }
2589
2590 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
2591 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2592 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2593 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2594 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2595 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2596 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2597 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2598 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2599 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2600 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2601 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2602 }\
2603 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2604 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2605 }\
2606 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
2607 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
2608 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
2609 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
2610 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
2611 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
2612 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
2613 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2614
2615 QPEL_2TAP(put_, 16, mmx2)
2616 QPEL_2TAP(avg_, 16, mmx2)
2617 QPEL_2TAP(put_, 8, mmx2)
2618 QPEL_2TAP(avg_, 8, mmx2)
2619 QPEL_2TAP(put_, 16, 3dnow)
2620 QPEL_2TAP(avg_, 16, 3dnow)
2621 QPEL_2TAP(put_, 8, 3dnow)
2622 QPEL_2TAP(avg_, 8, 3dnow)
2623
2624
2625 #if 0
2626 static void just_return() { return; }
2627 #endif
2628
2629 #define SET_QPEL_FUNC(postfix1, postfix2) \
2630 c->put_ ## postfix1 = put_ ## postfix2;\
2631 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2632 c->avg_ ## postfix1 = avg_ ## postfix2;
2633
2634 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2635 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2636 const int w = 8;
2637 const int ix = ox>>(16+shift);
2638 const int iy = oy>>(16+shift);
2639 const int oxs = ox>>4;
2640 const int oys = oy>>4;
2641 const int dxxs = dxx>>4;
2642 const int dxys = dxy>>4;
2643 const int dyxs = dyx>>4;
2644 const int dyys = dyy>>4;
2645 const uint16_t r4[4] = {r,r,r,r};
2646 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2647 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2648 const uint64_t shift2 = 2*shift;
2649 uint8_t edge_buf[(h+1)*stride];
2650 int x, y;
2651
2652 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2653 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2654 const int dxh = dxy*(h-1);
2655 const int dyw = dyx*(w-1);
2656 if( // non-constant fullpel offset (3% of blocks)
2657 (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2658 oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2659 // uses more than 16 bits of subpel mv (only at huge resolution)
2660 || (dxx|dxy|dyx|dyy)&15 )
2661 {
2662 //FIXME could still use mmx for some of the rows
2663 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2664 return;
2665 }
2666
2667 src += ix + iy*stride;
2668 if( (unsigned)ix >= width-w ||
2669 (unsigned)iy >= height-h )
2670 {
2671 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2672 src = edge_buf;
2673 }
2674
2675 asm volatile(
2676 "movd %0, %%mm6 \n\t"
2677 "pxor %%mm7, %%mm7 \n\t"
2678 "punpcklwd %%mm6, %%mm6 \n\t"
2679 "punpcklwd %%mm6, %%mm6 \n\t"
2680 :: "r"(1<<shift)
2681 );
2682
2683 for(x=0; x<w; x+=4){
2684 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2685 oxs - dxys + dxxs*(x+1),
2686 oxs - dxys + dxxs*(x+2),
2687 oxs - dxys + dxxs*(x+3) };
2688 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2689 oys - dyys + dyxs*(x+1),
2690 oys - dyys + dyxs*(x+2),
2691 oys - dyys + dyxs*(x+3) };
2692
2693 for(y=0; y<h; y++){
2694 asm volatile(
2695 "movq %0, %%mm4 \n\t"
2696 "movq %1, %%mm5 \n\t"
2697 "paddw %2, %%mm4 \n\t"
2698 "paddw %3, %%mm5 \n\t"
2699 "movq %%mm4, %0 \n\t"
2700 "movq %%mm5, %1 \n\t"
2701 "psrlw $12, %%mm4 \n\t"
2702 "psrlw $12, %%mm5 \n\t"
2703 : "+m"(*dx4), "+m"(*dy4)
2704 : "m"(*dxy4), "m"(*dyy4)
2705 );
2706
2707 asm volatile(
2708 "movq %%mm6, %%mm2 \n\t"
2709 "movq %%mm6, %%mm1 \n\t"
2710 "psubw %%mm4, %%mm2 \n\t"
2711 "psubw %%mm5, %%mm1 \n\t"
2712 "movq %%mm2, %%mm0 \n\t"
2713 "movq %%mm4, %%mm3 \n\t"
2714 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2715 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2716 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2717 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2718
2719 "movd %4, %%mm5 \n\t"
2720 "movd %3, %%mm4 \n\t"
2721 "punpcklbw %%mm7, %%mm5 \n\t"
2722 "punpcklbw %%mm7, %%mm4 \n\t"
2723 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2724 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2725
2726 "movd %2, %%mm5 \n\t"
2727 "movd %1, %%mm4 \n\t"
2728 "punpcklbw %%mm7, %%mm5 \n\t"
2729 "punpcklbw %%mm7, %%mm4 \n\t"
2730 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2731 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2732 "paddw %5, %%mm1 \n\t"
2733 "paddw %%mm3, %%mm2 \n\t"
2734 "paddw %%mm1, %%mm0 \n\t"
2735 "paddw %%mm2, %%mm0 \n\t"
2736
2737 "psrlw %6, %%mm0 \n\t"
2738 "packuswb %%mm0, %%mm0 \n\t"
2739 "movd %%mm0, %0 \n\t"
2740
2741 : "=m"(dst[x+y*stride])
2742 : "m"(src[0]), "m"(src[1]),
2743 "m"(src[stride]), "m"(src[stride+1]),
2744 "m"(*r4), "m"(shift2)
2745 );
2746 src += stride;
2747 }
2748 src += 4-h*stride;
2749 }
2750 }
2751
2752 #ifdef CONFIG_ENCODERS
2753 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2754 long i=0;
2755
2756 assert(FFABS(scale) < 256);
2757 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2758
2759 asm volatile(
2760 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2761 "psrlw $15, %%mm6 \n\t" // 1w
2762 "pxor %%mm7, %%mm7 \n\t"
2763 "movd %4, %%mm5 \n\t"
2764 "punpcklwd %%mm5, %%mm5 \n\t"
2765 "punpcklwd %%mm5, %%mm5 \n\t"
2766 "1: \n\t"
2767 "movq (%1, %0), %%mm0 \n\t"
2768 "movq 8(%1, %0), %%mm1 \n\t"
2769 "pmulhw %%mm5, %%mm0 \n\t"
2770 "pmulhw %%mm5, %%mm1 \n\t"
2771 "paddw %%mm6, %%mm0 \n\t"
2772 "paddw %%mm6, %%mm1 \n\t"
2773 "psraw $1, %%mm0 \n\t"
2774 "psraw $1, %%mm1 \n\t"
2775 "paddw (%2, %0), %%mm0 \n\t"
2776 "paddw 8(%2, %0), %%mm1 \n\t"
2777 "psraw $6, %%mm0 \n\t"
2778 "psraw $6, %%mm1 \n\t"
2779 "pmullw (%3, %0), %%mm0 \n\t"
2780 "pmullw 8(%3, %0), %%mm1 \n\t"
2781 "pmaddwd %%mm0, %%mm0 \n\t"
2782 "pmaddwd %%mm1, %%mm1 \n\t"
2783 "paddd %%mm1, %%mm0 \n\t"
2784 "psrld $4, %%mm0 \n\t"
2785 "paddd %%mm0, %%mm7 \n\t"
2786 "add $16, %0 \n\t"
2787 "cmp $128, %0 \n\t" //FIXME optimize & bench
2788 " jb 1b \n\t"
2789 "movq %%mm7, %%mm6 \n\t"
2790 "psrlq $32, %%mm7 \n\t"
2791 "paddd %%mm6, %%mm7 \n\t"
2792 "psrld $2, %%mm7 \n\t"
2793 "movd %%mm7, %0 \n\t"
2794
2795 : "+r" (i)
2796 : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
2797 );
2798 return i;
2799 }
2800
2801 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
2802 long i=0;
2803
2804 if(FFABS(scale) < 256){
2805 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2806 asm volatile(
2807 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2808 "psrlw $15, %%mm6 \n\t" // 1w
2809 "movd %3, %%mm5 \n\t"
2810 "punpcklwd %%mm5, %%mm5 \n\t"
2811 "punpcklwd %%mm5, %%mm5 \n\t"
2812 "1: \n\t"
2813 "movq (%1, %0), %%mm0 \n\t"
2814 "movq 8(%1, %0), %%mm1 \n\t"
2815 "pmulhw %%mm5, %%mm0 \n\t"
2816 "pmulhw %%mm5, %%mm1 \n\t"
2817 "paddw %%mm6, %%mm0 \n\t"
2818 "paddw %%mm6, %%mm1 \n\t"
2819 "psraw $1, %%mm0 \n\t"
2820 "psraw $1, %%mm1 \n\t"
2821 "paddw (%2, %0), %%mm0 \n\t"
2822 "paddw 8(%2, %0), %%mm1 \n\t"
2823 "movq %%mm0, (%2, %0) \n\t"
2824 "movq %%mm1, 8(%2, %0) \n\t"
2825 "add $16, %0 \n\t"
2826 "cmp $128, %0 \n\t" //FIXME optimize & bench
2827 " jb 1b \n\t"
2828
2829 : "+r" (i)
2830 : "r"(basis), "r"(rem), "g"(scale)
2831 );
2832 }else{
2833 for(i=0; i<8*8; i++){
2834 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2835 }
2836 }
2837 }
2838 #endif /* CONFIG_ENCODERS */
2839
2840 #define PREFETCH(name, op) \
2841 static void name(void *mem, int stride, int h){\
2842 const uint8_t *p= mem;\
2843 do{\
2844 asm volatile(#op" %0" :: "m"(*p));\
2845 p+= stride;\
2846 }while(--h);\
2847 }
2848 PREFETCH(prefetch_mmx2, prefetcht0)
2849 PREFETCH(prefetch_3dnow, prefetch)
2850 #undef PREFETCH
2851
2852 #include "h264dsp_mmx.c"
2853
2854 /* AVS specific */
2855 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2856
2857 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2858 put_pixels8_mmx(dst, src, stride, 8);
2859 }
2860 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2861 avg_pixels8_mmx(dst, src, stride, 8);
2862 }
2863 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2864 put_pixels16_mmx(dst, src, stride, 16);
2865 }
2866 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2867 avg_pixels16_mmx(dst, src, stride, 16);
2868 }
2869
2870 /* external functions, from idct_mmx.c */
2871 void ff_mmx_idct(DCTELEM *block);
2872 void ff_mmxext_idct(DCTELEM *block);
2873
2874 void ff_vp3_idct_sse2(int16_t *input_data);
2875 void ff_vp3_idct_mmx(int16_t *data);
2876 void ff_vp3_dsp_init_mmx(void);
2877
2878 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2879 converted */
2880 #ifdef CONFIG_GPL
2881 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2882 {
2883 ff_mmx_idct (block);
2884 put_pixels_clamped_mmx(block, dest, line_size);
2885 }
2886 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2887 {
2888 ff_mmx_idct (block);
2889 add_pixels_clamped_mmx(block, dest, line_size);
2890 }
2891 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2892 {
2893 ff_mmxext_idct (block);
2894 put_pixels_clamped_mmx(block, dest, line_size);
2895 }
2896 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2897 {
2898 ff_mmxext_idct (block);
2899 add_pixels_clamped_mmx(block, dest, line_size);
2900 }
2901 #endif
2902 static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
2903 {
2904 ff_vp3_idct_sse2(block);
2905 put_signed_pixels_clamped_mmx(block, dest, line_size);
2906 }
2907 static void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
2908 {
2909 ff_vp3_idct_sse2(block);
2910 add_pixels_clamped_mmx(block, dest, line_size);
2911 }
2912 static void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2913 {
2914 ff_vp3_idct_mmx(block);
2915 put_signed_pixels_clamped_mmx(block, dest, line_size);
2916 }
2917 static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2918 {
2919 ff_vp3_idct_mmx(block);
2920 add_pixels_clamped_mmx(block, dest, line_size);
2921 }
2922 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2923 {
2924 ff_idct_xvid_mmx (block);
2925 put_pixels_clamped_mmx(block, dest, line_size);
2926 }
2927 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2928 {
2929 ff_idct_xvid_mmx (block);
2930 add_pixels_clamped_mmx(block, dest, line_size);
2931 }
2932 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2933 {
2934 ff_idct_xvid_mmx2 (block);
2935 put_pixels_clamped_mmx(block, dest, line_size);
2936 }
2937 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2938 {
2939 ff_idct_xvid_mmx2 (block);
2940 add_pixels_clamped_mmx(block, dest, line_size);
2941 }
2942
2943 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2944 {
2945 int i;
2946 asm volatile("pxor %%mm7, %%mm7":);
2947 for(i=0; i<blocksize; i+=2) {
2948 asm volatile(
2949 "movq %0, %%mm0 \n\t"
2950 "movq %1, %%mm1 \n\t"
2951 "movq %%mm0, %%mm2 \n\t"
2952 "movq %%mm1, %%mm3 \n\t"
2953 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2954 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2955 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2956 "pxor %%mm2, %%mm1 \n\t"
2957 "movq %%mm3, %%mm4 \n\t"
2958 "pand %%mm1, %%mm3 \n\t"
2959 "pandn %%mm1, %%mm4 \n\t"
2960 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2961 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2962 "movq %%mm3, %1 \n\t"
2963 "movq %%mm0, %0 \n\t"
2964 :"+m"(mag[i]), "+m"(ang[i])
2965 ::"memory"
2966 );
2967 }
2968 asm volatile("femms");
2969 }
2970 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2971 {
2972 int i;
2973
2974 asm volatile(
2975 "movaps %0, %%xmm5 \n\t"
2976 ::"m"(ff_pdw_80000000[0])
2977 );
2978 for(i=0; i<blocksize; i+=4) {
2979 asm volatile(
2980 "movaps %0, %%xmm0 \n\t"
2981 "movaps %1, %%xmm1 \n\t"
2982 "xorps %%xmm2, %%xmm2 \n\t"
2983 "xorps %%xmm3, %%xmm3 \n\t"
2984 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2985 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2986 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2987 "xorps %%xmm2, %%xmm1 \n\t"
2988 "movaps %%xmm3, %%xmm4 \n\t"
2989 "andps %%xmm1, %%xmm3 \n\t"
2990 "andnps %%xmm1, %%xmm4 \n\t"
2991 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2992 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2993 "movaps %%xmm3, %1 \n\t"
2994 "movaps %%xmm0, %0 \n\t"
2995 :"+m"(mag[i]), "+m"(ang[i])
2996 ::"memory"
2997 );
2998 }
2999 }
3000
3001 static void vector_fmul_3dnow(float *dst, const float *src, int len){
3002 long i = (len-4)*4;
3003 asm volatile(
3004 "1: \n\t"
3005 "movq (%1,%0), %%mm0 \n\t"
3006 "movq 8(%1,%0), %%mm1 \n\t"
3007 "pfmul (%2,%0), %%mm0 \n\t"
3008 "pfmul 8(%2,%0), %%mm1 \n\t"
3009 "movq %%mm0, (%1,%0) \n\t"
3010 "movq %%mm1, 8(%1,%0) \n\t"
3011 "sub $16, %0 \n\t"
3012 "jge 1b \n\t"
3013 "femms \n\t"
3014 :"+r"(i)
3015 :"r"(dst), "r"(src)
3016 :"memory"
3017 );
3018 }
3019 static void vector_fmul_sse(float *dst, const float *src, int len){
3020 long i = (len-8)*4;
3021 asm volatile(
3022 "1: \n\t"
3023 "movaps (%1,%0), %%xmm0 \n\t"
3024 "movaps 16(%1,%0), %%xmm1 \n\t"
3025 "mulps (%2,%0), %%xmm0 \n\t"
3026 "mulps 16(%2,%0), %%xmm1 \n\t"
3027 "movaps %%xmm0, (%1,%0) \n\t"
3028 "movaps %%xmm1, 16(%1,%0) \n\t"
3029 "sub $32, %0 \n\t"
3030 "jge 1b \n\t"
3031 :"+r"(i)
3032 :"r"(dst), "r"(src)
3033 :"memory"
3034 );
3035 }
3036
3037 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
3038 long i = len*4-16;
3039 asm volatile(
3040 "1: \n\t"
3041 "pswapd 8(%1), %%mm0 \n\t"
3042 "pswapd (%1), %%mm1 \n\t"
3043 "pfmul (%3,%0), %%mm0 \n\t"
3044 "pfmul 8(%3,%0), %%mm1 \n\t"
3045 "movq %%mm0, (%2,%0) \n\t"
3046 "movq %%mm1, 8(%2,%0) \n\t"
3047 "add $16, %1 \n\t"
3048 "sub $16, %0 \n\t"
3049 "jge 1b \n\t"
3050 :"+r"(i), "+r"(src1)
3051 :"r"(dst), "r"(src0)
3052 );
3053 asm volatile("femms");
3054 }
3055 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3056 long i = len*4-32;
3057 asm volatile(
3058 "1: \n\t"
3059 "movaps 16(%1), %%xmm0 \n\t"
3060 "movaps (%1), %%xmm1 \n\t"
3061 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
3062 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3063 "mulps (%3,%0), %%xmm0 \n\t"
3064 "mulps 16(%3,%0), %%xmm1 \n\t"
3065 "movaps %%xmm0, (%2,%0) \n\t"
3066 "movaps %%xmm1, 16(%2,%0) \n\t"
3067 "add $32, %1 \n\t"
3068 "sub $32, %0 \n\t"
3069 "jge 1b \n\t"
3070 :"+r"(i), "+r"(src1)
3071 :"r"(dst), "r"(src0)
3072 );
3073 }
3074
3075 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3076 const float *src2, int src3, int len, int step){
3077 long i = (len-4)*4;
3078 if(step == 2 && src3 == 0){
3079 dst += (len-4)*2;
3080 asm volatile(
3081 "1: \n\t"
3082 "movq (%2,%0), %%mm0 \n\t"
3083 "movq 8(%2,%0), %%mm1 \n\t"
3084 "pfmul (%3,%0), %%mm0 \n\t"
3085 "pfmul 8(%3,%0), %%mm1 \n\t"
3086 "pfadd (%4,%0), %%mm0 \n\t"
3087 "pfadd 8(%4,%0), %%mm1 \n\t"
3088 "movd %%mm0, (%1) \n\t"
3089 "movd %%mm1, 16(%1) \n\t"
3090 "psrlq $32, %%mm0 \n\t"
3091 "psrlq $32, %%mm1 \n\t"
3092 "movd %%mm0, 8(%1) \n\t"
3093 "movd %%mm1, 24(%1) \n\t"
3094 "sub $32, %1 \n\t"
3095 "sub $16, %0 \n\t"
3096 "jge 1b \n\t"
3097 :"+r"(i), "+r"(dst)
3098 :"r"(src0), "r"(src1), "r"(src2)
3099 :"memory"
3100 );
3101 }
3102 else if(step == 1 && src3 == 0){
3103 asm volatile(
3104 "1: \n\t"
3105 "movq (%2,%0), %%mm0 \n\t"
3106 "movq 8(%2,%0), %%mm1 \n\t"
3107 "pfmul (%3,%0), %%mm0 \n\t"
3108 "pfmul 8(%3,%0), %%mm1 \n\t"
3109 "pfadd (%4,%0), %%mm0 \n\t"
3110 "pfadd 8(%4,%0), %%mm1 \n\t"
3111 "movq %%mm0, (%1,%0) \n\t"
3112 "movq %%mm1, 8(%1,%0) \n\t"
3113 "sub $16, %0 \n\t"
3114 "jge 1b \n\t"
3115 :"+r"(i)
3116 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3117 :"memory"
3118 );
3119 }
3120 else
3121 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3122 asm volatile("femms");
3123 }
3124 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3125 const float *src2, int src3, int len, int step){
3126 long i = (len-8)*4;
3127 if(step == 2 && src3 == 0){
3128 dst += (len-8)*2;
3129 asm volatile(
3130 "1: \n\t"
3131 "movaps (%2,%0), %%xmm0 \n\t"
3132 "movaps 16(%2,%0), %%xmm1 \n\t"
3133 "mulps (%3,%0), %%xmm0 \n\t"
3134 "mulps 16(%3,%0), %%xmm1 \n\t"
3135 "addps (%4,%0), %%xmm0 \n\t"
3136 "addps 16(%4,%0), %%xmm1 \n\t"
3137 "movss %%xmm0, (%1) \n\t"
3138 "movss %%xmm1, 32(%1) \n\t"
3139 "movhlps %%xmm0, %%xmm2 \n\t"
3140 "movhlps %%xmm1, %%xmm3 \n\t"
3141 "movss %%xmm2, 16(%1) \n\t"
3142 "movss %%xmm3, 48(%1) \n\t"
3143 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3144 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3145 "movss %%xmm0, 8(%1) \n\t"
3146 "movss %%xmm1, 40(%1) \n\t"
3147 "movhlps %%xmm0, %%xmm2 \n\t"
3148 "movhlps %%xmm1, %%xmm3 \n\t"
3149 "movss %%xmm2, 24(%1) \n\t"
3150 "movss %%xmm3, 56(%1) \n\t"
3151 "sub $64, %1 \n\t"
3152 "sub $32, %0 \n\t"
3153 "jge 1b \n\t"
3154 :"+r"(i), "+r"(dst)
3155 :"r"(src0), "r"(src1), "r"(src2)
3156 :"memory"
3157 );
3158 }
3159 else if(step == 1 && src3 == 0){
3160 asm volatile(
3161 "1: \n\t"
3162 "movaps (%2,%0), %%xmm0 \n\t"
3163 "movaps 16(%2,%0), %%xmm1 \n\t"
3164 "mulps (%3,%0), %%xmm0 \n\t"
3165 "mulps 16(%3,%0), %%xmm1 \n\t"
3166 "addps (%4,%0), %%xmm0 \n\t"
3167 "addps 16(%4,%0), %%xmm1 \n\t"
3168 "movaps %%xmm0, (%1,%0) \n\t"
3169 "movaps %%xmm1, 16(%1,%0) \n\t"
3170 "sub $32, %0 \n\t"
3171 "jge 1b \n\t"
3172 :"+r"(i)
3173 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3174 :"memory"
3175 );
3176 }
3177 else
3178 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3179 }
3180
3181 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3182 // not bit-exact: pf2id uses different rounding than C and SSE
3183 int i;
3184 for(i=0; i<len; i+=4) {
3185 asm volatile(
3186 "pf2id %1, %%mm0 \n\t"
3187 "pf2id %2, %%mm1 \n\t"
3188 "packssdw %%mm1, %%mm0 \n\t"
3189 "movq %%mm0, %0 \n\t"
3190 :"=m"(dst[i])
3191 :"m"(src[i]), "m"(src[i+2])
3192 );
3193 }
3194 asm volatile("femms");
3195 }
3196 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3197 int i;
3198 for(i=0; i<len; i+=4) {
3199 asm volatile(
3200 "cvtps2pi %1, %%mm0 \n\t"
3201 "cvtps2pi %2, %%mm1 \n\t"
3202 "packssdw %%mm1, %%mm0 \n\t"
3203 "movq %%mm0, %0 \n\t"
3204 :"=m"(dst[i])
3205 :"m"(src[i]), "m"(src[i+2])
3206 );
3207 }
3208 asm volatile("emms");
3209 }
3210
3211 #ifdef CONFIG_SNOW_DECODER
3212 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
3213 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3214 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3215 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3216 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3217 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3218 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3219 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3220 #endif
3221
3222 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3223 {
3224 mm_flags = mm_support();
3225
3226 if (avctx->dsp_mask) {
3227 if (avctx->dsp_mask & FF_MM_FORCE)
3228 mm_flags |= (avctx->dsp_mask & 0xffff);
3229 else
3230 mm_flags &= ~(avctx->dsp_mask & 0xffff);
3231 }
3232
3233 #if 0
3234 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3235 if (mm_flags & MM_MMX)
3236 av_log(avctx, AV_LOG_INFO, " mmx");
3237 if (mm_flags & MM_MMXEXT)
3238 av_log(avctx, AV_LOG_INFO, " mmxext");
3239 if (mm_flags & MM_3DNOW)
3240 av_log(avctx, AV_LOG_INFO, " 3dnow");
3241 if (mm_flags & MM_SSE)
3242 av_log(avctx, AV_LOG_INFO, " sse");
3243 if (mm_flags & MM_SSE2)
3244 av_log(avctx, AV_LOG_INFO, " sse2");
3245 av_log(avctx, AV_LOG_INFO, "\n");
3246 #endif
3247
3248 if (mm_flags & MM_MMX) {
3249 const int idct_algo= avctx->idct_algo;
3250
3251 #ifdef CONFIG_ENCODERS
3252 const int dct_algo = avctx->dct_algo;
3253 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3254 if(mm_flags & MM_SSE2){
3255 c->fdct = ff_fdct_sse2;
3256 }else if(mm_flags & MM_MMXEXT){
3257 c->fdct = ff_fdct_mmx2;
3258 }else{
3259 c->fdct = ff_fdct_mmx;
3260 }
3261 }
3262 #endif //CONFIG_ENCODERS
3263 if(avctx->lowres==0){
3264 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3265 c->idct_put= ff_simple_idct_put_mmx;
3266 c->idct_add= ff_simple_idct_add_mmx;
3267 c->idct = ff_simple_idct_mmx;
3268 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3269 #ifdef CONFIG_GPL
3270 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
3271 if(mm_flags & MM_MMXEXT){
3272 c->idct_put= ff_libmpeg2mmx2_idct_put;
3273 c->idct_add= ff_libmpeg2mmx2_idct_add;
3274 c->idct = ff_mmxext_idct;
3275 }else{
3276 c->idct_put= ff_libmpeg2mmx_idct_put;
3277 c->idct_add= ff_libmpeg2mmx_idct_add;
3278 c->idct = ff_mmx_idct;
3279 }
3280 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3281 #endif
3282 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
3283 idct_algo==FF_IDCT_VP3 &&
3284 avctx->codec->id!=CODEC_ID_THEORA &&
3285 !(avctx->flags & CODEC_FLAG_BITEXACT)){
3286 if(mm_flags & MM_SSE2){
3287 c->idct_put= ff_vp3_idct_put_sse2;
3288 c->idct_add= ff_vp3_idct_add_sse2;
3289 c->idct = ff_vp3_idct_sse2;
3290 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3291 }else{
3292 ff_vp3_dsp_init_mmx();
3293 c->idct_put= ff_vp3_idct_put_mmx;
3294 c->idct_add= ff_vp3_idct_add_mmx;
3295 c->idct = ff_vp3_idct_mmx;
3296 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
3297 }
3298 }else if(idct_algo==FF_IDCT_CAVS){
3299 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3300 }else if(idct_algo==FF_IDCT_XVIDMMX){
3301 if(mm_flags & MM_MMXEXT){
3302 c->idct_put= ff_idct_xvid_mmx2_put;
3303 c->idct_add= ff_idct_xvid_mmx2_add;
3304 c->idct = ff_idct_xvid_mmx2;
3305 }else{
3306 c->idct_put= ff_idct_xvid_mmx_put;
3307 c->idct_add= ff_idct_xvid_mmx_add;
3308 c->idct = ff_idct_xvid_mmx;
3309 }
3310 }
3311 }
3312
3313 #ifdef CONFIG_ENCODERS
3314 c->get_pixels = get_pixels_mmx;
3315 c->diff_pixels = diff_pixels_mmx;
3316 #endif //CONFIG_ENCODERS
3317 c->put_pixels_clamped = put_pixels_clamped_mmx;
3318 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
3319 c->add_pixels_clamped = add_pixels_clamped_mmx;
3320 c->clear_blocks = clear_blocks_mmx;
3321 #ifdef CONFIG_ENCODERS
3322 c->pix_sum = pix_sum16_mmx;
3323 #endif //CONFIG_ENCODERS
3324
3325 c->put_pixels_tab[0][0] = put_pixels16_mmx;
3326 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
3327 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
3328 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
3329
3330 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
3331 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
3332 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
3333 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
3334
3335 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
3336 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
3337 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
3338 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
3339
3340 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
3341 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
3342 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
3343 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
3344
3345 c->put_pixels_tab[1][0] = put_pixels8_mmx;
3346 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
3347 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
3348 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
3349
3350 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
3351 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
3352 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
3353 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
3354
3355 c->avg_pixels_tab[1][