__asm __volatile -> asm volatile, improves code consistency and works
[libav.git] / libavcodec / i386 / dsputil_mmx.c
1 /*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
25 #include "dsputil.h"
26 #include "dsputil_mmx.h"
27 #include "simple_idct.h"
28 #include "mpegvideo.h"
29 #include "x86_cpu.h"
30 #include "mmx.h"
31 #include "vp3dsp_mmx.h"
32 #include "vp3dsp_sse2.h"
33 #include "h263.h"
34
35 //#undef NDEBUG
36 //#include <assert.h>
37
38 extern void ff_idct_xvid_mmx(short *block);
39 extern void ff_idct_xvid_mmx2(short *block);
40
41 int mm_flags; /* multimedia extension flags */
42
43 /* pixel operations */
44 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
45 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
46
47 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
48 {0x8000000080000000ULL, 0x8000000080000000ULL};
49
50 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
51 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
52 DECLARE_ALIGNED_16(const xmm_t, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
53 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
55 DECLARE_ALIGNED_16(const xmm_t, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
56 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
61 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
62 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
63
64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
69 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
70
71 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
72 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
73
74 #define JUMPALIGN() asm volatile (ASMALIGN(3)::)
75 #define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::)
76
77 #define MOVQ_WONE(regd) \
78 asm volatile ( \
79 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
80 "psrlw $15, %%" #regd ::)
81
82 #define MOVQ_BFE(regd) \
83 asm volatile ( \
84 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
85 "paddb %%" #regd ", %%" #regd " \n\t" ::)
86
87 #ifndef PIC
88 #define MOVQ_BONE(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
89 #define MOVQ_WTWO(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
90 #else
91 // for shared library it's better to use this way for accessing constants
92 // pcmpeqd -> -1
93 #define MOVQ_BONE(regd) \
94 asm volatile ( \
95 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
96 "psrlw $15, %%" #regd " \n\t" \
97 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
98
99 #define MOVQ_WTWO(regd) \
100 asm volatile ( \
101 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
102 "psrlw $15, %%" #regd " \n\t" \
103 "psllw $1, %%" #regd " \n\t"::)
104
105 #endif
106
107 // using regr as temporary and for the output result
108 // first argument is unmodifed and second is trashed
109 // regfe is supposed to contain 0xfefefefefefefefe
110 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
111 "movq " #rega ", " #regr " \n\t"\
112 "pand " #regb ", " #regr " \n\t"\
113 "pxor " #rega ", " #regb " \n\t"\
114 "pand " #regfe "," #regb " \n\t"\
115 "psrlq $1, " #regb " \n\t"\
116 "paddb " #regb ", " #regr " \n\t"
117
118 #define PAVGB_MMX(rega, regb, regr, regfe) \
119 "movq " #rega ", " #regr " \n\t"\
120 "por " #regb ", " #regr " \n\t"\
121 "pxor " #rega ", " #regb " \n\t"\
122 "pand " #regfe "," #regb " \n\t"\
123 "psrlq $1, " #regb " \n\t"\
124 "psubb " #regb ", " #regr " \n\t"
125
126 // mm6 is supposed to contain 0xfefefefefefefefe
127 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
128 "movq " #rega ", " #regr " \n\t"\
129 "movq " #regc ", " #regp " \n\t"\
130 "pand " #regb ", " #regr " \n\t"\
131 "pand " #regd ", " #regp " \n\t"\
132 "pxor " #rega ", " #regb " \n\t"\
133 "pxor " #regc ", " #regd " \n\t"\
134 "pand %%mm6, " #regb " \n\t"\
135 "pand %%mm6, " #regd " \n\t"\
136 "psrlq $1, " #regb " \n\t"\
137 "psrlq $1, " #regd " \n\t"\
138 "paddb " #regb ", " #regr " \n\t"\
139 "paddb " #regd ", " #regp " \n\t"
140
141 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
142 "movq " #rega ", " #regr " \n\t"\
143 "movq " #regc ", " #regp " \n\t"\
144 "por " #regb ", " #regr " \n\t"\
145 "por " #regd ", " #regp " \n\t"\
146 "pxor " #rega ", " #regb " \n\t"\
147 "pxor " #regc ", " #regd " \n\t"\
148 "pand %%mm6, " #regb " \n\t"\
149 "pand %%mm6, " #regd " \n\t"\
150 "psrlq $1, " #regd " \n\t"\
151 "psrlq $1, " #regb " \n\t"\
152 "psubb " #regb ", " #regr " \n\t"\
153 "psubb " #regd ", " #regp " \n\t"
154
155 /***********************************/
156 /* MMX no rounding */
157 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
158 #define SET_RND MOVQ_WONE
159 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
160 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
161
162 #include "dsputil_mmx_rnd.h"
163
164 #undef DEF
165 #undef SET_RND
166 #undef PAVGBP
167 #undef PAVGB
168 /***********************************/
169 /* MMX rounding */
170
171 #define DEF(x, y) x ## _ ## y ##_mmx
172 #define SET_RND MOVQ_WTWO
173 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
174 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
175
176 #include "dsputil_mmx_rnd.h"
177
178 #undef DEF
179 #undef SET_RND
180 #undef PAVGBP
181 #undef PAVGB
182
183 /***********************************/
184 /* 3Dnow specific */
185
186 #define DEF(x) x ## _3dnow
187 #define PAVGB "pavgusb"
188
189 #include "dsputil_mmx_avg.h"
190
191 #undef DEF
192 #undef PAVGB
193
194 /***********************************/
195 /* MMX2 specific */
196
197 #define DEF(x) x ## _mmx2
198
199 /* Introduced only in MMX2 set */
200 #define PAVGB "pavgb"
201
202 #include "dsputil_mmx_avg.h"
203
204 #undef DEF
205 #undef PAVGB
206
207 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
208 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
209 #define put_pixels16_mmx2 put_pixels16_mmx
210 #define put_pixels8_mmx2 put_pixels8_mmx
211 #define put_pixels4_mmx2 put_pixels4_mmx
212 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
213 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
214 #define put_pixels16_3dnow put_pixels16_mmx
215 #define put_pixels8_3dnow put_pixels8_mmx
216 #define put_pixels4_3dnow put_pixels4_mmx
217 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
218 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
219
220 /***********************************/
221 /* standard MMX */
222
223 #ifdef CONFIG_ENCODERS
224 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
225 {
226 asm volatile(
227 "mov $-128, %%"REG_a" \n\t"
228 "pxor %%mm7, %%mm7 \n\t"
229 ASMALIGN(4)
230 "1: \n\t"
231 "movq (%0), %%mm0 \n\t"
232 "movq (%0, %2), %%mm2 \n\t"
233 "movq %%mm0, %%mm1 \n\t"
234 "movq %%mm2, %%mm3 \n\t"
235 "punpcklbw %%mm7, %%mm0 \n\t"
236 "punpckhbw %%mm7, %%mm1 \n\t"
237 "punpcklbw %%mm7, %%mm2 \n\t"
238 "punpckhbw %%mm7, %%mm3 \n\t"
239 "movq %%mm0, (%1, %%"REG_a") \n\t"
240 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
241 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
242 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
243 "add %3, %0 \n\t"
244 "add $32, %%"REG_a" \n\t"
245 "js 1b \n\t"
246 : "+r" (pixels)
247 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
248 : "%"REG_a
249 );
250 }
251
252 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
253 {
254 asm volatile(
255 "pxor %%mm7, %%mm7 \n\t"
256 "mov $-128, %%"REG_a" \n\t"
257 ASMALIGN(4)
258 "1: \n\t"
259 "movq (%0), %%mm0 \n\t"
260 "movq (%1), %%mm2 \n\t"
261 "movq %%mm0, %%mm1 \n\t"
262 "movq %%mm2, %%mm3 \n\t"
263 "punpcklbw %%mm7, %%mm0 \n\t"
264 "punpckhbw %%mm7, %%mm1 \n\t"
265 "punpcklbw %%mm7, %%mm2 \n\t"
266 "punpckhbw %%mm7, %%mm3 \n\t"
267 "psubw %%mm2, %%mm0 \n\t"
268 "psubw %%mm3, %%mm1 \n\t"
269 "movq %%mm0, (%2, %%"REG_a") \n\t"
270 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
271 "add %3, %0 \n\t"
272 "add %3, %1 \n\t"
273 "add $16, %%"REG_a" \n\t"
274 "jnz 1b \n\t"
275 : "+r" (s1), "+r" (s2)
276 : "r" (block+64), "r" ((long)stride)
277 : "%"REG_a
278 );
279 }
280 #endif //CONFIG_ENCODERS
281
282 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
283 {
284 const DCTELEM *p;
285 uint8_t *pix;
286
287 /* read the pixels */
288 p = block;
289 pix = pixels;
290 /* unrolled loop */
291 asm volatile(
292 "movq %3, %%mm0 \n\t"
293 "movq 8%3, %%mm1 \n\t"
294 "movq 16%3, %%mm2 \n\t"
295 "movq 24%3, %%mm3 \n\t"
296 "movq 32%3, %%mm4 \n\t"
297 "movq 40%3, %%mm5 \n\t"
298 "movq 48%3, %%mm6 \n\t"
299 "movq 56%3, %%mm7 \n\t"
300 "packuswb %%mm1, %%mm0 \n\t"
301 "packuswb %%mm3, %%mm2 \n\t"
302 "packuswb %%mm5, %%mm4 \n\t"
303 "packuswb %%mm7, %%mm6 \n\t"
304 "movq %%mm0, (%0) \n\t"
305 "movq %%mm2, (%0, %1) \n\t"
306 "movq %%mm4, (%0, %1, 2) \n\t"
307 "movq %%mm6, (%0, %2) \n\t"
308 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
309 :"memory");
310 pix += line_size*4;
311 p += 32;
312
313 // if here would be an exact copy of the code above
314 // compiler would generate some very strange code
315 // thus using "r"
316 asm volatile(
317 "movq (%3), %%mm0 \n\t"
318 "movq 8(%3), %%mm1 \n\t"
319 "movq 16(%3), %%mm2 \n\t"
320 "movq 24(%3), %%mm3 \n\t"
321 "movq 32(%3), %%mm4 \n\t"
322 "movq 40(%3), %%mm5 \n\t"
323 "movq 48(%3), %%mm6 \n\t"
324 "movq 56(%3), %%mm7 \n\t"
325 "packuswb %%mm1, %%mm0 \n\t"
326 "packuswb %%mm3, %%mm2 \n\t"
327 "packuswb %%mm5, %%mm4 \n\t"
328 "packuswb %%mm7, %%mm6 \n\t"
329 "movq %%mm0, (%0) \n\t"
330 "movq %%mm2, (%0, %1) \n\t"
331 "movq %%mm4, (%0, %1, 2) \n\t"
332 "movq %%mm6, (%0, %2) \n\t"
333 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
334 :"memory");
335 }
336
337 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
338 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
339
340 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
341 {
342 int i;
343
344 movq_m2r(*vector128, mm1);
345 for (i = 0; i < 8; i++) {
346 movq_m2r(*(block), mm0);
347 packsswb_m2r(*(block + 4), mm0);
348 block += 8;
349 paddb_r2r(mm1, mm0);
350 movq_r2m(mm0, *pixels);
351 pixels += line_size;
352 }
353 }
354
355 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
356 {
357 const DCTELEM *p;
358 uint8_t *pix;
359 int i;
360
361 /* read the pixels */
362 p = block;
363 pix = pixels;
364 MOVQ_ZERO(mm7);
365 i = 4;
366 do {
367 asm volatile(
368 "movq (%2), %%mm0 \n\t"
369 "movq 8(%2), %%mm1 \n\t"
370 "movq 16(%2), %%mm2 \n\t"
371 "movq 24(%2), %%mm3 \n\t"
372 "movq %0, %%mm4 \n\t"
373 "movq %1, %%mm6 \n\t"
374 "movq %%mm4, %%mm5 \n\t"
375 "punpcklbw %%mm7, %%mm4 \n\t"
376 "punpckhbw %%mm7, %%mm5 \n\t"
377 "paddsw %%mm4, %%mm0 \n\t"
378 "paddsw %%mm5, %%mm1 \n\t"
379 "movq %%mm6, %%mm5 \n\t"
380 "punpcklbw %%mm7, %%mm6 \n\t"
381 "punpckhbw %%mm7, %%mm5 \n\t"
382 "paddsw %%mm6, %%mm2 \n\t"
383 "paddsw %%mm5, %%mm3 \n\t"
384 "packuswb %%mm1, %%mm0 \n\t"
385 "packuswb %%mm3, %%mm2 \n\t"
386 "movq %%mm0, %0 \n\t"
387 "movq %%mm2, %1 \n\t"
388 :"+m"(*pix), "+m"(*(pix+line_size))
389 :"r"(p)
390 :"memory");
391 pix += line_size*2;
392 p += 16;
393 } while (--i);
394 }
395
396 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
397 {
398 asm volatile(
399 "lea (%3, %3), %%"REG_a" \n\t"
400 ASMALIGN(3)
401 "1: \n\t"
402 "movd (%1), %%mm0 \n\t"
403 "movd (%1, %3), %%mm1 \n\t"
404 "movd %%mm0, (%2) \n\t"
405 "movd %%mm1, (%2, %3) \n\t"
406 "add %%"REG_a", %1 \n\t"
407 "add %%"REG_a", %2 \n\t"
408 "movd (%1), %%mm0 \n\t"
409 "movd (%1, %3), %%mm1 \n\t"
410 "movd %%mm0, (%2) \n\t"
411 "movd %%mm1, (%2, %3) \n\t"
412 "add %%"REG_a", %1 \n\t"
413 "add %%"REG_a", %2 \n\t"
414 "subl $4, %0 \n\t"
415 "jnz 1b \n\t"
416 : "+g"(h), "+r" (pixels), "+r" (block)
417 : "r"((long)line_size)
418 : "%"REG_a, "memory"
419 );
420 }
421
422 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
423 {
424 asm volatile(
425 "lea (%3, %3), %%"REG_a" \n\t"
426 ASMALIGN(3)
427 "1: \n\t"
428 "movq (%1), %%mm0 \n\t"
429 "movq (%1, %3), %%mm1 \n\t"
430 "movq %%mm0, (%2) \n\t"
431 "movq %%mm1, (%2, %3) \n\t"
432 "add %%"REG_a", %1 \n\t"
433 "add %%"REG_a", %2 \n\t"
434 "movq (%1), %%mm0 \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq %%mm0, (%2) \n\t"
437 "movq %%mm1, (%2, %3) \n\t"
438 "add %%"REG_a", %1 \n\t"
439 "add %%"REG_a", %2 \n\t"
440 "subl $4, %0 \n\t"
441 "jnz 1b \n\t"
442 : "+g"(h), "+r" (pixels), "+r" (block)
443 : "r"((long)line_size)
444 : "%"REG_a, "memory"
445 );
446 }
447
448 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
449 {
450 asm volatile(
451 "lea (%3, %3), %%"REG_a" \n\t"
452 ASMALIGN(3)
453 "1: \n\t"
454 "movq (%1), %%mm0 \n\t"
455 "movq 8(%1), %%mm4 \n\t"
456 "movq (%1, %3), %%mm1 \n\t"
457 "movq 8(%1, %3), %%mm5 \n\t"
458 "movq %%mm0, (%2) \n\t"
459 "movq %%mm4, 8(%2) \n\t"
460 "movq %%mm1, (%2, %3) \n\t"
461 "movq %%mm5, 8(%2, %3) \n\t"
462 "add %%"REG_a", %1 \n\t"
463 "add %%"REG_a", %2 \n\t"
464 "movq (%1), %%mm0 \n\t"
465 "movq 8(%1), %%mm4 \n\t"
466 "movq (%1, %3), %%mm1 \n\t"
467 "movq 8(%1, %3), %%mm5 \n\t"
468 "movq %%mm0, (%2) \n\t"
469 "movq %%mm4, 8(%2) \n\t"
470 "movq %%mm1, (%2, %3) \n\t"
471 "movq %%mm5, 8(%2, %3) \n\t"
472 "add %%"REG_a", %1 \n\t"
473 "add %%"REG_a", %2 \n\t"
474 "subl $4, %0 \n\t"
475 "jnz 1b \n\t"
476 : "+g"(h), "+r" (pixels), "+r" (block)
477 : "r"((long)line_size)
478 : "%"REG_a, "memory"
479 );
480 }
481
482 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
483 {
484 asm volatile(
485 "1: \n\t"
486 "movdqu (%1), %%xmm0 \n\t"
487 "movdqu (%1,%3), %%xmm1 \n\t"
488 "movdqu (%1,%3,2), %%xmm2 \n\t"
489 "movdqu (%1,%4), %%xmm3 \n\t"
490 "movdqa %%xmm0, (%2) \n\t"
491 "movdqa %%xmm1, (%2,%3) \n\t"
492 "movdqa %%xmm2, (%2,%3,2) \n\t"
493 "movdqa %%xmm3, (%2,%4) \n\t"
494 "subl $4, %0 \n\t"
495 "lea (%1,%3,4), %1 \n\t"
496 "lea (%2,%3,4), %2 \n\t"
497 "jnz 1b \n\t"
498 : "+g"(h), "+r" (pixels), "+r" (block)
499 : "r"((long)line_size), "r"(3L*line_size)
500 : "memory"
501 );
502 }
503
504 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
505 {
506 asm volatile(
507 "1: \n\t"
508 "movdqu (%1), %%xmm0 \n\t"
509 "movdqu (%1,%3), %%xmm1 \n\t"
510 "movdqu (%1,%3,2), %%xmm2 \n\t"
511 "movdqu (%1,%4), %%xmm3 \n\t"
512 "pavgb (%2), %%xmm0 \n\t"
513 "pavgb (%2,%3), %%xmm1 \n\t"
514 "pavgb (%2,%3,2), %%xmm2 \n\t"
515 "pavgb (%2,%4), %%xmm3 \n\t"
516 "movdqa %%xmm0, (%2) \n\t"
517 "movdqa %%xmm1, (%2,%3) \n\t"
518 "movdqa %%xmm2, (%2,%3,2) \n\t"
519 "movdqa %%xmm3, (%2,%4) \n\t"
520 "subl $4, %0 \n\t"
521 "lea (%1,%3,4), %1 \n\t"
522 "lea (%2,%3,4), %2 \n\t"
523 "jnz 1b \n\t"
524 : "+g"(h), "+r" (pixels), "+r" (block)
525 : "r"((long)line_size), "r"(3L*line_size)
526 : "memory"
527 );
528 }
529
530 static void clear_blocks_mmx(DCTELEM *blocks)
531 {
532 asm volatile(
533 "pxor %%mm7, %%mm7 \n\t"
534 "mov $-128*6, %%"REG_a" \n\t"
535 "1: \n\t"
536 "movq %%mm7, (%0, %%"REG_a") \n\t"
537 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
538 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
539 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
540 "add $32, %%"REG_a" \n\t"
541 " js 1b \n\t"
542 : : "r" (((uint8_t *)blocks)+128*6)
543 : "%"REG_a
544 );
545 }
546
547 #ifdef CONFIG_ENCODERS
548 static int pix_sum16_mmx(uint8_t * pix, int line_size){
549 const int h=16;
550 int sum;
551 long index= -line_size*h;
552
553 asm volatile(
554 "pxor %%mm7, %%mm7 \n\t"
555 "pxor %%mm6, %%mm6 \n\t"
556 "1: \n\t"
557 "movq (%2, %1), %%mm0 \n\t"
558 "movq (%2, %1), %%mm1 \n\t"
559 "movq 8(%2, %1), %%mm2 \n\t"
560 "movq 8(%2, %1), %%mm3 \n\t"
561 "punpcklbw %%mm7, %%mm0 \n\t"
562 "punpckhbw %%mm7, %%mm1 \n\t"
563 "punpcklbw %%mm7, %%mm2 \n\t"
564 "punpckhbw %%mm7, %%mm3 \n\t"
565 "paddw %%mm0, %%mm1 \n\t"
566 "paddw %%mm2, %%mm3 \n\t"
567 "paddw %%mm1, %%mm3 \n\t"
568 "paddw %%mm3, %%mm6 \n\t"
569 "add %3, %1 \n\t"
570 " js 1b \n\t"
571 "movq %%mm6, %%mm5 \n\t"
572 "psrlq $32, %%mm6 \n\t"
573 "paddw %%mm5, %%mm6 \n\t"
574 "movq %%mm6, %%mm5 \n\t"
575 "psrlq $16, %%mm6 \n\t"
576 "paddw %%mm5, %%mm6 \n\t"
577 "movd %%mm6, %0 \n\t"
578 "andl $0xFFFF, %0 \n\t"
579 : "=&r" (sum), "+r" (index)
580 : "r" (pix - index), "r" ((long)line_size)
581 );
582
583 return sum;
584 }
585 #endif //CONFIG_ENCODERS
586
587 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
588 long i=0;
589 asm volatile(
590 "1: \n\t"
591 "movq (%1, %0), %%mm0 \n\t"
592 "movq (%2, %0), %%mm1 \n\t"
593 "paddb %%mm0, %%mm1 \n\t"
594 "movq %%mm1, (%2, %0) \n\t"
595 "movq 8(%1, %0), %%mm0 \n\t"
596 "movq 8(%2, %0), %%mm1 \n\t"
597 "paddb %%mm0, %%mm1 \n\t"
598 "movq %%mm1, 8(%2, %0) \n\t"
599 "add $16, %0 \n\t"
600 "cmp %3, %0 \n\t"
601 " jb 1b \n\t"
602 : "+r" (i)
603 : "r"(src), "r"(dst), "r"((long)w-15)
604 );
605 for(; i<w; i++)
606 dst[i+0] += src[i+0];
607 }
608
609 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
610 long i=0;
611 asm volatile(
612 "1: \n\t"
613 "movq (%2, %0), %%mm0 \n\t"
614 "movq 8(%2, %0), %%mm1 \n\t"
615 "paddb (%3, %0), %%mm0 \n\t"
616 "paddb 8(%3, %0), %%mm1 \n\t"
617 "movq %%mm0, (%1, %0) \n\t"
618 "movq %%mm1, 8(%1, %0) \n\t"
619 "add $16, %0 \n\t"
620 "cmp %4, %0 \n\t"
621 " jb 1b \n\t"
622 : "+r" (i)
623 : "r"(dst), "r"(src1), "r"(src2), "r"((long)w-15)
624 );
625 for(; i<w; i++)
626 dst[i] = src1[i] + src2[i];
627 }
628
629 #define H263_LOOP_FILTER \
630 "pxor %%mm7, %%mm7 \n\t"\
631 "movq %0, %%mm0 \n\t"\
632 "movq %0, %%mm1 \n\t"\
633 "movq %3, %%mm2 \n\t"\
634 "movq %3, %%mm3 \n\t"\
635 "punpcklbw %%mm7, %%mm0 \n\t"\
636 "punpckhbw %%mm7, %%mm1 \n\t"\
637 "punpcklbw %%mm7, %%mm2 \n\t"\
638 "punpckhbw %%mm7, %%mm3 \n\t"\
639 "psubw %%mm2, %%mm0 \n\t"\
640 "psubw %%mm3, %%mm1 \n\t"\
641 "movq %1, %%mm2 \n\t"\
642 "movq %1, %%mm3 \n\t"\
643 "movq %2, %%mm4 \n\t"\
644 "movq %2, %%mm5 \n\t"\
645 "punpcklbw %%mm7, %%mm2 \n\t"\
646 "punpckhbw %%mm7, %%mm3 \n\t"\
647 "punpcklbw %%mm7, %%mm4 \n\t"\
648 "punpckhbw %%mm7, %%mm5 \n\t"\
649 "psubw %%mm2, %%mm4 \n\t"\
650 "psubw %%mm3, %%mm5 \n\t"\
651 "psllw $2, %%mm4 \n\t"\
652 "psllw $2, %%mm5 \n\t"\
653 "paddw %%mm0, %%mm4 \n\t"\
654 "paddw %%mm1, %%mm5 \n\t"\
655 "pxor %%mm6, %%mm6 \n\t"\
656 "pcmpgtw %%mm4, %%mm6 \n\t"\
657 "pcmpgtw %%mm5, %%mm7 \n\t"\
658 "pxor %%mm6, %%mm4 \n\t"\
659 "pxor %%mm7, %%mm5 \n\t"\
660 "psubw %%mm6, %%mm4 \n\t"\
661 "psubw %%mm7, %%mm5 \n\t"\
662 "psrlw $3, %%mm4 \n\t"\
663 "psrlw $3, %%mm5 \n\t"\
664 "packuswb %%mm5, %%mm4 \n\t"\
665 "packsswb %%mm7, %%mm6 \n\t"\
666 "pxor %%mm7, %%mm7 \n\t"\
667 "movd %4, %%mm2 \n\t"\
668 "punpcklbw %%mm2, %%mm2 \n\t"\
669 "punpcklbw %%mm2, %%mm2 \n\t"\
670 "punpcklbw %%mm2, %%mm2 \n\t"\
671 "psubusb %%mm4, %%mm2 \n\t"\
672 "movq %%mm2, %%mm3 \n\t"\
673 "psubusb %%mm4, %%mm3 \n\t"\
674 "psubb %%mm3, %%mm2 \n\t"\
675 "movq %1, %%mm3 \n\t"\
676 "movq %2, %%mm4 \n\t"\
677 "pxor %%mm6, %%mm3 \n\t"\
678 "pxor %%mm6, %%mm4 \n\t"\
679 "paddusb %%mm2, %%mm3 \n\t"\
680 "psubusb %%mm2, %%mm4 \n\t"\
681 "pxor %%mm6, %%mm3 \n\t"\
682 "pxor %%mm6, %%mm4 \n\t"\
683 "paddusb %%mm2, %%mm2 \n\t"\
684 "packsswb %%mm1, %%mm0 \n\t"\
685 "pcmpgtb %%mm0, %%mm7 \n\t"\
686 "pxor %%mm7, %%mm0 \n\t"\
687 "psubb %%mm7, %%mm0 \n\t"\
688 "movq %%mm0, %%mm1 \n\t"\
689 "psubusb %%mm2, %%mm0 \n\t"\
690 "psubb %%mm0, %%mm1 \n\t"\
691 "pand %5, %%mm1 \n\t"\
692 "psrlw $2, %%mm1 \n\t"\
693 "pxor %%mm7, %%mm1 \n\t"\
694 "psubb %%mm7, %%mm1 \n\t"\
695 "movq %0, %%mm5 \n\t"\
696 "movq %3, %%mm6 \n\t"\
697 "psubb %%mm1, %%mm5 \n\t"\
698 "paddb %%mm1, %%mm6 \n\t"
699
700 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
701 if(ENABLE_ANY_H263) {
702 const int strength= ff_h263_loop_filter_strength[qscale];
703
704 asm volatile(
705
706 H263_LOOP_FILTER
707
708 "movq %%mm3, %1 \n\t"
709 "movq %%mm4, %2 \n\t"
710 "movq %%mm5, %0 \n\t"
711 "movq %%mm6, %3 \n\t"
712 : "+m" (*(uint64_t*)(src - 2*stride)),
713 "+m" (*(uint64_t*)(src - 1*stride)),
714 "+m" (*(uint64_t*)(src + 0*stride)),
715 "+m" (*(uint64_t*)(src + 1*stride))
716 : "g" (2*strength), "m"(ff_pb_FC)
717 );
718 }
719 }
720
721 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
722 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
723 "movd %4, %%mm0 \n\t"
724 "movd %5, %%mm1 \n\t"
725 "movd %6, %%mm2 \n\t"
726 "movd %7, %%mm3 \n\t"
727 "punpcklbw %%mm1, %%mm0 \n\t"
728 "punpcklbw %%mm3, %%mm2 \n\t"
729 "movq %%mm0, %%mm1 \n\t"
730 "punpcklwd %%mm2, %%mm0 \n\t"
731 "punpckhwd %%mm2, %%mm1 \n\t"
732 "movd %%mm0, %0 \n\t"
733 "punpckhdq %%mm0, %%mm0 \n\t"
734 "movd %%mm0, %1 \n\t"
735 "movd %%mm1, %2 \n\t"
736 "punpckhdq %%mm1, %%mm1 \n\t"
737 "movd %%mm1, %3 \n\t"
738
739 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
740 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
741 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
742 "=m" (*(uint32_t*)(dst + 3*dst_stride))
743 : "m" (*(uint32_t*)(src + 0*src_stride)),
744 "m" (*(uint32_t*)(src + 1*src_stride)),
745 "m" (*(uint32_t*)(src + 2*src_stride)),
746 "m" (*(uint32_t*)(src + 3*src_stride))
747 );
748 }
749
750 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
751 if(ENABLE_ANY_H263) {
752 const int strength= ff_h263_loop_filter_strength[qscale];
753 DECLARE_ALIGNED(8, uint64_t, temp[4]);
754 uint8_t *btemp= (uint8_t*)temp;
755
756 src -= 2;
757
758 transpose4x4(btemp , src , 8, stride);
759 transpose4x4(btemp+4, src + 4*stride, 8, stride);
760 asm volatile(
761 H263_LOOP_FILTER // 5 3 4 6
762
763 : "+m" (temp[0]),
764 "+m" (temp[1]),
765 "+m" (temp[2]),
766 "+m" (temp[3])
767 : "g" (2*strength), "m"(ff_pb_FC)
768 );
769
770 asm volatile(
771 "movq %%mm5, %%mm1 \n\t"
772 "movq %%mm4, %%mm0 \n\t"
773 "punpcklbw %%mm3, %%mm5 \n\t"
774 "punpcklbw %%mm6, %%mm4 \n\t"
775 "punpckhbw %%mm3, %%mm1 \n\t"
776 "punpckhbw %%mm6, %%mm0 \n\t"
777 "movq %%mm5, %%mm3 \n\t"
778 "movq %%mm1, %%mm6 \n\t"
779 "punpcklwd %%mm4, %%mm5 \n\t"
780 "punpcklwd %%mm0, %%mm1 \n\t"
781 "punpckhwd %%mm4, %%mm3 \n\t"
782 "punpckhwd %%mm0, %%mm6 \n\t"
783 "movd %%mm5, (%0) \n\t"
784 "punpckhdq %%mm5, %%mm5 \n\t"
785 "movd %%mm5, (%0,%2) \n\t"
786 "movd %%mm3, (%0,%2,2) \n\t"
787 "punpckhdq %%mm3, %%mm3 \n\t"
788 "movd %%mm3, (%0,%3) \n\t"
789 "movd %%mm1, (%1) \n\t"
790 "punpckhdq %%mm1, %%mm1 \n\t"
791 "movd %%mm1, (%1,%2) \n\t"
792 "movd %%mm6, (%1,%2,2) \n\t"
793 "punpckhdq %%mm6, %%mm6 \n\t"
794 "movd %%mm6, (%1,%3) \n\t"
795 :: "r" (src),
796 "r" (src + 4*stride),
797 "r" ((long) stride ),
798 "r" ((long)(3*stride))
799 );
800 }
801 }
802
803 #ifdef CONFIG_ENCODERS
804 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
805 int tmp;
806 asm volatile (
807 "movl $16,%%ecx\n"
808 "pxor %%mm0,%%mm0\n"
809 "pxor %%mm7,%%mm7\n"
810 "1:\n"
811 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
812 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
813
814 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
815
816 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
817 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
818
819 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
820 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
821 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
822
823 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
824 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
825
826 "pmaddwd %%mm3,%%mm3\n"
827 "pmaddwd %%mm4,%%mm4\n"
828
829 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
830 pix2^2+pix3^2+pix6^2+pix7^2) */
831 "paddd %%mm3,%%mm4\n"
832 "paddd %%mm2,%%mm7\n"
833
834 "add %2, %0\n"
835 "paddd %%mm4,%%mm7\n"
836 "dec %%ecx\n"
837 "jnz 1b\n"
838
839 "movq %%mm7,%%mm1\n"
840 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
841 "paddd %%mm7,%%mm1\n"
842 "movd %%mm1,%1\n"
843 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
844 return tmp;
845 }
846
847 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
848 int tmp;
849 asm volatile (
850 "movl %4,%%ecx\n"
851 "shr $1,%%ecx\n"
852 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
853 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
854 "1:\n"
855 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
856 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
857 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
858 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
859
860 /* todo: mm1-mm2, mm3-mm4 */
861 /* algo: subtract mm1 from mm2 with saturation and vice versa */
862 /* OR the results to get absolute difference */
863 "movq %%mm1,%%mm5\n"
864 "movq %%mm3,%%mm6\n"
865 "psubusb %%mm2,%%mm1\n"
866 "psubusb %%mm4,%%mm3\n"
867 "psubusb %%mm5,%%mm2\n"
868 "psubusb %%mm6,%%mm4\n"
869
870 "por %%mm1,%%mm2\n"
871 "por %%mm3,%%mm4\n"
872
873 /* now convert to 16-bit vectors so we can square them */
874 "movq %%mm2,%%mm1\n"
875 "movq %%mm4,%%mm3\n"
876
877 "punpckhbw %%mm0,%%mm2\n"
878 "punpckhbw %%mm0,%%mm4\n"
879 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
880 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
881
882 "pmaddwd %%mm2,%%mm2\n"
883 "pmaddwd %%mm4,%%mm4\n"
884 "pmaddwd %%mm1,%%mm1\n"
885 "pmaddwd %%mm3,%%mm3\n"
886
887 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
888 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
889
890 "paddd %%mm2,%%mm1\n"
891 "paddd %%mm4,%%mm3\n"
892 "paddd %%mm1,%%mm7\n"
893 "paddd %%mm3,%%mm7\n"
894
895 "decl %%ecx\n"
896 "jnz 1b\n"
897
898 "movq %%mm7,%%mm1\n"
899 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
900 "paddd %%mm7,%%mm1\n"
901 "movd %%mm1,%2\n"
902 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
903 : "r" ((long)line_size) , "m" (h)
904 : "%ecx");
905 return tmp;
906 }
907
908 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
909 int tmp;
910 asm volatile (
911 "movl %4,%%ecx\n"
912 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
913 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
914 "1:\n"
915 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
916 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
917 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
918 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
919
920 /* todo: mm1-mm2, mm3-mm4 */
921 /* algo: subtract mm1 from mm2 with saturation and vice versa */
922 /* OR the results to get absolute difference */
923 "movq %%mm1,%%mm5\n"
924 "movq %%mm3,%%mm6\n"
925 "psubusb %%mm2,%%mm1\n"
926 "psubusb %%mm4,%%mm3\n"
927 "psubusb %%mm5,%%mm2\n"
928 "psubusb %%mm6,%%mm4\n"
929
930 "por %%mm1,%%mm2\n"
931 "por %%mm3,%%mm4\n"
932
933 /* now convert to 16-bit vectors so we can square them */
934 "movq %%mm2,%%mm1\n"
935 "movq %%mm4,%%mm3\n"
936
937 "punpckhbw %%mm0,%%mm2\n"
938 "punpckhbw %%mm0,%%mm4\n"
939 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
940 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
941
942 "pmaddwd %%mm2,%%mm2\n"
943 "pmaddwd %%mm4,%%mm4\n"
944 "pmaddwd %%mm1,%%mm1\n"
945 "pmaddwd %%mm3,%%mm3\n"
946
947 "add %3,%0\n"
948 "add %3,%1\n"
949
950 "paddd %%mm2,%%mm1\n"
951 "paddd %%mm4,%%mm3\n"
952 "paddd %%mm1,%%mm7\n"
953 "paddd %%mm3,%%mm7\n"
954
955 "decl %%ecx\n"
956 "jnz 1b\n"
957
958 "movq %%mm7,%%mm1\n"
959 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
960 "paddd %%mm7,%%mm1\n"
961 "movd %%mm1,%2\n"
962 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
963 : "r" ((long)line_size) , "m" (h)
964 : "%ecx");
965 return tmp;
966 }
967
968 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
969 int tmp;
970 asm volatile (
971 "shr $1,%2\n"
972 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
973 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
974 "1:\n"
975 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
976 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
977 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
978 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
979
980 /* todo: mm1-mm2, mm3-mm4 */
981 /* algo: subtract mm1 from mm2 with saturation and vice versa */
982 /* OR the results to get absolute difference */
983 "movdqa %%xmm1,%%xmm5\n"
984 "movdqa %%xmm3,%%xmm6\n"
985 "psubusb %%xmm2,%%xmm1\n"
986 "psubusb %%xmm4,%%xmm3\n"
987 "psubusb %%xmm5,%%xmm2\n"
988 "psubusb %%xmm6,%%xmm4\n"
989
990 "por %%xmm1,%%xmm2\n"
991 "por %%xmm3,%%xmm4\n"
992
993 /* now convert to 16-bit vectors so we can square them */
994 "movdqa %%xmm2,%%xmm1\n"
995 "movdqa %%xmm4,%%xmm3\n"
996
997 "punpckhbw %%xmm0,%%xmm2\n"
998 "punpckhbw %%xmm0,%%xmm4\n"
999 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
1000 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
1001
1002 "pmaddwd %%xmm2,%%xmm2\n"
1003 "pmaddwd %%xmm4,%%xmm4\n"
1004 "pmaddwd %%xmm1,%%xmm1\n"
1005 "pmaddwd %%xmm3,%%xmm3\n"
1006
1007 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
1008 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
1009
1010 "paddd %%xmm2,%%xmm1\n"
1011 "paddd %%xmm4,%%xmm3\n"
1012 "paddd %%xmm1,%%xmm7\n"
1013 "paddd %%xmm3,%%xmm7\n"
1014
1015 "decl %2\n"
1016 "jnz 1b\n"
1017
1018 "movdqa %%xmm7,%%xmm1\n"
1019 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
1020 "paddd %%xmm1,%%xmm7\n"
1021 "movdqa %%xmm7,%%xmm1\n"
1022 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
1023 "paddd %%xmm1,%%xmm7\n"
1024 "movd %%xmm7,%3\n"
1025 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
1026 : "r" ((long)line_size));
1027 return tmp;
1028 }
1029
1030 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
1031 int tmp;
1032 asm volatile (
1033 "movl %3,%%ecx\n"
1034 "pxor %%mm7,%%mm7\n"
1035 "pxor %%mm6,%%mm6\n"
1036
1037 "movq (%0),%%mm0\n"
1038 "movq %%mm0, %%mm1\n"
1039 "psllq $8, %%mm0\n"
1040 "psrlq $8, %%mm1\n"
1041 "psrlq $8, %%mm0\n"
1042 "movq %%mm0, %%mm2\n"
1043 "movq %%mm1, %%mm3\n"
1044 "punpcklbw %%mm7,%%mm0\n"
1045 "punpcklbw %%mm7,%%mm1\n"
1046 "punpckhbw %%mm7,%%mm2\n"
1047 "punpckhbw %%mm7,%%mm3\n"
1048 "psubw %%mm1, %%mm0\n"
1049 "psubw %%mm3, %%mm2\n"
1050
1051 "add %2,%0\n"
1052
1053 "movq (%0),%%mm4\n"
1054 "movq %%mm4, %%mm1\n"
1055 "psllq $8, %%mm4\n"
1056 "psrlq $8, %%mm1\n"
1057 "psrlq $8, %%mm4\n"
1058 "movq %%mm4, %%mm5\n"
1059 "movq %%mm1, %%mm3\n"
1060 "punpcklbw %%mm7,%%mm4\n"
1061 "punpcklbw %%mm7,%%mm1\n"
1062 "punpckhbw %%mm7,%%mm5\n"
1063 "punpckhbw %%mm7,%%mm3\n"
1064 "psubw %%mm1, %%mm4\n"
1065 "psubw %%mm3, %%mm5\n"
1066 "psubw %%mm4, %%mm0\n"
1067 "psubw %%mm5, %%mm2\n"
1068 "pxor %%mm3, %%mm3\n"
1069 "pxor %%mm1, %%mm1\n"
1070 "pcmpgtw %%mm0, %%mm3\n\t"
1071 "pcmpgtw %%mm2, %%mm1\n\t"
1072 "pxor %%mm3, %%mm0\n"
1073 "pxor %%mm1, %%mm2\n"
1074 "psubw %%mm3, %%mm0\n"
1075 "psubw %%mm1, %%mm2\n"
1076 "paddw %%mm0, %%mm2\n"
1077 "paddw %%mm2, %%mm6\n"
1078
1079 "add %2,%0\n"
1080 "1:\n"
1081
1082 "movq (%0),%%mm0\n"
1083 "movq %%mm0, %%mm1\n"
1084 "psllq $8, %%mm0\n"
1085 "psrlq $8, %%mm1\n"
1086 "psrlq $8, %%mm0\n"
1087 "movq %%mm0, %%mm2\n"
1088 "movq %%mm1, %%mm3\n"
1089 "punpcklbw %%mm7,%%mm0\n"
1090 "punpcklbw %%mm7,%%mm1\n"
1091 "punpckhbw %%mm7,%%mm2\n"
1092 "punpckhbw %%mm7,%%mm3\n"
1093 "psubw %%mm1, %%mm0\n"
1094 "psubw %%mm3, %%mm2\n"
1095 "psubw %%mm0, %%mm4\n"
1096 "psubw %%mm2, %%mm5\n"
1097 "pxor %%mm3, %%mm3\n"
1098 "pxor %%mm1, %%mm1\n"
1099 "pcmpgtw %%mm4, %%mm3\n\t"
1100 "pcmpgtw %%mm5, %%mm1\n\t"
1101 "pxor %%mm3, %%mm4\n"
1102 "pxor %%mm1, %%mm5\n"
1103 "psubw %%mm3, %%mm4\n"
1104 "psubw %%mm1, %%mm5\n"
1105 "paddw %%mm4, %%mm5\n"
1106 "paddw %%mm5, %%mm6\n"
1107
1108 "add %2,%0\n"
1109
1110 "movq (%0),%%mm4\n"
1111 "movq %%mm4, %%mm1\n"
1112 "psllq $8, %%mm4\n"
1113 "psrlq $8, %%mm1\n"
1114 "psrlq $8, %%mm4\n"
1115 "movq %%mm4, %%mm5\n"
1116 "movq %%mm1, %%mm3\n"
1117 "punpcklbw %%mm7,%%mm4\n"
1118 "punpcklbw %%mm7,%%mm1\n"
1119 "punpckhbw %%mm7,%%mm5\n"
1120 "punpckhbw %%mm7,%%mm3\n"
1121 "psubw %%mm1, %%mm4\n"
1122 "psubw %%mm3, %%mm5\n"
1123 "psubw %%mm4, %%mm0\n"
1124 "psubw %%mm5, %%mm2\n"
1125 "pxor %%mm3, %%mm3\n"
1126 "pxor %%mm1, %%mm1\n"
1127 "pcmpgtw %%mm0, %%mm3\n\t"
1128 "pcmpgtw %%mm2, %%mm1\n\t"
1129 "pxor %%mm3, %%mm0\n"
1130 "pxor %%mm1, %%mm2\n"
1131 "psubw %%mm3, %%mm0\n"
1132 "psubw %%mm1, %%mm2\n"
1133 "paddw %%mm0, %%mm2\n"
1134 "paddw %%mm2, %%mm6\n"
1135
1136 "add %2,%0\n"
1137 "subl $2, %%ecx\n"
1138 " jnz 1b\n"
1139
1140 "movq %%mm6, %%mm0\n"
1141 "punpcklwd %%mm7,%%mm0\n"
1142 "punpckhwd %%mm7,%%mm6\n"
1143 "paddd %%mm0, %%mm6\n"
1144
1145 "movq %%mm6,%%mm0\n"
1146 "psrlq $32, %%mm6\n"
1147 "paddd %%mm6,%%mm0\n"
1148 "movd %%mm0,%1\n"
1149 : "+r" (pix1), "=r"(tmp)
1150 : "r" ((long)line_size) , "g" (h-2)
1151 : "%ecx");
1152 return tmp;
1153 }
1154
1155 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1156 int tmp;
1157 uint8_t * pix= pix1;
1158 asm volatile (
1159 "movl %3,%%ecx\n"
1160 "pxor %%mm7,%%mm7\n"
1161 "pxor %%mm6,%%mm6\n"
1162
1163 "movq (%0),%%mm0\n"
1164 "movq 1(%0),%%mm1\n"
1165 "movq %%mm0, %%mm2\n"
1166 "movq %%mm1, %%mm3\n"
1167 "punpcklbw %%mm7,%%mm0\n"
1168 "punpcklbw %%mm7,%%mm1\n"
1169 "punpckhbw %%mm7,%%mm2\n"
1170 "punpckhbw %%mm7,%%mm3\n"
1171 "psubw %%mm1, %%mm0\n"
1172 "psubw %%mm3, %%mm2\n"
1173
1174 "add %2,%0\n"
1175
1176 "movq (%0),%%mm4\n"
1177 "movq 1(%0),%%mm1\n"
1178 "movq %%mm4, %%mm5\n"
1179 "movq %%mm1, %%mm3\n"
1180 "punpcklbw %%mm7,%%mm4\n"
1181 "punpcklbw %%mm7,%%mm1\n"
1182 "punpckhbw %%mm7,%%mm5\n"
1183 "punpckhbw %%mm7,%%mm3\n"
1184 "psubw %%mm1, %%mm4\n"
1185 "psubw %%mm3, %%mm5\n"
1186 "psubw %%mm4, %%mm0\n"
1187 "psubw %%mm5, %%mm2\n"
1188 "pxor %%mm3, %%mm3\n"
1189 "pxor %%mm1, %%mm1\n"
1190 "pcmpgtw %%mm0, %%mm3\n\t"
1191 "pcmpgtw %%mm2, %%mm1\n\t"
1192 "pxor %%mm3, %%mm0\n"
1193 "pxor %%mm1, %%mm2\n"
1194 "psubw %%mm3, %%mm0\n"
1195 "psubw %%mm1, %%mm2\n"
1196 "paddw %%mm0, %%mm2\n"
1197 "paddw %%mm2, %%mm6\n"
1198
1199 "add %2,%0\n"
1200 "1:\n"
1201
1202 "movq (%0),%%mm0\n"
1203 "movq 1(%0),%%mm1\n"
1204 "movq %%mm0, %%mm2\n"
1205 "movq %%mm1, %%mm3\n"
1206 "punpcklbw %%mm7,%%mm0\n"
1207 "punpcklbw %%mm7,%%mm1\n"
1208 "punpckhbw %%mm7,%%mm2\n"
1209 "punpckhbw %%mm7,%%mm3\n"
1210 "psubw %%mm1, %%mm0\n"
1211 "psubw %%mm3, %%mm2\n"
1212 "psubw %%mm0, %%mm4\n"
1213 "psubw %%mm2, %%mm5\n"
1214 "pxor %%mm3, %%mm3\n"
1215 "pxor %%mm1, %%mm1\n"
1216 "pcmpgtw %%mm4, %%mm3\n\t"
1217 "pcmpgtw %%mm5, %%mm1\n\t"
1218 "pxor %%mm3, %%mm4\n"
1219 "pxor %%mm1, %%mm5\n"
1220 "psubw %%mm3, %%mm4\n"
1221 "psubw %%mm1, %%mm5\n"
1222 "paddw %%mm4, %%mm5\n"
1223 "paddw %%mm5, %%mm6\n"
1224
1225 "add %2,%0\n"
1226
1227 "movq (%0),%%mm4\n"
1228 "movq 1(%0),%%mm1\n"
1229 "movq %%mm4, %%mm5\n"
1230 "movq %%mm1, %%mm3\n"
1231 "punpcklbw %%mm7,%%mm4\n"
1232 "punpcklbw %%mm7,%%mm1\n"
1233 "punpckhbw %%mm7,%%mm5\n"
1234 "punpckhbw %%mm7,%%mm3\n"
1235 "psubw %%mm1, %%mm4\n"
1236 "psubw %%mm3, %%mm5\n"
1237 "psubw %%mm4, %%mm0\n"
1238 "psubw %%mm5, %%mm2\n"
1239 "pxor %%mm3, %%mm3\n"
1240 "pxor %%mm1, %%mm1\n"
1241 "pcmpgtw %%mm0, %%mm3\n\t"
1242 "pcmpgtw %%mm2, %%mm1\n\t"
1243 "pxor %%mm3, %%mm0\n"
1244 "pxor %%mm1, %%mm2\n"
1245 "psubw %%mm3, %%mm0\n"
1246 "psubw %%mm1, %%mm2\n"
1247 "paddw %%mm0, %%mm2\n"
1248 "paddw %%mm2, %%mm6\n"
1249
1250 "add %2,%0\n"
1251 "subl $2, %%ecx\n"
1252 " jnz 1b\n"
1253
1254 "movq %%mm6, %%mm0\n"
1255 "punpcklwd %%mm7,%%mm0\n"
1256 "punpckhwd %%mm7,%%mm6\n"
1257 "paddd %%mm0, %%mm6\n"
1258
1259 "movq %%mm6,%%mm0\n"
1260 "psrlq $32, %%mm6\n"
1261 "paddd %%mm6,%%mm0\n"
1262 "movd %%mm0,%1\n"
1263 : "+r" (pix1), "=r"(tmp)
1264 : "r" ((long)line_size) , "g" (h-2)
1265 : "%ecx");
1266 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1267 }
1268
1269 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1270 MpegEncContext *c = p;
1271 int score1, score2;
1272
1273 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1274 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1275 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1276
1277 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1278 else return score1 + FFABS(score2)*8;
1279 }
1280
1281 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1282 MpegEncContext *c = p;
1283 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1284 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1285
1286 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1287 else return score1 + FFABS(score2)*8;
1288 }
1289
1290 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1291 int tmp;
1292
1293 assert( (((int)pix) & 7) == 0);
1294 assert((line_size &7) ==0);
1295
1296 #define SUM(in0, in1, out0, out1) \
1297 "movq (%0), %%mm2\n"\
1298 "movq 8(%0), %%mm3\n"\
1299 "add %2,%0\n"\
1300 "movq %%mm2, " #out0 "\n"\
1301 "movq %%mm3, " #out1 "\n"\
1302 "psubusb " #in0 ", %%mm2\n"\
1303 "psubusb " #in1 ", %%mm3\n"\
1304 "psubusb " #out0 ", " #in0 "\n"\
1305 "psubusb " #out1 ", " #in1 "\n"\
1306 "por %%mm2, " #in0 "\n"\
1307 "por %%mm3, " #in1 "\n"\
1308 "movq " #in0 ", %%mm2\n"\
1309 "movq " #in1 ", %%mm3\n"\
1310 "punpcklbw %%mm7, " #in0 "\n"\
1311 "punpcklbw %%mm7, " #in1 "\n"\
1312 "punpckhbw %%mm7, %%mm2\n"\
1313 "punpckhbw %%mm7, %%mm3\n"\
1314 "paddw " #in1 ", " #in0 "\n"\
1315 "paddw %%mm3, %%mm2\n"\
1316 "paddw %%mm2, " #in0 "\n"\
1317 "paddw " #in0 ", %%mm6\n"
1318
1319
1320 asm volatile (
1321 "movl %3,%%ecx\n"
1322 "pxor %%mm6,%%mm6\n"
1323 "pxor %%mm7,%%mm7\n"
1324 "movq (%0),%%mm0\n"
1325 "movq 8(%0),%%mm1\n"
1326 "add %2,%0\n"
1327 "subl $2, %%ecx\n"
1328 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1329 "1:\n"
1330
1331 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1332
1333 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1334
1335 "subl $2, %%ecx\n"
1336 "jnz 1b\n"
1337
1338 "movq %%mm6,%%mm0\n"
1339 "psrlq $32, %%mm6\n"
1340 "paddw %%mm6,%%mm0\n"
1341 "movq %%mm0,%%mm6\n"
1342 "psrlq $16, %%mm0\n"
1343 "paddw %%mm6,%%mm0\n"
1344 "movd %%mm0,%1\n"
1345 : "+r" (pix), "=r"(tmp)
1346 : "r" ((long)line_size) , "m" (h)
1347 : "%ecx");
1348 return tmp & 0xFFFF;
1349 }
1350 #undef SUM
1351
1352 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1353 int tmp;
1354
1355 assert( (((int)pix) & 7) == 0);
1356 assert((line_size &7) ==0);
1357
1358 #define SUM(in0, in1, out0, out1) \
1359 "movq (%0), " #out0 "\n"\
1360 "movq 8(%0), " #out1 "\n"\
1361 "add %2,%0\n"\
1362 "psadbw " #out0 ", " #in0 "\n"\
1363 "psadbw " #out1 ", " #in1 "\n"\
1364 "paddw " #in1 ", " #in0 "\n"\
1365 "paddw " #in0 ", %%mm6\n"
1366
1367 asm volatile (
1368 "movl %3,%%ecx\n"
1369 "pxor %%mm6,%%mm6\n"
1370 "pxor %%mm7,%%mm7\n"
1371 "movq (%0),%%mm0\n"
1372 "movq 8(%0),%%mm1\n"
1373 "add %2,%0\n"
1374 "subl $2, %%ecx\n"
1375 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1376 "1:\n"
1377
1378 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1379
1380 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1381
1382 "subl $2, %%ecx\n"
1383 "jnz 1b\n"
1384
1385 "movd %%mm6,%1\n"
1386 : "+r" (pix), "=r"(tmp)
1387 : "r" ((long)line_size) , "m" (h)
1388 : "%ecx");
1389 return tmp;
1390 }
1391 #undef SUM
1392
1393 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1394 int tmp;
1395
1396 assert( (((int)pix1) & 7) == 0);
1397 assert( (((int)pix2) & 7) == 0);
1398 assert((line_size &7) ==0);
1399
1400 #define SUM(in0, in1, out0, out1) \
1401 "movq (%0),%%mm2\n"\
1402 "movq (%1)," #out0 "\n"\
1403 "movq 8(%0),%%mm3\n"\
1404 "movq 8(%1)," #out1 "\n"\
1405 "add %3,%0\n"\
1406 "add %3,%1\n"\
1407 "psubb " #out0 ", %%mm2\n"\
1408 "psubb " #out1 ", %%mm3\n"\
1409 "pxor %%mm7, %%mm2\n"\
1410 "pxor %%mm7, %%mm3\n"\
1411 "movq %%mm2, " #out0 "\n"\
1412 "movq %%mm3, " #out1 "\n"\
1413 "psubusb " #in0 ", %%mm2\n"\
1414 "psubusb " #in1 ", %%mm3\n"\
1415 "psubusb " #out0 ", " #in0 "\n"\
1416 "psubusb " #out1 ", " #in1 "\n"\
1417 "por %%mm2, " #in0 "\n"\
1418 "por %%mm3, " #in1 "\n"\
1419 "movq " #in0 ", %%mm2\n"\
1420 "movq " #in1 ", %%mm3\n"\
1421 "punpcklbw %%mm7, " #in0 "\n"\
1422 "punpcklbw %%mm7, " #in1 "\n"\
1423 "punpckhbw %%mm7, %%mm2\n"\
1424 "punpckhbw %%mm7, %%mm3\n"\
1425 "paddw " #in1 ", " #in0 "\n"\
1426 "paddw %%mm3, %%mm2\n"\
1427 "paddw %%mm2, " #in0 "\n"\
1428 "paddw " #in0 ", %%mm6\n"
1429
1430
1431 asm volatile (
1432 "movl %4,%%ecx\n"
1433 "pxor %%mm6,%%mm6\n"
1434 "pcmpeqw %%mm7,%%mm7\n"
1435 "psllw $15, %%mm7\n"
1436 "packsswb %%mm7, %%mm7\n"
1437 "movq (%0),%%mm0\n"
1438 "movq (%1),%%mm2\n"
1439 "movq 8(%0),%%mm1\n"
1440 "movq 8(%1),%%mm3\n"
1441 "add %3,%0\n"
1442 "add %3,%1\n"
1443 "subl $2, %%ecx\n"
1444 "psubb %%mm2, %%mm0\n"
1445 "psubb %%mm3, %%mm1\n"
1446 "pxor %%mm7, %%mm0\n"
1447 "pxor %%mm7, %%mm1\n"
1448 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1449 "1:\n"
1450
1451 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1452
1453 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1454
1455 "subl $2, %%ecx\n"
1456 "jnz 1b\n"
1457
1458 "movq %%mm6,%%mm0\n"
1459 "psrlq $32, %%mm6\n"
1460 "paddw %%mm6,%%mm0\n"
1461 "movq %%mm0,%%mm6\n"
1462 "psrlq $16, %%mm0\n"
1463 "paddw %%mm6,%%mm0\n"
1464 "movd %%mm0,%2\n"
1465 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1466 : "r" ((long)line_size) , "m" (h)
1467 : "%ecx");
1468 return tmp & 0x7FFF;
1469 }
1470 #undef SUM
1471
1472 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1473 int tmp;
1474
1475 assert( (((int)pix1) & 7) == 0);
1476 assert( (((int)pix2) & 7) == 0);
1477 assert((line_size &7) ==0);
1478
1479 #define SUM(in0, in1, out0, out1) \
1480 "movq (%0)," #out0 "\n"\
1481 "movq (%1),%%mm2\n"\
1482 "movq 8(%0)," #out1 "\n"\
1483 "movq 8(%1),%%mm3\n"\
1484 "add %3,%0\n"\
1485 "add %3,%1\n"\
1486 "psubb %%mm2, " #out0 "\n"\
1487 "psubb %%mm3, " #out1 "\n"\
1488 "pxor %%mm7, " #out0 "\n"\
1489 "pxor %%mm7, " #out1 "\n"\
1490 "psadbw " #out0 ", " #in0 "\n"\
1491 "psadbw " #out1 ", " #in1 "\n"\
1492 "paddw " #in1 ", " #in0 "\n"\
1493 "paddw " #in0 ", %%mm6\n"
1494
1495 asm volatile (
1496 "movl %4,%%ecx\n"
1497 "pxor %%mm6,%%mm6\n"
1498 "pcmpeqw %%mm7,%%mm7\n"
1499 "psllw $15, %%mm7\n"
1500 "packsswb %%mm7, %%mm7\n"
1501 "movq (%0),%%mm0\n"
1502 "movq (%1),%%mm2\n"
1503 "movq 8(%0),%%mm1\n"
1504 "movq 8(%1),%%mm3\n"
1505 "add %3,%0\n"
1506 "add %3,%1\n"
1507 "subl $2, %%ecx\n"
1508 "psubb %%mm2, %%mm0\n"
1509 "psubb %%mm3, %%mm1\n"
1510 "pxor %%mm7, %%mm0\n"
1511 "pxor %%mm7, %%mm1\n"
1512 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1513 "1:\n"
1514
1515 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1516
1517 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1518
1519 "subl $2, %%ecx\n"
1520 "jnz 1b\n"
1521
1522 "movd %%mm6,%2\n"
1523 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1524 : "r" ((long)line_size) , "m" (h)
1525 : "%ecx");
1526 return tmp;
1527 }
1528 #undef SUM
1529
1530 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1531 long i=0;
1532 asm volatile(
1533 "1: \n\t"
1534 "movq (%2, %0), %%mm0 \n\t"
1535 "movq (%1, %0), %%mm1 \n\t"
1536 "psubb %%mm0, %%mm1 \n\t"
1537 "movq %%mm1, (%3, %0) \n\t"
1538 "movq 8(%2, %0), %%mm0 \n\t"
1539 "movq 8(%1, %0), %%mm1 \n\t"
1540 "psubb %%mm0, %%mm1 \n\t"
1541 "movq %%mm1, 8(%3, %0) \n\t"
1542 "add $16, %0 \n\t"
1543 "cmp %4, %0 \n\t"
1544 " jb 1b \n\t"
1545 : "+r" (i)
1546 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1547 );
1548 for(; i<w; i++)
1549 dst[i+0] = src1[i+0]-src2[i+0];
1550 }
1551
1552 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1553 long i=0;
1554 uint8_t l, lt;
1555
1556 asm volatile(
1557 "1: \n\t"
1558 "movq -1(%1, %0), %%mm0 \n\t" // LT
1559 "movq (%1, %0), %%mm1 \n\t" // T
1560 "movq -1(%2, %0), %%mm2 \n\t" // L
1561 "movq (%2, %0), %%mm3 \n\t" // X
1562 "movq %%mm2, %%mm4 \n\t" // L
1563 "psubb %%mm0, %%mm2 \n\t"
1564 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1565 "movq %%mm4, %%mm5 \n\t" // L
1566 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1567 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1568 "pminub %%mm2, %%mm4 \n\t"
1569 "pmaxub %%mm1, %%mm4 \n\t"
1570 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1571 "movq %%mm3, (%3, %0) \n\t"
1572 "add $8, %0 \n\t"
1573 "cmp %4, %0 \n\t"
1574 " jb 1b \n\t"
1575 : "+r" (i)
1576 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1577 );
1578
1579 l= *left;
1580 lt= *left_top;
1581
1582 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1583
1584 *left_top= src1[w-1];
1585 *left = src2[w-1];
1586 }
1587
1588 #define PAETH(cpu, abs3)\
1589 void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
1590 {\
1591 long i = -bpp;\
1592 long end = w-3;\
1593 asm volatile(\
1594 "pxor %%mm7, %%mm7 \n"\
1595 "movd (%1,%0), %%mm0 \n"\
1596 "movd (%2,%0), %%mm1 \n"\
1597 "punpcklbw %%mm7, %%mm0 \n"\
1598 "punpcklbw %%mm7, %%mm1 \n"\
1599 "add %4, %0 \n"\
1600 "1: \n"\
1601 "movq %%mm1, %%mm2 \n"\
1602 "movd (%2,%0), %%mm1 \n"\
1603 "movq %%mm2, %%mm3 \n"\
1604 "punpcklbw %%mm7, %%mm1 \n"\
1605 "movq %%mm2, %%mm4 \n"\
1606 "psubw %%mm1, %%mm3 \n"\
1607 "psubw %%mm0, %%mm4 \n"\
1608 "movq %%mm3, %%mm5 \n"\
1609 "paddw %%mm4, %%mm5 \n"\
1610 abs3\
1611 "movq %%mm4, %%mm6 \n"\
1612 "pminsw %%mm5, %%mm6 \n"\
1613 "pcmpgtw %%mm6, %%mm3 \n"\
1614 "pcmpgtw %%mm5, %%mm4 \n"\
1615 "movq %%mm4, %%mm6 \n"\
1616 "pand %%mm3, %%mm4 \n"\
1617 "pandn %%mm3, %%mm6 \n"\
1618 "pandn %%mm0, %%mm3 \n"\
1619 "movd (%3,%0), %%mm0 \n"\
1620 "pand %%mm1, %%mm6 \n"\
1621 "pand %%mm4, %%mm2 \n"\
1622 "punpcklbw %%mm7, %%mm0 \n"\
1623 "movq %6, %%mm5 \n"\
1624 "paddw %%mm6, %%mm0 \n"\
1625 "paddw %%mm2, %%mm3 \n"\
1626 "paddw %%mm3, %%mm0 \n"\
1627 "pand %%mm5, %%mm0 \n"\
1628 "movq %%mm0, %%mm3 \n"\
1629 "packuswb %%mm3, %%mm3 \n"\
1630 "movd %%mm3, (%1,%0) \n"\
1631 "add %4, %0 \n"\
1632 "cmp %5, %0 \n"\
1633 "jle 1b \n"\
1634 :"+r"(i)\
1635 :"r"(dst), "r"(top), "r"(src), "r"((long)bpp), "g"(end),\
1636 "m"(ff_pw_255)\
1637 :"memory"\
1638 );\
1639 }
1640
1641 #define ABS3_MMX2\
1642 "psubw %%mm5, %%mm7 \n"\
1643 "pmaxsw %%mm7, %%mm5 \n"\
1644 "pxor %%mm6, %%mm6 \n"\
1645 "pxor %%mm7, %%mm7 \n"\
1646 "psubw %%mm3, %%mm6 \n"\
1647 "psubw %%mm4, %%mm7 \n"\
1648 "pmaxsw %%mm6, %%mm3 \n"\
1649 "pmaxsw %%mm7, %%mm4 \n"\
1650 "pxor %%mm7, %%mm7 \n"
1651
1652 #define ABS3_SSSE3\
1653 "pabsw %%mm3, %%mm3 \n"\
1654 "pabsw %%mm4, %%mm4 \n"\
1655 "pabsw %%mm5, %%mm5 \n"
1656
1657 PAETH(mmx2, ABS3_MMX2)
1658 #ifdef HAVE_SSSE3
1659 PAETH(ssse3, ABS3_SSSE3)
1660 #endif
1661
1662 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
1663 "mov"#m" "#p1", "#a" \n\t"\
1664 "mov"#m" "#p2", "#t" \n\t"\
1665 "punpcklbw "#a", "#t" \n\t"\
1666 "punpcklbw "#a", "#a" \n\t"\
1667 "psubw "#t", "#a" \n\t"\
1668
1669 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1670 uint8_t *p1b=p1, *p2b=p2;\
1671 asm volatile(\
1672 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1673 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1674 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1675 "add %4, %1 \n\t"\
1676 "add %4, %2 \n\t"\
1677 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1678 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1679 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1680 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1681 "mov"#m1" "#mm"0, %0 \n\t"\
1682 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1683 "mov"#m1" %0, "#mm"0 \n\t"\
1684 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1685 : "r"((long)stride), "r"((long)stride*3)\
1686 );\
1687 }
1688 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1689
1690 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
1691 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1692
1693 #define LBUTTERFLY2(a1,b1,a2,b2)\
1694 "paddw " #b1 ", " #a1 " \n\t"\
1695 "paddw " #b2 ", " #a2 " \n\t"\
1696 "paddw " #b1 ", " #b1 " \n\t"\
1697 "paddw " #b2 ", " #b2 " \n\t"\
1698 "psubw " #a1 ", " #b1 " \n\t"\
1699 "psubw " #a2 ", " #b2 " \n\t"
1700
1701 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1702 LBUTTERFLY2(m0, m1, m2, m3)\
1703 LBUTTERFLY2(m4, m5, m6, m7)\
1704 LBUTTERFLY2(m0, m2, m1, m3)\
1705 LBUTTERFLY2(m4, m6, m5, m7)\
1706 LBUTTERFLY2(m0, m4, m1, m5)\
1707 LBUTTERFLY2(m2, m6, m3, m7)\
1708
1709 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1710
1711 #define MMABS_MMX(a,z)\
1712 "pxor " #z ", " #z " \n\t"\
1713 "pcmpgtw " #a ", " #z " \n\t"\
1714 "pxor " #z ", " #a " \n\t"\
1715 "psubw " #z ", " #a " \n\t"
1716
1717 #define MMABS_MMX2(a,z)\
1718 "pxor " #z ", " #z " \n\t"\
1719 "psubw " #a ", " #z " \n\t"\
1720 "pmaxsw " #z ", " #a " \n\t"
1721
1722 #define MMABS_SSSE3(a,z)\
1723 "pabsw " #a ", " #a " \n\t"
1724
1725 #define MMABS_SUM(a,z, sum)\
1726 MMABS(a,z)\
1727 "paddusw " #a ", " #sum " \n\t"
1728
1729 #define MMABS_SUM_8x8_NOSPILL\
1730 MMABS(%%xmm0, %%xmm8)\
1731 MMABS(%%xmm1, %%xmm9)\
1732 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1733 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1734 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1735 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1736 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1737 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1738 "paddusw %%xmm1, %%xmm0 \n\t"
1739
1740 #ifdef ARCH_X86_64
1741 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1742 #else
1743 #define MMABS_SUM_8x8_SSE2\
1744 "movdqa %%xmm7, (%1) \n\t"\
1745 MMABS(%%xmm0, %%xmm7)\
1746 MMABS(%%xmm1, %%xmm7)\
1747 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1748 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1749 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1750 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1751 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1752 "movdqa (%1), %%xmm2 \n\t"\
1753 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1754 "paddusw %%xmm1, %%xmm0 \n\t"
1755 #endif
1756
1757 #define LOAD4(o, a, b, c, d)\
1758 "movq "#o"(%1), "#a" \n\t"\
1759 "movq "#o"+8(%1), "#b" \n\t"\
1760 "movq "#o"+16(%1), "#c" \n\t"\
1761 "movq "#o"+24(%1), "#d" \n\t"\
1762
1763 #define STORE4(o, a, b, c, d)\
1764 "movq "#a", "#o"(%1) \n\t"\
1765 "movq "#b", "#o"+8(%1) \n\t"\
1766 "movq "#c", "#o"+16(%1) \n\t"\
1767 "movq "#d", "#o"+24(%1) \n\t"\
1768
1769 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1770 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1771 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1772 #define HSUM_MMX(a, t, dst)\
1773 "movq "#a", "#t" \n\t"\
1774 "psrlq $32, "#a" \n\t"\
1775 "paddusw "#t", "#a" \n\t"\
1776 "movq "#a", "#t" \n\t"\
1777 "psrlq $16, "#a" \n\t"\
1778 "paddusw "#t", "#a" \n\t"\
1779 "movd "#a", "#dst" \n\t"\
1780
1781 #define HSUM_MMX2(a, t, dst)\
1782 "pshufw $0x0E, "#a", "#t" \n\t"\
1783 "paddusw "#t", "#a" \n\t"\
1784 "pshufw $0x01, "#a", "#t" \n\t"\
1785 "paddusw "#t", "#a" \n\t"\
1786 "movd "#a", "#dst" \n\t"\
1787
1788 #define HSUM_SSE2(a, t, dst)\
1789 "movhlps "#a", "#t" \n\t"\
1790 "paddusw "#t", "#a" \n\t"\
1791 "pshuflw $0x0E, "#a", "#t" \n\t"\
1792 "paddusw "#t", "#a" \n\t"\
1793 "pshuflw $0x01, "#a", "#t" \n\t"\
1794 "paddusw "#t", "#a" \n\t"\
1795 "movd "#a", "#dst" \n\t"\
1796
1797 #define HADAMARD8_DIFF_MMX(cpu) \
1798 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1799 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1800 int sum;\
1801 \
1802 assert(h==8);\
1803 \
1804 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1805 \
1806 asm volatile(\
1807 HADAMARD48\
1808 \
1809 "movq %%mm7, 96(%1) \n\t"\
1810 \
1811 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1812 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1813 \
1814 "movq 96(%1), %%mm7 \n\t"\
1815 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1816 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1817 \
1818 : "=r" (sum)\
1819 : "r"(temp)\
1820 );\
1821 \
1822 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1823 \
1824 asm volatile(\
1825 HADAMARD48\
1826 \
1827 "movq %%mm7, 96(%1) \n\t"\
1828 \
1829 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1830 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1831 \
1832 "movq 96(%1), %%mm7 \n\t"\
1833 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1834 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1835 "movq %%mm6, %%mm7 \n\t"\
1836 "movq %%mm0, %%mm6 \n\t"\
1837 \
1838 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1839 \
1840 HADAMARD48\
1841 "movq %%mm7, 64(%1) \n\t"\
1842 MMABS(%%mm0, %%mm7)\
1843 MMABS(%%mm1, %%mm7)\
1844 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1845 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1846 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1847 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1848 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1849 "movq 64(%1), %%mm2 \n\t"\
1850 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1851 "paddusw %%mm1, %%mm0 \n\t"\
1852 "movq %%mm0, 64(%1) \n\t"\
1853 \
1854 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1855 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1856 \
1857 HADAMARD48\
1858 "movq %%mm7, (%1) \n\t"\
1859 MMABS(%%mm0, %%mm7)\
1860 MMABS(%%mm1, %%mm7)\
1861 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1862 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1863 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1864 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1865 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1866 "movq (%1), %%mm2 \n\t"\
1867 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1868 "paddusw 64(%1), %%mm0 \n\t"\
1869 "paddusw %%mm1, %%mm0 \n\t"\
1870 \
1871 HSUM(%%mm0, %%mm1, %0)\
1872 \
1873 : "=r" (sum)\
1874 : "r"(temp)\
1875 );\
1876 return sum&0xFFFF;\
1877 }\
1878 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1879
1880 #define HADAMARD8_DIFF_SSE2(cpu) \
1881 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1882 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1883 int sum;\
1884 \
1885 assert(h==8);\
1886 \
1887 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1888 \
1889 asm volatile(\
1890 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1891 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1892 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1893 MMABS_SUM_8x8\
1894 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1895 : "=r" (sum)\
1896 : "r"(temp)\
1897 );\
1898 return sum&0xFFFF;\
1899 }\
1900 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1901
1902 #define MMABS(a,z) MMABS_MMX(a,z)
1903 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1904 HADAMARD8_DIFF_MMX(mmx)
1905 #undef MMABS
1906 #undef HSUM
1907
1908 #define MMABS(a,z) MMABS_MMX2(a,z)
1909 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1910 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1911 HADAMARD8_DIFF_MMX(mmx2)
1912 HADAMARD8_DIFF_SSE2(sse2)
1913 #undef MMABS
1914 #undef MMABS_SUM_8x8
1915 #undef HSUM
1916
1917 #ifdef HAVE_SSSE3
1918 #define MMABS(a,z) MMABS_SSSE3(a,z)
1919 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1920 HADAMARD8_DIFF_SSE2(ssse3)
1921 #undef MMABS
1922 #undef MMABS_SUM_8x8
1923 #endif
1924
1925 #define DCT_SAD4(m,mm,o)\
1926 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1927 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1928 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1929 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1930 MMABS_SUM(mm##2, mm##6, mm##0)\
1931 MMABS_SUM(mm##3, mm##7, mm##1)\
1932 MMABS_SUM(mm##4, mm##6, mm##0)\
1933 MMABS_SUM(mm##5, mm##7, mm##1)\
1934
1935 #define DCT_SAD_MMX\
1936 "pxor %%mm0, %%mm0 \n\t"\
1937 "pxor %%mm1, %%mm1 \n\t"\
1938 DCT_SAD4(q, %%mm, 0)\
1939 DCT_SAD4(q, %%mm, 8)\
1940 DCT_SAD4(q, %%mm, 64)\
1941 DCT_SAD4(q, %%mm, 72)\
1942 "paddusw %%mm1, %%mm0 \n\t"\
1943 HSUM(%%mm0, %%mm1, %0)
1944
1945 #define DCT_SAD_SSE2\
1946 "pxor %%xmm0, %%xmm0 \n\t"\
1947 "pxor %%xmm1, %%xmm1 \n\t"\
1948 DCT_SAD4(dqa, %%xmm, 0)\
1949 DCT_SAD4(dqa, %%xmm, 64)\
1950 "paddusw %%xmm1, %%xmm0 \n\t"\
1951 HSUM(%%xmm0, %%xmm1, %0)
1952
1953 #define DCT_SAD_FUNC(cpu) \
1954 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1955 int sum;\
1956 asm volatile(\
1957 DCT_SAD\
1958 :"=r"(sum)\
1959 :"r"(block)\
1960 );\
1961 return sum&0xFFFF;\
1962 }
1963
1964 #define DCT_SAD DCT_SAD_MMX
1965 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1966 #define MMABS(a,z) MMABS_MMX(a,z)
1967 DCT_SAD_FUNC(mmx)
1968 #undef MMABS
1969 #undef HSUM
1970
1971 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1972 #define MMABS(a,z) MMABS_MMX2(a,z)
1973 DCT_SAD_FUNC(mmx2)
1974 #undef HSUM
1975 #undef DCT_SAD
1976
1977 #define DCT_SAD DCT_SAD_SSE2
1978 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1979 DCT_SAD_FUNC(sse2)
1980 #undef MMABS
1981
1982 #ifdef HAVE_SSSE3
1983 #define MMABS(a,z) MMABS_SSSE3(a,z)
1984 DCT_SAD_FUNC(ssse3)
1985 #undef MMABS
1986 #endif
1987 #undef HSUM
1988 #undef DCT_SAD
1989
1990 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1991 int sum;
1992 long i=size;
1993 asm volatile(
1994 "pxor %%mm4, %%mm4 \n"
1995 "1: \n"
1996 "sub $8, %0 \n"
1997 "movq (%2,%0), %%mm2 \n"
1998 "movq (%3,%0,2), %%mm0 \n"
1999 "movq 8(%3,%0,2), %%mm1 \n"
2000 "punpckhbw %%mm2, %%mm3 \n"
2001 "punpcklbw %%mm2, %%mm2 \n"
2002 "psraw $8, %%mm3 \n"
2003 "psraw $8, %%mm2 \n"
2004 "psubw %%mm3, %%mm1 \n"
2005 "psubw %%mm2, %%mm0 \n"
2006 "pmaddwd %%mm1, %%mm1 \n"
2007 "pmaddwd %%mm0, %%mm0 \n"
2008 "paddd %%mm1, %%mm4 \n"
2009 "paddd %%mm0, %%mm4 \n"
2010 "jg 1b \n"
2011 "movq %%mm4, %%mm3 \n"
2012 "psrlq $32, %%mm3 \n"
2013 "paddd %%mm3, %%mm4 \n"
2014 "movd %%mm4, %1 \n"
2015 :"+r"(i), "=r"(sum)
2016 :"r"(pix1), "r"(pix2)
2017 );
2018 return sum;
2019 }
2020
2021 #endif //CONFIG_ENCODERS
2022
2023 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2024 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
2025 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
2026 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
2027 "movq "#in7", " #m3 " \n\t" /* d */\
2028 "movq "#in0", %%mm5 \n\t" /* D */\
2029 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
2030 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
2031 "movq "#in1", %%mm5 \n\t" /* C */\
2032 "movq "#in2", %%mm6 \n\t" /* B */\
2033 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
2034 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
2035 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
2036 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
2037 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
2038 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
2039 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
2040 "psraw $5, %%mm5 \n\t"\
2041 "packuswb %%mm5, %%mm5 \n\t"\
2042 OP(%%mm5, out, %%mm7, d)
2043
2044 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
2045 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2046 uint64_t temp;\
2047 \
2048 asm volatile(\
2049 "pxor %%mm7, %%mm7 \n\t"\
2050 "1: \n\t"\
2051 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
2052 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
2053 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
2054 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
2055 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
2056 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
2057 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
2058 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
2059 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
2060 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
2061 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
2062 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
2063 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
2064 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
2065 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
2066 "paddw %%mm3, %%mm5 \n\t" /* b */\
2067 "paddw %%mm2, %%mm6 \n\t" /* c */\
2068 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2069 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
2070 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
2071 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
2072 "paddw %%mm4, %%mm0 \n\t" /* a */\
2073 "paddw %%mm1, %%mm5 \n\t" /* d */\
2074 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2075 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
2076 "paddw %6, %%mm6 \n\t"\
2077 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2078 "psraw $5, %%mm0 \n\t"\
2079 "movq %%mm0, %5 \n\t"\
2080 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2081 \
2082 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
2083 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
2084 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
2085 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
2086 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
2087 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
2088 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
2089 "paddw %%mm0, %%mm2 \n\t" /* b */\
2090 "paddw %%mm5, %%mm3 \n\t" /* c */\
2091 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
2092 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
2093 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
2094 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
2095 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
2096 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
2097 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
2098 "paddw %%mm2, %%mm1 \n\t" /* a */\
2099 "paddw %%mm6, %%mm4 \n\t" /* d */\
2100 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2101 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
2102 "paddw %6, %%mm1 \n\t"\
2103 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
2104 "psraw $5, %%mm3 \n\t"\
2105 "movq %5, %%mm1 \n\t"\
2106 "packuswb %%mm3, %%mm1 \n\t"\
2107 OP_MMX2(%%mm1, (%1),%%mm4, q)\
2108 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
2109 \
2110 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
2111 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
2112 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
2113 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
2114 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
2115 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
2116 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
2117 "paddw %%mm1, %%mm5 \n\t" /* b */\
2118 "paddw %%mm4, %%mm0 \n\t" /* c */\
2119 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2120 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
2121 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
2122 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
2123 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
2124 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
2125 "paddw %%mm3, %%mm2 \n\t" /* d */\
2126 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
2127 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
2128 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
2129 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
2130 "paddw %%mm2, %%mm6 \n\t" /* a */\
2131 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2132 "paddw %6, %%mm0 \n\t"\
2133 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2134 "psraw $5, %%mm0 \n\t"\
2135 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2136 \
2137 "paddw %%mm5, %%mm3 \n\t" /* a */\
2138 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
2139 "paddw %%mm4, %%mm6 \n\t" /* b */\
2140 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
2141 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
2142 "paddw %%mm1, %%mm4 \n\t" /* c */\
2143 "paddw %%mm2, %%mm5 \n\t" /* d */\
2144 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
2145 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
2146 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2147 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
2148 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
2149 "paddw %6, %%mm4 \n\t"\
2150 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
2151 "psraw $5, %%mm4 \n\t"\
2152 "packuswb %%mm4, %%mm0 \n\t"\
2153 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2154 \
2155 "add %3, %0 \n\t"\
2156 "add %4, %1 \n\t"\
2157 "decl %2 \n\t"\
2158 " jnz 1b \n\t"\
2159 : "+a"(src), "+c"(dst), "+g"(h)\
2160 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2161 : "memory"\
2162 );\
2163 }\
2164 \
2165 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2166 int i;\
2167 int16_t temp[16];\
2168 /* quick HACK, XXX FIXME MUST be optimized */\
2169 for(i=0; i<h; i++)\
2170 {\
2171 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2172 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2173 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2174 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2175 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2176 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2177 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2178 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2179 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2180 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2181 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2182 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2183 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2184 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2185 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2186 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2187 asm volatile(\
2188 "movq (%0), %%mm0 \n\t"\
2189 "movq 8(%0), %%mm1 \n\t"\
2190 "paddw %2, %%mm0 \n\t"\
2191 "paddw %2, %%mm1 \n\t"\
2192 "psraw $5, %%mm0 \n\t"\
2193 "psraw $5, %%mm1 \n\t"\
2194 "packuswb %%mm1, %%mm0 \n\t"\
2195 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2196 "movq 16(%0), %%mm0 \n\t"\
2197 "movq 24(%0), %%mm1 \n\t"\
2198 "paddw %2, %%mm0 \n\t"\
2199 "paddw %2, %%mm1 \n\t"\
2200 "psraw $5, %%mm0 \n\t"\
2201 "psraw $5, %%mm1 \n\t"\
2202 "packuswb %%mm1, %%mm0 \n\t"\
2203 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2204 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2205 : "memory"\
2206 );\
2207 dst+=dstStride;\
2208 src+=srcStride;\
2209 }\
2210 }\
2211 \
2212 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2213 uint64_t temp;\
2214 \
2215 asm volatile(\
2216 "pxor %%mm7, %%mm7 \n\t"\
2217 "1: \n\t"\
2218 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
2219 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
2220 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
2221 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
2222 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
2223 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
2224 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
2225 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
2226 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
2227 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
2228 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
2229 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
2230 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
2231 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
2232 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
2233 "paddw %%mm3, %%mm5 \n\t" /* b */\
2234 "paddw %%mm2, %%mm6 \n\t" /* c */\
2235 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2236 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
2237 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
2238 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
2239 "paddw %%mm4, %%mm0 \n\t" /* a */\
2240 "paddw %%mm1, %%mm5 \n\t" /* d */\
2241 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2242 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
2243 "paddw %6, %%mm6 \n\t"\
2244 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2245 "psraw $5, %%mm0 \n\t"\
2246 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2247 \
2248 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
2249 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
2250 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
2251 "paddw %%mm5, %%mm1 \n\t" /* a */\
2252 "paddw %%mm6, %%mm2 \n\t" /* b */\
2253 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
2254 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
2255 "paddw %%mm6, %%mm3 \n\t" /* c */\
2256 "paddw %%mm5, %%mm4 \n\t" /* d */\
2257 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
2258 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
2259 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2260 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
2261 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
2262 "paddw %6, %%mm1 \n\t"\
2263 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
2264 "psraw $5, %%mm3 \n\t"\
2265 "packuswb %%mm3, %%mm0 \n\t"\
2266 OP_MMX2(%%mm0, (%1), %%mm4, q)\
2267 \
2268 "add %3, %0 \n\t"\
2269 "add %4, %1 \n\t"\
2270 "decl %2 \n\t"\
2271 " jnz 1b \n\t"\
2272 : "+a"(src), "+c"(dst), "+g"(h)\
2273 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2274 : "memory"\
2275 );\
2276 }\
2277 \
2278 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2279 int i;\
2280 int16_t temp[8];\
2281 /* quick HACK, XXX FIXME MUST be optimized */\
2282 for(i=0; i<h; i++)\
2283 {\
2284 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2285 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2286 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2287 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2288 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2289 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2290 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2291 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2292 asm volatile(\
2293 "movq (%0), %%mm0 \n\t"\
2294 "movq 8(%0), %%mm1 \n\t"\
2295 "paddw %2, %%mm0 \n\t"\
2296 "paddw %2, %%mm1 \n\t"\
2297 "psraw $5, %%mm0 \n\t"\
2298 "psraw $5, %%mm1 \n\t"\
2299 "packuswb %%mm1, %%mm0 \n\t"\
2300 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2301 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2302 :"memory"\
2303 );\
2304 dst+=dstStride;\
2305 src+=srcStride;\
2306 }\
2307 }
2308
2309 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2310 \
2311 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2312 uint64_t temp[17*4];\
2313 uint64_t *temp_ptr= temp;\
2314 int count= 17;\
2315 \
2316 /*FIXME unroll */\
2317 asm volatile(\
2318 "pxor %%mm7, %%mm7 \n\t"\
2319 "1: \n\t"\
2320 "movq (%0), %%mm0 \n\t"\
2321 "movq (%0), %%mm1 \n\t"\
2322 "movq 8(%0), %%mm2 \n\t"\
2323 "movq 8(%0), %%mm3 \n\t"\
2324 "punpcklbw %%mm7, %%mm0 \n\t"\
2325 "punpckhbw %%mm7, %%mm1 \n\t"\
2326 "punpcklbw %%mm7, %%mm2 \n\t"\
2327 "punpckhbw %%mm7, %%mm3 \n\t"\
2328 "movq %%mm0, (%1) \n\t"\
2329 "movq %%mm1, 17*8(%1) \n\t"\
2330 "movq %%mm2, 2*17*8(%1) \n\t"\
2331 "movq %%mm3, 3*17*8(%1) \n\t"\
2332 "add $8, %1 \n\t"\
2333 "add %3, %0 \n\t"\
2334 "decl %2 \n\t"\
2335 " jnz 1b \n\t"\
2336 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2337 : "r" ((long)srcStride)\
2338 : "memory"\
2339 );\
2340 \
2341 temp_ptr= temp;\
2342 count=4;\
2343 \
2344 /*FIXME reorder for speed */\
2345 asm volatile(\
2346 /*"pxor %%mm7, %%mm7 \n\t"*/\
2347 "1: \n\t"\
2348 "movq (%0), %%mm0 \n\t"\
2349 "movq 8(%0), %%mm1 \n\t"\
2350 "movq 16(%0), %%mm2 \n\t"\
2351 "movq 24(%0), %%mm3 \n\t"\
2352 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2353 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2354 "add %4, %1 \n\t"\
2355 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2356 \
2357 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2358 "add %4, %1 \n\t"\
2359 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2360 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2361 "add %4, %1 \n\t"\
2362 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2363 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2364 "add %4, %1 \n\t"\
2365 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2366 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2367 "add %4, %1 \n\t"\
2368 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2369 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2370 "add %4, %1 \n\t"\
2371 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2372 \
2373 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2374 "add %4, %1 \n\t" \
2375 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2376 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2377 \
2378 "add $136, %0 \n\t"\
2379 "add %6, %1 \n\t"\
2380 "decl %2 \n\t"\
2381 " jnz 1b \n\t"\
2382 \
2383 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2384 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2385 :"memory"\
2386 );\
2387 }\
2388 \
2389 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2390 uint64_t temp[9*2];\
2391 uint64_t *temp_ptr= temp;\
2392 int count= 9;\
2393 \
2394 /*FIXME unroll */\
2395 asm volatile(\
2396 "pxor %%mm7, %%mm7 \n\t"\
2397 "1: \n\t"\
2398 "movq (%0), %%mm0 \n\t"\
2399 "movq (%0), %%mm1 \n\t"\
2400 "punpcklbw %%mm7, %%mm0 \n\t"\
2401 "punpckhbw %%mm7, %%mm1 \n\t"\
2402 "movq %%mm0, (%1) \n\t"\
2403 "movq %%mm1, 9*8(%1) \n\t"\
2404 "add $8, %1 \n\t"\
2405 "add %3, %0 \n\t"\
2406 "decl %2 \n\t"\
2407 " jnz 1b \n\t"\
2408 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2409 : "r" ((long)srcStride)\
2410 : "memory"\
2411 );\
2412 \
2413 temp_ptr= temp;\
2414 count=2;\
2415 \
2416 /*FIXME reorder for speed */\
2417 asm volatile(\
2418 /*"pxor %%mm7, %%mm7 \n\t"*/\
2419 "1: \n\t"\
2420 "movq (%0), %%mm0 \n\t"\
2421 "movq 8(%0), %%mm1 \n\t"\
2422 "movq 16(%0), %%mm2 \n\t"\
2423 "movq 24(%0), %%mm3 \n\t"\
2424 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2425 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2426 "add %4, %1 \n\t"\
2427 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2428 \
2429 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2430 "add %4, %1 \n\t"\
2431 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2432 \
2433 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2434 "add %4, %1 \n\t"\
2435 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2436 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2437 \
2438 "add $72, %0 \n\t"\
2439 "add %6, %1 \n\t"\
2440 "decl %2 \n\t"\
2441 " jnz 1b \n\t"\
2442 \
2443 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2444 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2445 : "memory"\
2446 );\
2447 }\
2448 \
2449 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2450 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
2451 }\
2452 \
2453 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2454 uint64_t temp[8];\
2455 uint8_t * const half= (uint8_t*)temp;\
2456 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2457 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2458 }\
2459 \
2460 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2461 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2462 }\
2463 \
2464 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2465 uint64_t temp[8];\
2466 uint8_t * const half= (uint8_t*)temp;\
2467 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2468 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2469 }\
2470 \
2471 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2472 uint64_t temp[8];\
2473 uint8_t * const half= (uint8_t*)temp;\
2474 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2475 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2476 }\
2477 \
2478 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2479 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2480 }\
2481 \
2482 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2483 uint64_t temp[8];\
2484 uint8_t * const half= (uint8_t*)temp;\
2485 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2486 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2487 }\
2488 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2489 uint64_t half[8 + 9];\
2490 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2491 uint8_t * const halfHV= ((uint8_t*)half);\
2492 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2493 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2494 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2495 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2496 }\
2497 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2498 uint64_t half[8 + 9];\
2499 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2500 uint8_t * const halfHV= ((uint8_t*)half);\
2501 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2502 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2503 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2504 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2505 }\
2506 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2507 uint64_t half[8 + 9];\
2508 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2509 uint8_t * const halfHV= ((uint8_t*)half);\
2510 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2511 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2512 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2513 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2514 }\
2515 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2516 uint64_t half[8 + 9];\
2517 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2518 uint8_t * const halfHV= ((uint8_t*)half);\
2519 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2520 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2521 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2522 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2523 }\
2524 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2525 uint64_t half[8 + 9];\
2526 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2527 uint8_t * const halfHV= ((uint8_t*)half);\
2528 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2529 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2530 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2531 }\
2532 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2533 uint64_t half[8 + 9];\
2534 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2535 uint8_t * const halfHV= ((uint8_t*)half);\
2536 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2537 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2538 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2539 }\
2540 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2541 uint64_t half[8 + 9];\
2542 uint8_t * const halfH= ((uint8_t*)half);\
2543 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2544 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2545 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2546 }\
2547 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2548 uint64_t half[8 + 9];\
2549 uint8_t * const halfH= ((uint8_t*)half);\
2550 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2551 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2552 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2553 }\
2554 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2555 uint64_t half[9];\
2556 uint8_t * const halfH= ((uint8_t*)half);\
2557 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2558 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2559 }\
2560 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2561 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
2562 }\
2563 \
2564 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2565 uint64_t temp[32];\
2566 uint8_t * const half= (uint8_t*)temp;\
2567 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2568 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2569 }\
2570 \
2571 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2572 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2573 }\
2574 \
2575 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2576 uint64_t temp[32];\
2577 uint8_t * const half= (uint8_t*)temp;\
2578 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2579 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2580 }\
2581 \
2582 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2583 uint64_t temp[32];\
2584 uint8_t * const half= (uint8_t*)temp;\
2585 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2586 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2587 }\
2588 \
2589 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2590 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2591 }\
2592 \
2593 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2594 uint64_t temp[32];\
2595 uint8_t * const half= (uint8_t*)temp;\
2596 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2597 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2598 }\
2599 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2600 uint64_t half[16*2 + 17*2];\
2601 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2602 uint8_t * const halfHV= ((uint8_t*)half);\
2603 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2604 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2605 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2606 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2607 }\
2608 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2609 uint64_t half[16*2 + 17*2];\
2610 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2611 uint8_t * const halfHV= ((uint8_t*)half);\
2612 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2613 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2614 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2615 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2616 }\
2617 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2618 uint64_t half[16*2 + 17*2];\
2619 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2620 uint8_t * const halfHV= ((uint8_t*)half);\
2621 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2622 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2623 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2624 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2625 }\
2626 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2627 uint64_t half[16*2 + 17*2];\
2628 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2629 uint8_t * const halfHV= ((uint8_t*)half);\
2630 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2631 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2632 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2633 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2634 }\
2635 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2636 uint64_t half[16*2 + 17*2];\
2637 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2638 uint8_t * const halfHV= ((uint8_t*)half);\
2639 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2640 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2641 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2642 }\
2643 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2644 uint64_t half[16*2 + 17*2];\
2645 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2646 uint8_t * const halfHV= ((uint8_t*)half);\
2647 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2648 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2649 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2650 }\
2651 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2652 uint64_t half[17*2];\
2653 uint8_t * const halfH= ((uint8_t*)half);\
2654 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2655 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2656 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2657 }\
2658 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2659 uint64_t half[17*2];\
2660 uint8_t * const halfH= ((uint8_t*)half);\
2661 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2662 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2663 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2664 }\
2665 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2666 uint64_t half[17*2];\
2667 uint8_t * const halfH= ((uint8_t*)half);\
2668 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2669 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2670 }
2671
2672 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2673 #define AVG_3DNOW_OP(a,b,temp, size) \
2674 "mov" #size " " #b ", " #temp " \n\t"\
2675 "pavgusb " #temp ", " #a " \n\t"\
2676 "mov" #size " " #a ", " #b " \n\t"
2677 #define AVG_MMX2_OP(a,b,temp, size) \
2678 "mov" #size " " #b ", " #temp " \n\t"\
2679 "pavgb " #temp ", " #a " \n\t"\
2680 "mov" #size " " #a ", " #b " \n\t"
2681
2682 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
2683 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
2684 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2685 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
2686 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
2687 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2688 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
2689 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
2690 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2691
2692 /***********************************/
2693 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2694
2695 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2696 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2697 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2698 }
2699 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2700 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2701 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2702 }
2703
2704 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
2705 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2706 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2707 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2708 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2709 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2710 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2711 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2712 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2713 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2714 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2715 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2716 }\
2717 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2718 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2719 }\
2720 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
2721 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
2722 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
2723 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
2724 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
2725 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
2726 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
2727 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2728
2729 QPEL_2TAP(put_, 16, mmx2)
2730 QPEL_2TAP(avg_, 16, mmx2)
2731 QPEL_2TAP(put_, 8, mmx2)
2732 QPEL_2TAP(avg_, 8, mmx2)
2733 QPEL_2TAP(put_, 16, 3dnow)
2734 QPEL_2TAP(avg_, 16, 3dnow)
2735 QPEL_2TAP(put_, 8, 3dnow)
2736 QPEL_2TAP(avg_, 8, 3dnow)
2737
2738
2739 #if 0
2740 static void just_return() { return; }
2741 #endif
2742
2743 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2744 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2745 const int w = 8;
2746 const int ix = ox>>(16+shift);
2747 const int iy = oy>>(16+shift);
2748 const int oxs = ox>>4;
2749 const int oys = oy>>4;
2750 const int dxxs = dxx>>4;
2751 const int dxys = dxy>>4;
2752 const int dyxs = dyx>>4;
2753 const int dyys = dyy>>4;
2754 const uint16_t r4[4] = {r,r,r,r};
2755 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2756 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2757 const uint64_t shift2 = 2*shift;
2758 uint8_t edge_buf[(h+1)*stride];
2759 int x, y;
2760
2761 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2762 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2763 const int dxh = dxy*(h-1);
2764 const int dyw = dyx*(w-1);
2765 if( // non-constant fullpel offset (3% of blocks)
2766 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
2767 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
2768 // uses more than 16 bits of subpel mv (only at huge resolution)
2769 || (dxx|dxy|dyx|dyy)&15 )
2770 {
2771 //FIXME could still use mmx for some of the rows
2772 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2773 return;
2774 }
2775
2776 src += ix + iy*stride;
2777 if( (unsigned)ix >= width-w ||
2778 (unsigned)iy >= height-h )
2779 {
2780 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2781 src = edge_buf;
2782 }
2783
2784 asm volatile(
2785 "movd %0, %%mm6 \n\t"
2786 "pxor %%mm7, %%mm7 \n\t"
2787 "punpcklwd %%mm6, %%mm6 \n\t"
2788 "punpcklwd %%mm6, %%mm6 \n\t"
2789 :: "r"(1<<shift)
2790 );
2791
2792 for(x=0; x<w; x+=4){
2793 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2794 oxs - dxys + dxxs*(x+1),
2795 oxs - dxys + dxxs*(x+2),
2796 oxs - dxys + dxxs*(x+3) };
2797 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2798 oys - dyys + dyxs*(x+1),
2799 oys - dyys + dyxs*(x+2),
2800 oys - dyys + dyxs*(x+3) };
2801
2802 for(y=0; y<h; y++){
2803 asm volatile(
2804 "movq %0, %%mm4 \n\t"
2805 "movq %1, %%mm5 \n\t"
2806 "paddw %2, %%mm4 \n\t"
2807 "paddw %3, %%mm5 \n\t"
2808 "movq %%mm4, %0 \n\t"
2809 "movq %%mm5, %1 \n\t"
2810 "psrlw $12, %%mm4 \n\t"
2811 "psrlw $12, %%mm5 \n\t"
2812 : "+m"(*dx4), "+m"(*dy4)
2813 : "m"(*dxy4), "m"(*dyy4)
2814 );
2815
2816 asm volatile(
2817 "movq %%mm6, %%mm2 \n\t"
2818 "movq %%mm6, %%mm1 \n\t"
2819 "psubw %%mm4, %%mm2 \n\t"
2820 "psubw %%mm5, %%mm1 \n\t"
2821 "movq %%mm2, %%mm0 \n\t"
2822 "movq %%mm4, %%mm3 \n\t"
2823 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2824 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2825 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2826 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2827
2828 "movd %4, %%mm5 \n\t"
2829 "movd %3, %%mm4 \n\t"
2830 "punpcklbw %%mm7, %%mm5 \n\t"
2831 "punpcklbw %%mm7, %%mm4 \n\t"
2832 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2833 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2834
2835 "movd %2, %%mm5 \n\t"
2836 "movd %1, %%mm4 \n\t"
2837 "punpcklbw %%mm7, %%mm5 \n\t"
2838 "punpcklbw %%mm7, %%mm4 \n\t"
2839 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2840 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2841 "paddw %5, %%mm1 \n\t"
2842 "paddw %%mm3, %%mm2 \n\t"
2843 "paddw %%mm1, %%mm0 \n\t"
2844 "paddw %%mm2, %%mm0 \n\t"
2845
2846 "psrlw %6, %%mm0 \n\t"
2847 "packuswb %%mm0, %%mm0 \n\t"
2848 "movd %%mm0, %0 \n\t"
2849
2850 : "=m"(dst[x+y*stride])
2851 : "m"(src[0]), "m"(src[1]),
2852 "m"(src[stride]), "m"(src[stride+1]),
2853 "m"(*r4), "m"(shift2)
2854 );
2855 src += stride;
2856 }
2857 src += 4-h*stride;
2858 }
2859 }
2860
2861 #ifdef CONFIG_ENCODERS
2862
2863 #define PHADDD(a, t)\
2864 "movq "#a", "#t" \n\t"\
2865 "psrlq $32, "#a" \n\t"\
2866 "paddd "#t", "#a" \n\t"
2867 /*
2868 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2869 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2870 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2871 */
2872 #define PMULHRW(x, y, s, o)\
2873 "pmulhw " #s ", "#x " \n\t"\
2874 "pmulhw " #s ", "#y " \n\t"\
2875 "paddw " #o ", "#x " \n\t"\
2876 "paddw " #o ", "#y " \n\t"\
2877 "psraw $1, "#x " \n\t"\
2878 "psraw $1, "#y " \n\t"
2879 #define DEF(x) x ## _mmx
2880 #define SET_RND MOVQ_WONE
2881 #define SCALE_OFFSET 1
2882
2883 #include "dsputil_mmx_qns.h"
2884
2885 #undef DEF
2886 #undef SET_RND
2887 #undef SCALE_OFFSET
2888 #undef PMULHRW
2889
2890 #define DEF(x) x ## _3dnow
2891 #define SET_RND(x)
2892 #define SCALE_OFFSET 0
2893 #define PMULHRW(x, y, s, o)\
2894 "pmulhrw " #s ", "#x " \n\t"\
2895 "pmulhrw " #s ", "#y " \n\t"
2896
2897 #include "dsputil_mmx_qns.h"
2898
2899 #undef DEF
2900 #undef SET_RND
2901 #undef SCALE_OFFSET
2902 #undef PMULHRW
2903
2904 #ifdef HAVE_SSSE3
2905 #undef PHADDD
2906 #define DEF(x) x ## _ssse3
2907 #define SET_RND(x)
2908 #define SCALE_OFFSET -1
2909 #define PHADDD(a, t)\
2910 "pshufw $0x0E, "#a", "#t" \n\t"\
2911 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
2912 #define PMULHRW(x, y, s, o)\
2913 "pmulhrsw " #s ", "#x " \n\t"\
2914 "pmulhrsw " #s ", "#y " \n\t"
2915
2916 #include "dsputil_mmx_qns.h"
2917
2918 #undef DEF
2919 #undef SET_RND
2920 #undef SCALE_OFFSET
2921 #undef PMULHRW
2922 #undef PHADDD
2923 #endif //HAVE_SSSE3
2924
2925 #endif /* CONFIG_ENCODERS */
2926
2927 #define PREFETCH(name, op) \
2928 static void name(void *mem, int stride, int h){\
2929 const uint8_t *p= mem;\
2930 do{\
2931 asm volatile(#op" %0" :: "m"(*p));\
2932 p+= stride;\
2933 }while(--h);\
2934 }
2935 PREFETCH(prefetch_mmx2, prefetcht0)
2936 PREFETCH(prefetch_3dnow, prefetch)
2937 #undef PREFETCH
2938
2939 #include "h264dsp_mmx.c"
2940
2941 /* CAVS specific */
2942 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2943
2944 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2945 put_pixels8_mmx(dst, src, stride, 8);
2946 }
2947 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2948 avg_pixels8_mmx(dst, src, stride, 8);
2949 }
2950 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2951 put_pixels16_mmx(dst, src, stride, 16);
2952 }
2953 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2954 avg_pixels16_mmx(dst