Do not misuse long as the size of a register in x86.
[libav.git] / libavcodec / i386 / dsputilenc_mmx.c
1 /*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
25 #include "dsputil.h"
26 #include "dsputil_mmx.h"
27 #include "mpegvideo.h"
28 #include "x86_cpu.h"
29
30
31 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
32 {
33 asm volatile(
34 "mov $-128, %%"REG_a" \n\t"
35 "pxor %%mm7, %%mm7 \n\t"
36 ASMALIGN(4)
37 "1: \n\t"
38 "movq (%0), %%mm0 \n\t"
39 "movq (%0, %2), %%mm2 \n\t"
40 "movq %%mm0, %%mm1 \n\t"
41 "movq %%mm2, %%mm3 \n\t"
42 "punpcklbw %%mm7, %%mm0 \n\t"
43 "punpckhbw %%mm7, %%mm1 \n\t"
44 "punpcklbw %%mm7, %%mm2 \n\t"
45 "punpckhbw %%mm7, %%mm3 \n\t"
46 "movq %%mm0, (%1, %%"REG_a") \n\t"
47 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
48 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
49 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
50 "add %3, %0 \n\t"
51 "add $32, %%"REG_a" \n\t"
52 "js 1b \n\t"
53 : "+r" (pixels)
54 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
55 : "%"REG_a
56 );
57 }
58
59 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
60 {
61 asm volatile(
62 "pxor %%mm7, %%mm7 \n\t"
63 "mov $-128, %%"REG_a" \n\t"
64 ASMALIGN(4)
65 "1: \n\t"
66 "movq (%0), %%mm0 \n\t"
67 "movq (%1), %%mm2 \n\t"
68 "movq %%mm0, %%mm1 \n\t"
69 "movq %%mm2, %%mm3 \n\t"
70 "punpcklbw %%mm7, %%mm0 \n\t"
71 "punpckhbw %%mm7, %%mm1 \n\t"
72 "punpcklbw %%mm7, %%mm2 \n\t"
73 "punpckhbw %%mm7, %%mm3 \n\t"
74 "psubw %%mm2, %%mm0 \n\t"
75 "psubw %%mm3, %%mm1 \n\t"
76 "movq %%mm0, (%2, %%"REG_a") \n\t"
77 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
78 "add %3, %0 \n\t"
79 "add %3, %1 \n\t"
80 "add $16, %%"REG_a" \n\t"
81 "jnz 1b \n\t"
82 : "+r" (s1), "+r" (s2)
83 : "r" (block+64), "r" ((x86_reg)stride)
84 : "%"REG_a
85 );
86 }
87
88 static int pix_sum16_mmx(uint8_t * pix, int line_size){
89 const int h=16;
90 int sum;
91 x86_reg index= -line_size*h;
92
93 asm volatile(
94 "pxor %%mm7, %%mm7 \n\t"
95 "pxor %%mm6, %%mm6 \n\t"
96 "1: \n\t"
97 "movq (%2, %1), %%mm0 \n\t"
98 "movq (%2, %1), %%mm1 \n\t"
99 "movq 8(%2, %1), %%mm2 \n\t"
100 "movq 8(%2, %1), %%mm3 \n\t"
101 "punpcklbw %%mm7, %%mm0 \n\t"
102 "punpckhbw %%mm7, %%mm1 \n\t"
103 "punpcklbw %%mm7, %%mm2 \n\t"
104 "punpckhbw %%mm7, %%mm3 \n\t"
105 "paddw %%mm0, %%mm1 \n\t"
106 "paddw %%mm2, %%mm3 \n\t"
107 "paddw %%mm1, %%mm3 \n\t"
108 "paddw %%mm3, %%mm6 \n\t"
109 "add %3, %1 \n\t"
110 " js 1b \n\t"
111 "movq %%mm6, %%mm5 \n\t"
112 "psrlq $32, %%mm6 \n\t"
113 "paddw %%mm5, %%mm6 \n\t"
114 "movq %%mm6, %%mm5 \n\t"
115 "psrlq $16, %%mm6 \n\t"
116 "paddw %%mm5, %%mm6 \n\t"
117 "movd %%mm6, %0 \n\t"
118 "andl $0xFFFF, %0 \n\t"
119 : "=&r" (sum), "+r" (index)
120 : "r" (pix - index), "r" ((x86_reg)line_size)
121 );
122
123 return sum;
124 }
125
126 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
127 int tmp;
128 asm volatile (
129 "movl $16,%%ecx\n"
130 "pxor %%mm0,%%mm0\n"
131 "pxor %%mm7,%%mm7\n"
132 "1:\n"
133 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
134 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
135
136 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
137
138 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
139 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
140
141 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
142 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
143 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
144
145 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
146 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
147
148 "pmaddwd %%mm3,%%mm3\n"
149 "pmaddwd %%mm4,%%mm4\n"
150
151 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
152 pix2^2+pix3^2+pix6^2+pix7^2) */
153 "paddd %%mm3,%%mm4\n"
154 "paddd %%mm2,%%mm7\n"
155
156 "add %2, %0\n"
157 "paddd %%mm4,%%mm7\n"
158 "dec %%ecx\n"
159 "jnz 1b\n"
160
161 "movq %%mm7,%%mm1\n"
162 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
163 "paddd %%mm7,%%mm1\n"
164 "movd %%mm1,%1\n"
165 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
166 return tmp;
167 }
168
169 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
170 int tmp;
171 asm volatile (
172 "movl %4,%%ecx\n"
173 "shr $1,%%ecx\n"
174 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
175 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
176 "1:\n"
177 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
178 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
179 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
180 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
181
182 /* todo: mm1-mm2, mm3-mm4 */
183 /* algo: subtract mm1 from mm2 with saturation and vice versa */
184 /* OR the results to get absolute difference */
185 "movq %%mm1,%%mm5\n"
186 "movq %%mm3,%%mm6\n"
187 "psubusb %%mm2,%%mm1\n"
188 "psubusb %%mm4,%%mm3\n"
189 "psubusb %%mm5,%%mm2\n"
190 "psubusb %%mm6,%%mm4\n"
191
192 "por %%mm1,%%mm2\n"
193 "por %%mm3,%%mm4\n"
194
195 /* now convert to 16-bit vectors so we can square them */
196 "movq %%mm2,%%mm1\n"
197 "movq %%mm4,%%mm3\n"
198
199 "punpckhbw %%mm0,%%mm2\n"
200 "punpckhbw %%mm0,%%mm4\n"
201 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
202 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
203
204 "pmaddwd %%mm2,%%mm2\n"
205 "pmaddwd %%mm4,%%mm4\n"
206 "pmaddwd %%mm1,%%mm1\n"
207 "pmaddwd %%mm3,%%mm3\n"
208
209 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
210 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
211
212 "paddd %%mm2,%%mm1\n"
213 "paddd %%mm4,%%mm3\n"
214 "paddd %%mm1,%%mm7\n"
215 "paddd %%mm3,%%mm7\n"
216
217 "decl %%ecx\n"
218 "jnz 1b\n"
219
220 "movq %%mm7,%%mm1\n"
221 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
222 "paddd %%mm7,%%mm1\n"
223 "movd %%mm1,%2\n"
224 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
225 : "r" ((x86_reg)line_size) , "m" (h)
226 : "%ecx");
227 return tmp;
228 }
229
230 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
231 int tmp;
232 asm volatile (
233 "movl %4,%%ecx\n"
234 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
235 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
236 "1:\n"
237 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
238 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
239 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
240 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
241
242 /* todo: mm1-mm2, mm3-mm4 */
243 /* algo: subtract mm1 from mm2 with saturation and vice versa */
244 /* OR the results to get absolute difference */
245 "movq %%mm1,%%mm5\n"
246 "movq %%mm3,%%mm6\n"
247 "psubusb %%mm2,%%mm1\n"
248 "psubusb %%mm4,%%mm3\n"
249 "psubusb %%mm5,%%mm2\n"
250 "psubusb %%mm6,%%mm4\n"
251
252 "por %%mm1,%%mm2\n"
253 "por %%mm3,%%mm4\n"
254
255 /* now convert to 16-bit vectors so we can square them */
256 "movq %%mm2,%%mm1\n"
257 "movq %%mm4,%%mm3\n"
258
259 "punpckhbw %%mm0,%%mm2\n"
260 "punpckhbw %%mm0,%%mm4\n"
261 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
262 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
263
264 "pmaddwd %%mm2,%%mm2\n"
265 "pmaddwd %%mm4,%%mm4\n"
266 "pmaddwd %%mm1,%%mm1\n"
267 "pmaddwd %%mm3,%%mm3\n"
268
269 "add %3,%0\n"
270 "add %3,%1\n"
271
272 "paddd %%mm2,%%mm1\n"
273 "paddd %%mm4,%%mm3\n"
274 "paddd %%mm1,%%mm7\n"
275 "paddd %%mm3,%%mm7\n"
276
277 "decl %%ecx\n"
278 "jnz 1b\n"
279
280 "movq %%mm7,%%mm1\n"
281 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
282 "paddd %%mm7,%%mm1\n"
283 "movd %%mm1,%2\n"
284 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
285 : "r" ((x86_reg)line_size) , "m" (h)
286 : "%ecx");
287 return tmp;
288 }
289
290 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
291 int tmp;
292 asm volatile (
293 "shr $1,%2\n"
294 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
295 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
296 "1:\n"
297 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
298 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
299 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
300 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
301
302 /* todo: mm1-mm2, mm3-mm4 */
303 /* algo: subtract mm1 from mm2 with saturation and vice versa */
304 /* OR the results to get absolute difference */
305 "movdqa %%xmm1,%%xmm5\n"
306 "movdqa %%xmm3,%%xmm6\n"
307 "psubusb %%xmm2,%%xmm1\n"
308 "psubusb %%xmm4,%%xmm3\n"
309 "psubusb %%xmm5,%%xmm2\n"
310 "psubusb %%xmm6,%%xmm4\n"
311
312 "por %%xmm1,%%xmm2\n"
313 "por %%xmm3,%%xmm4\n"
314
315 /* now convert to 16-bit vectors so we can square them */
316 "movdqa %%xmm2,%%xmm1\n"
317 "movdqa %%xmm4,%%xmm3\n"
318
319 "punpckhbw %%xmm0,%%xmm2\n"
320 "punpckhbw %%xmm0,%%xmm4\n"
321 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
322 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
323
324 "pmaddwd %%xmm2,%%xmm2\n"
325 "pmaddwd %%xmm4,%%xmm4\n"
326 "pmaddwd %%xmm1,%%xmm1\n"
327 "pmaddwd %%xmm3,%%xmm3\n"
328
329 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
330 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
331
332 "paddd %%xmm2,%%xmm1\n"
333 "paddd %%xmm4,%%xmm3\n"
334 "paddd %%xmm1,%%xmm7\n"
335 "paddd %%xmm3,%%xmm7\n"
336
337 "decl %2\n"
338 "jnz 1b\n"
339
340 "movdqa %%xmm7,%%xmm1\n"
341 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
342 "paddd %%xmm1,%%xmm7\n"
343 "movdqa %%xmm7,%%xmm1\n"
344 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
345 "paddd %%xmm1,%%xmm7\n"
346 "movd %%xmm7,%3\n"
347 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
348 : "r" ((x86_reg)line_size));
349 return tmp;
350 }
351
352 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
353 int tmp;
354 asm volatile (
355 "movl %3,%%ecx\n"
356 "pxor %%mm7,%%mm7\n"
357 "pxor %%mm6,%%mm6\n"
358
359 "movq (%0),%%mm0\n"
360 "movq %%mm0, %%mm1\n"
361 "psllq $8, %%mm0\n"
362 "psrlq $8, %%mm1\n"
363 "psrlq $8, %%mm0\n"
364 "movq %%mm0, %%mm2\n"
365 "movq %%mm1, %%mm3\n"
366 "punpcklbw %%mm7,%%mm0\n"
367 "punpcklbw %%mm7,%%mm1\n"
368 "punpckhbw %%mm7,%%mm2\n"
369 "punpckhbw %%mm7,%%mm3\n"
370 "psubw %%mm1, %%mm0\n"
371 "psubw %%mm3, %%mm2\n"
372
373 "add %2,%0\n"
374
375 "movq (%0),%%mm4\n"
376 "movq %%mm4, %%mm1\n"
377 "psllq $8, %%mm4\n"
378 "psrlq $8, %%mm1\n"
379 "psrlq $8, %%mm4\n"
380 "movq %%mm4, %%mm5\n"
381 "movq %%mm1, %%mm3\n"
382 "punpcklbw %%mm7,%%mm4\n"
383 "punpcklbw %%mm7,%%mm1\n"
384 "punpckhbw %%mm7,%%mm5\n"
385 "punpckhbw %%mm7,%%mm3\n"
386 "psubw %%mm1, %%mm4\n"
387 "psubw %%mm3, %%mm5\n"
388 "psubw %%mm4, %%mm0\n"
389 "psubw %%mm5, %%mm2\n"
390 "pxor %%mm3, %%mm3\n"
391 "pxor %%mm1, %%mm1\n"
392 "pcmpgtw %%mm0, %%mm3\n\t"
393 "pcmpgtw %%mm2, %%mm1\n\t"
394 "pxor %%mm3, %%mm0\n"
395 "pxor %%mm1, %%mm2\n"
396 "psubw %%mm3, %%mm0\n"
397 "psubw %%mm1, %%mm2\n"
398 "paddw %%mm0, %%mm2\n"
399 "paddw %%mm2, %%mm6\n"
400
401 "add %2,%0\n"
402 "1:\n"
403
404 "movq (%0),%%mm0\n"
405 "movq %%mm0, %%mm1\n"
406 "psllq $8, %%mm0\n"
407 "psrlq $8, %%mm1\n"
408 "psrlq $8, %%mm0\n"
409 "movq %%mm0, %%mm2\n"
410 "movq %%mm1, %%mm3\n"
411 "punpcklbw %%mm7,%%mm0\n"
412 "punpcklbw %%mm7,%%mm1\n"
413 "punpckhbw %%mm7,%%mm2\n"
414 "punpckhbw %%mm7,%%mm3\n"
415 "psubw %%mm1, %%mm0\n"
416 "psubw %%mm3, %%mm2\n"
417 "psubw %%mm0, %%mm4\n"
418 "psubw %%mm2, %%mm5\n"
419 "pxor %%mm3, %%mm3\n"
420 "pxor %%mm1, %%mm1\n"
421 "pcmpgtw %%mm4, %%mm3\n\t"
422 "pcmpgtw %%mm5, %%mm1\n\t"
423 "pxor %%mm3, %%mm4\n"
424 "pxor %%mm1, %%mm5\n"
425 "psubw %%mm3, %%mm4\n"
426 "psubw %%mm1, %%mm5\n"
427 "paddw %%mm4, %%mm5\n"
428 "paddw %%mm5, %%mm6\n"
429
430 "add %2,%0\n"
431
432 "movq (%0),%%mm4\n"
433 "movq %%mm4, %%mm1\n"
434 "psllq $8, %%mm4\n"
435 "psrlq $8, %%mm1\n"
436 "psrlq $8, %%mm4\n"
437 "movq %%mm4, %%mm5\n"
438 "movq %%mm1, %%mm3\n"
439 "punpcklbw %%mm7,%%mm4\n"
440 "punpcklbw %%mm7,%%mm1\n"
441 "punpckhbw %%mm7,%%mm5\n"
442 "punpckhbw %%mm7,%%mm3\n"
443 "psubw %%mm1, %%mm4\n"
444 "psubw %%mm3, %%mm5\n"
445 "psubw %%mm4, %%mm0\n"
446 "psubw %%mm5, %%mm2\n"
447 "pxor %%mm3, %%mm3\n"
448 "pxor %%mm1, %%mm1\n"
449 "pcmpgtw %%mm0, %%mm3\n\t"
450 "pcmpgtw %%mm2, %%mm1\n\t"
451 "pxor %%mm3, %%mm0\n"
452 "pxor %%mm1, %%mm2\n"
453 "psubw %%mm3, %%mm0\n"
454 "psubw %%mm1, %%mm2\n"
455 "paddw %%mm0, %%mm2\n"
456 "paddw %%mm2, %%mm6\n"
457
458 "add %2,%0\n"
459 "subl $2, %%ecx\n"
460 " jnz 1b\n"
461
462 "movq %%mm6, %%mm0\n"
463 "punpcklwd %%mm7,%%mm0\n"
464 "punpckhwd %%mm7,%%mm6\n"
465 "paddd %%mm0, %%mm6\n"
466
467 "movq %%mm6,%%mm0\n"
468 "psrlq $32, %%mm6\n"
469 "paddd %%mm6,%%mm0\n"
470 "movd %%mm0,%1\n"
471 : "+r" (pix1), "=r"(tmp)
472 : "r" ((x86_reg)line_size) , "g" (h-2)
473 : "%ecx");
474 return tmp;
475 }
476
477 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
478 int tmp;
479 uint8_t * pix= pix1;
480 asm volatile (
481 "movl %3,%%ecx\n"
482 "pxor %%mm7,%%mm7\n"
483 "pxor %%mm6,%%mm6\n"
484
485 "movq (%0),%%mm0\n"
486 "movq 1(%0),%%mm1\n"
487 "movq %%mm0, %%mm2\n"
488 "movq %%mm1, %%mm3\n"
489 "punpcklbw %%mm7,%%mm0\n"
490 "punpcklbw %%mm7,%%mm1\n"
491 "punpckhbw %%mm7,%%mm2\n"
492 "punpckhbw %%mm7,%%mm3\n"
493 "psubw %%mm1, %%mm0\n"
494 "psubw %%mm3, %%mm2\n"
495
496 "add %2,%0\n"
497
498 "movq (%0),%%mm4\n"
499 "movq 1(%0),%%mm1\n"
500 "movq %%mm4, %%mm5\n"
501 "movq %%mm1, %%mm3\n"
502 "punpcklbw %%mm7,%%mm4\n"
503 "punpcklbw %%mm7,%%mm1\n"
504 "punpckhbw %%mm7,%%mm5\n"
505 "punpckhbw %%mm7,%%mm3\n"
506 "psubw %%mm1, %%mm4\n"
507 "psubw %%mm3, %%mm5\n"
508 "psubw %%mm4, %%mm0\n"
509 "psubw %%mm5, %%mm2\n"
510 "pxor %%mm3, %%mm3\n"
511 "pxor %%mm1, %%mm1\n"
512 "pcmpgtw %%mm0, %%mm3\n\t"
513 "pcmpgtw %%mm2, %%mm1\n\t"
514 "pxor %%mm3, %%mm0\n"
515 "pxor %%mm1, %%mm2\n"
516 "psubw %%mm3, %%mm0\n"
517 "psubw %%mm1, %%mm2\n"
518 "paddw %%mm0, %%mm2\n"
519 "paddw %%mm2, %%mm6\n"
520
521 "add %2,%0\n"
522 "1:\n"
523
524 "movq (%0),%%mm0\n"
525 "movq 1(%0),%%mm1\n"
526 "movq %%mm0, %%mm2\n"
527 "movq %%mm1, %%mm3\n"
528 "punpcklbw %%mm7,%%mm0\n"
529 "punpcklbw %%mm7,%%mm1\n"
530 "punpckhbw %%mm7,%%mm2\n"
531 "punpckhbw %%mm7,%%mm3\n"
532 "psubw %%mm1, %%mm0\n"
533 "psubw %%mm3, %%mm2\n"
534 "psubw %%mm0, %%mm4\n"
535 "psubw %%mm2, %%mm5\n"
536 "pxor %%mm3, %%mm3\n"
537 "pxor %%mm1, %%mm1\n"
538 "pcmpgtw %%mm4, %%mm3\n\t"
539 "pcmpgtw %%mm5, %%mm1\n\t"
540 "pxor %%mm3, %%mm4\n"
541 "pxor %%mm1, %%mm5\n"
542 "psubw %%mm3, %%mm4\n"
543 "psubw %%mm1, %%mm5\n"
544 "paddw %%mm4, %%mm5\n"
545 "paddw %%mm5, %%mm6\n"
546
547 "add %2,%0\n"
548
549 "movq (%0),%%mm4\n"
550 "movq 1(%0),%%mm1\n"
551 "movq %%mm4, %%mm5\n"
552 "movq %%mm1, %%mm3\n"
553 "punpcklbw %%mm7,%%mm4\n"
554 "punpcklbw %%mm7,%%mm1\n"
555 "punpckhbw %%mm7,%%mm5\n"
556 "punpckhbw %%mm7,%%mm3\n"
557 "psubw %%mm1, %%mm4\n"
558 "psubw %%mm3, %%mm5\n"
559 "psubw %%mm4, %%mm0\n"
560 "psubw %%mm5, %%mm2\n"
561 "pxor %%mm3, %%mm3\n"
562 "pxor %%mm1, %%mm1\n"
563 "pcmpgtw %%mm0, %%mm3\n\t"
564 "pcmpgtw %%mm2, %%mm1\n\t"
565 "pxor %%mm3, %%mm0\n"
566 "pxor %%mm1, %%mm2\n"
567 "psubw %%mm3, %%mm0\n"
568 "psubw %%mm1, %%mm2\n"
569 "paddw %%mm0, %%mm2\n"
570 "paddw %%mm2, %%mm6\n"
571
572 "add %2,%0\n"
573 "subl $2, %%ecx\n"
574 " jnz 1b\n"
575
576 "movq %%mm6, %%mm0\n"
577 "punpcklwd %%mm7,%%mm0\n"
578 "punpckhwd %%mm7,%%mm6\n"
579 "paddd %%mm0, %%mm6\n"
580
581 "movq %%mm6,%%mm0\n"
582 "psrlq $32, %%mm6\n"
583 "paddd %%mm6,%%mm0\n"
584 "movd %%mm0,%1\n"
585 : "+r" (pix1), "=r"(tmp)
586 : "r" ((x86_reg)line_size) , "g" (h-2)
587 : "%ecx");
588 return tmp + hf_noise8_mmx(pix+8, line_size, h);
589 }
590
591 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
592 MpegEncContext *c = p;
593 int score1, score2;
594
595 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
596 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
597 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
598
599 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
600 else return score1 + FFABS(score2)*8;
601 }
602
603 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
604 MpegEncContext *c = p;
605 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
606 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
607
608 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
609 else return score1 + FFABS(score2)*8;
610 }
611
612 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
613 int tmp;
614
615 assert( (((int)pix) & 7) == 0);
616 assert((line_size &7) ==0);
617
618 #define SUM(in0, in1, out0, out1) \
619 "movq (%0), %%mm2\n"\
620 "movq 8(%0), %%mm3\n"\
621 "add %2,%0\n"\
622 "movq %%mm2, " #out0 "\n"\
623 "movq %%mm3, " #out1 "\n"\
624 "psubusb " #in0 ", %%mm2\n"\
625 "psubusb " #in1 ", %%mm3\n"\
626 "psubusb " #out0 ", " #in0 "\n"\
627 "psubusb " #out1 ", " #in1 "\n"\
628 "por %%mm2, " #in0 "\n"\
629 "por %%mm3, " #in1 "\n"\
630 "movq " #in0 ", %%mm2\n"\
631 "movq " #in1 ", %%mm3\n"\
632 "punpcklbw %%mm7, " #in0 "\n"\
633 "punpcklbw %%mm7, " #in1 "\n"\
634 "punpckhbw %%mm7, %%mm2\n"\
635 "punpckhbw %%mm7, %%mm3\n"\
636 "paddw " #in1 ", " #in0 "\n"\
637 "paddw %%mm3, %%mm2\n"\
638 "paddw %%mm2, " #in0 "\n"\
639 "paddw " #in0 ", %%mm6\n"
640
641
642 asm volatile (
643 "movl %3,%%ecx\n"
644 "pxor %%mm6,%%mm6\n"
645 "pxor %%mm7,%%mm7\n"
646 "movq (%0),%%mm0\n"
647 "movq 8(%0),%%mm1\n"
648 "add %2,%0\n"
649 "subl $2, %%ecx\n"
650 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
651 "1:\n"
652
653 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
654
655 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
656
657 "subl $2, %%ecx\n"
658 "jnz 1b\n"
659
660 "movq %%mm6,%%mm0\n"
661 "psrlq $32, %%mm6\n"
662 "paddw %%mm6,%%mm0\n"
663 "movq %%mm0,%%mm6\n"
664 "psrlq $16, %%mm0\n"
665 "paddw %%mm6,%%mm0\n"
666 "movd %%mm0,%1\n"
667 : "+r" (pix), "=r"(tmp)
668 : "r" ((x86_reg)line_size) , "m" (h)
669 : "%ecx");
670 return tmp & 0xFFFF;
671 }
672 #undef SUM
673
674 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
675 int tmp;
676
677 assert( (((int)pix) & 7) == 0);
678 assert((line_size &7) ==0);
679
680 #define SUM(in0, in1, out0, out1) \
681 "movq (%0), " #out0 "\n"\
682 "movq 8(%0), " #out1 "\n"\
683 "add %2,%0\n"\
684 "psadbw " #out0 ", " #in0 "\n"\
685 "psadbw " #out1 ", " #in1 "\n"\
686 "paddw " #in1 ", " #in0 "\n"\
687 "paddw " #in0 ", %%mm6\n"
688
689 asm volatile (
690 "movl %3,%%ecx\n"
691 "pxor %%mm6,%%mm6\n"
692 "pxor %%mm7,%%mm7\n"
693 "movq (%0),%%mm0\n"
694 "movq 8(%0),%%mm1\n"
695 "add %2,%0\n"
696 "subl $2, %%ecx\n"
697 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
698 "1:\n"
699
700 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
701
702 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
703
704 "subl $2, %%ecx\n"
705 "jnz 1b\n"
706
707 "movd %%mm6,%1\n"
708 : "+r" (pix), "=r"(tmp)
709 : "r" ((x86_reg)line_size) , "m" (h)
710 : "%ecx");
711 return tmp;
712 }
713 #undef SUM
714
715 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
716 int tmp;
717
718 assert( (((int)pix1) & 7) == 0);
719 assert( (((int)pix2) & 7) == 0);
720 assert((line_size &7) ==0);
721
722 #define SUM(in0, in1, out0, out1) \
723 "movq (%0),%%mm2\n"\
724 "movq (%1)," #out0 "\n"\
725 "movq 8(%0),%%mm3\n"\
726 "movq 8(%1)," #out1 "\n"\
727 "add %3,%0\n"\
728 "add %3,%1\n"\
729 "psubb " #out0 ", %%mm2\n"\
730 "psubb " #out1 ", %%mm3\n"\
731 "pxor %%mm7, %%mm2\n"\
732 "pxor %%mm7, %%mm3\n"\
733 "movq %%mm2, " #out0 "\n"\
734 "movq %%mm3, " #out1 "\n"\
735 "psubusb " #in0 ", %%mm2\n"\
736 "psubusb " #in1 ", %%mm3\n"\
737 "psubusb " #out0 ", " #in0 "\n"\
738 "psubusb " #out1 ", " #in1 "\n"\
739 "por %%mm2, " #in0 "\n"\
740 "por %%mm3, " #in1 "\n"\
741 "movq " #in0 ", %%mm2\n"\
742 "movq " #in1 ", %%mm3\n"\
743 "punpcklbw %%mm7, " #in0 "\n"\
744 "punpcklbw %%mm7, " #in1 "\n"\
745 "punpckhbw %%mm7, %%mm2\n"\
746 "punpckhbw %%mm7, %%mm3\n"\
747 "paddw " #in1 ", " #in0 "\n"\
748 "paddw %%mm3, %%mm2\n"\
749 "paddw %%mm2, " #in0 "\n"\
750 "paddw " #in0 ", %%mm6\n"
751
752
753 asm volatile (
754 "movl %4,%%ecx\n"
755 "pxor %%mm6,%%mm6\n"
756 "pcmpeqw %%mm7,%%mm7\n"
757 "psllw $15, %%mm7\n"
758 "packsswb %%mm7, %%mm7\n"
759 "movq (%0),%%mm0\n"
760 "movq (%1),%%mm2\n"
761 "movq 8(%0),%%mm1\n"
762 "movq 8(%1),%%mm3\n"
763 "add %3,%0\n"
764 "add %3,%1\n"
765 "subl $2, %%ecx\n"
766 "psubb %%mm2, %%mm0\n"
767 "psubb %%mm3, %%mm1\n"
768 "pxor %%mm7, %%mm0\n"
769 "pxor %%mm7, %%mm1\n"
770 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
771 "1:\n"
772
773 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
774
775 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
776
777 "subl $2, %%ecx\n"
778 "jnz 1b\n"
779
780 "movq %%mm6,%%mm0\n"
781 "psrlq $32, %%mm6\n"
782 "paddw %%mm6,%%mm0\n"
783 "movq %%mm0,%%mm6\n"
784 "psrlq $16, %%mm0\n"
785 "paddw %%mm6,%%mm0\n"
786 "movd %%mm0,%2\n"
787 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
788 : "r" ((x86_reg)line_size) , "m" (h)
789 : "%ecx");
790 return tmp & 0x7FFF;
791 }
792 #undef SUM
793
794 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
795 int tmp;
796
797 assert( (((int)pix1) & 7) == 0);
798 assert( (((int)pix2) & 7) == 0);
799 assert((line_size &7) ==0);
800
801 #define SUM(in0, in1, out0, out1) \
802 "movq (%0)," #out0 "\n"\
803 "movq (%1),%%mm2\n"\
804 "movq 8(%0)," #out1 "\n"\
805 "movq 8(%1),%%mm3\n"\
806 "add %3,%0\n"\
807 "add %3,%1\n"\
808 "psubb %%mm2, " #out0 "\n"\
809 "psubb %%mm3, " #out1 "\n"\
810 "pxor %%mm7, " #out0 "\n"\
811 "pxor %%mm7, " #out1 "\n"\
812 "psadbw " #out0 ", " #in0 "\n"\
813 "psadbw " #out1 ", " #in1 "\n"\
814 "paddw " #in1 ", " #in0 "\n"\
815 "paddw " #in0 ", %%mm6\n"
816
817 asm volatile (
818 "movl %4,%%ecx\n"
819 "pxor %%mm6,%%mm6\n"
820 "pcmpeqw %%mm7,%%mm7\n"
821 "psllw $15, %%mm7\n"
822 "packsswb %%mm7, %%mm7\n"
823 "movq (%0),%%mm0\n"
824 "movq (%1),%%mm2\n"
825 "movq 8(%0),%%mm1\n"
826 "movq 8(%1),%%mm3\n"
827 "add %3,%0\n"
828 "add %3,%1\n"
829 "subl $2, %%ecx\n"
830 "psubb %%mm2, %%mm0\n"
831 "psubb %%mm3, %%mm1\n"
832 "pxor %%mm7, %%mm0\n"
833 "pxor %%mm7, %%mm1\n"
834 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
835 "1:\n"
836
837 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
838
839 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
840
841 "subl $2, %%ecx\n"
842 "jnz 1b\n"
843
844 "movd %%mm6,%2\n"
845 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
846 : "r" ((x86_reg)line_size) , "m" (h)
847 : "%ecx");
848 return tmp;
849 }
850 #undef SUM
851
852 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
853 x86_reg i=0;
854 asm volatile(
855 "1: \n\t"
856 "movq (%2, %0), %%mm0 \n\t"
857 "movq (%1, %0), %%mm1 \n\t"
858 "psubb %%mm0, %%mm1 \n\t"
859 "movq %%mm1, (%3, %0) \n\t"
860 "movq 8(%2, %0), %%mm0 \n\t"
861 "movq 8(%1, %0), %%mm1 \n\t"
862 "psubb %%mm0, %%mm1 \n\t"
863 "movq %%mm1, 8(%3, %0) \n\t"
864 "add $16, %0 \n\t"
865 "cmp %4, %0 \n\t"
866 " jb 1b \n\t"
867 : "+r" (i)
868 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
869 );
870 for(; i<w; i++)
871 dst[i+0] = src1[i+0]-src2[i+0];
872 }
873
874 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
875 x86_reg i=0;
876 uint8_t l, lt;
877
878 asm volatile(
879 "1: \n\t"
880 "movq -1(%1, %0), %%mm0 \n\t" // LT
881 "movq (%1, %0), %%mm1 \n\t" // T
882 "movq -1(%2, %0), %%mm2 \n\t" // L
883 "movq (%2, %0), %%mm3 \n\t" // X
884 "movq %%mm2, %%mm4 \n\t" // L
885 "psubb %%mm0, %%mm2 \n\t"
886 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
887 "movq %%mm4, %%mm5 \n\t" // L
888 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
889 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
890 "pminub %%mm2, %%mm4 \n\t"
891 "pmaxub %%mm1, %%mm4 \n\t"
892 "psubb %%mm4, %%mm3 \n\t" // dst - pred
893 "movq %%mm3, (%3, %0) \n\t"
894 "add $8, %0 \n\t"
895 "cmp %4, %0 \n\t"
896 " jb 1b \n\t"
897 : "+r" (i)
898 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
899 );
900
901 l= *left;
902 lt= *left_top;
903
904 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
905
906 *left_top= src1[w-1];
907 *left = src2[w-1];
908 }
909
910 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
911 "mov"#m" "#p1", "#a" \n\t"\
912 "mov"#m" "#p2", "#t" \n\t"\
913 "punpcklbw "#a", "#t" \n\t"\
914 "punpcklbw "#a", "#a" \n\t"\
915 "psubw "#t", "#a" \n\t"\
916
917 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
918 uint8_t *p1b=p1, *p2b=p2;\
919 asm volatile(\
920 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
921 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
922 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
923 "add %4, %1 \n\t"\
924 "add %4, %2 \n\t"\
925 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
926 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
927 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
928 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
929 "mov"#m1" "#mm"0, %0 \n\t"\
930 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
931 "mov"#m1" %0, "#mm"0 \n\t"\
932 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
933 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
934 );\
935 }
936 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
937
938 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
939 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
940
941 #define LBUTTERFLY2(a1,b1,a2,b2)\
942 "paddw " #b1 ", " #a1 " \n\t"\
943 "paddw " #b2 ", " #a2 " \n\t"\
944 "paddw " #b1 ", " #b1 " \n\t"\
945 "paddw " #b2 ", " #b2 " \n\t"\
946 "psubw " #a1 ", " #b1 " \n\t"\
947 "psubw " #a2 ", " #b2 " \n\t"
948
949 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
950 LBUTTERFLY2(m0, m1, m2, m3)\
951 LBUTTERFLY2(m4, m5, m6, m7)\
952 LBUTTERFLY2(m0, m2, m1, m3)\
953 LBUTTERFLY2(m4, m6, m5, m7)\
954 LBUTTERFLY2(m0, m4, m1, m5)\
955 LBUTTERFLY2(m2, m6, m3, m7)\
956
957 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
958
959 #define MMABS_MMX(a,z)\
960 "pxor " #z ", " #z " \n\t"\
961 "pcmpgtw " #a ", " #z " \n\t"\
962 "pxor " #z ", " #a " \n\t"\
963 "psubw " #z ", " #a " \n\t"
964
965 #define MMABS_MMX2(a,z)\
966 "pxor " #z ", " #z " \n\t"\
967 "psubw " #a ", " #z " \n\t"\
968 "pmaxsw " #z ", " #a " \n\t"
969
970 #define MMABS_SSSE3(a,z)\
971 "pabsw " #a ", " #a " \n\t"
972
973 #define MMABS_SUM(a,z, sum)\
974 MMABS(a,z)\
975 "paddusw " #a ", " #sum " \n\t"
976
977 #define MMABS_SUM_8x8_NOSPILL\
978 MMABS(%%xmm0, %%xmm8)\
979 MMABS(%%xmm1, %%xmm9)\
980 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
981 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
982 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
983 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
984 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
985 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
986 "paddusw %%xmm1, %%xmm0 \n\t"
987
988 #ifdef ARCH_X86_64
989 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
990 #else
991 #define MMABS_SUM_8x8_SSE2\
992 "movdqa %%xmm7, (%1) \n\t"\
993 MMABS(%%xmm0, %%xmm7)\
994 MMABS(%%xmm1, %%xmm7)\
995 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
996 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
997 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
998 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
999 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1000 "movdqa (%1), %%xmm2 \n\t"\
1001 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1002 "paddusw %%xmm1, %%xmm0 \n\t"
1003 #endif
1004
1005 #define LOAD4(o, a, b, c, d)\
1006 "movq "#o"(%1), "#a" \n\t"\
1007 "movq "#o"+8(%1), "#b" \n\t"\
1008 "movq "#o"+16(%1), "#c" \n\t"\
1009 "movq "#o"+24(%1), "#d" \n\t"\
1010
1011 #define STORE4(o, a, b, c, d)\
1012 "movq "#a", "#o"(%1) \n\t"\
1013 "movq "#b", "#o"+8(%1) \n\t"\
1014 "movq "#c", "#o"+16(%1) \n\t"\
1015 "movq "#d", "#o"+24(%1) \n\t"\
1016
1017 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1018 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1019 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1020 #define HSUM_MMX(a, t, dst)\
1021 "movq "#a", "#t" \n\t"\
1022 "psrlq $32, "#a" \n\t"\
1023 "paddusw "#t", "#a" \n\t"\
1024 "movq "#a", "#t" \n\t"\
1025 "psrlq $16, "#a" \n\t"\
1026 "paddusw "#t", "#a" \n\t"\
1027 "movd "#a", "#dst" \n\t"\
1028
1029 #define HSUM_MMX2(a, t, dst)\
1030 "pshufw $0x0E, "#a", "#t" \n\t"\
1031 "paddusw "#t", "#a" \n\t"\
1032 "pshufw $0x01, "#a", "#t" \n\t"\
1033 "paddusw "#t", "#a" \n\t"\
1034 "movd "#a", "#dst" \n\t"\
1035
1036 #define HSUM_SSE2(a, t, dst)\
1037 "movhlps "#a", "#t" \n\t"\
1038 "paddusw "#t", "#a" \n\t"\
1039 "pshuflw $0x0E, "#a", "#t" \n\t"\
1040 "paddusw "#t", "#a" \n\t"\
1041 "pshuflw $0x01, "#a", "#t" \n\t"\
1042 "paddusw "#t", "#a" \n\t"\
1043 "movd "#a", "#dst" \n\t"\
1044
1045 #define HADAMARD8_DIFF_MMX(cpu) \
1046 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1047 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1048 int sum;\
1049 \
1050 assert(h==8);\
1051 \
1052 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1053 \
1054 asm volatile(\
1055 HADAMARD48\
1056 \
1057 "movq %%mm7, 96(%1) \n\t"\
1058 \
1059 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1060 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1061 \
1062 "movq 96(%1), %%mm7 \n\t"\
1063 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1064 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1065 \
1066 : "=r" (sum)\
1067 : "r"(temp)\
1068 );\
1069 \
1070 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1071 \
1072 asm volatile(\
1073 HADAMARD48\
1074 \
1075 "movq %%mm7, 96(%1) \n\t"\
1076 \
1077 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1078 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1079 \
1080 "movq 96(%1), %%mm7 \n\t"\
1081 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1082 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1083 "movq %%mm6, %%mm7 \n\t"\
1084 "movq %%mm0, %%mm6 \n\t"\
1085 \
1086 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1087 \
1088 HADAMARD48\
1089 "movq %%mm7, 64(%1) \n\t"\
1090 MMABS(%%mm0, %%mm7)\
1091 MMABS(%%mm1, %%mm7)\
1092 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1093 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1094 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1095 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1096 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1097 "movq 64(%1), %%mm2 \n\t"\
1098 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1099 "paddusw %%mm1, %%mm0 \n\t"\
1100 "movq %%mm0, 64(%1) \n\t"\
1101 \
1102 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1103 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1104 \
1105 HADAMARD48\
1106 "movq %%mm7, (%1) \n\t"\
1107 MMABS(%%mm0, %%mm7)\
1108 MMABS(%%mm1, %%mm7)\
1109 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1110 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1111 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1112 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1113 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1114 "movq (%1), %%mm2 \n\t"\
1115 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1116 "paddusw 64(%1), %%mm0 \n\t"\
1117 "paddusw %%mm1, %%mm0 \n\t"\
1118 \
1119 HSUM(%%mm0, %%mm1, %0)\
1120 \
1121 : "=r" (sum)\
1122 : "r"(temp)\
1123 );\
1124 return sum&0xFFFF;\
1125 }\
1126 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1127
1128 #define HADAMARD8_DIFF_SSE2(cpu) \
1129 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1130 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1131 int sum;\
1132 \
1133 assert(h==8);\
1134 \
1135 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1136 \
1137 asm volatile(\
1138 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1139 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1140 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1141 MMABS_SUM_8x8\
1142 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1143 : "=r" (sum)\
1144 : "r"(temp)\
1145 );\
1146 return sum&0xFFFF;\
1147 }\
1148 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1149
1150 #define MMABS(a,z) MMABS_MMX(a,z)
1151 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1152 HADAMARD8_DIFF_MMX(mmx)
1153 #undef MMABS
1154 #undef HSUM
1155
1156 #define MMABS(a,z) MMABS_MMX2(a,z)
1157 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1158 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1159 HADAMARD8_DIFF_MMX(mmx2)
1160 HADAMARD8_DIFF_SSE2(sse2)
1161 #undef MMABS
1162 #undef MMABS_SUM_8x8
1163 #undef HSUM
1164
1165 #ifdef HAVE_SSSE3
1166 #define MMABS(a,z) MMABS_SSSE3(a,z)
1167 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1168 HADAMARD8_DIFF_SSE2(ssse3)
1169 #undef MMABS
1170 #undef MMABS_SUM_8x8
1171 #endif
1172
1173 #define DCT_SAD4(m,mm,o)\
1174 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1175 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1176 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1177 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1178 MMABS_SUM(mm##2, mm##6, mm##0)\
1179 MMABS_SUM(mm##3, mm##7, mm##1)\
1180 MMABS_SUM(mm##4, mm##6, mm##0)\
1181 MMABS_SUM(mm##5, mm##7, mm##1)\
1182
1183 #define DCT_SAD_MMX\
1184 "pxor %%mm0, %%mm0 \n\t"\
1185 "pxor %%mm1, %%mm1 \n\t"\
1186 DCT_SAD4(q, %%mm, 0)\
1187 DCT_SAD4(q, %%mm, 8)\
1188 DCT_SAD4(q, %%mm, 64)\
1189 DCT_SAD4(q, %%mm, 72)\
1190 "paddusw %%mm1, %%mm0 \n\t"\
1191 HSUM(%%mm0, %%mm1, %0)
1192
1193 #define DCT_SAD_SSE2\
1194 "pxor %%xmm0, %%xmm0 \n\t"\
1195 "pxor %%xmm1, %%xmm1 \n\t"\
1196 DCT_SAD4(dqa, %%xmm, 0)\
1197 DCT_SAD4(dqa, %%xmm, 64)\
1198 "paddusw %%xmm1, %%xmm0 \n\t"\
1199 HSUM(%%xmm0, %%xmm1, %0)
1200
1201 #define DCT_SAD_FUNC(cpu) \
1202 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1203 int sum;\
1204 asm volatile(\
1205 DCT_SAD\
1206 :"=r"(sum)\
1207 :"r"(block)\
1208 );\
1209 return sum&0xFFFF;\
1210 }
1211
1212 #define DCT_SAD DCT_SAD_MMX
1213 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1214 #define MMABS(a,z) MMABS_MMX(a,z)
1215 DCT_SAD_FUNC(mmx)
1216 #undef MMABS
1217 #undef HSUM
1218
1219 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1220 #define MMABS(a,z) MMABS_MMX2(a,z)
1221 DCT_SAD_FUNC(mmx2)
1222 #undef HSUM
1223 #undef DCT_SAD
1224
1225 #define DCT_SAD DCT_SAD_SSE2
1226 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1227 DCT_SAD_FUNC(sse2)
1228 #undef MMABS
1229
1230 #ifdef HAVE_SSSE3
1231 #define MMABS(a,z) MMABS_SSSE3(a,z)
1232 DCT_SAD_FUNC(ssse3)
1233 #undef MMABS
1234 #endif
1235 #undef HSUM
1236 #undef DCT_SAD
1237
1238 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1239 int sum;
1240 x86_reg i=size;
1241 asm volatile(
1242 "pxor %%mm4, %%mm4 \n"
1243 "1: \n"
1244 "sub $8, %0 \n"
1245 "movq (%2,%0), %%mm2 \n"
1246 "movq (%3,%0,2), %%mm0 \n"
1247 "movq 8(%3,%0,2), %%mm1 \n"
1248 "punpckhbw %%mm2, %%mm3 \n"
1249 "punpcklbw %%mm2, %%mm2 \n"
1250 "psraw $8, %%mm3 \n"
1251 "psraw $8, %%mm2 \n"
1252 "psubw %%mm3, %%mm1 \n"
1253 "psubw %%mm2, %%mm0 \n"
1254 "pmaddwd %%mm1, %%mm1 \n"
1255 "pmaddwd %%mm0, %%mm0 \n"
1256 "paddd %%mm1, %%mm4 \n"
1257 "paddd %%mm0, %%mm4 \n"
1258 "jg 1b \n"
1259 "movq %%mm4, %%mm3 \n"
1260 "psrlq $32, %%mm3 \n"
1261 "paddd %%mm3, %%mm4 \n"
1262 "movd %%mm4, %1 \n"
1263 :"+r"(i), "=r"(sum)
1264 :"r"(pix1), "r"(pix2)
1265 );
1266 return sum;
1267 }
1268
1269 #define PHADDD(a, t)\
1270 "movq "#a", "#t" \n\t"\
1271 "psrlq $32, "#a" \n\t"\
1272 "paddd "#t", "#a" \n\t"
1273 /*
1274 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1275 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1276 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1277 */
1278 #define PMULHRW(x, y, s, o)\
1279 "pmulhw " #s ", "#x " \n\t"\
1280 "pmulhw " #s ", "#y " \n\t"\
1281 "paddw " #o ", "#x " \n\t"\
1282 "paddw " #o ", "#y " \n\t"\
1283 "psraw $1, "#x " \n\t"\
1284 "psraw $1, "#y " \n\t"
1285 #define DEF(x) x ## _mmx
1286 #define SET_RND MOVQ_WONE
1287 #define SCALE_OFFSET 1
1288
1289 #include "dsputil_mmx_qns.h"
1290
1291 #undef DEF
1292 #undef SET_RND
1293 #undef SCALE_OFFSET
1294 #undef PMULHRW
1295
1296 #define DEF(x) x ## _3dnow
1297 #define SET_RND(x)
1298 #define SCALE_OFFSET 0
1299 #define PMULHRW(x, y, s, o)\
1300 "pmulhrw " #s ", "#x " \n\t"\
1301 "pmulhrw " #s ", "#y " \n\t"
1302
1303 #include "dsputil_mmx_qns.h"
1304
1305 #undef DEF
1306 #undef SET_RND
1307 #undef SCALE_OFFSET
1308 #undef PMULHRW
1309
1310 #ifdef HAVE_SSSE3
1311 #undef PHADDD
1312 #define DEF(x) x ## _ssse3
1313 #define SET_RND(x)
1314 #define SCALE_OFFSET -1
1315 #define PHADDD(a, t)\
1316 "pshufw $0x0E, "#a", "#t" \n\t"\
1317 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
1318 #define PMULHRW(x, y, s, o)\
1319 "pmulhrsw " #s ", "#x " \n\t"\
1320 "pmulhrsw " #s ", "#y " \n\t"
1321
1322 #include "dsputil_mmx_qns.h"
1323
1324 #undef DEF
1325 #undef SET_RND
1326 #undef SCALE_OFFSET
1327 #undef PMULHRW
1328 #undef PHADDD
1329 #endif //HAVE_SSSE3
1330
1331
1332 /* FLAC specific */
1333 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1334 double *autoc);
1335
1336
1337 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1338 {
1339 if (mm_flags & MM_MMX) {
1340 const int dct_algo = avctx->dct_algo;
1341 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1342 if(mm_flags & MM_SSE2){
1343 c->fdct = ff_fdct_sse2;
1344 }else if(mm_flags & MM_MMXEXT){
1345 c->fdct = ff_fdct_mmx2;
1346 }else{
1347 c->fdct = ff_fdct_mmx;
1348 }
1349 }
1350
1351 c->get_pixels = get_pixels_mmx;
1352 c->diff_pixels = diff_pixels_mmx;
1353 c->pix_sum = pix_sum16_mmx;
1354
1355 c->diff_bytes= diff_bytes_mmx;
1356 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1357
1358 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1359 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1360
1361 c->pix_norm1 = pix_norm1_mmx;
1362 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
1363 c->sse[1] = sse8_mmx;
1364 c->vsad[4]= vsad_intra16_mmx;
1365
1366 c->nsse[0] = nsse16_mmx;
1367 c->nsse[1] = nsse8_mmx;
1368 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1369 c->vsad[0] = vsad16_mmx;
1370 }
1371
1372 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1373 c->try_8x8basis= try_8x8basis_mmx;
1374 }
1375 c->add_8x8basis= add_8x8basis_mmx;
1376
1377 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1378
1379
1380 if (mm_flags & MM_MMXEXT) {
1381 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1382 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1383 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1384 c->vsad[4]= vsad_intra16_mmx2;
1385
1386 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1387 c->vsad[0] = vsad16_mmx2;
1388 }
1389
1390 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1391 }
1392
1393 if(mm_flags & MM_SSE2){
1394 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1395 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1396 c->hadamard8_diff[1]= hadamard8_diff_sse2;
1397 if (ENABLE_FLAC_ENCODER)
1398 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1399 }
1400
1401 #ifdef HAVE_SSSE3
1402 if(mm_flags & MM_SSSE3){
1403 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1404 c->try_8x8basis= try_8x8basis_ssse3;
1405 }
1406 c->add_8x8basis= add_8x8basis_ssse3;
1407 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1408 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1409 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1410 }
1411 #endif
1412
1413 if(mm_flags & MM_3DNOW){
1414 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1415 c->try_8x8basis= try_8x8basis_3dnow;
1416 }
1417 c->add_8x8basis= add_8x8basis_3dnow;
1418 }
1419 }
1420
1421 dsputil_init_pix_mmx(c, avctx);
1422 }