Simplify vsad16_mmx2().
[libav.git] / libavcodec / i386 / dsputilenc_mmx.c
CommitLineData
97d1d009
AJ
1/*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
245976da
DB
25#include "libavutil/x86_cpu.h"
26#include "libavcodec/dsputil.h"
27#include "libavcodec/mpegvideo.h"
97d1d009 28#include "dsputil_mmx.h"
97d1d009
AJ
29
30
31static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
32{
33 asm volatile(
34 "mov $-128, %%"REG_a" \n\t"
35 "pxor %%mm7, %%mm7 \n\t"
36 ASMALIGN(4)
37 "1: \n\t"
38 "movq (%0), %%mm0 \n\t"
39 "movq (%0, %2), %%mm2 \n\t"
40 "movq %%mm0, %%mm1 \n\t"
41 "movq %%mm2, %%mm3 \n\t"
42 "punpcklbw %%mm7, %%mm0 \n\t"
43 "punpckhbw %%mm7, %%mm1 \n\t"
44 "punpcklbw %%mm7, %%mm2 \n\t"
45 "punpckhbw %%mm7, %%mm3 \n\t"
46 "movq %%mm0, (%1, %%"REG_a") \n\t"
47 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
48 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
49 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
50 "add %3, %0 \n\t"
51 "add $32, %%"REG_a" \n\t"
52 "js 1b \n\t"
53 : "+r" (pixels)
40d0e665 54 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
97d1d009
AJ
55 : "%"REG_a
56 );
57}
58
59static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
60{
61 asm volatile(
62 "pxor %%mm7, %%mm7 \n\t"
63 "mov $-128, %%"REG_a" \n\t"
64 ASMALIGN(4)
65 "1: \n\t"
66 "movq (%0), %%mm0 \n\t"
67 "movq (%1), %%mm2 \n\t"
68 "movq %%mm0, %%mm1 \n\t"
69 "movq %%mm2, %%mm3 \n\t"
70 "punpcklbw %%mm7, %%mm0 \n\t"
71 "punpckhbw %%mm7, %%mm1 \n\t"
72 "punpcklbw %%mm7, %%mm2 \n\t"
73 "punpckhbw %%mm7, %%mm3 \n\t"
74 "psubw %%mm2, %%mm0 \n\t"
75 "psubw %%mm3, %%mm1 \n\t"
76 "movq %%mm0, (%2, %%"REG_a") \n\t"
77 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
78 "add %3, %0 \n\t"
79 "add %3, %1 \n\t"
80 "add $16, %%"REG_a" \n\t"
81 "jnz 1b \n\t"
82 : "+r" (s1), "+r" (s2)
40d0e665 83 : "r" (block+64), "r" ((x86_reg)stride)
97d1d009
AJ
84 : "%"REG_a
85 );
86}
87
88static int pix_sum16_mmx(uint8_t * pix, int line_size){
89 const int h=16;
90 int sum;
40d0e665 91 x86_reg index= -line_size*h;
97d1d009
AJ
92
93 asm volatile(
94 "pxor %%mm7, %%mm7 \n\t"
95 "pxor %%mm6, %%mm6 \n\t"
96 "1: \n\t"
97 "movq (%2, %1), %%mm0 \n\t"
98 "movq (%2, %1), %%mm1 \n\t"
99 "movq 8(%2, %1), %%mm2 \n\t"
100 "movq 8(%2, %1), %%mm3 \n\t"
101 "punpcklbw %%mm7, %%mm0 \n\t"
102 "punpckhbw %%mm7, %%mm1 \n\t"
103 "punpcklbw %%mm7, %%mm2 \n\t"
104 "punpckhbw %%mm7, %%mm3 \n\t"
105 "paddw %%mm0, %%mm1 \n\t"
106 "paddw %%mm2, %%mm3 \n\t"
107 "paddw %%mm1, %%mm3 \n\t"
108 "paddw %%mm3, %%mm6 \n\t"
109 "add %3, %1 \n\t"
110 " js 1b \n\t"
111 "movq %%mm6, %%mm5 \n\t"
112 "psrlq $32, %%mm6 \n\t"
113 "paddw %%mm5, %%mm6 \n\t"
114 "movq %%mm6, %%mm5 \n\t"
115 "psrlq $16, %%mm6 \n\t"
116 "paddw %%mm5, %%mm6 \n\t"
117 "movd %%mm6, %0 \n\t"
118 "andl $0xFFFF, %0 \n\t"
119 : "=&r" (sum), "+r" (index)
40d0e665 120 : "r" (pix - index), "r" ((x86_reg)line_size)
97d1d009
AJ
121 );
122
123 return sum;
124}
125
126static int pix_norm1_mmx(uint8_t *pix, int line_size) {
127 int tmp;
128 asm volatile (
129 "movl $16,%%ecx\n"
130 "pxor %%mm0,%%mm0\n"
131 "pxor %%mm7,%%mm7\n"
132 "1:\n"
133 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
134 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
135
136 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
137
138 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
139 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
140
141 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
142 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
143 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
144
145 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
146 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
147
148 "pmaddwd %%mm3,%%mm3\n"
149 "pmaddwd %%mm4,%%mm4\n"
150
151 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
152 pix2^2+pix3^2+pix6^2+pix7^2) */
153 "paddd %%mm3,%%mm4\n"
154 "paddd %%mm2,%%mm7\n"
155
156 "add %2, %0\n"
157 "paddd %%mm4,%%mm7\n"
158 "dec %%ecx\n"
159 "jnz 1b\n"
160
161 "movq %%mm7,%%mm1\n"
162 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
163 "paddd %%mm7,%%mm1\n"
164 "movd %%mm1,%1\n"
40d0e665 165 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
97d1d009
AJ
166 return tmp;
167}
168
169static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
170 int tmp;
171 asm volatile (
172 "movl %4,%%ecx\n"
173 "shr $1,%%ecx\n"
174 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
175 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
176 "1:\n"
177 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
178 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
179 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
180 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
181
182 /* todo: mm1-mm2, mm3-mm4 */
183 /* algo: subtract mm1 from mm2 with saturation and vice versa */
184 /* OR the results to get absolute difference */
185 "movq %%mm1,%%mm5\n"
186 "movq %%mm3,%%mm6\n"
187 "psubusb %%mm2,%%mm1\n"
188 "psubusb %%mm4,%%mm3\n"
189 "psubusb %%mm5,%%mm2\n"
190 "psubusb %%mm6,%%mm4\n"
191
192 "por %%mm1,%%mm2\n"
193 "por %%mm3,%%mm4\n"
194
195 /* now convert to 16-bit vectors so we can square them */
196 "movq %%mm2,%%mm1\n"
197 "movq %%mm4,%%mm3\n"
198
199 "punpckhbw %%mm0,%%mm2\n"
200 "punpckhbw %%mm0,%%mm4\n"
201 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
202 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
203
204 "pmaddwd %%mm2,%%mm2\n"
205 "pmaddwd %%mm4,%%mm4\n"
206 "pmaddwd %%mm1,%%mm1\n"
207 "pmaddwd %%mm3,%%mm3\n"
208
209 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
210 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
211
212 "paddd %%mm2,%%mm1\n"
213 "paddd %%mm4,%%mm3\n"
214 "paddd %%mm1,%%mm7\n"
215 "paddd %%mm3,%%mm7\n"
216
217 "decl %%ecx\n"
218 "jnz 1b\n"
219
220 "movq %%mm7,%%mm1\n"
221 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
222 "paddd %%mm7,%%mm1\n"
223 "movd %%mm1,%2\n"
224 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
40d0e665 225 : "r" ((x86_reg)line_size) , "m" (h)
97d1d009
AJ
226 : "%ecx");
227 return tmp;
228}
229
230static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
231 int tmp;
232 asm volatile (
233 "movl %4,%%ecx\n"
234 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
235 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
236 "1:\n"
237 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
238 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
239 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
240 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
241
242 /* todo: mm1-mm2, mm3-mm4 */
243 /* algo: subtract mm1 from mm2 with saturation and vice versa */
244 /* OR the results to get absolute difference */
245 "movq %%mm1,%%mm5\n"
246 "movq %%mm3,%%mm6\n"
247 "psubusb %%mm2,%%mm1\n"
248 "psubusb %%mm4,%%mm3\n"
249 "psubusb %%mm5,%%mm2\n"
250 "psubusb %%mm6,%%mm4\n"
251
252 "por %%mm1,%%mm2\n"
253 "por %%mm3,%%mm4\n"
254
255 /* now convert to 16-bit vectors so we can square them */
256 "movq %%mm2,%%mm1\n"
257 "movq %%mm4,%%mm3\n"
258
259 "punpckhbw %%mm0,%%mm2\n"
260 "punpckhbw %%mm0,%%mm4\n"
261 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
262 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
263
264 "pmaddwd %%mm2,%%mm2\n"
265 "pmaddwd %%mm4,%%mm4\n"
266 "pmaddwd %%mm1,%%mm1\n"
267 "pmaddwd %%mm3,%%mm3\n"
268
269 "add %3,%0\n"
270 "add %3,%1\n"
271
272 "paddd %%mm2,%%mm1\n"
273 "paddd %%mm4,%%mm3\n"
274 "paddd %%mm1,%%mm7\n"
275 "paddd %%mm3,%%mm7\n"
276
277 "decl %%ecx\n"
278 "jnz 1b\n"
279
280 "movq %%mm7,%%mm1\n"
281 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
282 "paddd %%mm7,%%mm1\n"
283 "movd %%mm1,%2\n"
284 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
40d0e665 285 : "r" ((x86_reg)line_size) , "m" (h)
97d1d009
AJ
286 : "%ecx");
287 return tmp;
288}
289
290static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
291 int tmp;
292 asm volatile (
293 "shr $1,%2\n"
294 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
295 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
296 "1:\n"
297 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
298 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
299 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
300 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
301
302 /* todo: mm1-mm2, mm3-mm4 */
303 /* algo: subtract mm1 from mm2 with saturation and vice versa */
304 /* OR the results to get absolute difference */
305 "movdqa %%xmm1,%%xmm5\n"
306 "movdqa %%xmm3,%%xmm6\n"
307 "psubusb %%xmm2,%%xmm1\n"
308 "psubusb %%xmm4,%%xmm3\n"
309 "psubusb %%xmm5,%%xmm2\n"
310 "psubusb %%xmm6,%%xmm4\n"
311
312 "por %%xmm1,%%xmm2\n"
313 "por %%xmm3,%%xmm4\n"
314
315 /* now convert to 16-bit vectors so we can square them */
316 "movdqa %%xmm2,%%xmm1\n"
317 "movdqa %%xmm4,%%xmm3\n"
318
319 "punpckhbw %%xmm0,%%xmm2\n"
320 "punpckhbw %%xmm0,%%xmm4\n"
321 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
322 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
323
324 "pmaddwd %%xmm2,%%xmm2\n"
325 "pmaddwd %%xmm4,%%xmm4\n"
326 "pmaddwd %%xmm1,%%xmm1\n"
327 "pmaddwd %%xmm3,%%xmm3\n"
328
329 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
330 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
331
332 "paddd %%xmm2,%%xmm1\n"
333 "paddd %%xmm4,%%xmm3\n"
334 "paddd %%xmm1,%%xmm7\n"
335 "paddd %%xmm3,%%xmm7\n"
336
337 "decl %2\n"
338 "jnz 1b\n"
339
340 "movdqa %%xmm7,%%xmm1\n"
341 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
342 "paddd %%xmm1,%%xmm7\n"
343 "movdqa %%xmm7,%%xmm1\n"
344 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
345 "paddd %%xmm1,%%xmm7\n"
346 "movd %%xmm7,%3\n"
347 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
40d0e665 348 : "r" ((x86_reg)line_size));
97d1d009
AJ
349 return tmp;
350}
351
352static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
353 int tmp;
354 asm volatile (
355 "movl %3,%%ecx\n"
356 "pxor %%mm7,%%mm7\n"
357 "pxor %%mm6,%%mm6\n"
358
359 "movq (%0),%%mm0\n"
360 "movq %%mm0, %%mm1\n"
361 "psllq $8, %%mm0\n"
362 "psrlq $8, %%mm1\n"
363 "psrlq $8, %%mm0\n"
364 "movq %%mm0, %%mm2\n"
365 "movq %%mm1, %%mm3\n"
366 "punpcklbw %%mm7,%%mm0\n"
367 "punpcklbw %%mm7,%%mm1\n"
368 "punpckhbw %%mm7,%%mm2\n"
369 "punpckhbw %%mm7,%%mm3\n"
370 "psubw %%mm1, %%mm0\n"
371 "psubw %%mm3, %%mm2\n"
372
373 "add %2,%0\n"
374
375 "movq (%0),%%mm4\n"
376 "movq %%mm4, %%mm1\n"
377 "psllq $8, %%mm4\n"
378 "psrlq $8, %%mm1\n"
379 "psrlq $8, %%mm4\n"
380 "movq %%mm4, %%mm5\n"
381 "movq %%mm1, %%mm3\n"
382 "punpcklbw %%mm7,%%mm4\n"
383 "punpcklbw %%mm7,%%mm1\n"
384 "punpckhbw %%mm7,%%mm5\n"
385 "punpckhbw %%mm7,%%mm3\n"
386 "psubw %%mm1, %%mm4\n"
387 "psubw %%mm3, %%mm5\n"
388 "psubw %%mm4, %%mm0\n"
389 "psubw %%mm5, %%mm2\n"
390 "pxor %%mm3, %%mm3\n"
391 "pxor %%mm1, %%mm1\n"
392 "pcmpgtw %%mm0, %%mm3\n\t"
393 "pcmpgtw %%mm2, %%mm1\n\t"
394 "pxor %%mm3, %%mm0\n"
395 "pxor %%mm1, %%mm2\n"
396 "psubw %%mm3, %%mm0\n"
397 "psubw %%mm1, %%mm2\n"
398 "paddw %%mm0, %%mm2\n"
399 "paddw %%mm2, %%mm6\n"
400
401 "add %2,%0\n"
402 "1:\n"
403
404 "movq (%0),%%mm0\n"
405 "movq %%mm0, %%mm1\n"
406 "psllq $8, %%mm0\n"
407 "psrlq $8, %%mm1\n"
408 "psrlq $8, %%mm0\n"
409 "movq %%mm0, %%mm2\n"
410 "movq %%mm1, %%mm3\n"
411 "punpcklbw %%mm7,%%mm0\n"
412 "punpcklbw %%mm7,%%mm1\n"
413 "punpckhbw %%mm7,%%mm2\n"
414 "punpckhbw %%mm7,%%mm3\n"
415 "psubw %%mm1, %%mm0\n"
416 "psubw %%mm3, %%mm2\n"
417 "psubw %%mm0, %%mm4\n"
418 "psubw %%mm2, %%mm5\n"
419 "pxor %%mm3, %%mm3\n"
420 "pxor %%mm1, %%mm1\n"
421 "pcmpgtw %%mm4, %%mm3\n\t"
422 "pcmpgtw %%mm5, %%mm1\n\t"
423 "pxor %%mm3, %%mm4\n"
424 "pxor %%mm1, %%mm5\n"
425 "psubw %%mm3, %%mm4\n"
426 "psubw %%mm1, %%mm5\n"
427 "paddw %%mm4, %%mm5\n"
428 "paddw %%mm5, %%mm6\n"
429
430 "add %2,%0\n"
431
432 "movq (%0),%%mm4\n"
433 "movq %%mm4, %%mm1\n"
434 "psllq $8, %%mm4\n"
435 "psrlq $8, %%mm1\n"
436 "psrlq $8, %%mm4\n"
437 "movq %%mm4, %%mm5\n"
438 "movq %%mm1, %%mm3\n"
439 "punpcklbw %%mm7,%%mm4\n"
440 "punpcklbw %%mm7,%%mm1\n"
441 "punpckhbw %%mm7,%%mm5\n"
442 "punpckhbw %%mm7,%%mm3\n"
443 "psubw %%mm1, %%mm4\n"
444 "psubw %%mm3, %%mm5\n"
445 "psubw %%mm4, %%mm0\n"
446 "psubw %%mm5, %%mm2\n"
447 "pxor %%mm3, %%mm3\n"
448 "pxor %%mm1, %%mm1\n"
449 "pcmpgtw %%mm0, %%mm3\n\t"
450 "pcmpgtw %%mm2, %%mm1\n\t"
451 "pxor %%mm3, %%mm0\n"
452 "pxor %%mm1, %%mm2\n"
453 "psubw %%mm3, %%mm0\n"
454 "psubw %%mm1, %%mm2\n"
455 "paddw %%mm0, %%mm2\n"
456 "paddw %%mm2, %%mm6\n"
457
458 "add %2,%0\n"
459 "subl $2, %%ecx\n"
460 " jnz 1b\n"
461
462 "movq %%mm6, %%mm0\n"
463 "punpcklwd %%mm7,%%mm0\n"
464 "punpckhwd %%mm7,%%mm6\n"
465 "paddd %%mm0, %%mm6\n"
466
467 "movq %%mm6,%%mm0\n"
468 "psrlq $32, %%mm6\n"
469 "paddd %%mm6,%%mm0\n"
470 "movd %%mm0,%1\n"
471 : "+r" (pix1), "=r"(tmp)
40d0e665 472 : "r" ((x86_reg)line_size) , "g" (h-2)
97d1d009
AJ
473 : "%ecx");
474 return tmp;
475}
476
477static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
478 int tmp;
479 uint8_t * pix= pix1;
480 asm volatile (
481 "movl %3,%%ecx\n"
482 "pxor %%mm7,%%mm7\n"
483 "pxor %%mm6,%%mm6\n"
484
485 "movq (%0),%%mm0\n"
486 "movq 1(%0),%%mm1\n"
487 "movq %%mm0, %%mm2\n"
488 "movq %%mm1, %%mm3\n"
489 "punpcklbw %%mm7,%%mm0\n"
490 "punpcklbw %%mm7,%%mm1\n"
491 "punpckhbw %%mm7,%%mm2\n"
492 "punpckhbw %%mm7,%%mm3\n"
493 "psubw %%mm1, %%mm0\n"
494 "psubw %%mm3, %%mm2\n"
495
496 "add %2,%0\n"
497
498 "movq (%0),%%mm4\n"
499 "movq 1(%0),%%mm1\n"
500 "movq %%mm4, %%mm5\n"
501 "movq %%mm1, %%mm3\n"
502 "punpcklbw %%mm7,%%mm4\n"
503 "punpcklbw %%mm7,%%mm1\n"
504 "punpckhbw %%mm7,%%mm5\n"
505 "punpckhbw %%mm7,%%mm3\n"
506 "psubw %%mm1, %%mm4\n"
507 "psubw %%mm3, %%mm5\n"
508 "psubw %%mm4, %%mm0\n"
509 "psubw %%mm5, %%mm2\n"
510 "pxor %%mm3, %%mm3\n"
511 "pxor %%mm1, %%mm1\n"
512 "pcmpgtw %%mm0, %%mm3\n\t"
513 "pcmpgtw %%mm2, %%mm1\n\t"
514 "pxor %%mm3, %%mm0\n"
515 "pxor %%mm1, %%mm2\n"
516 "psubw %%mm3, %%mm0\n"
517 "psubw %%mm1, %%mm2\n"
518 "paddw %%mm0, %%mm2\n"
519 "paddw %%mm2, %%mm6\n"
520
521 "add %2,%0\n"
522 "1:\n"
523
524 "movq (%0),%%mm0\n"
525 "movq 1(%0),%%mm1\n"
526 "movq %%mm0, %%mm2\n"
527 "movq %%mm1, %%mm3\n"
528 "punpcklbw %%mm7,%%mm0\n"
529 "punpcklbw %%mm7,%%mm1\n"
530 "punpckhbw %%mm7,%%mm2\n"
531 "punpckhbw %%mm7,%%mm3\n"
532 "psubw %%mm1, %%mm0\n"
533 "psubw %%mm3, %%mm2\n"
534 "psubw %%mm0, %%mm4\n"
535 "psubw %%mm2, %%mm5\n"
536 "pxor %%mm3, %%mm3\n"
537 "pxor %%mm1, %%mm1\n"
538 "pcmpgtw %%mm4, %%mm3\n\t"
539 "pcmpgtw %%mm5, %%mm1\n\t"
540 "pxor %%mm3, %%mm4\n"
541 "pxor %%mm1, %%mm5\n"
542 "psubw %%mm3, %%mm4\n"
543 "psubw %%mm1, %%mm5\n"
544 "paddw %%mm4, %%mm5\n"
545 "paddw %%mm5, %%mm6\n"
546
547 "add %2,%0\n"
548
549 "movq (%0),%%mm4\n"
550 "movq 1(%0),%%mm1\n"
551 "movq %%mm4, %%mm5\n"
552 "movq %%mm1, %%mm3\n"
553 "punpcklbw %%mm7,%%mm4\n"
554 "punpcklbw %%mm7,%%mm1\n"
555 "punpckhbw %%mm7,%%mm5\n"
556 "punpckhbw %%mm7,%%mm3\n"
557 "psubw %%mm1, %%mm4\n"
558 "psubw %%mm3, %%mm5\n"
559 "psubw %%mm4, %%mm0\n"
560 "psubw %%mm5, %%mm2\n"
561 "pxor %%mm3, %%mm3\n"
562 "pxor %%mm1, %%mm1\n"
563 "pcmpgtw %%mm0, %%mm3\n\t"
564 "pcmpgtw %%mm2, %%mm1\n\t"
565 "pxor %%mm3, %%mm0\n"
566 "pxor %%mm1, %%mm2\n"
567 "psubw %%mm3, %%mm0\n"
568 "psubw %%mm1, %%mm2\n"
569 "paddw %%mm0, %%mm2\n"
570 "paddw %%mm2, %%mm6\n"
571
572 "add %2,%0\n"
573 "subl $2, %%ecx\n"
574 " jnz 1b\n"
575
576 "movq %%mm6, %%mm0\n"
577 "punpcklwd %%mm7,%%mm0\n"
578 "punpckhwd %%mm7,%%mm6\n"
579 "paddd %%mm0, %%mm6\n"
580
581 "movq %%mm6,%%mm0\n"
582 "psrlq $32, %%mm6\n"
583 "paddd %%mm6,%%mm0\n"
584 "movd %%mm0,%1\n"
585 : "+r" (pix1), "=r"(tmp)
40d0e665 586 : "r" ((x86_reg)line_size) , "g" (h-2)
97d1d009
AJ
587 : "%ecx");
588 return tmp + hf_noise8_mmx(pix+8, line_size, h);
589}
590
591static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
592 MpegEncContext *c = p;
593 int score1, score2;
594
595 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
596 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
597 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
598
599 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
600 else return score1 + FFABS(score2)*8;
601}
602
603static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
604 MpegEncContext *c = p;
605 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
606 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
607
608 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
609 else return score1 + FFABS(score2)*8;
610}
611
612static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
613 int tmp;
614
615 assert( (((int)pix) & 7) == 0);
616 assert((line_size &7) ==0);
617
618#define SUM(in0, in1, out0, out1) \
619 "movq (%0), %%mm2\n"\
620 "movq 8(%0), %%mm3\n"\
621 "add %2,%0\n"\
622 "movq %%mm2, " #out0 "\n"\
623 "movq %%mm3, " #out1 "\n"\
624 "psubusb " #in0 ", %%mm2\n"\
625 "psubusb " #in1 ", %%mm3\n"\
626 "psubusb " #out0 ", " #in0 "\n"\
627 "psubusb " #out1 ", " #in1 "\n"\
628 "por %%mm2, " #in0 "\n"\
629 "por %%mm3, " #in1 "\n"\
630 "movq " #in0 ", %%mm2\n"\
631 "movq " #in1 ", %%mm3\n"\
632 "punpcklbw %%mm7, " #in0 "\n"\
633 "punpcklbw %%mm7, " #in1 "\n"\
634 "punpckhbw %%mm7, %%mm2\n"\
635 "punpckhbw %%mm7, %%mm3\n"\
636 "paddw " #in1 ", " #in0 "\n"\
637 "paddw %%mm3, %%mm2\n"\
638 "paddw %%mm2, " #in0 "\n"\
639 "paddw " #in0 ", %%mm6\n"
640
641
642 asm volatile (
643 "movl %3,%%ecx\n"
644 "pxor %%mm6,%%mm6\n"
645 "pxor %%mm7,%%mm7\n"
646 "movq (%0),%%mm0\n"
647 "movq 8(%0),%%mm1\n"
648 "add %2,%0\n"
06bb35f9 649 "jmp 2f\n"
97d1d009
AJ
650 "1:\n"
651
652 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
06bb35f9 653 "2:\n"
97d1d009
AJ
654 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
655
656 "subl $2, %%ecx\n"
657 "jnz 1b\n"
658
659 "movq %%mm6,%%mm0\n"
660 "psrlq $32, %%mm6\n"
661 "paddw %%mm6,%%mm0\n"
662 "movq %%mm0,%%mm6\n"
663 "psrlq $16, %%mm0\n"
664 "paddw %%mm6,%%mm0\n"
665 "movd %%mm0,%1\n"
666 : "+r" (pix), "=r"(tmp)
40d0e665 667 : "r" ((x86_reg)line_size) , "m" (h)
97d1d009
AJ
668 : "%ecx");
669 return tmp & 0xFFFF;
670}
671#undef SUM
672
673static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
674 int tmp;
675
676 assert( (((int)pix) & 7) == 0);
677 assert((line_size &7) ==0);
678
679#define SUM(in0, in1, out0, out1) \
680 "movq (%0), " #out0 "\n"\
681 "movq 8(%0), " #out1 "\n"\
682 "add %2,%0\n"\
683 "psadbw " #out0 ", " #in0 "\n"\
684 "psadbw " #out1 ", " #in1 "\n"\
685 "paddw " #in1 ", " #in0 "\n"\
686 "paddw " #in0 ", %%mm6\n"
687
688 asm volatile (
689 "movl %3,%%ecx\n"
690 "pxor %%mm6,%%mm6\n"
691 "pxor %%mm7,%%mm7\n"
692 "movq (%0),%%mm0\n"
693 "movq 8(%0),%%mm1\n"
694 "add %2,%0\n"
e1381022 695 "jmp 2f\n"
97d1d009
AJ
696 "1:\n"
697
698 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
e1381022 699 "2:\n"
97d1d009
AJ
700 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
701
702 "subl $2, %%ecx\n"
703 "jnz 1b\n"
704
705 "movd %%mm6,%1\n"
706 : "+r" (pix), "=r"(tmp)
40d0e665 707 : "r" ((x86_reg)line_size) , "m" (h)
97d1d009
AJ
708 : "%ecx");
709 return tmp;
710}
711#undef SUM
712
713static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
714 int tmp;
715
716 assert( (((int)pix1) & 7) == 0);
717 assert( (((int)pix2) & 7) == 0);
718 assert((line_size &7) ==0);
719
720#define SUM(in0, in1, out0, out1) \
721 "movq (%0),%%mm2\n"\
722 "movq (%1)," #out0 "\n"\
723 "movq 8(%0),%%mm3\n"\
724 "movq 8(%1)," #out1 "\n"\
725 "add %3,%0\n"\
726 "add %3,%1\n"\
727 "psubb " #out0 ", %%mm2\n"\
728 "psubb " #out1 ", %%mm3\n"\
729 "pxor %%mm7, %%mm2\n"\
730 "pxor %%mm7, %%mm3\n"\
731 "movq %%mm2, " #out0 "\n"\
732 "movq %%mm3, " #out1 "\n"\
733 "psubusb " #in0 ", %%mm2\n"\
734 "psubusb " #in1 ", %%mm3\n"\
735 "psubusb " #out0 ", " #in0 "\n"\
736 "psubusb " #out1 ", " #in1 "\n"\
737 "por %%mm2, " #in0 "\n"\
738 "por %%mm3, " #in1 "\n"\
739 "movq " #in0 ", %%mm2\n"\
740 "movq " #in1 ", %%mm3\n"\
741 "punpcklbw %%mm7, " #in0 "\n"\
742 "punpcklbw %%mm7, " #in1 "\n"\
743 "punpckhbw %%mm7, %%mm2\n"\
744 "punpckhbw %%mm7, %%mm3\n"\
745 "paddw " #in1 ", " #in0 "\n"\
746 "paddw %%mm3, %%mm2\n"\
747 "paddw %%mm2, " #in0 "\n"\
748 "paddw " #in0 ", %%mm6\n"
749
750
751 asm volatile (
752 "movl %4,%%ecx\n"
753 "pxor %%mm6,%%mm6\n"
754 "pcmpeqw %%mm7,%%mm7\n"
755 "psllw $15, %%mm7\n"
756 "packsswb %%mm7, %%mm7\n"
757 "movq (%0),%%mm0\n"
758 "movq (%1),%%mm2\n"
759 "movq 8(%0),%%mm1\n"
760 "movq 8(%1),%%mm3\n"
761 "add %3,%0\n"
762 "add %3,%1\n"
97d1d009
AJ
763 "psubb %%mm2, %%mm0\n"
764 "psubb %%mm3, %%mm1\n"
765 "pxor %%mm7, %%mm0\n"
766 "pxor %%mm7, %%mm1\n"
6bf6a930 767 "jmp 2f\n"
97d1d009
AJ
768 "1:\n"
769
770 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
6bf6a930 771 "2:\n"
97d1d009
AJ
772 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
773
774 "subl $2, %%ecx\n"
775 "jnz 1b\n"
776
777 "movq %%mm6,%%mm0\n"
778 "psrlq $32, %%mm6\n"
779 "paddw %%mm6,%%mm0\n"
780 "movq %%mm0,%%mm6\n"
781 "psrlq $16, %%mm0\n"
782 "paddw %%mm6,%%mm0\n"
783 "movd %%mm0,%2\n"
784 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
40d0e665 785 : "r" ((x86_reg)line_size) , "m" (h)
97d1d009
AJ
786 : "%ecx");
787 return tmp & 0x7FFF;
788}
789#undef SUM
790
791static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
792 int tmp;
793
794 assert( (((int)pix1) & 7) == 0);
795 assert( (((int)pix2) & 7) == 0);
796 assert((line_size &7) ==0);
797
798#define SUM(in0, in1, out0, out1) \
799 "movq (%0)," #out0 "\n"\
800 "movq (%1),%%mm2\n"\
801 "movq 8(%0)," #out1 "\n"\
802 "movq 8(%1),%%mm3\n"\
803 "add %3,%0\n"\
804 "add %3,%1\n"\
805 "psubb %%mm2, " #out0 "\n"\
806 "psubb %%mm3, " #out1 "\n"\
807 "pxor %%mm7, " #out0 "\n"\
808 "pxor %%mm7, " #out1 "\n"\
809 "psadbw " #out0 ", " #in0 "\n"\
810 "psadbw " #out1 ", " #in1 "\n"\
811 "paddw " #in1 ", " #in0 "\n"\
812 "paddw " #in0 ", %%mm6\n"
813
814 asm volatile (
815 "movl %4,%%ecx\n"
816 "pxor %%mm6,%%mm6\n"
817 "pcmpeqw %%mm7,%%mm7\n"
818 "psllw $15, %%mm7\n"
819 "packsswb %%mm7, %%mm7\n"
820 "movq (%0),%%mm0\n"
821 "movq (%1),%%mm2\n"
822 "movq 8(%0),%%mm1\n"
823 "movq 8(%1),%%mm3\n"
824 "add %3,%0\n"
825 "add %3,%1\n"
97d1d009
AJ
826 "psubb %%mm2, %%mm0\n"
827 "psubb %%mm3, %%mm1\n"
828 "pxor %%mm7, %%mm0\n"
829 "pxor %%mm7, %%mm1\n"
0bd134ab 830 "jmp 2f\n"
97d1d009
AJ
831 "1:\n"
832
833 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
0bd134ab 834 "2:\n"
97d1d009
AJ
835 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
836
837 "subl $2, %%ecx\n"
838 "jnz 1b\n"
839
840 "movd %%mm6,%2\n"
841 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
40d0e665 842 : "r" ((x86_reg)line_size) , "m" (h)
97d1d009
AJ
843 : "%ecx");
844 return tmp;
845}
846#undef SUM
847
848static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
40d0e665 849 x86_reg i=0;
97d1d009
AJ
850 asm volatile(
851 "1: \n\t"
852 "movq (%2, %0), %%mm0 \n\t"
853 "movq (%1, %0), %%mm1 \n\t"
854 "psubb %%mm0, %%mm1 \n\t"
855 "movq %%mm1, (%3, %0) \n\t"
856 "movq 8(%2, %0), %%mm0 \n\t"
857 "movq 8(%1, %0), %%mm1 \n\t"
858 "psubb %%mm0, %%mm1 \n\t"
859 "movq %%mm1, 8(%3, %0) \n\t"
860 "add $16, %0 \n\t"
861 "cmp %4, %0 \n\t"
862 " jb 1b \n\t"
863 : "+r" (i)
40d0e665 864 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
97d1d009
AJ
865 );
866 for(; i<w; i++)
867 dst[i+0] = src1[i+0]-src2[i+0];
868}
869
870static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
40d0e665 871 x86_reg i=0;
97d1d009
AJ
872 uint8_t l, lt;
873
874 asm volatile(
875 "1: \n\t"
876 "movq -1(%1, %0), %%mm0 \n\t" // LT
877 "movq (%1, %0), %%mm1 \n\t" // T
878 "movq -1(%2, %0), %%mm2 \n\t" // L
879 "movq (%2, %0), %%mm3 \n\t" // X
880 "movq %%mm2, %%mm4 \n\t" // L
881 "psubb %%mm0, %%mm2 \n\t"
882 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
883 "movq %%mm4, %%mm5 \n\t" // L
884 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
885 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
886 "pminub %%mm2, %%mm4 \n\t"
887 "pmaxub %%mm1, %%mm4 \n\t"
888 "psubb %%mm4, %%mm3 \n\t" // dst - pred
889 "movq %%mm3, (%3, %0) \n\t"
890 "add $8, %0 \n\t"
891 "cmp %4, %0 \n\t"
892 " jb 1b \n\t"
893 : "+r" (i)
40d0e665 894 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
97d1d009
AJ
895 );
896
897 l= *left;
898 lt= *left_top;
899
900 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
901
902 *left_top= src1[w-1];
903 *left = src2[w-1];
904}
905
906#define DIFF_PIXELS_1(m,a,t,p1,p2)\
907 "mov"#m" "#p1", "#a" \n\t"\
908 "mov"#m" "#p2", "#t" \n\t"\
909 "punpcklbw "#a", "#t" \n\t"\
910 "punpcklbw "#a", "#a" \n\t"\
911 "psubw "#t", "#a" \n\t"\
912
913#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
914 uint8_t *p1b=p1, *p2b=p2;\
915 asm volatile(\
916 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
917 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
918 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
919 "add %4, %1 \n\t"\
920 "add %4, %2 \n\t"\
921 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
922 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
923 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
924 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
925 "mov"#m1" "#mm"0, %0 \n\t"\
926 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
927 "mov"#m1" %0, "#mm"0 \n\t"\
928 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
40d0e665 929 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
97d1d009
AJ
930 );\
931}
932 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
933
934#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
935#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
936
937#define LBUTTERFLY2(a1,b1,a2,b2)\
938 "paddw " #b1 ", " #a1 " \n\t"\
939 "paddw " #b2 ", " #a2 " \n\t"\
940 "paddw " #b1 ", " #b1 " \n\t"\
941 "paddw " #b2 ", " #b2 " \n\t"\
942 "psubw " #a1 ", " #b1 " \n\t"\
943 "psubw " #a2 ", " #b2 " \n\t"
944
945#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
946 LBUTTERFLY2(m0, m1, m2, m3)\
947 LBUTTERFLY2(m4, m5, m6, m7)\
948 LBUTTERFLY2(m0, m2, m1, m3)\
949 LBUTTERFLY2(m4, m6, m5, m7)\
950 LBUTTERFLY2(m0, m4, m1, m5)\
951 LBUTTERFLY2(m2, m6, m3, m7)\
952
953#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
954
955#define MMABS_MMX(a,z)\
956 "pxor " #z ", " #z " \n\t"\
957 "pcmpgtw " #a ", " #z " \n\t"\
958 "pxor " #z ", " #a " \n\t"\
959 "psubw " #z ", " #a " \n\t"
960
961#define MMABS_MMX2(a,z)\
962 "pxor " #z ", " #z " \n\t"\
963 "psubw " #a ", " #z " \n\t"\
964 "pmaxsw " #z ", " #a " \n\t"
965
966#define MMABS_SSSE3(a,z)\
967 "pabsw " #a ", " #a " \n\t"
968
969#define MMABS_SUM(a,z, sum)\
970 MMABS(a,z)\
971 "paddusw " #a ", " #sum " \n\t"
972
973#define MMABS_SUM_8x8_NOSPILL\
974 MMABS(%%xmm0, %%xmm8)\
975 MMABS(%%xmm1, %%xmm9)\
976 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
977 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
978 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
979 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
980 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
981 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
982 "paddusw %%xmm1, %%xmm0 \n\t"
983
984#ifdef ARCH_X86_64
985#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
986#else
987#define MMABS_SUM_8x8_SSE2\
988 "movdqa %%xmm7, (%1) \n\t"\
989 MMABS(%%xmm0, %%xmm7)\
990 MMABS(%%xmm1, %%xmm7)\
991 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
992 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
993 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
994 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
995 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
996 "movdqa (%1), %%xmm2 \n\t"\
997 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
998 "paddusw %%xmm1, %%xmm0 \n\t"
999#endif
1000
1001#define LOAD4(o, a, b, c, d)\
1002 "movq "#o"(%1), "#a" \n\t"\
1003 "movq "#o"+8(%1), "#b" \n\t"\
1004 "movq "#o"+16(%1), "#c" \n\t"\
1005 "movq "#o"+24(%1), "#d" \n\t"\
1006
1007#define STORE4(o, a, b, c, d)\
1008 "movq "#a", "#o"(%1) \n\t"\
1009 "movq "#b", "#o"+8(%1) \n\t"\
1010 "movq "#c", "#o"+16(%1) \n\t"\
1011 "movq "#d", "#o"+24(%1) \n\t"\
1012
1013/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1014 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1015 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1016#define HSUM_MMX(a, t, dst)\
1017 "movq "#a", "#t" \n\t"\
1018 "psrlq $32, "#a" \n\t"\
1019 "paddusw "#t", "#a" \n\t"\
1020 "movq "#a", "#t" \n\t"\
1021 "psrlq $16, "#a" \n\t"\
1022 "paddusw "#t", "#a" \n\t"\
1023 "movd "#a", "#dst" \n\t"\
1024
1025#define HSUM_MMX2(a, t, dst)\
1026 "pshufw $0x0E, "#a", "#t" \n\t"\
1027 "paddusw "#t", "#a" \n\t"\
1028 "pshufw $0x01, "#a", "#t" \n\t"\
1029 "paddusw "#t", "#a" \n\t"\
1030 "movd "#a", "#dst" \n\t"\
1031
1032#define HSUM_SSE2(a, t, dst)\
1033 "movhlps "#a", "#t" \n\t"\
1034 "paddusw "#t", "#a" \n\t"\
1035 "pshuflw $0x0E, "#a", "#t" \n\t"\
1036 "paddusw "#t", "#a" \n\t"\
1037 "pshuflw $0x01, "#a", "#t" \n\t"\
1038 "paddusw "#t", "#a" \n\t"\
1039 "movd "#a", "#dst" \n\t"\
1040
1041#define HADAMARD8_DIFF_MMX(cpu) \
1042static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1043 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1044 int sum;\
1045\
1046 assert(h==8);\
1047\
1048 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1049\
1050 asm volatile(\
1051 HADAMARD48\
1052\
1053 "movq %%mm7, 96(%1) \n\t"\
1054\
1055 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1056 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1057\
1058 "movq 96(%1), %%mm7 \n\t"\
1059 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1060 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1061\
1062 : "=r" (sum)\
1063 : "r"(temp)\
1064 );\
1065\
1066 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1067\
1068 asm volatile(\
1069 HADAMARD48\
1070\
1071 "movq %%mm7, 96(%1) \n\t"\
1072\
1073 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1074 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1075\
1076 "movq 96(%1), %%mm7 \n\t"\
1077 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1078 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1079 "movq %%mm6, %%mm7 \n\t"\
1080 "movq %%mm0, %%mm6 \n\t"\
1081\
1082 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1083\
1084 HADAMARD48\
1085 "movq %%mm7, 64(%1) \n\t"\
1086 MMABS(%%mm0, %%mm7)\
1087 MMABS(%%mm1, %%mm7)\
1088 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1089 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1090 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1091 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1092 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1093 "movq 64(%1), %%mm2 \n\t"\
1094 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1095 "paddusw %%mm1, %%mm0 \n\t"\
1096 "movq %%mm0, 64(%1) \n\t"\
1097\
1098 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1099 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1100\
1101 HADAMARD48\
1102 "movq %%mm7, (%1) \n\t"\
1103 MMABS(%%mm0, %%mm7)\
1104 MMABS(%%mm1, %%mm7)\
1105 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1106 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1107 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1108 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1109 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1110 "movq (%1), %%mm2 \n\t"\
1111 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1112 "paddusw 64(%1), %%mm0 \n\t"\
1113 "paddusw %%mm1, %%mm0 \n\t"\
1114\
1115 HSUM(%%mm0, %%mm1, %0)\
1116\
1117 : "=r" (sum)\
1118 : "r"(temp)\
1119 );\
1120 return sum&0xFFFF;\
1121}\
1122WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1123
1124#define HADAMARD8_DIFF_SSE2(cpu) \
1125static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1126 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1127 int sum;\
1128\
1129 assert(h==8);\
1130\
1131 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1132\
1133 asm volatile(\
1134 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1135 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1136 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1137 MMABS_SUM_8x8\
1138 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1139 : "=r" (sum)\
1140 : "r"(temp)\
1141 );\
1142 return sum&0xFFFF;\
1143}\
1144WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1145
1146#define MMABS(a,z) MMABS_MMX(a,z)
1147#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1148HADAMARD8_DIFF_MMX(mmx)
1149#undef MMABS
1150#undef HSUM
1151
1152#define MMABS(a,z) MMABS_MMX2(a,z)
1153#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1154#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1155HADAMARD8_DIFF_MMX(mmx2)
1156HADAMARD8_DIFF_SSE2(sse2)
1157#undef MMABS
1158#undef MMABS_SUM_8x8
1159#undef HSUM
1160
1161#ifdef HAVE_SSSE3
1162#define MMABS(a,z) MMABS_SSSE3(a,z)
1163#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1164HADAMARD8_DIFF_SSE2(ssse3)
1165#undef MMABS
1166#undef MMABS_SUM_8x8
1167#endif
1168
1169#define DCT_SAD4(m,mm,o)\
1170 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1171 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1172 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1173 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1174 MMABS_SUM(mm##2, mm##6, mm##0)\
1175 MMABS_SUM(mm##3, mm##7, mm##1)\
1176 MMABS_SUM(mm##4, mm##6, mm##0)\
1177 MMABS_SUM(mm##5, mm##7, mm##1)\
1178
1179#define DCT_SAD_MMX\
1180 "pxor %%mm0, %%mm0 \n\t"\
1181 "pxor %%mm1, %%mm1 \n\t"\
1182 DCT_SAD4(q, %%mm, 0)\
1183 DCT_SAD4(q, %%mm, 8)\
1184 DCT_SAD4(q, %%mm, 64)\
1185 DCT_SAD4(q, %%mm, 72)\
1186 "paddusw %%mm1, %%mm0 \n\t"\
1187 HSUM(%%mm0, %%mm1, %0)
1188
1189#define DCT_SAD_SSE2\
1190 "pxor %%xmm0, %%xmm0 \n\t"\
1191 "pxor %%xmm1, %%xmm1 \n\t"\
1192 DCT_SAD4(dqa, %%xmm, 0)\
1193 DCT_SAD4(dqa, %%xmm, 64)\
1194 "paddusw %%xmm1, %%xmm0 \n\t"\
1195 HSUM(%%xmm0, %%xmm1, %0)
1196
1197#define DCT_SAD_FUNC(cpu) \
1198static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1199 int sum;\
1200 asm volatile(\
1201 DCT_SAD\
1202 :"=r"(sum)\
1203 :"r"(block)\
1204 );\
1205 return sum&0xFFFF;\
1206}
1207
1208#define DCT_SAD DCT_SAD_MMX
1209#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1210#define MMABS(a,z) MMABS_MMX(a,z)
1211DCT_SAD_FUNC(mmx)
1212#undef MMABS
1213#undef HSUM
1214
1215#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1216#define MMABS(a,z) MMABS_MMX2(a,z)
1217DCT_SAD_FUNC(mmx2)
1218#undef HSUM
1219#undef DCT_SAD
1220
1221#define DCT_SAD DCT_SAD_SSE2
1222#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1223DCT_SAD_FUNC(sse2)
1224#undef MMABS
1225
1226#ifdef HAVE_SSSE3
1227#define MMABS(a,z) MMABS_SSSE3(a,z)
1228DCT_SAD_FUNC(ssse3)
1229#undef MMABS
1230#endif
1231#undef HSUM
1232#undef DCT_SAD
1233
1234static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1235 int sum;
40d0e665 1236 x86_reg i=size;
97d1d009
AJ
1237 asm volatile(
1238 "pxor %%mm4, %%mm4 \n"
1239 "1: \n"
1240 "sub $8, %0 \n"
1241 "movq (%2,%0), %%mm2 \n"
1242 "movq (%3,%0,2), %%mm0 \n"
1243 "movq 8(%3,%0,2), %%mm1 \n"
1244 "punpckhbw %%mm2, %%mm3 \n"
1245 "punpcklbw %%mm2, %%mm2 \n"
1246 "psraw $8, %%mm3 \n"
1247 "psraw $8, %%mm2 \n"
1248 "psubw %%mm3, %%mm1 \n"
1249 "psubw %%mm2, %%mm0 \n"
1250 "pmaddwd %%mm1, %%mm1 \n"
1251 "pmaddwd %%mm0, %%mm0 \n"
1252 "paddd %%mm1, %%mm4 \n"
1253 "paddd %%mm0, %%mm4 \n"
1254 "jg 1b \n"
1255 "movq %%mm4, %%mm3 \n"
1256 "psrlq $32, %%mm3 \n"
1257 "paddd %%mm3, %%mm4 \n"
1258 "movd %%mm4, %1 \n"
1259 :"+r"(i), "=r"(sum)
1260 :"r"(pix1), "r"(pix2)
1261 );
1262 return sum;
1263}
1264
1265#define PHADDD(a, t)\
1266 "movq "#a", "#t" \n\t"\
1267 "psrlq $32, "#a" \n\t"\
1268 "paddd "#t", "#a" \n\t"
1269/*
1270 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1271 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1272 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1273 */
1274#define PMULHRW(x, y, s, o)\
1275 "pmulhw " #s ", "#x " \n\t"\
1276 "pmulhw " #s ", "#y " \n\t"\
1277 "paddw " #o ", "#x " \n\t"\
1278 "paddw " #o ", "#y " \n\t"\
1279 "psraw $1, "#x " \n\t"\
1280 "psraw $1, "#y " \n\t"
1281#define DEF(x) x ## _mmx
1282#define SET_RND MOVQ_WONE
1283#define SCALE_OFFSET 1
1284
1285#include "dsputil_mmx_qns.h"
1286
1287#undef DEF
1288#undef SET_RND
1289#undef SCALE_OFFSET
1290#undef PMULHRW
1291
1292#define DEF(x) x ## _3dnow
1293#define SET_RND(x)
1294#define SCALE_OFFSET 0
1295#define PMULHRW(x, y, s, o)\
1296 "pmulhrw " #s ", "#x " \n\t"\
1297 "pmulhrw " #s ", "#y " \n\t"
1298
1299#include "dsputil_mmx_qns.h"
1300
1301#undef DEF
1302#undef SET_RND
1303#undef SCALE_OFFSET
1304#undef PMULHRW
1305
1306#ifdef HAVE_SSSE3
1307#undef PHADDD
1308#define DEF(x) x ## _ssse3
1309#define SET_RND(x)
1310#define SCALE_OFFSET -1
1311#define PHADDD(a, t)\
1312 "pshufw $0x0E, "#a", "#t" \n\t"\
1313 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
1314#define PMULHRW(x, y, s, o)\
1315 "pmulhrsw " #s ", "#x " \n\t"\
1316 "pmulhrsw " #s ", "#y " \n\t"
1317
1318#include "dsputil_mmx_qns.h"
1319
1320#undef DEF
1321#undef SET_RND
1322#undef SCALE_OFFSET
1323#undef PMULHRW
1324#undef PHADDD
1325#endif //HAVE_SSSE3
1326
1327
1328/* FLAC specific */
1329void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1330 double *autoc);
1331
1332
1333void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1334{
1335 if (mm_flags & MM_MMX) {
1336 const int dct_algo = avctx->dct_algo;
1337 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1338 if(mm_flags & MM_SSE2){
1339 c->fdct = ff_fdct_sse2;
1340 }else if(mm_flags & MM_MMXEXT){
1341 c->fdct = ff_fdct_mmx2;
1342 }else{
1343 c->fdct = ff_fdct_mmx;
1344 }
1345 }
1346
1347 c->get_pixels = get_pixels_mmx;
1348 c->diff_pixels = diff_pixels_mmx;
1349 c->pix_sum = pix_sum16_mmx;
1350
1351 c->diff_bytes= diff_bytes_mmx;
1352 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1353
1354 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1355 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1356
1357 c->pix_norm1 = pix_norm1_mmx;
1358 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
1359 c->sse[1] = sse8_mmx;
1360 c->vsad[4]= vsad_intra16_mmx;
1361
1362 c->nsse[0] = nsse16_mmx;
1363 c->nsse[1] = nsse8_mmx;
1364 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1365 c->vsad[0] = vsad16_mmx;
1366 }
1367
1368 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1369 c->try_8x8basis= try_8x8basis_mmx;
1370 }
1371 c->add_8x8basis= add_8x8basis_mmx;
1372
1373 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1374
1375
1376 if (mm_flags & MM_MMXEXT) {
1377 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1378 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1379 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1380 c->vsad[4]= vsad_intra16_mmx2;
1381
1382 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1383 c->vsad[0] = vsad16_mmx2;
1384 }
1385
1386 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1387 }
1388
1389 if(mm_flags & MM_SSE2){
1390 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1391 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1392 c->hadamard8_diff[1]= hadamard8_diff_sse2;
1393 if (ENABLE_FLAC_ENCODER)
1394 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1395 }
1396
1397#ifdef HAVE_SSSE3
1398 if(mm_flags & MM_SSSE3){
1399 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1400 c->try_8x8basis= try_8x8basis_ssse3;
1401 }
1402 c->add_8x8basis= add_8x8basis_ssse3;
1403 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1404 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1405 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1406 }
1407#endif
1408
1409 if(mm_flags & MM_3DNOW){
1410 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1411 c->try_8x8basis= try_8x8basis_3dnow;
1412 }
1413 c->add_8x8basis= add_8x8basis_3dnow;
1414 }
1415 }
1416
1417 dsputil_init_pix_mmx(c, avctx);
1418}