mpeg4 header encoding bugfix
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5 24int mm_flags; /* multimedia extension flags */
eb4b3dd3 25/* FIXME use them in static form */
ba6802de
MN
26int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
31int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
36int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
41int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
1457ab52
MN
46int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
47int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
48int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
49int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
50
de6d9b64 51/* pixel operations */
a7bd8797
MN
52static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 55
d6a4c0b1
ZK
56#define JUMPALIGN() __asm __volatile (".balign 8"::)
57#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
58
fca0f0e5
ZK
59#define MOVQ_WONE(regd) \
60 __asm __volatile ( \
61 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
62 "psrlw $15, %%" #regd ::)
63
64#define MOVQ_BFE(regd) \
65 __asm __volatile ( \
66 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
67 "paddb %%" #regd ", %%" #regd " \n\t" ::)
68
d6a4c0b1 69#ifndef PIC
fca0f0e5 70#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
71#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
72#else
73// for shared library it's better to use this way for accessing constants
74// pcmpeqd -> -1
fca0f0e5 75#define MOVQ_BONE(regd) \
d6a4c0b1 76 __asm __volatile ( \
fca0f0e5
ZK
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
78 "psrlw $15, %%" #regd " \n\t" \
79 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
80
81#define MOVQ_WTWO(regd) \
82 __asm __volatile ( \
fca0f0e5
ZK
83 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
84 "psrlw $15, %%" #regd " \n\t" \
85 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 86
d6a4c0b1
ZK
87#endif
88
fca0f0e5 89// using regr as temporary and for the output result
def60345 90// first argument is unmodifed and second is trashed
39825f31
ZK
91// regfe is supposed to contain 0xfefefefefefefefe
92#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
93 "movq " #rega ", " #regr " \n\t"\
94 "pand " #regb ", " #regr " \n\t"\
def60345 95 "pxor " #rega ", " #regb " \n\t"\
39825f31 96 "pand " #regfe "," #regb " \n\t"\
def60345 97 "psrlq $1, " #regb " \n\t"\
91abb473 98 "paddb " #regb ", " #regr " \n\t"
def60345 99
39825f31 100#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
101 "movq " #rega ", " #regr " \n\t"\
102 "por " #regb ", " #regr " \n\t"\
def60345 103 "pxor " #rega ", " #regb " \n\t"\
39825f31 104 "pand " #regfe "," #regb " \n\t"\
def60345 105 "psrlq $1, " #regb " \n\t"\
91abb473 106 "psubb " #regb ", " #regr " \n\t"
def60345 107
39825f31 108// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
109#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
110 "movq " #rega ", " #regr " \n\t"\
111 "movq " #regc ", " #regp " \n\t"\
112 "pand " #regb ", " #regr " \n\t"\
113 "pand " #regd ", " #regp " \n\t"\
114 "pxor " #rega ", " #regb " \n\t"\
115 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
116 "pand %%mm6, " #regb " \n\t"\
117 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
118 "psrlq $1, " #regb " \n\t"\
119 "psrlq $1, " #regd " \n\t"\
120 "paddb " #regb ", " #regr " \n\t"\
121 "paddb " #regd ", " #regp " \n\t"
122
123#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
124 "movq " #rega ", " #regr " \n\t"\
125 "movq " #regc ", " #regp " \n\t"\
126 "por " #regb ", " #regr " \n\t"\
127 "por " #regd ", " #regp " \n\t"\
128 "pxor " #rega ", " #regb " \n\t"\
129 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
130 "pand %%mm6, " #regb " \n\t"\
131 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
132 "psrlq $1, " #regd " \n\t"\
133 "psrlq $1, " #regb " \n\t"\
134 "psubb " #regb ", " #regr " \n\t"\
135 "psubb " #regd ", " #regp " \n\t"
136
91abb473
ZK
137/***********************************/
138/* MMX no rounding */
139#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 140#define SET_RND MOVQ_WONE
6aa6ea8e 141#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 142#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 143
91abb473
ZK
144#include "dsputil_mmx_rnd.h"
145
146#undef DEF
fca0f0e5 147#undef SET_RND
6aa6ea8e 148#undef PAVGBP
39825f31 149#undef PAVGB
91abb473
ZK
150/***********************************/
151/* MMX rounding */
152
153#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 154#define SET_RND MOVQ_WTWO
6aa6ea8e 155#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 156#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 157
91abb473
ZK
158#include "dsputil_mmx_rnd.h"
159
160#undef DEF
fca0f0e5 161#undef SET_RND
6aa6ea8e 162#undef PAVGBP
39825f31 163#undef PAVGB
a7bd8797 164
de6d9b64
FB
165/***********************************/
166/* 3Dnow specific */
167
168#define DEF(x) x ## _3dnow
169/* for Athlons PAVGUSB is prefered */
170#define PAVGB "pavgusb"
171
172#include "dsputil_mmx_avg.h"
173
174#undef DEF
175#undef PAVGB
176
177/***********************************/
178/* MMX2 specific */
179
607dce96 180#define DEF(x) x ## _mmx2
de6d9b64
FB
181
182/* Introduced only in MMX2 set */
183#define PAVGB "pavgb"
184
185#include "dsputil_mmx_avg.h"
186
187#undef DEF
188#undef PAVGB
189
190/***********************************/
191/* standard MMX */
192
193static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
194{
607dce96
MN
195 asm volatile(
196 "movl $-128, %%eax \n\t"
197 "pxor %%mm7, %%mm7 \n\t"
198 ".balign 16 \n\t"
199 "1: \n\t"
200 "movq (%0), %%mm0 \n\t"
201 "movq (%0, %2), %%mm2 \n\t"
202 "movq %%mm0, %%mm1 \n\t"
203 "movq %%mm2, %%mm3 \n\t"
204 "punpcklbw %%mm7, %%mm0 \n\t"
205 "punpckhbw %%mm7, %%mm1 \n\t"
206 "punpcklbw %%mm7, %%mm2 \n\t"
207 "punpckhbw %%mm7, %%mm3 \n\t"
208 "movq %%mm0, (%1, %%eax)\n\t"
209 "movq %%mm1, 8(%1, %%eax)\n\t"
210 "movq %%mm2, 16(%1, %%eax)\n\t"
211 "movq %%mm3, 24(%1, %%eax)\n\t"
212 "addl %3, %0 \n\t"
213 "addl $32, %%eax \n\t"
214 "js 1b \n\t"
215 : "+r" (pixels)
216 : "r" (block+64), "r" (line_size), "r" (line_size*2)
217 : "%eax"
218 );
de6d9b64
FB
219}
220
1457ab52 221static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
9dbcbd92
MN
222{
223 asm volatile(
607dce96 224 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 225 "movl $-128, %%eax \n\t"
607dce96 226 ".balign 16 \n\t"
9dbcbd92
MN
227 "1: \n\t"
228 "movq (%0), %%mm0 \n\t"
229 "movq (%1), %%mm2 \n\t"
230 "movq %%mm0, %%mm1 \n\t"
231 "movq %%mm2, %%mm3 \n\t"
232 "punpcklbw %%mm7, %%mm0 \n\t"
233 "punpckhbw %%mm7, %%mm1 \n\t"
234 "punpcklbw %%mm7, %%mm2 \n\t"
235 "punpckhbw %%mm7, %%mm3 \n\t"
236 "psubw %%mm2, %%mm0 \n\t"
237 "psubw %%mm3, %%mm1 \n\t"
238 "movq %%mm0, (%2, %%eax)\n\t"
239 "movq %%mm1, 8(%2, %%eax)\n\t"
240 "addl %3, %0 \n\t"
241 "addl %3, %1 \n\t"
242 "addl $16, %%eax \n\t"
243 "jnz 1b \n\t"
244 : "+r" (s1), "+r" (s2)
245 : "r" (block+64), "r" (stride)
246 : "%eax"
247 );
248}
249
eb4b3dd3 250void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
251{
252 const DCTELEM *p;
253 UINT8 *pix;
de6d9b64
FB
254
255 /* read the pixels */
256 p = block;
257 pix = pixels;
d6a4c0b1 258 /* unrolled loop */
de6d9b64 259 __asm __volatile(
a822a479
NK
260 "movq %3, %%mm0\n\t"
261 "movq 8%3, %%mm1\n\t"
262 "movq 16%3, %%mm2\n\t"
263 "movq 24%3, %%mm3\n\t"
264 "movq 32%3, %%mm4\n\t"
265 "movq 40%3, %%mm5\n\t"
266 "movq 48%3, %%mm6\n\t"
267 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
268 "packuswb %%mm1, %%mm0\n\t"
269 "packuswb %%mm3, %%mm2\n\t"
270 "packuswb %%mm5, %%mm4\n\t"
271 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
272 "movq %%mm0, (%0)\n\t"
273 "movq %%mm2, (%0, %1)\n\t"
274 "movq %%mm4, (%0, %1, 2)\n\t"
275 "movq %%mm6, (%0, %2)\n\t"
276 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
277 :"memory");
278 pix += line_size*4;
279 p += 32;
d6a4c0b1
ZK
280
281 // if here would be an exact copy of the code above
282 // compiler would generate some very strange code
283 // thus using "r"
284 __asm __volatile(
285 "movq (%3), %%mm0\n\t"
286 "movq 8(%3), %%mm1\n\t"
287 "movq 16(%3), %%mm2\n\t"
288 "movq 24(%3), %%mm3\n\t"
289 "movq 32(%3), %%mm4\n\t"
290 "movq 40(%3), %%mm5\n\t"
291 "movq 48(%3), %%mm6\n\t"
292 "movq 56(%3), %%mm7\n\t"
293 "packuswb %%mm1, %%mm0\n\t"
294 "packuswb %%mm3, %%mm2\n\t"
295 "packuswb %%mm5, %%mm4\n\t"
296 "packuswb %%mm7, %%mm6\n\t"
297 "movq %%mm0, (%0)\n\t"
298 "movq %%mm2, (%0, %1)\n\t"
299 "movq %%mm4, (%0, %1, 2)\n\t"
300 "movq %%mm6, (%0, %2)\n\t"
301 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
302 :"memory");
de6d9b64
FB
303}
304
eb4b3dd3 305void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
306{
307 const DCTELEM *p;
308 UINT8 *pix;
309 int i;
310
311 /* read the pixels */
312 p = block;
313 pix = pixels;
d6a4c0b1
ZK
314 MOVQ_ZERO(mm7);
315 i = 4;
cd8e5f96 316 do {
de6d9b64 317 __asm __volatile(
cd8e5f96
ZK
318 "movq (%2), %%mm0\n\t"
319 "movq 8(%2), %%mm1\n\t"
320 "movq 16(%2), %%mm2\n\t"
321 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
322 "movq %0, %%mm4\n\t"
323 "movq %1, %%mm6\n\t"
324 "movq %%mm4, %%mm5\n\t"
325 "punpcklbw %%mm7, %%mm4\n\t"
326 "punpckhbw %%mm7, %%mm5\n\t"
327 "paddsw %%mm4, %%mm0\n\t"
328 "paddsw %%mm5, %%mm1\n\t"
329 "movq %%mm6, %%mm5\n\t"
330 "punpcklbw %%mm7, %%mm6\n\t"
331 "punpckhbw %%mm7, %%mm5\n\t"
332 "paddsw %%mm6, %%mm2\n\t"
333 "paddsw %%mm5, %%mm3\n\t"
334 "packuswb %%mm1, %%mm0\n\t"
335 "packuswb %%mm3, %%mm2\n\t"
336 "movq %%mm0, %0\n\t"
337 "movq %%mm2, %1\n\t"
a822a479 338 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 339 :"r"(p)
de6d9b64
FB
340 :"memory");
341 pix += line_size*2;
342 p += 16;
cd8e5f96 343 } while (--i);
de6d9b64
FB
344}
345
b3184779 346static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 347{
39825f31 348 __asm __volatile(
31ddcf98 349 "lea (%3, %3), %%eax \n\t"
52af45ad 350 ".balign 8 \n\t"
31ddcf98
ZK
351 "1: \n\t"
352 "movq (%1), %%mm0 \n\t"
353 "movq (%1, %3), %%mm1 \n\t"
354 "movq %%mm0, (%2) \n\t"
355 "movq %%mm1, (%2, %3) \n\t"
356 "addl %%eax, %1 \n\t"
357 "addl %%eax, %2 \n\t"
358 "movq (%1), %%mm0 \n\t"
359 "movq (%1, %3), %%mm1 \n\t"
360 "movq %%mm0, (%2) \n\t"
361 "movq %%mm1, (%2, %3) \n\t"
362 "addl %%eax, %1 \n\t"
363 "addl %%eax, %2 \n\t"
364 "subl $4, %0 \n\t"
365 "jnz 1b \n\t"
366 : "+g"(h), "+r" (pixels), "+r" (block)
367 : "r"(line_size)
368 : "%eax", "memory"
369 );
de6d9b64
FB
370}
371
b3184779
MN
372static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
373{
374 __asm __volatile(
375 "lea (%3, %3), %%eax \n\t"
376 ".balign 8 \n\t"
377 "1: \n\t"
378 "movq (%1), %%mm0 \n\t"
379 "movq 8(%1), %%mm4 \n\t"
380 "movq (%1, %3), %%mm1 \n\t"
381 "movq 8(%1, %3), %%mm5 \n\t"
382 "movq %%mm0, (%2) \n\t"
383 "movq %%mm4, 8(%2) \n\t"
384 "movq %%mm1, (%2, %3) \n\t"
385 "movq %%mm5, 8(%2, %3) \n\t"
386 "addl %%eax, %1 \n\t"
387 "addl %%eax, %2 \n\t"
388 "movq (%1), %%mm0 \n\t"
389 "movq 8(%1), %%mm4 \n\t"
390 "movq (%1, %3), %%mm1 \n\t"
391 "movq 8(%1, %3), %%mm5 \n\t"
392 "movq %%mm0, (%2) \n\t"
393 "movq %%mm4, 8(%2) \n\t"
394 "movq %%mm1, (%2, %3) \n\t"
395 "movq %%mm5, 8(%2, %3) \n\t"
396 "addl %%eax, %1 \n\t"
397 "addl %%eax, %2 \n\t"
398 "subl $4, %0 \n\t"
399 "jnz 1b \n\t"
400 : "+g"(h), "+r" (pixels), "+r" (block)
401 : "r"(line_size)
402 : "%eax", "memory"
403 );
404}
405
649c00c9
MN
406static void clear_blocks_mmx(DCTELEM *blocks)
407{
39825f31 408 __asm __volatile(
649c00c9
MN
409 "pxor %%mm7, %%mm7 \n\t"
410 "movl $-128*6, %%eax \n\t"
411 "1: \n\t"
412 "movq %%mm7, (%0, %%eax) \n\t"
413 "movq %%mm7, 8(%0, %%eax) \n\t"
414 "movq %%mm7, 16(%0, %%eax) \n\t"
415 "movq %%mm7, 24(%0, %%eax) \n\t"
416 "addl $32, %%eax \n\t"
417 " js 1b \n\t"
418 : : "r" (((int)blocks)+128*6)
419 : "%eax"
420 );
421}
422
084c726b
MN
423static int pix_sum16_mmx(UINT8 * pix, int line_size){
424 const int h=16;
425 int sum;
426 int index= -line_size*h;
427
428 __asm __volatile(
429 "pxor %%mm7, %%mm7 \n\t"
430 "pxor %%mm6, %%mm6 \n\t"
431 "1: \n\t"
432 "movq (%2, %1), %%mm0 \n\t"
433 "movq (%2, %1), %%mm1 \n\t"
434 "movq 8(%2, %1), %%mm2 \n\t"
435 "movq 8(%2, %1), %%mm3 \n\t"
436 "punpcklbw %%mm7, %%mm0 \n\t"
437 "punpckhbw %%mm7, %%mm1 \n\t"
438 "punpcklbw %%mm7, %%mm2 \n\t"
439 "punpckhbw %%mm7, %%mm3 \n\t"
440 "paddw %%mm0, %%mm1 \n\t"
441 "paddw %%mm2, %%mm3 \n\t"
442 "paddw %%mm1, %%mm3 \n\t"
443 "paddw %%mm3, %%mm6 \n\t"
444 "addl %3, %1 \n\t"
445 " js 1b \n\t"
446 "movq %%mm6, %%mm5 \n\t"
447 "psrlq $32, %%mm6 \n\t"
448 "paddw %%mm5, %%mm6 \n\t"
449 "movq %%mm6, %%mm5 \n\t"
450 "psrlq $16, %%mm6 \n\t"
451 "paddw %%mm5, %%mm6 \n\t"
452 "movd %%mm6, %0 \n\t"
453 "andl $0xFFFF, %0 \n\t"
454 : "=&r" (sum), "+r" (index)
455 : "r" (pix - index), "r" (line_size)
456 );
457
458 return sum;
459}
460
11f18faf
MN
461static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
462 int i=0;
463 asm volatile(
464 "1: \n\t"
465 "movq (%1, %0), %%mm0 \n\t"
466 "movq (%2, %0), %%mm1 \n\t"
467 "paddb %%mm0, %%mm1 \n\t"
468 "movq %%mm1, (%2, %0) \n\t"
469 "movq 8(%1, %0), %%mm0 \n\t"
470 "movq 8(%2, %0), %%mm1 \n\t"
471 "paddb %%mm0, %%mm1 \n\t"
472 "movq %%mm1, 8(%2, %0) \n\t"
473 "addl $16, %0 \n\t"
474 "cmpl %3, %0 \n\t"
475 " jb 1b \n\t"
476 : "+r" (i)
477 : "r"(src), "r"(dst), "r"(w-15)
478 );
479 for(; i<w; i++)
480 dst[i+0] += src[i+0];
481}
482
483static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
484 int i=0;
485 asm volatile(
486 "1: \n\t"
487 "movq (%2, %0), %%mm0 \n\t"
488 "movq (%1, %0), %%mm1 \n\t"
489 "psubb %%mm0, %%mm1 \n\t"
490 "movq %%mm1, (%3, %0) \n\t"
491 "movq 8(%2, %0), %%mm0 \n\t"
492 "movq 8(%1, %0), %%mm1 \n\t"
493 "psubb %%mm0, %%mm1 \n\t"
494 "movq %%mm1, 8(%3, %0) \n\t"
495 "addl $16, %0 \n\t"
496 "cmpl %4, %0 \n\t"
497 " jb 1b \n\t"
498 : "+r" (i)
499 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
500 );
501 for(; i<w; i++)
502 dst[i+0] = src1[i+0]-src2[i+0];
503}
1457ab52
MN
504#define LBUTTERFLY(a,b)\
505 "paddw " #b ", " #a " \n\t"\
506 "paddw " #b ", " #b " \n\t"\
507 "psubw " #a ", " #b " \n\t"
508
509#define HADAMARD48\
510 LBUTTERFLY(%%mm0, %%mm1)\
511 LBUTTERFLY(%%mm2, %%mm3)\
512 LBUTTERFLY(%%mm4, %%mm5)\
513 LBUTTERFLY(%%mm6, %%mm7)\
514 \
515 LBUTTERFLY(%%mm0, %%mm2)\
516 LBUTTERFLY(%%mm1, %%mm3)\
517 LBUTTERFLY(%%mm4, %%mm6)\
518 LBUTTERFLY(%%mm5, %%mm7)\
519 \
520 LBUTTERFLY(%%mm0, %%mm4)\
521 LBUTTERFLY(%%mm1, %%mm5)\
522 LBUTTERFLY(%%mm2, %%mm6)\
523 LBUTTERFLY(%%mm3, %%mm7)
524
525#define MMABS(a,z)\
526 "pxor " #z ", " #z " \n\t"\
527 "pcmpgtw " #a ", " #z " \n\t"\
528 "pxor " #z ", " #a " \n\t"\
529 "psubw " #z ", " #a " \n\t"
530
531#define MMABS_SUM(a,z, sum)\
532 "pxor " #z ", " #z " \n\t"\
533 "pcmpgtw " #a ", " #z " \n\t"\
534 "pxor " #z ", " #a " \n\t"\
535 "psubw " #z ", " #a " \n\t"\
536 "paddusw " #a ", " #sum " \n\t"
537
538
539#define SBUTTERFLY(a,b,t,n)\
540 "movq " #a ", " #t " \n\t" /* abcd */\
541 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
542 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
543
544#define TRANSPOSE4(a,b,c,d,t)\
545 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
546 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
547 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
548 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
549
550#define LOAD4(o, a, b, c, d)\
551 "movq "#o"(%1), " #a " \n\t"\
552 "movq "#o"+16(%1), " #b " \n\t"\
553 "movq "#o"+32(%1), " #c " \n\t"\
554 "movq "#o"+48(%1), " #d " \n\t"
555
556#define STORE4(o, a, b, c, d)\
557 "movq "#a", "#o"(%1) \n\t"\
558 "movq "#b", "#o"+16(%1) \n\t"\
559 "movq "#c", "#o"+32(%1) \n\t"\
560 "movq "#d", "#o"+48(%1) \n\t"\
561
562static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
563 uint64_t temp[16] __align8;
564 int sum=0;
565
566 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
11f18faf 567
1457ab52
MN
568 asm volatile(
569 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
570 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
571
572 HADAMARD48
573
574 "movq %%mm7, 112(%1) \n\t"
575
576 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
577 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
578
579 "movq 112(%1), %%mm7 \n\t"
580 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
581 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
582
583 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
584 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
585
586 HADAMARD48
587
588 "movq %%mm7, 120(%1) \n\t"
589
590 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
591 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
592
593 "movq 120(%1), %%mm7 \n\t"
594 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
595 "movq %%mm7, %%mm5 \n\t"//FIXME remove
596 "movq %%mm6, %%mm7 \n\t"
597 "movq %%mm0, %%mm6 \n\t"
598// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
599
600 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
601// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
602
603 HADAMARD48
604 "movq %%mm7, 64(%1) \n\t"
605 MMABS(%%mm0, %%mm7)
606 MMABS_SUM(%%mm1, %%mm7, %%mm0)
607 MMABS_SUM(%%mm2, %%mm7, %%mm0)
608 MMABS_SUM(%%mm3, %%mm7, %%mm0)
609 MMABS_SUM(%%mm4, %%mm7, %%mm0)
610 MMABS_SUM(%%mm5, %%mm7, %%mm0)
611 MMABS_SUM(%%mm6, %%mm7, %%mm0)
612 "movq 64(%1), %%mm1 \n\t"
613 MMABS_SUM(%%mm1, %%mm7, %%mm0)
614 "movq %%mm0, 64(%1) \n\t"
615
616 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
617 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
618
619 HADAMARD48
620 "movq %%mm7, (%1) \n\t"
621 MMABS(%%mm0, %%mm7)
622 MMABS_SUM(%%mm1, %%mm7, %%mm0)
623 MMABS_SUM(%%mm2, %%mm7, %%mm0)
624 MMABS_SUM(%%mm3, %%mm7, %%mm0)
625 MMABS_SUM(%%mm4, %%mm7, %%mm0)
626 MMABS_SUM(%%mm5, %%mm7, %%mm0)
627 MMABS_SUM(%%mm6, %%mm7, %%mm0)
628 "movq (%1), %%mm1 \n\t"
629 MMABS_SUM(%%mm1, %%mm7, %%mm0)
630 "movq 64(%1), %%mm1 \n\t"
631 MMABS_SUM(%%mm1, %%mm7, %%mm0)
632
633 "movq %%mm0, %%mm1 \n\t"
634 "psrlq $32, %%mm0 \n\t"
635 "paddusw %%mm1, %%mm0 \n\t"
636 "movq %%mm0, %%mm1 \n\t"
637 "psrlq $16, %%mm0 \n\t"
638 "paddusw %%mm1, %%mm0 \n\t"
639 "movd %%mm0, %0 \n\t"
640
641 : "=r" (sum)
642 : "r"(temp)
643 );
644 return sum&0xFFFF;
645}
646
647WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
11f18faf 648
61a4e8ae 649#if 0
d6a4c0b1 650static void just_return() { return; }
61a4e8ae 651#endif
d6a4c0b1 652
eb4b3dd3 653void dsputil_init_mmx(DSPContext* c, unsigned mask)
de6d9b64
FB
654{
655 mm_flags = mm_support();
1565dabc
LB
656#if 0
657 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 658 if (mm_flags & MM_MMX)
1565dabc 659 fprintf(stderr, " mmx");
de6d9b64 660 if (mm_flags & MM_MMXEXT)
1565dabc 661 fprintf(stderr, " mmxext");
de6d9b64 662 if (mm_flags & MM_3DNOW)
1565dabc 663 fprintf(stderr, " 3dnow");
de6d9b64 664 if (mm_flags & MM_SSE)
1565dabc 665 fprintf(stderr, " sse");
de6d9b64 666 if (mm_flags & MM_SSE2)
1565dabc
LB
667 fprintf(stderr, " sse2");
668 fprintf(stderr, "\n");
de6d9b64
FB
669#endif
670
671 if (mm_flags & MM_MMX) {
eb4b3dd3
ZK
672 c->get_pixels = get_pixels_mmx;
673 c->diff_pixels = diff_pixels_mmx;
674 c->put_pixels_clamped = put_pixels_clamped_mmx;
675 c->add_pixels_clamped = add_pixels_clamped_mmx;
676 c->clear_blocks = clear_blocks_mmx;
677 c->pix_sum = pix_sum16_mmx;
678
679 c->pix_abs16x16 = pix_abs16x16_mmx;
680 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
681 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
682 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
683 c->pix_abs8x8 = pix_abs8x8_mmx;
684 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
685 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
686 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
687
688 c->put_pixels_tab[0][0] = put_pixels16_mmx;
689 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
690 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
691 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
692
693 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
694 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
695 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
696 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
697
698 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
699 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
700 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
701 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
702
703 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
704 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
705 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
706 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
707
708 c->put_pixels_tab[1][0] = put_pixels8_mmx;
709 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
710 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
711 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
712
713 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
714 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
715 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
716 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
717
718 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
719 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
720 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
721 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
722
723 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
724 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
725 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
726 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
11f18faf
MN
727
728 c->add_bytes= add_bytes_mmx;
729 c->diff_bytes= diff_bytes_mmx;
1457ab52
MN
730
731 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
732 c->hadamard8_diff[1]= hadamard8_diff_mmx;
733
734 c->sad[0]= sad16x16_mmx;
735 c->sad[1]= sad8x8_mmx;
736
de6d9b64 737 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
738 c->pix_abs16x16 = pix_abs16x16_mmx2;
739 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
740 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
741 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
742
743 c->pix_abs8x8 = pix_abs8x8_mmx2;
744 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
745 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
746 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
747
1457ab52
MN
748 c->sad[0]= sad16x16_mmx2;
749 c->sad[1]= sad8x8_mmx2;
750
eb4b3dd3
ZK
751 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
752 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
753 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
754 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
755
756 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
757 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
758 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
759 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
760
761 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
762 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
763 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
764 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
765
766 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
767 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
768 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
769 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
de6d9b64 770 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
771 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
772 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
773 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
774 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
775
776 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
777 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
778 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
779 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
780
781 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
782 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
783 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
784 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
785
786 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
787 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
788 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
789 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
de6d9b64
FB
790 }
791 }
d6a4c0b1
ZK
792
793#if 0
794 // for speed testing
795 get_pixels = just_return;
796 put_pixels_clamped = just_return;
797 add_pixels_clamped = just_return;
798
799 pix_abs16x16 = just_return;
800 pix_abs16x16_x2 = just_return;
801 pix_abs16x16_y2 = just_return;
802 pix_abs16x16_xy2 = just_return;
803
804 put_pixels_tab[0] = just_return;
805 put_pixels_tab[1] = just_return;
806 put_pixels_tab[2] = just_return;
807 put_pixels_tab[3] = just_return;
808
809 put_no_rnd_pixels_tab[0] = just_return;
810 put_no_rnd_pixels_tab[1] = just_return;
811 put_no_rnd_pixels_tab[2] = just_return;
812 put_no_rnd_pixels_tab[3] = just_return;
813
814 avg_pixels_tab[0] = just_return;
815 avg_pixels_tab[1] = just_return;
816 avg_pixels_tab[2] = just_return;
817 avg_pixels_tab[3] = just_return;
818
819 avg_no_rnd_pixels_tab[0] = just_return;
820 avg_no_rnd_pixels_tab[1] = just_return;
821 avg_no_rnd_pixels_tab[2] = just_return;
822 avg_no_rnd_pixels_tab[3] = just_return;
823
d6a4c0b1
ZK
824 //av_fdct = just_return;
825 //ff_idct = just_return;
826#endif
de6d9b64 827}
4f12a497
FB
828
829/* remove any non bit exact operation (testing purpose). NOTE that
830 this function should be kept as small as possible because it is
831 always difficult to test automatically non bit exact cases. */
eb4b3dd3 832void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
4f12a497
FB
833{
834 if (mm_flags & MM_MMX) {
b3184779 835 /* MMX2 & 3DNOW */
eb4b3dd3
ZK
836 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
837 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
838 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
839 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
840 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
841 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4afeaec9 842
b3184779 843 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
844 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
845 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
846 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
847 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
848 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
849 c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497
FB
850 }
851 }
852}