huffyuv
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
23
7d650cb5 24int mm_flags; /* multimedia extension flags */
eb4b3dd3 25/* FIXME use them in static form */
ba6802de
MN
26int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
27int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30
31int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
32int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35
36int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
37int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40
41int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
42int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45
de6d9b64 46/* pixel operations */
a7bd8797
MN
47static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
48static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
49static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
de6d9b64 50
d6a4c0b1
ZK
51#define JUMPALIGN() __asm __volatile (".balign 8"::)
52#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
53
fca0f0e5
ZK
54#define MOVQ_WONE(regd) \
55 __asm __volatile ( \
56 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
57 "psrlw $15, %%" #regd ::)
58
59#define MOVQ_BFE(regd) \
60 __asm __volatile ( \
61 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
62 "paddb %%" #regd ", %%" #regd " \n\t" ::)
63
d6a4c0b1 64#ifndef PIC
fca0f0e5 65#define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
d6a4c0b1
ZK
66#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
67#else
68// for shared library it's better to use this way for accessing constants
69// pcmpeqd -> -1
fca0f0e5 70#define MOVQ_BONE(regd) \
d6a4c0b1 71 __asm __volatile ( \
fca0f0e5
ZK
72 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73 "psrlw $15, %%" #regd " \n\t" \
74 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
75
76#define MOVQ_WTWO(regd) \
77 __asm __volatile ( \
fca0f0e5
ZK
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79 "psrlw $15, %%" #regd " \n\t" \
80 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 81
d6a4c0b1
ZK
82#endif
83
fca0f0e5 84// using regr as temporary and for the output result
def60345 85// first argument is unmodifed and second is trashed
39825f31
ZK
86// regfe is supposed to contain 0xfefefefefefefefe
87#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
91abb473
ZK
88 "movq " #rega ", " #regr " \n\t"\
89 "pand " #regb ", " #regr " \n\t"\
def60345 90 "pxor " #rega ", " #regb " \n\t"\
39825f31 91 "pand " #regfe "," #regb " \n\t"\
def60345 92 "psrlq $1, " #regb " \n\t"\
91abb473 93 "paddb " #regb ", " #regr " \n\t"
def60345 94
39825f31 95#define PAVGB_MMX(rega, regb, regr, regfe) \
91abb473
ZK
96 "movq " #rega ", " #regr " \n\t"\
97 "por " #regb ", " #regr " \n\t"\
def60345 98 "pxor " #rega ", " #regb " \n\t"\
39825f31 99 "pand " #regfe "," #regb " \n\t"\
def60345 100 "psrlq $1, " #regb " \n\t"\
91abb473 101 "psubb " #regb ", " #regr " \n\t"
def60345 102
39825f31 103// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e
ZK
104#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
105 "movq " #rega ", " #regr " \n\t"\
106 "movq " #regc ", " #regp " \n\t"\
107 "pand " #regb ", " #regr " \n\t"\
108 "pand " #regd ", " #regp " \n\t"\
109 "pxor " #rega ", " #regb " \n\t"\
110 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
111 "pand %%mm6, " #regb " \n\t"\
112 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
113 "psrlq $1, " #regb " \n\t"\
114 "psrlq $1, " #regd " \n\t"\
115 "paddb " #regb ", " #regr " \n\t"\
116 "paddb " #regd ", " #regp " \n\t"
117
118#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
119 "movq " #rega ", " #regr " \n\t"\
120 "movq " #regc ", " #regp " \n\t"\
121 "por " #regb ", " #regr " \n\t"\
122 "por " #regd ", " #regp " \n\t"\
123 "pxor " #rega ", " #regb " \n\t"\
124 "pxor " #regc ", " #regd " \n\t"\
fca0f0e5
ZK
125 "pand %%mm6, " #regb " \n\t"\
126 "pand %%mm6, " #regd " \n\t"\
6aa6ea8e
ZK
127 "psrlq $1, " #regd " \n\t"\
128 "psrlq $1, " #regb " \n\t"\
129 "psubb " #regb ", " #regr " \n\t"\
130 "psubb " #regd ", " #regp " \n\t"
131
91abb473
ZK
132/***********************************/
133/* MMX no rounding */
134#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 135#define SET_RND MOVQ_WONE
6aa6ea8e 136#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
39825f31 137#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 138
91abb473
ZK
139#include "dsputil_mmx_rnd.h"
140
141#undef DEF
fca0f0e5 142#undef SET_RND
6aa6ea8e 143#undef PAVGBP
39825f31 144#undef PAVGB
91abb473
ZK
145/***********************************/
146/* MMX rounding */
147
148#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 149#define SET_RND MOVQ_WTWO
6aa6ea8e 150#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
39825f31 151#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 152
91abb473
ZK
153#include "dsputil_mmx_rnd.h"
154
155#undef DEF
fca0f0e5 156#undef SET_RND
6aa6ea8e 157#undef PAVGBP
39825f31 158#undef PAVGB
a7bd8797 159
de6d9b64
FB
160/***********************************/
161/* 3Dnow specific */
162
163#define DEF(x) x ## _3dnow
164/* for Athlons PAVGUSB is prefered */
165#define PAVGB "pavgusb"
166
167#include "dsputil_mmx_avg.h"
168
169#undef DEF
170#undef PAVGB
171
172/***********************************/
173/* MMX2 specific */
174
607dce96 175#define DEF(x) x ## _mmx2
de6d9b64
FB
176
177/* Introduced only in MMX2 set */
178#define PAVGB "pavgb"
179
180#include "dsputil_mmx_avg.h"
181
182#undef DEF
183#undef PAVGB
184
185/***********************************/
186/* standard MMX */
187
188static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
189{
607dce96
MN
190 asm volatile(
191 "movl $-128, %%eax \n\t"
192 "pxor %%mm7, %%mm7 \n\t"
193 ".balign 16 \n\t"
194 "1: \n\t"
195 "movq (%0), %%mm0 \n\t"
196 "movq (%0, %2), %%mm2 \n\t"
197 "movq %%mm0, %%mm1 \n\t"
198 "movq %%mm2, %%mm3 \n\t"
199 "punpcklbw %%mm7, %%mm0 \n\t"
200 "punpckhbw %%mm7, %%mm1 \n\t"
201 "punpcklbw %%mm7, %%mm2 \n\t"
202 "punpckhbw %%mm7, %%mm3 \n\t"
203 "movq %%mm0, (%1, %%eax)\n\t"
204 "movq %%mm1, 8(%1, %%eax)\n\t"
205 "movq %%mm2, 16(%1, %%eax)\n\t"
206 "movq %%mm3, 24(%1, %%eax)\n\t"
207 "addl %3, %0 \n\t"
208 "addl $32, %%eax \n\t"
209 "js 1b \n\t"
210 : "+r" (pixels)
211 : "r" (block+64), "r" (line_size), "r" (line_size*2)
212 : "%eax"
213 );
de6d9b64
FB
214}
215
9dbcbd92
MN
216static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
217{
218 asm volatile(
607dce96 219 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 220 "movl $-128, %%eax \n\t"
607dce96 221 ".balign 16 \n\t"
9dbcbd92
MN
222 "1: \n\t"
223 "movq (%0), %%mm0 \n\t"
224 "movq (%1), %%mm2 \n\t"
225 "movq %%mm0, %%mm1 \n\t"
226 "movq %%mm2, %%mm3 \n\t"
227 "punpcklbw %%mm7, %%mm0 \n\t"
228 "punpckhbw %%mm7, %%mm1 \n\t"
229 "punpcklbw %%mm7, %%mm2 \n\t"
230 "punpckhbw %%mm7, %%mm3 \n\t"
231 "psubw %%mm2, %%mm0 \n\t"
232 "psubw %%mm3, %%mm1 \n\t"
233 "movq %%mm0, (%2, %%eax)\n\t"
234 "movq %%mm1, 8(%2, %%eax)\n\t"
235 "addl %3, %0 \n\t"
236 "addl %3, %1 \n\t"
237 "addl $16, %%eax \n\t"
238 "jnz 1b \n\t"
239 : "+r" (s1), "+r" (s2)
240 : "r" (block+64), "r" (stride)
241 : "%eax"
242 );
243}
244
eb4b3dd3 245void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
246{
247 const DCTELEM *p;
248 UINT8 *pix;
de6d9b64
FB
249
250 /* read the pixels */
251 p = block;
252 pix = pixels;
d6a4c0b1 253 /* unrolled loop */
de6d9b64 254 __asm __volatile(
a822a479
NK
255 "movq %3, %%mm0\n\t"
256 "movq 8%3, %%mm1\n\t"
257 "movq 16%3, %%mm2\n\t"
258 "movq 24%3, %%mm3\n\t"
259 "movq 32%3, %%mm4\n\t"
260 "movq 40%3, %%mm5\n\t"
261 "movq 48%3, %%mm6\n\t"
262 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
263 "packuswb %%mm1, %%mm0\n\t"
264 "packuswb %%mm3, %%mm2\n\t"
265 "packuswb %%mm5, %%mm4\n\t"
266 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
267 "movq %%mm0, (%0)\n\t"
268 "movq %%mm2, (%0, %1)\n\t"
269 "movq %%mm4, (%0, %1, 2)\n\t"
270 "movq %%mm6, (%0, %2)\n\t"
271 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
272 :"memory");
273 pix += line_size*4;
274 p += 32;
d6a4c0b1
ZK
275
276 // if here would be an exact copy of the code above
277 // compiler would generate some very strange code
278 // thus using "r"
279 __asm __volatile(
280 "movq (%3), %%mm0\n\t"
281 "movq 8(%3), %%mm1\n\t"
282 "movq 16(%3), %%mm2\n\t"
283 "movq 24(%3), %%mm3\n\t"
284 "movq 32(%3), %%mm4\n\t"
285 "movq 40(%3), %%mm5\n\t"
286 "movq 48(%3), %%mm6\n\t"
287 "movq 56(%3), %%mm7\n\t"
288 "packuswb %%mm1, %%mm0\n\t"
289 "packuswb %%mm3, %%mm2\n\t"
290 "packuswb %%mm5, %%mm4\n\t"
291 "packuswb %%mm7, %%mm6\n\t"
292 "movq %%mm0, (%0)\n\t"
293 "movq %%mm2, (%0, %1)\n\t"
294 "movq %%mm4, (%0, %1, 2)\n\t"
295 "movq %%mm6, (%0, %2)\n\t"
296 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
297 :"memory");
de6d9b64
FB
298}
299
eb4b3dd3 300void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
de6d9b64
FB
301{
302 const DCTELEM *p;
303 UINT8 *pix;
304 int i;
305
306 /* read the pixels */
307 p = block;
308 pix = pixels;
d6a4c0b1
ZK
309 MOVQ_ZERO(mm7);
310 i = 4;
cd8e5f96 311 do {
de6d9b64 312 __asm __volatile(
cd8e5f96
ZK
313 "movq (%2), %%mm0\n\t"
314 "movq 8(%2), %%mm1\n\t"
315 "movq 16(%2), %%mm2\n\t"
316 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
317 "movq %0, %%mm4\n\t"
318 "movq %1, %%mm6\n\t"
319 "movq %%mm4, %%mm5\n\t"
320 "punpcklbw %%mm7, %%mm4\n\t"
321 "punpckhbw %%mm7, %%mm5\n\t"
322 "paddsw %%mm4, %%mm0\n\t"
323 "paddsw %%mm5, %%mm1\n\t"
324 "movq %%mm6, %%mm5\n\t"
325 "punpcklbw %%mm7, %%mm6\n\t"
326 "punpckhbw %%mm7, %%mm5\n\t"
327 "paddsw %%mm6, %%mm2\n\t"
328 "paddsw %%mm5, %%mm3\n\t"
329 "packuswb %%mm1, %%mm0\n\t"
330 "packuswb %%mm3, %%mm2\n\t"
331 "movq %%mm0, %0\n\t"
332 "movq %%mm2, %1\n\t"
a822a479 333 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 334 :"r"(p)
de6d9b64
FB
335 :"memory");
336 pix += line_size*2;
337 p += 16;
cd8e5f96 338 } while (--i);
de6d9b64
FB
339}
340
b3184779 341static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
de6d9b64 342{
39825f31 343 __asm __volatile(
31ddcf98 344 "lea (%3, %3), %%eax \n\t"
52af45ad 345 ".balign 8 \n\t"
31ddcf98
ZK
346 "1: \n\t"
347 "movq (%1), %%mm0 \n\t"
348 "movq (%1, %3), %%mm1 \n\t"
349 "movq %%mm0, (%2) \n\t"
350 "movq %%mm1, (%2, %3) \n\t"
351 "addl %%eax, %1 \n\t"
352 "addl %%eax, %2 \n\t"
353 "movq (%1), %%mm0 \n\t"
354 "movq (%1, %3), %%mm1 \n\t"
355 "movq %%mm0, (%2) \n\t"
356 "movq %%mm1, (%2, %3) \n\t"
357 "addl %%eax, %1 \n\t"
358 "addl %%eax, %2 \n\t"
359 "subl $4, %0 \n\t"
360 "jnz 1b \n\t"
361 : "+g"(h), "+r" (pixels), "+r" (block)
362 : "r"(line_size)
363 : "%eax", "memory"
364 );
de6d9b64
FB
365}
366
b3184779
MN
367static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
368{
369 __asm __volatile(
370 "lea (%3, %3), %%eax \n\t"
371 ".balign 8 \n\t"
372 "1: \n\t"
373 "movq (%1), %%mm0 \n\t"
374 "movq 8(%1), %%mm4 \n\t"
375 "movq (%1, %3), %%mm1 \n\t"
376 "movq 8(%1, %3), %%mm5 \n\t"
377 "movq %%mm0, (%2) \n\t"
378 "movq %%mm4, 8(%2) \n\t"
379 "movq %%mm1, (%2, %3) \n\t"
380 "movq %%mm5, 8(%2, %3) \n\t"
381 "addl %%eax, %1 \n\t"
382 "addl %%eax, %2 \n\t"
383 "movq (%1), %%mm0 \n\t"
384 "movq 8(%1), %%mm4 \n\t"
385 "movq (%1, %3), %%mm1 \n\t"
386 "movq 8(%1, %3), %%mm5 \n\t"
387 "movq %%mm0, (%2) \n\t"
388 "movq %%mm4, 8(%2) \n\t"
389 "movq %%mm1, (%2, %3) \n\t"
390 "movq %%mm5, 8(%2, %3) \n\t"
391 "addl %%eax, %1 \n\t"
392 "addl %%eax, %2 \n\t"
393 "subl $4, %0 \n\t"
394 "jnz 1b \n\t"
395 : "+g"(h), "+r" (pixels), "+r" (block)
396 : "r"(line_size)
397 : "%eax", "memory"
398 );
399}
400
649c00c9
MN
401static void clear_blocks_mmx(DCTELEM *blocks)
402{
39825f31 403 __asm __volatile(
649c00c9
MN
404 "pxor %%mm7, %%mm7 \n\t"
405 "movl $-128*6, %%eax \n\t"
406 "1: \n\t"
407 "movq %%mm7, (%0, %%eax) \n\t"
408 "movq %%mm7, 8(%0, %%eax) \n\t"
409 "movq %%mm7, 16(%0, %%eax) \n\t"
410 "movq %%mm7, 24(%0, %%eax) \n\t"
411 "addl $32, %%eax \n\t"
412 " js 1b \n\t"
413 : : "r" (((int)blocks)+128*6)
414 : "%eax"
415 );
416}
417
084c726b
MN
418static int pix_sum16_mmx(UINT8 * pix, int line_size){
419 const int h=16;
420 int sum;
421 int index= -line_size*h;
422
423 __asm __volatile(
424 "pxor %%mm7, %%mm7 \n\t"
425 "pxor %%mm6, %%mm6 \n\t"
426 "1: \n\t"
427 "movq (%2, %1), %%mm0 \n\t"
428 "movq (%2, %1), %%mm1 \n\t"
429 "movq 8(%2, %1), %%mm2 \n\t"
430 "movq 8(%2, %1), %%mm3 \n\t"
431 "punpcklbw %%mm7, %%mm0 \n\t"
432 "punpckhbw %%mm7, %%mm1 \n\t"
433 "punpcklbw %%mm7, %%mm2 \n\t"
434 "punpckhbw %%mm7, %%mm3 \n\t"
435 "paddw %%mm0, %%mm1 \n\t"
436 "paddw %%mm2, %%mm3 \n\t"
437 "paddw %%mm1, %%mm3 \n\t"
438 "paddw %%mm3, %%mm6 \n\t"
439 "addl %3, %1 \n\t"
440 " js 1b \n\t"
441 "movq %%mm6, %%mm5 \n\t"
442 "psrlq $32, %%mm6 \n\t"
443 "paddw %%mm5, %%mm6 \n\t"
444 "movq %%mm6, %%mm5 \n\t"
445 "psrlq $16, %%mm6 \n\t"
446 "paddw %%mm5, %%mm6 \n\t"
447 "movd %%mm6, %0 \n\t"
448 "andl $0xFFFF, %0 \n\t"
449 : "=&r" (sum), "+r" (index)
450 : "r" (pix - index), "r" (line_size)
451 );
452
453 return sum;
454}
455
11f18faf
MN
456static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
457 int i=0;
458 asm volatile(
459 "1: \n\t"
460 "movq (%1, %0), %%mm0 \n\t"
461 "movq (%2, %0), %%mm1 \n\t"
462 "paddb %%mm0, %%mm1 \n\t"
463 "movq %%mm1, (%2, %0) \n\t"
464 "movq 8(%1, %0), %%mm0 \n\t"
465 "movq 8(%2, %0), %%mm1 \n\t"
466 "paddb %%mm0, %%mm1 \n\t"
467 "movq %%mm1, 8(%2, %0) \n\t"
468 "addl $16, %0 \n\t"
469 "cmpl %3, %0 \n\t"
470 " jb 1b \n\t"
471 : "+r" (i)
472 : "r"(src), "r"(dst), "r"(w-15)
473 );
474 for(; i<w; i++)
475 dst[i+0] += src[i+0];
476}
477
478static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
479 int i=0;
480 asm volatile(
481 "1: \n\t"
482 "movq (%2, %0), %%mm0 \n\t"
483 "movq (%1, %0), %%mm1 \n\t"
484 "psubb %%mm0, %%mm1 \n\t"
485 "movq %%mm1, (%3, %0) \n\t"
486 "movq 8(%2, %0), %%mm0 \n\t"
487 "movq 8(%1, %0), %%mm1 \n\t"
488 "psubb %%mm0, %%mm1 \n\t"
489 "movq %%mm1, 8(%3, %0) \n\t"
490 "addl $16, %0 \n\t"
491 "cmpl %4, %0 \n\t"
492 " jb 1b \n\t"
493 : "+r" (i)
494 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
495 );
496 for(; i<w; i++)
497 dst[i+0] = src1[i+0]-src2[i+0];
498}
499
500
61a4e8ae 501#if 0
d6a4c0b1 502static void just_return() { return; }
61a4e8ae 503#endif
d6a4c0b1 504
eb4b3dd3 505void dsputil_init_mmx(DSPContext* c, unsigned mask)
de6d9b64
FB
506{
507 mm_flags = mm_support();
1565dabc
LB
508#if 0
509 fprintf(stderr, "libavcodec: CPU flags:");
de6d9b64 510 if (mm_flags & MM_MMX)
1565dabc 511 fprintf(stderr, " mmx");
de6d9b64 512 if (mm_flags & MM_MMXEXT)
1565dabc 513 fprintf(stderr, " mmxext");
de6d9b64 514 if (mm_flags & MM_3DNOW)
1565dabc 515 fprintf(stderr, " 3dnow");
de6d9b64 516 if (mm_flags & MM_SSE)
1565dabc 517 fprintf(stderr, " sse");
de6d9b64 518 if (mm_flags & MM_SSE2)
1565dabc
LB
519 fprintf(stderr, " sse2");
520 fprintf(stderr, "\n");
de6d9b64
FB
521#endif
522
523 if (mm_flags & MM_MMX) {
eb4b3dd3
ZK
524 c->get_pixels = get_pixels_mmx;
525 c->diff_pixels = diff_pixels_mmx;
526 c->put_pixels_clamped = put_pixels_clamped_mmx;
527 c->add_pixels_clamped = add_pixels_clamped_mmx;
528 c->clear_blocks = clear_blocks_mmx;
529 c->pix_sum = pix_sum16_mmx;
530
531 c->pix_abs16x16 = pix_abs16x16_mmx;
532 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
533 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
534 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
535 c->pix_abs8x8 = pix_abs8x8_mmx;
536 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
537 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
538 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
539
540 c->put_pixels_tab[0][0] = put_pixels16_mmx;
541 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
542 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
543 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
544
545 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
546 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
547 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
548 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
549
550 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
551 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
552 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
553 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
554
555 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
556 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
557 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
558 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
559
560 c->put_pixels_tab[1][0] = put_pixels8_mmx;
561 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
562 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
563 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
564
565 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
566 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
567 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
568 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
569
570 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
571 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
572 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
573 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
574
575 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
576 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
577 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
578 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
11f18faf
MN
579
580 c->add_bytes= add_bytes_mmx;
581 c->diff_bytes= diff_bytes_mmx;
607dce96 582
de6d9b64 583 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
584 c->pix_abs16x16 = pix_abs16x16_mmx2;
585 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
586 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
587 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
588
589 c->pix_abs8x8 = pix_abs8x8_mmx2;
590 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
591 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
592 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
593
594 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
595 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
596 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
597 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
598
599 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
600 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
601 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
602 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
603
604 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
605 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
606 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
607 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
608
609 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
610 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
611 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
612 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
de6d9b64 613 } else if (mm_flags & MM_3DNOW) {
eb4b3dd3
ZK
614 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
615 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
616 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
617 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
618
619 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
620 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
621 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
622 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
623
624 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
625 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
626 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
627 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
628
629 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
630 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
631 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
632 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
de6d9b64
FB
633 }
634 }
d6a4c0b1
ZK
635
636#if 0
637 // for speed testing
638 get_pixels = just_return;
639 put_pixels_clamped = just_return;
640 add_pixels_clamped = just_return;
641
642 pix_abs16x16 = just_return;
643 pix_abs16x16_x2 = just_return;
644 pix_abs16x16_y2 = just_return;
645 pix_abs16x16_xy2 = just_return;
646
647 put_pixels_tab[0] = just_return;
648 put_pixels_tab[1] = just_return;
649 put_pixels_tab[2] = just_return;
650 put_pixels_tab[3] = just_return;
651
652 put_no_rnd_pixels_tab[0] = just_return;
653 put_no_rnd_pixels_tab[1] = just_return;
654 put_no_rnd_pixels_tab[2] = just_return;
655 put_no_rnd_pixels_tab[3] = just_return;
656
657 avg_pixels_tab[0] = just_return;
658 avg_pixels_tab[1] = just_return;
659 avg_pixels_tab[2] = just_return;
660 avg_pixels_tab[3] = just_return;
661
662 avg_no_rnd_pixels_tab[0] = just_return;
663 avg_no_rnd_pixels_tab[1] = just_return;
664 avg_no_rnd_pixels_tab[2] = just_return;
665 avg_no_rnd_pixels_tab[3] = just_return;
666
d6a4c0b1
ZK
667 //av_fdct = just_return;
668 //ff_idct = just_return;
669#endif
de6d9b64 670}
4f12a497
FB
671
672/* remove any non bit exact operation (testing purpose). NOTE that
673 this function should be kept as small as possible because it is
674 always difficult to test automatically non bit exact cases. */
eb4b3dd3 675void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
4f12a497
FB
676{
677 if (mm_flags & MM_MMX) {
b3184779 678 /* MMX2 & 3DNOW */
eb4b3dd3
ZK
679 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
680 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
681 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
682 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
683 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
684 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
4afeaec9 685
b3184779 686 if (mm_flags & MM_MMXEXT) {
eb4b3dd3
ZK
687 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
688 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
689 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
690 c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
691 c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
692 c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
4f12a497
FB
693 }
694 }
695}