Added Launch, FaviconURL and Redirect examples
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5
FB
25int mm_flags; /* multimedia extension flags */
26
ba6802de
MN
27int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31
32int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36
37int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41
42int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46
8def0299
FB
47/* external functions, from idct_mmx.c */
48void ff_mmx_idct(DCTELEM *block);
49void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 50
de6d9b64 51/* pixel operations */
def60345 52static const uint64_t mm_bfe __attribute__ ((aligned(8))) = 0xfefefefefefefefeULL;
a7bd8797
MN
53static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
54static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
55static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
a9b3f630
NK
56//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
57//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
de6d9b64 58
d6a4c0b1
ZK
59#define JUMPALIGN() __asm __volatile (".balign 8"::)
60#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
61
62#ifndef PIC
63#define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
64#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
a7bd8797 65#define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t"
def60345 66#define MOVQ_BFE(regd) "movq "MANGLE(mm_bfe)", "#regd" \n\t"
d6a4c0b1
ZK
67#else
68// for shared library it's better to use this way for accessing constants
69// pcmpeqd -> -1
70#define MOVQ_WONE(regd) \
71 __asm __volatile ( \
72 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73 "psrlw $15, %%" #regd ::)
74
75#define MOVQ_WTWO(regd) \
76 __asm __volatile ( \
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
78 "psrlw $15, %%" #regd " \n\t" \
79 "psllw $1, %%" #regd ::)
a7bd8797
MN
80
81#define MOVQ_BONE(regd) \
82 "pcmpeqd " #regd ", " #regd " \n\t" \
83 "psrlw $15, " #regd " \n\t"\
84 "packuswb " #regd ", " #regd " \n\t"
def60345
ZK
85
86#define MOVQ_BFE(regd) \
87 "pcmpeqd " #regd ", " #regd " \n\t"\
88 "paddb " #regd ", " #regd " \n\t"
d6a4c0b1
ZK
89#endif
90
def60345
ZK
91// using mm6 as temporary and for the output result
92// first argument is unmodifed and second is trashed
93// mm7 is supposed to contain 0xfefefefefefefefe
91abb473
ZK
94#define PAVGB_MMX_NO_RND(rega, regb, regr) \
95 "movq " #rega ", " #regr " \n\t"\
96 "pand " #regb ", " #regr " \n\t"\
def60345
ZK
97 "pxor " #rega ", " #regb " \n\t"\
98 "pand %%mm7, " #regb " \n\t"\
99 "psrlq $1, " #regb " \n\t"\
91abb473 100 "paddb " #regb ", " #regr " \n\t"
def60345 101
91abb473
ZK
102#define PAVGB_MMX(rega, regb, regr) \
103 "movq " #rega ", " #regr " \n\t"\
104 "por " #regb ", " #regr " \n\t"\
def60345
ZK
105 "pxor " #rega ", " #regb " \n\t"\
106 "pand %%mm7, " #regb " \n\t"\
107 "psrlq $1, " #regb " \n\t"\
91abb473 108 "psubb " #regb ", " #regr " \n\t"
def60345 109
6aa6ea8e
ZK
110#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
111 "movq " #rega ", " #regr " \n\t"\
112 "movq " #regc ", " #regp " \n\t"\
113 "pand " #regb ", " #regr " \n\t"\
114 "pand " #regd ", " #regp " \n\t"\
115 "pxor " #rega ", " #regb " \n\t"\
116 "pxor " #regc ", " #regd " \n\t"\
117 "pand %%mm7, " #regb " \n\t"\
118 "pand %%mm7, " #regd " \n\t"\
119 "psrlq $1, " #regb " \n\t"\
120 "psrlq $1, " #regd " \n\t"\
121 "paddb " #regb ", " #regr " \n\t"\
122 "paddb " #regd ", " #regp " \n\t"
123
124#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
125 "movq " #rega ", " #regr " \n\t"\
126 "movq " #regc ", " #regp " \n\t"\
127 "por " #regb ", " #regr " \n\t"\
128 "por " #regd ", " #regp " \n\t"\
129 "pxor " #rega ", " #regb " \n\t"\
130 "pxor " #regc ", " #regd " \n\t"\
131 "pand %%mm7, " #regb " \n\t"\
132 "pand %%mm7, " #regd " \n\t"\
133 "psrlq $1, " #regd " \n\t"\
134 "psrlq $1, " #regb " \n\t"\
135 "psubb " #regb ", " #regr " \n\t"\
136 "psubb " #regd ", " #regp " \n\t"
137
91abb473
ZK
138/***********************************/
139/* MMX no rounding */
140#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
141
142#define PAVGB(a, b) PAVGB_MMX_NO_RND(a, b, %%mm6)
143#define PAVGBR(a, b, c) PAVGB_MMX_NO_RND(a, b, c)
6aa6ea8e 144#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
91abb473
ZK
145#include "dsputil_mmx_rnd.h"
146
147#undef DEF
148#undef PAVGB
149#undef PAVGBR
6aa6ea8e 150#undef PAVGBP
91abb473
ZK
151/***********************************/
152/* MMX rounding */
153
154#define DEF(x, y) x ## _ ## y ##_mmx
155
156#define PAVGB(a, b) PAVGB_MMX(a, b, %%mm6)
157#define PAVGBR(a, b, c) PAVGB_MMX(a, b, c)
6aa6ea8e 158#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
91abb473
ZK
159#include "dsputil_mmx_rnd.h"
160
161#undef DEF
162#undef PAVGB
163#undef PAVGBR
6aa6ea8e 164#undef PAVGBP
a7bd8797 165
de6d9b64
FB
166/***********************************/
167/* 3Dnow specific */
168
169#define DEF(x) x ## _3dnow
170/* for Athlons PAVGUSB is prefered */
171#define PAVGB "pavgusb"
172
173#include "dsputil_mmx_avg.h"
174
175#undef DEF
176#undef PAVGB
177
178/***********************************/
179/* MMX2 specific */
180
607dce96 181#define DEF(x) x ## _mmx2
de6d9b64
FB
182
183/* Introduced only in MMX2 set */
184#define PAVGB "pavgb"
185
186#include "dsputil_mmx_avg.h"
187
188#undef DEF
189#undef PAVGB
190
191/***********************************/
192/* standard MMX */
193
194static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
195{
607dce96
MN
196 asm volatile(
197 "movl $-128, %%eax \n\t"
198 "pxor %%mm7, %%mm7 \n\t"
199 ".balign 16 \n\t"
200 "1: \n\t"
201 "movq (%0), %%mm0 \n\t"
202 "movq (%0, %2), %%mm2 \n\t"
203 "movq %%mm0, %%mm1 \n\t"
204 "movq %%mm2, %%mm3 \n\t"
205 "punpcklbw %%mm7, %%mm0 \n\t"
206 "punpckhbw %%mm7, %%mm1 \n\t"
207 "punpcklbw %%mm7, %%mm2 \n\t"
208 "punpckhbw %%mm7, %%mm3 \n\t"
209 "movq %%mm0, (%1, %%eax)\n\t"
210 "movq %%mm1, 8(%1, %%eax)\n\t"
211 "movq %%mm2, 16(%1, %%eax)\n\t"
212 "movq %%mm3, 24(%1, %%eax)\n\t"
213 "addl %3, %0 \n\t"
214 "addl $32, %%eax \n\t"
215 "js 1b \n\t"
216 : "+r" (pixels)
217 : "r" (block+64), "r" (line_size), "r" (line_size*2)
218 : "%eax"
219 );
de6d9b64
FB
220}
221
9dbcbd92
MN
222static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
223{
224 asm volatile(
607dce96 225 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 226 "movl $-128, %%eax \n\t"
607dce96 227 ".balign 16 \n\t"
9dbcbd92
MN
228 "1: \n\t"
229 "movq (%0), %%mm0 \n\t"
230 "movq (%1), %%mm2 \n\t"
231 "movq %%mm0, %%mm1 \n\t"
232 "movq %%mm2, %%mm3 \n\t"
233 "punpcklbw %%mm7, %%mm0 \n\t"
234 "punpckhbw %%mm7, %%mm1 \n\t"
235 "punpcklbw %%mm7, %%mm2 \n\t"
236 "punpckhbw %%mm7, %%mm3 \n\t"
237 "psubw %%mm2, %%mm0 \n\t"
238 "psubw %%mm3, %%mm1 \n\t"
239 "movq %%mm0, (%2, %%eax)\n\t"
240 "movq %%mm1, 8(%2, %%eax)\n\t"
241 "addl %3, %0 \n\t"
242 "addl %3, %1 \n\t"
243 "addl $16, %%eax \n\t"
244 "jnz 1b \n\t"
245 : "+r" (s1), "+r" (s2)
246 : "r" (block+64), "r" (stride)
247 : "%eax"
248 );
249}
250
de6d9b64
FB
251static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
252{
253 const DCTELEM *p;
254 UINT8 *pix;
de6d9b64
FB
255
256 /* read the pixels */
257 p = block;
258 pix = pixels;
d6a4c0b1 259 /* unrolled loop */
de6d9b64 260 __asm __volatile(
a822a479
NK
261 "movq %3, %%mm0\n\t"
262 "movq 8%3, %%mm1\n\t"
263 "movq 16%3, %%mm2\n\t"
264 "movq 24%3, %%mm3\n\t"
265 "movq 32%3, %%mm4\n\t"
266 "movq 40%3, %%mm5\n\t"
267 "movq 48%3, %%mm6\n\t"
268 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
269 "packuswb %%mm1, %%mm0\n\t"
270 "packuswb %%mm3, %%mm2\n\t"
271 "packuswb %%mm5, %%mm4\n\t"
272 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
273 "movq %%mm0, (%0)\n\t"
274 "movq %%mm2, (%0, %1)\n\t"
275 "movq %%mm4, (%0, %1, 2)\n\t"
276 "movq %%mm6, (%0, %2)\n\t"
277 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
278 :"memory");
279 pix += line_size*4;
280 p += 32;
d6a4c0b1
ZK
281
282 // if here would be an exact copy of the code above
283 // compiler would generate some very strange code
284 // thus using "r"
285 __asm __volatile(
286 "movq (%3), %%mm0\n\t"
287 "movq 8(%3), %%mm1\n\t"
288 "movq 16(%3), %%mm2\n\t"
289 "movq 24(%3), %%mm3\n\t"
290 "movq 32(%3), %%mm4\n\t"
291 "movq 40(%3), %%mm5\n\t"
292 "movq 48(%3), %%mm6\n\t"
293 "movq 56(%3), %%mm7\n\t"
294 "packuswb %%mm1, %%mm0\n\t"
295 "packuswb %%mm3, %%mm2\n\t"
296 "packuswb %%mm5, %%mm4\n\t"
297 "packuswb %%mm7, %%mm6\n\t"
298 "movq %%mm0, (%0)\n\t"
299 "movq %%mm2, (%0, %1)\n\t"
300 "movq %%mm4, (%0, %1, 2)\n\t"
301 "movq %%mm6, (%0, %2)\n\t"
302 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
303 :"memory");
de6d9b64
FB
304}
305
306static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
307{
308 const DCTELEM *p;
309 UINT8 *pix;
310 int i;
311
312 /* read the pixels */
313 p = block;
314 pix = pixels;
d6a4c0b1
ZK
315 MOVQ_ZERO(mm7);
316 i = 4;
cd8e5f96 317 do {
de6d9b64 318 __asm __volatile(
cd8e5f96
ZK
319 "movq (%2), %%mm0\n\t"
320 "movq 8(%2), %%mm1\n\t"
321 "movq 16(%2), %%mm2\n\t"
322 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
323 "movq %0, %%mm4\n\t"
324 "movq %1, %%mm6\n\t"
325 "movq %%mm4, %%mm5\n\t"
326 "punpcklbw %%mm7, %%mm4\n\t"
327 "punpckhbw %%mm7, %%mm5\n\t"
328 "paddsw %%mm4, %%mm0\n\t"
329 "paddsw %%mm5, %%mm1\n\t"
330 "movq %%mm6, %%mm5\n\t"
331 "punpcklbw %%mm7, %%mm6\n\t"
332 "punpckhbw %%mm7, %%mm5\n\t"
333 "paddsw %%mm6, %%mm2\n\t"
334 "paddsw %%mm5, %%mm3\n\t"
335 "packuswb %%mm1, %%mm0\n\t"
336 "packuswb %%mm3, %%mm2\n\t"
337 "movq %%mm0, %0\n\t"
338 "movq %%mm2, %1\n\t"
a822a479 339 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 340 :"r"(p)
de6d9b64
FB
341 :"memory");
342 pix += line_size*2;
343 p += 16;
cd8e5f96 344 } while (--i);
de6d9b64
FB
345}
346
347static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
348{
31ddcf98
ZK
349 asm volatile
350 (
351 "lea (%3, %3), %%eax \n\t"
52af45ad 352 ".balign 8 \n\t"
31ddcf98
ZK
353 "1: \n\t"
354 "movq (%1), %%mm0 \n\t"
355 "movq (%1, %3), %%mm1 \n\t"
356 "movq %%mm0, (%2) \n\t"
357 "movq %%mm1, (%2, %3) \n\t"
358 "addl %%eax, %1 \n\t"
359 "addl %%eax, %2 \n\t"
360 "movq (%1), %%mm0 \n\t"
361 "movq (%1, %3), %%mm1 \n\t"
362 "movq %%mm0, (%2) \n\t"
363 "movq %%mm1, (%2, %3) \n\t"
364 "addl %%eax, %1 \n\t"
365 "addl %%eax, %2 \n\t"
366 "subl $4, %0 \n\t"
367 "jnz 1b \n\t"
368 : "+g"(h), "+r" (pixels), "+r" (block)
369 : "r"(line_size)
370 : "%eax", "memory"
371 );
de6d9b64
FB
372}
373
6aa6ea8e 374#if 1
de6d9b64
FB
375static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
376{
377 UINT8 *p;
378 const UINT8 *pix;
379 p = block;
d6a4c0b1
ZK
380 pix = pixels; // 1s
381 MOVQ_ZERO(mm7);
382 MOVQ_WTWO(mm6);
383 JUMPALIGN();
de6d9b64
FB
384 do {
385 __asm __volatile(
386 "movq %1, %%mm0\n\t"
387 "movq %2, %%mm1\n\t"
388 "movq 1%1, %%mm4\n\t"
389 "movq 1%2, %%mm5\n\t"
390 "movq %%mm0, %%mm2\n\t"
391 "movq %%mm1, %%mm3\n\t"
392 "punpcklbw %%mm7, %%mm0\n\t"
393 "punpcklbw %%mm7, %%mm1\n\t"
394 "punpckhbw %%mm7, %%mm2\n\t"
395 "punpckhbw %%mm7, %%mm3\n\t"
396 "paddusw %%mm1, %%mm0\n\t"
397 "paddusw %%mm3, %%mm2\n\t"
398 "movq %%mm4, %%mm1\n\t"
399 "movq %%mm5, %%mm3\n\t"
400 "punpcklbw %%mm7, %%mm4\n\t"
401 "punpcklbw %%mm7, %%mm5\n\t"
402 "punpckhbw %%mm7, %%mm1\n\t"
403 "punpckhbw %%mm7, %%mm3\n\t"
404 "paddusw %%mm5, %%mm4\n\t"
405 "paddusw %%mm3, %%mm1\n\t"
406 "paddusw %%mm6, %%mm4\n\t"
407 "paddusw %%mm6, %%mm1\n\t"
408 "paddusw %%mm4, %%mm0\n\t"
409 "paddusw %%mm1, %%mm2\n\t"
410 "psrlw $2, %%mm0\n\t"
411 "psrlw $2, %%mm2\n\t"
412 "packuswb %%mm2, %%mm0\n\t"
413 "movq %%mm0, %0\n\t"
414 :"=m"(*p)
415 :"m"(*pix),
416 "m"(*(pix+line_size))
417 :"memory");
418 pix += line_size;
419 p += line_size;
420 } while(--h);
de6d9b64
FB
421}
422
de6d9b64
FB
423static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
424{
425 UINT8 *p;
426 const UINT8 *pix;
427 p = block;
428 pix = pixels;
d6a4c0b1
ZK
429 MOVQ_ZERO(mm7);
430 MOVQ_WONE(mm6);
431 JUMPALIGN();
de6d9b64
FB
432 do {
433 __asm __volatile(
434 "movq %1, %%mm0\n\t"
435 "movq %2, %%mm1\n\t"
436 "movq 1%1, %%mm4\n\t"
437 "movq 1%2, %%mm5\n\t"
438 "movq %%mm0, %%mm2\n\t"
439 "movq %%mm1, %%mm3\n\t"
440 "punpcklbw %%mm7, %%mm0\n\t"
441 "punpcklbw %%mm7, %%mm1\n\t"
442 "punpckhbw %%mm7, %%mm2\n\t"
443 "punpckhbw %%mm7, %%mm3\n\t"
444 "paddusw %%mm1, %%mm0\n\t"
445 "paddusw %%mm3, %%mm2\n\t"
446 "movq %%mm4, %%mm1\n\t"
447 "movq %%mm5, %%mm3\n\t"
448 "punpcklbw %%mm7, %%mm4\n\t"
449 "punpcklbw %%mm7, %%mm5\n\t"
450 "punpckhbw %%mm7, %%mm1\n\t"
451 "punpckhbw %%mm7, %%mm3\n\t"
452 "paddusw %%mm5, %%mm4\n\t"
453 "paddusw %%mm3, %%mm1\n\t"
454 "paddusw %%mm6, %%mm4\n\t"
455 "paddusw %%mm6, %%mm1\n\t"
456 "paddusw %%mm4, %%mm0\n\t"
457 "paddusw %%mm1, %%mm2\n\t"
458 "psrlw $2, %%mm0\n\t"
459 "psrlw $2, %%mm2\n\t"
460 "packuswb %%mm2, %%mm0\n\t"
461 "movq %%mm0, %0\n\t"
462 :"=m"(*p)
463 :"m"(*pix),
464 "m"(*(pix+line_size))
465 :"memory");
466 pix += line_size;
467 p += line_size;
468 } while(--h);
de6d9b64 469}
91abb473 470#endif
de6d9b64
FB
471static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
472{
473 UINT8 *p;
474 const UINT8 *pix;
475 p = block;
476 pix = pixels;
d6a4c0b1
ZK
477 MOVQ_ZERO(mm7);
478 MOVQ_WONE(mm6);
479 JUMPALIGN();
de6d9b64
FB
480 do {
481 __asm __volatile(
482 "movq %0, %%mm0\n\t"
483 "movq %1, %%mm1\n\t"
484 "movq %%mm0, %%mm2\n\t"
485 "movq %%mm1, %%mm3\n\t"
486 "punpcklbw %%mm7, %%mm0\n\t"
487 "punpcklbw %%mm7, %%mm1\n\t"
488 "punpckhbw %%mm7, %%mm2\n\t"
489 "punpckhbw %%mm7, %%mm3\n\t"
490 "paddusw %%mm1, %%mm0\n\t"
491 "paddusw %%mm3, %%mm2\n\t"
492 "paddusw %%mm6, %%mm0\n\t"
493 "paddusw %%mm6, %%mm2\n\t"
494 "psrlw $1, %%mm0\n\t"
495 "psrlw $1, %%mm2\n\t"
496 "packuswb %%mm2, %%mm0\n\t"
497 "movq %%mm0, %0\n\t"
a822a479 498 :"+m"(*p)
de6d9b64
FB
499 :"m"(*pix)
500 :"memory");
501 pix += line_size;
502 p += line_size;
503 }
504 while (--h);
de6d9b64
FB
505}
506
507static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
508{
509 UINT8 *p;
510 const UINT8 *pix;
511 p = block;
512 pix = pixels;
d6a4c0b1
ZK
513 MOVQ_ZERO(mm7);
514 MOVQ_WONE(mm6);
515 JUMPALIGN();
de6d9b64
FB
516 do {
517 __asm __volatile(
518 "movq %1, %%mm1\n\t"
519 "movq %0, %%mm0\n\t"
520 "movq 1%1, %%mm4\n\t"
521 "movq %%mm0, %%mm2\n\t"
522 "movq %%mm1, %%mm3\n\t"
523 "movq %%mm4, %%mm5\n\t"
524 "punpcklbw %%mm7, %%mm1\n\t"
525 "punpckhbw %%mm7, %%mm3\n\t"
526 "punpcklbw %%mm7, %%mm4\n\t"
527 "punpckhbw %%mm7, %%mm5\n\t"
528 "punpcklbw %%mm7, %%mm0\n\t"
529 "punpckhbw %%mm7, %%mm2\n\t"
530 "paddusw %%mm4, %%mm1\n\t"
531 "paddusw %%mm5, %%mm3\n\t"
532 "paddusw %%mm6, %%mm1\n\t"
533 "paddusw %%mm6, %%mm3\n\t"
534 "psrlw $1, %%mm1\n\t"
535 "psrlw $1, %%mm3\n\t"
536 "paddusw %%mm6, %%mm0\n\t"
537 "paddusw %%mm6, %%mm2\n\t"
538 "paddusw %%mm1, %%mm0\n\t"
539 "paddusw %%mm3, %%mm2\n\t"
540 "psrlw $1, %%mm0\n\t"
541 "psrlw $1, %%mm2\n\t"
542 "packuswb %%mm2, %%mm0\n\t"
543 "movq %%mm0, %0\n\t"
a822a479 544 :"+m"(*p)
de6d9b64
FB
545 :"m"(*pix)
546 :"memory");
547 pix += line_size;
548 p += line_size;
549 } while (--h);
de6d9b64
FB
550}
551
552static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
553{
554 UINT8 *p;
555 const UINT8 *pix;
556 p = block;
557 pix = pixels;
d6a4c0b1
ZK
558 MOVQ_ZERO(mm7);
559 MOVQ_WONE(mm6);
560 JUMPALIGN();
de6d9b64
FB
561 do {
562 __asm __volatile(
563 "movq %1, %%mm1\n\t"
564 "movq %0, %%mm0\n\t"
565 "movq %2, %%mm4\n\t"
566 "movq %%mm0, %%mm2\n\t"
567 "movq %%mm1, %%mm3\n\t"
568 "movq %%mm4, %%mm5\n\t"
569 "punpcklbw %%mm7, %%mm1\n\t"
570 "punpckhbw %%mm7, %%mm3\n\t"
571 "punpcklbw %%mm7, %%mm4\n\t"
572 "punpckhbw %%mm7, %%mm5\n\t"
573 "punpcklbw %%mm7, %%mm0\n\t"
574 "punpckhbw %%mm7, %%mm2\n\t"
575 "paddusw %%mm4, %%mm1\n\t"
576 "paddusw %%mm5, %%mm3\n\t"
577 "paddusw %%mm6, %%mm1\n\t"
578 "paddusw %%mm6, %%mm3\n\t"
579 "psrlw $1, %%mm1\n\t"
580 "psrlw $1, %%mm3\n\t"
581 "paddusw %%mm6, %%mm0\n\t"
582 "paddusw %%mm6, %%mm2\n\t"
583 "paddusw %%mm1, %%mm0\n\t"
584 "paddusw %%mm3, %%mm2\n\t"
585 "psrlw $1, %%mm0\n\t"
586 "psrlw $1, %%mm2\n\t"
587 "packuswb %%mm2, %%mm0\n\t"
588 "movq %%mm0, %0\n\t"
a822a479 589 :"+m"(*p)
de6d9b64
FB
590 :"m"(*pix), "m"(*(pix+line_size))
591 :"memory");
592 pix += line_size;
593 p += line_size ;
594 } while(--h);
de6d9b64
FB
595}
596
597static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
598{
599 UINT8 *p;
600 const UINT8 *pix;
601 p = block;
602 pix = pixels;
d6a4c0b1
ZK
603 MOVQ_ZERO(mm7);
604 // this doesn't seem to be used offten - so
605 // the inside usage of mm_wone is not optimized
606 MOVQ_WTWO(mm6);
de6d9b64
FB
607 do {
608 __asm __volatile(
609 "movq %1, %%mm0\n\t"
610 "movq %2, %%mm1\n\t"
611 "movq 1%1, %%mm4\n\t"
612 "movq 1%2, %%mm5\n\t"
613 "movq %%mm0, %%mm2\n\t"
614 "movq %%mm1, %%mm3\n\t"
615 "punpcklbw %%mm7, %%mm0\n\t"
616 "punpcklbw %%mm7, %%mm1\n\t"
617 "punpckhbw %%mm7, %%mm2\n\t"
618 "punpckhbw %%mm7, %%mm3\n\t"
619 "paddusw %%mm1, %%mm0\n\t"
620 "paddusw %%mm3, %%mm2\n\t"
621 "movq %%mm4, %%mm1\n\t"
622 "movq %%mm5, %%mm3\n\t"
623 "punpcklbw %%mm7, %%mm4\n\t"
624 "punpcklbw %%mm7, %%mm5\n\t"
625 "punpckhbw %%mm7, %%mm1\n\t"
626 "punpckhbw %%mm7, %%mm3\n\t"
627 "paddusw %%mm5, %%mm4\n\t"
628 "paddusw %%mm3, %%mm1\n\t"
629 "paddusw %%mm6, %%mm4\n\t"
630 "paddusw %%mm6, %%mm1\n\t"
631 "paddusw %%mm4, %%mm0\n\t"
632 "paddusw %%mm1, %%mm2\n\t"
633 "movq %3, %%mm5\n\t"
634 "psrlw $2, %%mm0\n\t"
635 "movq %0, %%mm1\n\t"
636 "psrlw $2, %%mm2\n\t"
637 "movq %%mm1, %%mm3\n\t"
638 "punpcklbw %%mm7, %%mm1\n\t"
639 "punpckhbw %%mm7, %%mm3\n\t"
640 "paddusw %%mm1, %%mm0\n\t"
641 "paddusw %%mm3, %%mm2\n\t"
642 "paddusw %%mm5, %%mm0\n\t"
643 "paddusw %%mm5, %%mm2\n\t"
644 "psrlw $1, %%mm0\n\t"
645 "psrlw $1, %%mm2\n\t"
646 "packuswb %%mm2, %%mm0\n\t"
647 "movq %%mm0, %0\n\t"
a822a479 648 :"+m"(*p)
de6d9b64 649 :"m"(*pix),
a9b3f630 650 "m"(*(pix+line_size)), "m"(mm_wone)
de6d9b64
FB
651 :"memory");
652 pix += line_size;
653 p += line_size ;
654 } while(--h);
de6d9b64
FB
655}
656
657static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
658{
659 UINT8 *p;
660 const UINT8 *pix;
661 p = block;
662 pix = pixels;
d6a4c0b1 663 MOVQ_ZERO(mm7);
de6d9b64
FB
664 do {
665 __asm __volatile(
666 "movq %1, %%mm0\n\t"
667 "movq %0, %%mm1\n\t"
668 "movq %%mm0, %%mm2\n\t"
669 "movq %%mm1, %%mm3\n\t"
670 "punpcklbw %%mm7, %%mm0\n\t"
671 "punpcklbw %%mm7, %%mm1\n\t"
672 "punpckhbw %%mm7, %%mm2\n\t"
673 "punpckhbw %%mm7, %%mm3\n\t"
674 "paddusw %%mm1, %%mm0\n\t"
675 "paddusw %%mm3, %%mm2\n\t"
676 "psrlw $1, %%mm0\n\t"
677 "psrlw $1, %%mm2\n\t"
678 "packuswb %%mm2, %%mm0\n\t"
679 "movq %%mm0, %0\n\t"
a822a479 680 :"+m"(*p)
de6d9b64
FB
681 :"m"(*pix)
682 :"memory");
683 pix += line_size;
684 p += line_size ;
685 } while (--h);
de6d9b64
FB
686}
687
688static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
689{
690 UINT8 *p;
691 const UINT8 *pix;
692 p = block;
693 pix = pixels;
d6a4c0b1 694 MOVQ_ZERO(mm7);
de6d9b64
FB
695 do {
696 __asm __volatile(
697 "movq %1, %%mm0\n\t"
698 "movq 1%1, %%mm1\n\t"
699 "movq %0, %%mm4\n\t"
700 "movq %%mm0, %%mm2\n\t"
701 "movq %%mm1, %%mm3\n\t"
702 "movq %%mm4, %%mm5\n\t"
703 "punpcklbw %%mm7, %%mm0\n\t"
704 "punpcklbw %%mm7, %%mm1\n\t"
705 "punpckhbw %%mm7, %%mm2\n\t"
706 "punpckhbw %%mm7, %%mm3\n\t"
707 "punpcklbw %%mm7, %%mm4\n\t"
708 "punpckhbw %%mm7, %%mm5\n\t"
709 "paddusw %%mm1, %%mm0\n\t"
710 "paddusw %%mm3, %%mm2\n\t"
711 "psrlw $1, %%mm0\n\t"
712 "psrlw $1, %%mm2\n\t"
713 "paddusw %%mm4, %%mm0\n\t"
714 "paddusw %%mm5, %%mm2\n\t"
715 "psrlw $1, %%mm0\n\t"
716 "psrlw $1, %%mm2\n\t"
717 "packuswb %%mm2, %%mm0\n\t"
718 "movq %%mm0, %0\n\t"
a822a479 719 :"+m"(*p)
de6d9b64
FB
720 :"m"(*pix)
721 :"memory");
722 pix += line_size;
723 p += line_size;
724 } while (--h);
de6d9b64
FB
725}
726
727static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
728{
729 UINT8 *p;
730 const UINT8 *pix;
731 p = block;
732 pix = pixels;
d6a4c0b1 733 MOVQ_ZERO(mm7);
de6d9b64
FB
734 do {
735 __asm __volatile(
736 "movq %1, %%mm0\n\t"
737 "movq %2, %%mm1\n\t"
738 "movq %0, %%mm4\n\t"
739 "movq %%mm0, %%mm2\n\t"
740 "movq %%mm1, %%mm3\n\t"
741 "movq %%mm4, %%mm5\n\t"
742 "punpcklbw %%mm7, %%mm0\n\t"
743 "punpcklbw %%mm7, %%mm1\n\t"
744 "punpckhbw %%mm7, %%mm2\n\t"
745 "punpckhbw %%mm7, %%mm3\n\t"
746 "punpcklbw %%mm7, %%mm4\n\t"
747 "punpckhbw %%mm7, %%mm5\n\t"
748 "paddusw %%mm1, %%mm0\n\t"
749 "paddusw %%mm3, %%mm2\n\t"
750 "psrlw $1, %%mm0\n\t"
751 "psrlw $1, %%mm2\n\t"
752 "paddusw %%mm4, %%mm0\n\t"
753 "paddusw %%mm5, %%mm2\n\t"
754 "psrlw $1, %%mm0\n\t"
755 "psrlw $1, %%mm2\n\t"
756 "packuswb %%mm2, %%mm0\n\t"
757 "movq %%mm0, %0\n\t"
a822a479 758 :"+m"(*p)
de6d9b64
FB
759 :"m"(*pix), "m"(*(pix+line_size))
760 :"memory");
761 pix += line_size;
762 p += line_size ;
763 } while(--h);
de6d9b64
FB
764}
765
766static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
767{
768 UINT8 *p;
769 const UINT8 *pix;
770 p = block;
771 pix = pixels;
d6a4c0b1
ZK
772 MOVQ_ZERO(mm7);
773 MOVQ_WONE(mm6);
774 JUMPALIGN();
de6d9b64
FB
775 do {
776 __asm __volatile(
777 "movq %1, %%mm0\n\t"
778 "movq %2, %%mm1\n\t"
779 "movq 1%1, %%mm4\n\t"
780 "movq 1%2, %%mm5\n\t"
781 "movq %%mm0, %%mm2\n\t"
782 "movq %%mm1, %%mm3\n\t"
783 "punpcklbw %%mm7, %%mm0\n\t"
784 "punpcklbw %%mm7, %%mm1\n\t"
785 "punpckhbw %%mm7, %%mm2\n\t"
786 "punpckhbw %%mm7, %%mm3\n\t"
787 "paddusw %%mm1, %%mm0\n\t"
788 "paddusw %%mm3, %%mm2\n\t"
789 "movq %%mm4, %%mm1\n\t"
790 "movq %%mm5, %%mm3\n\t"
791 "punpcklbw %%mm7, %%mm4\n\t"
792 "punpcklbw %%mm7, %%mm5\n\t"
793 "punpckhbw %%mm7, %%mm1\n\t"
794 "punpckhbw %%mm7, %%mm3\n\t"
795 "paddusw %%mm5, %%mm4\n\t"
796 "paddusw %%mm3, %%mm1\n\t"
797 "paddusw %%mm6, %%mm4\n\t"
798 "paddusw %%mm6, %%mm1\n\t"
799 "paddusw %%mm4, %%mm0\n\t"
800 "paddusw %%mm1, %%mm2\n\t"
801 "movq %0, %%mm1\n\t"
802 "psrlw $2, %%mm0\n\t"
803 "movq %%mm1, %%mm3\n\t"
804 "psrlw $2, %%mm2\n\t"
805 "punpcklbw %%mm7, %%mm1\n\t"
806 "punpckhbw %%mm7, %%mm3\n\t"
807 "paddusw %%mm1, %%mm0\n\t"
808 "paddusw %%mm3, %%mm2\n\t"
809 "psrlw $1, %%mm0\n\t"
810 "psrlw $1, %%mm2\n\t"
811 "packuswb %%mm2, %%mm0\n\t"
812 "movq %%mm0, %0\n\t"
a822a479 813 :"+m"(*p)
de6d9b64
FB
814 :"m"(*pix),
815 "m"(*(pix+line_size))
816 :"memory");
817 pix += line_size;
818 p += line_size;
819 } while(--h);
de6d9b64
FB
820}
821
649c00c9
MN
822static void clear_blocks_mmx(DCTELEM *blocks)
823{
824 asm volatile(
825 "pxor %%mm7, %%mm7 \n\t"
826 "movl $-128*6, %%eax \n\t"
827 "1: \n\t"
828 "movq %%mm7, (%0, %%eax) \n\t"
829 "movq %%mm7, 8(%0, %%eax) \n\t"
830 "movq %%mm7, 16(%0, %%eax) \n\t"
831 "movq %%mm7, 24(%0, %%eax) \n\t"
832 "addl $32, %%eax \n\t"
833 " js 1b \n\t"
834 : : "r" (((int)blocks)+128*6)
835 : "%eax"
836 );
837}
838
61a4e8ae 839#if 0
d6a4c0b1 840static void just_return() { return; }
61a4e8ae 841#endif
d6a4c0b1 842
de6d9b64
FB
843void dsputil_init_mmx(void)
844{
845 mm_flags = mm_support();
f4470e09
MN
846#if 1
847 printf("libavcodec: CPU flags:");
de6d9b64
FB
848 if (mm_flags & MM_MMX)
849 printf(" mmx");
850 if (mm_flags & MM_MMXEXT)
851 printf(" mmxext");
852 if (mm_flags & MM_3DNOW)
853 printf(" 3dnow");
854 if (mm_flags & MM_SSE)
855 printf(" sse");
856 if (mm_flags & MM_SSE2)
857 printf(" sse2");
858 printf("\n");
859#endif
860
861 if (mm_flags & MM_MMX) {
862 get_pixels = get_pixels_mmx;
9dbcbd92 863 diff_pixels = diff_pixels_mmx;
de6d9b64
FB
864 put_pixels_clamped = put_pixels_clamped_mmx;
865 add_pixels_clamped = add_pixels_clamped_mmx;
649c00c9 866 clear_blocks= clear_blocks_mmx;
dcb9cd4b 867
ba6802de
MN
868 pix_abs16x16 = pix_abs16x16_mmx;
869 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
870 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
de6d9b64 871 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
ba6802de
MN
872 pix_abs8x8 = pix_abs8x8_mmx;
873 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
874 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
875 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
de6d9b64
FB
876 av_fdct = fdct_mmx;
877
878 put_pixels_tab[0] = put_pixels_mmx;
879 put_pixels_tab[1] = put_pixels_x2_mmx;
880 put_pixels_tab[2] = put_pixels_y2_mmx;
881 put_pixels_tab[3] = put_pixels_xy2_mmx;
882
883 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
884 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
885 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
886 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
dcb9cd4b 887
de6d9b64
FB
888 avg_pixels_tab[0] = avg_pixels_mmx;
889 avg_pixels_tab[1] = avg_pixels_x2_mmx;
890 avg_pixels_tab[2] = avg_pixels_y2_mmx;
891 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
892
893 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
894 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
895 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
896 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
607dce96 897
de6d9b64 898 if (mm_flags & MM_MMXEXT) {
ba6802de
MN
899 pix_abs16x16 = pix_abs16x16_mmx2;
900 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
901 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
902 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
dcb9cd4b 903
ba6802de
MN
904 pix_abs8x8 = pix_abs8x8_mmx2;
905 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
906 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
907 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
607dce96
MN
908
909 put_pixels_tab[1] = put_pixels_x2_mmx2;
910 put_pixels_tab[2] = put_pixels_y2_mmx2;
911 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
912 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
dcb9cd4b 913
607dce96
MN
914 avg_pixels_tab[0] = avg_pixels_mmx2;
915 avg_pixels_tab[1] = avg_pixels_x2_mmx2;
916 avg_pixels_tab[2] = avg_pixels_y2_mmx2;
917 avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
de6d9b64
FB
918 } else if (mm_flags & MM_3DNOW) {
919 put_pixels_tab[1] = put_pixels_x2_3dnow;
920 put_pixels_tab[2] = put_pixels_y2_3dnow;
607dce96
MN
921 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
922 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
61a4e8ae 923
de6d9b64
FB
924 avg_pixels_tab[0] = avg_pixels_3dnow;
925 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
926 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
927 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
de6d9b64 928 }
4af7bcc1 929
8def0299
FB
930 /* idct */
931 if (mm_flags & MM_MMXEXT) {
932 ff_idct = ff_mmxext_idct;
933 } else {
934 ff_idct = ff_mmx_idct;
935 }
d962f6fd
A
936#ifdef SIMPLE_IDCT
937// ff_idct = simple_idct;
938 ff_idct = simple_idct_mmx;
939#endif
de6d9b64 940 }
d6a4c0b1
ZK
941
942#if 0
943 // for speed testing
944 get_pixels = just_return;
945 put_pixels_clamped = just_return;
946 add_pixels_clamped = just_return;
947
948 pix_abs16x16 = just_return;
949 pix_abs16x16_x2 = just_return;
950 pix_abs16x16_y2 = just_return;
951 pix_abs16x16_xy2 = just_return;
952
953 put_pixels_tab[0] = just_return;
954 put_pixels_tab[1] = just_return;
955 put_pixels_tab[2] = just_return;
956 put_pixels_tab[3] = just_return;
957
958 put_no_rnd_pixels_tab[0] = just_return;
959 put_no_rnd_pixels_tab[1] = just_return;
960 put_no_rnd_pixels_tab[2] = just_return;
961 put_no_rnd_pixels_tab[3] = just_return;
962
963 avg_pixels_tab[0] = just_return;
964 avg_pixels_tab[1] = just_return;
965 avg_pixels_tab[2] = just_return;
966 avg_pixels_tab[3] = just_return;
967
968 avg_no_rnd_pixels_tab[0] = just_return;
969 avg_no_rnd_pixels_tab[1] = just_return;
970 avg_no_rnd_pixels_tab[2] = just_return;
971 avg_no_rnd_pixels_tab[3] = just_return;
972
d6a4c0b1
ZK
973 //av_fdct = just_return;
974 //ff_idct = just_return;
975#endif
de6d9b64 976}
4f12a497
FB
977
978/* remove any non bit exact operation (testing purpose). NOTE that
979 this function should be kept as small as possible because it is
980 always difficult to test automatically non bit exact cases. */
981void dsputil_set_bit_exact_mmx(void)
982{
983 if (mm_flags & MM_MMX) {
984 if (mm_flags & MM_MMXEXT) {
985 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
986 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
987 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
988 } else if (mm_flags & MM_3DNOW) {
989 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
990 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
991 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
992 }
993 }
994}