* code with new PAVGB for MMX only CPU splited into separate file
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
de6d9b64
FB
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5
FB
25int mm_flags; /* multimedia extension flags */
26
ba6802de
MN
27int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
31
32int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
36
37int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
41
42int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
46
8def0299
FB
47/* external functions, from idct_mmx.c */
48void ff_mmx_idct(DCTELEM *block);
49void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 50
de6d9b64 51/* pixel operations */
def60345 52static const uint64_t mm_bfe __attribute__ ((aligned(8))) = 0xfefefefefefefefeULL;
a7bd8797
MN
53static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
54static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
55static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
a9b3f630
NK
56//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
57//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
de6d9b64 58
d6a4c0b1
ZK
59#define JUMPALIGN() __asm __volatile (".balign 8"::)
60#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
61
62#ifndef PIC
63#define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
64#define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
a7bd8797 65#define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t"
def60345 66#define MOVQ_BFE(regd) "movq "MANGLE(mm_bfe)", "#regd" \n\t"
d6a4c0b1
ZK
67#else
68// for shared library it's better to use this way for accessing constants
69// pcmpeqd -> -1
70#define MOVQ_WONE(regd) \
71 __asm __volatile ( \
72 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73 "psrlw $15, %%" #regd ::)
74
75#define MOVQ_WTWO(regd) \
76 __asm __volatile ( \
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
78 "psrlw $15, %%" #regd " \n\t" \
79 "psllw $1, %%" #regd ::)
a7bd8797
MN
80
81#define MOVQ_BONE(regd) \
82 "pcmpeqd " #regd ", " #regd " \n\t" \
83 "psrlw $15, " #regd " \n\t"\
84 "packuswb " #regd ", " #regd " \n\t"
def60345
ZK
85
86#define MOVQ_BFE(regd) \
87 "pcmpeqd " #regd ", " #regd " \n\t"\
88 "paddb " #regd ", " #regd " \n\t"
d6a4c0b1
ZK
89#endif
90
def60345
ZK
91// using mm6 as temporary and for the output result
92// first argument is unmodifed and second is trashed
93// mm7 is supposed to contain 0xfefefefefefefefe
91abb473
ZK
94#define PAVGB_MMX_NO_RND(rega, regb, regr) \
95 "movq " #rega ", " #regr " \n\t"\
96 "pand " #regb ", " #regr " \n\t"\
def60345
ZK
97 "pxor " #rega ", " #regb " \n\t"\
98 "pand %%mm7, " #regb " \n\t"\
99 "psrlq $1, " #regb " \n\t"\
91abb473 100 "paddb " #regb ", " #regr " \n\t"
def60345 101
91abb473
ZK
102#define PAVGB_MMX(rega, regb, regr) \
103 "movq " #rega ", " #regr " \n\t"\
104 "por " #regb ", " #regr " \n\t"\
def60345
ZK
105 "pxor " #rega ", " #regb " \n\t"\
106 "pand %%mm7, " #regb " \n\t"\
107 "psrlq $1, " #regb " \n\t"\
91abb473 108 "psubb " #regb ", " #regr " \n\t"
def60345 109
91abb473
ZK
110/***********************************/
111/* MMX no rounding */
112#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
113
114#define PAVGB(a, b) PAVGB_MMX_NO_RND(a, b, %%mm6)
115#define PAVGBR(a, b, c) PAVGB_MMX_NO_RND(a, b, c)
116#include "dsputil_mmx_rnd.h"
117
118#undef DEF
119#undef PAVGB
120#undef PAVGBR
121/***********************************/
122/* MMX rounding */
123
124#define DEF(x, y) x ## _ ## y ##_mmx
125
126#define PAVGB(a, b) PAVGB_MMX(a, b, %%mm6)
127#define PAVGBR(a, b, c) PAVGB_MMX(a, b, c)
128#include "dsputil_mmx_rnd.h"
129
130#undef DEF
131#undef PAVGB
132#undef PAVGBR
a7bd8797 133
de6d9b64
FB
134/***********************************/
135/* 3Dnow specific */
136
137#define DEF(x) x ## _3dnow
138/* for Athlons PAVGUSB is prefered */
139#define PAVGB "pavgusb"
140
141#include "dsputil_mmx_avg.h"
142
143#undef DEF
144#undef PAVGB
145
146/***********************************/
147/* MMX2 specific */
148
607dce96 149#define DEF(x) x ## _mmx2
de6d9b64
FB
150
151/* Introduced only in MMX2 set */
152#define PAVGB "pavgb"
153
154#include "dsputil_mmx_avg.h"
155
156#undef DEF
157#undef PAVGB
158
159/***********************************/
160/* standard MMX */
161
162static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
163{
607dce96
MN
164 asm volatile(
165 "movl $-128, %%eax \n\t"
166 "pxor %%mm7, %%mm7 \n\t"
167 ".balign 16 \n\t"
168 "1: \n\t"
169 "movq (%0), %%mm0 \n\t"
170 "movq (%0, %2), %%mm2 \n\t"
171 "movq %%mm0, %%mm1 \n\t"
172 "movq %%mm2, %%mm3 \n\t"
173 "punpcklbw %%mm7, %%mm0 \n\t"
174 "punpckhbw %%mm7, %%mm1 \n\t"
175 "punpcklbw %%mm7, %%mm2 \n\t"
176 "punpckhbw %%mm7, %%mm3 \n\t"
177 "movq %%mm0, (%1, %%eax)\n\t"
178 "movq %%mm1, 8(%1, %%eax)\n\t"
179 "movq %%mm2, 16(%1, %%eax)\n\t"
180 "movq %%mm3, 24(%1, %%eax)\n\t"
181 "addl %3, %0 \n\t"
182 "addl $32, %%eax \n\t"
183 "js 1b \n\t"
184 : "+r" (pixels)
185 : "r" (block+64), "r" (line_size), "r" (line_size*2)
186 : "%eax"
187 );
de6d9b64
FB
188}
189
9dbcbd92
MN
190static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
191{
192 asm volatile(
607dce96 193 "pxor %%mm7, %%mm7 \n\t"
9dbcbd92 194 "movl $-128, %%eax \n\t"
607dce96 195 ".balign 16 \n\t"
9dbcbd92
MN
196 "1: \n\t"
197 "movq (%0), %%mm0 \n\t"
198 "movq (%1), %%mm2 \n\t"
199 "movq %%mm0, %%mm1 \n\t"
200 "movq %%mm2, %%mm3 \n\t"
201 "punpcklbw %%mm7, %%mm0 \n\t"
202 "punpckhbw %%mm7, %%mm1 \n\t"
203 "punpcklbw %%mm7, %%mm2 \n\t"
204 "punpckhbw %%mm7, %%mm3 \n\t"
205 "psubw %%mm2, %%mm0 \n\t"
206 "psubw %%mm3, %%mm1 \n\t"
207 "movq %%mm0, (%2, %%eax)\n\t"
208 "movq %%mm1, 8(%2, %%eax)\n\t"
209 "addl %3, %0 \n\t"
210 "addl %3, %1 \n\t"
211 "addl $16, %%eax \n\t"
212 "jnz 1b \n\t"
213 : "+r" (s1), "+r" (s2)
214 : "r" (block+64), "r" (stride)
215 : "%eax"
216 );
217}
218
de6d9b64
FB
219static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
220{
221 const DCTELEM *p;
222 UINT8 *pix;
de6d9b64
FB
223
224 /* read the pixels */
225 p = block;
226 pix = pixels;
d6a4c0b1 227 /* unrolled loop */
de6d9b64 228 __asm __volatile(
a822a479
NK
229 "movq %3, %%mm0\n\t"
230 "movq 8%3, %%mm1\n\t"
231 "movq 16%3, %%mm2\n\t"
232 "movq 24%3, %%mm3\n\t"
233 "movq 32%3, %%mm4\n\t"
234 "movq 40%3, %%mm5\n\t"
235 "movq 48%3, %%mm6\n\t"
236 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
237 "packuswb %%mm1, %%mm0\n\t"
238 "packuswb %%mm3, %%mm2\n\t"
239 "packuswb %%mm5, %%mm4\n\t"
240 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
241 "movq %%mm0, (%0)\n\t"
242 "movq %%mm2, (%0, %1)\n\t"
243 "movq %%mm4, (%0, %1, 2)\n\t"
244 "movq %%mm6, (%0, %2)\n\t"
245 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
246 :"memory");
247 pix += line_size*4;
248 p += 32;
d6a4c0b1
ZK
249
250 // if here would be an exact copy of the code above
251 // compiler would generate some very strange code
252 // thus using "r"
253 __asm __volatile(
254 "movq (%3), %%mm0\n\t"
255 "movq 8(%3), %%mm1\n\t"
256 "movq 16(%3), %%mm2\n\t"
257 "movq 24(%3), %%mm3\n\t"
258 "movq 32(%3), %%mm4\n\t"
259 "movq 40(%3), %%mm5\n\t"
260 "movq 48(%3), %%mm6\n\t"
261 "movq 56(%3), %%mm7\n\t"
262 "packuswb %%mm1, %%mm0\n\t"
263 "packuswb %%mm3, %%mm2\n\t"
264 "packuswb %%mm5, %%mm4\n\t"
265 "packuswb %%mm7, %%mm6\n\t"
266 "movq %%mm0, (%0)\n\t"
267 "movq %%mm2, (%0, %1)\n\t"
268 "movq %%mm4, (%0, %1, 2)\n\t"
269 "movq %%mm6, (%0, %2)\n\t"
270 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
271 :"memory");
de6d9b64
FB
272}
273
274static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
275{
276 const DCTELEM *p;
277 UINT8 *pix;
278 int i;
279
280 /* read the pixels */
281 p = block;
282 pix = pixels;
d6a4c0b1
ZK
283 MOVQ_ZERO(mm7);
284 i = 4;
cd8e5f96 285 do {
de6d9b64 286 __asm __volatile(
cd8e5f96
ZK
287 "movq (%2), %%mm0\n\t"
288 "movq 8(%2), %%mm1\n\t"
289 "movq 16(%2), %%mm2\n\t"
290 "movq 24(%2), %%mm3\n\t"
de6d9b64
FB
291 "movq %0, %%mm4\n\t"
292 "movq %1, %%mm6\n\t"
293 "movq %%mm4, %%mm5\n\t"
294 "punpcklbw %%mm7, %%mm4\n\t"
295 "punpckhbw %%mm7, %%mm5\n\t"
296 "paddsw %%mm4, %%mm0\n\t"
297 "paddsw %%mm5, %%mm1\n\t"
298 "movq %%mm6, %%mm5\n\t"
299 "punpcklbw %%mm7, %%mm6\n\t"
300 "punpckhbw %%mm7, %%mm5\n\t"
301 "paddsw %%mm6, %%mm2\n\t"
302 "paddsw %%mm5, %%mm3\n\t"
303 "packuswb %%mm1, %%mm0\n\t"
304 "packuswb %%mm3, %%mm2\n\t"
305 "movq %%mm0, %0\n\t"
306 "movq %%mm2, %1\n\t"
a822a479 307 :"+m"(*pix), "+m"(*(pix+line_size))
cd8e5f96 308 :"r"(p)
de6d9b64
FB
309 :"memory");
310 pix += line_size*2;
311 p += 16;
cd8e5f96 312 } while (--i);
de6d9b64
FB
313}
314
315static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
316{
31ddcf98
ZK
317 asm volatile
318 (
319 "lea (%3, %3), %%eax \n\t"
52af45ad 320 ".balign 8 \n\t"
31ddcf98
ZK
321 "1: \n\t"
322 "movq (%1), %%mm0 \n\t"
323 "movq (%1, %3), %%mm1 \n\t"
324 "movq %%mm0, (%2) \n\t"
325 "movq %%mm1, (%2, %3) \n\t"
326 "addl %%eax, %1 \n\t"
327 "addl %%eax, %2 \n\t"
328 "movq (%1), %%mm0 \n\t"
329 "movq (%1, %3), %%mm1 \n\t"
330 "movq %%mm0, (%2) \n\t"
331 "movq %%mm1, (%2, %3) \n\t"
332 "addl %%eax, %1 \n\t"
333 "addl %%eax, %2 \n\t"
334 "subl $4, %0 \n\t"
335 "jnz 1b \n\t"
336 : "+g"(h), "+r" (pixels), "+r" (block)
337 : "r"(line_size)
338 : "%eax", "memory"
339 );
de6d9b64
FB
340}
341
def60345 342#if 0
de6d9b64
FB
343static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
344{
345 UINT8 *p;
346 const UINT8 *pix;
347 p = block;
d6a4c0b1
ZK
348 pix = pixels; // 1s
349 MOVQ_ZERO(mm7);
350 MOVQ_WTWO(mm6);
351 JUMPALIGN();
de6d9b64
FB
352 do {
353 __asm __volatile(
354 "movq %1, %%mm0\n\t"
355 "movq %2, %%mm1\n\t"
356 "movq 1%1, %%mm4\n\t"
357 "movq 1%2, %%mm5\n\t"
358 "movq %%mm0, %%mm2\n\t"
359 "movq %%mm1, %%mm3\n\t"
360 "punpcklbw %%mm7, %%mm0\n\t"
361 "punpcklbw %%mm7, %%mm1\n\t"
362 "punpckhbw %%mm7, %%mm2\n\t"
363 "punpckhbw %%mm7, %%mm3\n\t"
364 "paddusw %%mm1, %%mm0\n\t"
365 "paddusw %%mm3, %%mm2\n\t"
366 "movq %%mm4, %%mm1\n\t"
367 "movq %%mm5, %%mm3\n\t"
368 "punpcklbw %%mm7, %%mm4\n\t"
369 "punpcklbw %%mm7, %%mm5\n\t"
370 "punpckhbw %%mm7, %%mm1\n\t"
371 "punpckhbw %%mm7, %%mm3\n\t"
372 "paddusw %%mm5, %%mm4\n\t"
373 "paddusw %%mm3, %%mm1\n\t"
374 "paddusw %%mm6, %%mm4\n\t"
375 "paddusw %%mm6, %%mm1\n\t"
376 "paddusw %%mm4, %%mm0\n\t"
377 "paddusw %%mm1, %%mm2\n\t"
378 "psrlw $2, %%mm0\n\t"
379 "psrlw $2, %%mm2\n\t"
380 "packuswb %%mm2, %%mm0\n\t"
381 "movq %%mm0, %0\n\t"
382 :"=m"(*p)
383 :"m"(*pix),
384 "m"(*(pix+line_size))
385 :"memory");
386 pix += line_size;
387 p += line_size;
388 } while(--h);
de6d9b64
FB
389}
390
de6d9b64
FB
391static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
392{
393 UINT8 *p;
394 const UINT8 *pix;
395 p = block;
396 pix = pixels;
d6a4c0b1
ZK
397 MOVQ_ZERO(mm7);
398 MOVQ_WONE(mm6);
399 JUMPALIGN();
de6d9b64
FB
400 do {
401 __asm __volatile(
402 "movq %1, %%mm0\n\t"
403 "movq %2, %%mm1\n\t"
404 "movq 1%1, %%mm4\n\t"
405 "movq 1%2, %%mm5\n\t"
406 "movq %%mm0, %%mm2\n\t"
407 "movq %%mm1, %%mm3\n\t"
408 "punpcklbw %%mm7, %%mm0\n\t"
409 "punpcklbw %%mm7, %%mm1\n\t"
410 "punpckhbw %%mm7, %%mm2\n\t"
411 "punpckhbw %%mm7, %%mm3\n\t"
412 "paddusw %%mm1, %%mm0\n\t"
413 "paddusw %%mm3, %%mm2\n\t"
414 "movq %%mm4, %%mm1\n\t"
415 "movq %%mm5, %%mm3\n\t"
416 "punpcklbw %%mm7, %%mm4\n\t"
417 "punpcklbw %%mm7, %%mm5\n\t"
418 "punpckhbw %%mm7, %%mm1\n\t"
419 "punpckhbw %%mm7, %%mm3\n\t"
420 "paddusw %%mm5, %%mm4\n\t"
421 "paddusw %%mm3, %%mm1\n\t"
422 "paddusw %%mm6, %%mm4\n\t"
423 "paddusw %%mm6, %%mm1\n\t"
424 "paddusw %%mm4, %%mm0\n\t"
425 "paddusw %%mm1, %%mm2\n\t"
426 "psrlw $2, %%mm0\n\t"
427 "psrlw $2, %%mm2\n\t"
428 "packuswb %%mm2, %%mm0\n\t"
429 "movq %%mm0, %0\n\t"
430 :"=m"(*p)
431 :"m"(*pix),
432 "m"(*(pix+line_size))
433 :"memory");
434 pix += line_size;
435 p += line_size;
436 } while(--h);
de6d9b64 437}
91abb473 438#endif
de6d9b64
FB
439static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
440{
441 UINT8 *p;
442 const UINT8 *pix;
443 p = block;
444 pix = pixels;
d6a4c0b1
ZK
445 MOVQ_ZERO(mm7);
446 MOVQ_WONE(mm6);
447 JUMPALIGN();
de6d9b64
FB
448 do {
449 __asm __volatile(
450 "movq %0, %%mm0\n\t"
451 "movq %1, %%mm1\n\t"
452 "movq %%mm0, %%mm2\n\t"
453 "movq %%mm1, %%mm3\n\t"
454 "punpcklbw %%mm7, %%mm0\n\t"
455 "punpcklbw %%mm7, %%mm1\n\t"
456 "punpckhbw %%mm7, %%mm2\n\t"
457 "punpckhbw %%mm7, %%mm3\n\t"
458 "paddusw %%mm1, %%mm0\n\t"
459 "paddusw %%mm3, %%mm2\n\t"
460 "paddusw %%mm6, %%mm0\n\t"
461 "paddusw %%mm6, %%mm2\n\t"
462 "psrlw $1, %%mm0\n\t"
463 "psrlw $1, %%mm2\n\t"
464 "packuswb %%mm2, %%mm0\n\t"
465 "movq %%mm0, %0\n\t"
a822a479 466 :"+m"(*p)
de6d9b64
FB
467 :"m"(*pix)
468 :"memory");
469 pix += line_size;
470 p += line_size;
471 }
472 while (--h);
de6d9b64
FB
473}
474
475static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
476{
477 UINT8 *p;
478 const UINT8 *pix;
479 p = block;
480 pix = pixels;
d6a4c0b1
ZK
481 MOVQ_ZERO(mm7);
482 MOVQ_WONE(mm6);
483 JUMPALIGN();
de6d9b64
FB
484 do {
485 __asm __volatile(
486 "movq %1, %%mm1\n\t"
487 "movq %0, %%mm0\n\t"
488 "movq 1%1, %%mm4\n\t"
489 "movq %%mm0, %%mm2\n\t"
490 "movq %%mm1, %%mm3\n\t"
491 "movq %%mm4, %%mm5\n\t"
492 "punpcklbw %%mm7, %%mm1\n\t"
493 "punpckhbw %%mm7, %%mm3\n\t"
494 "punpcklbw %%mm7, %%mm4\n\t"
495 "punpckhbw %%mm7, %%mm5\n\t"
496 "punpcklbw %%mm7, %%mm0\n\t"
497 "punpckhbw %%mm7, %%mm2\n\t"
498 "paddusw %%mm4, %%mm1\n\t"
499 "paddusw %%mm5, %%mm3\n\t"
500 "paddusw %%mm6, %%mm1\n\t"
501 "paddusw %%mm6, %%mm3\n\t"
502 "psrlw $1, %%mm1\n\t"
503 "psrlw $1, %%mm3\n\t"
504 "paddusw %%mm6, %%mm0\n\t"
505 "paddusw %%mm6, %%mm2\n\t"
506 "paddusw %%mm1, %%mm0\n\t"
507 "paddusw %%mm3, %%mm2\n\t"
508 "psrlw $1, %%mm0\n\t"
509 "psrlw $1, %%mm2\n\t"
510 "packuswb %%mm2, %%mm0\n\t"
511 "movq %%mm0, %0\n\t"
a822a479 512 :"+m"(*p)
de6d9b64
FB
513 :"m"(*pix)
514 :"memory");
515 pix += line_size;
516 p += line_size;
517 } while (--h);
de6d9b64
FB
518}
519
520static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
521{
522 UINT8 *p;
523 const UINT8 *pix;
524 p = block;
525 pix = pixels;
d6a4c0b1
ZK
526 MOVQ_ZERO(mm7);
527 MOVQ_WONE(mm6);
528 JUMPALIGN();
de6d9b64
FB
529 do {
530 __asm __volatile(
531 "movq %1, %%mm1\n\t"
532 "movq %0, %%mm0\n\t"
533 "movq %2, %%mm4\n\t"
534 "movq %%mm0, %%mm2\n\t"
535 "movq %%mm1, %%mm3\n\t"
536 "movq %%mm4, %%mm5\n\t"
537 "punpcklbw %%mm7, %%mm1\n\t"
538 "punpckhbw %%mm7, %%mm3\n\t"
539 "punpcklbw %%mm7, %%mm4\n\t"
540 "punpckhbw %%mm7, %%mm5\n\t"
541 "punpcklbw %%mm7, %%mm0\n\t"
542 "punpckhbw %%mm7, %%mm2\n\t"
543 "paddusw %%mm4, %%mm1\n\t"
544 "paddusw %%mm5, %%mm3\n\t"
545 "paddusw %%mm6, %%mm1\n\t"
546 "paddusw %%mm6, %%mm3\n\t"
547 "psrlw $1, %%mm1\n\t"
548 "psrlw $1, %%mm3\n\t"
549 "paddusw %%mm6, %%mm0\n\t"
550 "paddusw %%mm6, %%mm2\n\t"
551 "paddusw %%mm1, %%mm0\n\t"
552 "paddusw %%mm3, %%mm2\n\t"
553 "psrlw $1, %%mm0\n\t"
554 "psrlw $1, %%mm2\n\t"
555 "packuswb %%mm2, %%mm0\n\t"
556 "movq %%mm0, %0\n\t"
a822a479 557 :"+m"(*p)
de6d9b64
FB
558 :"m"(*pix), "m"(*(pix+line_size))
559 :"memory");
560 pix += line_size;
561 p += line_size ;
562 } while(--h);
de6d9b64
FB
563}
564
565static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
566{
567 UINT8 *p;
568 const UINT8 *pix;
569 p = block;
570 pix = pixels;
d6a4c0b1
ZK
571 MOVQ_ZERO(mm7);
572 // this doesn't seem to be used offten - so
573 // the inside usage of mm_wone is not optimized
574 MOVQ_WTWO(mm6);
de6d9b64
FB
575 do {
576 __asm __volatile(
577 "movq %1, %%mm0\n\t"
578 "movq %2, %%mm1\n\t"
579 "movq 1%1, %%mm4\n\t"
580 "movq 1%2, %%mm5\n\t"
581 "movq %%mm0, %%mm2\n\t"
582 "movq %%mm1, %%mm3\n\t"
583 "punpcklbw %%mm7, %%mm0\n\t"
584 "punpcklbw %%mm7, %%mm1\n\t"
585 "punpckhbw %%mm7, %%mm2\n\t"
586 "punpckhbw %%mm7, %%mm3\n\t"
587 "paddusw %%mm1, %%mm0\n\t"
588 "paddusw %%mm3, %%mm2\n\t"
589 "movq %%mm4, %%mm1\n\t"
590 "movq %%mm5, %%mm3\n\t"
591 "punpcklbw %%mm7, %%mm4\n\t"
592 "punpcklbw %%mm7, %%mm5\n\t"
593 "punpckhbw %%mm7, %%mm1\n\t"
594 "punpckhbw %%mm7, %%mm3\n\t"
595 "paddusw %%mm5, %%mm4\n\t"
596 "paddusw %%mm3, %%mm1\n\t"
597 "paddusw %%mm6, %%mm4\n\t"
598 "paddusw %%mm6, %%mm1\n\t"
599 "paddusw %%mm4, %%mm0\n\t"
600 "paddusw %%mm1, %%mm2\n\t"
601 "movq %3, %%mm5\n\t"
602 "psrlw $2, %%mm0\n\t"
603 "movq %0, %%mm1\n\t"
604 "psrlw $2, %%mm2\n\t"
605 "movq %%mm1, %%mm3\n\t"
606 "punpcklbw %%mm7, %%mm1\n\t"
607 "punpckhbw %%mm7, %%mm3\n\t"
608 "paddusw %%mm1, %%mm0\n\t"
609 "paddusw %%mm3, %%mm2\n\t"
610 "paddusw %%mm5, %%mm0\n\t"
611 "paddusw %%mm5, %%mm2\n\t"
612 "psrlw $1, %%mm0\n\t"
613 "psrlw $1, %%mm2\n\t"
614 "packuswb %%mm2, %%mm0\n\t"
615 "movq %%mm0, %0\n\t"
a822a479 616 :"+m"(*p)
de6d9b64 617 :"m"(*pix),
a9b3f630 618 "m"(*(pix+line_size)), "m"(mm_wone)
de6d9b64
FB
619 :"memory");
620 pix += line_size;
621 p += line_size ;
622 } while(--h);
de6d9b64
FB
623}
624
625static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
626{
627 UINT8 *p;
628 const UINT8 *pix;
629 p = block;
630 pix = pixels;
d6a4c0b1 631 MOVQ_ZERO(mm7);
de6d9b64
FB
632 do {
633 __asm __volatile(
634 "movq %1, %%mm0\n\t"
635 "movq %0, %%mm1\n\t"
636 "movq %%mm0, %%mm2\n\t"
637 "movq %%mm1, %%mm3\n\t"
638 "punpcklbw %%mm7, %%mm0\n\t"
639 "punpcklbw %%mm7, %%mm1\n\t"
640 "punpckhbw %%mm7, %%mm2\n\t"
641 "punpckhbw %%mm7, %%mm3\n\t"
642 "paddusw %%mm1, %%mm0\n\t"
643 "paddusw %%mm3, %%mm2\n\t"
644 "psrlw $1, %%mm0\n\t"
645 "psrlw $1, %%mm2\n\t"
646 "packuswb %%mm2, %%mm0\n\t"
647 "movq %%mm0, %0\n\t"
a822a479 648 :"+m"(*p)
de6d9b64
FB
649 :"m"(*pix)
650 :"memory");
651 pix += line_size;
652 p += line_size ;
653 } while (--h);
de6d9b64
FB
654}
655
656static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
657{
658 UINT8 *p;
659 const UINT8 *pix;
660 p = block;
661 pix = pixels;
d6a4c0b1 662 MOVQ_ZERO(mm7);
de6d9b64
FB
663 do {
664 __asm __volatile(
665 "movq %1, %%mm0\n\t"
666 "movq 1%1, %%mm1\n\t"
667 "movq %0, %%mm4\n\t"
668 "movq %%mm0, %%mm2\n\t"
669 "movq %%mm1, %%mm3\n\t"
670 "movq %%mm4, %%mm5\n\t"
671 "punpcklbw %%mm7, %%mm0\n\t"
672 "punpcklbw %%mm7, %%mm1\n\t"
673 "punpckhbw %%mm7, %%mm2\n\t"
674 "punpckhbw %%mm7, %%mm3\n\t"
675 "punpcklbw %%mm7, %%mm4\n\t"
676 "punpckhbw %%mm7, %%mm5\n\t"
677 "paddusw %%mm1, %%mm0\n\t"
678 "paddusw %%mm3, %%mm2\n\t"
679 "psrlw $1, %%mm0\n\t"
680 "psrlw $1, %%mm2\n\t"
681 "paddusw %%mm4, %%mm0\n\t"
682 "paddusw %%mm5, %%mm2\n\t"
683 "psrlw $1, %%mm0\n\t"
684 "psrlw $1, %%mm2\n\t"
685 "packuswb %%mm2, %%mm0\n\t"
686 "movq %%mm0, %0\n\t"
a822a479 687 :"+m"(*p)
de6d9b64
FB
688 :"m"(*pix)
689 :"memory");
690 pix += line_size;
691 p += line_size;
692 } while (--h);
de6d9b64
FB
693}
694
695static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
696{
697 UINT8 *p;
698 const UINT8 *pix;
699 p = block;
700 pix = pixels;
d6a4c0b1 701 MOVQ_ZERO(mm7);
de6d9b64
FB
702 do {
703 __asm __volatile(
704 "movq %1, %%mm0\n\t"
705 "movq %2, %%mm1\n\t"
706 "movq %0, %%mm4\n\t"
707 "movq %%mm0, %%mm2\n\t"
708 "movq %%mm1, %%mm3\n\t"
709 "movq %%mm4, %%mm5\n\t"
710 "punpcklbw %%mm7, %%mm0\n\t"
711 "punpcklbw %%mm7, %%mm1\n\t"
712 "punpckhbw %%mm7, %%mm2\n\t"
713 "punpckhbw %%mm7, %%mm3\n\t"
714 "punpcklbw %%mm7, %%mm4\n\t"
715 "punpckhbw %%mm7, %%mm5\n\t"
716 "paddusw %%mm1, %%mm0\n\t"
717 "paddusw %%mm3, %%mm2\n\t"
718 "psrlw $1, %%mm0\n\t"
719 "psrlw $1, %%mm2\n\t"
720 "paddusw %%mm4, %%mm0\n\t"
721 "paddusw %%mm5, %%mm2\n\t"
722 "psrlw $1, %%mm0\n\t"
723 "psrlw $1, %%mm2\n\t"
724 "packuswb %%mm2, %%mm0\n\t"
725 "movq %%mm0, %0\n\t"
a822a479 726 :"+m"(*p)
de6d9b64
FB
727 :"m"(*pix), "m"(*(pix+line_size))
728 :"memory");
729 pix += line_size;
730 p += line_size ;
731 } while(--h);
de6d9b64
FB
732}
733
734static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
735{
736 UINT8 *p;
737 const UINT8 *pix;
738 p = block;
739 pix = pixels;
d6a4c0b1
ZK
740 MOVQ_ZERO(mm7);
741 MOVQ_WONE(mm6);
742 JUMPALIGN();
de6d9b64
FB
743 do {
744 __asm __volatile(
745 "movq %1, %%mm0\n\t"
746 "movq %2, %%mm1\n\t"
747 "movq 1%1, %%mm4\n\t"
748 "movq 1%2, %%mm5\n\t"
749 "movq %%mm0, %%mm2\n\t"
750 "movq %%mm1, %%mm3\n\t"
751 "punpcklbw %%mm7, %%mm0\n\t"
752 "punpcklbw %%mm7, %%mm1\n\t"
753 "punpckhbw %%mm7, %%mm2\n\t"
754 "punpckhbw %%mm7, %%mm3\n\t"
755 "paddusw %%mm1, %%mm0\n\t"
756 "paddusw %%mm3, %%mm2\n\t"
757 "movq %%mm4, %%mm1\n\t"
758 "movq %%mm5, %%mm3\n\t"
759 "punpcklbw %%mm7, %%mm4\n\t"
760 "punpcklbw %%mm7, %%mm5\n\t"
761 "punpckhbw %%mm7, %%mm1\n\t"
762 "punpckhbw %%mm7, %%mm3\n\t"
763 "paddusw %%mm5, %%mm4\n\t"
764 "paddusw %%mm3, %%mm1\n\t"
765 "paddusw %%mm6, %%mm4\n\t"
766 "paddusw %%mm6, %%mm1\n\t"
767 "paddusw %%mm4, %%mm0\n\t"
768 "paddusw %%mm1, %%mm2\n\t"
769 "movq %0, %%mm1\n\t"
770 "psrlw $2, %%mm0\n\t"
771 "movq %%mm1, %%mm3\n\t"
772 "psrlw $2, %%mm2\n\t"
773 "punpcklbw %%mm7, %%mm1\n\t"
774 "punpckhbw %%mm7, %%mm3\n\t"
775 "paddusw %%mm1, %%mm0\n\t"
776 "paddusw %%mm3, %%mm2\n\t"
777 "psrlw $1, %%mm0\n\t"
778 "psrlw $1, %%mm2\n\t"
779 "packuswb %%mm2, %%mm0\n\t"
780 "movq %%mm0, %0\n\t"
a822a479 781 :"+m"(*p)
de6d9b64
FB
782 :"m"(*pix),
783 "m"(*(pix+line_size))
784 :"memory");
785 pix += line_size;
786 p += line_size;
787 } while(--h);
de6d9b64
FB
788}
789
649c00c9
MN
790static void clear_blocks_mmx(DCTELEM *blocks)
791{
792 asm volatile(
793 "pxor %%mm7, %%mm7 \n\t"
794 "movl $-128*6, %%eax \n\t"
795 "1: \n\t"
796 "movq %%mm7, (%0, %%eax) \n\t"
797 "movq %%mm7, 8(%0, %%eax) \n\t"
798 "movq %%mm7, 16(%0, %%eax) \n\t"
799 "movq %%mm7, 24(%0, %%eax) \n\t"
800 "addl $32, %%eax \n\t"
801 " js 1b \n\t"
802 : : "r" (((int)blocks)+128*6)
803 : "%eax"
804 );
805}
806
61a4e8ae 807#if 0
d6a4c0b1 808static void just_return() { return; }
61a4e8ae 809#endif
d6a4c0b1 810
de6d9b64
FB
811void dsputil_init_mmx(void)
812{
813 mm_flags = mm_support();
f4470e09
MN
814#if 1
815 printf("libavcodec: CPU flags:");
de6d9b64
FB
816 if (mm_flags & MM_MMX)
817 printf(" mmx");
818 if (mm_flags & MM_MMXEXT)
819 printf(" mmxext");
820 if (mm_flags & MM_3DNOW)
821 printf(" 3dnow");
822 if (mm_flags & MM_SSE)
823 printf(" sse");
824 if (mm_flags & MM_SSE2)
825 printf(" sse2");
826 printf("\n");
827#endif
828
829 if (mm_flags & MM_MMX) {
830 get_pixels = get_pixels_mmx;
9dbcbd92 831 diff_pixels = diff_pixels_mmx;
de6d9b64
FB
832 put_pixels_clamped = put_pixels_clamped_mmx;
833 add_pixels_clamped = add_pixels_clamped_mmx;
649c00c9 834 clear_blocks= clear_blocks_mmx;
dcb9cd4b 835
ba6802de
MN
836 pix_abs16x16 = pix_abs16x16_mmx;
837 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
838 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
de6d9b64 839 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
ba6802de
MN
840 pix_abs8x8 = pix_abs8x8_mmx;
841 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
842 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
843 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
de6d9b64
FB
844 av_fdct = fdct_mmx;
845
846 put_pixels_tab[0] = put_pixels_mmx;
847 put_pixels_tab[1] = put_pixels_x2_mmx;
848 put_pixels_tab[2] = put_pixels_y2_mmx;
849 put_pixels_tab[3] = put_pixels_xy2_mmx;
850
851 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
852 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
853 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
854 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
dcb9cd4b 855
de6d9b64
FB
856 avg_pixels_tab[0] = avg_pixels_mmx;
857 avg_pixels_tab[1] = avg_pixels_x2_mmx;
858 avg_pixels_tab[2] = avg_pixels_y2_mmx;
859 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
860
861 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
862 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
863 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
864 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
607dce96 865
de6d9b64 866 if (mm_flags & MM_MMXEXT) {
ba6802de
MN
867 pix_abs16x16 = pix_abs16x16_mmx2;
868 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
869 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
870 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
dcb9cd4b 871
ba6802de
MN
872 pix_abs8x8 = pix_abs8x8_mmx2;
873 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
874 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
875 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
607dce96
MN
876
877 put_pixels_tab[1] = put_pixels_x2_mmx2;
878 put_pixels_tab[2] = put_pixels_y2_mmx2;
879 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
880 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
dcb9cd4b 881
607dce96
MN
882 avg_pixels_tab[0] = avg_pixels_mmx2;
883 avg_pixels_tab[1] = avg_pixels_x2_mmx2;
884 avg_pixels_tab[2] = avg_pixels_y2_mmx2;
885 avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
de6d9b64
FB
886 } else if (mm_flags & MM_3DNOW) {
887 put_pixels_tab[1] = put_pixels_x2_3dnow;
888 put_pixels_tab[2] = put_pixels_y2_3dnow;
607dce96
MN
889 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
890 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
61a4e8ae 891
de6d9b64
FB
892 avg_pixels_tab[0] = avg_pixels_3dnow;
893 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
894 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
895 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
de6d9b64 896 }
4af7bcc1 897
8def0299
FB
898 /* idct */
899 if (mm_flags & MM_MMXEXT) {
900 ff_idct = ff_mmxext_idct;
901 } else {
902 ff_idct = ff_mmx_idct;
903 }
d962f6fd
A
904#ifdef SIMPLE_IDCT
905// ff_idct = simple_idct;
906 ff_idct = simple_idct_mmx;
907#endif
de6d9b64 908 }
d6a4c0b1
ZK
909
910#if 0
911 // for speed testing
912 get_pixels = just_return;
913 put_pixels_clamped = just_return;
914 add_pixels_clamped = just_return;
915
916 pix_abs16x16 = just_return;
917 pix_abs16x16_x2 = just_return;
918 pix_abs16x16_y2 = just_return;
919 pix_abs16x16_xy2 = just_return;
920
921 put_pixels_tab[0] = just_return;
922 put_pixels_tab[1] = just_return;
923 put_pixels_tab[2] = just_return;
924 put_pixels_tab[3] = just_return;
925
926 put_no_rnd_pixels_tab[0] = just_return;
927 put_no_rnd_pixels_tab[1] = just_return;
928 put_no_rnd_pixels_tab[2] = just_return;
929 put_no_rnd_pixels_tab[3] = just_return;
930
931 avg_pixels_tab[0] = just_return;
932 avg_pixels_tab[1] = just_return;
933 avg_pixels_tab[2] = just_return;
934 avg_pixels_tab[3] = just_return;
935
936 avg_no_rnd_pixels_tab[0] = just_return;
937 avg_no_rnd_pixels_tab[1] = just_return;
938 avg_no_rnd_pixels_tab[2] = just_return;
939 avg_no_rnd_pixels_tab[3] = just_return;
940
d6a4c0b1
ZK
941 //av_fdct = just_return;
942 //ff_idct = just_return;
943#endif
de6d9b64 944}
4f12a497
FB
945
946/* remove any non bit exact operation (testing purpose). NOTE that
947 this function should be kept as small as possible because it is
948 always difficult to test automatically non bit exact cases. */
949void dsputil_set_bit_exact_mmx(void)
950{
951 if (mm_flags & MM_MMX) {
952 if (mm_flags & MM_MMXEXT) {
953 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
954 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
955 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
956 } else if (mm_flags & MM_3DNOW) {
957 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
958 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
959 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
960 }
961 }
962}