* simplified indexing
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22#include "../dsputil.h"
d962f6fd 23#include "../simple_idct.h"
de6d9b64 24
7d650cb5
FB
25int mm_flags; /* multimedia extension flags */
26
de6d9b64
FB
27int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
28int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h);
29int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
30int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
31int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
32
8def0299
FB
33/* external functions, from idct_mmx.c */
34void ff_mmx_idct(DCTELEM *block);
35void ff_mmxext_idct(DCTELEM *block);
4af7bcc1 36
de6d9b64 37/* pixel operations */
a9b3f630
NK
38static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
39static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
40//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
41//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
de6d9b64
FB
42
43/***********************************/
44/* 3Dnow specific */
45
46#define DEF(x) x ## _3dnow
47/* for Athlons PAVGUSB is prefered */
48#define PAVGB "pavgusb"
49
50#include "dsputil_mmx_avg.h"
51
52#undef DEF
53#undef PAVGB
54
55/***********************************/
56/* MMX2 specific */
57
58#define DEF(x) x ## _sse
59
60/* Introduced only in MMX2 set */
61#define PAVGB "pavgb"
62
63#include "dsputil_mmx_avg.h"
64
65#undef DEF
66#undef PAVGB
67
68/***********************************/
69/* standard MMX */
70
71static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
72{
73 DCTELEM *p;
74 const UINT8 *pix;
75 int i;
76
77 /* read the pixels */
78 p = block;
79 pix = pixels;
a822a479 80 __asm __volatile("pxor %%mm7, %%mm7":);
de6d9b64
FB
81 for(i=0;i<4;i++) {
82 __asm __volatile(
83 "movq %1, %%mm0\n\t"
84 "movq %2, %%mm1\n\t"
85 "movq %%mm0, %%mm2\n\t"
86 "movq %%mm1, %%mm3\n\t"
87 "punpcklbw %%mm7, %%mm0\n\t"
88 "punpckhbw %%mm7, %%mm2\n\t"
89 "punpcklbw %%mm7, %%mm1\n\t"
90 "punpckhbw %%mm7, %%mm3\n\t"
91 "movq %%mm0, %0\n\t"
92 "movq %%mm2, 8%0\n\t"
93 "movq %%mm1, 16%0\n\t"
94 "movq %%mm3, 24%0\n\t"
95 :"=m"(*p)
96 :"m"(*pix), "m"(*(pix+line_size))
97 :"memory");
98 pix += line_size*2;
99 p += 16;
100 }
de6d9b64
FB
101}
102
103static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
104{
105 const DCTELEM *p;
106 UINT8 *pix;
107 int i;
108
109 /* read the pixels */
110 p = block;
111 pix = pixels;
112 for(i=0;i<2;i++) {
113 __asm __volatile(
a822a479
NK
114 "movq %3, %%mm0\n\t"
115 "movq 8%3, %%mm1\n\t"
116 "movq 16%3, %%mm2\n\t"
117 "movq 24%3, %%mm3\n\t"
118 "movq 32%3, %%mm4\n\t"
119 "movq 40%3, %%mm5\n\t"
120 "movq 48%3, %%mm6\n\t"
121 "movq 56%3, %%mm7\n\t"
de6d9b64
FB
122 "packuswb %%mm1, %%mm0\n\t"
123 "packuswb %%mm3, %%mm2\n\t"
124 "packuswb %%mm5, %%mm4\n\t"
125 "packuswb %%mm7, %%mm6\n\t"
a822a479
NK
126 "movq %%mm0, (%0)\n\t"
127 "movq %%mm2, (%0, %1)\n\t"
128 "movq %%mm4, (%0, %1, 2)\n\t"
129 "movq %%mm6, (%0, %2)\n\t"
130 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
de6d9b64
FB
131 :"memory");
132 pix += line_size*4;
133 p += 32;
134 }
de6d9b64
FB
135}
136
137static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
138{
139 const DCTELEM *p;
140 UINT8 *pix;
141 int i;
142
143 /* read the pixels */
144 p = block;
145 pix = pixels;
a822a479 146 __asm __volatile("pxor %%mm7, %%mm7":);
de6d9b64
FB
147 for(i=0;i<4;i++) {
148 __asm __volatile(
149 "movq %2, %%mm0\n\t"
150 "movq 8%2, %%mm1\n\t"
151 "movq 16%2, %%mm2\n\t"
152 "movq 24%2, %%mm3\n\t"
153 "movq %0, %%mm4\n\t"
154 "movq %1, %%mm6\n\t"
155 "movq %%mm4, %%mm5\n\t"
156 "punpcklbw %%mm7, %%mm4\n\t"
157 "punpckhbw %%mm7, %%mm5\n\t"
158 "paddsw %%mm4, %%mm0\n\t"
159 "paddsw %%mm5, %%mm1\n\t"
160 "movq %%mm6, %%mm5\n\t"
161 "punpcklbw %%mm7, %%mm6\n\t"
162 "punpckhbw %%mm7, %%mm5\n\t"
163 "paddsw %%mm6, %%mm2\n\t"
164 "paddsw %%mm5, %%mm3\n\t"
165 "packuswb %%mm1, %%mm0\n\t"
166 "packuswb %%mm3, %%mm2\n\t"
167 "movq %%mm0, %0\n\t"
168 "movq %%mm2, %1\n\t"
a822a479 169 :"+m"(*pix), "+m"(*(pix+line_size))
de6d9b64
FB
170 :"m"(*p)
171 :"memory");
172 pix += line_size*2;
173 p += 16;
174 }
de6d9b64
FB
175}
176
177static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
178{
179 int dh, hh;
180 UINT8 *p;
181 const UINT8 *pix;
182 p = block;
183 pix = pixels;
184 hh=h>>2;
185 dh=h&3;
186 while(hh--) {
187 __asm __volatile(
a822a479
NK
188 "movq (%1), %%mm0 \n\t"
189 "movq (%1, %2), %%mm1 \n\t"
190 "movq (%1, %2, 2), %%mm2 \n\t"
191 "movq (%1, %3), %%mm3 \n\t"
192 "movq %%mm0, (%0) \n\t"
193 "movq %%mm1, (%0, %2) \n\t"
194 "movq %%mm2, (%0, %2, 2) \n\t"
195 "movq %%mm3, (%0, %3) \n\t"
196 ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3)
de6d9b64
FB
197 :"memory");
198 pix = pix + line_size*4;
199 p = p + line_size*4;
200 }
201 while(dh--) {
202 __asm __volatile(
203 "movq %1, %%mm0\n\t"
204 "movq %%mm0, %0\n\t"
205 :"=m"(*p)
206 :"m"(*pix)
207 :"memory");
208 pix = pix + line_size;
209 p = p + line_size;
210 }
de6d9b64
FB
211}
212
213static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
214{
215 UINT8 *p;
216 const UINT8 *pix;
217 p = block;
218 pix = pixels;
219 __asm __volatile(
220 "pxor %%mm7, %%mm7\n\t"
221 "movq %0, %%mm4\n\t"
a822a479 222 ::"m"(mm_wone));
de6d9b64
FB
223 do {
224 __asm __volatile(
225 "movq %1, %%mm0\n\t"
226 "movq 1%1, %%mm1\n\t"
227 "movq %%mm0, %%mm2\n\t"
228 "movq %%mm1, %%mm3\n\t"
229 "punpcklbw %%mm7, %%mm0\n\t"
230 "punpcklbw %%mm7, %%mm1\n\t"
231 "punpckhbw %%mm7, %%mm2\n\t"
232 "punpckhbw %%mm7, %%mm3\n\t"
233 "paddusw %%mm1, %%mm0\n\t"
234 "paddusw %%mm3, %%mm2\n\t"
235 "paddusw %%mm4, %%mm0\n\t"
236 "paddusw %%mm4, %%mm2\n\t"
237 "psrlw $1, %%mm0\n\t"
238 "psrlw $1, %%mm2\n\t"
239 "packuswb %%mm2, %%mm0\n\t"
240 "movq %%mm0, %0\n\t"
241 :"=m"(*p)
242 :"m"(*pix)
243 :"memory");
244 pix += line_size; p += line_size;
245 } while (--h);
de6d9b64
FB
246}
247
248static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
249{
250 UINT8 *p;
251 const UINT8 *pix;
252 p = block;
253 pix = pixels;
254 __asm __volatile(
255 "pxor %%mm7, %%mm7\n\t"
256 "movq %0, %%mm4\n\t"
a822a479 257 ::"m"(mm_wone));
de6d9b64
FB
258 do {
259 __asm __volatile(
260 "movq %1, %%mm0\n\t"
261 "movq %2, %%mm1\n\t"
262 "movq %%mm0, %%mm2\n\t"
263 "movq %%mm1, %%mm3\n\t"
264 "punpcklbw %%mm7, %%mm0\n\t"
265 "punpcklbw %%mm7, %%mm1\n\t"
266 "punpckhbw %%mm7, %%mm2\n\t"
267 "punpckhbw %%mm7, %%mm3\n\t"
268 "paddusw %%mm1, %%mm0\n\t"
269 "paddusw %%mm3, %%mm2\n\t"
270 "paddusw %%mm4, %%mm0\n\t"
271 "paddusw %%mm4, %%mm2\n\t"
272 "psrlw $1, %%mm0\n\t"
273 "psrlw $1, %%mm2\n\t"
274 "packuswb %%mm2, %%mm0\n\t"
275 "movq %%mm0, %0\n\t"
276 :"=m"(*p)
277 :"m"(*pix),
278 "m"(*(pix+line_size))
279 :"memory");
280 pix += line_size;
281 p += line_size;
282 } while (--h);
de6d9b64
FB
283}
284
285static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
286{
287 UINT8 *p;
288 const UINT8 *pix;
289 p = block;
290 pix = pixels;
291 __asm __volatile(
292 "pxor %%mm7, %%mm7\n\t"
293 "movq %0, %%mm6\n\t"
a822a479 294 ::"m"(mm_wtwo));
de6d9b64
FB
295 do {
296 __asm __volatile(
297 "movq %1, %%mm0\n\t"
298 "movq %2, %%mm1\n\t"
299 "movq 1%1, %%mm4\n\t"
300 "movq 1%2, %%mm5\n\t"
301 "movq %%mm0, %%mm2\n\t"
302 "movq %%mm1, %%mm3\n\t"
303 "punpcklbw %%mm7, %%mm0\n\t"
304 "punpcklbw %%mm7, %%mm1\n\t"
305 "punpckhbw %%mm7, %%mm2\n\t"
306 "punpckhbw %%mm7, %%mm3\n\t"
307 "paddusw %%mm1, %%mm0\n\t"
308 "paddusw %%mm3, %%mm2\n\t"
309 "movq %%mm4, %%mm1\n\t"
310 "movq %%mm5, %%mm3\n\t"
311 "punpcklbw %%mm7, %%mm4\n\t"
312 "punpcklbw %%mm7, %%mm5\n\t"
313 "punpckhbw %%mm7, %%mm1\n\t"
314 "punpckhbw %%mm7, %%mm3\n\t"
315 "paddusw %%mm5, %%mm4\n\t"
316 "paddusw %%mm3, %%mm1\n\t"
317 "paddusw %%mm6, %%mm4\n\t"
318 "paddusw %%mm6, %%mm1\n\t"
319 "paddusw %%mm4, %%mm0\n\t"
320 "paddusw %%mm1, %%mm2\n\t"
321 "psrlw $2, %%mm0\n\t"
322 "psrlw $2, %%mm2\n\t"
323 "packuswb %%mm2, %%mm0\n\t"
324 "movq %%mm0, %0\n\t"
325 :"=m"(*p)
326 :"m"(*pix),
327 "m"(*(pix+line_size))
328 :"memory");
329 pix += line_size;
330 p += line_size;
331 } while(--h);
de6d9b64
FB
332}
333
334static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
335{
336 UINT8 *p;
337 const UINT8 *pix;
338 p = block;
339 pix = pixels;
a822a479 340 __asm __volatile("pxor %%mm7, %%mm7\n\t":);
de6d9b64
FB
341 do {
342 __asm __volatile(
343 "movq %1, %%mm0\n\t"
344 "movq 1%1, %%mm1\n\t"
345 "movq %%mm0, %%mm2\n\t"
346 "movq %%mm1, %%mm3\n\t"
347 "punpcklbw %%mm7, %%mm0\n\t"
348 "punpcklbw %%mm7, %%mm1\n\t"
349 "punpckhbw %%mm7, %%mm2\n\t"
350 "punpckhbw %%mm7, %%mm3\n\t"
351 "paddusw %%mm1, %%mm0\n\t"
352 "paddusw %%mm3, %%mm2\n\t"
353 "psrlw $1, %%mm0\n\t"
354 "psrlw $1, %%mm2\n\t"
355 "packuswb %%mm2, %%mm0\n\t"
356 "movq %%mm0, %0\n\t"
357 :"=m"(*p)
358 :"m"(*pix)
359 :"memory");
360 pix += line_size;
361 p += line_size;
362 } while (--h);
de6d9b64
FB
363}
364
365static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
366{
367 UINT8 *p;
368 const UINT8 *pix;
369 p = block;
370 pix = pixels;
a822a479 371 __asm __volatile("pxor %%mm7, %%mm7\n\t":);
de6d9b64
FB
372 do {
373 __asm __volatile(
374 "movq %1, %%mm0\n\t"
375 "movq %2, %%mm1\n\t"
376 "movq %%mm0, %%mm2\n\t"
377 "movq %%mm1, %%mm3\n\t"
378 "punpcklbw %%mm7, %%mm0\n\t"
379 "punpcklbw %%mm7, %%mm1\n\t"
380 "punpckhbw %%mm7, %%mm2\n\t"
381 "punpckhbw %%mm7, %%mm3\n\t"
382 "paddusw %%mm1, %%mm0\n\t"
383 "paddusw %%mm3, %%mm2\n\t"
384 "psrlw $1, %%mm0\n\t"
385 "psrlw $1, %%mm2\n\t"
386 "packuswb %%mm2, %%mm0\n\t"
387 "movq %%mm0, %0\n\t"
388 :"=m"(*p)
389 :"m"(*pix),
390 "m"(*(pix+line_size))
391 :"memory");
392 pix += line_size;
393 p += line_size;
394 } while(--h);
de6d9b64
FB
395}
396
397static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
398{
399 UINT8 *p;
400 const UINT8 *pix;
401 p = block;
402 pix = pixels;
403 __asm __volatile(
404 "pxor %%mm7, %%mm7\n\t"
405 "movq %0, %%mm6\n\t"
a822a479 406 ::"m"(mm_wone));
de6d9b64
FB
407 do {
408 __asm __volatile(
409 "movq %1, %%mm0\n\t"
410 "movq %2, %%mm1\n\t"
411 "movq 1%1, %%mm4\n\t"
412 "movq 1%2, %%mm5\n\t"
413 "movq %%mm0, %%mm2\n\t"
414 "movq %%mm1, %%mm3\n\t"
415 "punpcklbw %%mm7, %%mm0\n\t"
416 "punpcklbw %%mm7, %%mm1\n\t"
417 "punpckhbw %%mm7, %%mm2\n\t"
418 "punpckhbw %%mm7, %%mm3\n\t"
419 "paddusw %%mm1, %%mm0\n\t"
420 "paddusw %%mm3, %%mm2\n\t"
421 "movq %%mm4, %%mm1\n\t"
422 "movq %%mm5, %%mm3\n\t"
423 "punpcklbw %%mm7, %%mm4\n\t"
424 "punpcklbw %%mm7, %%mm5\n\t"
425 "punpckhbw %%mm7, %%mm1\n\t"
426 "punpckhbw %%mm7, %%mm3\n\t"
427 "paddusw %%mm5, %%mm4\n\t"
428 "paddusw %%mm3, %%mm1\n\t"
429 "paddusw %%mm6, %%mm4\n\t"
430 "paddusw %%mm6, %%mm1\n\t"
431 "paddusw %%mm4, %%mm0\n\t"
432 "paddusw %%mm1, %%mm2\n\t"
433 "psrlw $2, %%mm0\n\t"
434 "psrlw $2, %%mm2\n\t"
435 "packuswb %%mm2, %%mm0\n\t"
436 "movq %%mm0, %0\n\t"
437 :"=m"(*p)
438 :"m"(*pix),
439 "m"(*(pix+line_size))
440 :"memory");
441 pix += line_size;
442 p += line_size;
443 } while(--h);
de6d9b64
FB
444}
445
446static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
447{
448 UINT8 *p;
449 const UINT8 *pix;
450 p = block;
451 pix = pixels;
452 __asm __volatile(
453 "pxor %%mm7, %%mm7\n\t"
454 "movq %0, %%mm6\n\t"
a822a479 455 ::"m"(mm_wone));
de6d9b64
FB
456 do {
457 __asm __volatile(
458 "movq %0, %%mm0\n\t"
459 "movq %1, %%mm1\n\t"
460 "movq %%mm0, %%mm2\n\t"
461 "movq %%mm1, %%mm3\n\t"
462 "punpcklbw %%mm7, %%mm0\n\t"
463 "punpcklbw %%mm7, %%mm1\n\t"
464 "punpckhbw %%mm7, %%mm2\n\t"
465 "punpckhbw %%mm7, %%mm3\n\t"
466 "paddusw %%mm1, %%mm0\n\t"
467 "paddusw %%mm3, %%mm2\n\t"
468 "paddusw %%mm6, %%mm0\n\t"
469 "paddusw %%mm6, %%mm2\n\t"
470 "psrlw $1, %%mm0\n\t"
471 "psrlw $1, %%mm2\n\t"
472 "packuswb %%mm2, %%mm0\n\t"
473 "movq %%mm0, %0\n\t"
a822a479 474 :"+m"(*p)
de6d9b64
FB
475 :"m"(*pix)
476 :"memory");
477 pix += line_size;
478 p += line_size;
479 }
480 while (--h);
de6d9b64
FB
481}
482
483static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
484{
485 UINT8 *p;
486 const UINT8 *pix;
487 p = block;
488 pix = pixels;
489 __asm __volatile(
490 "pxor %%mm7, %%mm7\n\t"
491 "movq %0, %%mm6\n\t"
a822a479 492 ::"m"(mm_wone));
de6d9b64
FB
493 do {
494 __asm __volatile(
495 "movq %1, %%mm1\n\t"
496 "movq %0, %%mm0\n\t"
497 "movq 1%1, %%mm4\n\t"
498 "movq %%mm0, %%mm2\n\t"
499 "movq %%mm1, %%mm3\n\t"
500 "movq %%mm4, %%mm5\n\t"
501 "punpcklbw %%mm7, %%mm1\n\t"
502 "punpckhbw %%mm7, %%mm3\n\t"
503 "punpcklbw %%mm7, %%mm4\n\t"
504 "punpckhbw %%mm7, %%mm5\n\t"
505 "punpcklbw %%mm7, %%mm0\n\t"
506 "punpckhbw %%mm7, %%mm2\n\t"
507 "paddusw %%mm4, %%mm1\n\t"
508 "paddusw %%mm5, %%mm3\n\t"
509 "paddusw %%mm6, %%mm1\n\t"
510 "paddusw %%mm6, %%mm3\n\t"
511 "psrlw $1, %%mm1\n\t"
512 "psrlw $1, %%mm3\n\t"
513 "paddusw %%mm6, %%mm0\n\t"
514 "paddusw %%mm6, %%mm2\n\t"
515 "paddusw %%mm1, %%mm0\n\t"
516 "paddusw %%mm3, %%mm2\n\t"
517 "psrlw $1, %%mm0\n\t"
518 "psrlw $1, %%mm2\n\t"
519 "packuswb %%mm2, %%mm0\n\t"
520 "movq %%mm0, %0\n\t"
a822a479 521 :"+m"(*p)
de6d9b64
FB
522 :"m"(*pix)
523 :"memory");
524 pix += line_size;
525 p += line_size;
526 } while (--h);
de6d9b64
FB
527}
528
529static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
530{
531 UINT8 *p;
532 const UINT8 *pix;
533 p = block;
534 pix = pixels;
535 __asm __volatile(
536 "pxor %%mm7, %%mm7\n\t"
537 "movq %0, %%mm6\n\t"
a822a479 538 ::"m"(mm_wone));
de6d9b64
FB
539 do {
540 __asm __volatile(
541 "movq %1, %%mm1\n\t"
542 "movq %0, %%mm0\n\t"
543 "movq %2, %%mm4\n\t"
544 "movq %%mm0, %%mm2\n\t"
545 "movq %%mm1, %%mm3\n\t"
546 "movq %%mm4, %%mm5\n\t"
547 "punpcklbw %%mm7, %%mm1\n\t"
548 "punpckhbw %%mm7, %%mm3\n\t"
549 "punpcklbw %%mm7, %%mm4\n\t"
550 "punpckhbw %%mm7, %%mm5\n\t"
551 "punpcklbw %%mm7, %%mm0\n\t"
552 "punpckhbw %%mm7, %%mm2\n\t"
553 "paddusw %%mm4, %%mm1\n\t"
554 "paddusw %%mm5, %%mm3\n\t"
555 "paddusw %%mm6, %%mm1\n\t"
556 "paddusw %%mm6, %%mm3\n\t"
557 "psrlw $1, %%mm1\n\t"
558 "psrlw $1, %%mm3\n\t"
559 "paddusw %%mm6, %%mm0\n\t"
560 "paddusw %%mm6, %%mm2\n\t"
561 "paddusw %%mm1, %%mm0\n\t"
562 "paddusw %%mm3, %%mm2\n\t"
563 "psrlw $1, %%mm0\n\t"
564 "psrlw $1, %%mm2\n\t"
565 "packuswb %%mm2, %%mm0\n\t"
566 "movq %%mm0, %0\n\t"
a822a479 567 :"+m"(*p)
de6d9b64
FB
568 :"m"(*pix), "m"(*(pix+line_size))
569 :"memory");
570 pix += line_size;
571 p += line_size ;
572 } while(--h);
de6d9b64
FB
573}
574
575static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
576{
577 UINT8 *p;
578 const UINT8 *pix;
579 p = block;
580 pix = pixels;
581 __asm __volatile(
582 "pxor %%mm7, %%mm7\n\t"
583 "movq %0, %%mm6\n\t"
a822a479 584 ::"m"(mm_wtwo));
de6d9b64
FB
585 do {
586 __asm __volatile(
587 "movq %1, %%mm0\n\t"
588 "movq %2, %%mm1\n\t"
589 "movq 1%1, %%mm4\n\t"
590 "movq 1%2, %%mm5\n\t"
591 "movq %%mm0, %%mm2\n\t"
592 "movq %%mm1, %%mm3\n\t"
593 "punpcklbw %%mm7, %%mm0\n\t"
594 "punpcklbw %%mm7, %%mm1\n\t"
595 "punpckhbw %%mm7, %%mm2\n\t"
596 "punpckhbw %%mm7, %%mm3\n\t"
597 "paddusw %%mm1, %%mm0\n\t"
598 "paddusw %%mm3, %%mm2\n\t"
599 "movq %%mm4, %%mm1\n\t"
600 "movq %%mm5, %%mm3\n\t"
601 "punpcklbw %%mm7, %%mm4\n\t"
602 "punpcklbw %%mm7, %%mm5\n\t"
603 "punpckhbw %%mm7, %%mm1\n\t"
604 "punpckhbw %%mm7, %%mm3\n\t"
605 "paddusw %%mm5, %%mm4\n\t"
606 "paddusw %%mm3, %%mm1\n\t"
607 "paddusw %%mm6, %%mm4\n\t"
608 "paddusw %%mm6, %%mm1\n\t"
609 "paddusw %%mm4, %%mm0\n\t"
610 "paddusw %%mm1, %%mm2\n\t"
611 "movq %3, %%mm5\n\t"
612 "psrlw $2, %%mm0\n\t"
613 "movq %0, %%mm1\n\t"
614 "psrlw $2, %%mm2\n\t"
615 "movq %%mm1, %%mm3\n\t"
616 "punpcklbw %%mm7, %%mm1\n\t"
617 "punpckhbw %%mm7, %%mm3\n\t"
618 "paddusw %%mm1, %%mm0\n\t"
619 "paddusw %%mm3, %%mm2\n\t"
620 "paddusw %%mm5, %%mm0\n\t"
621 "paddusw %%mm5, %%mm2\n\t"
622 "psrlw $1, %%mm0\n\t"
623 "psrlw $1, %%mm2\n\t"
624 "packuswb %%mm2, %%mm0\n\t"
625 "movq %%mm0, %0\n\t"
a822a479 626 :"+m"(*p)
de6d9b64 627 :"m"(*pix),
a9b3f630 628 "m"(*(pix+line_size)), "m"(mm_wone)
de6d9b64
FB
629 :"memory");
630 pix += line_size;
631 p += line_size ;
632 } while(--h);
de6d9b64
FB
633}
634
635static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
636{
637 UINT8 *p;
638 const UINT8 *pix;
639 p = block;
640 pix = pixels;
a822a479 641 __asm __volatile("pxor %%mm7, %%mm7\n\t":);
de6d9b64
FB
642 do {
643 __asm __volatile(
644 "movq %1, %%mm0\n\t"
645 "movq %0, %%mm1\n\t"
646 "movq %%mm0, %%mm2\n\t"
647 "movq %%mm1, %%mm3\n\t"
648 "punpcklbw %%mm7, %%mm0\n\t"
649 "punpcklbw %%mm7, %%mm1\n\t"
650 "punpckhbw %%mm7, %%mm2\n\t"
651 "punpckhbw %%mm7, %%mm3\n\t"
652 "paddusw %%mm1, %%mm0\n\t"
653 "paddusw %%mm3, %%mm2\n\t"
654 "psrlw $1, %%mm0\n\t"
655 "psrlw $1, %%mm2\n\t"
656 "packuswb %%mm2, %%mm0\n\t"
657 "movq %%mm0, %0\n\t"
a822a479 658 :"+m"(*p)
de6d9b64
FB
659 :"m"(*pix)
660 :"memory");
661 pix += line_size;
662 p += line_size ;
663 } while (--h);
de6d9b64
FB
664}
665
666static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
667{
668 UINT8 *p;
669 const UINT8 *pix;
670 p = block;
671 pix = pixels;
672 __asm __volatile(
a822a479 673 "pxor %%mm7, %%mm7\n\t":);
de6d9b64
FB
674 do {
675 __asm __volatile(
676 "movq %1, %%mm0\n\t"
677 "movq 1%1, %%mm1\n\t"
678 "movq %0, %%mm4\n\t"
679 "movq %%mm0, %%mm2\n\t"
680 "movq %%mm1, %%mm3\n\t"
681 "movq %%mm4, %%mm5\n\t"
682 "punpcklbw %%mm7, %%mm0\n\t"
683 "punpcklbw %%mm7, %%mm1\n\t"
684 "punpckhbw %%mm7, %%mm2\n\t"
685 "punpckhbw %%mm7, %%mm3\n\t"
686 "punpcklbw %%mm7, %%mm4\n\t"
687 "punpckhbw %%mm7, %%mm5\n\t"
688 "paddusw %%mm1, %%mm0\n\t"
689 "paddusw %%mm3, %%mm2\n\t"
690 "psrlw $1, %%mm0\n\t"
691 "psrlw $1, %%mm2\n\t"
692 "paddusw %%mm4, %%mm0\n\t"
693 "paddusw %%mm5, %%mm2\n\t"
694 "psrlw $1, %%mm0\n\t"
695 "psrlw $1, %%mm2\n\t"
696 "packuswb %%mm2, %%mm0\n\t"
697 "movq %%mm0, %0\n\t"
a822a479 698 :"+m"(*p)
de6d9b64
FB
699 :"m"(*pix)
700 :"memory");
701 pix += line_size;
702 p += line_size;
703 } while (--h);
de6d9b64
FB
704}
705
706static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
707{
708 UINT8 *p;
709 const UINT8 *pix;
710 p = block;
711 pix = pixels;
712 __asm __volatile(
a822a479 713 "pxor %%mm7, %%mm7\n\t":);
de6d9b64
FB
714 do {
715 __asm __volatile(
716 "movq %1, %%mm0\n\t"
717 "movq %2, %%mm1\n\t"
718 "movq %0, %%mm4\n\t"
719 "movq %%mm0, %%mm2\n\t"
720 "movq %%mm1, %%mm3\n\t"
721 "movq %%mm4, %%mm5\n\t"
722 "punpcklbw %%mm7, %%mm0\n\t"
723 "punpcklbw %%mm7, %%mm1\n\t"
724 "punpckhbw %%mm7, %%mm2\n\t"
725 "punpckhbw %%mm7, %%mm3\n\t"
726 "punpcklbw %%mm7, %%mm4\n\t"
727 "punpckhbw %%mm7, %%mm5\n\t"
728 "paddusw %%mm1, %%mm0\n\t"
729 "paddusw %%mm3, %%mm2\n\t"
730 "psrlw $1, %%mm0\n\t"
731 "psrlw $1, %%mm2\n\t"
732 "paddusw %%mm4, %%mm0\n\t"
733 "paddusw %%mm5, %%mm2\n\t"
734 "psrlw $1, %%mm0\n\t"
735 "psrlw $1, %%mm2\n\t"
736 "packuswb %%mm2, %%mm0\n\t"
737 "movq %%mm0, %0\n\t"
a822a479 738 :"+m"(*p)
de6d9b64
FB
739 :"m"(*pix), "m"(*(pix+line_size))
740 :"memory");
741 pix += line_size;
742 p += line_size ;
743 } while(--h);
de6d9b64
FB
744}
745
746static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
747{
748 UINT8 *p;
749 const UINT8 *pix;
750 p = block;
751 pix = pixels;
752 __asm __volatile(
753 "pxor %%mm7, %%mm7\n\t"
754 "movq %0, %%mm6\n\t"
a822a479 755 ::"m"(mm_wone));
de6d9b64
FB
756 do {
757 __asm __volatile(
758 "movq %1, %%mm0\n\t"
759 "movq %2, %%mm1\n\t"
760 "movq 1%1, %%mm4\n\t"
761 "movq 1%2, %%mm5\n\t"
762 "movq %%mm0, %%mm2\n\t"
763 "movq %%mm1, %%mm3\n\t"
764 "punpcklbw %%mm7, %%mm0\n\t"
765 "punpcklbw %%mm7, %%mm1\n\t"
766 "punpckhbw %%mm7, %%mm2\n\t"
767 "punpckhbw %%mm7, %%mm3\n\t"
768 "paddusw %%mm1, %%mm0\n\t"
769 "paddusw %%mm3, %%mm2\n\t"
770 "movq %%mm4, %%mm1\n\t"
771 "movq %%mm5, %%mm3\n\t"
772 "punpcklbw %%mm7, %%mm4\n\t"
773 "punpcklbw %%mm7, %%mm5\n\t"
774 "punpckhbw %%mm7, %%mm1\n\t"
775 "punpckhbw %%mm7, %%mm3\n\t"
776 "paddusw %%mm5, %%mm4\n\t"
777 "paddusw %%mm3, %%mm1\n\t"
778 "paddusw %%mm6, %%mm4\n\t"
779 "paddusw %%mm6, %%mm1\n\t"
780 "paddusw %%mm4, %%mm0\n\t"
781 "paddusw %%mm1, %%mm2\n\t"
782 "movq %0, %%mm1\n\t"
783 "psrlw $2, %%mm0\n\t"
784 "movq %%mm1, %%mm3\n\t"
785 "psrlw $2, %%mm2\n\t"
786 "punpcklbw %%mm7, %%mm1\n\t"
787 "punpckhbw %%mm7, %%mm3\n\t"
788 "paddusw %%mm1, %%mm0\n\t"
789 "paddusw %%mm3, %%mm2\n\t"
790 "psrlw $1, %%mm0\n\t"
791 "psrlw $1, %%mm2\n\t"
792 "packuswb %%mm2, %%mm0\n\t"
793 "movq %%mm0, %0\n\t"
a822a479 794 :"+m"(*p)
de6d9b64
FB
795 :"m"(*pix),
796 "m"(*(pix+line_size))
797 :"memory");
798 pix += line_size;
799 p += line_size;
800 } while(--h);
de6d9b64
FB
801}
802
803static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
804{
805 DCTELEM *p;
806 const UINT8 *pix;
807 p = block;
808 pix = pixels;
a822a479 809 __asm __volatile("pxor %%mm7, %%mm7":);
de6d9b64
FB
810 do {
811 __asm __volatile(
812 "movq %0, %%mm0\n\t"
813 "movq %1, %%mm2\n\t"
814 "movq 8%0, %%mm1\n\t"
815 "movq %%mm2, %%mm3\n\t"
816 "punpcklbw %%mm7, %%mm2\n\t"
817 "punpckhbw %%mm7, %%mm3\n\t"
818 "psubsw %%mm2, %%mm0\n\t"
819 "psubsw %%mm3, %%mm1\n\t"
820 "movq %%mm0, %0\n\t"
821 "movq %%mm1, 8%0\n\t"
a822a479 822 :"+m"(*p)
de6d9b64
FB
823 :"m"(*pix)
824 :"memory");
825 pix += line_size;
826 p += 8;
827 } while (--h);
de6d9b64
FB
828}
829
830static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
831{
832 DCTELEM *p;
833 const UINT8 *pix;
834 p = block;
835 pix = pixels;
836 __asm __volatile(
837 "pxor %%mm7, %%mm7\n\t"
838 "movq %0, %%mm6"
a822a479 839 ::"m"(mm_wone));
de6d9b64
FB
840 do {
841 __asm __volatile(
842 "movq %0, %%mm0\n\t"
843 "movq %1, %%mm2\n\t"
844 "movq 8%0, %%mm1\n\t"
845 "movq 1%1, %%mm4\n\t"
846 "movq %%mm2, %%mm3\n\t"
847 "movq %%mm4, %%mm5\n\t"
848 "punpcklbw %%mm7, %%mm2\n\t"
849 "punpckhbw %%mm7, %%mm3\n\t"
850 "punpcklbw %%mm7, %%mm4\n\t"
851 "punpckhbw %%mm7, %%mm5\n\t"
852 "paddusw %%mm4, %%mm2\n\t"
853 "paddusw %%mm5, %%mm3\n\t"
854 "paddusw %%mm6, %%mm2\n\t"
855 "paddusw %%mm6, %%mm3\n\t"
856 "psrlw $1, %%mm2\n\t"
857 "psrlw $1, %%mm3\n\t"
858 "psubsw %%mm2, %%mm0\n\t"
859 "psubsw %%mm3, %%mm1\n\t"
860 "movq %%mm0, %0\n\t"
861 "movq %%mm1, 8%0\n\t"
a822a479 862 :"+m"(*p)
de6d9b64
FB
863 :"m"(*pix)
864 :"memory");
865 pix += line_size;
866 p += 8;
867 } while (--h);
de6d9b64
FB
868}
869
870static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
871{
872 DCTELEM *p;
873 const UINT8 *pix;
874 p = block;
875 pix = pixels;
876 __asm __volatile(
877 "pxor %%mm7, %%mm7\n\t"
878 "movq %0, %%mm6"
a822a479 879 ::"m"(mm_wone));
de6d9b64
FB
880 do {
881 __asm __volatile(
882 "movq %0, %%mm0\n\t"
883 "movq %1, %%mm2\n\t"
884 "movq 8%0, %%mm1\n\t"
885 "movq %2, %%mm4\n\t"
886 "movq %%mm2, %%mm3\n\t"
887 "movq %%mm4, %%mm5\n\t"
888 "punpcklbw %%mm7, %%mm2\n\t"
889 "punpckhbw %%mm7, %%mm3\n\t"
890 "punpcklbw %%mm7, %%mm4\n\t"
891 "punpckhbw %%mm7, %%mm5\n\t"
892 "paddusw %%mm4, %%mm2\n\t"
893 "paddusw %%mm5, %%mm3\n\t"
894 "paddusw %%mm6, %%mm2\n\t"
895 "paddusw %%mm6, %%mm3\n\t"
896 "psrlw $1, %%mm2\n\t"
897 "psrlw $1, %%mm3\n\t"
898 "psubsw %%mm2, %%mm0\n\t"
899 "psubsw %%mm3, %%mm1\n\t"
900 "movq %%mm0, %0\n\t"
901 "movq %%mm1, 8%0\n\t"
a822a479 902 :"+m"(*p)
de6d9b64
FB
903 :"m"(*pix), "m"(*(pix+line_size))
904 :"memory");
905 pix += line_size;
906 p += 8;
907 } while (--h);
de6d9b64
FB
908}
909
910static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
911{
912 DCTELEM *p;
913 const UINT8 *pix;
914 p = block;
915 pix = pixels;
916 __asm __volatile(
917 "pxor %%mm7, %%mm7\n\t"
918 "movq %0, %%mm6\n\t"
a822a479 919 ::"m"(mm_wtwo));
de6d9b64
FB
920 do {
921 __asm __volatile(
922 "movq %1, %%mm0\n\t"
923 "movq %2, %%mm1\n\t"
924 "movq 1%1, %%mm4\n\t"
925 "movq 1%2, %%mm5\n\t"
926 "movq %%mm0, %%mm2\n\t"
927 "movq %%mm1, %%mm3\n\t"
928 "punpcklbw %%mm7, %%mm0\n\t"
929 "punpcklbw %%mm7, %%mm1\n\t"
930 "punpckhbw %%mm7, %%mm2\n\t"
931 "punpckhbw %%mm7, %%mm3\n\t"
932 "paddusw %%mm1, %%mm0\n\t"
933 "paddusw %%mm3, %%mm2\n\t"
934 "movq %%mm4, %%mm1\n\t"
935 "movq %%mm5, %%mm3\n\t"
936 "punpcklbw %%mm7, %%mm4\n\t"
937 "punpcklbw %%mm7, %%mm5\n\t"
938 "punpckhbw %%mm7, %%mm1\n\t"
939 "punpckhbw %%mm7, %%mm3\n\t"
940 "paddusw %%mm5, %%mm4\n\t"
941 "paddusw %%mm3, %%mm1\n\t"
942 "paddusw %%mm6, %%mm4\n\t"
943 "paddusw %%mm6, %%mm1\n\t"
944 "paddusw %%mm4, %%mm0\n\t"
945 "paddusw %%mm1, %%mm2\n\t"
946 "movq %0, %%mm1\n\t"
947 "movq 8%0, %%mm3\n\t"
948 "psrlw $2, %%mm0\n\t"
949 "psrlw $2, %%mm2\n\t"
950 "psubsw %%mm0, %%mm1\n\t"
951 "psubsw %%mm2, %%mm3\n\t"
952 "movq %%mm1, %0\n\t"
953 "movq %%mm3, 8%0\n\t"
a822a479 954 :"+m"(*p)
de6d9b64
FB
955 :"m"(*pix),
956 "m"(*(pix+line_size))
957 :"memory");
958 pix += line_size;
959 p += 8 ;
960 } while(--h);
de6d9b64
FB
961}
962
963void dsputil_init_mmx(void)
964{
965 mm_flags = mm_support();
f4470e09
MN
966#if 1
967 printf("libavcodec: CPU flags:");
de6d9b64
FB
968 if (mm_flags & MM_MMX)
969 printf(" mmx");
970 if (mm_flags & MM_MMXEXT)
971 printf(" mmxext");
972 if (mm_flags & MM_3DNOW)
973 printf(" 3dnow");
974 if (mm_flags & MM_SSE)
975 printf(" sse");
976 if (mm_flags & MM_SSE2)
977 printf(" sse2");
978 printf("\n");
979#endif
980
981 if (mm_flags & MM_MMX) {
982 get_pixels = get_pixels_mmx;
983 put_pixels_clamped = put_pixels_clamped_mmx;
984 add_pixels_clamped = add_pixels_clamped_mmx;
985
986 pix_abs16x16 = pix_abs16x16_mmx;
987 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
988 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
989 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
990 av_fdct = fdct_mmx;
991
992 put_pixels_tab[0] = put_pixels_mmx;
993 put_pixels_tab[1] = put_pixels_x2_mmx;
994 put_pixels_tab[2] = put_pixels_y2_mmx;
995 put_pixels_tab[3] = put_pixels_xy2_mmx;
996
997 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
998 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
999 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1000 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
1001
1002 avg_pixels_tab[0] = avg_pixels_mmx;
1003 avg_pixels_tab[1] = avg_pixels_x2_mmx;
1004 avg_pixels_tab[2] = avg_pixels_y2_mmx;
1005 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1006
1007 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
1008 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
1009 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
1010 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
1011
1012 sub_pixels_tab[0] = sub_pixels_mmx;
1013 sub_pixels_tab[1] = sub_pixels_x2_mmx;
1014 sub_pixels_tab[2] = sub_pixels_y2_mmx;
1015 sub_pixels_tab[3] = sub_pixels_xy2_mmx;
1016
1017 if (mm_flags & MM_MMXEXT) {
1018 pix_abs16x16 = pix_abs16x16_sse;
1019 }
1020
1021 if (mm_flags & MM_SSE) {
1022 put_pixels_tab[1] = put_pixels_x2_sse;
1023 put_pixels_tab[2] = put_pixels_y2_sse;
1024
1025 avg_pixels_tab[0] = avg_pixels_sse;
1026 avg_pixels_tab[1] = avg_pixels_x2_sse;
1027 avg_pixels_tab[2] = avg_pixels_y2_sse;
1028 avg_pixels_tab[3] = avg_pixels_xy2_sse;
1029
1030 sub_pixels_tab[1] = sub_pixels_x2_sse;
1031 sub_pixels_tab[2] = sub_pixels_y2_sse;
1032 } else if (mm_flags & MM_3DNOW) {
1033 put_pixels_tab[1] = put_pixels_x2_3dnow;
1034 put_pixels_tab[2] = put_pixels_y2_3dnow;
1035
1036 avg_pixels_tab[0] = avg_pixels_3dnow;
1037 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
1038 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
1039 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
1040
1041 sub_pixels_tab[1] = sub_pixels_x2_3dnow;
1042 sub_pixels_tab[2] = sub_pixels_y2_3dnow;
1043 }
4af7bcc1 1044
8def0299
FB
1045 /* idct */
1046 if (mm_flags & MM_MMXEXT) {
1047 ff_idct = ff_mmxext_idct;
1048 } else {
1049 ff_idct = ff_mmx_idct;
1050 }
d962f6fd
A
1051#ifdef SIMPLE_IDCT
1052// ff_idct = simple_idct;
1053 ff_idct = simple_idct_mmx;
1054#endif
de6d9b64
FB
1055 }
1056}