add support for intel mac. mp3lib is not fixed yet.
[libav.git] / postproc / rgb2rgb_template.c
CommitLineData
fcfbc150 1/*
a3aece93
NK
2 *
3 * rgb2rgb.c, Software RGB to RGB convertor
6611aa83
NK
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
a3aece93 7 * Written by Nick Kurshev.
1de97d84 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
6cb38650 9 * lot of big-endian byteorder fixes by Alex Beregszaszi
a3aece93 10 */
a3aece93 11
0d9f3d85
A
12#include <stddef.h>
13#include <inttypes.h> /* for __WORDSIZE */
14
fac8012c
NP
15#include "asmalign.h"
16
0d9f3d85 17#ifndef __WORDSIZE
ff78c596
A
18// #warning You have misconfigured system and probably will lose performance!
19#define __WORDSIZE MP_WORDSIZE
0d9f3d85
A
20#endif
21
1de97d84
MN
22#undef PREFETCH
23#undef MOVNTQ
24#undef EMMS
25#undef SFENCE
26#undef MMREG_SIZE
27#undef PREFETCHW
28#undef PAVGB
29
30#ifdef HAVE_SSE2
31#define MMREG_SIZE 16
32#else
33#define MMREG_SIZE 8
34#endif
35
36#ifdef HAVE_3DNOW
37#define PREFETCH "prefetch"
38#define PREFETCHW "prefetchw"
39#define PAVGB "pavgusb"
40#elif defined ( HAVE_MMX2 )
41#define PREFETCH "prefetchnta"
42#define PREFETCHW "prefetcht0"
43#define PAVGB "pavgb"
44#else
fac8012c
NP
45#ifdef __APPLE__
46#define PREFETCH "#"
47#define PREFETCHW "#"
48#elif
1de97d84
MN
49#define PREFETCH "/nop"
50#define PREFETCHW "/nop"
99969243 51#endif
fac8012c 52#endif
1de97d84
MN
53
54#ifdef HAVE_3DNOW
55/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
56#define EMMS "femms"
57#else
58#define EMMS "emms"
e697a141 59#endif
79811694 60
1de97d84
MN
61#ifdef HAVE_MMX2
62#define MOVNTQ "movntq"
63#define SFENCE "sfence"
64#else
65#define MOVNTQ "movq"
fac8012c
NP
66#ifdef __APPLE__
67#define SFENCE "#"
68#elif
1de97d84
MN
69#define SFENCE "/nop"
70#endif
fac8012c 71#endif
1de97d84 72
7f526efd 73static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
b234ae81 74{
fde33ab5 75 uint8_t *dest = dst;
56993147
NK
76 const uint8_t *s = src;
77 const uint8_t *end;
49a0c6ee 78#ifdef HAVE_MMX
d8dad2a5 79 const uint8_t *mm_end;
49a0c6ee 80#endif
b234ae81 81 end = s + src_size;
49a0c6ee 82#ifdef HAVE_MMX
a3aece93 83 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 84 mm_end = end - 23;
a3aece93 85 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
49a0c6ee
NK
86 while(s < mm_end)
87 {
88 __asm __volatile(
a3aece93 89 PREFETCH" 32%1\n\t"
49a0c6ee 90 "movd %1, %%mm0\n\t"
0155db7f
NK
91 "punpckldq 3%1, %%mm0\n\t"
92 "movd 6%1, %%mm1\n\t"
93 "punpckldq 9%1, %%mm1\n\t"
94 "movd 12%1, %%mm2\n\t"
95 "punpckldq 15%1, %%mm2\n\t"
96 "movd 18%1, %%mm3\n\t"
97 "punpckldq 21%1, %%mm3\n\t"
49a0c6ee 98 "pand %%mm7, %%mm0\n\t"
0155db7f 99 "pand %%mm7, %%mm1\n\t"
49a0c6ee 100 "pand %%mm7, %%mm2\n\t"
0155db7f 101 "pand %%mm7, %%mm3\n\t"
96b956cc 102 MOVNTQ" %%mm0, %0\n\t"
0155db7f
NK
103 MOVNTQ" %%mm1, 8%0\n\t"
104 MOVNTQ" %%mm2, 16%0\n\t"
105 MOVNTQ" %%mm3, 24%0"
49a0c6ee
NK
106 :"=m"(*dest)
107 :"m"(*s)
108 :"memory");
0155db7f
NK
109 dest += 32;
110 s += 24;
49a0c6ee 111 }
79811694 112 __asm __volatile(SFENCE:::"memory");
96b956cc 113 __asm __volatile(EMMS:::"memory");
49a0c6ee 114#endif
b234ae81
NK
115 while(s < end)
116 {
6cb38650 117#ifdef WORDS_BIGENDIAN
f688668c 118 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
6cb38650 119 *dest++ = 0;
f688668c
AC
120 *dest++ = s[2];
121 *dest++ = s[1];
122 *dest++ = s[0];
123 s+=3;
6cb38650 124#else
fde33ab5
NK
125 *dest++ = *s++;
126 *dest++ = *s++;
127 *dest++ = *s++;
128 *dest++ = 0;
6cb38650 129#endif
b234ae81
NK
130 }
131}
59ac5a93 132
7f526efd 133static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
59ac5a93
NK
134{
135 uint8_t *dest = dst;
56993147
NK
136 const uint8_t *s = src;
137 const uint8_t *end;
494a6294 138#ifdef HAVE_MMX
d8dad2a5 139 const uint8_t *mm_end;
494a6294 140#endif
59ac5a93 141 end = s + src_size;
494a6294 142#ifdef HAVE_MMX
a3aece93 143 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 144 mm_end = end - 31;
494a6294
NK
145 while(s < mm_end)
146 {
147 __asm __volatile(
a3aece93 148 PREFETCH" 32%1\n\t"
494a6294
NK
149 "movq %1, %%mm0\n\t"
150 "movq 8%1, %%mm1\n\t"
2b3eef22
NK
151 "movq 16%1, %%mm4\n\t"
152 "movq 24%1, %%mm5\n\t"
494a6294
NK
153 "movq %%mm0, %%mm2\n\t"
154 "movq %%mm1, %%mm3\n\t"
2b3eef22
NK
155 "movq %%mm4, %%mm6\n\t"
156 "movq %%mm5, %%mm7\n\t"
494a6294
NK
157 "psrlq $8, %%mm2\n\t"
158 "psrlq $8, %%mm3\n\t"
2b3eef22
NK
159 "psrlq $8, %%mm6\n\t"
160 "psrlq $8, %%mm7\n\t"
161 "pand %2, %%mm0\n\t"
162 "pand %2, %%mm1\n\t"
163 "pand %2, %%mm4\n\t"
164 "pand %2, %%mm5\n\t"
165 "pand %3, %%mm2\n\t"
166 "pand %3, %%mm3\n\t"
167 "pand %3, %%mm6\n\t"
168 "pand %3, %%mm7\n\t"
169 "por %%mm2, %%mm0\n\t"
170 "por %%mm3, %%mm1\n\t"
171 "por %%mm6, %%mm4\n\t"
172 "por %%mm7, %%mm5\n\t"
173
174 "movq %%mm1, %%mm2\n\t"
175 "movq %%mm4, %%mm3\n\t"
176 "psllq $48, %%mm2\n\t"
177 "psllq $32, %%mm3\n\t"
178 "pand %4, %%mm2\n\t"
179 "pand %5, %%mm3\n\t"
494a6294 180 "por %%mm2, %%mm0\n\t"
2b3eef22
NK
181 "psrlq $16, %%mm1\n\t"
182 "psrlq $32, %%mm4\n\t"
183 "psllq $16, %%mm5\n\t"
494a6294 184 "por %%mm3, %%mm1\n\t"
2b3eef22
NK
185 "pand %6, %%mm5\n\t"
186 "por %%mm5, %%mm4\n\t"
1de97d84 187
494a6294 188 MOVNTQ" %%mm0, %0\n\t"
2b3eef22
NK
189 MOVNTQ" %%mm1, 8%0\n\t"
190 MOVNTQ" %%mm4, 16%0"
494a6294 191 :"=m"(*dest)
2b3eef22
NK
192 :"m"(*s),"m"(mask24l),
193 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
494a6294 194 :"memory");
2b3eef22
NK
195 dest += 24;
196 s += 32;
494a6294
NK
197 }
198 __asm __volatile(SFENCE:::"memory");
199 __asm __volatile(EMMS:::"memory");
200#endif
59ac5a93
NK
201 while(s < end)
202 {
6cb38650 203#ifdef WORDS_BIGENDIAN
f688668c 204 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
6cb38650 205 s++;
f688668c
AC
206 dest[2] = *s++;
207 dest[1] = *s++;
208 dest[0] = *s++;
209 dest += 3;
6cb38650 210#else
59ac5a93
NK
211 *dest++ = *s++;
212 *dest++ = *s++;
213 *dest++ = *s++;
214 s++;
6cb38650 215#endif
59ac5a93
NK
216 }
217}
b238eb2e 218
a3aece93
NK
219/*
220 Original by Strepto/Astral
221 ported to gcc & bugfixed : A'rpi
51da31f1 222 MMX2, 3DNOW optimization by Nick Kurshev
9b2c28e6 223 32bit c version, and and&add trick by Michael Niedermayer
a3aece93 224*/
7f526efd 225static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
b238eb2e 226{
0d9f3d85
A
227 register const uint8_t* s=src;
228 register uint8_t* d=dst;
229 register const uint8_t *end;
d8dad2a5 230 const uint8_t *mm_end;
0d9f3d85 231 end = s + src_size;
b238eb2e 232#ifdef HAVE_MMX
0d9f3d85
A
233 __asm __volatile(PREFETCH" %0"::"m"(*s));
234 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
d8dad2a5 235 mm_end = end - 15;
0d9f3d85 236 while(s<mm_end)
a3aece93
NK
237 {
238 __asm __volatile(
239 PREFETCH" 32%1\n\t"
240 "movq %1, %%mm0\n\t"
241 "movq 8%1, %%mm2\n\t"
242 "movq %%mm0, %%mm1\n\t"
243 "movq %%mm2, %%mm3\n\t"
244 "pand %%mm4, %%mm0\n\t"
a3aece93 245 "pand %%mm4, %%mm2\n\t"
9b2c28e6
MN
246 "paddw %%mm1, %%mm0\n\t"
247 "paddw %%mm3, %%mm2\n\t"
a3aece93
NK
248 MOVNTQ" %%mm0, %0\n\t"
249 MOVNTQ" %%mm2, 8%0"
0d9f3d85
A
250 :"=m"(*d)
251 :"m"(*s)
9b2c28e6 252 );
0d9f3d85
A
253 d+=16;
254 s+=16;
b238eb2e 255 }
a3aece93
NK
256 __asm __volatile(SFENCE:::"memory");
257 __asm __volatile(EMMS:::"memory");
b238eb2e 258#endif
d8dad2a5 259 mm_end = end - 3;
0d9f3d85
A
260 while(s < mm_end)
261 {
262 register unsigned x= *((uint32_t *)s);
263 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
264 d+=4;
265 s+=4;
266 }
267 if(s < end)
268 {
269 register unsigned short x= *((uint16_t *)s);
270 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
271 }
b238eb2e 272}
fcfbc150 273
7f526efd 274static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
ac4d0aea
MN
275{
276 register const uint8_t* s=src;
277 register uint8_t* d=dst;
278 register const uint8_t *end;
0598bcbb 279 const uint8_t *mm_end;
ac4d0aea
MN
280 end = s + src_size;
281#ifdef HAVE_MMX
282 __asm __volatile(PREFETCH" %0"::"m"(*s));
283 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
284 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
0598bcbb 285 mm_end = end - 15;
ac4d0aea
MN
286 while(s<mm_end)
287 {
288 __asm __volatile(
289 PREFETCH" 32%1\n\t"
290 "movq %1, %%mm0\n\t"
291 "movq 8%1, %%mm2\n\t"
292 "movq %%mm0, %%mm1\n\t"
293 "movq %%mm2, %%mm3\n\t"
294 "psrlq $1, %%mm0\n\t"
295 "psrlq $1, %%mm2\n\t"
296 "pand %%mm7, %%mm0\n\t"
297 "pand %%mm7, %%mm2\n\t"
298 "pand %%mm6, %%mm1\n\t"
299 "pand %%mm6, %%mm3\n\t"
300 "por %%mm1, %%mm0\n\t"
301 "por %%mm3, %%mm2\n\t"
302 MOVNTQ" %%mm0, %0\n\t"
303 MOVNTQ" %%mm2, 8%0"
304 :"=m"(*d)
305 :"m"(*s)
306 );
307 d+=16;
308 s+=16;
309 }
310 __asm __volatile(SFENCE:::"memory");
311 __asm __volatile(EMMS:::"memory");
312#endif
0598bcbb
MN
313 mm_end = end - 3;
314 while(s < mm_end)
ac4d0aea
MN
315 {
316 register uint32_t x= *((uint32_t *)s);
317 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
318 s+=4;
319 d+=4;
320 }
321 if(s < end)
322 {
323 register uint16_t x= *((uint16_t *)s);
324 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
325 s+=2;
326 d+=2;
327 }
328}
329
7f526efd 330static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
fcfbc150 331{
53445e83 332 const uint8_t *s = src;
0d9f3d85
A
333 const uint8_t *end;
334#ifdef HAVE_MMX
335 const uint8_t *mm_end;
336#endif
53445e83
NK
337 uint16_t *d = (uint16_t *)dst;
338 end = s + src_size;
0d9f3d85 339#ifdef HAVE_MMX
aeae5d53
MN
340 mm_end = end - 15;
341#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
342 asm volatile(
343 "movq %3, %%mm5 \n\t"
344 "movq %4, %%mm6 \n\t"
345 "movq %5, %%mm7 \n\t"
fac8012c 346 ASMALIGN16
aeae5d53
MN
347 "1: \n\t"
348 PREFETCH" 32(%1) \n\t"
349 "movd (%1), %%mm0 \n\t"
350 "movd 4(%1), %%mm3 \n\t"
351 "punpckldq 8(%1), %%mm0 \n\t"
352 "punpckldq 12(%1), %%mm3 \n\t"
353 "movq %%mm0, %%mm1 \n\t"
354 "movq %%mm3, %%mm4 \n\t"
355 "pand %%mm6, %%mm0 \n\t"
356 "pand %%mm6, %%mm3 \n\t"
357 "pmaddwd %%mm7, %%mm0 \n\t"
358 "pmaddwd %%mm7, %%mm3 \n\t"
359 "pand %%mm5, %%mm1 \n\t"
360 "pand %%mm5, %%mm4 \n\t"
361 "por %%mm1, %%mm0 \n\t"
362 "por %%mm4, %%mm3 \n\t"
363 "psrld $5, %%mm0 \n\t"
364 "pslld $11, %%mm3 \n\t"
365 "por %%mm3, %%mm0 \n\t"
366 MOVNTQ" %%mm0, (%0) \n\t"
6e1c66bc
AJ
367 "add $16, %1 \n\t"
368 "add $8, %0 \n\t"
369 "cmp %2, %1 \n\t"
aeae5d53
MN
370 " jb 1b \n\t"
371 : "+r" (d), "+r"(s)
372 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
373 );
374#else
53445e83
NK
375 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
376 __asm __volatile(
377 "movq %0, %%mm7\n\t"
378 "movq %1, %%mm6\n\t"
379 ::"m"(red_16mask),"m"(green_16mask));
380 while(s < mm_end)
381 {
382 __asm __volatile(
383 PREFETCH" 32%1\n\t"
384 "movd %1, %%mm0\n\t"
385 "movd 4%1, %%mm3\n\t"
386 "punpckldq 8%1, %%mm0\n\t"
387 "punpckldq 12%1, %%mm3\n\t"
388 "movq %%mm0, %%mm1\n\t"
389 "movq %%mm0, %%mm2\n\t"
390 "movq %%mm3, %%mm4\n\t"
391 "movq %%mm3, %%mm5\n\t"
392 "psrlq $3, %%mm0\n\t"
393 "psrlq $3, %%mm3\n\t"
394 "pand %2, %%mm0\n\t"
395 "pand %2, %%mm3\n\t"
396 "psrlq $5, %%mm1\n\t"
397 "psrlq $5, %%mm4\n\t"
398 "pand %%mm6, %%mm1\n\t"
399 "pand %%mm6, %%mm4\n\t"
400 "psrlq $8, %%mm2\n\t"
401 "psrlq $8, %%mm5\n\t"
402 "pand %%mm7, %%mm2\n\t"
403 "pand %%mm7, %%mm5\n\t"
404 "por %%mm1, %%mm0\n\t"
405 "por %%mm4, %%mm3\n\t"
406 "por %%mm2, %%mm0\n\t"
407 "por %%mm5, %%mm3\n\t"
408 "psllq $16, %%mm3\n\t"
409 "por %%mm3, %%mm0\n\t"
410 MOVNTQ" %%mm0, %0\n\t"
411 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
412 d += 4;
413 s += 16;
414 }
aeae5d53 415#endif
0d9f3d85
A
416 __asm __volatile(SFENCE:::"memory");
417 __asm __volatile(EMMS:::"memory");
418#endif
53445e83
NK
419 while(s < end)
420 {
d07355da
RR
421 register int rgb = *(uint32_t*)s; s += 4;
422 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
53445e83 423 }
fcfbc150
MN
424}
425
7f526efd 426static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
ac4d0aea
MN
427{
428 const uint8_t *s = src;
429 const uint8_t *end;
430#ifdef HAVE_MMX
431 const uint8_t *mm_end;
432#endif
433 uint16_t *d = (uint16_t *)dst;
434 end = s + src_size;
435#ifdef HAVE_MMX
436 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
437 __asm __volatile(
438 "movq %0, %%mm7\n\t"
439 "movq %1, %%mm6\n\t"
440 ::"m"(red_16mask),"m"(green_16mask));
0598bcbb 441 mm_end = end - 15;
ac4d0aea
MN
442 while(s < mm_end)
443 {
444 __asm __volatile(
445 PREFETCH" 32%1\n\t"
446 "movd %1, %%mm0\n\t"
447 "movd 4%1, %%mm3\n\t"
448 "punpckldq 8%1, %%mm0\n\t"
449 "punpckldq 12%1, %%mm3\n\t"
450 "movq %%mm0, %%mm1\n\t"
451 "movq %%mm0, %%mm2\n\t"
452 "movq %%mm3, %%mm4\n\t"
453 "movq %%mm3, %%mm5\n\t"
454 "psllq $8, %%mm0\n\t"
455 "psllq $8, %%mm3\n\t"
456 "pand %%mm7, %%mm0\n\t"
457 "pand %%mm7, %%mm3\n\t"
458 "psrlq $5, %%mm1\n\t"
459 "psrlq $5, %%mm4\n\t"
460 "pand %%mm6, %%mm1\n\t"
461 "pand %%mm6, %%mm4\n\t"
462 "psrlq $19, %%mm2\n\t"
463 "psrlq $19, %%mm5\n\t"
464 "pand %2, %%mm2\n\t"
465 "pand %2, %%mm5\n\t"
466 "por %%mm1, %%mm0\n\t"
467 "por %%mm4, %%mm3\n\t"
468 "por %%mm2, %%mm0\n\t"
469 "por %%mm5, %%mm3\n\t"
470 "psllq $16, %%mm3\n\t"
471 "por %%mm3, %%mm0\n\t"
472 MOVNTQ" %%mm0, %0\n\t"
473 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
474 d += 4;
475 s += 16;
476 }
477 __asm __volatile(SFENCE:::"memory");
478 __asm __volatile(EMMS:::"memory");
479#endif
480 while(s < end)
481 {
1d773cfd
AC
482 register int rgb = *(uint32_t*)s; s += 4;
483 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
ac4d0aea
MN
484 }
485}
486
7f526efd 487static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
fcfbc150 488{
53445e83 489 const uint8_t *s = src;
0d9f3d85
A
490 const uint8_t *end;
491#ifdef HAVE_MMX
492 const uint8_t *mm_end;
493#endif
53445e83
NK
494 uint16_t *d = (uint16_t *)dst;
495 end = s + src_size;
0d9f3d85 496#ifdef HAVE_MMX
aeae5d53
MN
497 mm_end = end - 15;
498#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
499 asm volatile(
500 "movq %3, %%mm5 \n\t"
501 "movq %4, %%mm6 \n\t"
502 "movq %5, %%mm7 \n\t"
fac8012c 503 ASMALIGN16
aeae5d53
MN
504 "1: \n\t"
505 PREFETCH" 32(%1) \n\t"
506 "movd (%1), %%mm0 \n\t"
507 "movd 4(%1), %%mm3 \n\t"
508 "punpckldq 8(%1), %%mm0 \n\t"
509 "punpckldq 12(%1), %%mm3 \n\t"
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm3, %%mm4 \n\t"
512 "pand %%mm6, %%mm0 \n\t"
513 "pand %%mm6, %%mm3 \n\t"
514 "pmaddwd %%mm7, %%mm0 \n\t"
515 "pmaddwd %%mm7, %%mm3 \n\t"
516 "pand %%mm5, %%mm1 \n\t"
517 "pand %%mm5, %%mm4 \n\t"
518 "por %%mm1, %%mm0 \n\t"
519 "por %%mm4, %%mm3 \n\t"
520 "psrld $6, %%mm0 \n\t"
521 "pslld $10, %%mm3 \n\t"
522 "por %%mm3, %%mm0 \n\t"
523 MOVNTQ" %%mm0, (%0) \n\t"
6e1c66bc
AJ
524 "add $16, %1 \n\t"
525 "add $8, %0 \n\t"
526 "cmp %2, %1 \n\t"
aeae5d53
MN
527 " jb 1b \n\t"
528 : "+r" (d), "+r"(s)
529 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
530 );
531#else
53445e83
NK
532 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
533 __asm __volatile(
534 "movq %0, %%mm7\n\t"
535 "movq %1, %%mm6\n\t"
536 ::"m"(red_15mask),"m"(green_15mask));
537 while(s < mm_end)
538 {
539 __asm __volatile(
540 PREFETCH" 32%1\n\t"
541 "movd %1, %%mm0\n\t"
542 "movd 4%1, %%mm3\n\t"
543 "punpckldq 8%1, %%mm0\n\t"
544 "punpckldq 12%1, %%mm3\n\t"
545 "movq %%mm0, %%mm1\n\t"
546 "movq %%mm0, %%mm2\n\t"
547 "movq %%mm3, %%mm4\n\t"
548 "movq %%mm3, %%mm5\n\t"
549 "psrlq $3, %%mm0\n\t"
550 "psrlq $3, %%mm3\n\t"
551 "pand %2, %%mm0\n\t"
552 "pand %2, %%mm3\n\t"
553 "psrlq $6, %%mm1\n\t"
554 "psrlq $6, %%mm4\n\t"
555 "pand %%mm6, %%mm1\n\t"
556 "pand %%mm6, %%mm4\n\t"
557 "psrlq $9, %%mm2\n\t"
558 "psrlq $9, %%mm5\n\t"
559 "pand %%mm7, %%mm2\n\t"
560 "pand %%mm7, %%mm5\n\t"
561 "por %%mm1, %%mm0\n\t"
562 "por %%mm4, %%mm3\n\t"
563 "por %%mm2, %%mm0\n\t"
564 "por %%mm5, %%mm3\n\t"
565 "psllq $16, %%mm3\n\t"
566 "por %%mm3, %%mm0\n\t"
567 MOVNTQ" %%mm0, %0\n\t"
568 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
569 d += 4;
570 s += 16;
571 }
aeae5d53 572#endif
0d9f3d85
A
573 __asm __volatile(SFENCE:::"memory");
574 __asm __volatile(EMMS:::"memory");
575#endif
53445e83
NK
576 while(s < end)
577 {
1d773cfd
AC
578 register int rgb = *(uint32_t*)s; s += 4;
579 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
53445e83 580 }
fcfbc150
MN
581}
582
7f526efd 583static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
ac4d0aea
MN
584{
585 const uint8_t *s = src;
586 const uint8_t *end;
587#ifdef HAVE_MMX
588 const uint8_t *mm_end;
589#endif
590 uint16_t *d = (uint16_t *)dst;
591 end = s + src_size;
592#ifdef HAVE_MMX
593 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
594 __asm __volatile(
595 "movq %0, %%mm7\n\t"
596 "movq %1, %%mm6\n\t"
597 ::"m"(red_15mask),"m"(green_15mask));
0598bcbb 598 mm_end = end - 15;
ac4d0aea
MN
599 while(s < mm_end)
600 {
601 __asm __volatile(
602 PREFETCH" 32%1\n\t"
603 "movd %1, %%mm0\n\t"
604 "movd 4%1, %%mm3\n\t"
605 "punpckldq 8%1, %%mm0\n\t"
606 "punpckldq 12%1, %%mm3\n\t"
607 "movq %%mm0, %%mm1\n\t"
608 "movq %%mm0, %%mm2\n\t"
609 "movq %%mm3, %%mm4\n\t"
610 "movq %%mm3, %%mm5\n\t"
611 "psllq $7, %%mm0\n\t"
612 "psllq $7, %%mm3\n\t"
613 "pand %%mm7, %%mm0\n\t"
614 "pand %%mm7, %%mm3\n\t"
615 "psrlq $6, %%mm1\n\t"
616 "psrlq $6, %%mm4\n\t"
617 "pand %%mm6, %%mm1\n\t"
618 "pand %%mm6, %%mm4\n\t"
619 "psrlq $19, %%mm2\n\t"
620 "psrlq $19, %%mm5\n\t"
621 "pand %2, %%mm2\n\t"
622 "pand %2, %%mm5\n\t"
623 "por %%mm1, %%mm0\n\t"
624 "por %%mm4, %%mm3\n\t"
625 "por %%mm2, %%mm0\n\t"
626 "por %%mm5, %%mm3\n\t"
627 "psllq $16, %%mm3\n\t"
628 "por %%mm3, %%mm0\n\t"
629 MOVNTQ" %%mm0, %0\n\t"
630 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
631 d += 4;
632 s += 16;
633 }
634 __asm __volatile(SFENCE:::"memory");
635 __asm __volatile(EMMS:::"memory");
636#endif
637 while(s < end)
638 {
1d773cfd
AC
639 register int rgb = *(uint32_t*)s; s += 4;
640 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
ac4d0aea
MN
641 }
642}
643
7f526efd 644static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
996e1a7c 645{
3eb2151c 646 const uint8_t *s = src;
0d9f3d85
A
647 const uint8_t *end;
648#ifdef HAVE_MMX
649 const uint8_t *mm_end;
650#endif
90226a43 651 uint16_t *d = (uint16_t *)dst;
3eb2151c 652 end = s + src_size;
0d9f3d85 653#ifdef HAVE_MMX
0155db7f
NK
654 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
655 __asm __volatile(
656 "movq %0, %%mm7\n\t"
657 "movq %1, %%mm6\n\t"
53445e83 658 ::"m"(red_16mask),"m"(green_16mask));
d8dad2a5 659 mm_end = end - 11;
3eb2151c 660 while(s < mm_end)
0155db7f
NK
661 {
662 __asm __volatile(
663 PREFETCH" 32%1\n\t"
664 "movd %1, %%mm0\n\t"
3eb2151c
NK
665 "movd 3%1, %%mm3\n\t"
666 "punpckldq 6%1, %%mm0\n\t"
0155db7f
NK
667 "punpckldq 9%1, %%mm3\n\t"
668 "movq %%mm0, %%mm1\n\t"
669 "movq %%mm0, %%mm2\n\t"
670 "movq %%mm3, %%mm4\n\t"
671 "movq %%mm3, %%mm5\n\t"
672 "psrlq $3, %%mm0\n\t"
673 "psrlq $3, %%mm3\n\t"
3eb2151c
NK
674 "pand %2, %%mm0\n\t"
675 "pand %2, %%mm3\n\t"
676 "psrlq $5, %%mm1\n\t"
677 "psrlq $5, %%mm4\n\t"
678 "pand %%mm6, %%mm1\n\t"
679 "pand %%mm6, %%mm4\n\t"
680 "psrlq $8, %%mm2\n\t"
681 "psrlq $8, %%mm5\n\t"
682 "pand %%mm7, %%mm2\n\t"
683 "pand %%mm7, %%mm5\n\t"
0155db7f 684 "por %%mm1, %%mm0\n\t"
0155db7f 685 "por %%mm4, %%mm3\n\t"
3eb2151c 686 "por %%mm2, %%mm0\n\t"
0155db7f 687 "por %%mm5, %%mm3\n\t"
3eb2151c
NK
688 "psllq $16, %%mm3\n\t"
689 "por %%mm3, %%mm0\n\t"
0155db7f 690 MOVNTQ" %%mm0, %0\n\t"
53445e83 691 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
3eb2151c
NK
692 d += 4;
693 s += 12;
0155db7f 694 }
0d9f3d85
A
695 __asm __volatile(SFENCE:::"memory");
696 __asm __volatile(EMMS:::"memory");
697#endif
3eb2151c
NK
698 while(s < end)
699 {
700 const int b= *s++;
701 const int g= *s++;
702 const int r= *s++;
703 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
704 }
996e1a7c
NK
705}
706
7f526efd 707static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
ac4d0aea
MN
708{
709 const uint8_t *s = src;
710 const uint8_t *end;
711#ifdef HAVE_MMX
712 const uint8_t *mm_end;
713#endif
714 uint16_t *d = (uint16_t *)dst;
715 end = s + src_size;
716#ifdef HAVE_MMX
717 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
718 __asm __volatile(
719 "movq %0, %%mm7\n\t"
720 "movq %1, %%mm6\n\t"
721 ::"m"(red_16mask),"m"(green_16mask));
0598bcbb 722 mm_end = end - 15;
ac4d0aea
MN
723 while(s < mm_end)
724 {
725 __asm __volatile(
726 PREFETCH" 32%1\n\t"
727 "movd %1, %%mm0\n\t"
728 "movd 3%1, %%mm3\n\t"
729 "punpckldq 6%1, %%mm0\n\t"
730 "punpckldq 9%1, %%mm3\n\t"
731 "movq %%mm0, %%mm1\n\t"
732 "movq %%mm0, %%mm2\n\t"
733 "movq %%mm3, %%mm4\n\t"
734 "movq %%mm3, %%mm5\n\t"
735 "psllq $8, %%mm0\n\t"
736 "psllq $8, %%mm3\n\t"
737 "pand %%mm7, %%mm0\n\t"
738 "pand %%mm7, %%mm3\n\t"
739 "psrlq $5, %%mm1\n\t"
740 "psrlq $5, %%mm4\n\t"
741 "pand %%mm6, %%mm1\n\t"
742 "pand %%mm6, %%mm4\n\t"
743 "psrlq $19, %%mm2\n\t"
744 "psrlq $19, %%mm5\n\t"
745 "pand %2, %%mm2\n\t"
746 "pand %2, %%mm5\n\t"
747 "por %%mm1, %%mm0\n\t"
748 "por %%mm4, %%mm3\n\t"
749 "por %%mm2, %%mm0\n\t"
750 "por %%mm5, %%mm3\n\t"
751 "psllq $16, %%mm3\n\t"
752 "por %%mm3, %%mm0\n\t"
753 MOVNTQ" %%mm0, %0\n\t"
754 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
755 d += 4;
756 s += 12;
757 }
758 __asm __volatile(SFENCE:::"memory");
759 __asm __volatile(EMMS:::"memory");
760#endif
761 while(s < end)
762 {
763 const int r= *s++;
764 const int g= *s++;
765 const int b= *s++;
766 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
767 }
768}
769
7f526efd 770static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
996e1a7c 771{
53445e83 772 const uint8_t *s = src;
0d9f3d85
A
773 const uint8_t *end;
774#ifdef HAVE_MMX
775 const uint8_t *mm_end;
776#endif
53445e83
NK
777 uint16_t *d = (uint16_t *)dst;
778 end = s + src_size;
0d9f3d85 779#ifdef HAVE_MMX
53445e83
NK
780 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
781 __asm __volatile(
782 "movq %0, %%mm7\n\t"
783 "movq %1, %%mm6\n\t"
784 ::"m"(red_15mask),"m"(green_15mask));
d8dad2a5 785 mm_end = end - 11;
53445e83
NK
786 while(s < mm_end)
787 {
788 __asm __volatile(
789 PREFETCH" 32%1\n\t"
790 "movd %1, %%mm0\n\t"
791 "movd 3%1, %%mm3\n\t"
792 "punpckldq 6%1, %%mm0\n\t"
793 "punpckldq 9%1, %%mm3\n\t"
794 "movq %%mm0, %%mm1\n\t"
795 "movq %%mm0, %%mm2\n\t"
796 "movq %%mm3, %%mm4\n\t"
797 "movq %%mm3, %%mm5\n\t"
798 "psrlq $3, %%mm0\n\t"
799 "psrlq $3, %%mm3\n\t"
800 "pand %2, %%mm0\n\t"
801 "pand %2, %%mm3\n\t"
802 "psrlq $6, %%mm1\n\t"
803 "psrlq $6, %%mm4\n\t"
804 "pand %%mm6, %%mm1\n\t"
805 "pand %%mm6, %%mm4\n\t"
806 "psrlq $9, %%mm2\n\t"
807 "psrlq $9, %%mm5\n\t"
808 "pand %%mm7, %%mm2\n\t"
809 "pand %%mm7, %%mm5\n\t"
810 "por %%mm1, %%mm0\n\t"
811 "por %%mm4, %%mm3\n\t"
812 "por %%mm2, %%mm0\n\t"
813 "por %%mm5, %%mm3\n\t"
814 "psllq $16, %%mm3\n\t"
815 "por %%mm3, %%mm0\n\t"
816 MOVNTQ" %%mm0, %0\n\t"
817 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
818 d += 4;
819 s += 12;
820 }
0d9f3d85
A
821 __asm __volatile(SFENCE:::"memory");
822 __asm __volatile(EMMS:::"memory");
823#endif
53445e83
NK
824 while(s < end)
825 {
826 const int b= *s++;
827 const int g= *s++;
828 const int r= *s++;
829 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
830 }
0d9f3d85
A
831}
832
7f526efd 833static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
ac4d0aea
MN
834{
835 const uint8_t *s = src;
836 const uint8_t *end;
837#ifdef HAVE_MMX
838 const uint8_t *mm_end;
839#endif
840 uint16_t *d = (uint16_t *)dst;
841 end = s + src_size;
842#ifdef HAVE_MMX
843 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
844 __asm __volatile(
845 "movq %0, %%mm7\n\t"
846 "movq %1, %%mm6\n\t"
847 ::"m"(red_15mask),"m"(green_15mask));
0598bcbb 848 mm_end = end - 15;
ac4d0aea
MN
849 while(s < mm_end)
850 {
851 __asm __volatile(
852 PREFETCH" 32%1\n\t"
853 "movd %1, %%mm0\n\t"
854 "movd 3%1, %%mm3\n\t"
855 "punpckldq 6%1, %%mm0\n\t"
856 "punpckldq 9%1, %%mm3\n\t"
857 "movq %%mm0, %%mm1\n\t"
858 "movq %%mm0, %%mm2\n\t"
859 "movq %%mm3, %%mm4\n\t"
860 "movq %%mm3, %%mm5\n\t"
861 "psllq $7, %%mm0\n\t"
862 "psllq $7, %%mm3\n\t"
863 "pand %%mm7, %%mm0\n\t"
864 "pand %%mm7, %%mm3\n\t"
865 "psrlq $6, %%mm1\n\t"
866 "psrlq $6, %%mm4\n\t"
867 "pand %%mm6, %%mm1\n\t"
868 "pand %%mm6, %%mm4\n\t"
869 "psrlq $19, %%mm2\n\t"
870 "psrlq $19, %%mm5\n\t"
871 "pand %2, %%mm2\n\t"
872 "pand %2, %%mm5\n\t"
873 "por %%mm1, %%mm0\n\t"
874 "por %%mm4, %%mm3\n\t"
875 "por %%mm2, %%mm0\n\t"
876 "por %%mm5, %%mm3\n\t"
877 "psllq $16, %%mm3\n\t"
878 "por %%mm3, %%mm0\n\t"
879 MOVNTQ" %%mm0, %0\n\t"
880 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
881 d += 4;
882 s += 12;
883 }
884 __asm __volatile(SFENCE:::"memory");
885 __asm __volatile(EMMS:::"memory");
886#endif
887 while(s < end)
888 {
889 const int r= *s++;
890 const int g= *s++;
891 const int b= *s++;
892 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
893 }
894}
895
0d9f3d85
A
896/*
897 I use here less accurate approximation by simply
898 left-shifting the input
899 value and filling the low order bits with
900 zeroes. This method improves png's
901 compression but this scheme cannot reproduce white exactly, since it does not
902 generate an all-ones maximum value; the net effect is to darken the
903 image slightly.
904
905 The better method should be "left bit replication":
906
907 4 3 2 1 0
908 ---------
909 1 1 0 1 1
910
911 7 6 5 4 3 2 1 0
912 ----------------
913 1 1 0 1 1 1 1 0
914 |=======| |===|
915 | Leftmost Bits Repeated to Fill Open Bits
916 |
917 Original Bits
918*/
7f526efd 919static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
0d9f3d85
A
920{
921 const uint16_t *end;
922#ifdef HAVE_MMX
923 const uint16_t *mm_end;
924#endif
925 uint8_t *d = (uint8_t *)dst;
926 const uint16_t *s = (uint16_t *)src;
927 end = s + src_size/2;
928#ifdef HAVE_MMX
929 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 930 mm_end = end - 7;
0d9f3d85
A
931 while(s < mm_end)
932 {
933 __asm __volatile(
934 PREFETCH" 32%1\n\t"
935 "movq %1, %%mm0\n\t"
936 "movq %1, %%mm1\n\t"
937 "movq %1, %%mm2\n\t"
938 "pand %2, %%mm0\n\t"
939 "pand %3, %%mm1\n\t"
940 "pand %4, %%mm2\n\t"
941 "psllq $3, %%mm0\n\t"
942 "psrlq $2, %%mm1\n\t"
943 "psrlq $7, %%mm2\n\t"
944 "movq %%mm0, %%mm3\n\t"
945 "movq %%mm1, %%mm4\n\t"
946 "movq %%mm2, %%mm5\n\t"
947 "punpcklwd %5, %%mm0\n\t"
948 "punpcklwd %5, %%mm1\n\t"
949 "punpcklwd %5, %%mm2\n\t"
950 "punpckhwd %5, %%mm3\n\t"
951 "punpckhwd %5, %%mm4\n\t"
952 "punpckhwd %5, %%mm5\n\t"
953 "psllq $8, %%mm1\n\t"
954 "psllq $16, %%mm2\n\t"
955 "por %%mm1, %%mm0\n\t"
956 "por %%mm2, %%mm0\n\t"
957 "psllq $8, %%mm4\n\t"
958 "psllq $16, %%mm5\n\t"
959 "por %%mm4, %%mm3\n\t"
960 "por %%mm5, %%mm3\n\t"
961
962 "movq %%mm0, %%mm6\n\t"
963 "movq %%mm3, %%mm7\n\t"
964
965 "movq 8%1, %%mm0\n\t"
966 "movq 8%1, %%mm1\n\t"
967 "movq 8%1, %%mm2\n\t"
968 "pand %2, %%mm0\n\t"
969 "pand %3, %%mm1\n\t"
970 "pand %4, %%mm2\n\t"
971 "psllq $3, %%mm0\n\t"
972 "psrlq $2, %%mm1\n\t"
973 "psrlq $7, %%mm2\n\t"
974 "movq %%mm0, %%mm3\n\t"
975 "movq %%mm1, %%mm4\n\t"
976 "movq %%mm2, %%mm5\n\t"
977 "punpcklwd %5, %%mm0\n\t"
978 "punpcklwd %5, %%mm1\n\t"
979 "punpcklwd %5, %%mm2\n\t"
980 "punpckhwd %5, %%mm3\n\t"
981 "punpckhwd %5, %%mm4\n\t"
982 "punpckhwd %5, %%mm5\n\t"
983 "psllq $8, %%mm1\n\t"
984 "psllq $16, %%mm2\n\t"
985 "por %%mm1, %%mm0\n\t"
986 "por %%mm2, %%mm0\n\t"
987 "psllq $8, %%mm4\n\t"
988 "psllq $16, %%mm5\n\t"
989 "por %%mm4, %%mm3\n\t"
990 "por %%mm5, %%mm3\n\t"
991
992 :"=m"(*d)
993 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
994 :"memory");
995 /* Borrowed 32 to 24 */
996 __asm __volatile(
997 "movq %%mm0, %%mm4\n\t"
998 "movq %%mm3, %%mm5\n\t"
999 "movq %%mm6, %%mm0\n\t"
1000 "movq %%mm7, %%mm1\n\t"
1001
1002 "movq %%mm4, %%mm6\n\t"
1003 "movq %%mm5, %%mm7\n\t"
1004 "movq %%mm0, %%mm2\n\t"
1005 "movq %%mm1, %%mm3\n\t"
1006
1007 "psrlq $8, %%mm2\n\t"
1008 "psrlq $8, %%mm3\n\t"
1009 "psrlq $8, %%mm6\n\t"
1010 "psrlq $8, %%mm7\n\t"
1011 "pand %2, %%mm0\n\t"
1012 "pand %2, %%mm1\n\t"
1013 "pand %2, %%mm4\n\t"
1014 "pand %2, %%mm5\n\t"
1015 "pand %3, %%mm2\n\t"
1016 "pand %3, %%mm3\n\t"
1017 "pand %3, %%mm6\n\t"
1018 "pand %3, %%mm7\n\t"
1019 "por %%mm2, %%mm0\n\t"
1020 "por %%mm3, %%mm1\n\t"
1021 "por %%mm6, %%mm4\n\t"
1022 "por %%mm7, %%mm5\n\t"
1023
1024 "movq %%mm1, %%mm2\n\t"
1025 "movq %%mm4, %%mm3\n\t"
1026 "psllq $48, %%mm2\n\t"
1027 "psllq $32, %%mm3\n\t"
1028 "pand %4, %%mm2\n\t"
1029 "pand %5, %%mm3\n\t"
1030 "por %%mm2, %%mm0\n\t"
1031 "psrlq $16, %%mm1\n\t"
1032 "psrlq $32, %%mm4\n\t"
1033 "psllq $16, %%mm5\n\t"
1034 "por %%mm3, %%mm1\n\t"
1035 "pand %6, %%mm5\n\t"
1036 "por %%mm5, %%mm4\n\t"
1037
1038 MOVNTQ" %%mm0, %0\n\t"
1039 MOVNTQ" %%mm1, 8%0\n\t"
1040 MOVNTQ" %%mm4, 16%0"
1041
1042 :"=m"(*d)
1043 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1044 :"memory");
1045 d += 24;
1046 s += 8;
1047 }
53445e83
NK
1048 __asm __volatile(SFENCE:::"memory");
1049 __asm __volatile(EMMS:::"memory");
0d9f3d85
A
1050#endif
1051 while(s < end)
1052 {
1053 register uint16_t bgr;
1054 bgr = *s++;
1055 *d++ = (bgr&0x1F)<<3;
1056 *d++ = (bgr&0x3E0)>>2;
1057 *d++ = (bgr&0x7C00)>>7;
1058 }
1059}
1060
7f526efd 1061static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
0d9f3d85
A
1062{
1063 const uint16_t *end;
1064#ifdef HAVE_MMX
1065 const uint16_t *mm_end;
1066#endif
1067 uint8_t *d = (uint8_t *)dst;
1068 const uint16_t *s = (const uint16_t *)src;
1069 end = s + src_size/2;
1070#ifdef HAVE_MMX
1071 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 1072 mm_end = end - 7;
0d9f3d85
A
1073 while(s < mm_end)
1074 {
1075 __asm __volatile(
1076 PREFETCH" 32%1\n\t"
1077 "movq %1, %%mm0\n\t"
1078 "movq %1, %%mm1\n\t"
1079 "movq %1, %%mm2\n\t"
1080 "pand %2, %%mm0\n\t"
1081 "pand %3, %%mm1\n\t"
1082 "pand %4, %%mm2\n\t"
1083 "psllq $3, %%mm0\n\t"
1084 "psrlq $3, %%mm1\n\t"
1085 "psrlq $8, %%mm2\n\t"
1086 "movq %%mm0, %%mm3\n\t"
1087 "movq %%mm1, %%mm4\n\t"
1088 "movq %%mm2, %%mm5\n\t"
1089 "punpcklwd %5, %%mm0\n\t"
1090 "punpcklwd %5, %%mm1\n\t"
1091 "punpcklwd %5, %%mm2\n\t"
1092 "punpckhwd %5, %%mm3\n\t"
1093 "punpckhwd %5, %%mm4\n\t"
1094 "punpckhwd %5, %%mm5\n\t"
1095 "psllq $8, %%mm1\n\t"
1096 "psllq $16, %%mm2\n\t"
1097 "por %%mm1, %%mm0\n\t"
1098 "por %%mm2, %%mm0\n\t"
1099 "psllq $8, %%mm4\n\t"
1100 "psllq $16, %%mm5\n\t"
1101 "por %%mm4, %%mm3\n\t"
1102 "por %%mm5, %%mm3\n\t"
1103
1104 "movq %%mm0, %%mm6\n\t"
1105 "movq %%mm3, %%mm7\n\t"
1106
1107 "movq 8%1, %%mm0\n\t"
1108 "movq 8%1, %%mm1\n\t"
1109 "movq 8%1, %%mm2\n\t"
1110 "pand %2, %%mm0\n\t"
1111 "pand %3, %%mm1\n\t"
1112 "pand %4, %%mm2\n\t"
1113 "psllq $3, %%mm0\n\t"
1114 "psrlq $3, %%mm1\n\t"
1115 "psrlq $8, %%mm2\n\t"
1116 "movq %%mm0, %%mm3\n\t"
1117 "movq %%mm1, %%mm4\n\t"
1118 "movq %%mm2, %%mm5\n\t"
1119 "punpcklwd %5, %%mm0\n\t"
1120 "punpcklwd %5, %%mm1\n\t"
1121 "punpcklwd %5, %%mm2\n\t"
1122 "punpckhwd %5, %%mm3\n\t"
1123 "punpckhwd %5, %%mm4\n\t"
1124 "punpckhwd %5, %%mm5\n\t"
1125 "psllq $8, %%mm1\n\t"
1126 "psllq $16, %%mm2\n\t"
1127 "por %%mm1, %%mm0\n\t"
1128 "por %%mm2, %%mm0\n\t"
1129 "psllq $8, %%mm4\n\t"
1130 "psllq $16, %%mm5\n\t"
1131 "por %%mm4, %%mm3\n\t"
1132 "por %%mm5, %%mm3\n\t"
1133 :"=m"(*d)
1134 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1135 :"memory");
1136 /* Borrowed 32 to 24 */
1137 __asm __volatile(
1138 "movq %%mm0, %%mm4\n\t"
1139 "movq %%mm3, %%mm5\n\t"
1140 "movq %%mm6, %%mm0\n\t"
1141 "movq %%mm7, %%mm1\n\t"
1142
1143 "movq %%mm4, %%mm6\n\t"
1144 "movq %%mm5, %%mm7\n\t"
1145 "movq %%mm0, %%mm2\n\t"
1146 "movq %%mm1, %%mm3\n\t"
1147
1148 "psrlq $8, %%mm2\n\t"
1149 "psrlq $8, %%mm3\n\t"
1150 "psrlq $8, %%mm6\n\t"
1151 "psrlq $8, %%mm7\n\t"
1152 "pand %2, %%mm0\n\t"
1153 "pand %2, %%mm1\n\t"
1154 "pand %2, %%mm4\n\t"
1155 "pand %2, %%mm5\n\t"
1156 "pand %3, %%mm2\n\t"
1157 "pand %3, %%mm3\n\t"
1158 "pand %3, %%mm6\n\t"
1159 "pand %3, %%mm7\n\t"
1160 "por %%mm2, %%mm0\n\t"
1161 "por %%mm3, %%mm1\n\t"
1162 "por %%mm6, %%mm4\n\t"
1163 "por %%mm7, %%mm5\n\t"
1164
1165 "movq %%mm1, %%mm2\n\t"
1166 "movq %%mm4, %%mm3\n\t"
1167 "psllq $48, %%mm2\n\t"
1168 "psllq $32, %%mm3\n\t"
1169 "pand %4, %%mm2\n\t"
1170 "pand %5, %%mm3\n\t"
1171 "por %%mm2, %%mm0\n\t"
1172 "psrlq $16, %%mm1\n\t"
1173 "psrlq $32, %%mm4\n\t"
1174 "psllq $16, %%mm5\n\t"
1175 "por %%mm3, %%mm1\n\t"
1176 "pand %6, %%mm5\n\t"
1177 "por %%mm5, %%mm4\n\t"
1178
1179 MOVNTQ" %%mm0, %0\n\t"
1180 MOVNTQ" %%mm1, 8%0\n\t"
1181 MOVNTQ" %%mm4, 16%0"
1182
1183 :"=m"(*d)
1184 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1185 :"memory");
1186 d += 24;
1187 s += 8;
1188 }
1189 __asm __volatile(SFENCE:::"memory");
1190 __asm __volatile(EMMS:::"memory");
1191#endif
1192 while(s < end)
1193 {
1194 register uint16_t bgr;
1195 bgr = *s++;
1196 *d++ = (bgr&0x1F)<<3;
1197 *d++ = (bgr&0x7E0)>>3;
1198 *d++ = (bgr&0xF800)>>8;
1199 }
1200}
1201
7f526efd 1202static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
0d9f3d85
A
1203{
1204 const uint16_t *end;
1205#ifdef HAVE_MMX
1206 const uint16_t *mm_end;
1207#endif
1208 uint8_t *d = (uint8_t *)dst;
1209 const uint16_t *s = (const uint16_t *)src;
1210 end = s + src_size/2;
1211#ifdef HAVE_MMX
1212 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1213 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
d8dad2a5 1214 mm_end = end - 3;
0d9f3d85
A
1215 while(s < mm_end)
1216 {
1217 __asm __volatile(
1218 PREFETCH" 32%1\n\t"
1219 "movq %1, %%mm0\n\t"
1220 "movq %1, %%mm1\n\t"
1221 "movq %1, %%mm2\n\t"
1222 "pand %2, %%mm0\n\t"
1223 "pand %3, %%mm1\n\t"
1224 "pand %4, %%mm2\n\t"
1225 "psllq $3, %%mm0\n\t"
1226 "psrlq $2, %%mm1\n\t"
1227 "psrlq $7, %%mm2\n\t"
1228 "movq %%mm0, %%mm3\n\t"
1229 "movq %%mm1, %%mm4\n\t"
1230 "movq %%mm2, %%mm5\n\t"
1231 "punpcklwd %%mm7, %%mm0\n\t"
1232 "punpcklwd %%mm7, %%mm1\n\t"
1233 "punpcklwd %%mm7, %%mm2\n\t"
1234 "punpckhwd %%mm7, %%mm3\n\t"
1235 "punpckhwd %%mm7, %%mm4\n\t"
1236 "punpckhwd %%mm7, %%mm5\n\t"
1237 "psllq $8, %%mm1\n\t"
1238 "psllq $16, %%mm2\n\t"
1239 "por %%mm1, %%mm0\n\t"
1240 "por %%mm2, %%mm0\n\t"
1241 "psllq $8, %%mm4\n\t"
1242 "psllq $16, %%mm5\n\t"
1243 "por %%mm4, %%mm3\n\t"
1244 "por %%mm5, %%mm3\n\t"
1245 MOVNTQ" %%mm0, %0\n\t"
1246 MOVNTQ" %%mm3, 8%0\n\t"
1247 :"=m"(*d)
1248 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1249 :"memory");
1250 d += 16;
1251 s += 4;
1252 }
1253 __asm __volatile(SFENCE:::"memory");
1254 __asm __volatile(EMMS:::"memory");
1255#endif
1256 while(s < end)
996e1a7c 1257 {
deb2277c
MN
1258#if 0 //slightly slower on athlon
1259 int bgr= *s++;
1260 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1261#else
0d9f3d85
A
1262 register uint16_t bgr;
1263 bgr = *s++;
6cb38650
AB
1264#ifdef WORDS_BIGENDIAN
1265 *d++ = 0;
6cb38650 1266 *d++ = (bgr&0x7C00)>>7;
f688668c
AC
1267 *d++ = (bgr&0x3E0)>>2;
1268 *d++ = (bgr&0x1F)<<3;
6cb38650 1269#else
0d9f3d85
A
1270 *d++ = (bgr&0x1F)<<3;
1271 *d++ = (bgr&0x3E0)>>2;
1272 *d++ = (bgr&0x7C00)>>7;
1273 *d++ = 0;
deb2277c 1274#endif
6cb38650
AB
1275
1276#endif
0d9f3d85
A
1277 }
1278}
996e1a7c 1279
7f526efd 1280static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
0d9f3d85
A
1281{
1282 const uint16_t *end;
1283#ifdef HAVE_MMX
1284 const uint16_t *mm_end;
1285#endif
1286 uint8_t *d = (uint8_t *)dst;
1287 const uint16_t *s = (uint16_t *)src;
1288 end = s + src_size/2;
1289#ifdef HAVE_MMX
1290 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1291 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
d8dad2a5 1292 mm_end = end - 3;
0d9f3d85
A
1293 while(s < mm_end)
1294 {
1295 __asm __volatile(
1296 PREFETCH" 32%1\n\t"
1297 "movq %1, %%mm0\n\t"
1298 "movq %1, %%mm1\n\t"
1299 "movq %1, %%mm2\n\t"
1300 "pand %2, %%mm0\n\t"
1301 "pand %3, %%mm1\n\t"
1302 "pand %4, %%mm2\n\t"
1303 "psllq $3, %%mm0\n\t"
1304 "psrlq $3, %%mm1\n\t"
1305 "psrlq $8, %%mm2\n\t"
1306 "movq %%mm0, %%mm3\n\t"
1307 "movq %%mm1, %%mm4\n\t"
1308 "movq %%mm2, %%mm5\n\t"
1309 "punpcklwd %%mm7, %%mm0\n\t"
1310 "punpcklwd %%mm7, %%mm1\n\t"
1311 "punpcklwd %%mm7, %%mm2\n\t"
1312 "punpckhwd %%mm7, %%mm3\n\t"
1313 "punpckhwd %%mm7, %%mm4\n\t"
1314 "punpckhwd %%mm7, %%mm5\n\t"
1315 "psllq $8, %%mm1\n\t"
1316 "psllq $16, %%mm2\n\t"
1317 "por %%mm1, %%mm0\n\t"
1318 "por %%mm2, %%mm0\n\t"
1319 "psllq $8, %%mm4\n\t"
1320 "psllq $16, %%mm5\n\t"
1321 "por %%mm4, %%mm3\n\t"
1322 "por %%mm5, %%mm3\n\t"
1323 MOVNTQ" %%mm0, %0\n\t"
1324 MOVNTQ" %%mm3, 8%0\n\t"
1325 :"=m"(*d)
1326 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1327 :"memory");
1328 d += 16;
1329 s += 4;
996e1a7c 1330 }
0d9f3d85
A
1331 __asm __volatile(SFENCE:::"memory");
1332 __asm __volatile(EMMS:::"memory");
53445e83 1333#endif
0d9f3d85
A
1334 while(s < end)
1335 {
1336 register uint16_t bgr;
1337 bgr = *s++;
6cb38650
AB
1338#ifdef WORDS_BIGENDIAN
1339 *d++ = 0;
6cb38650 1340 *d++ = (bgr&0xF800)>>8;
f688668c
AC
1341 *d++ = (bgr&0x7E0)>>3;
1342 *d++ = (bgr&0x1F)<<3;
6cb38650 1343#else
0d9f3d85
A
1344 *d++ = (bgr&0x1F)<<3;
1345 *d++ = (bgr&0x7E0)>>3;
1346 *d++ = (bgr&0xF800)>>8;
1347 *d++ = 0;
6cb38650 1348#endif
0d9f3d85 1349 }
996e1a7c 1350}
fcfbc150 1351
7f526efd 1352static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
99969243 1353{
99969243 1354#ifdef HAVE_MMX
0d9f3d85 1355/* TODO: unroll this loop */
99969243 1356 asm volatile (
6e1c66bc 1357 "xor %%"REG_a", %%"REG_a" \n\t"
fac8012c 1358 ASMALIGN16
99969243 1359 "1: \n\t"
6e1c66bc
AJ
1360 PREFETCH" 32(%0, %%"REG_a") \n\t"
1361 "movq (%0, %%"REG_a"), %%mm0 \n\t"
99969243
MN
1362 "movq %%mm0, %%mm1 \n\t"
1363 "movq %%mm0, %%mm2 \n\t"
1364 "pslld $16, %%mm0 \n\t"
1365 "psrld $16, %%mm1 \n\t"
0d9f3d85
A
1366 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1367 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1368 "pand "MANGLE(mask32b)", %%mm1 \n\t"
99969243
MN
1369 "por %%mm0, %%mm2 \n\t"
1370 "por %%mm1, %%mm2 \n\t"
6e1c66bc
AJ
1371 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1372 "add $8, %%"REG_a" \n\t"
1373 "cmp %2, %%"REG_a" \n\t"
99969243 1374 " jb 1b \n\t"
7f526efd 1375 :: "r" (src), "r"(dst), "r" (src_size-7)
6e1c66bc 1376 : "%"REG_a
99969243 1377 );
9395185f
MN
1378
1379 __asm __volatile(SFENCE:::"memory");
1380 __asm __volatile(EMMS:::"memory");
99969243 1381#else
0d9f3d85
A
1382 unsigned i;
1383 unsigned num_pixels = src_size >> 2;
99969243
MN
1384 for(i=0; i<num_pixels; i++)
1385 {
d7b8e4b6
MN
1386#ifdef WORDS_BIGENDIAN
1387 dst[4*i + 1] = src[4*i + 3];
1388 dst[4*i + 2] = src[4*i + 2];
1389 dst[4*i + 3] = src[4*i + 1];
1390#else
1391 dst[4*i + 0] = src[4*i + 2];
1392 dst[4*i + 1] = src[4*i + 1];
1393 dst[4*i + 2] = src[4*i + 0];
1394#endif
99969243
MN
1395 }
1396#endif
1397}
1398
7f526efd 1399static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
74d35835 1400{
0d9f3d85 1401 unsigned i;
74d35835 1402#ifdef HAVE_MMX
6e1c66bc 1403 long mmx_size= 23 - src_size;
74d35835
MN
1404 asm volatile (
1405 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1406 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1407 "movq "MANGLE(mask24b)", %%mm7 \n\t"
fac8012c 1408 ASMALIGN16
74d35835 1409 "1: \n\t"
6e1c66bc
AJ
1410 PREFETCH" 32(%1, %%"REG_a") \n\t"
1411 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1412 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1413 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
74d35835
MN
1414 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1415 "pand %%mm5, %%mm0 \n\t"
1416 "pand %%mm6, %%mm1 \n\t"
1417 "pand %%mm7, %%mm2 \n\t"
1418 "por %%mm0, %%mm1 \n\t"
1419 "por %%mm2, %%mm1 \n\t"
6e1c66bc
AJ
1420 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1421 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1422 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1423 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
74d35835
MN
1424 "pand %%mm7, %%mm0 \n\t"
1425 "pand %%mm5, %%mm1 \n\t"
1426 "pand %%mm6, %%mm2 \n\t"
1427 "por %%mm0, %%mm1 \n\t"
1428 "por %%mm2, %%mm1 \n\t"
6e1c66bc
AJ
1429 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1430 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1431 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1432 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
74d35835
MN
1433 "pand %%mm6, %%mm0 \n\t"
1434 "pand %%mm7, %%mm1 \n\t"
1435 "pand %%mm5, %%mm2 \n\t"
1436 "por %%mm0, %%mm1 \n\t"
1437 "por %%mm2, %%mm1 \n\t"
6e1c66bc
AJ
1438 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1439 "add $24, %%"REG_a" \n\t"
74d35835
MN
1440 " js 1b \n\t"
1441 : "+a" (mmx_size)
1442 : "r" (src-mmx_size), "r"(dst-mmx_size)
1443 );
1444
1445 __asm __volatile(SFENCE:::"memory");
1446 __asm __volatile(EMMS:::"memory");
1447
218ad65d 1448 if(mmx_size==23) return; //finihsed, was multiple of 8
0d9f3d85 1449
74d35835
MN
1450 src+= src_size;
1451 dst+= src_size;
0d9f3d85 1452 src_size= 23-mmx_size;
74d35835
MN
1453 src-= src_size;
1454 dst-= src_size;
1455#endif
1456 for(i=0; i<src_size; i+=3)
1457 {
0d9f3d85 1458 register uint8_t x;
74d35835
MN
1459 x = src[i + 2];
1460 dst[i + 1] = src[i + 1];
1461 dst[i + 2] = src[i + 0];
1462 dst[i + 0] = x;
1463 }
1464}
1465
b1ec5875 1466static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
7f526efd
RD
1467 long width, long height,
1468 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
d9d58d17 1469{
7f526efd
RD
1470 long y;
1471 const long chromWidth= width>>1;
42b5fcb8
MN
1472 for(y=0; y<height; y++)
1473 {
4060205b 1474#ifdef HAVE_MMX
42b5fcb8
MN
1475//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1476 asm volatile(
6e1c66bc 1477 "xor %%"REG_a", %%"REG_a" \n\t"
fac8012c 1478 ASMALIGN16
42b5fcb8 1479 "1: \n\t"
6e1c66bc
AJ
1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1481 PREFETCH" 32(%2, %%"REG_a") \n\t"
1482 PREFETCH" 32(%3, %%"REG_a") \n\t"
1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
42b5fcb8 1484 "movq %%mm0, %%mm2 \n\t" // U(0)
6e1c66bc 1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
42b5fcb8
MN
1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
4060205b 1488
6e1c66bc
AJ
1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
42b5fcb8
MN
1491 "movq %%mm3, %%mm4 \n\t" // Y(0)
1492 "movq %%mm5, %%mm6 \n\t" // Y(8)
1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
4060205b 1497
6e1c66bc
AJ
1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
4060205b 1502
6e1c66bc
AJ
1503 "add $8, %%"REG_a" \n\t"
1504 "cmp %4, %%"REG_a" \n\t"
42b5fcb8 1505 " jb 1b \n\t"
7f526efd 1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
6e1c66bc 1507 : "%"REG_a
42b5fcb8 1508 );
4060205b 1509#else
b3b8bf64
MN
1510
1511#if defined ARCH_ALPHA && defined HAVE_MVI
1512#define pl2yuy2(n) \
1513 y1 = yc[n]; \
1514 y2 = yc2[n]; \
1515 u = uc[n]; \
1516 v = vc[n]; \
1517 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1518 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1519 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1520 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1521 yuv1 = (u << 8) + (v << 24); \
1522 yuv2 = yuv1 + y2; \
1523 yuv1 += y1; \
1524 qdst[n] = yuv1; \
1525 qdst2[n] = yuv2;
1526
1527 int i;
1528 uint64_t *qdst = (uint64_t *) dst;
1529 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1530 const uint32_t *yc = (uint32_t *) ysrc;
1531 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1532 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1533 for(i = 0; i < chromWidth; i += 8){
1534 uint64_t y1, y2, yuv1, yuv2;
1535 uint64_t u, v;
1536 /* Prefetch */
1537 asm("ldq $31,64(%0)" :: "r"(yc));
1538 asm("ldq $31,64(%0)" :: "r"(yc2));
1539 asm("ldq $31,64(%0)" :: "r"(uc));
1540 asm("ldq $31,64(%0)" :: "r"(vc));
1541
1542 pl2yuy2(0);
1543 pl2yuy2(1);
1544 pl2yuy2(2);
1545 pl2yuy2(3);
1546
1547 yc += 4;
1548 yc2 += 4;
1549 uc += 4;
1550 vc += 4;
1551 qdst += 4;
1552 qdst2 += 4;
1553 }
1554 y++;
1555 ysrc += lumStride;
1556 dst += dstStride;
1557
1558#elif __WORDSIZE >= 64
42b5fcb8 1559 int i;
0d9f3d85
A
1560 uint64_t *ldst = (uint64_t *) dst;
1561 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1562 for(i = 0; i < chromWidth; i += 2){
1563 uint64_t k, l;
1564 k = yc[0] + (uc[0] << 8) +
1565 (yc[1] << 16) + (vc[0] << 24);
1566 l = yc[2] + (uc[1] << 8) +
1567 (yc[3] << 16) + (vc[1] << 24);
1568 *ldst++ = k + (l << 32);
1569 yc += 4;
1570 uc += 2;
1571 vc += 2;
42b5fcb8 1572 }
0d9f3d85
A
1573
1574#else
1575 int i, *idst = (int32_t *) dst;
1576 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1577 for(i = 0; i < chromWidth; i++){
da7f8893
MN
1578#ifdef WORDS_BIGENDIAN
1579 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1580 (yc[1] << 8) + (vc[0] << 0);
1581#else
0d9f3d85
A
1582 *idst++ = yc[0] + (uc[0] << 8) +
1583 (yc[1] << 16) + (vc[0] << 24);
da7f8893 1584#endif
0d9f3d85
A
1585 yc += 2;
1586 uc++;
1587 vc++;
1588 }
1589#endif
42b5fcb8 1590#endif
b1ec5875 1591 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
42b5fcb8
MN
1592 {
1593 usrc += chromStride;
1594 vsrc += chromStride;
1595 }
1596 ysrc += lumStride;
1597 dst += dstStride;
d9d58d17 1598 }
42b5fcb8
MN
1599#ifdef HAVE_MMX
1600asm( EMMS" \n\t"
1601 SFENCE" \n\t"
1602 :::"memory");
4060205b 1603#endif
d9d58d17
MN
1604}
1605
dabcdbc4
MN
1606/**
1607 *
1608 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1609 * problem for anyone then tell me, and ill fix it)
1610 */
b1ec5875 1611static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
7f526efd
RD
1612 long width, long height,
1613 long lumStride, long chromStride, long dstStride)
b1ec5875
MN
1614{
1615 //FIXME interpolate chroma
1616 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
caeaabe7
AB
1617}
1618
1619static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
7f526efd
RD
1620 long width, long height,
1621 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
caeaabe7 1622{
7f526efd
RD
1623 long y;
1624 const long chromWidth= width>>1;
caeaabe7
AB
1625 for(y=0; y<height; y++)
1626 {
7ac25f2d
MN
1627#ifdef HAVE_MMX
1628//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1629 asm volatile(
6e1c66bc 1630 "xor %%"REG_a", %%"REG_a" \n\t"
fac8012c 1631 ASMALIGN16
7ac25f2d 1632 "1: \n\t"
6e1c66bc
AJ
1633 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1634 PREFETCH" 32(%2, %%"REG_a") \n\t"
1635 PREFETCH" 32(%3, %%"REG_a") \n\t"
1636 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
7ac25f2d 1637 "movq %%mm0, %%mm2 \n\t" // U(0)
6e1c66bc 1638 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
7ac25f2d
MN
1639 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1640 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1641
6e1c66bc
AJ
1642 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1643 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
7ac25f2d
MN
1644 "movq %%mm0, %%mm4 \n\t" // Y(0)
1645 "movq %%mm2, %%mm6 \n\t" // Y(8)
1646 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1647 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1648 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1649 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1650
6e1c66bc
AJ
1651 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1652 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1653 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1654 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
7ac25f2d 1655
6e1c66bc
AJ
1656 "add $8, %%"REG_a" \n\t"
1657 "cmp %4, %%"REG_a" \n\t"
7ac25f2d 1658 " jb 1b \n\t"
7f526efd 1659 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
6e1c66bc 1660 : "%"REG_a
7ac25f2d
MN
1661 );
1662#else
1663//FIXME adapt the alpha asm code from yv12->yuy2
1664
caeaabe7
AB
1665#if __WORDSIZE >= 64
1666 int i;
1667 uint64_t *ldst = (uint64_t *) dst;
1668 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1669 for(i = 0; i < chromWidth; i += 2){
1670 uint64_t k, l;
1671 k = uc[0] + (yc[0] << 8) +
1672 (vc[0] << 16) + (yc[1] << 24);
1673 l = uc[1] + (yc[2] << 8) +
1674 (vc[1] << 16) + (yc[3] << 24);
1675 *ldst++ = k + (l << 32);
1676 yc += 4;
1677 uc += 2;
1678 vc += 2;
1679 }
1680
1681#else
1682 int i, *idst = (int32_t *) dst;
1683 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1684 for(i = 0; i < chromWidth; i++){
da7f8893
MN
1685#ifdef WORDS_BIGENDIAN
1686 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1687 (vc[0] << 8) + (yc[1] << 0);
1688#else
caeaabe7
AB
1689 *idst++ = uc[0] + (yc[0] << 8) +
1690 (vc[0] << 16) + (yc[1] << 24);
da7f8893 1691#endif
caeaabe7
AB
1692 yc += 2;
1693 uc++;
1694 vc++;
1695 }
1696#endif
7ac25f2d 1697#endif
caeaabe7
AB
1698 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1699 {
1700 usrc += chromStride;
1701 vsrc += chromStride;
1702 }
1703 ysrc += lumStride;
1704 dst += dstStride;
1705 }
7ac25f2d
MN
1706#ifdef HAVE_MMX
1707asm( EMMS" \n\t"
1708 SFENCE" \n\t"
1709 :::"memory");
1710#endif
caeaabe7
AB
1711}
1712
1713/**
1714 *
1715 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1716 * problem for anyone then tell me, and ill fix it)
1717 */
1718static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
7f526efd
RD
1719 long width, long height,
1720 long lumStride, long chromStride, long dstStride)
caeaabe7
AB
1721{
1722 //FIXME interpolate chroma
1723 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
b1ec5875
MN
1724}
1725
1726/**
1727 *
1728 * width should be a multiple of 16
1729 */
1730static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
7f526efd
RD
1731 long width, long height,
1732 long lumStride, long chromStride, long dstStride)
b1ec5875
MN
1733{
1734 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1735}
1736
1737/**
1738 *
1739 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1740 * problem for anyone then tell me, and ill fix it)
1741 */
1de97d84 1742static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
7f526efd
RD
1743 long width, long height,
1744 long lumStride, long chromStride, long srcStride)
d9d58d17 1745{
7f526efd
RD
1746 long y;
1747 const long chromWidth= width>>1;
dabcdbc4
MN
1748 for(y=0; y<height; y+=2)
1749 {
bd09433f 1750#ifdef HAVE_MMX
dabcdbc4 1751 asm volatile(
6e1c66bc 1752 "xor %%"REG_a", %%"REG_a" \n\t"
dabcdbc4
MN
1753 "pcmpeqw %%mm7, %%mm7 \n\t"
1754 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
fac8012c 1755 ASMALIGN16
dabcdbc4 1756 "1: \n\t"
6e1c66bc
AJ
1757 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1758 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1759 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
dabcdbc4
MN
1760 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1761 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1762 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1763 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1764 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1765 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1766 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1767 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1768
6e1c66bc 1769 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
dabcdbc4 1770
6e1c66bc
AJ
1771 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1772 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
dabcdbc4
MN
1773 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1774 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1775 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1776 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1777 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1778 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1779 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1780 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1781
6e1c66bc 1782 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
dabcdbc4
MN
1783
1784 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1785 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1786 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1787 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1788 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1789 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1790 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1791 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1792
6e1c66bc
AJ
1793 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1794 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
dabcdbc4 1795
6e1c66bc
AJ
1796 "add $8, %%"REG_a" \n\t"
1797 "cmp %4, %%"REG_a" \n\t"
dabcdbc4 1798 " jb 1b \n\t"
7f526efd 1799 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
6e1c66bc 1800 : "memory", "%"REG_a
cfc15dc6 1801 );
dabcdbc4 1802
ed346065
MN
1803 ydst += lumStride;
1804 src += srcStride;
1805
cfc15dc6 1806 asm volatile(
6e1c66bc 1807 "xor %%"REG_a", %%"REG_a" \n\t"
fac8012c 1808 ASMALIGN16
dabcdbc4 1809 "1: \n\t"
6e1c66bc
AJ
1810 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1811 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1812 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1813 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1814 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
dabcdbc4
MN
1815 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1816 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1817 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1818 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1819 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1820 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1821
6e1c66bc
AJ
1822 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1823 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
dabcdbc4 1824
6e1c66bc
AJ
1825 "add $8, %%"REG_a" \n\t"
1826 "cmp %4, %%"REG_a" \n\t"
dabcdbc4
MN
1827 " jb 1b \n\t"
1828
7f526efd 1829 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
6e1c66bc 1830 : "memory", "%"REG_a
dabcdbc4 1831 );
bd09433f 1832#else
7f526efd 1833 long i;
dabcdbc4
MN
1834 for(i=0; i<chromWidth; i++)
1835 {
1836 ydst[2*i+0] = src[4*i+0];
1837 udst[i] = src[4*i+1];
1838 ydst[2*i+1] = src[4*i+2];
1839 vdst[i] = src[4*i+3];
1840 }
1841 ydst += lumStride;
1842 src += srcStride;
1843
1844 for(i=0; i<chromWidth; i++)
1845 {
1846 ydst[2*i+0] = src[4*i+0];
1847 ydst[2*i+1] = src[4*i+2];
1848 }
1849#endif
1850 udst += chromStride;
1851 vdst += chromStride;
1852 ydst += lumStride;
1853 src += srcStride;
d9d58d17 1854 }
dabcdbc4 1855#ifdef HAVE_MMX
ed8c0670
MN
1856asm volatile( EMMS" \n\t"
1857 SFENCE" \n\t"
1858 :::"memory");
bd09433f 1859#endif
42b5fcb8 1860}
81c0590e 1861
d661d18d
AB
1862static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1863 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
7f526efd 1864 long width, long height, long lumStride, long chromStride)
d661d18d
AB
1865{
1866 /* Y Plane */
1867 memcpy(ydst, ysrc, width*height);
1868
1869 /* XXX: implement upscaling for U,V */
1870}
1871
7f526efd 1872static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
b241cbf2 1873{
7f526efd 1874 long x,y;
b241cbf2 1875
b2609d4c
MN
1876 dst[0]= src[0];
1877
b241cbf2 1878 // first line
b2609d4c
MN
1879 for(x=0; x<srcWidth-1; x++){
1880 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1881 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
b241cbf2 1882 }
b2609d4c
MN
1883 dst[2*srcWidth-1]= src[srcWidth-1];
1884
1885 dst+= dstStride;
b241cbf2
MN
1886
1887 for(y=1; y<srcHeight; y++){
1888#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
6e1c66bc 1889 const long mmxSize= srcWidth&~15;
b241cbf2 1890 asm volatile(
6e1c66bc 1891 "mov %4, %%"REG_a" \n\t"
b241cbf2 1892 "1: \n\t"
6e1c66bc
AJ
1893 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1894 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1895 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1896 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1897 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1898 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
b2609d4c
MN
1899 PAVGB" %%mm0, %%mm5 \n\t"
1900 PAVGB" %%mm0, %%mm3 \n\t"
1901 PAVGB" %%mm0, %%mm5 \n\t"
1902 PAVGB" %%mm0, %%mm3 \n\t"
1903 PAVGB" %%mm1, %%mm4 \n\t"
1904 PAVGB" %%mm1, %%mm2 \n\t"
1905 PAVGB" %%mm1, %%mm4 \n\t"
1906 PAVGB" %%mm1, %%mm2 \n\t"
1907 "movq %%mm5, %%mm7 \n\t"
1908 "movq %%mm4, %%mm6 \n\t"
1909 "punpcklbw %%mm3, %%mm5 \n\t"
1910 "punpckhbw %%mm3, %%mm7 \n\t"
1911 "punpcklbw %%mm2, %%mm4 \n\t"
1912 "punpckhbw %%mm2, %%mm6 \n\t"
b241cbf2 1913#if 1
6e1c66bc
AJ
1914 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1915 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1916 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1917 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
b241cbf2 1918#else
6e1c66bc
AJ
1919 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1920 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1921 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1922 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
b241cbf2 1923#endif
6e1c66bc 1924 "add $8, %%"REG_a" \n\t"
b241cbf2 1925 " js 1b \n\t"
b2609d4c 1926 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
b241cbf2
MN
1927 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1928 "g" (-mmxSize)
6e1c66bc 1929 : "%"REG_a
b241cbf2
MN
1930
1931 );
b241cbf2 1932#else
7f526efd 1933 const long mmxSize=1;
b2609d4c
MN
1934#endif
1935 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1936 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
b241cbf2 1937
b2609d4c 1938 for(x=mmxSize-1; x<srcWidth-1; x++){
b241cbf2
MN
1939 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1940 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1941 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1942 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1943 }
b2609d4c
MN
1944 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1945 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
b241cbf2
MN
1946
1947 dst+=dstStride*2;
1948 src+=srcStride;
1949 }
b241cbf2
MN
1950
1951 // last line
b2609d4c
MN
1952#if 1
1953 dst[0]= src[0];
1954
1955 for(x=0; x<srcWidth-1; x++){
1956 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1957 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1958 }
1959 dst[2*srcWidth-1]= src[srcWidth-1];
1960#else
b241cbf2
MN
1961 for(x=0; x<srcWidth; x++){
1962 dst[2*x+0]=
1963 dst[2*x+1]= src[x];
1964 }
b2609d4c
MN
1965#endif
1966
b241cbf2
MN
1967#ifdef HAVE_MMX
1968asm volatile( EMMS" \n\t"
1969 SFENCE" \n\t"
1970 :::"memory");
1971#endif
1972}
1973
81c0590e
A
1974/**
1975 *
1976 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1977 * problem for anyone then tell me, and ill fix it)
1de97d84 1978 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
81c0590e 1979 */
1de97d84 1980static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
7f526efd
RD
1981 long width, long height,
1982 long lumStride, long chromStride, long srcStride)
81c0590e 1983{
7f526efd
RD
1984 long y;
1985 const long chromWidth= width>>1;
81c0590e
A
1986 for(y=0; y<height; y+=2)
1987 {
ed8c0670
MN
1988#ifdef HAVE_MMX
1989 asm volatile(
1990 "xorl %%eax, %%eax \n\t"
1991 "pcmpeqw %%mm7, %%mm7 \n\t"
1992 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
fac8012c 1993 ASMALIGN16
ed8c0670
MN
1994 "1: \n\t"
1995 PREFETCH" 64(%0, %%eax, 4) \n\t"
1996 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1997 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1998 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1999 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2000 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2001 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2002 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2003 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2004 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2005 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2006
2007 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2008
2009 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2010 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2011 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2012 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2013 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2014 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2015 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2016 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2017 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2018 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2019
2020 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2021
2022 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2023 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2024 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2025 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2026 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2027 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2028 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2029 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2030
2031 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2032 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2033
2034 "addl $8, %%eax \n\t"
2035 "cmpl %4, %%eax \n\t"
2036 " jb 1b \n\t"
4596673c 2037 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
ed8c0670
MN
2038 : "memory", "%eax"
2039 );
2040
2041 ydst += lumStride;
2042 src += srcStride;
2043
2044 asm volatile(
2045 "xorl %%eax, %%eax \n\t"
fac8012c 2046 ASMALIGN16
ed8c0670
MN
2047 "1: \n\t"
2048 PREFETCH" 64(%0, %%eax, 4) \n\t"
2049 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2050 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2051 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2052 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2053 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2054 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2055 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2056 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2057 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2058 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2059
2060 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2061 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2062
2063 "addl $8, %%eax \n\t"
2064 "cmpl %4, %%eax \n\t"
2065 " jb 1b \n\t"
2066
4596673c 2067 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
ed8c0670
MN
2068 : "memory", "%eax"
2069 );
2070#else
7f526efd 2071 long i;
81c0590e
A
2072 for(i=0; i<chromWidth; i++)
2073 {
2074 udst[i] = src[4*i+0];
2075 ydst[2*i+0] = src[4*i+1];
2076 vdst[i] = src[4*i+2];
2077 ydst[2*i+1] = src[4*i+3];
2078 }
2079 ydst += lumStride;
2080 src += srcStride;
2081
2082 for(i=0; i<chromWidth; i++)
2083 {
2084 ydst[2*i+0] = src[4*i+1];
2085 ydst[2*i+1] = src[4*i+3];
2086 }
ed8c0670 2087#endif
81c0590e
A
2088 udst += chromStride;
2089 vdst += chromStride;
2090 ydst += lumStride;
2091 src += srcStride;
2092 }
ed8c0670
MN
2093#ifdef HAVE_MMX
2094asm volatile( EMMS" \n\t"
2095 SFENCE" \n\t"
2096 :::"memory");
2097#endif
81c0590e
A
2098}
2099
1de97d84
MN
2100/**
2101 *
2102 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2103 * problem for anyone then tell me, and ill fix it)
21316f3c 2104 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1de97d84
MN
2105 */
2106static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
7f526efd
RD
2107 long width, long height,
2108 long lumStride, long chromStride, long srcStride)
1de97d84 2109{
7f526efd
RD
2110 long y;
2111 const long chromWidth= width>>1;
21316f3c
MN
2112#ifdef HAVE_MMX
2113 for(y=0; y<height-2; y+=2)
2114 {
7f526efd 2115 long i;
21316f3c
MN
2116 for(i=0; i<2; i++)
2117 {
2118 asm volatile(
6e1c66bc 2119 "mov %2, %%"REG_a" \n\t"
854288bb
FB
2120 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2121 "movq "MANGLE(w1111)", %%mm5 \n\t"
21316f3c 2122 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 2123 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
fac8012c 2124 ASMALIGN16
21316f3c 2125 "1: \n\t"
6e1c66bc
AJ
2126 PREFETCH" 64(%0, %%"REG_b") \n\t"
2127 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2128 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
21316f3c
MN
2129 "punpcklbw %%mm7, %%mm0 \n\t"
2130 "punpcklbw %%mm7, %%mm1 \n\t"
6e1c66bc
AJ
2131 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
2132 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
21316f3c
MN
2133 "punpcklbw %%mm7, %%mm2 \n\t"
2134 "punpcklbw %%mm7, %%mm3 \n\t"
2135 "pmaddwd %%mm6, %%mm0 \n\t"
2136 "pmaddwd %%mm6, %%mm1 \n\t"
2137 "pmaddwd %%mm6, %%mm2 \n\t"
2138 "pmaddwd %%mm6, %%mm3 \n\t"
2139#ifndef FAST_BGR2YV12
2140 "psrad $8, %%mm0 \n\t"
2141 "psrad $8, %%mm1 \n\t"
2142 "psrad $8, %%mm2 \n\t"
2143 "psrad $8, %%mm3 \n\t"
2144#endif
2145 "packssdw %%mm1, %%mm0 \n\t"
2146 "packssdw %%mm3, %%mm2 \n\t"
2147 "pmaddwd %%mm5, %%mm0 \n\t"
2148 "pmaddwd %%mm5, %%mm2 \n\t"
2149 "packssdw %%mm2, %%mm0 \n\t"
2150 "psraw $7, %%mm0 \n\t"
2151
6e1c66bc
AJ
2152 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2153 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
21316f3c
MN
2154 "punpcklbw %%mm7, %%mm4 \n\t"
2155 "punpcklbw %%mm7, %%mm1 \n\t"
6e1c66bc
AJ
2156 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
2157 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
21316f3c
MN
2158 "punpcklbw %%mm7, %%mm2 \n\t"
2159 "punpcklbw %%mm7, %%mm3 \n\t"
2160 "pmaddwd %%mm6, %%mm4 \n\t"
2161 "pmaddwd %%mm6, %%mm1 \n\t"
2162 "pmaddwd %%mm6, %%mm2 \n\t"
2163 "pmaddwd %%mm6, %%mm3 \n\t"
2164#ifndef FAST_BGR2YV12
2165 "psrad $8, %%mm4 \n\t"
2166 "psrad $8, %%mm1 \n\t"
2167 "psrad $8, %%mm2 \n\t"
2168 "psrad $8, %%mm3 \n\t"
2169#endif
2170 "packssdw %%mm1, %%mm4 \n\t"
2171 "packssdw %%mm3, %%mm2 \n\t"
2172 "pmaddwd %%mm5, %%mm4 \n\t"
2173 "pmaddwd %%mm5, %%mm2 \n\t"
6e1c66bc 2174 "add $24, %%"REG_b" \n\t"
21316f3c
MN
2175 "packssdw %%mm2, %%mm4 \n\t"
2176 "psraw $7, %%mm4 \n\t"
2177
2178 "packuswb %%mm4, %%mm0 \n\t"
854288bb 2179 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
21316f3c 2180
6e1c66bc
AJ
2181 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2182 "add $8, %%"REG_a" \n\t"
21316f3c 2183 " js 1b \n\t"
7f526efd 2184 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
6e1c66bc 2185 : "%"REG_a, "%"REG_b
21316f3c
MN
2186 );
2187 ydst += lumStride;
2188 src += srcStride;
2189 }
2190 src -= srcStride*2;
2191 asm volatile(
6e1c66bc 2192 "mov %4, %%"REG_a" \n\t"
854288bb
FB
2193 "movq "MANGLE(w1111)", %%mm5 \n\t"
2194 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
21316f3c 2195 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc
AJ
2196 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2197 "add %%"REG_b", %%"REG_b" \n\t"
fac8012c 2198 ASMALIGN16
21316f3c 2199 "1: \n\t"
6e1c66bc
AJ
2200 PREFETCH" 64(%0, %%"REG_b") \n\t"
2201 PREFETCH" 64(%1, %%"REG_b") \n\t"
21316f3c 2202#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
6e1c66bc
AJ
2203 "movq (%0, %%"REG_b"), %%mm0 \n\t"
2204 "movq (%1, %%"REG_b"), %%mm1 \n\t"
2205 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
2206 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
21316f3c
MN
2207 PAVGB" %%mm1, %%mm0 \n\t"
2208 PAVGB" %%mm3, %%mm2 \n\t"
2209 "movq %%mm0, %%mm1 \n\t"
2210 "movq %%mm2, %%mm3 \n\t"
2211 "psrlq $24, %%mm0 \n\t"
2212 "psrlq $24, %%mm2 \n\t"
2213 PAVGB" %%mm1, %%mm0 \n\t"
2214 PAVGB" %%mm3, %%mm2 \n\t"
2215 "punpcklbw %%mm7, %%mm0 \n\t"
2216 "punpcklbw %%mm7, %%mm2 \n\t"
2217#else
6e1c66bc
AJ
2218 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2219 "movd (%1, %%"REG_b"), %%mm1 \n\t"
2220 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
2221 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
21316f3c
MN
2222 "punpcklbw %%mm7, %%mm0 \n\t"
2223 "punpcklbw %%mm7, %%mm1 \n\t"
2224 "punpcklbw %%mm7, %%mm2 \n\t"
2225 "punpcklbw %%mm7, %%mm3 \n\t"
2226 "paddw %%mm1, %%mm0 \n\t"
2227 "paddw %%mm3, %%mm2 \n\t"
2228 "paddw %%mm2, %%mm0 \n\t"
6e1c66bc
AJ
2229 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
2230 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
2231 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
2232 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
21316f3c
MN
2233 "punpcklbw %%mm7, %%mm4 \n\t"
2234 "punpcklbw %%mm7, %%mm1 \n\t"
2235 "punpcklbw %%mm7, %%mm2 \n\t"
2236 "punpcklbw %%mm7, %%mm3 \n\t"
2237 "paddw %%mm1, %%mm4 \n\t"
2238 "paddw %%mm3, %%mm2 \n\t"
2239 "paddw %%mm4, %%mm2 \n\t"
2240 "psrlw $2, %%mm0 \n\t"
2241 "psrlw $2, %%mm2 \n\t"
2242#endif
854288bb
FB
2243 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2244 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
21316f3c
MN
2245
2246 "pmaddwd %%mm0, %%mm1 \n\t"
2247 "pmaddwd %%mm2, %%mm3 \n\t"
2248 "pmaddwd %%mm6, %%mm0 \n\t"
2249 "pmaddwd %%mm6, %%mm2 \n\t"
2250#ifndef FAST_BGR2YV12
2251 "psrad $8, %%mm0 \n\t"
2252 "psrad $8, %%mm1 \n\t"
2253 "psrad $8, %%mm2 \n\t"
2254 "psrad $8, %%mm3 \n\t"
2255#endif
2256 "packssdw %%mm2, %%mm0 \n\t"
2257 "packssdw %%mm3, %%mm1 \n\t"
2258 "pmaddwd %%mm5, %%mm0 \n\t"
2259 "pmaddwd %%mm5, %%mm1 \n\t"
2260 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2261 "psraw $7, %%mm0 \n\t"
2262
2263#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
6e1c66bc
AJ
2264 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
2265 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
2266 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
2267 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
21316f3c
MN
2268 PAVGB" %%mm1, %%mm4 \n\t"
2269 PAVGB" %%mm3, %%mm2 \n\t"
2270 "movq %%mm4, %%mm1 \n\t"
2271 "movq %%mm2, %%mm3 \n\t"
2272 "psrlq $24, %%mm4 \n\t"
2273 "psrlq $24, %%mm2 \n\t"
2274 PAVGB" %%mm1, %%mm4 \n\t"
2275 PAVGB" %%mm3, %%mm2 \n\t"
2276 "punpcklbw %%mm7, %%mm4 \n\t"
2277 "punpcklbw %%mm7, %%mm2 \n\t"
2278#else
6e1c66bc
AJ
2279 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2280 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
2281 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
2282 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
21316f3c
MN
2283 "punpcklbw %%mm7, %%mm4 \n\t"
2284 "punpcklbw %%mm7, %%mm1 \n\t"
2285 "punpcklbw %%mm7, %%mm2 \n\t"
2286 "punpcklbw %%mm7, %%mm3 \n\t"
2287 "paddw %%mm1, %%mm4 \n\t"
2288 "paddw %%mm3, %%mm2 \n\t"
2289 "paddw %%mm2, %%mm4 \n\t"
6e1c66bc
AJ
2290 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
2291 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
2292 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
2293 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
21316f3c
MN
2294 "punpcklbw %%mm7, %%mm5 \n\t"
2295 "punpcklbw %%mm7, %%mm1 \n\t"
2296 "punpcklbw %%mm7, %%mm2 \n\t"
2297 "punpcklbw %%mm7, %%mm3 \n\t"
2298 "paddw %%mm1, %%mm5 \n\t"
2299 "paddw %%mm3, %%mm2 \n\t"
2300 "paddw %%mm5, %%mm2 \n\t"
854288bb 2301 "movq "MANGLE(w1111)", %%mm5 \n\t"
21316f3c
MN
2302 "psrlw $2, %%mm4 \n\t"
2303 "psrlw $2, %%mm2 \n\t"
2304#endif
854288bb
FB
2305 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2306 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
21316f3c
MN
2307
2308 "pmaddwd %%mm4, %%mm1 \n\t"
2309 "pmaddwd %%mm2, %%mm3 \n\t"
2310 "pmaddwd %%mm6, %%mm4 \n\t"
2311 "pmaddwd %%mm6, %%mm2 \n\t"
2312#ifndef FAST_BGR2YV12
2313 "psrad $8, %%mm4 \n\t"
2314 "psrad $8, %%mm1 \n\t"
2315 "psrad $8, %%mm2 \n\t"
2316 "psrad $8, %%mm3 \n\t"
2317#endif
2318 "packssdw %%mm2, %%mm4 \n\t"
2319 "packssdw %%mm3, %%mm1 \n\t"
2320 "pmaddwd %%mm5, %%mm4 \n\t"
2321 "pmaddwd %%mm5, %%mm1 \n\t"
6e1c66bc 2322 "add $24, %%"REG_b" \n\t"
21316f3c
MN
2323 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2324 "psraw $7, %%mm4 \n\t"
2325
2326 "movq %%mm0, %%mm1 \n\t"
2327 "punpckldq %%mm4, %%mm0 \n\t"
2328 "punpckhdq %%mm4, %%mm1 \n\t"
2329 "packsswb %%mm1, %%mm0 \n\t"
854288bb 2330 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
6e1c66bc 2331 "movd %%mm0, (%2, %%"REG_a") \n\t"
21316f3c 2332 "punpckhdq %%mm0, %%mm0 \n\t"
6e1c66bc
AJ
2333 "movd %%mm0, (%3, %%"REG_a") \n\t"
2334 "add $4, %%"REG_a" \n\t"
21316f3c 2335 " js 1b \n\t"
7f526efd 2336 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
6e1c66bc 2337 : "%"REG_a, "%"REG_b
21316f3c
MN
2338 );
2339
2340 udst += chromStride;
2341 vdst += chromStride;
2342 src += srcStride*2;
2343 }
2344
2345 asm volatile( EMMS" \n\t"
2346 SFENCE" \n\t"
2347 :::"memory");
2348#else
2349 y=0;
2350#endif
2351 for(; y<height; y+=2)
1de97d84 2352 {
7f526efd 2353 long i;
1de97d84
MN
2354 for(i=0; i<chromWidth; i++)
2355 {
2356 unsigned int b= src[6*i+0];
2357 unsigned int g= src[6*i+1];
2358 unsigned int r= src[6*i+2];
2359
aa21f0c3
MN
2360 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2361 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2362 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
81c0590e 2363
1de97d84
MN
2364 udst[i] = U;
2365 vdst[i] = V;
2366 ydst[2*i] = Y;
2367
2368 b= src[6*i+3];
2369 g= src[6*i+4];
2370 r= src[6*i+5];
2371
aa21f0c3 2372 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1de97d84
MN
2373 ydst[2*i+1] = Y;
2374 }
2375 ydst += lumStride;
2376 src += srcStride;
2377
2378 for(i=0; i<chromWidth; i++)
2379 {
2380 unsigned int b= src[6*i+0];
2381 unsigned int g= src[6*i+1];
2382 unsigned int r= src[6*i+2];
2383
aa21f0c3 2384 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1de97d84
MN
2385
2386 ydst[2*i] = Y;
2387
2388 b= src[6*i+3];
2389 g= src[6*i+4];
2390 r= src[6*i+5];
2391
aa21f0c3 2392 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1de97d84
MN
2393 ydst[2*i+1] = Y;
2394 }
2395 udst += chromStride;
2396 vdst += chromStride;
2397 ydst += lumStride;
2398 src += srcStride;
2399 }
2400}
5d55fdb4
MN
2401
2402void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
7f526efd
RD
2403 long width, long height, long src1Stride,
2404 long src2Stride, long dstStride){
2405 long h;
5d55fdb4
MN
2406
2407 for(h=0; h < height; h++)
2408 {
7f526efd 2409 long w;
5d55fdb4
MN
2410
2411#ifdef HAVE_MMX
2412#ifdef HAVE_SSE2
2413 asm(
6e1c66bc 2414 "xor %%"REG_a", %%"REG_a" \n\t"
5d55fdb4 2415 "1: \n\t"
6e1c66bc
AJ
2416 PREFETCH" 64(%1, %%"REG_a") \n\t"
2417 PREFETCH" 64(%2, %%"REG_a") \n\t"
2418 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2419 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2420 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
5d55fdb4
MN
2421 "punpcklbw %%xmm2, %%xmm0 \n\t"
2422 "punpckhbw %%xmm2, %%xmm1 \n\t"
6e1c66bc
AJ
2423 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2424 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2425 "add $16, %%"REG_a" \n\t"
2426 "cmp %3, %%"REG_a" \n\t"
5d55fdb4 2427 " jb 1b \n\t"
7f526efd 2428 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
6e1c66bc 2429 : "memory", "%"REG_a""
5d55fdb4
MN
2430 );
2431#else
2432 asm(
6e1c66bc 2433 "xor %%"REG_a", %%"REG_a" \n\t"
5d55fdb4 2434 "1: \n\t"
6e1c66bc
AJ
2435 PREFETCH" 64(%1, %%"REG_a") \n\t"
2436 PREFETCH" 64(%2, %%"REG_a") \n\t"
2437 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2438 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
5d55fdb4
MN
2439 "movq %%mm0, %%mm1 \n\t"
2440 "movq %%mm2, %%mm3 \n\t"
6e1c66bc
AJ
2441 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2442 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
5d55fdb4
MN
2443 "punpcklbw %%mm4, %%mm0 \n\t"
2444 "punpckhbw %%mm4, %%mm1 \n\t"
2445 "punpcklbw %%mm5, %%mm2 \n\t"
2446 "punpckhbw %%mm5, %%mm3 \n\t"
6e1c66bc
AJ
2447 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2448 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2449 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2450 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2451 "add $16, %%"REG_a" \n\t"
2452 "cmp %3, %%"REG_a" \n\t"
5d55fdb4 2453 " jb 1b \n\t"
7f526efd 2454 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
6e1c66bc 2455 : "memory", "%"REG_a
5d55fdb4
MN
2456 );
2457#endif
2458 for(w= (width&(~15)); w < width; w++)
2459 {
2460 dest[2*w+0] = src1[w];
2461 dest[2*w+1] = src2[w];
2462 }
2463#else
2464 for(w=0; w < width; w++)
2465 {
2466 dest[2*w+0] = src1[w];
2467 dest[2*w+1] = src2[w];
2468 }
2469#endif
2470 dest += dstStride;
2471 src1 += src1Stride;
2472 src2 += src2Stride;
2473 }
2474#ifdef HAVE_MMX
2475 asm(
2476 EMMS" \n\t"
2477 SFENCE" \n\t"
2478 ::: "memory"
2479 );
2480#endif
2481}
ac4d0aea
MN
2482
2483static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2484 uint8_t *dst1, uint8_t *dst2,
7f526efd
RD
2485 long width, long height,
2486 long srcStride1, long srcStride2,
2487 long dstStride1, long dstStride2)
ac4d0aea 2488{
7f526efd 2489 long y,x,w,h;
ac4d0aea
MN
2490 w=width/2; h=height/2;
2491#ifdef HAVE_MMX
2492 asm volatile(
2493 PREFETCH" %0\n\t"
2494 PREFETCH" %1\n\t"
2495 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2496#endif
2497 for(y=0;y<h;y++){
2498 const uint8_t* s1=src1+srcStride1*(y>>1);
2499 uint8_t* d=dst1+dstStride1*y;
2500 x=0;
2501#ifdef HAVE_MMX
f0b62bbd 2502 for(;x<w-31;x+=32)
ac4d0aea
MN
2503 {
2504 asm volatile(
2505 PREFETCH" 32%1\n\t"
2506 "movq %1, %%mm0\n\t"
2507 "movq 8%1, %%mm2\n\t"
2508 "movq 16%1, %%mm4\n\t"
2509 "movq 24%1, %%mm6\n\t"
2510 "movq %%mm0, %%mm1\n\t"
2511 "movq %%mm2, %%mm3\n\t"
2512 "movq %%mm4, %%mm5\n\t"
2513 "movq %%mm6, %%mm7\n\t"
2514 "punpcklbw %%mm0, %%mm0\n\t"
2515 "punpckhbw %%mm1, %%mm1\n\t"
2516 "punpcklbw %%mm2, %%mm2\n\t"
2517 "punpckhbw %%mm3, %%mm3\n\t"
2518 "punpcklbw %%mm4, %%mm4\n\t"
2519 "punpckhbw %%mm5, %%mm5\n\t"
2520 "punpcklbw %%mm6, %%mm6\n\t"
2521 "punpckhbw %%mm7, %%mm7\n\t"
2522 MOVNTQ" %%mm0, %0\n\t"
2523 MOVNTQ" %%mm1, 8%0\n\t"
2524 MOVNTQ" %%mm2, 16%0\n\t"
2525 MOVNTQ" %%mm3, 24%0\n\t"
2526 MOVNTQ" %%mm4, 32%0\n\t"
2527 MOVNTQ" %%mm5, 40%0\n\t"
2528 MOVNTQ" %%mm6, 48%0\n\t"
2529 MOVNTQ" %%mm7, 56%0"
2530 :"=m"(d[2*x])
2531 :"m"(s1[x])
2532 :"memory");
2533 }
2534#endif
2535 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2536 }
2537 for(y=0;y<h;y++){
2538 const uint8_t* s2=src2+srcStride2*(y>>1);
2539 uint8_t* d=dst2+dstStride2*y;
2540 x=0;
2541#ifdef HAVE_MMX
f0b62bbd 2542 for(;x<w-31;x+=32)
ac4d0aea
MN
2543 {
2544 asm volatile(
2545 PREFETCH" 32%1\n\t"
2546 "movq %1, %%mm0\n\t"
2547 "movq 8%1, %%mm2\n\t"
2548 "movq 16%1, %%mm4\n\t"
2549 "movq 24%1, %%mm6\n\t"
2550 "movq %%mm0, %%mm1\n\t"
2551 "movq %%mm2, %%mm3\n\t"
2552 "movq %%mm4, %%mm5\n\t"
2553 "movq %%mm6, %%mm7\n\t"
2554 "punpcklbw %%mm0, %%mm0\n\t"
2555 "punpckhbw %%mm1, %%mm1\n\t"
2556 "punpcklbw %%mm2, %%mm2\n\t"
2557 "punpckhbw %%mm3, %%mm3\n\t"
2558 "punpcklbw %%mm4, %%mm4\n\t"
2559 "punpckhbw %%mm5, %%mm5\n\t"
2560 "punpcklbw %%mm6, %%mm6\n\t"
2561 "punpckhbw %%mm7, %%mm7\n\t"
2562 MOVNTQ" %%mm0, %0\n\t"
2563 MOVNTQ" %%mm1, 8%0\n\t"
2564 MOVNTQ" %%mm2, 16%0\n\t"
2565 MOVNTQ" %%mm3, 24%0\n\t"
2566 MOVNTQ" %%mm4, 32%0\n\t"
2567 MOVNTQ" %%mm5, 40%0\n\t"
2568 MOVNTQ" %%mm6, 48%0\n\t"
2569 MOVNTQ" %%mm7, 56%0"
2570 :"=m"(d[2*x])
2571 :"m"(s2[x])
2572 :"memory");
2573 }
2574#endif
2575 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2576 }
2577#ifdef HAVE_MMX
2578 asm(
2579 EMMS" \n\t"
2580 SFENCE" \n\t"
2581 ::: "memory"
2582 );
2583#endif
2584}
2585
2586static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2587 uint8_t *dst,
7f526efd
RD
2588 long width, long height,
2589 long srcStride1, long srcStride2,
2590 long srcStride3, long dstStride)
ac4d0aea 2591{
7f526efd 2592 long y,x,w,h;
ac4d0aea 2593 w=width/2; h=height;
ac4d0aea
MN
2594 for(y=0;y<h;y++){
2595 const uint8_t* yp=src1+srcStride1*y;
2596 const uint8_t* up=src2+srcStride2*(y>>2);
2597 const uint8_t* vp=src3+srcStride3*(y>>2);
2598 uint8_t* d=dst+dstStride*y;
ac4d0aea
MN
2599 x=0;
2600#ifdef HAVE_MMX
4596673c 2601 for(;x<w-7;x+=8)
ac4d0aea
MN
2602 {
2603 asm volatile(
4596673c
MN
2604 PREFETCH" 32(%1, %0)\n\t"
2605 PREFETCH" 32(%2, %0)\n\t"
2606 PREFETCH" 32(%3, %0)\n\t"
2607 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2608 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2609 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
ac4d0aea
MN
2610 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2611 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2612 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2613 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2614 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2615 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2616 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2617
2618 "movq %%mm1, %%mm6\n\t"
2619 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2620 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2621 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
4596673c
MN
2622 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2623 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
ac4d0aea
MN
2624
2625 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
4596673c 2626 "movq 8(%1, %0, 4), %%mm0\n\t"
ac4d0aea
MN
2627 "movq %%mm0, %%mm3\n\t"
2628 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2629 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
4596673c
MN
2630 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2631 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
ac4d0aea
MN
2632
2633 "movq %%mm4, %%mm6\n\t"
4596673c 2634 "movq 16(%1, %0, 4), %%mm0\n\t"
ac4d0aea
MN
2635 "movq %%mm0, %%mm3\n\t"
2636 "punpcklbw %%mm5, %%mm4\n\t"
2637 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2638 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
4596673c
MN
2639 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2640 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
ac4d0aea
MN
2641
2642 "punpckhbw %%mm5, %%mm6\n\t"
4596673c 2643 "movq 24(%1, %0, 4), %%mm0\n\t"
ac4d0aea
MN
2644 "movq %%mm0, %%mm3\n\t"
2645 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2646 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
4596673c
MN
2647 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2648 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
ac4d0aea 2649
4596673c
MN
2650 : "+r" (x)
2651 : "r"(yp), "r" (up), "r"(vp), "r"(d)
ac4d0aea
MN
2652 :"memory");
2653 }
2654#endif
4596673c 2655 for(; x<w; x++)
ac4d0aea 2656 {
7f526efd 2657 const long x2= x<<2;
ac4d0aea
MN
2658 d[8*x+0]=yp[x2];
2659 d[8*x+1]=up[x];
2660 d[8*x+2]=yp[x2+1];
2661 d[8*x+3]=vp[x];
2662 d[8*x+4]=yp[x2+2];
2663 d[8*x+5]=up[x];
2664 d[8*x+6]=yp[x2+3];
2665 d[8*x+7]=vp[x];
2666 }
2667 }
2668#ifdef HAVE_MMX
2669 asm(
2670 EMMS" \n\t"
2671 SFENCE" \n\t"
2672 ::: "memory"
2673 );
2674#endif
2675}