lot of bigendian fixes
[libav.git] / postproc / rgb2rgb_template.c
CommitLineData
fcfbc150 1/*
a3aece93
NK
2 *
3 * rgb2rgb.c, Software RGB to RGB convertor
6611aa83
NK
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
a3aece93 7 * Written by Nick Kurshev.
1de97d84 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
6cb38650 9 * lot of big-endian byteorder fixes by Alex Beregszaszi
a3aece93 10 */
a3aece93 11
0d9f3d85
A
12#include <stddef.h>
13#include <inttypes.h> /* for __WORDSIZE */
14
15#ifndef __WORDSIZE
ff78c596
A
16// #warning You have misconfigured system and probably will lose performance!
17#define __WORDSIZE MP_WORDSIZE
0d9f3d85
A
18#endif
19
1de97d84
MN
20#undef PREFETCH
21#undef MOVNTQ
22#undef EMMS
23#undef SFENCE
24#undef MMREG_SIZE
25#undef PREFETCHW
26#undef PAVGB
27
28#ifdef HAVE_SSE2
29#define MMREG_SIZE 16
30#else
31#define MMREG_SIZE 8
32#endif
33
34#ifdef HAVE_3DNOW
35#define PREFETCH "prefetch"
36#define PREFETCHW "prefetchw"
37#define PAVGB "pavgusb"
38#elif defined ( HAVE_MMX2 )
39#define PREFETCH "prefetchnta"
40#define PREFETCHW "prefetcht0"
41#define PAVGB "pavgb"
42#else
43#define PREFETCH "/nop"
44#define PREFETCHW "/nop"
99969243 45#endif
1de97d84
MN
46
47#ifdef HAVE_3DNOW
48/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
49#define EMMS "femms"
50#else
51#define EMMS "emms"
e697a141 52#endif
79811694 53
1de97d84
MN
54#ifdef HAVE_MMX2
55#define MOVNTQ "movntq"
56#define SFENCE "sfence"
57#else
58#define MOVNTQ "movq"
59#define SFENCE "/nop"
60#endif
61
62static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
b234ae81 63{
fde33ab5 64 uint8_t *dest = dst;
56993147
NK
65 const uint8_t *s = src;
66 const uint8_t *end;
49a0c6ee 67#ifdef HAVE_MMX
d8dad2a5 68 const uint8_t *mm_end;
49a0c6ee 69#endif
b234ae81 70 end = s + src_size;
49a0c6ee 71#ifdef HAVE_MMX
a3aece93 72 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 73 mm_end = end - 23;
a3aece93 74 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
49a0c6ee
NK
75 while(s < mm_end)
76 {
77 __asm __volatile(
a3aece93 78 PREFETCH" 32%1\n\t"
49a0c6ee 79 "movd %1, %%mm0\n\t"
0155db7f
NK
80 "punpckldq 3%1, %%mm0\n\t"
81 "movd 6%1, %%mm1\n\t"
82 "punpckldq 9%1, %%mm1\n\t"
83 "movd 12%1, %%mm2\n\t"
84 "punpckldq 15%1, %%mm2\n\t"
85 "movd 18%1, %%mm3\n\t"
86 "punpckldq 21%1, %%mm3\n\t"
49a0c6ee 87 "pand %%mm7, %%mm0\n\t"
0155db7f 88 "pand %%mm7, %%mm1\n\t"
49a0c6ee 89 "pand %%mm7, %%mm2\n\t"
0155db7f 90 "pand %%mm7, %%mm3\n\t"
96b956cc 91 MOVNTQ" %%mm0, %0\n\t"
0155db7f
NK
92 MOVNTQ" %%mm1, 8%0\n\t"
93 MOVNTQ" %%mm2, 16%0\n\t"
94 MOVNTQ" %%mm3, 24%0"
49a0c6ee
NK
95 :"=m"(*dest)
96 :"m"(*s)
97 :"memory");
0155db7f
NK
98 dest += 32;
99 s += 24;
49a0c6ee 100 }
79811694 101 __asm __volatile(SFENCE:::"memory");
96b956cc 102 __asm __volatile(EMMS:::"memory");
49a0c6ee 103#endif
b234ae81
NK
104 while(s < end)
105 {
6cb38650
AB
106#ifdef WORDS_BIGENDIAN
107 *dest++ = 0;
108 *dest++ = *s++;
109 *dest++ = *s++;
110 *dest++ = *s++;
111#else
fde33ab5
NK
112 *dest++ = *s++;
113 *dest++ = *s++;
114 *dest++ = *s++;
115 *dest++ = 0;
6cb38650 116#endif
b234ae81
NK
117 }
118}
59ac5a93 119
1de97d84 120static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
59ac5a93
NK
121{
122 uint8_t *dest = dst;
56993147
NK
123 const uint8_t *s = src;
124 const uint8_t *end;
494a6294 125#ifdef HAVE_MMX
d8dad2a5 126 const uint8_t *mm_end;
494a6294 127#endif
59ac5a93 128 end = s + src_size;
494a6294 129#ifdef HAVE_MMX
a3aece93 130 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 131 mm_end = end - 31;
494a6294
NK
132 while(s < mm_end)
133 {
134 __asm __volatile(
a3aece93 135 PREFETCH" 32%1\n\t"
494a6294
NK
136 "movq %1, %%mm0\n\t"
137 "movq 8%1, %%mm1\n\t"
2b3eef22
NK
138 "movq 16%1, %%mm4\n\t"
139 "movq 24%1, %%mm5\n\t"
494a6294
NK
140 "movq %%mm0, %%mm2\n\t"
141 "movq %%mm1, %%mm3\n\t"
2b3eef22
NK
142 "movq %%mm4, %%mm6\n\t"
143 "movq %%mm5, %%mm7\n\t"
494a6294
NK
144 "psrlq $8, %%mm2\n\t"
145 "psrlq $8, %%mm3\n\t"
2b3eef22
NK
146 "psrlq $8, %%mm6\n\t"
147 "psrlq $8, %%mm7\n\t"
148 "pand %2, %%mm0\n\t"
149 "pand %2, %%mm1\n\t"
150 "pand %2, %%mm4\n\t"
151 "pand %2, %%mm5\n\t"
152 "pand %3, %%mm2\n\t"
153 "pand %3, %%mm3\n\t"
154 "pand %3, %%mm6\n\t"
155 "pand %3, %%mm7\n\t"
156 "por %%mm2, %%mm0\n\t"
157 "por %%mm3, %%mm1\n\t"
158 "por %%mm6, %%mm4\n\t"
159 "por %%mm7, %%mm5\n\t"
160
161 "movq %%mm1, %%mm2\n\t"
162 "movq %%mm4, %%mm3\n\t"
163 "psllq $48, %%mm2\n\t"
164 "psllq $32, %%mm3\n\t"
165 "pand %4, %%mm2\n\t"
166 "pand %5, %%mm3\n\t"
494a6294 167 "por %%mm2, %%mm0\n\t"
2b3eef22
NK
168 "psrlq $16, %%mm1\n\t"
169 "psrlq $32, %%mm4\n\t"
170 "psllq $16, %%mm5\n\t"
494a6294 171 "por %%mm3, %%mm1\n\t"
2b3eef22
NK
172 "pand %6, %%mm5\n\t"
173 "por %%mm5, %%mm4\n\t"
1de97d84 174
494a6294 175 MOVNTQ" %%mm0, %0\n\t"
2b3eef22
NK
176 MOVNTQ" %%mm1, 8%0\n\t"
177 MOVNTQ" %%mm4, 16%0"
494a6294 178 :"=m"(*dest)
2b3eef22
NK
179 :"m"(*s),"m"(mask24l),
180 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
494a6294 181 :"memory");
2b3eef22
NK
182 dest += 24;
183 s += 32;
494a6294
NK
184 }
185 __asm __volatile(SFENCE:::"memory");
186 __asm __volatile(EMMS:::"memory");
187#endif
59ac5a93
NK
188 while(s < end)
189 {
6cb38650
AB
190#ifdef WORDS_BIGENDIAN
191 s++;
192 *dest++ = *s++;
193 *dest++ = *s++;
194 *dest++ = *s++;
195#else
59ac5a93
NK
196 *dest++ = *s++;
197 *dest++ = *s++;
198 *dest++ = *s++;
199 s++;
6cb38650 200#endif
59ac5a93
NK
201 }
202}
b238eb2e 203
a3aece93
NK
204/*
205 Original by Strepto/Astral
206 ported to gcc & bugfixed : A'rpi
51da31f1 207 MMX2, 3DNOW optimization by Nick Kurshev
9b2c28e6 208 32bit c version, and and&add trick by Michael Niedermayer
a3aece93 209*/
1de97d84 210static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
b238eb2e 211{
0d9f3d85
A
212 register const uint8_t* s=src;
213 register uint8_t* d=dst;
214 register const uint8_t *end;
d8dad2a5 215 const uint8_t *mm_end;
0d9f3d85 216 end = s + src_size;
b238eb2e 217#ifdef HAVE_MMX
0d9f3d85
A
218 __asm __volatile(PREFETCH" %0"::"m"(*s));
219 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
d8dad2a5 220 mm_end = end - 15;
0d9f3d85 221 while(s<mm_end)
a3aece93
NK
222 {
223 __asm __volatile(
224 PREFETCH" 32%1\n\t"
225 "movq %1, %%mm0\n\t"
226 "movq 8%1, %%mm2\n\t"
227 "movq %%mm0, %%mm1\n\t"
228 "movq %%mm2, %%mm3\n\t"
229 "pand %%mm4, %%mm0\n\t"
a3aece93 230 "pand %%mm4, %%mm2\n\t"
9b2c28e6
MN
231 "paddw %%mm1, %%mm0\n\t"
232 "paddw %%mm3, %%mm2\n\t"
a3aece93
NK
233 MOVNTQ" %%mm0, %0\n\t"
234 MOVNTQ" %%mm2, 8%0"
0d9f3d85
A
235 :"=m"(*d)
236 :"m"(*s)
9b2c28e6 237 );
0d9f3d85
A
238 d+=16;
239 s+=16;
b238eb2e 240 }
a3aece93
NK
241 __asm __volatile(SFENCE:::"memory");
242 __asm __volatile(EMMS:::"memory");
b238eb2e 243#endif
d8dad2a5 244 mm_end = end - 3;
0d9f3d85
A
245 while(s < mm_end)
246 {
247 register unsigned x= *((uint32_t *)s);
248 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
249 d+=4;
250 s+=4;
251 }
252 if(s < end)
253 {
254 register unsigned short x= *((uint16_t *)s);
255 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
256 }
b238eb2e 257}
fcfbc150 258
ac4d0aea
MN
259static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
260{
261 register const uint8_t* s=src;
262 register uint8_t* d=dst;
263 register const uint8_t *end;
0598bcbb 264 const uint8_t *mm_end;
ac4d0aea
MN
265 end = s + src_size;
266#ifdef HAVE_MMX
267 __asm __volatile(PREFETCH" %0"::"m"(*s));
268 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
269 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
0598bcbb 270 mm_end = end - 15;
ac4d0aea
MN
271 while(s<mm_end)
272 {
273 __asm __volatile(
274 PREFETCH" 32%1\n\t"
275 "movq %1, %%mm0\n\t"
276 "movq 8%1, %%mm2\n\t"
277 "movq %%mm0, %%mm1\n\t"
278 "movq %%mm2, %%mm3\n\t"
279 "psrlq $1, %%mm0\n\t"
280 "psrlq $1, %%mm2\n\t"
281 "pand %%mm7, %%mm0\n\t"
282 "pand %%mm7, %%mm2\n\t"
283 "pand %%mm6, %%mm1\n\t"
284 "pand %%mm6, %%mm3\n\t"
285 "por %%mm1, %%mm0\n\t"
286 "por %%mm3, %%mm2\n\t"
287 MOVNTQ" %%mm0, %0\n\t"
288 MOVNTQ" %%mm2, 8%0"
289 :"=m"(*d)
290 :"m"(*s)
291 );
292 d+=16;
293 s+=16;
294 }
295 __asm __volatile(SFENCE:::"memory");
296 __asm __volatile(EMMS:::"memory");
297#endif
0598bcbb
MN
298 mm_end = end - 3;
299 while(s < mm_end)
ac4d0aea
MN
300 {
301 register uint32_t x= *((uint32_t *)s);
302 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
303 s+=4;
304 d+=4;
305 }
306 if(s < end)
307 {
308 register uint16_t x= *((uint16_t *)s);
309 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
310 s+=2;
311 d+=2;
312 }
313}
314
1de97d84 315static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
fcfbc150 316{
53445e83 317 const uint8_t *s = src;
0d9f3d85
A
318 const uint8_t *end;
319#ifdef HAVE_MMX
320 const uint8_t *mm_end;
321#endif
53445e83
NK
322 uint16_t *d = (uint16_t *)dst;
323 end = s + src_size;
0d9f3d85 324#ifdef HAVE_MMX
aeae5d53
MN
325 mm_end = end - 15;
326#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
327 asm volatile(
328 "movq %3, %%mm5 \n\t"
329 "movq %4, %%mm6 \n\t"
330 "movq %5, %%mm7 \n\t"
331 ".balign 16 \n\t"
332 "1: \n\t"
333 PREFETCH" 32(%1) \n\t"
334 "movd (%1), %%mm0 \n\t"
335 "movd 4(%1), %%mm3 \n\t"
336 "punpckldq 8(%1), %%mm0 \n\t"
337 "punpckldq 12(%1), %%mm3 \n\t"
338 "movq %%mm0, %%mm1 \n\t"
339 "movq %%mm3, %%mm4 \n\t"
340 "pand %%mm6, %%mm0 \n\t"
341 "pand %%mm6, %%mm3 \n\t"
342 "pmaddwd %%mm7, %%mm0 \n\t"
343 "pmaddwd %%mm7, %%mm3 \n\t"
344 "pand %%mm5, %%mm1 \n\t"
345 "pand %%mm5, %%mm4 \n\t"
346 "por %%mm1, %%mm0 \n\t"
347 "por %%mm4, %%mm3 \n\t"
348 "psrld $5, %%mm0 \n\t"
349 "pslld $11, %%mm3 \n\t"
350 "por %%mm3, %%mm0 \n\t"
351 MOVNTQ" %%mm0, (%0) \n\t"
352 "addl $16, %1 \n\t"
353 "addl $8, %0 \n\t"
354 "cmpl %2, %1 \n\t"
355 " jb 1b \n\t"
356 : "+r" (d), "+r"(s)
357 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
358 );
359#else
53445e83
NK
360 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
361 __asm __volatile(
362 "movq %0, %%mm7\n\t"
363 "movq %1, %%mm6\n\t"
364 ::"m"(red_16mask),"m"(green_16mask));
365 while(s < mm_end)
366 {
367 __asm __volatile(
368 PREFETCH" 32%1\n\t"
369 "movd %1, %%mm0\n\t"
370 "movd 4%1, %%mm3\n\t"
371 "punpckldq 8%1, %%mm0\n\t"
372 "punpckldq 12%1, %%mm3\n\t"
373 "movq %%mm0, %%mm1\n\t"
374 "movq %%mm0, %%mm2\n\t"
375 "movq %%mm3, %%mm4\n\t"
376 "movq %%mm3, %%mm5\n\t"
377 "psrlq $3, %%mm0\n\t"
378 "psrlq $3, %%mm3\n\t"
379 "pand %2, %%mm0\n\t"
380 "pand %2, %%mm3\n\t"
381 "psrlq $5, %%mm1\n\t"
382 "psrlq $5, %%mm4\n\t"
383 "pand %%mm6, %%mm1\n\t"
384 "pand %%mm6, %%mm4\n\t"
385 "psrlq $8, %%mm2\n\t"
386 "psrlq $8, %%mm5\n\t"
387 "pand %%mm7, %%mm2\n\t"
388 "pand %%mm7, %%mm5\n\t"
389 "por %%mm1, %%mm0\n\t"
390 "por %%mm4, %%mm3\n\t"
391 "por %%mm2, %%mm0\n\t"
392 "por %%mm5, %%mm3\n\t"
393 "psllq $16, %%mm3\n\t"
394 "por %%mm3, %%mm0\n\t"
395 MOVNTQ" %%mm0, %0\n\t"
396 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
397 d += 4;
398 s += 16;
399 }
aeae5d53 400#endif
0d9f3d85
A
401 __asm __volatile(SFENCE:::"memory");
402 __asm __volatile(EMMS:::"memory");
403#endif
53445e83
NK
404 while(s < end)
405 {
6cb38650 406 // FIXME on bigendian
ae4cffd9 407 const int src= *s; s += 4;
deb2277c
MN
408 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
409// *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
53445e83 410 }
fcfbc150
MN
411}
412
ac4d0aea
MN
413static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
414{
415 const uint8_t *s = src;
416 const uint8_t *end;
417#ifdef HAVE_MMX
418 const uint8_t *mm_end;
419#endif
420 uint16_t *d = (uint16_t *)dst;
421 end = s + src_size;
422#ifdef HAVE_MMX
423 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
424 __asm __volatile(
425 "movq %0, %%mm7\n\t"
426 "movq %1, %%mm6\n\t"
427 ::"m"(red_16mask),"m"(green_16mask));
0598bcbb 428 mm_end = end - 15;
ac4d0aea
MN
429 while(s < mm_end)
430 {
431 __asm __volatile(
432 PREFETCH" 32%1\n\t"
433 "movd %1, %%mm0\n\t"
434 "movd 4%1, %%mm3\n\t"
435 "punpckldq 8%1, %%mm0\n\t"
436 "punpckldq 12%1, %%mm3\n\t"
437 "movq %%mm0, %%mm1\n\t"
438 "movq %%mm0, %%mm2\n\t"
439 "movq %%mm3, %%mm4\n\t"
440 "movq %%mm3, %%mm5\n\t"
441 "psllq $8, %%mm0\n\t"
442 "psllq $8, %%mm3\n\t"
443 "pand %%mm7, %%mm0\n\t"
444 "pand %%mm7, %%mm3\n\t"
445 "psrlq $5, %%mm1\n\t"
446 "psrlq $5, %%mm4\n\t"
447 "pand %%mm6, %%mm1\n\t"
448 "pand %%mm6, %%mm4\n\t"
449 "psrlq $19, %%mm2\n\t"
450 "psrlq $19, %%mm5\n\t"
451 "pand %2, %%mm2\n\t"
452 "pand %2, %%mm5\n\t"
453 "por %%mm1, %%mm0\n\t"
454 "por %%mm4, %%mm3\n\t"
455 "por %%mm2, %%mm0\n\t"
456 "por %%mm5, %%mm3\n\t"
457 "psllq $16, %%mm3\n\t"
458 "por %%mm3, %%mm0\n\t"
459 MOVNTQ" %%mm0, %0\n\t"
460 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
461 d += 4;
462 s += 16;
463 }
464 __asm __volatile(SFENCE:::"memory");
465 __asm __volatile(EMMS:::"memory");
466#endif
467 while(s < end)
468 {
6cb38650 469 // FIXME on bigendian
ae4cffd9 470 const int src= *s; s += 4;
deb2277c 471 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
ac4d0aea
MN
472 }
473}
474
1de97d84 475static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
fcfbc150 476{
53445e83 477 const uint8_t *s = src;
0d9f3d85
A
478 const uint8_t *end;
479#ifdef HAVE_MMX
480 const uint8_t *mm_end;
481#endif
53445e83
NK
482 uint16_t *d = (uint16_t *)dst;
483 end = s + src_size;
0d9f3d85 484#ifdef HAVE_MMX
aeae5d53
MN
485 mm_end = end - 15;
486#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
487 asm volatile(
488 "movq %3, %%mm5 \n\t"
489 "movq %4, %%mm6 \n\t"
490 "movq %5, %%mm7 \n\t"
491 ".balign 16 \n\t"
492 "1: \n\t"
493 PREFETCH" 32(%1) \n\t"
494 "movd (%1), %%mm0 \n\t"
495 "movd 4(%1), %%mm3 \n\t"
496 "punpckldq 8(%1), %%mm0 \n\t"
497 "punpckldq 12(%1), %%mm3 \n\t"
498 "movq %%mm0, %%mm1 \n\t"
499 "movq %%mm3, %%mm4 \n\t"
500 "pand %%mm6, %%mm0 \n\t"
501 "pand %%mm6, %%mm3 \n\t"
502 "pmaddwd %%mm7, %%mm0 \n\t"
503 "pmaddwd %%mm7, %%mm3 \n\t"
504 "pand %%mm5, %%mm1 \n\t"
505 "pand %%mm5, %%mm4 \n\t"
506 "por %%mm1, %%mm0 \n\t"
507 "por %%mm4, %%mm3 \n\t"
508 "psrld $6, %%mm0 \n\t"
509 "pslld $10, %%mm3 \n\t"
510 "por %%mm3, %%mm0 \n\t"
511 MOVNTQ" %%mm0, (%0) \n\t"
512 "addl $16, %1 \n\t"
513 "addl $8, %0 \n\t"
514 "cmpl %2, %1 \n\t"
515 " jb 1b \n\t"
516 : "+r" (d), "+r"(s)
517 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
518 );
519#else
53445e83
NK
520 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
521 __asm __volatile(
522 "movq %0, %%mm7\n\t"
523 "movq %1, %%mm6\n\t"
524 ::"m"(red_15mask),"m"(green_15mask));
525 while(s < mm_end)
526 {
527 __asm __volatile(
528 PREFETCH" 32%1\n\t"
529 "movd %1, %%mm0\n\t"
530 "movd 4%1, %%mm3\n\t"
531 "punpckldq 8%1, %%mm0\n\t"
532 "punpckldq 12%1, %%mm3\n\t"
533 "movq %%mm0, %%mm1\n\t"
534 "movq %%mm0, %%mm2\n\t"
535 "movq %%mm3, %%mm4\n\t"
536 "movq %%mm3, %%mm5\n\t"
537 "psrlq $3, %%mm0\n\t"
538 "psrlq $3, %%mm3\n\t"
539 "pand %2, %%mm0\n\t"
540 "pand %2, %%mm3\n\t"
541 "psrlq $6, %%mm1\n\t"
542 "psrlq $6, %%mm4\n\t"
543 "pand %%mm6, %%mm1\n\t"
544 "pand %%mm6, %%mm4\n\t"
545 "psrlq $9, %%mm2\n\t"
546 "psrlq $9, %%mm5\n\t"
547 "pand %%mm7, %%mm2\n\t"
548 "pand %%mm7, %%mm5\n\t"
549 "por %%mm1, %%mm0\n\t"
550 "por %%mm4, %%mm3\n\t"
551 "por %%mm2, %%mm0\n\t"
552 "por %%mm5, %%mm3\n\t"
553 "psllq $16, %%mm3\n\t"
554 "por %%mm3, %%mm0\n\t"
555 MOVNTQ" %%mm0, %0\n\t"
556 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
557 d += 4;
558 s += 16;
559 }
aeae5d53 560#endif
0d9f3d85
A
561 __asm __volatile(SFENCE:::"memory");
562 __asm __volatile(EMMS:::"memory");
563#endif
53445e83
NK
564 while(s < end)
565 {
6cb38650 566 // FIXME on bigendian
ae4cffd9 567 const int src= *s; s += 4;
deb2277c 568 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
53445e83 569 }
fcfbc150
MN
570}
571
ac4d0aea
MN
572static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
573{
574 const uint8_t *s = src;
575 const uint8_t *end;
576#ifdef HAVE_MMX
577 const uint8_t *mm_end;
578#endif
579 uint16_t *d = (uint16_t *)dst;
580 end = s + src_size;
581#ifdef HAVE_MMX
582 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
583 __asm __volatile(
584 "movq %0, %%mm7\n\t"
585 "movq %1, %%mm6\n\t"
586 ::"m"(red_15mask),"m"(green_15mask));
0598bcbb 587 mm_end = end - 15;
ac4d0aea
MN
588 while(s < mm_end)
589 {
590 __asm __volatile(
591 PREFETCH" 32%1\n\t"
592 "movd %1, %%mm0\n\t"
593 "movd 4%1, %%mm3\n\t"
594 "punpckldq 8%1, %%mm0\n\t"
595 "punpckldq 12%1, %%mm3\n\t"
596 "movq %%mm0, %%mm1\n\t"
597 "movq %%mm0, %%mm2\n\t"
598 "movq %%mm3, %%mm4\n\t"
599 "movq %%mm3, %%mm5\n\t"
600 "psllq $7, %%mm0\n\t"
601 "psllq $7, %%mm3\n\t"
602 "pand %%mm7, %%mm0\n\t"
603 "pand %%mm7, %%mm3\n\t"
604 "psrlq $6, %%mm1\n\t"
605 "psrlq $6, %%mm4\n\t"
606 "pand %%mm6, %%mm1\n\t"
607 "pand %%mm6, %%mm4\n\t"
608 "psrlq $19, %%mm2\n\t"
609 "psrlq $19, %%mm5\n\t"
610 "pand %2, %%mm2\n\t"
611 "pand %2, %%mm5\n\t"
612 "por %%mm1, %%mm0\n\t"
613 "por %%mm4, %%mm3\n\t"
614 "por %%mm2, %%mm0\n\t"
615 "por %%mm5, %%mm3\n\t"
616 "psllq $16, %%mm3\n\t"
617 "por %%mm3, %%mm0\n\t"
618 MOVNTQ" %%mm0, %0\n\t"
619 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
620 d += 4;
621 s += 16;
622 }
623 __asm __volatile(SFENCE:::"memory");
624 __asm __volatile(EMMS:::"memory");
625#endif
626 while(s < end)
627 {
6cb38650 628 // FIXME on bigendian
ae4cffd9 629 const int src= *s; s += 4;
deb2277c 630 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
ac4d0aea
MN
631 }
632}
633
1de97d84 634static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
996e1a7c 635{
3eb2151c 636 const uint8_t *s = src;
0d9f3d85
A
637 const uint8_t *end;
638#ifdef HAVE_MMX
639 const uint8_t *mm_end;
640#endif
90226a43 641 uint16_t *d = (uint16_t *)dst;
3eb2151c 642 end = s + src_size;
0d9f3d85 643#ifdef HAVE_MMX
0155db7f
NK
644 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
645 __asm __volatile(
646 "movq %0, %%mm7\n\t"
647 "movq %1, %%mm6\n\t"
53445e83 648 ::"m"(red_16mask),"m"(green_16mask));
d8dad2a5 649 mm_end = end - 11;
3eb2151c 650 while(s < mm_end)
0155db7f
NK
651 {
652 __asm __volatile(
653 PREFETCH" 32%1\n\t"
654 "movd %1, %%mm0\n\t"
3eb2151c
NK
655 "movd 3%1, %%mm3\n\t"
656 "punpckldq 6%1, %%mm0\n\t"
0155db7f
NK
657 "punpckldq 9%1, %%mm3\n\t"
658 "movq %%mm0, %%mm1\n\t"
659 "movq %%mm0, %%mm2\n\t"
660 "movq %%mm3, %%mm4\n\t"
661 "movq %%mm3, %%mm5\n\t"
662 "psrlq $3, %%mm0\n\t"
663 "psrlq $3, %%mm3\n\t"
3eb2151c
NK
664 "pand %2, %%mm0\n\t"
665 "pand %2, %%mm3\n\t"
666 "psrlq $5, %%mm1\n\t"
667 "psrlq $5, %%mm4\n\t"
668 "pand %%mm6, %%mm1\n\t"
669 "pand %%mm6, %%mm4\n\t"
670 "psrlq $8, %%mm2\n\t"
671 "psrlq $8, %%mm5\n\t"
672 "pand %%mm7, %%mm2\n\t"
673 "pand %%mm7, %%mm5\n\t"
0155db7f 674 "por %%mm1, %%mm0\n\t"
0155db7f 675 "por %%mm4, %%mm3\n\t"
3eb2151c 676 "por %%mm2, %%mm0\n\t"
0155db7f 677 "por %%mm5, %%mm3\n\t"
3eb2151c
NK
678 "psllq $16, %%mm3\n\t"
679 "por %%mm3, %%mm0\n\t"
0155db7f 680 MOVNTQ" %%mm0, %0\n\t"
53445e83 681 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
3eb2151c
NK
682 d += 4;
683 s += 12;
0155db7f 684 }
0d9f3d85
A
685 __asm __volatile(SFENCE:::"memory");
686 __asm __volatile(EMMS:::"memory");
687#endif
3eb2151c
NK
688 while(s < end)
689 {
690 const int b= *s++;
691 const int g= *s++;
692 const int r= *s++;
693 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
694 }
996e1a7c
NK
695}
696
ac4d0aea
MN
697static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
698{
699 const uint8_t *s = src;
700 const uint8_t *end;
701#ifdef HAVE_MMX
702 const uint8_t *mm_end;
703#endif
704 uint16_t *d = (uint16_t *)dst;
705 end = s + src_size;
706#ifdef HAVE_MMX
707 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
708 __asm __volatile(
709 "movq %0, %%mm7\n\t"
710 "movq %1, %%mm6\n\t"
711 ::"m"(red_16mask),"m"(green_16mask));
0598bcbb 712 mm_end = end - 15;
ac4d0aea
MN
713 while(s < mm_end)
714 {
715 __asm __volatile(
716 PREFETCH" 32%1\n\t"
717 "movd %1, %%mm0\n\t"
718 "movd 3%1, %%mm3\n\t"
719 "punpckldq 6%1, %%mm0\n\t"
720 "punpckldq 9%1, %%mm3\n\t"
721 "movq %%mm0, %%mm1\n\t"
722 "movq %%mm0, %%mm2\n\t"
723 "movq %%mm3, %%mm4\n\t"
724 "movq %%mm3, %%mm5\n\t"
725 "psllq $8, %%mm0\n\t"
726 "psllq $8, %%mm3\n\t"
727 "pand %%mm7, %%mm0\n\t"
728 "pand %%mm7, %%mm3\n\t"
729 "psrlq $5, %%mm1\n\t"
730 "psrlq $5, %%mm4\n\t"
731 "pand %%mm6, %%mm1\n\t"
732 "pand %%mm6, %%mm4\n\t"
733 "psrlq $19, %%mm2\n\t"
734 "psrlq $19, %%mm5\n\t"
735 "pand %2, %%mm2\n\t"
736 "pand %2, %%mm5\n\t"
737 "por %%mm1, %%mm0\n\t"
738 "por %%mm4, %%mm3\n\t"
739 "por %%mm2, %%mm0\n\t"
740 "por %%mm5, %%mm3\n\t"
741 "psllq $16, %%mm3\n\t"
742 "por %%mm3, %%mm0\n\t"
743 MOVNTQ" %%mm0, %0\n\t"
744 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
745 d += 4;
746 s += 12;
747 }
748 __asm __volatile(SFENCE:::"memory");
749 __asm __volatile(EMMS:::"memory");
750#endif
751 while(s < end)
752 {
753 const int r= *s++;
754 const int g= *s++;
755 const int b= *s++;
756 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
757 }
758}
759
1de97d84 760static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
996e1a7c 761{
53445e83 762 const uint8_t *s = src;
0d9f3d85
A
763 const uint8_t *end;
764#ifdef HAVE_MMX
765 const uint8_t *mm_end;
766#endif
53445e83
NK
767 uint16_t *d = (uint16_t *)dst;
768 end = s + src_size;
0d9f3d85 769#ifdef HAVE_MMX
53445e83
NK
770 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
771 __asm __volatile(
772 "movq %0, %%mm7\n\t"
773 "movq %1, %%mm6\n\t"
774 ::"m"(red_15mask),"m"(green_15mask));
d8dad2a5 775 mm_end = end - 11;
53445e83
NK
776 while(s < mm_end)
777 {
778 __asm __volatile(
779 PREFETCH" 32%1\n\t"
780 "movd %1, %%mm0\n\t"
781 "movd 3%1, %%mm3\n\t"
782 "punpckldq 6%1, %%mm0\n\t"
783 "punpckldq 9%1, %%mm3\n\t"
784 "movq %%mm0, %%mm1\n\t"
785 "movq %%mm0, %%mm2\n\t"
786 "movq %%mm3, %%mm4\n\t"
787 "movq %%mm3, %%mm5\n\t"
788 "psrlq $3, %%mm0\n\t"
789 "psrlq $3, %%mm3\n\t"
790 "pand %2, %%mm0\n\t"
791 "pand %2, %%mm3\n\t"
792 "psrlq $6, %%mm1\n\t"
793 "psrlq $6, %%mm4\n\t"
794 "pand %%mm6, %%mm1\n\t"
795 "pand %%mm6, %%mm4\n\t"
796 "psrlq $9, %%mm2\n\t"
797 "psrlq $9, %%mm5\n\t"
798 "pand %%mm7, %%mm2\n\t"
799 "pand %%mm7, %%mm5\n\t"
800 "por %%mm1, %%mm0\n\t"
801 "por %%mm4, %%mm3\n\t"
802 "por %%mm2, %%mm0\n\t"
803 "por %%mm5, %%mm3\n\t"
804 "psllq $16, %%mm3\n\t"
805 "por %%mm3, %%mm0\n\t"
806 MOVNTQ" %%mm0, %0\n\t"
807 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
808 d += 4;
809 s += 12;
810 }
0d9f3d85
A
811 __asm __volatile(SFENCE:::"memory");
812 __asm __volatile(EMMS:::"memory");
813#endif
53445e83
NK
814 while(s < end)
815 {
816 const int b= *s++;
817 const int g= *s++;
818 const int r= *s++;
819 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
820 }
0d9f3d85
A
821}
822
ac4d0aea
MN
823static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
824{
825 const uint8_t *s = src;
826 const uint8_t *end;
827#ifdef HAVE_MMX
828 const uint8_t *mm_end;
829#endif
830 uint16_t *d = (uint16_t *)dst;
831 end = s + src_size;
832#ifdef HAVE_MMX
833 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
834 __asm __volatile(
835 "movq %0, %%mm7\n\t"
836 "movq %1, %%mm6\n\t"
837 ::"m"(red_15mask),"m"(green_15mask));
0598bcbb 838 mm_end = end - 15;
ac4d0aea
MN
839 while(s < mm_end)
840 {
841 __asm __volatile(
842 PREFETCH" 32%1\n\t"
843 "movd %1, %%mm0\n\t"
844 "movd 3%1, %%mm3\n\t"
845 "punpckldq 6%1, %%mm0\n\t"
846 "punpckldq 9%1, %%mm3\n\t"
847 "movq %%mm0, %%mm1\n\t"
848 "movq %%mm0, %%mm2\n\t"
849 "movq %%mm3, %%mm4\n\t"
850 "movq %%mm3, %%mm5\n\t"
851 "psllq $7, %%mm0\n\t"
852 "psllq $7, %%mm3\n\t"
853 "pand %%mm7, %%mm0\n\t"
854 "pand %%mm7, %%mm3\n\t"
855 "psrlq $6, %%mm1\n\t"
856 "psrlq $6, %%mm4\n\t"
857 "pand %%mm6, %%mm1\n\t"
858 "pand %%mm6, %%mm4\n\t"
859 "psrlq $19, %%mm2\n\t"
860 "psrlq $19, %%mm5\n\t"
861 "pand %2, %%mm2\n\t"
862 "pand %2, %%mm5\n\t"
863 "por %%mm1, %%mm0\n\t"
864 "por %%mm4, %%mm3\n\t"
865 "por %%mm2, %%mm0\n\t"
866 "por %%mm5, %%mm3\n\t"
867 "psllq $16, %%mm3\n\t"
868 "por %%mm3, %%mm0\n\t"
869 MOVNTQ" %%mm0, %0\n\t"
870 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
871 d += 4;
872 s += 12;
873 }
874 __asm __volatile(SFENCE:::"memory");
875 __asm __volatile(EMMS:::"memory");
876#endif
877 while(s < end)
878 {
879 const int r= *s++;
880 const int g= *s++;
881 const int b= *s++;
882 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
883 }
884}
885
0d9f3d85
A
886/*
887 I use here less accurate approximation by simply
888 left-shifting the input
889 value and filling the low order bits with
890 zeroes. This method improves png's
891 compression but this scheme cannot reproduce white exactly, since it does not
892 generate an all-ones maximum value; the net effect is to darken the
893 image slightly.
894
895 The better method should be "left bit replication":
896
897 4 3 2 1 0
898 ---------
899 1 1 0 1 1
900
901 7 6 5 4 3 2 1 0
902 ----------------
903 1 1 0 1 1 1 1 0
904 |=======| |===|
905 | Leftmost Bits Repeated to Fill Open Bits
906 |
907 Original Bits
908*/
909static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
910{
911 const uint16_t *end;
912#ifdef HAVE_MMX
913 const uint16_t *mm_end;
914#endif
915 uint8_t *d = (uint8_t *)dst;
916 const uint16_t *s = (uint16_t *)src;
917 end = s + src_size/2;
918#ifdef HAVE_MMX
919 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 920 mm_end = end - 7;
0d9f3d85
A
921 while(s < mm_end)
922 {
923 __asm __volatile(
924 PREFETCH" 32%1\n\t"
925 "movq %1, %%mm0\n\t"
926 "movq %1, %%mm1\n\t"
927 "movq %1, %%mm2\n\t"
928 "pand %2, %%mm0\n\t"
929 "pand %3, %%mm1\n\t"
930 "pand %4, %%mm2\n\t"
931 "psllq $3, %%mm0\n\t"
932 "psrlq $2, %%mm1\n\t"
933 "psrlq $7, %%mm2\n\t"
934 "movq %%mm0, %%mm3\n\t"
935 "movq %%mm1, %%mm4\n\t"
936 "movq %%mm2, %%mm5\n\t"
937 "punpcklwd %5, %%mm0\n\t"
938 "punpcklwd %5, %%mm1\n\t"
939 "punpcklwd %5, %%mm2\n\t"
940 "punpckhwd %5, %%mm3\n\t"
941 "punpckhwd %5, %%mm4\n\t"
942 "punpckhwd %5, %%mm5\n\t"
943 "psllq $8, %%mm1\n\t"
944 "psllq $16, %%mm2\n\t"
945 "por %%mm1, %%mm0\n\t"
946 "por %%mm2, %%mm0\n\t"
947 "psllq $8, %%mm4\n\t"
948 "psllq $16, %%mm5\n\t"
949 "por %%mm4, %%mm3\n\t"
950 "por %%mm5, %%mm3\n\t"
951
952 "movq %%mm0, %%mm6\n\t"
953 "movq %%mm3, %%mm7\n\t"
954
955 "movq 8%1, %%mm0\n\t"
956 "movq 8%1, %%mm1\n\t"
957 "movq 8%1, %%mm2\n\t"
958 "pand %2, %%mm0\n\t"
959 "pand %3, %%mm1\n\t"
960 "pand %4, %%mm2\n\t"
961 "psllq $3, %%mm0\n\t"
962 "psrlq $2, %%mm1\n\t"
963 "psrlq $7, %%mm2\n\t"
964 "movq %%mm0, %%mm3\n\t"
965 "movq %%mm1, %%mm4\n\t"
966 "movq %%mm2, %%mm5\n\t"
967 "punpcklwd %5, %%mm0\n\t"
968 "punpcklwd %5, %%mm1\n\t"
969 "punpcklwd %5, %%mm2\n\t"
970 "punpckhwd %5, %%mm3\n\t"
971 "punpckhwd %5, %%mm4\n\t"
972 "punpckhwd %5, %%mm5\n\t"
973 "psllq $8, %%mm1\n\t"
974 "psllq $16, %%mm2\n\t"
975 "por %%mm1, %%mm0\n\t"
976 "por %%mm2, %%mm0\n\t"
977 "psllq $8, %%mm4\n\t"
978 "psllq $16, %%mm5\n\t"
979 "por %%mm4, %%mm3\n\t"
980 "por %%mm5, %%mm3\n\t"
981
982 :"=m"(*d)
983 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
984 :"memory");
985 /* Borrowed 32 to 24 */
986 __asm __volatile(
987 "movq %%mm0, %%mm4\n\t"
988 "movq %%mm3, %%mm5\n\t"
989 "movq %%mm6, %%mm0\n\t"
990 "movq %%mm7, %%mm1\n\t"
991
992 "movq %%mm4, %%mm6\n\t"
993 "movq %%mm5, %%mm7\n\t"
994 "movq %%mm0, %%mm2\n\t"
995 "movq %%mm1, %%mm3\n\t"
996
997 "psrlq $8, %%mm2\n\t"
998 "psrlq $8, %%mm3\n\t"
999 "psrlq $8, %%mm6\n\t"
1000 "psrlq $8, %%mm7\n\t"
1001 "pand %2, %%mm0\n\t"
1002 "pand %2, %%mm1\n\t"
1003 "pand %2, %%mm4\n\t"
1004 "pand %2, %%mm5\n\t"
1005 "pand %3, %%mm2\n\t"
1006 "pand %3, %%mm3\n\t"
1007 "pand %3, %%mm6\n\t"
1008 "pand %3, %%mm7\n\t"
1009 "por %%mm2, %%mm0\n\t"
1010 "por %%mm3, %%mm1\n\t"
1011 "por %%mm6, %%mm4\n\t"
1012 "por %%mm7, %%mm5\n\t"
1013
1014 "movq %%mm1, %%mm2\n\t"
1015 "movq %%mm4, %%mm3\n\t"
1016 "psllq $48, %%mm2\n\t"
1017 "psllq $32, %%mm3\n\t"
1018 "pand %4, %%mm2\n\t"
1019 "pand %5, %%mm3\n\t"
1020 "por %%mm2, %%mm0\n\t"
1021 "psrlq $16, %%mm1\n\t"
1022 "psrlq $32, %%mm4\n\t"
1023 "psllq $16, %%mm5\n\t"
1024 "por %%mm3, %%mm1\n\t"
1025 "pand %6, %%mm5\n\t"
1026 "por %%mm5, %%mm4\n\t"
1027
1028 MOVNTQ" %%mm0, %0\n\t"
1029 MOVNTQ" %%mm1, 8%0\n\t"
1030 MOVNTQ" %%mm4, 16%0"
1031
1032 :"=m"(*d)
1033 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1034 :"memory");
1035 d += 24;
1036 s += 8;
1037 }
53445e83
NK
1038 __asm __volatile(SFENCE:::"memory");
1039 __asm __volatile(EMMS:::"memory");
0d9f3d85
A
1040#endif
1041 while(s < end)
1042 {
1043 register uint16_t bgr;
1044 bgr = *s++;
1045 *d++ = (bgr&0x1F)<<3;
1046 *d++ = (bgr&0x3E0)>>2;
1047 *d++ = (bgr&0x7C00)>>7;
1048 }
1049}
1050
1051static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1052{
1053 const uint16_t *end;
1054#ifdef HAVE_MMX
1055 const uint16_t *mm_end;
1056#endif
1057 uint8_t *d = (uint8_t *)dst;
1058 const uint16_t *s = (const uint16_t *)src;
1059 end = s + src_size/2;
1060#ifdef HAVE_MMX
1061 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 1062 mm_end = end - 7;
0d9f3d85
A
1063 while(s < mm_end)
1064 {
1065 __asm __volatile(
1066 PREFETCH" 32%1\n\t"
1067 "movq %1, %%mm0\n\t"
1068 "movq %1, %%mm1\n\t"
1069 "movq %1, %%mm2\n\t"
1070 "pand %2, %%mm0\n\t"
1071 "pand %3, %%mm1\n\t"
1072 "pand %4, %%mm2\n\t"
1073 "psllq $3, %%mm0\n\t"
1074 "psrlq $3, %%mm1\n\t"
1075 "psrlq $8, %%mm2\n\t"
1076 "movq %%mm0, %%mm3\n\t"
1077 "movq %%mm1, %%mm4\n\t"
1078 "movq %%mm2, %%mm5\n\t"
1079 "punpcklwd %5, %%mm0\n\t"
1080 "punpcklwd %5, %%mm1\n\t"
1081 "punpcklwd %5, %%mm2\n\t"
1082 "punpckhwd %5, %%mm3\n\t"
1083 "punpckhwd %5, %%mm4\n\t"
1084 "punpckhwd %5, %%mm5\n\t"
1085 "psllq $8, %%mm1\n\t"
1086 "psllq $16, %%mm2\n\t"
1087 "por %%mm1, %%mm0\n\t"
1088 "por %%mm2, %%mm0\n\t"
1089 "psllq $8, %%mm4\n\t"
1090 "psllq $16, %%mm5\n\t"
1091 "por %%mm4, %%mm3\n\t"
1092 "por %%mm5, %%mm3\n\t"
1093
1094 "movq %%mm0, %%mm6\n\t"
1095 "movq %%mm3, %%mm7\n\t"
1096
1097 "movq 8%1, %%mm0\n\t"
1098 "movq 8%1, %%mm1\n\t"
1099 "movq 8%1, %%mm2\n\t"
1100 "pand %2, %%mm0\n\t"
1101 "pand %3, %%mm1\n\t"
1102 "pand %4, %%mm2\n\t"
1103 "psllq $3, %%mm0\n\t"
1104 "psrlq $3, %%mm1\n\t"
1105 "psrlq $8, %%mm2\n\t"
1106 "movq %%mm0, %%mm3\n\t"
1107 "movq %%mm1, %%mm4\n\t"
1108 "movq %%mm2, %%mm5\n\t"
1109 "punpcklwd %5, %%mm0\n\t"
1110 "punpcklwd %5, %%mm1\n\t"
1111 "punpcklwd %5, %%mm2\n\t"
1112 "punpckhwd %5, %%mm3\n\t"
1113 "punpckhwd %5, %%mm4\n\t"
1114 "punpckhwd %5, %%mm5\n\t"
1115 "psllq $8, %%mm1\n\t"
1116 "psllq $16, %%mm2\n\t"
1117 "por %%mm1, %%mm0\n\t"
1118 "por %%mm2, %%mm0\n\t"
1119 "psllq $8, %%mm4\n\t"
1120 "psllq $16, %%mm5\n\t"
1121 "por %%mm4, %%mm3\n\t"
1122 "por %%mm5, %%mm3\n\t"
1123 :"=m"(*d)
1124 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1125 :"memory");
1126 /* Borrowed 32 to 24 */
1127 __asm __volatile(
1128 "movq %%mm0, %%mm4\n\t"
1129 "movq %%mm3, %%mm5\n\t"
1130 "movq %%mm6, %%mm0\n\t"
1131 "movq %%mm7, %%mm1\n\t"
1132
1133 "movq %%mm4, %%mm6\n\t"
1134 "movq %%mm5, %%mm7\n\t"
1135 "movq %%mm0, %%mm2\n\t"
1136 "movq %%mm1, %%mm3\n\t"
1137
1138 "psrlq $8, %%mm2\n\t"
1139 "psrlq $8, %%mm3\n\t"
1140 "psrlq $8, %%mm6\n\t"
1141 "psrlq $8, %%mm7\n\t"
1142 "pand %2, %%mm0\n\t"
1143 "pand %2, %%mm1\n\t"
1144 "pand %2, %%mm4\n\t"
1145 "pand %2, %%mm5\n\t"
1146 "pand %3, %%mm2\n\t"
1147 "pand %3, %%mm3\n\t"
1148 "pand %3, %%mm6\n\t"
1149 "pand %3, %%mm7\n\t"
1150 "por %%mm2, %%mm0\n\t"
1151 "por %%mm3, %%mm1\n\t"
1152 "por %%mm6, %%mm4\n\t"
1153 "por %%mm7, %%mm5\n\t"
1154
1155 "movq %%mm1, %%mm2\n\t"
1156 "movq %%mm4, %%mm3\n\t"
1157 "psllq $48, %%mm2\n\t"
1158 "psllq $32, %%mm3\n\t"
1159 "pand %4, %%mm2\n\t"
1160 "pand %5, %%mm3\n\t"
1161 "por %%mm2, %%mm0\n\t"
1162 "psrlq $16, %%mm1\n\t"
1163 "psrlq $32, %%mm4\n\t"
1164 "psllq $16, %%mm5\n\t"
1165 "por %%mm3, %%mm1\n\t"
1166 "pand %6, %%mm5\n\t"
1167 "por %%mm5, %%mm4\n\t"
1168
1169 MOVNTQ" %%mm0, %0\n\t"
1170 MOVNTQ" %%mm1, 8%0\n\t"
1171 MOVNTQ" %%mm4, 16%0"
1172
1173 :"=m"(*d)
1174 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1175 :"memory");
1176 d += 24;
1177 s += 8;
1178 }
1179 __asm __volatile(SFENCE:::"memory");
1180 __asm __volatile(EMMS:::"memory");
1181#endif
1182 while(s < end)
1183 {
1184 register uint16_t bgr;
1185 bgr = *s++;
1186 *d++ = (bgr&0x1F)<<3;
1187 *d++ = (bgr&0x7E0)>>3;
1188 *d++ = (bgr&0xF800)>>8;
1189 }
1190}
1191
1192static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1193{
1194 const uint16_t *end;
1195#ifdef HAVE_MMX
1196 const uint16_t *mm_end;
1197#endif
1198 uint8_t *d = (uint8_t *)dst;
1199 const uint16_t *s = (const uint16_t *)src;
1200 end = s + src_size/2;
1201#ifdef HAVE_MMX
1202 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1203 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
d8dad2a5 1204 mm_end = end - 3;
0d9f3d85
A
1205 while(s < mm_end)
1206 {
1207 __asm __volatile(
1208 PREFETCH" 32%1\n\t"
1209 "movq %1, %%mm0\n\t"
1210 "movq %1, %%mm1\n\t"
1211 "movq %1, %%mm2\n\t"
1212 "pand %2, %%mm0\n\t"
1213 "pand %3, %%mm1\n\t"
1214 "pand %4, %%mm2\n\t"
1215 "psllq $3, %%mm0\n\t"
1216 "psrlq $2, %%mm1\n\t"
1217 "psrlq $7, %%mm2\n\t"
1218 "movq %%mm0, %%mm3\n\t"
1219 "movq %%mm1, %%mm4\n\t"
1220 "movq %%mm2, %%mm5\n\t"
1221 "punpcklwd %%mm7, %%mm0\n\t"
1222 "punpcklwd %%mm7, %%mm1\n\t"
1223 "punpcklwd %%mm7, %%mm2\n\t"
1224 "punpckhwd %%mm7, %%mm3\n\t"
1225 "punpckhwd %%mm7, %%mm4\n\t"
1226 "punpckhwd %%mm7, %%mm5\n\t"
1227 "psllq $8, %%mm1\n\t"
1228 "psllq $16, %%mm2\n\t"
1229 "por %%mm1, %%mm0\n\t"
1230 "por %%mm2, %%mm0\n\t"
1231 "psllq $8, %%mm4\n\t"
1232 "psllq $16, %%mm5\n\t"
1233 "por %%mm4, %%mm3\n\t"
1234 "por %%mm5, %%mm3\n\t"
1235 MOVNTQ" %%mm0, %0\n\t"
1236 MOVNTQ" %%mm3, 8%0\n\t"
1237 :"=m"(*d)
1238 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1239 :"memory");
1240 d += 16;
1241 s += 4;
1242 }
1243 __asm __volatile(SFENCE:::"memory");
1244 __asm __volatile(EMMS:::"memory");
1245#endif
1246 while(s < end)
996e1a7c 1247 {
deb2277c
MN
1248#if 0 //slightly slower on athlon
1249 int bgr= *s++;
1250 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1251#else
1252//FIXME this is very likely wrong for bigendian (and the following converters too)
0d9f3d85
A
1253 register uint16_t bgr;
1254 bgr = *s++;
6cb38650
AB
1255#ifdef WORDS_BIGENDIAN
1256 *d++ = 0;
1257 *d++ = (bgr&0x1F)<<3;
1258 *d++ = (bgr&0x3E0)>>2;
1259 *d++ = (bgr&0x7C00)>>7;
1260#else
0d9f3d85
A
1261 *d++ = (bgr&0x1F)<<3;
1262 *d++ = (bgr&0x3E0)>>2;
1263 *d++ = (bgr&0x7C00)>>7;
1264 *d++ = 0;
deb2277c 1265#endif
6cb38650
AB
1266
1267#endif
0d9f3d85
A
1268 }
1269}
996e1a7c 1270
0d9f3d85
A
1271static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1272{
1273 const uint16_t *end;
1274#ifdef HAVE_MMX
1275 const uint16_t *mm_end;
1276#endif
1277 uint8_t *d = (uint8_t *)dst;
1278 const uint16_t *s = (uint16_t *)src;
1279 end = s + src_size/2;
1280#ifdef HAVE_MMX
1281 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1282 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
d8dad2a5 1283 mm_end = end - 3;
0d9f3d85
A
1284 while(s < mm_end)
1285 {
1286 __asm __volatile(
1287 PREFETCH" 32%1\n\t"
1288 "movq %1, %%mm0\n\t"
1289 "movq %1, %%mm1\n\t"
1290 "movq %1, %%mm2\n\t"
1291 "pand %2, %%mm0\n\t"
1292 "pand %3, %%mm1\n\t"
1293 "pand %4, %%mm2\n\t"
1294 "psllq $3, %%mm0\n\t"
1295 "psrlq $3, %%mm1\n\t"
1296 "psrlq $8, %%mm2\n\t"
1297 "movq %%mm0, %%mm3\n\t"
1298 "movq %%mm1, %%mm4\n\t"
1299 "movq %%mm2, %%mm5\n\t"
1300 "punpcklwd %%mm7, %%mm0\n\t"
1301 "punpcklwd %%mm7, %%mm1\n\t"
1302 "punpcklwd %%mm7, %%mm2\n\t"
1303 "punpckhwd %%mm7, %%mm3\n\t"
1304 "punpckhwd %%mm7, %%mm4\n\t"
1305 "punpckhwd %%mm7, %%mm5\n\t"
1306 "psllq $8, %%mm1\n\t"
1307 "psllq $16, %%mm2\n\t"
1308 "por %%mm1, %%mm0\n\t"
1309 "por %%mm2, %%mm0\n\t"
1310 "psllq $8, %%mm4\n\t"
1311 "psllq $16, %%mm5\n\t"
1312 "por %%mm4, %%mm3\n\t"
1313 "por %%mm5, %%mm3\n\t"
1314 MOVNTQ" %%mm0, %0\n\t"
1315 MOVNTQ" %%mm3, 8%0\n\t"
1316 :"=m"(*d)
1317 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1318 :"memory");
1319 d += 16;
1320 s += 4;
996e1a7c 1321 }
0d9f3d85
A
1322 __asm __volatile(SFENCE:::"memory");
1323 __asm __volatile(EMMS:::"memory");
53445e83 1324#endif
0d9f3d85
A
1325 while(s < end)
1326 {
1327 register uint16_t bgr;
1328 bgr = *s++;
6cb38650
AB
1329#ifdef WORDS_BIGENDIAN
1330 *d++ = 0;
1331 *d++ = (bgr&0x1F)<<3;
1332 *d++ = (bgr&0x7E0)>>3;
1333 *d++ = (bgr&0xF800)>>8;
1334#else
0d9f3d85
A
1335 *d++ = (bgr&0x1F)<<3;
1336 *d++ = (bgr&0x7E0)>>3;
1337 *d++ = (bgr&0xF800)>>8;
1338 *d++ = 0;
6cb38650 1339#endif
0d9f3d85 1340 }
996e1a7c 1341}
fcfbc150 1342
1de97d84 1343static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
99969243 1344{
99969243 1345#ifdef HAVE_MMX
0d9f3d85 1346/* TODO: unroll this loop */
99969243
MN
1347 asm volatile (
1348 "xorl %%eax, %%eax \n\t"
cff6ecd7 1349 ".balign 16 \n\t"
99969243
MN
1350 "1: \n\t"
1351 PREFETCH" 32(%0, %%eax) \n\t"
1352 "movq (%0, %%eax), %%mm0 \n\t"
1353 "movq %%mm0, %%mm1 \n\t"
1354 "movq %%mm0, %%mm2 \n\t"
1355 "pslld $16, %%mm0 \n\t"
1356 "psrld $16, %%mm1 \n\t"
0d9f3d85
A
1357 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1358 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1359 "pand "MANGLE(mask32b)", %%mm1 \n\t"
99969243
MN
1360 "por %%mm0, %%mm2 \n\t"
1361 "por %%mm1, %%mm2 \n\t"
1362 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
218ad65d 1363 "addl $8, %%eax \n\t"
99969243
MN
1364 "cmpl %2, %%eax \n\t"
1365 " jb 1b \n\t"
d8dad2a5 1366 :: "r" (src), "r"(dst), "r" (src_size-7)
99969243
MN
1367 : "%eax"
1368 );
9395185f
MN
1369
1370 __asm __volatile(SFENCE:::"memory");
1371 __asm __volatile(EMMS:::"memory");
99969243 1372#else
0d9f3d85
A
1373 unsigned i;
1374 unsigned num_pixels = src_size >> 2;
99969243
MN
1375 for(i=0; i<num_pixels; i++)
1376 {
d7b8e4b6
MN
1377#ifdef WORDS_BIGENDIAN
1378 dst[4*i + 1] = src[4*i + 3];
1379 dst[4*i + 2] = src[4*i + 2];
1380 dst[4*i + 3] = src[4*i + 1];
1381#else
1382 dst[4*i + 0] = src[4*i + 2];
1383 dst[4*i + 1] = src[4*i + 1];
1384 dst[4*i + 2] = src[4*i + 0];
1385#endif
99969243
MN
1386 }
1387#endif
1388}
1389
74d35835
MN
1390static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1391{
0d9f3d85 1392 unsigned i;
74d35835
MN
1393#ifdef HAVE_MMX
1394 int mmx_size= 23 - src_size;
1395 asm volatile (
1396 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1397 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1398 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1399 ".balign 16 \n\t"
1400 "1: \n\t"
1401 PREFETCH" 32(%1, %%eax) \n\t"
1402 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1403 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1404 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1405 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1406 "pand %%mm5, %%mm0 \n\t"
1407 "pand %%mm6, %%mm1 \n\t"
1408 "pand %%mm7, %%mm2 \n\t"
1409 "por %%mm0, %%mm1 \n\t"
1410 "por %%mm2, %%mm1 \n\t"
1411 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1412 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1413 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1414 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1415 "pand %%mm7, %%mm0 \n\t"
1416 "pand %%mm5, %%mm1 \n\t"
1417 "pand %%mm6, %%mm2 \n\t"
1418 "por %%mm0, %%mm1 \n\t"
1419 "por %%mm2, %%mm1 \n\t"
1420 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1421 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1422 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1423 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1424 "pand %%mm6, %%mm0 \n\t"
1425 "pand %%mm7, %%mm1 \n\t"
1426 "pand %%mm5, %%mm2 \n\t"
1427 "por %%mm0, %%mm1 \n\t"
1428 "por %%mm2, %%mm1 \n\t"
1429 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1430 "addl $24, %%eax \n\t"
1431 " js 1b \n\t"
1432 : "+a" (mmx_size)
1433 : "r" (src-mmx_size), "r"(dst-mmx_size)
1434 );
1435
1436 __asm __volatile(SFENCE:::"memory");
1437 __asm __volatile(EMMS:::"memory");
1438
218ad65d 1439 if(mmx_size==23) return; //finihsed, was multiple of 8
0d9f3d85 1440
74d35835
MN
1441 src+= src_size;
1442 dst+= src_size;
0d9f3d85 1443 src_size= 23-mmx_size;
74d35835
MN
1444 src-= src_size;
1445 dst-= src_size;
1446#endif
1447 for(i=0; i<src_size; i+=3)
1448 {
0d9f3d85 1449 register uint8_t x;
74d35835
MN
1450 x = src[i + 2];
1451 dst[i + 1] = src[i + 1];
1452 dst[i + 2] = src[i + 0];
1453 dst[i + 0] = x;
1454 }
1455}
1456
b1ec5875 1457static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
cfc15dc6 1458 unsigned int width, unsigned int height,
f0b62bbd 1459 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
d9d58d17 1460{
0d9f3d85
A
1461 unsigned y;
1462 const unsigned chromWidth= width>>1;
42b5fcb8
MN
1463 for(y=0; y<height; y++)
1464 {
4060205b 1465#ifdef HAVE_MMX
42b5fcb8
MN
1466//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1467 asm volatile(
1468 "xorl %%eax, %%eax \n\t"
cff6ecd7 1469 ".balign 16 \n\t"
42b5fcb8
MN
1470 "1: \n\t"
1471 PREFETCH" 32(%1, %%eax, 2) \n\t"
1472 PREFETCH" 32(%2, %%eax) \n\t"
1473 PREFETCH" 32(%3, %%eax) \n\t"
1474 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1475 "movq %%mm0, %%mm2 \n\t" // U(0)
1476 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1477 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1478 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
4060205b 1479
42b5fcb8
MN
1480 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1481 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1482 "movq %%mm3, %%mm4 \n\t" // Y(0)
1483 "movq %%mm5, %%mm6 \n\t" // Y(8)
1484 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1485 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1486 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1487 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
4060205b 1488
42b5fcb8
MN
1489 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1490 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1491 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1492 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
4060205b 1493
42b5fcb8
MN
1494 "addl $8, %%eax \n\t"
1495 "cmpl %4, %%eax \n\t"
1496 " jb 1b \n\t"
4596673c 1497 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
42b5fcb8
MN
1498 : "%eax"
1499 );
4060205b 1500#else
b3b8bf64
MN
1501
1502#if defined ARCH_ALPHA && defined HAVE_MVI
1503#define pl2yuy2(n) \
1504 y1 = yc[n]; \
1505 y2 = yc2[n]; \
1506 u = uc[n]; \
1507 v = vc[n]; \
1508 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1509 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1510 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1511 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1512 yuv1 = (u << 8) + (v << 24); \
1513 yuv2 = yuv1 + y2; \
1514 yuv1 += y1; \
1515 qdst[n] = yuv1; \
1516 qdst2[n] = yuv2;
1517
1518 int i;
1519 uint64_t *qdst = (uint64_t *) dst;
1520 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1521 const uint32_t *yc = (uint32_t *) ysrc;
1522 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1523 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1524 for(i = 0; i < chromWidth; i += 8){
1525 uint64_t y1, y2, yuv1, yuv2;
1526 uint64_t u, v;
1527 /* Prefetch */
1528 asm("ldq $31,64(%0)" :: "r"(yc));
1529 asm("ldq $31,64(%0)" :: "r"(yc2));
1530 asm("ldq $31,64(%0)" :: "r"(uc));
1531 asm("ldq $31,64(%0)" :: "r"(vc));
1532
1533 pl2yuy2(0);
1534 pl2yuy2(1);
1535 pl2yuy2(2);
1536 pl2yuy2(3);
1537
1538 yc += 4;
1539 yc2 += 4;
1540 uc += 4;
1541 vc += 4;
1542 qdst += 4;
1543 qdst2 += 4;
1544 }
1545 y++;
1546 ysrc += lumStride;
1547 dst += dstStride;
1548
1549#elif __WORDSIZE >= 64
42b5fcb8 1550 int i;
0d9f3d85
A
1551 uint64_t *ldst = (uint64_t *) dst;
1552 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1553 for(i = 0; i < chromWidth; i += 2){
1554 uint64_t k, l;
1555 k = yc[0] + (uc[0] << 8) +
1556 (yc[1] << 16) + (vc[0] << 24);
1557 l = yc[2] + (uc[1] << 8) +
1558 (yc[3] << 16) + (vc[1] << 24);
1559 *ldst++ = k + (l << 32);
1560 yc += 4;
1561 uc += 2;
1562 vc += 2;
42b5fcb8 1563 }
0d9f3d85
A
1564
1565#else
1566 int i, *idst = (int32_t *) dst;
1567 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1568 for(i = 0; i < chromWidth; i++){
da7f8893
MN
1569#ifdef WORDS_BIGENDIAN
1570 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1571 (yc[1] << 8) + (vc[0] << 0);
1572#else
0d9f3d85
A
1573 *idst++ = yc[0] + (uc[0] << 8) +
1574 (yc[1] << 16) + (vc[0] << 24);
da7f8893 1575#endif
0d9f3d85
A
1576 yc += 2;
1577 uc++;
1578 vc++;
1579 }
1580#endif
42b5fcb8 1581#endif
b1ec5875 1582 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
42b5fcb8
MN
1583 {
1584 usrc += chromStride;
1585 vsrc += chromStride;
1586 }
1587 ysrc += lumStride;
1588 dst += dstStride;
d9d58d17 1589 }
42b5fcb8
MN
1590#ifdef HAVE_MMX
1591asm( EMMS" \n\t"
1592 SFENCE" \n\t"
1593 :::"memory");
4060205b 1594#endif
d9d58d17
MN
1595}
1596
dabcdbc4
MN
1597/**
1598 *
1599 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1600 * problem for anyone then tell me, and ill fix it)
1601 */
b1ec5875
MN
1602static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1603 unsigned int width, unsigned int height,
f0b62bbd 1604 int lumStride, int chromStride, int dstStride)
b1ec5875
MN
1605{
1606 //FIXME interpolate chroma
1607 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
caeaabe7
AB
1608}
1609
1610static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1611 unsigned int width, unsigned int height,
1612 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1613{
1614 unsigned y;
1615 const unsigned chromWidth= width>>1;
1616 for(y=0; y<height; y++)
1617 {
7ac25f2d
MN
1618#ifdef HAVE_MMX
1619//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1620 asm volatile(
1621 "xorl %%eax, %%eax \n\t"
1622 ".balign 16 \n\t"
1623 "1: \n\t"
1624 PREFETCH" 32(%1, %%eax, 2) \n\t"
1625 PREFETCH" 32(%2, %%eax) \n\t"
1626 PREFETCH" 32(%3, %%eax) \n\t"
1627 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1628 "movq %%mm0, %%mm2 \n\t" // U(0)
1629 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1630 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1631 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1632
1633 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1634 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1635 "movq %%mm0, %%mm4 \n\t" // Y(0)
1636 "movq %%mm2, %%mm6 \n\t" // Y(8)
1637 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1638 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1639 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1640 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1641
1642 MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t"
1643 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1644 MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
1645 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1646
1647 "addl $8, %%eax \n\t"
1648 "cmpl %4, %%eax \n\t"
1649 " jb 1b \n\t"
1650 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1651 : "%eax"
1652 );
1653#else
1654//FIXME adapt the alpha asm code from yv12->yuy2
1655
caeaabe7
AB
1656#if __WORDSIZE >= 64
1657 int i;
1658 uint64_t *ldst = (uint64_t *) dst;
1659 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1660 for(i = 0; i < chromWidth; i += 2){
1661 uint64_t k, l;
1662 k = uc[0] + (yc[0] << 8) +
1663 (vc[0] << 16) + (yc[1] << 24);
1664 l = uc[1] + (yc[2] << 8) +
1665 (vc[1] << 16) + (yc[3] << 24);
1666 *ldst++ = k + (l << 32);
1667 yc += 4;
1668 uc += 2;
1669 vc += 2;
1670 }
1671
1672#else
1673 int i, *idst = (int32_t *) dst;
1674 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1675 for(i = 0; i < chromWidth; i++){
da7f8893
MN
1676#ifdef WORDS_BIGENDIAN
1677 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1678 (vc[0] << 8) + (yc[1] << 0);
1679#else
caeaabe7
AB
1680 *idst++ = uc[0] + (yc[0] << 8) +
1681 (vc[0] << 16) + (yc[1] << 24);
da7f8893 1682#endif
caeaabe7
AB
1683 yc += 2;
1684 uc++;
1685 vc++;
1686 }
1687#endif
7ac25f2d 1688#endif
caeaabe7
AB
1689 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1690 {
1691 usrc += chromStride;
1692 vsrc += chromStride;
1693 }
1694 ysrc += lumStride;
1695 dst += dstStride;
1696 }
7ac25f2d
MN
1697#ifdef HAVE_MMX
1698asm( EMMS" \n\t"
1699 SFENCE" \n\t"
1700 :::"memory");
1701#endif
caeaabe7
AB
1702}
1703
1704/**
1705 *
1706 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1707 * problem for anyone then tell me, and ill fix it)
1708 */
1709static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1710 unsigned int width, unsigned int height,
1711 int lumStride, int chromStride, int dstStride)
1712{
1713 //FIXME interpolate chroma
1714 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
b1ec5875
MN
1715}
1716
1717/**
1718 *
1719 * width should be a multiple of 16
1720 */
1721static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1722 unsigned int width, unsigned int height,
f0b62bbd 1723 int lumStride, int chromStride, int dstStride)
b1ec5875
MN
1724{
1725 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1726}
1727
1728/**
1729 *
1730 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1731 * problem for anyone then tell me, and ill fix it)
1732 */
1de97d84 1733static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
cfc15dc6 1734 unsigned int width, unsigned int height,
f0b62bbd 1735 int lumStride, int chromStride, int srcStride)
d9d58d17 1736{
0d9f3d85
A
1737 unsigned y;
1738 const unsigned chromWidth= width>>1;
dabcdbc4
MN
1739 for(y=0; y<height; y+=2)
1740 {
bd09433f 1741#ifdef HAVE_MMX
dabcdbc4
MN
1742 asm volatile(
1743 "xorl %%eax, %%eax \n\t"
1744 "pcmpeqw %%mm7, %%mm7 \n\t"
1745 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
cff6ecd7 1746 ".balign 16 \n\t"
dabcdbc4
MN
1747 "1: \n\t"
1748 PREFETCH" 64(%0, %%eax, 4) \n\t"
1749 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1750 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1751 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1752 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1753 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1754 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1755 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1756 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1757 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1758 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1759
1760 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1761
1762 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1763 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1764 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1765 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1766 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1767 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1768 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1769 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1770 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1771 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1772
1773 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1774
1775 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1776 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1777 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1778 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1779 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1780 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1781 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1782 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1783
1784 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1785 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1786
1787 "addl $8, %%eax \n\t"
1788 "cmpl %4, %%eax \n\t"
1789 " jb 1b \n\t"
4596673c 1790 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
cfc15dc6
MN
1791 : "memory", "%eax"
1792 );
dabcdbc4 1793
ed346065
MN
1794 ydst += lumStride;
1795 src += srcStride;
1796
cfc15dc6
MN
1797 asm volatile(
1798 "xorl %%eax, %%eax \n\t"
cff6ecd7 1799 ".balign 16 \n\t"
dabcdbc4
MN
1800 "1: \n\t"
1801 PREFETCH" 64(%0, %%eax, 4) \n\t"
1802 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1803 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1804 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1805 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1806 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1807 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1808 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1809 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1810 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1811 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1812
1813 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1814 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1815
1816 "addl $8, %%eax \n\t"
cfc15dc6 1817 "cmpl %4, %%eax \n\t"
dabcdbc4
MN
1818 " jb 1b \n\t"
1819
4596673c 1820 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
dabcdbc4
MN
1821 : "memory", "%eax"
1822 );
bd09433f 1823#else
0d9f3d85 1824 unsigned i;
dabcdbc4
MN
1825 for(i=0; i<chromWidth; i++)
1826 {
1827 ydst[2*i+0] = src[4*i+0];
1828 udst[i] = src[4*i+1];
1829 ydst[2*i+1] = src[4*i+2];
1830 vdst[i] = src[4*i+3];
1831 }
1832 ydst += lumStride;
1833 src += srcStride;
1834
1835 for(i=0; i<chromWidth; i++)
1836 {
1837 ydst[2*i+0] = src[4*i+0];
1838 ydst[2*i+1] = src[4*i+2];
1839 }
1840#endif
1841 udst += chromStride;
1842 vdst += chromStride;
1843 ydst += lumStride;
1844 src += srcStride;
d9d58d17 1845 }
dabcdbc4 1846#ifdef HAVE_MMX
ed8c0670
MN
1847asm volatile( EMMS" \n\t"
1848 SFENCE" \n\t"
1849 :::"memory");
bd09433f 1850#endif
42b5fcb8 1851}
81c0590e 1852
d661d18d
AB
1853static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1854 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
f0b62bbd 1855 unsigned int width, unsigned int height, int lumStride, int chromStride)
d661d18d
AB
1856{
1857 /* Y Plane */
1858 memcpy(ydst, ysrc, width*height);
1859
1860 /* XXX: implement upscaling for U,V */
1861}
1862
b241cbf2
MN
1863static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1864{
1865 int x,y;
1866
b2609d4c
MN
1867 dst[0]= src[0];
1868
b241cbf2 1869 // first line
b2609d4c
MN
1870 for(x=0; x<srcWidth-1; x++){
1871 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1872 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
b241cbf2 1873 }
b2609d4c
MN
1874 dst[2*srcWidth-1]= src[srcWidth-1];
1875
1876 dst+= dstStride;
b241cbf2
MN
1877
1878 for(y=1; y<srcHeight; y++){
1879#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
b2609d4c 1880 const int mmxSize= srcWidth&~15;
b241cbf2
MN
1881 asm volatile(
1882 "movl %4, %%eax \n\t"
1883 "1: \n\t"
1884 "movq (%0, %%eax), %%mm0 \n\t"
1885 "movq (%1, %%eax), %%mm1 \n\t"
1886 "movq 1(%0, %%eax), %%mm2 \n\t"
1887 "movq 1(%1, %%eax), %%mm3 \n\t"
b2609d4c
MN
1888 "movq -1(%0, %%eax), %%mm4 \n\t"
1889 "movq -1(%1, %%eax), %%mm5 \n\t"
1890 PAVGB" %%mm0, %%mm5 \n\t"
1891 PAVGB" %%mm0, %%mm3 \n\t"
1892 PAVGB" %%mm0, %%mm5 \n\t"
1893 PAVGB" %%mm0, %%mm3 \n\t"
1894 PAVGB" %%mm1, %%mm4 \n\t"
1895 PAVGB" %%mm1, %%mm2 \n\t"
1896 PAVGB" %%mm1, %%mm4 \n\t"
1897 PAVGB" %%mm1, %%mm2 \n\t"
1898 "movq %%mm5, %%mm7 \n\t"
1899 "movq %%mm4, %%mm6 \n\t"
1900 "punpcklbw %%mm3, %%mm5 \n\t"
1901 "punpckhbw %%mm3, %%mm7 \n\t"
1902 "punpcklbw %%mm2, %%mm4 \n\t"
1903 "punpckhbw %%mm2, %%mm6 \n\t"
b241cbf2 1904#if 1
b2609d4c
MN
1905 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
1906 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
1907 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
1908 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
b241cbf2 1909#else
b2609d4c
MN
1910 "movq %%mm5, (%2, %%eax, 2) \n\t"
1911 "movq %%mm7, 8(%2, %%eax, 2) \n\t"
1912 "movq %%mm4, (%3, %%eax, 2) \n\t"
1913 "movq %%mm6, 8(%3, %%eax, 2) \n\t"
b241cbf2
MN
1914#endif
1915 "addl $8, %%eax \n\t"
1916 " js 1b \n\t"
b2609d4c 1917 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
b241cbf2
MN
1918 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1919 "g" (-mmxSize)
1920 : "%eax"
1921
1922 );
b241cbf2 1923#else
b2609d4c
MN
1924 const int mmxSize=1;
1925#endif
1926 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1927 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
b241cbf2 1928
b2609d4c 1929 for(x=mmxSize-1; x<srcWidth-1; x++){
b241cbf2
MN
1930 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1931 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1932 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1933 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1934 }
b2609d4c
MN
1935 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1936 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
b241cbf2
MN
1937
1938 dst+=dstStride*2;
1939 src+=srcStride;
1940 }
b241cbf2
MN
1941
1942 // last line
b2609d4c
MN
1943#if 1
1944 dst[0]= src[0];
1945
1946 for(x=0; x<srcWidth-1; x++){
1947 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1948 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1949 }
1950 dst[2*srcWidth-1]= src[srcWidth-1];
1951#else
b241cbf2
MN
1952 for(x=0; x<srcWidth; x++){
1953 dst[2*x+0]=
1954 dst[2*x+1]= src[x];
1955 }
b2609d4c
MN
1956#endif
1957
b241cbf2
MN
1958#ifdef HAVE_MMX
1959asm volatile( EMMS" \n\t"
1960 SFENCE" \n\t"
1961 :::"memory");
1962#endif
1963}
1964
81c0590e
A
1965/**
1966 *
1967 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1968 * problem for anyone then tell me, and ill fix it)
1de97d84 1969 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
81c0590e 1970 */
1de97d84 1971static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
81c0590e 1972 unsigned int width, unsigned int height,
f0b62bbd 1973 int lumStride, int chromStride, int srcStride)
81c0590e 1974{
0d9f3d85
A
1975 unsigned y;
1976 const unsigned chromWidth= width>>1;
81c0590e
A
1977 for(y=0; y<height; y+=2)
1978 {
ed8c0670
MN
1979#ifdef HAVE_MMX
1980 asm volatile(
1981 "xorl %%eax, %%eax \n\t"
1982 "pcmpeqw %%mm7, %%mm7 \n\t"
1983 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1984 ".balign 16 \n\t"
1985 "1: \n\t"
1986 PREFETCH" 64(%0, %%eax, 4) \n\t"
1987 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1988 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1989 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1990 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1991 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1992 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1993 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1994 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1995 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1996 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1997
1998 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1999
2000 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2001 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2002 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2003 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2004 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2005 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2006 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2007 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2008 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2009 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2010
2011 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2012
2013 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2014 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2015 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2016 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2017 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2018 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2019 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2020 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2021
2022 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2023 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2024
2025 "addl $8, %%eax \n\t"
2026 "cmpl %4, %%eax \n\t"
2027 " jb 1b \n\t"
4596673c 2028 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
ed8c0670
MN
2029 : "memory", "%eax"
2030 );
2031
2032 ydst += lumStride;
2033 src += srcStride;
2034
2035 asm volatile(
2036 "xorl %%eax, %%eax \n\t"
2037 ".balign 16 \n\t"
2038 "1: \n\t"
2039 PREFETCH" 64(%0, %%eax, 4) \n\t"
2040 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2041 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2042 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2043 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2044 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2045 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2046 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2047 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2048 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2049 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2050
2051 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2052 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2053
2054 "addl $8, %%eax \n\t"
2055 "cmpl %4, %%eax \n\t"
2056 " jb 1b \n\t"
2057
4596673c 2058 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
ed8c0670
MN
2059 : "memory", "%eax"
2060 );
2061#else
0d9f3d85 2062 unsigned i;
81c0590e
A
2063 for(i=0; i<chromWidth; i++)
2064 {
2065 udst[i] = src[4*i+0];
2066 ydst[2*i+0] = src[4*i+1];
2067 vdst[i] = src[4*i+2];
2068 ydst[2*i+1] = src[4*i+3];
2069 }
2070 ydst += lumStride;
2071 src += srcStride;
2072
2073 for(i=0; i<chromWidth; i++)
2074 {
2075 ydst[2*i+0] = src[4*i+1];
2076 ydst[2*i+1] = src[4*i+3];
2077 }
ed8c0670 2078#endif
81c0590e
A
2079 udst += chromStride;
2080 vdst += chromStride;
2081 ydst += lumStride;
2082 src += srcStride;
2083 }
ed8c0670
MN
2084#ifdef HAVE_MMX
2085asm volatile( EMMS" \n\t"
2086 SFENCE" \n\t"
2087 :::"memory");
2088#endif
81c0590e
A
2089}
2090
1de97d84
MN
2091/**
2092 *
2093 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2094 * problem for anyone then tell me, and ill fix it)
21316f3c 2095 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1de97d84
MN
2096 */
2097static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2098 unsigned int width, unsigned int height,
f0b62bbd 2099 int lumStride, int chromStride, int srcStride)
1de97d84 2100{
0d9f3d85
A
2101 unsigned y;
2102 const unsigned chromWidth= width>>1;
21316f3c
MN
2103#ifdef HAVE_MMX
2104 for(y=0; y<height-2; y+=2)
2105 {
0d9f3d85 2106 unsigned i;
21316f3c
MN
2107 for(i=0; i<2; i++)
2108 {
2109 asm volatile(
2110 "movl %2, %%eax \n\t"
854288bb
FB
2111 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2112 "movq "MANGLE(w1111)", %%mm5 \n\t"
21316f3c
MN
2113 "pxor %%mm7, %%mm7 \n\t"
2114 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2115 ".balign 16 \n\t"
2116 "1: \n\t"
2117 PREFETCH" 64(%0, %%ebx) \n\t"
2118 "movd (%0, %%ebx), %%mm0 \n\t"
2119 "movd 3(%0, %%ebx), %%mm1 \n\t"
2120 "punpcklbw %%mm7, %%mm0 \n\t"
2121 "punpcklbw %%mm7, %%mm1 \n\t"
2122 "movd 6(%0, %%ebx), %%mm2 \n\t"
2123 "movd 9(%0, %%ebx), %%mm3 \n\t"
2124 "punpcklbw %%mm7, %%mm2 \n\t"
2125 "punpcklbw %%mm7, %%mm3 \n\t"
2126 "pmaddwd %%mm6, %%mm0 \n\t"
2127 "pmaddwd %%mm6, %%mm1 \n\t"
2128 "pmaddwd %%mm6, %%mm2 \n\t"
2129 "pmaddwd %%mm6, %%mm3 \n\t"
2130#ifndef FAST_BGR2YV12
2131 "psrad $8, %%mm0 \n\t"
2132 "psrad $8, %%mm1 \n\t"
2133 "psrad $8, %%mm2 \n\t"
2134 "psrad $8, %%mm3 \n\t"
2135#endif
2136 "packssdw %%mm1, %%mm0 \n\t"
2137 "packssdw %%mm3, %%mm2 \n\t"
2138 "pmaddwd %%mm5, %%mm0 \n\t"
2139 "pmaddwd %%mm5, %%mm2 \n\t"
2140 "packssdw %%mm2, %%mm0 \n\t"
2141 "psraw $7, %%mm0 \n\t"
2142
2143 "movd 12(%0, %%ebx), %%mm4 \n\t"
2144 "movd 15(%0, %%ebx), %%mm1 \n\t"
2145 "punpcklbw %%mm7, %%mm4 \n\t"
2146 "punpcklbw %%mm7, %%mm1 \n\t"
2147 "movd 18(%0, %%ebx), %%mm2 \n\t"
2148 "movd 21(%0, %%ebx), %%mm3 \n\t"
2149 "punpcklbw %%mm7, %%mm2 \n\t"
2150 "punpcklbw %%mm7, %%mm3 \n\t"
2151 "pmaddwd %%mm6, %%mm4 \n\t"
2152 "pmaddwd %%mm6, %%mm1 \n\t"
2153 "pmaddwd %%mm6, %%mm2 \n\t"
2154 "pmaddwd %%mm6, %%mm3 \n\t"
2155#ifndef FAST_BGR2YV12
2156 "psrad $8, %%mm4 \n\t"
2157 "psrad $8, %%mm1 \n\t"
2158 "psrad $8, %%mm2 \n\t"
2159 "psrad $8, %%mm3 \n\t"
2160#endif
2161 "packssdw %%mm1, %%mm4 \n\t"
2162 "packssdw %%mm3, %%mm2 \n\t"
2163 "pmaddwd %%mm5, %%mm4 \n\t"
2164 "pmaddwd %%mm5, %%mm2 \n\t"
2165 "addl $24, %%ebx \n\t"
2166 "packssdw %%mm2, %%mm4 \n\t"
2167 "psraw $7, %%mm4 \n\t"
2168
2169 "packuswb %%mm4, %%mm0 \n\t"
854288bb 2170 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
21316f3c
MN
2171
2172 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
2173 "addl $8, %%eax \n\t"
2174 " js 1b \n\t"
2175 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2176 : "%eax", "%ebx"
2177 );
2178 ydst += lumStride;
2179 src += srcStride;
2180 }
2181 src -= srcStride*2;
2182 asm volatile(
2183 "movl %4, %%eax \n\t"
854288bb
FB
2184 "movq "MANGLE(w1111)", %%mm5 \n\t"
2185 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
21316f3c
MN
2186 "pxor %%mm7, %%mm7 \n\t"
2187 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2188 "addl %%ebx, %%ebx \n\t"
2189 ".balign 16 \n\t"
2190 "1: \n\t"
2191 PREFETCH" 64(%0, %%ebx) \n\t"
2192 PREFETCH" 64(%1, %%ebx) \n\t"
2193#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2194 "movq (%0, %%ebx), %%mm0 \n\t"
2195 "movq (%1, %%ebx), %%mm1 \n\t"
2196 "movq 6(%0, %%ebx), %%mm2 \n\t"
2197 "movq 6(%1, %%ebx), %%mm3 \n\t"
2198 PAVGB" %%mm1, %%mm0 \n\t"
2199 PAVGB" %%mm3, %%mm2 \n\t"
2200 "movq %%mm0, %%mm1 \n\t"
2201 "movq %%mm2, %%mm3 \n\t"
2202 "psrlq $24, %%mm0 \n\t"
2203 "psrlq $24, %%mm2 \n\t"
2204 PAVGB" %%mm1, %%mm0 \n\t"
2205 PAVGB" %%mm3, %%mm2 \n\t"
2206 "punpcklbw %%mm7, %%mm0 \n\t"
2207 "punpcklbw %%mm7, %%mm2 \n\t"
2208#else
2209 "movd (%0, %%ebx), %%mm0 \n\t"
2210 "movd (%1, %%ebx), %%mm1 \n\t"
2211 "movd 3(%0, %%ebx), %%mm2 \n\t"
2212 "movd 3(%1, %%ebx), %%mm3 \n\t"
2213 "punpcklbw %%mm7, %%mm0 \n\t"
2214 "punpcklbw %%mm7, %%mm1 \n\t"
2215 "punpcklbw %%mm7, %%mm2 \n\t"
2216 "punpcklbw %%mm7, %%mm3 \n\t"
2217 "paddw %%mm1, %%mm0 \n\t"
2218 "paddw %%mm3, %%mm2 \n\t"
2219 "paddw %%mm2, %%mm0 \n\t"
2220 "movd 6(%0, %%ebx), %%mm4 \n\t"
2221 "movd 6(%1, %%ebx), %%mm1 \n\t"
2222 "movd 9(%0, %%ebx), %%mm2 \n\t"
2223 "movd 9(%1, %%ebx), %%mm3 \n\t"
2224 "punpcklbw %%mm7, %%mm4 \n\t"
2225 "punpcklbw %%mm7, %%mm1 \n\t"
2226 "punpcklbw %%mm7, %%mm2 \n\t"
2227 "punpcklbw %%mm7, %%mm3 \n\t"
2228 "paddw %%mm1, %%mm4 \n\t"
2229 "paddw %%mm3, %%mm2 \n\t"
2230 "paddw %%mm4, %%mm2 \n\t"
2231 "psrlw $2, %%mm0 \n\t"
2232 "psrlw $2, %%mm2 \n\t"
2233#endif
854288bb
FB
2234 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2235 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
21316f3c
MN
2236
2237 "pmaddwd %%mm0, %%mm1 \n\t"
2238 "pmaddwd %%mm2, %%mm3 \n\t"
2239 "pmaddwd %%mm6, %%mm0 \n\t"
2240 "pmaddwd %%mm6, %%mm2 \n\t"
2241#ifndef FAST_BGR2YV12
2242 "psrad $8, %%mm0 \n\t"
2243 "psrad $8, %%mm1 \n\t"
2244 "psrad $8, %%mm2 \n\t"
2245 "psrad $8, %%mm3 \n\t"
2246#endif
2247 "packssdw %%mm2, %%mm0 \n\t"
2248 "packssdw %%mm3, %%mm1 \n\t"
2249 "pmaddwd %%mm5, %%mm0 \n\t"
2250 "pmaddwd %%mm5, %%mm1 \n\t"
2251 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2252 "psraw $7, %%mm0 \n\t"
2253
2254#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2255 "movq 12(%0, %%ebx), %%mm4 \n\t"
2256 "movq 12(%1, %%ebx), %%mm1 \n\t"
2257 "movq 18(%0, %%ebx), %%mm2 \n\t"
2258 "movq 18(%1, %%ebx), %%mm3 \n\t"
2259 PAVGB" %%mm1, %%mm4 \n\t"
2260 PAVGB" %%mm3, %%mm2 \n\t"
2261 "movq %%mm4, %%mm1 \n\t"
2262 "movq %%mm2, %%mm3 \n\t"
2263 "psrlq $24, %%mm4 \n\t"
2264 "psrlq $24, %%mm2 \n\t"
2265 PAVGB" %%mm1, %%mm4 \n\t"
2266 PAVGB" %%mm3, %%mm2 \n\t"
2267 "punpcklbw %%mm7, %%mm4 \n\t"
2268 "punpcklbw %%mm7, %%mm2 \n\t"
2269#else
2270 "movd 12(%0, %%ebx), %%mm4 \n\t"
2271 "movd 12(%1, %%ebx), %%mm1 \n\t"
2272 "movd 15(%0, %%ebx), %%mm2 \n\t"
2273 "movd 15(%1, %%ebx), %%mm3 \n\t"
2274 "punpcklbw %%mm7, %%mm4 \n\t"
2275 "punpcklbw %%mm7, %%mm1 \n\t"
2276 "punpcklbw %%mm7, %%mm2 \n\t"
2277 "punpcklbw %%mm7, %%mm3 \n\t"
2278 "paddw %%mm1, %%mm4 \n\t"
2279 "paddw %%mm3, %%mm2 \n\t"
2280 "paddw %%mm2, %%mm4 \n\t"
2281 "movd 18(%0, %%ebx), %%mm5 \n\t"
2282 "movd 18(%1, %%ebx), %%mm1 \n\t"
2283 "movd 21(%0, %%ebx), %%mm2 \n\t"
2284 "movd 21(%1, %%ebx), %%mm3 \n\t"
2285 "punpcklbw %%mm7, %%mm5 \n\t"
2286 "punpcklbw %%mm7, %%mm1 \n\t"
2287 "punpcklbw %%mm7, %%mm2 \n\t"
2288 "punpcklbw %%mm7, %%mm3 \n\t"
2289 "paddw %%mm1, %%mm5 \n\t"
2290 "paddw %%mm3, %%mm2 \n\t"
2291 "paddw %%mm5, %%mm2 \n\t"
854288bb 2292 "movq "MANGLE(w1111)", %%mm5 \n\t"
21316f3c
MN
2293 "psrlw $2, %%mm4 \n\t"
2294 "psrlw $2, %%mm2 \n\t"
2295#endif
854288bb
FB
2296 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2297 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
21316f3c
MN
2298
2299 "pmaddwd %%mm4, %%mm1 \n\t"
2300 "pmaddwd %%mm2, %%mm3 \n\t"
2301 "pmaddwd %%mm6, %%mm4 \n\t"
2302 "pmaddwd %%mm6, %%mm2 \n\t"
2303#ifndef FAST_BGR2YV12
2304 "psrad $8, %%mm4 \n\t"
2305 "psrad $8, %%mm1 \n\t"
2306 "psrad $8, %%mm2 \n\t"
2307 "psrad $8, %%mm3 \n\t"
2308#endif
2309 "packssdw %%mm2, %%mm4 \n\t"
2310 "packssdw %%mm3, %%mm1 \n\t"
2311 "pmaddwd %%mm5, %%mm4 \n\t"
2312 "pmaddwd %%mm5, %%mm1 \n\t"
2313 "addl $24, %%ebx \n\t"
2314 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2315 "psraw $7, %%mm4 \n\t"
2316
2317 "movq %%mm0, %%mm1 \n\t"
2318 "punpckldq %%mm4, %%mm0 \n\t"
2319 "punpckhdq %%mm4, %%mm1 \n\t"
2320 "packsswb %%mm1, %%mm0 \n\t"
854288bb 2321 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
21316f3c
MN
2322
2323 "movd %%mm0, (%2, %%eax) \n\t"
2324 "punpckhdq %%mm0, %%mm0 \n\t"
2325 "movd %%mm0, (%3, %%eax) \n\t"
2326 "addl $4, %%eax \n\t"
2327 " js 1b \n\t"
4ccbc7d5 2328 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
21316f3c
MN
2329 : "%eax", "%ebx"
2330 );
2331
2332 udst += chromStride;
2333 vdst += chromStride;
2334 src += srcStride*2;
2335 }
2336
2337 asm volatile( EMMS" \n\t"
2338 SFENCE" \n\t"
2339 :::"memory");
2340#else
2341 y=0;
2342#endif
2343 for(; y<height; y+=2)
1de97d84 2344 {
0d9f3d85 2345 unsigned i;
1de97d84
MN
2346 for(i=0; i<chromWidth; i++)
2347 {
2348 unsigned int b= src[6*i+0];
2349 unsigned int g= src[6*i+1];
2350 unsigned int r= src[6*i+2];
2351
aa21f0c3
MN
2352 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2353 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2354 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
81c0590e 2355
1de97d84
MN
2356 udst[i] = U;
2357 vdst[i] = V;
2358 ydst[2*i] = Y;
2359
2360 b= src[6*i+3];
2361 g= src[6*i+4];
2362 r= src[6*i+5];
2363
aa21f0c3 2364 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1de97d84
MN
2365 ydst[2*i+1] = Y;
2366 }
2367 ydst += lumStride;
2368 src += srcStride;
2369
2370 for(i=0; i<chromWidth; i++)
2371 {
2372 unsigned int b= src[6*i+0];
2373 unsigned int g= src[6*i+1];
2374 unsigned int r= src[6*i+2];
2375
aa21f0c3 2376 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1de97d84
MN
2377
2378 ydst[2*i] = Y;
2379
2380 b= src[6*i+3];
2381 g= src[6*i+4];
2382 r= src[6*i+5];
2383
aa21f0c3 2384 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1de97d84
MN
2385 ydst[2*i+1] = Y;
2386 }
2387 udst += chromStride;
2388 vdst += chromStride;
2389 ydst += lumStride;
2390 src += srcStride;
2391 }
2392}
5d55fdb4
MN
2393
2394void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
f0b62bbd
MN
2395 unsigned width, unsigned height, int src1Stride,
2396 int src2Stride, int dstStride){
0d9f3d85 2397 unsigned h;
5d55fdb4
MN
2398
2399 for(h=0; h < height; h++)
2400 {
0d9f3d85 2401 unsigned w;
5d55fdb4
MN
2402
2403#ifdef HAVE_MMX
2404#ifdef HAVE_SSE2
2405 asm(
2406 "xorl %%eax, %%eax \n\t"
2407 "1: \n\t"
2408 PREFETCH" 64(%1, %%eax) \n\t"
2409 PREFETCH" 64(%2, %%eax) \n\t"
2410 "movdqa (%1, %%eax), %%xmm0 \n\t"
2411 "movdqa (%1, %%eax), %%xmm1 \n\t"
2412 "movdqa (%2, %%eax), %%xmm2 \n\t"
2413 "punpcklbw %%xmm2, %%xmm0 \n\t"
2414 "punpckhbw %%xmm2, %%xmm1 \n\t"
2415 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2416 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2417 "addl $16, %%eax \n\t"
2418 "cmpl %3, %%eax \n\t"
2419 " jb 1b \n\t"
2420 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2421 : "memory", "%eax"
2422 );
2423#else
2424 asm(
2425 "xorl %%eax, %%eax \n\t"
2426 "1: \n\t"
2427 PREFETCH" 64(%1, %%eax) \n\t"
2428 PREFETCH" 64(%2, %%eax) \n\t"
2429 "movq (%1, %%eax), %%mm0 \n\t"
2430 "movq 8(%1, %%eax), %%mm2 \n\t"
2431 "movq %%mm0, %%mm1 \n\t"
2432 "movq %%mm2, %%mm3 \n\t"
2433 "movq (%2, %%eax), %%mm4 \n\t"
2434 "movq 8(%2, %%eax), %%mm5 \n\t"
2435 "punpcklbw %%mm4, %%mm0 \n\t"
2436 "punpckhbw %%mm4, %%mm1 \n\t"
2437 "punpcklbw %%mm5, %%mm2 \n\t"
2438 "punpckhbw %%mm5, %%mm3 \n\t"
2439 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2440 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2441 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2442 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2443 "addl $16, %%eax \n\t"
2444 "cmpl %3, %%eax \n\t"
2445 " jb 1b \n\t"
2446 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2447 : "memory", "%eax"
2448 );
2449#endif
2450 for(w= (width&(~15)); w < width; w++)
2451 {
2452 dest[2*w+0] = src1[w];
2453 dest[2*w+1] = src2[w];
2454 }
2455#else
2456 for(w=0; w < width; w++)
2457 {
2458 dest[2*w+0] = src1[w];
2459 dest[2*w+1] = src2[w];
2460 }
2461#endif
2462 dest += dstStride;
2463 src1 += src1Stride;
2464 src2 += src2Stride;
2465 }
2466#ifdef HAVE_MMX
2467 asm(
2468 EMMS" \n\t"
2469 SFENCE" \n\t"
2470 ::: "memory"
2471 );
2472#endif
2473}
ac4d0aea
MN
2474
2475static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2476 uint8_t *dst1, uint8_t *dst2,
2477 unsigned width, unsigned height,
f0b62bbd
MN
2478 int srcStride1, int srcStride2,
2479 int dstStride1, int dstStride2)
ac4d0aea 2480{
f0b62bbd
MN
2481 unsigned int y,x,h;
2482 int w;
ac4d0aea
MN
2483 w=width/2; h=height/2;
2484#ifdef HAVE_MMX
2485 asm volatile(
2486 PREFETCH" %0\n\t"
2487 PREFETCH" %1\n\t"
2488 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2489#endif
2490 for(y=0;y<h;y++){
2491 const uint8_t* s1=src1+srcStride1*(y>>1);
2492 uint8_t* d=dst1+dstStride1*y;
2493 x=0;
2494#ifdef HAVE_MMX
f0b62bbd 2495 for(;x<w-31;x+=32)
ac4d0aea
MN
2496 {
2497 asm volatile(
2498 PREFETCH" 32%1\n\t"
2499 "movq %1, %%mm0\n\t"
2500 "movq 8%1, %%mm2\n\t"
2501 "movq 16%1, %%mm4\n\t"
2502 "movq 24%1, %%mm6\n\t"
2503 "movq %%mm0, %%mm1\n\t"
2504 "movq %%mm2, %%mm3\n\t"
2505 "movq %%mm4, %%mm5\n\t"
2506 "movq %%mm6, %%mm7\n\t"
2507 "punpcklbw %%mm0, %%mm0\n\t"
2508 "punpckhbw %%mm1, %%mm1\n\t"
2509 "punpcklbw %%mm2, %%mm2\n\t"
2510 "punpckhbw %%mm3, %%mm3\n\t"
2511 "punpcklbw %%mm4, %%mm4\n\t"
2512 "punpckhbw %%mm5, %%mm5\n\t"
2513 "punpcklbw %%mm6, %%mm6\n\t"
2514 "punpckhbw %%mm7, %%mm7\n\t"
2515 MOVNTQ" %%mm0, %0\n\t"
2516 MOVNTQ" %%mm1, 8%0\n\t"
2517 MOVNTQ" %%mm2, 16%0\n\t"
2518 MOVNTQ" %%mm3, 24%0\n\t"
2519 MOVNTQ" %%mm4, 32%0\n\t"
2520 MOVNTQ" %%mm5, 40%0\n\t"
2521 MOVNTQ" %%mm6, 48%0\n\t"
2522 MOVNTQ" %%mm7, 56%0"
2523 :"=m"(d[2*x])
2524 :"m"(s1[x])
2525 :"memory");
2526 }
2527#endif
2528 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2529 }
2530 for(y=0;y<h;y++){
2531 const uint8_t* s2=src2+srcStride2*(y>>1);
2532 uint8_t* d=dst2+dstStride2*y;
2533 x=0;
2534#ifdef HAVE_MMX
f0b62bbd 2535 for(;x<w-31;x+=32)
ac4d0aea
MN
2536 {
2537 asm volatile(
2538 PREFETCH" 32%1\n\t"
2539 "movq %1, %%mm0\n\t"
2540 "movq 8%1, %%mm2\n\t"
2541 "movq 16%1, %%mm4\n\t"
2542 "movq 24%1, %%mm6\n\t"
2543 "movq %%mm0, %%mm1\n\t"
2544 "movq %%mm2, %%mm3\n\t"
2545 "movq %%mm4, %%mm5\n\t"
2546 "movq %%mm6, %%mm7\n\t"
2547 "punpcklbw %%mm0, %%mm0\n\t"
2548 "punpckhbw %%mm1, %%mm1\n\t"
2549 "punpcklbw %%mm2, %%mm2\n\t"
2550 "punpckhbw %%mm3, %%mm3\n\t"
2551 "punpcklbw %%mm4, %%mm4\n\t"
2552 "punpckhbw %%mm5, %%mm5\n\t"
2553 "punpcklbw %%mm6, %%mm6\n\t"
2554 "punpckhbw %%mm7, %%mm7\n\t"
2555 MOVNTQ" %%mm0, %0\n\t"
2556 MOVNTQ" %%mm1, 8%0\n\t"
2557 MOVNTQ" %%mm2, 16%0\n\t"
2558 MOVNTQ" %%mm3, 24%0\n\t"
2559 MOVNTQ" %%mm4, 32%0\n\t"
2560 MOVNTQ" %%mm5, 40%0\n\t"
2561 MOVNTQ" %%mm6, 48%0\n\t"
2562 MOVNTQ" %%mm7, 56%0"
2563 :"=m"(d[2*x])
2564 :"m"(s2[x])
2565 :"memory");
2566 }
2567#endif
2568 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2569 }
2570#ifdef HAVE_MMX
2571 asm(
2572 EMMS" \n\t"
2573 SFENCE" \n\t"
2574 ::: "memory"
2575 );
2576#endif
2577}
2578
2579static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2580 uint8_t *dst,
2581 unsigned width, unsigned height,
f0b62bbd
MN
2582 int srcStride1, int srcStride2,
2583 int srcStride3, int dstStride)
ac4d0aea 2584{
4596673c 2585 unsigned y,x,w,h;
ac4d0aea 2586 w=width/2; h=height;
ac4d0aea
MN
2587 for(y=0;y<h;y++){
2588 const uint8_t* yp=src1+srcStride1*y;
2589 const uint8_t* up=src2+srcStride2*(y>>2);
2590 const uint8_t* vp=src3+srcStride3*(y>>2);
2591 uint8_t* d=dst+dstStride*y;
ac4d0aea
MN
2592 x=0;
2593#ifdef HAVE_MMX
4596673c 2594 for(;x<w-7;x+=8)
ac4d0aea
MN
2595 {
2596 asm volatile(
4596673c
MN
2597 PREFETCH" 32(%1, %0)\n\t"
2598 PREFETCH" 32(%2, %0)\n\t"
2599 PREFETCH" 32(%3, %0)\n\t"
2600 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2601 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2602 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
ac4d0aea
MN
2603 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2604 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2605 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2606 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2607 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2608 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2609 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2610
2611 "movq %%mm1, %%mm6\n\t"
2612 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2613 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2614 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
4596673c
MN
2615 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2616 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
ac4d0aea
MN
2617
2618 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
4596673c 2619 "movq 8(%1, %0, 4), %%mm0\n\t"
ac4d0aea
MN
2620 "movq %%mm0, %%mm3\n\t"
2621 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2622 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
4596673c
MN
2623 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2624 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
ac4d0aea
MN
2625
2626 "movq %%mm4, %%mm6\n\t"
4596673c 2627 "movq 16(%1, %0, 4), %%mm0\n\t"
ac4d0aea
MN
2628 "movq %%mm0, %%mm3\n\t"
2629 "punpcklbw %%mm5, %%mm4\n\t"
2630 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2631 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
4596673c
MN
2632 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2633 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
ac4d0aea
MN
2634
2635 "punpckhbw %%mm5, %%mm6\n\t"
4596673c 2636 "movq 24(%1, %0, 4), %%mm0\n\t"
ac4d0aea
MN
2637 "movq %%mm0, %%mm3\n\t"
2638 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2639 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
4596673c
MN
2640 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2641 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
ac4d0aea 2642
4596673c
MN
2643 : "+r" (x)
2644 : "r"(yp), "r" (up), "r"(vp), "r"(d)
ac4d0aea
MN
2645 :"memory");
2646 }
2647#endif
4596673c 2648 for(; x<w; x++)
ac4d0aea 2649 {
4596673c 2650 const int x2= x<<2;
ac4d0aea
MN
2651 d[8*x+0]=yp[x2];
2652 d[8*x+1]=up[x];
2653 d[8*x+2]=yp[x2+1];
2654 d[8*x+3]=vp[x];
2655 d[8*x+4]=yp[x2+2];
2656 d[8*x+5]=up[x];
2657 d[8*x+6]=yp[x2+3];
2658 d[8*x+7]=vp[x];
2659 }
2660 }
2661#ifdef HAVE_MMX
2662 asm(
2663 EMMS" \n\t"
2664 SFENCE" \n\t"
2665 ::: "memory"
2666 );
2667#endif
2668}