fixing RGB32->RGB16 on big endian patch by (Colin Leroy <colin at colino dot net>)
[libav.git] / postproc / rgb2rgb_template.c
CommitLineData
fcfbc150 1/*
a3aece93
NK
2 *
3 * rgb2rgb.c, Software RGB to RGB convertor
6611aa83
NK
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
a3aece93 7 * Written by Nick Kurshev.
1de97d84 8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
a3aece93 9 */
a3aece93 10
0d9f3d85
A
11#include <stddef.h>
12#include <inttypes.h> /* for __WORDSIZE */
13
14#ifndef __WORDSIZE
ff78c596
A
15// #warning You have misconfigured system and probably will lose performance!
16#define __WORDSIZE MP_WORDSIZE
0d9f3d85
A
17#endif
18
1de97d84
MN
19#undef PREFETCH
20#undef MOVNTQ
21#undef EMMS
22#undef SFENCE
23#undef MMREG_SIZE
24#undef PREFETCHW
25#undef PAVGB
26
27#ifdef HAVE_SSE2
28#define MMREG_SIZE 16
29#else
30#define MMREG_SIZE 8
31#endif
32
33#ifdef HAVE_3DNOW
34#define PREFETCH "prefetch"
35#define PREFETCHW "prefetchw"
36#define PAVGB "pavgusb"
37#elif defined ( HAVE_MMX2 )
38#define PREFETCH "prefetchnta"
39#define PREFETCHW "prefetcht0"
40#define PAVGB "pavgb"
41#else
42#define PREFETCH "/nop"
43#define PREFETCHW "/nop"
99969243 44#endif
1de97d84
MN
45
46#ifdef HAVE_3DNOW
47/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48#define EMMS "femms"
49#else
50#define EMMS "emms"
e697a141 51#endif
79811694 52
1de97d84
MN
53#ifdef HAVE_MMX2
54#define MOVNTQ "movntq"
55#define SFENCE "sfence"
56#else
57#define MOVNTQ "movq"
58#define SFENCE "/nop"
59#endif
60
61static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
b234ae81 62{
fde33ab5 63 uint8_t *dest = dst;
56993147
NK
64 const uint8_t *s = src;
65 const uint8_t *end;
49a0c6ee 66#ifdef HAVE_MMX
d8dad2a5 67 const uint8_t *mm_end;
49a0c6ee 68#endif
b234ae81 69 end = s + src_size;
49a0c6ee 70#ifdef HAVE_MMX
a3aece93 71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 72 mm_end = end - 23;
a3aece93 73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
49a0c6ee
NK
74 while(s < mm_end)
75 {
76 __asm __volatile(
a3aece93 77 PREFETCH" 32%1\n\t"
49a0c6ee 78 "movd %1, %%mm0\n\t"
0155db7f
NK
79 "punpckldq 3%1, %%mm0\n\t"
80 "movd 6%1, %%mm1\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
49a0c6ee 86 "pand %%mm7, %%mm0\n\t"
0155db7f 87 "pand %%mm7, %%mm1\n\t"
49a0c6ee 88 "pand %%mm7, %%mm2\n\t"
0155db7f 89 "pand %%mm7, %%mm3\n\t"
96b956cc 90 MOVNTQ" %%mm0, %0\n\t"
0155db7f
NK
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
93 MOVNTQ" %%mm3, 24%0"
49a0c6ee
NK
94 :"=m"(*dest)
95 :"m"(*s)
96 :"memory");
0155db7f
NK
97 dest += 32;
98 s += 24;
49a0c6ee 99 }
79811694 100 __asm __volatile(SFENCE:::"memory");
96b956cc 101 __asm __volatile(EMMS:::"memory");
49a0c6ee 102#endif
b234ae81
NK
103 while(s < end)
104 {
fde33ab5
NK
105 *dest++ = *s++;
106 *dest++ = *s++;
107 *dest++ = *s++;
108 *dest++ = 0;
b234ae81
NK
109 }
110}
59ac5a93 111
1de97d84 112static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
59ac5a93
NK
113{
114 uint8_t *dest = dst;
56993147
NK
115 const uint8_t *s = src;
116 const uint8_t *end;
494a6294 117#ifdef HAVE_MMX
d8dad2a5 118 const uint8_t *mm_end;
494a6294 119#endif
59ac5a93 120 end = s + src_size;
494a6294 121#ifdef HAVE_MMX
a3aece93 122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 123 mm_end = end - 31;
494a6294
NK
124 while(s < mm_end)
125 {
126 __asm __volatile(
a3aece93 127 PREFETCH" 32%1\n\t"
494a6294
NK
128 "movq %1, %%mm0\n\t"
129 "movq 8%1, %%mm1\n\t"
2b3eef22
NK
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
494a6294
NK
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
2b3eef22
NK
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
494a6294
NK
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
2b3eef22
NK
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
140 "pand %2, %%mm0\n\t"
141 "pand %2, %%mm1\n\t"
142 "pand %2, %%mm4\n\t"
143 "pand %2, %%mm5\n\t"
144 "pand %3, %%mm2\n\t"
145 "pand %3, %%mm3\n\t"
146 "pand %3, %%mm6\n\t"
147 "pand %3, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
152
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
157 "pand %4, %%mm2\n\t"
158 "pand %5, %%mm3\n\t"
494a6294 159 "por %%mm2, %%mm0\n\t"
2b3eef22
NK
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
494a6294 163 "por %%mm3, %%mm1\n\t"
2b3eef22
NK
164 "pand %6, %%mm5\n\t"
165 "por %%mm5, %%mm4\n\t"
1de97d84 166
494a6294 167 MOVNTQ" %%mm0, %0\n\t"
2b3eef22
NK
168 MOVNTQ" %%mm1, 8%0\n\t"
169 MOVNTQ" %%mm4, 16%0"
494a6294 170 :"=m"(*dest)
2b3eef22
NK
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
494a6294 173 :"memory");
2b3eef22
NK
174 dest += 24;
175 s += 32;
494a6294
NK
176 }
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
179#endif
59ac5a93
NK
180 while(s < end)
181 {
182 *dest++ = *s++;
183 *dest++ = *s++;
184 *dest++ = *s++;
185 s++;
186 }
187}
b238eb2e 188
a3aece93
NK
189/*
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
51da31f1 192 MMX2, 3DNOW optimization by Nick Kurshev
9b2c28e6 193 32bit c version, and and&add trick by Michael Niedermayer
a3aece93 194*/
1de97d84 195static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
b238eb2e 196{
0d9f3d85
A
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
d8dad2a5 200 const uint8_t *mm_end;
0d9f3d85 201 end = s + src_size;
b238eb2e 202#ifdef HAVE_MMX
0d9f3d85
A
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
d8dad2a5 205 mm_end = end - 15;
0d9f3d85 206 while(s<mm_end)
a3aece93
NK
207 {
208 __asm __volatile(
209 PREFETCH" 32%1\n\t"
210 "movq %1, %%mm0\n\t"
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
a3aece93 215 "pand %%mm4, %%mm2\n\t"
9b2c28e6
MN
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
a3aece93
NK
218 MOVNTQ" %%mm0, %0\n\t"
219 MOVNTQ" %%mm2, 8%0"
0d9f3d85
A
220 :"=m"(*d)
221 :"m"(*s)
9b2c28e6 222 );
0d9f3d85
A
223 d+=16;
224 s+=16;
b238eb2e 225 }
a3aece93
NK
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
b238eb2e 228#endif
d8dad2a5 229 mm_end = end - 3;
0d9f3d85
A
230 while(s < mm_end)
231 {
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234 d+=4;
235 s+=4;
236 }
237 if(s < end)
238 {
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241 }
b238eb2e 242}
fcfbc150 243
d661d18d
AB
244static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
245{
246 unsigned j,i,num_pixels=src_size/3;
247 for(i=0,j=0; j<num_pixels; i+=3,j+=3)
248 {
249 dst[j+0] = src[i+2];
250 dst[j+1] = src[i+1];
251 dst[j+2] = src[i+0];
252 }
253}
254
ac4d0aea
MN
255static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
256{
257 register const uint8_t* s=src;
258 register uint8_t* d=dst;
259 register const uint8_t *end;
0598bcbb 260 const uint8_t *mm_end;
ac4d0aea
MN
261 end = s + src_size;
262#ifdef HAVE_MMX
263 __asm __volatile(PREFETCH" %0"::"m"(*s));
264 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
265 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
0598bcbb 266 mm_end = end - 15;
ac4d0aea
MN
267 while(s<mm_end)
268 {
269 __asm __volatile(
270 PREFETCH" 32%1\n\t"
271 "movq %1, %%mm0\n\t"
272 "movq 8%1, %%mm2\n\t"
273 "movq %%mm0, %%mm1\n\t"
274 "movq %%mm2, %%mm3\n\t"
275 "psrlq $1, %%mm0\n\t"
276 "psrlq $1, %%mm2\n\t"
277 "pand %%mm7, %%mm0\n\t"
278 "pand %%mm7, %%mm2\n\t"
279 "pand %%mm6, %%mm1\n\t"
280 "pand %%mm6, %%mm3\n\t"
281 "por %%mm1, %%mm0\n\t"
282 "por %%mm3, %%mm2\n\t"
283 MOVNTQ" %%mm0, %0\n\t"
284 MOVNTQ" %%mm2, 8%0"
285 :"=m"(*d)
286 :"m"(*s)
287 );
288 d+=16;
289 s+=16;
290 }
291 __asm __volatile(SFENCE:::"memory");
292 __asm __volatile(EMMS:::"memory");
293#endif
0598bcbb
MN
294 mm_end = end - 3;
295 while(s < mm_end)
ac4d0aea
MN
296 {
297 register uint32_t x= *((uint32_t *)s);
298 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
299 s+=4;
300 d+=4;
301 }
302 if(s < end)
303 {
304 register uint16_t x= *((uint16_t *)s);
305 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
306 s+=2;
307 d+=2;
308 }
309}
310
1de97d84 311static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
fcfbc150 312{
53445e83 313 const uint8_t *s = src;
0d9f3d85
A
314 const uint8_t *end;
315#ifdef HAVE_MMX
316 const uint8_t *mm_end;
317#endif
53445e83
NK
318 uint16_t *d = (uint16_t *)dst;
319 end = s + src_size;
0d9f3d85 320#ifdef HAVE_MMX
53445e83
NK
321 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
322 __asm __volatile(
323 "movq %0, %%mm7\n\t"
324 "movq %1, %%mm6\n\t"
325 ::"m"(red_16mask),"m"(green_16mask));
d8dad2a5 326 mm_end = end - 15;
53445e83
NK
327 while(s < mm_end)
328 {
329 __asm __volatile(
330 PREFETCH" 32%1\n\t"
331 "movd %1, %%mm0\n\t"
332 "movd 4%1, %%mm3\n\t"
333 "punpckldq 8%1, %%mm0\n\t"
334 "punpckldq 12%1, %%mm3\n\t"
335 "movq %%mm0, %%mm1\n\t"
336 "movq %%mm0, %%mm2\n\t"
337 "movq %%mm3, %%mm4\n\t"
338 "movq %%mm3, %%mm5\n\t"
339 "psrlq $3, %%mm0\n\t"
340 "psrlq $3, %%mm3\n\t"
341 "pand %2, %%mm0\n\t"
342 "pand %2, %%mm3\n\t"
343 "psrlq $5, %%mm1\n\t"
344 "psrlq $5, %%mm4\n\t"
345 "pand %%mm6, %%mm1\n\t"
346 "pand %%mm6, %%mm4\n\t"
347 "psrlq $8, %%mm2\n\t"
348 "psrlq $8, %%mm5\n\t"
349 "pand %%mm7, %%mm2\n\t"
350 "pand %%mm7, %%mm5\n\t"
351 "por %%mm1, %%mm0\n\t"
352 "por %%mm4, %%mm3\n\t"
353 "por %%mm2, %%mm0\n\t"
354 "por %%mm5, %%mm3\n\t"
355 "psllq $16, %%mm3\n\t"
356 "por %%mm3, %%mm0\n\t"
357 MOVNTQ" %%mm0, %0\n\t"
358 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
359 d += 4;
360 s += 16;
361 }
0d9f3d85
A
362 __asm __volatile(SFENCE:::"memory");
363 __asm __volatile(EMMS:::"memory");
364#endif
53445e83
NK
365 while(s < end)
366 {
470ba6f2 367#ifndef WORDS_BIGENDIAN
53445e83
NK
368 const int b= *s++;
369 const int g= *s++;
370 const int r= *s++;
470ba6f2
CL
371#else
372 const int a= *s++; /*skip*/
373 const int r= *s++;
374 const int g= *s++;
375 const int b= *s++;
376#endif
53445e83 377 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
470ba6f2 378#ifndef WORDS_BIGENDIAN
0d9f3d85 379 s++;
470ba6f2 380#endif
53445e83 381 }
fcfbc150
MN
382}
383
ac4d0aea
MN
384static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
385{
386 const uint8_t *s = src;
387 const uint8_t *end;
388#ifdef HAVE_MMX
389 const uint8_t *mm_end;
390#endif
391 uint16_t *d = (uint16_t *)dst;
392 end = s + src_size;
393#ifdef HAVE_MMX
394 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
395 __asm __volatile(
396 "movq %0, %%mm7\n\t"
397 "movq %1, %%mm6\n\t"
398 ::"m"(red_16mask),"m"(green_16mask));
0598bcbb 399 mm_end = end - 15;
ac4d0aea
MN
400 while(s < mm_end)
401 {
402 __asm __volatile(
403 PREFETCH" 32%1\n\t"
404 "movd %1, %%mm0\n\t"
405 "movd 4%1, %%mm3\n\t"
406 "punpckldq 8%1, %%mm0\n\t"
407 "punpckldq 12%1, %%mm3\n\t"
408 "movq %%mm0, %%mm1\n\t"
409 "movq %%mm0, %%mm2\n\t"
410 "movq %%mm3, %%mm4\n\t"
411 "movq %%mm3, %%mm5\n\t"
412 "psllq $8, %%mm0\n\t"
413 "psllq $8, %%mm3\n\t"
414 "pand %%mm7, %%mm0\n\t"
415 "pand %%mm7, %%mm3\n\t"
416 "psrlq $5, %%mm1\n\t"
417 "psrlq $5, %%mm4\n\t"
418 "pand %%mm6, %%mm1\n\t"
419 "pand %%mm6, %%mm4\n\t"
420 "psrlq $19, %%mm2\n\t"
421 "psrlq $19, %%mm5\n\t"
422 "pand %2, %%mm2\n\t"
423 "pand %2, %%mm5\n\t"
424 "por %%mm1, %%mm0\n\t"
425 "por %%mm4, %%mm3\n\t"
426 "por %%mm2, %%mm0\n\t"
427 "por %%mm5, %%mm3\n\t"
428 "psllq $16, %%mm3\n\t"
429 "por %%mm3, %%mm0\n\t"
430 MOVNTQ" %%mm0, %0\n\t"
431 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
432 d += 4;
433 s += 16;
434 }
435 __asm __volatile(SFENCE:::"memory");
436 __asm __volatile(EMMS:::"memory");
437#endif
438 while(s < end)
439 {
440 const int r= *s++;
441 const int g= *s++;
442 const int b= *s++;
443 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
444 s++;
445 }
446}
447
1de97d84 448static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
fcfbc150 449{
53445e83 450 const uint8_t *s = src;
0d9f3d85
A
451 const uint8_t *end;
452#ifdef HAVE_MMX
453 const uint8_t *mm_end;
454#endif
53445e83
NK
455 uint16_t *d = (uint16_t *)dst;
456 end = s + src_size;
0d9f3d85 457#ifdef HAVE_MMX
53445e83
NK
458 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
459 __asm __volatile(
460 "movq %0, %%mm7\n\t"
461 "movq %1, %%mm6\n\t"
462 ::"m"(red_15mask),"m"(green_15mask));
d8dad2a5 463 mm_end = end - 15;
53445e83
NK
464 while(s < mm_end)
465 {
466 __asm __volatile(
467 PREFETCH" 32%1\n\t"
468 "movd %1, %%mm0\n\t"
469 "movd 4%1, %%mm3\n\t"
470 "punpckldq 8%1, %%mm0\n\t"
471 "punpckldq 12%1, %%mm3\n\t"
472 "movq %%mm0, %%mm1\n\t"
473 "movq %%mm0, %%mm2\n\t"
474 "movq %%mm3, %%mm4\n\t"
475 "movq %%mm3, %%mm5\n\t"
476 "psrlq $3, %%mm0\n\t"
477 "psrlq $3, %%mm3\n\t"
478 "pand %2, %%mm0\n\t"
479 "pand %2, %%mm3\n\t"
480 "psrlq $6, %%mm1\n\t"
481 "psrlq $6, %%mm4\n\t"
482 "pand %%mm6, %%mm1\n\t"
483 "pand %%mm6, %%mm4\n\t"
484 "psrlq $9, %%mm2\n\t"
485 "psrlq $9, %%mm5\n\t"
486 "pand %%mm7, %%mm2\n\t"
487 "pand %%mm7, %%mm5\n\t"
488 "por %%mm1, %%mm0\n\t"
489 "por %%mm4, %%mm3\n\t"
490 "por %%mm2, %%mm0\n\t"
491 "por %%mm5, %%mm3\n\t"
492 "psllq $16, %%mm3\n\t"
493 "por %%mm3, %%mm0\n\t"
494 MOVNTQ" %%mm0, %0\n\t"
495 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
496 d += 4;
497 s += 16;
498 }
0d9f3d85
A
499 __asm __volatile(SFENCE:::"memory");
500 __asm __volatile(EMMS:::"memory");
501#endif
53445e83
NK
502 while(s < end)
503 {
504 const int b= *s++;
505 const int g= *s++;
506 const int r= *s++;
507 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
0d9f3d85 508 s++;
53445e83 509 }
fcfbc150
MN
510}
511
ac4d0aea
MN
512static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
513{
514 const uint8_t *s = src;
515 const uint8_t *end;
516#ifdef HAVE_MMX
517 const uint8_t *mm_end;
518#endif
519 uint16_t *d = (uint16_t *)dst;
520 end = s + src_size;
521#ifdef HAVE_MMX
522 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
523 __asm __volatile(
524 "movq %0, %%mm7\n\t"
525 "movq %1, %%mm6\n\t"
526 ::"m"(red_15mask),"m"(green_15mask));
0598bcbb 527 mm_end = end - 15;
ac4d0aea
MN
528 while(s < mm_end)
529 {
530 __asm __volatile(
531 PREFETCH" 32%1\n\t"
532 "movd %1, %%mm0\n\t"
533 "movd 4%1, %%mm3\n\t"
534 "punpckldq 8%1, %%mm0\n\t"
535 "punpckldq 12%1, %%mm3\n\t"
536 "movq %%mm0, %%mm1\n\t"
537 "movq %%mm0, %%mm2\n\t"
538 "movq %%mm3, %%mm4\n\t"
539 "movq %%mm3, %%mm5\n\t"
540 "psllq $7, %%mm0\n\t"
541 "psllq $7, %%mm3\n\t"
542 "pand %%mm7, %%mm0\n\t"
543 "pand %%mm7, %%mm3\n\t"
544 "psrlq $6, %%mm1\n\t"
545 "psrlq $6, %%mm4\n\t"
546 "pand %%mm6, %%mm1\n\t"
547 "pand %%mm6, %%mm4\n\t"
548 "psrlq $19, %%mm2\n\t"
549 "psrlq $19, %%mm5\n\t"
550 "pand %2, %%mm2\n\t"
551 "pand %2, %%mm5\n\t"
552 "por %%mm1, %%mm0\n\t"
553 "por %%mm4, %%mm3\n\t"
554 "por %%mm2, %%mm0\n\t"
555 "por %%mm5, %%mm3\n\t"
556 "psllq $16, %%mm3\n\t"
557 "por %%mm3, %%mm0\n\t"
558 MOVNTQ" %%mm0, %0\n\t"
559 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
560 d += 4;
561 s += 16;
562 }
563 __asm __volatile(SFENCE:::"memory");
564 __asm __volatile(EMMS:::"memory");
565#endif
566 while(s < end)
567 {
568 const int r= *s++;
569 const int g= *s++;
570 const int b= *s++;
571 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
572 s++;
573 }
574}
575
1de97d84 576static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
996e1a7c 577{
3eb2151c 578 const uint8_t *s = src;
0d9f3d85
A
579 const uint8_t *end;
580#ifdef HAVE_MMX
581 const uint8_t *mm_end;
582#endif
90226a43 583 uint16_t *d = (uint16_t *)dst;
3eb2151c 584 end = s + src_size;
0d9f3d85 585#ifdef HAVE_MMX
0155db7f
NK
586 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
587 __asm __volatile(
588 "movq %0, %%mm7\n\t"
589 "movq %1, %%mm6\n\t"
53445e83 590 ::"m"(red_16mask),"m"(green_16mask));
d8dad2a5 591 mm_end = end - 11;
3eb2151c 592 while(s < mm_end)
0155db7f
NK
593 {
594 __asm __volatile(
595 PREFETCH" 32%1\n\t"
596 "movd %1, %%mm0\n\t"
3eb2151c
NK
597 "movd 3%1, %%mm3\n\t"
598 "punpckldq 6%1, %%mm0\n\t"
0155db7f
NK
599 "punpckldq 9%1, %%mm3\n\t"
600 "movq %%mm0, %%mm1\n\t"
601 "movq %%mm0, %%mm2\n\t"
602 "movq %%mm3, %%mm4\n\t"
603 "movq %%mm3, %%mm5\n\t"
604 "psrlq $3, %%mm0\n\t"
605 "psrlq $3, %%mm3\n\t"
3eb2151c
NK
606 "pand %2, %%mm0\n\t"
607 "pand %2, %%mm3\n\t"
608 "psrlq $5, %%mm1\n\t"
609 "psrlq $5, %%mm4\n\t"
610 "pand %%mm6, %%mm1\n\t"
611 "pand %%mm6, %%mm4\n\t"
612 "psrlq $8, %%mm2\n\t"
613 "psrlq $8, %%mm5\n\t"
614 "pand %%mm7, %%mm2\n\t"
615 "pand %%mm7, %%mm5\n\t"
0155db7f 616 "por %%mm1, %%mm0\n\t"
0155db7f 617 "por %%mm4, %%mm3\n\t"
3eb2151c 618 "por %%mm2, %%mm0\n\t"
0155db7f 619 "por %%mm5, %%mm3\n\t"
3eb2151c
NK
620 "psllq $16, %%mm3\n\t"
621 "por %%mm3, %%mm0\n\t"
0155db7f 622 MOVNTQ" %%mm0, %0\n\t"
53445e83 623 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
3eb2151c
NK
624 d += 4;
625 s += 12;
0155db7f 626 }
0d9f3d85
A
627 __asm __volatile(SFENCE:::"memory");
628 __asm __volatile(EMMS:::"memory");
629#endif
3eb2151c
NK
630 while(s < end)
631 {
632 const int b= *s++;
633 const int g= *s++;
634 const int r= *s++;
635 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
636 }
996e1a7c
NK
637}
638
ac4d0aea
MN
639static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
640{
641 const uint8_t *s = src;
642 const uint8_t *end;
643#ifdef HAVE_MMX
644 const uint8_t *mm_end;
645#endif
646 uint16_t *d = (uint16_t *)dst;
647 end = s + src_size;
648#ifdef HAVE_MMX
649 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
650 __asm __volatile(
651 "movq %0, %%mm7\n\t"
652 "movq %1, %%mm6\n\t"
653 ::"m"(red_16mask),"m"(green_16mask));
0598bcbb 654 mm_end = end - 15;
ac4d0aea
MN
655 while(s < mm_end)
656 {
657 __asm __volatile(
658 PREFETCH" 32%1\n\t"
659 "movd %1, %%mm0\n\t"
660 "movd 3%1, %%mm3\n\t"
661 "punpckldq 6%1, %%mm0\n\t"
662 "punpckldq 9%1, %%mm3\n\t"
663 "movq %%mm0, %%mm1\n\t"
664 "movq %%mm0, %%mm2\n\t"
665 "movq %%mm3, %%mm4\n\t"
666 "movq %%mm3, %%mm5\n\t"
667 "psllq $8, %%mm0\n\t"
668 "psllq $8, %%mm3\n\t"
669 "pand %%mm7, %%mm0\n\t"
670 "pand %%mm7, %%mm3\n\t"
671 "psrlq $5, %%mm1\n\t"
672 "psrlq $5, %%mm4\n\t"
673 "pand %%mm6, %%mm1\n\t"
674 "pand %%mm6, %%mm4\n\t"
675 "psrlq $19, %%mm2\n\t"
676 "psrlq $19, %%mm5\n\t"
677 "pand %2, %%mm2\n\t"
678 "pand %2, %%mm5\n\t"
679 "por %%mm1, %%mm0\n\t"
680 "por %%mm4, %%mm3\n\t"
681 "por %%mm2, %%mm0\n\t"
682 "por %%mm5, %%mm3\n\t"
683 "psllq $16, %%mm3\n\t"
684 "por %%mm3, %%mm0\n\t"
685 MOVNTQ" %%mm0, %0\n\t"
686 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
687 d += 4;
688 s += 12;
689 }
690 __asm __volatile(SFENCE:::"memory");
691 __asm __volatile(EMMS:::"memory");
692#endif
693 while(s < end)
694 {
695 const int r= *s++;
696 const int g= *s++;
697 const int b= *s++;
698 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
699 }
700}
701
1de97d84 702static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
996e1a7c 703{
53445e83 704 const uint8_t *s = src;
0d9f3d85
A
705 const uint8_t *end;
706#ifdef HAVE_MMX
707 const uint8_t *mm_end;
708#endif
53445e83
NK
709 uint16_t *d = (uint16_t *)dst;
710 end = s + src_size;
0d9f3d85 711#ifdef HAVE_MMX
53445e83
NK
712 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
713 __asm __volatile(
714 "movq %0, %%mm7\n\t"
715 "movq %1, %%mm6\n\t"
716 ::"m"(red_15mask),"m"(green_15mask));
d8dad2a5 717 mm_end = end - 11;
53445e83
NK
718 while(s < mm_end)
719 {
720 __asm __volatile(
721 PREFETCH" 32%1\n\t"
722 "movd %1, %%mm0\n\t"
723 "movd 3%1, %%mm3\n\t"
724 "punpckldq 6%1, %%mm0\n\t"
725 "punpckldq 9%1, %%mm3\n\t"
726 "movq %%mm0, %%mm1\n\t"
727 "movq %%mm0, %%mm2\n\t"
728 "movq %%mm3, %%mm4\n\t"
729 "movq %%mm3, %%mm5\n\t"
730 "psrlq $3, %%mm0\n\t"
731 "psrlq $3, %%mm3\n\t"
732 "pand %2, %%mm0\n\t"
733 "pand %2, %%mm3\n\t"
734 "psrlq $6, %%mm1\n\t"
735 "psrlq $6, %%mm4\n\t"
736 "pand %%mm6, %%mm1\n\t"
737 "pand %%mm6, %%mm4\n\t"
738 "psrlq $9, %%mm2\n\t"
739 "psrlq $9, %%mm5\n\t"
740 "pand %%mm7, %%mm2\n\t"
741 "pand %%mm7, %%mm5\n\t"
742 "por %%mm1, %%mm0\n\t"
743 "por %%mm4, %%mm3\n\t"
744 "por %%mm2, %%mm0\n\t"
745 "por %%mm5, %%mm3\n\t"
746 "psllq $16, %%mm3\n\t"
747 "por %%mm3, %%mm0\n\t"
748 MOVNTQ" %%mm0, %0\n\t"
749 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
750 d += 4;
751 s += 12;
752 }
0d9f3d85
A
753 __asm __volatile(SFENCE:::"memory");
754 __asm __volatile(EMMS:::"memory");
755#endif
53445e83
NK
756 while(s < end)
757 {
758 const int b= *s++;
759 const int g= *s++;
760 const int r= *s++;
761 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
762 }
0d9f3d85
A
763}
764
ac4d0aea
MN
765static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
766{
767 const uint8_t *s = src;
768 const uint8_t *end;
769#ifdef HAVE_MMX
770 const uint8_t *mm_end;
771#endif
772 uint16_t *d = (uint16_t *)dst;
773 end = s + src_size;
774#ifdef HAVE_MMX
775 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
776 __asm __volatile(
777 "movq %0, %%mm7\n\t"
778 "movq %1, %%mm6\n\t"
779 ::"m"(red_15mask),"m"(green_15mask));
0598bcbb 780 mm_end = end - 15;
ac4d0aea
MN
781 while(s < mm_end)
782 {
783 __asm __volatile(
784 PREFETCH" 32%1\n\t"
785 "movd %1, %%mm0\n\t"
786 "movd 3%1, %%mm3\n\t"
787 "punpckldq 6%1, %%mm0\n\t"
788 "punpckldq 9%1, %%mm3\n\t"
789 "movq %%mm0, %%mm1\n\t"
790 "movq %%mm0, %%mm2\n\t"
791 "movq %%mm3, %%mm4\n\t"
792 "movq %%mm3, %%mm5\n\t"
793 "psllq $7, %%mm0\n\t"
794 "psllq $7, %%mm3\n\t"
795 "pand %%mm7, %%mm0\n\t"
796 "pand %%mm7, %%mm3\n\t"
797 "psrlq $6, %%mm1\n\t"
798 "psrlq $6, %%mm4\n\t"
799 "pand %%mm6, %%mm1\n\t"
800 "pand %%mm6, %%mm4\n\t"
801 "psrlq $19, %%mm2\n\t"
802 "psrlq $19, %%mm5\n\t"
803 "pand %2, %%mm2\n\t"
804 "pand %2, %%mm5\n\t"
805 "por %%mm1, %%mm0\n\t"
806 "por %%mm4, %%mm3\n\t"
807 "por %%mm2, %%mm0\n\t"
808 "por %%mm5, %%mm3\n\t"
809 "psllq $16, %%mm3\n\t"
810 "por %%mm3, %%mm0\n\t"
811 MOVNTQ" %%mm0, %0\n\t"
812 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
813 d += 4;
814 s += 12;
815 }
816 __asm __volatile(SFENCE:::"memory");
817 __asm __volatile(EMMS:::"memory");
818#endif
819 while(s < end)
820 {
821 const int r= *s++;
822 const int g= *s++;
823 const int b= *s++;
824 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
825 }
826}
827
0d9f3d85
A
828/*
829 I use here less accurate approximation by simply
830 left-shifting the input
831 value and filling the low order bits with
832 zeroes. This method improves png's
833 compression but this scheme cannot reproduce white exactly, since it does not
834 generate an all-ones maximum value; the net effect is to darken the
835 image slightly.
836
837 The better method should be "left bit replication":
838
839 4 3 2 1 0
840 ---------
841 1 1 0 1 1
842
843 7 6 5 4 3 2 1 0
844 ----------------
845 1 1 0 1 1 1 1 0
846 |=======| |===|
847 | Leftmost Bits Repeated to Fill Open Bits
848 |
849 Original Bits
850*/
851static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
852{
853 const uint16_t *end;
854#ifdef HAVE_MMX
855 const uint16_t *mm_end;
856#endif
857 uint8_t *d = (uint8_t *)dst;
858 const uint16_t *s = (uint16_t *)src;
859 end = s + src_size/2;
860#ifdef HAVE_MMX
861 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 862 mm_end = end - 7;
0d9f3d85
A
863 while(s < mm_end)
864 {
865 __asm __volatile(
866 PREFETCH" 32%1\n\t"
867 "movq %1, %%mm0\n\t"
868 "movq %1, %%mm1\n\t"
869 "movq %1, %%mm2\n\t"
870 "pand %2, %%mm0\n\t"
871 "pand %3, %%mm1\n\t"
872 "pand %4, %%mm2\n\t"
873 "psllq $3, %%mm0\n\t"
874 "psrlq $2, %%mm1\n\t"
875 "psrlq $7, %%mm2\n\t"
876 "movq %%mm0, %%mm3\n\t"
877 "movq %%mm1, %%mm4\n\t"
878 "movq %%mm2, %%mm5\n\t"
879 "punpcklwd %5, %%mm0\n\t"
880 "punpcklwd %5, %%mm1\n\t"
881 "punpcklwd %5, %%mm2\n\t"
882 "punpckhwd %5, %%mm3\n\t"
883 "punpckhwd %5, %%mm4\n\t"
884 "punpckhwd %5, %%mm5\n\t"
885 "psllq $8, %%mm1\n\t"
886 "psllq $16, %%mm2\n\t"
887 "por %%mm1, %%mm0\n\t"
888 "por %%mm2, %%mm0\n\t"
889 "psllq $8, %%mm4\n\t"
890 "psllq $16, %%mm5\n\t"
891 "por %%mm4, %%mm3\n\t"
892 "por %%mm5, %%mm3\n\t"
893
894 "movq %%mm0, %%mm6\n\t"
895 "movq %%mm3, %%mm7\n\t"
896
897 "movq 8%1, %%mm0\n\t"
898 "movq 8%1, %%mm1\n\t"
899 "movq 8%1, %%mm2\n\t"
900 "pand %2, %%mm0\n\t"
901 "pand %3, %%mm1\n\t"
902 "pand %4, %%mm2\n\t"
903 "psllq $3, %%mm0\n\t"
904 "psrlq $2, %%mm1\n\t"
905 "psrlq $7, %%mm2\n\t"
906 "movq %%mm0, %%mm3\n\t"
907 "movq %%mm1, %%mm4\n\t"
908 "movq %%mm2, %%mm5\n\t"
909 "punpcklwd %5, %%mm0\n\t"
910 "punpcklwd %5, %%mm1\n\t"
911 "punpcklwd %5, %%mm2\n\t"
912 "punpckhwd %5, %%mm3\n\t"
913 "punpckhwd %5, %%mm4\n\t"
914 "punpckhwd %5, %%mm5\n\t"
915 "psllq $8, %%mm1\n\t"
916 "psllq $16, %%mm2\n\t"
917 "por %%mm1, %%mm0\n\t"
918 "por %%mm2, %%mm0\n\t"
919 "psllq $8, %%mm4\n\t"
920 "psllq $16, %%mm5\n\t"
921 "por %%mm4, %%mm3\n\t"
922 "por %%mm5, %%mm3\n\t"
923
924 :"=m"(*d)
925 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
926 :"memory");
927 /* Borrowed 32 to 24 */
928 __asm __volatile(
929 "movq %%mm0, %%mm4\n\t"
930 "movq %%mm3, %%mm5\n\t"
931 "movq %%mm6, %%mm0\n\t"
932 "movq %%mm7, %%mm1\n\t"
933
934 "movq %%mm4, %%mm6\n\t"
935 "movq %%mm5, %%mm7\n\t"
936 "movq %%mm0, %%mm2\n\t"
937 "movq %%mm1, %%mm3\n\t"
938
939 "psrlq $8, %%mm2\n\t"
940 "psrlq $8, %%mm3\n\t"
941 "psrlq $8, %%mm6\n\t"
942 "psrlq $8, %%mm7\n\t"
943 "pand %2, %%mm0\n\t"
944 "pand %2, %%mm1\n\t"
945 "pand %2, %%mm4\n\t"
946 "pand %2, %%mm5\n\t"
947 "pand %3, %%mm2\n\t"
948 "pand %3, %%mm3\n\t"
949 "pand %3, %%mm6\n\t"
950 "pand %3, %%mm7\n\t"
951 "por %%mm2, %%mm0\n\t"
952 "por %%mm3, %%mm1\n\t"
953 "por %%mm6, %%mm4\n\t"
954 "por %%mm7, %%mm5\n\t"
955
956 "movq %%mm1, %%mm2\n\t"
957 "movq %%mm4, %%mm3\n\t"
958 "psllq $48, %%mm2\n\t"
959 "psllq $32, %%mm3\n\t"
960 "pand %4, %%mm2\n\t"
961 "pand %5, %%mm3\n\t"
962 "por %%mm2, %%mm0\n\t"
963 "psrlq $16, %%mm1\n\t"
964 "psrlq $32, %%mm4\n\t"
965 "psllq $16, %%mm5\n\t"
966 "por %%mm3, %%mm1\n\t"
967 "pand %6, %%mm5\n\t"
968 "por %%mm5, %%mm4\n\t"
969
970 MOVNTQ" %%mm0, %0\n\t"
971 MOVNTQ" %%mm1, 8%0\n\t"
972 MOVNTQ" %%mm4, 16%0"
973
974 :"=m"(*d)
975 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
976 :"memory");
977 d += 24;
978 s += 8;
979 }
53445e83
NK
980 __asm __volatile(SFENCE:::"memory");
981 __asm __volatile(EMMS:::"memory");
0d9f3d85
A
982#endif
983 while(s < end)
984 {
985 register uint16_t bgr;
986 bgr = *s++;
987 *d++ = (bgr&0x1F)<<3;
988 *d++ = (bgr&0x3E0)>>2;
989 *d++ = (bgr&0x7C00)>>7;
990 }
991}
992
993static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
994{
995 const uint16_t *end;
996#ifdef HAVE_MMX
997 const uint16_t *mm_end;
998#endif
999 uint8_t *d = (uint8_t *)dst;
1000 const uint16_t *s = (const uint16_t *)src;
1001 end = s + src_size/2;
1002#ifdef HAVE_MMX
1003 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
d8dad2a5 1004 mm_end = end - 7;
0d9f3d85
A
1005 while(s < mm_end)
1006 {
1007 __asm __volatile(
1008 PREFETCH" 32%1\n\t"
1009 "movq %1, %%mm0\n\t"
1010 "movq %1, %%mm1\n\t"
1011 "movq %1, %%mm2\n\t"
1012 "pand %2, %%mm0\n\t"
1013 "pand %3, %%mm1\n\t"
1014 "pand %4, %%mm2\n\t"
1015 "psllq $3, %%mm0\n\t"
1016 "psrlq $3, %%mm1\n\t"
1017 "psrlq $8, %%mm2\n\t"
1018 "movq %%mm0, %%mm3\n\t"
1019 "movq %%mm1, %%mm4\n\t"
1020 "movq %%mm2, %%mm5\n\t"
1021 "punpcklwd %5, %%mm0\n\t"
1022 "punpcklwd %5, %%mm1\n\t"
1023 "punpcklwd %5, %%mm2\n\t"
1024 "punpckhwd %5, %%mm3\n\t"
1025 "punpckhwd %5, %%mm4\n\t"
1026 "punpckhwd %5, %%mm5\n\t"
1027 "psllq $8, %%mm1\n\t"
1028 "psllq $16, %%mm2\n\t"
1029 "por %%mm1, %%mm0\n\t"
1030 "por %%mm2, %%mm0\n\t"
1031 "psllq $8, %%mm4\n\t"
1032 "psllq $16, %%mm5\n\t"
1033 "por %%mm4, %%mm3\n\t"
1034 "por %%mm5, %%mm3\n\t"
1035
1036 "movq %%mm0, %%mm6\n\t"
1037 "movq %%mm3, %%mm7\n\t"
1038
1039 "movq 8%1, %%mm0\n\t"
1040 "movq 8%1, %%mm1\n\t"
1041 "movq 8%1, %%mm2\n\t"
1042 "pand %2, %%mm0\n\t"
1043 "pand %3, %%mm1\n\t"
1044 "pand %4, %%mm2\n\t"
1045 "psllq $3, %%mm0\n\t"
1046 "psrlq $3, %%mm1\n\t"
1047 "psrlq $8, %%mm2\n\t"
1048 "movq %%mm0, %%mm3\n\t"
1049 "movq %%mm1, %%mm4\n\t"
1050 "movq %%mm2, %%mm5\n\t"
1051 "punpcklwd %5, %%mm0\n\t"
1052 "punpcklwd %5, %%mm1\n\t"
1053 "punpcklwd %5, %%mm2\n\t"
1054 "punpckhwd %5, %%mm3\n\t"
1055 "punpckhwd %5, %%mm4\n\t"
1056 "punpckhwd %5, %%mm5\n\t"
1057 "psllq $8, %%mm1\n\t"
1058 "psllq $16, %%mm2\n\t"
1059 "por %%mm1, %%mm0\n\t"
1060 "por %%mm2, %%mm0\n\t"
1061 "psllq $8, %%mm4\n\t"
1062 "psllq $16, %%mm5\n\t"
1063 "por %%mm4, %%mm3\n\t"
1064 "por %%mm5, %%mm3\n\t"
1065 :"=m"(*d)
1066 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1067 :"memory");
1068 /* Borrowed 32 to 24 */
1069 __asm __volatile(
1070 "movq %%mm0, %%mm4\n\t"
1071 "movq %%mm3, %%mm5\n\t"
1072 "movq %%mm6, %%mm0\n\t"
1073 "movq %%mm7, %%mm1\n\t"
1074
1075 "movq %%mm4, %%mm6\n\t"
1076 "movq %%mm5, %%mm7\n\t"
1077 "movq %%mm0, %%mm2\n\t"
1078 "movq %%mm1, %%mm3\n\t"
1079
1080 "psrlq $8, %%mm2\n\t"
1081 "psrlq $8, %%mm3\n\t"
1082 "psrlq $8, %%mm6\n\t"
1083 "psrlq $8, %%mm7\n\t"
1084 "pand %2, %%mm0\n\t"
1085 "pand %2, %%mm1\n\t"
1086 "pand %2, %%mm4\n\t"
1087 "pand %2, %%mm5\n\t"
1088 "pand %3, %%mm2\n\t"
1089 "pand %3, %%mm3\n\t"
1090 "pand %3, %%mm6\n\t"
1091 "pand %3, %%mm7\n\t"
1092 "por %%mm2, %%mm0\n\t"
1093 "por %%mm3, %%mm1\n\t"
1094 "por %%mm6, %%mm4\n\t"
1095 "por %%mm7, %%mm5\n\t"
1096
1097 "movq %%mm1, %%mm2\n\t"
1098 "movq %%mm4, %%mm3\n\t"
1099 "psllq $48, %%mm2\n\t"
1100 "psllq $32, %%mm3\n\t"
1101 "pand %4, %%mm2\n\t"
1102 "pand %5, %%mm3\n\t"
1103 "por %%mm2, %%mm0\n\t"
1104 "psrlq $16, %%mm1\n\t"
1105 "psrlq $32, %%mm4\n\t"
1106 "psllq $16, %%mm5\n\t"
1107 "por %%mm3, %%mm1\n\t"
1108 "pand %6, %%mm5\n\t"
1109 "por %%mm5, %%mm4\n\t"
1110
1111 MOVNTQ" %%mm0, %0\n\t"
1112 MOVNTQ" %%mm1, 8%0\n\t"
1113 MOVNTQ" %%mm4, 16%0"
1114
1115 :"=m"(*d)
1116 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1117 :"memory");
1118 d += 24;
1119 s += 8;
1120 }
1121 __asm __volatile(SFENCE:::"memory");
1122 __asm __volatile(EMMS:::"memory");
1123#endif
1124 while(s < end)
1125 {
1126 register uint16_t bgr;
1127 bgr = *s++;
1128 *d++ = (bgr&0x1F)<<3;
1129 *d++ = (bgr&0x7E0)>>3;
1130 *d++ = (bgr&0xF800)>>8;
1131 }
1132}
1133
1134static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1135{
1136 const uint16_t *end;
1137#ifdef HAVE_MMX
1138 const uint16_t *mm_end;
1139#endif
1140 uint8_t *d = (uint8_t *)dst;
1141 const uint16_t *s = (const uint16_t *)src;
1142 end = s + src_size/2;
1143#ifdef HAVE_MMX
1144 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1145 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
d8dad2a5 1146 mm_end = end - 3;
0d9f3d85
A
1147 while(s < mm_end)
1148 {
1149 __asm __volatile(
1150 PREFETCH" 32%1\n\t"
1151 "movq %1, %%mm0\n\t"
1152 "movq %1, %%mm1\n\t"
1153 "movq %1, %%mm2\n\t"
1154 "pand %2, %%mm0\n\t"
1155 "pand %3, %%mm1\n\t"
1156 "pand %4, %%mm2\n\t"
1157 "psllq $3, %%mm0\n\t"
1158 "psrlq $2, %%mm1\n\t"
1159 "psrlq $7, %%mm2\n\t"
1160 "movq %%mm0, %%mm3\n\t"
1161 "movq %%mm1, %%mm4\n\t"
1162 "movq %%mm2, %%mm5\n\t"
1163 "punpcklwd %%mm7, %%mm0\n\t"
1164 "punpcklwd %%mm7, %%mm1\n\t"
1165 "punpcklwd %%mm7, %%mm2\n\t"
1166 "punpckhwd %%mm7, %%mm3\n\t"
1167 "punpckhwd %%mm7, %%mm4\n\t"
1168 "punpckhwd %%mm7, %%mm5\n\t"
1169 "psllq $8, %%mm1\n\t"
1170 "psllq $16, %%mm2\n\t"
1171 "por %%mm1, %%mm0\n\t"
1172 "por %%mm2, %%mm0\n\t"
1173 "psllq $8, %%mm4\n\t"
1174 "psllq $16, %%mm5\n\t"
1175 "por %%mm4, %%mm3\n\t"
1176 "por %%mm5, %%mm3\n\t"
1177 MOVNTQ" %%mm0, %0\n\t"
1178 MOVNTQ" %%mm3, 8%0\n\t"
1179 :"=m"(*d)
1180 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1181 :"memory");
1182 d += 16;
1183 s += 4;
1184 }
1185 __asm __volatile(SFENCE:::"memory");
1186 __asm __volatile(EMMS:::"memory");
1187#endif
1188 while(s < end)
996e1a7c 1189 {
0d9f3d85
A
1190 register uint16_t bgr;
1191 bgr = *s++;
1192 *d++ = (bgr&0x1F)<<3;
1193 *d++ = (bgr&0x3E0)>>2;
1194 *d++ = (bgr&0x7C00)>>7;
1195 *d++ = 0;
1196 }
1197}
996e1a7c 1198
0d9f3d85
A
1199static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1200{
1201 const uint16_t *end;
1202#ifdef HAVE_MMX
1203 const uint16_t *mm_end;
1204#endif
1205 uint8_t *d = (uint8_t *)dst;
1206 const uint16_t *s = (uint16_t *)src;
1207 end = s + src_size/2;
1208#ifdef HAVE_MMX
1209 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1210 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
d8dad2a5 1211 mm_end = end - 3;
0d9f3d85
A
1212 while(s < mm_end)
1213 {
1214 __asm __volatile(
1215 PREFETCH" 32%1\n\t"
1216 "movq %1, %%mm0\n\t"
1217 "movq %1, %%mm1\n\t"
1218 "movq %1, %%mm2\n\t"
1219 "pand %2, %%mm0\n\t"
1220 "pand %3, %%mm1\n\t"
1221 "pand %4, %%mm2\n\t"
1222 "psllq $3, %%mm0\n\t"
1223 "psrlq $3, %%mm1\n\t"
1224 "psrlq $8, %%mm2\n\t"
1225 "movq %%mm0, %%mm3\n\t"
1226 "movq %%mm1, %%mm4\n\t"
1227 "movq %%mm2, %%mm5\n\t"
1228 "punpcklwd %%mm7, %%mm0\n\t"
1229 "punpcklwd %%mm7, %%mm1\n\t"
1230 "punpcklwd %%mm7, %%mm2\n\t"
1231 "punpckhwd %%mm7, %%mm3\n\t"
1232 "punpckhwd %%mm7, %%mm4\n\t"
1233 "punpckhwd %%mm7, %%mm5\n\t"
1234 "psllq $8, %%mm1\n\t"
1235 "psllq $16, %%mm2\n\t"
1236 "por %%mm1, %%mm0\n\t"
1237 "por %%mm2, %%mm0\n\t"
1238 "psllq $8, %%mm4\n\t"
1239 "psllq $16, %%mm5\n\t"
1240 "por %%mm4, %%mm3\n\t"
1241 "por %%mm5, %%mm3\n\t"
1242 MOVNTQ" %%mm0, %0\n\t"
1243 MOVNTQ" %%mm3, 8%0\n\t"
1244 :"=m"(*d)
1245 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1246 :"memory");
1247 d += 16;
1248 s += 4;
996e1a7c 1249 }
0d9f3d85
A
1250 __asm __volatile(SFENCE:::"memory");
1251 __asm __volatile(EMMS:::"memory");
53445e83 1252#endif
0d9f3d85
A
1253 while(s < end)
1254 {
1255 register uint16_t bgr;
1256 bgr = *s++;
1257 *d++ = (bgr&0x1F)<<3;
1258 *d++ = (bgr&0x7E0)>>3;
1259 *d++ = (bgr&0xF800)>>8;
1260 *d++ = 0;
1261 }
996e1a7c 1262}
fcfbc150 1263
1de97d84 1264static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
99969243 1265{
99969243 1266#ifdef HAVE_MMX
0d9f3d85 1267/* TODO: unroll this loop */
99969243
MN
1268 asm volatile (
1269 "xorl %%eax, %%eax \n\t"
cff6ecd7 1270 ".balign 16 \n\t"
99969243
MN
1271 "1: \n\t"
1272 PREFETCH" 32(%0, %%eax) \n\t"
1273 "movq (%0, %%eax), %%mm0 \n\t"
1274 "movq %%mm0, %%mm1 \n\t"
1275 "movq %%mm0, %%mm2 \n\t"
1276 "pslld $16, %%mm0 \n\t"
1277 "psrld $16, %%mm1 \n\t"
0d9f3d85
A
1278 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1279 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1280 "pand "MANGLE(mask32b)", %%mm1 \n\t"
99969243
MN
1281 "por %%mm0, %%mm2 \n\t"
1282 "por %%mm1, %%mm2 \n\t"
1283 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
218ad65d 1284 "addl $8, %%eax \n\t"
99969243
MN
1285 "cmpl %2, %%eax \n\t"
1286 " jb 1b \n\t"
d8dad2a5 1287 :: "r" (src), "r"(dst), "r" (src_size-7)
99969243
MN
1288 : "%eax"
1289 );
9395185f
MN
1290
1291 __asm __volatile(SFENCE:::"memory");
1292 __asm __volatile(EMMS:::"memory");
99969243 1293#else
0d9f3d85
A
1294 unsigned i;
1295 unsigned num_pixels = src_size >> 2;
99969243
MN
1296 for(i=0; i<num_pixels; i++)
1297 {
1298 dst[4*i + 0] = src[4*i + 2];
1299 dst[4*i + 1] = src[4*i + 1];
1300 dst[4*i + 2] = src[4*i + 0];
1301 }
1302#endif
1303}
1304
74d35835
MN
1305static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1306{
0d9f3d85 1307 unsigned i;
74d35835
MN
1308#ifdef HAVE_MMX
1309 int mmx_size= 23 - src_size;
1310 asm volatile (
1311 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1312 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1313 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1314 ".balign 16 \n\t"
1315 "1: \n\t"
1316 PREFETCH" 32(%1, %%eax) \n\t"
1317 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1318 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1319 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1320 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1321 "pand %%mm5, %%mm0 \n\t"
1322 "pand %%mm6, %%mm1 \n\t"
1323 "pand %%mm7, %%mm2 \n\t"
1324 "por %%mm0, %%mm1 \n\t"
1325 "por %%mm2, %%mm1 \n\t"
1326 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1327 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1328 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1329 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1330 "pand %%mm7, %%mm0 \n\t"
1331 "pand %%mm5, %%mm1 \n\t"
1332 "pand %%mm6, %%mm2 \n\t"
1333 "por %%mm0, %%mm1 \n\t"
1334 "por %%mm2, %%mm1 \n\t"
1335 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1336 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1337 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1338 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1339 "pand %%mm6, %%mm0 \n\t"
1340 "pand %%mm7, %%mm1 \n\t"
1341 "pand %%mm5, %%mm2 \n\t"
1342 "por %%mm0, %%mm1 \n\t"
1343 "por %%mm2, %%mm1 \n\t"
1344 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1345 "addl $24, %%eax \n\t"
1346 " js 1b \n\t"
1347 : "+a" (mmx_size)
1348 : "r" (src-mmx_size), "r"(dst-mmx_size)
1349 );
1350
1351 __asm __volatile(SFENCE:::"memory");
1352 __asm __volatile(EMMS:::"memory");
1353
218ad65d 1354 if(mmx_size==23) return; //finihsed, was multiple of 8
0d9f3d85 1355
74d35835
MN
1356 src+= src_size;
1357 dst+= src_size;
0d9f3d85 1358 src_size= 23-mmx_size;
74d35835
MN
1359 src-= src_size;
1360 dst-= src_size;
1361#endif
1362 for(i=0; i<src_size; i+=3)
1363 {
0d9f3d85 1364 register uint8_t x;
74d35835
MN
1365 x = src[i + 2];
1366 dst[i + 1] = src[i + 1];
1367 dst[i + 2] = src[i + 0];
1368 dst[i + 0] = x;
1369 }
1370}
1371
b1ec5875 1372static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
cfc15dc6 1373 unsigned int width, unsigned int height,
b1ec5875 1374 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
d9d58d17 1375{
0d9f3d85
A
1376 unsigned y;
1377 const unsigned chromWidth= width>>1;
42b5fcb8
MN
1378 for(y=0; y<height; y++)
1379 {
4060205b 1380#ifdef HAVE_MMX
42b5fcb8
MN
1381//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1382 asm volatile(
1383 "xorl %%eax, %%eax \n\t"
cff6ecd7 1384 ".balign 16 \n\t"
42b5fcb8
MN
1385 "1: \n\t"
1386 PREFETCH" 32(%1, %%eax, 2) \n\t"
1387 PREFETCH" 32(%2, %%eax) \n\t"
1388 PREFETCH" 32(%3, %%eax) \n\t"
1389 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1390 "movq %%mm0, %%mm2 \n\t" // U(0)
1391 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1392 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1393 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
4060205b 1394
42b5fcb8
MN
1395 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1396 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1397 "movq %%mm3, %%mm4 \n\t" // Y(0)
1398 "movq %%mm5, %%mm6 \n\t" // Y(8)
1399 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1400 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1401 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1402 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
4060205b 1403
42b5fcb8
MN
1404 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1405 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1406 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1407 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
4060205b 1408
42b5fcb8
MN
1409 "addl $8, %%eax \n\t"
1410 "cmpl %4, %%eax \n\t"
1411 " jb 1b \n\t"
1412 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1413 : "%eax"
1414 );
4060205b 1415#else
0d9f3d85 1416#if __WORDSIZE >= 64
42b5fcb8 1417 int i;
0d9f3d85
A
1418 uint64_t *ldst = (uint64_t *) dst;
1419 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1420 for(i = 0; i < chromWidth; i += 2){
1421 uint64_t k, l;
1422 k = yc[0] + (uc[0] << 8) +
1423 (yc[1] << 16) + (vc[0] << 24);
1424 l = yc[2] + (uc[1] << 8) +
1425 (yc[3] << 16) + (vc[1] << 24);
1426 *ldst++ = k + (l << 32);
1427 yc += 4;
1428 uc += 2;
1429 vc += 2;
42b5fcb8 1430 }
0d9f3d85
A
1431
1432#else
1433 int i, *idst = (int32_t *) dst;
1434 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1435 for(i = 0; i < chromWidth; i++){
1436 *idst++ = yc[0] + (uc[0] << 8) +
1437 (yc[1] << 16) + (vc[0] << 24);
1438 yc += 2;
1439 uc++;
1440 vc++;
1441 }
1442#endif
42b5fcb8 1443#endif
b1ec5875 1444 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
42b5fcb8
MN
1445 {
1446 usrc += chromStride;
1447 vsrc += chromStride;
1448 }
1449 ysrc += lumStride;
1450 dst += dstStride;
d9d58d17 1451 }
42b5fcb8
MN
1452#ifdef HAVE_MMX
1453asm( EMMS" \n\t"
1454 SFENCE" \n\t"
1455 :::"memory");
4060205b 1456#endif
d9d58d17
MN
1457}
1458
dabcdbc4
MN
1459/**
1460 *
1461 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1462 * problem for anyone then tell me, and ill fix it)
1463 */
b1ec5875
MN
1464static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1465 unsigned int width, unsigned int height,
1466 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1467{
1468 //FIXME interpolate chroma
1469 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1470}
1471
1472/**
1473 *
1474 * width should be a multiple of 16
1475 */
1476static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1477 unsigned int width, unsigned int height,
1478 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1479{
1480 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1481}
1482
1483/**
1484 *
1485 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1486 * problem for anyone then tell me, and ill fix it)
1487 */
1de97d84 1488static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
cfc15dc6
MN
1489 unsigned int width, unsigned int height,
1490 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
d9d58d17 1491{
0d9f3d85
A
1492 unsigned y;
1493 const unsigned chromWidth= width>>1;
dabcdbc4
MN
1494 for(y=0; y<height; y+=2)
1495 {
bd09433f 1496#ifdef HAVE_MMX
dabcdbc4
MN
1497 asm volatile(
1498 "xorl %%eax, %%eax \n\t"
1499 "pcmpeqw %%mm7, %%mm7 \n\t"
1500 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
cff6ecd7 1501 ".balign 16 \n\t"
dabcdbc4
MN
1502 "1: \n\t"
1503 PREFETCH" 64(%0, %%eax, 4) \n\t"
1504 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1505 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1506 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1507 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1508 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1509 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1510 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1511 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1512 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1513 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1514
1515 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1516
1517 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1518 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1519 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1520 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1521 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1522 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1523 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1524 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1525 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1526 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1527
1528 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1529
1530 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1531 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1532 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1533 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1534 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1535 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1536 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1537 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1538
1539 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1540 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1541
1542 "addl $8, %%eax \n\t"
1543 "cmpl %4, %%eax \n\t"
1544 " jb 1b \n\t"
cfc15dc6
MN
1545 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1546 : "memory", "%eax"
1547 );
dabcdbc4 1548
ed346065
MN
1549 ydst += lumStride;
1550 src += srcStride;
1551
cfc15dc6
MN
1552 asm volatile(
1553 "xorl %%eax, %%eax \n\t"
cff6ecd7 1554 ".balign 16 \n\t"
dabcdbc4
MN
1555 "1: \n\t"
1556 PREFETCH" 64(%0, %%eax, 4) \n\t"
1557 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1558 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1559 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1560 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1561 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1562 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1563 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1564 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1565 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1566 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1567
1568 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1569 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1570
1571 "addl $8, %%eax \n\t"
cfc15dc6 1572 "cmpl %4, %%eax \n\t"
dabcdbc4
MN
1573 " jb 1b \n\t"
1574
ed346065 1575 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
dabcdbc4
MN
1576 : "memory", "%eax"
1577 );
bd09433f 1578#else
0d9f3d85 1579 unsigned i;
dabcdbc4
MN
1580 for(i=0; i<chromWidth; i++)
1581 {
1582 ydst[2*i+0] = src[4*i+0];
1583 udst[i] = src[4*i+1];
1584 ydst[2*i+1] = src[4*i+2];
1585 vdst[i] = src[4*i+3];
1586 }
1587 ydst += lumStride;
1588 src += srcStride;
1589
1590 for(i=0; i<chromWidth; i++)
1591 {
1592 ydst[2*i+0] = src[4*i+0];
1593 ydst[2*i+1] = src[4*i+2];
1594 }
1595#endif
1596 udst += chromStride;
1597 vdst += chromStride;
1598 ydst += lumStride;
1599 src += srcStride;
d9d58d17 1600 }
dabcdbc4 1601#ifdef HAVE_MMX
ed8c0670
MN
1602asm volatile( EMMS" \n\t"
1603 SFENCE" \n\t"
1604 :::"memory");
bd09433f 1605#endif
42b5fcb8 1606}
81c0590e 1607
d661d18d
AB
1608static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1609 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1610 unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
1611{
1612 /* Y Plane */
1613 memcpy(ydst, ysrc, width*height);
1614
1615 /* XXX: implement upscaling for U,V */
1616}
1617
b241cbf2
MN
1618static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1619{
1620 int x,y;
1621
1622 // first line
1623 for(x=0; x<srcWidth; x++){
1624 dst[2*x+0]=
1625 dst[2*x+1]= src[x];
1626 }
1627 dst+= dstStride;
1628
1629 for(y=1; y<srcHeight; y++){
1630#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1631 const int mmxSize= srcWidth;
1632 asm volatile(
1633 "movl %4, %%eax \n\t"
1634 "1: \n\t"
1635 "movq (%0, %%eax), %%mm0 \n\t"
1636 "movq (%1, %%eax), %%mm1 \n\t"
1637 "movq 1(%0, %%eax), %%mm2 \n\t"
1638 "movq 1(%1, %%eax), %%mm3 \n\t"
1639 "movq %%mm0, %%mm4 \n\t"
1640 "movq %%mm1, %%mm5 \n\t"
1641 PAVGB" %%mm3, %%mm0 \n\t"
1642 PAVGB" %%mm3, %%mm0 \n\t"
1643 PAVGB" %%mm4, %%mm3 \n\t"
1644 PAVGB" %%mm4, %%mm3 \n\t"
1645 PAVGB" %%mm2, %%mm1 \n\t"
1646 PAVGB" %%mm2, %%mm1 \n\t"
1647 PAVGB" %%mm5, %%mm2 \n\t"
1648 PAVGB" %%mm5, %%mm2 \n\t"
1649 "movq %%mm3, %%mm4 \n\t"
1650 "movq %%mm2, %%mm5 \n\t"
1651 "punpcklbw %%mm1, %%mm3 \n\t"
1652 "punpckhbw %%mm1, %%mm4 \n\t"
1653 "punpcklbw %%mm0, %%mm2 \n\t"
1654 "punpckhbw %%mm0, %%mm5 \n\t"
1655#if 1
1656 MOVNTQ" %%mm3, (%2, %%eax, 2) \n\t"
1657 MOVNTQ" %%mm4, 8(%2, %%eax, 2) \n\t"
1658 MOVNTQ" %%mm2, (%3, %%eax, 2) \n\t"
1659 MOVNTQ" %%mm5, 8(%3, %%eax, 2) \n\t"
1660#else
1661 "movq %%mm3, (%2, %%eax, 2) \n\t"
1662 "movq %%mm4, 8(%2, %%eax, 2) \n\t"
1663 "movq %%mm2, (%3, %%eax, 2) \n\t"
1664 "movq %%mm5, 8(%3, %%eax, 2) \n\t"
1665#endif
1666 "addl $8, %%eax \n\t"
1667 " js 1b \n\t"
1668 :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
1669 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1670 "g" (-mmxSize)
1671 : "%eax"
1672
1673 );
1674 dst[0]=
1675 dst[dstStride]= src[0];
1676#else
1677 dst[0]=
1678 dst[dstStride]= src[0];
1679
1680 for(x=0; x<srcWidth-1; x++){
1681 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1682 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1683 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1684 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1685 }
1686#endif
1687 dst[srcWidth*2 -1]=
1688 dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
1689
1690 dst+=dstStride*2;
1691 src+=srcStride;
1692 }
1693 src-=srcStride;
1694
1695 // last line
1696 for(x=0; x<srcWidth; x++){
1697 dst[2*x+0]=
1698 dst[2*x+1]= src[x];
1699 }
1700#ifdef HAVE_MMX
1701asm volatile( EMMS" \n\t"
1702 SFENCE" \n\t"
1703 :::"memory");
1704#endif
1705}
1706
81c0590e
A
1707/**
1708 *
1709 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1710 * problem for anyone then tell me, and ill fix it)
1de97d84 1711 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
81c0590e 1712 */
1de97d84 1713static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
81c0590e
A
1714 unsigned int width, unsigned int height,
1715 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1716{
0d9f3d85
A
1717 unsigned y;
1718 const unsigned chromWidth= width>>1;
81c0590e
A
1719 for(y=0; y<height; y+=2)
1720 {
ed8c0670
MN
1721#ifdef HAVE_MMX
1722 asm volatile(
1723 "xorl %%eax, %%eax \n\t"
1724 "pcmpeqw %%mm7, %%mm7 \n\t"
1725 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1726 ".balign 16 \n\t"
1727 "1: \n\t"
1728 PREFETCH" 64(%0, %%eax, 4) \n\t"
1729 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1730 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1731 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1732 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1733 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1734 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1735 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1736 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1737 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1738 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1739
1740 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1741
1742 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1743 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1744 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1745 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1746 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1747 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1748 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1749 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1750 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1751 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1752
1753 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1754
1755 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1756 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1757 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1758 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1759 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1760 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1761 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1762 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1763
1764 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1765 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1766
1767 "addl $8, %%eax \n\t"
1768 "cmpl %4, %%eax \n\t"
1769 " jb 1b \n\t"
1770 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1771 : "memory", "%eax"
1772 );
1773
1774 ydst += lumStride;
1775 src += srcStride;
1776
1777 asm volatile(
1778 "xorl %%eax, %%eax \n\t"
1779 ".balign 16 \n\t"
1780 "1: \n\t"
1781 PREFETCH" 64(%0, %%eax, 4) \n\t"
1782 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1783 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1784 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1785 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1786 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1787 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1788 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1789 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1790 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1791 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1792
1793 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1794 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1795
1796 "addl $8, %%eax \n\t"
1797 "cmpl %4, %%eax \n\t"
1798 " jb 1b \n\t"
1799
1800 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1801 : "memory", "%eax"
1802 );
1803#else
0d9f3d85 1804 unsigned i;
81c0590e
A
1805 for(i=0; i<chromWidth; i++)
1806 {
1807 udst[i] = src[4*i+0];
1808 ydst[2*i+0] = src[4*i+1];
1809 vdst[i] = src[4*i+2];
1810 ydst[2*i+1] = src[4*i+3];
1811 }
1812 ydst += lumStride;
1813 src += srcStride;
1814
1815 for(i=0; i<chromWidth; i++)
1816 {
1817 ydst[2*i+0] = src[4*i+1];
1818 ydst[2*i+1] = src[4*i+3];
1819 }
ed8c0670 1820#endif
81c0590e
A
1821 udst += chromStride;
1822 vdst += chromStride;
1823 ydst += lumStride;
1824 src += srcStride;
1825 }
ed8c0670
MN
1826#ifdef HAVE_MMX
1827asm volatile( EMMS" \n\t"
1828 SFENCE" \n\t"
1829 :::"memory");
1830#endif
81c0590e
A
1831}
1832
1de97d84
MN
1833/**
1834 *
1835 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1836 * problem for anyone then tell me, and ill fix it)
21316f3c 1837 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1de97d84
MN
1838 */
1839static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1840 unsigned int width, unsigned int height,
1841 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1842{
0d9f3d85
A
1843 unsigned y;
1844 const unsigned chromWidth= width>>1;
21316f3c
MN
1845#ifdef HAVE_MMX
1846 for(y=0; y<height-2; y+=2)
1847 {
0d9f3d85 1848 unsigned i;
21316f3c
MN
1849 for(i=0; i<2; i++)
1850 {
1851 asm volatile(
1852 "movl %2, %%eax \n\t"
854288bb
FB
1853 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1854 "movq "MANGLE(w1111)", %%mm5 \n\t"
21316f3c
MN
1855 "pxor %%mm7, %%mm7 \n\t"
1856 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1857 ".balign 16 \n\t"
1858 "1: \n\t"
1859 PREFETCH" 64(%0, %%ebx) \n\t"
1860 "movd (%0, %%ebx), %%mm0 \n\t"
1861 "movd 3(%0, %%ebx), %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm0 \n\t"
1863 "punpcklbw %%mm7, %%mm1 \n\t"
1864 "movd 6(%0, %%ebx), %%mm2 \n\t"
1865 "movd 9(%0, %%ebx), %%mm3 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "punpcklbw %%mm7, %%mm3 \n\t"
1868 "pmaddwd %%mm6, %%mm0 \n\t"
1869 "pmaddwd %%mm6, %%mm1 \n\t"
1870 "pmaddwd %%mm6, %%mm2 \n\t"
1871 "pmaddwd %%mm6, %%mm3 \n\t"
1872#ifndef FAST_BGR2YV12
1873 "psrad $8, %%mm0 \n\t"
1874 "psrad $8, %%mm1 \n\t"
1875 "psrad $8, %%mm2 \n\t"
1876 "psrad $8, %%mm3 \n\t"
1877#endif
1878 "packssdw %%mm1, %%mm0 \n\t"
1879 "packssdw %%mm3, %%mm2 \n\t"
1880 "pmaddwd %%mm5, %%mm0 \n\t"
1881 "pmaddwd %%mm5, %%mm2 \n\t"
1882 "packssdw %%mm2, %%mm0 \n\t"
1883 "psraw $7, %%mm0 \n\t"
1884
1885 "movd 12(%0, %%ebx), %%mm4 \n\t"
1886 "movd 15(%0, %%ebx), %%mm1 \n\t"
1887 "punpcklbw %%mm7, %%mm4 \n\t"
1888 "punpcklbw %%mm7, %%mm1 \n\t"
1889 "movd 18(%0, %%ebx), %%mm2 \n\t"
1890 "movd 21(%0, %%ebx), %%mm3 \n\t"
1891 "punpcklbw %%mm7, %%mm2 \n\t"
1892 "punpcklbw %%mm7, %%mm3 \n\t"
1893 "pmaddwd %%mm6, %%mm4 \n\t"
1894 "pmaddwd %%mm6, %%mm1 \n\t"
1895 "pmaddwd %%mm6, %%mm2 \n\t"
1896 "pmaddwd %%mm6, %%mm3 \n\t"
1897#ifndef FAST_BGR2YV12
1898 "psrad $8, %%mm4 \n\t"
1899 "psrad $8, %%mm1 \n\t"
1900 "psrad $8, %%mm2 \n\t"
1901 "psrad $8, %%mm3 \n\t"
1902#endif
1903 "packssdw %%mm1, %%mm4 \n\t"
1904 "packssdw %%mm3, %%mm2 \n\t"
1905 "pmaddwd %%mm5, %%mm4 \n\t"
1906 "pmaddwd %%mm5, %%mm2 \n\t"
1907 "addl $24, %%ebx \n\t"
1908 "packssdw %%mm2, %%mm4 \n\t"
1909 "psraw $7, %%mm4 \n\t"
1910
1911 "packuswb %%mm4, %%mm0 \n\t"
854288bb 1912 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
21316f3c
MN
1913
1914 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
1915 "addl $8, %%eax \n\t"
1916 " js 1b \n\t"
1917 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1918 : "%eax", "%ebx"
1919 );
1920 ydst += lumStride;
1921 src += srcStride;
1922 }
1923 src -= srcStride*2;
1924 asm volatile(
1925 "movl %4, %%eax \n\t"
854288bb
FB
1926 "movq "MANGLE(w1111)", %%mm5 \n\t"
1927 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
21316f3c
MN
1928 "pxor %%mm7, %%mm7 \n\t"
1929 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1930 "addl %%ebx, %%ebx \n\t"
1931 ".balign 16 \n\t"
1932 "1: \n\t"
1933 PREFETCH" 64(%0, %%ebx) \n\t"
1934 PREFETCH" 64(%1, %%ebx) \n\t"
1935#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1936 "movq (%0, %%ebx), %%mm0 \n\t"
1937 "movq (%1, %%ebx), %%mm1 \n\t"
1938 "movq 6(%0, %%ebx), %%mm2 \n\t"
1939 "movq 6(%1, %%ebx), %%mm3 \n\t"
1940 PAVGB" %%mm1, %%mm0 \n\t"
1941 PAVGB" %%mm3, %%mm2 \n\t"
1942 "movq %%mm0, %%mm1 \n\t"
1943 "movq %%mm2, %%mm3 \n\t"
1944 "psrlq $24, %%mm0 \n\t"
1945 "psrlq $24, %%mm2 \n\t"
1946 PAVGB" %%mm1, %%mm0 \n\t"
1947 PAVGB" %%mm3, %%mm2 \n\t"
1948 "punpcklbw %%mm7, %%mm0 \n\t"
1949 "punpcklbw %%mm7, %%mm2 \n\t"
1950#else
1951 "movd (%0, %%ebx), %%mm0 \n\t"
1952 "movd (%1, %%ebx), %%mm1 \n\t"
1953 "movd 3(%0, %%ebx), %%mm2 \n\t"
1954 "movd 3(%1, %%ebx), %%mm3 \n\t"
1955 "punpcklbw %%mm7, %%mm0 \n\t"
1956 "punpcklbw %%mm7, %%mm1 \n\t"
1957 "punpcklbw %%mm7, %%mm2 \n\t"
1958 "punpcklbw %%mm7, %%mm3 \n\t"
1959 "paddw %%mm1, %%mm0 \n\t"
1960 "paddw %%mm3, %%mm2 \n\t"
1961 "paddw %%mm2, %%mm0 \n\t"
1962 "movd 6(%0, %%ebx), %%mm4 \n\t"
1963 "movd 6(%1, %%ebx), %%mm1 \n\t"
1964 "movd 9(%0, %%ebx), %%mm2 \n\t"
1965 "movd 9(%1, %%ebx), %%mm3 \n\t"
1966 "punpcklbw %%mm7, %%mm4 \n\t"
1967 "punpcklbw %%mm7, %%mm1 \n\t"
1968 "punpcklbw %%mm7, %%mm2 \n\t"
1969 "punpcklbw %%mm7, %%mm3 \n\t"
1970 "paddw %%mm1, %%mm4 \n\t"
1971 "paddw %%mm3, %%mm2 \n\t"
1972 "paddw %%mm4, %%mm2 \n\t"
1973 "psrlw $2, %%mm0 \n\t"
1974 "psrlw $2, %%mm2 \n\t"
1975#endif
854288bb
FB
1976 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1977 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
21316f3c
MN
1978
1979 "pmaddwd %%mm0, %%mm1 \n\t"
1980 "pmaddwd %%mm2, %%mm3 \n\t"
1981 "pmaddwd %%mm6, %%mm0 \n\t"
1982 "pmaddwd %%mm6, %%mm2 \n\t"
1983#ifndef FAST_BGR2YV12
1984 "psrad $8, %%mm0 \n\t"
1985 "psrad $8, %%mm1 \n\t"
1986 "psrad $8, %%mm2 \n\t"
1987 "psrad $8, %%mm3 \n\t"
1988#endif
1989 "packssdw %%mm2, %%mm0 \n\t"
1990 "packssdw %%mm3, %%mm1 \n\t"
1991 "pmaddwd %%mm5, %%mm0 \n\t"
1992 "pmaddwd %%mm5, %%mm1 \n\t"
1993 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1994 "psraw $7, %%mm0 \n\t"
1995
1996#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1997 "movq 12(%0, %%ebx), %%mm4 \n\t"
1998 "movq 12(%1, %%ebx), %%mm1 \n\t"
1999 "movq 18(%0, %%ebx), %%mm2 \n\t"
2000 "movq 18(%1, %%ebx), %%mm3 \n\t"
2001 PAVGB" %%mm1, %%mm4 \n\t"
2002 PAVGB" %%mm3, %%mm2 \n\t"
2003 "movq %%mm4, %%mm1 \n\t"
2004 "movq %%mm2, %%mm3 \n\t"
2005 "psrlq $24, %%mm4 \n\t"
2006 "psrlq $24, %%mm2 \n\t"
2007 PAVGB" %%mm1, %%mm4 \n\t"
2008 PAVGB" %%mm3, %%mm2 \n\t"
2009 "punpcklbw %%mm7, %%mm4 \n\t"
2010 "punpcklbw %%mm7, %%mm2 \n\t"
2011#else
2012 "movd 12(%0, %%ebx), %%mm4 \n\t"
2013 "movd 12(%1, %%ebx), %%mm1 \n\t"
2014 "movd 15(%0, %%ebx), %%mm2 \n\t"
2015 "movd 15(%1, %%ebx), %%mm3 \n\t"
2016 "punpcklbw %%mm7, %%mm4 \n\t"
2017 "punpcklbw %%mm7, %%mm1 \n\t"
2018 "punpcklbw %%mm7, %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm3 \n\t"
2020 "paddw %%mm1, %%mm4 \n\t"
2021 "paddw %%mm3, %%mm2 \n\t"
2022 "paddw %%mm2, %%mm4 \n\t"
2023 "movd 18(%0, %%ebx), %%mm5 \n\t"
2024 "movd 18(%1, %%ebx), %%mm1 \n\t"
2025 "movd 21(%0, %%ebx), %%mm2 \n\t"
2026 "movd 21(%1, %%ebx), %%mm3 \n\t"
2027 "punpcklbw %%mm7, %%mm5 \n\t"
2028 "punpcklbw %%mm7, %%mm1 \n\t"
2029 "punpcklbw %%mm7, %%mm2 \n\t"
2030 "punpcklbw %%mm7, %%mm3 \n\t"
2031 "paddw %%mm1, %%mm5 \n\t"
2032 "paddw %%mm3, %%mm2 \n\t"
2033 "paddw %%mm5, %%mm2 \n\t"
854288bb 2034 "movq "MANGLE(w1111)", %%mm5 \n\t"
21316f3c
MN
2035 "psrlw $2, %%mm4 \n\t"
2036 "psrlw $2, %%mm2 \n\t"
2037#endif
854288bb
FB
2038 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2039 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
21316f3c
MN
2040
2041 "pmaddwd %%mm4, %%mm1 \n\t"
2042 "pmaddwd %%mm2, %%mm3 \n\t"
2043 "pmaddwd %%mm6, %%mm4 \n\t"
2044 "pmaddwd %%mm6, %%mm2 \n\t"
2045#ifndef FAST_BGR2YV12
2046 "psrad $8, %%mm4 \n\t"
2047 "psrad $8, %%mm1 \n\t"
2048 "psrad $8, %%mm2 \n\t"
2049 "psrad $8, %%mm3 \n\t"
2050#endif
2051 "packssdw %%mm2, %%mm4 \n\t"
2052 "packssdw %%mm3, %%mm1 \n\t"
2053 "pmaddwd %%mm5, %%mm4 \n\t"
2054 "pmaddwd %%mm5, %%mm1 \n\t"
2055 "addl $24, %%ebx \n\t"
2056 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2057 "psraw $7, %%mm4 \n\t"
2058
2059 "movq %%mm0, %%mm1 \n\t"
2060 "punpckldq %%mm4, %%mm0 \n\t"
2061 "punpckhdq %%mm4, %%mm1 \n\t"
2062 "packsswb %%mm1, %%mm0 \n\t"
854288bb 2063 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
21316f3c
MN
2064
2065 "movd %%mm0, (%2, %%eax) \n\t"
2066 "punpckhdq %%mm0, %%mm0 \n\t"
2067 "movd %%mm0, (%3, %%eax) \n\t"
2068 "addl $4, %%eax \n\t"
2069 " js 1b \n\t"
2070 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2071 : "%eax", "%ebx"
2072 );
2073
2074 udst += chromStride;
2075 vdst += chromStride;
2076 src += srcStride*2;
2077 }
2078
2079 asm volatile( EMMS" \n\t"
2080 SFENCE" \n\t"
2081 :::"memory");
2082#else
2083 y=0;
2084#endif
2085 for(; y<height; y+=2)
1de97d84 2086 {
0d9f3d85 2087 unsigned i;
1de97d84
MN
2088 for(i=0; i<chromWidth; i++)
2089 {
2090 unsigned int b= src[6*i+0];
2091 unsigned int g= src[6*i+1];
2092 unsigned int r= src[6*i+2];
2093
aa21f0c3
MN
2094 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2095 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2096 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
81c0590e 2097
1de97d84
MN
2098 udst[i] = U;
2099 vdst[i] = V;
2100 ydst[2*i] = Y;
2101
2102 b= src[6*i+3];
2103 g= src[6*i+4];
2104 r= src[6*i+5];
2105
aa21f0c3 2106 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1de97d84
MN
2107 ydst[2*i+1] = Y;
2108 }
2109 ydst += lumStride;
2110 src += srcStride;
2111
2112 for(i=0; i<chromWidth; i++)
2113 {
2114 unsigned int b= src[6*i+0];
2115 unsigned int g= src[6*i+1];
2116 unsigned int r= src[6*i+2];
2117
aa21f0c3 2118 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1de97d84
MN
2119
2120 ydst[2*i] = Y;
2121
2122 b= src[6*i+3];
2123 g= src[6*i+4];
2124 r= src[6*i+5];
2125
aa21f0c3 2126 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1de97d84
MN
2127 ydst[2*i+1] = Y;
2128 }
2129 udst += chromStride;
2130 vdst += chromStride;
2131 ydst += lumStride;
2132 src += srcStride;
2133 }
2134}
5d55fdb4
MN
2135
2136void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
0d9f3d85
A
2137 unsigned width, unsigned height, unsigned src1Stride,
2138 unsigned src2Stride, unsigned dstStride){
2139 unsigned h;
5d55fdb4
MN
2140
2141 for(h=0; h < height; h++)
2142 {
0d9f3d85 2143 unsigned w;
5d55fdb4
MN
2144
2145#ifdef HAVE_MMX
2146#ifdef HAVE_SSE2
2147 asm(
2148 "xorl %%eax, %%eax \n\t"
2149 "1: \n\t"
2150 PREFETCH" 64(%1, %%eax) \n\t"
2151 PREFETCH" 64(%2, %%eax) \n\t"
2152 "movdqa (%1, %%eax), %%xmm0 \n\t"
2153 "movdqa (%1, %%eax), %%xmm1 \n\t"
2154 "movdqa (%2, %%eax), %%xmm2 \n\t"
2155 "punpcklbw %%xmm2, %%xmm0 \n\t"
2156 "punpckhbw %%xmm2, %%xmm1 \n\t"
2157 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2158 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2159 "addl $16, %%eax \n\t"
2160 "cmpl %3, %%eax \n\t"
2161 " jb 1b \n\t"
2162 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2163 : "memory", "%eax"
2164 );
2165#else
2166 asm(
2167 "xorl %%eax, %%eax \n\t"
2168 "1: \n\t"
2169 PREFETCH" 64(%1, %%eax) \n\t"
2170 PREFETCH" 64(%2, %%eax) \n\t"
2171 "movq (%1, %%eax), %%mm0 \n\t"
2172 "movq 8(%1, %%eax), %%mm2 \n\t"
2173 "movq %%mm0, %%mm1 \n\t"
2174 "movq %%mm2, %%mm3 \n\t"
2175 "movq (%2, %%eax), %%mm4 \n\t"
2176 "movq 8(%2, %%eax), %%mm5 \n\t"
2177 "punpcklbw %%mm4, %%mm0 \n\t"
2178 "punpckhbw %%mm4, %%mm1 \n\t"
2179 "punpcklbw %%mm5, %%mm2 \n\t"
2180 "punpckhbw %%mm5, %%mm3 \n\t"
2181 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2182 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2183 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2184 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2185 "addl $16, %%eax \n\t"
2186 "cmpl %3, %%eax \n\t"
2187 " jb 1b \n\t"
2188 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2189 : "memory", "%eax"
2190 );
2191#endif
2192 for(w= (width&(~15)); w < width; w++)
2193 {
2194 dest[2*w+0] = src1[w];
2195 dest[2*w+1] = src2[w];
2196 }
2197#else
2198 for(w=0; w < width; w++)
2199 {
2200 dest[2*w+0] = src1[w];
2201 dest[2*w+1] = src2[w];
2202 }
2203#endif
2204 dest += dstStride;
2205 src1 += src1Stride;
2206 src2 += src2Stride;
2207 }
2208#ifdef HAVE_MMX
2209 asm(
2210 EMMS" \n\t"
2211 SFENCE" \n\t"
2212 ::: "memory"
2213 );
2214#endif
2215}
ac4d0aea
MN
2216
2217static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2218 uint8_t *dst1, uint8_t *dst2,
2219 unsigned width, unsigned height,
2220 unsigned srcStride1, unsigned srcStride2,
2221 unsigned dstStride1, unsigned dstStride2)
2222{
2223 unsigned y,x,w,h;
2224 w=width/2; h=height/2;
2225#ifdef HAVE_MMX
2226 asm volatile(
2227 PREFETCH" %0\n\t"
2228 PREFETCH" %1\n\t"
2229 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2230#endif
2231 for(y=0;y<h;y++){
2232 const uint8_t* s1=src1+srcStride1*(y>>1);
2233 uint8_t* d=dst1+dstStride1*y;
2234 x=0;
2235#ifdef HAVE_MMX
2236 if(w > 32)
2237 for(;x<w;x+=32)
2238 {
2239 asm volatile(
2240 PREFETCH" 32%1\n\t"
2241 "movq %1, %%mm0\n\t"
2242 "movq 8%1, %%mm2\n\t"
2243 "movq 16%1, %%mm4\n\t"
2244 "movq 24%1, %%mm6\n\t"
2245 "movq %%mm0, %%mm1\n\t"
2246 "movq %%mm2, %%mm3\n\t"
2247 "movq %%mm4, %%mm5\n\t"
2248 "movq %%mm6, %%mm7\n\t"
2249 "punpcklbw %%mm0, %%mm0\n\t"
2250 "punpckhbw %%mm1, %%mm1\n\t"
2251 "punpcklbw %%mm2, %%mm2\n\t"
2252 "punpckhbw %%mm3, %%mm3\n\t"
2253 "punpcklbw %%mm4, %%mm4\n\t"
2254 "punpckhbw %%mm5, %%mm5\n\t"
2255 "punpcklbw %%mm6, %%mm6\n\t"
2256 "punpckhbw %%mm7, %%mm7\n\t"
2257 MOVNTQ" %%mm0, %0\n\t"
2258 MOVNTQ" %%mm1, 8%0\n\t"
2259 MOVNTQ" %%mm2, 16%0\n\t"
2260 MOVNTQ" %%mm3, 24%0\n\t"
2261 MOVNTQ" %%mm4, 32%0\n\t"
2262 MOVNTQ" %%mm5, 40%0\n\t"
2263 MOVNTQ" %%mm6, 48%0\n\t"
2264 MOVNTQ" %%mm7, 56%0"
2265 :"=m"(d[2*x])
2266 :"m"(s1[x])
2267 :"memory");
2268 }
2269#endif
2270 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2271 }
2272 for(y=0;y<h;y++){
2273 const uint8_t* s2=src2+srcStride2*(y>>1);
2274 uint8_t* d=dst2+dstStride2*y;
2275 x=0;
2276#ifdef HAVE_MMX
2277 if(w > 32)
2278 for(;x<w;x+=32)
2279 {
2280 asm volatile(
2281 PREFETCH" 32%1\n\t"
2282 "movq %1, %%mm0\n\t"
2283 "movq 8%1, %%mm2\n\t"
2284 "movq 16%1, %%mm4\n\t"
2285 "movq 24%1, %%mm6\n\t"
2286 "movq %%mm0, %%mm1\n\t"
2287 "movq %%mm2, %%mm3\n\t"
2288 "movq %%mm4, %%mm5\n\t"
2289 "movq %%mm6, %%mm7\n\t"
2290 "punpcklbw %%mm0, %%mm0\n\t"
2291 "punpckhbw %%mm1, %%mm1\n\t"
2292 "punpcklbw %%mm2, %%mm2\n\t"
2293 "punpckhbw %%mm3, %%mm3\n\t"
2294 "punpcklbw %%mm4, %%mm4\n\t"
2295 "punpckhbw %%mm5, %%mm5\n\t"
2296 "punpcklbw %%mm6, %%mm6\n\t"
2297 "punpckhbw %%mm7, %%mm7\n\t"
2298 MOVNTQ" %%mm0, %0\n\t"
2299 MOVNTQ" %%mm1, 8%0\n\t"
2300 MOVNTQ" %%mm2, 16%0\n\t"
2301 MOVNTQ" %%mm3, 24%0\n\t"
2302 MOVNTQ" %%mm4, 32%0\n\t"
2303 MOVNTQ" %%mm5, 40%0\n\t"
2304 MOVNTQ" %%mm6, 48%0\n\t"
2305 MOVNTQ" %%mm7, 56%0"
2306 :"=m"(d[2*x])
2307 :"m"(s2[x])
2308 :"memory");
2309 }
2310#endif
2311 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2312 }
2313#ifdef HAVE_MMX
2314 asm(
2315 EMMS" \n\t"
2316 SFENCE" \n\t"
2317 ::: "memory"
2318 );
2319#endif
2320}
2321
2322static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2323 uint8_t *dst,
2324 unsigned width, unsigned height,
2325 unsigned srcStride1, unsigned srcStride2,
2326 unsigned srcStride3, unsigned dstStride)
2327{
2328 unsigned y,x,x2,w,h;
2329 w=width/2; h=height;
2330#ifdef HAVE_MMX
2331 asm volatile(
2332 PREFETCH" %0\n\t"
2333 PREFETCH" %1\n\t"
2334 PREFETCH" %2\n\t"
2335 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
2336#endif
2337 for(y=0;y<h;y++){
2338 const uint8_t* yp=src1+srcStride1*y;
2339 const uint8_t* up=src2+srcStride2*(y>>2);
2340 const uint8_t* vp=src3+srcStride3*(y>>2);
2341 uint8_t* d=dst+dstStride*y;
2342 x2=0;
2343 x=0;
2344#ifdef HAVE_MMX
2345 for(;x<w;x+=8,x2+=32)
2346 {
2347 asm volatile(
2348 PREFETCH" 32%1\n\t"
2349 PREFETCH" 32%2\n\t"
2350 PREFETCH" 32%3\n\t"
2351 "movq %1, %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2352 "movq %2, %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2353 "movq %3, %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2354 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2355 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2356 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2357 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2358 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2359 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2360 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2361
2362 "movq %%mm1, %%mm6\n\t"
2363 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2364 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2365 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2366 MOVNTQ" %%mm0, %0\n\t"
2367 MOVNTQ" %%mm3, 8%0\n\t"
2368
2369 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2370 "movq 8%1, %%mm0\n\t"
2371 "movq %%mm0, %%mm3\n\t"
2372 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2373 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2374 MOVNTQ" %%mm0, 16%0\n\t"
2375 MOVNTQ" %%mm3, 24%0\n\t"
2376
2377 "movq %%mm4, %%mm6\n\t"
2378 "movq 16%1, %%mm0\n\t"
2379 "movq %%mm0, %%mm3\n\t"
2380 "punpcklbw %%mm5, %%mm4\n\t"
2381 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2382 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2383 MOVNTQ" %%mm0, 32%0\n\t"
2384 MOVNTQ" %%mm3, 40%0\n\t"
2385
2386 "punpckhbw %%mm5, %%mm6\n\t"
2387 "movq 24%1, %%mm0\n\t"
2388 "movq %%mm0, %%mm3\n\t"
2389 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2390 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2391 MOVNTQ" %%mm0, 48%0\n\t"
2392 MOVNTQ" %%mm3, 56%0\n\t"
2393
2394 :"=m"(d[8*x])
2395 :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])
2396 :"memory");
2397 }
2398#endif
2399 for(;x<w;x++,x2+=4)
2400 {
2401 d[8*x+0]=yp[x2];
2402 d[8*x+1]=up[x];
2403 d[8*x+2]=yp[x2+1];
2404 d[8*x+3]=vp[x];
2405 d[8*x+4]=yp[x2+2];
2406 d[8*x+5]=up[x];
2407 d[8*x+6]=yp[x2+3];
2408 d[8*x+7]=vp[x];
2409 }
2410 }
2411#ifdef HAVE_MMX
2412 asm(
2413 EMMS" \n\t"
2414 SFENCE" \n\t"
2415 ::: "memory"
2416 );
2417#endif
2418}