fixing RGB32->RGB16 on big endian patch by (Colin Leroy <colin at colino dot net>)
[libav.git] / postproc / rgb2rgb_template.c
1 /*
2 *
3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 */
10
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
13
14 #ifndef __WORDSIZE
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
17 #endif
18
19 #undef PREFETCH
20 #undef MOVNTQ
21 #undef EMMS
22 #undef SFENCE
23 #undef MMREG_SIZE
24 #undef PREFETCHW
25 #undef PAVGB
26
27 #ifdef HAVE_SSE2
28 #define MMREG_SIZE 16
29 #else
30 #define MMREG_SIZE 8
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #define PAVGB "pavgb"
41 #else
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
44 #endif
45
46 #ifdef HAVE_3DNOW
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48 #define EMMS "femms"
49 #else
50 #define EMMS "emms"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
56 #else
57 #define MOVNTQ "movq"
58 #define SFENCE "/nop"
59 #endif
60
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62 {
63 uint8_t *dest = dst;
64 const uint8_t *s = src;
65 const uint8_t *end;
66 #ifdef HAVE_MMX
67 const uint8_t *mm_end;
68 #endif
69 end = s + src_size;
70 #ifdef HAVE_MMX
71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
72 mm_end = end - 23;
73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
74 while(s < mm_end)
75 {
76 __asm __volatile(
77 PREFETCH" 32%1\n\t"
78 "movd %1, %%mm0\n\t"
79 "punpckldq 3%1, %%mm0\n\t"
80 "movd 6%1, %%mm1\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ" %%mm0, %0\n\t"
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
93 MOVNTQ" %%mm3, 24%0"
94 :"=m"(*dest)
95 :"m"(*s)
96 :"memory");
97 dest += 32;
98 s += 24;
99 }
100 __asm __volatile(SFENCE:::"memory");
101 __asm __volatile(EMMS:::"memory");
102 #endif
103 while(s < end)
104 {
105 *dest++ = *s++;
106 *dest++ = *s++;
107 *dest++ = *s++;
108 *dest++ = 0;
109 }
110 }
111
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113 {
114 uint8_t *dest = dst;
115 const uint8_t *s = src;
116 const uint8_t *end;
117 #ifdef HAVE_MMX
118 const uint8_t *mm_end;
119 #endif
120 end = s + src_size;
121 #ifdef HAVE_MMX
122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
123 mm_end = end - 31;
124 while(s < mm_end)
125 {
126 __asm __volatile(
127 PREFETCH" 32%1\n\t"
128 "movq %1, %%mm0\n\t"
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
140 "pand %2, %%mm0\n\t"
141 "pand %2, %%mm1\n\t"
142 "pand %2, %%mm4\n\t"
143 "pand %2, %%mm5\n\t"
144 "pand %3, %%mm2\n\t"
145 "pand %3, %%mm3\n\t"
146 "pand %3, %%mm6\n\t"
147 "pand %3, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
152
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
157 "pand %4, %%mm2\n\t"
158 "pand %5, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
164 "pand %6, %%mm5\n\t"
165 "por %%mm5, %%mm4\n\t"
166
167 MOVNTQ" %%mm0, %0\n\t"
168 MOVNTQ" %%mm1, 8%0\n\t"
169 MOVNTQ" %%mm4, 16%0"
170 :"=m"(*dest)
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173 :"memory");
174 dest += 24;
175 s += 32;
176 }
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
179 #endif
180 while(s < end)
181 {
182 *dest++ = *s++;
183 *dest++ = *s++;
184 *dest++ = *s++;
185 s++;
186 }
187 }
188
189 /*
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
194 */
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 {
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
200 const uint8_t *mm_end;
201 end = s + src_size;
202 #ifdef HAVE_MMX
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
205 mm_end = end - 15;
206 while(s<mm_end)
207 {
208 __asm __volatile(
209 PREFETCH" 32%1\n\t"
210 "movq %1, %%mm0\n\t"
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ" %%mm0, %0\n\t"
219 MOVNTQ" %%mm2, 8%0"
220 :"=m"(*d)
221 :"m"(*s)
222 );
223 d+=16;
224 s+=16;
225 }
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
228 #endif
229 mm_end = end - 3;
230 while(s < mm_end)
231 {
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234 d+=4;
235 s+=4;
236 }
237 if(s < end)
238 {
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241 }
242 }
243
244 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
245 {
246 unsigned j,i,num_pixels=src_size/3;
247 for(i=0,j=0; j<num_pixels; i+=3,j+=3)
248 {
249 dst[j+0] = src[i+2];
250 dst[j+1] = src[i+1];
251 dst[j+2] = src[i+0];
252 }
253 }
254
255 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
256 {
257 register const uint8_t* s=src;
258 register uint8_t* d=dst;
259 register const uint8_t *end;
260 const uint8_t *mm_end;
261 end = s + src_size;
262 #ifdef HAVE_MMX
263 __asm __volatile(PREFETCH" %0"::"m"(*s));
264 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
265 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
266 mm_end = end - 15;
267 while(s<mm_end)
268 {
269 __asm __volatile(
270 PREFETCH" 32%1\n\t"
271 "movq %1, %%mm0\n\t"
272 "movq 8%1, %%mm2\n\t"
273 "movq %%mm0, %%mm1\n\t"
274 "movq %%mm2, %%mm3\n\t"
275 "psrlq $1, %%mm0\n\t"
276 "psrlq $1, %%mm2\n\t"
277 "pand %%mm7, %%mm0\n\t"
278 "pand %%mm7, %%mm2\n\t"
279 "pand %%mm6, %%mm1\n\t"
280 "pand %%mm6, %%mm3\n\t"
281 "por %%mm1, %%mm0\n\t"
282 "por %%mm3, %%mm2\n\t"
283 MOVNTQ" %%mm0, %0\n\t"
284 MOVNTQ" %%mm2, 8%0"
285 :"=m"(*d)
286 :"m"(*s)
287 );
288 d+=16;
289 s+=16;
290 }
291 __asm __volatile(SFENCE:::"memory");
292 __asm __volatile(EMMS:::"memory");
293 #endif
294 mm_end = end - 3;
295 while(s < mm_end)
296 {
297 register uint32_t x= *((uint32_t *)s);
298 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
299 s+=4;
300 d+=4;
301 }
302 if(s < end)
303 {
304 register uint16_t x= *((uint16_t *)s);
305 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
306 s+=2;
307 d+=2;
308 }
309 }
310
311 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
312 {
313 const uint8_t *s = src;
314 const uint8_t *end;
315 #ifdef HAVE_MMX
316 const uint8_t *mm_end;
317 #endif
318 uint16_t *d = (uint16_t *)dst;
319 end = s + src_size;
320 #ifdef HAVE_MMX
321 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
322 __asm __volatile(
323 "movq %0, %%mm7\n\t"
324 "movq %1, %%mm6\n\t"
325 ::"m"(red_16mask),"m"(green_16mask));
326 mm_end = end - 15;
327 while(s < mm_end)
328 {
329 __asm __volatile(
330 PREFETCH" 32%1\n\t"
331 "movd %1, %%mm0\n\t"
332 "movd 4%1, %%mm3\n\t"
333 "punpckldq 8%1, %%mm0\n\t"
334 "punpckldq 12%1, %%mm3\n\t"
335 "movq %%mm0, %%mm1\n\t"
336 "movq %%mm0, %%mm2\n\t"
337 "movq %%mm3, %%mm4\n\t"
338 "movq %%mm3, %%mm5\n\t"
339 "psrlq $3, %%mm0\n\t"
340 "psrlq $3, %%mm3\n\t"
341 "pand %2, %%mm0\n\t"
342 "pand %2, %%mm3\n\t"
343 "psrlq $5, %%mm1\n\t"
344 "psrlq $5, %%mm4\n\t"
345 "pand %%mm6, %%mm1\n\t"
346 "pand %%mm6, %%mm4\n\t"
347 "psrlq $8, %%mm2\n\t"
348 "psrlq $8, %%mm5\n\t"
349 "pand %%mm7, %%mm2\n\t"
350 "pand %%mm7, %%mm5\n\t"
351 "por %%mm1, %%mm0\n\t"
352 "por %%mm4, %%mm3\n\t"
353 "por %%mm2, %%mm0\n\t"
354 "por %%mm5, %%mm3\n\t"
355 "psllq $16, %%mm3\n\t"
356 "por %%mm3, %%mm0\n\t"
357 MOVNTQ" %%mm0, %0\n\t"
358 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
359 d += 4;
360 s += 16;
361 }
362 __asm __volatile(SFENCE:::"memory");
363 __asm __volatile(EMMS:::"memory");
364 #endif
365 while(s < end)
366 {
367 #ifndef WORDS_BIGENDIAN
368 const int b= *s++;
369 const int g= *s++;
370 const int r= *s++;
371 #else
372 const int a= *s++; /*skip*/
373 const int r= *s++;
374 const int g= *s++;
375 const int b= *s++;
376 #endif
377 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
378 #ifndef WORDS_BIGENDIAN
379 s++;
380 #endif
381 }
382 }
383
384 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
385 {
386 const uint8_t *s = src;
387 const uint8_t *end;
388 #ifdef HAVE_MMX
389 const uint8_t *mm_end;
390 #endif
391 uint16_t *d = (uint16_t *)dst;
392 end = s + src_size;
393 #ifdef HAVE_MMX
394 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
395 __asm __volatile(
396 "movq %0, %%mm7\n\t"
397 "movq %1, %%mm6\n\t"
398 ::"m"(red_16mask),"m"(green_16mask));
399 mm_end = end - 15;
400 while(s < mm_end)
401 {
402 __asm __volatile(
403 PREFETCH" 32%1\n\t"
404 "movd %1, %%mm0\n\t"
405 "movd 4%1, %%mm3\n\t"
406 "punpckldq 8%1, %%mm0\n\t"
407 "punpckldq 12%1, %%mm3\n\t"
408 "movq %%mm0, %%mm1\n\t"
409 "movq %%mm0, %%mm2\n\t"
410 "movq %%mm3, %%mm4\n\t"
411 "movq %%mm3, %%mm5\n\t"
412 "psllq $8, %%mm0\n\t"
413 "psllq $8, %%mm3\n\t"
414 "pand %%mm7, %%mm0\n\t"
415 "pand %%mm7, %%mm3\n\t"
416 "psrlq $5, %%mm1\n\t"
417 "psrlq $5, %%mm4\n\t"
418 "pand %%mm6, %%mm1\n\t"
419 "pand %%mm6, %%mm4\n\t"
420 "psrlq $19, %%mm2\n\t"
421 "psrlq $19, %%mm5\n\t"
422 "pand %2, %%mm2\n\t"
423 "pand %2, %%mm5\n\t"
424 "por %%mm1, %%mm0\n\t"
425 "por %%mm4, %%mm3\n\t"
426 "por %%mm2, %%mm0\n\t"
427 "por %%mm5, %%mm3\n\t"
428 "psllq $16, %%mm3\n\t"
429 "por %%mm3, %%mm0\n\t"
430 MOVNTQ" %%mm0, %0\n\t"
431 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
432 d += 4;
433 s += 16;
434 }
435 __asm __volatile(SFENCE:::"memory");
436 __asm __volatile(EMMS:::"memory");
437 #endif
438 while(s < end)
439 {
440 const int r= *s++;
441 const int g= *s++;
442 const int b= *s++;
443 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
444 s++;
445 }
446 }
447
448 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
449 {
450 const uint8_t *s = src;
451 const uint8_t *end;
452 #ifdef HAVE_MMX
453 const uint8_t *mm_end;
454 #endif
455 uint16_t *d = (uint16_t *)dst;
456 end = s + src_size;
457 #ifdef HAVE_MMX
458 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
459 __asm __volatile(
460 "movq %0, %%mm7\n\t"
461 "movq %1, %%mm6\n\t"
462 ::"m"(red_15mask),"m"(green_15mask));
463 mm_end = end - 15;
464 while(s < mm_end)
465 {
466 __asm __volatile(
467 PREFETCH" 32%1\n\t"
468 "movd %1, %%mm0\n\t"
469 "movd 4%1, %%mm3\n\t"
470 "punpckldq 8%1, %%mm0\n\t"
471 "punpckldq 12%1, %%mm3\n\t"
472 "movq %%mm0, %%mm1\n\t"
473 "movq %%mm0, %%mm2\n\t"
474 "movq %%mm3, %%mm4\n\t"
475 "movq %%mm3, %%mm5\n\t"
476 "psrlq $3, %%mm0\n\t"
477 "psrlq $3, %%mm3\n\t"
478 "pand %2, %%mm0\n\t"
479 "pand %2, %%mm3\n\t"
480 "psrlq $6, %%mm1\n\t"
481 "psrlq $6, %%mm4\n\t"
482 "pand %%mm6, %%mm1\n\t"
483 "pand %%mm6, %%mm4\n\t"
484 "psrlq $9, %%mm2\n\t"
485 "psrlq $9, %%mm5\n\t"
486 "pand %%mm7, %%mm2\n\t"
487 "pand %%mm7, %%mm5\n\t"
488 "por %%mm1, %%mm0\n\t"
489 "por %%mm4, %%mm3\n\t"
490 "por %%mm2, %%mm0\n\t"
491 "por %%mm5, %%mm3\n\t"
492 "psllq $16, %%mm3\n\t"
493 "por %%mm3, %%mm0\n\t"
494 MOVNTQ" %%mm0, %0\n\t"
495 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
496 d += 4;
497 s += 16;
498 }
499 __asm __volatile(SFENCE:::"memory");
500 __asm __volatile(EMMS:::"memory");
501 #endif
502 while(s < end)
503 {
504 const int b= *s++;
505 const int g= *s++;
506 const int r= *s++;
507 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
508 s++;
509 }
510 }
511
512 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
513 {
514 const uint8_t *s = src;
515 const uint8_t *end;
516 #ifdef HAVE_MMX
517 const uint8_t *mm_end;
518 #endif
519 uint16_t *d = (uint16_t *)dst;
520 end = s + src_size;
521 #ifdef HAVE_MMX
522 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
523 __asm __volatile(
524 "movq %0, %%mm7\n\t"
525 "movq %1, %%mm6\n\t"
526 ::"m"(red_15mask),"m"(green_15mask));
527 mm_end = end - 15;
528 while(s < mm_end)
529 {
530 __asm __volatile(
531 PREFETCH" 32%1\n\t"
532 "movd %1, %%mm0\n\t"
533 "movd 4%1, %%mm3\n\t"
534 "punpckldq 8%1, %%mm0\n\t"
535 "punpckldq 12%1, %%mm3\n\t"
536 "movq %%mm0, %%mm1\n\t"
537 "movq %%mm0, %%mm2\n\t"
538 "movq %%mm3, %%mm4\n\t"
539 "movq %%mm3, %%mm5\n\t"
540 "psllq $7, %%mm0\n\t"
541 "psllq $7, %%mm3\n\t"
542 "pand %%mm7, %%mm0\n\t"
543 "pand %%mm7, %%mm3\n\t"
544 "psrlq $6, %%mm1\n\t"
545 "psrlq $6, %%mm4\n\t"
546 "pand %%mm6, %%mm1\n\t"
547 "pand %%mm6, %%mm4\n\t"
548 "psrlq $19, %%mm2\n\t"
549 "psrlq $19, %%mm5\n\t"
550 "pand %2, %%mm2\n\t"
551 "pand %2, %%mm5\n\t"
552 "por %%mm1, %%mm0\n\t"
553 "por %%mm4, %%mm3\n\t"
554 "por %%mm2, %%mm0\n\t"
555 "por %%mm5, %%mm3\n\t"
556 "psllq $16, %%mm3\n\t"
557 "por %%mm3, %%mm0\n\t"
558 MOVNTQ" %%mm0, %0\n\t"
559 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
560 d += 4;
561 s += 16;
562 }
563 __asm __volatile(SFENCE:::"memory");
564 __asm __volatile(EMMS:::"memory");
565 #endif
566 while(s < end)
567 {
568 const int r= *s++;
569 const int g= *s++;
570 const int b= *s++;
571 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
572 s++;
573 }
574 }
575
576 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
577 {
578 const uint8_t *s = src;
579 const uint8_t *end;
580 #ifdef HAVE_MMX
581 const uint8_t *mm_end;
582 #endif
583 uint16_t *d = (uint16_t *)dst;
584 end = s + src_size;
585 #ifdef HAVE_MMX
586 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
587 __asm __volatile(
588 "movq %0, %%mm7\n\t"
589 "movq %1, %%mm6\n\t"
590 ::"m"(red_16mask),"m"(green_16mask));
591 mm_end = end - 11;
592 while(s < mm_end)
593 {
594 __asm __volatile(
595 PREFETCH" 32%1\n\t"
596 "movd %1, %%mm0\n\t"
597 "movd 3%1, %%mm3\n\t"
598 "punpckldq 6%1, %%mm0\n\t"
599 "punpckldq 9%1, %%mm3\n\t"
600 "movq %%mm0, %%mm1\n\t"
601 "movq %%mm0, %%mm2\n\t"
602 "movq %%mm3, %%mm4\n\t"
603 "movq %%mm3, %%mm5\n\t"
604 "psrlq $3, %%mm0\n\t"
605 "psrlq $3, %%mm3\n\t"
606 "pand %2, %%mm0\n\t"
607 "pand %2, %%mm3\n\t"
608 "psrlq $5, %%mm1\n\t"
609 "psrlq $5, %%mm4\n\t"
610 "pand %%mm6, %%mm1\n\t"
611 "pand %%mm6, %%mm4\n\t"
612 "psrlq $8, %%mm2\n\t"
613 "psrlq $8, %%mm5\n\t"
614 "pand %%mm7, %%mm2\n\t"
615 "pand %%mm7, %%mm5\n\t"
616 "por %%mm1, %%mm0\n\t"
617 "por %%mm4, %%mm3\n\t"
618 "por %%mm2, %%mm0\n\t"
619 "por %%mm5, %%mm3\n\t"
620 "psllq $16, %%mm3\n\t"
621 "por %%mm3, %%mm0\n\t"
622 MOVNTQ" %%mm0, %0\n\t"
623 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
624 d += 4;
625 s += 12;
626 }
627 __asm __volatile(SFENCE:::"memory");
628 __asm __volatile(EMMS:::"memory");
629 #endif
630 while(s < end)
631 {
632 const int b= *s++;
633 const int g= *s++;
634 const int r= *s++;
635 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
636 }
637 }
638
639 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
640 {
641 const uint8_t *s = src;
642 const uint8_t *end;
643 #ifdef HAVE_MMX
644 const uint8_t *mm_end;
645 #endif
646 uint16_t *d = (uint16_t *)dst;
647 end = s + src_size;
648 #ifdef HAVE_MMX
649 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
650 __asm __volatile(
651 "movq %0, %%mm7\n\t"
652 "movq %1, %%mm6\n\t"
653 ::"m"(red_16mask),"m"(green_16mask));
654 mm_end = end - 15;
655 while(s < mm_end)
656 {
657 __asm __volatile(
658 PREFETCH" 32%1\n\t"
659 "movd %1, %%mm0\n\t"
660 "movd 3%1, %%mm3\n\t"
661 "punpckldq 6%1, %%mm0\n\t"
662 "punpckldq 9%1, %%mm3\n\t"
663 "movq %%mm0, %%mm1\n\t"
664 "movq %%mm0, %%mm2\n\t"
665 "movq %%mm3, %%mm4\n\t"
666 "movq %%mm3, %%mm5\n\t"
667 "psllq $8, %%mm0\n\t"
668 "psllq $8, %%mm3\n\t"
669 "pand %%mm7, %%mm0\n\t"
670 "pand %%mm7, %%mm3\n\t"
671 "psrlq $5, %%mm1\n\t"
672 "psrlq $5, %%mm4\n\t"
673 "pand %%mm6, %%mm1\n\t"
674 "pand %%mm6, %%mm4\n\t"
675 "psrlq $19, %%mm2\n\t"
676 "psrlq $19, %%mm5\n\t"
677 "pand %2, %%mm2\n\t"
678 "pand %2, %%mm5\n\t"
679 "por %%mm1, %%mm0\n\t"
680 "por %%mm4, %%mm3\n\t"
681 "por %%mm2, %%mm0\n\t"
682 "por %%mm5, %%mm3\n\t"
683 "psllq $16, %%mm3\n\t"
684 "por %%mm3, %%mm0\n\t"
685 MOVNTQ" %%mm0, %0\n\t"
686 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
687 d += 4;
688 s += 12;
689 }
690 __asm __volatile(SFENCE:::"memory");
691 __asm __volatile(EMMS:::"memory");
692 #endif
693 while(s < end)
694 {
695 const int r= *s++;
696 const int g= *s++;
697 const int b= *s++;
698 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
699 }
700 }
701
702 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
703 {
704 const uint8_t *s = src;
705 const uint8_t *end;
706 #ifdef HAVE_MMX
707 const uint8_t *mm_end;
708 #endif
709 uint16_t *d = (uint16_t *)dst;
710 end = s + src_size;
711 #ifdef HAVE_MMX
712 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
713 __asm __volatile(
714 "movq %0, %%mm7\n\t"
715 "movq %1, %%mm6\n\t"
716 ::"m"(red_15mask),"m"(green_15mask));
717 mm_end = end - 11;
718 while(s < mm_end)
719 {
720 __asm __volatile(
721 PREFETCH" 32%1\n\t"
722 "movd %1, %%mm0\n\t"
723 "movd 3%1, %%mm3\n\t"
724 "punpckldq 6%1, %%mm0\n\t"
725 "punpckldq 9%1, %%mm3\n\t"
726 "movq %%mm0, %%mm1\n\t"
727 "movq %%mm0, %%mm2\n\t"
728 "movq %%mm3, %%mm4\n\t"
729 "movq %%mm3, %%mm5\n\t"
730 "psrlq $3, %%mm0\n\t"
731 "psrlq $3, %%mm3\n\t"
732 "pand %2, %%mm0\n\t"
733 "pand %2, %%mm3\n\t"
734 "psrlq $6, %%mm1\n\t"
735 "psrlq $6, %%mm4\n\t"
736 "pand %%mm6, %%mm1\n\t"
737 "pand %%mm6, %%mm4\n\t"
738 "psrlq $9, %%mm2\n\t"
739 "psrlq $9, %%mm5\n\t"
740 "pand %%mm7, %%mm2\n\t"
741 "pand %%mm7, %%mm5\n\t"
742 "por %%mm1, %%mm0\n\t"
743 "por %%mm4, %%mm3\n\t"
744 "por %%mm2, %%mm0\n\t"
745 "por %%mm5, %%mm3\n\t"
746 "psllq $16, %%mm3\n\t"
747 "por %%mm3, %%mm0\n\t"
748 MOVNTQ" %%mm0, %0\n\t"
749 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
750 d += 4;
751 s += 12;
752 }
753 __asm __volatile(SFENCE:::"memory");
754 __asm __volatile(EMMS:::"memory");
755 #endif
756 while(s < end)
757 {
758 const int b= *s++;
759 const int g= *s++;
760 const int r= *s++;
761 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
762 }
763 }
764
765 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
766 {
767 const uint8_t *s = src;
768 const uint8_t *end;
769 #ifdef HAVE_MMX
770 const uint8_t *mm_end;
771 #endif
772 uint16_t *d = (uint16_t *)dst;
773 end = s + src_size;
774 #ifdef HAVE_MMX
775 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
776 __asm __volatile(
777 "movq %0, %%mm7\n\t"
778 "movq %1, %%mm6\n\t"
779 ::"m"(red_15mask),"m"(green_15mask));
780 mm_end = end - 15;
781 while(s < mm_end)
782 {
783 __asm __volatile(
784 PREFETCH" 32%1\n\t"
785 "movd %1, %%mm0\n\t"
786 "movd 3%1, %%mm3\n\t"
787 "punpckldq 6%1, %%mm0\n\t"
788 "punpckldq 9%1, %%mm3\n\t"
789 "movq %%mm0, %%mm1\n\t"
790 "movq %%mm0, %%mm2\n\t"
791 "movq %%mm3, %%mm4\n\t"
792 "movq %%mm3, %%mm5\n\t"
793 "psllq $7, %%mm0\n\t"
794 "psllq $7, %%mm3\n\t"
795 "pand %%mm7, %%mm0\n\t"
796 "pand %%mm7, %%mm3\n\t"
797 "psrlq $6, %%mm1\n\t"
798 "psrlq $6, %%mm4\n\t"
799 "pand %%mm6, %%mm1\n\t"
800 "pand %%mm6, %%mm4\n\t"
801 "psrlq $19, %%mm2\n\t"
802 "psrlq $19, %%mm5\n\t"
803 "pand %2, %%mm2\n\t"
804 "pand %2, %%mm5\n\t"
805 "por %%mm1, %%mm0\n\t"
806 "por %%mm4, %%mm3\n\t"
807 "por %%mm2, %%mm0\n\t"
808 "por %%mm5, %%mm3\n\t"
809 "psllq $16, %%mm3\n\t"
810 "por %%mm3, %%mm0\n\t"
811 MOVNTQ" %%mm0, %0\n\t"
812 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
813 d += 4;
814 s += 12;
815 }
816 __asm __volatile(SFENCE:::"memory");
817 __asm __volatile(EMMS:::"memory");
818 #endif
819 while(s < end)
820 {
821 const int r= *s++;
822 const int g= *s++;
823 const int b= *s++;
824 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
825 }
826 }
827
828 /*
829 I use here less accurate approximation by simply
830 left-shifting the input
831 value and filling the low order bits with
832 zeroes. This method improves png's
833 compression but this scheme cannot reproduce white exactly, since it does not
834 generate an all-ones maximum value; the net effect is to darken the
835 image slightly.
836
837 The better method should be "left bit replication":
838
839 4 3 2 1 0
840 ---------
841 1 1 0 1 1
842
843 7 6 5 4 3 2 1 0
844 ----------------
845 1 1 0 1 1 1 1 0
846 |=======| |===|
847 | Leftmost Bits Repeated to Fill Open Bits
848 |
849 Original Bits
850 */
851 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
852 {
853 const uint16_t *end;
854 #ifdef HAVE_MMX
855 const uint16_t *mm_end;
856 #endif
857 uint8_t *d = (uint8_t *)dst;
858 const uint16_t *s = (uint16_t *)src;
859 end = s + src_size/2;
860 #ifdef HAVE_MMX
861 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
862 mm_end = end - 7;
863 while(s < mm_end)
864 {
865 __asm __volatile(
866 PREFETCH" 32%1\n\t"
867 "movq %1, %%mm0\n\t"
868 "movq %1, %%mm1\n\t"
869 "movq %1, %%mm2\n\t"
870 "pand %2, %%mm0\n\t"
871 "pand %3, %%mm1\n\t"
872 "pand %4, %%mm2\n\t"
873 "psllq $3, %%mm0\n\t"
874 "psrlq $2, %%mm1\n\t"
875 "psrlq $7, %%mm2\n\t"
876 "movq %%mm0, %%mm3\n\t"
877 "movq %%mm1, %%mm4\n\t"
878 "movq %%mm2, %%mm5\n\t"
879 "punpcklwd %5, %%mm0\n\t"
880 "punpcklwd %5, %%mm1\n\t"
881 "punpcklwd %5, %%mm2\n\t"
882 "punpckhwd %5, %%mm3\n\t"
883 "punpckhwd %5, %%mm4\n\t"
884 "punpckhwd %5, %%mm5\n\t"
885 "psllq $8, %%mm1\n\t"
886 "psllq $16, %%mm2\n\t"
887 "por %%mm1, %%mm0\n\t"
888 "por %%mm2, %%mm0\n\t"
889 "psllq $8, %%mm4\n\t"
890 "psllq $16, %%mm5\n\t"
891 "por %%mm4, %%mm3\n\t"
892 "por %%mm5, %%mm3\n\t"
893
894 "movq %%mm0, %%mm6\n\t"
895 "movq %%mm3, %%mm7\n\t"
896
897 "movq 8%1, %%mm0\n\t"
898 "movq 8%1, %%mm1\n\t"
899 "movq 8%1, %%mm2\n\t"
900 "pand %2, %%mm0\n\t"
901 "pand %3, %%mm1\n\t"
902 "pand %4, %%mm2\n\t"
903 "psllq $3, %%mm0\n\t"
904 "psrlq $2, %%mm1\n\t"
905 "psrlq $7, %%mm2\n\t"
906 "movq %%mm0, %%mm3\n\t"
907 "movq %%mm1, %%mm4\n\t"
908 "movq %%mm2, %%mm5\n\t"
909 "punpcklwd %5, %%mm0\n\t"
910 "punpcklwd %5, %%mm1\n\t"
911 "punpcklwd %5, %%mm2\n\t"
912 "punpckhwd %5, %%mm3\n\t"
913 "punpckhwd %5, %%mm4\n\t"
914 "punpckhwd %5, %%mm5\n\t"
915 "psllq $8, %%mm1\n\t"
916 "psllq $16, %%mm2\n\t"
917 "por %%mm1, %%mm0\n\t"
918 "por %%mm2, %%mm0\n\t"
919 "psllq $8, %%mm4\n\t"
920 "psllq $16, %%mm5\n\t"
921 "por %%mm4, %%mm3\n\t"
922 "por %%mm5, %%mm3\n\t"
923
924 :"=m"(*d)
925 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
926 :"memory");
927 /* Borrowed 32 to 24 */
928 __asm __volatile(
929 "movq %%mm0, %%mm4\n\t"
930 "movq %%mm3, %%mm5\n\t"
931 "movq %%mm6, %%mm0\n\t"
932 "movq %%mm7, %%mm1\n\t"
933
934 "movq %%mm4, %%mm6\n\t"
935 "movq %%mm5, %%mm7\n\t"
936 "movq %%mm0, %%mm2\n\t"
937 "movq %%mm1, %%mm3\n\t"
938
939 "psrlq $8, %%mm2\n\t"
940 "psrlq $8, %%mm3\n\t"
941 "psrlq $8, %%mm6\n\t"
942 "psrlq $8, %%mm7\n\t"
943 "pand %2, %%mm0\n\t"
944 "pand %2, %%mm1\n\t"
945 "pand %2, %%mm4\n\t"
946 "pand %2, %%mm5\n\t"
947 "pand %3, %%mm2\n\t"
948 "pand %3, %%mm3\n\t"
949 "pand %3, %%mm6\n\t"
950 "pand %3, %%mm7\n\t"
951 "por %%mm2, %%mm0\n\t"
952 "por %%mm3, %%mm1\n\t"
953 "por %%mm6, %%mm4\n\t"
954 "por %%mm7, %%mm5\n\t"
955
956 "movq %%mm1, %%mm2\n\t"
957 "movq %%mm4, %%mm3\n\t"
958 "psllq $48, %%mm2\n\t"
959 "psllq $32, %%mm3\n\t"
960 "pand %4, %%mm2\n\t"
961 "pand %5, %%mm3\n\t"
962 "por %%mm2, %%mm0\n\t"
963 "psrlq $16, %%mm1\n\t"
964 "psrlq $32, %%mm4\n\t"
965 "psllq $16, %%mm5\n\t"
966 "por %%mm3, %%mm1\n\t"
967 "pand %6, %%mm5\n\t"
968 "por %%mm5, %%mm4\n\t"
969
970 MOVNTQ" %%mm0, %0\n\t"
971 MOVNTQ" %%mm1, 8%0\n\t"
972 MOVNTQ" %%mm4, 16%0"
973
974 :"=m"(*d)
975 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
976 :"memory");
977 d += 24;
978 s += 8;
979 }
980 __asm __volatile(SFENCE:::"memory");
981 __asm __volatile(EMMS:::"memory");
982 #endif
983 while(s < end)
984 {
985 register uint16_t bgr;
986 bgr = *s++;
987 *d++ = (bgr&0x1F)<<3;
988 *d++ = (bgr&0x3E0)>>2;
989 *d++ = (bgr&0x7C00)>>7;
990 }
991 }
992
993 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
994 {
995 const uint16_t *end;
996 #ifdef HAVE_MMX
997 const uint16_t *mm_end;
998 #endif
999 uint8_t *d = (uint8_t *)dst;
1000 const uint16_t *s = (const uint16_t *)src;
1001 end = s + src_size/2;
1002 #ifdef HAVE_MMX
1003 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1004 mm_end = end - 7;
1005 while(s < mm_end)
1006 {
1007 __asm __volatile(
1008 PREFETCH" 32%1\n\t"
1009 "movq %1, %%mm0\n\t"
1010 "movq %1, %%mm1\n\t"
1011 "movq %1, %%mm2\n\t"
1012 "pand %2, %%mm0\n\t"
1013 "pand %3, %%mm1\n\t"
1014 "pand %4, %%mm2\n\t"
1015 "psllq $3, %%mm0\n\t"
1016 "psrlq $3, %%mm1\n\t"
1017 "psrlq $8, %%mm2\n\t"
1018 "movq %%mm0, %%mm3\n\t"
1019 "movq %%mm1, %%mm4\n\t"
1020 "movq %%mm2, %%mm5\n\t"
1021 "punpcklwd %5, %%mm0\n\t"
1022 "punpcklwd %5, %%mm1\n\t"
1023 "punpcklwd %5, %%mm2\n\t"
1024 "punpckhwd %5, %%mm3\n\t"
1025 "punpckhwd %5, %%mm4\n\t"
1026 "punpckhwd %5, %%mm5\n\t"
1027 "psllq $8, %%mm1\n\t"
1028 "psllq $16, %%mm2\n\t"
1029 "por %%mm1, %%mm0\n\t"
1030 "por %%mm2, %%mm0\n\t"
1031 "psllq $8, %%mm4\n\t"
1032 "psllq $16, %%mm5\n\t"
1033 "por %%mm4, %%mm3\n\t"
1034 "por %%mm5, %%mm3\n\t"
1035
1036 "movq %%mm0, %%mm6\n\t"
1037 "movq %%mm3, %%mm7\n\t"
1038
1039 "movq 8%1, %%mm0\n\t"
1040 "movq 8%1, %%mm1\n\t"
1041 "movq 8%1, %%mm2\n\t"
1042 "pand %2, %%mm0\n\t"
1043 "pand %3, %%mm1\n\t"
1044 "pand %4, %%mm2\n\t"
1045 "psllq $3, %%mm0\n\t"
1046 "psrlq $3, %%mm1\n\t"
1047 "psrlq $8, %%mm2\n\t"
1048 "movq %%mm0, %%mm3\n\t"
1049 "movq %%mm1, %%mm4\n\t"
1050 "movq %%mm2, %%mm5\n\t"
1051 "punpcklwd %5, %%mm0\n\t"
1052 "punpcklwd %5, %%mm1\n\t"
1053 "punpcklwd %5, %%mm2\n\t"
1054 "punpckhwd %5, %%mm3\n\t"
1055 "punpckhwd %5, %%mm4\n\t"
1056 "punpckhwd %5, %%mm5\n\t"
1057 "psllq $8, %%mm1\n\t"
1058 "psllq $16, %%mm2\n\t"
1059 "por %%mm1, %%mm0\n\t"
1060 "por %%mm2, %%mm0\n\t"
1061 "psllq $8, %%mm4\n\t"
1062 "psllq $16, %%mm5\n\t"
1063 "por %%mm4, %%mm3\n\t"
1064 "por %%mm5, %%mm3\n\t"
1065 :"=m"(*d)
1066 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1067 :"memory");
1068 /* Borrowed 32 to 24 */
1069 __asm __volatile(
1070 "movq %%mm0, %%mm4\n\t"
1071 "movq %%mm3, %%mm5\n\t"
1072 "movq %%mm6, %%mm0\n\t"
1073 "movq %%mm7, %%mm1\n\t"
1074
1075 "movq %%mm4, %%mm6\n\t"
1076 "movq %%mm5, %%mm7\n\t"
1077 "movq %%mm0, %%mm2\n\t"
1078 "movq %%mm1, %%mm3\n\t"
1079
1080 "psrlq $8, %%mm2\n\t"
1081 "psrlq $8, %%mm3\n\t"
1082 "psrlq $8, %%mm6\n\t"
1083 "psrlq $8, %%mm7\n\t"
1084 "pand %2, %%mm0\n\t"
1085 "pand %2, %%mm1\n\t"
1086 "pand %2, %%mm4\n\t"
1087 "pand %2, %%mm5\n\t"
1088 "pand %3, %%mm2\n\t"
1089 "pand %3, %%mm3\n\t"
1090 "pand %3, %%mm6\n\t"
1091 "pand %3, %%mm7\n\t"
1092 "por %%mm2, %%mm0\n\t"
1093 "por %%mm3, %%mm1\n\t"
1094 "por %%mm6, %%mm4\n\t"
1095 "por %%mm7, %%mm5\n\t"
1096
1097 "movq %%mm1, %%mm2\n\t"
1098 "movq %%mm4, %%mm3\n\t"
1099 "psllq $48, %%mm2\n\t"
1100 "psllq $32, %%mm3\n\t"
1101 "pand %4, %%mm2\n\t"
1102 "pand %5, %%mm3\n\t"
1103 "por %%mm2, %%mm0\n\t"
1104 "psrlq $16, %%mm1\n\t"
1105 "psrlq $32, %%mm4\n\t"
1106 "psllq $16, %%mm5\n\t"
1107 "por %%mm3, %%mm1\n\t"
1108 "pand %6, %%mm5\n\t"
1109 "por %%mm5, %%mm4\n\t"
1110
1111 MOVNTQ" %%mm0, %0\n\t"
1112 MOVNTQ" %%mm1, 8%0\n\t"
1113 MOVNTQ" %%mm4, 16%0"
1114
1115 :"=m"(*d)
1116 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1117 :"memory");
1118 d += 24;
1119 s += 8;
1120 }
1121 __asm __volatile(SFENCE:::"memory");
1122 __asm __volatile(EMMS:::"memory");
1123 #endif
1124 while(s < end)
1125 {
1126 register uint16_t bgr;
1127 bgr = *s++;
1128 *d++ = (bgr&0x1F)<<3;
1129 *d++ = (bgr&0x7E0)>>3;
1130 *d++ = (bgr&0xF800)>>8;
1131 }
1132 }
1133
1134 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1135 {
1136 const uint16_t *end;
1137 #ifdef HAVE_MMX
1138 const uint16_t *mm_end;
1139 #endif
1140 uint8_t *d = (uint8_t *)dst;
1141 const uint16_t *s = (const uint16_t *)src;
1142 end = s + src_size/2;
1143 #ifdef HAVE_MMX
1144 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1145 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1146 mm_end = end - 3;
1147 while(s < mm_end)
1148 {
1149 __asm __volatile(
1150 PREFETCH" 32%1\n\t"
1151 "movq %1, %%mm0\n\t"
1152 "movq %1, %%mm1\n\t"
1153 "movq %1, %%mm2\n\t"
1154 "pand %2, %%mm0\n\t"
1155 "pand %3, %%mm1\n\t"
1156 "pand %4, %%mm2\n\t"
1157 "psllq $3, %%mm0\n\t"
1158 "psrlq $2, %%mm1\n\t"
1159 "psrlq $7, %%mm2\n\t"
1160 "movq %%mm0, %%mm3\n\t"
1161 "movq %%mm1, %%mm4\n\t"
1162 "movq %%mm2, %%mm5\n\t"
1163 "punpcklwd %%mm7, %%mm0\n\t"
1164 "punpcklwd %%mm7, %%mm1\n\t"
1165 "punpcklwd %%mm7, %%mm2\n\t"
1166 "punpckhwd %%mm7, %%mm3\n\t"
1167 "punpckhwd %%mm7, %%mm4\n\t"
1168 "punpckhwd %%mm7, %%mm5\n\t"
1169 "psllq $8, %%mm1\n\t"
1170 "psllq $16, %%mm2\n\t"
1171 "por %%mm1, %%mm0\n\t"
1172 "por %%mm2, %%mm0\n\t"
1173 "psllq $8, %%mm4\n\t"
1174 "psllq $16, %%mm5\n\t"
1175 "por %%mm4, %%mm3\n\t"
1176 "por %%mm5, %%mm3\n\t"
1177 MOVNTQ" %%mm0, %0\n\t"
1178 MOVNTQ" %%mm3, 8%0\n\t"
1179 :"=m"(*d)
1180 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1181 :"memory");
1182 d += 16;
1183 s += 4;
1184 }
1185 __asm __volatile(SFENCE:::"memory");
1186 __asm __volatile(EMMS:::"memory");
1187 #endif
1188 while(s < end)
1189 {
1190 register uint16_t bgr;
1191 bgr = *s++;
1192 *d++ = (bgr&0x1F)<<3;
1193 *d++ = (bgr&0x3E0)>>2;
1194 *d++ = (bgr&0x7C00)>>7;
1195 *d++ = 0;
1196 }
1197 }
1198
1199 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1200 {
1201 const uint16_t *end;
1202 #ifdef HAVE_MMX
1203 const uint16_t *mm_end;
1204 #endif
1205 uint8_t *d = (uint8_t *)dst;
1206 const uint16_t *s = (uint16_t *)src;
1207 end = s + src_size/2;
1208 #ifdef HAVE_MMX
1209 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1210 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1211 mm_end = end - 3;
1212 while(s < mm_end)
1213 {
1214 __asm __volatile(
1215 PREFETCH" 32%1\n\t"
1216 "movq %1, %%mm0\n\t"
1217 "movq %1, %%mm1\n\t"
1218 "movq %1, %%mm2\n\t"
1219 "pand %2, %%mm0\n\t"
1220 "pand %3, %%mm1\n\t"
1221 "pand %4, %%mm2\n\t"
1222 "psllq $3, %%mm0\n\t"
1223 "psrlq $3, %%mm1\n\t"
1224 "psrlq $8, %%mm2\n\t"
1225 "movq %%mm0, %%mm3\n\t"
1226 "movq %%mm1, %%mm4\n\t"
1227 "movq %%mm2, %%mm5\n\t"
1228 "punpcklwd %%mm7, %%mm0\n\t"
1229 "punpcklwd %%mm7, %%mm1\n\t"
1230 "punpcklwd %%mm7, %%mm2\n\t"
1231 "punpckhwd %%mm7, %%mm3\n\t"
1232 "punpckhwd %%mm7, %%mm4\n\t"
1233 "punpckhwd %%mm7, %%mm5\n\t"
1234 "psllq $8, %%mm1\n\t"
1235 "psllq $16, %%mm2\n\t"
1236 "por %%mm1, %%mm0\n\t"
1237 "por %%mm2, %%mm0\n\t"
1238 "psllq $8, %%mm4\n\t"
1239 "psllq $16, %%mm5\n\t"
1240 "por %%mm4, %%mm3\n\t"
1241 "por %%mm5, %%mm3\n\t"
1242 MOVNTQ" %%mm0, %0\n\t"
1243 MOVNTQ" %%mm3, 8%0\n\t"
1244 :"=m"(*d)
1245 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1246 :"memory");
1247 d += 16;
1248 s += 4;
1249 }
1250 __asm __volatile(SFENCE:::"memory");
1251 __asm __volatile(EMMS:::"memory");
1252 #endif
1253 while(s < end)
1254 {
1255 register uint16_t bgr;
1256 bgr = *s++;
1257 *d++ = (bgr&0x1F)<<3;
1258 *d++ = (bgr&0x7E0)>>3;
1259 *d++ = (bgr&0xF800)>>8;
1260 *d++ = 0;
1261 }
1262 }
1263
1264 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1265 {
1266 #ifdef HAVE_MMX
1267 /* TODO: unroll this loop */
1268 asm volatile (
1269 "xorl %%eax, %%eax \n\t"
1270 ".balign 16 \n\t"
1271 "1: \n\t"
1272 PREFETCH" 32(%0, %%eax) \n\t"
1273 "movq (%0, %%eax), %%mm0 \n\t"
1274 "movq %%mm0, %%mm1 \n\t"
1275 "movq %%mm0, %%mm2 \n\t"
1276 "pslld $16, %%mm0 \n\t"
1277 "psrld $16, %%mm1 \n\t"
1278 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1279 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1280 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1281 "por %%mm0, %%mm2 \n\t"
1282 "por %%mm1, %%mm2 \n\t"
1283 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1284 "addl $8, %%eax \n\t"
1285 "cmpl %2, %%eax \n\t"
1286 " jb 1b \n\t"
1287 :: "r" (src), "r"(dst), "r" (src_size-7)
1288 : "%eax"
1289 );
1290
1291 __asm __volatile(SFENCE:::"memory");
1292 __asm __volatile(EMMS:::"memory");
1293 #else
1294 unsigned i;
1295 unsigned num_pixels = src_size >> 2;
1296 for(i=0; i<num_pixels; i++)
1297 {
1298 dst[4*i + 0] = src[4*i + 2];
1299 dst[4*i + 1] = src[4*i + 1];
1300 dst[4*i + 2] = src[4*i + 0];
1301 }
1302 #endif
1303 }
1304
1305 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1306 {
1307 unsigned i;
1308 #ifdef HAVE_MMX
1309 int mmx_size= 23 - src_size;
1310 asm volatile (
1311 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1312 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1313 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1314 ".balign 16 \n\t"
1315 "1: \n\t"
1316 PREFETCH" 32(%1, %%eax) \n\t"
1317 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1318 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1319 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1320 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1321 "pand %%mm5, %%mm0 \n\t"
1322 "pand %%mm6, %%mm1 \n\t"
1323 "pand %%mm7, %%mm2 \n\t"
1324 "por %%mm0, %%mm1 \n\t"
1325 "por %%mm2, %%mm1 \n\t"
1326 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1327 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1328 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1329 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1330 "pand %%mm7, %%mm0 \n\t"
1331 "pand %%mm5, %%mm1 \n\t"
1332 "pand %%mm6, %%mm2 \n\t"
1333 "por %%mm0, %%mm1 \n\t"
1334 "por %%mm2, %%mm1 \n\t"
1335 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1336 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1337 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1338 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1339 "pand %%mm6, %%mm0 \n\t"
1340 "pand %%mm7, %%mm1 \n\t"
1341 "pand %%mm5, %%mm2 \n\t"
1342 "por %%mm0, %%mm1 \n\t"
1343 "por %%mm2, %%mm1 \n\t"
1344 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1345 "addl $24, %%eax \n\t"
1346 " js 1b \n\t"
1347 : "+a" (mmx_size)
1348 : "r" (src-mmx_size), "r"(dst-mmx_size)
1349 );
1350
1351 __asm __volatile(SFENCE:::"memory");
1352 __asm __volatile(EMMS:::"memory");
1353
1354 if(mmx_size==23) return; //finihsed, was multiple of 8
1355
1356 src+= src_size;
1357 dst+= src_size;
1358 src_size= 23-mmx_size;
1359 src-= src_size;
1360 dst-= src_size;
1361 #endif
1362 for(i=0; i<src_size; i+=3)
1363 {
1364 register uint8_t x;
1365 x = src[i + 2];
1366 dst[i + 1] = src[i + 1];
1367 dst[i + 2] = src[i + 0];
1368 dst[i + 0] = x;
1369 }
1370 }
1371
1372 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1373 unsigned int width, unsigned int height,
1374 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
1375 {
1376 unsigned y;
1377 const unsigned chromWidth= width>>1;
1378 for(y=0; y<height; y++)
1379 {
1380 #ifdef HAVE_MMX
1381 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1382 asm volatile(
1383 "xorl %%eax, %%eax \n\t"
1384 ".balign 16 \n\t"
1385 "1: \n\t"
1386 PREFETCH" 32(%1, %%eax, 2) \n\t"
1387 PREFETCH" 32(%2, %%eax) \n\t"
1388 PREFETCH" 32(%3, %%eax) \n\t"
1389 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1390 "movq %%mm0, %%mm2 \n\t" // U(0)
1391 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1392 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1393 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1394
1395 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1396 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1397 "movq %%mm3, %%mm4 \n\t" // Y(0)
1398 "movq %%mm5, %%mm6 \n\t" // Y(8)
1399 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1400 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1401 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1402 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1403
1404 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1405 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1406 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1407 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1408
1409 "addl $8, %%eax \n\t"
1410 "cmpl %4, %%eax \n\t"
1411 " jb 1b \n\t"
1412 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1413 : "%eax"
1414 );
1415 #else
1416 #if __WORDSIZE >= 64
1417 int i;
1418 uint64_t *ldst = (uint64_t *) dst;
1419 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1420 for(i = 0; i < chromWidth; i += 2){
1421 uint64_t k, l;
1422 k = yc[0] + (uc[0] << 8) +
1423 (yc[1] << 16) + (vc[0] << 24);
1424 l = yc[2] + (uc[1] << 8) +
1425 (yc[3] << 16) + (vc[1] << 24);
1426 *ldst++ = k + (l << 32);
1427 yc += 4;
1428 uc += 2;
1429 vc += 2;
1430 }
1431
1432 #else
1433 int i, *idst = (int32_t *) dst;
1434 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1435 for(i = 0; i < chromWidth; i++){
1436 *idst++ = yc[0] + (uc[0] << 8) +
1437 (yc[1] << 16) + (vc[0] << 24);
1438 yc += 2;
1439 uc++;
1440 vc++;
1441 }
1442 #endif
1443 #endif
1444 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1445 {
1446 usrc += chromStride;
1447 vsrc += chromStride;
1448 }
1449 ysrc += lumStride;
1450 dst += dstStride;
1451 }
1452 #ifdef HAVE_MMX
1453 asm( EMMS" \n\t"
1454 SFENCE" \n\t"
1455 :::"memory");
1456 #endif
1457 }
1458
1459 /**
1460 *
1461 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1462 * problem for anyone then tell me, and ill fix it)
1463 */
1464 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1465 unsigned int width, unsigned int height,
1466 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1467 {
1468 //FIXME interpolate chroma
1469 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1470 }
1471
1472 /**
1473 *
1474 * width should be a multiple of 16
1475 */
1476 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1477 unsigned int width, unsigned int height,
1478 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1479 {
1480 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1481 }
1482
1483 /**
1484 *
1485 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1486 * problem for anyone then tell me, and ill fix it)
1487 */
1488 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1489 unsigned int width, unsigned int height,
1490 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1491 {
1492 unsigned y;
1493 const unsigned chromWidth= width>>1;
1494 for(y=0; y<height; y+=2)
1495 {
1496 #ifdef HAVE_MMX
1497 asm volatile(
1498 "xorl %%eax, %%eax \n\t"
1499 "pcmpeqw %%mm7, %%mm7 \n\t"
1500 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1501 ".balign 16 \n\t"
1502 "1: \n\t"
1503 PREFETCH" 64(%0, %%eax, 4) \n\t"
1504 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1505 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1506 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1507 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1508 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1509 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1510 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1511 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1512 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1513 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1514
1515 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1516
1517 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1518 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1519 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1520 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1521 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1522 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1523 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1524 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1525 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1526 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1527
1528 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1529
1530 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1531 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1532 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1533 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1534 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1535 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1536 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1537 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1538
1539 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1540 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1541
1542 "addl $8, %%eax \n\t"
1543 "cmpl %4, %%eax \n\t"
1544 " jb 1b \n\t"
1545 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1546 : "memory", "%eax"
1547 );
1548
1549 ydst += lumStride;
1550 src += srcStride;
1551
1552 asm volatile(
1553 "xorl %%eax, %%eax \n\t"
1554 ".balign 16 \n\t"
1555 "1: \n\t"
1556 PREFETCH" 64(%0, %%eax, 4) \n\t"
1557 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1558 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1559 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1560 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1561 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1562 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1563 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1564 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1565 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1566 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1567
1568 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1569 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1570
1571 "addl $8, %%eax \n\t"
1572 "cmpl %4, %%eax \n\t"
1573 " jb 1b \n\t"
1574
1575 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1576 : "memory", "%eax"
1577 );
1578 #else
1579 unsigned i;
1580 for(i=0; i<chromWidth; i++)
1581 {
1582 ydst[2*i+0] = src[4*i+0];
1583 udst[i] = src[4*i+1];
1584 ydst[2*i+1] = src[4*i+2];
1585 vdst[i] = src[4*i+3];
1586 }
1587 ydst += lumStride;
1588 src += srcStride;
1589
1590 for(i=0; i<chromWidth; i++)
1591 {
1592 ydst[2*i+0] = src[4*i+0];
1593 ydst[2*i+1] = src[4*i+2];
1594 }
1595 #endif
1596 udst += chromStride;
1597 vdst += chromStride;
1598 ydst += lumStride;
1599 src += srcStride;
1600 }
1601 #ifdef HAVE_MMX
1602 asm volatile( EMMS" \n\t"
1603 SFENCE" \n\t"
1604 :::"memory");
1605 #endif
1606 }
1607
1608 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1609 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1610 unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
1611 {
1612 /* Y Plane */
1613 memcpy(ydst, ysrc, width*height);
1614
1615 /* XXX: implement upscaling for U,V */
1616 }
1617
1618 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1619 {
1620 int x,y;
1621
1622 // first line
1623 for(x=0; x<srcWidth; x++){
1624 dst[2*x+0]=
1625 dst[2*x+1]= src[x];
1626 }
1627 dst+= dstStride;
1628
1629 for(y=1; y<srcHeight; y++){
1630 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1631 const int mmxSize= srcWidth;
1632 asm volatile(
1633 "movl %4, %%eax \n\t"
1634 "1: \n\t"
1635 "movq (%0, %%eax), %%mm0 \n\t"
1636 "movq (%1, %%eax), %%mm1 \n\t"
1637 "movq 1(%0, %%eax), %%mm2 \n\t"
1638 "movq 1(%1, %%eax), %%mm3 \n\t"
1639 "movq %%mm0, %%mm4 \n\t"
1640 "movq %%mm1, %%mm5 \n\t"
1641 PAVGB" %%mm3, %%mm0 \n\t"
1642 PAVGB" %%mm3, %%mm0 \n\t"
1643 PAVGB" %%mm4, %%mm3 \n\t"
1644 PAVGB" %%mm4, %%mm3 \n\t"
1645 PAVGB" %%mm2, %%mm1 \n\t"
1646 PAVGB" %%mm2, %%mm1 \n\t"
1647 PAVGB" %%mm5, %%mm2 \n\t"
1648 PAVGB" %%mm5, %%mm2 \n\t"
1649 "movq %%mm3, %%mm4 \n\t"
1650 "movq %%mm2, %%mm5 \n\t"
1651 "punpcklbw %%mm1, %%mm3 \n\t"
1652 "punpckhbw %%mm1, %%mm4 \n\t"
1653 "punpcklbw %%mm0, %%mm2 \n\t"
1654 "punpckhbw %%mm0, %%mm5 \n\t"
1655 #if 1
1656 MOVNTQ" %%mm3, (%2, %%eax, 2) \n\t"
1657 MOVNTQ" %%mm4, 8(%2, %%eax, 2) \n\t"
1658 MOVNTQ" %%mm2, (%3, %%eax, 2) \n\t"
1659 MOVNTQ" %%mm5, 8(%3, %%eax, 2) \n\t"
1660 #else
1661 "movq %%mm3, (%2, %%eax, 2) \n\t"
1662 "movq %%mm4, 8(%2, %%eax, 2) \n\t"
1663 "movq %%mm2, (%3, %%eax, 2) \n\t"
1664 "movq %%mm5, 8(%3, %%eax, 2) \n\t"
1665 #endif
1666 "addl $8, %%eax \n\t"
1667 " js 1b \n\t"
1668 :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
1669 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1670 "g" (-mmxSize)
1671 : "%eax"
1672
1673 );
1674 dst[0]=
1675 dst[dstStride]= src[0];
1676 #else
1677 dst[0]=
1678 dst[dstStride]= src[0];
1679
1680 for(x=0; x<srcWidth-1; x++){
1681 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1682 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1683 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1684 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1685 }
1686 #endif
1687 dst[srcWidth*2 -1]=
1688 dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
1689
1690 dst+=dstStride*2;
1691 src+=srcStride;
1692 }
1693 src-=srcStride;
1694
1695 // last line
1696 for(x=0; x<srcWidth; x++){
1697 dst[2*x+0]=
1698 dst[2*x+1]= src[x];
1699 }
1700 #ifdef HAVE_MMX
1701 asm volatile( EMMS" \n\t"
1702 SFENCE" \n\t"
1703 :::"memory");
1704 #endif
1705 }
1706
1707 /**
1708 *
1709 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1710 * problem for anyone then tell me, and ill fix it)
1711 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1712 */
1713 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1714 unsigned int width, unsigned int height,
1715 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1716 {
1717 unsigned y;
1718 const unsigned chromWidth= width>>1;
1719 for(y=0; y<height; y+=2)
1720 {
1721 #ifdef HAVE_MMX
1722 asm volatile(
1723 "xorl %%eax, %%eax \n\t"
1724 "pcmpeqw %%mm7, %%mm7 \n\t"
1725 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1726 ".balign 16 \n\t"
1727 "1: \n\t"
1728 PREFETCH" 64(%0, %%eax, 4) \n\t"
1729 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1730 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1731 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1732 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1733 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1734 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1735 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1736 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1737 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1738 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1739
1740 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1741
1742 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1743 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1744 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1745 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1746 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1747 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1748 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1749 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1750 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1751 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1752
1753 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1754
1755 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1756 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1757 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1758 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1759 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1760 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1761 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1762 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1763
1764 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1765 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1766
1767 "addl $8, %%eax \n\t"
1768 "cmpl %4, %%eax \n\t"
1769 " jb 1b \n\t"
1770 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1771 : "memory", "%eax"
1772 );
1773
1774 ydst += lumStride;
1775 src += srcStride;
1776
1777 asm volatile(
1778 "xorl %%eax, %%eax \n\t"
1779 ".balign 16 \n\t"
1780 "1: \n\t"
1781 PREFETCH" 64(%0, %%eax, 4) \n\t"
1782 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1783 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1784 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1785 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1786 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1787 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1788 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1789 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1790 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1791 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1792
1793 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1794 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1795
1796 "addl $8, %%eax \n\t"
1797 "cmpl %4, %%eax \n\t"
1798 " jb 1b \n\t"
1799
1800 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1801 : "memory", "%eax"
1802 );
1803 #else
1804 unsigned i;
1805 for(i=0; i<chromWidth; i++)
1806 {
1807 udst[i] = src[4*i+0];
1808 ydst[2*i+0] = src[4*i+1];
1809 vdst[i] = src[4*i+2];
1810 ydst[2*i+1] = src[4*i+3];
1811 }
1812 ydst += lumStride;
1813 src += srcStride;
1814
1815 for(i=0; i<chromWidth; i++)
1816 {
1817 ydst[2*i+0] = src[4*i+1];
1818 ydst[2*i+1] = src[4*i+3];
1819 }
1820 #endif
1821 udst += chromStride;
1822 vdst += chromStride;
1823 ydst += lumStride;
1824 src += srcStride;
1825 }
1826 #ifdef HAVE_MMX
1827 asm volatile( EMMS" \n\t"
1828 SFENCE" \n\t"
1829 :::"memory");
1830 #endif
1831 }
1832
1833 /**
1834 *
1835 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1836 * problem for anyone then tell me, and ill fix it)
1837 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1838 */
1839 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1840 unsigned int width, unsigned int height,
1841 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1842 {
1843 unsigned y;
1844 const unsigned chromWidth= width>>1;
1845 #ifdef HAVE_MMX
1846 for(y=0; y<height-2; y+=2)
1847 {
1848 unsigned i;
1849 for(i=0; i<2; i++)
1850 {
1851 asm volatile(
1852 "movl %2, %%eax \n\t"
1853 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1854 "movq "MANGLE(w1111)", %%mm5 \n\t"
1855 "pxor %%mm7, %%mm7 \n\t"
1856 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1857 ".balign 16 \n\t"
1858 "1: \n\t"
1859 PREFETCH" 64(%0, %%ebx) \n\t"
1860 "movd (%0, %%ebx), %%mm0 \n\t"
1861 "movd 3(%0, %%ebx), %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm0 \n\t"
1863 "punpcklbw %%mm7, %%mm1 \n\t"
1864 "movd 6(%0, %%ebx), %%mm2 \n\t"
1865 "movd 9(%0, %%ebx), %%mm3 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "punpcklbw %%mm7, %%mm3 \n\t"
1868 "pmaddwd %%mm6, %%mm0 \n\t"
1869 "pmaddwd %%mm6, %%mm1 \n\t"
1870 "pmaddwd %%mm6, %%mm2 \n\t"
1871 "pmaddwd %%mm6, %%mm3 \n\t"
1872 #ifndef FAST_BGR2YV12
1873 "psrad $8, %%mm0 \n\t"
1874 "psrad $8, %%mm1 \n\t"
1875 "psrad $8, %%mm2 \n\t"
1876 "psrad $8, %%mm3 \n\t"
1877 #endif
1878 "packssdw %%mm1, %%mm0 \n\t"
1879 "packssdw %%mm3, %%mm2 \n\t"
1880 "pmaddwd %%mm5, %%mm0 \n\t"
1881 "pmaddwd %%mm5, %%mm2 \n\t"
1882 "packssdw %%mm2, %%mm0 \n\t"
1883 "psraw $7, %%mm0 \n\t"
1884
1885 "movd 12(%0, %%ebx), %%mm4 \n\t"
1886 "movd 15(%0, %%ebx), %%mm1 \n\t"
1887 "punpcklbw %%mm7, %%mm4 \n\t"
1888 "punpcklbw %%mm7, %%mm1 \n\t"
1889 "movd 18(%0, %%ebx), %%mm2 \n\t"
1890 "movd 21(%0, %%ebx), %%mm3 \n\t"
1891 "punpcklbw %%mm7, %%mm2 \n\t"
1892 "punpcklbw %%mm7, %%mm3 \n\t"
1893 "pmaddwd %%mm6, %%mm4 \n\t"
1894 "pmaddwd %%mm6, %%mm1 \n\t"
1895 "pmaddwd %%mm6, %%mm2 \n\t"
1896 "pmaddwd %%mm6, %%mm3 \n\t"
1897 #ifndef FAST_BGR2YV12
1898 "psrad $8, %%mm4 \n\t"
1899 "psrad $8, %%mm1 \n\t"
1900 "psrad $8, %%mm2 \n\t"
1901 "psrad $8, %%mm3 \n\t"
1902 #endif
1903 "packssdw %%mm1, %%mm4 \n\t"
1904 "packssdw %%mm3, %%mm2 \n\t"
1905 "pmaddwd %%mm5, %%mm4 \n\t"
1906 "pmaddwd %%mm5, %%mm2 \n\t"
1907 "addl $24, %%ebx \n\t"
1908 "packssdw %%mm2, %%mm4 \n\t"
1909 "psraw $7, %%mm4 \n\t"
1910
1911 "packuswb %%mm4, %%mm0 \n\t"
1912 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1913
1914 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
1915 "addl $8, %%eax \n\t"
1916 " js 1b \n\t"
1917 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1918 : "%eax", "%ebx"
1919 );
1920 ydst += lumStride;
1921 src += srcStride;
1922 }
1923 src -= srcStride*2;
1924 asm volatile(
1925 "movl %4, %%eax \n\t"
1926 "movq "MANGLE(w1111)", %%mm5 \n\t"
1927 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1928 "pxor %%mm7, %%mm7 \n\t"
1929 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1930 "addl %%ebx, %%ebx \n\t"
1931 ".balign 16 \n\t"
1932 "1: \n\t"
1933 PREFETCH" 64(%0, %%ebx) \n\t"
1934 PREFETCH" 64(%1, %%ebx) \n\t"
1935 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1936 "movq (%0, %%ebx), %%mm0 \n\t"
1937 "movq (%1, %%ebx), %%mm1 \n\t"
1938 "movq 6(%0, %%ebx), %%mm2 \n\t"
1939 "movq 6(%1, %%ebx), %%mm3 \n\t"
1940 PAVGB" %%mm1, %%mm0 \n\t"
1941 PAVGB" %%mm3, %%mm2 \n\t"
1942 "movq %%mm0, %%mm1 \n\t"
1943 "movq %%mm2, %%mm3 \n\t"
1944 "psrlq $24, %%mm0 \n\t"
1945 "psrlq $24, %%mm2 \n\t"
1946 PAVGB" %%mm1, %%mm0 \n\t"
1947 PAVGB" %%mm3, %%mm2 \n\t"
1948 "punpcklbw %%mm7, %%mm0 \n\t"
1949 "punpcklbw %%mm7, %%mm2 \n\t"
1950 #else
1951 "movd (%0, %%ebx), %%mm0 \n\t"
1952 "movd (%1, %%ebx), %%mm1 \n\t"
1953 "movd 3(%0, %%ebx), %%mm2 \n\t"
1954 "movd 3(%1, %%ebx), %%mm3 \n\t"
1955 "punpcklbw %%mm7, %%mm0 \n\t"
1956 "punpcklbw %%mm7, %%mm1 \n\t"
1957 "punpcklbw %%mm7, %%mm2 \n\t"
1958 "punpcklbw %%mm7, %%mm3 \n\t"
1959 "paddw %%mm1, %%mm0 \n\t"
1960 "paddw %%mm3, %%mm2 \n\t"
1961 "paddw %%mm2, %%mm0 \n\t"
1962 "movd 6(%0, %%ebx), %%mm4 \n\t"
1963 "movd 6(%1, %%ebx), %%mm1 \n\t"
1964 "movd 9(%0, %%ebx), %%mm2 \n\t"
1965 "movd 9(%1, %%ebx), %%mm3 \n\t"
1966 "punpcklbw %%mm7, %%mm4 \n\t"
1967 "punpcklbw %%mm7, %%mm1 \n\t"
1968 "punpcklbw %%mm7, %%mm2 \n\t"
1969 "punpcklbw %%mm7, %%mm3 \n\t"
1970 "paddw %%mm1, %%mm4 \n\t"
1971 "paddw %%mm3, %%mm2 \n\t"
1972 "paddw %%mm4, %%mm2 \n\t"
1973 "psrlw $2, %%mm0 \n\t"
1974 "psrlw $2, %%mm2 \n\t"
1975 #endif
1976 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1977 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1978
1979 "pmaddwd %%mm0, %%mm1 \n\t"
1980 "pmaddwd %%mm2, %%mm3 \n\t"
1981 "pmaddwd %%mm6, %%mm0 \n\t"
1982 "pmaddwd %%mm6, %%mm2 \n\t"
1983 #ifndef FAST_BGR2YV12
1984 "psrad $8, %%mm0 \n\t"
1985 "psrad $8, %%mm1 \n\t"
1986 "psrad $8, %%mm2 \n\t"
1987 "psrad $8, %%mm3 \n\t"
1988 #endif
1989 "packssdw %%mm2, %%mm0 \n\t"
1990 "packssdw %%mm3, %%mm1 \n\t"
1991 "pmaddwd %%mm5, %%mm0 \n\t"
1992 "pmaddwd %%mm5, %%mm1 \n\t"
1993 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1994 "psraw $7, %%mm0 \n\t"
1995
1996 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1997 "movq 12(%0, %%ebx), %%mm4 \n\t"
1998 "movq 12(%1, %%ebx), %%mm1 \n\t"
1999 "movq 18(%0, %%ebx), %%mm2 \n\t"
2000 "movq 18(%1, %%ebx), %%mm3 \n\t"
2001 PAVGB" %%mm1, %%mm4 \n\t"
2002 PAVGB" %%mm3, %%mm2 \n\t"
2003 "movq %%mm4, %%mm1 \n\t"
2004 "movq %%mm2, %%mm3 \n\t"
2005 "psrlq $24, %%mm4 \n\t"
2006 "psrlq $24, %%mm2 \n\t"
2007 PAVGB" %%mm1, %%mm4 \n\t"
2008 PAVGB" %%mm3, %%mm2 \n\t"
2009 "punpcklbw %%mm7, %%mm4 \n\t"
2010 "punpcklbw %%mm7, %%mm2 \n\t"
2011 #else
2012 "movd 12(%0, %%ebx), %%mm4 \n\t"
2013 "movd 12(%1, %%ebx), %%mm1 \n\t"
2014 "movd 15(%0, %%ebx), %%mm2 \n\t"
2015 "movd 15(%1, %%ebx), %%mm3 \n\t"
2016 "punpcklbw %%mm7, %%mm4 \n\t"
2017 "punpcklbw %%mm7, %%mm1 \n\t"
2018 "punpcklbw %%mm7, %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm3 \n\t"
2020 "paddw %%mm1, %%mm4 \n\t"
2021 "paddw %%mm3, %%mm2 \n\t"
2022 "paddw %%mm2, %%mm4 \n\t"
2023 "movd 18(%0, %%ebx), %%mm5 \n\t"
2024 "movd 18(%1, %%ebx), %%mm1 \n\t"
2025 "movd 21(%0, %%ebx), %%mm2 \n\t"
2026 "movd 21(%1, %%ebx), %%mm3 \n\t"
2027 "punpcklbw %%mm7, %%mm5 \n\t"
2028 "punpcklbw %%mm7, %%mm1 \n\t"
2029 "punpcklbw %%mm7, %%mm2 \n\t"
2030 "punpcklbw %%mm7, %%mm3 \n\t"
2031 "paddw %%mm1, %%mm5 \n\t"
2032 "paddw %%mm3, %%mm2 \n\t"
2033 "paddw %%mm5, %%mm2 \n\t"
2034 "movq "MANGLE(w1111)", %%mm5 \n\t"
2035 "psrlw $2, %%mm4 \n\t"
2036 "psrlw $2, %%mm2 \n\t"
2037 #endif
2038 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2039 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2040
2041 "pmaddwd %%mm4, %%mm1 \n\t"
2042 "pmaddwd %%mm2, %%mm3 \n\t"
2043 "pmaddwd %%mm6, %%mm4 \n\t"
2044 "pmaddwd %%mm6, %%mm2 \n\t"
2045 #ifndef FAST_BGR2YV12
2046 "psrad $8, %%mm4 \n\t"
2047 "psrad $8, %%mm1 \n\t"
2048 "psrad $8, %%mm2 \n\t"
2049 "psrad $8, %%mm3 \n\t"
2050 #endif
2051 "packssdw %%mm2, %%mm4 \n\t"
2052 "packssdw %%mm3, %%mm1 \n\t"
2053 "pmaddwd %%mm5, %%mm4 \n\t"
2054 "pmaddwd %%mm5, %%mm1 \n\t"
2055 "addl $24, %%ebx \n\t"
2056 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2057 "psraw $7, %%mm4 \n\t"
2058
2059 "movq %%mm0, %%mm1 \n\t"
2060 "punpckldq %%mm4, %%mm0 \n\t"
2061 "punpckhdq %%mm4, %%mm1 \n\t"
2062 "packsswb %%mm1, %%mm0 \n\t"
2063 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2064
2065 "movd %%mm0, (%2, %%eax) \n\t"
2066 "punpckhdq %%mm0, %%mm0 \n\t"
2067 "movd %%mm0, (%3, %%eax) \n\t"
2068 "addl $4, %%eax \n\t"
2069 " js 1b \n\t"
2070 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2071 : "%eax", "%ebx"
2072 );
2073
2074 udst += chromStride;
2075 vdst += chromStride;
2076 src += srcStride*2;
2077 }
2078
2079 asm volatile( EMMS" \n\t"
2080 SFENCE" \n\t"
2081 :::"memory");
2082 #else
2083 y=0;
2084 #endif
2085 for(; y<height; y+=2)
2086 {
2087 unsigned i;
2088 for(i=0; i<chromWidth; i++)
2089 {
2090 unsigned int b= src[6*i+0];
2091 unsigned int g= src[6*i+1];
2092 unsigned int r= src[6*i+2];
2093
2094 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2095 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2096 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2097
2098 udst[i] = U;
2099 vdst[i] = V;
2100 ydst[2*i] = Y;
2101
2102 b= src[6*i+3];
2103 g= src[6*i+4];
2104 r= src[6*i+5];
2105
2106 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2107 ydst[2*i+1] = Y;
2108 }
2109 ydst += lumStride;
2110 src += srcStride;
2111
2112 for(i=0; i<chromWidth; i++)
2113 {
2114 unsigned int b= src[6*i+0];
2115 unsigned int g= src[6*i+1];
2116 unsigned int r= src[6*i+2];
2117
2118 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2119
2120 ydst[2*i] = Y;
2121
2122 b= src[6*i+3];
2123 g= src[6*i+4];
2124 r= src[6*i+5];
2125
2126 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2127 ydst[2*i+1] = Y;
2128 }
2129 udst += chromStride;
2130 vdst += chromStride;
2131 ydst += lumStride;
2132 src += srcStride;
2133 }
2134 }
2135
2136 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2137 unsigned width, unsigned height, unsigned src1Stride,
2138 unsigned src2Stride, unsigned dstStride){
2139 unsigned h;
2140
2141 for(h=0; h < height; h++)
2142 {
2143 unsigned w;
2144
2145 #ifdef HAVE_MMX
2146 #ifdef HAVE_SSE2
2147 asm(
2148 "xorl %%eax, %%eax \n\t"
2149 "1: \n\t"
2150 PREFETCH" 64(%1, %%eax) \n\t"
2151 PREFETCH" 64(%2, %%eax) \n\t"
2152 "movdqa (%1, %%eax), %%xmm0 \n\t"
2153 "movdqa (%1, %%eax), %%xmm1 \n\t"
2154 "movdqa (%2, %%eax), %%xmm2 \n\t"
2155 "punpcklbw %%xmm2, %%xmm0 \n\t"
2156 "punpckhbw %%xmm2, %%xmm1 \n\t"
2157 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2158 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2159 "addl $16, %%eax \n\t"
2160 "cmpl %3, %%eax \n\t"
2161 " jb 1b \n\t"
2162 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2163 : "memory", "%eax"
2164 );
2165 #else
2166 asm(
2167 "xorl %%eax, %%eax \n\t"
2168 "1: \n\t"
2169 PREFETCH" 64(%1, %%eax) \n\t"
2170 PREFETCH" 64(%2, %%eax) \n\t"
2171 "movq (%1, %%eax), %%mm0 \n\t"
2172 "movq 8(%1, %%eax), %%mm2 \n\t"
2173 "movq %%mm0, %%mm1 \n\t"
2174 "movq %%mm2, %%mm3 \n\t"
2175 "movq (%2, %%eax), %%mm4 \n\t"
2176 "movq 8(%2, %%eax), %%mm5 \n\t"
2177 "punpcklbw %%mm4, %%mm0 \n\t"
2178 "punpckhbw %%mm4, %%mm1 \n\t"
2179 "punpcklbw %%mm5, %%mm2 \n\t"
2180 "punpckhbw %%mm5, %%mm3 \n\t"
2181 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2182 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2183 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2184 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2185 "addl $16, %%eax \n\t"
2186 "cmpl %3, %%eax \n\t"
2187 " jb 1b \n\t"
2188 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2189 : "memory", "%eax"
2190 );
2191 #endif
2192 for(w= (width&(~15)); w < width; w++)
2193 {
2194 dest[2*w+0] = src1[w];
2195 dest[2*w+1] = src2[w];
2196 }
2197 #else
2198 for(w=0; w < width; w++)
2199 {
2200 dest[2*w+0] = src1[w];
2201 dest[2*w+1] = src2[w];
2202 }
2203 #endif
2204 dest += dstStride;
2205 src1 += src1Stride;
2206 src2 += src2Stride;
2207 }
2208 #ifdef HAVE_MMX
2209 asm(
2210 EMMS" \n\t"
2211 SFENCE" \n\t"
2212 ::: "memory"
2213 );
2214 #endif
2215 }
2216
2217 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2218 uint8_t *dst1, uint8_t *dst2,
2219 unsigned width, unsigned height,
2220 unsigned srcStride1, unsigned srcStride2,
2221 unsigned dstStride1, unsigned dstStride2)
2222 {
2223 unsigned y,x,w,h;
2224 w=width/2; h=height/2;
2225 #ifdef HAVE_MMX
2226 asm volatile(
2227 PREFETCH" %0\n\t"
2228 PREFETCH" %1\n\t"
2229 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2230 #endif
2231 for(y=0;y<h;y++){
2232 const uint8_t* s1=src1+srcStride1*(y>>1);
2233 uint8_t* d=dst1+dstStride1*y;
2234 x=0;
2235 #ifdef HAVE_MMX
2236 if(w > 32)
2237 for(;x<w;x+=32)
2238 {
2239 asm volatile(
2240 PREFETCH" 32%1\n\t"
2241 "movq %1, %%mm0\n\t"
2242 "movq 8%1, %%mm2\n\t"
2243 "movq 16%1, %%mm4\n\t"
2244 "movq 24%1, %%mm6\n\t"
2245 "movq %%mm0, %%mm1\n\t"
2246 "movq %%mm2, %%mm3\n\t"
2247 "movq %%mm4, %%mm5\n\t"
2248 "movq %%mm6, %%mm7\n\t"
2249 "punpcklbw %%mm0, %%mm0\n\t"
2250 "punpckhbw %%mm1, %%mm1\n\t"
2251 "punpcklbw %%mm2, %%mm2\n\t"
2252 "punpckhbw %%mm3, %%mm3\n\t"
2253 "punpcklbw %%mm4, %%mm4\n\t"
2254 "punpckhbw %%mm5, %%mm5\n\t"
2255 "punpcklbw %%mm6, %%mm6\n\t"
2256 "punpckhbw %%mm7, %%mm7\n\t"
2257 MOVNTQ" %%mm0, %0\n\t"
2258 MOVNTQ" %%mm1, 8%0\n\t"
2259 MOVNTQ" %%mm2, 16%0\n\t"
2260 MOVNTQ" %%mm3, 24%0\n\t"
2261 MOVNTQ" %%mm4, 32%0\n\t"
2262 MOVNTQ" %%mm5, 40%0\n\t"
2263 MOVNTQ" %%mm6, 48%0\n\t"
2264 MOVNTQ" %%mm7, 56%0"
2265 :"=m"(d[2*x])
2266 :"m"(s1[x])
2267 :"memory");
2268 }
2269 #endif
2270 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2271 }
2272 for(y=0;y<h;y++){
2273 const uint8_t* s2=src2+srcStride2*(y>>1);
2274 uint8_t* d=dst2+dstStride2*y;
2275 x=0;
2276 #ifdef HAVE_MMX
2277 if(w > 32)
2278 for(;x<w;x+=32)
2279 {
2280 asm volatile(
2281 PREFETCH" 32%1\n\t"
2282 "movq %1, %%mm0\n\t"
2283 "movq 8%1, %%mm2\n\t"
2284 "movq 16%1, %%mm4\n\t"
2285 "movq 24%1, %%mm6\n\t"
2286 "movq %%mm0, %%mm1\n\t"
2287 "movq %%mm2, %%mm3\n\t"
2288 "movq %%mm4, %%mm5\n\t"
2289 "movq %%mm6, %%mm7\n\t"
2290 "punpcklbw %%mm0, %%mm0\n\t"
2291 "punpckhbw %%mm1, %%mm1\n\t"
2292 "punpcklbw %%mm2, %%mm2\n\t"
2293 "punpckhbw %%mm3, %%mm3\n\t"
2294 "punpcklbw %%mm4, %%mm4\n\t"
2295 "punpckhbw %%mm5, %%mm5\n\t"
2296 "punpcklbw %%mm6, %%mm6\n\t"
2297 "punpckhbw %%mm7, %%mm7\n\t"
2298 MOVNTQ" %%mm0, %0\n\t"
2299 MOVNTQ" %%mm1, 8%0\n\t"
2300 MOVNTQ" %%mm2, 16%0\n\t"
2301 MOVNTQ" %%mm3, 24%0\n\t"
2302 MOVNTQ" %%mm4, 32%0\n\t"
2303 MOVNTQ" %%mm5, 40%0\n\t"
2304 MOVNTQ" %%mm6, 48%0\n\t"
2305 MOVNTQ" %%mm7, 56%0"
2306 :"=m"(d[2*x])
2307 :"m"(s2[x])
2308 :"memory");
2309 }
2310 #endif
2311 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2312 }
2313 #ifdef HAVE_MMX
2314 asm(
2315 EMMS" \n\t"
2316 SFENCE" \n\t"
2317 ::: "memory"
2318 );
2319 #endif
2320 }
2321
2322 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2323 uint8_t *dst,
2324 unsigned width, unsigned height,
2325 unsigned srcStride1, unsigned srcStride2,
2326 unsigned srcStride3, unsigned dstStride)
2327 {
2328 unsigned y,x,x2,w,h;
2329 w=width/2; h=height;
2330 #ifdef HAVE_MMX
2331 asm volatile(
2332 PREFETCH" %0\n\t"
2333 PREFETCH" %1\n\t"
2334 PREFETCH" %2\n\t"
2335 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
2336 #endif
2337 for(y=0;y<h;y++){
2338 const uint8_t* yp=src1+srcStride1*y;
2339 const uint8_t* up=src2+srcStride2*(y>>2);
2340 const uint8_t* vp=src3+srcStride3*(y>>2);
2341 uint8_t* d=dst+dstStride*y;
2342 x2=0;
2343 x=0;
2344 #ifdef HAVE_MMX
2345 for(;x<w;x+=8,x2+=32)
2346 {
2347 asm volatile(
2348 PREFETCH" 32%1\n\t"
2349 PREFETCH" 32%2\n\t"
2350 PREFETCH" 32%3\n\t"
2351 "movq %1, %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2352 "movq %2, %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2353 "movq %3, %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2354 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2355 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2356 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2357 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2358 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2359 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2360 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2361
2362 "movq %%mm1, %%mm6\n\t"
2363 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2364 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2365 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2366 MOVNTQ" %%mm0, %0\n\t"
2367 MOVNTQ" %%mm3, 8%0\n\t"
2368
2369 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2370 "movq 8%1, %%mm0\n\t"
2371 "movq %%mm0, %%mm3\n\t"
2372 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2373 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2374 MOVNTQ" %%mm0, 16%0\n\t"
2375 MOVNTQ" %%mm3, 24%0\n\t"
2376
2377 "movq %%mm4, %%mm6\n\t"
2378 "movq 16%1, %%mm0\n\t"
2379 "movq %%mm0, %%mm3\n\t"
2380 "punpcklbw %%mm5, %%mm4\n\t"
2381 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2382 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2383 MOVNTQ" %%mm0, 32%0\n\t"
2384 MOVNTQ" %%mm3, 40%0\n\t"
2385
2386 "punpckhbw %%mm5, %%mm6\n\t"
2387 "movq 24%1, %%mm0\n\t"
2388 "movq %%mm0, %%mm3\n\t"
2389 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2390 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2391 MOVNTQ" %%mm0, 48%0\n\t"
2392 MOVNTQ" %%mm3, 56%0\n\t"
2393
2394 :"=m"(d[8*x])
2395 :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])
2396 :"memory");
2397 }
2398 #endif
2399 for(;x<w;x++,x2+=4)
2400 {
2401 d[8*x+0]=yp[x2];
2402 d[8*x+1]=up[x];
2403 d[8*x+2]=yp[x2+1];
2404 d[8*x+3]=vp[x];
2405 d[8*x+4]=yp[x2+2];
2406 d[8*x+5]=up[x];
2407 d[8*x+6]=yp[x2+3];
2408 d[8*x+7]=vp[x];
2409 }
2410 }
2411 #ifdef HAVE_MMX
2412 asm(
2413 EMMS" \n\t"
2414 SFENCE" \n\t"
2415 ::: "memory"
2416 );
2417 #endif
2418 }