44e764e66de2ae3982da6cfa5ad41b28edf00a45
3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
12 #include <inttypes.h> /* for __WORDSIZE */
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
61 static inline void RENAME(rgb24to32
)(const uint8_t *src
,uint8_t *dst
,unsigned src_size
)
64 const uint8_t *s
= src
;
67 const uint8_t *mm_end
;
71 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
73 __asm
__volatile("movq %0, %%mm7"::"m"(mask32
):"memory");
79 "punpckldq 3%1, %%mm0\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ
" %%mm0, %0\n\t"
91 MOVNTQ
" %%mm1, 8%0\n\t"
92 MOVNTQ
" %%mm2, 16%0\n\t"
100 __asm
__volatile(SFENCE
:::"memory");
101 __asm
__volatile(EMMS
:::"memory");
112 static inline void RENAME(rgb32to24
)(const uint8_t *src
,uint8_t *dst
,unsigned src_size
)
115 const uint8_t *s
= src
;
118 const uint8_t *mm_end
;
122 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
165 "por %%mm5, %%mm4\n\t"
167 MOVNTQ
" %%mm0, %0\n\t"
168 MOVNTQ
" %%mm1, 8%0\n\t"
171 :"m"(*s
),"m"(mask24l
),
172 "m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
177 __asm
__volatile(SFENCE
:::"memory");
178 __asm
__volatile(EMMS
:::"memory");
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
195 static inline void RENAME(rgb15to16
)(const uint8_t *src
,uint8_t *dst
,unsigned src_size
)
197 register const uint8_t* s
=src
;
198 register uint8_t* d
=dst
;
199 register const uint8_t *end
;
200 const uint8_t *mm_end
;
203 __asm
__volatile(PREFETCH
" %0"::"m"(*s
));
204 __asm
__volatile("movq %0, %%mm4"::"m"(mask15s
));
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ
" %%mm0, %0\n\t"
226 __asm
__volatile(SFENCE
:::"memory");
227 __asm
__volatile(EMMS
:::"memory");
232 register unsigned x
= *((uint32_t *)s
);
233 *((uint32_t *)d
) = (x
&0x7FFF7FFF) + (x
&0x7FE07FE0);
239 register unsigned short x
= *((uint16_t *)s
);
240 *((uint16_t *)d
) = (x
&0x7FFF) + (x
&0x7FE0);
244 static inline void RENAME(bgr24torgb24
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
246 unsigned j
,i
,num_pixels
=src_size
/3;
247 for(i
=0,j
=0; j
<num_pixels
; i
+=3,j
+=3)
255 static inline void RENAME(rgb16to15
)(const uint8_t *src
,uint8_t *dst
,unsigned src_size
)
257 register const uint8_t* s
=src
;
258 register uint8_t* d
=dst
;
259 register const uint8_t *end
;
260 const uint8_t *mm_end
;
263 __asm
__volatile(PREFETCH
" %0"::"m"(*s
));
264 __asm
__volatile("movq %0, %%mm7"::"m"(mask15rg
));
265 __asm
__volatile("movq %0, %%mm6"::"m"(mask15b
));
272 "movq 8%1, %%mm2\n\t"
273 "movq %%mm0, %%mm1\n\t"
274 "movq %%mm2, %%mm3\n\t"
275 "psrlq $1, %%mm0\n\t"
276 "psrlq $1, %%mm2\n\t"
277 "pand %%mm7, %%mm0\n\t"
278 "pand %%mm7, %%mm2\n\t"
279 "pand %%mm6, %%mm1\n\t"
280 "pand %%mm6, %%mm3\n\t"
281 "por %%mm1, %%mm0\n\t"
282 "por %%mm3, %%mm2\n\t"
283 MOVNTQ
" %%mm0, %0\n\t"
291 __asm
__volatile(SFENCE
:::"memory");
292 __asm
__volatile(EMMS
:::"memory");
297 register uint32_t x
= *((uint32_t *)s
);
298 *((uint32_t *)d
) = ((x
>>1)&0x7FE07FE0) | (x
&0x001F001F);
304 register uint16_t x
= *((uint16_t *)s
);
305 *((uint16_t *)d
) = ((x
>>1)&0x7FE0) | (x
&0x001F);
311 static inline void RENAME(rgb32to16
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
313 const uint8_t *s
= src
;
316 const uint8_t *mm_end
;
318 uint16_t *d
= (uint16_t *)dst
;
321 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
325 ::"m"(red_16mask
),"m"(green_16mask
));
332 "movd 4%1, %%mm3\n\t"
333 "punpckldq 8%1, %%mm0\n\t"
334 "punpckldq 12%1, %%mm3\n\t"
335 "movq %%mm0, %%mm1\n\t"
336 "movq %%mm0, %%mm2\n\t"
337 "movq %%mm3, %%mm4\n\t"
338 "movq %%mm3, %%mm5\n\t"
339 "psrlq $3, %%mm0\n\t"
340 "psrlq $3, %%mm3\n\t"
343 "psrlq $5, %%mm1\n\t"
344 "psrlq $5, %%mm4\n\t"
345 "pand %%mm6, %%mm1\n\t"
346 "pand %%mm6, %%mm4\n\t"
347 "psrlq $8, %%mm2\n\t"
348 "psrlq $8, %%mm5\n\t"
349 "pand %%mm7, %%mm2\n\t"
350 "pand %%mm7, %%mm5\n\t"
351 "por %%mm1, %%mm0\n\t"
352 "por %%mm4, %%mm3\n\t"
353 "por %%mm2, %%mm0\n\t"
354 "por %%mm5, %%mm3\n\t"
355 "psllq $16, %%mm3\n\t"
356 "por %%mm3, %%mm0\n\t"
357 MOVNTQ
" %%mm0, %0\n\t"
358 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
362 __asm
__volatile(SFENCE
:::"memory");
363 __asm
__volatile(EMMS
:::"memory");
370 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
375 static inline void RENAME(rgb32tobgr16
)(const uint8_t *src
, uint8_t *dst
, unsigned int src_size
)
377 const uint8_t *s
= src
;
380 const uint8_t *mm_end
;
382 uint16_t *d
= (uint16_t *)dst
;
385 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
389 ::"m"(red_16mask
),"m"(green_16mask
));
396 "movd 4%1, %%mm3\n\t"
397 "punpckldq 8%1, %%mm0\n\t"
398 "punpckldq 12%1, %%mm3\n\t"
399 "movq %%mm0, %%mm1\n\t"
400 "movq %%mm0, %%mm2\n\t"
401 "movq %%mm3, %%mm4\n\t"
402 "movq %%mm3, %%mm5\n\t"
403 "psllq $8, %%mm0\n\t"
404 "psllq $8, %%mm3\n\t"
405 "pand %%mm7, %%mm0\n\t"
406 "pand %%mm7, %%mm3\n\t"
407 "psrlq $5, %%mm1\n\t"
408 "psrlq $5, %%mm4\n\t"
409 "pand %%mm6, %%mm1\n\t"
410 "pand %%mm6, %%mm4\n\t"
411 "psrlq $19, %%mm2\n\t"
412 "psrlq $19, %%mm5\n\t"
415 "por %%mm1, %%mm0\n\t"
416 "por %%mm4, %%mm3\n\t"
417 "por %%mm2, %%mm0\n\t"
418 "por %%mm5, %%mm3\n\t"
419 "psllq $16, %%mm3\n\t"
420 "por %%mm3, %%mm0\n\t"
421 MOVNTQ
" %%mm0, %0\n\t"
422 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
426 __asm
__volatile(SFENCE
:::"memory");
427 __asm
__volatile(EMMS
:::"memory");
434 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
439 static inline void RENAME(rgb32to15
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
441 const uint8_t *s
= src
;
444 const uint8_t *mm_end
;
446 uint16_t *d
= (uint16_t *)dst
;
449 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
453 ::"m"(red_15mask
),"m"(green_15mask
));
460 "movd 4%1, %%mm3\n\t"
461 "punpckldq 8%1, %%mm0\n\t"
462 "punpckldq 12%1, %%mm3\n\t"
463 "movq %%mm0, %%mm1\n\t"
464 "movq %%mm0, %%mm2\n\t"
465 "movq %%mm3, %%mm4\n\t"
466 "movq %%mm3, %%mm5\n\t"
467 "psrlq $3, %%mm0\n\t"
468 "psrlq $3, %%mm3\n\t"
471 "psrlq $6, %%mm1\n\t"
472 "psrlq $6, %%mm4\n\t"
473 "pand %%mm6, %%mm1\n\t"
474 "pand %%mm6, %%mm4\n\t"
475 "psrlq $9, %%mm2\n\t"
476 "psrlq $9, %%mm5\n\t"
477 "pand %%mm7, %%mm2\n\t"
478 "pand %%mm7, %%mm5\n\t"
479 "por %%mm1, %%mm0\n\t"
480 "por %%mm4, %%mm3\n\t"
481 "por %%mm2, %%mm0\n\t"
482 "por %%mm5, %%mm3\n\t"
483 "psllq $16, %%mm3\n\t"
484 "por %%mm3, %%mm0\n\t"
485 MOVNTQ
" %%mm0, %0\n\t"
486 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
490 __asm
__volatile(SFENCE
:::"memory");
491 __asm
__volatile(EMMS
:::"memory");
498 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
503 static inline void RENAME(rgb32tobgr15
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
505 const uint8_t *s
= src
;
508 const uint8_t *mm_end
;
510 uint16_t *d
= (uint16_t *)dst
;
513 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
517 ::"m"(red_15mask
),"m"(green_15mask
));
524 "movd 4%1, %%mm3\n\t"
525 "punpckldq 8%1, %%mm0\n\t"
526 "punpckldq 12%1, %%mm3\n\t"
527 "movq %%mm0, %%mm1\n\t"
528 "movq %%mm0, %%mm2\n\t"
529 "movq %%mm3, %%mm4\n\t"
530 "movq %%mm3, %%mm5\n\t"
531 "psllq $7, %%mm0\n\t"
532 "psllq $7, %%mm3\n\t"
533 "pand %%mm7, %%mm0\n\t"
534 "pand %%mm7, %%mm3\n\t"
535 "psrlq $6, %%mm1\n\t"
536 "psrlq $6, %%mm4\n\t"
537 "pand %%mm6, %%mm1\n\t"
538 "pand %%mm6, %%mm4\n\t"
539 "psrlq $19, %%mm2\n\t"
540 "psrlq $19, %%mm5\n\t"
543 "por %%mm1, %%mm0\n\t"
544 "por %%mm4, %%mm3\n\t"
545 "por %%mm2, %%mm0\n\t"
546 "por %%mm5, %%mm3\n\t"
547 "psllq $16, %%mm3\n\t"
548 "por %%mm3, %%mm0\n\t"
549 MOVNTQ
" %%mm0, %0\n\t"
550 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
554 __asm
__volatile(SFENCE
:::"memory");
555 __asm
__volatile(EMMS
:::"memory");
562 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
567 static inline void RENAME(rgb24to16
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
569 const uint8_t *s
= src
;
572 const uint8_t *mm_end
;
574 uint16_t *d
= (uint16_t *)dst
;
577 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
581 ::"m"(red_16mask
),"m"(green_16mask
));
588 "movd 3%1, %%mm3\n\t"
589 "punpckldq 6%1, %%mm0\n\t"
590 "punpckldq 9%1, %%mm3\n\t"
591 "movq %%mm0, %%mm1\n\t"
592 "movq %%mm0, %%mm2\n\t"
593 "movq %%mm3, %%mm4\n\t"
594 "movq %%mm3, %%mm5\n\t"
595 "psrlq $3, %%mm0\n\t"
596 "psrlq $3, %%mm3\n\t"
599 "psrlq $5, %%mm1\n\t"
600 "psrlq $5, %%mm4\n\t"
601 "pand %%mm6, %%mm1\n\t"
602 "pand %%mm6, %%mm4\n\t"
603 "psrlq $8, %%mm2\n\t"
604 "psrlq $8, %%mm5\n\t"
605 "pand %%mm7, %%mm2\n\t"
606 "pand %%mm7, %%mm5\n\t"
607 "por %%mm1, %%mm0\n\t"
608 "por %%mm4, %%mm3\n\t"
609 "por %%mm2, %%mm0\n\t"
610 "por %%mm5, %%mm3\n\t"
611 "psllq $16, %%mm3\n\t"
612 "por %%mm3, %%mm0\n\t"
613 MOVNTQ
" %%mm0, %0\n\t"
614 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
618 __asm
__volatile(SFENCE
:::"memory");
619 __asm
__volatile(EMMS
:::"memory");
626 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
630 static inline void RENAME(rgb24tobgr16
)(const uint8_t *src
, uint8_t *dst
, unsigned int src_size
)
632 const uint8_t *s
= src
;
635 const uint8_t *mm_end
;
637 uint16_t *d
= (uint16_t *)dst
;
640 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
644 ::"m"(red_16mask
),"m"(green_16mask
));
651 "movd 3%1, %%mm3\n\t"
652 "punpckldq 6%1, %%mm0\n\t"
653 "punpckldq 9%1, %%mm3\n\t"
654 "movq %%mm0, %%mm1\n\t"
655 "movq %%mm0, %%mm2\n\t"
656 "movq %%mm3, %%mm4\n\t"
657 "movq %%mm3, %%mm5\n\t"
658 "psllq $8, %%mm0\n\t"
659 "psllq $8, %%mm3\n\t"
660 "pand %%mm7, %%mm0\n\t"
661 "pand %%mm7, %%mm3\n\t"
662 "psrlq $5, %%mm1\n\t"
663 "psrlq $5, %%mm4\n\t"
664 "pand %%mm6, %%mm1\n\t"
665 "pand %%mm6, %%mm4\n\t"
666 "psrlq $19, %%mm2\n\t"
667 "psrlq $19, %%mm5\n\t"
670 "por %%mm1, %%mm0\n\t"
671 "por %%mm4, %%mm3\n\t"
672 "por %%mm2, %%mm0\n\t"
673 "por %%mm5, %%mm3\n\t"
674 "psllq $16, %%mm3\n\t"
675 "por %%mm3, %%mm0\n\t"
676 MOVNTQ
" %%mm0, %0\n\t"
677 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
681 __asm
__volatile(SFENCE
:::"memory");
682 __asm
__volatile(EMMS
:::"memory");
689 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
693 static inline void RENAME(rgb24to15
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
695 const uint8_t *s
= src
;
698 const uint8_t *mm_end
;
700 uint16_t *d
= (uint16_t *)dst
;
703 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
707 ::"m"(red_15mask
),"m"(green_15mask
));
714 "movd 3%1, %%mm3\n\t"
715 "punpckldq 6%1, %%mm0\n\t"
716 "punpckldq 9%1, %%mm3\n\t"
717 "movq %%mm0, %%mm1\n\t"
718 "movq %%mm0, %%mm2\n\t"
719 "movq %%mm3, %%mm4\n\t"
720 "movq %%mm3, %%mm5\n\t"
721 "psrlq $3, %%mm0\n\t"
722 "psrlq $3, %%mm3\n\t"
725 "psrlq $6, %%mm1\n\t"
726 "psrlq $6, %%mm4\n\t"
727 "pand %%mm6, %%mm1\n\t"
728 "pand %%mm6, %%mm4\n\t"
729 "psrlq $9, %%mm2\n\t"
730 "psrlq $9, %%mm5\n\t"
731 "pand %%mm7, %%mm2\n\t"
732 "pand %%mm7, %%mm5\n\t"
733 "por %%mm1, %%mm0\n\t"
734 "por %%mm4, %%mm3\n\t"
735 "por %%mm2, %%mm0\n\t"
736 "por %%mm5, %%mm3\n\t"
737 "psllq $16, %%mm3\n\t"
738 "por %%mm3, %%mm0\n\t"
739 MOVNTQ
" %%mm0, %0\n\t"
740 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
744 __asm
__volatile(SFENCE
:::"memory");
745 __asm
__volatile(EMMS
:::"memory");
752 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
756 static inline void RENAME(rgb24tobgr15
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
758 const uint8_t *s
= src
;
761 const uint8_t *mm_end
;
763 uint16_t *d
= (uint16_t *)dst
;
766 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
770 ::"m"(red_15mask
),"m"(green_15mask
));
777 "movd 3%1, %%mm3\n\t"
778 "punpckldq 6%1, %%mm0\n\t"
779 "punpckldq 9%1, %%mm3\n\t"
780 "movq %%mm0, %%mm1\n\t"
781 "movq %%mm0, %%mm2\n\t"
782 "movq %%mm3, %%mm4\n\t"
783 "movq %%mm3, %%mm5\n\t"
784 "psllq $7, %%mm0\n\t"
785 "psllq $7, %%mm3\n\t"
786 "pand %%mm7, %%mm0\n\t"
787 "pand %%mm7, %%mm3\n\t"
788 "psrlq $6, %%mm1\n\t"
789 "psrlq $6, %%mm4\n\t"
790 "pand %%mm6, %%mm1\n\t"
791 "pand %%mm6, %%mm4\n\t"
792 "psrlq $19, %%mm2\n\t"
793 "psrlq $19, %%mm5\n\t"
796 "por %%mm1, %%mm0\n\t"
797 "por %%mm4, %%mm3\n\t"
798 "por %%mm2, %%mm0\n\t"
799 "por %%mm5, %%mm3\n\t"
800 "psllq $16, %%mm3\n\t"
801 "por %%mm3, %%mm0\n\t"
802 MOVNTQ
" %%mm0, %0\n\t"
803 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
807 __asm
__volatile(SFENCE
:::"memory");
808 __asm
__volatile(EMMS
:::"memory");
815 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
820 I use here less accurate approximation by simply
821 left-shifting the input
822 value and filling the low order bits with
823 zeroes. This method improves png's
824 compression but this scheme cannot reproduce white exactly, since it does not
825 generate an all-ones maximum value; the net effect is to darken the
828 The better method should be "left bit replication":
838 | Leftmost Bits Repeated to Fill Open Bits
842 static inline void RENAME(rgb15to24
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
846 const uint16_t *mm_end
;
848 uint8_t *d
= (uint8_t *)dst
;
849 const uint16_t *s
= (uint16_t *)src
;
850 end
= s
+ src_size
/2;
852 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
864 "psllq $3, %%mm0\n\t"
865 "psrlq $2, %%mm1\n\t"
866 "psrlq $7, %%mm2\n\t"
867 "movq %%mm0, %%mm3\n\t"
868 "movq %%mm1, %%mm4\n\t"
869 "movq %%mm2, %%mm5\n\t"
870 "punpcklwd %5, %%mm0\n\t"
871 "punpcklwd %5, %%mm1\n\t"
872 "punpcklwd %5, %%mm2\n\t"
873 "punpckhwd %5, %%mm3\n\t"
874 "punpckhwd %5, %%mm4\n\t"
875 "punpckhwd %5, %%mm5\n\t"
876 "psllq $8, %%mm1\n\t"
877 "psllq $16, %%mm2\n\t"
878 "por %%mm1, %%mm0\n\t"
879 "por %%mm2, %%mm0\n\t"
880 "psllq $8, %%mm4\n\t"
881 "psllq $16, %%mm5\n\t"
882 "por %%mm4, %%mm3\n\t"
883 "por %%mm5, %%mm3\n\t"
885 "movq %%mm0, %%mm6\n\t"
886 "movq %%mm3, %%mm7\n\t"
888 "movq 8%1, %%mm0\n\t"
889 "movq 8%1, %%mm1\n\t"
890 "movq 8%1, %%mm2\n\t"
894 "psllq $3, %%mm0\n\t"
895 "psrlq $2, %%mm1\n\t"
896 "psrlq $7, %%mm2\n\t"
897 "movq %%mm0, %%mm3\n\t"
898 "movq %%mm1, %%mm4\n\t"
899 "movq %%mm2, %%mm5\n\t"
900 "punpcklwd %5, %%mm0\n\t"
901 "punpcklwd %5, %%mm1\n\t"
902 "punpcklwd %5, %%mm2\n\t"
903 "punpckhwd %5, %%mm3\n\t"
904 "punpckhwd %5, %%mm4\n\t"
905 "punpckhwd %5, %%mm5\n\t"
906 "psllq $8, %%mm1\n\t"
907 "psllq $16, %%mm2\n\t"
908 "por %%mm1, %%mm0\n\t"
909 "por %%mm2, %%mm0\n\t"
910 "psllq $8, %%mm4\n\t"
911 "psllq $16, %%mm5\n\t"
912 "por %%mm4, %%mm3\n\t"
913 "por %%mm5, %%mm3\n\t"
916 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
), "m"(mmx_null
)
918 /* Borrowed 32 to 24 */
920 "movq %%mm0, %%mm4\n\t"
921 "movq %%mm3, %%mm5\n\t"
922 "movq %%mm6, %%mm0\n\t"
923 "movq %%mm7, %%mm1\n\t"
925 "movq %%mm4, %%mm6\n\t"
926 "movq %%mm5, %%mm7\n\t"
927 "movq %%mm0, %%mm2\n\t"
928 "movq %%mm1, %%mm3\n\t"
930 "psrlq $8, %%mm2\n\t"
931 "psrlq $8, %%mm3\n\t"
932 "psrlq $8, %%mm6\n\t"
933 "psrlq $8, %%mm7\n\t"
942 "por %%mm2, %%mm0\n\t"
943 "por %%mm3, %%mm1\n\t"
944 "por %%mm6, %%mm4\n\t"
945 "por %%mm7, %%mm5\n\t"
947 "movq %%mm1, %%mm2\n\t"
948 "movq %%mm4, %%mm3\n\t"
949 "psllq $48, %%mm2\n\t"
950 "psllq $32, %%mm3\n\t"
953 "por %%mm2, %%mm0\n\t"
954 "psrlq $16, %%mm1\n\t"
955 "psrlq $32, %%mm4\n\t"
956 "psllq $16, %%mm5\n\t"
957 "por %%mm3, %%mm1\n\t"
959 "por %%mm5, %%mm4\n\t"
961 MOVNTQ
" %%mm0, %0\n\t"
962 MOVNTQ
" %%mm1, 8%0\n\t"
966 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
971 __asm
__volatile(SFENCE
:::"memory");
972 __asm
__volatile(EMMS
:::"memory");
976 register uint16_t bgr
;
978 *d
++ = (bgr
&0x1F)<<3;
979 *d
++ = (bgr
&0x3E0)>>2;
980 *d
++ = (bgr
&0x7C00)>>7;
984 static inline void RENAME(rgb16to24
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
988 const uint16_t *mm_end
;
990 uint8_t *d
= (uint8_t *)dst
;
991 const uint16_t *s
= (const uint16_t *)src
;
992 end
= s
+ src_size
/2;
994 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1000 "movq %1, %%mm0\n\t"
1001 "movq %1, %%mm1\n\t"
1002 "movq %1, %%mm2\n\t"
1003 "pand %2, %%mm0\n\t"
1004 "pand %3, %%mm1\n\t"
1005 "pand %4, %%mm2\n\t"
1006 "psllq $3, %%mm0\n\t"
1007 "psrlq $3, %%mm1\n\t"
1008 "psrlq $8, %%mm2\n\t"
1009 "movq %%mm0, %%mm3\n\t"
1010 "movq %%mm1, %%mm4\n\t"
1011 "movq %%mm2, %%mm5\n\t"
1012 "punpcklwd %5, %%mm0\n\t"
1013 "punpcklwd %5, %%mm1\n\t"
1014 "punpcklwd %5, %%mm2\n\t"
1015 "punpckhwd %5, %%mm3\n\t"
1016 "punpckhwd %5, %%mm4\n\t"
1017 "punpckhwd %5, %%mm5\n\t"
1018 "psllq $8, %%mm1\n\t"
1019 "psllq $16, %%mm2\n\t"
1020 "por %%mm1, %%mm0\n\t"
1021 "por %%mm2, %%mm0\n\t"
1022 "psllq $8, %%mm4\n\t"
1023 "psllq $16, %%mm5\n\t"
1024 "por %%mm4, %%mm3\n\t"
1025 "por %%mm5, %%mm3\n\t"
1027 "movq %%mm0, %%mm6\n\t"
1028 "movq %%mm3, %%mm7\n\t"
1030 "movq 8%1, %%mm0\n\t"
1031 "movq 8%1, %%mm1\n\t"
1032 "movq 8%1, %%mm2\n\t"
1033 "pand %2, %%mm0\n\t"
1034 "pand %3, %%mm1\n\t"
1035 "pand %4, %%mm2\n\t"
1036 "psllq $3, %%mm0\n\t"
1037 "psrlq $3, %%mm1\n\t"
1038 "psrlq $8, %%mm2\n\t"
1039 "movq %%mm0, %%mm3\n\t"
1040 "movq %%mm1, %%mm4\n\t"
1041 "movq %%mm2, %%mm5\n\t"
1042 "punpcklwd %5, %%mm0\n\t"
1043 "punpcklwd %5, %%mm1\n\t"
1044 "punpcklwd %5, %%mm2\n\t"
1045 "punpckhwd %5, %%mm3\n\t"
1046 "punpckhwd %5, %%mm4\n\t"
1047 "punpckhwd %5, %%mm5\n\t"
1048 "psllq $8, %%mm1\n\t"
1049 "psllq $16, %%mm2\n\t"
1050 "por %%mm1, %%mm0\n\t"
1051 "por %%mm2, %%mm0\n\t"
1052 "psllq $8, %%mm4\n\t"
1053 "psllq $16, %%mm5\n\t"
1054 "por %%mm4, %%mm3\n\t"
1055 "por %%mm5, %%mm3\n\t"
1057 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
),"m"(mmx_null
)
1059 /* Borrowed 32 to 24 */
1061 "movq %%mm0, %%mm4\n\t"
1062 "movq %%mm3, %%mm5\n\t"
1063 "movq %%mm6, %%mm0\n\t"
1064 "movq %%mm7, %%mm1\n\t"
1066 "movq %%mm4, %%mm6\n\t"
1067 "movq %%mm5, %%mm7\n\t"
1068 "movq %%mm0, %%mm2\n\t"
1069 "movq %%mm1, %%mm3\n\t"
1071 "psrlq $8, %%mm2\n\t"
1072 "psrlq $8, %%mm3\n\t"
1073 "psrlq $8, %%mm6\n\t"
1074 "psrlq $8, %%mm7\n\t"
1075 "pand %2, %%mm0\n\t"
1076 "pand %2, %%mm1\n\t"
1077 "pand %2, %%mm4\n\t"
1078 "pand %2, %%mm5\n\t"
1079 "pand %3, %%mm2\n\t"
1080 "pand %3, %%mm3\n\t"
1081 "pand %3, %%mm6\n\t"
1082 "pand %3, %%mm7\n\t"
1083 "por %%mm2, %%mm0\n\t"
1084 "por %%mm3, %%mm1\n\t"
1085 "por %%mm6, %%mm4\n\t"
1086 "por %%mm7, %%mm5\n\t"
1088 "movq %%mm1, %%mm2\n\t"
1089 "movq %%mm4, %%mm3\n\t"
1090 "psllq $48, %%mm2\n\t"
1091 "psllq $32, %%mm3\n\t"
1092 "pand %4, %%mm2\n\t"
1093 "pand %5, %%mm3\n\t"
1094 "por %%mm2, %%mm0\n\t"
1095 "psrlq $16, %%mm1\n\t"
1096 "psrlq $32, %%mm4\n\t"
1097 "psllq $16, %%mm5\n\t"
1098 "por %%mm3, %%mm1\n\t"
1099 "pand %6, %%mm5\n\t"
1100 "por %%mm5, %%mm4\n\t"
1102 MOVNTQ
" %%mm0, %0\n\t"
1103 MOVNTQ
" %%mm1, 8%0\n\t"
1104 MOVNTQ
" %%mm4, 16%0"
1107 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
1112 __asm
__volatile(SFENCE
:::"memory");
1113 __asm
__volatile(EMMS
:::"memory");
1117 register uint16_t bgr
;
1119 *d
++ = (bgr
&0x1F)<<3;
1120 *d
++ = (bgr
&0x7E0)>>3;
1121 *d
++ = (bgr
&0xF800)>>8;
1125 static inline void RENAME(rgb15to32
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
1127 const uint16_t *end
;
1129 const uint16_t *mm_end
;
1131 uint8_t *d
= (uint8_t *)dst
;
1132 const uint16_t *s
= (const uint16_t *)src
;
1133 end
= s
+ src_size
/2;
1135 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1136 __asm
__volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1142 "movq %1, %%mm0\n\t"
1143 "movq %1, %%mm1\n\t"
1144 "movq %1, %%mm2\n\t"
1145 "pand %2, %%mm0\n\t"
1146 "pand %3, %%mm1\n\t"
1147 "pand %4, %%mm2\n\t"
1148 "psllq $3, %%mm0\n\t"
1149 "psrlq $2, %%mm1\n\t"
1150 "psrlq $7, %%mm2\n\t"
1151 "movq %%mm0, %%mm3\n\t"
1152 "movq %%mm1, %%mm4\n\t"
1153 "movq %%mm2, %%mm5\n\t"
1154 "punpcklwd %%mm7, %%mm0\n\t"
1155 "punpcklwd %%mm7, %%mm1\n\t"
1156 "punpcklwd %%mm7, %%mm2\n\t"
1157 "punpckhwd %%mm7, %%mm3\n\t"
1158 "punpckhwd %%mm7, %%mm4\n\t"
1159 "punpckhwd %%mm7, %%mm5\n\t"
1160 "psllq $8, %%mm1\n\t"
1161 "psllq $16, %%mm2\n\t"
1162 "por %%mm1, %%mm0\n\t"
1163 "por %%mm2, %%mm0\n\t"
1164 "psllq $8, %%mm4\n\t"
1165 "psllq $16, %%mm5\n\t"
1166 "por %%mm4, %%mm3\n\t"
1167 "por %%mm5, %%mm3\n\t"
1168 MOVNTQ
" %%mm0, %0\n\t"
1169 MOVNTQ
" %%mm3, 8%0\n\t"
1171 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
)
1176 __asm
__volatile(SFENCE
:::"memory");
1177 __asm
__volatile(EMMS
:::"memory");
1181 register uint16_t bgr
;
1183 *d
++ = (bgr
&0x1F)<<3;
1184 *d
++ = (bgr
&0x3E0)>>2;
1185 *d
++ = (bgr
&0x7C00)>>7;
1190 static inline void RENAME(rgb16to32
)(const uint8_t *src
, uint8_t *dst
, unsigned src_size
)
1192 const uint16_t *end
;
1194 const uint16_t *mm_end
;
1196 uint8_t *d
= (uint8_t *)dst
;
1197 const uint16_t *s
= (uint16_t *)src
;
1198 end
= s
+ src_size
/2;
1200 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1201 __asm
__volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1207 "movq %1, %%mm0\n\t"
1208 "movq %1, %%mm1\n\t"
1209 "movq %1, %%mm2\n\t"
1210 "pand %2, %%mm0\n\t"
1211 "pand %3, %%mm1\n\t"
1212 "pand %4, %%mm2\n\t"
1213 "psllq $3, %%mm0\n\t"
1214 "psrlq $3, %%mm1\n\t"
1215 "psrlq $8, %%mm2\n\t"
1216 "movq %%mm0, %%mm3\n\t"
1217 "movq %%mm1, %%mm4\n\t"
1218 "movq %%mm2, %%mm5\n\t"
1219 "punpcklwd %%mm7, %%mm0\n\t"
1220 "punpcklwd %%mm7, %%mm1\n\t"
1221 "punpcklwd %%mm7, %%mm2\n\t"
1222 "punpckhwd %%mm7, %%mm3\n\t"
1223 "punpckhwd %%mm7, %%mm4\n\t"
1224 "punpckhwd %%mm7, %%mm5\n\t"
1225 "psllq $8, %%mm1\n\t"
1226 "psllq $16, %%mm2\n\t"
1227 "por %%mm1, %%mm0\n\t"
1228 "por %%mm2, %%mm0\n\t"
1229 "psllq $8, %%mm4\n\t"
1230 "psllq $16, %%mm5\n\t"
1231 "por %%mm4, %%mm3\n\t"
1232 "por %%mm5, %%mm3\n\t"
1233 MOVNTQ
" %%mm0, %0\n\t"
1234 MOVNTQ
" %%mm3, 8%0\n\t"
1236 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
)
1241 __asm
__volatile(SFENCE
:::"memory");
1242 __asm
__volatile(EMMS
:::"memory");
1246 register uint16_t bgr
;
1248 *d
++ = (bgr
&0x1F)<<3;
1249 *d
++ = (bgr
&0x7E0)>>3;
1250 *d
++ = (bgr
&0xF800)>>8;
1255 static inline void RENAME(rgb32tobgr32
)(const uint8_t *src
, uint8_t *dst
, unsigned int src_size
)
1258 /* TODO: unroll this loop */
1260 "xorl %%eax, %%eax \n\t"
1263 PREFETCH
" 32(%0, %%eax) \n\t"
1264 "movq (%0, %%eax), %%mm0 \n\t"
1265 "movq %%mm0, %%mm1 \n\t"
1266 "movq %%mm0, %%mm2 \n\t"
1267 "pslld $16, %%mm0 \n\t"
1268 "psrld $16, %%mm1 \n\t"
1269 "pand "MANGLE(mask32r
)", %%mm0 \n\t"
1270 "pand "MANGLE(mask32g
)", %%mm2 \n\t"
1271 "pand "MANGLE(mask32b
)", %%mm1 \n\t"
1272 "por %%mm0, %%mm2 \n\t"
1273 "por %%mm1, %%mm2 \n\t"
1274 MOVNTQ
" %%mm2, (%1, %%eax) \n\t"
1275 "addl $8, %%eax \n\t"
1276 "cmpl %2, %%eax \n\t"
1278 :: "r" (src
), "r"(dst
), "r" (src_size
-7)
1282 __asm
__volatile(SFENCE
:::"memory");
1283 __asm
__volatile(EMMS
:::"memory");
1286 unsigned num_pixels
= src_size
>> 2;
1287 for(i
=0; i
<num_pixels
; i
++)
1289 dst
[4*i
+ 0] = src
[4*i
+ 2];
1290 dst
[4*i
+ 1] = src
[4*i
+ 1];
1291 dst
[4*i
+ 2] = src
[4*i
+ 0];
1296 static inline void RENAME(rgb24tobgr24
)(const uint8_t *src
, uint8_t *dst
, unsigned int src_size
)
1300 int mmx_size
= 23 - src_size
;
1302 "movq "MANGLE(mask24r
)", %%mm5 \n\t"
1303 "movq "MANGLE(mask24g
)", %%mm6 \n\t"
1304 "movq "MANGLE(mask24b
)", %%mm7 \n\t"
1307 PREFETCH
" 32(%1, %%eax) \n\t"
1308 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1309 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1310 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1311 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1312 "pand %%mm5, %%mm0 \n\t"
1313 "pand %%mm6, %%mm1 \n\t"
1314 "pand %%mm7, %%mm2 \n\t"
1315 "por %%mm0, %%mm1 \n\t"
1316 "por %%mm2, %%mm1 \n\t"
1317 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1318 MOVNTQ
" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1319 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1320 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1321 "pand %%mm7, %%mm0 \n\t"
1322 "pand %%mm5, %%mm1 \n\t"
1323 "pand %%mm6, %%mm2 \n\t"
1324 "por %%mm0, %%mm1 \n\t"
1325 "por %%mm2, %%mm1 \n\t"
1326 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1327 MOVNTQ
" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1328 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1329 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1330 "pand %%mm6, %%mm0 \n\t"
1331 "pand %%mm7, %%mm1 \n\t"
1332 "pand %%mm5, %%mm2 \n\t"
1333 "por %%mm0, %%mm1 \n\t"
1334 "por %%mm2, %%mm1 \n\t"
1335 MOVNTQ
" %%mm1, 16(%2, %%eax) \n\t"
1336 "addl $24, %%eax \n\t"
1339 : "r" (src
-mmx_size
), "r"(dst
-mmx_size
)
1342 __asm
__volatile(SFENCE
:::"memory");
1343 __asm
__volatile(EMMS
:::"memory");
1345 if(mmx_size
==23) return; //finihsed, was multiple of 8
1349 src_size
= 23-mmx_size
;
1353 for(i
=0; i
<src_size
; i
+=3)
1357 dst
[i
+ 1] = src
[i
+ 1];
1358 dst
[i
+ 2] = src
[i
+ 0];
1363 static inline void RENAME(yuvPlanartoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1364 unsigned int width
, unsigned int height
,
1365 unsigned int lumStride
, unsigned int chromStride
, unsigned int dstStride
, int vertLumPerChroma
)
1368 const unsigned chromWidth
= width
>>1;
1369 for(y
=0; y
<height
; y
++)
1372 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1374 "xorl %%eax, %%eax \n\t"
1377 PREFETCH
" 32(%1, %%eax, 2) \n\t"
1378 PREFETCH
" 32(%2, %%eax) \n\t"
1379 PREFETCH
" 32(%3, %%eax) \n\t"
1380 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1381 "movq %%mm0, %%mm2 \n\t" // U(0)
1382 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1383 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1384 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1386 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1387 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1388 "movq %%mm3, %%mm4 \n\t" // Y(0)
1389 "movq %%mm5, %%mm6 \n\t" // Y(8)
1390 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1391 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1392 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1393 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1395 MOVNTQ
" %%mm3, (%0, %%eax, 4) \n\t"
1396 MOVNTQ
" %%mm4, 8(%0, %%eax, 4) \n\t"
1397 MOVNTQ
" %%mm5, 16(%0, %%eax, 4) \n\t"
1398 MOVNTQ
" %%mm6, 24(%0, %%eax, 4) \n\t"
1400 "addl $8, %%eax \n\t"
1401 "cmpl %4, %%eax \n\t"
1403 ::"r"(dst
), "r"(ysrc
), "r"(usrc
), "r"(vsrc
), "r" (chromWidth
)
1407 #if __WORDSIZE >= 64
1409 uint64_t *ldst
= (uint64_t *) dst
;
1410 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1411 for(i
= 0; i
< chromWidth
; i
+= 2){
1413 k
= yc
[0] + (uc
[0] << 8) +
1414 (yc
[1] << 16) + (vc
[0] << 24);
1415 l
= yc
[2] + (uc
[1] << 8) +
1416 (yc
[3] << 16) + (vc
[1] << 24);
1417 *ldst
++ = k
+ (l
<< 32);
1424 int i
, *idst
= (int32_t *) dst
;
1425 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1426 for(i
= 0; i
< chromWidth
; i
++){
1427 *idst
++ = yc
[0] + (uc
[0] << 8) +
1428 (yc
[1] << 16) + (vc
[0] << 24);
1435 if((y
&(vertLumPerChroma
-1))==(vertLumPerChroma
-1) )
1437 usrc
+= chromStride
;
1438 vsrc
+= chromStride
;
1452 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1453 * problem for anyone then tell me, and ill fix it)
1455 static inline void RENAME(yv12toyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1456 unsigned int width
, unsigned int height
,
1457 unsigned int lumStride
, unsigned int chromStride
, unsigned int dstStride
)
1459 //FIXME interpolate chroma
1460 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 2);
1465 * width should be a multiple of 16
1467 static inline void RENAME(yuv422ptoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1468 unsigned int width
, unsigned int height
,
1469 unsigned int lumStride
, unsigned int chromStride
, unsigned int dstStride
)
1471 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 1);
1476 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1477 * problem for anyone then tell me, and ill fix it)
1479 static inline void RENAME(yuy2toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1480 unsigned int width
, unsigned int height
,
1481 unsigned int lumStride
, unsigned int chromStride
, unsigned int srcStride
)
1484 const unsigned chromWidth
= width
>>1;
1485 for(y
=0; y
<height
; y
+=2)
1489 "xorl %%eax, %%eax \n\t"
1490 "pcmpeqw %%mm7, %%mm7 \n\t"
1491 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1494 PREFETCH
" 64(%0, %%eax, 4) \n\t"
1495 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1496 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1497 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1498 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1499 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1500 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1501 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1502 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1503 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1504 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1506 MOVNTQ
" %%mm2, (%1, %%eax, 2) \n\t"
1508 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1509 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1510 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1511 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1512 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1513 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1514 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1515 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1516 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1517 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1519 MOVNTQ
" %%mm3, 8(%1, %%eax, 2) \n\t"
1521 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1522 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1523 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1524 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1525 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1526 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1527 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1528 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1530 MOVNTQ
" %%mm0, (%3, %%eax) \n\t"
1531 MOVNTQ
" %%mm2, (%2, %%eax) \n\t"
1533 "addl $8, %%eax \n\t"
1534 "cmpl %4, %%eax \n\t"
1536 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "r" (chromWidth
)
1544 "xorl %%eax, %%eax \n\t"
1547 PREFETCH
" 64(%0, %%eax, 4) \n\t"
1548 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1549 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1550 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1551 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1552 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1553 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1554 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1555 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1556 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1557 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1559 MOVNTQ
" %%mm0, (%1, %%eax, 2) \n\t"
1560 MOVNTQ
" %%mm2, 8(%1, %%eax, 2) \n\t"
1562 "addl $8, %%eax \n\t"
1563 "cmpl %4, %%eax \n\t"
1566 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "r" (chromWidth
)
1571 for(i
=0; i
<chromWidth
; i
++)
1573 ydst
[2*i
+0] = src
[4*i
+0];
1574 udst
[i
] = src
[4*i
+1];
1575 ydst
[2*i
+1] = src
[4*i
+2];
1576 vdst
[i
] = src
[4*i
+3];
1581 for(i
=0; i
<chromWidth
; i
++)
1583 ydst
[2*i
+0] = src
[4*i
+0];
1584 ydst
[2*i
+1] = src
[4*i
+2];
1587 udst
+= chromStride
;
1588 vdst
+= chromStride
;
1593 asm volatile( EMMS
" \n\t"
1599 static inline void RENAME(yvu9toyv12
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
,
1600 uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1601 unsigned int width
, unsigned int height
, unsigned int lumStride
, unsigned int chromStride
)
1604 memcpy(ydst
, ysrc
, width
*height
);
1606 /* XXX: implement upscaling for U,V */
1609 static inline void RENAME(planar2x
)(const uint8_t *src
, uint8_t *dst
, int srcWidth
, int srcHeight
, int srcStride
, int dstStride
)
1614 for(x
=0; x
<srcWidth
; x
++){
1620 for(y
=1; y
<srcHeight
; y
++){
1621 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1622 const int mmxSize
= srcWidth
;
1624 "movl %4, %%eax \n\t"
1626 "movq (%0, %%eax), %%mm0 \n\t"
1627 "movq (%1, %%eax), %%mm1 \n\t"
1628 "movq 1(%0, %%eax), %%mm2 \n\t"
1629 "movq 1(%1, %%eax), %%mm3 \n\t"
1630 "movq %%mm0, %%mm4 \n\t"
1631 "movq %%mm1, %%mm5 \n\t"
1632 PAVGB
" %%mm3, %%mm0 \n\t"
1633 PAVGB
" %%mm3, %%mm0 \n\t"
1634 PAVGB
" %%mm4, %%mm3 \n\t"
1635 PAVGB
" %%mm4, %%mm3 \n\t"
1636 PAVGB
" %%mm2, %%mm1 \n\t"
1637 PAVGB
" %%mm2, %%mm1 \n\t"
1638 PAVGB
" %%mm5, %%mm2 \n\t"
1639 PAVGB
" %%mm5, %%mm2 \n\t"
1640 "movq %%mm3, %%mm4 \n\t"
1641 "movq %%mm2, %%mm5 \n\t"
1642 "punpcklbw %%mm1, %%mm3 \n\t"
1643 "punpckhbw %%mm1, %%mm4 \n\t"
1644 "punpcklbw %%mm0, %%mm2 \n\t"
1645 "punpckhbw %%mm0, %%mm5 \n\t"
1647 MOVNTQ
" %%mm3, (%2, %%eax, 2) \n\t"
1648 MOVNTQ
" %%mm4, 8(%2, %%eax, 2) \n\t"
1649 MOVNTQ
" %%mm2, (%3, %%eax, 2) \n\t"
1650 MOVNTQ
" %%mm5, 8(%3, %%eax, 2) \n\t"
1652 "movq %%mm3, (%2, %%eax, 2) \n\t"
1653 "movq %%mm4, 8(%2, %%eax, 2) \n\t"
1654 "movq %%mm2, (%3, %%eax, 2) \n\t"
1655 "movq %%mm5, 8(%3, %%eax, 2) \n\t"
1657 "addl $8, %%eax \n\t"
1659 :: "r" (src
+ mmxSize
-1), "r" (src
+ srcStride
+ mmxSize
-1),
1660 "r" (dst
+ mmxSize
*2), "r" (dst
+ dstStride
+ mmxSize
*2),
1666 dst
[dstStride
]= src
[0];
1669 dst
[dstStride
]= src
[0];
1671 for(x
=0; x
<srcWidth
-1; x
++){
1672 dst
[2*x
+1]= (3*src
[x
+0] + src
[x
+srcStride
+1])>>2;
1673 dst
[2*x
+dstStride
+2]= ( src
[x
+0] + 3*src
[x
+srcStride
+1])>>2;
1674 dst
[2*x
+dstStride
+1]= ( src
[x
+1] + 3*src
[x
+srcStride
])>>2;
1675 dst
[2*x
+2]= (3*src
[x
+1] + src
[x
+srcStride
])>>2;
1679 dst
[srcWidth
*2 -1 + dstStride
]= src
[srcWidth
-1];
1687 for(x
=0; x
<srcWidth
; x
++){
1692 asm volatile( EMMS
" \n\t"
1700 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1701 * problem for anyone then tell me, and ill fix it)
1702 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1704 static inline void RENAME(uyvytoyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1705 unsigned int width
, unsigned int height
,
1706 unsigned int lumStride
, unsigned int chromStride
, unsigned int srcStride
)
1709 const unsigned chromWidth
= width
>>1;
1710 for(y
=0; y
<height
; y
+=2)
1714 "xorl %%eax, %%eax \n\t"
1715 "pcmpeqw %%mm7, %%mm7 \n\t"
1716 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1719 PREFETCH
" 64(%0, %%eax, 4) \n\t"
1720 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1721 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1722 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1723 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1724 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1725 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1726 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1727 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1728 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1729 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1731 MOVNTQ
" %%mm2, (%1, %%eax, 2) \n\t"
1733 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1734 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1735 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1736 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1737 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1738 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1739 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1740 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1741 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1742 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1744 MOVNTQ
" %%mm3, 8(%1, %%eax, 2) \n\t"
1746 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1747 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1748 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1749 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1750 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1751 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1752 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1753 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1755 MOVNTQ
" %%mm0, (%3, %%eax) \n\t"
1756 MOVNTQ
" %%mm2, (%2, %%eax) \n\t"
1758 "addl $8, %%eax \n\t"
1759 "cmpl %4, %%eax \n\t"
1761 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "r" (chromWidth
)
1769 "xorl %%eax, %%eax \n\t"
1772 PREFETCH
" 64(%0, %%eax, 4) \n\t"
1773 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1774 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1775 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1776 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1777 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1778 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1779 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1780 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1781 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1782 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1784 MOVNTQ
" %%mm0, (%1, %%eax, 2) \n\t"
1785 MOVNTQ
" %%mm2, 8(%1, %%eax, 2) \n\t"
1787 "addl $8, %%eax \n\t"
1788 "cmpl %4, %%eax \n\t"
1791 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "r" (chromWidth
)
1796 for(i
=0; i
<chromWidth
; i
++)
1798 udst
[i
] = src
[4*i
+0];
1799 ydst
[2*i
+0] = src
[4*i
+1];
1800 vdst
[i
] = src
[4*i
+2];
1801 ydst
[2*i
+1] = src
[4*i
+3];
1806 for(i
=0; i
<chromWidth
; i
++)
1808 ydst
[2*i
+0] = src
[4*i
+1];
1809 ydst
[2*i
+1] = src
[4*i
+3];
1812 udst
+= chromStride
;
1813 vdst
+= chromStride
;
1818 asm volatile( EMMS
" \n\t"
1826 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1827 * problem for anyone then tell me, and ill fix it)
1828 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1830 static inline void RENAME(rgb24toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1831 unsigned int width
, unsigned int height
,
1832 unsigned int lumStride
, unsigned int chromStride
, unsigned int srcStride
)
1835 const unsigned chromWidth
= width
>>1;
1837 for(y
=0; y
<height
-2; y
+=2)
1843 "movl %2, %%eax \n\t"
1844 "movq "MANGLE(bgr2YCoeff
)", %%mm6 \n\t"
1845 "movq "MANGLE(w1111
)", %%mm5 \n\t"
1846 "pxor %%mm7, %%mm7 \n\t"
1847 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1850 PREFETCH
" 64(%0, %%ebx) \n\t"
1851 "movd (%0, %%ebx), %%mm0 \n\t"
1852 "movd 3(%0, %%ebx), %%mm1 \n\t"
1853 "punpcklbw %%mm7, %%mm0 \n\t"
1854 "punpcklbw %%mm7, %%mm1 \n\t"
1855 "movd 6(%0, %%ebx), %%mm2 \n\t"
1856 "movd 9(%0, %%ebx), %%mm3 \n\t"
1857 "punpcklbw %%mm7, %%mm2 \n\t"
1858 "punpcklbw %%mm7, %%mm3 \n\t"
1859 "pmaddwd %%mm6, %%mm0 \n\t"
1860 "pmaddwd %%mm6, %%mm1 \n\t"
1861 "pmaddwd %%mm6, %%mm2 \n\t"
1862 "pmaddwd %%mm6, %%mm3 \n\t"
1863 #ifndef FAST_BGR2YV12
1864 "psrad $8, %%mm0 \n\t"
1865 "psrad $8, %%mm1 \n\t"
1866 "psrad $8, %%mm2 \n\t"
1867 "psrad $8, %%mm3 \n\t"
1869 "packssdw %%mm1, %%mm0 \n\t"
1870 "packssdw %%mm3, %%mm2 \n\t"
1871 "pmaddwd %%mm5, %%mm0 \n\t"
1872 "pmaddwd %%mm5, %%mm2 \n\t"
1873 "packssdw %%mm2, %%mm0 \n\t"
1874 "psraw $7, %%mm0 \n\t"
1876 "movd 12(%0, %%ebx), %%mm4 \n\t"
1877 "movd 15(%0, %%ebx), %%mm1 \n\t"
1878 "punpcklbw %%mm7, %%mm4 \n\t"
1879 "punpcklbw %%mm7, %%mm1 \n\t"
1880 "movd 18(%0, %%ebx), %%mm2 \n\t"
1881 "movd 21(%0, %%ebx), %%mm3 \n\t"
1882 "punpcklbw %%mm7, %%mm2 \n\t"
1883 "punpcklbw %%mm7, %%mm3 \n\t"
1884 "pmaddwd %%mm6, %%mm4 \n\t"
1885 "pmaddwd %%mm6, %%mm1 \n\t"
1886 "pmaddwd %%mm6, %%mm2 \n\t"
1887 "pmaddwd %%mm6, %%mm3 \n\t"
1888 #ifndef FAST_BGR2YV12
1889 "psrad $8, %%mm4 \n\t"
1890 "psrad $8, %%mm1 \n\t"
1891 "psrad $8, %%mm2 \n\t"
1892 "psrad $8, %%mm3 \n\t"
1894 "packssdw %%mm1, %%mm4 \n\t"
1895 "packssdw %%mm3, %%mm2 \n\t"
1896 "pmaddwd %%mm5, %%mm4 \n\t"
1897 "pmaddwd %%mm5, %%mm2 \n\t"
1898 "addl $24, %%ebx \n\t"
1899 "packssdw %%mm2, %%mm4 \n\t"
1900 "psraw $7, %%mm4 \n\t"
1902 "packuswb %%mm4, %%mm0 \n\t"
1903 "paddusb "MANGLE(bgr2YOffset
)", %%mm0 \n\t"
1905 MOVNTQ
" %%mm0, (%1, %%eax) \n\t"
1906 "addl $8, %%eax \n\t"
1908 : : "r" (src
+width
*3), "r" (ydst
+width
), "g" (-width
)
1916 "movl %4, %%eax \n\t"
1917 "movq "MANGLE(w1111
)", %%mm5 \n\t"
1918 "movq "MANGLE(bgr2UCoeff
)", %%mm6 \n\t"
1919 "pxor %%mm7, %%mm7 \n\t"
1920 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1921 "addl %%ebx, %%ebx \n\t"
1924 PREFETCH
" 64(%0, %%ebx) \n\t"
1925 PREFETCH
" 64(%1, %%ebx) \n\t"
1926 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1927 "movq (%0, %%ebx), %%mm0 \n\t"
1928 "movq (%1, %%ebx), %%mm1 \n\t"
1929 "movq 6(%0, %%ebx), %%mm2 \n\t"
1930 "movq 6(%1, %%ebx), %%mm3 \n\t"
1931 PAVGB
" %%mm1, %%mm0 \n\t"
1932 PAVGB
" %%mm3, %%mm2 \n\t"
1933 "movq %%mm0, %%mm1 \n\t"
1934 "movq %%mm2, %%mm3 \n\t"
1935 "psrlq $24, %%mm0 \n\t"
1936 "psrlq $24, %%mm2 \n\t"
1937 PAVGB
" %%mm1, %%mm0 \n\t"
1938 PAVGB
" %%mm3, %%mm2 \n\t"
1939 "punpcklbw %%mm7, %%mm0 \n\t"
1940 "punpcklbw %%mm7, %%mm2 \n\t"
1942 "movd (%0, %%ebx), %%mm0 \n\t"
1943 "movd (%1, %%ebx), %%mm1 \n\t"
1944 "movd 3(%0, %%ebx), %%mm2 \n\t"
1945 "movd 3(%1, %%ebx), %%mm3 \n\t"
1946 "punpcklbw %%mm7, %%mm0 \n\t"
1947 "punpcklbw %%mm7, %%mm1 \n\t"
1948 "punpcklbw %%mm7, %%mm2 \n\t"
1949 "punpcklbw %%mm7, %%mm3 \n\t"
1950 "paddw %%mm1, %%mm0 \n\t"
1951 "paddw %%mm3, %%mm2 \n\t"
1952 "paddw %%mm2, %%mm0 \n\t"
1953 "movd 6(%0, %%ebx), %%mm4 \n\t"
1954 "movd 6(%1, %%ebx), %%mm1 \n\t"
1955 "movd 9(%0, %%ebx), %%mm2 \n\t"
1956 "movd 9(%1, %%ebx), %%mm3 \n\t"
1957 "punpcklbw %%mm7, %%mm4 \n\t"
1958 "punpcklbw %%mm7, %%mm1 \n\t"
1959 "punpcklbw %%mm7, %%mm2 \n\t"
1960 "punpcklbw %%mm7, %%mm3 \n\t"
1961 "paddw %%mm1, %%mm4 \n\t"
1962 "paddw %%mm3, %%mm2 \n\t"
1963 "paddw %%mm4, %%mm2 \n\t"
1964 "psrlw $2, %%mm0 \n\t"
1965 "psrlw $2, %%mm2 \n\t"
1967 "movq "MANGLE(bgr2VCoeff
)", %%mm1 \n\t"
1968 "movq "MANGLE(bgr2VCoeff
)", %%mm3 \n\t"
1970 "pmaddwd %%mm0, %%mm1 \n\t"
1971 "pmaddwd %%mm2, %%mm3 \n\t"
1972 "pmaddwd %%mm6, %%mm0 \n\t"
1973 "pmaddwd %%mm6, %%mm2 \n\t"
1974 #ifndef FAST_BGR2YV12
1975 "psrad $8, %%mm0 \n\t"
1976 "psrad $8, %%mm1 \n\t"
1977 "psrad $8, %%mm2 \n\t"
1978 "psrad $8, %%mm3 \n\t"
1980 "packssdw %%mm2, %%mm0 \n\t"
1981 "packssdw %%mm3, %%mm1 \n\t"
1982 "pmaddwd %%mm5, %%mm0 \n\t"
1983 "pmaddwd %%mm5, %%mm1 \n\t"
1984 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1985 "psraw $7, %%mm0 \n\t"
1987 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1988 "movq 12(%0, %%ebx), %%mm4 \n\t"
1989 "movq 12(%1, %%ebx), %%mm1 \n\t"
1990 "movq 18(%0, %%ebx), %%mm2 \n\t"
1991 "movq 18(%1, %%ebx), %%mm3 \n\t"
1992 PAVGB
" %%mm1, %%mm4 \n\t"
1993 PAVGB
" %%mm3, %%mm2 \n\t"
1994 "movq %%mm4, %%mm1 \n\t"
1995 "movq %%mm2, %%mm3 \n\t"
1996 "psrlq $24, %%mm4 \n\t"
1997 "psrlq $24, %%mm2 \n\t"
1998 PAVGB
" %%mm1, %%mm4 \n\t"
1999 PAVGB
" %%mm3, %%mm2 \n\t"
2000 "punpcklbw %%mm7, %%mm4 \n\t"
2001 "punpcklbw %%mm7, %%mm2 \n\t"
2003 "movd 12(%0, %%ebx), %%mm4 \n\t"
2004 "movd 12(%1, %%ebx), %%mm1 \n\t"
2005 "movd 15(%0, %%ebx), %%mm2 \n\t"
2006 "movd 15(%1, %%ebx), %%mm3 \n\t"
2007 "punpcklbw %%mm7, %%mm4 \n\t"
2008 "punpcklbw %%mm7, %%mm1 \n\t"
2009 "punpcklbw %%mm7, %%mm2 \n\t"
2010 "punpcklbw %%mm7, %%mm3 \n\t"
2011 "paddw %%mm1, %%mm4 \n\t"
2012 "paddw %%mm3, %%mm2 \n\t"
2013 "paddw %%mm2, %%mm4 \n\t"
2014 "movd 18(%0, %%ebx), %%mm5 \n\t"
2015 "movd 18(%1, %%ebx), %%mm1 \n\t"
2016 "movd 21(%0, %%ebx), %%mm2 \n\t"
2017 "movd 21(%1, %%ebx), %%mm3 \n\t"
2018 "punpcklbw %%mm7, %%mm5 \n\t"
2019 "punpcklbw %%mm7, %%mm1 \n\t"
2020 "punpcklbw %%mm7, %%mm2 \n\t"
2021 "punpcklbw %%mm7, %%mm3 \n\t"
2022 "paddw %%mm1, %%mm5 \n\t"
2023 "paddw %%mm3, %%mm2 \n\t"
2024 "paddw %%mm5, %%mm2 \n\t"
2025 "movq "MANGLE(w1111
)", %%mm5 \n\t"
2026 "psrlw $2, %%mm4 \n\t"
2027 "psrlw $2, %%mm2 \n\t"
2029 "movq "MANGLE(bgr2VCoeff
)", %%mm1 \n\t"
2030 "movq "MANGLE(bgr2VCoeff
)", %%mm3 \n\t"
2032 "pmaddwd %%mm4, %%mm1 \n\t"
2033 "pmaddwd %%mm2, %%mm3 \n\t"
2034 "pmaddwd %%mm6, %%mm4 \n\t"
2035 "pmaddwd %%mm6, %%mm2 \n\t"
2036 #ifndef FAST_BGR2YV12
2037 "psrad $8, %%mm4 \n\t"
2038 "psrad $8, %%mm1 \n\t"
2039 "psrad $8, %%mm2 \n\t"
2040 "psrad $8, %%mm3 \n\t"
2042 "packssdw %%mm2, %%mm4 \n\t"
2043 "packssdw %%mm3, %%mm1 \n\t"
2044 "pmaddwd %%mm5, %%mm4 \n\t"
2045 "pmaddwd %%mm5, %%mm1 \n\t"
2046 "addl $24, %%ebx \n\t"
2047 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2048 "psraw $7, %%mm4 \n\t"
2050 "movq %%mm0, %%mm1 \n\t"
2051 "punpckldq %%mm4, %%mm0 \n\t"
2052 "punpckhdq %%mm4, %%mm1 \n\t"
2053 "packsswb %%mm1, %%mm0 \n\t"
2054 "paddb "MANGLE(bgr2UVOffset
)", %%mm0 \n\t"
2056 "movd %%mm0, (%2, %%eax) \n\t"
2057 "punpckhdq %%mm0, %%mm0 \n\t"
2058 "movd %%mm0, (%3, %%eax) \n\t"
2059 "addl $4, %%eax \n\t"
2061 : : "r" (src
+width
*6), "r" (src
+srcStride
+width
*6), "r" (udst
+width
), "r" (vdst
+width
), "g" (-width
)
2065 udst
+= chromStride
;
2066 vdst
+= chromStride
;
2070 asm volatile( EMMS
" \n\t"
2076 for(; y
<height
; y
+=2)
2079 for(i
=0; i
<chromWidth
; i
++)
2081 unsigned int b
= src
[6*i
+0];
2082 unsigned int g
= src
[6*i
+1];
2083 unsigned int r
= src
[6*i
+2];
2085 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2086 unsigned int V
= ((RV
*r
+ GV
*g
+ BV
*b
)>>RGB2YUV_SHIFT
) + 128;
2087 unsigned int U
= ((RU
*r
+ GU
*g
+ BU
*b
)>>RGB2YUV_SHIFT
) + 128;
2097 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2103 for(i
=0; i
<chromWidth
; i
++)
2105 unsigned int b
= src
[6*i
+0];
2106 unsigned int g
= src
[6*i
+1];
2107 unsigned int r
= src
[6*i
+2];
2109 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2117 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2120 udst
+= chromStride
;
2121 vdst
+= chromStride
;
2127 void RENAME(interleaveBytes
)(uint8_t *src1
, uint8_t *src2
, uint8_t *dest
,
2128 unsigned width
, unsigned height
, unsigned src1Stride
,
2129 unsigned src2Stride
, unsigned dstStride
){
2132 for(h
=0; h
< height
; h
++)
2139 "xorl %%eax, %%eax \n\t"
2141 PREFETCH
" 64(%1, %%eax) \n\t"
2142 PREFETCH
" 64(%2, %%eax) \n\t"
2143 "movdqa (%1, %%eax), %%xmm0 \n\t"
2144 "movdqa (%1, %%eax), %%xmm1 \n\t"
2145 "movdqa (%2, %%eax), %%xmm2 \n\t"
2146 "punpcklbw %%xmm2, %%xmm0 \n\t"
2147 "punpckhbw %%xmm2, %%xmm1 \n\t"
2148 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2149 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2150 "addl $16, %%eax \n\t"
2151 "cmpl %3, %%eax \n\t"
2153 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" (width
-15)
2158 "xorl %%eax, %%eax \n\t"
2160 PREFETCH
" 64(%1, %%eax) \n\t"
2161 PREFETCH
" 64(%2, %%eax) \n\t"
2162 "movq (%1, %%eax), %%mm0 \n\t"
2163 "movq 8(%1, %%eax), %%mm2 \n\t"
2164 "movq %%mm0, %%mm1 \n\t"
2165 "movq %%mm2, %%mm3 \n\t"
2166 "movq (%2, %%eax), %%mm4 \n\t"
2167 "movq 8(%2, %%eax), %%mm5 \n\t"
2168 "punpcklbw %%mm4, %%mm0 \n\t"
2169 "punpckhbw %%mm4, %%mm1 \n\t"
2170 "punpcklbw %%mm5, %%mm2 \n\t"
2171 "punpckhbw %%mm5, %%mm3 \n\t"
2172 MOVNTQ
" %%mm0, (%0, %%eax, 2) \n\t"
2173 MOVNTQ
" %%mm1, 8(%0, %%eax, 2) \n\t"
2174 MOVNTQ
" %%mm2, 16(%0, %%eax, 2) \n\t"
2175 MOVNTQ
" %%mm3, 24(%0, %%eax, 2) \n\t"
2176 "addl $16, %%eax \n\t"
2177 "cmpl %3, %%eax \n\t"
2179 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" (width
-15)
2183 for(w
= (width
&(~15)); w
< width
; w
++)
2185 dest
[2*w
+0] = src1
[w
];
2186 dest
[2*w
+1] = src2
[w
];
2189 for(w
=0; w
< width
; w
++)
2191 dest
[2*w
+0] = src1
[w
];
2192 dest
[2*w
+1] = src2
[w
];
2208 static inline void RENAME(vu9_to_vu12
)(const uint8_t *src1
, const uint8_t *src2
,
2209 uint8_t *dst1
, uint8_t *dst2
,
2210 unsigned width
, unsigned height
,
2211 unsigned srcStride1
, unsigned srcStride2
,
2212 unsigned dstStride1
, unsigned dstStride2
)
2215 w
=width
/2; h
=height
/2;
2220 ::"m"(*(src1
+srcStride1
)),"m"(*(src2
+srcStride2
)):"memory");
2223 const uint8_t* s1
=src1
+srcStride1
*(y
>>1);
2224 uint8_t* d
=dst1
+dstStride1
*y
;
2232 "movq %1, %%mm0\n\t"
2233 "movq 8%1, %%mm2\n\t"
2234 "movq 16%1, %%mm4\n\t"
2235 "movq 24%1, %%mm6\n\t"
2236 "movq %%mm0, %%mm1\n\t"
2237 "movq %%mm2, %%mm3\n\t"
2238 "movq %%mm4, %%mm5\n\t"
2239 "movq %%mm6, %%mm7\n\t"
2240 "punpcklbw %%mm0, %%mm0\n\t"
2241 "punpckhbw %%mm1, %%mm1\n\t"
2242 "punpcklbw %%mm2, %%mm2\n\t"
2243 "punpckhbw %%mm3, %%mm3\n\t"
2244 "punpcklbw %%mm4, %%mm4\n\t"
2245 "punpckhbw %%mm5, %%mm5\n\t"
2246 "punpcklbw %%mm6, %%mm6\n\t"
2247 "punpckhbw %%mm7, %%mm7\n\t"
2248 MOVNTQ
" %%mm0, %0\n\t"
2249 MOVNTQ
" %%mm1, 8%0\n\t"
2250 MOVNTQ
" %%mm2, 16%0\n\t"
2251 MOVNTQ
" %%mm3, 24%0\n\t"
2252 MOVNTQ
" %%mm4, 32%0\n\t"
2253 MOVNTQ
" %%mm5, 40%0\n\t"
2254 MOVNTQ
" %%mm6, 48%0\n\t"
2255 MOVNTQ
" %%mm7, 56%0"
2261 for(;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s1
[x
];
2264 const uint8_t* s2
=src2
+srcStride2
*(y
>>1);
2265 uint8_t* d
=dst2
+dstStride2
*y
;
2273 "movq %1, %%mm0\n\t"
2274 "movq 8%1, %%mm2\n\t"
2275 "movq 16%1, %%mm4\n\t"
2276 "movq 24%1, %%mm6\n\t"
2277 "movq %%mm0, %%mm1\n\t"
2278 "movq %%mm2, %%mm3\n\t"
2279 "movq %%mm4, %%mm5\n\t"
2280 "movq %%mm6, %%mm7\n\t"
2281 "punpcklbw %%mm0, %%mm0\n\t"
2282 "punpckhbw %%mm1, %%mm1\n\t"
2283 "punpcklbw %%mm2, %%mm2\n\t"
2284 "punpckhbw %%mm3, %%mm3\n\t"
2285 "punpcklbw %%mm4, %%mm4\n\t"
2286 "punpckhbw %%mm5, %%mm5\n\t"
2287 "punpcklbw %%mm6, %%mm6\n\t"
2288 "punpckhbw %%mm7, %%mm7\n\t"
2289 MOVNTQ
" %%mm0, %0\n\t"
2290 MOVNTQ
" %%mm1, 8%0\n\t"
2291 MOVNTQ
" %%mm2, 16%0\n\t"
2292 MOVNTQ
" %%mm3, 24%0\n\t"
2293 MOVNTQ
" %%mm4, 32%0\n\t"
2294 MOVNTQ
" %%mm5, 40%0\n\t"
2295 MOVNTQ
" %%mm6, 48%0\n\t"
2296 MOVNTQ
" %%mm7, 56%0"
2302 for(;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s2
[x
];
2313 static inline void RENAME(yvu9_to_yuy2
)(const uint8_t *src1
, const uint8_t *src2
, const uint8_t *src3
,
2315 unsigned width
, unsigned height
,
2316 unsigned srcStride1
, unsigned srcStride2
,
2317 unsigned srcStride3
, unsigned dstStride
)
2319 unsigned y
,x
,x2
,w
,h
;
2320 w
=width
/2; h
=height
;
2326 ::"m"(*(src1
+srcStride1
)),"m"(*(src2
+srcStride2
)),"m"(*(src3
+srcStride3
)):"memory");
2329 const uint8_t* yp
=src1
+srcStride1
*y
;
2330 const uint8_t* up
=src2
+srcStride2
*(y
>>2);
2331 const uint8_t* vp
=src3
+srcStride3
*(y
>>2);
2332 uint8_t* d
=dst
+dstStride
*y
;
2336 for(;x
<w
;x
+=8,x2
+=32)
2342 "movq %1, %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2343 "movq %2, %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2344 "movq %3, %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2345 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2346 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2347 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2348 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2349 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2350 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2351 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2353 "movq %%mm1, %%mm6\n\t"
2354 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2355 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2356 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2357 MOVNTQ
" %%mm0, %0\n\t"
2358 MOVNTQ
" %%mm3, 8%0\n\t"
2360 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2361 "movq 8%1, %%mm0\n\t"
2362 "movq %%mm0, %%mm3\n\t"
2363 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2364 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2365 MOVNTQ
" %%mm0, 16%0\n\t"
2366 MOVNTQ
" %%mm3, 24%0\n\t"
2368 "movq %%mm4, %%mm6\n\t"
2369 "movq 16%1, %%mm0\n\t"
2370 "movq %%mm0, %%mm3\n\t"
2371 "punpcklbw %%mm5, %%mm4\n\t"
2372 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2373 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2374 MOVNTQ
" %%mm0, 32%0\n\t"
2375 MOVNTQ
" %%mm3, 40%0\n\t"
2377 "punpckhbw %%mm5, %%mm6\n\t"
2378 "movq 24%1, %%mm0\n\t"
2379 "movq %%mm0, %%mm3\n\t"
2380 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2381 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2382 MOVNTQ
" %%mm0, 48%0\n\t"
2383 MOVNTQ
" %%mm3, 56%0\n\t"
2386 :"m"(yp
[x2
]),"m"(up
[x
]),"m"(vp
[x
])