2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
71 #define SFENCE " # nop"
74 static inline void RENAME(rgb24tobgr32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
77 const uint8_t *s
= src
;
80 const uint8_t *mm_end
;
84 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
86 __asm__
volatile("movq %0, %%mm7"::"m"(mask32a
):"memory");
92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t"
95 "movd 12%1, %%mm2 \n\t"
96 "punpckldq 15%1, %%mm2 \n\t"
97 "movd 18%1, %%mm3 \n\t"
98 "punpckldq 21%1, %%mm3 \n\t"
99 "por %%mm7, %%mm0 \n\t"
100 "por %%mm7, %%mm1 \n\t"
101 "por %%mm7, %%mm2 \n\t"
102 "por %%mm7, %%mm3 \n\t"
103 MOVNTQ
" %%mm0, %0 \n\t"
104 MOVNTQ
" %%mm1, 8%0 \n\t"
105 MOVNTQ
" %%mm2, 16%0 \n\t"
113 __asm__
volatile(SFENCE
:::"memory");
114 __asm__
volatile(EMMS
:::"memory");
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
134 static inline void RENAME(rgb32tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
137 const uint8_t *s
= src
;
140 const uint8_t *mm_end
;
144 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t"
175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t"
189 MOVNTQ
" %%mm0, %0 \n\t"
190 MOVNTQ
" %%mm1, 8%0 \n\t"
193 :"m"(*s
),"m"(mask24l
),
194 "m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
199 __asm__
volatile(SFENCE
:::"memory");
200 __asm__
volatile(EMMS
:::"memory");
205 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
226 static inline void RENAME(rgb15to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
228 register const uint8_t* s
=src
;
229 register uint8_t* d
=dst
;
230 register const uint8_t *end
;
231 const uint8_t *mm_end
;
234 __asm__
volatile(PREFETCH
" %0"::"m"(*s
));
235 __asm__
volatile("movq %0, %%mm4"::"m"(mask15s
));
241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ
" %%mm0, %0 \n\t"
257 __asm__
volatile(SFENCE
:::"memory");
258 __asm__
volatile(EMMS
:::"memory");
263 register unsigned x
= *((const uint32_t *)s
);
264 *((uint32_t *)d
) = (x
&0x7FFF7FFF) + (x
&0x7FE07FE0);
270 register unsigned short x
= *((const uint16_t *)s
);
271 *((uint16_t *)d
) = (x
&0x7FFF) + (x
&0x7FE0);
275 static inline void RENAME(rgb16to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
277 register const uint8_t* s
=src
;
278 register uint8_t* d
=dst
;
279 register const uint8_t *end
;
280 const uint8_t *mm_end
;
283 __asm__
volatile(PREFETCH
" %0"::"m"(*s
));
284 __asm__
volatile("movq %0, %%mm7"::"m"(mask15rg
));
285 __asm__
volatile("movq %0, %%mm6"::"m"(mask15b
));
291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ
" %%mm0, %0 \n\t"
311 __asm__
volatile(SFENCE
:::"memory");
312 __asm__
volatile(EMMS
:::"memory");
317 register uint32_t x
= *((const uint32_t*)s
);
318 *((uint32_t *)d
) = ((x
>>1)&0x7FE07FE0) | (x
&0x001F001F);
324 register uint16_t x
= *((const uint16_t*)s
);
325 *((uint16_t *)d
) = ((x
>>1)&0x7FE0) | (x
&0x001F);
329 static inline void RENAME(rgb32to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
331 const uint8_t *s
= src
;
334 const uint8_t *mm_end
;
336 uint16_t *d
= (uint16_t *)dst
;
340 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
342 "movq %3, %%mm5 \n\t"
343 "movq %4, %%mm6 \n\t"
344 "movq %5, %%mm7 \n\t"
348 PREFETCH
" 32(%1) \n\t"
349 "movd (%1), %%mm0 \n\t"
350 "movd 4(%1), %%mm3 \n\t"
351 "punpckldq 8(%1), %%mm0 \n\t"
352 "punpckldq 12(%1), %%mm3 \n\t"
353 "movq %%mm0, %%mm1 \n\t"
354 "movq %%mm3, %%mm4 \n\t"
355 "pand %%mm6, %%mm0 \n\t"
356 "pand %%mm6, %%mm3 \n\t"
357 "pmaddwd %%mm7, %%mm0 \n\t"
358 "pmaddwd %%mm7, %%mm3 \n\t"
359 "pand %%mm5, %%mm1 \n\t"
360 "pand %%mm5, %%mm4 \n\t"
361 "por %%mm1, %%mm0 \n\t"
362 "por %%mm4, %%mm3 \n\t"
363 "psrld $5, %%mm0 \n\t"
364 "pslld $11, %%mm3 \n\t"
365 "por %%mm3, %%mm0 \n\t"
366 MOVNTQ
" %%mm0, (%0) \n\t"
373 : "r" (mm_end
), "m" (mask3216g
), "m" (mask3216br
), "m" (mul3216
)
376 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
378 "movq %0, %%mm7 \n\t"
379 "movq %1, %%mm6 \n\t"
380 ::"m"(red_16mask
),"m"(green_16mask
));
385 "movd %1, %%mm0 \n\t"
386 "movd 4%1, %%mm3 \n\t"
387 "punpckldq 8%1, %%mm0 \n\t"
388 "punpckldq 12%1, %%mm3 \n\t"
389 "movq %%mm0, %%mm1 \n\t"
390 "movq %%mm0, %%mm2 \n\t"
391 "movq %%mm3, %%mm4 \n\t"
392 "movq %%mm3, %%mm5 \n\t"
393 "psrlq $3, %%mm0 \n\t"
394 "psrlq $3, %%mm3 \n\t"
395 "pand %2, %%mm0 \n\t"
396 "pand %2, %%mm3 \n\t"
397 "psrlq $5, %%mm1 \n\t"
398 "psrlq $5, %%mm4 \n\t"
399 "pand %%mm6, %%mm1 \n\t"
400 "pand %%mm6, %%mm4 \n\t"
401 "psrlq $8, %%mm2 \n\t"
402 "psrlq $8, %%mm5 \n\t"
403 "pand %%mm7, %%mm2 \n\t"
404 "pand %%mm7, %%mm5 \n\t"
405 "por %%mm1, %%mm0 \n\t"
406 "por %%mm4, %%mm3 \n\t"
407 "por %%mm2, %%mm0 \n\t"
408 "por %%mm5, %%mm3 \n\t"
409 "psllq $16, %%mm3 \n\t"
410 "por %%mm3, %%mm0 \n\t"
411 MOVNTQ
" %%mm0, %0 \n\t"
412 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
417 __asm__
volatile(SFENCE
:::"memory");
418 __asm__
volatile(EMMS
:::"memory");
422 register int rgb
= *(const uint32_t*)s
; s
+= 4;
423 *d
++ = ((rgb
&0xFF)>>3) + ((rgb
&0xFC00)>>5) + ((rgb
&0xF80000)>>8);
427 static inline void RENAME(rgb32tobgr16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
429 const uint8_t *s
= src
;
432 const uint8_t *mm_end
;
434 uint16_t *d
= (uint16_t *)dst
;
437 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
439 "movq %0, %%mm7 \n\t"
440 "movq %1, %%mm6 \n\t"
441 ::"m"(red_16mask
),"m"(green_16mask
));
447 "movd %1, %%mm0 \n\t"
448 "movd 4%1, %%mm3 \n\t"
449 "punpckldq 8%1, %%mm0 \n\t"
450 "punpckldq 12%1, %%mm3 \n\t"
451 "movq %%mm0, %%mm1 \n\t"
452 "movq %%mm0, %%mm2 \n\t"
453 "movq %%mm3, %%mm4 \n\t"
454 "movq %%mm3, %%mm5 \n\t"
455 "psllq $8, %%mm0 \n\t"
456 "psllq $8, %%mm3 \n\t"
457 "pand %%mm7, %%mm0 \n\t"
458 "pand %%mm7, %%mm3 \n\t"
459 "psrlq $5, %%mm1 \n\t"
460 "psrlq $5, %%mm4 \n\t"
461 "pand %%mm6, %%mm1 \n\t"
462 "pand %%mm6, %%mm4 \n\t"
463 "psrlq $19, %%mm2 \n\t"
464 "psrlq $19, %%mm5 \n\t"
465 "pand %2, %%mm2 \n\t"
466 "pand %2, %%mm5 \n\t"
467 "por %%mm1, %%mm0 \n\t"
468 "por %%mm4, %%mm3 \n\t"
469 "por %%mm2, %%mm0 \n\t"
470 "por %%mm5, %%mm3 \n\t"
471 "psllq $16, %%mm3 \n\t"
472 "por %%mm3, %%mm0 \n\t"
473 MOVNTQ
" %%mm0, %0 \n\t"
474 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
478 __asm__
volatile(SFENCE
:::"memory");
479 __asm__
volatile(EMMS
:::"memory");
483 register int rgb
= *(const uint32_t*)s
; s
+= 4;
484 *d
++ = ((rgb
&0xF8)<<8) + ((rgb
&0xFC00)>>5) + ((rgb
&0xF80000)>>19);
488 static inline void RENAME(rgb32to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
490 const uint8_t *s
= src
;
493 const uint8_t *mm_end
;
495 uint16_t *d
= (uint16_t *)dst
;
499 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
501 "movq %3, %%mm5 \n\t"
502 "movq %4, %%mm6 \n\t"
503 "movq %5, %%mm7 \n\t"
507 PREFETCH
" 32(%1) \n\t"
508 "movd (%1), %%mm0 \n\t"
509 "movd 4(%1), %%mm3 \n\t"
510 "punpckldq 8(%1), %%mm0 \n\t"
511 "punpckldq 12(%1), %%mm3 \n\t"
512 "movq %%mm0, %%mm1 \n\t"
513 "movq %%mm3, %%mm4 \n\t"
514 "pand %%mm6, %%mm0 \n\t"
515 "pand %%mm6, %%mm3 \n\t"
516 "pmaddwd %%mm7, %%mm0 \n\t"
517 "pmaddwd %%mm7, %%mm3 \n\t"
518 "pand %%mm5, %%mm1 \n\t"
519 "pand %%mm5, %%mm4 \n\t"
520 "por %%mm1, %%mm0 \n\t"
521 "por %%mm4, %%mm3 \n\t"
522 "psrld $6, %%mm0 \n\t"
523 "pslld $10, %%mm3 \n\t"
524 "por %%mm3, %%mm0 \n\t"
525 MOVNTQ
" %%mm0, (%0) \n\t"
532 : "r" (mm_end
), "m" (mask3215g
), "m" (mask3216br
), "m" (mul3215
)
535 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
537 "movq %0, %%mm7 \n\t"
538 "movq %1, %%mm6 \n\t"
539 ::"m"(red_15mask
),"m"(green_15mask
));
544 "movd %1, %%mm0 \n\t"
545 "movd 4%1, %%mm3 \n\t"
546 "punpckldq 8%1, %%mm0 \n\t"
547 "punpckldq 12%1, %%mm3 \n\t"
548 "movq %%mm0, %%mm1 \n\t"
549 "movq %%mm0, %%mm2 \n\t"
550 "movq %%mm3, %%mm4 \n\t"
551 "movq %%mm3, %%mm5 \n\t"
552 "psrlq $3, %%mm0 \n\t"
553 "psrlq $3, %%mm3 \n\t"
554 "pand %2, %%mm0 \n\t"
555 "pand %2, %%mm3 \n\t"
556 "psrlq $6, %%mm1 \n\t"
557 "psrlq $6, %%mm4 \n\t"
558 "pand %%mm6, %%mm1 \n\t"
559 "pand %%mm6, %%mm4 \n\t"
560 "psrlq $9, %%mm2 \n\t"
561 "psrlq $9, %%mm5 \n\t"
562 "pand %%mm7, %%mm2 \n\t"
563 "pand %%mm7, %%mm5 \n\t"
564 "por %%mm1, %%mm0 \n\t"
565 "por %%mm4, %%mm3 \n\t"
566 "por %%mm2, %%mm0 \n\t"
567 "por %%mm5, %%mm3 \n\t"
568 "psllq $16, %%mm3 \n\t"
569 "por %%mm3, %%mm0 \n\t"
570 MOVNTQ
" %%mm0, %0 \n\t"
571 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
576 __asm__
volatile(SFENCE
:::"memory");
577 __asm__
volatile(EMMS
:::"memory");
581 register int rgb
= *(const uint32_t*)s
; s
+= 4;
582 *d
++ = ((rgb
&0xFF)>>3) + ((rgb
&0xF800)>>6) + ((rgb
&0xF80000)>>9);
586 static inline void RENAME(rgb32tobgr15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
588 const uint8_t *s
= src
;
591 const uint8_t *mm_end
;
593 uint16_t *d
= (uint16_t *)dst
;
596 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
598 "movq %0, %%mm7 \n\t"
599 "movq %1, %%mm6 \n\t"
600 ::"m"(red_15mask
),"m"(green_15mask
));
606 "movd %1, %%mm0 \n\t"
607 "movd 4%1, %%mm3 \n\t"
608 "punpckldq 8%1, %%mm0 \n\t"
609 "punpckldq 12%1, %%mm3 \n\t"
610 "movq %%mm0, %%mm1 \n\t"
611 "movq %%mm0, %%mm2 \n\t"
612 "movq %%mm3, %%mm4 \n\t"
613 "movq %%mm3, %%mm5 \n\t"
614 "psllq $7, %%mm0 \n\t"
615 "psllq $7, %%mm3 \n\t"
616 "pand %%mm7, %%mm0 \n\t"
617 "pand %%mm7, %%mm3 \n\t"
618 "psrlq $6, %%mm1 \n\t"
619 "psrlq $6, %%mm4 \n\t"
620 "pand %%mm6, %%mm1 \n\t"
621 "pand %%mm6, %%mm4 \n\t"
622 "psrlq $19, %%mm2 \n\t"
623 "psrlq $19, %%mm5 \n\t"
624 "pand %2, %%mm2 \n\t"
625 "pand %2, %%mm5 \n\t"
626 "por %%mm1, %%mm0 \n\t"
627 "por %%mm4, %%mm3 \n\t"
628 "por %%mm2, %%mm0 \n\t"
629 "por %%mm5, %%mm3 \n\t"
630 "psllq $16, %%mm3 \n\t"
631 "por %%mm3, %%mm0 \n\t"
632 MOVNTQ
" %%mm0, %0 \n\t"
633 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
637 __asm__
volatile(SFENCE
:::"memory");
638 __asm__
volatile(EMMS
:::"memory");
642 register int rgb
= *(const uint32_t*)s
; s
+= 4;
643 *d
++ = ((rgb
&0xF8)<<7) + ((rgb
&0xF800)>>6) + ((rgb
&0xF80000)>>19);
647 static inline void RENAME(rgb24tobgr16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
649 const uint8_t *s
= src
;
652 const uint8_t *mm_end
;
654 uint16_t *d
= (uint16_t *)dst
;
657 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
659 "movq %0, %%mm7 \n\t"
660 "movq %1, %%mm6 \n\t"
661 ::"m"(red_16mask
),"m"(green_16mask
));
667 "movd %1, %%mm0 \n\t"
668 "movd 3%1, %%mm3 \n\t"
669 "punpckldq 6%1, %%mm0 \n\t"
670 "punpckldq 9%1, %%mm3 \n\t"
671 "movq %%mm0, %%mm1 \n\t"
672 "movq %%mm0, %%mm2 \n\t"
673 "movq %%mm3, %%mm4 \n\t"
674 "movq %%mm3, %%mm5 \n\t"
675 "psrlq $3, %%mm0 \n\t"
676 "psrlq $3, %%mm3 \n\t"
677 "pand %2, %%mm0 \n\t"
678 "pand %2, %%mm3 \n\t"
679 "psrlq $5, %%mm1 \n\t"
680 "psrlq $5, %%mm4 \n\t"
681 "pand %%mm6, %%mm1 \n\t"
682 "pand %%mm6, %%mm4 \n\t"
683 "psrlq $8, %%mm2 \n\t"
684 "psrlq $8, %%mm5 \n\t"
685 "pand %%mm7, %%mm2 \n\t"
686 "pand %%mm7, %%mm5 \n\t"
687 "por %%mm1, %%mm0 \n\t"
688 "por %%mm4, %%mm3 \n\t"
689 "por %%mm2, %%mm0 \n\t"
690 "por %%mm5, %%mm3 \n\t"
691 "psllq $16, %%mm3 \n\t"
692 "por %%mm3, %%mm0 \n\t"
693 MOVNTQ
" %%mm0, %0 \n\t"
694 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
698 __asm__
volatile(SFENCE
:::"memory");
699 __asm__
volatile(EMMS
:::"memory");
706 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
710 static inline void RENAME(rgb24to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
712 const uint8_t *s
= src
;
715 const uint8_t *mm_end
;
717 uint16_t *d
= (uint16_t *)dst
;
720 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
722 "movq %0, %%mm7 \n\t"
723 "movq %1, %%mm6 \n\t"
724 ::"m"(red_16mask
),"m"(green_16mask
));
730 "movd %1, %%mm0 \n\t"
731 "movd 3%1, %%mm3 \n\t"
732 "punpckldq 6%1, %%mm0 \n\t"
733 "punpckldq 9%1, %%mm3 \n\t"
734 "movq %%mm0, %%mm1 \n\t"
735 "movq %%mm0, %%mm2 \n\t"
736 "movq %%mm3, %%mm4 \n\t"
737 "movq %%mm3, %%mm5 \n\t"
738 "psllq $8, %%mm0 \n\t"
739 "psllq $8, %%mm3 \n\t"
740 "pand %%mm7, %%mm0 \n\t"
741 "pand %%mm7, %%mm3 \n\t"
742 "psrlq $5, %%mm1 \n\t"
743 "psrlq $5, %%mm4 \n\t"
744 "pand %%mm6, %%mm1 \n\t"
745 "pand %%mm6, %%mm4 \n\t"
746 "psrlq $19, %%mm2 \n\t"
747 "psrlq $19, %%mm5 \n\t"
748 "pand %2, %%mm2 \n\t"
749 "pand %2, %%mm5 \n\t"
750 "por %%mm1, %%mm0 \n\t"
751 "por %%mm4, %%mm3 \n\t"
752 "por %%mm2, %%mm0 \n\t"
753 "por %%mm5, %%mm3 \n\t"
754 "psllq $16, %%mm3 \n\t"
755 "por %%mm3, %%mm0 \n\t"
756 MOVNTQ
" %%mm0, %0 \n\t"
757 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
761 __asm__
volatile(SFENCE
:::"memory");
762 __asm__
volatile(EMMS
:::"memory");
769 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
773 static inline void RENAME(rgb24tobgr15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
775 const uint8_t *s
= src
;
778 const uint8_t *mm_end
;
780 uint16_t *d
= (uint16_t *)dst
;
783 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
785 "movq %0, %%mm7 \n\t"
786 "movq %1, %%mm6 \n\t"
787 ::"m"(red_15mask
),"m"(green_15mask
));
793 "movd %1, %%mm0 \n\t"
794 "movd 3%1, %%mm3 \n\t"
795 "punpckldq 6%1, %%mm0 \n\t"
796 "punpckldq 9%1, %%mm3 \n\t"
797 "movq %%mm0, %%mm1 \n\t"
798 "movq %%mm0, %%mm2 \n\t"
799 "movq %%mm3, %%mm4 \n\t"
800 "movq %%mm3, %%mm5 \n\t"
801 "psrlq $3, %%mm0 \n\t"
802 "psrlq $3, %%mm3 \n\t"
803 "pand %2, %%mm0 \n\t"
804 "pand %2, %%mm3 \n\t"
805 "psrlq $6, %%mm1 \n\t"
806 "psrlq $6, %%mm4 \n\t"
807 "pand %%mm6, %%mm1 \n\t"
808 "pand %%mm6, %%mm4 \n\t"
809 "psrlq $9, %%mm2 \n\t"
810 "psrlq $9, %%mm5 \n\t"
811 "pand %%mm7, %%mm2 \n\t"
812 "pand %%mm7, %%mm5 \n\t"
813 "por %%mm1, %%mm0 \n\t"
814 "por %%mm4, %%mm3 \n\t"
815 "por %%mm2, %%mm0 \n\t"
816 "por %%mm5, %%mm3 \n\t"
817 "psllq $16, %%mm3 \n\t"
818 "por %%mm3, %%mm0 \n\t"
819 MOVNTQ
" %%mm0, %0 \n\t"
820 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
824 __asm__
volatile(SFENCE
:::"memory");
825 __asm__
volatile(EMMS
:::"memory");
832 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
836 static inline void RENAME(rgb24to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
838 const uint8_t *s
= src
;
841 const uint8_t *mm_end
;
843 uint16_t *d
= (uint16_t *)dst
;
846 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
848 "movq %0, %%mm7 \n\t"
849 "movq %1, %%mm6 \n\t"
850 ::"m"(red_15mask
),"m"(green_15mask
));
856 "movd %1, %%mm0 \n\t"
857 "movd 3%1, %%mm3 \n\t"
858 "punpckldq 6%1, %%mm0 \n\t"
859 "punpckldq 9%1, %%mm3 \n\t"
860 "movq %%mm0, %%mm1 \n\t"
861 "movq %%mm0, %%mm2 \n\t"
862 "movq %%mm3, %%mm4 \n\t"
863 "movq %%mm3, %%mm5 \n\t"
864 "psllq $7, %%mm0 \n\t"
865 "psllq $7, %%mm3 \n\t"
866 "pand %%mm7, %%mm0 \n\t"
867 "pand %%mm7, %%mm3 \n\t"
868 "psrlq $6, %%mm1 \n\t"
869 "psrlq $6, %%mm4 \n\t"
870 "pand %%mm6, %%mm1 \n\t"
871 "pand %%mm6, %%mm4 \n\t"
872 "psrlq $19, %%mm2 \n\t"
873 "psrlq $19, %%mm5 \n\t"
874 "pand %2, %%mm2 \n\t"
875 "pand %2, %%mm5 \n\t"
876 "por %%mm1, %%mm0 \n\t"
877 "por %%mm4, %%mm3 \n\t"
878 "por %%mm2, %%mm0 \n\t"
879 "por %%mm5, %%mm3 \n\t"
880 "psllq $16, %%mm3 \n\t"
881 "por %%mm3, %%mm0 \n\t"
882 MOVNTQ
" %%mm0, %0 \n\t"
883 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
887 __asm__
volatile(SFENCE
:::"memory");
888 __asm__
volatile(EMMS
:::"memory");
895 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
900 I use less accurate approximation here by simply left-shifting the input
901 value and filling the low order bits with zeroes. This method improves PNG
902 compression but this scheme cannot reproduce white exactly, since it does
903 not generate an all-ones maximum value; the net effect is to darken the
906 The better method should be "left bit replication":
916 | leftmost bits repeated to fill open bits
920 static inline void RENAME(rgb15tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
924 const uint16_t *mm_end
;
927 const uint16_t *s
= (const uint16_t*)src
;
928 end
= s
+ src_size
/2;
930 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
936 "movq %1, %%mm0 \n\t"
937 "movq %1, %%mm1 \n\t"
938 "movq %1, %%mm2 \n\t"
939 "pand %2, %%mm0 \n\t"
940 "pand %3, %%mm1 \n\t"
941 "pand %4, %%mm2 \n\t"
942 "psllq $3, %%mm0 \n\t"
943 "psrlq $2, %%mm1 \n\t"
944 "psrlq $7, %%mm2 \n\t"
945 "movq %%mm0, %%mm3 \n\t"
946 "movq %%mm1, %%mm4 \n\t"
947 "movq %%mm2, %%mm5 \n\t"
948 "punpcklwd %5, %%mm0 \n\t"
949 "punpcklwd %5, %%mm1 \n\t"
950 "punpcklwd %5, %%mm2 \n\t"
951 "punpckhwd %5, %%mm3 \n\t"
952 "punpckhwd %5, %%mm4 \n\t"
953 "punpckhwd %5, %%mm5 \n\t"
954 "psllq $8, %%mm1 \n\t"
955 "psllq $16, %%mm2 \n\t"
956 "por %%mm1, %%mm0 \n\t"
957 "por %%mm2, %%mm0 \n\t"
958 "psllq $8, %%mm4 \n\t"
959 "psllq $16, %%mm5 \n\t"
960 "por %%mm4, %%mm3 \n\t"
961 "por %%mm5, %%mm3 \n\t"
963 "movq %%mm0, %%mm6 \n\t"
964 "movq %%mm3, %%mm7 \n\t"
966 "movq 8%1, %%mm0 \n\t"
967 "movq 8%1, %%mm1 \n\t"
968 "movq 8%1, %%mm2 \n\t"
969 "pand %2, %%mm0 \n\t"
970 "pand %3, %%mm1 \n\t"
971 "pand %4, %%mm2 \n\t"
972 "psllq $3, %%mm0 \n\t"
973 "psrlq $2, %%mm1 \n\t"
974 "psrlq $7, %%mm2 \n\t"
975 "movq %%mm0, %%mm3 \n\t"
976 "movq %%mm1, %%mm4 \n\t"
977 "movq %%mm2, %%mm5 \n\t"
978 "punpcklwd %5, %%mm0 \n\t"
979 "punpcklwd %5, %%mm1 \n\t"
980 "punpcklwd %5, %%mm2 \n\t"
981 "punpckhwd %5, %%mm3 \n\t"
982 "punpckhwd %5, %%mm4 \n\t"
983 "punpckhwd %5, %%mm5 \n\t"
984 "psllq $8, %%mm1 \n\t"
985 "psllq $16, %%mm2 \n\t"
986 "por %%mm1, %%mm0 \n\t"
987 "por %%mm2, %%mm0 \n\t"
988 "psllq $8, %%mm4 \n\t"
989 "psllq $16, %%mm5 \n\t"
990 "por %%mm4, %%mm3 \n\t"
991 "por %%mm5, %%mm3 \n\t"
994 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
), "m"(mmx_null
)
996 /* borrowed 32 to 24 */
998 "movq %%mm0, %%mm4 \n\t"
999 "movq %%mm3, %%mm5 \n\t"
1000 "movq %%mm6, %%mm0 \n\t"
1001 "movq %%mm7, %%mm1 \n\t"
1003 "movq %%mm4, %%mm6 \n\t"
1004 "movq %%mm5, %%mm7 \n\t"
1005 "movq %%mm0, %%mm2 \n\t"
1006 "movq %%mm1, %%mm3 \n\t"
1008 "psrlq $8, %%mm2 \n\t"
1009 "psrlq $8, %%mm3 \n\t"
1010 "psrlq $8, %%mm6 \n\t"
1011 "psrlq $8, %%mm7 \n\t"
1012 "pand %2, %%mm0 \n\t"
1013 "pand %2, %%mm1 \n\t"
1014 "pand %2, %%mm4 \n\t"
1015 "pand %2, %%mm5 \n\t"
1016 "pand %3, %%mm2 \n\t"
1017 "pand %3, %%mm3 \n\t"
1018 "pand %3, %%mm6 \n\t"
1019 "pand %3, %%mm7 \n\t"
1020 "por %%mm2, %%mm0 \n\t"
1021 "por %%mm3, %%mm1 \n\t"
1022 "por %%mm6, %%mm4 \n\t"
1023 "por %%mm7, %%mm5 \n\t"
1025 "movq %%mm1, %%mm2 \n\t"
1026 "movq %%mm4, %%mm3 \n\t"
1027 "psllq $48, %%mm2 \n\t"
1028 "psllq $32, %%mm3 \n\t"
1029 "pand %4, %%mm2 \n\t"
1030 "pand %5, %%mm3 \n\t"
1031 "por %%mm2, %%mm0 \n\t"
1032 "psrlq $16, %%mm1 \n\t"
1033 "psrlq $32, %%mm4 \n\t"
1034 "psllq $16, %%mm5 \n\t"
1035 "por %%mm3, %%mm1 \n\t"
1036 "pand %6, %%mm5 \n\t"
1037 "por %%mm5, %%mm4 \n\t"
1039 MOVNTQ
" %%mm0, %0 \n\t"
1040 MOVNTQ
" %%mm1, 8%0 \n\t"
1041 MOVNTQ
" %%mm4, 16%0"
1044 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
1049 __asm__
volatile(SFENCE
:::"memory");
1050 __asm__
volatile(EMMS
:::"memory");
1054 register uint16_t bgr
;
1056 *d
++ = (bgr
&0x1F)<<3;
1057 *d
++ = (bgr
&0x3E0)>>2;
1058 *d
++ = (bgr
&0x7C00)>>7;
1062 static inline void RENAME(rgb16tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1064 const uint16_t *end
;
1066 const uint16_t *mm_end
;
1068 uint8_t *d
= (uint8_t *)dst
;
1069 const uint16_t *s
= (const uint16_t *)src
;
1070 end
= s
+ src_size
/2;
1072 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1077 PREFETCH
" 32%1 \n\t"
1078 "movq %1, %%mm0 \n\t"
1079 "movq %1, %%mm1 \n\t"
1080 "movq %1, %%mm2 \n\t"
1081 "pand %2, %%mm0 \n\t"
1082 "pand %3, %%mm1 \n\t"
1083 "pand %4, %%mm2 \n\t"
1084 "psllq $3, %%mm0 \n\t"
1085 "psrlq $3, %%mm1 \n\t"
1086 "psrlq $8, %%mm2 \n\t"
1087 "movq %%mm0, %%mm3 \n\t"
1088 "movq %%mm1, %%mm4 \n\t"
1089 "movq %%mm2, %%mm5 \n\t"
1090 "punpcklwd %5, %%mm0 \n\t"
1091 "punpcklwd %5, %%mm1 \n\t"
1092 "punpcklwd %5, %%mm2 \n\t"
1093 "punpckhwd %5, %%mm3 \n\t"
1094 "punpckhwd %5, %%mm4 \n\t"
1095 "punpckhwd %5, %%mm5 \n\t"
1096 "psllq $8, %%mm1 \n\t"
1097 "psllq $16, %%mm2 \n\t"
1098 "por %%mm1, %%mm0 \n\t"
1099 "por %%mm2, %%mm0 \n\t"
1100 "psllq $8, %%mm4 \n\t"
1101 "psllq $16, %%mm5 \n\t"
1102 "por %%mm4, %%mm3 \n\t"
1103 "por %%mm5, %%mm3 \n\t"
1105 "movq %%mm0, %%mm6 \n\t"
1106 "movq %%mm3, %%mm7 \n\t"
1108 "movq 8%1, %%mm0 \n\t"
1109 "movq 8%1, %%mm1 \n\t"
1110 "movq 8%1, %%mm2 \n\t"
1111 "pand %2, %%mm0 \n\t"
1112 "pand %3, %%mm1 \n\t"
1113 "pand %4, %%mm2 \n\t"
1114 "psllq $3, %%mm0 \n\t"
1115 "psrlq $3, %%mm1 \n\t"
1116 "psrlq $8, %%mm2 \n\t"
1117 "movq %%mm0, %%mm3 \n\t"
1118 "movq %%mm1, %%mm4 \n\t"
1119 "movq %%mm2, %%mm5 \n\t"
1120 "punpcklwd %5, %%mm0 \n\t"
1121 "punpcklwd %5, %%mm1 \n\t"
1122 "punpcklwd %5, %%mm2 \n\t"
1123 "punpckhwd %5, %%mm3 \n\t"
1124 "punpckhwd %5, %%mm4 \n\t"
1125 "punpckhwd %5, %%mm5 \n\t"
1126 "psllq $8, %%mm1 \n\t"
1127 "psllq $16, %%mm2 \n\t"
1128 "por %%mm1, %%mm0 \n\t"
1129 "por %%mm2, %%mm0 \n\t"
1130 "psllq $8, %%mm4 \n\t"
1131 "psllq $16, %%mm5 \n\t"
1132 "por %%mm4, %%mm3 \n\t"
1133 "por %%mm5, %%mm3 \n\t"
1135 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
),"m"(mmx_null
)
1137 /* borrowed 32 to 24 */
1139 "movq %%mm0, %%mm4 \n\t"
1140 "movq %%mm3, %%mm5 \n\t"
1141 "movq %%mm6, %%mm0 \n\t"
1142 "movq %%mm7, %%mm1 \n\t"
1144 "movq %%mm4, %%mm6 \n\t"
1145 "movq %%mm5, %%mm7 \n\t"
1146 "movq %%mm0, %%mm2 \n\t"
1147 "movq %%mm1, %%mm3 \n\t"
1149 "psrlq $8, %%mm2 \n\t"
1150 "psrlq $8, %%mm3 \n\t"
1151 "psrlq $8, %%mm6 \n\t"
1152 "psrlq $8, %%mm7 \n\t"
1153 "pand %2, %%mm0 \n\t"
1154 "pand %2, %%mm1 \n\t"
1155 "pand %2, %%mm4 \n\t"
1156 "pand %2, %%mm5 \n\t"
1157 "pand %3, %%mm2 \n\t"
1158 "pand %3, %%mm3 \n\t"
1159 "pand %3, %%mm6 \n\t"
1160 "pand %3, %%mm7 \n\t"
1161 "por %%mm2, %%mm0 \n\t"
1162 "por %%mm3, %%mm1 \n\t"
1163 "por %%mm6, %%mm4 \n\t"
1164 "por %%mm7, %%mm5 \n\t"
1166 "movq %%mm1, %%mm2 \n\t"
1167 "movq %%mm4, %%mm3 \n\t"
1168 "psllq $48, %%mm2 \n\t"
1169 "psllq $32, %%mm3 \n\t"
1170 "pand %4, %%mm2 \n\t"
1171 "pand %5, %%mm3 \n\t"
1172 "por %%mm2, %%mm0 \n\t"
1173 "psrlq $16, %%mm1 \n\t"
1174 "psrlq $32, %%mm4 \n\t"
1175 "psllq $16, %%mm5 \n\t"
1176 "por %%mm3, %%mm1 \n\t"
1177 "pand %6, %%mm5 \n\t"
1178 "por %%mm5, %%mm4 \n\t"
1180 MOVNTQ
" %%mm0, %0 \n\t"
1181 MOVNTQ
" %%mm1, 8%0 \n\t"
1182 MOVNTQ
" %%mm4, 16%0"
1185 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
1190 __asm__
volatile(SFENCE
:::"memory");
1191 __asm__
volatile(EMMS
:::"memory");
1195 register uint16_t bgr
;
1197 *d
++ = (bgr
&0x1F)<<3;
1198 *d
++ = (bgr
&0x7E0)>>3;
1199 *d
++ = (bgr
&0xF800)>>8;
1204 * mm0 = 00 B3 00 B2 00 B1 00 B0
1205 * mm1 = 00 G3 00 G2 00 G1 00 G0
1206 * mm2 = 00 R3 00 R2 00 R1 00 R0
1207 * mm6 = FF FF FF FF FF FF FF FF
1208 * mm7 = 00 00 00 00 00 00 00 00
1210 #define PACK_RGB32 \
1211 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1212 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1213 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1214 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1215 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1216 "movq %%mm0, %%mm3 \n\t" \
1217 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1218 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1219 MOVNTQ" %%mm0, %0 \n\t" \
1220 MOVNTQ" %%mm3, 8%0 \n\t" \
1222 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1224 const uint16_t *end
;
1226 const uint16_t *mm_end
;
1229 const uint16_t *s
= (const uint16_t *)src
;
1230 end
= s
+ src_size
/2;
1232 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1233 __asm__
volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1234 __asm__
volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1239 PREFETCH
" 32%1 \n\t"
1240 "movq %1, %%mm0 \n\t"
1241 "movq %1, %%mm1 \n\t"
1242 "movq %1, %%mm2 \n\t"
1243 "pand %2, %%mm0 \n\t"
1244 "pand %3, %%mm1 \n\t"
1245 "pand %4, %%mm2 \n\t"
1246 "psllq $3, %%mm0 \n\t"
1247 "psrlq $2, %%mm1 \n\t"
1248 "psrlq $7, %%mm2 \n\t"
1251 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
)
1256 __asm__
volatile(SFENCE
:::"memory");
1257 __asm__
volatile(EMMS
:::"memory");
1261 register uint16_t bgr
;
1265 *d
++ = (bgr
&0x7C00)>>7;
1266 *d
++ = (bgr
&0x3E0)>>2;
1267 *d
++ = (bgr
&0x1F)<<3;
1269 *d
++ = (bgr
&0x1F)<<3;
1270 *d
++ = (bgr
&0x3E0)>>2;
1271 *d
++ = (bgr
&0x7C00)>>7;
1277 static inline void RENAME(rgb16to32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1279 const uint16_t *end
;
1281 const uint16_t *mm_end
;
1284 const uint16_t *s
= (const uint16_t*)src
;
1285 end
= s
+ src_size
/2;
1287 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1288 __asm__
volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1289 __asm__
volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1294 PREFETCH
" 32%1 \n\t"
1295 "movq %1, %%mm0 \n\t"
1296 "movq %1, %%mm1 \n\t"
1297 "movq %1, %%mm2 \n\t"
1298 "pand %2, %%mm0 \n\t"
1299 "pand %3, %%mm1 \n\t"
1300 "pand %4, %%mm2 \n\t"
1301 "psllq $3, %%mm0 \n\t"
1302 "psrlq $3, %%mm1 \n\t"
1303 "psrlq $8, %%mm2 \n\t"
1306 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
)
1311 __asm__
volatile(SFENCE
:::"memory");
1312 __asm__
volatile(EMMS
:::"memory");
1316 register uint16_t bgr
;
1320 *d
++ = (bgr
&0xF800)>>8;
1321 *d
++ = (bgr
&0x7E0)>>3;
1322 *d
++ = (bgr
&0x1F)<<3;
1324 *d
++ = (bgr
&0x1F)<<3;
1325 *d
++ = (bgr
&0x7E0)>>3;
1326 *d
++ = (bgr
&0xF800)>>8;
1332 static inline void RENAME(rgb32tobgr32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1334 x86_reg idx
= 15 - src_size
;
1335 const uint8_t *s
= src
-idx
;
1336 uint8_t *d
= dst
-idx
;
1341 PREFETCH
" (%1, %0) \n\t"
1342 "movq %3, %%mm7 \n\t"
1343 "pxor %4, %%mm7 \n\t"
1344 "movq %%mm7, %%mm6 \n\t"
1345 "pxor %5, %%mm7 \n\t"
1348 PREFETCH
" 32(%1, %0) \n\t"
1349 "movq (%1, %0), %%mm0 \n\t"
1350 "movq 8(%1, %0), %%mm1 \n\t"
1352 "pshufw $177, %%mm0, %%mm3 \n\t"
1353 "pshufw $177, %%mm1, %%mm5 \n\t"
1354 "pand %%mm7, %%mm0 \n\t"
1355 "pand %%mm6, %%mm3 \n\t"
1356 "pand %%mm7, %%mm1 \n\t"
1357 "pand %%mm6, %%mm5 \n\t"
1358 "por %%mm3, %%mm0 \n\t"
1359 "por %%mm5, %%mm1 \n\t"
1361 "movq %%mm0, %%mm2 \n\t"
1362 "movq %%mm1, %%mm4 \n\t"
1363 "pand %%mm7, %%mm0 \n\t"
1364 "pand %%mm6, %%mm2 \n\t"
1365 "pand %%mm7, %%mm1 \n\t"
1366 "pand %%mm6, %%mm4 \n\t"
1367 "movq %%mm2, %%mm3 \n\t"
1368 "movq %%mm4, %%mm5 \n\t"
1369 "pslld $16, %%mm2 \n\t"
1370 "psrld $16, %%mm3 \n\t"
1371 "pslld $16, %%mm4 \n\t"
1372 "psrld $16, %%mm5 \n\t"
1373 "por %%mm2, %%mm0 \n\t"
1374 "por %%mm4, %%mm1 \n\t"
1375 "por %%mm3, %%mm0 \n\t"
1376 "por %%mm5, %%mm1 \n\t"
1378 MOVNTQ
" %%mm0, (%2, %0) \n\t"
1379 MOVNTQ
" %%mm1, 8(%2, %0) \n\t"
1386 : "r" (s
), "r" (d
), "m" (mask32b
), "m" (mask32r
), "m" (mmx_one
)
1389 for (; idx
<15; idx
+=4) {
1390 register int v
= *(const uint32_t *)&s
[idx
], g
= v
& 0xff00ff00;
1392 *(uint32_t *)&d
[idx
] = (v
>>16) + g
+ (v
<<16);
1396 static inline void RENAME(rgb24tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1400 x86_reg mmx_size
= 23 - src_size
;
1402 "test %%"REG_a
", %%"REG_a
" \n\t"
1404 "movq "MANGLE(mask24r
)", %%mm5 \n\t"
1405 "movq "MANGLE(mask24g
)", %%mm6 \n\t"
1406 "movq "MANGLE(mask24b
)", %%mm7 \n\t"
1409 PREFETCH
" 32(%1, %%"REG_a
") \n\t"
1410 "movq (%1, %%"REG_a
"), %%mm0 \n\t" // BGR BGR BG
1411 "movq (%1, %%"REG_a
"), %%mm1 \n\t" // BGR BGR BG
1412 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t" // R BGR BGR B
1413 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1414 "pand %%mm5, %%mm0 \n\t"
1415 "pand %%mm6, %%mm1 \n\t"
1416 "pand %%mm7, %%mm2 \n\t"
1417 "por %%mm0, %%mm1 \n\t"
1418 "por %%mm2, %%mm1 \n\t"
1419 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t" // BGR BGR BG
1420 MOVNTQ
" %%mm1, (%2, %%"REG_a
") \n\t" // RGB RGB RG
1421 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t" // R BGR BGR B
1422 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t" // GR BGR BGR
1423 "pand %%mm7, %%mm0 \n\t"
1424 "pand %%mm5, %%mm1 \n\t"
1425 "pand %%mm6, %%mm2 \n\t"
1426 "por %%mm0, %%mm1 \n\t"
1427 "por %%mm2, %%mm1 \n\t"
1428 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t" // R BGR BGR B
1429 MOVNTQ
" %%mm1, 8(%2, %%"REG_a
") \n\t" // B RGB RGB R
1430 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t" // GR BGR BGR
1431 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t" // BGR BGR BG
1432 "pand %%mm6, %%mm0 \n\t"
1433 "pand %%mm7, %%mm1 \n\t"
1434 "pand %%mm5, %%mm2 \n\t"
1435 "por %%mm0, %%mm1 \n\t"
1436 "por %%mm2, %%mm1 \n\t"
1437 MOVNTQ
" %%mm1, 16(%2, %%"REG_a
") \n\t"
1438 "add $24, %%"REG_a
" \n\t"
1442 : "r" (src
-mmx_size
), "r"(dst
-mmx_size
)
1445 __asm__
volatile(SFENCE
:::"memory");
1446 __asm__
volatile(EMMS
:::"memory");
1448 if (mmx_size
==23) return; //finished, was multiple of 8
1452 src_size
= 23-mmx_size
;
1456 for (i
=0; i
<src_size
; i
+=3)
1460 dst
[i
+ 1] = src
[i
+ 1];
1461 dst
[i
+ 2] = src
[i
+ 0];
1466 static inline void RENAME(yuvPlanartoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1467 long width
, long height
,
1468 long lumStride
, long chromStride
, long dstStride
, long vertLumPerChroma
)
1471 const x86_reg chromWidth
= width
>>1;
1472 for (y
=0; y
<height
; y
++)
1475 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1477 "xor %%"REG_a
", %%"REG_a
" \n\t"
1480 PREFETCH
" 32(%1, %%"REG_a
", 2) \n\t"
1481 PREFETCH
" 32(%2, %%"REG_a
") \n\t"
1482 PREFETCH
" 32(%3, %%"REG_a
") \n\t"
1483 "movq (%2, %%"REG_a
"), %%mm0 \n\t" // U(0)
1484 "movq %%mm0, %%mm2 \n\t" // U(0)
1485 "movq (%3, %%"REG_a
"), %%mm1 \n\t" // V(0)
1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1489 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" // Y(0)
1490 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" // Y(8)
1491 "movq %%mm3, %%mm4 \n\t" // Y(0)
1492 "movq %%mm5, %%mm6 \n\t" // Y(8)
1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1498 MOVNTQ
" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1499 MOVNTQ
" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1500 MOVNTQ
" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1501 MOVNTQ
" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1503 "add $8, %%"REG_a
" \n\t"
1504 "cmp %4, %%"REG_a
" \n\t"
1506 ::"r"(dst
), "r"(ysrc
), "r"(usrc
), "r"(vsrc
), "g" (chromWidth
)
1511 #if ARCH_ALPHA && HAVE_MVI
1512 #define pl2yuy2(n) \
1517 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1518 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1519 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1520 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1521 yuv1 = (u << 8) + (v << 24); \
1528 uint64_t *qdst
= (uint64_t *) dst
;
1529 uint64_t *qdst2
= (uint64_t *) (dst
+ dstStride
);
1530 const uint32_t *yc
= (uint32_t *) ysrc
;
1531 const uint32_t *yc2
= (uint32_t *) (ysrc
+ lumStride
);
1532 const uint16_t *uc
= (uint16_t*) usrc
, *vc
= (uint16_t*) vsrc
;
1533 for (i
= 0; i
< chromWidth
; i
+= 8){
1534 uint64_t y1
, y2
, yuv1
, yuv2
;
1537 __asm__("ldq $31,64(%0)" :: "r"(yc
));
1538 __asm__("ldq $31,64(%0)" :: "r"(yc2
));
1539 __asm__("ldq $31,64(%0)" :: "r"(uc
));
1540 __asm__("ldq $31,64(%0)" :: "r"(vc
));
1558 #elif HAVE_FAST_64BIT
1560 uint64_t *ldst
= (uint64_t *) dst
;
1561 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1562 for (i
= 0; i
< chromWidth
; i
+= 2){
1564 k
= yc
[0] + (uc
[0] << 8) +
1565 (yc
[1] << 16) + (vc
[0] << 24);
1566 l
= yc
[2] + (uc
[1] << 8) +
1567 (yc
[3] << 16) + (vc
[1] << 24);
1568 *ldst
++ = k
+ (l
<< 32);
1575 int i
, *idst
= (int32_t *) dst
;
1576 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1577 for (i
= 0; i
< chromWidth
; i
++){
1579 *idst
++ = (yc
[0] << 24)+ (uc
[0] << 16) +
1580 (yc
[1] << 8) + (vc
[0] << 0);
1582 *idst
++ = yc
[0] + (uc
[0] << 8) +
1583 (yc
[1] << 16) + (vc
[0] << 24);
1591 if ((y
&(vertLumPerChroma
-1)) == vertLumPerChroma
-1)
1593 usrc
+= chromStride
;
1594 vsrc
+= chromStride
;
1600 __asm__( EMMS
" \n\t"
1607 * Height should be a multiple of 2 and width should be a multiple of 16.
1608 * (If this is a problem for anyone then tell me, and I will fix it.)
1610 static inline void RENAME(yv12toyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1611 long width
, long height
,
1612 long lumStride
, long chromStride
, long dstStride
)
1614 //FIXME interpolate chroma
1615 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 2);
1618 static inline void RENAME(yuvPlanartouyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1619 long width
, long height
,
1620 long lumStride
, long chromStride
, long dstStride
, long vertLumPerChroma
)
1623 const x86_reg chromWidth
= width
>>1;
1624 for (y
=0; y
<height
; y
++)
1627 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1629 "xor %%"REG_a
", %%"REG_a
" \n\t"
1632 PREFETCH
" 32(%1, %%"REG_a
", 2) \n\t"
1633 PREFETCH
" 32(%2, %%"REG_a
") \n\t"
1634 PREFETCH
" 32(%3, %%"REG_a
") \n\t"
1635 "movq (%2, %%"REG_a
"), %%mm0 \n\t" // U(0)
1636 "movq %%mm0, %%mm2 \n\t" // U(0)
1637 "movq (%3, %%"REG_a
"), %%mm1 \n\t" // V(0)
1638 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1639 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1641 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" // Y(0)
1642 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" // Y(8)
1643 "movq %%mm0, %%mm4 \n\t" // Y(0)
1644 "movq %%mm2, %%mm6 \n\t" // Y(8)
1645 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1646 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1647 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1648 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1650 MOVNTQ
" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1651 MOVNTQ
" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1652 MOVNTQ
" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1653 MOVNTQ
" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1655 "add $8, %%"REG_a
" \n\t"
1656 "cmp %4, %%"REG_a
" \n\t"
1658 ::"r"(dst
), "r"(ysrc
), "r"(usrc
), "r"(vsrc
), "g" (chromWidth
)
1662 //FIXME adapt the Alpha ASM code from yv12->yuy2
1666 uint64_t *ldst
= (uint64_t *) dst
;
1667 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1668 for (i
= 0; i
< chromWidth
; i
+= 2){
1670 k
= uc
[0] + (yc
[0] << 8) +
1671 (vc
[0] << 16) + (yc
[1] << 24);
1672 l
= uc
[1] + (yc
[2] << 8) +
1673 (vc
[1] << 16) + (yc
[3] << 24);
1674 *ldst
++ = k
+ (l
<< 32);
1681 int i
, *idst
= (int32_t *) dst
;
1682 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1683 for (i
= 0; i
< chromWidth
; i
++){
1685 *idst
++ = (uc
[0] << 24)+ (yc
[0] << 16) +
1686 (vc
[0] << 8) + (yc
[1] << 0);
1688 *idst
++ = uc
[0] + (yc
[0] << 8) +
1689 (vc
[0] << 16) + (yc
[1] << 24);
1697 if ((y
&(vertLumPerChroma
-1)) == vertLumPerChroma
-1)
1699 usrc
+= chromStride
;
1700 vsrc
+= chromStride
;
1706 __asm__( EMMS
" \n\t"
1713 * Height should be a multiple of 2 and width should be a multiple of 16
1714 * (If this is a problem for anyone then tell me, and I will fix it.)
1716 static inline void RENAME(yv12touyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1717 long width
, long height
,
1718 long lumStride
, long chromStride
, long dstStride
)
1720 //FIXME interpolate chroma
1721 RENAME(yuvPlanartouyvy
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 2);
1725 * Width should be a multiple of 16.
1727 static inline void RENAME(yuv422ptouyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1728 long width
, long height
,
1729 long lumStride
, long chromStride
, long dstStride
)
1731 RENAME(yuvPlanartouyvy
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 1);
1735 * Width should be a multiple of 16.
1737 static inline void RENAME(yuv422ptoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1738 long width
, long height
,
1739 long lumStride
, long chromStride
, long dstStride
)
1741 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 1);
1745 * Height should be a multiple of 2 and width should be a multiple of 16.
1746 * (If this is a problem for anyone then tell me, and I will fix it.)
1748 static inline void RENAME(yuy2toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1749 long width
, long height
,
1750 long lumStride
, long chromStride
, long srcStride
)
1753 const x86_reg chromWidth
= width
>>1;
1754 for (y
=0; y
<height
; y
+=2)
1758 "xor %%"REG_a
", %%"REG_a
" \n\t"
1759 "pcmpeqw %%mm7, %%mm7 \n\t"
1760 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1763 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
1764 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
1765 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
1766 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1767 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1768 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1769 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1770 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1771 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1772 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1773 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1775 MOVNTQ
" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1777 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(8)
1778 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(12)
1779 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1780 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1781 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1782 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1783 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1784 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1785 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1786 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1788 MOVNTQ
" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1790 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1791 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1792 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1793 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1794 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1795 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1796 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1797 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1799 MOVNTQ
" %%mm0, (%3, %%"REG_a
") \n\t"
1800 MOVNTQ
" %%mm2, (%2, %%"REG_a
") \n\t"
1802 "add $8, %%"REG_a
" \n\t"
1803 "cmp %4, %%"REG_a
" \n\t"
1805 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
1806 : "memory", "%"REG_a
1813 "xor %%"REG_a
", %%"REG_a
" \n\t"
1816 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
1817 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
1818 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
1819 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(8)
1820 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t" // YUYV YUYV(12)
1821 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1822 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1823 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1824 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1825 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1826 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1828 MOVNTQ
" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1829 MOVNTQ
" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1831 "add $8, %%"REG_a
" \n\t"
1832 "cmp %4, %%"REG_a
" \n\t"
1835 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
1836 : "memory", "%"REG_a
1840 for (i
=0; i
<chromWidth
; i
++)
1842 ydst
[2*i
+0] = src
[4*i
+0];
1843 udst
[i
] = src
[4*i
+1];
1844 ydst
[2*i
+1] = src
[4*i
+2];
1845 vdst
[i
] = src
[4*i
+3];
1850 for (i
=0; i
<chromWidth
; i
++)
1852 ydst
[2*i
+0] = src
[4*i
+0];
1853 ydst
[2*i
+1] = src
[4*i
+2];
1856 udst
+= chromStride
;
1857 vdst
+= chromStride
;
1862 __asm__
volatile( EMMS
" \n\t"
1868 static inline void RENAME(yvu9toyv12
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
,
1869 uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1870 long width
, long height
, long lumStride
, long chromStride
)
1873 memcpy(ydst
, ysrc
, width
*height
);
1875 /* XXX: implement upscaling for U,V */
1878 static inline void RENAME(planar2x
)(const uint8_t *src
, uint8_t *dst
, long srcWidth
, long srcHeight
, long srcStride
, long dstStride
)
1885 for (x
=0; x
<srcWidth
-1; x
++){
1886 dst
[2*x
+1]= (3*src
[x
] + src
[x
+1])>>2;
1887 dst
[2*x
+2]= ( src
[x
] + 3*src
[x
+1])>>2;
1889 dst
[2*srcWidth
-1]= src
[srcWidth
-1];
1893 for (y
=1; y
<srcHeight
; y
++){
1894 #if HAVE_MMX2 || HAVE_AMD3DNOW
1895 const x86_reg mmxSize
= srcWidth
&~15;
1897 "mov %4, %%"REG_a
" \n\t"
1899 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1900 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1901 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1902 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1903 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1904 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1905 PAVGB
" %%mm0, %%mm5 \n\t"
1906 PAVGB
" %%mm0, %%mm3 \n\t"
1907 PAVGB
" %%mm0, %%mm5 \n\t"
1908 PAVGB
" %%mm0, %%mm3 \n\t"
1909 PAVGB
" %%mm1, %%mm4 \n\t"
1910 PAVGB
" %%mm1, %%mm2 \n\t"
1911 PAVGB
" %%mm1, %%mm4 \n\t"
1912 PAVGB
" %%mm1, %%mm2 \n\t"
1913 "movq %%mm5, %%mm7 \n\t"
1914 "movq %%mm4, %%mm6 \n\t"
1915 "punpcklbw %%mm3, %%mm5 \n\t"
1916 "punpckhbw %%mm3, %%mm7 \n\t"
1917 "punpcklbw %%mm2, %%mm4 \n\t"
1918 "punpckhbw %%mm2, %%mm6 \n\t"
1920 MOVNTQ
" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1921 MOVNTQ
" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1922 MOVNTQ
" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1923 MOVNTQ
" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1925 "movq %%mm5, (%2, %%"REG_a
", 2) \n\t"
1926 "movq %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1927 "movq %%mm4, (%3, %%"REG_a
", 2) \n\t"
1928 "movq %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1930 "add $8, %%"REG_a
" \n\t"
1932 :: "r" (src
+ mmxSize
), "r" (src
+ srcStride
+ mmxSize
),
1933 "r" (dst
+ mmxSize
*2), "r" (dst
+ dstStride
+ mmxSize
*2),
1939 const x86_reg mmxSize
=1;
1941 dst
[0 ]= (3*src
[0] + src
[srcStride
])>>2;
1942 dst
[dstStride
]= ( src
[0] + 3*src
[srcStride
])>>2;
1944 for (x
=mmxSize
-1; x
<srcWidth
-1; x
++){
1945 dst
[2*x
+1]= (3*src
[x
+0] + src
[x
+srcStride
+1])>>2;
1946 dst
[2*x
+dstStride
+2]= ( src
[x
+0] + 3*src
[x
+srcStride
+1])>>2;
1947 dst
[2*x
+dstStride
+1]= ( src
[x
+1] + 3*src
[x
+srcStride
])>>2;
1948 dst
[2*x
+2]= (3*src
[x
+1] + src
[x
+srcStride
])>>2;
1950 dst
[srcWidth
*2 -1 ]= (3*src
[srcWidth
-1] + src
[srcWidth
-1 + srcStride
])>>2;
1951 dst
[srcWidth
*2 -1 + dstStride
]= ( src
[srcWidth
-1] + 3*src
[srcWidth
-1 + srcStride
])>>2;
1961 for (x
=0; x
<srcWidth
-1; x
++){
1962 dst
[2*x
+1]= (3*src
[x
] + src
[x
+1])>>2;
1963 dst
[2*x
+2]= ( src
[x
] + 3*src
[x
+1])>>2;
1965 dst
[2*srcWidth
-1]= src
[srcWidth
-1];
1967 for (x
=0; x
<srcWidth
; x
++){
1974 __asm__
volatile( EMMS
" \n\t"
1981 * Height should be a multiple of 2 and width should be a multiple of 16.
1982 * (If this is a problem for anyone then tell me, and I will fix it.)
1983 * Chrominance data is only taken from every second line, others are ignored.
1984 * FIXME: Write HQ version.
1986 static inline void RENAME(uyvytoyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1987 long width
, long height
,
1988 long lumStride
, long chromStride
, long srcStride
)
1991 const x86_reg chromWidth
= width
>>1;
1992 for (y
=0; y
<height
; y
+=2)
1996 "xor %%"REG_a
", %%"REG_a
" \n\t"
1997 "pcmpeqw %%mm7, %%mm7 \n\t"
1998 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2001 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
2002 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // UYVY UYVY(0)
2003 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // UYVY UYVY(4)
2004 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2005 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2006 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2007 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2008 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2009 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2010 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2011 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2013 MOVNTQ
" %%mm2, (%1, %%"REG_a
", 2) \n\t"
2015 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t" // UYVY UYVY(8)
2016 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t" // UYVY UYVY(12)
2017 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2018 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2019 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2020 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2021 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2022 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2023 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2024 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2026 MOVNTQ
" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
2028 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2029 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2030 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2031 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2032 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2033 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2034 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2035 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2037 MOVNTQ
" %%mm0, (%3, %%"REG_a
") \n\t"
2038 MOVNTQ
" %%mm2, (%2, %%"REG_a
") \n\t"
2040 "add $8, %%"REG_a
" \n\t"
2041 "cmp %4, %%"REG_a
" \n\t"
2043 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
2044 : "memory", "%"REG_a
2051 "xor %%"REG_a
", %%"REG_a
" \n\t"
2054 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
2055 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
2056 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
2057 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(8)
2058 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t" // YUYV YUYV(12)
2059 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2060 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2061 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2062 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2063 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2064 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2066 MOVNTQ
" %%mm0, (%1, %%"REG_a
", 2) \n\t"
2067 MOVNTQ
" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
2069 "add $8, %%"REG_a
" \n\t"
2070 "cmp %4, %%"REG_a
" \n\t"
2073 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
2074 : "memory", "%"REG_a
2078 for (i
=0; i
<chromWidth
; i
++)
2080 udst
[i
] = src
[4*i
+0];
2081 ydst
[2*i
+0] = src
[4*i
+1];
2082 vdst
[i
] = src
[4*i
+2];
2083 ydst
[2*i
+1] = src
[4*i
+3];
2088 for (i
=0; i
<chromWidth
; i
++)
2090 ydst
[2*i
+0] = src
[4*i
+1];
2091 ydst
[2*i
+1] = src
[4*i
+3];
2094 udst
+= chromStride
;
2095 vdst
+= chromStride
;
2100 __asm__
volatile( EMMS
" \n\t"
2107 * Height should be a multiple of 2 and width should be a multiple of 2.
2108 * (If this is a problem for anyone then tell me, and I will fix it.)
2109 * Chrominance data is only taken from every second line,
2110 * others are ignored in the C version.
2111 * FIXME: Write HQ version.
2113 static inline void RENAME(rgb24toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
2114 long width
, long height
,
2115 long lumStride
, long chromStride
, long srcStride
)
2118 const x86_reg chromWidth
= width
>>1;
2120 for (y
=0; y
<height
-2; y
+=2)
2126 "mov %2, %%"REG_a
" \n\t"
2127 "movq "MANGLE(ff_bgr2YCoeff
)", %%mm6 \n\t"
2128 "movq "MANGLE(ff_w1111
)", %%mm5 \n\t"
2129 "pxor %%mm7, %%mm7 \n\t"
2130 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
2133 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
2134 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
2135 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
2136 "punpcklbw %%mm7, %%mm0 \n\t"
2137 "punpcklbw %%mm7, %%mm1 \n\t"
2138 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
2139 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
2140 "punpcklbw %%mm7, %%mm2 \n\t"
2141 "punpcklbw %%mm7, %%mm3 \n\t"
2142 "pmaddwd %%mm6, %%mm0 \n\t"
2143 "pmaddwd %%mm6, %%mm1 \n\t"
2144 "pmaddwd %%mm6, %%mm2 \n\t"
2145 "pmaddwd %%mm6, %%mm3 \n\t"
2146 #ifndef FAST_BGR2YV12
2147 "psrad $8, %%mm0 \n\t"
2148 "psrad $8, %%mm1 \n\t"
2149 "psrad $8, %%mm2 \n\t"
2150 "psrad $8, %%mm3 \n\t"
2152 "packssdw %%mm1, %%mm0 \n\t"
2153 "packssdw %%mm3, %%mm2 \n\t"
2154 "pmaddwd %%mm5, %%mm0 \n\t"
2155 "pmaddwd %%mm5, %%mm2 \n\t"
2156 "packssdw %%mm2, %%mm0 \n\t"
2157 "psraw $7, %%mm0 \n\t"
2159 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
2160 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
2161 "punpcklbw %%mm7, %%mm4 \n\t"
2162 "punpcklbw %%mm7, %%mm1 \n\t"
2163 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
2164 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
2165 "punpcklbw %%mm7, %%mm2 \n\t"
2166 "punpcklbw %%mm7, %%mm3 \n\t"
2167 "pmaddwd %%mm6, %%mm4 \n\t"
2168 "pmaddwd %%mm6, %%mm1 \n\t"
2169 "pmaddwd %%mm6, %%mm2 \n\t"
2170 "pmaddwd %%mm6, %%mm3 \n\t"
2171 #ifndef FAST_BGR2YV12
2172 "psrad $8, %%mm4 \n\t"
2173 "psrad $8, %%mm1 \n\t"
2174 "psrad $8, %%mm2 \n\t"
2175 "psrad $8, %%mm3 \n\t"
2177 "packssdw %%mm1, %%mm4 \n\t"
2178 "packssdw %%mm3, %%mm2 \n\t"
2179 "pmaddwd %%mm5, %%mm4 \n\t"
2180 "pmaddwd %%mm5, %%mm2 \n\t"
2181 "add $24, %%"REG_d
" \n\t"
2182 "packssdw %%mm2, %%mm4 \n\t"
2183 "psraw $7, %%mm4 \n\t"
2185 "packuswb %%mm4, %%mm0 \n\t"
2186 "paddusb "MANGLE(ff_bgr2YOffset
)", %%mm0 \n\t"
2188 MOVNTQ
" %%mm0, (%1, %%"REG_a
") \n\t"
2189 "add $8, %%"REG_a
" \n\t"
2191 : : "r" (src
+width
*3), "r" (ydst
+width
), "g" ((x86_reg
)-width
)
2192 : "%"REG_a
, "%"REG_d
2199 "mov %4, %%"REG_a
" \n\t"
2200 "movq "MANGLE(ff_w1111
)", %%mm5 \n\t"
2201 "movq "MANGLE(ff_bgr2UCoeff
)", %%mm6 \n\t"
2202 "pxor %%mm7, %%mm7 \n\t"
2203 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
2204 "add %%"REG_d
", %%"REG_d
" \n\t"
2207 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
2208 PREFETCH
" 64(%1, %%"REG_d
") \n\t"
2209 #if HAVE_MMX2 || HAVE_AMD3DNOW
2210 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
2211 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
2212 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
2213 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
2214 PAVGB
" %%mm1, %%mm0 \n\t"
2215 PAVGB
" %%mm3, %%mm2 \n\t"
2216 "movq %%mm0, %%mm1 \n\t"
2217 "movq %%mm2, %%mm3 \n\t"
2218 "psrlq $24, %%mm0 \n\t"
2219 "psrlq $24, %%mm2 \n\t"
2220 PAVGB
" %%mm1, %%mm0 \n\t"
2221 PAVGB
" %%mm3, %%mm2 \n\t"
2222 "punpcklbw %%mm7, %%mm0 \n\t"
2223 "punpcklbw %%mm7, %%mm2 \n\t"
2225 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
2226 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
2227 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
2228 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
2229 "punpcklbw %%mm7, %%mm0 \n\t"
2230 "punpcklbw %%mm7, %%mm1 \n\t"
2231 "punpcklbw %%mm7, %%mm2 \n\t"
2232 "punpcklbw %%mm7, %%mm3 \n\t"
2233 "paddw %%mm1, %%mm0 \n\t"
2234 "paddw %%mm3, %%mm2 \n\t"
2235 "paddw %%mm2, %%mm0 \n\t"
2236 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
2237 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
2238 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
2239 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
2240 "punpcklbw %%mm7, %%mm4 \n\t"
2241 "punpcklbw %%mm7, %%mm1 \n\t"
2242 "punpcklbw %%mm7, %%mm2 \n\t"
2243 "punpcklbw %%mm7, %%mm3 \n\t"
2244 "paddw %%mm1, %%mm4 \n\t"
2245 "paddw %%mm3, %%mm2 \n\t"
2246 "paddw %%mm4, %%mm2 \n\t"
2247 "psrlw $2, %%mm0 \n\t"
2248 "psrlw $2, %%mm2 \n\t"
2250 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm1 \n\t"
2251 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm3 \n\t"
2253 "pmaddwd %%mm0, %%mm1 \n\t"
2254 "pmaddwd %%mm2, %%mm3 \n\t"
2255 "pmaddwd %%mm6, %%mm0 \n\t"
2256 "pmaddwd %%mm6, %%mm2 \n\t"
2257 #ifndef FAST_BGR2YV12
2258 "psrad $8, %%mm0 \n\t"
2259 "psrad $8, %%mm1 \n\t"
2260 "psrad $8, %%mm2 \n\t"
2261 "psrad $8, %%mm3 \n\t"
2263 "packssdw %%mm2, %%mm0 \n\t"
2264 "packssdw %%mm3, %%mm1 \n\t"
2265 "pmaddwd %%mm5, %%mm0 \n\t"
2266 "pmaddwd %%mm5, %%mm1 \n\t"
2267 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2268 "psraw $7, %%mm0 \n\t"
2270 #if HAVE_MMX2 || HAVE_AMD3DNOW
2271 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
2272 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
2273 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
2274 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
2275 PAVGB
" %%mm1, %%mm4 \n\t"
2276 PAVGB
" %%mm3, %%mm2 \n\t"
2277 "movq %%mm4, %%mm1 \n\t"
2278 "movq %%mm2, %%mm3 \n\t"
2279 "psrlq $24, %%mm4 \n\t"
2280 "psrlq $24, %%mm2 \n\t"
2281 PAVGB
" %%mm1, %%mm4 \n\t"
2282 PAVGB
" %%mm3, %%mm2 \n\t"
2283 "punpcklbw %%mm7, %%mm4 \n\t"
2284 "punpcklbw %%mm7, %%mm2 \n\t"
2286 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
2287 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
2288 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
2289 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
2290 "punpcklbw %%mm7, %%mm4 \n\t"
2291 "punpcklbw %%mm7, %%mm1 \n\t"
2292 "punpcklbw %%mm7, %%mm2 \n\t"
2293 "punpcklbw %%mm7, %%mm3 \n\t"
2294 "paddw %%mm1, %%mm4 \n\t"
2295 "paddw %%mm3, %%mm2 \n\t"
2296 "paddw %%mm2, %%mm4 \n\t"
2297 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
2298 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
2299 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
2300 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
2301 "punpcklbw %%mm7, %%mm5 \n\t"
2302 "punpcklbw %%mm7, %%mm1 \n\t"
2303 "punpcklbw %%mm7, %%mm2 \n\t"
2304 "punpcklbw %%mm7, %%mm3 \n\t"
2305 "paddw %%mm1, %%mm5 \n\t"
2306 "paddw %%mm3, %%mm2 \n\t"
2307 "paddw %%mm5, %%mm2 \n\t"
2308 "movq "MANGLE(ff_w1111
)", %%mm5 \n\t"
2309 "psrlw $2, %%mm4 \n\t"
2310 "psrlw $2, %%mm2 \n\t"
2312 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm1 \n\t"
2313 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm3 \n\t"
2315 "pmaddwd %%mm4, %%mm1 \n\t"
2316 "pmaddwd %%mm2, %%mm3 \n\t"
2317 "pmaddwd %%mm6, %%mm4 \n\t"
2318 "pmaddwd %%mm6, %%mm2 \n\t"
2319 #ifndef FAST_BGR2YV12
2320 "psrad $8, %%mm4 \n\t"
2321 "psrad $8, %%mm1 \n\t"
2322 "psrad $8, %%mm2 \n\t"
2323 "psrad $8, %%mm3 \n\t"
2325 "packssdw %%mm2, %%mm4 \n\t"
2326 "packssdw %%mm3, %%mm1 \n\t"
2327 "pmaddwd %%mm5, %%mm4 \n\t"
2328 "pmaddwd %%mm5, %%mm1 \n\t"
2329 "add $24, %%"REG_d
" \n\t"
2330 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2331 "psraw $7, %%mm4 \n\t"
2333 "movq %%mm0, %%mm1 \n\t"
2334 "punpckldq %%mm4, %%mm0 \n\t"
2335 "punpckhdq %%mm4, %%mm1 \n\t"
2336 "packsswb %%mm1, %%mm0 \n\t"
2337 "paddb "MANGLE(ff_bgr2UVOffset
)", %%mm0 \n\t"
2338 "movd %%mm0, (%2, %%"REG_a
") \n\t"
2339 "punpckhdq %%mm0, %%mm0 \n\t"
2340 "movd %%mm0, (%3, %%"REG_a
") \n\t"
2341 "add $4, %%"REG_a
" \n\t"
2343 : : "r" (src
+chromWidth
*6), "r" (src
+srcStride
+chromWidth
*6), "r" (udst
+chromWidth
), "r" (vdst
+chromWidth
), "g" (-chromWidth
)
2344 : "%"REG_a
, "%"REG_d
2347 udst
+= chromStride
;
2348 vdst
+= chromStride
;
2352 __asm__
volatile( EMMS
" \n\t"
2358 for (; y
<height
; y
+=2)
2361 for (i
=0; i
<chromWidth
; i
++)
2363 unsigned int b
= src
[6*i
+0];
2364 unsigned int g
= src
[6*i
+1];
2365 unsigned int r
= src
[6*i
+2];
2367 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2368 unsigned int V
= ((RV
*r
+ GV
*g
+ BV
*b
)>>RGB2YUV_SHIFT
) + 128;
2369 unsigned int U
= ((RU
*r
+ GU
*g
+ BU
*b
)>>RGB2YUV_SHIFT
) + 128;
2379 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2385 for (i
=0; i
<chromWidth
; i
++)
2387 unsigned int b
= src
[6*i
+0];
2388 unsigned int g
= src
[6*i
+1];
2389 unsigned int r
= src
[6*i
+2];
2391 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2399 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2402 udst
+= chromStride
;
2403 vdst
+= chromStride
;
2409 static void RENAME(interleaveBytes
)(uint8_t *src1
, uint8_t *src2
, uint8_t *dest
,
2410 long width
, long height
, long src1Stride
,
2411 long src2Stride
, long dstStride
){
2414 for (h
=0; h
< height
; h
++)
2421 "xor %%"REG_a
", %%"REG_a
" \n\t"
2423 PREFETCH
" 64(%1, %%"REG_a
") \n\t"
2424 PREFETCH
" 64(%2, %%"REG_a
") \n\t"
2425 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
2426 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
2427 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
2428 "punpcklbw %%xmm2, %%xmm0 \n\t"
2429 "punpckhbw %%xmm2, %%xmm1 \n\t"
2430 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
2431 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
2432 "add $16, %%"REG_a
" \n\t"
2433 "cmp %3, %%"REG_a
" \n\t"
2435 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" ((x86_reg
)width
-15)
2436 : "memory", "%"REG_a
""
2440 "xor %%"REG_a
", %%"REG_a
" \n\t"
2442 PREFETCH
" 64(%1, %%"REG_a
") \n\t"
2443 PREFETCH
" 64(%2, %%"REG_a
") \n\t"
2444 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
2445 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
2446 "movq %%mm0, %%mm1 \n\t"
2447 "movq %%mm2, %%mm3 \n\t"
2448 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
2449 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
2450 "punpcklbw %%mm4, %%mm0 \n\t"
2451 "punpckhbw %%mm4, %%mm1 \n\t"
2452 "punpcklbw %%mm5, %%mm2 \n\t"
2453 "punpckhbw %%mm5, %%mm3 \n\t"
2454 MOVNTQ
" %%mm0, (%0, %%"REG_a
", 2) \n\t"
2455 MOVNTQ
" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
2456 MOVNTQ
" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
2457 MOVNTQ
" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
2458 "add $16, %%"REG_a
" \n\t"
2459 "cmp %3, %%"REG_a
" \n\t"
2461 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" ((x86_reg
)width
-15)
2462 : "memory", "%"REG_a
2465 for (w
= (width
&(~15)); w
< width
; w
++)
2467 dest
[2*w
+0] = src1
[w
];
2468 dest
[2*w
+1] = src2
[w
];
2471 for (w
=0; w
< width
; w
++)
2473 dest
[2*w
+0] = src1
[w
];
2474 dest
[2*w
+1] = src2
[w
];
2490 static inline void RENAME(vu9_to_vu12
)(const uint8_t *src1
, const uint8_t *src2
,
2491 uint8_t *dst1
, uint8_t *dst2
,
2492 long width
, long height
,
2493 long srcStride1
, long srcStride2
,
2494 long dstStride1
, long dstStride2
)
2498 w
=width
/2; h
=height
/2;
2503 ::"m"(*(src1
+srcStride1
)),"m"(*(src2
+srcStride2
)):"memory");
2506 const uint8_t* s1
=src1
+srcStride1
*(y
>>1);
2507 uint8_t* d
=dst1
+dstStride1
*y
;
2513 PREFETCH
" 32%1 \n\t"
2514 "movq %1, %%mm0 \n\t"
2515 "movq 8%1, %%mm2 \n\t"
2516 "movq 16%1, %%mm4 \n\t"
2517 "movq 24%1, %%mm6 \n\t"
2518 "movq %%mm0, %%mm1 \n\t"
2519 "movq %%mm2, %%mm3 \n\t"
2520 "movq %%mm4, %%mm5 \n\t"
2521 "movq %%mm6, %%mm7 \n\t"
2522 "punpcklbw %%mm0, %%mm0 \n\t"
2523 "punpckhbw %%mm1, %%mm1 \n\t"
2524 "punpcklbw %%mm2, %%mm2 \n\t"
2525 "punpckhbw %%mm3, %%mm3 \n\t"
2526 "punpcklbw %%mm4, %%mm4 \n\t"
2527 "punpckhbw %%mm5, %%mm5 \n\t"
2528 "punpcklbw %%mm6, %%mm6 \n\t"
2529 "punpckhbw %%mm7, %%mm7 \n\t"
2530 MOVNTQ
" %%mm0, %0 \n\t"
2531 MOVNTQ
" %%mm1, 8%0 \n\t"
2532 MOVNTQ
" %%mm2, 16%0 \n\t"
2533 MOVNTQ
" %%mm3, 24%0 \n\t"
2534 MOVNTQ
" %%mm4, 32%0 \n\t"
2535 MOVNTQ
" %%mm5, 40%0 \n\t"
2536 MOVNTQ
" %%mm6, 48%0 \n\t"
2537 MOVNTQ
" %%mm7, 56%0"
2543 for (;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s1
[x
];
2546 const uint8_t* s2
=src2
+srcStride2
*(y
>>1);
2547 uint8_t* d
=dst2
+dstStride2
*y
;
2553 PREFETCH
" 32%1 \n\t"
2554 "movq %1, %%mm0 \n\t"
2555 "movq 8%1, %%mm2 \n\t"
2556 "movq 16%1, %%mm4 \n\t"
2557 "movq 24%1, %%mm6 \n\t"
2558 "movq %%mm0, %%mm1 \n\t"
2559 "movq %%mm2, %%mm3 \n\t"
2560 "movq %%mm4, %%mm5 \n\t"
2561 "movq %%mm6, %%mm7 \n\t"
2562 "punpcklbw %%mm0, %%mm0 \n\t"
2563 "punpckhbw %%mm1, %%mm1 \n\t"
2564 "punpcklbw %%mm2, %%mm2 \n\t"
2565 "punpckhbw %%mm3, %%mm3 \n\t"
2566 "punpcklbw %%mm4, %%mm4 \n\t"
2567 "punpckhbw %%mm5, %%mm5 \n\t"
2568 "punpcklbw %%mm6, %%mm6 \n\t"
2569 "punpckhbw %%mm7, %%mm7 \n\t"
2570 MOVNTQ
" %%mm0, %0 \n\t"
2571 MOVNTQ
" %%mm1, 8%0 \n\t"
2572 MOVNTQ
" %%mm2, 16%0 \n\t"
2573 MOVNTQ
" %%mm3, 24%0 \n\t"
2574 MOVNTQ
" %%mm4, 32%0 \n\t"
2575 MOVNTQ
" %%mm5, 40%0 \n\t"
2576 MOVNTQ
" %%mm6, 48%0 \n\t"
2577 MOVNTQ
" %%mm7, 56%0"
2583 for (;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s2
[x
];
2594 static inline void RENAME(yvu9_to_yuy2
)(const uint8_t *src1
, const uint8_t *src2
, const uint8_t *src3
,
2596 long width
, long height
,
2597 long srcStride1
, long srcStride2
,
2598 long srcStride3
, long dstStride
)
2602 w
=width
/2; h
=height
;
2604 const uint8_t* yp
=src1
+srcStride1
*y
;
2605 const uint8_t* up
=src2
+srcStride2
*(y
>>2);
2606 const uint8_t* vp
=src3
+srcStride3
*(y
>>2);
2607 uint8_t* d
=dst
+dstStride
*y
;
2613 PREFETCH
" 32(%1, %0) \n\t"
2614 PREFETCH
" 32(%2, %0) \n\t"
2615 PREFETCH
" 32(%3, %0) \n\t"
2616 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2617 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2618 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2619 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2620 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2621 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2622 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2623 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2624 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2625 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2627 "movq %%mm1, %%mm6 \n\t"
2628 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2629 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2630 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2631 MOVNTQ
" %%mm0, (%4, %0, 8) \n\t"
2632 MOVNTQ
" %%mm3, 8(%4, %0, 8) \n\t"
2634 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2635 "movq 8(%1, %0, 4), %%mm0 \n\t"
2636 "movq %%mm0, %%mm3 \n\t"
2637 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2638 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2639 MOVNTQ
" %%mm0, 16(%4, %0, 8) \n\t"
2640 MOVNTQ
" %%mm3, 24(%4, %0, 8) \n\t"
2642 "movq %%mm4, %%mm6 \n\t"
2643 "movq 16(%1, %0, 4), %%mm0 \n\t"
2644 "movq %%mm0, %%mm3 \n\t"
2645 "punpcklbw %%mm5, %%mm4 \n\t"
2646 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2647 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2648 MOVNTQ
" %%mm0, 32(%4, %0, 8) \n\t"
2649 MOVNTQ
" %%mm3, 40(%4, %0, 8) \n\t"
2651 "punpckhbw %%mm5, %%mm6 \n\t"
2652 "movq 24(%1, %0, 4), %%mm0 \n\t"
2653 "movq %%mm0, %%mm3 \n\t"
2654 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2655 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2656 MOVNTQ
" %%mm0, 48(%4, %0, 8) \n\t"
2657 MOVNTQ
" %%mm3, 56(%4, %0, 8) \n\t"
2660 : "r"(yp
), "r" (up
), "r"(vp
), "r"(d
)
2666 const long x2
= x
<<2;
2669 d
[8*x
+2] = yp
[x2
+1];
2671 d
[8*x
+4] = yp
[x2
+2];
2673 d
[8*x
+6] = yp
[x2
+3];
2686 static void RENAME(extract_even
)(const uint8_t *src
, uint8_t *dst
, x86_reg count
)
2696 "pcmpeqw %%mm7, %%mm7 \n\t"
2697 "psrlw $8, %%mm7 \n\t"
2699 "movq -30(%1, %0, 2), %%mm0 \n\t"
2700 "movq -22(%1, %0, 2), %%mm1 \n\t"
2701 "movq -14(%1, %0, 2), %%mm2 \n\t"
2702 "movq -6(%1, %0, 2), %%mm3 \n\t"
2703 "pand %%mm7, %%mm0 \n\t"
2704 "pand %%mm7, %%mm1 \n\t"
2705 "pand %%mm7, %%mm2 \n\t"
2706 "pand %%mm7, %%mm3 \n\t"
2707 "packuswb %%mm1, %%mm0 \n\t"
2708 "packuswb %%mm3, %%mm2 \n\t"
2709 MOVNTQ
" %%mm0,-15(%2, %0) \n\t"
2710 MOVNTQ
" %%mm2,- 7(%2, %0) \n\t"
2714 : "r"(src
), "r"(dst
)
2720 dst
[count
]= src
[2*count
];
2725 static void RENAME(extract_even2
)(const uint8_t *src
, uint8_t *dst0
, uint8_t *dst1
, x86_reg count
)
2735 "pcmpeqw %%mm7, %%mm7 \n\t"
2736 "psrlw $8, %%mm7 \n\t"
2738 "movq -28(%1, %0, 4), %%mm0 \n\t"
2739 "movq -20(%1, %0, 4), %%mm1 \n\t"
2740 "movq -12(%1, %0, 4), %%mm2 \n\t"
2741 "movq -4(%1, %0, 4), %%mm3 \n\t"
2742 "pand %%mm7, %%mm0 \n\t"
2743 "pand %%mm7, %%mm1 \n\t"
2744 "pand %%mm7, %%mm2 \n\t"
2745 "pand %%mm7, %%mm3 \n\t"
2746 "packuswb %%mm1, %%mm0 \n\t"
2747 "packuswb %%mm3, %%mm2 \n\t"
2748 "movq %%mm0, %%mm1 \n\t"
2749 "movq %%mm2, %%mm3 \n\t"
2750 "psrlw $8, %%mm0 \n\t"
2751 "psrlw $8, %%mm2 \n\t"
2752 "pand %%mm7, %%mm1 \n\t"
2753 "pand %%mm7, %%mm3 \n\t"
2754 "packuswb %%mm2, %%mm0 \n\t"
2755 "packuswb %%mm3, %%mm1 \n\t"
2756 MOVNTQ
" %%mm0,- 7(%3, %0) \n\t"
2757 MOVNTQ
" %%mm1,- 7(%2, %0) \n\t"
2761 : "r"(src
), "r"(dst0
), "r"(dst1
)
2767 dst0
[count
]= src
[4*count
+0];
2768 dst1
[count
]= src
[4*count
+2];
2773 static void RENAME(extract_even2avg
)(const uint8_t *src0
, const uint8_t *src1
, uint8_t *dst0
, uint8_t *dst1
, x86_reg count
)
2784 "pcmpeqw %%mm7, %%mm7 \n\t"
2785 "psrlw $8, %%mm7 \n\t"
2787 "movq -28(%1, %0, 4), %%mm0 \n\t"
2788 "movq -20(%1, %0, 4), %%mm1 \n\t"
2789 "movq -12(%1, %0, 4), %%mm2 \n\t"
2790 "movq -4(%1, %0, 4), %%mm3 \n\t"
2791 PAVGB
" -28(%2, %0, 4), %%mm0 \n\t"
2792 PAVGB
" -20(%2, %0, 4), %%mm1 \n\t"
2793 PAVGB
" -12(%2, %0, 4), %%mm2 \n\t"
2794 PAVGB
" - 4(%2, %0, 4), %%mm3 \n\t"
2795 "pand %%mm7, %%mm0 \n\t"
2796 "pand %%mm7, %%mm1 \n\t"
2797 "pand %%mm7, %%mm2 \n\t"
2798 "pand %%mm7, %%mm3 \n\t"
2799 "packuswb %%mm1, %%mm0 \n\t"
2800 "packuswb %%mm3, %%mm2 \n\t"
2801 "movq %%mm0, %%mm1 \n\t"
2802 "movq %%mm2, %%mm3 \n\t"
2803 "psrlw $8, %%mm0 \n\t"
2804 "psrlw $8, %%mm2 \n\t"
2805 "pand %%mm7, %%mm1 \n\t"
2806 "pand %%mm7, %%mm3 \n\t"
2807 "packuswb %%mm2, %%mm0 \n\t"
2808 "packuswb %%mm3, %%mm1 \n\t"
2809 MOVNTQ
" %%mm0,- 7(%4, %0) \n\t"
2810 MOVNTQ
" %%mm1,- 7(%3, %0) \n\t"
2814 : "r"(src0
), "r"(src1
), "r"(dst0
), "r"(dst1
)
2820 dst0
[count
]= (src0
[4*count
+0]+src1
[4*count
+0])>>1;
2821 dst1
[count
]= (src0
[4*count
+2]+src1
[4*count
+2])>>1;
2826 static void RENAME(extract_odd2
)(const uint8_t *src
, uint8_t *dst0
, uint8_t *dst1
, x86_reg count
)
2836 "pcmpeqw %%mm7, %%mm7 \n\t"
2837 "psrlw $8, %%mm7 \n\t"
2839 "movq -28(%1, %0, 4), %%mm0 \n\t"
2840 "movq -20(%1, %0, 4), %%mm1 \n\t"
2841 "movq -12(%1, %0, 4), %%mm2 \n\t"
2842 "movq -4(%1, %0, 4), %%mm3 \n\t"
2843 "psrlw $8, %%mm0 \n\t"
2844 "psrlw $8, %%mm1 \n\t"
2845 "psrlw $8, %%mm2 \n\t"
2846 "psrlw $8, %%mm3 \n\t"
2847 "packuswb %%mm1, %%mm0 \n\t"
2848 "packuswb %%mm3, %%mm2 \n\t"
2849 "movq %%mm0, %%mm1 \n\t"
2850 "movq %%mm2, %%mm3 \n\t"
2851 "psrlw $8, %%mm0 \n\t"
2852 "psrlw $8, %%mm2 \n\t"
2853 "pand %%mm7, %%mm1 \n\t"
2854 "pand %%mm7, %%mm3 \n\t"
2855 "packuswb %%mm2, %%mm0 \n\t"
2856 "packuswb %%mm3, %%mm1 \n\t"
2857 MOVNTQ
" %%mm0,- 7(%3, %0) \n\t"
2858 MOVNTQ
" %%mm1,- 7(%2, %0) \n\t"
2862 : "r"(src
), "r"(dst0
), "r"(dst1
)
2869 dst0
[count
]= src
[4*count
+0];
2870 dst1
[count
]= src
[4*count
+2];
2875 static void RENAME(extract_odd2avg
)(const uint8_t *src0
, const uint8_t *src1
, uint8_t *dst0
, uint8_t *dst1
, x86_reg count
)
2886 "pcmpeqw %%mm7, %%mm7 \n\t"
2887 "psrlw $8, %%mm7 \n\t"
2889 "movq -28(%1, %0, 4), %%mm0 \n\t"
2890 "movq -20(%1, %0, 4), %%mm1 \n\t"
2891 "movq -12(%1, %0, 4), %%mm2 \n\t"
2892 "movq -4(%1, %0, 4), %%mm3 \n\t"
2893 PAVGB
" -28(%2, %0, 4), %%mm0 \n\t"
2894 PAVGB
" -20(%2, %0, 4), %%mm1 \n\t"
2895 PAVGB
" -12(%2, %0, 4), %%mm2 \n\t"
2896 PAVGB
" - 4(%2, %0, 4), %%mm3 \n\t"
2897 "psrlw $8, %%mm0 \n\t"
2898 "psrlw $8, %%mm1 \n\t"
2899 "psrlw $8, %%mm2 \n\t"
2900 "psrlw $8, %%mm3 \n\t"
2901 "packuswb %%mm1, %%mm0 \n\t"
2902 "packuswb %%mm3, %%mm2 \n\t"
2903 "movq %%mm0, %%mm1 \n\t"
2904 "movq %%mm2, %%mm3 \n\t"
2905 "psrlw $8, %%mm0 \n\t"
2906 "psrlw $8, %%mm2 \n\t"
2907 "pand %%mm7, %%mm1 \n\t"
2908 "pand %%mm7, %%mm3 \n\t"
2909 "packuswb %%mm2, %%mm0 \n\t"
2910 "packuswb %%mm3, %%mm1 \n\t"
2911 MOVNTQ
" %%mm0,- 7(%4, %0) \n\t"
2912 MOVNTQ
" %%mm1,- 7(%3, %0) \n\t"
2916 : "r"(src0
), "r"(src1
), "r"(dst0
), "r"(dst1
)
2924 dst0
[count
]= (src0
[4*count
+0]+src1
[4*count
+0])>>1;
2925 dst1
[count
]= (src0
[4*count
+2]+src1
[4*count
+2])>>1;
2930 static void RENAME(yuyvtoyuv420
)(uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
, const uint8_t *src
,
2931 long width
, long height
,
2932 long lumStride
, long chromStride
, long srcStride
)
2935 const long chromWidth
= -((-width
)>>1);
2937 for (y
=0; y
<height
; y
++){
2938 RENAME(extract_even
)(src
, ydst
, width
);