44e764e66de2ae3982da6cfa5ad41b28edf00a45
[libav.git] / postproc / rgb2rgb_template.c
1 /*
2 *
3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 */
10
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
13
14 #ifndef __WORDSIZE
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
17 #endif
18
19 #undef PREFETCH
20 #undef MOVNTQ
21 #undef EMMS
22 #undef SFENCE
23 #undef MMREG_SIZE
24 #undef PREFETCHW
25 #undef PAVGB
26
27 #ifdef HAVE_SSE2
28 #define MMREG_SIZE 16
29 #else
30 #define MMREG_SIZE 8
31 #endif
32
33 #ifdef HAVE_3DNOW
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #define PAVGB "pavgb"
41 #else
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
44 #endif
45
46 #ifdef HAVE_3DNOW
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48 #define EMMS "femms"
49 #else
50 #define EMMS "emms"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
56 #else
57 #define MOVNTQ "movq"
58 #define SFENCE "/nop"
59 #endif
60
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62 {
63 uint8_t *dest = dst;
64 const uint8_t *s = src;
65 const uint8_t *end;
66 #ifdef HAVE_MMX
67 const uint8_t *mm_end;
68 #endif
69 end = s + src_size;
70 #ifdef HAVE_MMX
71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
72 mm_end = end - 23;
73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
74 while(s < mm_end)
75 {
76 __asm __volatile(
77 PREFETCH" 32%1\n\t"
78 "movd %1, %%mm0\n\t"
79 "punpckldq 3%1, %%mm0\n\t"
80 "movd 6%1, %%mm1\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ" %%mm0, %0\n\t"
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
93 MOVNTQ" %%mm3, 24%0"
94 :"=m"(*dest)
95 :"m"(*s)
96 :"memory");
97 dest += 32;
98 s += 24;
99 }
100 __asm __volatile(SFENCE:::"memory");
101 __asm __volatile(EMMS:::"memory");
102 #endif
103 while(s < end)
104 {
105 *dest++ = *s++;
106 *dest++ = *s++;
107 *dest++ = *s++;
108 *dest++ = 0;
109 }
110 }
111
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113 {
114 uint8_t *dest = dst;
115 const uint8_t *s = src;
116 const uint8_t *end;
117 #ifdef HAVE_MMX
118 const uint8_t *mm_end;
119 #endif
120 end = s + src_size;
121 #ifdef HAVE_MMX
122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
123 mm_end = end - 31;
124 while(s < mm_end)
125 {
126 __asm __volatile(
127 PREFETCH" 32%1\n\t"
128 "movq %1, %%mm0\n\t"
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
140 "pand %2, %%mm0\n\t"
141 "pand %2, %%mm1\n\t"
142 "pand %2, %%mm4\n\t"
143 "pand %2, %%mm5\n\t"
144 "pand %3, %%mm2\n\t"
145 "pand %3, %%mm3\n\t"
146 "pand %3, %%mm6\n\t"
147 "pand %3, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
152
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
157 "pand %4, %%mm2\n\t"
158 "pand %5, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
164 "pand %6, %%mm5\n\t"
165 "por %%mm5, %%mm4\n\t"
166
167 MOVNTQ" %%mm0, %0\n\t"
168 MOVNTQ" %%mm1, 8%0\n\t"
169 MOVNTQ" %%mm4, 16%0"
170 :"=m"(*dest)
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173 :"memory");
174 dest += 24;
175 s += 32;
176 }
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
179 #endif
180 while(s < end)
181 {
182 *dest++ = *s++;
183 *dest++ = *s++;
184 *dest++ = *s++;
185 s++;
186 }
187 }
188
189 /*
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
194 */
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 {
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
200 const uint8_t *mm_end;
201 end = s + src_size;
202 #ifdef HAVE_MMX
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
205 mm_end = end - 15;
206 while(s<mm_end)
207 {
208 __asm __volatile(
209 PREFETCH" 32%1\n\t"
210 "movq %1, %%mm0\n\t"
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ" %%mm0, %0\n\t"
219 MOVNTQ" %%mm2, 8%0"
220 :"=m"(*d)
221 :"m"(*s)
222 );
223 d+=16;
224 s+=16;
225 }
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
228 #endif
229 mm_end = end - 3;
230 while(s < mm_end)
231 {
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234 d+=4;
235 s+=4;
236 }
237 if(s < end)
238 {
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241 }
242 }
243
244 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
245 {
246 unsigned j,i,num_pixels=src_size/3;
247 for(i=0,j=0; j<num_pixels; i+=3,j+=3)
248 {
249 dst[j+0] = src[i+2];
250 dst[j+1] = src[i+1];
251 dst[j+2] = src[i+0];
252 }
253 }
254
255 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
256 {
257 register const uint8_t* s=src;
258 register uint8_t* d=dst;
259 register const uint8_t *end;
260 const uint8_t *mm_end;
261 end = s + src_size;
262 #ifdef HAVE_MMX
263 __asm __volatile(PREFETCH" %0"::"m"(*s));
264 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
265 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
266 mm_end = end - 15;
267 while(s<mm_end)
268 {
269 __asm __volatile(
270 PREFETCH" 32%1\n\t"
271 "movq %1, %%mm0\n\t"
272 "movq 8%1, %%mm2\n\t"
273 "movq %%mm0, %%mm1\n\t"
274 "movq %%mm2, %%mm3\n\t"
275 "psrlq $1, %%mm0\n\t"
276 "psrlq $1, %%mm2\n\t"
277 "pand %%mm7, %%mm0\n\t"
278 "pand %%mm7, %%mm2\n\t"
279 "pand %%mm6, %%mm1\n\t"
280 "pand %%mm6, %%mm3\n\t"
281 "por %%mm1, %%mm0\n\t"
282 "por %%mm3, %%mm2\n\t"
283 MOVNTQ" %%mm0, %0\n\t"
284 MOVNTQ" %%mm2, 8%0"
285 :"=m"(*d)
286 :"m"(*s)
287 );
288 d+=16;
289 s+=16;
290 }
291 __asm __volatile(SFENCE:::"memory");
292 __asm __volatile(EMMS:::"memory");
293 #endif
294 mm_end = end - 3;
295 while(s < mm_end)
296 {
297 register uint32_t x= *((uint32_t *)s);
298 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
299 s+=4;
300 d+=4;
301 }
302 if(s < end)
303 {
304 register uint16_t x= *((uint16_t *)s);
305 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
306 s+=2;
307 d+=2;
308 }
309 }
310
311 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
312 {
313 const uint8_t *s = src;
314 const uint8_t *end;
315 #ifdef HAVE_MMX
316 const uint8_t *mm_end;
317 #endif
318 uint16_t *d = (uint16_t *)dst;
319 end = s + src_size;
320 #ifdef HAVE_MMX
321 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
322 __asm __volatile(
323 "movq %0, %%mm7\n\t"
324 "movq %1, %%mm6\n\t"
325 ::"m"(red_16mask),"m"(green_16mask));
326 mm_end = end - 15;
327 while(s < mm_end)
328 {
329 __asm __volatile(
330 PREFETCH" 32%1\n\t"
331 "movd %1, %%mm0\n\t"
332 "movd 4%1, %%mm3\n\t"
333 "punpckldq 8%1, %%mm0\n\t"
334 "punpckldq 12%1, %%mm3\n\t"
335 "movq %%mm0, %%mm1\n\t"
336 "movq %%mm0, %%mm2\n\t"
337 "movq %%mm3, %%mm4\n\t"
338 "movq %%mm3, %%mm5\n\t"
339 "psrlq $3, %%mm0\n\t"
340 "psrlq $3, %%mm3\n\t"
341 "pand %2, %%mm0\n\t"
342 "pand %2, %%mm3\n\t"
343 "psrlq $5, %%mm1\n\t"
344 "psrlq $5, %%mm4\n\t"
345 "pand %%mm6, %%mm1\n\t"
346 "pand %%mm6, %%mm4\n\t"
347 "psrlq $8, %%mm2\n\t"
348 "psrlq $8, %%mm5\n\t"
349 "pand %%mm7, %%mm2\n\t"
350 "pand %%mm7, %%mm5\n\t"
351 "por %%mm1, %%mm0\n\t"
352 "por %%mm4, %%mm3\n\t"
353 "por %%mm2, %%mm0\n\t"
354 "por %%mm5, %%mm3\n\t"
355 "psllq $16, %%mm3\n\t"
356 "por %%mm3, %%mm0\n\t"
357 MOVNTQ" %%mm0, %0\n\t"
358 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
359 d += 4;
360 s += 16;
361 }
362 __asm __volatile(SFENCE:::"memory");
363 __asm __volatile(EMMS:::"memory");
364 #endif
365 while(s < end)
366 {
367 const int b= *s++;
368 const int g= *s++;
369 const int r= *s++;
370 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
371 s++;
372 }
373 }
374
375 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
376 {
377 const uint8_t *s = src;
378 const uint8_t *end;
379 #ifdef HAVE_MMX
380 const uint8_t *mm_end;
381 #endif
382 uint16_t *d = (uint16_t *)dst;
383 end = s + src_size;
384 #ifdef HAVE_MMX
385 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
386 __asm __volatile(
387 "movq %0, %%mm7\n\t"
388 "movq %1, %%mm6\n\t"
389 ::"m"(red_16mask),"m"(green_16mask));
390 mm_end = end - 15;
391 while(s < mm_end)
392 {
393 __asm __volatile(
394 PREFETCH" 32%1\n\t"
395 "movd %1, %%mm0\n\t"
396 "movd 4%1, %%mm3\n\t"
397 "punpckldq 8%1, %%mm0\n\t"
398 "punpckldq 12%1, %%mm3\n\t"
399 "movq %%mm0, %%mm1\n\t"
400 "movq %%mm0, %%mm2\n\t"
401 "movq %%mm3, %%mm4\n\t"
402 "movq %%mm3, %%mm5\n\t"
403 "psllq $8, %%mm0\n\t"
404 "psllq $8, %%mm3\n\t"
405 "pand %%mm7, %%mm0\n\t"
406 "pand %%mm7, %%mm3\n\t"
407 "psrlq $5, %%mm1\n\t"
408 "psrlq $5, %%mm4\n\t"
409 "pand %%mm6, %%mm1\n\t"
410 "pand %%mm6, %%mm4\n\t"
411 "psrlq $19, %%mm2\n\t"
412 "psrlq $19, %%mm5\n\t"
413 "pand %2, %%mm2\n\t"
414 "pand %2, %%mm5\n\t"
415 "por %%mm1, %%mm0\n\t"
416 "por %%mm4, %%mm3\n\t"
417 "por %%mm2, %%mm0\n\t"
418 "por %%mm5, %%mm3\n\t"
419 "psllq $16, %%mm3\n\t"
420 "por %%mm3, %%mm0\n\t"
421 MOVNTQ" %%mm0, %0\n\t"
422 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
423 d += 4;
424 s += 16;
425 }
426 __asm __volatile(SFENCE:::"memory");
427 __asm __volatile(EMMS:::"memory");
428 #endif
429 while(s < end)
430 {
431 const int r= *s++;
432 const int g= *s++;
433 const int b= *s++;
434 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
435 s++;
436 }
437 }
438
439 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
440 {
441 const uint8_t *s = src;
442 const uint8_t *end;
443 #ifdef HAVE_MMX
444 const uint8_t *mm_end;
445 #endif
446 uint16_t *d = (uint16_t *)dst;
447 end = s + src_size;
448 #ifdef HAVE_MMX
449 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
450 __asm __volatile(
451 "movq %0, %%mm7\n\t"
452 "movq %1, %%mm6\n\t"
453 ::"m"(red_15mask),"m"(green_15mask));
454 mm_end = end - 15;
455 while(s < mm_end)
456 {
457 __asm __volatile(
458 PREFETCH" 32%1\n\t"
459 "movd %1, %%mm0\n\t"
460 "movd 4%1, %%mm3\n\t"
461 "punpckldq 8%1, %%mm0\n\t"
462 "punpckldq 12%1, %%mm3\n\t"
463 "movq %%mm0, %%mm1\n\t"
464 "movq %%mm0, %%mm2\n\t"
465 "movq %%mm3, %%mm4\n\t"
466 "movq %%mm3, %%mm5\n\t"
467 "psrlq $3, %%mm0\n\t"
468 "psrlq $3, %%mm3\n\t"
469 "pand %2, %%mm0\n\t"
470 "pand %2, %%mm3\n\t"
471 "psrlq $6, %%mm1\n\t"
472 "psrlq $6, %%mm4\n\t"
473 "pand %%mm6, %%mm1\n\t"
474 "pand %%mm6, %%mm4\n\t"
475 "psrlq $9, %%mm2\n\t"
476 "psrlq $9, %%mm5\n\t"
477 "pand %%mm7, %%mm2\n\t"
478 "pand %%mm7, %%mm5\n\t"
479 "por %%mm1, %%mm0\n\t"
480 "por %%mm4, %%mm3\n\t"
481 "por %%mm2, %%mm0\n\t"
482 "por %%mm5, %%mm3\n\t"
483 "psllq $16, %%mm3\n\t"
484 "por %%mm3, %%mm0\n\t"
485 MOVNTQ" %%mm0, %0\n\t"
486 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
487 d += 4;
488 s += 16;
489 }
490 __asm __volatile(SFENCE:::"memory");
491 __asm __volatile(EMMS:::"memory");
492 #endif
493 while(s < end)
494 {
495 const int b= *s++;
496 const int g= *s++;
497 const int r= *s++;
498 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
499 s++;
500 }
501 }
502
503 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
504 {
505 const uint8_t *s = src;
506 const uint8_t *end;
507 #ifdef HAVE_MMX
508 const uint8_t *mm_end;
509 #endif
510 uint16_t *d = (uint16_t *)dst;
511 end = s + src_size;
512 #ifdef HAVE_MMX
513 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
514 __asm __volatile(
515 "movq %0, %%mm7\n\t"
516 "movq %1, %%mm6\n\t"
517 ::"m"(red_15mask),"m"(green_15mask));
518 mm_end = end - 15;
519 while(s < mm_end)
520 {
521 __asm __volatile(
522 PREFETCH" 32%1\n\t"
523 "movd %1, %%mm0\n\t"
524 "movd 4%1, %%mm3\n\t"
525 "punpckldq 8%1, %%mm0\n\t"
526 "punpckldq 12%1, %%mm3\n\t"
527 "movq %%mm0, %%mm1\n\t"
528 "movq %%mm0, %%mm2\n\t"
529 "movq %%mm3, %%mm4\n\t"
530 "movq %%mm3, %%mm5\n\t"
531 "psllq $7, %%mm0\n\t"
532 "psllq $7, %%mm3\n\t"
533 "pand %%mm7, %%mm0\n\t"
534 "pand %%mm7, %%mm3\n\t"
535 "psrlq $6, %%mm1\n\t"
536 "psrlq $6, %%mm4\n\t"
537 "pand %%mm6, %%mm1\n\t"
538 "pand %%mm6, %%mm4\n\t"
539 "psrlq $19, %%mm2\n\t"
540 "psrlq $19, %%mm5\n\t"
541 "pand %2, %%mm2\n\t"
542 "pand %2, %%mm5\n\t"
543 "por %%mm1, %%mm0\n\t"
544 "por %%mm4, %%mm3\n\t"
545 "por %%mm2, %%mm0\n\t"
546 "por %%mm5, %%mm3\n\t"
547 "psllq $16, %%mm3\n\t"
548 "por %%mm3, %%mm0\n\t"
549 MOVNTQ" %%mm0, %0\n\t"
550 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
551 d += 4;
552 s += 16;
553 }
554 __asm __volatile(SFENCE:::"memory");
555 __asm __volatile(EMMS:::"memory");
556 #endif
557 while(s < end)
558 {
559 const int r= *s++;
560 const int g= *s++;
561 const int b= *s++;
562 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
563 s++;
564 }
565 }
566
567 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
568 {
569 const uint8_t *s = src;
570 const uint8_t *end;
571 #ifdef HAVE_MMX
572 const uint8_t *mm_end;
573 #endif
574 uint16_t *d = (uint16_t *)dst;
575 end = s + src_size;
576 #ifdef HAVE_MMX
577 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
578 __asm __volatile(
579 "movq %0, %%mm7\n\t"
580 "movq %1, %%mm6\n\t"
581 ::"m"(red_16mask),"m"(green_16mask));
582 mm_end = end - 11;
583 while(s < mm_end)
584 {
585 __asm __volatile(
586 PREFETCH" 32%1\n\t"
587 "movd %1, %%mm0\n\t"
588 "movd 3%1, %%mm3\n\t"
589 "punpckldq 6%1, %%mm0\n\t"
590 "punpckldq 9%1, %%mm3\n\t"
591 "movq %%mm0, %%mm1\n\t"
592 "movq %%mm0, %%mm2\n\t"
593 "movq %%mm3, %%mm4\n\t"
594 "movq %%mm3, %%mm5\n\t"
595 "psrlq $3, %%mm0\n\t"
596 "psrlq $3, %%mm3\n\t"
597 "pand %2, %%mm0\n\t"
598 "pand %2, %%mm3\n\t"
599 "psrlq $5, %%mm1\n\t"
600 "psrlq $5, %%mm4\n\t"
601 "pand %%mm6, %%mm1\n\t"
602 "pand %%mm6, %%mm4\n\t"
603 "psrlq $8, %%mm2\n\t"
604 "psrlq $8, %%mm5\n\t"
605 "pand %%mm7, %%mm2\n\t"
606 "pand %%mm7, %%mm5\n\t"
607 "por %%mm1, %%mm0\n\t"
608 "por %%mm4, %%mm3\n\t"
609 "por %%mm2, %%mm0\n\t"
610 "por %%mm5, %%mm3\n\t"
611 "psllq $16, %%mm3\n\t"
612 "por %%mm3, %%mm0\n\t"
613 MOVNTQ" %%mm0, %0\n\t"
614 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
615 d += 4;
616 s += 12;
617 }
618 __asm __volatile(SFENCE:::"memory");
619 __asm __volatile(EMMS:::"memory");
620 #endif
621 while(s < end)
622 {
623 const int b= *s++;
624 const int g= *s++;
625 const int r= *s++;
626 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
627 }
628 }
629
630 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
631 {
632 const uint8_t *s = src;
633 const uint8_t *end;
634 #ifdef HAVE_MMX
635 const uint8_t *mm_end;
636 #endif
637 uint16_t *d = (uint16_t *)dst;
638 end = s + src_size;
639 #ifdef HAVE_MMX
640 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
641 __asm __volatile(
642 "movq %0, %%mm7\n\t"
643 "movq %1, %%mm6\n\t"
644 ::"m"(red_16mask),"m"(green_16mask));
645 mm_end = end - 15;
646 while(s < mm_end)
647 {
648 __asm __volatile(
649 PREFETCH" 32%1\n\t"
650 "movd %1, %%mm0\n\t"
651 "movd 3%1, %%mm3\n\t"
652 "punpckldq 6%1, %%mm0\n\t"
653 "punpckldq 9%1, %%mm3\n\t"
654 "movq %%mm0, %%mm1\n\t"
655 "movq %%mm0, %%mm2\n\t"
656 "movq %%mm3, %%mm4\n\t"
657 "movq %%mm3, %%mm5\n\t"
658 "psllq $8, %%mm0\n\t"
659 "psllq $8, %%mm3\n\t"
660 "pand %%mm7, %%mm0\n\t"
661 "pand %%mm7, %%mm3\n\t"
662 "psrlq $5, %%mm1\n\t"
663 "psrlq $5, %%mm4\n\t"
664 "pand %%mm6, %%mm1\n\t"
665 "pand %%mm6, %%mm4\n\t"
666 "psrlq $19, %%mm2\n\t"
667 "psrlq $19, %%mm5\n\t"
668 "pand %2, %%mm2\n\t"
669 "pand %2, %%mm5\n\t"
670 "por %%mm1, %%mm0\n\t"
671 "por %%mm4, %%mm3\n\t"
672 "por %%mm2, %%mm0\n\t"
673 "por %%mm5, %%mm3\n\t"
674 "psllq $16, %%mm3\n\t"
675 "por %%mm3, %%mm0\n\t"
676 MOVNTQ" %%mm0, %0\n\t"
677 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
678 d += 4;
679 s += 12;
680 }
681 __asm __volatile(SFENCE:::"memory");
682 __asm __volatile(EMMS:::"memory");
683 #endif
684 while(s < end)
685 {
686 const int r= *s++;
687 const int g= *s++;
688 const int b= *s++;
689 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
690 }
691 }
692
693 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
694 {
695 const uint8_t *s = src;
696 const uint8_t *end;
697 #ifdef HAVE_MMX
698 const uint8_t *mm_end;
699 #endif
700 uint16_t *d = (uint16_t *)dst;
701 end = s + src_size;
702 #ifdef HAVE_MMX
703 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
704 __asm __volatile(
705 "movq %0, %%mm7\n\t"
706 "movq %1, %%mm6\n\t"
707 ::"m"(red_15mask),"m"(green_15mask));
708 mm_end = end - 11;
709 while(s < mm_end)
710 {
711 __asm __volatile(
712 PREFETCH" 32%1\n\t"
713 "movd %1, %%mm0\n\t"
714 "movd 3%1, %%mm3\n\t"
715 "punpckldq 6%1, %%mm0\n\t"
716 "punpckldq 9%1, %%mm3\n\t"
717 "movq %%mm0, %%mm1\n\t"
718 "movq %%mm0, %%mm2\n\t"
719 "movq %%mm3, %%mm4\n\t"
720 "movq %%mm3, %%mm5\n\t"
721 "psrlq $3, %%mm0\n\t"
722 "psrlq $3, %%mm3\n\t"
723 "pand %2, %%mm0\n\t"
724 "pand %2, %%mm3\n\t"
725 "psrlq $6, %%mm1\n\t"
726 "psrlq $6, %%mm4\n\t"
727 "pand %%mm6, %%mm1\n\t"
728 "pand %%mm6, %%mm4\n\t"
729 "psrlq $9, %%mm2\n\t"
730 "psrlq $9, %%mm5\n\t"
731 "pand %%mm7, %%mm2\n\t"
732 "pand %%mm7, %%mm5\n\t"
733 "por %%mm1, %%mm0\n\t"
734 "por %%mm4, %%mm3\n\t"
735 "por %%mm2, %%mm0\n\t"
736 "por %%mm5, %%mm3\n\t"
737 "psllq $16, %%mm3\n\t"
738 "por %%mm3, %%mm0\n\t"
739 MOVNTQ" %%mm0, %0\n\t"
740 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
741 d += 4;
742 s += 12;
743 }
744 __asm __volatile(SFENCE:::"memory");
745 __asm __volatile(EMMS:::"memory");
746 #endif
747 while(s < end)
748 {
749 const int b= *s++;
750 const int g= *s++;
751 const int r= *s++;
752 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
753 }
754 }
755
756 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
757 {
758 const uint8_t *s = src;
759 const uint8_t *end;
760 #ifdef HAVE_MMX
761 const uint8_t *mm_end;
762 #endif
763 uint16_t *d = (uint16_t *)dst;
764 end = s + src_size;
765 #ifdef HAVE_MMX
766 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
767 __asm __volatile(
768 "movq %0, %%mm7\n\t"
769 "movq %1, %%mm6\n\t"
770 ::"m"(red_15mask),"m"(green_15mask));
771 mm_end = end - 15;
772 while(s < mm_end)
773 {
774 __asm __volatile(
775 PREFETCH" 32%1\n\t"
776 "movd %1, %%mm0\n\t"
777 "movd 3%1, %%mm3\n\t"
778 "punpckldq 6%1, %%mm0\n\t"
779 "punpckldq 9%1, %%mm3\n\t"
780 "movq %%mm0, %%mm1\n\t"
781 "movq %%mm0, %%mm2\n\t"
782 "movq %%mm3, %%mm4\n\t"
783 "movq %%mm3, %%mm5\n\t"
784 "psllq $7, %%mm0\n\t"
785 "psllq $7, %%mm3\n\t"
786 "pand %%mm7, %%mm0\n\t"
787 "pand %%mm7, %%mm3\n\t"
788 "psrlq $6, %%mm1\n\t"
789 "psrlq $6, %%mm4\n\t"
790 "pand %%mm6, %%mm1\n\t"
791 "pand %%mm6, %%mm4\n\t"
792 "psrlq $19, %%mm2\n\t"
793 "psrlq $19, %%mm5\n\t"
794 "pand %2, %%mm2\n\t"
795 "pand %2, %%mm5\n\t"
796 "por %%mm1, %%mm0\n\t"
797 "por %%mm4, %%mm3\n\t"
798 "por %%mm2, %%mm0\n\t"
799 "por %%mm5, %%mm3\n\t"
800 "psllq $16, %%mm3\n\t"
801 "por %%mm3, %%mm0\n\t"
802 MOVNTQ" %%mm0, %0\n\t"
803 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
804 d += 4;
805 s += 12;
806 }
807 __asm __volatile(SFENCE:::"memory");
808 __asm __volatile(EMMS:::"memory");
809 #endif
810 while(s < end)
811 {
812 const int r= *s++;
813 const int g= *s++;
814 const int b= *s++;
815 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
816 }
817 }
818
819 /*
820 I use here less accurate approximation by simply
821 left-shifting the input
822 value and filling the low order bits with
823 zeroes. This method improves png's
824 compression but this scheme cannot reproduce white exactly, since it does not
825 generate an all-ones maximum value; the net effect is to darken the
826 image slightly.
827
828 The better method should be "left bit replication":
829
830 4 3 2 1 0
831 ---------
832 1 1 0 1 1
833
834 7 6 5 4 3 2 1 0
835 ----------------
836 1 1 0 1 1 1 1 0
837 |=======| |===|
838 | Leftmost Bits Repeated to Fill Open Bits
839 |
840 Original Bits
841 */
842 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
843 {
844 const uint16_t *end;
845 #ifdef HAVE_MMX
846 const uint16_t *mm_end;
847 #endif
848 uint8_t *d = (uint8_t *)dst;
849 const uint16_t *s = (uint16_t *)src;
850 end = s + src_size/2;
851 #ifdef HAVE_MMX
852 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
853 mm_end = end - 7;
854 while(s < mm_end)
855 {
856 __asm __volatile(
857 PREFETCH" 32%1\n\t"
858 "movq %1, %%mm0\n\t"
859 "movq %1, %%mm1\n\t"
860 "movq %1, %%mm2\n\t"
861 "pand %2, %%mm0\n\t"
862 "pand %3, %%mm1\n\t"
863 "pand %4, %%mm2\n\t"
864 "psllq $3, %%mm0\n\t"
865 "psrlq $2, %%mm1\n\t"
866 "psrlq $7, %%mm2\n\t"
867 "movq %%mm0, %%mm3\n\t"
868 "movq %%mm1, %%mm4\n\t"
869 "movq %%mm2, %%mm5\n\t"
870 "punpcklwd %5, %%mm0\n\t"
871 "punpcklwd %5, %%mm1\n\t"
872 "punpcklwd %5, %%mm2\n\t"
873 "punpckhwd %5, %%mm3\n\t"
874 "punpckhwd %5, %%mm4\n\t"
875 "punpckhwd %5, %%mm5\n\t"
876 "psllq $8, %%mm1\n\t"
877 "psllq $16, %%mm2\n\t"
878 "por %%mm1, %%mm0\n\t"
879 "por %%mm2, %%mm0\n\t"
880 "psllq $8, %%mm4\n\t"
881 "psllq $16, %%mm5\n\t"
882 "por %%mm4, %%mm3\n\t"
883 "por %%mm5, %%mm3\n\t"
884
885 "movq %%mm0, %%mm6\n\t"
886 "movq %%mm3, %%mm7\n\t"
887
888 "movq 8%1, %%mm0\n\t"
889 "movq 8%1, %%mm1\n\t"
890 "movq 8%1, %%mm2\n\t"
891 "pand %2, %%mm0\n\t"
892 "pand %3, %%mm1\n\t"
893 "pand %4, %%mm2\n\t"
894 "psllq $3, %%mm0\n\t"
895 "psrlq $2, %%mm1\n\t"
896 "psrlq $7, %%mm2\n\t"
897 "movq %%mm0, %%mm3\n\t"
898 "movq %%mm1, %%mm4\n\t"
899 "movq %%mm2, %%mm5\n\t"
900 "punpcklwd %5, %%mm0\n\t"
901 "punpcklwd %5, %%mm1\n\t"
902 "punpcklwd %5, %%mm2\n\t"
903 "punpckhwd %5, %%mm3\n\t"
904 "punpckhwd %5, %%mm4\n\t"
905 "punpckhwd %5, %%mm5\n\t"
906 "psllq $8, %%mm1\n\t"
907 "psllq $16, %%mm2\n\t"
908 "por %%mm1, %%mm0\n\t"
909 "por %%mm2, %%mm0\n\t"
910 "psllq $8, %%mm4\n\t"
911 "psllq $16, %%mm5\n\t"
912 "por %%mm4, %%mm3\n\t"
913 "por %%mm5, %%mm3\n\t"
914
915 :"=m"(*d)
916 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
917 :"memory");
918 /* Borrowed 32 to 24 */
919 __asm __volatile(
920 "movq %%mm0, %%mm4\n\t"
921 "movq %%mm3, %%mm5\n\t"
922 "movq %%mm6, %%mm0\n\t"
923 "movq %%mm7, %%mm1\n\t"
924
925 "movq %%mm4, %%mm6\n\t"
926 "movq %%mm5, %%mm7\n\t"
927 "movq %%mm0, %%mm2\n\t"
928 "movq %%mm1, %%mm3\n\t"
929
930 "psrlq $8, %%mm2\n\t"
931 "psrlq $8, %%mm3\n\t"
932 "psrlq $8, %%mm6\n\t"
933 "psrlq $8, %%mm7\n\t"
934 "pand %2, %%mm0\n\t"
935 "pand %2, %%mm1\n\t"
936 "pand %2, %%mm4\n\t"
937 "pand %2, %%mm5\n\t"
938 "pand %3, %%mm2\n\t"
939 "pand %3, %%mm3\n\t"
940 "pand %3, %%mm6\n\t"
941 "pand %3, %%mm7\n\t"
942 "por %%mm2, %%mm0\n\t"
943 "por %%mm3, %%mm1\n\t"
944 "por %%mm6, %%mm4\n\t"
945 "por %%mm7, %%mm5\n\t"
946
947 "movq %%mm1, %%mm2\n\t"
948 "movq %%mm4, %%mm3\n\t"
949 "psllq $48, %%mm2\n\t"
950 "psllq $32, %%mm3\n\t"
951 "pand %4, %%mm2\n\t"
952 "pand %5, %%mm3\n\t"
953 "por %%mm2, %%mm0\n\t"
954 "psrlq $16, %%mm1\n\t"
955 "psrlq $32, %%mm4\n\t"
956 "psllq $16, %%mm5\n\t"
957 "por %%mm3, %%mm1\n\t"
958 "pand %6, %%mm5\n\t"
959 "por %%mm5, %%mm4\n\t"
960
961 MOVNTQ" %%mm0, %0\n\t"
962 MOVNTQ" %%mm1, 8%0\n\t"
963 MOVNTQ" %%mm4, 16%0"
964
965 :"=m"(*d)
966 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
967 :"memory");
968 d += 24;
969 s += 8;
970 }
971 __asm __volatile(SFENCE:::"memory");
972 __asm __volatile(EMMS:::"memory");
973 #endif
974 while(s < end)
975 {
976 register uint16_t bgr;
977 bgr = *s++;
978 *d++ = (bgr&0x1F)<<3;
979 *d++ = (bgr&0x3E0)>>2;
980 *d++ = (bgr&0x7C00)>>7;
981 }
982 }
983
984 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
985 {
986 const uint16_t *end;
987 #ifdef HAVE_MMX
988 const uint16_t *mm_end;
989 #endif
990 uint8_t *d = (uint8_t *)dst;
991 const uint16_t *s = (const uint16_t *)src;
992 end = s + src_size/2;
993 #ifdef HAVE_MMX
994 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
995 mm_end = end - 7;
996 while(s < mm_end)
997 {
998 __asm __volatile(
999 PREFETCH" 32%1\n\t"
1000 "movq %1, %%mm0\n\t"
1001 "movq %1, %%mm1\n\t"
1002 "movq %1, %%mm2\n\t"
1003 "pand %2, %%mm0\n\t"
1004 "pand %3, %%mm1\n\t"
1005 "pand %4, %%mm2\n\t"
1006 "psllq $3, %%mm0\n\t"
1007 "psrlq $3, %%mm1\n\t"
1008 "psrlq $8, %%mm2\n\t"
1009 "movq %%mm0, %%mm3\n\t"
1010 "movq %%mm1, %%mm4\n\t"
1011 "movq %%mm2, %%mm5\n\t"
1012 "punpcklwd %5, %%mm0\n\t"
1013 "punpcklwd %5, %%mm1\n\t"
1014 "punpcklwd %5, %%mm2\n\t"
1015 "punpckhwd %5, %%mm3\n\t"
1016 "punpckhwd %5, %%mm4\n\t"
1017 "punpckhwd %5, %%mm5\n\t"
1018 "psllq $8, %%mm1\n\t"
1019 "psllq $16, %%mm2\n\t"
1020 "por %%mm1, %%mm0\n\t"
1021 "por %%mm2, %%mm0\n\t"
1022 "psllq $8, %%mm4\n\t"
1023 "psllq $16, %%mm5\n\t"
1024 "por %%mm4, %%mm3\n\t"
1025 "por %%mm5, %%mm3\n\t"
1026
1027 "movq %%mm0, %%mm6\n\t"
1028 "movq %%mm3, %%mm7\n\t"
1029
1030 "movq 8%1, %%mm0\n\t"
1031 "movq 8%1, %%mm1\n\t"
1032 "movq 8%1, %%mm2\n\t"
1033 "pand %2, %%mm0\n\t"
1034 "pand %3, %%mm1\n\t"
1035 "pand %4, %%mm2\n\t"
1036 "psllq $3, %%mm0\n\t"
1037 "psrlq $3, %%mm1\n\t"
1038 "psrlq $8, %%mm2\n\t"
1039 "movq %%mm0, %%mm3\n\t"
1040 "movq %%mm1, %%mm4\n\t"
1041 "movq %%mm2, %%mm5\n\t"
1042 "punpcklwd %5, %%mm0\n\t"
1043 "punpcklwd %5, %%mm1\n\t"
1044 "punpcklwd %5, %%mm2\n\t"
1045 "punpckhwd %5, %%mm3\n\t"
1046 "punpckhwd %5, %%mm4\n\t"
1047 "punpckhwd %5, %%mm5\n\t"
1048 "psllq $8, %%mm1\n\t"
1049 "psllq $16, %%mm2\n\t"
1050 "por %%mm1, %%mm0\n\t"
1051 "por %%mm2, %%mm0\n\t"
1052 "psllq $8, %%mm4\n\t"
1053 "psllq $16, %%mm5\n\t"
1054 "por %%mm4, %%mm3\n\t"
1055 "por %%mm5, %%mm3\n\t"
1056 :"=m"(*d)
1057 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1058 :"memory");
1059 /* Borrowed 32 to 24 */
1060 __asm __volatile(
1061 "movq %%mm0, %%mm4\n\t"
1062 "movq %%mm3, %%mm5\n\t"
1063 "movq %%mm6, %%mm0\n\t"
1064 "movq %%mm7, %%mm1\n\t"
1065
1066 "movq %%mm4, %%mm6\n\t"
1067 "movq %%mm5, %%mm7\n\t"
1068 "movq %%mm0, %%mm2\n\t"
1069 "movq %%mm1, %%mm3\n\t"
1070
1071 "psrlq $8, %%mm2\n\t"
1072 "psrlq $8, %%mm3\n\t"
1073 "psrlq $8, %%mm6\n\t"
1074 "psrlq $8, %%mm7\n\t"
1075 "pand %2, %%mm0\n\t"
1076 "pand %2, %%mm1\n\t"
1077 "pand %2, %%mm4\n\t"
1078 "pand %2, %%mm5\n\t"
1079 "pand %3, %%mm2\n\t"
1080 "pand %3, %%mm3\n\t"
1081 "pand %3, %%mm6\n\t"
1082 "pand %3, %%mm7\n\t"
1083 "por %%mm2, %%mm0\n\t"
1084 "por %%mm3, %%mm1\n\t"
1085 "por %%mm6, %%mm4\n\t"
1086 "por %%mm7, %%mm5\n\t"
1087
1088 "movq %%mm1, %%mm2\n\t"
1089 "movq %%mm4, %%mm3\n\t"
1090 "psllq $48, %%mm2\n\t"
1091 "psllq $32, %%mm3\n\t"
1092 "pand %4, %%mm2\n\t"
1093 "pand %5, %%mm3\n\t"
1094 "por %%mm2, %%mm0\n\t"
1095 "psrlq $16, %%mm1\n\t"
1096 "psrlq $32, %%mm4\n\t"
1097 "psllq $16, %%mm5\n\t"
1098 "por %%mm3, %%mm1\n\t"
1099 "pand %6, %%mm5\n\t"
1100 "por %%mm5, %%mm4\n\t"
1101
1102 MOVNTQ" %%mm0, %0\n\t"
1103 MOVNTQ" %%mm1, 8%0\n\t"
1104 MOVNTQ" %%mm4, 16%0"
1105
1106 :"=m"(*d)
1107 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1108 :"memory");
1109 d += 24;
1110 s += 8;
1111 }
1112 __asm __volatile(SFENCE:::"memory");
1113 __asm __volatile(EMMS:::"memory");
1114 #endif
1115 while(s < end)
1116 {
1117 register uint16_t bgr;
1118 bgr = *s++;
1119 *d++ = (bgr&0x1F)<<3;
1120 *d++ = (bgr&0x7E0)>>3;
1121 *d++ = (bgr&0xF800)>>8;
1122 }
1123 }
1124
1125 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1126 {
1127 const uint16_t *end;
1128 #ifdef HAVE_MMX
1129 const uint16_t *mm_end;
1130 #endif
1131 uint8_t *d = (uint8_t *)dst;
1132 const uint16_t *s = (const uint16_t *)src;
1133 end = s + src_size/2;
1134 #ifdef HAVE_MMX
1135 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1136 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1137 mm_end = end - 3;
1138 while(s < mm_end)
1139 {
1140 __asm __volatile(
1141 PREFETCH" 32%1\n\t"
1142 "movq %1, %%mm0\n\t"
1143 "movq %1, %%mm1\n\t"
1144 "movq %1, %%mm2\n\t"
1145 "pand %2, %%mm0\n\t"
1146 "pand %3, %%mm1\n\t"
1147 "pand %4, %%mm2\n\t"
1148 "psllq $3, %%mm0\n\t"
1149 "psrlq $2, %%mm1\n\t"
1150 "psrlq $7, %%mm2\n\t"
1151 "movq %%mm0, %%mm3\n\t"
1152 "movq %%mm1, %%mm4\n\t"
1153 "movq %%mm2, %%mm5\n\t"
1154 "punpcklwd %%mm7, %%mm0\n\t"
1155 "punpcklwd %%mm7, %%mm1\n\t"
1156 "punpcklwd %%mm7, %%mm2\n\t"
1157 "punpckhwd %%mm7, %%mm3\n\t"
1158 "punpckhwd %%mm7, %%mm4\n\t"
1159 "punpckhwd %%mm7, %%mm5\n\t"
1160 "psllq $8, %%mm1\n\t"
1161 "psllq $16, %%mm2\n\t"
1162 "por %%mm1, %%mm0\n\t"
1163 "por %%mm2, %%mm0\n\t"
1164 "psllq $8, %%mm4\n\t"
1165 "psllq $16, %%mm5\n\t"
1166 "por %%mm4, %%mm3\n\t"
1167 "por %%mm5, %%mm3\n\t"
1168 MOVNTQ" %%mm0, %0\n\t"
1169 MOVNTQ" %%mm3, 8%0\n\t"
1170 :"=m"(*d)
1171 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1172 :"memory");
1173 d += 16;
1174 s += 4;
1175 }
1176 __asm __volatile(SFENCE:::"memory");
1177 __asm __volatile(EMMS:::"memory");
1178 #endif
1179 while(s < end)
1180 {
1181 register uint16_t bgr;
1182 bgr = *s++;
1183 *d++ = (bgr&0x1F)<<3;
1184 *d++ = (bgr&0x3E0)>>2;
1185 *d++ = (bgr&0x7C00)>>7;
1186 *d++ = 0;
1187 }
1188 }
1189
1190 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1191 {
1192 const uint16_t *end;
1193 #ifdef HAVE_MMX
1194 const uint16_t *mm_end;
1195 #endif
1196 uint8_t *d = (uint8_t *)dst;
1197 const uint16_t *s = (uint16_t *)src;
1198 end = s + src_size/2;
1199 #ifdef HAVE_MMX
1200 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1201 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1202 mm_end = end - 3;
1203 while(s < mm_end)
1204 {
1205 __asm __volatile(
1206 PREFETCH" 32%1\n\t"
1207 "movq %1, %%mm0\n\t"
1208 "movq %1, %%mm1\n\t"
1209 "movq %1, %%mm2\n\t"
1210 "pand %2, %%mm0\n\t"
1211 "pand %3, %%mm1\n\t"
1212 "pand %4, %%mm2\n\t"
1213 "psllq $3, %%mm0\n\t"
1214 "psrlq $3, %%mm1\n\t"
1215 "psrlq $8, %%mm2\n\t"
1216 "movq %%mm0, %%mm3\n\t"
1217 "movq %%mm1, %%mm4\n\t"
1218 "movq %%mm2, %%mm5\n\t"
1219 "punpcklwd %%mm7, %%mm0\n\t"
1220 "punpcklwd %%mm7, %%mm1\n\t"
1221 "punpcklwd %%mm7, %%mm2\n\t"
1222 "punpckhwd %%mm7, %%mm3\n\t"
1223 "punpckhwd %%mm7, %%mm4\n\t"
1224 "punpckhwd %%mm7, %%mm5\n\t"
1225 "psllq $8, %%mm1\n\t"
1226 "psllq $16, %%mm2\n\t"
1227 "por %%mm1, %%mm0\n\t"
1228 "por %%mm2, %%mm0\n\t"
1229 "psllq $8, %%mm4\n\t"
1230 "psllq $16, %%mm5\n\t"
1231 "por %%mm4, %%mm3\n\t"
1232 "por %%mm5, %%mm3\n\t"
1233 MOVNTQ" %%mm0, %0\n\t"
1234 MOVNTQ" %%mm3, 8%0\n\t"
1235 :"=m"(*d)
1236 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1237 :"memory");
1238 d += 16;
1239 s += 4;
1240 }
1241 __asm __volatile(SFENCE:::"memory");
1242 __asm __volatile(EMMS:::"memory");
1243 #endif
1244 while(s < end)
1245 {
1246 register uint16_t bgr;
1247 bgr = *s++;
1248 *d++ = (bgr&0x1F)<<3;
1249 *d++ = (bgr&0x7E0)>>3;
1250 *d++ = (bgr&0xF800)>>8;
1251 *d++ = 0;
1252 }
1253 }
1254
1255 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1256 {
1257 #ifdef HAVE_MMX
1258 /* TODO: unroll this loop */
1259 asm volatile (
1260 "xorl %%eax, %%eax \n\t"
1261 ".balign 16 \n\t"
1262 "1: \n\t"
1263 PREFETCH" 32(%0, %%eax) \n\t"
1264 "movq (%0, %%eax), %%mm0 \n\t"
1265 "movq %%mm0, %%mm1 \n\t"
1266 "movq %%mm0, %%mm2 \n\t"
1267 "pslld $16, %%mm0 \n\t"
1268 "psrld $16, %%mm1 \n\t"
1269 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1270 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1271 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1272 "por %%mm0, %%mm2 \n\t"
1273 "por %%mm1, %%mm2 \n\t"
1274 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1275 "addl $8, %%eax \n\t"
1276 "cmpl %2, %%eax \n\t"
1277 " jb 1b \n\t"
1278 :: "r" (src), "r"(dst), "r" (src_size-7)
1279 : "%eax"
1280 );
1281
1282 __asm __volatile(SFENCE:::"memory");
1283 __asm __volatile(EMMS:::"memory");
1284 #else
1285 unsigned i;
1286 unsigned num_pixels = src_size >> 2;
1287 for(i=0; i<num_pixels; i++)
1288 {
1289 dst[4*i + 0] = src[4*i + 2];
1290 dst[4*i + 1] = src[4*i + 1];
1291 dst[4*i + 2] = src[4*i + 0];
1292 }
1293 #endif
1294 }
1295
1296 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1297 {
1298 unsigned i;
1299 #ifdef HAVE_MMX
1300 int mmx_size= 23 - src_size;
1301 asm volatile (
1302 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1303 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1304 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1305 ".balign 16 \n\t"
1306 "1: \n\t"
1307 PREFETCH" 32(%1, %%eax) \n\t"
1308 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1309 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1310 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1311 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1312 "pand %%mm5, %%mm0 \n\t"
1313 "pand %%mm6, %%mm1 \n\t"
1314 "pand %%mm7, %%mm2 \n\t"
1315 "por %%mm0, %%mm1 \n\t"
1316 "por %%mm2, %%mm1 \n\t"
1317 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1318 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1319 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1320 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1321 "pand %%mm7, %%mm0 \n\t"
1322 "pand %%mm5, %%mm1 \n\t"
1323 "pand %%mm6, %%mm2 \n\t"
1324 "por %%mm0, %%mm1 \n\t"
1325 "por %%mm2, %%mm1 \n\t"
1326 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1327 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1328 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1329 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1330 "pand %%mm6, %%mm0 \n\t"
1331 "pand %%mm7, %%mm1 \n\t"
1332 "pand %%mm5, %%mm2 \n\t"
1333 "por %%mm0, %%mm1 \n\t"
1334 "por %%mm2, %%mm1 \n\t"
1335 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1336 "addl $24, %%eax \n\t"
1337 " js 1b \n\t"
1338 : "+a" (mmx_size)
1339 : "r" (src-mmx_size), "r"(dst-mmx_size)
1340 );
1341
1342 __asm __volatile(SFENCE:::"memory");
1343 __asm __volatile(EMMS:::"memory");
1344
1345 if(mmx_size==23) return; //finihsed, was multiple of 8
1346
1347 src+= src_size;
1348 dst+= src_size;
1349 src_size= 23-mmx_size;
1350 src-= src_size;
1351 dst-= src_size;
1352 #endif
1353 for(i=0; i<src_size; i+=3)
1354 {
1355 register uint8_t x;
1356 x = src[i + 2];
1357 dst[i + 1] = src[i + 1];
1358 dst[i + 2] = src[i + 0];
1359 dst[i + 0] = x;
1360 }
1361 }
1362
1363 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1364 unsigned int width, unsigned int height,
1365 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
1366 {
1367 unsigned y;
1368 const unsigned chromWidth= width>>1;
1369 for(y=0; y<height; y++)
1370 {
1371 #ifdef HAVE_MMX
1372 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1373 asm volatile(
1374 "xorl %%eax, %%eax \n\t"
1375 ".balign 16 \n\t"
1376 "1: \n\t"
1377 PREFETCH" 32(%1, %%eax, 2) \n\t"
1378 PREFETCH" 32(%2, %%eax) \n\t"
1379 PREFETCH" 32(%3, %%eax) \n\t"
1380 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1381 "movq %%mm0, %%mm2 \n\t" // U(0)
1382 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1383 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1384 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1385
1386 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1387 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1388 "movq %%mm3, %%mm4 \n\t" // Y(0)
1389 "movq %%mm5, %%mm6 \n\t" // Y(8)
1390 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1391 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1392 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1393 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1394
1395 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1396 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1397 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1398 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1399
1400 "addl $8, %%eax \n\t"
1401 "cmpl %4, %%eax \n\t"
1402 " jb 1b \n\t"
1403 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1404 : "%eax"
1405 );
1406 #else
1407 #if __WORDSIZE >= 64
1408 int i;
1409 uint64_t *ldst = (uint64_t *) dst;
1410 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1411 for(i = 0; i < chromWidth; i += 2){
1412 uint64_t k, l;
1413 k = yc[0] + (uc[0] << 8) +
1414 (yc[1] << 16) + (vc[0] << 24);
1415 l = yc[2] + (uc[1] << 8) +
1416 (yc[3] << 16) + (vc[1] << 24);
1417 *ldst++ = k + (l << 32);
1418 yc += 4;
1419 uc += 2;
1420 vc += 2;
1421 }
1422
1423 #else
1424 int i, *idst = (int32_t *) dst;
1425 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1426 for(i = 0; i < chromWidth; i++){
1427 *idst++ = yc[0] + (uc[0] << 8) +
1428 (yc[1] << 16) + (vc[0] << 24);
1429 yc += 2;
1430 uc++;
1431 vc++;
1432 }
1433 #endif
1434 #endif
1435 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1436 {
1437 usrc += chromStride;
1438 vsrc += chromStride;
1439 }
1440 ysrc += lumStride;
1441 dst += dstStride;
1442 }
1443 #ifdef HAVE_MMX
1444 asm( EMMS" \n\t"
1445 SFENCE" \n\t"
1446 :::"memory");
1447 #endif
1448 }
1449
1450 /**
1451 *
1452 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1453 * problem for anyone then tell me, and ill fix it)
1454 */
1455 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1456 unsigned int width, unsigned int height,
1457 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1458 {
1459 //FIXME interpolate chroma
1460 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1461 }
1462
1463 /**
1464 *
1465 * width should be a multiple of 16
1466 */
1467 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1468 unsigned int width, unsigned int height,
1469 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1470 {
1471 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1472 }
1473
1474 /**
1475 *
1476 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1477 * problem for anyone then tell me, and ill fix it)
1478 */
1479 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1480 unsigned int width, unsigned int height,
1481 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1482 {
1483 unsigned y;
1484 const unsigned chromWidth= width>>1;
1485 for(y=0; y<height; y+=2)
1486 {
1487 #ifdef HAVE_MMX
1488 asm volatile(
1489 "xorl %%eax, %%eax \n\t"
1490 "pcmpeqw %%mm7, %%mm7 \n\t"
1491 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1492 ".balign 16 \n\t"
1493 "1: \n\t"
1494 PREFETCH" 64(%0, %%eax, 4) \n\t"
1495 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1496 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1497 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1498 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1499 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1500 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1501 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1502 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1503 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1504 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1505
1506 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1507
1508 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1509 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1510 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1511 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1512 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1513 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1514 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1515 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1516 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1517 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1518
1519 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1520
1521 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1522 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1523 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1524 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1525 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1526 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1527 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1528 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1529
1530 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1531 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1532
1533 "addl $8, %%eax \n\t"
1534 "cmpl %4, %%eax \n\t"
1535 " jb 1b \n\t"
1536 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1537 : "memory", "%eax"
1538 );
1539
1540 ydst += lumStride;
1541 src += srcStride;
1542
1543 asm volatile(
1544 "xorl %%eax, %%eax \n\t"
1545 ".balign 16 \n\t"
1546 "1: \n\t"
1547 PREFETCH" 64(%0, %%eax, 4) \n\t"
1548 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1549 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1550 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1551 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1552 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1553 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1554 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1555 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1556 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1557 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1558
1559 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1560 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1561
1562 "addl $8, %%eax \n\t"
1563 "cmpl %4, %%eax \n\t"
1564 " jb 1b \n\t"
1565
1566 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1567 : "memory", "%eax"
1568 );
1569 #else
1570 unsigned i;
1571 for(i=0; i<chromWidth; i++)
1572 {
1573 ydst[2*i+0] = src[4*i+0];
1574 udst[i] = src[4*i+1];
1575 ydst[2*i+1] = src[4*i+2];
1576 vdst[i] = src[4*i+3];
1577 }
1578 ydst += lumStride;
1579 src += srcStride;
1580
1581 for(i=0; i<chromWidth; i++)
1582 {
1583 ydst[2*i+0] = src[4*i+0];
1584 ydst[2*i+1] = src[4*i+2];
1585 }
1586 #endif
1587 udst += chromStride;
1588 vdst += chromStride;
1589 ydst += lumStride;
1590 src += srcStride;
1591 }
1592 #ifdef HAVE_MMX
1593 asm volatile( EMMS" \n\t"
1594 SFENCE" \n\t"
1595 :::"memory");
1596 #endif
1597 }
1598
1599 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1600 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1601 unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
1602 {
1603 /* Y Plane */
1604 memcpy(ydst, ysrc, width*height);
1605
1606 /* XXX: implement upscaling for U,V */
1607 }
1608
1609 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1610 {
1611 int x,y;
1612
1613 // first line
1614 for(x=0; x<srcWidth; x++){
1615 dst[2*x+0]=
1616 dst[2*x+1]= src[x];
1617 }
1618 dst+= dstStride;
1619
1620 for(y=1; y<srcHeight; y++){
1621 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1622 const int mmxSize= srcWidth;
1623 asm volatile(
1624 "movl %4, %%eax \n\t"
1625 "1: \n\t"
1626 "movq (%0, %%eax), %%mm0 \n\t"
1627 "movq (%1, %%eax), %%mm1 \n\t"
1628 "movq 1(%0, %%eax), %%mm2 \n\t"
1629 "movq 1(%1, %%eax), %%mm3 \n\t"
1630 "movq %%mm0, %%mm4 \n\t"
1631 "movq %%mm1, %%mm5 \n\t"
1632 PAVGB" %%mm3, %%mm0 \n\t"
1633 PAVGB" %%mm3, %%mm0 \n\t"
1634 PAVGB" %%mm4, %%mm3 \n\t"
1635 PAVGB" %%mm4, %%mm3 \n\t"
1636 PAVGB" %%mm2, %%mm1 \n\t"
1637 PAVGB" %%mm2, %%mm1 \n\t"
1638 PAVGB" %%mm5, %%mm2 \n\t"
1639 PAVGB" %%mm5, %%mm2 \n\t"
1640 "movq %%mm3, %%mm4 \n\t"
1641 "movq %%mm2, %%mm5 \n\t"
1642 "punpcklbw %%mm1, %%mm3 \n\t"
1643 "punpckhbw %%mm1, %%mm4 \n\t"
1644 "punpcklbw %%mm0, %%mm2 \n\t"
1645 "punpckhbw %%mm0, %%mm5 \n\t"
1646 #if 1
1647 MOVNTQ" %%mm3, (%2, %%eax, 2) \n\t"
1648 MOVNTQ" %%mm4, 8(%2, %%eax, 2) \n\t"
1649 MOVNTQ" %%mm2, (%3, %%eax, 2) \n\t"
1650 MOVNTQ" %%mm5, 8(%3, %%eax, 2) \n\t"
1651 #else
1652 "movq %%mm3, (%2, %%eax, 2) \n\t"
1653 "movq %%mm4, 8(%2, %%eax, 2) \n\t"
1654 "movq %%mm2, (%3, %%eax, 2) \n\t"
1655 "movq %%mm5, 8(%3, %%eax, 2) \n\t"
1656 #endif
1657 "addl $8, %%eax \n\t"
1658 " js 1b \n\t"
1659 :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
1660 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1661 "g" (-mmxSize)
1662 : "%eax"
1663
1664 );
1665 dst[0]=
1666 dst[dstStride]= src[0];
1667 #else
1668 dst[0]=
1669 dst[dstStride]= src[0];
1670
1671 for(x=0; x<srcWidth-1; x++){
1672 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1673 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1674 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1675 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1676 }
1677 #endif
1678 dst[srcWidth*2 -1]=
1679 dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
1680
1681 dst+=dstStride*2;
1682 src+=srcStride;
1683 }
1684 src-=srcStride;
1685
1686 // last line
1687 for(x=0; x<srcWidth; x++){
1688 dst[2*x+0]=
1689 dst[2*x+1]= src[x];
1690 }
1691 #ifdef HAVE_MMX
1692 asm volatile( EMMS" \n\t"
1693 SFENCE" \n\t"
1694 :::"memory");
1695 #endif
1696 }
1697
1698 /**
1699 *
1700 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1701 * problem for anyone then tell me, and ill fix it)
1702 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1703 */
1704 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1705 unsigned int width, unsigned int height,
1706 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1707 {
1708 unsigned y;
1709 const unsigned chromWidth= width>>1;
1710 for(y=0; y<height; y+=2)
1711 {
1712 #ifdef HAVE_MMX
1713 asm volatile(
1714 "xorl %%eax, %%eax \n\t"
1715 "pcmpeqw %%mm7, %%mm7 \n\t"
1716 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1717 ".balign 16 \n\t"
1718 "1: \n\t"
1719 PREFETCH" 64(%0, %%eax, 4) \n\t"
1720 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1721 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1722 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1723 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1724 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1725 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1726 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1727 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1728 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1729 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1730
1731 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1732
1733 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1734 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1735 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1736 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1737 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1738 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1739 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1740 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1741 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1742 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1743
1744 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1745
1746 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1747 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1748 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1749 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1750 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1751 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1752 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1753 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1754
1755 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1756 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1757
1758 "addl $8, %%eax \n\t"
1759 "cmpl %4, %%eax \n\t"
1760 " jb 1b \n\t"
1761 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1762 : "memory", "%eax"
1763 );
1764
1765 ydst += lumStride;
1766 src += srcStride;
1767
1768 asm volatile(
1769 "xorl %%eax, %%eax \n\t"
1770 ".balign 16 \n\t"
1771 "1: \n\t"
1772 PREFETCH" 64(%0, %%eax, 4) \n\t"
1773 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1774 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1775 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1776 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1777 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1778 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1779 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1780 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1781 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1782 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1783
1784 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1785 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1786
1787 "addl $8, %%eax \n\t"
1788 "cmpl %4, %%eax \n\t"
1789 " jb 1b \n\t"
1790
1791 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1792 : "memory", "%eax"
1793 );
1794 #else
1795 unsigned i;
1796 for(i=0; i<chromWidth; i++)
1797 {
1798 udst[i] = src[4*i+0];
1799 ydst[2*i+0] = src[4*i+1];
1800 vdst[i] = src[4*i+2];
1801 ydst[2*i+1] = src[4*i+3];
1802 }
1803 ydst += lumStride;
1804 src += srcStride;
1805
1806 for(i=0; i<chromWidth; i++)
1807 {
1808 ydst[2*i+0] = src[4*i+1];
1809 ydst[2*i+1] = src[4*i+3];
1810 }
1811 #endif
1812 udst += chromStride;
1813 vdst += chromStride;
1814 ydst += lumStride;
1815 src += srcStride;
1816 }
1817 #ifdef HAVE_MMX
1818 asm volatile( EMMS" \n\t"
1819 SFENCE" \n\t"
1820 :::"memory");
1821 #endif
1822 }
1823
1824 /**
1825 *
1826 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1827 * problem for anyone then tell me, and ill fix it)
1828 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1829 */
1830 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1831 unsigned int width, unsigned int height,
1832 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1833 {
1834 unsigned y;
1835 const unsigned chromWidth= width>>1;
1836 #ifdef HAVE_MMX
1837 for(y=0; y<height-2; y+=2)
1838 {
1839 unsigned i;
1840 for(i=0; i<2; i++)
1841 {
1842 asm volatile(
1843 "movl %2, %%eax \n\t"
1844 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1845 "movq "MANGLE(w1111)", %%mm5 \n\t"
1846 "pxor %%mm7, %%mm7 \n\t"
1847 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1848 ".balign 16 \n\t"
1849 "1: \n\t"
1850 PREFETCH" 64(%0, %%ebx) \n\t"
1851 "movd (%0, %%ebx), %%mm0 \n\t"
1852 "movd 3(%0, %%ebx), %%mm1 \n\t"
1853 "punpcklbw %%mm7, %%mm0 \n\t"
1854 "punpcklbw %%mm7, %%mm1 \n\t"
1855 "movd 6(%0, %%ebx), %%mm2 \n\t"
1856 "movd 9(%0, %%ebx), %%mm3 \n\t"
1857 "punpcklbw %%mm7, %%mm2 \n\t"
1858 "punpcklbw %%mm7, %%mm3 \n\t"
1859 "pmaddwd %%mm6, %%mm0 \n\t"
1860 "pmaddwd %%mm6, %%mm1 \n\t"
1861 "pmaddwd %%mm6, %%mm2 \n\t"
1862 "pmaddwd %%mm6, %%mm3 \n\t"
1863 #ifndef FAST_BGR2YV12
1864 "psrad $8, %%mm0 \n\t"
1865 "psrad $8, %%mm1 \n\t"
1866 "psrad $8, %%mm2 \n\t"
1867 "psrad $8, %%mm3 \n\t"
1868 #endif
1869 "packssdw %%mm1, %%mm0 \n\t"
1870 "packssdw %%mm3, %%mm2 \n\t"
1871 "pmaddwd %%mm5, %%mm0 \n\t"
1872 "pmaddwd %%mm5, %%mm2 \n\t"
1873 "packssdw %%mm2, %%mm0 \n\t"
1874 "psraw $7, %%mm0 \n\t"
1875
1876 "movd 12(%0, %%ebx), %%mm4 \n\t"
1877 "movd 15(%0, %%ebx), %%mm1 \n\t"
1878 "punpcklbw %%mm7, %%mm4 \n\t"
1879 "punpcklbw %%mm7, %%mm1 \n\t"
1880 "movd 18(%0, %%ebx), %%mm2 \n\t"
1881 "movd 21(%0, %%ebx), %%mm3 \n\t"
1882 "punpcklbw %%mm7, %%mm2 \n\t"
1883 "punpcklbw %%mm7, %%mm3 \n\t"
1884 "pmaddwd %%mm6, %%mm4 \n\t"
1885 "pmaddwd %%mm6, %%mm1 \n\t"
1886 "pmaddwd %%mm6, %%mm2 \n\t"
1887 "pmaddwd %%mm6, %%mm3 \n\t"
1888 #ifndef FAST_BGR2YV12
1889 "psrad $8, %%mm4 \n\t"
1890 "psrad $8, %%mm1 \n\t"
1891 "psrad $8, %%mm2 \n\t"
1892 "psrad $8, %%mm3 \n\t"
1893 #endif
1894 "packssdw %%mm1, %%mm4 \n\t"
1895 "packssdw %%mm3, %%mm2 \n\t"
1896 "pmaddwd %%mm5, %%mm4 \n\t"
1897 "pmaddwd %%mm5, %%mm2 \n\t"
1898 "addl $24, %%ebx \n\t"
1899 "packssdw %%mm2, %%mm4 \n\t"
1900 "psraw $7, %%mm4 \n\t"
1901
1902 "packuswb %%mm4, %%mm0 \n\t"
1903 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1904
1905 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
1906 "addl $8, %%eax \n\t"
1907 " js 1b \n\t"
1908 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1909 : "%eax", "%ebx"
1910 );
1911 ydst += lumStride;
1912 src += srcStride;
1913 }
1914 src -= srcStride*2;
1915 asm volatile(
1916 "movl %4, %%eax \n\t"
1917 "movq "MANGLE(w1111)", %%mm5 \n\t"
1918 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1919 "pxor %%mm7, %%mm7 \n\t"
1920 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1921 "addl %%ebx, %%ebx \n\t"
1922 ".balign 16 \n\t"
1923 "1: \n\t"
1924 PREFETCH" 64(%0, %%ebx) \n\t"
1925 PREFETCH" 64(%1, %%ebx) \n\t"
1926 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1927 "movq (%0, %%ebx), %%mm0 \n\t"
1928 "movq (%1, %%ebx), %%mm1 \n\t"
1929 "movq 6(%0, %%ebx), %%mm2 \n\t"
1930 "movq 6(%1, %%ebx), %%mm3 \n\t"
1931 PAVGB" %%mm1, %%mm0 \n\t"
1932 PAVGB" %%mm3, %%mm2 \n\t"
1933 "movq %%mm0, %%mm1 \n\t"
1934 "movq %%mm2, %%mm3 \n\t"
1935 "psrlq $24, %%mm0 \n\t"
1936 "psrlq $24, %%mm2 \n\t"
1937 PAVGB" %%mm1, %%mm0 \n\t"
1938 PAVGB" %%mm3, %%mm2 \n\t"
1939 "punpcklbw %%mm7, %%mm0 \n\t"
1940 "punpcklbw %%mm7, %%mm2 \n\t"
1941 #else
1942 "movd (%0, %%ebx), %%mm0 \n\t"
1943 "movd (%1, %%ebx), %%mm1 \n\t"
1944 "movd 3(%0, %%ebx), %%mm2 \n\t"
1945 "movd 3(%1, %%ebx), %%mm3 \n\t"
1946 "punpcklbw %%mm7, %%mm0 \n\t"
1947 "punpcklbw %%mm7, %%mm1 \n\t"
1948 "punpcklbw %%mm7, %%mm2 \n\t"
1949 "punpcklbw %%mm7, %%mm3 \n\t"
1950 "paddw %%mm1, %%mm0 \n\t"
1951 "paddw %%mm3, %%mm2 \n\t"
1952 "paddw %%mm2, %%mm0 \n\t"
1953 "movd 6(%0, %%ebx), %%mm4 \n\t"
1954 "movd 6(%1, %%ebx), %%mm1 \n\t"
1955 "movd 9(%0, %%ebx), %%mm2 \n\t"
1956 "movd 9(%1, %%ebx), %%mm3 \n\t"
1957 "punpcklbw %%mm7, %%mm4 \n\t"
1958 "punpcklbw %%mm7, %%mm1 \n\t"
1959 "punpcklbw %%mm7, %%mm2 \n\t"
1960 "punpcklbw %%mm7, %%mm3 \n\t"
1961 "paddw %%mm1, %%mm4 \n\t"
1962 "paddw %%mm3, %%mm2 \n\t"
1963 "paddw %%mm4, %%mm2 \n\t"
1964 "psrlw $2, %%mm0 \n\t"
1965 "psrlw $2, %%mm2 \n\t"
1966 #endif
1967 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1968 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1969
1970 "pmaddwd %%mm0, %%mm1 \n\t"
1971 "pmaddwd %%mm2, %%mm3 \n\t"
1972 "pmaddwd %%mm6, %%mm0 \n\t"
1973 "pmaddwd %%mm6, %%mm2 \n\t"
1974 #ifndef FAST_BGR2YV12
1975 "psrad $8, %%mm0 \n\t"
1976 "psrad $8, %%mm1 \n\t"
1977 "psrad $8, %%mm2 \n\t"
1978 "psrad $8, %%mm3 \n\t"
1979 #endif
1980 "packssdw %%mm2, %%mm0 \n\t"
1981 "packssdw %%mm3, %%mm1 \n\t"
1982 "pmaddwd %%mm5, %%mm0 \n\t"
1983 "pmaddwd %%mm5, %%mm1 \n\t"
1984 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1985 "psraw $7, %%mm0 \n\t"
1986
1987 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1988 "movq 12(%0, %%ebx), %%mm4 \n\t"
1989 "movq 12(%1, %%ebx), %%mm1 \n\t"
1990 "movq 18(%0, %%ebx), %%mm2 \n\t"
1991 "movq 18(%1, %%ebx), %%mm3 \n\t"
1992 PAVGB" %%mm1, %%mm4 \n\t"
1993 PAVGB" %%mm3, %%mm2 \n\t"
1994 "movq %%mm4, %%mm1 \n\t"
1995 "movq %%mm2, %%mm3 \n\t"
1996 "psrlq $24, %%mm4 \n\t"
1997 "psrlq $24, %%mm2 \n\t"
1998 PAVGB" %%mm1, %%mm4 \n\t"
1999 PAVGB" %%mm3, %%mm2 \n\t"
2000 "punpcklbw %%mm7, %%mm4 \n\t"
2001 "punpcklbw %%mm7, %%mm2 \n\t"
2002 #else
2003 "movd 12(%0, %%ebx), %%mm4 \n\t"
2004 "movd 12(%1, %%ebx), %%mm1 \n\t"
2005 "movd 15(%0, %%ebx), %%mm2 \n\t"
2006 "movd 15(%1, %%ebx), %%mm3 \n\t"
2007 "punpcklbw %%mm7, %%mm4 \n\t"
2008 "punpcklbw %%mm7, %%mm1 \n\t"
2009 "punpcklbw %%mm7, %%mm2 \n\t"
2010 "punpcklbw %%mm7, %%mm3 \n\t"
2011 "paddw %%mm1, %%mm4 \n\t"
2012 "paddw %%mm3, %%mm2 \n\t"
2013 "paddw %%mm2, %%mm4 \n\t"
2014 "movd 18(%0, %%ebx), %%mm5 \n\t"
2015 "movd 18(%1, %%ebx), %%mm1 \n\t"
2016 "movd 21(%0, %%ebx), %%mm2 \n\t"
2017 "movd 21(%1, %%ebx), %%mm3 \n\t"
2018 "punpcklbw %%mm7, %%mm5 \n\t"
2019 "punpcklbw %%mm7, %%mm1 \n\t"
2020 "punpcklbw %%mm7, %%mm2 \n\t"
2021 "punpcklbw %%mm7, %%mm3 \n\t"
2022 "paddw %%mm1, %%mm5 \n\t"
2023 "paddw %%mm3, %%mm2 \n\t"
2024 "paddw %%mm5, %%mm2 \n\t"
2025 "movq "MANGLE(w1111)", %%mm5 \n\t"
2026 "psrlw $2, %%mm4 \n\t"
2027 "psrlw $2, %%mm2 \n\t"
2028 #endif
2029 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2030 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2031
2032 "pmaddwd %%mm4, %%mm1 \n\t"
2033 "pmaddwd %%mm2, %%mm3 \n\t"
2034 "pmaddwd %%mm6, %%mm4 \n\t"
2035 "pmaddwd %%mm6, %%mm2 \n\t"
2036 #ifndef FAST_BGR2YV12
2037 "psrad $8, %%mm4 \n\t"
2038 "psrad $8, %%mm1 \n\t"
2039 "psrad $8, %%mm2 \n\t"
2040 "psrad $8, %%mm3 \n\t"
2041 #endif
2042 "packssdw %%mm2, %%mm4 \n\t"
2043 "packssdw %%mm3, %%mm1 \n\t"
2044 "pmaddwd %%mm5, %%mm4 \n\t"
2045 "pmaddwd %%mm5, %%mm1 \n\t"
2046 "addl $24, %%ebx \n\t"
2047 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2048 "psraw $7, %%mm4 \n\t"
2049
2050 "movq %%mm0, %%mm1 \n\t"
2051 "punpckldq %%mm4, %%mm0 \n\t"
2052 "punpckhdq %%mm4, %%mm1 \n\t"
2053 "packsswb %%mm1, %%mm0 \n\t"
2054 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2055
2056 "movd %%mm0, (%2, %%eax) \n\t"
2057 "punpckhdq %%mm0, %%mm0 \n\t"
2058 "movd %%mm0, (%3, %%eax) \n\t"
2059 "addl $4, %%eax \n\t"
2060 " js 1b \n\t"
2061 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2062 : "%eax", "%ebx"
2063 );
2064
2065 udst += chromStride;
2066 vdst += chromStride;
2067 src += srcStride*2;
2068 }
2069
2070 asm volatile( EMMS" \n\t"
2071 SFENCE" \n\t"
2072 :::"memory");
2073 #else
2074 y=0;
2075 #endif
2076 for(; y<height; y+=2)
2077 {
2078 unsigned i;
2079 for(i=0; i<chromWidth; i++)
2080 {
2081 unsigned int b= src[6*i+0];
2082 unsigned int g= src[6*i+1];
2083 unsigned int r= src[6*i+2];
2084
2085 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2086 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2087 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2088
2089 udst[i] = U;
2090 vdst[i] = V;
2091 ydst[2*i] = Y;
2092
2093 b= src[6*i+3];
2094 g= src[6*i+4];
2095 r= src[6*i+5];
2096
2097 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2098 ydst[2*i+1] = Y;
2099 }
2100 ydst += lumStride;
2101 src += srcStride;
2102
2103 for(i=0; i<chromWidth; i++)
2104 {
2105 unsigned int b= src[6*i+0];
2106 unsigned int g= src[6*i+1];
2107 unsigned int r= src[6*i+2];
2108
2109 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2110
2111 ydst[2*i] = Y;
2112
2113 b= src[6*i+3];
2114 g= src[6*i+4];
2115 r= src[6*i+5];
2116
2117 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2118 ydst[2*i+1] = Y;
2119 }
2120 udst += chromStride;
2121 vdst += chromStride;
2122 ydst += lumStride;
2123 src += srcStride;
2124 }
2125 }
2126
2127 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2128 unsigned width, unsigned height, unsigned src1Stride,
2129 unsigned src2Stride, unsigned dstStride){
2130 unsigned h;
2131
2132 for(h=0; h < height; h++)
2133 {
2134 unsigned w;
2135
2136 #ifdef HAVE_MMX
2137 #ifdef HAVE_SSE2
2138 asm(
2139 "xorl %%eax, %%eax \n\t"
2140 "1: \n\t"
2141 PREFETCH" 64(%1, %%eax) \n\t"
2142 PREFETCH" 64(%2, %%eax) \n\t"
2143 "movdqa (%1, %%eax), %%xmm0 \n\t"
2144 "movdqa (%1, %%eax), %%xmm1 \n\t"
2145 "movdqa (%2, %%eax), %%xmm2 \n\t"
2146 "punpcklbw %%xmm2, %%xmm0 \n\t"
2147 "punpckhbw %%xmm2, %%xmm1 \n\t"
2148 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2149 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2150 "addl $16, %%eax \n\t"
2151 "cmpl %3, %%eax \n\t"
2152 " jb 1b \n\t"
2153 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2154 : "memory", "%eax"
2155 );
2156 #else
2157 asm(
2158 "xorl %%eax, %%eax \n\t"
2159 "1: \n\t"
2160 PREFETCH" 64(%1, %%eax) \n\t"
2161 PREFETCH" 64(%2, %%eax) \n\t"
2162 "movq (%1, %%eax), %%mm0 \n\t"
2163 "movq 8(%1, %%eax), %%mm2 \n\t"
2164 "movq %%mm0, %%mm1 \n\t"
2165 "movq %%mm2, %%mm3 \n\t"
2166 "movq (%2, %%eax), %%mm4 \n\t"
2167 "movq 8(%2, %%eax), %%mm5 \n\t"
2168 "punpcklbw %%mm4, %%mm0 \n\t"
2169 "punpckhbw %%mm4, %%mm1 \n\t"
2170 "punpcklbw %%mm5, %%mm2 \n\t"
2171 "punpckhbw %%mm5, %%mm3 \n\t"
2172 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2173 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2174 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2175 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2176 "addl $16, %%eax \n\t"
2177 "cmpl %3, %%eax \n\t"
2178 " jb 1b \n\t"
2179 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2180 : "memory", "%eax"
2181 );
2182 #endif
2183 for(w= (width&(~15)); w < width; w++)
2184 {
2185 dest[2*w+0] = src1[w];
2186 dest[2*w+1] = src2[w];
2187 }
2188 #else
2189 for(w=0; w < width; w++)
2190 {
2191 dest[2*w+0] = src1[w];
2192 dest[2*w+1] = src2[w];
2193 }
2194 #endif
2195 dest += dstStride;
2196 src1 += src1Stride;
2197 src2 += src2Stride;
2198 }
2199 #ifdef HAVE_MMX
2200 asm(
2201 EMMS" \n\t"
2202 SFENCE" \n\t"
2203 ::: "memory"
2204 );
2205 #endif
2206 }
2207
2208 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2209 uint8_t *dst1, uint8_t *dst2,
2210 unsigned width, unsigned height,
2211 unsigned srcStride1, unsigned srcStride2,
2212 unsigned dstStride1, unsigned dstStride2)
2213 {
2214 unsigned y,x,w,h;
2215 w=width/2; h=height/2;
2216 #ifdef HAVE_MMX
2217 asm volatile(
2218 PREFETCH" %0\n\t"
2219 PREFETCH" %1\n\t"
2220 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2221 #endif
2222 for(y=0;y<h;y++){
2223 const uint8_t* s1=src1+srcStride1*(y>>1);
2224 uint8_t* d=dst1+dstStride1*y;
2225 x=0;
2226 #ifdef HAVE_MMX
2227 if(w > 32)
2228 for(;x<w;x+=32)
2229 {
2230 asm volatile(
2231 PREFETCH" 32%1\n\t"
2232 "movq %1, %%mm0\n\t"
2233 "movq 8%1, %%mm2\n\t"
2234 "movq 16%1, %%mm4\n\t"
2235 "movq 24%1, %%mm6\n\t"
2236 "movq %%mm0, %%mm1\n\t"
2237 "movq %%mm2, %%mm3\n\t"
2238 "movq %%mm4, %%mm5\n\t"
2239 "movq %%mm6, %%mm7\n\t"
2240 "punpcklbw %%mm0, %%mm0\n\t"
2241 "punpckhbw %%mm1, %%mm1\n\t"
2242 "punpcklbw %%mm2, %%mm2\n\t"
2243 "punpckhbw %%mm3, %%mm3\n\t"
2244 "punpcklbw %%mm4, %%mm4\n\t"
2245 "punpckhbw %%mm5, %%mm5\n\t"
2246 "punpcklbw %%mm6, %%mm6\n\t"
2247 "punpckhbw %%mm7, %%mm7\n\t"
2248 MOVNTQ" %%mm0, %0\n\t"
2249 MOVNTQ" %%mm1, 8%0\n\t"
2250 MOVNTQ" %%mm2, 16%0\n\t"
2251 MOVNTQ" %%mm3, 24%0\n\t"
2252 MOVNTQ" %%mm4, 32%0\n\t"
2253 MOVNTQ" %%mm5, 40%0\n\t"
2254 MOVNTQ" %%mm6, 48%0\n\t"
2255 MOVNTQ" %%mm7, 56%0"
2256 :"=m"(d[2*x])
2257 :"m"(s1[x])
2258 :"memory");
2259 }
2260 #endif
2261 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2262 }
2263 for(y=0;y<h;y++){
2264 const uint8_t* s2=src2+srcStride2*(y>>1);
2265 uint8_t* d=dst2+dstStride2*y;
2266 x=0;
2267 #ifdef HAVE_MMX
2268 if(w > 32)
2269 for(;x<w;x+=32)
2270 {
2271 asm volatile(
2272 PREFETCH" 32%1\n\t"
2273 "movq %1, %%mm0\n\t"
2274 "movq 8%1, %%mm2\n\t"
2275 "movq 16%1, %%mm4\n\t"
2276 "movq 24%1, %%mm6\n\t"
2277 "movq %%mm0, %%mm1\n\t"
2278 "movq %%mm2, %%mm3\n\t"
2279 "movq %%mm4, %%mm5\n\t"
2280 "movq %%mm6, %%mm7\n\t"
2281 "punpcklbw %%mm0, %%mm0\n\t"
2282 "punpckhbw %%mm1, %%mm1\n\t"
2283 "punpcklbw %%mm2, %%mm2\n\t"
2284 "punpckhbw %%mm3, %%mm3\n\t"
2285 "punpcklbw %%mm4, %%mm4\n\t"
2286 "punpckhbw %%mm5, %%mm5\n\t"
2287 "punpcklbw %%mm6, %%mm6\n\t"
2288 "punpckhbw %%mm7, %%mm7\n\t"
2289 MOVNTQ" %%mm0, %0\n\t"
2290 MOVNTQ" %%mm1, 8%0\n\t"
2291 MOVNTQ" %%mm2, 16%0\n\t"
2292 MOVNTQ" %%mm3, 24%0\n\t"
2293 MOVNTQ" %%mm4, 32%0\n\t"
2294 MOVNTQ" %%mm5, 40%0\n\t"
2295 MOVNTQ" %%mm6, 48%0\n\t"
2296 MOVNTQ" %%mm7, 56%0"
2297 :"=m"(d[2*x])
2298 :"m"(s2[x])
2299 :"memory");
2300 }
2301 #endif
2302 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2303 }
2304 #ifdef HAVE_MMX
2305 asm(
2306 EMMS" \n\t"
2307 SFENCE" \n\t"
2308 ::: "memory"
2309 );
2310 #endif
2311 }
2312
2313 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2314 uint8_t *dst,
2315 unsigned width, unsigned height,
2316 unsigned srcStride1, unsigned srcStride2,
2317 unsigned srcStride3, unsigned dstStride)
2318 {
2319 unsigned y,x,x2,w,h;
2320 w=width/2; h=height;
2321 #ifdef HAVE_MMX
2322 asm volatile(
2323 PREFETCH" %0\n\t"
2324 PREFETCH" %1\n\t"
2325 PREFETCH" %2\n\t"
2326 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
2327 #endif
2328 for(y=0;y<h;y++){
2329 const uint8_t* yp=src1+srcStride1*y;
2330 const uint8_t* up=src2+srcStride2*(y>>2);
2331 const uint8_t* vp=src3+srcStride3*(y>>2);
2332 uint8_t* d=dst+dstStride*y;
2333 x2=0;
2334 x=0;
2335 #ifdef HAVE_MMX
2336 for(;x<w;x+=8,x2+=32)
2337 {
2338 asm volatile(
2339 PREFETCH" 32%1\n\t"
2340 PREFETCH" 32%2\n\t"
2341 PREFETCH" 32%3\n\t"
2342 "movq %1, %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2343 "movq %2, %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2344 "movq %3, %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2345 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2346 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2347 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2348 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2349 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2350 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2351 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2352
2353 "movq %%mm1, %%mm6\n\t"
2354 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2355 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2356 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2357 MOVNTQ" %%mm0, %0\n\t"
2358 MOVNTQ" %%mm3, 8%0\n\t"
2359
2360 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2361 "movq 8%1, %%mm0\n\t"
2362 "movq %%mm0, %%mm3\n\t"
2363 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2364 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2365 MOVNTQ" %%mm0, 16%0\n\t"
2366 MOVNTQ" %%mm3, 24%0\n\t"
2367
2368 "movq %%mm4, %%mm6\n\t"
2369 "movq 16%1, %%mm0\n\t"
2370 "movq %%mm0, %%mm3\n\t"
2371 "punpcklbw %%mm5, %%mm4\n\t"
2372 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2373 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2374 MOVNTQ" %%mm0, 32%0\n\t"
2375 MOVNTQ" %%mm3, 40%0\n\t"
2376
2377 "punpckhbw %%mm5, %%mm6\n\t"
2378 "movq 24%1, %%mm0\n\t"
2379 "movq %%mm0, %%mm3\n\t"
2380 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2381 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2382 MOVNTQ" %%mm0, 48%0\n\t"
2383 MOVNTQ" %%mm3, 56%0\n\t"
2384
2385 :"=m"(d[8*x])
2386 :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])
2387 :"memory");
2388 }
2389 #endif
2390 for(;x<w;x++,x2+=4)
2391 {
2392 d[8*x+0]=yp[x2];
2393 d[8*x+1]=up[x];
2394 d[8*x+2]=yp[x2+1];
2395 d[8*x+3]=vp[x];
2396 d[8*x+4]=yp[x2+2];
2397 d[8*x+5]=up[x];
2398 d[8*x+6]=yp[x2+3];
2399 d[8*x+7]=vp[x];
2400 }
2401 }
2402 #ifdef HAVE_MMX
2403 asm(
2404 EMMS" \n\t"
2405 SFENCE" \n\t"
2406 ::: "memory"
2407 );
2408 #endif
2409 }