lot of bigendian fixes
[libav.git] / postproc / rgb2rgb_template.c
1 /*
2 *
3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
10 */
11
12 #include <stddef.h>
13 #include <inttypes.h> /* for __WORDSIZE */
14
15 #ifndef __WORDSIZE
16 // #warning You have misconfigured system and probably will lose performance!
17 #define __WORDSIZE MP_WORDSIZE
18 #endif
19
20 #undef PREFETCH
21 #undef MOVNTQ
22 #undef EMMS
23 #undef SFENCE
24 #undef MMREG_SIZE
25 #undef PREFETCHW
26 #undef PAVGB
27
28 #ifdef HAVE_SSE2
29 #define MMREG_SIZE 16
30 #else
31 #define MMREG_SIZE 8
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #define PAVGB "pavgusb"
38 #elif defined ( HAVE_MMX2 )
39 #define PREFETCH "prefetchnta"
40 #define PREFETCHW "prefetcht0"
41 #define PAVGB "pavgb"
42 #else
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
45 #endif
46
47 #ifdef HAVE_3DNOW
48 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
49 #define EMMS "femms"
50 #else
51 #define EMMS "emms"
52 #endif
53
54 #ifdef HAVE_MMX2
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
57 #else
58 #define MOVNTQ "movq"
59 #define SFENCE "/nop"
60 #endif
61
62 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
63 {
64 uint8_t *dest = dst;
65 const uint8_t *s = src;
66 const uint8_t *end;
67 #ifdef HAVE_MMX
68 const uint8_t *mm_end;
69 #endif
70 end = s + src_size;
71 #ifdef HAVE_MMX
72 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
73 mm_end = end - 23;
74 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
75 while(s < mm_end)
76 {
77 __asm __volatile(
78 PREFETCH" 32%1\n\t"
79 "movd %1, %%mm0\n\t"
80 "punpckldq 3%1, %%mm0\n\t"
81 "movd 6%1, %%mm1\n\t"
82 "punpckldq 9%1, %%mm1\n\t"
83 "movd 12%1, %%mm2\n\t"
84 "punpckldq 15%1, %%mm2\n\t"
85 "movd 18%1, %%mm3\n\t"
86 "punpckldq 21%1, %%mm3\n\t"
87 "pand %%mm7, %%mm0\n\t"
88 "pand %%mm7, %%mm1\n\t"
89 "pand %%mm7, %%mm2\n\t"
90 "pand %%mm7, %%mm3\n\t"
91 MOVNTQ" %%mm0, %0\n\t"
92 MOVNTQ" %%mm1, 8%0\n\t"
93 MOVNTQ" %%mm2, 16%0\n\t"
94 MOVNTQ" %%mm3, 24%0"
95 :"=m"(*dest)
96 :"m"(*s)
97 :"memory");
98 dest += 32;
99 s += 24;
100 }
101 __asm __volatile(SFENCE:::"memory");
102 __asm __volatile(EMMS:::"memory");
103 #endif
104 while(s < end)
105 {
106 #ifdef WORDS_BIGENDIAN
107 *dest++ = 0;
108 *dest++ = *s++;
109 *dest++ = *s++;
110 *dest++ = *s++;
111 #else
112 *dest++ = *s++;
113 *dest++ = *s++;
114 *dest++ = *s++;
115 *dest++ = 0;
116 #endif
117 }
118 }
119
120 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
121 {
122 uint8_t *dest = dst;
123 const uint8_t *s = src;
124 const uint8_t *end;
125 #ifdef HAVE_MMX
126 const uint8_t *mm_end;
127 #endif
128 end = s + src_size;
129 #ifdef HAVE_MMX
130 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
131 mm_end = end - 31;
132 while(s < mm_end)
133 {
134 __asm __volatile(
135 PREFETCH" 32%1\n\t"
136 "movq %1, %%mm0\n\t"
137 "movq 8%1, %%mm1\n\t"
138 "movq 16%1, %%mm4\n\t"
139 "movq 24%1, %%mm5\n\t"
140 "movq %%mm0, %%mm2\n\t"
141 "movq %%mm1, %%mm3\n\t"
142 "movq %%mm4, %%mm6\n\t"
143 "movq %%mm5, %%mm7\n\t"
144 "psrlq $8, %%mm2\n\t"
145 "psrlq $8, %%mm3\n\t"
146 "psrlq $8, %%mm6\n\t"
147 "psrlq $8, %%mm7\n\t"
148 "pand %2, %%mm0\n\t"
149 "pand %2, %%mm1\n\t"
150 "pand %2, %%mm4\n\t"
151 "pand %2, %%mm5\n\t"
152 "pand %3, %%mm2\n\t"
153 "pand %3, %%mm3\n\t"
154 "pand %3, %%mm6\n\t"
155 "pand %3, %%mm7\n\t"
156 "por %%mm2, %%mm0\n\t"
157 "por %%mm3, %%mm1\n\t"
158 "por %%mm6, %%mm4\n\t"
159 "por %%mm7, %%mm5\n\t"
160
161 "movq %%mm1, %%mm2\n\t"
162 "movq %%mm4, %%mm3\n\t"
163 "psllq $48, %%mm2\n\t"
164 "psllq $32, %%mm3\n\t"
165 "pand %4, %%mm2\n\t"
166 "pand %5, %%mm3\n\t"
167 "por %%mm2, %%mm0\n\t"
168 "psrlq $16, %%mm1\n\t"
169 "psrlq $32, %%mm4\n\t"
170 "psllq $16, %%mm5\n\t"
171 "por %%mm3, %%mm1\n\t"
172 "pand %6, %%mm5\n\t"
173 "por %%mm5, %%mm4\n\t"
174
175 MOVNTQ" %%mm0, %0\n\t"
176 MOVNTQ" %%mm1, 8%0\n\t"
177 MOVNTQ" %%mm4, 16%0"
178 :"=m"(*dest)
179 :"m"(*s),"m"(mask24l),
180 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
181 :"memory");
182 dest += 24;
183 s += 32;
184 }
185 __asm __volatile(SFENCE:::"memory");
186 __asm __volatile(EMMS:::"memory");
187 #endif
188 while(s < end)
189 {
190 #ifdef WORDS_BIGENDIAN
191 s++;
192 *dest++ = *s++;
193 *dest++ = *s++;
194 *dest++ = *s++;
195 #else
196 *dest++ = *s++;
197 *dest++ = *s++;
198 *dest++ = *s++;
199 s++;
200 #endif
201 }
202 }
203
204 /*
205 Original by Strepto/Astral
206 ported to gcc & bugfixed : A'rpi
207 MMX2, 3DNOW optimization by Nick Kurshev
208 32bit c version, and and&add trick by Michael Niedermayer
209 */
210 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
211 {
212 register const uint8_t* s=src;
213 register uint8_t* d=dst;
214 register const uint8_t *end;
215 const uint8_t *mm_end;
216 end = s + src_size;
217 #ifdef HAVE_MMX
218 __asm __volatile(PREFETCH" %0"::"m"(*s));
219 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
220 mm_end = end - 15;
221 while(s<mm_end)
222 {
223 __asm __volatile(
224 PREFETCH" 32%1\n\t"
225 "movq %1, %%mm0\n\t"
226 "movq 8%1, %%mm2\n\t"
227 "movq %%mm0, %%mm1\n\t"
228 "movq %%mm2, %%mm3\n\t"
229 "pand %%mm4, %%mm0\n\t"
230 "pand %%mm4, %%mm2\n\t"
231 "paddw %%mm1, %%mm0\n\t"
232 "paddw %%mm3, %%mm2\n\t"
233 MOVNTQ" %%mm0, %0\n\t"
234 MOVNTQ" %%mm2, 8%0"
235 :"=m"(*d)
236 :"m"(*s)
237 );
238 d+=16;
239 s+=16;
240 }
241 __asm __volatile(SFENCE:::"memory");
242 __asm __volatile(EMMS:::"memory");
243 #endif
244 mm_end = end - 3;
245 while(s < mm_end)
246 {
247 register unsigned x= *((uint32_t *)s);
248 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
249 d+=4;
250 s+=4;
251 }
252 if(s < end)
253 {
254 register unsigned short x= *((uint16_t *)s);
255 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
256 }
257 }
258
259 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
260 {
261 register const uint8_t* s=src;
262 register uint8_t* d=dst;
263 register const uint8_t *end;
264 const uint8_t *mm_end;
265 end = s + src_size;
266 #ifdef HAVE_MMX
267 __asm __volatile(PREFETCH" %0"::"m"(*s));
268 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
269 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
270 mm_end = end - 15;
271 while(s<mm_end)
272 {
273 __asm __volatile(
274 PREFETCH" 32%1\n\t"
275 "movq %1, %%mm0\n\t"
276 "movq 8%1, %%mm2\n\t"
277 "movq %%mm0, %%mm1\n\t"
278 "movq %%mm2, %%mm3\n\t"
279 "psrlq $1, %%mm0\n\t"
280 "psrlq $1, %%mm2\n\t"
281 "pand %%mm7, %%mm0\n\t"
282 "pand %%mm7, %%mm2\n\t"
283 "pand %%mm6, %%mm1\n\t"
284 "pand %%mm6, %%mm3\n\t"
285 "por %%mm1, %%mm0\n\t"
286 "por %%mm3, %%mm2\n\t"
287 MOVNTQ" %%mm0, %0\n\t"
288 MOVNTQ" %%mm2, 8%0"
289 :"=m"(*d)
290 :"m"(*s)
291 );
292 d+=16;
293 s+=16;
294 }
295 __asm __volatile(SFENCE:::"memory");
296 __asm __volatile(EMMS:::"memory");
297 #endif
298 mm_end = end - 3;
299 while(s < mm_end)
300 {
301 register uint32_t x= *((uint32_t *)s);
302 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
303 s+=4;
304 d+=4;
305 }
306 if(s < end)
307 {
308 register uint16_t x= *((uint16_t *)s);
309 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
310 s+=2;
311 d+=2;
312 }
313 }
314
315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
316 {
317 const uint8_t *s = src;
318 const uint8_t *end;
319 #ifdef HAVE_MMX
320 const uint8_t *mm_end;
321 #endif
322 uint16_t *d = (uint16_t *)dst;
323 end = s + src_size;
324 #ifdef HAVE_MMX
325 mm_end = end - 15;
326 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
327 asm volatile(
328 "movq %3, %%mm5 \n\t"
329 "movq %4, %%mm6 \n\t"
330 "movq %5, %%mm7 \n\t"
331 ".balign 16 \n\t"
332 "1: \n\t"
333 PREFETCH" 32(%1) \n\t"
334 "movd (%1), %%mm0 \n\t"
335 "movd 4(%1), %%mm3 \n\t"
336 "punpckldq 8(%1), %%mm0 \n\t"
337 "punpckldq 12(%1), %%mm3 \n\t"
338 "movq %%mm0, %%mm1 \n\t"
339 "movq %%mm3, %%mm4 \n\t"
340 "pand %%mm6, %%mm0 \n\t"
341 "pand %%mm6, %%mm3 \n\t"
342 "pmaddwd %%mm7, %%mm0 \n\t"
343 "pmaddwd %%mm7, %%mm3 \n\t"
344 "pand %%mm5, %%mm1 \n\t"
345 "pand %%mm5, %%mm4 \n\t"
346 "por %%mm1, %%mm0 \n\t"
347 "por %%mm4, %%mm3 \n\t"
348 "psrld $5, %%mm0 \n\t"
349 "pslld $11, %%mm3 \n\t"
350 "por %%mm3, %%mm0 \n\t"
351 MOVNTQ" %%mm0, (%0) \n\t"
352 "addl $16, %1 \n\t"
353 "addl $8, %0 \n\t"
354 "cmpl %2, %1 \n\t"
355 " jb 1b \n\t"
356 : "+r" (d), "+r"(s)
357 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
358 );
359 #else
360 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
361 __asm __volatile(
362 "movq %0, %%mm7\n\t"
363 "movq %1, %%mm6\n\t"
364 ::"m"(red_16mask),"m"(green_16mask));
365 while(s < mm_end)
366 {
367 __asm __volatile(
368 PREFETCH" 32%1\n\t"
369 "movd %1, %%mm0\n\t"
370 "movd 4%1, %%mm3\n\t"
371 "punpckldq 8%1, %%mm0\n\t"
372 "punpckldq 12%1, %%mm3\n\t"
373 "movq %%mm0, %%mm1\n\t"
374 "movq %%mm0, %%mm2\n\t"
375 "movq %%mm3, %%mm4\n\t"
376 "movq %%mm3, %%mm5\n\t"
377 "psrlq $3, %%mm0\n\t"
378 "psrlq $3, %%mm3\n\t"
379 "pand %2, %%mm0\n\t"
380 "pand %2, %%mm3\n\t"
381 "psrlq $5, %%mm1\n\t"
382 "psrlq $5, %%mm4\n\t"
383 "pand %%mm6, %%mm1\n\t"
384 "pand %%mm6, %%mm4\n\t"
385 "psrlq $8, %%mm2\n\t"
386 "psrlq $8, %%mm5\n\t"
387 "pand %%mm7, %%mm2\n\t"
388 "pand %%mm7, %%mm5\n\t"
389 "por %%mm1, %%mm0\n\t"
390 "por %%mm4, %%mm3\n\t"
391 "por %%mm2, %%mm0\n\t"
392 "por %%mm5, %%mm3\n\t"
393 "psllq $16, %%mm3\n\t"
394 "por %%mm3, %%mm0\n\t"
395 MOVNTQ" %%mm0, %0\n\t"
396 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
397 d += 4;
398 s += 16;
399 }
400 #endif
401 __asm __volatile(SFENCE:::"memory");
402 __asm __volatile(EMMS:::"memory");
403 #endif
404 while(s < end)
405 {
406 // FIXME on bigendian
407 const int src= *s; s += 4;
408 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
409 // *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
410 }
411 }
412
413 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
414 {
415 const uint8_t *s = src;
416 const uint8_t *end;
417 #ifdef HAVE_MMX
418 const uint8_t *mm_end;
419 #endif
420 uint16_t *d = (uint16_t *)dst;
421 end = s + src_size;
422 #ifdef HAVE_MMX
423 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
424 __asm __volatile(
425 "movq %0, %%mm7\n\t"
426 "movq %1, %%mm6\n\t"
427 ::"m"(red_16mask),"m"(green_16mask));
428 mm_end = end - 15;
429 while(s < mm_end)
430 {
431 __asm __volatile(
432 PREFETCH" 32%1\n\t"
433 "movd %1, %%mm0\n\t"
434 "movd 4%1, %%mm3\n\t"
435 "punpckldq 8%1, %%mm0\n\t"
436 "punpckldq 12%1, %%mm3\n\t"
437 "movq %%mm0, %%mm1\n\t"
438 "movq %%mm0, %%mm2\n\t"
439 "movq %%mm3, %%mm4\n\t"
440 "movq %%mm3, %%mm5\n\t"
441 "psllq $8, %%mm0\n\t"
442 "psllq $8, %%mm3\n\t"
443 "pand %%mm7, %%mm0\n\t"
444 "pand %%mm7, %%mm3\n\t"
445 "psrlq $5, %%mm1\n\t"
446 "psrlq $5, %%mm4\n\t"
447 "pand %%mm6, %%mm1\n\t"
448 "pand %%mm6, %%mm4\n\t"
449 "psrlq $19, %%mm2\n\t"
450 "psrlq $19, %%mm5\n\t"
451 "pand %2, %%mm2\n\t"
452 "pand %2, %%mm5\n\t"
453 "por %%mm1, %%mm0\n\t"
454 "por %%mm4, %%mm3\n\t"
455 "por %%mm2, %%mm0\n\t"
456 "por %%mm5, %%mm3\n\t"
457 "psllq $16, %%mm3\n\t"
458 "por %%mm3, %%mm0\n\t"
459 MOVNTQ" %%mm0, %0\n\t"
460 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
461 d += 4;
462 s += 16;
463 }
464 __asm __volatile(SFENCE:::"memory");
465 __asm __volatile(EMMS:::"memory");
466 #endif
467 while(s < end)
468 {
469 // FIXME on bigendian
470 const int src= *s; s += 4;
471 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
472 }
473 }
474
475 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
476 {
477 const uint8_t *s = src;
478 const uint8_t *end;
479 #ifdef HAVE_MMX
480 const uint8_t *mm_end;
481 #endif
482 uint16_t *d = (uint16_t *)dst;
483 end = s + src_size;
484 #ifdef HAVE_MMX
485 mm_end = end - 15;
486 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
487 asm volatile(
488 "movq %3, %%mm5 \n\t"
489 "movq %4, %%mm6 \n\t"
490 "movq %5, %%mm7 \n\t"
491 ".balign 16 \n\t"
492 "1: \n\t"
493 PREFETCH" 32(%1) \n\t"
494 "movd (%1), %%mm0 \n\t"
495 "movd 4(%1), %%mm3 \n\t"
496 "punpckldq 8(%1), %%mm0 \n\t"
497 "punpckldq 12(%1), %%mm3 \n\t"
498 "movq %%mm0, %%mm1 \n\t"
499 "movq %%mm3, %%mm4 \n\t"
500 "pand %%mm6, %%mm0 \n\t"
501 "pand %%mm6, %%mm3 \n\t"
502 "pmaddwd %%mm7, %%mm0 \n\t"
503 "pmaddwd %%mm7, %%mm3 \n\t"
504 "pand %%mm5, %%mm1 \n\t"
505 "pand %%mm5, %%mm4 \n\t"
506 "por %%mm1, %%mm0 \n\t"
507 "por %%mm4, %%mm3 \n\t"
508 "psrld $6, %%mm0 \n\t"
509 "pslld $10, %%mm3 \n\t"
510 "por %%mm3, %%mm0 \n\t"
511 MOVNTQ" %%mm0, (%0) \n\t"
512 "addl $16, %1 \n\t"
513 "addl $8, %0 \n\t"
514 "cmpl %2, %1 \n\t"
515 " jb 1b \n\t"
516 : "+r" (d), "+r"(s)
517 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
518 );
519 #else
520 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
521 __asm __volatile(
522 "movq %0, %%mm7\n\t"
523 "movq %1, %%mm6\n\t"
524 ::"m"(red_15mask),"m"(green_15mask));
525 while(s < mm_end)
526 {
527 __asm __volatile(
528 PREFETCH" 32%1\n\t"
529 "movd %1, %%mm0\n\t"
530 "movd 4%1, %%mm3\n\t"
531 "punpckldq 8%1, %%mm0\n\t"
532 "punpckldq 12%1, %%mm3\n\t"
533 "movq %%mm0, %%mm1\n\t"
534 "movq %%mm0, %%mm2\n\t"
535 "movq %%mm3, %%mm4\n\t"
536 "movq %%mm3, %%mm5\n\t"
537 "psrlq $3, %%mm0\n\t"
538 "psrlq $3, %%mm3\n\t"
539 "pand %2, %%mm0\n\t"
540 "pand %2, %%mm3\n\t"
541 "psrlq $6, %%mm1\n\t"
542 "psrlq $6, %%mm4\n\t"
543 "pand %%mm6, %%mm1\n\t"
544 "pand %%mm6, %%mm4\n\t"
545 "psrlq $9, %%mm2\n\t"
546 "psrlq $9, %%mm5\n\t"
547 "pand %%mm7, %%mm2\n\t"
548 "pand %%mm7, %%mm5\n\t"
549 "por %%mm1, %%mm0\n\t"
550 "por %%mm4, %%mm3\n\t"
551 "por %%mm2, %%mm0\n\t"
552 "por %%mm5, %%mm3\n\t"
553 "psllq $16, %%mm3\n\t"
554 "por %%mm3, %%mm0\n\t"
555 MOVNTQ" %%mm0, %0\n\t"
556 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
557 d += 4;
558 s += 16;
559 }
560 #endif
561 __asm __volatile(SFENCE:::"memory");
562 __asm __volatile(EMMS:::"memory");
563 #endif
564 while(s < end)
565 {
566 // FIXME on bigendian
567 const int src= *s; s += 4;
568 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
569 }
570 }
571
572 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
573 {
574 const uint8_t *s = src;
575 const uint8_t *end;
576 #ifdef HAVE_MMX
577 const uint8_t *mm_end;
578 #endif
579 uint16_t *d = (uint16_t *)dst;
580 end = s + src_size;
581 #ifdef HAVE_MMX
582 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
583 __asm __volatile(
584 "movq %0, %%mm7\n\t"
585 "movq %1, %%mm6\n\t"
586 ::"m"(red_15mask),"m"(green_15mask));
587 mm_end = end - 15;
588 while(s < mm_end)
589 {
590 __asm __volatile(
591 PREFETCH" 32%1\n\t"
592 "movd %1, %%mm0\n\t"
593 "movd 4%1, %%mm3\n\t"
594 "punpckldq 8%1, %%mm0\n\t"
595 "punpckldq 12%1, %%mm3\n\t"
596 "movq %%mm0, %%mm1\n\t"
597 "movq %%mm0, %%mm2\n\t"
598 "movq %%mm3, %%mm4\n\t"
599 "movq %%mm3, %%mm5\n\t"
600 "psllq $7, %%mm0\n\t"
601 "psllq $7, %%mm3\n\t"
602 "pand %%mm7, %%mm0\n\t"
603 "pand %%mm7, %%mm3\n\t"
604 "psrlq $6, %%mm1\n\t"
605 "psrlq $6, %%mm4\n\t"
606 "pand %%mm6, %%mm1\n\t"
607 "pand %%mm6, %%mm4\n\t"
608 "psrlq $19, %%mm2\n\t"
609 "psrlq $19, %%mm5\n\t"
610 "pand %2, %%mm2\n\t"
611 "pand %2, %%mm5\n\t"
612 "por %%mm1, %%mm0\n\t"
613 "por %%mm4, %%mm3\n\t"
614 "por %%mm2, %%mm0\n\t"
615 "por %%mm5, %%mm3\n\t"
616 "psllq $16, %%mm3\n\t"
617 "por %%mm3, %%mm0\n\t"
618 MOVNTQ" %%mm0, %0\n\t"
619 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
620 d += 4;
621 s += 16;
622 }
623 __asm __volatile(SFENCE:::"memory");
624 __asm __volatile(EMMS:::"memory");
625 #endif
626 while(s < end)
627 {
628 // FIXME on bigendian
629 const int src= *s; s += 4;
630 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
631 }
632 }
633
634 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
635 {
636 const uint8_t *s = src;
637 const uint8_t *end;
638 #ifdef HAVE_MMX
639 const uint8_t *mm_end;
640 #endif
641 uint16_t *d = (uint16_t *)dst;
642 end = s + src_size;
643 #ifdef HAVE_MMX
644 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
645 __asm __volatile(
646 "movq %0, %%mm7\n\t"
647 "movq %1, %%mm6\n\t"
648 ::"m"(red_16mask),"m"(green_16mask));
649 mm_end = end - 11;
650 while(s < mm_end)
651 {
652 __asm __volatile(
653 PREFETCH" 32%1\n\t"
654 "movd %1, %%mm0\n\t"
655 "movd 3%1, %%mm3\n\t"
656 "punpckldq 6%1, %%mm0\n\t"
657 "punpckldq 9%1, %%mm3\n\t"
658 "movq %%mm0, %%mm1\n\t"
659 "movq %%mm0, %%mm2\n\t"
660 "movq %%mm3, %%mm4\n\t"
661 "movq %%mm3, %%mm5\n\t"
662 "psrlq $3, %%mm0\n\t"
663 "psrlq $3, %%mm3\n\t"
664 "pand %2, %%mm0\n\t"
665 "pand %2, %%mm3\n\t"
666 "psrlq $5, %%mm1\n\t"
667 "psrlq $5, %%mm4\n\t"
668 "pand %%mm6, %%mm1\n\t"
669 "pand %%mm6, %%mm4\n\t"
670 "psrlq $8, %%mm2\n\t"
671 "psrlq $8, %%mm5\n\t"
672 "pand %%mm7, %%mm2\n\t"
673 "pand %%mm7, %%mm5\n\t"
674 "por %%mm1, %%mm0\n\t"
675 "por %%mm4, %%mm3\n\t"
676 "por %%mm2, %%mm0\n\t"
677 "por %%mm5, %%mm3\n\t"
678 "psllq $16, %%mm3\n\t"
679 "por %%mm3, %%mm0\n\t"
680 MOVNTQ" %%mm0, %0\n\t"
681 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
682 d += 4;
683 s += 12;
684 }
685 __asm __volatile(SFENCE:::"memory");
686 __asm __volatile(EMMS:::"memory");
687 #endif
688 while(s < end)
689 {
690 const int b= *s++;
691 const int g= *s++;
692 const int r= *s++;
693 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
694 }
695 }
696
697 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
698 {
699 const uint8_t *s = src;
700 const uint8_t *end;
701 #ifdef HAVE_MMX
702 const uint8_t *mm_end;
703 #endif
704 uint16_t *d = (uint16_t *)dst;
705 end = s + src_size;
706 #ifdef HAVE_MMX
707 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
708 __asm __volatile(
709 "movq %0, %%mm7\n\t"
710 "movq %1, %%mm6\n\t"
711 ::"m"(red_16mask),"m"(green_16mask));
712 mm_end = end - 15;
713 while(s < mm_end)
714 {
715 __asm __volatile(
716 PREFETCH" 32%1\n\t"
717 "movd %1, %%mm0\n\t"
718 "movd 3%1, %%mm3\n\t"
719 "punpckldq 6%1, %%mm0\n\t"
720 "punpckldq 9%1, %%mm3\n\t"
721 "movq %%mm0, %%mm1\n\t"
722 "movq %%mm0, %%mm2\n\t"
723 "movq %%mm3, %%mm4\n\t"
724 "movq %%mm3, %%mm5\n\t"
725 "psllq $8, %%mm0\n\t"
726 "psllq $8, %%mm3\n\t"
727 "pand %%mm7, %%mm0\n\t"
728 "pand %%mm7, %%mm3\n\t"
729 "psrlq $5, %%mm1\n\t"
730 "psrlq $5, %%mm4\n\t"
731 "pand %%mm6, %%mm1\n\t"
732 "pand %%mm6, %%mm4\n\t"
733 "psrlq $19, %%mm2\n\t"
734 "psrlq $19, %%mm5\n\t"
735 "pand %2, %%mm2\n\t"
736 "pand %2, %%mm5\n\t"
737 "por %%mm1, %%mm0\n\t"
738 "por %%mm4, %%mm3\n\t"
739 "por %%mm2, %%mm0\n\t"
740 "por %%mm5, %%mm3\n\t"
741 "psllq $16, %%mm3\n\t"
742 "por %%mm3, %%mm0\n\t"
743 MOVNTQ" %%mm0, %0\n\t"
744 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
745 d += 4;
746 s += 12;
747 }
748 __asm __volatile(SFENCE:::"memory");
749 __asm __volatile(EMMS:::"memory");
750 #endif
751 while(s < end)
752 {
753 const int r= *s++;
754 const int g= *s++;
755 const int b= *s++;
756 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
757 }
758 }
759
760 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
761 {
762 const uint8_t *s = src;
763 const uint8_t *end;
764 #ifdef HAVE_MMX
765 const uint8_t *mm_end;
766 #endif
767 uint16_t *d = (uint16_t *)dst;
768 end = s + src_size;
769 #ifdef HAVE_MMX
770 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
771 __asm __volatile(
772 "movq %0, %%mm7\n\t"
773 "movq %1, %%mm6\n\t"
774 ::"m"(red_15mask),"m"(green_15mask));
775 mm_end = end - 11;
776 while(s < mm_end)
777 {
778 __asm __volatile(
779 PREFETCH" 32%1\n\t"
780 "movd %1, %%mm0\n\t"
781 "movd 3%1, %%mm3\n\t"
782 "punpckldq 6%1, %%mm0\n\t"
783 "punpckldq 9%1, %%mm3\n\t"
784 "movq %%mm0, %%mm1\n\t"
785 "movq %%mm0, %%mm2\n\t"
786 "movq %%mm3, %%mm4\n\t"
787 "movq %%mm3, %%mm5\n\t"
788 "psrlq $3, %%mm0\n\t"
789 "psrlq $3, %%mm3\n\t"
790 "pand %2, %%mm0\n\t"
791 "pand %2, %%mm3\n\t"
792 "psrlq $6, %%mm1\n\t"
793 "psrlq $6, %%mm4\n\t"
794 "pand %%mm6, %%mm1\n\t"
795 "pand %%mm6, %%mm4\n\t"
796 "psrlq $9, %%mm2\n\t"
797 "psrlq $9, %%mm5\n\t"
798 "pand %%mm7, %%mm2\n\t"
799 "pand %%mm7, %%mm5\n\t"
800 "por %%mm1, %%mm0\n\t"
801 "por %%mm4, %%mm3\n\t"
802 "por %%mm2, %%mm0\n\t"
803 "por %%mm5, %%mm3\n\t"
804 "psllq $16, %%mm3\n\t"
805 "por %%mm3, %%mm0\n\t"
806 MOVNTQ" %%mm0, %0\n\t"
807 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
808 d += 4;
809 s += 12;
810 }
811 __asm __volatile(SFENCE:::"memory");
812 __asm __volatile(EMMS:::"memory");
813 #endif
814 while(s < end)
815 {
816 const int b= *s++;
817 const int g= *s++;
818 const int r= *s++;
819 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
820 }
821 }
822
823 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
824 {
825 const uint8_t *s = src;
826 const uint8_t *end;
827 #ifdef HAVE_MMX
828 const uint8_t *mm_end;
829 #endif
830 uint16_t *d = (uint16_t *)dst;
831 end = s + src_size;
832 #ifdef HAVE_MMX
833 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
834 __asm __volatile(
835 "movq %0, %%mm7\n\t"
836 "movq %1, %%mm6\n\t"
837 ::"m"(red_15mask),"m"(green_15mask));
838 mm_end = end - 15;
839 while(s < mm_end)
840 {
841 __asm __volatile(
842 PREFETCH" 32%1\n\t"
843 "movd %1, %%mm0\n\t"
844 "movd 3%1, %%mm3\n\t"
845 "punpckldq 6%1, %%mm0\n\t"
846 "punpckldq 9%1, %%mm3\n\t"
847 "movq %%mm0, %%mm1\n\t"
848 "movq %%mm0, %%mm2\n\t"
849 "movq %%mm3, %%mm4\n\t"
850 "movq %%mm3, %%mm5\n\t"
851 "psllq $7, %%mm0\n\t"
852 "psllq $7, %%mm3\n\t"
853 "pand %%mm7, %%mm0\n\t"
854 "pand %%mm7, %%mm3\n\t"
855 "psrlq $6, %%mm1\n\t"
856 "psrlq $6, %%mm4\n\t"
857 "pand %%mm6, %%mm1\n\t"
858 "pand %%mm6, %%mm4\n\t"
859 "psrlq $19, %%mm2\n\t"
860 "psrlq $19, %%mm5\n\t"
861 "pand %2, %%mm2\n\t"
862 "pand %2, %%mm5\n\t"
863 "por %%mm1, %%mm0\n\t"
864 "por %%mm4, %%mm3\n\t"
865 "por %%mm2, %%mm0\n\t"
866 "por %%mm5, %%mm3\n\t"
867 "psllq $16, %%mm3\n\t"
868 "por %%mm3, %%mm0\n\t"
869 MOVNTQ" %%mm0, %0\n\t"
870 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
871 d += 4;
872 s += 12;
873 }
874 __asm __volatile(SFENCE:::"memory");
875 __asm __volatile(EMMS:::"memory");
876 #endif
877 while(s < end)
878 {
879 const int r= *s++;
880 const int g= *s++;
881 const int b= *s++;
882 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
883 }
884 }
885
886 /*
887 I use here less accurate approximation by simply
888 left-shifting the input
889 value and filling the low order bits with
890 zeroes. This method improves png's
891 compression but this scheme cannot reproduce white exactly, since it does not
892 generate an all-ones maximum value; the net effect is to darken the
893 image slightly.
894
895 The better method should be "left bit replication":
896
897 4 3 2 1 0
898 ---------
899 1 1 0 1 1
900
901 7 6 5 4 3 2 1 0
902 ----------------
903 1 1 0 1 1 1 1 0
904 |=======| |===|
905 | Leftmost Bits Repeated to Fill Open Bits
906 |
907 Original Bits
908 */
909 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
910 {
911 const uint16_t *end;
912 #ifdef HAVE_MMX
913 const uint16_t *mm_end;
914 #endif
915 uint8_t *d = (uint8_t *)dst;
916 const uint16_t *s = (uint16_t *)src;
917 end = s + src_size/2;
918 #ifdef HAVE_MMX
919 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
920 mm_end = end - 7;
921 while(s < mm_end)
922 {
923 __asm __volatile(
924 PREFETCH" 32%1\n\t"
925 "movq %1, %%mm0\n\t"
926 "movq %1, %%mm1\n\t"
927 "movq %1, %%mm2\n\t"
928 "pand %2, %%mm0\n\t"
929 "pand %3, %%mm1\n\t"
930 "pand %4, %%mm2\n\t"
931 "psllq $3, %%mm0\n\t"
932 "psrlq $2, %%mm1\n\t"
933 "psrlq $7, %%mm2\n\t"
934 "movq %%mm0, %%mm3\n\t"
935 "movq %%mm1, %%mm4\n\t"
936 "movq %%mm2, %%mm5\n\t"
937 "punpcklwd %5, %%mm0\n\t"
938 "punpcklwd %5, %%mm1\n\t"
939 "punpcklwd %5, %%mm2\n\t"
940 "punpckhwd %5, %%mm3\n\t"
941 "punpckhwd %5, %%mm4\n\t"
942 "punpckhwd %5, %%mm5\n\t"
943 "psllq $8, %%mm1\n\t"
944 "psllq $16, %%mm2\n\t"
945 "por %%mm1, %%mm0\n\t"
946 "por %%mm2, %%mm0\n\t"
947 "psllq $8, %%mm4\n\t"
948 "psllq $16, %%mm5\n\t"
949 "por %%mm4, %%mm3\n\t"
950 "por %%mm5, %%mm3\n\t"
951
952 "movq %%mm0, %%mm6\n\t"
953 "movq %%mm3, %%mm7\n\t"
954
955 "movq 8%1, %%mm0\n\t"
956 "movq 8%1, %%mm1\n\t"
957 "movq 8%1, %%mm2\n\t"
958 "pand %2, %%mm0\n\t"
959 "pand %3, %%mm1\n\t"
960 "pand %4, %%mm2\n\t"
961 "psllq $3, %%mm0\n\t"
962 "psrlq $2, %%mm1\n\t"
963 "psrlq $7, %%mm2\n\t"
964 "movq %%mm0, %%mm3\n\t"
965 "movq %%mm1, %%mm4\n\t"
966 "movq %%mm2, %%mm5\n\t"
967 "punpcklwd %5, %%mm0\n\t"
968 "punpcklwd %5, %%mm1\n\t"
969 "punpcklwd %5, %%mm2\n\t"
970 "punpckhwd %5, %%mm3\n\t"
971 "punpckhwd %5, %%mm4\n\t"
972 "punpckhwd %5, %%mm5\n\t"
973 "psllq $8, %%mm1\n\t"
974 "psllq $16, %%mm2\n\t"
975 "por %%mm1, %%mm0\n\t"
976 "por %%mm2, %%mm0\n\t"
977 "psllq $8, %%mm4\n\t"
978 "psllq $16, %%mm5\n\t"
979 "por %%mm4, %%mm3\n\t"
980 "por %%mm5, %%mm3\n\t"
981
982 :"=m"(*d)
983 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
984 :"memory");
985 /* Borrowed 32 to 24 */
986 __asm __volatile(
987 "movq %%mm0, %%mm4\n\t"
988 "movq %%mm3, %%mm5\n\t"
989 "movq %%mm6, %%mm0\n\t"
990 "movq %%mm7, %%mm1\n\t"
991
992 "movq %%mm4, %%mm6\n\t"
993 "movq %%mm5, %%mm7\n\t"
994 "movq %%mm0, %%mm2\n\t"
995 "movq %%mm1, %%mm3\n\t"
996
997 "psrlq $8, %%mm2\n\t"
998 "psrlq $8, %%mm3\n\t"
999 "psrlq $8, %%mm6\n\t"
1000 "psrlq $8, %%mm7\n\t"
1001 "pand %2, %%mm0\n\t"
1002 "pand %2, %%mm1\n\t"
1003 "pand %2, %%mm4\n\t"
1004 "pand %2, %%mm5\n\t"
1005 "pand %3, %%mm2\n\t"
1006 "pand %3, %%mm3\n\t"
1007 "pand %3, %%mm6\n\t"
1008 "pand %3, %%mm7\n\t"
1009 "por %%mm2, %%mm0\n\t"
1010 "por %%mm3, %%mm1\n\t"
1011 "por %%mm6, %%mm4\n\t"
1012 "por %%mm7, %%mm5\n\t"
1013
1014 "movq %%mm1, %%mm2\n\t"
1015 "movq %%mm4, %%mm3\n\t"
1016 "psllq $48, %%mm2\n\t"
1017 "psllq $32, %%mm3\n\t"
1018 "pand %4, %%mm2\n\t"
1019 "pand %5, %%mm3\n\t"
1020 "por %%mm2, %%mm0\n\t"
1021 "psrlq $16, %%mm1\n\t"
1022 "psrlq $32, %%mm4\n\t"
1023 "psllq $16, %%mm5\n\t"
1024 "por %%mm3, %%mm1\n\t"
1025 "pand %6, %%mm5\n\t"
1026 "por %%mm5, %%mm4\n\t"
1027
1028 MOVNTQ" %%mm0, %0\n\t"
1029 MOVNTQ" %%mm1, 8%0\n\t"
1030 MOVNTQ" %%mm4, 16%0"
1031
1032 :"=m"(*d)
1033 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1034 :"memory");
1035 d += 24;
1036 s += 8;
1037 }
1038 __asm __volatile(SFENCE:::"memory");
1039 __asm __volatile(EMMS:::"memory");
1040 #endif
1041 while(s < end)
1042 {
1043 register uint16_t bgr;
1044 bgr = *s++;
1045 *d++ = (bgr&0x1F)<<3;
1046 *d++ = (bgr&0x3E0)>>2;
1047 *d++ = (bgr&0x7C00)>>7;
1048 }
1049 }
1050
1051 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1052 {
1053 const uint16_t *end;
1054 #ifdef HAVE_MMX
1055 const uint16_t *mm_end;
1056 #endif
1057 uint8_t *d = (uint8_t *)dst;
1058 const uint16_t *s = (const uint16_t *)src;
1059 end = s + src_size/2;
1060 #ifdef HAVE_MMX
1061 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1062 mm_end = end - 7;
1063 while(s < mm_end)
1064 {
1065 __asm __volatile(
1066 PREFETCH" 32%1\n\t"
1067 "movq %1, %%mm0\n\t"
1068 "movq %1, %%mm1\n\t"
1069 "movq %1, %%mm2\n\t"
1070 "pand %2, %%mm0\n\t"
1071 "pand %3, %%mm1\n\t"
1072 "pand %4, %%mm2\n\t"
1073 "psllq $3, %%mm0\n\t"
1074 "psrlq $3, %%mm1\n\t"
1075 "psrlq $8, %%mm2\n\t"
1076 "movq %%mm0, %%mm3\n\t"
1077 "movq %%mm1, %%mm4\n\t"
1078 "movq %%mm2, %%mm5\n\t"
1079 "punpcklwd %5, %%mm0\n\t"
1080 "punpcklwd %5, %%mm1\n\t"
1081 "punpcklwd %5, %%mm2\n\t"
1082 "punpckhwd %5, %%mm3\n\t"
1083 "punpckhwd %5, %%mm4\n\t"
1084 "punpckhwd %5, %%mm5\n\t"
1085 "psllq $8, %%mm1\n\t"
1086 "psllq $16, %%mm2\n\t"
1087 "por %%mm1, %%mm0\n\t"
1088 "por %%mm2, %%mm0\n\t"
1089 "psllq $8, %%mm4\n\t"
1090 "psllq $16, %%mm5\n\t"
1091 "por %%mm4, %%mm3\n\t"
1092 "por %%mm5, %%mm3\n\t"
1093
1094 "movq %%mm0, %%mm6\n\t"
1095 "movq %%mm3, %%mm7\n\t"
1096
1097 "movq 8%1, %%mm0\n\t"
1098 "movq 8%1, %%mm1\n\t"
1099 "movq 8%1, %%mm2\n\t"
1100 "pand %2, %%mm0\n\t"
1101 "pand %3, %%mm1\n\t"
1102 "pand %4, %%mm2\n\t"
1103 "psllq $3, %%mm0\n\t"
1104 "psrlq $3, %%mm1\n\t"
1105 "psrlq $8, %%mm2\n\t"
1106 "movq %%mm0, %%mm3\n\t"
1107 "movq %%mm1, %%mm4\n\t"
1108 "movq %%mm2, %%mm5\n\t"
1109 "punpcklwd %5, %%mm0\n\t"
1110 "punpcklwd %5, %%mm1\n\t"
1111 "punpcklwd %5, %%mm2\n\t"
1112 "punpckhwd %5, %%mm3\n\t"
1113 "punpckhwd %5, %%mm4\n\t"
1114 "punpckhwd %5, %%mm5\n\t"
1115 "psllq $8, %%mm1\n\t"
1116 "psllq $16, %%mm2\n\t"
1117 "por %%mm1, %%mm0\n\t"
1118 "por %%mm2, %%mm0\n\t"
1119 "psllq $8, %%mm4\n\t"
1120 "psllq $16, %%mm5\n\t"
1121 "por %%mm4, %%mm3\n\t"
1122 "por %%mm5, %%mm3\n\t"
1123 :"=m"(*d)
1124 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1125 :"memory");
1126 /* Borrowed 32 to 24 */
1127 __asm __volatile(
1128 "movq %%mm0, %%mm4\n\t"
1129 "movq %%mm3, %%mm5\n\t"
1130 "movq %%mm6, %%mm0\n\t"
1131 "movq %%mm7, %%mm1\n\t"
1132
1133 "movq %%mm4, %%mm6\n\t"
1134 "movq %%mm5, %%mm7\n\t"
1135 "movq %%mm0, %%mm2\n\t"
1136 "movq %%mm1, %%mm3\n\t"
1137
1138 "psrlq $8, %%mm2\n\t"
1139 "psrlq $8, %%mm3\n\t"
1140 "psrlq $8, %%mm6\n\t"
1141 "psrlq $8, %%mm7\n\t"
1142 "pand %2, %%mm0\n\t"
1143 "pand %2, %%mm1\n\t"
1144 "pand %2, %%mm4\n\t"
1145 "pand %2, %%mm5\n\t"
1146 "pand %3, %%mm2\n\t"
1147 "pand %3, %%mm3\n\t"
1148 "pand %3, %%mm6\n\t"
1149 "pand %3, %%mm7\n\t"
1150 "por %%mm2, %%mm0\n\t"
1151 "por %%mm3, %%mm1\n\t"
1152 "por %%mm6, %%mm4\n\t"
1153 "por %%mm7, %%mm5\n\t"
1154
1155 "movq %%mm1, %%mm2\n\t"
1156 "movq %%mm4, %%mm3\n\t"
1157 "psllq $48, %%mm2\n\t"
1158 "psllq $32, %%mm3\n\t"
1159 "pand %4, %%mm2\n\t"
1160 "pand %5, %%mm3\n\t"
1161 "por %%mm2, %%mm0\n\t"
1162 "psrlq $16, %%mm1\n\t"
1163 "psrlq $32, %%mm4\n\t"
1164 "psllq $16, %%mm5\n\t"
1165 "por %%mm3, %%mm1\n\t"
1166 "pand %6, %%mm5\n\t"
1167 "por %%mm5, %%mm4\n\t"
1168
1169 MOVNTQ" %%mm0, %0\n\t"
1170 MOVNTQ" %%mm1, 8%0\n\t"
1171 MOVNTQ" %%mm4, 16%0"
1172
1173 :"=m"(*d)
1174 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1175 :"memory");
1176 d += 24;
1177 s += 8;
1178 }
1179 __asm __volatile(SFENCE:::"memory");
1180 __asm __volatile(EMMS:::"memory");
1181 #endif
1182 while(s < end)
1183 {
1184 register uint16_t bgr;
1185 bgr = *s++;
1186 *d++ = (bgr&0x1F)<<3;
1187 *d++ = (bgr&0x7E0)>>3;
1188 *d++ = (bgr&0xF800)>>8;
1189 }
1190 }
1191
1192 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1193 {
1194 const uint16_t *end;
1195 #ifdef HAVE_MMX
1196 const uint16_t *mm_end;
1197 #endif
1198 uint8_t *d = (uint8_t *)dst;
1199 const uint16_t *s = (const uint16_t *)src;
1200 end = s + src_size/2;
1201 #ifdef HAVE_MMX
1202 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1203 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1204 mm_end = end - 3;
1205 while(s < mm_end)
1206 {
1207 __asm __volatile(
1208 PREFETCH" 32%1\n\t"
1209 "movq %1, %%mm0\n\t"
1210 "movq %1, %%mm1\n\t"
1211 "movq %1, %%mm2\n\t"
1212 "pand %2, %%mm0\n\t"
1213 "pand %3, %%mm1\n\t"
1214 "pand %4, %%mm2\n\t"
1215 "psllq $3, %%mm0\n\t"
1216 "psrlq $2, %%mm1\n\t"
1217 "psrlq $7, %%mm2\n\t"
1218 "movq %%mm0, %%mm3\n\t"
1219 "movq %%mm1, %%mm4\n\t"
1220 "movq %%mm2, %%mm5\n\t"
1221 "punpcklwd %%mm7, %%mm0\n\t"
1222 "punpcklwd %%mm7, %%mm1\n\t"
1223 "punpcklwd %%mm7, %%mm2\n\t"
1224 "punpckhwd %%mm7, %%mm3\n\t"
1225 "punpckhwd %%mm7, %%mm4\n\t"
1226 "punpckhwd %%mm7, %%mm5\n\t"
1227 "psllq $8, %%mm1\n\t"
1228 "psllq $16, %%mm2\n\t"
1229 "por %%mm1, %%mm0\n\t"
1230 "por %%mm2, %%mm0\n\t"
1231 "psllq $8, %%mm4\n\t"
1232 "psllq $16, %%mm5\n\t"
1233 "por %%mm4, %%mm3\n\t"
1234 "por %%mm5, %%mm3\n\t"
1235 MOVNTQ" %%mm0, %0\n\t"
1236 MOVNTQ" %%mm3, 8%0\n\t"
1237 :"=m"(*d)
1238 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1239 :"memory");
1240 d += 16;
1241 s += 4;
1242 }
1243 __asm __volatile(SFENCE:::"memory");
1244 __asm __volatile(EMMS:::"memory");
1245 #endif
1246 while(s < end)
1247 {
1248 #if 0 //slightly slower on athlon
1249 int bgr= *s++;
1250 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1251 #else
1252 //FIXME this is very likely wrong for bigendian (and the following converters too)
1253 register uint16_t bgr;
1254 bgr = *s++;
1255 #ifdef WORDS_BIGENDIAN
1256 *d++ = 0;
1257 *d++ = (bgr&0x1F)<<3;
1258 *d++ = (bgr&0x3E0)>>2;
1259 *d++ = (bgr&0x7C00)>>7;
1260 #else
1261 *d++ = (bgr&0x1F)<<3;
1262 *d++ = (bgr&0x3E0)>>2;
1263 *d++ = (bgr&0x7C00)>>7;
1264 *d++ = 0;
1265 #endif
1266
1267 #endif
1268 }
1269 }
1270
1271 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1272 {
1273 const uint16_t *end;
1274 #ifdef HAVE_MMX
1275 const uint16_t *mm_end;
1276 #endif
1277 uint8_t *d = (uint8_t *)dst;
1278 const uint16_t *s = (uint16_t *)src;
1279 end = s + src_size/2;
1280 #ifdef HAVE_MMX
1281 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1282 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1283 mm_end = end - 3;
1284 while(s < mm_end)
1285 {
1286 __asm __volatile(
1287 PREFETCH" 32%1\n\t"
1288 "movq %1, %%mm0\n\t"
1289 "movq %1, %%mm1\n\t"
1290 "movq %1, %%mm2\n\t"
1291 "pand %2, %%mm0\n\t"
1292 "pand %3, %%mm1\n\t"
1293 "pand %4, %%mm2\n\t"
1294 "psllq $3, %%mm0\n\t"
1295 "psrlq $3, %%mm1\n\t"
1296 "psrlq $8, %%mm2\n\t"
1297 "movq %%mm0, %%mm3\n\t"
1298 "movq %%mm1, %%mm4\n\t"
1299 "movq %%mm2, %%mm5\n\t"
1300 "punpcklwd %%mm7, %%mm0\n\t"
1301 "punpcklwd %%mm7, %%mm1\n\t"
1302 "punpcklwd %%mm7, %%mm2\n\t"
1303 "punpckhwd %%mm7, %%mm3\n\t"
1304 "punpckhwd %%mm7, %%mm4\n\t"
1305 "punpckhwd %%mm7, %%mm5\n\t"
1306 "psllq $8, %%mm1\n\t"
1307 "psllq $16, %%mm2\n\t"
1308 "por %%mm1, %%mm0\n\t"
1309 "por %%mm2, %%mm0\n\t"
1310 "psllq $8, %%mm4\n\t"
1311 "psllq $16, %%mm5\n\t"
1312 "por %%mm4, %%mm3\n\t"
1313 "por %%mm5, %%mm3\n\t"
1314 MOVNTQ" %%mm0, %0\n\t"
1315 MOVNTQ" %%mm3, 8%0\n\t"
1316 :"=m"(*d)
1317 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1318 :"memory");
1319 d += 16;
1320 s += 4;
1321 }
1322 __asm __volatile(SFENCE:::"memory");
1323 __asm __volatile(EMMS:::"memory");
1324 #endif
1325 while(s < end)
1326 {
1327 register uint16_t bgr;
1328 bgr = *s++;
1329 #ifdef WORDS_BIGENDIAN
1330 *d++ = 0;
1331 *d++ = (bgr&0x1F)<<3;
1332 *d++ = (bgr&0x7E0)>>3;
1333 *d++ = (bgr&0xF800)>>8;
1334 #else
1335 *d++ = (bgr&0x1F)<<3;
1336 *d++ = (bgr&0x7E0)>>3;
1337 *d++ = (bgr&0xF800)>>8;
1338 *d++ = 0;
1339 #endif
1340 }
1341 }
1342
1343 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1344 {
1345 #ifdef HAVE_MMX
1346 /* TODO: unroll this loop */
1347 asm volatile (
1348 "xorl %%eax, %%eax \n\t"
1349 ".balign 16 \n\t"
1350 "1: \n\t"
1351 PREFETCH" 32(%0, %%eax) \n\t"
1352 "movq (%0, %%eax), %%mm0 \n\t"
1353 "movq %%mm0, %%mm1 \n\t"
1354 "movq %%mm0, %%mm2 \n\t"
1355 "pslld $16, %%mm0 \n\t"
1356 "psrld $16, %%mm1 \n\t"
1357 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1358 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1359 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1360 "por %%mm0, %%mm2 \n\t"
1361 "por %%mm1, %%mm2 \n\t"
1362 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1363 "addl $8, %%eax \n\t"
1364 "cmpl %2, %%eax \n\t"
1365 " jb 1b \n\t"
1366 :: "r" (src), "r"(dst), "r" (src_size-7)
1367 : "%eax"
1368 );
1369
1370 __asm __volatile(SFENCE:::"memory");
1371 __asm __volatile(EMMS:::"memory");
1372 #else
1373 unsigned i;
1374 unsigned num_pixels = src_size >> 2;
1375 for(i=0; i<num_pixels; i++)
1376 {
1377 #ifdef WORDS_BIGENDIAN
1378 dst[4*i + 1] = src[4*i + 3];
1379 dst[4*i + 2] = src[4*i + 2];
1380 dst[4*i + 3] = src[4*i + 1];
1381 #else
1382 dst[4*i + 0] = src[4*i + 2];
1383 dst[4*i + 1] = src[4*i + 1];
1384 dst[4*i + 2] = src[4*i + 0];
1385 #endif
1386 }
1387 #endif
1388 }
1389
1390 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1391 {
1392 unsigned i;
1393 #ifdef HAVE_MMX
1394 int mmx_size= 23 - src_size;
1395 asm volatile (
1396 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1397 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1398 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1399 ".balign 16 \n\t"
1400 "1: \n\t"
1401 PREFETCH" 32(%1, %%eax) \n\t"
1402 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1403 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1404 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1405 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1406 "pand %%mm5, %%mm0 \n\t"
1407 "pand %%mm6, %%mm1 \n\t"
1408 "pand %%mm7, %%mm2 \n\t"
1409 "por %%mm0, %%mm1 \n\t"
1410 "por %%mm2, %%mm1 \n\t"
1411 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1412 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1413 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1414 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1415 "pand %%mm7, %%mm0 \n\t"
1416 "pand %%mm5, %%mm1 \n\t"
1417 "pand %%mm6, %%mm2 \n\t"
1418 "por %%mm0, %%mm1 \n\t"
1419 "por %%mm2, %%mm1 \n\t"
1420 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1421 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1422 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1423 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1424 "pand %%mm6, %%mm0 \n\t"
1425 "pand %%mm7, %%mm1 \n\t"
1426 "pand %%mm5, %%mm2 \n\t"
1427 "por %%mm0, %%mm1 \n\t"
1428 "por %%mm2, %%mm1 \n\t"
1429 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1430 "addl $24, %%eax \n\t"
1431 " js 1b \n\t"
1432 : "+a" (mmx_size)
1433 : "r" (src-mmx_size), "r"(dst-mmx_size)
1434 );
1435
1436 __asm __volatile(SFENCE:::"memory");
1437 __asm __volatile(EMMS:::"memory");
1438
1439 if(mmx_size==23) return; //finihsed, was multiple of 8
1440
1441 src+= src_size;
1442 dst+= src_size;
1443 src_size= 23-mmx_size;
1444 src-= src_size;
1445 dst-= src_size;
1446 #endif
1447 for(i=0; i<src_size; i+=3)
1448 {
1449 register uint8_t x;
1450 x = src[i + 2];
1451 dst[i + 1] = src[i + 1];
1452 dst[i + 2] = src[i + 0];
1453 dst[i + 0] = x;
1454 }
1455 }
1456
1457 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1458 unsigned int width, unsigned int height,
1459 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1460 {
1461 unsigned y;
1462 const unsigned chromWidth= width>>1;
1463 for(y=0; y<height; y++)
1464 {
1465 #ifdef HAVE_MMX
1466 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1467 asm volatile(
1468 "xorl %%eax, %%eax \n\t"
1469 ".balign 16 \n\t"
1470 "1: \n\t"
1471 PREFETCH" 32(%1, %%eax, 2) \n\t"
1472 PREFETCH" 32(%2, %%eax) \n\t"
1473 PREFETCH" 32(%3, %%eax) \n\t"
1474 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1475 "movq %%mm0, %%mm2 \n\t" // U(0)
1476 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1477 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1478 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1479
1480 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1481 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1482 "movq %%mm3, %%mm4 \n\t" // Y(0)
1483 "movq %%mm5, %%mm6 \n\t" // Y(8)
1484 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1485 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1486 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1487 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1488
1489 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1490 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1491 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1492 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1493
1494 "addl $8, %%eax \n\t"
1495 "cmpl %4, %%eax \n\t"
1496 " jb 1b \n\t"
1497 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1498 : "%eax"
1499 );
1500 #else
1501
1502 #if defined ARCH_ALPHA && defined HAVE_MVI
1503 #define pl2yuy2(n) \
1504 y1 = yc[n]; \
1505 y2 = yc2[n]; \
1506 u = uc[n]; \
1507 v = vc[n]; \
1508 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1509 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1510 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1511 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1512 yuv1 = (u << 8) + (v << 24); \
1513 yuv2 = yuv1 + y2; \
1514 yuv1 += y1; \
1515 qdst[n] = yuv1; \
1516 qdst2[n] = yuv2;
1517
1518 int i;
1519 uint64_t *qdst = (uint64_t *) dst;
1520 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1521 const uint32_t *yc = (uint32_t *) ysrc;
1522 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1523 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1524 for(i = 0; i < chromWidth; i += 8){
1525 uint64_t y1, y2, yuv1, yuv2;
1526 uint64_t u, v;
1527 /* Prefetch */
1528 asm("ldq $31,64(%0)" :: "r"(yc));
1529 asm("ldq $31,64(%0)" :: "r"(yc2));
1530 asm("ldq $31,64(%0)" :: "r"(uc));
1531 asm("ldq $31,64(%0)" :: "r"(vc));
1532
1533 pl2yuy2(0);
1534 pl2yuy2(1);
1535 pl2yuy2(2);
1536 pl2yuy2(3);
1537
1538 yc += 4;
1539 yc2 += 4;
1540 uc += 4;
1541 vc += 4;
1542 qdst += 4;
1543 qdst2 += 4;
1544 }
1545 y++;
1546 ysrc += lumStride;
1547 dst += dstStride;
1548
1549 #elif __WORDSIZE >= 64
1550 int i;
1551 uint64_t *ldst = (uint64_t *) dst;
1552 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1553 for(i = 0; i < chromWidth; i += 2){
1554 uint64_t k, l;
1555 k = yc[0] + (uc[0] << 8) +
1556 (yc[1] << 16) + (vc[0] << 24);
1557 l = yc[2] + (uc[1] << 8) +
1558 (yc[3] << 16) + (vc[1] << 24);
1559 *ldst++ = k + (l << 32);
1560 yc += 4;
1561 uc += 2;
1562 vc += 2;
1563 }
1564
1565 #else
1566 int i, *idst = (int32_t *) dst;
1567 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1568 for(i = 0; i < chromWidth; i++){
1569 #ifdef WORDS_BIGENDIAN
1570 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1571 (yc[1] << 8) + (vc[0] << 0);
1572 #else
1573 *idst++ = yc[0] + (uc[0] << 8) +
1574 (yc[1] << 16) + (vc[0] << 24);
1575 #endif
1576 yc += 2;
1577 uc++;
1578 vc++;
1579 }
1580 #endif
1581 #endif
1582 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1583 {
1584 usrc += chromStride;
1585 vsrc += chromStride;
1586 }
1587 ysrc += lumStride;
1588 dst += dstStride;
1589 }
1590 #ifdef HAVE_MMX
1591 asm( EMMS" \n\t"
1592 SFENCE" \n\t"
1593 :::"memory");
1594 #endif
1595 }
1596
1597 /**
1598 *
1599 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1600 * problem for anyone then tell me, and ill fix it)
1601 */
1602 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1603 unsigned int width, unsigned int height,
1604 int lumStride, int chromStride, int dstStride)
1605 {
1606 //FIXME interpolate chroma
1607 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1608 }
1609
1610 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1611 unsigned int width, unsigned int height,
1612 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1613 {
1614 unsigned y;
1615 const unsigned chromWidth= width>>1;
1616 for(y=0; y<height; y++)
1617 {
1618 #ifdef HAVE_MMX
1619 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1620 asm volatile(
1621 "xorl %%eax, %%eax \n\t"
1622 ".balign 16 \n\t"
1623 "1: \n\t"
1624 PREFETCH" 32(%1, %%eax, 2) \n\t"
1625 PREFETCH" 32(%2, %%eax) \n\t"
1626 PREFETCH" 32(%3, %%eax) \n\t"
1627 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1628 "movq %%mm0, %%mm2 \n\t" // U(0)
1629 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1630 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1631 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1632
1633 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1634 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1635 "movq %%mm0, %%mm4 \n\t" // Y(0)
1636 "movq %%mm2, %%mm6 \n\t" // Y(8)
1637 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1638 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1639 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1640 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1641
1642 MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t"
1643 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1644 MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
1645 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1646
1647 "addl $8, %%eax \n\t"
1648 "cmpl %4, %%eax \n\t"
1649 " jb 1b \n\t"
1650 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1651 : "%eax"
1652 );
1653 #else
1654 //FIXME adapt the alpha asm code from yv12->yuy2
1655
1656 #if __WORDSIZE >= 64
1657 int i;
1658 uint64_t *ldst = (uint64_t *) dst;
1659 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1660 for(i = 0; i < chromWidth; i += 2){
1661 uint64_t k, l;
1662 k = uc[0] + (yc[0] << 8) +
1663 (vc[0] << 16) + (yc[1] << 24);
1664 l = uc[1] + (yc[2] << 8) +
1665 (vc[1] << 16) + (yc[3] << 24);
1666 *ldst++ = k + (l << 32);
1667 yc += 4;
1668 uc += 2;
1669 vc += 2;
1670 }
1671
1672 #else
1673 int i, *idst = (int32_t *) dst;
1674 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1675 for(i = 0; i < chromWidth; i++){
1676 #ifdef WORDS_BIGENDIAN
1677 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1678 (vc[0] << 8) + (yc[1] << 0);
1679 #else
1680 *idst++ = uc[0] + (yc[0] << 8) +
1681 (vc[0] << 16) + (yc[1] << 24);
1682 #endif
1683 yc += 2;
1684 uc++;
1685 vc++;
1686 }
1687 #endif
1688 #endif
1689 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1690 {
1691 usrc += chromStride;
1692 vsrc += chromStride;
1693 }
1694 ysrc += lumStride;
1695 dst += dstStride;
1696 }
1697 #ifdef HAVE_MMX
1698 asm( EMMS" \n\t"
1699 SFENCE" \n\t"
1700 :::"memory");
1701 #endif
1702 }
1703
1704 /**
1705 *
1706 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1707 * problem for anyone then tell me, and ill fix it)
1708 */
1709 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1710 unsigned int width, unsigned int height,
1711 int lumStride, int chromStride, int dstStride)
1712 {
1713 //FIXME interpolate chroma
1714 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1715 }
1716
1717 /**
1718 *
1719 * width should be a multiple of 16
1720 */
1721 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1722 unsigned int width, unsigned int height,
1723 int lumStride, int chromStride, int dstStride)
1724 {
1725 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1726 }
1727
1728 /**
1729 *
1730 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1731 * problem for anyone then tell me, and ill fix it)
1732 */
1733 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1734 unsigned int width, unsigned int height,
1735 int lumStride, int chromStride, int srcStride)
1736 {
1737 unsigned y;
1738 const unsigned chromWidth= width>>1;
1739 for(y=0; y<height; y+=2)
1740 {
1741 #ifdef HAVE_MMX
1742 asm volatile(
1743 "xorl %%eax, %%eax \n\t"
1744 "pcmpeqw %%mm7, %%mm7 \n\t"
1745 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1746 ".balign 16 \n\t"
1747 "1: \n\t"
1748 PREFETCH" 64(%0, %%eax, 4) \n\t"
1749 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1750 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1751 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1752 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1753 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1754 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1755 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1756 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1757 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1758 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1759
1760 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1761
1762 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1763 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1764 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1765 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1766 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1767 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1768 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1769 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1770 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1771 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1772
1773 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1774
1775 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1776 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1777 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1778 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1779 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1780 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1781 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1782 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1783
1784 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1785 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1786
1787 "addl $8, %%eax \n\t"
1788 "cmpl %4, %%eax \n\t"
1789 " jb 1b \n\t"
1790 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1791 : "memory", "%eax"
1792 );
1793
1794 ydst += lumStride;
1795 src += srcStride;
1796
1797 asm volatile(
1798 "xorl %%eax, %%eax \n\t"
1799 ".balign 16 \n\t"
1800 "1: \n\t"
1801 PREFETCH" 64(%0, %%eax, 4) \n\t"
1802 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1803 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1804 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1805 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1806 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1807 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1808 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1809 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1810 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1811 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1812
1813 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1814 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1815
1816 "addl $8, %%eax \n\t"
1817 "cmpl %4, %%eax \n\t"
1818 " jb 1b \n\t"
1819
1820 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1821 : "memory", "%eax"
1822 );
1823 #else
1824 unsigned i;
1825 for(i=0; i<chromWidth; i++)
1826 {
1827 ydst[2*i+0] = src[4*i+0];
1828 udst[i] = src[4*i+1];
1829 ydst[2*i+1] = src[4*i+2];
1830 vdst[i] = src[4*i+3];
1831 }
1832 ydst += lumStride;
1833 src += srcStride;
1834
1835 for(i=0; i<chromWidth; i++)
1836 {
1837 ydst[2*i+0] = src[4*i+0];
1838 ydst[2*i+1] = src[4*i+2];
1839 }
1840 #endif
1841 udst += chromStride;
1842 vdst += chromStride;
1843 ydst += lumStride;
1844 src += srcStride;
1845 }
1846 #ifdef HAVE_MMX
1847 asm volatile( EMMS" \n\t"
1848 SFENCE" \n\t"
1849 :::"memory");
1850 #endif
1851 }
1852
1853 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1854 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1855 unsigned int width, unsigned int height, int lumStride, int chromStride)
1856 {
1857 /* Y Plane */
1858 memcpy(ydst, ysrc, width*height);
1859
1860 /* XXX: implement upscaling for U,V */
1861 }
1862
1863 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1864 {
1865 int x,y;
1866
1867 dst[0]= src[0];
1868
1869 // first line
1870 for(x=0; x<srcWidth-1; x++){
1871 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1872 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1873 }
1874 dst[2*srcWidth-1]= src[srcWidth-1];
1875
1876 dst+= dstStride;
1877
1878 for(y=1; y<srcHeight; y++){
1879 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1880 const int mmxSize= srcWidth&~15;
1881 asm volatile(
1882 "movl %4, %%eax \n\t"
1883 "1: \n\t"
1884 "movq (%0, %%eax), %%mm0 \n\t"
1885 "movq (%1, %%eax), %%mm1 \n\t"
1886 "movq 1(%0, %%eax), %%mm2 \n\t"
1887 "movq 1(%1, %%eax), %%mm3 \n\t"
1888 "movq -1(%0, %%eax), %%mm4 \n\t"
1889 "movq -1(%1, %%eax), %%mm5 \n\t"
1890 PAVGB" %%mm0, %%mm5 \n\t"
1891 PAVGB" %%mm0, %%mm3 \n\t"
1892 PAVGB" %%mm0, %%mm5 \n\t"
1893 PAVGB" %%mm0, %%mm3 \n\t"
1894 PAVGB" %%mm1, %%mm4 \n\t"
1895 PAVGB" %%mm1, %%mm2 \n\t"
1896 PAVGB" %%mm1, %%mm4 \n\t"
1897 PAVGB" %%mm1, %%mm2 \n\t"
1898 "movq %%mm5, %%mm7 \n\t"
1899 "movq %%mm4, %%mm6 \n\t"
1900 "punpcklbw %%mm3, %%mm5 \n\t"
1901 "punpckhbw %%mm3, %%mm7 \n\t"
1902 "punpcklbw %%mm2, %%mm4 \n\t"
1903 "punpckhbw %%mm2, %%mm6 \n\t"
1904 #if 1
1905 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
1906 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
1907 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
1908 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
1909 #else
1910 "movq %%mm5, (%2, %%eax, 2) \n\t"
1911 "movq %%mm7, 8(%2, %%eax, 2) \n\t"
1912 "movq %%mm4, (%3, %%eax, 2) \n\t"
1913 "movq %%mm6, 8(%3, %%eax, 2) \n\t"
1914 #endif
1915 "addl $8, %%eax \n\t"
1916 " js 1b \n\t"
1917 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1918 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1919 "g" (-mmxSize)
1920 : "%eax"
1921
1922 );
1923 #else
1924 const int mmxSize=1;
1925 #endif
1926 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1927 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1928
1929 for(x=mmxSize-1; x<srcWidth-1; x++){
1930 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1931 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1932 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1933 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1934 }
1935 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1936 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1937
1938 dst+=dstStride*2;
1939 src+=srcStride;
1940 }
1941
1942 // last line
1943 #if 1
1944 dst[0]= src[0];
1945
1946 for(x=0; x<srcWidth-1; x++){
1947 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1948 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1949 }
1950 dst[2*srcWidth-1]= src[srcWidth-1];
1951 #else
1952 for(x=0; x<srcWidth; x++){
1953 dst[2*x+0]=
1954 dst[2*x+1]= src[x];
1955 }
1956 #endif
1957
1958 #ifdef HAVE_MMX
1959 asm volatile( EMMS" \n\t"
1960 SFENCE" \n\t"
1961 :::"memory");
1962 #endif
1963 }
1964
1965 /**
1966 *
1967 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1968 * problem for anyone then tell me, and ill fix it)
1969 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1970 */
1971 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1972 unsigned int width, unsigned int height,
1973 int lumStride, int chromStride, int srcStride)
1974 {
1975 unsigned y;
1976 const unsigned chromWidth= width>>1;
1977 for(y=0; y<height; y+=2)
1978 {
1979 #ifdef HAVE_MMX
1980 asm volatile(
1981 "xorl %%eax, %%eax \n\t"
1982 "pcmpeqw %%mm7, %%mm7 \n\t"
1983 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1984 ".balign 16 \n\t"
1985 "1: \n\t"
1986 PREFETCH" 64(%0, %%eax, 4) \n\t"
1987 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1988 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1989 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1990 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1991 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1992 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1993 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1994 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1995 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1996 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1997
1998 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1999
2000 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2001 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2002 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2003 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2004 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2005 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2006 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2007 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2008 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2009 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2010
2011 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2012
2013 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2014 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2015 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2016 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2017 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2018 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2019 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2020 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2021
2022 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2023 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2024
2025 "addl $8, %%eax \n\t"
2026 "cmpl %4, %%eax \n\t"
2027 " jb 1b \n\t"
2028 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2029 : "memory", "%eax"
2030 );
2031
2032 ydst += lumStride;
2033 src += srcStride;
2034
2035 asm volatile(
2036 "xorl %%eax, %%eax \n\t"
2037 ".balign 16 \n\t"
2038 "1: \n\t"
2039 PREFETCH" 64(%0, %%eax, 4) \n\t"
2040 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2041 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2042 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2043 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2044 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2045 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2046 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2047 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2048 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2049 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2050
2051 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2052 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2053
2054 "addl $8, %%eax \n\t"
2055 "cmpl %4, %%eax \n\t"
2056 " jb 1b \n\t"
2057
2058 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2059 : "memory", "%eax"
2060 );
2061 #else
2062 unsigned i;
2063 for(i=0; i<chromWidth; i++)
2064 {
2065 udst[i] = src[4*i+0];
2066 ydst[2*i+0] = src[4*i+1];
2067 vdst[i] = src[4*i+2];
2068 ydst[2*i+1] = src[4*i+3];
2069 }
2070 ydst += lumStride;
2071 src += srcStride;
2072
2073 for(i=0; i<chromWidth; i++)
2074 {
2075 ydst[2*i+0] = src[4*i+1];
2076 ydst[2*i+1] = src[4*i+3];
2077 }
2078 #endif
2079 udst += chromStride;
2080 vdst += chromStride;
2081 ydst += lumStride;
2082 src += srcStride;
2083 }
2084 #ifdef HAVE_MMX
2085 asm volatile( EMMS" \n\t"
2086 SFENCE" \n\t"
2087 :::"memory");
2088 #endif
2089 }
2090
2091 /**
2092 *
2093 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2094 * problem for anyone then tell me, and ill fix it)
2095 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2096 */
2097 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2098 unsigned int width, unsigned int height,
2099 int lumStride, int chromStride, int srcStride)
2100 {
2101 unsigned y;
2102 const unsigned chromWidth= width>>1;
2103 #ifdef HAVE_MMX
2104 for(y=0; y<height-2; y+=2)
2105 {
2106 unsigned i;
2107 for(i=0; i<2; i++)
2108 {
2109 asm volatile(
2110 "movl %2, %%eax \n\t"
2111 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2112 "movq "MANGLE(w1111)", %%mm5 \n\t"
2113 "pxor %%mm7, %%mm7 \n\t"
2114 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2115 ".balign 16 \n\t"
2116 "1: \n\t"
2117 PREFETCH" 64(%0, %%ebx) \n\t"
2118 "movd (%0, %%ebx), %%mm0 \n\t"
2119 "movd 3(%0, %%ebx), %%mm1 \n\t"
2120 "punpcklbw %%mm7, %%mm0 \n\t"
2121 "punpcklbw %%mm7, %%mm1 \n\t"
2122 "movd 6(%0, %%ebx), %%mm2 \n\t"
2123 "movd 9(%0, %%ebx), %%mm3 \n\t"
2124 "punpcklbw %%mm7, %%mm2 \n\t"
2125 "punpcklbw %%mm7, %%mm3 \n\t"
2126 "pmaddwd %%mm6, %%mm0 \n\t"
2127 "pmaddwd %%mm6, %%mm1 \n\t"
2128 "pmaddwd %%mm6, %%mm2 \n\t"
2129 "pmaddwd %%mm6, %%mm3 \n\t"
2130 #ifndef FAST_BGR2YV12
2131 "psrad $8, %%mm0 \n\t"
2132 "psrad $8, %%mm1 \n\t"
2133 "psrad $8, %%mm2 \n\t"
2134 "psrad $8, %%mm3 \n\t"
2135 #endif
2136 "packssdw %%mm1, %%mm0 \n\t"
2137 "packssdw %%mm3, %%mm2 \n\t"
2138 "pmaddwd %%mm5, %%mm0 \n\t"
2139 "pmaddwd %%mm5, %%mm2 \n\t"
2140 "packssdw %%mm2, %%mm0 \n\t"
2141 "psraw $7, %%mm0 \n\t"
2142
2143 "movd 12(%0, %%ebx), %%mm4 \n\t"
2144 "movd 15(%0, %%ebx), %%mm1 \n\t"
2145 "punpcklbw %%mm7, %%mm4 \n\t"
2146 "punpcklbw %%mm7, %%mm1 \n\t"
2147 "movd 18(%0, %%ebx), %%mm2 \n\t"
2148 "movd 21(%0, %%ebx), %%mm3 \n\t"
2149 "punpcklbw %%mm7, %%mm2 \n\t"
2150 "punpcklbw %%mm7, %%mm3 \n\t"
2151 "pmaddwd %%mm6, %%mm4 \n\t"
2152 "pmaddwd %%mm6, %%mm1 \n\t"
2153 "pmaddwd %%mm6, %%mm2 \n\t"
2154 "pmaddwd %%mm6, %%mm3 \n\t"
2155 #ifndef FAST_BGR2YV12
2156 "psrad $8, %%mm4 \n\t"
2157 "psrad $8, %%mm1 \n\t"
2158 "psrad $8, %%mm2 \n\t"
2159 "psrad $8, %%mm3 \n\t"
2160 #endif
2161 "packssdw %%mm1, %%mm4 \n\t"
2162 "packssdw %%mm3, %%mm2 \n\t"
2163 "pmaddwd %%mm5, %%mm4 \n\t"
2164 "pmaddwd %%mm5, %%mm2 \n\t"
2165 "addl $24, %%ebx \n\t"
2166 "packssdw %%mm2, %%mm4 \n\t"
2167 "psraw $7, %%mm4 \n\t"
2168
2169 "packuswb %%mm4, %%mm0 \n\t"
2170 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2171
2172 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
2173 "addl $8, %%eax \n\t"
2174 " js 1b \n\t"
2175 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2176 : "%eax", "%ebx"
2177 );
2178 ydst += lumStride;
2179 src += srcStride;
2180 }
2181 src -= srcStride*2;
2182 asm volatile(
2183 "movl %4, %%eax \n\t"
2184 "movq "MANGLE(w1111)", %%mm5 \n\t"
2185 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2186 "pxor %%mm7, %%mm7 \n\t"
2187 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2188 "addl %%ebx, %%ebx \n\t"
2189 ".balign 16 \n\t"
2190 "1: \n\t"
2191 PREFETCH" 64(%0, %%ebx) \n\t"
2192 PREFETCH" 64(%1, %%ebx) \n\t"
2193 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2194 "movq (%0, %%ebx), %%mm0 \n\t"
2195 "movq (%1, %%ebx), %%mm1 \n\t"
2196 "movq 6(%0, %%ebx), %%mm2 \n\t"
2197 "movq 6(%1, %%ebx), %%mm3 \n\t"
2198 PAVGB" %%mm1, %%mm0 \n\t"
2199 PAVGB" %%mm3, %%mm2 \n\t"
2200 "movq %%mm0, %%mm1 \n\t"
2201 "movq %%mm2, %%mm3 \n\t"
2202 "psrlq $24, %%mm0 \n\t"
2203 "psrlq $24, %%mm2 \n\t"
2204 PAVGB" %%mm1, %%mm0 \n\t"
2205 PAVGB" %%mm3, %%mm2 \n\t"
2206 "punpcklbw %%mm7, %%mm0 \n\t"
2207 "punpcklbw %%mm7, %%mm2 \n\t"
2208 #else
2209 "movd (%0, %%ebx), %%mm0 \n\t"
2210 "movd (%1, %%ebx), %%mm1 \n\t"
2211 "movd 3(%0, %%ebx), %%mm2 \n\t"
2212 "movd 3(%1, %%ebx), %%mm3 \n\t"
2213 "punpcklbw %%mm7, %%mm0 \n\t"
2214 "punpcklbw %%mm7, %%mm1 \n\t"
2215 "punpcklbw %%mm7, %%mm2 \n\t"
2216 "punpcklbw %%mm7, %%mm3 \n\t"
2217 "paddw %%mm1, %%mm0 \n\t"
2218 "paddw %%mm3, %%mm2 \n\t"
2219 "paddw %%mm2, %%mm0 \n\t"
2220 "movd 6(%0, %%ebx), %%mm4 \n\t"
2221 "movd 6(%1, %%ebx), %%mm1 \n\t"
2222 "movd 9(%0, %%ebx), %%mm2 \n\t"
2223 "movd 9(%1, %%ebx), %%mm3 \n\t"
2224 "punpcklbw %%mm7, %%mm4 \n\t"
2225 "punpcklbw %%mm7, %%mm1 \n\t"
2226 "punpcklbw %%mm7, %%mm2 \n\t"
2227 "punpcklbw %%mm7, %%mm3 \n\t"
2228 "paddw %%mm1, %%mm4 \n\t"
2229 "paddw %%mm3, %%mm2 \n\t"
2230 "paddw %%mm4, %%mm2 \n\t"
2231 "psrlw $2, %%mm0 \n\t"
2232 "psrlw $2, %%mm2 \n\t"
2233 #endif
2234 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2235 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2236
2237 "pmaddwd %%mm0, %%mm1 \n\t"
2238 "pmaddwd %%mm2, %%mm3 \n\t"
2239 "pmaddwd %%mm6, %%mm0 \n\t"
2240 "pmaddwd %%mm6, %%mm2 \n\t"
2241 #ifndef FAST_BGR2YV12
2242 "psrad $8, %%mm0 \n\t"
2243 "psrad $8, %%mm1 \n\t"
2244 "psrad $8, %%mm2 \n\t"
2245 "psrad $8, %%mm3 \n\t"
2246 #endif
2247 "packssdw %%mm2, %%mm0 \n\t"
2248 "packssdw %%mm3, %%mm1 \n\t"
2249 "pmaddwd %%mm5, %%mm0 \n\t"
2250 "pmaddwd %%mm5, %%mm1 \n\t"
2251 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2252 "psraw $7, %%mm0 \n\t"
2253
2254 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2255 "movq 12(%0, %%ebx), %%mm4 \n\t"
2256 "movq 12(%1, %%ebx), %%mm1 \n\t"
2257 "movq 18(%0, %%ebx), %%mm2 \n\t"
2258 "movq 18(%1, %%ebx), %%mm3 \n\t"
2259 PAVGB" %%mm1, %%mm4 \n\t"
2260 PAVGB" %%mm3, %%mm2 \n\t"
2261 "movq %%mm4, %%mm1 \n\t"
2262 "movq %%mm2, %%mm3 \n\t"
2263 "psrlq $24, %%mm4 \n\t"
2264 "psrlq $24, %%mm2 \n\t"
2265 PAVGB" %%mm1, %%mm4 \n\t"
2266 PAVGB" %%mm3, %%mm2 \n\t"
2267 "punpcklbw %%mm7, %%mm4 \n\t"
2268 "punpcklbw %%mm7, %%mm2 \n\t"
2269 #else
2270 "movd 12(%0, %%ebx), %%mm4 \n\t"
2271 "movd 12(%1, %%ebx), %%mm1 \n\t"
2272 "movd 15(%0, %%ebx), %%mm2 \n\t"
2273 "movd 15(%1, %%ebx), %%mm3 \n\t"
2274 "punpcklbw %%mm7, %%mm4 \n\t"
2275 "punpcklbw %%mm7, %%mm1 \n\t"
2276 "punpcklbw %%mm7, %%mm2 \n\t"
2277 "punpcklbw %%mm7, %%mm3 \n\t"
2278 "paddw %%mm1, %%mm4 \n\t"
2279 "paddw %%mm3, %%mm2 \n\t"
2280 "paddw %%mm2, %%mm4 \n\t"
2281 "movd 18(%0, %%ebx), %%mm5 \n\t"
2282 "movd 18(%1, %%ebx), %%mm1 \n\t"
2283 "movd 21(%0, %%ebx), %%mm2 \n\t"
2284 "movd 21(%1, %%ebx), %%mm3 \n\t"
2285 "punpcklbw %%mm7, %%mm5 \n\t"
2286 "punpcklbw %%mm7, %%mm1 \n\t"
2287 "punpcklbw %%mm7, %%mm2 \n\t"
2288 "punpcklbw %%mm7, %%mm3 \n\t"
2289 "paddw %%mm1, %%mm5 \n\t"
2290 "paddw %%mm3, %%mm2 \n\t"
2291 "paddw %%mm5, %%mm2 \n\t"
2292 "movq "MANGLE(w1111)", %%mm5 \n\t"
2293 "psrlw $2, %%mm4 \n\t"
2294 "psrlw $2, %%mm2 \n\t"
2295 #endif
2296 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2297 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2298
2299 "pmaddwd %%mm4, %%mm1 \n\t"
2300 "pmaddwd %%mm2, %%mm3 \n\t"
2301 "pmaddwd %%mm6, %%mm4 \n\t"
2302 "pmaddwd %%mm6, %%mm2 \n\t"
2303 #ifndef FAST_BGR2YV12
2304 "psrad $8, %%mm4 \n\t"
2305 "psrad $8, %%mm1 \n\t"
2306 "psrad $8, %%mm2 \n\t"
2307 "psrad $8, %%mm3 \n\t"
2308 #endif
2309 "packssdw %%mm2, %%mm4 \n\t"
2310 "packssdw %%mm3, %%mm1 \n\t"
2311 "pmaddwd %%mm5, %%mm4 \n\t"
2312 "pmaddwd %%mm5, %%mm1 \n\t"
2313 "addl $24, %%ebx \n\t"
2314 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2315 "psraw $7, %%mm4 \n\t"
2316
2317 "movq %%mm0, %%mm1 \n\t"
2318 "punpckldq %%mm4, %%mm0 \n\t"
2319 "punpckhdq %%mm4, %%mm1 \n\t"
2320 "packsswb %%mm1, %%mm0 \n\t"
2321 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2322
2323 "movd %%mm0, (%2, %%eax) \n\t"
2324 "punpckhdq %%mm0, %%mm0 \n\t"
2325 "movd %%mm0, (%3, %%eax) \n\t"
2326 "addl $4, %%eax \n\t"
2327 " js 1b \n\t"
2328 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2329 : "%eax", "%ebx"
2330 );
2331
2332 udst += chromStride;
2333 vdst += chromStride;
2334 src += srcStride*2;
2335 }
2336
2337 asm volatile( EMMS" \n\t"
2338 SFENCE" \n\t"
2339 :::"memory");
2340 #else
2341 y=0;
2342 #endif
2343 for(; y<height; y+=2)
2344 {
2345 unsigned i;
2346 for(i=0; i<chromWidth; i++)
2347 {
2348 unsigned int b= src[6*i+0];
2349 unsigned int g= src[6*i+1];
2350 unsigned int r= src[6*i+2];
2351
2352 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2353 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2354 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2355
2356 udst[i] = U;
2357 vdst[i] = V;
2358 ydst[2*i] = Y;
2359
2360 b= src[6*i+3];
2361 g= src[6*i+4];
2362 r= src[6*i+5];
2363
2364 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2365 ydst[2*i+1] = Y;
2366 }
2367 ydst += lumStride;
2368 src += srcStride;
2369
2370 for(i=0; i<chromWidth; i++)
2371 {
2372 unsigned int b= src[6*i+0];
2373 unsigned int g= src[6*i+1];
2374 unsigned int r= src[6*i+2];
2375
2376 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2377
2378 ydst[2*i] = Y;
2379
2380 b= src[6*i+3];
2381 g= src[6*i+4];
2382 r= src[6*i+5];
2383
2384 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2385 ydst[2*i+1] = Y;
2386 }
2387 udst += chromStride;
2388 vdst += chromStride;
2389 ydst += lumStride;
2390 src += srcStride;
2391 }
2392 }
2393
2394 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2395 unsigned width, unsigned height, int src1Stride,
2396 int src2Stride, int dstStride){
2397 unsigned h;
2398
2399 for(h=0; h < height; h++)
2400 {
2401 unsigned w;
2402
2403 #ifdef HAVE_MMX
2404 #ifdef HAVE_SSE2
2405 asm(
2406 "xorl %%eax, %%eax \n\t"
2407 "1: \n\t"
2408 PREFETCH" 64(%1, %%eax) \n\t"
2409 PREFETCH" 64(%2, %%eax) \n\t"
2410 "movdqa (%1, %%eax), %%xmm0 \n\t"
2411 "movdqa (%1, %%eax), %%xmm1 \n\t"
2412 "movdqa (%2, %%eax), %%xmm2 \n\t"
2413 "punpcklbw %%xmm2, %%xmm0 \n\t"
2414 "punpckhbw %%xmm2, %%xmm1 \n\t"
2415 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2416 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2417 "addl $16, %%eax \n\t"
2418 "cmpl %3, %%eax \n\t"
2419 " jb 1b \n\t"
2420 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2421 : "memory", "%eax"
2422 );
2423 #else
2424 asm(
2425 "xorl %%eax, %%eax \n\t"
2426 "1: \n\t"
2427 PREFETCH" 64(%1, %%eax) \n\t"
2428 PREFETCH" 64(%2, %%eax) \n\t"
2429 "movq (%1, %%eax), %%mm0 \n\t"
2430 "movq 8(%1, %%eax), %%mm2 \n\t"
2431 "movq %%mm0, %%mm1 \n\t"
2432 "movq %%mm2, %%mm3 \n\t"
2433 "movq (%2, %%eax), %%mm4 \n\t"
2434 "movq 8(%2, %%eax), %%mm5 \n\t"
2435 "punpcklbw %%mm4, %%mm0 \n\t"
2436 "punpckhbw %%mm4, %%mm1 \n\t"
2437 "punpcklbw %%mm5, %%mm2 \n\t"
2438 "punpckhbw %%mm5, %%mm3 \n\t"
2439 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2440 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2441 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2442 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2443 "addl $16, %%eax \n\t"
2444 "cmpl %3, %%eax \n\t"
2445 " jb 1b \n\t"
2446 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2447 : "memory", "%eax"
2448 );
2449 #endif
2450 for(w= (width&(~15)); w < width; w++)
2451 {
2452 dest[2*w+0] = src1[w];
2453 dest[2*w+1] = src2[w];
2454 }
2455 #else
2456 for(w=0; w < width; w++)
2457 {
2458 dest[2*w+0] = src1[w];
2459 dest[2*w+1] = src2[w];
2460 }
2461 #endif
2462 dest += dstStride;
2463 src1 += src1Stride;
2464 src2 += src2Stride;
2465 }
2466 #ifdef HAVE_MMX
2467 asm(
2468 EMMS" \n\t"
2469 SFENCE" \n\t"
2470 ::: "memory"
2471 );
2472 #endif
2473 }
2474
2475 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2476 uint8_t *dst1, uint8_t *dst2,
2477 unsigned width, unsigned height,
2478 int srcStride1, int srcStride2,
2479 int dstStride1, int dstStride2)
2480 {
2481 unsigned int y,x,h;
2482 int w;
2483 w=width/2; h=height/2;
2484 #ifdef HAVE_MMX
2485 asm volatile(
2486 PREFETCH" %0\n\t"
2487 PREFETCH" %1\n\t"
2488 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2489 #endif
2490 for(y=0;y<h;y++){
2491 const uint8_t* s1=src1+srcStride1*(y>>1);
2492 uint8_t* d=dst1+dstStride1*y;
2493 x=0;
2494 #ifdef HAVE_MMX
2495 for(;x<w-31;x+=32)
2496 {
2497 asm volatile(
2498 PREFETCH" 32%1\n\t"
2499 "movq %1, %%mm0\n\t"
2500 "movq 8%1, %%mm2\n\t"
2501 "movq 16%1, %%mm4\n\t"
2502 "movq 24%1, %%mm6\n\t"
2503 "movq %%mm0, %%mm1\n\t"
2504 "movq %%mm2, %%mm3\n\t"
2505 "movq %%mm4, %%mm5\n\t"
2506 "movq %%mm6, %%mm7\n\t"
2507 "punpcklbw %%mm0, %%mm0\n\t"
2508 "punpckhbw %%mm1, %%mm1\n\t"
2509 "punpcklbw %%mm2, %%mm2\n\t"
2510 "punpckhbw %%mm3, %%mm3\n\t"
2511 "punpcklbw %%mm4, %%mm4\n\t"
2512 "punpckhbw %%mm5, %%mm5\n\t"
2513 "punpcklbw %%mm6, %%mm6\n\t"
2514 "punpckhbw %%mm7, %%mm7\n\t"
2515 MOVNTQ" %%mm0, %0\n\t"
2516 MOVNTQ" %%mm1, 8%0\n\t"
2517 MOVNTQ" %%mm2, 16%0\n\t"
2518 MOVNTQ" %%mm3, 24%0\n\t"
2519 MOVNTQ" %%mm4, 32%0\n\t"
2520 MOVNTQ" %%mm5, 40%0\n\t"
2521 MOVNTQ" %%mm6, 48%0\n\t"
2522 MOVNTQ" %%mm7, 56%0"
2523 :"=m"(d[2*x])
2524 :"m"(s1[x])
2525 :"memory");
2526 }
2527 #endif
2528 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2529 }
2530 for(y=0;y<h;y++){
2531 const uint8_t* s2=src2+srcStride2*(y>>1);
2532 uint8_t* d=dst2+dstStride2*y;
2533 x=0;
2534 #ifdef HAVE_MMX
2535 for(;x<w-31;x+=32)
2536 {
2537 asm volatile(
2538 PREFETCH" 32%1\n\t"
2539 "movq %1, %%mm0\n\t"
2540 "movq 8%1, %%mm2\n\t"
2541 "movq 16%1, %%mm4\n\t"
2542 "movq 24%1, %%mm6\n\t"
2543 "movq %%mm0, %%mm1\n\t"
2544 "movq %%mm2, %%mm3\n\t"
2545 "movq %%mm4, %%mm5\n\t"
2546 "movq %%mm6, %%mm7\n\t"
2547 "punpcklbw %%mm0, %%mm0\n\t"
2548 "punpckhbw %%mm1, %%mm1\n\t"
2549 "punpcklbw %%mm2, %%mm2\n\t"
2550 "punpckhbw %%mm3, %%mm3\n\t"
2551 "punpcklbw %%mm4, %%mm4\n\t"
2552 "punpckhbw %%mm5, %%mm5\n\t"
2553 "punpcklbw %%mm6, %%mm6\n\t"
2554 "punpckhbw %%mm7, %%mm7\n\t"
2555 MOVNTQ" %%mm0, %0\n\t"
2556 MOVNTQ" %%mm1, 8%0\n\t"
2557 MOVNTQ" %%mm2, 16%0\n\t"
2558 MOVNTQ" %%mm3, 24%0\n\t"
2559 MOVNTQ" %%mm4, 32%0\n\t"
2560 MOVNTQ" %%mm5, 40%0\n\t"
2561 MOVNTQ" %%mm6, 48%0\n\t"
2562 MOVNTQ" %%mm7, 56%0"
2563 :"=m"(d[2*x])
2564 :"m"(s2[x])
2565 :"memory");
2566 }
2567 #endif
2568 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2569 }
2570 #ifdef HAVE_MMX
2571 asm(
2572 EMMS" \n\t"
2573 SFENCE" \n\t"
2574 ::: "memory"
2575 );
2576 #endif
2577 }
2578
2579 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2580 uint8_t *dst,
2581 unsigned width, unsigned height,
2582 int srcStride1, int srcStride2,
2583 int srcStride3, int dstStride)
2584 {
2585 unsigned y,x,w,h;
2586 w=width/2; h=height;
2587 for(y=0;y<h;y++){
2588 const uint8_t* yp=src1+srcStride1*y;
2589 const uint8_t* up=src2+srcStride2*(y>>2);
2590 const uint8_t* vp=src3+srcStride3*(y>>2);
2591 uint8_t* d=dst+dstStride*y;
2592 x=0;
2593 #ifdef HAVE_MMX
2594 for(;x<w-7;x+=8)
2595 {
2596 asm volatile(
2597 PREFETCH" 32(%1, %0)\n\t"
2598 PREFETCH" 32(%2, %0)\n\t"
2599 PREFETCH" 32(%3, %0)\n\t"
2600 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2601 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2602 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2603 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2604 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2605 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2606 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2607 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2608 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2609 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2610
2611 "movq %%mm1, %%mm6\n\t"
2612 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2613 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2614 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2615 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2616 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2617
2618 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2619 "movq 8(%1, %0, 4), %%mm0\n\t"
2620 "movq %%mm0, %%mm3\n\t"
2621 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2622 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2623 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2624 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2625
2626 "movq %%mm4, %%mm6\n\t"
2627 "movq 16(%1, %0, 4), %%mm0\n\t"
2628 "movq %%mm0, %%mm3\n\t"
2629 "punpcklbw %%mm5, %%mm4\n\t"
2630 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2631 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2632 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2633 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2634
2635 "punpckhbw %%mm5, %%mm6\n\t"
2636 "movq 24(%1, %0, 4), %%mm0\n\t"
2637 "movq %%mm0, %%mm3\n\t"
2638 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2639 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2640 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2641 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2642
2643 : "+r" (x)
2644 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2645 :"memory");
2646 }
2647 #endif
2648 for(; x<w; x++)
2649 {
2650 const int x2= x<<2;
2651 d[8*x+0]=yp[x2];
2652 d[8*x+1]=up[x];
2653 d[8*x+2]=yp[x2+1];
2654 d[8*x+3]=vp[x];
2655 d[8*x+4]=yp[x2+2];
2656 d[8*x+5]=up[x];
2657 d[8*x+6]=yp[x2+3];
2658 d[8*x+7]=vp[x];
2659 }
2660 }
2661 #ifdef HAVE_MMX
2662 asm(
2663 EMMS" \n\t"
2664 SFENCE" \n\t"
2665 ::: "memory"
2666 );
2667 #endif
2668 }