Add official GPL header to make license explicit as discussed on ffmpeg-devel.
[libav.git] / libswscale / rgb2rgb_template.c
1 /*
2 *
3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
26 #include <stddef.h>
27 #include <inttypes.h> /* for __WORDSIZE */
28
29 #ifndef __WORDSIZE
30 // #warning You have misconfigured system and probably will lose performance!
31 #define __WORDSIZE MP_WORDSIZE
32 #endif
33
34 #undef PREFETCH
35 #undef MOVNTQ
36 #undef EMMS
37 #undef SFENCE
38 #undef MMREG_SIZE
39 #undef PREFETCHW
40 #undef PAVGB
41
42 #ifdef HAVE_SSE2
43 #define MMREG_SIZE 16
44 #else
45 #define MMREG_SIZE 8
46 #endif
47
48 #ifdef HAVE_3DNOW
49 #define PREFETCH "prefetch"
50 #define PREFETCHW "prefetchw"
51 #define PAVGB "pavgusb"
52 #elif defined ( HAVE_MMX2 )
53 #define PREFETCH "prefetchnta"
54 #define PREFETCHW "prefetcht0"
55 #define PAVGB "pavgb"
56 #else
57 #ifdef __APPLE__
58 #define PREFETCH "#"
59 #define PREFETCHW "#"
60 #else
61 #define PREFETCH "/nop"
62 #define PREFETCHW "/nop"
63 #endif
64 #endif
65
66 #ifdef HAVE_3DNOW
67 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
68 #define EMMS "femms"
69 #else
70 #define EMMS "emms"
71 #endif
72
73 #ifdef HAVE_MMX2
74 #define MOVNTQ "movntq"
75 #define SFENCE "sfence"
76 #else
77 #define MOVNTQ "movq"
78 #ifdef __APPLE__
79 #define SFENCE "#"
80 #else
81 #define SFENCE "/nop"
82 #endif
83 #endif
84
85 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
86 {
87 uint8_t *dest = dst;
88 const uint8_t *s = src;
89 const uint8_t *end;
90 #ifdef HAVE_MMX
91 const uint8_t *mm_end;
92 #endif
93 end = s + src_size;
94 #ifdef HAVE_MMX
95 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
96 mm_end = end - 23;
97 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
98 while(s < mm_end)
99 {
100 __asm __volatile(
101 PREFETCH" 32%1\n\t"
102 "movd %1, %%mm0\n\t"
103 "punpckldq 3%1, %%mm0\n\t"
104 "movd 6%1, %%mm1\n\t"
105 "punpckldq 9%1, %%mm1\n\t"
106 "movd 12%1, %%mm2\n\t"
107 "punpckldq 15%1, %%mm2\n\t"
108 "movd 18%1, %%mm3\n\t"
109 "punpckldq 21%1, %%mm3\n\t"
110 "pand %%mm7, %%mm0\n\t"
111 "pand %%mm7, %%mm1\n\t"
112 "pand %%mm7, %%mm2\n\t"
113 "pand %%mm7, %%mm3\n\t"
114 MOVNTQ" %%mm0, %0\n\t"
115 MOVNTQ" %%mm1, 8%0\n\t"
116 MOVNTQ" %%mm2, 16%0\n\t"
117 MOVNTQ" %%mm3, 24%0"
118 :"=m"(*dest)
119 :"m"(*s)
120 :"memory");
121 dest += 32;
122 s += 24;
123 }
124 __asm __volatile(SFENCE:::"memory");
125 __asm __volatile(EMMS:::"memory");
126 #endif
127 while(s < end)
128 {
129 #ifdef WORDS_BIGENDIAN
130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
131 *dest++ = 0;
132 *dest++ = s[2];
133 *dest++ = s[1];
134 *dest++ = s[0];
135 s+=3;
136 #else
137 *dest++ = *s++;
138 *dest++ = *s++;
139 *dest++ = *s++;
140 *dest++ = 0;
141 #endif
142 }
143 }
144
145 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
146 {
147 uint8_t *dest = dst;
148 const uint8_t *s = src;
149 const uint8_t *end;
150 #ifdef HAVE_MMX
151 const uint8_t *mm_end;
152 #endif
153 end = s + src_size;
154 #ifdef HAVE_MMX
155 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
156 mm_end = end - 31;
157 while(s < mm_end)
158 {
159 __asm __volatile(
160 PREFETCH" 32%1\n\t"
161 "movq %1, %%mm0\n\t"
162 "movq 8%1, %%mm1\n\t"
163 "movq 16%1, %%mm4\n\t"
164 "movq 24%1, %%mm5\n\t"
165 "movq %%mm0, %%mm2\n\t"
166 "movq %%mm1, %%mm3\n\t"
167 "movq %%mm4, %%mm6\n\t"
168 "movq %%mm5, %%mm7\n\t"
169 "psrlq $8, %%mm2\n\t"
170 "psrlq $8, %%mm3\n\t"
171 "psrlq $8, %%mm6\n\t"
172 "psrlq $8, %%mm7\n\t"
173 "pand %2, %%mm0\n\t"
174 "pand %2, %%mm1\n\t"
175 "pand %2, %%mm4\n\t"
176 "pand %2, %%mm5\n\t"
177 "pand %3, %%mm2\n\t"
178 "pand %3, %%mm3\n\t"
179 "pand %3, %%mm6\n\t"
180 "pand %3, %%mm7\n\t"
181 "por %%mm2, %%mm0\n\t"
182 "por %%mm3, %%mm1\n\t"
183 "por %%mm6, %%mm4\n\t"
184 "por %%mm7, %%mm5\n\t"
185
186 "movq %%mm1, %%mm2\n\t"
187 "movq %%mm4, %%mm3\n\t"
188 "psllq $48, %%mm2\n\t"
189 "psllq $32, %%mm3\n\t"
190 "pand %4, %%mm2\n\t"
191 "pand %5, %%mm3\n\t"
192 "por %%mm2, %%mm0\n\t"
193 "psrlq $16, %%mm1\n\t"
194 "psrlq $32, %%mm4\n\t"
195 "psllq $16, %%mm5\n\t"
196 "por %%mm3, %%mm1\n\t"
197 "pand %6, %%mm5\n\t"
198 "por %%mm5, %%mm4\n\t"
199
200 MOVNTQ" %%mm0, %0\n\t"
201 MOVNTQ" %%mm1, 8%0\n\t"
202 MOVNTQ" %%mm4, 16%0"
203 :"=m"(*dest)
204 :"m"(*s),"m"(mask24l),
205 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
206 :"memory");
207 dest += 24;
208 s += 32;
209 }
210 __asm __volatile(SFENCE:::"memory");
211 __asm __volatile(EMMS:::"memory");
212 #endif
213 while(s < end)
214 {
215 #ifdef WORDS_BIGENDIAN
216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
217 s++;
218 dest[2] = *s++;
219 dest[1] = *s++;
220 dest[0] = *s++;
221 dest += 3;
222 #else
223 *dest++ = *s++;
224 *dest++ = *s++;
225 *dest++ = *s++;
226 s++;
227 #endif
228 }
229 }
230
231 /*
232 Original by Strepto/Astral
233 ported to gcc & bugfixed : A'rpi
234 MMX2, 3DNOW optimization by Nick Kurshev
235 32bit c version, and and&add trick by Michael Niedermayer
236 */
237 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
238 {
239 register const uint8_t* s=src;
240 register uint8_t* d=dst;
241 register const uint8_t *end;
242 const uint8_t *mm_end;
243 end = s + src_size;
244 #ifdef HAVE_MMX
245 __asm __volatile(PREFETCH" %0"::"m"(*s));
246 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
247 mm_end = end - 15;
248 while(s<mm_end)
249 {
250 __asm __volatile(
251 PREFETCH" 32%1\n\t"
252 "movq %1, %%mm0\n\t"
253 "movq 8%1, %%mm2\n\t"
254 "movq %%mm0, %%mm1\n\t"
255 "movq %%mm2, %%mm3\n\t"
256 "pand %%mm4, %%mm0\n\t"
257 "pand %%mm4, %%mm2\n\t"
258 "paddw %%mm1, %%mm0\n\t"
259 "paddw %%mm3, %%mm2\n\t"
260 MOVNTQ" %%mm0, %0\n\t"
261 MOVNTQ" %%mm2, 8%0"
262 :"=m"(*d)
263 :"m"(*s)
264 );
265 d+=16;
266 s+=16;
267 }
268 __asm __volatile(SFENCE:::"memory");
269 __asm __volatile(EMMS:::"memory");
270 #endif
271 mm_end = end - 3;
272 while(s < mm_end)
273 {
274 register unsigned x= *((uint32_t *)s);
275 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
276 d+=4;
277 s+=4;
278 }
279 if(s < end)
280 {
281 register unsigned short x= *((uint16_t *)s);
282 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
283 }
284 }
285
286 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
287 {
288 register const uint8_t* s=src;
289 register uint8_t* d=dst;
290 register const uint8_t *end;
291 const uint8_t *mm_end;
292 end = s + src_size;
293 #ifdef HAVE_MMX
294 __asm __volatile(PREFETCH" %0"::"m"(*s));
295 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
296 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
297 mm_end = end - 15;
298 while(s<mm_end)
299 {
300 __asm __volatile(
301 PREFETCH" 32%1\n\t"
302 "movq %1, %%mm0\n\t"
303 "movq 8%1, %%mm2\n\t"
304 "movq %%mm0, %%mm1\n\t"
305 "movq %%mm2, %%mm3\n\t"
306 "psrlq $1, %%mm0\n\t"
307 "psrlq $1, %%mm2\n\t"
308 "pand %%mm7, %%mm0\n\t"
309 "pand %%mm7, %%mm2\n\t"
310 "pand %%mm6, %%mm1\n\t"
311 "pand %%mm6, %%mm3\n\t"
312 "por %%mm1, %%mm0\n\t"
313 "por %%mm3, %%mm2\n\t"
314 MOVNTQ" %%mm0, %0\n\t"
315 MOVNTQ" %%mm2, 8%0"
316 :"=m"(*d)
317 :"m"(*s)
318 );
319 d+=16;
320 s+=16;
321 }
322 __asm __volatile(SFENCE:::"memory");
323 __asm __volatile(EMMS:::"memory");
324 #endif
325 mm_end = end - 3;
326 while(s < mm_end)
327 {
328 register uint32_t x= *((uint32_t *)s);
329 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
330 s+=4;
331 d+=4;
332 }
333 if(s < end)
334 {
335 register uint16_t x= *((uint16_t *)s);
336 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
337 s+=2;
338 d+=2;
339 }
340 }
341
342 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
343 {
344 const uint8_t *s = src;
345 const uint8_t *end;
346 #ifdef HAVE_MMX
347 const uint8_t *mm_end;
348 #endif
349 uint16_t *d = (uint16_t *)dst;
350 end = s + src_size;
351 #ifdef HAVE_MMX
352 mm_end = end - 15;
353 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
354 asm volatile(
355 "movq %3, %%mm5 \n\t"
356 "movq %4, %%mm6 \n\t"
357 "movq %5, %%mm7 \n\t"
358 ASMALIGN(4)
359 "1: \n\t"
360 PREFETCH" 32(%1) \n\t"
361 "movd (%1), %%mm0 \n\t"
362 "movd 4(%1), %%mm3 \n\t"
363 "punpckldq 8(%1), %%mm0 \n\t"
364 "punpckldq 12(%1), %%mm3 \n\t"
365 "movq %%mm0, %%mm1 \n\t"
366 "movq %%mm3, %%mm4 \n\t"
367 "pand %%mm6, %%mm0 \n\t"
368 "pand %%mm6, %%mm3 \n\t"
369 "pmaddwd %%mm7, %%mm0 \n\t"
370 "pmaddwd %%mm7, %%mm3 \n\t"
371 "pand %%mm5, %%mm1 \n\t"
372 "pand %%mm5, %%mm4 \n\t"
373 "por %%mm1, %%mm0 \n\t"
374 "por %%mm4, %%mm3 \n\t"
375 "psrld $5, %%mm0 \n\t"
376 "pslld $11, %%mm3 \n\t"
377 "por %%mm3, %%mm0 \n\t"
378 MOVNTQ" %%mm0, (%0) \n\t"
379 "add $16, %1 \n\t"
380 "add $8, %0 \n\t"
381 "cmp %2, %1 \n\t"
382 " jb 1b \n\t"
383 : "+r" (d), "+r"(s)
384 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
385 );
386 #else
387 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
388 __asm __volatile(
389 "movq %0, %%mm7\n\t"
390 "movq %1, %%mm6\n\t"
391 ::"m"(red_16mask),"m"(green_16mask));
392 while(s < mm_end)
393 {
394 __asm __volatile(
395 PREFETCH" 32%1\n\t"
396 "movd %1, %%mm0\n\t"
397 "movd 4%1, %%mm3\n\t"
398 "punpckldq 8%1, %%mm0\n\t"
399 "punpckldq 12%1, %%mm3\n\t"
400 "movq %%mm0, %%mm1\n\t"
401 "movq %%mm0, %%mm2\n\t"
402 "movq %%mm3, %%mm4\n\t"
403 "movq %%mm3, %%mm5\n\t"
404 "psrlq $3, %%mm0\n\t"
405 "psrlq $3, %%mm3\n\t"
406 "pand %2, %%mm0\n\t"
407 "pand %2, %%mm3\n\t"
408 "psrlq $5, %%mm1\n\t"
409 "psrlq $5, %%mm4\n\t"
410 "pand %%mm6, %%mm1\n\t"
411 "pand %%mm6, %%mm4\n\t"
412 "psrlq $8, %%mm2\n\t"
413 "psrlq $8, %%mm5\n\t"
414 "pand %%mm7, %%mm2\n\t"
415 "pand %%mm7, %%mm5\n\t"
416 "por %%mm1, %%mm0\n\t"
417 "por %%mm4, %%mm3\n\t"
418 "por %%mm2, %%mm0\n\t"
419 "por %%mm5, %%mm3\n\t"
420 "psllq $16, %%mm3\n\t"
421 "por %%mm3, %%mm0\n\t"
422 MOVNTQ" %%mm0, %0\n\t"
423 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
424 d += 4;
425 s += 16;
426 }
427 #endif
428 __asm __volatile(SFENCE:::"memory");
429 __asm __volatile(EMMS:::"memory");
430 #endif
431 while(s < end)
432 {
433 register int rgb = *(uint32_t*)s; s += 4;
434 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
435 }
436 }
437
438 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
439 {
440 const uint8_t *s = src;
441 const uint8_t *end;
442 #ifdef HAVE_MMX
443 const uint8_t *mm_end;
444 #endif
445 uint16_t *d = (uint16_t *)dst;
446 end = s + src_size;
447 #ifdef HAVE_MMX
448 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
449 __asm __volatile(
450 "movq %0, %%mm7\n\t"
451 "movq %1, %%mm6\n\t"
452 ::"m"(red_16mask),"m"(green_16mask));
453 mm_end = end - 15;
454 while(s < mm_end)
455 {
456 __asm __volatile(
457 PREFETCH" 32%1\n\t"
458 "movd %1, %%mm0\n\t"
459 "movd 4%1, %%mm3\n\t"
460 "punpckldq 8%1, %%mm0\n\t"
461 "punpckldq 12%1, %%mm3\n\t"
462 "movq %%mm0, %%mm1\n\t"
463 "movq %%mm0, %%mm2\n\t"
464 "movq %%mm3, %%mm4\n\t"
465 "movq %%mm3, %%mm5\n\t"
466 "psllq $8, %%mm0\n\t"
467 "psllq $8, %%mm3\n\t"
468 "pand %%mm7, %%mm0\n\t"
469 "pand %%mm7, %%mm3\n\t"
470 "psrlq $5, %%mm1\n\t"
471 "psrlq $5, %%mm4\n\t"
472 "pand %%mm6, %%mm1\n\t"
473 "pand %%mm6, %%mm4\n\t"
474 "psrlq $19, %%mm2\n\t"
475 "psrlq $19, %%mm5\n\t"
476 "pand %2, %%mm2\n\t"
477 "pand %2, %%mm5\n\t"
478 "por %%mm1, %%mm0\n\t"
479 "por %%mm4, %%mm3\n\t"
480 "por %%mm2, %%mm0\n\t"
481 "por %%mm5, %%mm3\n\t"
482 "psllq $16, %%mm3\n\t"
483 "por %%mm3, %%mm0\n\t"
484 MOVNTQ" %%mm0, %0\n\t"
485 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
486 d += 4;
487 s += 16;
488 }
489 __asm __volatile(SFENCE:::"memory");
490 __asm __volatile(EMMS:::"memory");
491 #endif
492 while(s < end)
493 {
494 register int rgb = *(uint32_t*)s; s += 4;
495 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
496 }
497 }
498
499 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
500 {
501 const uint8_t *s = src;
502 const uint8_t *end;
503 #ifdef HAVE_MMX
504 const uint8_t *mm_end;
505 #endif
506 uint16_t *d = (uint16_t *)dst;
507 end = s + src_size;
508 #ifdef HAVE_MMX
509 mm_end = end - 15;
510 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
511 asm volatile(
512 "movq %3, %%mm5 \n\t"
513 "movq %4, %%mm6 \n\t"
514 "movq %5, %%mm7 \n\t"
515 ASMALIGN(4)
516 "1: \n\t"
517 PREFETCH" 32(%1) \n\t"
518 "movd (%1), %%mm0 \n\t"
519 "movd 4(%1), %%mm3 \n\t"
520 "punpckldq 8(%1), %%mm0 \n\t"
521 "punpckldq 12(%1), %%mm3 \n\t"
522 "movq %%mm0, %%mm1 \n\t"
523 "movq %%mm3, %%mm4 \n\t"
524 "pand %%mm6, %%mm0 \n\t"
525 "pand %%mm6, %%mm3 \n\t"
526 "pmaddwd %%mm7, %%mm0 \n\t"
527 "pmaddwd %%mm7, %%mm3 \n\t"
528 "pand %%mm5, %%mm1 \n\t"
529 "pand %%mm5, %%mm4 \n\t"
530 "por %%mm1, %%mm0 \n\t"
531 "por %%mm4, %%mm3 \n\t"
532 "psrld $6, %%mm0 \n\t"
533 "pslld $10, %%mm3 \n\t"
534 "por %%mm3, %%mm0 \n\t"
535 MOVNTQ" %%mm0, (%0) \n\t"
536 "add $16, %1 \n\t"
537 "add $8, %0 \n\t"
538 "cmp %2, %1 \n\t"
539 " jb 1b \n\t"
540 : "+r" (d), "+r"(s)
541 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
542 );
543 #else
544 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
545 __asm __volatile(
546 "movq %0, %%mm7\n\t"
547 "movq %1, %%mm6\n\t"
548 ::"m"(red_15mask),"m"(green_15mask));
549 while(s < mm_end)
550 {
551 __asm __volatile(
552 PREFETCH" 32%1\n\t"
553 "movd %1, %%mm0\n\t"
554 "movd 4%1, %%mm3\n\t"
555 "punpckldq 8%1, %%mm0\n\t"
556 "punpckldq 12%1, %%mm3\n\t"
557 "movq %%mm0, %%mm1\n\t"
558 "movq %%mm0, %%mm2\n\t"
559 "movq %%mm3, %%mm4\n\t"
560 "movq %%mm3, %%mm5\n\t"
561 "psrlq $3, %%mm0\n\t"
562 "psrlq $3, %%mm3\n\t"
563 "pand %2, %%mm0\n\t"
564 "pand %2, %%mm3\n\t"
565 "psrlq $6, %%mm1\n\t"
566 "psrlq $6, %%mm4\n\t"
567 "pand %%mm6, %%mm1\n\t"
568 "pand %%mm6, %%mm4\n\t"
569 "psrlq $9, %%mm2\n\t"
570 "psrlq $9, %%mm5\n\t"
571 "pand %%mm7, %%mm2\n\t"
572 "pand %%mm7, %%mm5\n\t"
573 "por %%mm1, %%mm0\n\t"
574 "por %%mm4, %%mm3\n\t"
575 "por %%mm2, %%mm0\n\t"
576 "por %%mm5, %%mm3\n\t"
577 "psllq $16, %%mm3\n\t"
578 "por %%mm3, %%mm0\n\t"
579 MOVNTQ" %%mm0, %0\n\t"
580 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
581 d += 4;
582 s += 16;
583 }
584 #endif
585 __asm __volatile(SFENCE:::"memory");
586 __asm __volatile(EMMS:::"memory");
587 #endif
588 while(s < end)
589 {
590 register int rgb = *(uint32_t*)s; s += 4;
591 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
592 }
593 }
594
595 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
596 {
597 const uint8_t *s = src;
598 const uint8_t *end;
599 #ifdef HAVE_MMX
600 const uint8_t *mm_end;
601 #endif
602 uint16_t *d = (uint16_t *)dst;
603 end = s + src_size;
604 #ifdef HAVE_MMX
605 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
606 __asm __volatile(
607 "movq %0, %%mm7\n\t"
608 "movq %1, %%mm6\n\t"
609 ::"m"(red_15mask),"m"(green_15mask));
610 mm_end = end - 15;
611 while(s < mm_end)
612 {
613 __asm __volatile(
614 PREFETCH" 32%1\n\t"
615 "movd %1, %%mm0\n\t"
616 "movd 4%1, %%mm3\n\t"
617 "punpckldq 8%1, %%mm0\n\t"
618 "punpckldq 12%1, %%mm3\n\t"
619 "movq %%mm0, %%mm1\n\t"
620 "movq %%mm0, %%mm2\n\t"
621 "movq %%mm3, %%mm4\n\t"
622 "movq %%mm3, %%mm5\n\t"
623 "psllq $7, %%mm0\n\t"
624 "psllq $7, %%mm3\n\t"
625 "pand %%mm7, %%mm0\n\t"
626 "pand %%mm7, %%mm3\n\t"
627 "psrlq $6, %%mm1\n\t"
628 "psrlq $6, %%mm4\n\t"
629 "pand %%mm6, %%mm1\n\t"
630 "pand %%mm6, %%mm4\n\t"
631 "psrlq $19, %%mm2\n\t"
632 "psrlq $19, %%mm5\n\t"
633 "pand %2, %%mm2\n\t"
634 "pand %2, %%mm5\n\t"
635 "por %%mm1, %%mm0\n\t"
636 "por %%mm4, %%mm3\n\t"
637 "por %%mm2, %%mm0\n\t"
638 "por %%mm5, %%mm3\n\t"
639 "psllq $16, %%mm3\n\t"
640 "por %%mm3, %%mm0\n\t"
641 MOVNTQ" %%mm0, %0\n\t"
642 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
643 d += 4;
644 s += 16;
645 }
646 __asm __volatile(SFENCE:::"memory");
647 __asm __volatile(EMMS:::"memory");
648 #endif
649 while(s < end)
650 {
651 register int rgb = *(uint32_t*)s; s += 4;
652 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
653 }
654 }
655
656 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
657 {
658 const uint8_t *s = src;
659 const uint8_t *end;
660 #ifdef HAVE_MMX
661 const uint8_t *mm_end;
662 #endif
663 uint16_t *d = (uint16_t *)dst;
664 end = s + src_size;
665 #ifdef HAVE_MMX
666 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
667 __asm __volatile(
668 "movq %0, %%mm7\n\t"
669 "movq %1, %%mm6\n\t"
670 ::"m"(red_16mask),"m"(green_16mask));
671 mm_end = end - 11;
672 while(s < mm_end)
673 {
674 __asm __volatile(
675 PREFETCH" 32%1\n\t"
676 "movd %1, %%mm0\n\t"
677 "movd 3%1, %%mm3\n\t"
678 "punpckldq 6%1, %%mm0\n\t"
679 "punpckldq 9%1, %%mm3\n\t"
680 "movq %%mm0, %%mm1\n\t"
681 "movq %%mm0, %%mm2\n\t"
682 "movq %%mm3, %%mm4\n\t"
683 "movq %%mm3, %%mm5\n\t"
684 "psrlq $3, %%mm0\n\t"
685 "psrlq $3, %%mm3\n\t"
686 "pand %2, %%mm0\n\t"
687 "pand %2, %%mm3\n\t"
688 "psrlq $5, %%mm1\n\t"
689 "psrlq $5, %%mm4\n\t"
690 "pand %%mm6, %%mm1\n\t"
691 "pand %%mm6, %%mm4\n\t"
692 "psrlq $8, %%mm2\n\t"
693 "psrlq $8, %%mm5\n\t"
694 "pand %%mm7, %%mm2\n\t"
695 "pand %%mm7, %%mm5\n\t"
696 "por %%mm1, %%mm0\n\t"
697 "por %%mm4, %%mm3\n\t"
698 "por %%mm2, %%mm0\n\t"
699 "por %%mm5, %%mm3\n\t"
700 "psllq $16, %%mm3\n\t"
701 "por %%mm3, %%mm0\n\t"
702 MOVNTQ" %%mm0, %0\n\t"
703 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
704 d += 4;
705 s += 12;
706 }
707 __asm __volatile(SFENCE:::"memory");
708 __asm __volatile(EMMS:::"memory");
709 #endif
710 while(s < end)
711 {
712 const int b= *s++;
713 const int g= *s++;
714 const int r= *s++;
715 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
716 }
717 }
718
719 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
720 {
721 const uint8_t *s = src;
722 const uint8_t *end;
723 #ifdef HAVE_MMX
724 const uint8_t *mm_end;
725 #endif
726 uint16_t *d = (uint16_t *)dst;
727 end = s + src_size;
728 #ifdef HAVE_MMX
729 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
730 __asm __volatile(
731 "movq %0, %%mm7\n\t"
732 "movq %1, %%mm6\n\t"
733 ::"m"(red_16mask),"m"(green_16mask));
734 mm_end = end - 15;
735 while(s < mm_end)
736 {
737 __asm __volatile(
738 PREFETCH" 32%1\n\t"
739 "movd %1, %%mm0\n\t"
740 "movd 3%1, %%mm3\n\t"
741 "punpckldq 6%1, %%mm0\n\t"
742 "punpckldq 9%1, %%mm3\n\t"
743 "movq %%mm0, %%mm1\n\t"
744 "movq %%mm0, %%mm2\n\t"
745 "movq %%mm3, %%mm4\n\t"
746 "movq %%mm3, %%mm5\n\t"
747 "psllq $8, %%mm0\n\t"
748 "psllq $8, %%mm3\n\t"
749 "pand %%mm7, %%mm0\n\t"
750 "pand %%mm7, %%mm3\n\t"
751 "psrlq $5, %%mm1\n\t"
752 "psrlq $5, %%mm4\n\t"
753 "pand %%mm6, %%mm1\n\t"
754 "pand %%mm6, %%mm4\n\t"
755 "psrlq $19, %%mm2\n\t"
756 "psrlq $19, %%mm5\n\t"
757 "pand %2, %%mm2\n\t"
758 "pand %2, %%mm5\n\t"
759 "por %%mm1, %%mm0\n\t"
760 "por %%mm4, %%mm3\n\t"
761 "por %%mm2, %%mm0\n\t"
762 "por %%mm5, %%mm3\n\t"
763 "psllq $16, %%mm3\n\t"
764 "por %%mm3, %%mm0\n\t"
765 MOVNTQ" %%mm0, %0\n\t"
766 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
767 d += 4;
768 s += 12;
769 }
770 __asm __volatile(SFENCE:::"memory");
771 __asm __volatile(EMMS:::"memory");
772 #endif
773 while(s < end)
774 {
775 const int r= *s++;
776 const int g= *s++;
777 const int b= *s++;
778 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
779 }
780 }
781
782 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
783 {
784 const uint8_t *s = src;
785 const uint8_t *end;
786 #ifdef HAVE_MMX
787 const uint8_t *mm_end;
788 #endif
789 uint16_t *d = (uint16_t *)dst;
790 end = s + src_size;
791 #ifdef HAVE_MMX
792 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
793 __asm __volatile(
794 "movq %0, %%mm7\n\t"
795 "movq %1, %%mm6\n\t"
796 ::"m"(red_15mask),"m"(green_15mask));
797 mm_end = end - 11;
798 while(s < mm_end)
799 {
800 __asm __volatile(
801 PREFETCH" 32%1\n\t"
802 "movd %1, %%mm0\n\t"
803 "movd 3%1, %%mm3\n\t"
804 "punpckldq 6%1, %%mm0\n\t"
805 "punpckldq 9%1, %%mm3\n\t"
806 "movq %%mm0, %%mm1\n\t"
807 "movq %%mm0, %%mm2\n\t"
808 "movq %%mm3, %%mm4\n\t"
809 "movq %%mm3, %%mm5\n\t"
810 "psrlq $3, %%mm0\n\t"
811 "psrlq $3, %%mm3\n\t"
812 "pand %2, %%mm0\n\t"
813 "pand %2, %%mm3\n\t"
814 "psrlq $6, %%mm1\n\t"
815 "psrlq $6, %%mm4\n\t"
816 "pand %%mm6, %%mm1\n\t"
817 "pand %%mm6, %%mm4\n\t"
818 "psrlq $9, %%mm2\n\t"
819 "psrlq $9, %%mm5\n\t"
820 "pand %%mm7, %%mm2\n\t"
821 "pand %%mm7, %%mm5\n\t"
822 "por %%mm1, %%mm0\n\t"
823 "por %%mm4, %%mm3\n\t"
824 "por %%mm2, %%mm0\n\t"
825 "por %%mm5, %%mm3\n\t"
826 "psllq $16, %%mm3\n\t"
827 "por %%mm3, %%mm0\n\t"
828 MOVNTQ" %%mm0, %0\n\t"
829 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
830 d += 4;
831 s += 12;
832 }
833 __asm __volatile(SFENCE:::"memory");
834 __asm __volatile(EMMS:::"memory");
835 #endif
836 while(s < end)
837 {
838 const int b= *s++;
839 const int g= *s++;
840 const int r= *s++;
841 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
842 }
843 }
844
845 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
846 {
847 const uint8_t *s = src;
848 const uint8_t *end;
849 #ifdef HAVE_MMX
850 const uint8_t *mm_end;
851 #endif
852 uint16_t *d = (uint16_t *)dst;
853 end = s + src_size;
854 #ifdef HAVE_MMX
855 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
856 __asm __volatile(
857 "movq %0, %%mm7\n\t"
858 "movq %1, %%mm6\n\t"
859 ::"m"(red_15mask),"m"(green_15mask));
860 mm_end = end - 15;
861 while(s < mm_end)
862 {
863 __asm __volatile(
864 PREFETCH" 32%1\n\t"
865 "movd %1, %%mm0\n\t"
866 "movd 3%1, %%mm3\n\t"
867 "punpckldq 6%1, %%mm0\n\t"
868 "punpckldq 9%1, %%mm3\n\t"
869 "movq %%mm0, %%mm1\n\t"
870 "movq %%mm0, %%mm2\n\t"
871 "movq %%mm3, %%mm4\n\t"
872 "movq %%mm3, %%mm5\n\t"
873 "psllq $7, %%mm0\n\t"
874 "psllq $7, %%mm3\n\t"
875 "pand %%mm7, %%mm0\n\t"
876 "pand %%mm7, %%mm3\n\t"
877 "psrlq $6, %%mm1\n\t"
878 "psrlq $6, %%mm4\n\t"
879 "pand %%mm6, %%mm1\n\t"
880 "pand %%mm6, %%mm4\n\t"
881 "psrlq $19, %%mm2\n\t"
882 "psrlq $19, %%mm5\n\t"
883 "pand %2, %%mm2\n\t"
884 "pand %2, %%mm5\n\t"
885 "por %%mm1, %%mm0\n\t"
886 "por %%mm4, %%mm3\n\t"
887 "por %%mm2, %%mm0\n\t"
888 "por %%mm5, %%mm3\n\t"
889 "psllq $16, %%mm3\n\t"
890 "por %%mm3, %%mm0\n\t"
891 MOVNTQ" %%mm0, %0\n\t"
892 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
893 d += 4;
894 s += 12;
895 }
896 __asm __volatile(SFENCE:::"memory");
897 __asm __volatile(EMMS:::"memory");
898 #endif
899 while(s < end)
900 {
901 const int r= *s++;
902 const int g= *s++;
903 const int b= *s++;
904 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
905 }
906 }
907
908 /*
909 I use here less accurate approximation by simply
910 left-shifting the input
911 value and filling the low order bits with
912 zeroes. This method improves png's
913 compression but this scheme cannot reproduce white exactly, since it does not
914 generate an all-ones maximum value; the net effect is to darken the
915 image slightly.
916
917 The better method should be "left bit replication":
918
919 4 3 2 1 0
920 ---------
921 1 1 0 1 1
922
923 7 6 5 4 3 2 1 0
924 ----------------
925 1 1 0 1 1 1 1 0
926 |=======| |===|
927 | Leftmost Bits Repeated to Fill Open Bits
928 |
929 Original Bits
930 */
931 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
932 {
933 const uint16_t *end;
934 #ifdef HAVE_MMX
935 const uint16_t *mm_end;
936 #endif
937 uint8_t *d = (uint8_t *)dst;
938 const uint16_t *s = (uint16_t *)src;
939 end = s + src_size/2;
940 #ifdef HAVE_MMX
941 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
942 mm_end = end - 7;
943 while(s < mm_end)
944 {
945 __asm __volatile(
946 PREFETCH" 32%1\n\t"
947 "movq %1, %%mm0\n\t"
948 "movq %1, %%mm1\n\t"
949 "movq %1, %%mm2\n\t"
950 "pand %2, %%mm0\n\t"
951 "pand %3, %%mm1\n\t"
952 "pand %4, %%mm2\n\t"
953 "psllq $3, %%mm0\n\t"
954 "psrlq $2, %%mm1\n\t"
955 "psrlq $7, %%mm2\n\t"
956 "movq %%mm0, %%mm3\n\t"
957 "movq %%mm1, %%mm4\n\t"
958 "movq %%mm2, %%mm5\n\t"
959 "punpcklwd %5, %%mm0\n\t"
960 "punpcklwd %5, %%mm1\n\t"
961 "punpcklwd %5, %%mm2\n\t"
962 "punpckhwd %5, %%mm3\n\t"
963 "punpckhwd %5, %%mm4\n\t"
964 "punpckhwd %5, %%mm5\n\t"
965 "psllq $8, %%mm1\n\t"
966 "psllq $16, %%mm2\n\t"
967 "por %%mm1, %%mm0\n\t"
968 "por %%mm2, %%mm0\n\t"
969 "psllq $8, %%mm4\n\t"
970 "psllq $16, %%mm5\n\t"
971 "por %%mm4, %%mm3\n\t"
972 "por %%mm5, %%mm3\n\t"
973
974 "movq %%mm0, %%mm6\n\t"
975 "movq %%mm3, %%mm7\n\t"
976
977 "movq 8%1, %%mm0\n\t"
978 "movq 8%1, %%mm1\n\t"
979 "movq 8%1, %%mm2\n\t"
980 "pand %2, %%mm0\n\t"
981 "pand %3, %%mm1\n\t"
982 "pand %4, %%mm2\n\t"
983 "psllq $3, %%mm0\n\t"
984 "psrlq $2, %%mm1\n\t"
985 "psrlq $7, %%mm2\n\t"
986 "movq %%mm0, %%mm3\n\t"
987 "movq %%mm1, %%mm4\n\t"
988 "movq %%mm2, %%mm5\n\t"
989 "punpcklwd %5, %%mm0\n\t"
990 "punpcklwd %5, %%mm1\n\t"
991 "punpcklwd %5, %%mm2\n\t"
992 "punpckhwd %5, %%mm3\n\t"
993 "punpckhwd %5, %%mm4\n\t"
994 "punpckhwd %5, %%mm5\n\t"
995 "psllq $8, %%mm1\n\t"
996 "psllq $16, %%mm2\n\t"
997 "por %%mm1, %%mm0\n\t"
998 "por %%mm2, %%mm0\n\t"
999 "psllq $8, %%mm4\n\t"
1000 "psllq $16, %%mm5\n\t"
1001 "por %%mm4, %%mm3\n\t"
1002 "por %%mm5, %%mm3\n\t"
1003
1004 :"=m"(*d)
1005 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1006 :"memory");
1007 /* Borrowed 32 to 24 */
1008 __asm __volatile(
1009 "movq %%mm0, %%mm4\n\t"
1010 "movq %%mm3, %%mm5\n\t"
1011 "movq %%mm6, %%mm0\n\t"
1012 "movq %%mm7, %%mm1\n\t"
1013
1014 "movq %%mm4, %%mm6\n\t"
1015 "movq %%mm5, %%mm7\n\t"
1016 "movq %%mm0, %%mm2\n\t"
1017 "movq %%mm1, %%mm3\n\t"
1018
1019 "psrlq $8, %%mm2\n\t"
1020 "psrlq $8, %%mm3\n\t"
1021 "psrlq $8, %%mm6\n\t"
1022 "psrlq $8, %%mm7\n\t"
1023 "pand %2, %%mm0\n\t"
1024 "pand %2, %%mm1\n\t"
1025 "pand %2, %%mm4\n\t"
1026 "pand %2, %%mm5\n\t"
1027 "pand %3, %%mm2\n\t"
1028 "pand %3, %%mm3\n\t"
1029 "pand %3, %%mm6\n\t"
1030 "pand %3, %%mm7\n\t"
1031 "por %%mm2, %%mm0\n\t"
1032 "por %%mm3, %%mm1\n\t"
1033 "por %%mm6, %%mm4\n\t"
1034 "por %%mm7, %%mm5\n\t"
1035
1036 "movq %%mm1, %%mm2\n\t"
1037 "movq %%mm4, %%mm3\n\t"
1038 "psllq $48, %%mm2\n\t"
1039 "psllq $32, %%mm3\n\t"
1040 "pand %4, %%mm2\n\t"
1041 "pand %5, %%mm3\n\t"
1042 "por %%mm2, %%mm0\n\t"
1043 "psrlq $16, %%mm1\n\t"
1044 "psrlq $32, %%mm4\n\t"
1045 "psllq $16, %%mm5\n\t"
1046 "por %%mm3, %%mm1\n\t"
1047 "pand %6, %%mm5\n\t"
1048 "por %%mm5, %%mm4\n\t"
1049
1050 MOVNTQ" %%mm0, %0\n\t"
1051 MOVNTQ" %%mm1, 8%0\n\t"
1052 MOVNTQ" %%mm4, 16%0"
1053
1054 :"=m"(*d)
1055 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1056 :"memory");
1057 d += 24;
1058 s += 8;
1059 }
1060 __asm __volatile(SFENCE:::"memory");
1061 __asm __volatile(EMMS:::"memory");
1062 #endif
1063 while(s < end)
1064 {
1065 register uint16_t bgr;
1066 bgr = *s++;
1067 *d++ = (bgr&0x1F)<<3;
1068 *d++ = (bgr&0x3E0)>>2;
1069 *d++ = (bgr&0x7C00)>>7;
1070 }
1071 }
1072
1073 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1074 {
1075 const uint16_t *end;
1076 #ifdef HAVE_MMX
1077 const uint16_t *mm_end;
1078 #endif
1079 uint8_t *d = (uint8_t *)dst;
1080 const uint16_t *s = (const uint16_t *)src;
1081 end = s + src_size/2;
1082 #ifdef HAVE_MMX
1083 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1084 mm_end = end - 7;
1085 while(s < mm_end)
1086 {
1087 __asm __volatile(
1088 PREFETCH" 32%1\n\t"
1089 "movq %1, %%mm0\n\t"
1090 "movq %1, %%mm1\n\t"
1091 "movq %1, %%mm2\n\t"
1092 "pand %2, %%mm0\n\t"
1093 "pand %3, %%mm1\n\t"
1094 "pand %4, %%mm2\n\t"
1095 "psllq $3, %%mm0\n\t"
1096 "psrlq $3, %%mm1\n\t"
1097 "psrlq $8, %%mm2\n\t"
1098 "movq %%mm0, %%mm3\n\t"
1099 "movq %%mm1, %%mm4\n\t"
1100 "movq %%mm2, %%mm5\n\t"
1101 "punpcklwd %5, %%mm0\n\t"
1102 "punpcklwd %5, %%mm1\n\t"
1103 "punpcklwd %5, %%mm2\n\t"
1104 "punpckhwd %5, %%mm3\n\t"
1105 "punpckhwd %5, %%mm4\n\t"
1106 "punpckhwd %5, %%mm5\n\t"
1107 "psllq $8, %%mm1\n\t"
1108 "psllq $16, %%mm2\n\t"
1109 "por %%mm1, %%mm0\n\t"
1110 "por %%mm2, %%mm0\n\t"
1111 "psllq $8, %%mm4\n\t"
1112 "psllq $16, %%mm5\n\t"
1113 "por %%mm4, %%mm3\n\t"
1114 "por %%mm5, %%mm3\n\t"
1115
1116 "movq %%mm0, %%mm6\n\t"
1117 "movq %%mm3, %%mm7\n\t"
1118
1119 "movq 8%1, %%mm0\n\t"
1120 "movq 8%1, %%mm1\n\t"
1121 "movq 8%1, %%mm2\n\t"
1122 "pand %2, %%mm0\n\t"
1123 "pand %3, %%mm1\n\t"
1124 "pand %4, %%mm2\n\t"
1125 "psllq $3, %%mm0\n\t"
1126 "psrlq $3, %%mm1\n\t"
1127 "psrlq $8, %%mm2\n\t"
1128 "movq %%mm0, %%mm3\n\t"
1129 "movq %%mm1, %%mm4\n\t"
1130 "movq %%mm2, %%mm5\n\t"
1131 "punpcklwd %5, %%mm0\n\t"
1132 "punpcklwd %5, %%mm1\n\t"
1133 "punpcklwd %5, %%mm2\n\t"
1134 "punpckhwd %5, %%mm3\n\t"
1135 "punpckhwd %5, %%mm4\n\t"
1136 "punpckhwd %5, %%mm5\n\t"
1137 "psllq $8, %%mm1\n\t"
1138 "psllq $16, %%mm2\n\t"
1139 "por %%mm1, %%mm0\n\t"
1140 "por %%mm2, %%mm0\n\t"
1141 "psllq $8, %%mm4\n\t"
1142 "psllq $16, %%mm5\n\t"
1143 "por %%mm4, %%mm3\n\t"
1144 "por %%mm5, %%mm3\n\t"
1145 :"=m"(*d)
1146 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1147 :"memory");
1148 /* Borrowed 32 to 24 */
1149 __asm __volatile(
1150 "movq %%mm0, %%mm4\n\t"
1151 "movq %%mm3, %%mm5\n\t"
1152 "movq %%mm6, %%mm0\n\t"
1153 "movq %%mm7, %%mm1\n\t"
1154
1155 "movq %%mm4, %%mm6\n\t"
1156 "movq %%mm5, %%mm7\n\t"
1157 "movq %%mm0, %%mm2\n\t"
1158 "movq %%mm1, %%mm3\n\t"
1159
1160 "psrlq $8, %%mm2\n\t"
1161 "psrlq $8, %%mm3\n\t"
1162 "psrlq $8, %%mm6\n\t"
1163 "psrlq $8, %%mm7\n\t"
1164 "pand %2, %%mm0\n\t"
1165 "pand %2, %%mm1\n\t"
1166 "pand %2, %%mm4\n\t"
1167 "pand %2, %%mm5\n\t"
1168 "pand %3, %%mm2\n\t"
1169 "pand %3, %%mm3\n\t"
1170 "pand %3, %%mm6\n\t"
1171 "pand %3, %%mm7\n\t"
1172 "por %%mm2, %%mm0\n\t"
1173 "por %%mm3, %%mm1\n\t"
1174 "por %%mm6, %%mm4\n\t"
1175 "por %%mm7, %%mm5\n\t"
1176
1177 "movq %%mm1, %%mm2\n\t"
1178 "movq %%mm4, %%mm3\n\t"
1179 "psllq $48, %%mm2\n\t"
1180 "psllq $32, %%mm3\n\t"
1181 "pand %4, %%mm2\n\t"
1182 "pand %5, %%mm3\n\t"
1183 "por %%mm2, %%mm0\n\t"
1184 "psrlq $16, %%mm1\n\t"
1185 "psrlq $32, %%mm4\n\t"
1186 "psllq $16, %%mm5\n\t"
1187 "por %%mm3, %%mm1\n\t"
1188 "pand %6, %%mm5\n\t"
1189 "por %%mm5, %%mm4\n\t"
1190
1191 MOVNTQ" %%mm0, %0\n\t"
1192 MOVNTQ" %%mm1, 8%0\n\t"
1193 MOVNTQ" %%mm4, 16%0"
1194
1195 :"=m"(*d)
1196 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1197 :"memory");
1198 d += 24;
1199 s += 8;
1200 }
1201 __asm __volatile(SFENCE:::"memory");
1202 __asm __volatile(EMMS:::"memory");
1203 #endif
1204 while(s < end)
1205 {
1206 register uint16_t bgr;
1207 bgr = *s++;
1208 *d++ = (bgr&0x1F)<<3;
1209 *d++ = (bgr&0x7E0)>>3;
1210 *d++ = (bgr&0xF800)>>8;
1211 }
1212 }
1213
1214 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1215 {
1216 const uint16_t *end;
1217 #ifdef HAVE_MMX
1218 const uint16_t *mm_end;
1219 #endif
1220 uint8_t *d = (uint8_t *)dst;
1221 const uint16_t *s = (const uint16_t *)src;
1222 end = s + src_size/2;
1223 #ifdef HAVE_MMX
1224 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1225 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1226 mm_end = end - 3;
1227 while(s < mm_end)
1228 {
1229 __asm __volatile(
1230 PREFETCH" 32%1\n\t"
1231 "movq %1, %%mm0\n\t"
1232 "movq %1, %%mm1\n\t"
1233 "movq %1, %%mm2\n\t"
1234 "pand %2, %%mm0\n\t"
1235 "pand %3, %%mm1\n\t"
1236 "pand %4, %%mm2\n\t"
1237 "psllq $3, %%mm0\n\t"
1238 "psrlq $2, %%mm1\n\t"
1239 "psrlq $7, %%mm2\n\t"
1240 "movq %%mm0, %%mm3\n\t"
1241 "movq %%mm1, %%mm4\n\t"
1242 "movq %%mm2, %%mm5\n\t"
1243 "punpcklwd %%mm7, %%mm0\n\t"
1244 "punpcklwd %%mm7, %%mm1\n\t"
1245 "punpcklwd %%mm7, %%mm2\n\t"
1246 "punpckhwd %%mm7, %%mm3\n\t"
1247 "punpckhwd %%mm7, %%mm4\n\t"
1248 "punpckhwd %%mm7, %%mm5\n\t"
1249 "psllq $8, %%mm1\n\t"
1250 "psllq $16, %%mm2\n\t"
1251 "por %%mm1, %%mm0\n\t"
1252 "por %%mm2, %%mm0\n\t"
1253 "psllq $8, %%mm4\n\t"
1254 "psllq $16, %%mm5\n\t"
1255 "por %%mm4, %%mm3\n\t"
1256 "por %%mm5, %%mm3\n\t"
1257 MOVNTQ" %%mm0, %0\n\t"
1258 MOVNTQ" %%mm3, 8%0\n\t"
1259 :"=m"(*d)
1260 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1261 :"memory");
1262 d += 16;
1263 s += 4;
1264 }
1265 __asm __volatile(SFENCE:::"memory");
1266 __asm __volatile(EMMS:::"memory");
1267 #endif
1268 while(s < end)
1269 {
1270 #if 0 //slightly slower on athlon
1271 int bgr= *s++;
1272 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1273 #else
1274 register uint16_t bgr;
1275 bgr = *s++;
1276 #ifdef WORDS_BIGENDIAN
1277 *d++ = 0;
1278 *d++ = (bgr&0x7C00)>>7;
1279 *d++ = (bgr&0x3E0)>>2;
1280 *d++ = (bgr&0x1F)<<3;
1281 #else
1282 *d++ = (bgr&0x1F)<<3;
1283 *d++ = (bgr&0x3E0)>>2;
1284 *d++ = (bgr&0x7C00)>>7;
1285 *d++ = 0;
1286 #endif
1287
1288 #endif
1289 }
1290 }
1291
1292 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1293 {
1294 const uint16_t *end;
1295 #ifdef HAVE_MMX
1296 const uint16_t *mm_end;
1297 #endif
1298 uint8_t *d = (uint8_t *)dst;
1299 const uint16_t *s = (uint16_t *)src;
1300 end = s + src_size/2;
1301 #ifdef HAVE_MMX
1302 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1303 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1304 mm_end = end - 3;
1305 while(s < mm_end)
1306 {
1307 __asm __volatile(
1308 PREFETCH" 32%1\n\t"
1309 "movq %1, %%mm0\n\t"
1310 "movq %1, %%mm1\n\t"
1311 "movq %1, %%mm2\n\t"
1312 "pand %2, %%mm0\n\t"
1313 "pand %3, %%mm1\n\t"
1314 "pand %4, %%mm2\n\t"
1315 "psllq $3, %%mm0\n\t"
1316 "psrlq $3, %%mm1\n\t"
1317 "psrlq $8, %%mm2\n\t"
1318 "movq %%mm0, %%mm3\n\t"
1319 "movq %%mm1, %%mm4\n\t"
1320 "movq %%mm2, %%mm5\n\t"
1321 "punpcklwd %%mm7, %%mm0\n\t"
1322 "punpcklwd %%mm7, %%mm1\n\t"
1323 "punpcklwd %%mm7, %%mm2\n\t"
1324 "punpckhwd %%mm7, %%mm3\n\t"
1325 "punpckhwd %%mm7, %%mm4\n\t"
1326 "punpckhwd %%mm7, %%mm5\n\t"
1327 "psllq $8, %%mm1\n\t"
1328 "psllq $16, %%mm2\n\t"
1329 "por %%mm1, %%mm0\n\t"
1330 "por %%mm2, %%mm0\n\t"
1331 "psllq $8, %%mm4\n\t"
1332 "psllq $16, %%mm5\n\t"
1333 "por %%mm4, %%mm3\n\t"
1334 "por %%mm5, %%mm3\n\t"
1335 MOVNTQ" %%mm0, %0\n\t"
1336 MOVNTQ" %%mm3, 8%0\n\t"
1337 :"=m"(*d)
1338 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1339 :"memory");
1340 d += 16;
1341 s += 4;
1342 }
1343 __asm __volatile(SFENCE:::"memory");
1344 __asm __volatile(EMMS:::"memory");
1345 #endif
1346 while(s < end)
1347 {
1348 register uint16_t bgr;
1349 bgr = *s++;
1350 #ifdef WORDS_BIGENDIAN
1351 *d++ = 0;
1352 *d++ = (bgr&0xF800)>>8;
1353 *d++ = (bgr&0x7E0)>>3;
1354 *d++ = (bgr&0x1F)<<3;
1355 #else
1356 *d++ = (bgr&0x1F)<<3;
1357 *d++ = (bgr&0x7E0)>>3;
1358 *d++ = (bgr&0xF800)>>8;
1359 *d++ = 0;
1360 #endif
1361 }
1362 }
1363
1364 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1365 {
1366 #ifdef HAVE_MMX
1367 /* TODO: unroll this loop */
1368 asm volatile (
1369 "xor %%"REG_a", %%"REG_a" \n\t"
1370 ASMALIGN(4)
1371 "1: \n\t"
1372 PREFETCH" 32(%0, %%"REG_a") \n\t"
1373 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1374 "movq %%mm0, %%mm1 \n\t"
1375 "movq %%mm0, %%mm2 \n\t"
1376 "pslld $16, %%mm0 \n\t"
1377 "psrld $16, %%mm1 \n\t"
1378 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1379 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1380 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1381 "por %%mm0, %%mm2 \n\t"
1382 "por %%mm1, %%mm2 \n\t"
1383 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1384 "add $8, %%"REG_a" \n\t"
1385 "cmp %2, %%"REG_a" \n\t"
1386 " jb 1b \n\t"
1387 :: "r" (src), "r"(dst), "r" (src_size-7)
1388 : "%"REG_a
1389 );
1390
1391 __asm __volatile(SFENCE:::"memory");
1392 __asm __volatile(EMMS:::"memory");
1393 #else
1394 unsigned i;
1395 unsigned num_pixels = src_size >> 2;
1396 for(i=0; i<num_pixels; i++)
1397 {
1398 #ifdef WORDS_BIGENDIAN
1399 dst[4*i + 1] = src[4*i + 3];
1400 dst[4*i + 2] = src[4*i + 2];
1401 dst[4*i + 3] = src[4*i + 1];
1402 #else
1403 dst[4*i + 0] = src[4*i + 2];
1404 dst[4*i + 1] = src[4*i + 1];
1405 dst[4*i + 2] = src[4*i + 0];
1406 #endif
1407 }
1408 #endif
1409 }
1410
1411 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1412 {
1413 unsigned i;
1414 #ifdef HAVE_MMX
1415 long mmx_size= 23 - src_size;
1416 asm volatile (
1417 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1418 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1419 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1420 ASMALIGN(4)
1421 "1: \n\t"
1422 PREFETCH" 32(%1, %%"REG_a") \n\t"
1423 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1424 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1425 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1426 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1427 "pand %%mm5, %%mm0 \n\t"
1428 "pand %%mm6, %%mm1 \n\t"
1429 "pand %%mm7, %%mm2 \n\t"
1430 "por %%mm0, %%mm1 \n\t"
1431 "por %%mm2, %%mm1 \n\t"
1432 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1433 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1434 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1435 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1436 "pand %%mm7, %%mm0 \n\t"
1437 "pand %%mm5, %%mm1 \n\t"
1438 "pand %%mm6, %%mm2 \n\t"
1439 "por %%mm0, %%mm1 \n\t"
1440 "por %%mm2, %%mm1 \n\t"
1441 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1442 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1443 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1444 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1445 "pand %%mm6, %%mm0 \n\t"
1446 "pand %%mm7, %%mm1 \n\t"
1447 "pand %%mm5, %%mm2 \n\t"
1448 "por %%mm0, %%mm1 \n\t"
1449 "por %%mm2, %%mm1 \n\t"
1450 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1451 "add $24, %%"REG_a" \n\t"
1452 " js 1b \n\t"
1453 : "+a" (mmx_size)
1454 : "r" (src-mmx_size), "r"(dst-mmx_size)
1455 );
1456
1457 __asm __volatile(SFENCE:::"memory");
1458 __asm __volatile(EMMS:::"memory");
1459
1460 if(mmx_size==23) return; //finihsed, was multiple of 8
1461
1462 src+= src_size;
1463 dst+= src_size;
1464 src_size= 23-mmx_size;
1465 src-= src_size;
1466 dst-= src_size;
1467 #endif
1468 for(i=0; i<src_size; i+=3)
1469 {
1470 register uint8_t x;
1471 x = src[i + 2];
1472 dst[i + 1] = src[i + 1];
1473 dst[i + 2] = src[i + 0];
1474 dst[i + 0] = x;
1475 }
1476 }
1477
1478 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1479 long width, long height,
1480 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1481 {
1482 long y;
1483 const long chromWidth= width>>1;
1484 for(y=0; y<height; y++)
1485 {
1486 #ifdef HAVE_MMX
1487 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1488 asm volatile(
1489 "xor %%"REG_a", %%"REG_a" \n\t"
1490 ASMALIGN(4)
1491 "1: \n\t"
1492 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1493 PREFETCH" 32(%2, %%"REG_a") \n\t"
1494 PREFETCH" 32(%3, %%"REG_a") \n\t"
1495 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1496 "movq %%mm0, %%mm2 \n\t" // U(0)
1497 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1498 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1499 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1500
1501 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1502 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1503 "movq %%mm3, %%mm4 \n\t" // Y(0)
1504 "movq %%mm5, %%mm6 \n\t" // Y(8)
1505 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1506 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1507 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1508 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1509
1510 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1511 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1512 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1513 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1514
1515 "add $8, %%"REG_a" \n\t"
1516 "cmp %4, %%"REG_a" \n\t"
1517 " jb 1b \n\t"
1518 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1519 : "%"REG_a
1520 );
1521 #else
1522
1523 #if defined ARCH_ALPHA && defined HAVE_MVI
1524 #define pl2yuy2(n) \
1525 y1 = yc[n]; \
1526 y2 = yc2[n]; \
1527 u = uc[n]; \
1528 v = vc[n]; \
1529 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1530 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1531 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1532 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1533 yuv1 = (u << 8) + (v << 24); \
1534 yuv2 = yuv1 + y2; \
1535 yuv1 += y1; \
1536 qdst[n] = yuv1; \
1537 qdst2[n] = yuv2;
1538
1539 int i;
1540 uint64_t *qdst = (uint64_t *) dst;
1541 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1542 const uint32_t *yc = (uint32_t *) ysrc;
1543 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1544 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1545 for(i = 0; i < chromWidth; i += 8){
1546 uint64_t y1, y2, yuv1, yuv2;
1547 uint64_t u, v;
1548 /* Prefetch */
1549 asm("ldq $31,64(%0)" :: "r"(yc));
1550 asm("ldq $31,64(%0)" :: "r"(yc2));
1551 asm("ldq $31,64(%0)" :: "r"(uc));
1552 asm("ldq $31,64(%0)" :: "r"(vc));
1553
1554 pl2yuy2(0);
1555 pl2yuy2(1);
1556 pl2yuy2(2);
1557 pl2yuy2(3);
1558
1559 yc += 4;
1560 yc2 += 4;
1561 uc += 4;
1562 vc += 4;
1563 qdst += 4;
1564 qdst2 += 4;
1565 }
1566 y++;
1567 ysrc += lumStride;
1568 dst += dstStride;
1569
1570 #elif __WORDSIZE >= 64
1571 int i;
1572 uint64_t *ldst = (uint64_t *) dst;
1573 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1574 for(i = 0; i < chromWidth; i += 2){
1575 uint64_t k, l;
1576 k = yc[0] + (uc[0] << 8) +
1577 (yc[1] << 16) + (vc[0] << 24);
1578 l = yc[2] + (uc[1] << 8) +
1579 (yc[3] << 16) + (vc[1] << 24);
1580 *ldst++ = k + (l << 32);
1581 yc += 4;
1582 uc += 2;
1583 vc += 2;
1584 }
1585
1586 #else
1587 int i, *idst = (int32_t *) dst;
1588 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1589 for(i = 0; i < chromWidth; i++){
1590 #ifdef WORDS_BIGENDIAN
1591 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1592 (yc[1] << 8) + (vc[0] << 0);
1593 #else
1594 *idst++ = yc[0] + (uc[0] << 8) +
1595 (yc[1] << 16) + (vc[0] << 24);
1596 #endif
1597 yc += 2;
1598 uc++;
1599 vc++;
1600 }
1601 #endif
1602 #endif
1603 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1604 {
1605 usrc += chromStride;
1606 vsrc += chromStride;
1607 }
1608 ysrc += lumStride;
1609 dst += dstStride;
1610 }
1611 #ifdef HAVE_MMX
1612 asm( EMMS" \n\t"
1613 SFENCE" \n\t"
1614 :::"memory");
1615 #endif
1616 }
1617
1618 /**
1619 *
1620 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1621 * problem for anyone then tell me, and ill fix it)
1622 */
1623 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1624 long width, long height,
1625 long lumStride, long chromStride, long dstStride)
1626 {
1627 //FIXME interpolate chroma
1628 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1629 }
1630
1631 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1632 long width, long height,
1633 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1634 {
1635 long y;
1636 const long chromWidth= width>>1;
1637 for(y=0; y<height; y++)
1638 {
1639 #ifdef HAVE_MMX
1640 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1641 asm volatile(
1642 "xor %%"REG_a", %%"REG_a" \n\t"
1643 ASMALIGN(4)
1644 "1: \n\t"
1645 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1646 PREFETCH" 32(%2, %%"REG_a") \n\t"
1647 PREFETCH" 32(%3, %%"REG_a") \n\t"
1648 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1649 "movq %%mm0, %%mm2 \n\t" // U(0)
1650 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1651 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1652 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1653
1654 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1655 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1656 "movq %%mm0, %%mm4 \n\t" // Y(0)
1657 "movq %%mm2, %%mm6 \n\t" // Y(8)
1658 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1659 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1660 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1661 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1662
1663 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1664 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1665 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1666 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1667
1668 "add $8, %%"REG_a" \n\t"
1669 "cmp %4, %%"REG_a" \n\t"
1670 " jb 1b \n\t"
1671 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1672 : "%"REG_a
1673 );
1674 #else
1675 //FIXME adapt the alpha asm code from yv12->yuy2
1676
1677 #if __WORDSIZE >= 64
1678 int i;
1679 uint64_t *ldst = (uint64_t *) dst;
1680 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1681 for(i = 0; i < chromWidth; i += 2){
1682 uint64_t k, l;
1683 k = uc[0] + (yc[0] << 8) +
1684 (vc[0] << 16) + (yc[1] << 24);
1685 l = uc[1] + (yc[2] << 8) +
1686 (vc[1] << 16) + (yc[3] << 24);
1687 *ldst++ = k + (l << 32);
1688 yc += 4;
1689 uc += 2;
1690 vc += 2;
1691 }
1692
1693 #else
1694 int i, *idst = (int32_t *) dst;
1695 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1696 for(i = 0; i < chromWidth; i++){
1697 #ifdef WORDS_BIGENDIAN
1698 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1699 (vc[0] << 8) + (yc[1] << 0);
1700 #else
1701 *idst++ = uc[0] + (yc[0] << 8) +
1702 (vc[0] << 16) + (yc[1] << 24);
1703 #endif
1704 yc += 2;
1705 uc++;
1706 vc++;
1707 }
1708 #endif
1709 #endif
1710 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1711 {
1712 usrc += chromStride;
1713 vsrc += chromStride;
1714 }
1715 ysrc += lumStride;
1716 dst += dstStride;
1717 }
1718 #ifdef HAVE_MMX
1719 asm( EMMS" \n\t"
1720 SFENCE" \n\t"
1721 :::"memory");
1722 #endif
1723 }
1724
1725 /**
1726 *
1727 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1728 * problem for anyone then tell me, and ill fix it)
1729 */
1730 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1731 long width, long height,
1732 long lumStride, long chromStride, long dstStride)
1733 {
1734 //FIXME interpolate chroma
1735 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1736 }
1737
1738 /**
1739 *
1740 * width should be a multiple of 16
1741 */
1742 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1743 long width, long height,
1744 long lumStride, long chromStride, long dstStride)
1745 {
1746 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1747 }
1748
1749 /**
1750 *
1751 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1752 * problem for anyone then tell me, and ill fix it)
1753 */
1754 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1755 long width, long height,
1756 long lumStride, long chromStride, long srcStride)
1757 {
1758 long y;
1759 const long chromWidth= width>>1;
1760 for(y=0; y<height; y+=2)
1761 {
1762 #ifdef HAVE_MMX
1763 asm volatile(
1764 "xor %%"REG_a", %%"REG_a" \n\t"
1765 "pcmpeqw %%mm7, %%mm7 \n\t"
1766 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1767 ASMALIGN(4)
1768 "1: \n\t"
1769 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1770 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1771 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1772 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1773 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1774 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1775 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1776 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1777 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1778 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1779 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1780
1781 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1782
1783 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1784 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1785 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1786 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1787 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1788 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1789 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1790 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1791 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1792 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1793
1794 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1795
1796 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1797 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1798 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1799 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1800 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1801 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1802 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1803 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1804
1805 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1806 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1807
1808 "add $8, %%"REG_a" \n\t"
1809 "cmp %4, %%"REG_a" \n\t"
1810 " jb 1b \n\t"
1811 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1812 : "memory", "%"REG_a
1813 );
1814
1815 ydst += lumStride;
1816 src += srcStride;
1817
1818 asm volatile(
1819 "xor %%"REG_a", %%"REG_a" \n\t"
1820 ASMALIGN(4)
1821 "1: \n\t"
1822 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1823 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1824 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1825 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1826 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1827 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1828 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1829 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1830 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1831 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1832 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1833
1834 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1835 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1836
1837 "add $8, %%"REG_a" \n\t"
1838 "cmp %4, %%"REG_a" \n\t"
1839 " jb 1b \n\t"
1840
1841 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1842 : "memory", "%"REG_a
1843 );
1844 #else
1845 long i;
1846 for(i=0; i<chromWidth; i++)
1847 {
1848 ydst[2*i+0] = src[4*i+0];
1849 udst[i] = src[4*i+1];
1850 ydst[2*i+1] = src[4*i+2];
1851 vdst[i] = src[4*i+3];
1852 }
1853 ydst += lumStride;
1854 src += srcStride;
1855
1856 for(i=0; i<chromWidth; i++)
1857 {
1858 ydst[2*i+0] = src[4*i+0];
1859 ydst[2*i+1] = src[4*i+2];
1860 }
1861 #endif
1862 udst += chromStride;
1863 vdst += chromStride;
1864 ydst += lumStride;
1865 src += srcStride;
1866 }
1867 #ifdef HAVE_MMX
1868 asm volatile( EMMS" \n\t"
1869 SFENCE" \n\t"
1870 :::"memory");
1871 #endif
1872 }
1873
1874 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1875 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1876 long width, long height, long lumStride, long chromStride)
1877 {
1878 /* Y Plane */
1879 memcpy(ydst, ysrc, width*height);
1880
1881 /* XXX: implement upscaling for U,V */
1882 }
1883
1884 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1885 {
1886 long x,y;
1887
1888 dst[0]= src[0];
1889
1890 // first line
1891 for(x=0; x<srcWidth-1; x++){
1892 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1893 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1894 }
1895 dst[2*srcWidth-1]= src[srcWidth-1];
1896
1897 dst+= dstStride;
1898
1899 for(y=1; y<srcHeight; y++){
1900 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1901 const long mmxSize= srcWidth&~15;
1902 asm volatile(
1903 "mov %4, %%"REG_a" \n\t"
1904 "1: \n\t"
1905 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1906 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1907 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1908 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1909 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1910 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1911 PAVGB" %%mm0, %%mm5 \n\t"
1912 PAVGB" %%mm0, %%mm3 \n\t"
1913 PAVGB" %%mm0, %%mm5 \n\t"
1914 PAVGB" %%mm0, %%mm3 \n\t"
1915 PAVGB" %%mm1, %%mm4 \n\t"
1916 PAVGB" %%mm1, %%mm2 \n\t"
1917 PAVGB" %%mm1, %%mm4 \n\t"
1918 PAVGB" %%mm1, %%mm2 \n\t"
1919 "movq %%mm5, %%mm7 \n\t"
1920 "movq %%mm4, %%mm6 \n\t"
1921 "punpcklbw %%mm3, %%mm5 \n\t"
1922 "punpckhbw %%mm3, %%mm7 \n\t"
1923 "punpcklbw %%mm2, %%mm4 \n\t"
1924 "punpckhbw %%mm2, %%mm6 \n\t"
1925 #if 1
1926 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1927 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1928 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1929 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1930 #else
1931 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1932 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1933 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1934 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1935 #endif
1936 "add $8, %%"REG_a" \n\t"
1937 " js 1b \n\t"
1938 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1939 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1940 "g" (-mmxSize)
1941 : "%"REG_a
1942
1943 );
1944 #else
1945 const long mmxSize=1;
1946 #endif
1947 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1948 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1949
1950 for(x=mmxSize-1; x<srcWidth-1; x++){
1951 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1952 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1953 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1954 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1955 }
1956 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1957 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1958
1959 dst+=dstStride*2;
1960 src+=srcStride;
1961 }
1962
1963 // last line
1964 #if 1
1965 dst[0]= src[0];
1966
1967 for(x=0; x<srcWidth-1; x++){
1968 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1969 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1970 }
1971 dst[2*srcWidth-1]= src[srcWidth-1];
1972 #else
1973 for(x=0; x<srcWidth; x++){
1974 dst[2*x+0]=
1975 dst[2*x+1]= src[x];
1976 }
1977 #endif
1978
1979 #ifdef HAVE_MMX
1980 asm volatile( EMMS" \n\t"
1981 SFENCE" \n\t"
1982 :::"memory");
1983 #endif
1984 }
1985
1986 /**
1987 *
1988 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1989 * problem for anyone then tell me, and ill fix it)
1990 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1991 */
1992 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1993 long width, long height,
1994 long lumStride, long chromStride, long srcStride)
1995 {
1996 long y;
1997 const long chromWidth= width>>1;
1998 for(y=0; y<height; y+=2)
1999 {
2000 #ifdef HAVE_MMX
2001 asm volatile(
2002 "xorl %%eax, %%eax \n\t"
2003 "pcmpeqw %%mm7, %%mm7 \n\t"
2004 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2005 ASMALIGN(4)
2006 "1: \n\t"
2007 PREFETCH" 64(%0, %%eax, 4) \n\t"
2008 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2009 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2010 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2011 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2012 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2013 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2014 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2015 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2016 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2017 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2018
2019 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2020
2021 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2022 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2023 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2024 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2025 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2026 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2027 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2028 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2029 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2030 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2031
2032 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2033
2034 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2035 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2036 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2037 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2038 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2039 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2040 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2041 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2042
2043 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2044 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2045
2046 "addl $8, %%eax \n\t"
2047 "cmpl %4, %%eax \n\t"
2048 " jb 1b \n\t"
2049 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2050 : "memory", "%eax"
2051 );
2052
2053 ydst += lumStride;
2054 src += srcStride;
2055
2056 asm volatile(
2057 "xorl %%eax, %%eax \n\t"
2058 ASMALIGN(4)
2059 "1: \n\t"
2060 PREFETCH" 64(%0, %%eax, 4) \n\t"
2061 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2062 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2063 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2064 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2065 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2066 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2067 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2068 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2069 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2070 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2071
2072 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2073 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2074
2075 "addl $8, %%eax \n\t"
2076 "cmpl %4, %%eax \n\t"
2077 " jb 1b \n\t"
2078
2079 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2080 : "memory", "%eax"
2081 );
2082 #else
2083 long i;
2084 for(i=0; i<chromWidth; i++)
2085 {
2086 udst[i] = src[4*i+0];
2087 ydst[2*i+0] = src[4*i+1];
2088 vdst[i] = src[4*i+2];
2089 ydst[2*i+1] = src[4*i+3];
2090 }
2091 ydst += lumStride;
2092 src += srcStride;
2093
2094 for(i=0; i<chromWidth; i++)
2095 {
2096 ydst[2*i+0] = src[4*i+1];
2097 ydst[2*i+1] = src[4*i+3];
2098 }
2099 #endif
2100 udst += chromStride;
2101 vdst += chromStride;
2102 ydst += lumStride;
2103 src += srcStride;
2104 }
2105 #ifdef HAVE_MMX
2106 asm volatile( EMMS" \n\t"
2107 SFENCE" \n\t"
2108 :::"memory");
2109 #endif
2110 }
2111
2112 /**
2113 *
2114 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2115 * problem for anyone then tell me, and ill fix it)
2116 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2117 */
2118 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2119 long width, long height,
2120 long lumStride, long chromStride, long srcStride)
2121 {
2122 long y;
2123 const long chromWidth= width>>1;
2124 #ifdef HAVE_MMX
2125 for(y=0; y<height-2; y+=2)
2126 {
2127 long i;
2128 for(i=0; i<2; i++)
2129 {
2130 asm volatile(
2131 "mov %2, %%"REG_a" \n\t"
2132 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2133 "movq "MANGLE(w1111)", %%mm5 \n\t"
2134 "pxor %%mm7, %%mm7 \n\t"
2135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2136 ASMALIGN(4)
2137 "1: \n\t"
2138 PREFETCH" 64(%0, %%"REG_d") \n\t"
2139 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2140 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2141 "punpcklbw %%mm7, %%mm0 \n\t"
2142 "punpcklbw %%mm7, %%mm1 \n\t"
2143 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2144 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2145 "punpcklbw %%mm7, %%mm2 \n\t"
2146 "punpcklbw %%mm7, %%mm3 \n\t"
2147 "pmaddwd %%mm6, %%mm0 \n\t"
2148 "pmaddwd %%mm6, %%mm1 \n\t"
2149 "pmaddwd %%mm6, %%mm2 \n\t"
2150 "pmaddwd %%mm6, %%mm3 \n\t"
2151 #ifndef FAST_BGR2YV12
2152 "psrad $8, %%mm0 \n\t"
2153 "psrad $8, %%mm1 \n\t"
2154 "psrad $8, %%mm2 \n\t"
2155 "psrad $8, %%mm3 \n\t"
2156 #endif
2157 "packssdw %%mm1, %%mm0 \n\t"
2158 "packssdw %%mm3, %%mm2 \n\t"
2159 "pmaddwd %%mm5, %%mm0 \n\t"
2160 "pmaddwd %%mm5, %%mm2 \n\t"
2161 "packssdw %%mm2, %%mm0 \n\t"
2162 "psraw $7, %%mm0 \n\t"
2163
2164 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2165 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2166 "punpcklbw %%mm7, %%mm4 \n\t"
2167 "punpcklbw %%mm7, %%mm1 \n\t"
2168 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2169 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2170 "punpcklbw %%mm7, %%mm2 \n\t"
2171 "punpcklbw %%mm7, %%mm3 \n\t"
2172 "pmaddwd %%mm6, %%mm4 \n\t"
2173 "pmaddwd %%mm6, %%mm1 \n\t"
2174 "pmaddwd %%mm6, %%mm2 \n\t"
2175 "pmaddwd %%mm6, %%mm3 \n\t"
2176 #ifndef FAST_BGR2YV12
2177 "psrad $8, %%mm4 \n\t"
2178 "psrad $8, %%mm1 \n\t"
2179 "psrad $8, %%mm2 \n\t"
2180 "psrad $8, %%mm3 \n\t"
2181 #endif
2182 "packssdw %%mm1, %%mm4 \n\t"
2183 "packssdw %%mm3, %%mm2 \n\t"
2184 "pmaddwd %%mm5, %%mm4 \n\t"
2185 "pmaddwd %%mm5, %%mm2 \n\t"
2186 "add $24, %%"REG_d" \n\t"
2187 "packssdw %%mm2, %%mm4 \n\t"
2188 "psraw $7, %%mm4 \n\t"
2189
2190 "packuswb %%mm4, %%mm0 \n\t"
2191 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2192
2193 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2194 "add $8, %%"REG_a" \n\t"
2195 " js 1b \n\t"
2196 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2197 : "%"REG_a, "%"REG_d
2198 );
2199 ydst += lumStride;
2200 src += srcStride;
2201 }
2202 src -= srcStride*2;
2203 asm volatile(
2204 "mov %4, %%"REG_a" \n\t"
2205 "movq "MANGLE(w1111)", %%mm5 \n\t"
2206 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2207 "pxor %%mm7, %%mm7 \n\t"
2208 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2209 "add %%"REG_d", %%"REG_d" \n\t"
2210 ASMALIGN(4)
2211 "1: \n\t"
2212 PREFETCH" 64(%0, %%"REG_d") \n\t"
2213 PREFETCH" 64(%1, %%"REG_d") \n\t"
2214 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2215 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2216 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2217 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2218 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2219 PAVGB" %%mm1, %%mm0 \n\t"
2220 PAVGB" %%mm3, %%mm2 \n\t"
2221 "movq %%mm0, %%mm1 \n\t"
2222 "movq %%mm2, %%mm3 \n\t"
2223 "psrlq $24, %%mm0 \n\t"
2224 "psrlq $24, %%mm2 \n\t"
2225 PAVGB" %%mm1, %%mm0 \n\t"
2226 PAVGB" %%mm3, %%mm2 \n\t"
2227 "punpcklbw %%mm7, %%mm0 \n\t"
2228 "punpcklbw %%mm7, %%mm2 \n\t"
2229 #else
2230 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2231 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2232 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2233 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2234 "punpcklbw %%mm7, %%mm0 \n\t"
2235 "punpcklbw %%mm7, %%mm1 \n\t"
2236 "punpcklbw %%mm7, %%mm2 \n\t"
2237 "punpcklbw %%mm7, %%mm3 \n\t"
2238 "paddw %%mm1, %%mm0 \n\t"
2239 "paddw %%mm3, %%mm2 \n\t"
2240 "paddw %%mm2, %%mm0 \n\t"
2241 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2242 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2243 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2244 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2245 "punpcklbw %%mm7, %%mm4 \n\t"
2246 "punpcklbw %%mm7, %%mm1 \n\t"
2247 "punpcklbw %%mm7, %%mm2 \n\t"
2248 "punpcklbw %%mm7, %%mm3 \n\t"
2249 "paddw %%mm1, %%mm4 \n\t"
2250 "paddw %%mm3, %%mm2 \n\t"
2251 "paddw %%mm4, %%mm2 \n\t"
2252 "psrlw $2, %%mm0 \n\t"
2253 "psrlw $2, %%mm2 \n\t"
2254 #endif
2255 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2256 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2257
2258 "pmaddwd %%mm0, %%mm1 \n\t"
2259 "pmaddwd %%mm2, %%mm3 \n\t"
2260 "pmaddwd %%mm6, %%mm0 \n\t"
2261 "pmaddwd %%mm6, %%mm2 \n\t"
2262 #ifndef FAST_BGR2YV12
2263 "psrad $8, %%mm0 \n\t"
2264 "psrad $8, %%mm1 \n\t"
2265 "psrad $8, %%mm2 \n\t"
2266 "psrad $8, %%mm3 \n\t"
2267 #endif
2268 "packssdw %%mm2, %%mm0 \n\t"
2269 "packssdw %%mm3, %%mm1 \n\t"
2270 "pmaddwd %%mm5, %%mm0 \n\t"
2271 "pmaddwd %%mm5, %%mm1 \n\t"
2272 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2273 "psraw $7, %%mm0 \n\t"
2274
2275 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2276 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2277 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2278 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2279 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2280 PAVGB" %%mm1, %%mm4 \n\t"
2281 PAVGB" %%mm3, %%mm2 \n\t"
2282 "movq %%mm4, %%mm1 \n\t"
2283 "movq %%mm2, %%mm3 \n\t"
2284 "psrlq $24, %%mm4 \n\t"
2285 "psrlq $24, %%mm2 \n\t"
2286 PAVGB" %%mm1, %%mm4 \n\t"
2287 PAVGB" %%mm3, %%mm2 \n\t"
2288 "punpcklbw %%mm7, %%mm4 \n\t"
2289 "punpcklbw %%mm7, %%mm2 \n\t"
2290 #else
2291 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2292 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2293 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2294 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2295 "punpcklbw %%mm7, %%mm4 \n\t"
2296 "punpcklbw %%mm7, %%mm1 \n\t"
2297 "punpcklbw %%mm7, %%mm2 \n\t"
2298 "punpcklbw %%mm7, %%mm3 \n\t"
2299 "paddw %%mm1, %%mm4 \n\t"
2300 "paddw %%mm3, %%mm2 \n\t"
2301 "paddw %%mm2, %%mm4 \n\t"
2302 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2303 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2304 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2305 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2306 "punpcklbw %%mm7, %%mm5 \n\t"
2307 "punpcklbw %%mm7, %%mm1 \n\t"
2308 "punpcklbw %%mm7, %%mm2 \n\t"
2309 "punpcklbw %%mm7, %%mm3 \n\t"
2310 "paddw %%mm1, %%mm5 \n\t"
2311 "paddw %%mm3, %%mm2 \n\t"
2312 "paddw %%mm5, %%mm2 \n\t"
2313 "movq "MANGLE(w1111)", %%mm5 \n\t"
2314 "psrlw $2, %%mm4 \n\t"
2315 "psrlw $2, %%mm2 \n\t"
2316 #endif
2317 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2318 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2319
2320 "pmaddwd %%mm4, %%mm1 \n\t"
2321 "pmaddwd %%mm2, %%mm3 \n\t"
2322 "pmaddwd %%mm6, %%mm4 \n\t"
2323 "pmaddwd %%mm6, %%mm2 \n\t"
2324 #ifndef FAST_BGR2YV12
2325 "psrad $8, %%mm4 \n\t"
2326 "psrad $8, %%mm1 \n\t"
2327 "psrad $8, %%mm2 \n\t"
2328 "psrad $8, %%mm3 \n\t"
2329 #endif
2330 "packssdw %%mm2, %%mm4 \n\t"
2331 "packssdw %%mm3, %%mm1 \n\t"
2332 "pmaddwd %%mm5, %%mm4 \n\t"
2333 "pmaddwd %%mm5, %%mm1 \n\t"
2334 "add $24, %%"REG_d" \n\t"
2335 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2336 "psraw $7, %%mm4 \n\t"
2337
2338 "movq %%mm0, %%mm1 \n\t"
2339 "punpckldq %%mm4, %%mm0 \n\t"
2340 "punpckhdq %%mm4, %%mm1 \n\t"
2341 "packsswb %%mm1, %%mm0 \n\t"
2342 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2343 "movd %%mm0, (%2, %%"REG_a") \n\t"
2344 "punpckhdq %%mm0, %%mm0 \n\t"
2345 "movd %%mm0, (%3, %%"REG_a") \n\t"
2346 "add $4, %%"REG_a" \n\t"
2347 " js 1b \n\t"
2348 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2349 : "%"REG_a, "%"REG_d
2350 );
2351
2352 udst += chromStride;
2353 vdst += chromStride;
2354 src += srcStride*2;
2355 }
2356
2357 asm volatile( EMMS" \n\t"
2358 SFENCE" \n\t"
2359 :::"memory");
2360 #else
2361 y=0;
2362 #endif
2363 for(; y<height; y+=2)
2364 {
2365 long i;
2366 for(i=0; i<chromWidth; i++)
2367 {
2368 unsigned int b= src[6*i+0];
2369 unsigned int g= src[6*i+1];
2370 unsigned int r= src[6*i+2];
2371
2372 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2373 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2374 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2375
2376 udst[i] = U;
2377 vdst[i] = V;
2378 ydst[2*i] = Y;
2379
2380 b= src[6*i+3];
2381 g= src[6*i+4];
2382 r= src[6*i+5];
2383
2384 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2385 ydst[2*i+1] = Y;
2386 }
2387 ydst += lumStride;
2388 src += srcStride;
2389
2390 for(i=0; i<chromWidth; i++)
2391 {
2392 unsigned int b= src[6*i+0];
2393 unsigned int g= src[6*i+1];
2394 unsigned int r= src[6*i+2];
2395
2396 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2397
2398 ydst[2*i] = Y;
2399
2400 b= src[6*i+3];
2401 g= src[6*i+4];
2402 r= src[6*i+5];
2403
2404 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2405 ydst[2*i+1] = Y;
2406 }
2407 udst += chromStride;
2408 vdst += chromStride;
2409 ydst += lumStride;
2410 src += srcStride;
2411 }
2412 }
2413
2414 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2415 long width, long height, long src1Stride,
2416 long src2Stride, long dstStride){
2417 long h;
2418
2419 for(h=0; h < height; h++)
2420 {
2421 long w;
2422
2423 #ifdef HAVE_MMX
2424 #ifdef HAVE_SSE2
2425 asm(
2426 "xor %%"REG_a", %%"REG_a" \n\t"
2427 "1: \n\t"
2428 PREFETCH" 64(%1, %%"REG_a") \n\t"
2429 PREFETCH" 64(%2, %%"REG_a") \n\t"
2430 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2431 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2432 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2433 "punpcklbw %%xmm2, %%xmm0 \n\t"
2434 "punpckhbw %%xmm2, %%xmm1 \n\t"
2435 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2436 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2437 "add $16, %%"REG_a" \n\t"
2438 "cmp %3, %%"REG_a" \n\t"
2439 " jb 1b \n\t"
2440 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2441 : "memory", "%"REG_a""
2442 );
2443 #else
2444 asm(
2445 "xor %%"REG_a", %%"REG_a" \n\t"
2446 "1: \n\t"
2447 PREFETCH" 64(%1, %%"REG_a") \n\t"
2448 PREFETCH" 64(%2, %%"REG_a") \n\t"
2449 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2450 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2451 "movq %%mm0, %%mm1 \n\t"
2452 "movq %%mm2, %%mm3 \n\t"
2453 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2454 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2455 "punpcklbw %%mm4, %%mm0 \n\t"
2456 "punpckhbw %%mm4, %%mm1 \n\t"
2457 "punpcklbw %%mm5, %%mm2 \n\t"
2458 "punpckhbw %%mm5, %%mm3 \n\t"
2459 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2460 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2461 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2462 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2463 "add $16, %%"REG_a" \n\t"
2464 "cmp %3, %%"REG_a" \n\t"
2465 " jb 1b \n\t"
2466 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2467 : "memory", "%"REG_a
2468 );
2469 #endif
2470 for(w= (width&(~15)); w < width; w++)
2471 {
2472 dest[2*w+0] = src1[w];
2473 dest[2*w+1] = src2[w];
2474 }
2475 #else
2476 for(w=0; w < width; w++)
2477 {
2478 dest[2*w+0] = src1[w];
2479 dest[2*w+1] = src2[w];
2480 }
2481 #endif
2482 dest += dstStride;
2483 src1 += src1Stride;
2484 src2 += src2Stride;
2485 }
2486 #ifdef HAVE_MMX
2487 asm(
2488 EMMS" \n\t"
2489 SFENCE" \n\t"
2490 ::: "memory"
2491 );
2492 #endif
2493 }
2494
2495 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2496 uint8_t *dst1, uint8_t *dst2,
2497 long width, long height,
2498 long srcStride1, long srcStride2,
2499 long dstStride1, long dstStride2)
2500 {
2501 long y,x,w,h;
2502 w=width/2; h=height/2;
2503 #ifdef HAVE_MMX
2504 asm volatile(
2505 PREFETCH" %0\n\t"
2506 PREFETCH" %1\n\t"
2507 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2508 #endif
2509 for(y=0;y<h;y++){
2510 const uint8_t* s1=src1+srcStride1*(y>>1);
2511 uint8_t* d=dst1+dstStride1*y;
2512 x=0;
2513 #ifdef HAVE_MMX
2514 for(;x<w-31;x+=32)
2515 {
2516 asm volatile(
2517 PREFETCH" 32%1\n\t"
2518 "movq %1, %%mm0\n\t"
2519 "movq 8%1, %%mm2\n\t"
2520 "movq 16%1, %%mm4\n\t"
2521 "movq 24%1, %%mm6\n\t"
2522 "movq %%mm0, %%mm1\n\t"
2523 "movq %%mm2, %%mm3\n\t"
2524 "movq %%mm4, %%mm5\n\t"
2525 "movq %%mm6, %%mm7\n\t"
2526 "punpcklbw %%mm0, %%mm0\n\t"
2527 "punpckhbw %%mm1, %%mm1\n\t"
2528 "punpcklbw %%mm2, %%mm2\n\t"
2529 "punpckhbw %%mm3, %%mm3\n\t"
2530 "punpcklbw %%mm4, %%mm4\n\t"
2531 "punpckhbw %%mm5, %%mm5\n\t"
2532 "punpcklbw %%mm6, %%mm6\n\t"
2533 "punpckhbw %%mm7, %%mm7\n\t"
2534 MOVNTQ" %%mm0, %0\n\t"
2535 MOVNTQ" %%mm1, 8%0\n\t"
2536 MOVNTQ" %%mm2, 16%0\n\t"
2537 MOVNTQ" %%mm3, 24%0\n\t"
2538 MOVNTQ" %%mm4, 32%0\n\t"
2539 MOVNTQ" %%mm5, 40%0\n\t"
2540 MOVNTQ" %%mm6, 48%0\n\t"
2541 MOVNTQ" %%mm7, 56%0"
2542 :"=m"(d[2*x])
2543 :"m"(s1[x])
2544 :"memory");
2545 }
2546 #endif
2547 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2548 }
2549 for(y=0;y<h;y++){
2550 const uint8_t* s2=src2+srcStride2*(y>>1);
2551 uint8_t* d=dst2+dstStride2*y;
2552 x=0;
2553 #ifdef HAVE_MMX
2554 for(;x<w-31;x+=32)
2555 {
2556 asm volatile(
2557 PREFETCH" 32%1\n\t"
2558 "movq %1, %%mm0\n\t"
2559 "movq 8%1, %%mm2\n\t"
2560 "movq 16%1, %%mm4\n\t"
2561 "movq 24%1, %%mm6\n\t"
2562 "movq %%mm0, %%mm1\n\t"
2563 "movq %%mm2, %%mm3\n\t"
2564 "movq %%mm4, %%mm5\n\t"
2565 "movq %%mm6, %%mm7\n\t"
2566 "punpcklbw %%mm0, %%mm0\n\t"
2567 "punpckhbw %%mm1, %%mm1\n\t"
2568 "punpcklbw %%mm2, %%mm2\n\t"
2569 "punpckhbw %%mm3, %%mm3\n\t"
2570 "punpcklbw %%mm4, %%mm4\n\t"
2571 "punpckhbw %%mm5, %%mm5\n\t"
2572 "punpcklbw %%mm6, %%mm6\n\t"
2573 "punpckhbw %%mm7, %%mm7\n\t"
2574 MOVNTQ" %%mm0, %0\n\t"
2575 MOVNTQ" %%mm1, 8%0\n\t"
2576 MOVNTQ" %%mm2, 16%0\n\t"
2577 MOVNTQ" %%mm3, 24%0\n\t"
2578 MOVNTQ" %%mm4, 32%0\n\t"
2579 MOVNTQ" %%mm5, 40%0\n\t"
2580 MOVNTQ" %%mm6, 48%0\n\t"
2581 MOVNTQ" %%mm7, 56%0"
2582 :"=m"(d[2*x])
2583 :"m"(s2[x])
2584 :"memory");
2585 }
2586 #endif
2587 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2588 }
2589 #ifdef HAVE_MMX
2590 asm(
2591 EMMS" \n\t"
2592 SFENCE" \n\t"
2593 ::: "memory"
2594 );
2595 #endif
2596 }
2597
2598 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2599 uint8_t *dst,
2600 long width, long height,
2601 long srcStride1, long srcStride2,
2602 long srcStride3, long dstStride)
2603 {
2604 long y,x,w,h;
2605 w=width/2; h=height;
2606 for(y=0;y<h;y++){
2607 const uint8_t* yp=src1+srcStride1*y;
2608 const uint8_t* up=src2+srcStride2*(y>>2);
2609 const uint8_t* vp=src3+srcStride3*(y>>2);
2610 uint8_t* d=dst+dstStride*y;
2611 x=0;
2612 #ifdef HAVE_MMX
2613 for(;x<w-7;x+=8)
2614 {
2615 asm volatile(
2616 PREFETCH" 32(%1, %0)\n\t"
2617 PREFETCH" 32(%2, %0)\n\t"
2618 PREFETCH" 32(%3, %0)\n\t"
2619 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2620 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2621 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2622 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2623 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2624 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2625 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2626 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2627 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2628 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2629
2630 "movq %%mm1, %%mm6\n\t"
2631 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2632 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2633 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2634 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2635 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2636
2637 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2638 "movq 8(%1, %0, 4), %%mm0\n\t"
2639 "movq %%mm0, %%mm3\n\t"
2640 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2641 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2642 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2643 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2644
2645 "movq %%mm4, %%mm6\n\t"
2646 "movq 16(%1, %0, 4), %%mm0\n\t"
2647 "movq %%mm0, %%mm3\n\t"
2648 "punpcklbw %%mm5, %%mm4\n\t"
2649 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2650 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2651 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2652 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2653
2654 "punpckhbw %%mm5, %%mm6\n\t"
2655 "movq 24(%1, %0, 4), %%mm0\n\t"
2656 "movq %%mm0, %%mm3\n\t"
2657 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2658 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2659 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2660 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2661
2662 : "+r" (x)
2663 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2664 :"memory");
2665 }
2666 #endif
2667 for(; x<w; x++)
2668 {
2669 const long x2= x<<2;
2670 d[8*x+0]=yp[x2];
2671 d[8*x+1]=up[x];
2672 d[8*x+2]=yp[x2+1];
2673 d[8*x+3]=vp[x];
2674 d[8*x+4]=yp[x2+2];
2675 d[8*x+5]=up[x];
2676 d[8*x+6]=yp[x2+3];
2677 d[8*x+7]=vp[x];
2678 }
2679 }
2680 #ifdef HAVE_MMX
2681 asm(
2682 EMMS" \n\t"
2683 SFENCE" \n\t"
2684 ::: "memory"
2685 );
2686 #endif
2687 }