b03f6424f708f3182f98dfb40d5c7bb471db23cd
[libav.git] / libswscale / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 *
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
28 */
29
30 #include <stddef.h>
31
32 #undef PREFETCH
33 #undef MOVNTQ
34 #undef EMMS
35 #undef SFENCE
36 #undef MMREG_SIZE
37 #undef PREFETCHW
38 #undef PAVGB
39
40 #if HAVE_SSE2
41 #define MMREG_SIZE 16
42 #else
43 #define MMREG_SIZE 8
44 #endif
45
46 #if HAVE_AMD3DNOW
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
50 #elif HAVE_MMX2
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
53 #define PAVGB "pavgb"
54 #else
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
57 #endif
58
59 #if HAVE_AMD3DNOW
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
61 #define EMMS "femms"
62 #else
63 #define EMMS "emms"
64 #endif
65
66 #if HAVE_MMX2
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
69 #else
70 #define MOVNTQ "movq"
71 #define SFENCE " # nop"
72 #endif
73
74 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
75 {
76 uint8_t *dest = dst;
77 const uint8_t *s = src;
78 const uint8_t *end;
79 #if HAVE_MMX
80 const uint8_t *mm_end;
81 #endif
82 end = s + src_size;
83 #if HAVE_MMX
84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
85 mm_end = end - 23;
86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
87 while (s < mm_end)
88 {
89 __asm__ volatile(
90 PREFETCH" 32%1 \n\t"
91 "movd %1, %%mm0 \n\t"
92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t"
95 "movd 12%1, %%mm2 \n\t"
96 "punpckldq 15%1, %%mm2 \n\t"
97 "movd 18%1, %%mm3 \n\t"
98 "punpckldq 21%1, %%mm3 \n\t"
99 "por %%mm7, %%mm0 \n\t"
100 "por %%mm7, %%mm1 \n\t"
101 "por %%mm7, %%mm2 \n\t"
102 "por %%mm7, %%mm3 \n\t"
103 MOVNTQ" %%mm0, %0 \n\t"
104 MOVNTQ" %%mm1, 8%0 \n\t"
105 MOVNTQ" %%mm2, 16%0 \n\t"
106 MOVNTQ" %%mm3, 24%0"
107 :"=m"(*dest)
108 :"m"(*s)
109 :"memory");
110 dest += 32;
111 s += 24;
112 }
113 __asm__ volatile(SFENCE:::"memory");
114 __asm__ volatile(EMMS:::"memory");
115 #endif
116 while (s < end)
117 {
118 #ifdef WORDS_BIGENDIAN
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
120 *dest++ = 255;
121 *dest++ = s[2];
122 *dest++ = s[1];
123 *dest++ = s[0];
124 s+=3;
125 #else
126 *dest++ = *s++;
127 *dest++ = *s++;
128 *dest++ = *s++;
129 *dest++ = 255;
130 #endif
131 }
132 }
133
134 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
135 {
136 uint8_t *dest = dst;
137 const uint8_t *s = src;
138 const uint8_t *end;
139 #if HAVE_MMX
140 const uint8_t *mm_end;
141 #endif
142 end = s + src_size;
143 #if HAVE_MMX
144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
145 mm_end = end - 31;
146 while (s < mm_end)
147 {
148 __asm__ volatile(
149 PREFETCH" 32%1 \n\t"
150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t"
174
175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t"
188
189 MOVNTQ" %%mm0, %0 \n\t"
190 MOVNTQ" %%mm1, 8%0 \n\t"
191 MOVNTQ" %%mm4, 16%0"
192 :"=m"(*dest)
193 :"m"(*s),"m"(mask24l),
194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
195 :"memory");
196 dest += 24;
197 s += 32;
198 }
199 __asm__ volatile(SFENCE:::"memory");
200 __asm__ volatile(EMMS:::"memory");
201 #endif
202 while (s < end)
203 {
204 #ifdef WORDS_BIGENDIAN
205 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
206 s++;
207 dest[2] = *s++;
208 dest[1] = *s++;
209 dest[0] = *s++;
210 dest += 3;
211 #else
212 *dest++ = *s++;
213 *dest++ = *s++;
214 *dest++ = *s++;
215 s++;
216 #endif
217 }
218 }
219
220 /*
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
225 */
226 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
227 {
228 register const uint8_t* s=src;
229 register uint8_t* d=dst;
230 register const uint8_t *end;
231 const uint8_t *mm_end;
232 end = s + src_size;
233 #if HAVE_MMX
234 __asm__ volatile(PREFETCH" %0"::"m"(*s));
235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
236 mm_end = end - 15;
237 while (s<mm_end)
238 {
239 __asm__ volatile(
240 PREFETCH" 32%1 \n\t"
241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ" %%mm0, %0 \n\t"
250 MOVNTQ" %%mm2, 8%0"
251 :"=m"(*d)
252 :"m"(*s)
253 );
254 d+=16;
255 s+=16;
256 }
257 __asm__ volatile(SFENCE:::"memory");
258 __asm__ volatile(EMMS:::"memory");
259 #endif
260 mm_end = end - 3;
261 while (s < mm_end)
262 {
263 register unsigned x= *((const uint32_t *)s);
264 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
265 d+=4;
266 s+=4;
267 }
268 if (s < end)
269 {
270 register unsigned short x= *((const uint16_t *)s);
271 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
272 }
273 }
274
275 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
276 {
277 register const uint8_t* s=src;
278 register uint8_t* d=dst;
279 register const uint8_t *end;
280 const uint8_t *mm_end;
281 end = s + src_size;
282 #if HAVE_MMX
283 __asm__ volatile(PREFETCH" %0"::"m"(*s));
284 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
286 mm_end = end - 15;
287 while (s<mm_end)
288 {
289 __asm__ volatile(
290 PREFETCH" 32%1 \n\t"
291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ" %%mm0, %0 \n\t"
304 MOVNTQ" %%mm2, 8%0"
305 :"=m"(*d)
306 :"m"(*s)
307 );
308 d+=16;
309 s+=16;
310 }
311 __asm__ volatile(SFENCE:::"memory");
312 __asm__ volatile(EMMS:::"memory");
313 #endif
314 mm_end = end - 3;
315 while (s < mm_end)
316 {
317 register uint32_t x= *((const uint32_t*)s);
318 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
319 s+=4;
320 d+=4;
321 }
322 if (s < end)
323 {
324 register uint16_t x= *((const uint16_t*)s);
325 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
326 s+=2;
327 d+=2;
328 }
329 }
330
331 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
332 {
333 const uint8_t *s = src;
334 const uint8_t *end;
335 #if HAVE_MMX
336 const uint8_t *mm_end;
337 #endif
338 uint16_t *d = (uint16_t *)dst;
339 end = s + src_size;
340 #if HAVE_MMX
341 mm_end = end - 15;
342 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
343 __asm__ volatile(
344 "movq %3, %%mm5 \n\t"
345 "movq %4, %%mm6 \n\t"
346 "movq %5, %%mm7 \n\t"
347 "jmp 2f \n\t"
348 ASMALIGN(4)
349 "1: \n\t"
350 PREFETCH" 32(%1) \n\t"
351 "movd (%1), %%mm0 \n\t"
352 "movd 4(%1), %%mm3 \n\t"
353 "punpckldq 8(%1), %%mm0 \n\t"
354 "punpckldq 12(%1), %%mm3 \n\t"
355 "movq %%mm0, %%mm1 \n\t"
356 "movq %%mm3, %%mm4 \n\t"
357 "pand %%mm6, %%mm0 \n\t"
358 "pand %%mm6, %%mm3 \n\t"
359 "pmaddwd %%mm7, %%mm0 \n\t"
360 "pmaddwd %%mm7, %%mm3 \n\t"
361 "pand %%mm5, %%mm1 \n\t"
362 "pand %%mm5, %%mm4 \n\t"
363 "por %%mm1, %%mm0 \n\t"
364 "por %%mm4, %%mm3 \n\t"
365 "psrld $5, %%mm0 \n\t"
366 "pslld $11, %%mm3 \n\t"
367 "por %%mm3, %%mm0 \n\t"
368 MOVNTQ" %%mm0, (%0) \n\t"
369 "add $16, %1 \n\t"
370 "add $8, %0 \n\t"
371 "2: \n\t"
372 "cmp %2, %1 \n\t"
373 " jb 1b \n\t"
374 : "+r" (d), "+r"(s)
375 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
376 );
377 #else
378 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
379 __asm__ volatile(
380 "movq %0, %%mm7 \n\t"
381 "movq %1, %%mm6 \n\t"
382 ::"m"(red_16mask),"m"(green_16mask));
383 while (s < mm_end)
384 {
385 __asm__ volatile(
386 PREFETCH" 32%1 \n\t"
387 "movd %1, %%mm0 \n\t"
388 "movd 4%1, %%mm3 \n\t"
389 "punpckldq 8%1, %%mm0 \n\t"
390 "punpckldq 12%1, %%mm3 \n\t"
391 "movq %%mm0, %%mm1 \n\t"
392 "movq %%mm0, %%mm2 \n\t"
393 "movq %%mm3, %%mm4 \n\t"
394 "movq %%mm3, %%mm5 \n\t"
395 "psrlq $3, %%mm0 \n\t"
396 "psrlq $3, %%mm3 \n\t"
397 "pand %2, %%mm0 \n\t"
398 "pand %2, %%mm3 \n\t"
399 "psrlq $5, %%mm1 \n\t"
400 "psrlq $5, %%mm4 \n\t"
401 "pand %%mm6, %%mm1 \n\t"
402 "pand %%mm6, %%mm4 \n\t"
403 "psrlq $8, %%mm2 \n\t"
404 "psrlq $8, %%mm5 \n\t"
405 "pand %%mm7, %%mm2 \n\t"
406 "pand %%mm7, %%mm5 \n\t"
407 "por %%mm1, %%mm0 \n\t"
408 "por %%mm4, %%mm3 \n\t"
409 "por %%mm2, %%mm0 \n\t"
410 "por %%mm5, %%mm3 \n\t"
411 "psllq $16, %%mm3 \n\t"
412 "por %%mm3, %%mm0 \n\t"
413 MOVNTQ" %%mm0, %0 \n\t"
414 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
415 d += 4;
416 s += 16;
417 }
418 #endif
419 __asm__ volatile(SFENCE:::"memory");
420 __asm__ volatile(EMMS:::"memory");
421 #endif
422 while (s < end)
423 {
424 register int rgb = *(const uint32_t*)s; s += 4;
425 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
426 }
427 }
428
429 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
430 {
431 const uint8_t *s = src;
432 const uint8_t *end;
433 #if HAVE_MMX
434 const uint8_t *mm_end;
435 #endif
436 uint16_t *d = (uint16_t *)dst;
437 end = s + src_size;
438 #if HAVE_MMX
439 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
440 __asm__ volatile(
441 "movq %0, %%mm7 \n\t"
442 "movq %1, %%mm6 \n\t"
443 ::"m"(red_16mask),"m"(green_16mask));
444 mm_end = end - 15;
445 while (s < mm_end)
446 {
447 __asm__ volatile(
448 PREFETCH" 32%1 \n\t"
449 "movd %1, %%mm0 \n\t"
450 "movd 4%1, %%mm3 \n\t"
451 "punpckldq 8%1, %%mm0 \n\t"
452 "punpckldq 12%1, %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm0, %%mm2 \n\t"
455 "movq %%mm3, %%mm4 \n\t"
456 "movq %%mm3, %%mm5 \n\t"
457 "psllq $8, %%mm0 \n\t"
458 "psllq $8, %%mm3 \n\t"
459 "pand %%mm7, %%mm0 \n\t"
460 "pand %%mm7, %%mm3 \n\t"
461 "psrlq $5, %%mm1 \n\t"
462 "psrlq $5, %%mm4 \n\t"
463 "pand %%mm6, %%mm1 \n\t"
464 "pand %%mm6, %%mm4 \n\t"
465 "psrlq $19, %%mm2 \n\t"
466 "psrlq $19, %%mm5 \n\t"
467 "pand %2, %%mm2 \n\t"
468 "pand %2, %%mm5 \n\t"
469 "por %%mm1, %%mm0 \n\t"
470 "por %%mm4, %%mm3 \n\t"
471 "por %%mm2, %%mm0 \n\t"
472 "por %%mm5, %%mm3 \n\t"
473 "psllq $16, %%mm3 \n\t"
474 "por %%mm3, %%mm0 \n\t"
475 MOVNTQ" %%mm0, %0 \n\t"
476 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
477 d += 4;
478 s += 16;
479 }
480 __asm__ volatile(SFENCE:::"memory");
481 __asm__ volatile(EMMS:::"memory");
482 #endif
483 while (s < end)
484 {
485 register int rgb = *(const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
487 }
488 }
489
490 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
491 {
492 const uint8_t *s = src;
493 const uint8_t *end;
494 #if HAVE_MMX
495 const uint8_t *mm_end;
496 #endif
497 uint16_t *d = (uint16_t *)dst;
498 end = s + src_size;
499 #if HAVE_MMX
500 mm_end = end - 15;
501 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
502 __asm__ volatile(
503 "movq %3, %%mm5 \n\t"
504 "movq %4, %%mm6 \n\t"
505 "movq %5, %%mm7 \n\t"
506 "jmp 2f \n\t"
507 ASMALIGN(4)
508 "1: \n\t"
509 PREFETCH" 32(%1) \n\t"
510 "movd (%1), %%mm0 \n\t"
511 "movd 4(%1), %%mm3 \n\t"
512 "punpckldq 8(%1), %%mm0 \n\t"
513 "punpckldq 12(%1), %%mm3 \n\t"
514 "movq %%mm0, %%mm1 \n\t"
515 "movq %%mm3, %%mm4 \n\t"
516 "pand %%mm6, %%mm0 \n\t"
517 "pand %%mm6, %%mm3 \n\t"
518 "pmaddwd %%mm7, %%mm0 \n\t"
519 "pmaddwd %%mm7, %%mm3 \n\t"
520 "pand %%mm5, %%mm1 \n\t"
521 "pand %%mm5, %%mm4 \n\t"
522 "por %%mm1, %%mm0 \n\t"
523 "por %%mm4, %%mm3 \n\t"
524 "psrld $6, %%mm0 \n\t"
525 "pslld $10, %%mm3 \n\t"
526 "por %%mm3, %%mm0 \n\t"
527 MOVNTQ" %%mm0, (%0) \n\t"
528 "add $16, %1 \n\t"
529 "add $8, %0 \n\t"
530 "2: \n\t"
531 "cmp %2, %1 \n\t"
532 " jb 1b \n\t"
533 : "+r" (d), "+r"(s)
534 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
535 );
536 #else
537 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
538 __asm__ volatile(
539 "movq %0, %%mm7 \n\t"
540 "movq %1, %%mm6 \n\t"
541 ::"m"(red_15mask),"m"(green_15mask));
542 while (s < mm_end)
543 {
544 __asm__ volatile(
545 PREFETCH" 32%1 \n\t"
546 "movd %1, %%mm0 \n\t"
547 "movd 4%1, %%mm3 \n\t"
548 "punpckldq 8%1, %%mm0 \n\t"
549 "punpckldq 12%1, %%mm3 \n\t"
550 "movq %%mm0, %%mm1 \n\t"
551 "movq %%mm0, %%mm2 \n\t"
552 "movq %%mm3, %%mm4 \n\t"
553 "movq %%mm3, %%mm5 \n\t"
554 "psrlq $3, %%mm0 \n\t"
555 "psrlq $3, %%mm3 \n\t"
556 "pand %2, %%mm0 \n\t"
557 "pand %2, %%mm3 \n\t"
558 "psrlq $6, %%mm1 \n\t"
559 "psrlq $6, %%mm4 \n\t"
560 "pand %%mm6, %%mm1 \n\t"
561 "pand %%mm6, %%mm4 \n\t"
562 "psrlq $9, %%mm2 \n\t"
563 "psrlq $9, %%mm5 \n\t"
564 "pand %%mm7, %%mm2 \n\t"
565 "pand %%mm7, %%mm5 \n\t"
566 "por %%mm1, %%mm0 \n\t"
567 "por %%mm4, %%mm3 \n\t"
568 "por %%mm2, %%mm0 \n\t"
569 "por %%mm5, %%mm3 \n\t"
570 "psllq $16, %%mm3 \n\t"
571 "por %%mm3, %%mm0 \n\t"
572 MOVNTQ" %%mm0, %0 \n\t"
573 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
574 d += 4;
575 s += 16;
576 }
577 #endif
578 __asm__ volatile(SFENCE:::"memory");
579 __asm__ volatile(EMMS:::"memory");
580 #endif
581 while (s < end)
582 {
583 register int rgb = *(const uint32_t*)s; s += 4;
584 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
585 }
586 }
587
588 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
589 {
590 const uint8_t *s = src;
591 const uint8_t *end;
592 #if HAVE_MMX
593 const uint8_t *mm_end;
594 #endif
595 uint16_t *d = (uint16_t *)dst;
596 end = s + src_size;
597 #if HAVE_MMX
598 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
599 __asm__ volatile(
600 "movq %0, %%mm7 \n\t"
601 "movq %1, %%mm6 \n\t"
602 ::"m"(red_15mask),"m"(green_15mask));
603 mm_end = end - 15;
604 while (s < mm_end)
605 {
606 __asm__ volatile(
607 PREFETCH" 32%1 \n\t"
608 "movd %1, %%mm0 \n\t"
609 "movd 4%1, %%mm3 \n\t"
610 "punpckldq 8%1, %%mm0 \n\t"
611 "punpckldq 12%1, %%mm3 \n\t"
612 "movq %%mm0, %%mm1 \n\t"
613 "movq %%mm0, %%mm2 \n\t"
614 "movq %%mm3, %%mm4 \n\t"
615 "movq %%mm3, %%mm5 \n\t"
616 "psllq $7, %%mm0 \n\t"
617 "psllq $7, %%mm3 \n\t"
618 "pand %%mm7, %%mm0 \n\t"
619 "pand %%mm7, %%mm3 \n\t"
620 "psrlq $6, %%mm1 \n\t"
621 "psrlq $6, %%mm4 \n\t"
622 "pand %%mm6, %%mm1 \n\t"
623 "pand %%mm6, %%mm4 \n\t"
624 "psrlq $19, %%mm2 \n\t"
625 "psrlq $19, %%mm5 \n\t"
626 "pand %2, %%mm2 \n\t"
627 "pand %2, %%mm5 \n\t"
628 "por %%mm1, %%mm0 \n\t"
629 "por %%mm4, %%mm3 \n\t"
630 "por %%mm2, %%mm0 \n\t"
631 "por %%mm5, %%mm3 \n\t"
632 "psllq $16, %%mm3 \n\t"
633 "por %%mm3, %%mm0 \n\t"
634 MOVNTQ" %%mm0, %0 \n\t"
635 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
636 d += 4;
637 s += 16;
638 }
639 __asm__ volatile(SFENCE:::"memory");
640 __asm__ volatile(EMMS:::"memory");
641 #endif
642 while (s < end)
643 {
644 register int rgb = *(const uint32_t*)s; s += 4;
645 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
646 }
647 }
648
649 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
650 {
651 const uint8_t *s = src;
652 const uint8_t *end;
653 #if HAVE_MMX
654 const uint8_t *mm_end;
655 #endif
656 uint16_t *d = (uint16_t *)dst;
657 end = s + src_size;
658 #if HAVE_MMX
659 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
660 __asm__ volatile(
661 "movq %0, %%mm7 \n\t"
662 "movq %1, %%mm6 \n\t"
663 ::"m"(red_16mask),"m"(green_16mask));
664 mm_end = end - 11;
665 while (s < mm_end)
666 {
667 __asm__ volatile(
668 PREFETCH" 32%1 \n\t"
669 "movd %1, %%mm0 \n\t"
670 "movd 3%1, %%mm3 \n\t"
671 "punpckldq 6%1, %%mm0 \n\t"
672 "punpckldq 9%1, %%mm3 \n\t"
673 "movq %%mm0, %%mm1 \n\t"
674 "movq %%mm0, %%mm2 \n\t"
675 "movq %%mm3, %%mm4 \n\t"
676 "movq %%mm3, %%mm5 \n\t"
677 "psrlq $3, %%mm0 \n\t"
678 "psrlq $3, %%mm3 \n\t"
679 "pand %2, %%mm0 \n\t"
680 "pand %2, %%mm3 \n\t"
681 "psrlq $5, %%mm1 \n\t"
682 "psrlq $5, %%mm4 \n\t"
683 "pand %%mm6, %%mm1 \n\t"
684 "pand %%mm6, %%mm4 \n\t"
685 "psrlq $8, %%mm2 \n\t"
686 "psrlq $8, %%mm5 \n\t"
687 "pand %%mm7, %%mm2 \n\t"
688 "pand %%mm7, %%mm5 \n\t"
689 "por %%mm1, %%mm0 \n\t"
690 "por %%mm4, %%mm3 \n\t"
691 "por %%mm2, %%mm0 \n\t"
692 "por %%mm5, %%mm3 \n\t"
693 "psllq $16, %%mm3 \n\t"
694 "por %%mm3, %%mm0 \n\t"
695 MOVNTQ" %%mm0, %0 \n\t"
696 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
697 d += 4;
698 s += 12;
699 }
700 __asm__ volatile(SFENCE:::"memory");
701 __asm__ volatile(EMMS:::"memory");
702 #endif
703 while (s < end)
704 {
705 const int b = *s++;
706 const int g = *s++;
707 const int r = *s++;
708 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
709 }
710 }
711
712 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
713 {
714 const uint8_t *s = src;
715 const uint8_t *end;
716 #if HAVE_MMX
717 const uint8_t *mm_end;
718 #endif
719 uint16_t *d = (uint16_t *)dst;
720 end = s + src_size;
721 #if HAVE_MMX
722 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
723 __asm__ volatile(
724 "movq %0, %%mm7 \n\t"
725 "movq %1, %%mm6 \n\t"
726 ::"m"(red_16mask),"m"(green_16mask));
727 mm_end = end - 15;
728 while (s < mm_end)
729 {
730 __asm__ volatile(
731 PREFETCH" 32%1 \n\t"
732 "movd %1, %%mm0 \n\t"
733 "movd 3%1, %%mm3 \n\t"
734 "punpckldq 6%1, %%mm0 \n\t"
735 "punpckldq 9%1, %%mm3 \n\t"
736 "movq %%mm0, %%mm1 \n\t"
737 "movq %%mm0, %%mm2 \n\t"
738 "movq %%mm3, %%mm4 \n\t"
739 "movq %%mm3, %%mm5 \n\t"
740 "psllq $8, %%mm0 \n\t"
741 "psllq $8, %%mm3 \n\t"
742 "pand %%mm7, %%mm0 \n\t"
743 "pand %%mm7, %%mm3 \n\t"
744 "psrlq $5, %%mm1 \n\t"
745 "psrlq $5, %%mm4 \n\t"
746 "pand %%mm6, %%mm1 \n\t"
747 "pand %%mm6, %%mm4 \n\t"
748 "psrlq $19, %%mm2 \n\t"
749 "psrlq $19, %%mm5 \n\t"
750 "pand %2, %%mm2 \n\t"
751 "pand %2, %%mm5 \n\t"
752 "por %%mm1, %%mm0 \n\t"
753 "por %%mm4, %%mm3 \n\t"
754 "por %%mm2, %%mm0 \n\t"
755 "por %%mm5, %%mm3 \n\t"
756 "psllq $16, %%mm3 \n\t"
757 "por %%mm3, %%mm0 \n\t"
758 MOVNTQ" %%mm0, %0 \n\t"
759 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
760 d += 4;
761 s += 12;
762 }
763 __asm__ volatile(SFENCE:::"memory");
764 __asm__ volatile(EMMS:::"memory");
765 #endif
766 while (s < end)
767 {
768 const int r = *s++;
769 const int g = *s++;
770 const int b = *s++;
771 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
772 }
773 }
774
775 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
776 {
777 const uint8_t *s = src;
778 const uint8_t *end;
779 #if HAVE_MMX
780 const uint8_t *mm_end;
781 #endif
782 uint16_t *d = (uint16_t *)dst;
783 end = s + src_size;
784 #if HAVE_MMX
785 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
786 __asm__ volatile(
787 "movq %0, %%mm7 \n\t"
788 "movq %1, %%mm6 \n\t"
789 ::"m"(red_15mask),"m"(green_15mask));
790 mm_end = end - 11;
791 while (s < mm_end)
792 {
793 __asm__ volatile(
794 PREFETCH" 32%1 \n\t"
795 "movd %1, %%mm0 \n\t"
796 "movd 3%1, %%mm3 \n\t"
797 "punpckldq 6%1, %%mm0 \n\t"
798 "punpckldq 9%1, %%mm3 \n\t"
799 "movq %%mm0, %%mm1 \n\t"
800 "movq %%mm0, %%mm2 \n\t"
801 "movq %%mm3, %%mm4 \n\t"
802 "movq %%mm3, %%mm5 \n\t"
803 "psrlq $3, %%mm0 \n\t"
804 "psrlq $3, %%mm3 \n\t"
805 "pand %2, %%mm0 \n\t"
806 "pand %2, %%mm3 \n\t"
807 "psrlq $6, %%mm1 \n\t"
808 "psrlq $6, %%mm4 \n\t"
809 "pand %%mm6, %%mm1 \n\t"
810 "pand %%mm6, %%mm4 \n\t"
811 "psrlq $9, %%mm2 \n\t"
812 "psrlq $9, %%mm5 \n\t"
813 "pand %%mm7, %%mm2 \n\t"
814 "pand %%mm7, %%mm5 \n\t"
815 "por %%mm1, %%mm0 \n\t"
816 "por %%mm4, %%mm3 \n\t"
817 "por %%mm2, %%mm0 \n\t"
818 "por %%mm5, %%mm3 \n\t"
819 "psllq $16, %%mm3 \n\t"
820 "por %%mm3, %%mm0 \n\t"
821 MOVNTQ" %%mm0, %0 \n\t"
822 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
823 d += 4;
824 s += 12;
825 }
826 __asm__ volatile(SFENCE:::"memory");
827 __asm__ volatile(EMMS:::"memory");
828 #endif
829 while (s < end)
830 {
831 const int b = *s++;
832 const int g = *s++;
833 const int r = *s++;
834 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
835 }
836 }
837
838 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
839 {
840 const uint8_t *s = src;
841 const uint8_t *end;
842 #if HAVE_MMX
843 const uint8_t *mm_end;
844 #endif
845 uint16_t *d = (uint16_t *)dst;
846 end = s + src_size;
847 #if HAVE_MMX
848 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
849 __asm__ volatile(
850 "movq %0, %%mm7 \n\t"
851 "movq %1, %%mm6 \n\t"
852 ::"m"(red_15mask),"m"(green_15mask));
853 mm_end = end - 15;
854 while (s < mm_end)
855 {
856 __asm__ volatile(
857 PREFETCH" 32%1 \n\t"
858 "movd %1, %%mm0 \n\t"
859 "movd 3%1, %%mm3 \n\t"
860 "punpckldq 6%1, %%mm0 \n\t"
861 "punpckldq 9%1, %%mm3 \n\t"
862 "movq %%mm0, %%mm1 \n\t"
863 "movq %%mm0, %%mm2 \n\t"
864 "movq %%mm3, %%mm4 \n\t"
865 "movq %%mm3, %%mm5 \n\t"
866 "psllq $7, %%mm0 \n\t"
867 "psllq $7, %%mm3 \n\t"
868 "pand %%mm7, %%mm0 \n\t"
869 "pand %%mm7, %%mm3 \n\t"
870 "psrlq $6, %%mm1 \n\t"
871 "psrlq $6, %%mm4 \n\t"
872 "pand %%mm6, %%mm1 \n\t"
873 "pand %%mm6, %%mm4 \n\t"
874 "psrlq $19, %%mm2 \n\t"
875 "psrlq $19, %%mm5 \n\t"
876 "pand %2, %%mm2 \n\t"
877 "pand %2, %%mm5 \n\t"
878 "por %%mm1, %%mm0 \n\t"
879 "por %%mm4, %%mm3 \n\t"
880 "por %%mm2, %%mm0 \n\t"
881 "por %%mm5, %%mm3 \n\t"
882 "psllq $16, %%mm3 \n\t"
883 "por %%mm3, %%mm0 \n\t"
884 MOVNTQ" %%mm0, %0 \n\t"
885 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
886 d += 4;
887 s += 12;
888 }
889 __asm__ volatile(SFENCE:::"memory");
890 __asm__ volatile(EMMS:::"memory");
891 #endif
892 while (s < end)
893 {
894 const int r = *s++;
895 const int g = *s++;
896 const int b = *s++;
897 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
898 }
899 }
900
901 /*
902 I use less accurate approximation here by simply left-shifting the input
903 value and filling the low order bits with zeroes. This method improves PNG
904 compression but this scheme cannot reproduce white exactly, since it does
905 not generate an all-ones maximum value; the net effect is to darken the
906 image slightly.
907
908 The better method should be "left bit replication":
909
910 4 3 2 1 0
911 ---------
912 1 1 0 1 1
913
914 7 6 5 4 3 2 1 0
915 ----------------
916 1 1 0 1 1 1 1 0
917 |=======| |===|
918 | leftmost bits repeated to fill open bits
919 |
920 original bits
921 */
922 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
923 {
924 const uint16_t *end;
925 #if HAVE_MMX
926 const uint16_t *mm_end;
927 #endif
928 uint8_t *d = dst;
929 const uint16_t *s = (const uint16_t*)src;
930 end = s + src_size/2;
931 #if HAVE_MMX
932 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
933 mm_end = end - 7;
934 while (s < mm_end)
935 {
936 __asm__ volatile(
937 PREFETCH" 32%1 \n\t"
938 "movq %1, %%mm0 \n\t"
939 "movq %1, %%mm1 \n\t"
940 "movq %1, %%mm2 \n\t"
941 "pand %2, %%mm0 \n\t"
942 "pand %3, %%mm1 \n\t"
943 "pand %4, %%mm2 \n\t"
944 "psllq $3, %%mm0 \n\t"
945 "psrlq $2, %%mm1 \n\t"
946 "psrlq $7, %%mm2 \n\t"
947 "movq %%mm0, %%mm3 \n\t"
948 "movq %%mm1, %%mm4 \n\t"
949 "movq %%mm2, %%mm5 \n\t"
950 "punpcklwd %5, %%mm0 \n\t"
951 "punpcklwd %5, %%mm1 \n\t"
952 "punpcklwd %5, %%mm2 \n\t"
953 "punpckhwd %5, %%mm3 \n\t"
954 "punpckhwd %5, %%mm4 \n\t"
955 "punpckhwd %5, %%mm5 \n\t"
956 "psllq $8, %%mm1 \n\t"
957 "psllq $16, %%mm2 \n\t"
958 "por %%mm1, %%mm0 \n\t"
959 "por %%mm2, %%mm0 \n\t"
960 "psllq $8, %%mm4 \n\t"
961 "psllq $16, %%mm5 \n\t"
962 "por %%mm4, %%mm3 \n\t"
963 "por %%mm5, %%mm3 \n\t"
964
965 "movq %%mm0, %%mm6 \n\t"
966 "movq %%mm3, %%mm7 \n\t"
967
968 "movq 8%1, %%mm0 \n\t"
969 "movq 8%1, %%mm1 \n\t"
970 "movq 8%1, %%mm2 \n\t"
971 "pand %2, %%mm0 \n\t"
972 "pand %3, %%mm1 \n\t"
973 "pand %4, %%mm2 \n\t"
974 "psllq $3, %%mm0 \n\t"
975 "psrlq $2, %%mm1 \n\t"
976 "psrlq $7, %%mm2 \n\t"
977 "movq %%mm0, %%mm3 \n\t"
978 "movq %%mm1, %%mm4 \n\t"
979 "movq %%mm2, %%mm5 \n\t"
980 "punpcklwd %5, %%mm0 \n\t"
981 "punpcklwd %5, %%mm1 \n\t"
982 "punpcklwd %5, %%mm2 \n\t"
983 "punpckhwd %5, %%mm3 \n\t"
984 "punpckhwd %5, %%mm4 \n\t"
985 "punpckhwd %5, %%mm5 \n\t"
986 "psllq $8, %%mm1 \n\t"
987 "psllq $16, %%mm2 \n\t"
988 "por %%mm1, %%mm0 \n\t"
989 "por %%mm2, %%mm0 \n\t"
990 "psllq $8, %%mm4 \n\t"
991 "psllq $16, %%mm5 \n\t"
992 "por %%mm4, %%mm3 \n\t"
993 "por %%mm5, %%mm3 \n\t"
994
995 :"=m"(*d)
996 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
997 :"memory");
998 /* borrowed 32 to 24 */
999 __asm__ volatile(
1000 "movq %%mm0, %%mm4 \n\t"
1001 "movq %%mm3, %%mm5 \n\t"
1002 "movq %%mm6, %%mm0 \n\t"
1003 "movq %%mm7, %%mm1 \n\t"
1004
1005 "movq %%mm4, %%mm6 \n\t"
1006 "movq %%mm5, %%mm7 \n\t"
1007 "movq %%mm0, %%mm2 \n\t"
1008 "movq %%mm1, %%mm3 \n\t"
1009
1010 "psrlq $8, %%mm2 \n\t"
1011 "psrlq $8, %%mm3 \n\t"
1012 "psrlq $8, %%mm6 \n\t"
1013 "psrlq $8, %%mm7 \n\t"
1014 "pand %2, %%mm0 \n\t"
1015 "pand %2, %%mm1 \n\t"
1016 "pand %2, %%mm4 \n\t"
1017 "pand %2, %%mm5 \n\t"
1018 "pand %3, %%mm2 \n\t"
1019 "pand %3, %%mm3 \n\t"
1020 "pand %3, %%mm6 \n\t"
1021 "pand %3, %%mm7 \n\t"
1022 "por %%mm2, %%mm0 \n\t"
1023 "por %%mm3, %%mm1 \n\t"
1024 "por %%mm6, %%mm4 \n\t"
1025 "por %%mm7, %%mm5 \n\t"
1026
1027 "movq %%mm1, %%mm2 \n\t"
1028 "movq %%mm4, %%mm3 \n\t"
1029 "psllq $48, %%mm2 \n\t"
1030 "psllq $32, %%mm3 \n\t"
1031 "pand %4, %%mm2 \n\t"
1032 "pand %5, %%mm3 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "psrlq $16, %%mm1 \n\t"
1035 "psrlq $32, %%mm4 \n\t"
1036 "psllq $16, %%mm5 \n\t"
1037 "por %%mm3, %%mm1 \n\t"
1038 "pand %6, %%mm5 \n\t"
1039 "por %%mm5, %%mm4 \n\t"
1040
1041 MOVNTQ" %%mm0, %0 \n\t"
1042 MOVNTQ" %%mm1, 8%0 \n\t"
1043 MOVNTQ" %%mm4, 16%0"
1044
1045 :"=m"(*d)
1046 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1047 :"memory");
1048 d += 24;
1049 s += 8;
1050 }
1051 __asm__ volatile(SFENCE:::"memory");
1052 __asm__ volatile(EMMS:::"memory");
1053 #endif
1054 while (s < end)
1055 {
1056 register uint16_t bgr;
1057 bgr = *s++;
1058 *d++ = (bgr&0x1F)<<3;
1059 *d++ = (bgr&0x3E0)>>2;
1060 *d++ = (bgr&0x7C00)>>7;
1061 }
1062 }
1063
1064 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1065 {
1066 const uint16_t *end;
1067 #if HAVE_MMX
1068 const uint16_t *mm_end;
1069 #endif
1070 uint8_t *d = (uint8_t *)dst;
1071 const uint16_t *s = (const uint16_t *)src;
1072 end = s + src_size/2;
1073 #if HAVE_MMX
1074 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1075 mm_end = end - 7;
1076 while (s < mm_end)
1077 {
1078 __asm__ volatile(
1079 PREFETCH" 32%1 \n\t"
1080 "movq %1, %%mm0 \n\t"
1081 "movq %1, %%mm1 \n\t"
1082 "movq %1, %%mm2 \n\t"
1083 "pand %2, %%mm0 \n\t"
1084 "pand %3, %%mm1 \n\t"
1085 "pand %4, %%mm2 \n\t"
1086 "psllq $3, %%mm0 \n\t"
1087 "psrlq $3, %%mm1 \n\t"
1088 "psrlq $8, %%mm2 \n\t"
1089 "movq %%mm0, %%mm3 \n\t"
1090 "movq %%mm1, %%mm4 \n\t"
1091 "movq %%mm2, %%mm5 \n\t"
1092 "punpcklwd %5, %%mm0 \n\t"
1093 "punpcklwd %5, %%mm1 \n\t"
1094 "punpcklwd %5, %%mm2 \n\t"
1095 "punpckhwd %5, %%mm3 \n\t"
1096 "punpckhwd %5, %%mm4 \n\t"
1097 "punpckhwd %5, %%mm5 \n\t"
1098 "psllq $8, %%mm1 \n\t"
1099 "psllq $16, %%mm2 \n\t"
1100 "por %%mm1, %%mm0 \n\t"
1101 "por %%mm2, %%mm0 \n\t"
1102 "psllq $8, %%mm4 \n\t"
1103 "psllq $16, %%mm5 \n\t"
1104 "por %%mm4, %%mm3 \n\t"
1105 "por %%mm5, %%mm3 \n\t"
1106
1107 "movq %%mm0, %%mm6 \n\t"
1108 "movq %%mm3, %%mm7 \n\t"
1109
1110 "movq 8%1, %%mm0 \n\t"
1111 "movq 8%1, %%mm1 \n\t"
1112 "movq 8%1, %%mm2 \n\t"
1113 "pand %2, %%mm0 \n\t"
1114 "pand %3, %%mm1 \n\t"
1115 "pand %4, %%mm2 \n\t"
1116 "psllq $3, %%mm0 \n\t"
1117 "psrlq $3, %%mm1 \n\t"
1118 "psrlq $8, %%mm2 \n\t"
1119 "movq %%mm0, %%mm3 \n\t"
1120 "movq %%mm1, %%mm4 \n\t"
1121 "movq %%mm2, %%mm5 \n\t"
1122 "punpcklwd %5, %%mm0 \n\t"
1123 "punpcklwd %5, %%mm1 \n\t"
1124 "punpcklwd %5, %%mm2 \n\t"
1125 "punpckhwd %5, %%mm3 \n\t"
1126 "punpckhwd %5, %%mm4 \n\t"
1127 "punpckhwd %5, %%mm5 \n\t"
1128 "psllq $8, %%mm1 \n\t"
1129 "psllq $16, %%mm2 \n\t"
1130 "por %%mm1, %%mm0 \n\t"
1131 "por %%mm2, %%mm0 \n\t"
1132 "psllq $8, %%mm4 \n\t"
1133 "psllq $16, %%mm5 \n\t"
1134 "por %%mm4, %%mm3 \n\t"
1135 "por %%mm5, %%mm3 \n\t"
1136 :"=m"(*d)
1137 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1138 :"memory");
1139 /* borrowed 32 to 24 */
1140 __asm__ volatile(
1141 "movq %%mm0, %%mm4 \n\t"
1142 "movq %%mm3, %%mm5 \n\t"
1143 "movq %%mm6, %%mm0 \n\t"
1144 "movq %%mm7, %%mm1 \n\t"
1145
1146 "movq %%mm4, %%mm6 \n\t"
1147 "movq %%mm5, %%mm7 \n\t"
1148 "movq %%mm0, %%mm2 \n\t"
1149 "movq %%mm1, %%mm3 \n\t"
1150
1151 "psrlq $8, %%mm2 \n\t"
1152 "psrlq $8, %%mm3 \n\t"
1153 "psrlq $8, %%mm6 \n\t"
1154 "psrlq $8, %%mm7 \n\t"
1155 "pand %2, %%mm0 \n\t"
1156 "pand %2, %%mm1 \n\t"
1157 "pand %2, %%mm4 \n\t"
1158 "pand %2, %%mm5 \n\t"
1159 "pand %3, %%mm2 \n\t"
1160 "pand %3, %%mm3 \n\t"
1161 "pand %3, %%mm6 \n\t"
1162 "pand %3, %%mm7 \n\t"
1163 "por %%mm2, %%mm0 \n\t"
1164 "por %%mm3, %%mm1 \n\t"
1165 "por %%mm6, %%mm4 \n\t"
1166 "por %%mm7, %%mm5 \n\t"
1167
1168 "movq %%mm1, %%mm2 \n\t"
1169 "movq %%mm4, %%mm3 \n\t"
1170 "psllq $48, %%mm2 \n\t"
1171 "psllq $32, %%mm3 \n\t"
1172 "pand %4, %%mm2 \n\t"
1173 "pand %5, %%mm3 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "psrlq $16, %%mm1 \n\t"
1176 "psrlq $32, %%mm4 \n\t"
1177 "psllq $16, %%mm5 \n\t"
1178 "por %%mm3, %%mm1 \n\t"
1179 "pand %6, %%mm5 \n\t"
1180 "por %%mm5, %%mm4 \n\t"
1181
1182 MOVNTQ" %%mm0, %0 \n\t"
1183 MOVNTQ" %%mm1, 8%0 \n\t"
1184 MOVNTQ" %%mm4, 16%0"
1185
1186 :"=m"(*d)
1187 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1188 :"memory");
1189 d += 24;
1190 s += 8;
1191 }
1192 __asm__ volatile(SFENCE:::"memory");
1193 __asm__ volatile(EMMS:::"memory");
1194 #endif
1195 while (s < end)
1196 {
1197 register uint16_t bgr;
1198 bgr = *s++;
1199 *d++ = (bgr&0x1F)<<3;
1200 *d++ = (bgr&0x7E0)>>3;
1201 *d++ = (bgr&0xF800)>>8;
1202 }
1203 }
1204
1205 /*
1206 * mm0 = 00 B3 00 B2 00 B1 00 B0
1207 * mm1 = 00 G3 00 G2 00 G1 00 G0
1208 * mm2 = 00 R3 00 R2 00 R1 00 R0
1209 * mm6 = FF FF FF FF FF FF FF FF
1210 * mm7 = 00 00 00 00 00 00 00 00
1211 */
1212 #define PACK_RGB32 \
1213 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1214 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1215 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1216 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1217 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1218 "movq %%mm0, %%mm3 \n\t" \
1219 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1220 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1221 MOVNTQ" %%mm0, %0 \n\t" \
1222 MOVNTQ" %%mm3, 8%0 \n\t" \
1223
1224 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1225 {
1226 const uint16_t *end;
1227 #if HAVE_MMX
1228 const uint16_t *mm_end;
1229 #endif
1230 uint8_t *d = dst;
1231 const uint16_t *s = (const uint16_t *)src;
1232 end = s + src_size/2;
1233 #if HAVE_MMX
1234 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1235 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1236 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1237 mm_end = end - 3;
1238 while (s < mm_end)
1239 {
1240 __asm__ volatile(
1241 PREFETCH" 32%1 \n\t"
1242 "movq %1, %%mm0 \n\t"
1243 "movq %1, %%mm1 \n\t"
1244 "movq %1, %%mm2 \n\t"
1245 "pand %2, %%mm0 \n\t"
1246 "pand %3, %%mm1 \n\t"
1247 "pand %4, %%mm2 \n\t"
1248 "psllq $3, %%mm0 \n\t"
1249 "psrlq $2, %%mm1 \n\t"
1250 "psrlq $7, %%mm2 \n\t"
1251 PACK_RGB32
1252 :"=m"(*d)
1253 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1254 :"memory");
1255 d += 16;
1256 s += 4;
1257 }
1258 __asm__ volatile(SFENCE:::"memory");
1259 __asm__ volatile(EMMS:::"memory");
1260 #endif
1261 while (s < end)
1262 {
1263 #if 0 //slightly slower on Athlon
1264 int bgr= *s++;
1265 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1266 #else
1267 register uint16_t bgr;
1268 bgr = *s++;
1269 #ifdef WORDS_BIGENDIAN
1270 *d++ = 255;
1271 *d++ = (bgr&0x7C00)>>7;
1272 *d++ = (bgr&0x3E0)>>2;
1273 *d++ = (bgr&0x1F)<<3;
1274 #else
1275 *d++ = (bgr&0x1F)<<3;
1276 *d++ = (bgr&0x3E0)>>2;
1277 *d++ = (bgr&0x7C00)>>7;
1278 *d++ = 255;
1279 #endif
1280
1281 #endif
1282 }
1283 }
1284
1285 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1286 {
1287 const uint16_t *end;
1288 #if HAVE_MMX
1289 const uint16_t *mm_end;
1290 #endif
1291 uint8_t *d = dst;
1292 const uint16_t *s = (const uint16_t*)src;
1293 end = s + src_size/2;
1294 #if HAVE_MMX
1295 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1296 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1297 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1298 mm_end = end - 3;
1299 while (s < mm_end)
1300 {
1301 __asm__ volatile(
1302 PREFETCH" 32%1 \n\t"
1303 "movq %1, %%mm0 \n\t"
1304 "movq %1, %%mm1 \n\t"
1305 "movq %1, %%mm2 \n\t"
1306 "pand %2, %%mm0 \n\t"
1307 "pand %3, %%mm1 \n\t"
1308 "pand %4, %%mm2 \n\t"
1309 "psllq $3, %%mm0 \n\t"
1310 "psrlq $3, %%mm1 \n\t"
1311 "psrlq $8, %%mm2 \n\t"
1312 PACK_RGB32
1313 :"=m"(*d)
1314 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1315 :"memory");
1316 d += 16;
1317 s += 4;
1318 }
1319 __asm__ volatile(SFENCE:::"memory");
1320 __asm__ volatile(EMMS:::"memory");
1321 #endif
1322 while (s < end)
1323 {
1324 register uint16_t bgr;
1325 bgr = *s++;
1326 #ifdef WORDS_BIGENDIAN
1327 *d++ = 255;
1328 *d++ = (bgr&0xF800)>>8;
1329 *d++ = (bgr&0x7E0)>>3;
1330 *d++ = (bgr&0x1F)<<3;
1331 #else
1332 *d++ = (bgr&0x1F)<<3;
1333 *d++ = (bgr&0x7E0)>>3;
1334 *d++ = (bgr&0xF800)>>8;
1335 *d++ = 255;
1336 #endif
1337 }
1338 }
1339
1340 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1341 {
1342 long idx = 15 - src_size;
1343 const uint8_t *s = src-idx;
1344 uint8_t *d = dst-idx;
1345 #if HAVE_MMX
1346 __asm__ volatile(
1347 "test %0, %0 \n\t"
1348 "jns 2f \n\t"
1349 PREFETCH" (%1, %0) \n\t"
1350 "movq %3, %%mm7 \n\t"
1351 "pxor %4, %%mm7 \n\t"
1352 "movq %%mm7, %%mm6 \n\t"
1353 "pxor %5, %%mm7 \n\t"
1354 ASMALIGN(4)
1355 "1: \n\t"
1356 PREFETCH" 32(%1, %0) \n\t"
1357 "movq (%1, %0), %%mm0 \n\t"
1358 "movq 8(%1, %0), %%mm1 \n\t"
1359 # if HAVE_MMX2
1360 "pshufw $177, %%mm0, %%mm3 \n\t"
1361 "pshufw $177, %%mm1, %%mm5 \n\t"
1362 "pand %%mm7, %%mm0 \n\t"
1363 "pand %%mm6, %%mm3 \n\t"
1364 "pand %%mm7, %%mm1 \n\t"
1365 "pand %%mm6, %%mm5 \n\t"
1366 "por %%mm3, %%mm0 \n\t"
1367 "por %%mm5, %%mm1 \n\t"
1368 # else
1369 "movq %%mm0, %%mm2 \n\t"
1370 "movq %%mm1, %%mm4 \n\t"
1371 "pand %%mm7, %%mm0 \n\t"
1372 "pand %%mm6, %%mm2 \n\t"
1373 "pand %%mm7, %%mm1 \n\t"
1374 "pand %%mm6, %%mm4 \n\t"
1375 "movq %%mm2, %%mm3 \n\t"
1376 "movq %%mm4, %%mm5 \n\t"
1377 "pslld $16, %%mm2 \n\t"
1378 "psrld $16, %%mm3 \n\t"
1379 "pslld $16, %%mm4 \n\t"
1380 "psrld $16, %%mm5 \n\t"
1381 "por %%mm2, %%mm0 \n\t"
1382 "por %%mm4, %%mm1 \n\t"
1383 "por %%mm3, %%mm0 \n\t"
1384 "por %%mm5, %%mm1 \n\t"
1385 # endif
1386 MOVNTQ" %%mm0, (%2, %0) \n\t"
1387 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1388 "add $16, %0 \n\t"
1389 "js 1b \n\t"
1390 SFENCE" \n\t"
1391 EMMS" \n\t"
1392 "2: \n\t"
1393 : "+&r"(idx)
1394 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1395 : "memory");
1396 #endif
1397 for (; idx<15; idx+=4) {
1398 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1399 v &= 0xff00ff;
1400 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1401 }
1402 }
1403
1404 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1405 {
1406 unsigned i;
1407 #if HAVE_MMX
1408 long mmx_size= 23 - src_size;
1409 __asm__ volatile (
1410 "test %%"REG_a", %%"REG_a" \n\t"
1411 "jns 2f \n\t"
1412 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1413 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1414 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1415 ASMALIGN(4)
1416 "1: \n\t"
1417 PREFETCH" 32(%1, %%"REG_a") \n\t"
1418 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1419 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1420 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1421 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1422 "pand %%mm5, %%mm0 \n\t"
1423 "pand %%mm6, %%mm1 \n\t"
1424 "pand %%mm7, %%mm2 \n\t"
1425 "por %%mm0, %%mm1 \n\t"
1426 "por %%mm2, %%mm1 \n\t"
1427 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1428 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1429 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1430 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1431 "pand %%mm7, %%mm0 \n\t"
1432 "pand %%mm5, %%mm1 \n\t"
1433 "pand %%mm6, %%mm2 \n\t"
1434 "por %%mm0, %%mm1 \n\t"
1435 "por %%mm2, %%mm1 \n\t"
1436 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1437 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1438 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1439 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1440 "pand %%mm6, %%mm0 \n\t"
1441 "pand %%mm7, %%mm1 \n\t"
1442 "pand %%mm5, %%mm2 \n\t"
1443 "por %%mm0, %%mm1 \n\t"
1444 "por %%mm2, %%mm1 \n\t"
1445 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1446 "add $24, %%"REG_a" \n\t"
1447 " js 1b \n\t"
1448 "2: \n\t"
1449 : "+a" (mmx_size)
1450 : "r" (src-mmx_size), "r"(dst-mmx_size)
1451 );
1452
1453 __asm__ volatile(SFENCE:::"memory");
1454 __asm__ volatile(EMMS:::"memory");
1455
1456 if (mmx_size==23) return; //finished, was multiple of 8
1457
1458 src+= src_size;
1459 dst+= src_size;
1460 src_size= 23-mmx_size;
1461 src-= src_size;
1462 dst-= src_size;
1463 #endif
1464 for (i=0; i<src_size; i+=3)
1465 {
1466 register uint8_t x;
1467 x = src[i + 2];
1468 dst[i + 1] = src[i + 1];
1469 dst[i + 2] = src[i + 0];
1470 dst[i + 0] = x;
1471 }
1472 }
1473
1474 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1475 long width, long height,
1476 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1477 {
1478 long y;
1479 const long chromWidth= width>>1;
1480 for (y=0; y<height; y++)
1481 {
1482 #if HAVE_MMX
1483 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1484 __asm__ volatile(
1485 "xor %%"REG_a", %%"REG_a" \n\t"
1486 ASMALIGN(4)
1487 "1: \n\t"
1488 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1489 PREFETCH" 32(%2, %%"REG_a") \n\t"
1490 PREFETCH" 32(%3, %%"REG_a") \n\t"
1491 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1492 "movq %%mm0, %%mm2 \n\t" // U(0)
1493 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1494 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1495 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1496
1497 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1498 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1499 "movq %%mm3, %%mm4 \n\t" // Y(0)
1500 "movq %%mm5, %%mm6 \n\t" // Y(8)
1501 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1502 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1503 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1504 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1505
1506 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1507 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1508 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1509 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1510
1511 "add $8, %%"REG_a" \n\t"
1512 "cmp %4, %%"REG_a" \n\t"
1513 " jb 1b \n\t"
1514 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1515 : "%"REG_a
1516 );
1517 #else
1518
1519 #if ARCH_ALPHA && HAVE_MVI
1520 #define pl2yuy2(n) \
1521 y1 = yc[n]; \
1522 y2 = yc2[n]; \
1523 u = uc[n]; \
1524 v = vc[n]; \
1525 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1526 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1527 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1528 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1529 yuv1 = (u << 8) + (v << 24); \
1530 yuv2 = yuv1 + y2; \
1531 yuv1 += y1; \
1532 qdst[n] = yuv1; \
1533 qdst2[n] = yuv2;
1534
1535 int i;
1536 uint64_t *qdst = (uint64_t *) dst;
1537 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1538 const uint32_t *yc = (uint32_t *) ysrc;
1539 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1540 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1541 for (i = 0; i < chromWidth; i += 8){
1542 uint64_t y1, y2, yuv1, yuv2;
1543 uint64_t u, v;
1544 /* Prefetch */
1545 __asm__("ldq $31,64(%0)" :: "r"(yc));
1546 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1547 __asm__("ldq $31,64(%0)" :: "r"(uc));
1548 __asm__("ldq $31,64(%0)" :: "r"(vc));
1549
1550 pl2yuy2(0);
1551 pl2yuy2(1);
1552 pl2yuy2(2);
1553 pl2yuy2(3);
1554
1555 yc += 4;
1556 yc2 += 4;
1557 uc += 4;
1558 vc += 4;
1559 qdst += 4;
1560 qdst2 += 4;
1561 }
1562 y++;
1563 ysrc += lumStride;
1564 dst += dstStride;
1565
1566 #elif HAVE_FAST_64BIT
1567 int i;
1568 uint64_t *ldst = (uint64_t *) dst;
1569 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1570 for (i = 0; i < chromWidth; i += 2){
1571 uint64_t k, l;
1572 k = yc[0] + (uc[0] << 8) +
1573 (yc[1] << 16) + (vc[0] << 24);
1574 l = yc[2] + (uc[1] << 8) +
1575 (yc[3] << 16) + (vc[1] << 24);
1576 *ldst++ = k + (l << 32);
1577 yc += 4;
1578 uc += 2;
1579 vc += 2;
1580 }
1581
1582 #else
1583 int i, *idst = (int32_t *) dst;
1584 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1585 for (i = 0; i < chromWidth; i++){
1586 #ifdef WORDS_BIGENDIAN
1587 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1588 (yc[1] << 8) + (vc[0] << 0);
1589 #else
1590 *idst++ = yc[0] + (uc[0] << 8) +
1591 (yc[1] << 16) + (vc[0] << 24);
1592 #endif
1593 yc += 2;
1594 uc++;
1595 vc++;
1596 }
1597 #endif
1598 #endif
1599 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1600 {
1601 usrc += chromStride;
1602 vsrc += chromStride;
1603 }
1604 ysrc += lumStride;
1605 dst += dstStride;
1606 }
1607 #if HAVE_MMX
1608 __asm__( EMMS" \n\t"
1609 SFENCE" \n\t"
1610 :::"memory");
1611 #endif
1612 }
1613
1614 /**
1615 * Height should be a multiple of 2 and width should be a multiple of 16.
1616 * (If this is a problem for anyone then tell me, and I will fix it.)
1617 */
1618 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619 long width, long height,
1620 long lumStride, long chromStride, long dstStride)
1621 {
1622 //FIXME interpolate chroma
1623 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1624 }
1625
1626 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1627 long width, long height,
1628 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1629 {
1630 long y;
1631 const long chromWidth= width>>1;
1632 for (y=0; y<height; y++)
1633 {
1634 #if HAVE_MMX
1635 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1636 __asm__ volatile(
1637 "xor %%"REG_a", %%"REG_a" \n\t"
1638 ASMALIGN(4)
1639 "1: \n\t"
1640 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1641 PREFETCH" 32(%2, %%"REG_a") \n\t"
1642 PREFETCH" 32(%3, %%"REG_a") \n\t"
1643 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1644 "movq %%mm0, %%mm2 \n\t" // U(0)
1645 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1646 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1647 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1648
1649 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1650 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1651 "movq %%mm0, %%mm4 \n\t" // Y(0)
1652 "movq %%mm2, %%mm6 \n\t" // Y(8)
1653 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1654 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1655 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1656 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1657
1658 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1659 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1660 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1661 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1662
1663 "add $8, %%"REG_a" \n\t"
1664 "cmp %4, %%"REG_a" \n\t"
1665 " jb 1b \n\t"
1666 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1667 : "%"REG_a
1668 );
1669 #else
1670 //FIXME adapt the Alpha ASM code from yv12->yuy2
1671
1672 #if HAVE_FAST_64BIT
1673 int i;
1674 uint64_t *ldst = (uint64_t *) dst;
1675 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1676 for (i = 0; i < chromWidth; i += 2){
1677 uint64_t k, l;
1678 k = uc[0] + (yc[0] << 8) +
1679 (vc[0] << 16) + (yc[1] << 24);
1680 l = uc[1] + (yc[2] << 8) +
1681 (vc[1] << 16) + (yc[3] << 24);
1682 *ldst++ = k + (l << 32);
1683 yc += 4;
1684 uc += 2;
1685 vc += 2;
1686 }
1687
1688 #else
1689 int i, *idst = (int32_t *) dst;
1690 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1691 for (i = 0; i < chromWidth; i++){
1692 #ifdef WORDS_BIGENDIAN
1693 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1694 (vc[0] << 8) + (yc[1] << 0);
1695 #else
1696 *idst++ = uc[0] + (yc[0] << 8) +
1697 (vc[0] << 16) + (yc[1] << 24);
1698 #endif
1699 yc += 2;
1700 uc++;
1701 vc++;
1702 }
1703 #endif
1704 #endif
1705 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1706 {
1707 usrc += chromStride;
1708 vsrc += chromStride;
1709 }
1710 ysrc += lumStride;
1711 dst += dstStride;
1712 }
1713 #if HAVE_MMX
1714 __asm__( EMMS" \n\t"
1715 SFENCE" \n\t"
1716 :::"memory");
1717 #endif
1718 }
1719
1720 /**
1721 * Height should be a multiple of 2 and width should be a multiple of 16
1722 * (If this is a problem for anyone then tell me, and I will fix it.)
1723 */
1724 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1725 long width, long height,
1726 long lumStride, long chromStride, long dstStride)
1727 {
1728 //FIXME interpolate chroma
1729 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1730 }
1731
1732 /**
1733 * Width should be a multiple of 16.
1734 */
1735 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1736 long width, long height,
1737 long lumStride, long chromStride, long dstStride)
1738 {
1739 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1740 }
1741
1742 /**
1743 * Width should be a multiple of 16.
1744 */
1745 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1746 long width, long height,
1747 long lumStride, long chromStride, long dstStride)
1748 {
1749 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1750 }
1751
1752 /**
1753 * Height should be a multiple of 2 and width should be a multiple of 16.
1754 * (If this is a problem for anyone then tell me, and I will fix it.)
1755 */
1756 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1757 long width, long height,
1758 long lumStride, long chromStride, long srcStride)
1759 {
1760 long y;
1761 const long chromWidth= width>>1;
1762 for (y=0; y<height; y+=2)
1763 {
1764 #if HAVE_MMX
1765 __asm__ volatile(
1766 "xor %%"REG_a", %%"REG_a" \n\t"
1767 "pcmpeqw %%mm7, %%mm7 \n\t"
1768 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1769 ASMALIGN(4)
1770 "1: \n\t"
1771 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1772 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1773 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1774 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1775 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1776 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1777 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1778 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1779 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1780 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1781 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1782
1783 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1784
1785 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1786 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1787 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1788 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1789 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1790 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1791 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1792 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1793 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1794 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1795
1796 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1797
1798 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1799 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1800 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1801 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1802 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1803 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1804 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1805 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1806
1807 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1808 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1809
1810 "add $8, %%"REG_a" \n\t"
1811 "cmp %4, %%"REG_a" \n\t"
1812 " jb 1b \n\t"
1813 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1814 : "memory", "%"REG_a
1815 );
1816
1817 ydst += lumStride;
1818 src += srcStride;
1819
1820 __asm__ volatile(
1821 "xor %%"REG_a", %%"REG_a" \n\t"
1822 ASMALIGN(4)
1823 "1: \n\t"
1824 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1825 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1826 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1827 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1828 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1829 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1830 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1831 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1832 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1833 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1834 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1835
1836 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1837 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1838
1839 "add $8, %%"REG_a" \n\t"
1840 "cmp %4, %%"REG_a" \n\t"
1841 " jb 1b \n\t"
1842
1843 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1844 : "memory", "%"REG_a
1845 );
1846 #else
1847 long i;
1848 for (i=0; i<chromWidth; i++)
1849 {
1850 ydst[2*i+0] = src[4*i+0];
1851 udst[i] = src[4*i+1];
1852 ydst[2*i+1] = src[4*i+2];
1853 vdst[i] = src[4*i+3];
1854 }
1855 ydst += lumStride;
1856 src += srcStride;
1857
1858 for (i=0; i<chromWidth; i++)
1859 {
1860 ydst[2*i+0] = src[4*i+0];
1861 ydst[2*i+1] = src[4*i+2];
1862 }
1863 #endif
1864 udst += chromStride;
1865 vdst += chromStride;
1866 ydst += lumStride;
1867 src += srcStride;
1868 }
1869 #if HAVE_MMX
1870 __asm__ volatile( EMMS" \n\t"
1871 SFENCE" \n\t"
1872 :::"memory");
1873 #endif
1874 }
1875
1876 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1877 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1878 long width, long height, long lumStride, long chromStride)
1879 {
1880 /* Y Plane */
1881 memcpy(ydst, ysrc, width*height);
1882
1883 /* XXX: implement upscaling for U,V */
1884 }
1885
1886 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1887 {
1888 long x,y;
1889
1890 dst[0]= src[0];
1891
1892 // first line
1893 for (x=0; x<srcWidth-1; x++){
1894 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1895 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1896 }
1897 dst[2*srcWidth-1]= src[srcWidth-1];
1898
1899 dst+= dstStride;
1900
1901 for (y=1; y<srcHeight; y++){
1902 #if HAVE_MMX2 || HAVE_AMD3DNOW
1903 const long mmxSize= srcWidth&~15;
1904 __asm__ volatile(
1905 "mov %4, %%"REG_a" \n\t"
1906 "1: \n\t"
1907 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1908 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1909 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1910 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1911 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1912 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1913 PAVGB" %%mm0, %%mm5 \n\t"
1914 PAVGB" %%mm0, %%mm3 \n\t"
1915 PAVGB" %%mm0, %%mm5 \n\t"
1916 PAVGB" %%mm0, %%mm3 \n\t"
1917 PAVGB" %%mm1, %%mm4 \n\t"
1918 PAVGB" %%mm1, %%mm2 \n\t"
1919 PAVGB" %%mm1, %%mm4 \n\t"
1920 PAVGB" %%mm1, %%mm2 \n\t"
1921 "movq %%mm5, %%mm7 \n\t"
1922 "movq %%mm4, %%mm6 \n\t"
1923 "punpcklbw %%mm3, %%mm5 \n\t"
1924 "punpckhbw %%mm3, %%mm7 \n\t"
1925 "punpcklbw %%mm2, %%mm4 \n\t"
1926 "punpckhbw %%mm2, %%mm6 \n\t"
1927 #if 1
1928 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1929 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1930 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1931 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1932 #else
1933 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1934 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1935 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1936 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1937 #endif
1938 "add $8, %%"REG_a" \n\t"
1939 " js 1b \n\t"
1940 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1941 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1942 "g" (-mmxSize)
1943 : "%"REG_a
1944
1945 );
1946 #else
1947 const long mmxSize=1;
1948 #endif
1949 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1950 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1951
1952 for (x=mmxSize-1; x<srcWidth-1; x++){
1953 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1954 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1955 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1956 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1957 }
1958 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1959 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1960
1961 dst+=dstStride*2;
1962 src+=srcStride;
1963 }
1964
1965 // last line
1966 #if 1
1967 dst[0]= src[0];
1968
1969 for (x=0; x<srcWidth-1; x++){
1970 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1971 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1972 }
1973 dst[2*srcWidth-1]= src[srcWidth-1];
1974 #else
1975 for (x=0; x<srcWidth; x++){
1976 dst[2*x+0]=
1977 dst[2*x+1]= src[x];
1978 }
1979 #endif
1980
1981 #if HAVE_MMX
1982 __asm__ volatile( EMMS" \n\t"
1983 SFENCE" \n\t"
1984 :::"memory");
1985 #endif
1986 }
1987
1988 /**
1989 * Height should be a multiple of 2 and width should be a multiple of 16.
1990 * (If this is a problem for anyone then tell me, and I will fix it.)
1991 * Chrominance data is only taken from every second line, others are ignored.
1992 * FIXME: Write HQ version.
1993 */
1994 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1995 long width, long height,
1996 long lumStride, long chromStride, long srcStride)
1997 {
1998 long y;
1999 const long chromWidth= width>>1;
2000 for (y=0; y<height; y+=2)
2001 {
2002 #if HAVE_MMX
2003 __asm__ volatile(
2004 "xor %%"REG_a", %%"REG_a" \n\t"
2005 "pcmpeqw %%mm7, %%mm7 \n\t"
2006 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2007 ASMALIGN(4)
2008 "1: \n\t"
2009 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2010 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
2011 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
2012 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2013 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2014 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2015 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2016 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2017 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2018 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2019 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2020
2021 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
2022
2023 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
2024 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
2025 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2026 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2027 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2028 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2029 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2030 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2031 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2032 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2033
2034 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2035
2036 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2037 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2038 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2039 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2040 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2041 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2042 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2043 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2044
2045 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
2046 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
2047
2048 "add $8, %%"REG_a" \n\t"
2049 "cmp %4, %%"REG_a" \n\t"
2050 " jb 1b \n\t"
2051 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2052 : "memory", "%"REG_a
2053 );
2054
2055 ydst += lumStride;
2056 src += srcStride;
2057
2058 __asm__ volatile(
2059 "xor %%"REG_a", %%"REG_a" \n\t"
2060 ASMALIGN(4)
2061 "1: \n\t"
2062 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2063 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
2064 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
2065 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
2066 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
2067 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2068 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2069 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2070 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2071 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2072 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2073
2074 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
2075 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2076
2077 "add $8, %%"REG_a" \n\t"
2078 "cmp %4, %%"REG_a" \n\t"
2079 " jb 1b \n\t"
2080
2081 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2082 : "memory", "%"REG_a
2083 );
2084 #else
2085 long i;
2086 for (i=0; i<chromWidth; i++)
2087 {
2088 udst[i] = src[4*i+0];
2089 ydst[2*i+0] = src[4*i+1];
2090 vdst[i] = src[4*i+2];
2091 ydst[2*i+1] = src[4*i+3];
2092 }
2093 ydst += lumStride;
2094 src += srcStride;
2095
2096 for (i=0; i<chromWidth; i++)
2097 {
2098 ydst[2*i+0] = src[4*i+1];
2099 ydst[2*i+1] = src[4*i+3];
2100 }
2101 #endif
2102 udst += chromStride;
2103 vdst += chromStride;
2104 ydst += lumStride;
2105 src += srcStride;
2106 }
2107 #if HAVE_MMX
2108 __asm__ volatile( EMMS" \n\t"
2109 SFENCE" \n\t"
2110 :::"memory");
2111 #endif
2112 }
2113
2114 /**
2115 * Height should be a multiple of 2 and width should be a multiple of 2.
2116 * (If this is a problem for anyone then tell me, and I will fix it.)
2117 * Chrominance data is only taken from every second line,
2118 * others are ignored in the C version.
2119 * FIXME: Write HQ version.
2120 */
2121 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2122 long width, long height,
2123 long lumStride, long chromStride, long srcStride)
2124 {
2125 long y;
2126 const long chromWidth= width>>1;
2127 #if HAVE_MMX
2128 for (y=0; y<height-2; y+=2)
2129 {
2130 long i;
2131 for (i=0; i<2; i++)
2132 {
2133 __asm__ volatile(
2134 "mov %2, %%"REG_a" \n\t"
2135 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2136 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2137 "pxor %%mm7, %%mm7 \n\t"
2138 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2139 ASMALIGN(4)
2140 "1: \n\t"
2141 PREFETCH" 64(%0, %%"REG_d") \n\t"
2142 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2143 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2144 "punpcklbw %%mm7, %%mm0 \n\t"
2145 "punpcklbw %%mm7, %%mm1 \n\t"
2146 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2147 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2148 "punpcklbw %%mm7, %%mm2 \n\t"
2149 "punpcklbw %%mm7, %%mm3 \n\t"
2150 "pmaddwd %%mm6, %%mm0 \n\t"
2151 "pmaddwd %%mm6, %%mm1 \n\t"
2152 "pmaddwd %%mm6, %%mm2 \n\t"
2153 "pmaddwd %%mm6, %%mm3 \n\t"
2154 #ifndef FAST_BGR2YV12
2155 "psrad $8, %%mm0 \n\t"
2156 "psrad $8, %%mm1 \n\t"
2157 "psrad $8, %%mm2 \n\t"
2158 "psrad $8, %%mm3 \n\t"
2159 #endif
2160 "packssdw %%mm1, %%mm0 \n\t"
2161 "packssdw %%mm3, %%mm2 \n\t"
2162 "pmaddwd %%mm5, %%mm0 \n\t"
2163 "pmaddwd %%mm5, %%mm2 \n\t"
2164 "packssdw %%mm2, %%mm0 \n\t"
2165 "psraw $7, %%mm0 \n\t"
2166
2167 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2168 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2169 "punpcklbw %%mm7, %%mm4 \n\t"
2170 "punpcklbw %%mm7, %%mm1 \n\t"
2171 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2172 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2173 "punpcklbw %%mm7, %%mm2 \n\t"
2174 "punpcklbw %%mm7, %%mm3 \n\t"
2175 "pmaddwd %%mm6, %%mm4 \n\t"
2176 "pmaddwd %%mm6, %%mm1 \n\t"
2177 "pmaddwd %%mm6, %%mm2 \n\t"
2178 "pmaddwd %%mm6, %%mm3 \n\t"
2179 #ifndef FAST_BGR2YV12
2180 "psrad $8, %%mm4 \n\t"
2181 "psrad $8, %%mm1 \n\t"
2182 "psrad $8, %%mm2 \n\t"
2183 "psrad $8, %%mm3 \n\t"
2184 #endif
2185 "packssdw %%mm1, %%mm4 \n\t"
2186 "packssdw %%mm3, %%mm2 \n\t"
2187 "pmaddwd %%mm5, %%mm4 \n\t"
2188 "pmaddwd %%mm5, %%mm2 \n\t"
2189 "add $24, %%"REG_d" \n\t"
2190 "packssdw %%mm2, %%mm4 \n\t"
2191 "psraw $7, %%mm4 \n\t"
2192
2193 "packuswb %%mm4, %%mm0 \n\t"
2194 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2195
2196 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2197 "add $8, %%"REG_a" \n\t"
2198 " js 1b \n\t"
2199 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2200 : "%"REG_a, "%"REG_d
2201 );
2202 ydst += lumStride;
2203 src += srcStride;
2204 }
2205 src -= srcStride*2;
2206 __asm__ volatile(
2207 "mov %4, %%"REG_a" \n\t"
2208 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2209 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2210 "pxor %%mm7, %%mm7 \n\t"
2211 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2212 "add %%"REG_d", %%"REG_d" \n\t"
2213 ASMALIGN(4)
2214 "1: \n\t"
2215 PREFETCH" 64(%0, %%"REG_d") \n\t"
2216 PREFETCH" 64(%1, %%"REG_d") \n\t"
2217 #if HAVE_MMX2 || HAVE_AMD3DNOW
2218 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2219 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2220 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2221 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2222 PAVGB" %%mm1, %%mm0 \n\t"
2223 PAVGB" %%mm3, %%mm2 \n\t"
2224 "movq %%mm0, %%mm1 \n\t"
2225 "movq %%mm2, %%mm3 \n\t"
2226 "psrlq $24, %%mm0 \n\t"
2227 "psrlq $24, %%mm2 \n\t"
2228 PAVGB" %%mm1, %%mm0 \n\t"
2229 PAVGB" %%mm3, %%mm2 \n\t"
2230 "punpcklbw %%mm7, %%mm0 \n\t"
2231 "punpcklbw %%mm7, %%mm2 \n\t"
2232 #else
2233 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2234 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2235 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2236 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2237 "punpcklbw %%mm7, %%mm0 \n\t"
2238 "punpcklbw %%mm7, %%mm1 \n\t"
2239 "punpcklbw %%mm7, %%mm2 \n\t"
2240 "punpcklbw %%mm7, %%mm3 \n\t"
2241 "paddw %%mm1, %%mm0 \n\t"
2242 "paddw %%mm3, %%mm2 \n\t"
2243 "paddw %%mm2, %%mm0 \n\t"
2244 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2245 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2246 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2247 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2248 "punpcklbw %%mm7, %%mm4 \n\t"
2249 "punpcklbw %%mm7, %%mm1 \n\t"
2250 "punpcklbw %%mm7, %%mm2 \n\t"
2251 "punpcklbw %%mm7, %%mm3 \n\t"
2252 "paddw %%mm1, %%mm4 \n\t"
2253 "paddw %%mm3, %%mm2 \n\t"
2254 "paddw %%mm4, %%mm2 \n\t"
2255 "psrlw $2, %%mm0 \n\t"
2256 "psrlw $2, %%mm2 \n\t"
2257 #endif
2258 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2259 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2260
2261 "pmaddwd %%mm0, %%mm1 \n\t"
2262 "pmaddwd %%mm2, %%mm3 \n\t"
2263 "pmaddwd %%mm6, %%mm0 \n\t"
2264 "pmaddwd %%mm6, %%mm2 \n\t"
2265 #ifndef FAST_BGR2YV12
2266 "psrad $8, %%mm0 \n\t"
2267 "psrad $8, %%mm1 \n\t"
2268 "psrad $8, %%mm2 \n\t"
2269 "psrad $8, %%mm3 \n\t"
2270 #endif
2271 "packssdw %%mm2, %%mm0 \n\t"
2272 "packssdw %%mm3, %%mm1 \n\t"
2273 "pmaddwd %%mm5, %%mm0 \n\t"
2274 "pmaddwd %%mm5, %%mm1 \n\t"
2275 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2276 "psraw $7, %%mm0 \n\t"
2277
2278 #if HAVE_MMX2 || HAVE_AMD3DNOW
2279 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2280 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2281 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2282 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2283 PAVGB" %%mm1, %%mm4 \n\t"
2284 PAVGB" %%mm3, %%mm2 \n\t"
2285 "movq %%mm4, %%mm1 \n\t"
2286 "movq %%mm2, %%mm3 \n\t"
2287 "psrlq $24, %%mm4 \n\t"
2288 "psrlq $24, %%mm2 \n\t"
2289 PAVGB" %%mm1, %%mm4 \n\t"
2290 PAVGB" %%mm3, %%mm2 \n\t"
2291 "punpcklbw %%mm7, %%mm4 \n\t"
2292 "punpcklbw %%mm7, %%mm2 \n\t"
2293 #else
2294 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2295 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2296 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2297 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2298 "punpcklbw %%mm7, %%mm4 \n\t"
2299 "punpcklbw %%mm7, %%mm1 \n\t"
2300 "punpcklbw %%mm7, %%mm2 \n\t"
2301 "punpcklbw %%mm7, %%mm3 \n\t"
2302 "paddw %%mm1, %%mm4 \n\t"
2303 "paddw %%mm3, %%mm2 \n\t"
2304 "paddw %%mm2, %%mm4 \n\t"
2305 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2306 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2307 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2308 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2309 "punpcklbw %%mm7, %%mm5 \n\t"
2310 "punpcklbw %%mm7, %%mm1 \n\t"
2311 "punpcklbw %%mm7, %%mm2 \n\t"
2312 "punpcklbw %%mm7, %%mm3 \n\t"
2313 "paddw %%mm1, %%mm5 \n\t"
2314 "paddw %%mm3, %%mm2 \n\t"
2315 "paddw %%mm5, %%mm2 \n\t"
2316 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2317 "psrlw $2, %%mm4 \n\t"
2318 "psrlw $2, %%mm2 \n\t"
2319 #endif
2320 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2321 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2322
2323 "pmaddwd %%mm4, %%mm1 \n\t"
2324 "pmaddwd %%mm2, %%mm3 \n\t"
2325 "pmaddwd %%mm6, %%mm4 \n\t"
2326 "pmaddwd %%mm6, %%mm2 \n\t"
2327 #ifndef FAST_BGR2YV12
2328 "psrad $8, %%mm4 \n\t"
2329 "psrad $8, %%mm1 \n\t"
2330 "psrad $8, %%mm2 \n\t"
2331 "psrad $8, %%mm3 \n\t"
2332 #endif
2333 "packssdw %%mm2, %%mm4 \n\t"
2334 "packssdw %%mm3, %%mm1 \n\t"
2335 "pmaddwd %%mm5, %%mm4 \n\t"
2336 "pmaddwd %%mm5, %%mm1 \n\t"
2337 "add $24, %%"REG_d" \n\t"
2338 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2339 "psraw $7, %%mm4 \n\t"
2340
2341 "movq %%mm0, %%mm1 \n\t"
2342 "punpckldq %%mm4, %%mm0 \n\t"
2343 "punpckhdq %%mm4, %%mm1 \n\t"
2344 "packsswb %%mm1, %%mm0 \n\t"
2345 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2346 "movd %%mm0, (%2, %%"REG_a") \n\t"
2347 "punpckhdq %%mm0, %%mm0 \n\t"
2348 "movd %%mm0, (%3, %%"REG_a") \n\t"
2349 "add $4, %%"REG_a" \n\t"
2350 " js 1b \n\t"
2351 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2352 : "%"REG_a, "%"REG_d
2353 );
2354
2355 udst += chromStride;
2356 vdst += chromStride;
2357 src += srcStride*2;
2358 }
2359
2360 __asm__ volatile( EMMS" \n\t"
2361 SFENCE" \n\t"
2362 :::"memory");
2363 #else
2364 y=0;
2365 #endif
2366 for (; y<height; y+=2)
2367 {
2368 long i;
2369 for (i=0; i<chromWidth; i++)
2370 {
2371 unsigned int b = src[6*i+0];
2372 unsigned int g = src[6*i+1];
2373 unsigned int r = src[6*i+2];
2374
2375 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2376 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2377 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2378
2379 udst[i] = U;
2380 vdst[i] = V;
2381 ydst[2*i] = Y;
2382
2383 b = src[6*i+3];
2384 g = src[6*i+4];
2385 r = src[6*i+5];
2386
2387 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2388 ydst[2*i+1] = Y;
2389 }
2390 ydst += lumStride;
2391 src += srcStride;
2392
2393 for (i=0; i<chromWidth; i++)
2394 {
2395 unsigned int b = src[6*i+0];
2396 unsigned int g = src[6*i+1];
2397 unsigned int r = src[6*i+2];
2398
2399 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2400
2401 ydst[2*i] = Y;
2402
2403 b = src[6*i+3];
2404 g = src[6*i+4];
2405 r = src[6*i+5];
2406
2407 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2408 ydst[2*i+1] = Y;
2409 }
2410 udst += chromStride;
2411 vdst += chromStride;
2412 ydst += lumStride;
2413 src += srcStride;
2414 }
2415 }
2416
2417 static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2418 long width, long height, long src1Stride,
2419 long src2Stride, long dstStride){
2420 long h;
2421
2422 for (h=0; h < height; h++)
2423 {
2424 long w;
2425
2426 #if HAVE_MMX
2427 #if HAVE_SSE2
2428 __asm__(
2429 "xor %%"REG_a", %%"REG_a" \n\t"
2430 "1: \n\t"
2431 PREFETCH" 64(%1, %%"REG_a") \n\t"
2432 PREFETCH" 64(%2, %%"REG_a") \n\t"
2433 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2434 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2435 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2436 "punpcklbw %%xmm2, %%xmm0 \n\t"
2437 "punpckhbw %%xmm2, %%xmm1 \n\t"
2438 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2439 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2440 "add $16, %%"REG_a" \n\t"
2441 "cmp %3, %%"REG_a" \n\t"
2442 " jb 1b \n\t"
2443 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2444 : "memory", "%"REG_a""
2445 );
2446 #else
2447 __asm__(
2448 "xor %%"REG_a", %%"REG_a" \n\t"
2449 "1: \n\t"
2450 PREFETCH" 64(%1, %%"REG_a") \n\t"
2451 PREFETCH" 64(%2, %%"REG_a") \n\t"
2452 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2453 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2454 "movq %%mm0, %%mm1 \n\t"
2455 "movq %%mm2, %%mm3 \n\t"
2456 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2457 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2458 "punpcklbw %%mm4, %%mm0 \n\t"
2459 "punpckhbw %%mm4, %%mm1 \n\t"
2460 "punpcklbw %%mm5, %%mm2 \n\t"
2461 "punpckhbw %%mm5, %%mm3 \n\t"
2462 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2463 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2464 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2465 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2466 "add $16, %%"REG_a" \n\t"
2467 "cmp %3, %%"REG_a" \n\t"
2468 " jb 1b \n\t"
2469 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2470 : "memory", "%"REG_a
2471 );
2472 #endif
2473 for (w= (width&(~15)); w < width; w++)
2474 {
2475 dest[2*w+0] = src1[w];
2476 dest[2*w+1] = src2[w];
2477 }
2478 #else
2479 for (w=0; w < width; w++)
2480 {
2481 dest[2*w+0] = src1[w];
2482 dest[2*w+1] = src2[w];
2483 }
2484 #endif
2485 dest += dstStride;
2486 src1 += src1Stride;
2487 src2 += src2Stride;
2488 }
2489 #if HAVE_MMX
2490 __asm__(
2491 EMMS" \n\t"
2492 SFENCE" \n\t"
2493 ::: "memory"
2494 );
2495 #endif
2496 }
2497
2498 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2499 uint8_t *dst1, uint8_t *dst2,
2500 long width, long height,
2501 long srcStride1, long srcStride2,
2502 long dstStride1, long dstStride2)
2503 {
2504 long y,x,w,h;
2505 w=width/2; h=height/2;
2506 #if HAVE_MMX
2507 __asm__ volatile(
2508 PREFETCH" %0 \n\t"
2509 PREFETCH" %1 \n\t"
2510 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2511 #endif
2512 for (y=0;y<h;y++){
2513 const uint8_t* s1=src1+srcStride1*(y>>1);
2514 uint8_t* d=dst1+dstStride1*y;
2515 x=0;
2516 #if HAVE_MMX
2517 for (;x<w-31;x+=32)
2518 {
2519 __asm__ volatile(
2520 PREFETCH" 32%1 \n\t"
2521 "movq %1, %%mm0 \n\t"
2522 "movq 8%1, %%mm2 \n\t"
2523 "movq 16%1, %%mm4 \n\t"
2524 "movq 24%1, %%mm6 \n\t"
2525 "movq %%mm0, %%mm1 \n\t"
2526 "movq %%mm2, %%mm3 \n\t"
2527 "movq %%mm4, %%mm5 \n\t"
2528 "movq %%mm6, %%mm7 \n\t"
2529 "punpcklbw %%mm0, %%mm0 \n\t"
2530 "punpckhbw %%mm1, %%mm1 \n\t"
2531 "punpcklbw %%mm2, %%mm2 \n\t"
2532 "punpckhbw %%mm3, %%mm3 \n\t"
2533 "punpcklbw %%mm4, %%mm4 \n\t"
2534 "punpckhbw %%mm5, %%mm5 \n\t"
2535 "punpcklbw %%mm6, %%mm6 \n\t"
2536 "punpckhbw %%mm7, %%mm7 \n\t"
2537 MOVNTQ" %%mm0, %0 \n\t"
2538 MOVNTQ" %%mm1, 8%0 \n\t"
2539 MOVNTQ" %%mm2, 16%0 \n\t"
2540 MOVNTQ" %%mm3, 24%0 \n\t"
2541 MOVNTQ" %%mm4, 32%0 \n\t"
2542 MOVNTQ" %%mm5, 40%0 \n\t"
2543 MOVNTQ" %%mm6, 48%0 \n\t"
2544 MOVNTQ" %%mm7, 56%0"
2545 :"=m"(d[2*x])
2546 :"m"(s1[x])
2547 :"memory");
2548 }
2549 #endif
2550 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2551 }
2552 for (y=0;y<h;y++){
2553 const uint8_t* s2=src2+srcStride2*(y>>1);
2554 uint8_t* d=dst2+dstStride2*y;
2555 x=0;
2556 #if HAVE_MMX
2557 for (;x<w-31;x+=32)
2558 {
2559 __asm__ volatile(
2560 PREFETCH" 32%1 \n\t"
2561 "movq %1, %%mm0 \n\t"
2562 "movq 8%1, %%mm2 \n\t"
2563 "movq 16%1, %%mm4 \n\t"
2564 "movq 24%1, %%mm6 \n\t"
2565 "movq %%mm0, %%mm1 \n\t"
2566 "movq %%mm2, %%mm3 \n\t"
2567 "movq %%mm4, %%mm5 \n\t"
2568 "movq %%mm6, %%mm7 \n\t"
2569 "punpcklbw %%mm0, %%mm0 \n\t"
2570 "punpckhbw %%mm1, %%mm1 \n\t"
2571 "punpcklbw %%mm2, %%mm2 \n\t"
2572 "punpckhbw %%mm3, %%mm3 \n\t"
2573 "punpcklbw %%mm4, %%mm4 \n\t"
2574 "punpckhbw %%mm5, %%mm5 \n\t"
2575 "punpcklbw %%mm6, %%mm6 \n\t"
2576 "punpckhbw %%mm7, %%mm7 \n\t"
2577 MOVNTQ" %%mm0, %0 \n\t"
2578 MOVNTQ" %%mm1, 8%0 \n\t"
2579 MOVNTQ" %%mm2, 16%0 \n\t"
2580 MOVNTQ" %%mm3, 24%0 \n\t"
2581 MOVNTQ" %%mm4, 32%0 \n\t"
2582 MOVNTQ" %%mm5, 40%0 \n\t"
2583 MOVNTQ" %%mm6, 48%0 \n\t"
2584 MOVNTQ" %%mm7, 56%0"
2585 :"=m"(d[2*x])
2586 :"m"(s2[x])
2587 :"memory");
2588 }
2589 #endif
2590 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2591 }
2592 #if HAVE_MMX
2593 __asm__(
2594 EMMS" \n\t"
2595 SFENCE" \n\t"
2596 ::: "memory"
2597 );
2598 #endif
2599 }
2600
2601 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2602 uint8_t *dst,
2603 long width, long height,
2604 long srcStride1, long srcStride2,
2605 long srcStride3, long dstStride)
2606 {
2607 long y,x,w,h;
2608 w=width/2; h=height;
2609 for (y=0;y<h;y++){
2610 const uint8_t* yp=src1+srcStride1*y;
2611 const uint8_t* up=src2+srcStride2*(y>>2);
2612 const uint8_t* vp=src3+srcStride3*(y>>2);
2613 uint8_t* d=dst+dstStride*y;
2614 x=0;
2615 #if HAVE_MMX
2616 for (;x<w-7;x+=8)
2617 {
2618 __asm__ volatile(
2619 PREFETCH" 32(%1, %0) \n\t"
2620 PREFETCH" 32(%2, %0) \n\t"
2621 PREFETCH" 32(%3, %0) \n\t"
2622 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2623 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2624 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2625 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2626 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2627 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2628 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2629 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2630 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2631 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2632
2633 "movq %%mm1, %%mm6 \n\t"
2634 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2635 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2636 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2637 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2638 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2639
2640 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2641 "movq 8(%1, %0, 4), %%mm0 \n\t"
2642 "movq %%mm0, %%mm3 \n\t"
2643 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2644 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2645 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2646 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2647
2648 "movq %%mm4, %%mm6 \n\t"
2649 "movq 16(%1, %0, 4), %%mm0 \n\t"
2650 "movq %%mm0, %%mm3 \n\t"
2651 "punpcklbw %%mm5, %%mm4 \n\t"
2652 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2653 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2654 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2655 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2656
2657 "punpckhbw %%mm5, %%mm6 \n\t"
2658 "movq 24(%1, %0, 4), %%mm0 \n\t"
2659 "movq %%mm0, %%mm3 \n\t"
2660 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2661 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2662 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2663 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2664
2665 : "+r" (x)
2666 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2667 :"memory");
2668 }
2669 #endif
2670 for (; x<w; x++)
2671 {
2672 const long x2 = x<<2;
2673 d[8*x+0] = yp[x2];
2674 d[8*x+1] = up[x];
2675 d[8*x+2] = yp[x2+1];
2676 d[8*x+3] = vp[x];
2677 d[8*x+4] = yp[x2+2];
2678 d[8*x+5] = up[x];
2679 d[8*x+6] = yp[x2+3];
2680 d[8*x+7] = vp[x];
2681 }
2682 }
2683 #if HAVE_MMX
2684 __asm__(
2685 EMMS" \n\t"
2686 SFENCE" \n\t"
2687 ::: "memory"
2688 );
2689 #endif
2690 }
2691
2692 static inline void RENAME(rgb2rgb_init)(void){
2693 rgb15to16 = RENAME(rgb15to16);
2694 rgb15tobgr24 = RENAME(rgb15tobgr24);
2695 rgb15to32 = RENAME(rgb15to32);
2696 rgb16tobgr24 = RENAME(rgb16tobgr24);
2697 rgb16to32 = RENAME(rgb16to32);
2698 rgb16to15 = RENAME(rgb16to15);
2699 rgb24tobgr16 = RENAME(rgb24tobgr16);
2700 rgb24tobgr15 = RENAME(rgb24tobgr15);
2701 rgb24tobgr32 = RENAME(rgb24tobgr32);
2702 rgb32to16 = RENAME(rgb32to16);
2703 rgb32to15 = RENAME(rgb32to15);
2704 rgb32tobgr24 = RENAME(rgb32tobgr24);
2705 rgb24to15 = RENAME(rgb24to15);
2706 rgb24to16 = RENAME(rgb24to16);
2707 rgb24tobgr24 = RENAME(rgb24tobgr24);
2708 rgb32tobgr32 = RENAME(rgb32tobgr32);
2709 rgb32tobgr16 = RENAME(rgb32tobgr16);
2710 rgb32tobgr15 = RENAME(rgb32tobgr15);
2711 yv12toyuy2 = RENAME(yv12toyuy2);
2712 yv12touyvy = RENAME(yv12touyvy);
2713 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2714 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2715 yuy2toyv12 = RENAME(yuy2toyv12);
2716 // uyvytoyv12 = RENAME(uyvytoyv12);
2717 // yvu9toyv12 = RENAME(yvu9toyv12);
2718 planar2x = RENAME(planar2x);
2719 rgb24toyv12 = RENAME(rgb24toyv12);
2720 interleaveBytes = RENAME(interleaveBytes);
2721 vu9_to_vu12 = RENAME(vu9_to_vu12);
2722 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2723 }