Change semantic of CONFIG_*, HAVE_* and ARCH_*.
[libav.git] / libswscale / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 *
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
28 */
29
30 #include <stddef.h>
31
32 #undef PREFETCH
33 #undef MOVNTQ
34 #undef EMMS
35 #undef SFENCE
36 #undef MMREG_SIZE
37 #undef PREFETCHW
38 #undef PAVGB
39
40 #if HAVE_SSE2
41 #define MMREG_SIZE 16
42 #else
43 #define MMREG_SIZE 8
44 #endif
45
46 #if HAVE_3DNOW
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
50 #elif HAVE_MMX2
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
53 #define PAVGB "pavgb"
54 #else
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
57 #endif
58
59 #if HAVE_3DNOW
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
61 #define EMMS "femms"
62 #else
63 #define EMMS "emms"
64 #endif
65
66 #if HAVE_MMX2
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
69 #else
70 #define MOVNTQ "movq"
71 #define SFENCE " # nop"
72 #endif
73
74 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
75 {
76 uint8_t *dest = dst;
77 const uint8_t *s = src;
78 const uint8_t *end;
79 #if HAVE_MMX
80 const uint8_t *mm_end;
81 #endif
82 end = s + src_size;
83 #if HAVE_MMX
84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
85 mm_end = end - 23;
86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32):"memory");
87 while (s < mm_end)
88 {
89 __asm__ volatile(
90 PREFETCH" 32%1 \n\t"
91 "movd %1, %%mm0 \n\t"
92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t"
95 "movd 12%1, %%mm2 \n\t"
96 "punpckldq 15%1, %%mm2 \n\t"
97 "movd 18%1, %%mm3 \n\t"
98 "punpckldq 21%1, %%mm3 \n\t"
99 "pand %%mm7, %%mm0 \n\t"
100 "pand %%mm7, %%mm1 \n\t"
101 "pand %%mm7, %%mm2 \n\t"
102 "pand %%mm7, %%mm3 \n\t"
103 MOVNTQ" %%mm0, %0 \n\t"
104 MOVNTQ" %%mm1, 8%0 \n\t"
105 MOVNTQ" %%mm2, 16%0 \n\t"
106 MOVNTQ" %%mm3, 24%0"
107 :"=m"(*dest)
108 :"m"(*s)
109 :"memory");
110 dest += 32;
111 s += 24;
112 }
113 __asm__ volatile(SFENCE:::"memory");
114 __asm__ volatile(EMMS:::"memory");
115 #endif
116 while (s < end)
117 {
118 #ifdef WORDS_BIGENDIAN
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
120 *dest++ = 0;
121 *dest++ = s[2];
122 *dest++ = s[1];
123 *dest++ = s[0];
124 s+=3;
125 #else
126 *dest++ = *s++;
127 *dest++ = *s++;
128 *dest++ = *s++;
129 *dest++ = 0;
130 #endif
131 }
132 }
133
134 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
135 {
136 uint8_t *dest = dst;
137 const uint8_t *s = src;
138 const uint8_t *end;
139 #if HAVE_MMX
140 const uint8_t *mm_end;
141 #endif
142 end = s + src_size;
143 #if HAVE_MMX
144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
145 mm_end = end - 31;
146 while (s < mm_end)
147 {
148 __asm__ volatile(
149 PREFETCH" 32%1 \n\t"
150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t"
174
175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t"
188
189 MOVNTQ" %%mm0, %0 \n\t"
190 MOVNTQ" %%mm1, 8%0 \n\t"
191 MOVNTQ" %%mm4, 16%0"
192 :"=m"(*dest)
193 :"m"(*s),"m"(mask24l),
194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
195 :"memory");
196 dest += 24;
197 s += 32;
198 }
199 __asm__ volatile(SFENCE:::"memory");
200 __asm__ volatile(EMMS:::"memory");
201 #endif
202 while (s < end)
203 {
204 #ifdef WORDS_BIGENDIAN
205 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
206 s++;
207 dest[2] = *s++;
208 dest[1] = *s++;
209 dest[0] = *s++;
210 dest += 3;
211 #else
212 *dest++ = *s++;
213 *dest++ = *s++;
214 *dest++ = *s++;
215 s++;
216 #endif
217 }
218 }
219
220 /*
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
225 */
226 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
227 {
228 register const uint8_t* s=src;
229 register uint8_t* d=dst;
230 register const uint8_t *end;
231 const uint8_t *mm_end;
232 end = s + src_size;
233 #if HAVE_MMX
234 __asm__ volatile(PREFETCH" %0"::"m"(*s));
235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
236 mm_end = end - 15;
237 while (s<mm_end)
238 {
239 __asm__ volatile(
240 PREFETCH" 32%1 \n\t"
241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ" %%mm0, %0 \n\t"
250 MOVNTQ" %%mm2, 8%0"
251 :"=m"(*d)
252 :"m"(*s)
253 );
254 d+=16;
255 s+=16;
256 }
257 __asm__ volatile(SFENCE:::"memory");
258 __asm__ volatile(EMMS:::"memory");
259 #endif
260 mm_end = end - 3;
261 while (s < mm_end)
262 {
263 register unsigned x= *((const uint32_t *)s);
264 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
265 d+=4;
266 s+=4;
267 }
268 if (s < end)
269 {
270 register unsigned short x= *((const uint16_t *)s);
271 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
272 }
273 }
274
275 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
276 {
277 register const uint8_t* s=src;
278 register uint8_t* d=dst;
279 register const uint8_t *end;
280 const uint8_t *mm_end;
281 end = s + src_size;
282 #if HAVE_MMX
283 __asm__ volatile(PREFETCH" %0"::"m"(*s));
284 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
286 mm_end = end - 15;
287 while (s<mm_end)
288 {
289 __asm__ volatile(
290 PREFETCH" 32%1 \n\t"
291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ" %%mm0, %0 \n\t"
304 MOVNTQ" %%mm2, 8%0"
305 :"=m"(*d)
306 :"m"(*s)
307 );
308 d+=16;
309 s+=16;
310 }
311 __asm__ volatile(SFENCE:::"memory");
312 __asm__ volatile(EMMS:::"memory");
313 #endif
314 mm_end = end - 3;
315 while (s < mm_end)
316 {
317 register uint32_t x= *((const uint32_t*)s);
318 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
319 s+=4;
320 d+=4;
321 }
322 if (s < end)
323 {
324 register uint16_t x= *((const uint16_t*)s);
325 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
326 s+=2;
327 d+=2;
328 }
329 }
330
331 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
332 {
333 const uint8_t *s = src;
334 const uint8_t *end;
335 #if HAVE_MMX
336 const uint8_t *mm_end;
337 #endif
338 uint16_t *d = (uint16_t *)dst;
339 end = s + src_size;
340 #if HAVE_MMX
341 mm_end = end - 15;
342 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
343 __asm__ volatile(
344 "movq %3, %%mm5 \n\t"
345 "movq %4, %%mm6 \n\t"
346 "movq %5, %%mm7 \n\t"
347 "jmp 2f \n\t"
348 ASMALIGN(4)
349 "1: \n\t"
350 PREFETCH" 32(%1) \n\t"
351 "movd (%1), %%mm0 \n\t"
352 "movd 4(%1), %%mm3 \n\t"
353 "punpckldq 8(%1), %%mm0 \n\t"
354 "punpckldq 12(%1), %%mm3 \n\t"
355 "movq %%mm0, %%mm1 \n\t"
356 "movq %%mm3, %%mm4 \n\t"
357 "pand %%mm6, %%mm0 \n\t"
358 "pand %%mm6, %%mm3 \n\t"
359 "pmaddwd %%mm7, %%mm0 \n\t"
360 "pmaddwd %%mm7, %%mm3 \n\t"
361 "pand %%mm5, %%mm1 \n\t"
362 "pand %%mm5, %%mm4 \n\t"
363 "por %%mm1, %%mm0 \n\t"
364 "por %%mm4, %%mm3 \n\t"
365 "psrld $5, %%mm0 \n\t"
366 "pslld $11, %%mm3 \n\t"
367 "por %%mm3, %%mm0 \n\t"
368 MOVNTQ" %%mm0, (%0) \n\t"
369 "add $16, %1 \n\t"
370 "add $8, %0 \n\t"
371 "2: \n\t"
372 "cmp %2, %1 \n\t"
373 " jb 1b \n\t"
374 : "+r" (d), "+r"(s)
375 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
376 );
377 #else
378 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
379 __asm__ volatile(
380 "movq %0, %%mm7 \n\t"
381 "movq %1, %%mm6 \n\t"
382 ::"m"(red_16mask),"m"(green_16mask));
383 while (s < mm_end)
384 {
385 __asm__ volatile(
386 PREFETCH" 32%1 \n\t"
387 "movd %1, %%mm0 \n\t"
388 "movd 4%1, %%mm3 \n\t"
389 "punpckldq 8%1, %%mm0 \n\t"
390 "punpckldq 12%1, %%mm3 \n\t"
391 "movq %%mm0, %%mm1 \n\t"
392 "movq %%mm0, %%mm2 \n\t"
393 "movq %%mm3, %%mm4 \n\t"
394 "movq %%mm3, %%mm5 \n\t"
395 "psrlq $3, %%mm0 \n\t"
396 "psrlq $3, %%mm3 \n\t"
397 "pand %2, %%mm0 \n\t"
398 "pand %2, %%mm3 \n\t"
399 "psrlq $5, %%mm1 \n\t"
400 "psrlq $5, %%mm4 \n\t"
401 "pand %%mm6, %%mm1 \n\t"
402 "pand %%mm6, %%mm4 \n\t"
403 "psrlq $8, %%mm2 \n\t"
404 "psrlq $8, %%mm5 \n\t"
405 "pand %%mm7, %%mm2 \n\t"
406 "pand %%mm7, %%mm5 \n\t"
407 "por %%mm1, %%mm0 \n\t"
408 "por %%mm4, %%mm3 \n\t"
409 "por %%mm2, %%mm0 \n\t"
410 "por %%mm5, %%mm3 \n\t"
411 "psllq $16, %%mm3 \n\t"
412 "por %%mm3, %%mm0 \n\t"
413 MOVNTQ" %%mm0, %0 \n\t"
414 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
415 d += 4;
416 s += 16;
417 }
418 #endif
419 __asm__ volatile(SFENCE:::"memory");
420 __asm__ volatile(EMMS:::"memory");
421 #endif
422 while (s < end)
423 {
424 register int rgb = *(const uint32_t*)s; s += 4;
425 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
426 }
427 }
428
429 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
430 {
431 const uint8_t *s = src;
432 const uint8_t *end;
433 #if HAVE_MMX
434 const uint8_t *mm_end;
435 #endif
436 uint16_t *d = (uint16_t *)dst;
437 end = s + src_size;
438 #if HAVE_MMX
439 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
440 __asm__ volatile(
441 "movq %0, %%mm7 \n\t"
442 "movq %1, %%mm6 \n\t"
443 ::"m"(red_16mask),"m"(green_16mask));
444 mm_end = end - 15;
445 while (s < mm_end)
446 {
447 __asm__ volatile(
448 PREFETCH" 32%1 \n\t"
449 "movd %1, %%mm0 \n\t"
450 "movd 4%1, %%mm3 \n\t"
451 "punpckldq 8%1, %%mm0 \n\t"
452 "punpckldq 12%1, %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm0, %%mm2 \n\t"
455 "movq %%mm3, %%mm4 \n\t"
456 "movq %%mm3, %%mm5 \n\t"
457 "psllq $8, %%mm0 \n\t"
458 "psllq $8, %%mm3 \n\t"
459 "pand %%mm7, %%mm0 \n\t"
460 "pand %%mm7, %%mm3 \n\t"
461 "psrlq $5, %%mm1 \n\t"
462 "psrlq $5, %%mm4 \n\t"
463 "pand %%mm6, %%mm1 \n\t"
464 "pand %%mm6, %%mm4 \n\t"
465 "psrlq $19, %%mm2 \n\t"
466 "psrlq $19, %%mm5 \n\t"
467 "pand %2, %%mm2 \n\t"
468 "pand %2, %%mm5 \n\t"
469 "por %%mm1, %%mm0 \n\t"
470 "por %%mm4, %%mm3 \n\t"
471 "por %%mm2, %%mm0 \n\t"
472 "por %%mm5, %%mm3 \n\t"
473 "psllq $16, %%mm3 \n\t"
474 "por %%mm3, %%mm0 \n\t"
475 MOVNTQ" %%mm0, %0 \n\t"
476 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
477 d += 4;
478 s += 16;
479 }
480 __asm__ volatile(SFENCE:::"memory");
481 __asm__ volatile(EMMS:::"memory");
482 #endif
483 while (s < end)
484 {
485 register int rgb = *(const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
487 }
488 }
489
490 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
491 {
492 const uint8_t *s = src;
493 const uint8_t *end;
494 #if HAVE_MMX
495 const uint8_t *mm_end;
496 #endif
497 uint16_t *d = (uint16_t *)dst;
498 end = s + src_size;
499 #if HAVE_MMX
500 mm_end = end - 15;
501 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
502 __asm__ volatile(
503 "movq %3, %%mm5 \n\t"
504 "movq %4, %%mm6 \n\t"
505 "movq %5, %%mm7 \n\t"
506 "jmp 2f \n\t"
507 ASMALIGN(4)
508 "1: \n\t"
509 PREFETCH" 32(%1) \n\t"
510 "movd (%1), %%mm0 \n\t"
511 "movd 4(%1), %%mm3 \n\t"
512 "punpckldq 8(%1), %%mm0 \n\t"
513 "punpckldq 12(%1), %%mm3 \n\t"
514 "movq %%mm0, %%mm1 \n\t"
515 "movq %%mm3, %%mm4 \n\t"
516 "pand %%mm6, %%mm0 \n\t"
517 "pand %%mm6, %%mm3 \n\t"
518 "pmaddwd %%mm7, %%mm0 \n\t"
519 "pmaddwd %%mm7, %%mm3 \n\t"
520 "pand %%mm5, %%mm1 \n\t"
521 "pand %%mm5, %%mm4 \n\t"
522 "por %%mm1, %%mm0 \n\t"
523 "por %%mm4, %%mm3 \n\t"
524 "psrld $6, %%mm0 \n\t"
525 "pslld $10, %%mm3 \n\t"
526 "por %%mm3, %%mm0 \n\t"
527 MOVNTQ" %%mm0, (%0) \n\t"
528 "add $16, %1 \n\t"
529 "add $8, %0 \n\t"
530 "2: \n\t"
531 "cmp %2, %1 \n\t"
532 " jb 1b \n\t"
533 : "+r" (d), "+r"(s)
534 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
535 );
536 #else
537 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
538 __asm__ volatile(
539 "movq %0, %%mm7 \n\t"
540 "movq %1, %%mm6 \n\t"
541 ::"m"(red_15mask),"m"(green_15mask));
542 while (s < mm_end)
543 {
544 __asm__ volatile(
545 PREFETCH" 32%1 \n\t"
546 "movd %1, %%mm0 \n\t"
547 "movd 4%1, %%mm3 \n\t"
548 "punpckldq 8%1, %%mm0 \n\t"
549 "punpckldq 12%1, %%mm3 \n\t"
550 "movq %%mm0, %%mm1 \n\t"
551 "movq %%mm0, %%mm2 \n\t"
552 "movq %%mm3, %%mm4 \n\t"
553 "movq %%mm3, %%mm5 \n\t"
554 "psrlq $3, %%mm0 \n\t"
555 "psrlq $3, %%mm3 \n\t"
556 "pand %2, %%mm0 \n\t"
557 "pand %2, %%mm3 \n\t"
558 "psrlq $6, %%mm1 \n\t"
559 "psrlq $6, %%mm4 \n\t"
560 "pand %%mm6, %%mm1 \n\t"
561 "pand %%mm6, %%mm4 \n\t"
562 "psrlq $9, %%mm2 \n\t"
563 "psrlq $9, %%mm5 \n\t"
564 "pand %%mm7, %%mm2 \n\t"
565 "pand %%mm7, %%mm5 \n\t"
566 "por %%mm1, %%mm0 \n\t"
567 "por %%mm4, %%mm3 \n\t"
568 "por %%mm2, %%mm0 \n\t"
569 "por %%mm5, %%mm3 \n\t"
570 "psllq $16, %%mm3 \n\t"
571 "por %%mm3, %%mm0 \n\t"
572 MOVNTQ" %%mm0, %0 \n\t"
573 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
574 d += 4;
575 s += 16;
576 }
577 #endif
578 __asm__ volatile(SFENCE:::"memory");
579 __asm__ volatile(EMMS:::"memory");
580 #endif
581 while (s < end)
582 {
583 register int rgb = *(const uint32_t*)s; s += 4;
584 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
585 }
586 }
587
588 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
589 {
590 const uint8_t *s = src;
591 const uint8_t *end;
592 #if HAVE_MMX
593 const uint8_t *mm_end;
594 #endif
595 uint16_t *d = (uint16_t *)dst;
596 end = s + src_size;
597 #if HAVE_MMX
598 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
599 __asm__ volatile(
600 "movq %0, %%mm7 \n\t"
601 "movq %1, %%mm6 \n\t"
602 ::"m"(red_15mask),"m"(green_15mask));
603 mm_end = end - 15;
604 while (s < mm_end)
605 {
606 __asm__ volatile(
607 PREFETCH" 32%1 \n\t"
608 "movd %1, %%mm0 \n\t"
609 "movd 4%1, %%mm3 \n\t"
610 "punpckldq 8%1, %%mm0 \n\t"
611 "punpckldq 12%1, %%mm3 \n\t"
612 "movq %%mm0, %%mm1 \n\t"
613 "movq %%mm0, %%mm2 \n\t"
614 "movq %%mm3, %%mm4 \n\t"
615 "movq %%mm3, %%mm5 \n\t"
616 "psllq $7, %%mm0 \n\t"
617 "psllq $7, %%mm3 \n\t"
618 "pand %%mm7, %%mm0 \n\t"
619 "pand %%mm7, %%mm3 \n\t"
620 "psrlq $6, %%mm1 \n\t"
621 "psrlq $6, %%mm4 \n\t"
622 "pand %%mm6, %%mm1 \n\t"
623 "pand %%mm6, %%mm4 \n\t"
624 "psrlq $19, %%mm2 \n\t"
625 "psrlq $19, %%mm5 \n\t"
626 "pand %2, %%mm2 \n\t"
627 "pand %2, %%mm5 \n\t"
628 "por %%mm1, %%mm0 \n\t"
629 "por %%mm4, %%mm3 \n\t"
630 "por %%mm2, %%mm0 \n\t"
631 "por %%mm5, %%mm3 \n\t"
632 "psllq $16, %%mm3 \n\t"
633 "por %%mm3, %%mm0 \n\t"
634 MOVNTQ" %%mm0, %0 \n\t"
635 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
636 d += 4;
637 s += 16;
638 }
639 __asm__ volatile(SFENCE:::"memory");
640 __asm__ volatile(EMMS:::"memory");
641 #endif
642 while (s < end)
643 {
644 register int rgb = *(const uint32_t*)s; s += 4;
645 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
646 }
647 }
648
649 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
650 {
651 const uint8_t *s = src;
652 const uint8_t *end;
653 #if HAVE_MMX
654 const uint8_t *mm_end;
655 #endif
656 uint16_t *d = (uint16_t *)dst;
657 end = s + src_size;
658 #if HAVE_MMX
659 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
660 __asm__ volatile(
661 "movq %0, %%mm7 \n\t"
662 "movq %1, %%mm6 \n\t"
663 ::"m"(red_16mask),"m"(green_16mask));
664 mm_end = end - 11;
665 while (s < mm_end)
666 {
667 __asm__ volatile(
668 PREFETCH" 32%1 \n\t"
669 "movd %1, %%mm0 \n\t"
670 "movd 3%1, %%mm3 \n\t"
671 "punpckldq 6%1, %%mm0 \n\t"
672 "punpckldq 9%1, %%mm3 \n\t"
673 "movq %%mm0, %%mm1 \n\t"
674 "movq %%mm0, %%mm2 \n\t"
675 "movq %%mm3, %%mm4 \n\t"
676 "movq %%mm3, %%mm5 \n\t"
677 "psrlq $3, %%mm0 \n\t"
678 "psrlq $3, %%mm3 \n\t"
679 "pand %2, %%mm0 \n\t"
680 "pand %2, %%mm3 \n\t"
681 "psrlq $5, %%mm1 \n\t"
682 "psrlq $5, %%mm4 \n\t"
683 "pand %%mm6, %%mm1 \n\t"
684 "pand %%mm6, %%mm4 \n\t"
685 "psrlq $8, %%mm2 \n\t"
686 "psrlq $8, %%mm5 \n\t"
687 "pand %%mm7, %%mm2 \n\t"
688 "pand %%mm7, %%mm5 \n\t"
689 "por %%mm1, %%mm0 \n\t"
690 "por %%mm4, %%mm3 \n\t"
691 "por %%mm2, %%mm0 \n\t"
692 "por %%mm5, %%mm3 \n\t"
693 "psllq $16, %%mm3 \n\t"
694 "por %%mm3, %%mm0 \n\t"
695 MOVNTQ" %%mm0, %0 \n\t"
696 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
697 d += 4;
698 s += 12;
699 }
700 __asm__ volatile(SFENCE:::"memory");
701 __asm__ volatile(EMMS:::"memory");
702 #endif
703 while (s < end)
704 {
705 const int b = *s++;
706 const int g = *s++;
707 const int r = *s++;
708 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
709 }
710 }
711
712 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
713 {
714 const uint8_t *s = src;
715 const uint8_t *end;
716 #if HAVE_MMX
717 const uint8_t *mm_end;
718 #endif
719 uint16_t *d = (uint16_t *)dst;
720 end = s + src_size;
721 #if HAVE_MMX
722 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
723 __asm__ volatile(
724 "movq %0, %%mm7 \n\t"
725 "movq %1, %%mm6 \n\t"
726 ::"m"(red_16mask),"m"(green_16mask));
727 mm_end = end - 15;
728 while (s < mm_end)
729 {
730 __asm__ volatile(
731 PREFETCH" 32%1 \n\t"
732 "movd %1, %%mm0 \n\t"
733 "movd 3%1, %%mm3 \n\t"
734 "punpckldq 6%1, %%mm0 \n\t"
735 "punpckldq 9%1, %%mm3 \n\t"
736 "movq %%mm0, %%mm1 \n\t"
737 "movq %%mm0, %%mm2 \n\t"
738 "movq %%mm3, %%mm4 \n\t"
739 "movq %%mm3, %%mm5 \n\t"
740 "psllq $8, %%mm0 \n\t"
741 "psllq $8, %%mm3 \n\t"
742 "pand %%mm7, %%mm0 \n\t"
743 "pand %%mm7, %%mm3 \n\t"
744 "psrlq $5, %%mm1 \n\t"
745 "psrlq $5, %%mm4 \n\t"
746 "pand %%mm6, %%mm1 \n\t"
747 "pand %%mm6, %%mm4 \n\t"
748 "psrlq $19, %%mm2 \n\t"
749 "psrlq $19, %%mm5 \n\t"
750 "pand %2, %%mm2 \n\t"
751 "pand %2, %%mm5 \n\t"
752 "por %%mm1, %%mm0 \n\t"
753 "por %%mm4, %%mm3 \n\t"
754 "por %%mm2, %%mm0 \n\t"
755 "por %%mm5, %%mm3 \n\t"
756 "psllq $16, %%mm3 \n\t"
757 "por %%mm3, %%mm0 \n\t"
758 MOVNTQ" %%mm0, %0 \n\t"
759 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
760 d += 4;
761 s += 12;
762 }
763 __asm__ volatile(SFENCE:::"memory");
764 __asm__ volatile(EMMS:::"memory");
765 #endif
766 while (s < end)
767 {
768 const int r = *s++;
769 const int g = *s++;
770 const int b = *s++;
771 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
772 }
773 }
774
775 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
776 {
777 const uint8_t *s = src;
778 const uint8_t *end;
779 #if HAVE_MMX
780 const uint8_t *mm_end;
781 #endif
782 uint16_t *d = (uint16_t *)dst;
783 end = s + src_size;
784 #if HAVE_MMX
785 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
786 __asm__ volatile(
787 "movq %0, %%mm7 \n\t"
788 "movq %1, %%mm6 \n\t"
789 ::"m"(red_15mask),"m"(green_15mask));
790 mm_end = end - 11;
791 while (s < mm_end)
792 {
793 __asm__ volatile(
794 PREFETCH" 32%1 \n\t"
795 "movd %1, %%mm0 \n\t"
796 "movd 3%1, %%mm3 \n\t"
797 "punpckldq 6%1, %%mm0 \n\t"
798 "punpckldq 9%1, %%mm3 \n\t"
799 "movq %%mm0, %%mm1 \n\t"
800 "movq %%mm0, %%mm2 \n\t"
801 "movq %%mm3, %%mm4 \n\t"
802 "movq %%mm3, %%mm5 \n\t"
803 "psrlq $3, %%mm0 \n\t"
804 "psrlq $3, %%mm3 \n\t"
805 "pand %2, %%mm0 \n\t"
806 "pand %2, %%mm3 \n\t"
807 "psrlq $6, %%mm1 \n\t"
808 "psrlq $6, %%mm4 \n\t"
809 "pand %%mm6, %%mm1 \n\t"
810 "pand %%mm6, %%mm4 \n\t"
811 "psrlq $9, %%mm2 \n\t"
812 "psrlq $9, %%mm5 \n\t"
813 "pand %%mm7, %%mm2 \n\t"
814 "pand %%mm7, %%mm5 \n\t"
815 "por %%mm1, %%mm0 \n\t"
816 "por %%mm4, %%mm3 \n\t"
817 "por %%mm2, %%mm0 \n\t"
818 "por %%mm5, %%mm3 \n\t"
819 "psllq $16, %%mm3 \n\t"
820 "por %%mm3, %%mm0 \n\t"
821 MOVNTQ" %%mm0, %0 \n\t"
822 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
823 d += 4;
824 s += 12;
825 }
826 __asm__ volatile(SFENCE:::"memory");
827 __asm__ volatile(EMMS:::"memory");
828 #endif
829 while (s < end)
830 {
831 const int b = *s++;
832 const int g = *s++;
833 const int r = *s++;
834 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
835 }
836 }
837
838 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
839 {
840 const uint8_t *s = src;
841 const uint8_t *end;
842 #if HAVE_MMX
843 const uint8_t *mm_end;
844 #endif
845 uint16_t *d = (uint16_t *)dst;
846 end = s + src_size;
847 #if HAVE_MMX
848 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
849 __asm__ volatile(
850 "movq %0, %%mm7 \n\t"
851 "movq %1, %%mm6 \n\t"
852 ::"m"(red_15mask),"m"(green_15mask));
853 mm_end = end - 15;
854 while (s < mm_end)
855 {
856 __asm__ volatile(
857 PREFETCH" 32%1 \n\t"
858 "movd %1, %%mm0 \n\t"
859 "movd 3%1, %%mm3 \n\t"
860 "punpckldq 6%1, %%mm0 \n\t"
861 "punpckldq 9%1, %%mm3 \n\t"
862 "movq %%mm0, %%mm1 \n\t"
863 "movq %%mm0, %%mm2 \n\t"
864 "movq %%mm3, %%mm4 \n\t"
865 "movq %%mm3, %%mm5 \n\t"
866 "psllq $7, %%mm0 \n\t"
867 "psllq $7, %%mm3 \n\t"
868 "pand %%mm7, %%mm0 \n\t"
869 "pand %%mm7, %%mm3 \n\t"
870 "psrlq $6, %%mm1 \n\t"
871 "psrlq $6, %%mm4 \n\t"
872 "pand %%mm6, %%mm1 \n\t"
873 "pand %%mm6, %%mm4 \n\t"
874 "psrlq $19, %%mm2 \n\t"
875 "psrlq $19, %%mm5 \n\t"
876 "pand %2, %%mm2 \n\t"
877 "pand %2, %%mm5 \n\t"
878 "por %%mm1, %%mm0 \n\t"
879 "por %%mm4, %%mm3 \n\t"
880 "por %%mm2, %%mm0 \n\t"
881 "por %%mm5, %%mm3 \n\t"
882 "psllq $16, %%mm3 \n\t"
883 "por %%mm3, %%mm0 \n\t"
884 MOVNTQ" %%mm0, %0 \n\t"
885 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
886 d += 4;
887 s += 12;
888 }
889 __asm__ volatile(SFENCE:::"memory");
890 __asm__ volatile(EMMS:::"memory");
891 #endif
892 while (s < end)
893 {
894 const int r = *s++;
895 const int g = *s++;
896 const int b = *s++;
897 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
898 }
899 }
900
901 /*
902 I use less accurate approximation here by simply left-shifting the input
903 value and filling the low order bits with zeroes. This method improves PNG
904 compression but this scheme cannot reproduce white exactly, since it does
905 not generate an all-ones maximum value; the net effect is to darken the
906 image slightly.
907
908 The better method should be "left bit replication":
909
910 4 3 2 1 0
911 ---------
912 1 1 0 1 1
913
914 7 6 5 4 3 2 1 0
915 ----------------
916 1 1 0 1 1 1 1 0
917 |=======| |===|
918 | leftmost bits repeated to fill open bits
919 |
920 original bits
921 */
922 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
923 {
924 const uint16_t *end;
925 #if HAVE_MMX
926 const uint16_t *mm_end;
927 #endif
928 uint8_t *d = dst;
929 const uint16_t *s = (const uint16_t*)src;
930 end = s + src_size/2;
931 #if HAVE_MMX
932 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
933 mm_end = end - 7;
934 while (s < mm_end)
935 {
936 __asm__ volatile(
937 PREFETCH" 32%1 \n\t"
938 "movq %1, %%mm0 \n\t"
939 "movq %1, %%mm1 \n\t"
940 "movq %1, %%mm2 \n\t"
941 "pand %2, %%mm0 \n\t"
942 "pand %3, %%mm1 \n\t"
943 "pand %4, %%mm2 \n\t"
944 "psllq $3, %%mm0 \n\t"
945 "psrlq $2, %%mm1 \n\t"
946 "psrlq $7, %%mm2 \n\t"
947 "movq %%mm0, %%mm3 \n\t"
948 "movq %%mm1, %%mm4 \n\t"
949 "movq %%mm2, %%mm5 \n\t"
950 "punpcklwd %5, %%mm0 \n\t"
951 "punpcklwd %5, %%mm1 \n\t"
952 "punpcklwd %5, %%mm2 \n\t"
953 "punpckhwd %5, %%mm3 \n\t"
954 "punpckhwd %5, %%mm4 \n\t"
955 "punpckhwd %5, %%mm5 \n\t"
956 "psllq $8, %%mm1 \n\t"
957 "psllq $16, %%mm2 \n\t"
958 "por %%mm1, %%mm0 \n\t"
959 "por %%mm2, %%mm0 \n\t"
960 "psllq $8, %%mm4 \n\t"
961 "psllq $16, %%mm5 \n\t"
962 "por %%mm4, %%mm3 \n\t"
963 "por %%mm5, %%mm3 \n\t"
964
965 "movq %%mm0, %%mm6 \n\t"
966 "movq %%mm3, %%mm7 \n\t"
967
968 "movq 8%1, %%mm0 \n\t"
969 "movq 8%1, %%mm1 \n\t"
970 "movq 8%1, %%mm2 \n\t"
971 "pand %2, %%mm0 \n\t"
972 "pand %3, %%mm1 \n\t"
973 "pand %4, %%mm2 \n\t"
974 "psllq $3, %%mm0 \n\t"
975 "psrlq $2, %%mm1 \n\t"
976 "psrlq $7, %%mm2 \n\t"
977 "movq %%mm0, %%mm3 \n\t"
978 "movq %%mm1, %%mm4 \n\t"
979 "movq %%mm2, %%mm5 \n\t"
980 "punpcklwd %5, %%mm0 \n\t"
981 "punpcklwd %5, %%mm1 \n\t"
982 "punpcklwd %5, %%mm2 \n\t"
983 "punpckhwd %5, %%mm3 \n\t"
984 "punpckhwd %5, %%mm4 \n\t"
985 "punpckhwd %5, %%mm5 \n\t"
986 "psllq $8, %%mm1 \n\t"
987 "psllq $16, %%mm2 \n\t"
988 "por %%mm1, %%mm0 \n\t"
989 "por %%mm2, %%mm0 \n\t"
990 "psllq $8, %%mm4 \n\t"
991 "psllq $16, %%mm5 \n\t"
992 "por %%mm4, %%mm3 \n\t"
993 "por %%mm5, %%mm3 \n\t"
994
995 :"=m"(*d)
996 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
997 :"memory");
998 /* borrowed 32 to 24 */
999 __asm__ volatile(
1000 "movq %%mm0, %%mm4 \n\t"
1001 "movq %%mm3, %%mm5 \n\t"
1002 "movq %%mm6, %%mm0 \n\t"
1003 "movq %%mm7, %%mm1 \n\t"
1004
1005 "movq %%mm4, %%mm6 \n\t"
1006 "movq %%mm5, %%mm7 \n\t"
1007 "movq %%mm0, %%mm2 \n\t"
1008 "movq %%mm1, %%mm3 \n\t"
1009
1010 "psrlq $8, %%mm2 \n\t"
1011 "psrlq $8, %%mm3 \n\t"
1012 "psrlq $8, %%mm6 \n\t"
1013 "psrlq $8, %%mm7 \n\t"
1014 "pand %2, %%mm0 \n\t"
1015 "pand %2, %%mm1 \n\t"
1016 "pand %2, %%mm4 \n\t"
1017 "pand %2, %%mm5 \n\t"
1018 "pand %3, %%mm2 \n\t"
1019 "pand %3, %%mm3 \n\t"
1020 "pand %3, %%mm6 \n\t"
1021 "pand %3, %%mm7 \n\t"
1022 "por %%mm2, %%mm0 \n\t"
1023 "por %%mm3, %%mm1 \n\t"
1024 "por %%mm6, %%mm4 \n\t"
1025 "por %%mm7, %%mm5 \n\t"
1026
1027 "movq %%mm1, %%mm2 \n\t"
1028 "movq %%mm4, %%mm3 \n\t"
1029 "psllq $48, %%mm2 \n\t"
1030 "psllq $32, %%mm3 \n\t"
1031 "pand %4, %%mm2 \n\t"
1032 "pand %5, %%mm3 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "psrlq $16, %%mm1 \n\t"
1035 "psrlq $32, %%mm4 \n\t"
1036 "psllq $16, %%mm5 \n\t"
1037 "por %%mm3, %%mm1 \n\t"
1038 "pand %6, %%mm5 \n\t"
1039 "por %%mm5, %%mm4 \n\t"
1040
1041 MOVNTQ" %%mm0, %0 \n\t"
1042 MOVNTQ" %%mm1, 8%0 \n\t"
1043 MOVNTQ" %%mm4, 16%0"
1044
1045 :"=m"(*d)
1046 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1047 :"memory");
1048 d += 24;
1049 s += 8;
1050 }
1051 __asm__ volatile(SFENCE:::"memory");
1052 __asm__ volatile(EMMS:::"memory");
1053 #endif
1054 while (s < end)
1055 {
1056 register uint16_t bgr;
1057 bgr = *s++;
1058 *d++ = (bgr&0x1F)<<3;
1059 *d++ = (bgr&0x3E0)>>2;
1060 *d++ = (bgr&0x7C00)>>7;
1061 }
1062 }
1063
1064 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1065 {
1066 const uint16_t *end;
1067 #if HAVE_MMX
1068 const uint16_t *mm_end;
1069 #endif
1070 uint8_t *d = (uint8_t *)dst;
1071 const uint16_t *s = (const uint16_t *)src;
1072 end = s + src_size/2;
1073 #if HAVE_MMX
1074 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1075 mm_end = end - 7;
1076 while (s < mm_end)
1077 {
1078 __asm__ volatile(
1079 PREFETCH" 32%1 \n\t"
1080 "movq %1, %%mm0 \n\t"
1081 "movq %1, %%mm1 \n\t"
1082 "movq %1, %%mm2 \n\t"
1083 "pand %2, %%mm0 \n\t"
1084 "pand %3, %%mm1 \n\t"
1085 "pand %4, %%mm2 \n\t"
1086 "psllq $3, %%mm0 \n\t"
1087 "psrlq $3, %%mm1 \n\t"
1088 "psrlq $8, %%mm2 \n\t"
1089 "movq %%mm0, %%mm3 \n\t"
1090 "movq %%mm1, %%mm4 \n\t"
1091 "movq %%mm2, %%mm5 \n\t"
1092 "punpcklwd %5, %%mm0 \n\t"
1093 "punpcklwd %5, %%mm1 \n\t"
1094 "punpcklwd %5, %%mm2 \n\t"
1095 "punpckhwd %5, %%mm3 \n\t"
1096 "punpckhwd %5, %%mm4 \n\t"
1097 "punpckhwd %5, %%mm5 \n\t"
1098 "psllq $8, %%mm1 \n\t"
1099 "psllq $16, %%mm2 \n\t"
1100 "por %%mm1, %%mm0 \n\t"
1101 "por %%mm2, %%mm0 \n\t"
1102 "psllq $8, %%mm4 \n\t"
1103 "psllq $16, %%mm5 \n\t"
1104 "por %%mm4, %%mm3 \n\t"
1105 "por %%mm5, %%mm3 \n\t"
1106
1107 "movq %%mm0, %%mm6 \n\t"
1108 "movq %%mm3, %%mm7 \n\t"
1109
1110 "movq 8%1, %%mm0 \n\t"
1111 "movq 8%1, %%mm1 \n\t"
1112 "movq 8%1, %%mm2 \n\t"
1113 "pand %2, %%mm0 \n\t"
1114 "pand %3, %%mm1 \n\t"
1115 "pand %4, %%mm2 \n\t"
1116 "psllq $3, %%mm0 \n\t"
1117 "psrlq $3, %%mm1 \n\t"
1118 "psrlq $8, %%mm2 \n\t"
1119 "movq %%mm0, %%mm3 \n\t"
1120 "movq %%mm1, %%mm4 \n\t"
1121 "movq %%mm2, %%mm5 \n\t"
1122 "punpcklwd %5, %%mm0 \n\t"
1123 "punpcklwd %5, %%mm1 \n\t"
1124 "punpcklwd %5, %%mm2 \n\t"
1125 "punpckhwd %5, %%mm3 \n\t"
1126 "punpckhwd %5, %%mm4 \n\t"
1127 "punpckhwd %5, %%mm5 \n\t"
1128 "psllq $8, %%mm1 \n\t"
1129 "psllq $16, %%mm2 \n\t"
1130 "por %%mm1, %%mm0 \n\t"
1131 "por %%mm2, %%mm0 \n\t"
1132 "psllq $8, %%mm4 \n\t"
1133 "psllq $16, %%mm5 \n\t"
1134 "por %%mm4, %%mm3 \n\t"
1135 "por %%mm5, %%mm3 \n\t"
1136 :"=m"(*d)
1137 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1138 :"memory");
1139 /* borrowed 32 to 24 */
1140 __asm__ volatile(
1141 "movq %%mm0, %%mm4 \n\t"
1142 "movq %%mm3, %%mm5 \n\t"
1143 "movq %%mm6, %%mm0 \n\t"
1144 "movq %%mm7, %%mm1 \n\t"
1145
1146 "movq %%mm4, %%mm6 \n\t"
1147 "movq %%mm5, %%mm7 \n\t"
1148 "movq %%mm0, %%mm2 \n\t"
1149 "movq %%mm1, %%mm3 \n\t"
1150
1151 "psrlq $8, %%mm2 \n\t"
1152 "psrlq $8, %%mm3 \n\t"
1153 "psrlq $8, %%mm6 \n\t"
1154 "psrlq $8, %%mm7 \n\t"
1155 "pand %2, %%mm0 \n\t"
1156 "pand %2, %%mm1 \n\t"
1157 "pand %2, %%mm4 \n\t"
1158 "pand %2, %%mm5 \n\t"
1159 "pand %3, %%mm2 \n\t"
1160 "pand %3, %%mm3 \n\t"
1161 "pand %3, %%mm6 \n\t"
1162 "pand %3, %%mm7 \n\t"
1163 "por %%mm2, %%mm0 \n\t"
1164 "por %%mm3, %%mm1 \n\t"
1165 "por %%mm6, %%mm4 \n\t"
1166 "por %%mm7, %%mm5 \n\t"
1167
1168 "movq %%mm1, %%mm2 \n\t"
1169 "movq %%mm4, %%mm3 \n\t"
1170 "psllq $48, %%mm2 \n\t"
1171 "psllq $32, %%mm3 \n\t"
1172 "pand %4, %%mm2 \n\t"
1173 "pand %5, %%mm3 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "psrlq $16, %%mm1 \n\t"
1176 "psrlq $32, %%mm4 \n\t"
1177 "psllq $16, %%mm5 \n\t"
1178 "por %%mm3, %%mm1 \n\t"
1179 "pand %6, %%mm5 \n\t"
1180 "por %%mm5, %%mm4 \n\t"
1181
1182 MOVNTQ" %%mm0, %0 \n\t"
1183 MOVNTQ" %%mm1, 8%0 \n\t"
1184 MOVNTQ" %%mm4, 16%0"
1185
1186 :"=m"(*d)
1187 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1188 :"memory");
1189 d += 24;
1190 s += 8;
1191 }
1192 __asm__ volatile(SFENCE:::"memory");
1193 __asm__ volatile(EMMS:::"memory");
1194 #endif
1195 while (s < end)
1196 {
1197 register uint16_t bgr;
1198 bgr = *s++;
1199 *d++ = (bgr&0x1F)<<3;
1200 *d++ = (bgr&0x7E0)>>3;
1201 *d++ = (bgr&0xF800)>>8;
1202 }
1203 }
1204
1205 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1206 {
1207 const uint16_t *end;
1208 #if HAVE_MMX
1209 const uint16_t *mm_end;
1210 #endif
1211 uint8_t *d = dst;
1212 const uint16_t *s = (const uint16_t *)src;
1213 end = s + src_size/2;
1214 #if HAVE_MMX
1215 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1216 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1217 mm_end = end - 3;
1218 while (s < mm_end)
1219 {
1220 __asm__ volatile(
1221 PREFETCH" 32%1 \n\t"
1222 "movq %1, %%mm0 \n\t"
1223 "movq %1, %%mm1 \n\t"
1224 "movq %1, %%mm2 \n\t"
1225 "pand %2, %%mm0 \n\t"
1226 "pand %3, %%mm1 \n\t"
1227 "pand %4, %%mm2 \n\t"
1228 "psllq $3, %%mm0 \n\t"
1229 "psrlq $2, %%mm1 \n\t"
1230 "psrlq $7, %%mm2 \n\t"
1231 "movq %%mm0, %%mm3 \n\t"
1232 "movq %%mm1, %%mm4 \n\t"
1233 "movq %%mm2, %%mm5 \n\t"
1234 "punpcklwd %%mm7, %%mm0 \n\t"
1235 "punpcklwd %%mm7, %%mm1 \n\t"
1236 "punpcklwd %%mm7, %%mm2 \n\t"
1237 "punpckhwd %%mm7, %%mm3 \n\t"
1238 "punpckhwd %%mm7, %%mm4 \n\t"
1239 "punpckhwd %%mm7, %%mm5 \n\t"
1240 "psllq $8, %%mm1 \n\t"
1241 "psllq $16, %%mm2 \n\t"
1242 "por %%mm1, %%mm0 \n\t"
1243 "por %%mm2, %%mm0 \n\t"
1244 "psllq $8, %%mm4 \n\t"
1245 "psllq $16, %%mm5 \n\t"
1246 "por %%mm4, %%mm3 \n\t"
1247 "por %%mm5, %%mm3 \n\t"
1248 MOVNTQ" %%mm0, %0 \n\t"
1249 MOVNTQ" %%mm3, 8%0 \n\t"
1250 :"=m"(*d)
1251 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1252 :"memory");
1253 d += 16;
1254 s += 4;
1255 }
1256 __asm__ volatile(SFENCE:::"memory");
1257 __asm__ volatile(EMMS:::"memory");
1258 #endif
1259 while (s < end)
1260 {
1261 #if 0 //slightly slower on Athlon
1262 int bgr= *s++;
1263 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1264 #else
1265 register uint16_t bgr;
1266 bgr = *s++;
1267 #ifdef WORDS_BIGENDIAN
1268 *d++ = 0;
1269 *d++ = (bgr&0x7C00)>>7;
1270 *d++ = (bgr&0x3E0)>>2;
1271 *d++ = (bgr&0x1F)<<3;
1272 #else
1273 *d++ = (bgr&0x1F)<<3;
1274 *d++ = (bgr&0x3E0)>>2;
1275 *d++ = (bgr&0x7C00)>>7;
1276 *d++ = 0;
1277 #endif
1278
1279 #endif
1280 }
1281 }
1282
1283 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1284 {
1285 const uint16_t *end;
1286 #if HAVE_MMX
1287 const uint16_t *mm_end;
1288 #endif
1289 uint8_t *d = dst;
1290 const uint16_t *s = (const uint16_t*)src;
1291 end = s + src_size/2;
1292 #if HAVE_MMX
1293 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1294 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1295 mm_end = end - 3;
1296 while (s < mm_end)
1297 {
1298 __asm__ volatile(
1299 PREFETCH" 32%1 \n\t"
1300 "movq %1, %%mm0 \n\t"
1301 "movq %1, %%mm1 \n\t"
1302 "movq %1, %%mm2 \n\t"
1303 "pand %2, %%mm0 \n\t"
1304 "pand %3, %%mm1 \n\t"
1305 "pand %4, %%mm2 \n\t"
1306 "psllq $3, %%mm0 \n\t"
1307 "psrlq $3, %%mm1 \n\t"
1308 "psrlq $8, %%mm2 \n\t"
1309 "movq %%mm0, %%mm3 \n\t"
1310 "movq %%mm1, %%mm4 \n\t"
1311 "movq %%mm2, %%mm5 \n\t"
1312 "punpcklwd %%mm7, %%mm0 \n\t"
1313 "punpcklwd %%mm7, %%mm1 \n\t"
1314 "punpcklwd %%mm7, %%mm2 \n\t"
1315 "punpckhwd %%mm7, %%mm3 \n\t"
1316 "punpckhwd %%mm7, %%mm4 \n\t"
1317 "punpckhwd %%mm7, %%mm5 \n\t"
1318 "psllq $8, %%mm1 \n\t"
1319 "psllq $16, %%mm2 \n\t"
1320 "por %%mm1, %%mm0 \n\t"
1321 "por %%mm2, %%mm0 \n\t"
1322 "psllq $8, %%mm4 \n\t"
1323 "psllq $16, %%mm5 \n\t"
1324 "por %%mm4, %%mm3 \n\t"
1325 "por %%mm5, %%mm3 \n\t"
1326 MOVNTQ" %%mm0, %0 \n\t"
1327 MOVNTQ" %%mm3, 8%0 \n\t"
1328 :"=m"(*d)
1329 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1330 :"memory");
1331 d += 16;
1332 s += 4;
1333 }
1334 __asm__ volatile(SFENCE:::"memory");
1335 __asm__ volatile(EMMS:::"memory");
1336 #endif
1337 while (s < end)
1338 {
1339 register uint16_t bgr;
1340 bgr = *s++;
1341 #ifdef WORDS_BIGENDIAN
1342 *d++ = 0;
1343 *d++ = (bgr&0xF800)>>8;
1344 *d++ = (bgr&0x7E0)>>3;
1345 *d++ = (bgr&0x1F)<<3;
1346 #else
1347 *d++ = (bgr&0x1F)<<3;
1348 *d++ = (bgr&0x7E0)>>3;
1349 *d++ = (bgr&0xF800)>>8;
1350 *d++ = 0;
1351 #endif
1352 }
1353 }
1354
1355 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1356 {
1357 long idx = 15 - src_size;
1358 const uint8_t *s = src-idx;
1359 uint8_t *d = dst-idx;
1360 #if HAVE_MMX
1361 __asm__ volatile(
1362 "test %0, %0 \n\t"
1363 "jns 2f \n\t"
1364 PREFETCH" (%1, %0) \n\t"
1365 "movq %3, %%mm7 \n\t"
1366 "pxor %4, %%mm7 \n\t"
1367 "movq %%mm7, %%mm6 \n\t"
1368 "pxor %5, %%mm7 \n\t"
1369 ASMALIGN(4)
1370 "1: \n\t"
1371 PREFETCH" 32(%1, %0) \n\t"
1372 "movq (%1, %0), %%mm0 \n\t"
1373 "movq 8(%1, %0), %%mm1 \n\t"
1374 # if HAVE_MMX2
1375 "pshufw $177, %%mm0, %%mm3 \n\t"
1376 "pshufw $177, %%mm1, %%mm5 \n\t"
1377 "pand %%mm7, %%mm0 \n\t"
1378 "pand %%mm6, %%mm3 \n\t"
1379 "pand %%mm7, %%mm1 \n\t"
1380 "pand %%mm6, %%mm5 \n\t"
1381 "por %%mm3, %%mm0 \n\t"
1382 "por %%mm5, %%mm1 \n\t"
1383 # else
1384 "movq %%mm0, %%mm2 \n\t"
1385 "movq %%mm1, %%mm4 \n\t"
1386 "pand %%mm7, %%mm0 \n\t"
1387 "pand %%mm6, %%mm2 \n\t"
1388 "pand %%mm7, %%mm1 \n\t"
1389 "pand %%mm6, %%mm4 \n\t"
1390 "movq %%mm2, %%mm3 \n\t"
1391 "movq %%mm4, %%mm5 \n\t"
1392 "pslld $16, %%mm2 \n\t"
1393 "psrld $16, %%mm3 \n\t"
1394 "pslld $16, %%mm4 \n\t"
1395 "psrld $16, %%mm5 \n\t"
1396 "por %%mm2, %%mm0 \n\t"
1397 "por %%mm4, %%mm1 \n\t"
1398 "por %%mm3, %%mm0 \n\t"
1399 "por %%mm5, %%mm1 \n\t"
1400 # endif
1401 MOVNTQ" %%mm0, (%2, %0) \n\t"
1402 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1403 "add $16, %0 \n\t"
1404 "js 1b \n\t"
1405 SFENCE" \n\t"
1406 EMMS" \n\t"
1407 "2: \n\t"
1408 : "+&r"(idx)
1409 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1410 : "memory");
1411 #endif
1412 for (; idx<15; idx+=4) {
1413 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1414 v &= 0xff00ff;
1415 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1416 }
1417 }
1418
1419 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1420 {
1421 unsigned i;
1422 #if HAVE_MMX
1423 long mmx_size= 23 - src_size;
1424 __asm__ volatile (
1425 "test %%"REG_a", %%"REG_a" \n\t"
1426 "jns 2f \n\t"
1427 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1428 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1429 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1430 ASMALIGN(4)
1431 "1: \n\t"
1432 PREFETCH" 32(%1, %%"REG_a") \n\t"
1433 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1434 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1435 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1436 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1437 "pand %%mm5, %%mm0 \n\t"
1438 "pand %%mm6, %%mm1 \n\t"
1439 "pand %%mm7, %%mm2 \n\t"
1440 "por %%mm0, %%mm1 \n\t"
1441 "por %%mm2, %%mm1 \n\t"
1442 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1443 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1444 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1445 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1446 "pand %%mm7, %%mm0 \n\t"
1447 "pand %%mm5, %%mm1 \n\t"
1448 "pand %%mm6, %%mm2 \n\t"
1449 "por %%mm0, %%mm1 \n\t"
1450 "por %%mm2, %%mm1 \n\t"
1451 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1452 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1453 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1454 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1455 "pand %%mm6, %%mm0 \n\t"
1456 "pand %%mm7, %%mm1 \n\t"
1457 "pand %%mm5, %%mm2 \n\t"
1458 "por %%mm0, %%mm1 \n\t"
1459 "por %%mm2, %%mm1 \n\t"
1460 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1461 "add $24, %%"REG_a" \n\t"
1462 " js 1b \n\t"
1463 "2: \n\t"
1464 : "+a" (mmx_size)
1465 : "r" (src-mmx_size), "r"(dst-mmx_size)
1466 );
1467
1468 __asm__ volatile(SFENCE:::"memory");
1469 __asm__ volatile(EMMS:::"memory");
1470
1471 if (mmx_size==23) return; //finished, was multiple of 8
1472
1473 src+= src_size;
1474 dst+= src_size;
1475 src_size= 23-mmx_size;
1476 src-= src_size;
1477 dst-= src_size;
1478 #endif
1479 for (i=0; i<src_size; i+=3)
1480 {
1481 register uint8_t x;
1482 x = src[i + 2];
1483 dst[i + 1] = src[i + 1];
1484 dst[i + 2] = src[i + 0];
1485 dst[i + 0] = x;
1486 }
1487 }
1488
1489 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1490 long width, long height,
1491 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1492 {
1493 long y;
1494 const long chromWidth= width>>1;
1495 for (y=0; y<height; y++)
1496 {
1497 #if HAVE_MMX
1498 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1499 __asm__ volatile(
1500 "xor %%"REG_a", %%"REG_a" \n\t"
1501 ASMALIGN(4)
1502 "1: \n\t"
1503 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1504 PREFETCH" 32(%2, %%"REG_a") \n\t"
1505 PREFETCH" 32(%3, %%"REG_a") \n\t"
1506 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1507 "movq %%mm0, %%mm2 \n\t" // U(0)
1508 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1509 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1510 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1511
1512 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1513 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1514 "movq %%mm3, %%mm4 \n\t" // Y(0)
1515 "movq %%mm5, %%mm6 \n\t" // Y(8)
1516 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1517 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1518 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1519 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1520
1521 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1522 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1523 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1524 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1525
1526 "add $8, %%"REG_a" \n\t"
1527 "cmp %4, %%"REG_a" \n\t"
1528 " jb 1b \n\t"
1529 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1530 : "%"REG_a
1531 );
1532 #else
1533
1534 #if ARCH_ALPHA && HAVE_MVI
1535 #define pl2yuy2(n) \
1536 y1 = yc[n]; \
1537 y2 = yc2[n]; \
1538 u = uc[n]; \
1539 v = vc[n]; \
1540 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1541 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1542 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1543 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1544 yuv1 = (u << 8) + (v << 24); \
1545 yuv2 = yuv1 + y2; \
1546 yuv1 += y1; \
1547 qdst[n] = yuv1; \
1548 qdst2[n] = yuv2;
1549
1550 int i;
1551 uint64_t *qdst = (uint64_t *) dst;
1552 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1553 const uint32_t *yc = (uint32_t *) ysrc;
1554 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1555 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1556 for (i = 0; i < chromWidth; i += 8){
1557 uint64_t y1, y2, yuv1, yuv2;
1558 uint64_t u, v;
1559 /* Prefetch */
1560 __asm__("ldq $31,64(%0)" :: "r"(yc));
1561 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1562 __asm__("ldq $31,64(%0)" :: "r"(uc));
1563 __asm__("ldq $31,64(%0)" :: "r"(vc));
1564
1565 pl2yuy2(0);
1566 pl2yuy2(1);
1567 pl2yuy2(2);
1568 pl2yuy2(3);
1569
1570 yc += 4;
1571 yc2 += 4;
1572 uc += 4;
1573 vc += 4;
1574 qdst += 4;
1575 qdst2 += 4;
1576 }
1577 y++;
1578 ysrc += lumStride;
1579 dst += dstStride;
1580
1581 #elif HAVE_FAST_64BIT
1582 int i;
1583 uint64_t *ldst = (uint64_t *) dst;
1584 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1585 for (i = 0; i < chromWidth; i += 2){
1586 uint64_t k, l;
1587 k = yc[0] + (uc[0] << 8) +
1588 (yc[1] << 16) + (vc[0] << 24);
1589 l = yc[2] + (uc[1] << 8) +
1590 (yc[3] << 16) + (vc[1] << 24);
1591 *ldst++ = k + (l << 32);
1592 yc += 4;
1593 uc += 2;
1594 vc += 2;
1595 }
1596
1597 #else
1598 int i, *idst = (int32_t *) dst;
1599 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1600 for (i = 0; i < chromWidth; i++){
1601 #ifdef WORDS_BIGENDIAN
1602 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1603 (yc[1] << 8) + (vc[0] << 0);
1604 #else
1605 *idst++ = yc[0] + (uc[0] << 8) +
1606 (yc[1] << 16) + (vc[0] << 24);
1607 #endif
1608 yc += 2;
1609 uc++;
1610 vc++;
1611 }
1612 #endif
1613 #endif
1614 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1615 {
1616 usrc += chromStride;
1617 vsrc += chromStride;
1618 }
1619 ysrc += lumStride;
1620 dst += dstStride;
1621 }
1622 #if HAVE_MMX
1623 __asm__( EMMS" \n\t"
1624 SFENCE" \n\t"
1625 :::"memory");
1626 #endif
1627 }
1628
1629 /**
1630 * Height should be a multiple of 2 and width should be a multiple of 16.
1631 * (If this is a problem for anyone then tell me, and I will fix it.)
1632 */
1633 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1634 long width, long height,
1635 long lumStride, long chromStride, long dstStride)
1636 {
1637 //FIXME interpolate chroma
1638 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1639 }
1640
1641 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1642 long width, long height,
1643 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1644 {
1645 long y;
1646 const long chromWidth= width>>1;
1647 for (y=0; y<height; y++)
1648 {
1649 #if HAVE_MMX
1650 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1651 __asm__ volatile(
1652 "xor %%"REG_a", %%"REG_a" \n\t"
1653 ASMALIGN(4)
1654 "1: \n\t"
1655 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1656 PREFETCH" 32(%2, %%"REG_a") \n\t"
1657 PREFETCH" 32(%3, %%"REG_a") \n\t"
1658 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1659 "movq %%mm0, %%mm2 \n\t" // U(0)
1660 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1661 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1662 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1663
1664 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1665 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1666 "movq %%mm0, %%mm4 \n\t" // Y(0)
1667 "movq %%mm2, %%mm6 \n\t" // Y(8)
1668 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1669 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1670 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1671 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1672
1673 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1674 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1675 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1676 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1677
1678 "add $8, %%"REG_a" \n\t"
1679 "cmp %4, %%"REG_a" \n\t"
1680 " jb 1b \n\t"
1681 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1682 : "%"REG_a
1683 );
1684 #else
1685 //FIXME adapt the Alpha ASM code from yv12->yuy2
1686
1687 #if HAVE_FAST_64BIT
1688 int i;
1689 uint64_t *ldst = (uint64_t *) dst;
1690 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1691 for (i = 0; i < chromWidth; i += 2){
1692 uint64_t k, l;
1693 k = uc[0] + (yc[0] << 8) +
1694 (vc[0] << 16) + (yc[1] << 24);
1695 l = uc[1] + (yc[2] << 8) +
1696 (vc[1] << 16) + (yc[3] << 24);
1697 *ldst++ = k + (l << 32);
1698 yc += 4;
1699 uc += 2;
1700 vc += 2;
1701 }
1702
1703 #else
1704 int i, *idst = (int32_t *) dst;
1705 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1706 for (i = 0; i < chromWidth; i++){
1707 #ifdef WORDS_BIGENDIAN
1708 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1709 (vc[0] << 8) + (yc[1] << 0);
1710 #else
1711 *idst++ = uc[0] + (yc[0] << 8) +
1712 (vc[0] << 16) + (yc[1] << 24);
1713 #endif
1714 yc += 2;
1715 uc++;
1716 vc++;
1717 }
1718 #endif
1719 #endif
1720 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1721 {
1722 usrc += chromStride;
1723 vsrc += chromStride;
1724 }
1725 ysrc += lumStride;
1726 dst += dstStride;
1727 }
1728 #if HAVE_MMX
1729 __asm__( EMMS" \n\t"
1730 SFENCE" \n\t"
1731 :::"memory");
1732 #endif
1733 }
1734
1735 /**
1736 * Height should be a multiple of 2 and width should be a multiple of 16
1737 * (If this is a problem for anyone then tell me, and I will fix it.)
1738 */
1739 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1740 long width, long height,
1741 long lumStride, long chromStride, long dstStride)
1742 {
1743 //FIXME interpolate chroma
1744 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1745 }
1746
1747 /**
1748 * Width should be a multiple of 16.
1749 */
1750 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1751 long width, long height,
1752 long lumStride, long chromStride, long dstStride)
1753 {
1754 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1755 }
1756
1757 /**
1758 * Width should be a multiple of 16.
1759 */
1760 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1761 long width, long height,
1762 long lumStride, long chromStride, long dstStride)
1763 {
1764 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1765 }
1766
1767 /**
1768 * Height should be a multiple of 2 and width should be a multiple of 16.
1769 * (If this is a problem for anyone then tell me, and I will fix it.)
1770 */
1771 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1772 long width, long height,
1773 long lumStride, long chromStride, long srcStride)
1774 {
1775 long y;
1776 const long chromWidth= width>>1;
1777 for (y=0; y<height; y+=2)
1778 {
1779 #if HAVE_MMX
1780 __asm__ volatile(
1781 "xor %%"REG_a", %%"REG_a" \n\t"
1782 "pcmpeqw %%mm7, %%mm7 \n\t"
1783 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1784 ASMALIGN(4)
1785 "1: \n\t"
1786 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1787 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1788 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1789 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1790 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1791 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1792 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1793 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1794 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1795 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1796 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1797
1798 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1799
1800 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1801 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1802 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1803 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1804 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1805 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1806 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1807 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1808 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1809 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1810
1811 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1812
1813 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1814 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1815 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1816 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1817 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1818 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1819 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1820 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1821
1822 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1823 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1824
1825 "add $8, %%"REG_a" \n\t"
1826 "cmp %4, %%"REG_a" \n\t"
1827 " jb 1b \n\t"
1828 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1829 : "memory", "%"REG_a
1830 );
1831
1832 ydst += lumStride;
1833 src += srcStride;
1834
1835 __asm__ volatile(
1836 "xor %%"REG_a", %%"REG_a" \n\t"
1837 ASMALIGN(4)
1838 "1: \n\t"
1839 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1840 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1841 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1842 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1843 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1844 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1845 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1846 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1847 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1848 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1849 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1850
1851 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1852 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1853
1854 "add $8, %%"REG_a" \n\t"
1855 "cmp %4, %%"REG_a" \n\t"
1856 " jb 1b \n\t"
1857
1858 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1859 : "memory", "%"REG_a
1860 );
1861 #else
1862 long i;
1863 for (i=0; i<chromWidth; i++)
1864 {
1865 ydst[2*i+0] = src[4*i+0];
1866 udst[i] = src[4*i+1];
1867 ydst[2*i+1] = src[4*i+2];
1868 vdst[i] = src[4*i+3];
1869 }
1870 ydst += lumStride;
1871 src += srcStride;
1872
1873 for (i=0; i<chromWidth; i++)
1874 {
1875 ydst[2*i+0] = src[4*i+0];
1876 ydst[2*i+1] = src[4*i+2];
1877 }
1878 #endif
1879 udst += chromStride;
1880 vdst += chromStride;
1881 ydst += lumStride;
1882 src += srcStride;
1883 }
1884 #if HAVE_MMX
1885 __asm__ volatile( EMMS" \n\t"
1886 SFENCE" \n\t"
1887 :::"memory");
1888 #endif
1889 }
1890
1891 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1892 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1893 long width, long height, long lumStride, long chromStride)
1894 {
1895 /* Y Plane */
1896 memcpy(ydst, ysrc, width*height);
1897
1898 /* XXX: implement upscaling for U,V */
1899 }
1900
1901 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1902 {
1903 long x,y;
1904
1905 dst[0]= src[0];
1906
1907 // first line
1908 for (x=0; x<srcWidth-1; x++){
1909 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1910 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1911 }
1912 dst[2*srcWidth-1]= src[srcWidth-1];
1913
1914 dst+= dstStride;
1915
1916 for (y=1; y<srcHeight; y++){
1917 #if HAVE_MMX2 || HAVE_3DNOW
1918 const long mmxSize= srcWidth&~15;
1919 __asm__ volatile(
1920 "mov %4, %%"REG_a" \n\t"
1921 "1: \n\t"
1922 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1923 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1924 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1925 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1926 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1927 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1928 PAVGB" %%mm0, %%mm5 \n\t"
1929 PAVGB" %%mm0, %%mm3 \n\t"
1930 PAVGB" %%mm0, %%mm5 \n\t"
1931 PAVGB" %%mm0, %%mm3 \n\t"
1932 PAVGB" %%mm1, %%mm4 \n\t"
1933 PAVGB" %%mm1, %%mm2 \n\t"
1934 PAVGB" %%mm1, %%mm4 \n\t"
1935 PAVGB" %%mm1, %%mm2 \n\t"
1936 "movq %%mm5, %%mm7 \n\t"
1937 "movq %%mm4, %%mm6 \n\t"
1938 "punpcklbw %%mm3, %%mm5 \n\t"
1939 "punpckhbw %%mm3, %%mm7 \n\t"
1940 "punpcklbw %%mm2, %%mm4 \n\t"
1941 "punpckhbw %%mm2, %%mm6 \n\t"
1942 #if 1
1943 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1944 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1945 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1946 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1947 #else
1948 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1949 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1950 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1951 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1952 #endif
1953 "add $8, %%"REG_a" \n\t"
1954 " js 1b \n\t"
1955 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1956 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1957 "g" (-mmxSize)
1958 : "%"REG_a
1959
1960 );
1961 #else
1962 const long mmxSize=1;
1963 #endif
1964 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1965 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1966
1967 for (x=mmxSize-1; x<srcWidth-1; x++){
1968 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1969 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1970 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1971 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1972 }
1973 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1974 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1975
1976 dst+=dstStride*2;
1977 src+=srcStride;
1978 }
1979
1980 // last line
1981 #if 1
1982 dst[0]= src[0];
1983
1984 for (x=0; x<srcWidth-1; x++){
1985 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1986 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1987 }
1988 dst[2*srcWidth-1]= src[srcWidth-1];
1989 #else
1990 for (x=0; x<srcWidth; x++){
1991 dst[2*x+0]=
1992 dst[2*x+1]= src[x];
1993 }
1994 #endif
1995
1996 #if HAVE_MMX
1997 __asm__ volatile( EMMS" \n\t"
1998 SFENCE" \n\t"
1999 :::"memory");
2000 #endif
2001 }
2002
2003 /**
2004 * Height should be a multiple of 2 and width should be a multiple of 16.
2005 * (If this is a problem for anyone then tell me, and I will fix it.)
2006 * Chrominance data is only taken from every second line, others are ignored.
2007 * FIXME: Write HQ version.
2008 */
2009 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2010 long width, long height,
2011 long lumStride, long chromStride, long srcStride)
2012 {
2013 long y;
2014 const long chromWidth= width>>1;
2015 for (y=0; y<height; y+=2)
2016 {
2017 #if HAVE_MMX
2018 __asm__ volatile(
2019 "xor %%"REG_a", %%"REG_a" \n\t"
2020 "pcmpeqw %%mm7, %%mm7 \n\t"
2021 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2022 ASMALIGN(4)
2023 "1: \n\t"
2024 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2025 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
2026 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
2027 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2028 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2029 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2030 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2031 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2032 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2033 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2034 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2035
2036 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
2037
2038 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
2039 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
2040 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2041 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2042 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2043 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2044 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2045 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2046 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2047 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2048
2049 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2050
2051 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2052 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2053 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2054 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2055 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2056 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2057 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2058 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2059
2060 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
2061 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
2062
2063 "add $8, %%"REG_a" \n\t"
2064 "cmp %4, %%"REG_a" \n\t"
2065 " jb 1b \n\t"
2066 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2067 : "memory", "%"REG_a
2068 );
2069
2070 ydst += lumStride;
2071 src += srcStride;
2072
2073 __asm__ volatile(
2074 "xor %%"REG_a", %%"REG_a" \n\t"
2075 ASMALIGN(4)
2076 "1: \n\t"
2077 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2078 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
2079 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
2080 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
2081 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
2082 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2083 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2084 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2085 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2086 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2087 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2088
2089 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
2090 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2091
2092 "add $8, %%"REG_a" \n\t"
2093 "cmp %4, %%"REG_a" \n\t"
2094 " jb 1b \n\t"
2095
2096 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2097 : "memory", "%"REG_a
2098 );
2099 #else
2100 long i;
2101 for (i=0; i<chromWidth; i++)
2102 {
2103 udst[i] = src[4*i+0];
2104 ydst[2*i+0] = src[4*i+1];
2105 vdst[i] = src[4*i+2];
2106 ydst[2*i+1] = src[4*i+3];
2107 }
2108 ydst += lumStride;
2109 src += srcStride;
2110
2111 for (i=0; i<chromWidth; i++)
2112 {
2113 ydst[2*i+0] = src[4*i+1];
2114 ydst[2*i+1] = src[4*i+3];
2115 }
2116 #endif
2117 udst += chromStride;
2118 vdst += chromStride;
2119 ydst += lumStride;
2120 src += srcStride;
2121 }
2122 #if HAVE_MMX
2123 __asm__ volatile( EMMS" \n\t"
2124 SFENCE" \n\t"
2125 :::"memory");
2126 #endif
2127 }
2128
2129 /**
2130 * Height should be a multiple of 2 and width should be a multiple of 2.
2131 * (If this is a problem for anyone then tell me, and I will fix it.)
2132 * Chrominance data is only taken from every second line,
2133 * others are ignored in the C version.
2134 * FIXME: Write HQ version.
2135 */
2136 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2137 long width, long height,
2138 long lumStride, long chromStride, long srcStride)
2139 {
2140 long y;
2141 const long chromWidth= width>>1;
2142 #if HAVE_MMX
2143 for (y=0; y<height-2; y+=2)
2144 {
2145 long i;
2146 for (i=0; i<2; i++)
2147 {
2148 __asm__ volatile(
2149 "mov %2, %%"REG_a" \n\t"
2150 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2151 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2152 "pxor %%mm7, %%mm7 \n\t"
2153 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2154 ASMALIGN(4)
2155 "1: \n\t"
2156 PREFETCH" 64(%0, %%"REG_d") \n\t"
2157 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2158 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2159 "punpcklbw %%mm7, %%mm0 \n\t"
2160 "punpcklbw %%mm7, %%mm1 \n\t"
2161 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2162 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2163 "punpcklbw %%mm7, %%mm2 \n\t"
2164 "punpcklbw %%mm7, %%mm3 \n\t"
2165 "pmaddwd %%mm6, %%mm0 \n\t"
2166 "pmaddwd %%mm6, %%mm1 \n\t"
2167 "pmaddwd %%mm6, %%mm2 \n\t"
2168 "pmaddwd %%mm6, %%mm3 \n\t"
2169 #ifndef FAST_BGR2YV12
2170 "psrad $8, %%mm0 \n\t"
2171 "psrad $8, %%mm1 \n\t"
2172 "psrad $8, %%mm2 \n\t"
2173 "psrad $8, %%mm3 \n\t"
2174 #endif
2175 "packssdw %%mm1, %%mm0 \n\t"
2176 "packssdw %%mm3, %%mm2 \n\t"
2177 "pmaddwd %%mm5, %%mm0 \n\t"
2178 "pmaddwd %%mm5, %%mm2 \n\t"
2179 "packssdw %%mm2, %%mm0 \n\t"
2180 "psraw $7, %%mm0 \n\t"
2181
2182 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2183 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2184 "punpcklbw %%mm7, %%mm4 \n\t"
2185 "punpcklbw %%mm7, %%mm1 \n\t"
2186 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2187 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2188 "punpcklbw %%mm7, %%mm2 \n\t"
2189 "punpcklbw %%mm7, %%mm3 \n\t"
2190 "pmaddwd %%mm6, %%mm4 \n\t"
2191 "pmaddwd %%mm6, %%mm1 \n\t"
2192 "pmaddwd %%mm6, %%mm2 \n\t"
2193 "pmaddwd %%mm6, %%mm3 \n\t"
2194 #ifndef FAST_BGR2YV12
2195 "psrad $8, %%mm4 \n\t"
2196 "psrad $8, %%mm1 \n\t"
2197 "psrad $8, %%mm2 \n\t"
2198 "psrad $8, %%mm3 \n\t"
2199 #endif
2200 "packssdw %%mm1, %%mm4 \n\t"
2201 "packssdw %%mm3, %%mm2 \n\t"
2202 "pmaddwd %%mm5, %%mm4 \n\t"
2203 "pmaddwd %%mm5, %%mm2 \n\t"
2204 "add $24, %%"REG_d" \n\t"
2205 "packssdw %%mm2, %%mm4 \n\t"
2206 "psraw $7, %%mm4 \n\t"
2207
2208 "packuswb %%mm4, %%mm0 \n\t"
2209 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2210
2211 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2212 "add $8, %%"REG_a" \n\t"
2213 " js 1b \n\t"
2214 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2215 : "%"REG_a, "%"REG_d
2216 );
2217 ydst += lumStride;
2218 src += srcStride;
2219 }
2220 src -= srcStride*2;
2221 __asm__ volatile(
2222 "mov %4, %%"REG_a" \n\t"
2223 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2224 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2225 "pxor %%mm7, %%mm7 \n\t"
2226 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2227 "add %%"REG_d", %%"REG_d" \n\t"
2228 ASMALIGN(4)
2229 "1: \n\t"
2230 PREFETCH" 64(%0, %%"REG_d") \n\t"
2231 PREFETCH" 64(%1, %%"REG_d") \n\t"
2232 #if HAVE_MMX2 || HAVE_3DNOW
2233 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2234 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2235 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2236 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2237 PAVGB" %%mm1, %%mm0 \n\t"
2238 PAVGB" %%mm3, %%mm2 \n\t"
2239 "movq %%mm0, %%mm1 \n\t"
2240 "movq %%mm2, %%mm3 \n\t"
2241 "psrlq $24, %%mm0 \n\t"
2242 "psrlq $24, %%mm2 \n\t"
2243 PAVGB" %%mm1, %%mm0 \n\t"
2244 PAVGB" %%mm3, %%mm2 \n\t"
2245 "punpcklbw %%mm7, %%mm0 \n\t"
2246 "punpcklbw %%mm7, %%mm2 \n\t"
2247 #else
2248 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2249 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2250 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2251 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2252 "punpcklbw %%mm7, %%mm0 \n\t"
2253 "punpcklbw %%mm7, %%mm1 \n\t"
2254 "punpcklbw %%mm7, %%mm2 \n\t"
2255 "punpcklbw %%mm7, %%mm3 \n\t"
2256 "paddw %%mm1, %%mm0 \n\t"
2257 "paddw %%mm3, %%mm2 \n\t"
2258 "paddw %%mm2, %%mm0 \n\t"
2259 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2260 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2261 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2262 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2263 "punpcklbw %%mm7, %%mm4 \n\t"
2264 "punpcklbw %%mm7, %%mm1 \n\t"
2265 "punpcklbw %%mm7, %%mm2 \n\t"
2266 "punpcklbw %%mm7, %%mm3 \n\t"
2267 "paddw %%mm1, %%mm4 \n\t"
2268 "paddw %%mm3, %%mm2 \n\t"
2269 "paddw %%mm4, %%mm2 \n\t"
2270 "psrlw $2, %%mm0 \n\t"
2271 "psrlw $2, %%mm2 \n\t"
2272 #endif
2273 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2274 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2275
2276 "pmaddwd %%mm0, %%mm1 \n\t"
2277 "pmaddwd %%mm2, %%mm3 \n\t"
2278 "pmaddwd %%mm6, %%mm0 \n\t"
2279 "pmaddwd %%mm6, %%mm2 \n\t"
2280 #ifndef FAST_BGR2YV12
2281 "psrad $8, %%mm0 \n\t"
2282 "psrad $8, %%mm1 \n\t"
2283 "psrad $8, %%mm2 \n\t"
2284 "psrad $8, %%mm3 \n\t"
2285 #endif
2286 "packssdw %%mm2, %%mm0 \n\t"
2287 "packssdw %%mm3, %%mm1 \n\t"
2288 "pmaddwd %%mm5, %%mm0 \n\t"
2289 "pmaddwd %%mm5, %%mm1 \n\t"
2290 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2291 "psraw $7, %%mm0 \n\t"
2292
2293 #if HAVE_MMX2 || HAVE_3DNOW
2294 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2295 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2296 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2297 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2298 PAVGB" %%mm1, %%mm4 \n\t"
2299 PAVGB" %%mm3, %%mm2 \n\t"
2300 "movq %%mm4, %%mm1 \n\t"
2301 "movq %%mm2, %%mm3 \n\t"
2302 "psrlq $24, %%mm4 \n\t"
2303 "psrlq $24, %%mm2 \n\t"
2304 PAVGB" %%mm1, %%mm4 \n\t"
2305 PAVGB" %%mm3, %%mm2 \n\t"
2306 "punpcklbw %%mm7, %%mm4 \n\t"
2307 "punpcklbw %%mm7, %%mm2 \n\t"
2308 #else
2309 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2310 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2311 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2312 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2313 "punpcklbw %%mm7, %%mm4 \n\t"
2314 "punpcklbw %%mm7, %%mm1 \n\t"
2315 "punpcklbw %%mm7, %%mm2 \n\t"
2316 "punpcklbw %%mm7, %%mm3 \n\t"
2317 "paddw %%mm1, %%mm4 \n\t"
2318 "paddw %%mm3, %%mm2 \n\t"
2319 "paddw %%mm2, %%mm4 \n\t"
2320 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2321 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2322 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2323 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2324 "punpcklbw %%mm7, %%mm5 \n\t"
2325 "punpcklbw %%mm7, %%mm1 \n\t"
2326 "punpcklbw %%mm7, %%mm2 \n\t"
2327 "punpcklbw %%mm7, %%mm3 \n\t"
2328 "paddw %%mm1, %%mm5 \n\t"
2329 "paddw %%mm3, %%mm2 \n\t"
2330 "paddw %%mm5, %%mm2 \n\t"
2331 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2332 "psrlw $2, %%mm4 \n\t"
2333 "psrlw $2, %%mm2 \n\t"
2334 #endif
2335 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2336 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2337
2338 "pmaddwd %%mm4, %%mm1 \n\t"
2339 "pmaddwd %%mm2, %%mm3 \n\t"
2340 "pmaddwd %%mm6, %%mm4 \n\t"
2341 "pmaddwd %%mm6, %%mm2 \n\t"
2342 #ifndef FAST_BGR2YV12
2343 "psrad $8, %%mm4 \n\t"
2344 "psrad $8, %%mm1 \n\t"
2345 "psrad $8, %%mm2 \n\t"
2346 "psrad $8, %%mm3 \n\t"
2347 #endif
2348 "packssdw %%mm2, %%mm4 \n\t"
2349 "packssdw %%mm3, %%mm1 \n\t"
2350 "pmaddwd %%mm5, %%mm4 \n\t"
2351 "pmaddwd %%mm5, %%mm1 \n\t"
2352 "add $24, %%"REG_d" \n\t"
2353 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2354 "psraw $7, %%mm4 \n\t"
2355
2356 "movq %%mm0, %%mm1 \n\t"
2357 "punpckldq %%mm4, %%mm0 \n\t"
2358 "punpckhdq %%mm4, %%mm1 \n\t"
2359 "packsswb %%mm1, %%mm0 \n\t"
2360 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2361 "movd %%mm0, (%2, %%"REG_a") \n\t"
2362 "punpckhdq %%mm0, %%mm0 \n\t"
2363 "movd %%mm0, (%3, %%"REG_a") \n\t"
2364 "add $4, %%"REG_a" \n\t"
2365 " js 1b \n\t"
2366 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2367 : "%"REG_a, "%"REG_d
2368 );
2369
2370 udst += chromStride;
2371 vdst += chromStride;
2372 src += srcStride*2;
2373 }
2374
2375 __asm__ volatile( EMMS" \n\t"
2376 SFENCE" \n\t"
2377 :::"memory");
2378 #else
2379 y=0;
2380 #endif
2381 for (; y<height; y+=2)
2382 {
2383 long i;
2384 for (i=0; i<chromWidth; i++)
2385 {
2386 unsigned int b = src[6*i+0];
2387 unsigned int g = src[6*i+1];
2388 unsigned int r = src[6*i+2];
2389
2390 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2391 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2392 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2393
2394 udst[i] = U;
2395 vdst[i] = V;
2396 ydst[2*i] = Y;
2397
2398 b = src[6*i+3];
2399 g = src[6*i+4];
2400 r = src[6*i+5];
2401
2402 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2403 ydst[2*i+1] = Y;
2404 }
2405 ydst += lumStride;
2406 src += srcStride;
2407
2408 for (i=0; i<chromWidth; i++)
2409 {
2410 unsigned int b = src[6*i+0];
2411 unsigned int g = src[6*i+1];
2412 unsigned int r = src[6*i+2];
2413
2414 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2415
2416 ydst[2*i] = Y;
2417
2418 b = src[6*i+3];
2419 g = src[6*i+4];
2420 r = src[6*i+5];
2421
2422 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2423 ydst[2*i+1] = Y;
2424 }
2425 udst += chromStride;
2426 vdst += chromStride;
2427 ydst += lumStride;
2428 src += srcStride;
2429 }
2430 }
2431
2432 static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2433 long width, long height, long src1Stride,
2434 long src2Stride, long dstStride){
2435 long h;
2436
2437 for (h=0; h < height; h++)
2438 {
2439 long w;
2440
2441 #if HAVE_MMX
2442 #if HAVE_SSE2
2443 __asm__(
2444 "xor %%"REG_a", %%"REG_a" \n\t"
2445 "1: \n\t"
2446 PREFETCH" 64(%1, %%"REG_a") \n\t"
2447 PREFETCH" 64(%2, %%"REG_a") \n\t"
2448 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2449 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2450 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2451 "punpcklbw %%xmm2, %%xmm0 \n\t"
2452 "punpckhbw %%xmm2, %%xmm1 \n\t"
2453 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2454 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2455 "add $16, %%"REG_a" \n\t"
2456 "cmp %3, %%"REG_a" \n\t"
2457 " jb 1b \n\t"
2458 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2459 : "memory", "%"REG_a""
2460 );
2461 #else
2462 __asm__(
2463 "xor %%"REG_a", %%"REG_a" \n\t"
2464 "1: \n\t"
2465 PREFETCH" 64(%1, %%"REG_a") \n\t"
2466 PREFETCH" 64(%2, %%"REG_a") \n\t"
2467 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2468 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2469 "movq %%mm0, %%mm1 \n\t"
2470 "movq %%mm2, %%mm3 \n\t"
2471 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2472 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2473 "punpcklbw %%mm4, %%mm0 \n\t"
2474 "punpckhbw %%mm4, %%mm1 \n\t"
2475 "punpcklbw %%mm5, %%mm2 \n\t"
2476 "punpckhbw %%mm5, %%mm3 \n\t"
2477 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2478 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2479 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2480 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2481 "add $16, %%"REG_a" \n\t"
2482 "cmp %3, %%"REG_a" \n\t"
2483 " jb 1b \n\t"
2484 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2485 : "memory", "%"REG_a
2486 );
2487 #endif
2488 for (w= (width&(~15)); w < width; w++)
2489 {
2490 dest[2*w+0] = src1[w];
2491 dest[2*w+1] = src2[w];
2492 }
2493 #else
2494 for (w=0; w < width; w++)
2495 {
2496 dest[2*w+0] = src1[w];
2497 dest[2*w+1] = src2[w];
2498 }
2499 #endif
2500 dest += dstStride;
2501 src1 += src1Stride;
2502 src2 += src2Stride;
2503 }
2504 #if HAVE_MMX
2505 __asm__(
2506 EMMS" \n\t"
2507 SFENCE" \n\t"
2508 ::: "memory"
2509 );
2510 #endif
2511 }
2512
2513 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2514 uint8_t *dst1, uint8_t *dst2,
2515 long width, long height,
2516 long srcStride1, long srcStride2,
2517 long dstStride1, long dstStride2)
2518 {
2519 long y,x,w,h;
2520 w=width/2; h=height/2;
2521 #if HAVE_MMX
2522 __asm__ volatile(
2523 PREFETCH" %0 \n\t"
2524 PREFETCH" %1 \n\t"
2525 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2526 #endif
2527 for (y=0;y<h;y++){
2528 const uint8_t* s1=src1+srcStride1*(y>>1);
2529 uint8_t* d=dst1+dstStride1*y;
2530 x=0;
2531 #if HAVE_MMX
2532 for (;x<w-31;x+=32)
2533 {
2534 __asm__ volatile(
2535 PREFETCH" 32%1 \n\t"
2536 "movq %1, %%mm0 \n\t"
2537 "movq 8%1, %%mm2 \n\t"
2538 "movq 16%1, %%mm4 \n\t"
2539 "movq 24%1, %%mm6 \n\t"
2540 "movq %%mm0, %%mm1 \n\t"
2541 "movq %%mm2, %%mm3 \n\t"
2542 "movq %%mm4, %%mm5 \n\t"
2543 "movq %%mm6, %%mm7 \n\t"
2544 "punpcklbw %%mm0, %%mm0 \n\t"
2545 "punpckhbw %%mm1, %%mm1 \n\t"
2546 "punpcklbw %%mm2, %%mm2 \n\t"
2547 "punpckhbw %%mm3, %%mm3 \n\t"
2548 "punpcklbw %%mm4, %%mm4 \n\t"
2549 "punpckhbw %%mm5, %%mm5 \n\t"
2550 "punpcklbw %%mm6, %%mm6 \n\t"
2551 "punpckhbw %%mm7, %%mm7 \n\t"
2552 MOVNTQ" %%mm0, %0 \n\t"
2553 MOVNTQ" %%mm1, 8%0 \n\t"
2554 MOVNTQ" %%mm2, 16%0 \n\t"
2555 MOVNTQ" %%mm3, 24%0 \n\t"
2556 MOVNTQ" %%mm4, 32%0 \n\t"
2557 MOVNTQ" %%mm5, 40%0 \n\t"
2558 MOVNTQ" %%mm6, 48%0 \n\t"
2559 MOVNTQ" %%mm7, 56%0"
2560 :"=m"(d[2*x])
2561 :"m"(s1[x])
2562 :"memory");
2563 }
2564 #endif
2565 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2566 }
2567 for (y=0;y<h;y++){
2568 const uint8_t* s2=src2+srcStride2*(y>>1);
2569 uint8_t* d=dst2+dstStride2*y;
2570 x=0;
2571 #if HAVE_MMX
2572 for (;x<w-31;x+=32)
2573 {
2574 __asm__ volatile(
2575 PREFETCH" 32%1 \n\t"
2576 "movq %1, %%mm0 \n\t"
2577 "movq 8%1, %%mm2 \n\t"
2578 "movq 16%1, %%mm4 \n\t"
2579 "movq 24%1, %%mm6 \n\t"
2580 "movq %%mm0, %%mm1 \n\t"
2581 "movq %%mm2, %%mm3 \n\t"
2582 "movq %%mm4, %%mm5 \n\t"
2583 "movq %%mm6, %%mm7 \n\t"
2584 "punpcklbw %%mm0, %%mm0 \n\t"
2585 "punpckhbw %%mm1, %%mm1 \n\t"
2586 "punpcklbw %%mm2, %%mm2 \n\t"
2587 "punpckhbw %%mm3, %%mm3 \n\t"
2588 "punpcklbw %%mm4, %%mm4 \n\t"
2589 "punpckhbw %%mm5, %%mm5 \n\t"
2590 "punpcklbw %%mm6, %%mm6 \n\t"
2591 "punpckhbw %%mm7, %%mm7 \n\t"
2592 MOVNTQ" %%mm0, %0 \n\t"
2593 MOVNTQ" %%mm1, 8%0 \n\t"
2594 MOVNTQ" %%mm2, 16%0 \n\t"
2595 MOVNTQ" %%mm3, 24%0 \n\t"
2596 MOVNTQ" %%mm4, 32%0 \n\t"
2597 MOVNTQ" %%mm5, 40%0 \n\t"
2598 MOVNTQ" %%mm6, 48%0 \n\t"
2599 MOVNTQ" %%mm7, 56%0"
2600 :"=m"(d[2*x])
2601 :"m"(s2[x])
2602 :"memory");
2603 }
2604 #endif
2605 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2606 }
2607 #if HAVE_MMX
2608 __asm__(
2609 EMMS" \n\t"
2610 SFENCE" \n\t"
2611 ::: "memory"
2612 );
2613 #endif
2614 }
2615
2616 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2617 uint8_t *dst,
2618 long width, long height,
2619 long srcStride1, long srcStride2,
2620 long srcStride3, long dstStride)
2621 {
2622 long y,x,w,h;
2623 w=width/2; h=height;
2624 for (y=0;y<h;y++){
2625 const uint8_t* yp=src1+srcStride1*y;
2626 const uint8_t* up=src2+srcStride2*(y>>2);
2627 const uint8_t* vp=src3+srcStride3*(y>>2);
2628 uint8_t* d=dst+dstStride*y;
2629 x=0;
2630 #if HAVE_MMX
2631 for (;x<w-7;x+=8)
2632 {
2633 __asm__ volatile(
2634 PREFETCH" 32(%1, %0) \n\t"
2635 PREFETCH" 32(%2, %0) \n\t"
2636 PREFETCH" 32(%3, %0) \n\t"
2637 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2638 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2639 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2640 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2641 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2642 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2643 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2644 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2645 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2646 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2647
2648 "movq %%mm1, %%mm6 \n\t"
2649 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2650 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2651 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2652 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2653 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2654
2655 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2656 "movq 8(%1, %0, 4), %%mm0 \n\t"
2657 "movq %%mm0, %%mm3 \n\t"
2658 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2659 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2660 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2661 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2662
2663 "movq %%mm4, %%mm6 \n\t"
2664 "movq 16(%1, %0, 4), %%mm0 \n\t"
2665 "movq %%mm0, %%mm3 \n\t"
2666 "punpcklbw %%mm5, %%mm4 \n\t"
2667 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2668 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2669 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2670 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2671
2672 "punpckhbw %%mm5, %%mm6 \n\t"
2673 "movq 24(%1, %0, 4), %%mm0 \n\t"
2674 "movq %%mm0, %%mm3 \n\t"
2675 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2676 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2677 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2678 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2679
2680 : "+r" (x)
2681 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2682 :"memory");
2683 }
2684 #endif
2685 for (; x<w; x++)
2686 {
2687 const long x2 = x<<2;
2688 d[8*x+0] = yp[x2];
2689 d[8*x+1] = up[x];
2690 d[8*x+2] = yp[x2+1];
2691 d[8*x+3] = vp[x];
2692 d[8*x+4] = yp[x2+2];
2693 d[8*x+5] = up[x];
2694 d[8*x+6] = yp[x2+3];
2695 d[8*x+7] = vp[x];
2696 }
2697 }
2698 #if HAVE_MMX
2699 __asm__(
2700 EMMS" \n\t"
2701 SFENCE" \n\t"
2702 ::: "memory"
2703 );
2704 #endif
2705 }
2706
2707 static inline void RENAME(rgb2rgb_init)(void){
2708 rgb15to16 = RENAME(rgb15to16);
2709 rgb15tobgr24 = RENAME(rgb15tobgr24);
2710 rgb15to32 = RENAME(rgb15to32);
2711 rgb16tobgr24 = RENAME(rgb16tobgr24);
2712 rgb16to32 = RENAME(rgb16to32);
2713 rgb16to15 = RENAME(rgb16to15);
2714 rgb24tobgr16 = RENAME(rgb24tobgr16);
2715 rgb24tobgr15 = RENAME(rgb24tobgr15);
2716 rgb24tobgr32 = RENAME(rgb24tobgr32);
2717 rgb32to16 = RENAME(rgb32to16);
2718 rgb32to15 = RENAME(rgb32to15);
2719 rgb32tobgr24 = RENAME(rgb32tobgr24);
2720 rgb24to15 = RENAME(rgb24to15);
2721 rgb24to16 = RENAME(rgb24to16);
2722 rgb24tobgr24 = RENAME(rgb24tobgr24);
2723 rgb32tobgr32 = RENAME(rgb32tobgr32);
2724 rgb32tobgr16 = RENAME(rgb32tobgr16);
2725 rgb32tobgr15 = RENAME(rgb32tobgr15);
2726 yv12toyuy2 = RENAME(yv12toyuy2);
2727 yv12touyvy = RENAME(yv12touyvy);
2728 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2729 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2730 yuy2toyv12 = RENAME(yuy2toyv12);
2731 // uyvytoyv12 = RENAME(uyvytoyv12);
2732 // yvu9toyv12 = RENAME(yvu9toyv12);
2733 planar2x = RENAME(planar2x);
2734 rgb24toyv12 = RENAME(rgb24toyv12);
2735 interleaveBytes = RENAME(interleaveBytes);
2736 vu9_to_vu12 = RENAME(vu9_to_vu12);
2737 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2738 }