3d1fb8aea6b6cbd71d716993c8e128bfc9272a60
[libav.git] / libswscale / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 *
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
28 */
29
30 #include <stddef.h>
31
32 #undef PREFETCH
33 #undef MOVNTQ
34 #undef EMMS
35 #undef SFENCE
36 #undef MMREG_SIZE
37 #undef PREFETCHW
38 #undef PAVGB
39
40 #if HAVE_SSE2
41 #define MMREG_SIZE 16
42 #else
43 #define MMREG_SIZE 8
44 #endif
45
46 #if HAVE_AMD3DNOW
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
50 #elif HAVE_MMX2
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
53 #define PAVGB "pavgb"
54 #else
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
57 #endif
58
59 #if HAVE_AMD3DNOW
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
61 #define EMMS "femms"
62 #else
63 #define EMMS "emms"
64 #endif
65
66 #if HAVE_MMX2
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
69 #else
70 #define MOVNTQ "movq"
71 #define SFENCE " # nop"
72 #endif
73
74 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
75 {
76 uint8_t *dest = dst;
77 const uint8_t *s = src;
78 const uint8_t *end;
79 #if HAVE_MMX
80 const uint8_t *mm_end;
81 #endif
82 end = s + src_size;
83 #if HAVE_MMX
84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
85 mm_end = end - 23;
86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
87 while (s < mm_end)
88 {
89 __asm__ volatile(
90 PREFETCH" 32%1 \n\t"
91 "movd %1, %%mm0 \n\t"
92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t"
95 "movd 12%1, %%mm2 \n\t"
96 "punpckldq 15%1, %%mm2 \n\t"
97 "movd 18%1, %%mm3 \n\t"
98 "punpckldq 21%1, %%mm3 \n\t"
99 "por %%mm7, %%mm0 \n\t"
100 "por %%mm7, %%mm1 \n\t"
101 "por %%mm7, %%mm2 \n\t"
102 "por %%mm7, %%mm3 \n\t"
103 MOVNTQ" %%mm0, %0 \n\t"
104 MOVNTQ" %%mm1, 8%0 \n\t"
105 MOVNTQ" %%mm2, 16%0 \n\t"
106 MOVNTQ" %%mm3, 24%0"
107 :"=m"(*dest)
108 :"m"(*s)
109 :"memory");
110 dest += 32;
111 s += 24;
112 }
113 __asm__ volatile(SFENCE:::"memory");
114 __asm__ volatile(EMMS:::"memory");
115 #endif
116 while (s < end)
117 {
118 #if HAVE_BIGENDIAN
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
120 *dest++ = 255;
121 *dest++ = s[2];
122 *dest++ = s[1];
123 *dest++ = s[0];
124 s+=3;
125 #else
126 *dest++ = *s++;
127 *dest++ = *s++;
128 *dest++ = *s++;
129 *dest++ = 255;
130 #endif
131 }
132 }
133
134 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
135 {
136 uint8_t *dest = dst;
137 const uint8_t *s = src;
138 const uint8_t *end;
139 #if HAVE_MMX
140 const uint8_t *mm_end;
141 #endif
142 end = s + src_size;
143 #if HAVE_MMX
144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
145 mm_end = end - 31;
146 while (s < mm_end)
147 {
148 __asm__ volatile(
149 PREFETCH" 32%1 \n\t"
150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t"
174
175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t"
188
189 MOVNTQ" %%mm0, %0 \n\t"
190 MOVNTQ" %%mm1, 8%0 \n\t"
191 MOVNTQ" %%mm4, 16%0"
192 :"=m"(*dest)
193 :"m"(*s),"m"(mask24l),
194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
195 :"memory");
196 dest += 24;
197 s += 32;
198 }
199 __asm__ volatile(SFENCE:::"memory");
200 __asm__ volatile(EMMS:::"memory");
201 #endif
202 while (s < end)
203 {
204 #if HAVE_BIGENDIAN
205 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
206 s++;
207 dest[2] = *s++;
208 dest[1] = *s++;
209 dest[0] = *s++;
210 dest += 3;
211 #else
212 *dest++ = *s++;
213 *dest++ = *s++;
214 *dest++ = *s++;
215 s++;
216 #endif
217 }
218 }
219
220 /*
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
225 */
226 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
227 {
228 register const uint8_t* s=src;
229 register uint8_t* d=dst;
230 register const uint8_t *end;
231 const uint8_t *mm_end;
232 end = s + src_size;
233 #if HAVE_MMX
234 __asm__ volatile(PREFETCH" %0"::"m"(*s));
235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
236 mm_end = end - 15;
237 while (s<mm_end)
238 {
239 __asm__ volatile(
240 PREFETCH" 32%1 \n\t"
241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ" %%mm0, %0 \n\t"
250 MOVNTQ" %%mm2, 8%0"
251 :"=m"(*d)
252 :"m"(*s)
253 );
254 d+=16;
255 s+=16;
256 }
257 __asm__ volatile(SFENCE:::"memory");
258 __asm__ volatile(EMMS:::"memory");
259 #endif
260 mm_end = end - 3;
261 while (s < mm_end)
262 {
263 register unsigned x= *((const uint32_t *)s);
264 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
265 d+=4;
266 s+=4;
267 }
268 if (s < end)
269 {
270 register unsigned short x= *((const uint16_t *)s);
271 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
272 }
273 }
274
275 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
276 {
277 register const uint8_t* s=src;
278 register uint8_t* d=dst;
279 register const uint8_t *end;
280 const uint8_t *mm_end;
281 end = s + src_size;
282 #if HAVE_MMX
283 __asm__ volatile(PREFETCH" %0"::"m"(*s));
284 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
286 mm_end = end - 15;
287 while (s<mm_end)
288 {
289 __asm__ volatile(
290 PREFETCH" 32%1 \n\t"
291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ" %%mm0, %0 \n\t"
304 MOVNTQ" %%mm2, 8%0"
305 :"=m"(*d)
306 :"m"(*s)
307 );
308 d+=16;
309 s+=16;
310 }
311 __asm__ volatile(SFENCE:::"memory");
312 __asm__ volatile(EMMS:::"memory");
313 #endif
314 mm_end = end - 3;
315 while (s < mm_end)
316 {
317 register uint32_t x= *((const uint32_t*)s);
318 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
319 s+=4;
320 d+=4;
321 }
322 if (s < end)
323 {
324 register uint16_t x= *((const uint16_t*)s);
325 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
326 }
327 }
328
329 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
330 {
331 const uint8_t *s = src;
332 const uint8_t *end;
333 #if HAVE_MMX
334 const uint8_t *mm_end;
335 #endif
336 uint16_t *d = (uint16_t *)dst;
337 end = s + src_size;
338 #if HAVE_MMX
339 mm_end = end - 15;
340 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
341 __asm__ volatile(
342 "movq %3, %%mm5 \n\t"
343 "movq %4, %%mm6 \n\t"
344 "movq %5, %%mm7 \n\t"
345 "jmp 2f \n\t"
346 ASMALIGN(4)
347 "1: \n\t"
348 PREFETCH" 32(%1) \n\t"
349 "movd (%1), %%mm0 \n\t"
350 "movd 4(%1), %%mm3 \n\t"
351 "punpckldq 8(%1), %%mm0 \n\t"
352 "punpckldq 12(%1), %%mm3 \n\t"
353 "movq %%mm0, %%mm1 \n\t"
354 "movq %%mm3, %%mm4 \n\t"
355 "pand %%mm6, %%mm0 \n\t"
356 "pand %%mm6, %%mm3 \n\t"
357 "pmaddwd %%mm7, %%mm0 \n\t"
358 "pmaddwd %%mm7, %%mm3 \n\t"
359 "pand %%mm5, %%mm1 \n\t"
360 "pand %%mm5, %%mm4 \n\t"
361 "por %%mm1, %%mm0 \n\t"
362 "por %%mm4, %%mm3 \n\t"
363 "psrld $5, %%mm0 \n\t"
364 "pslld $11, %%mm3 \n\t"
365 "por %%mm3, %%mm0 \n\t"
366 MOVNTQ" %%mm0, (%0) \n\t"
367 "add $16, %1 \n\t"
368 "add $8, %0 \n\t"
369 "2: \n\t"
370 "cmp %2, %1 \n\t"
371 " jb 1b \n\t"
372 : "+r" (d), "+r"(s)
373 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
374 );
375 #else
376 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
377 __asm__ volatile(
378 "movq %0, %%mm7 \n\t"
379 "movq %1, %%mm6 \n\t"
380 ::"m"(red_16mask),"m"(green_16mask));
381 while (s < mm_end)
382 {
383 __asm__ volatile(
384 PREFETCH" 32%1 \n\t"
385 "movd %1, %%mm0 \n\t"
386 "movd 4%1, %%mm3 \n\t"
387 "punpckldq 8%1, %%mm0 \n\t"
388 "punpckldq 12%1, %%mm3 \n\t"
389 "movq %%mm0, %%mm1 \n\t"
390 "movq %%mm0, %%mm2 \n\t"
391 "movq %%mm3, %%mm4 \n\t"
392 "movq %%mm3, %%mm5 \n\t"
393 "psrlq $3, %%mm0 \n\t"
394 "psrlq $3, %%mm3 \n\t"
395 "pand %2, %%mm0 \n\t"
396 "pand %2, %%mm3 \n\t"
397 "psrlq $5, %%mm1 \n\t"
398 "psrlq $5, %%mm4 \n\t"
399 "pand %%mm6, %%mm1 \n\t"
400 "pand %%mm6, %%mm4 \n\t"
401 "psrlq $8, %%mm2 \n\t"
402 "psrlq $8, %%mm5 \n\t"
403 "pand %%mm7, %%mm2 \n\t"
404 "pand %%mm7, %%mm5 \n\t"
405 "por %%mm1, %%mm0 \n\t"
406 "por %%mm4, %%mm3 \n\t"
407 "por %%mm2, %%mm0 \n\t"
408 "por %%mm5, %%mm3 \n\t"
409 "psllq $16, %%mm3 \n\t"
410 "por %%mm3, %%mm0 \n\t"
411 MOVNTQ" %%mm0, %0 \n\t"
412 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
413 d += 4;
414 s += 16;
415 }
416 #endif
417 __asm__ volatile(SFENCE:::"memory");
418 __asm__ volatile(EMMS:::"memory");
419 #endif
420 while (s < end)
421 {
422 register int rgb = *(const uint32_t*)s; s += 4;
423 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
424 }
425 }
426
427 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
428 {
429 const uint8_t *s = src;
430 const uint8_t *end;
431 #if HAVE_MMX
432 const uint8_t *mm_end;
433 #endif
434 uint16_t *d = (uint16_t *)dst;
435 end = s + src_size;
436 #if HAVE_MMX
437 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
438 __asm__ volatile(
439 "movq %0, %%mm7 \n\t"
440 "movq %1, %%mm6 \n\t"
441 ::"m"(red_16mask),"m"(green_16mask));
442 mm_end = end - 15;
443 while (s < mm_end)
444 {
445 __asm__ volatile(
446 PREFETCH" 32%1 \n\t"
447 "movd %1, %%mm0 \n\t"
448 "movd 4%1, %%mm3 \n\t"
449 "punpckldq 8%1, %%mm0 \n\t"
450 "punpckldq 12%1, %%mm3 \n\t"
451 "movq %%mm0, %%mm1 \n\t"
452 "movq %%mm0, %%mm2 \n\t"
453 "movq %%mm3, %%mm4 \n\t"
454 "movq %%mm3, %%mm5 \n\t"
455 "psllq $8, %%mm0 \n\t"
456 "psllq $8, %%mm3 \n\t"
457 "pand %%mm7, %%mm0 \n\t"
458 "pand %%mm7, %%mm3 \n\t"
459 "psrlq $5, %%mm1 \n\t"
460 "psrlq $5, %%mm4 \n\t"
461 "pand %%mm6, %%mm1 \n\t"
462 "pand %%mm6, %%mm4 \n\t"
463 "psrlq $19, %%mm2 \n\t"
464 "psrlq $19, %%mm5 \n\t"
465 "pand %2, %%mm2 \n\t"
466 "pand %2, %%mm5 \n\t"
467 "por %%mm1, %%mm0 \n\t"
468 "por %%mm4, %%mm3 \n\t"
469 "por %%mm2, %%mm0 \n\t"
470 "por %%mm5, %%mm3 \n\t"
471 "psllq $16, %%mm3 \n\t"
472 "por %%mm3, %%mm0 \n\t"
473 MOVNTQ" %%mm0, %0 \n\t"
474 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
475 d += 4;
476 s += 16;
477 }
478 __asm__ volatile(SFENCE:::"memory");
479 __asm__ volatile(EMMS:::"memory");
480 #endif
481 while (s < end)
482 {
483 register int rgb = *(const uint32_t*)s; s += 4;
484 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
485 }
486 }
487
488 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
489 {
490 const uint8_t *s = src;
491 const uint8_t *end;
492 #if HAVE_MMX
493 const uint8_t *mm_end;
494 #endif
495 uint16_t *d = (uint16_t *)dst;
496 end = s + src_size;
497 #if HAVE_MMX
498 mm_end = end - 15;
499 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
500 __asm__ volatile(
501 "movq %3, %%mm5 \n\t"
502 "movq %4, %%mm6 \n\t"
503 "movq %5, %%mm7 \n\t"
504 "jmp 2f \n\t"
505 ASMALIGN(4)
506 "1: \n\t"
507 PREFETCH" 32(%1) \n\t"
508 "movd (%1), %%mm0 \n\t"
509 "movd 4(%1), %%mm3 \n\t"
510 "punpckldq 8(%1), %%mm0 \n\t"
511 "punpckldq 12(%1), %%mm3 \n\t"
512 "movq %%mm0, %%mm1 \n\t"
513 "movq %%mm3, %%mm4 \n\t"
514 "pand %%mm6, %%mm0 \n\t"
515 "pand %%mm6, %%mm3 \n\t"
516 "pmaddwd %%mm7, %%mm0 \n\t"
517 "pmaddwd %%mm7, %%mm3 \n\t"
518 "pand %%mm5, %%mm1 \n\t"
519 "pand %%mm5, %%mm4 \n\t"
520 "por %%mm1, %%mm0 \n\t"
521 "por %%mm4, %%mm3 \n\t"
522 "psrld $6, %%mm0 \n\t"
523 "pslld $10, %%mm3 \n\t"
524 "por %%mm3, %%mm0 \n\t"
525 MOVNTQ" %%mm0, (%0) \n\t"
526 "add $16, %1 \n\t"
527 "add $8, %0 \n\t"
528 "2: \n\t"
529 "cmp %2, %1 \n\t"
530 " jb 1b \n\t"
531 : "+r" (d), "+r"(s)
532 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
533 );
534 #else
535 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
536 __asm__ volatile(
537 "movq %0, %%mm7 \n\t"
538 "movq %1, %%mm6 \n\t"
539 ::"m"(red_15mask),"m"(green_15mask));
540 while (s < mm_end)
541 {
542 __asm__ volatile(
543 PREFETCH" 32%1 \n\t"
544 "movd %1, %%mm0 \n\t"
545 "movd 4%1, %%mm3 \n\t"
546 "punpckldq 8%1, %%mm0 \n\t"
547 "punpckldq 12%1, %%mm3 \n\t"
548 "movq %%mm0, %%mm1 \n\t"
549 "movq %%mm0, %%mm2 \n\t"
550 "movq %%mm3, %%mm4 \n\t"
551 "movq %%mm3, %%mm5 \n\t"
552 "psrlq $3, %%mm0 \n\t"
553 "psrlq $3, %%mm3 \n\t"
554 "pand %2, %%mm0 \n\t"
555 "pand %2, %%mm3 \n\t"
556 "psrlq $6, %%mm1 \n\t"
557 "psrlq $6, %%mm4 \n\t"
558 "pand %%mm6, %%mm1 \n\t"
559 "pand %%mm6, %%mm4 \n\t"
560 "psrlq $9, %%mm2 \n\t"
561 "psrlq $9, %%mm5 \n\t"
562 "pand %%mm7, %%mm2 \n\t"
563 "pand %%mm7, %%mm5 \n\t"
564 "por %%mm1, %%mm0 \n\t"
565 "por %%mm4, %%mm3 \n\t"
566 "por %%mm2, %%mm0 \n\t"
567 "por %%mm5, %%mm3 \n\t"
568 "psllq $16, %%mm3 \n\t"
569 "por %%mm3, %%mm0 \n\t"
570 MOVNTQ" %%mm0, %0 \n\t"
571 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
572 d += 4;
573 s += 16;
574 }
575 #endif
576 __asm__ volatile(SFENCE:::"memory");
577 __asm__ volatile(EMMS:::"memory");
578 #endif
579 while (s < end)
580 {
581 register int rgb = *(const uint32_t*)s; s += 4;
582 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
583 }
584 }
585
586 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
587 {
588 const uint8_t *s = src;
589 const uint8_t *end;
590 #if HAVE_MMX
591 const uint8_t *mm_end;
592 #endif
593 uint16_t *d = (uint16_t *)dst;
594 end = s + src_size;
595 #if HAVE_MMX
596 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
597 __asm__ volatile(
598 "movq %0, %%mm7 \n\t"
599 "movq %1, %%mm6 \n\t"
600 ::"m"(red_15mask),"m"(green_15mask));
601 mm_end = end - 15;
602 while (s < mm_end)
603 {
604 __asm__ volatile(
605 PREFETCH" 32%1 \n\t"
606 "movd %1, %%mm0 \n\t"
607 "movd 4%1, %%mm3 \n\t"
608 "punpckldq 8%1, %%mm0 \n\t"
609 "punpckldq 12%1, %%mm3 \n\t"
610 "movq %%mm0, %%mm1 \n\t"
611 "movq %%mm0, %%mm2 \n\t"
612 "movq %%mm3, %%mm4 \n\t"
613 "movq %%mm3, %%mm5 \n\t"
614 "psllq $7, %%mm0 \n\t"
615 "psllq $7, %%mm3 \n\t"
616 "pand %%mm7, %%mm0 \n\t"
617 "pand %%mm7, %%mm3 \n\t"
618 "psrlq $6, %%mm1 \n\t"
619 "psrlq $6, %%mm4 \n\t"
620 "pand %%mm6, %%mm1 \n\t"
621 "pand %%mm6, %%mm4 \n\t"
622 "psrlq $19, %%mm2 \n\t"
623 "psrlq $19, %%mm5 \n\t"
624 "pand %2, %%mm2 \n\t"
625 "pand %2, %%mm5 \n\t"
626 "por %%mm1, %%mm0 \n\t"
627 "por %%mm4, %%mm3 \n\t"
628 "por %%mm2, %%mm0 \n\t"
629 "por %%mm5, %%mm3 \n\t"
630 "psllq $16, %%mm3 \n\t"
631 "por %%mm3, %%mm0 \n\t"
632 MOVNTQ" %%mm0, %0 \n\t"
633 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
634 d += 4;
635 s += 16;
636 }
637 __asm__ volatile(SFENCE:::"memory");
638 __asm__ volatile(EMMS:::"memory");
639 #endif
640 while (s < end)
641 {
642 register int rgb = *(const uint32_t*)s; s += 4;
643 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
644 }
645 }
646
647 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
648 {
649 const uint8_t *s = src;
650 const uint8_t *end;
651 #if HAVE_MMX
652 const uint8_t *mm_end;
653 #endif
654 uint16_t *d = (uint16_t *)dst;
655 end = s + src_size;
656 #if HAVE_MMX
657 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
658 __asm__ volatile(
659 "movq %0, %%mm7 \n\t"
660 "movq %1, %%mm6 \n\t"
661 ::"m"(red_16mask),"m"(green_16mask));
662 mm_end = end - 11;
663 while (s < mm_end)
664 {
665 __asm__ volatile(
666 PREFETCH" 32%1 \n\t"
667 "movd %1, %%mm0 \n\t"
668 "movd 3%1, %%mm3 \n\t"
669 "punpckldq 6%1, %%mm0 \n\t"
670 "punpckldq 9%1, %%mm3 \n\t"
671 "movq %%mm0, %%mm1 \n\t"
672 "movq %%mm0, %%mm2 \n\t"
673 "movq %%mm3, %%mm4 \n\t"
674 "movq %%mm3, %%mm5 \n\t"
675 "psrlq $3, %%mm0 \n\t"
676 "psrlq $3, %%mm3 \n\t"
677 "pand %2, %%mm0 \n\t"
678 "pand %2, %%mm3 \n\t"
679 "psrlq $5, %%mm1 \n\t"
680 "psrlq $5, %%mm4 \n\t"
681 "pand %%mm6, %%mm1 \n\t"
682 "pand %%mm6, %%mm4 \n\t"
683 "psrlq $8, %%mm2 \n\t"
684 "psrlq $8, %%mm5 \n\t"
685 "pand %%mm7, %%mm2 \n\t"
686 "pand %%mm7, %%mm5 \n\t"
687 "por %%mm1, %%mm0 \n\t"
688 "por %%mm4, %%mm3 \n\t"
689 "por %%mm2, %%mm0 \n\t"
690 "por %%mm5, %%mm3 \n\t"
691 "psllq $16, %%mm3 \n\t"
692 "por %%mm3, %%mm0 \n\t"
693 MOVNTQ" %%mm0, %0 \n\t"
694 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
695 d += 4;
696 s += 12;
697 }
698 __asm__ volatile(SFENCE:::"memory");
699 __asm__ volatile(EMMS:::"memory");
700 #endif
701 while (s < end)
702 {
703 const int b = *s++;
704 const int g = *s++;
705 const int r = *s++;
706 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
707 }
708 }
709
710 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
711 {
712 const uint8_t *s = src;
713 const uint8_t *end;
714 #if HAVE_MMX
715 const uint8_t *mm_end;
716 #endif
717 uint16_t *d = (uint16_t *)dst;
718 end = s + src_size;
719 #if HAVE_MMX
720 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
721 __asm__ volatile(
722 "movq %0, %%mm7 \n\t"
723 "movq %1, %%mm6 \n\t"
724 ::"m"(red_16mask),"m"(green_16mask));
725 mm_end = end - 15;
726 while (s < mm_end)
727 {
728 __asm__ volatile(
729 PREFETCH" 32%1 \n\t"
730 "movd %1, %%mm0 \n\t"
731 "movd 3%1, %%mm3 \n\t"
732 "punpckldq 6%1, %%mm0 \n\t"
733 "punpckldq 9%1, %%mm3 \n\t"
734 "movq %%mm0, %%mm1 \n\t"
735 "movq %%mm0, %%mm2 \n\t"
736 "movq %%mm3, %%mm4 \n\t"
737 "movq %%mm3, %%mm5 \n\t"
738 "psllq $8, %%mm0 \n\t"
739 "psllq $8, %%mm3 \n\t"
740 "pand %%mm7, %%mm0 \n\t"
741 "pand %%mm7, %%mm3 \n\t"
742 "psrlq $5, %%mm1 \n\t"
743 "psrlq $5, %%mm4 \n\t"
744 "pand %%mm6, %%mm1 \n\t"
745 "pand %%mm6, %%mm4 \n\t"
746 "psrlq $19, %%mm2 \n\t"
747 "psrlq $19, %%mm5 \n\t"
748 "pand %2, %%mm2 \n\t"
749 "pand %2, %%mm5 \n\t"
750 "por %%mm1, %%mm0 \n\t"
751 "por %%mm4, %%mm3 \n\t"
752 "por %%mm2, %%mm0 \n\t"
753 "por %%mm5, %%mm3 \n\t"
754 "psllq $16, %%mm3 \n\t"
755 "por %%mm3, %%mm0 \n\t"
756 MOVNTQ" %%mm0, %0 \n\t"
757 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
758 d += 4;
759 s += 12;
760 }
761 __asm__ volatile(SFENCE:::"memory");
762 __asm__ volatile(EMMS:::"memory");
763 #endif
764 while (s < end)
765 {
766 const int r = *s++;
767 const int g = *s++;
768 const int b = *s++;
769 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
770 }
771 }
772
773 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
774 {
775 const uint8_t *s = src;
776 const uint8_t *end;
777 #if HAVE_MMX
778 const uint8_t *mm_end;
779 #endif
780 uint16_t *d = (uint16_t *)dst;
781 end = s + src_size;
782 #if HAVE_MMX
783 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
784 __asm__ volatile(
785 "movq %0, %%mm7 \n\t"
786 "movq %1, %%mm6 \n\t"
787 ::"m"(red_15mask),"m"(green_15mask));
788 mm_end = end - 11;
789 while (s < mm_end)
790 {
791 __asm__ volatile(
792 PREFETCH" 32%1 \n\t"
793 "movd %1, %%mm0 \n\t"
794 "movd 3%1, %%mm3 \n\t"
795 "punpckldq 6%1, %%mm0 \n\t"
796 "punpckldq 9%1, %%mm3 \n\t"
797 "movq %%mm0, %%mm1 \n\t"
798 "movq %%mm0, %%mm2 \n\t"
799 "movq %%mm3, %%mm4 \n\t"
800 "movq %%mm3, %%mm5 \n\t"
801 "psrlq $3, %%mm0 \n\t"
802 "psrlq $3, %%mm3 \n\t"
803 "pand %2, %%mm0 \n\t"
804 "pand %2, %%mm3 \n\t"
805 "psrlq $6, %%mm1 \n\t"
806 "psrlq $6, %%mm4 \n\t"
807 "pand %%mm6, %%mm1 \n\t"
808 "pand %%mm6, %%mm4 \n\t"
809 "psrlq $9, %%mm2 \n\t"
810 "psrlq $9, %%mm5 \n\t"
811 "pand %%mm7, %%mm2 \n\t"
812 "pand %%mm7, %%mm5 \n\t"
813 "por %%mm1, %%mm0 \n\t"
814 "por %%mm4, %%mm3 \n\t"
815 "por %%mm2, %%mm0 \n\t"
816 "por %%mm5, %%mm3 \n\t"
817 "psllq $16, %%mm3 \n\t"
818 "por %%mm3, %%mm0 \n\t"
819 MOVNTQ" %%mm0, %0 \n\t"
820 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
821 d += 4;
822 s += 12;
823 }
824 __asm__ volatile(SFENCE:::"memory");
825 __asm__ volatile(EMMS:::"memory");
826 #endif
827 while (s < end)
828 {
829 const int b = *s++;
830 const int g = *s++;
831 const int r = *s++;
832 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
833 }
834 }
835
836 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
837 {
838 const uint8_t *s = src;
839 const uint8_t *end;
840 #if HAVE_MMX
841 const uint8_t *mm_end;
842 #endif
843 uint16_t *d = (uint16_t *)dst;
844 end = s + src_size;
845 #if HAVE_MMX
846 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
847 __asm__ volatile(
848 "movq %0, %%mm7 \n\t"
849 "movq %1, %%mm6 \n\t"
850 ::"m"(red_15mask),"m"(green_15mask));
851 mm_end = end - 15;
852 while (s < mm_end)
853 {
854 __asm__ volatile(
855 PREFETCH" 32%1 \n\t"
856 "movd %1, %%mm0 \n\t"
857 "movd 3%1, %%mm3 \n\t"
858 "punpckldq 6%1, %%mm0 \n\t"
859 "punpckldq 9%1, %%mm3 \n\t"
860 "movq %%mm0, %%mm1 \n\t"
861 "movq %%mm0, %%mm2 \n\t"
862 "movq %%mm3, %%mm4 \n\t"
863 "movq %%mm3, %%mm5 \n\t"
864 "psllq $7, %%mm0 \n\t"
865 "psllq $7, %%mm3 \n\t"
866 "pand %%mm7, %%mm0 \n\t"
867 "pand %%mm7, %%mm3 \n\t"
868 "psrlq $6, %%mm1 \n\t"
869 "psrlq $6, %%mm4 \n\t"
870 "pand %%mm6, %%mm1 \n\t"
871 "pand %%mm6, %%mm4 \n\t"
872 "psrlq $19, %%mm2 \n\t"
873 "psrlq $19, %%mm5 \n\t"
874 "pand %2, %%mm2 \n\t"
875 "pand %2, %%mm5 \n\t"
876 "por %%mm1, %%mm0 \n\t"
877 "por %%mm4, %%mm3 \n\t"
878 "por %%mm2, %%mm0 \n\t"
879 "por %%mm5, %%mm3 \n\t"
880 "psllq $16, %%mm3 \n\t"
881 "por %%mm3, %%mm0 \n\t"
882 MOVNTQ" %%mm0, %0 \n\t"
883 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
884 d += 4;
885 s += 12;
886 }
887 __asm__ volatile(SFENCE:::"memory");
888 __asm__ volatile(EMMS:::"memory");
889 #endif
890 while (s < end)
891 {
892 const int r = *s++;
893 const int g = *s++;
894 const int b = *s++;
895 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
896 }
897 }
898
899 /*
900 I use less accurate approximation here by simply left-shifting the input
901 value and filling the low order bits with zeroes. This method improves PNG
902 compression but this scheme cannot reproduce white exactly, since it does
903 not generate an all-ones maximum value; the net effect is to darken the
904 image slightly.
905
906 The better method should be "left bit replication":
907
908 4 3 2 1 0
909 ---------
910 1 1 0 1 1
911
912 7 6 5 4 3 2 1 0
913 ----------------
914 1 1 0 1 1 1 1 0
915 |=======| |===|
916 | leftmost bits repeated to fill open bits
917 |
918 original bits
919 */
920 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
921 {
922 const uint16_t *end;
923 #if HAVE_MMX
924 const uint16_t *mm_end;
925 #endif
926 uint8_t *d = dst;
927 const uint16_t *s = (const uint16_t*)src;
928 end = s + src_size/2;
929 #if HAVE_MMX
930 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
931 mm_end = end - 7;
932 while (s < mm_end)
933 {
934 __asm__ volatile(
935 PREFETCH" 32%1 \n\t"
936 "movq %1, %%mm0 \n\t"
937 "movq %1, %%mm1 \n\t"
938 "movq %1, %%mm2 \n\t"
939 "pand %2, %%mm0 \n\t"
940 "pand %3, %%mm1 \n\t"
941 "pand %4, %%mm2 \n\t"
942 "psllq $3, %%mm0 \n\t"
943 "psrlq $2, %%mm1 \n\t"
944 "psrlq $7, %%mm2 \n\t"
945 "movq %%mm0, %%mm3 \n\t"
946 "movq %%mm1, %%mm4 \n\t"
947 "movq %%mm2, %%mm5 \n\t"
948 "punpcklwd %5, %%mm0 \n\t"
949 "punpcklwd %5, %%mm1 \n\t"
950 "punpcklwd %5, %%mm2 \n\t"
951 "punpckhwd %5, %%mm3 \n\t"
952 "punpckhwd %5, %%mm4 \n\t"
953 "punpckhwd %5, %%mm5 \n\t"
954 "psllq $8, %%mm1 \n\t"
955 "psllq $16, %%mm2 \n\t"
956 "por %%mm1, %%mm0 \n\t"
957 "por %%mm2, %%mm0 \n\t"
958 "psllq $8, %%mm4 \n\t"
959 "psllq $16, %%mm5 \n\t"
960 "por %%mm4, %%mm3 \n\t"
961 "por %%mm5, %%mm3 \n\t"
962
963 "movq %%mm0, %%mm6 \n\t"
964 "movq %%mm3, %%mm7 \n\t"
965
966 "movq 8%1, %%mm0 \n\t"
967 "movq 8%1, %%mm1 \n\t"
968 "movq 8%1, %%mm2 \n\t"
969 "pand %2, %%mm0 \n\t"
970 "pand %3, %%mm1 \n\t"
971 "pand %4, %%mm2 \n\t"
972 "psllq $3, %%mm0 \n\t"
973 "psrlq $2, %%mm1 \n\t"
974 "psrlq $7, %%mm2 \n\t"
975 "movq %%mm0, %%mm3 \n\t"
976 "movq %%mm1, %%mm4 \n\t"
977 "movq %%mm2, %%mm5 \n\t"
978 "punpcklwd %5, %%mm0 \n\t"
979 "punpcklwd %5, %%mm1 \n\t"
980 "punpcklwd %5, %%mm2 \n\t"
981 "punpckhwd %5, %%mm3 \n\t"
982 "punpckhwd %5, %%mm4 \n\t"
983 "punpckhwd %5, %%mm5 \n\t"
984 "psllq $8, %%mm1 \n\t"
985 "psllq $16, %%mm2 \n\t"
986 "por %%mm1, %%mm0 \n\t"
987 "por %%mm2, %%mm0 \n\t"
988 "psllq $8, %%mm4 \n\t"
989 "psllq $16, %%mm5 \n\t"
990 "por %%mm4, %%mm3 \n\t"
991 "por %%mm5, %%mm3 \n\t"
992
993 :"=m"(*d)
994 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
995 :"memory");
996 /* borrowed 32 to 24 */
997 __asm__ volatile(
998 "movq %%mm0, %%mm4 \n\t"
999 "movq %%mm3, %%mm5 \n\t"
1000 "movq %%mm6, %%mm0 \n\t"
1001 "movq %%mm7, %%mm1 \n\t"
1002
1003 "movq %%mm4, %%mm6 \n\t"
1004 "movq %%mm5, %%mm7 \n\t"
1005 "movq %%mm0, %%mm2 \n\t"
1006 "movq %%mm1, %%mm3 \n\t"
1007
1008 "psrlq $8, %%mm2 \n\t"
1009 "psrlq $8, %%mm3 \n\t"
1010 "psrlq $8, %%mm6 \n\t"
1011 "psrlq $8, %%mm7 \n\t"
1012 "pand %2, %%mm0 \n\t"
1013 "pand %2, %%mm1 \n\t"
1014 "pand %2, %%mm4 \n\t"
1015 "pand %2, %%mm5 \n\t"
1016 "pand %3, %%mm2 \n\t"
1017 "pand %3, %%mm3 \n\t"
1018 "pand %3, %%mm6 \n\t"
1019 "pand %3, %%mm7 \n\t"
1020 "por %%mm2, %%mm0 \n\t"
1021 "por %%mm3, %%mm1 \n\t"
1022 "por %%mm6, %%mm4 \n\t"
1023 "por %%mm7, %%mm5 \n\t"
1024
1025 "movq %%mm1, %%mm2 \n\t"
1026 "movq %%mm4, %%mm3 \n\t"
1027 "psllq $48, %%mm2 \n\t"
1028 "psllq $32, %%mm3 \n\t"
1029 "pand %4, %%mm2 \n\t"
1030 "pand %5, %%mm3 \n\t"
1031 "por %%mm2, %%mm0 \n\t"
1032 "psrlq $16, %%mm1 \n\t"
1033 "psrlq $32, %%mm4 \n\t"
1034 "psllq $16, %%mm5 \n\t"
1035 "por %%mm3, %%mm1 \n\t"
1036 "pand %6, %%mm5 \n\t"
1037 "por %%mm5, %%mm4 \n\t"
1038
1039 MOVNTQ" %%mm0, %0 \n\t"
1040 MOVNTQ" %%mm1, 8%0 \n\t"
1041 MOVNTQ" %%mm4, 16%0"
1042
1043 :"=m"(*d)
1044 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1045 :"memory");
1046 d += 24;
1047 s += 8;
1048 }
1049 __asm__ volatile(SFENCE:::"memory");
1050 __asm__ volatile(EMMS:::"memory");
1051 #endif
1052 while (s < end)
1053 {
1054 register uint16_t bgr;
1055 bgr = *s++;
1056 *d++ = (bgr&0x1F)<<3;
1057 *d++ = (bgr&0x3E0)>>2;
1058 *d++ = (bgr&0x7C00)>>7;
1059 }
1060 }
1061
1062 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1063 {
1064 const uint16_t *end;
1065 #if HAVE_MMX
1066 const uint16_t *mm_end;
1067 #endif
1068 uint8_t *d = (uint8_t *)dst;
1069 const uint16_t *s = (const uint16_t *)src;
1070 end = s + src_size/2;
1071 #if HAVE_MMX
1072 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1073 mm_end = end - 7;
1074 while (s < mm_end)
1075 {
1076 __asm__ volatile(
1077 PREFETCH" 32%1 \n\t"
1078 "movq %1, %%mm0 \n\t"
1079 "movq %1, %%mm1 \n\t"
1080 "movq %1, %%mm2 \n\t"
1081 "pand %2, %%mm0 \n\t"
1082 "pand %3, %%mm1 \n\t"
1083 "pand %4, %%mm2 \n\t"
1084 "psllq $3, %%mm0 \n\t"
1085 "psrlq $3, %%mm1 \n\t"
1086 "psrlq $8, %%mm2 \n\t"
1087 "movq %%mm0, %%mm3 \n\t"
1088 "movq %%mm1, %%mm4 \n\t"
1089 "movq %%mm2, %%mm5 \n\t"
1090 "punpcklwd %5, %%mm0 \n\t"
1091 "punpcklwd %5, %%mm1 \n\t"
1092 "punpcklwd %5, %%mm2 \n\t"
1093 "punpckhwd %5, %%mm3 \n\t"
1094 "punpckhwd %5, %%mm4 \n\t"
1095 "punpckhwd %5, %%mm5 \n\t"
1096 "psllq $8, %%mm1 \n\t"
1097 "psllq $16, %%mm2 \n\t"
1098 "por %%mm1, %%mm0 \n\t"
1099 "por %%mm2, %%mm0 \n\t"
1100 "psllq $8, %%mm4 \n\t"
1101 "psllq $16, %%mm5 \n\t"
1102 "por %%mm4, %%mm3 \n\t"
1103 "por %%mm5, %%mm3 \n\t"
1104
1105 "movq %%mm0, %%mm6 \n\t"
1106 "movq %%mm3, %%mm7 \n\t"
1107
1108 "movq 8%1, %%mm0 \n\t"
1109 "movq 8%1, %%mm1 \n\t"
1110 "movq 8%1, %%mm2 \n\t"
1111 "pand %2, %%mm0 \n\t"
1112 "pand %3, %%mm1 \n\t"
1113 "pand %4, %%mm2 \n\t"
1114 "psllq $3, %%mm0 \n\t"
1115 "psrlq $3, %%mm1 \n\t"
1116 "psrlq $8, %%mm2 \n\t"
1117 "movq %%mm0, %%mm3 \n\t"
1118 "movq %%mm1, %%mm4 \n\t"
1119 "movq %%mm2, %%mm5 \n\t"
1120 "punpcklwd %5, %%mm0 \n\t"
1121 "punpcklwd %5, %%mm1 \n\t"
1122 "punpcklwd %5, %%mm2 \n\t"
1123 "punpckhwd %5, %%mm3 \n\t"
1124 "punpckhwd %5, %%mm4 \n\t"
1125 "punpckhwd %5, %%mm5 \n\t"
1126 "psllq $8, %%mm1 \n\t"
1127 "psllq $16, %%mm2 \n\t"
1128 "por %%mm1, %%mm0 \n\t"
1129 "por %%mm2, %%mm0 \n\t"
1130 "psllq $8, %%mm4 \n\t"
1131 "psllq $16, %%mm5 \n\t"
1132 "por %%mm4, %%mm3 \n\t"
1133 "por %%mm5, %%mm3 \n\t"
1134 :"=m"(*d)
1135 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1136 :"memory");
1137 /* borrowed 32 to 24 */
1138 __asm__ volatile(
1139 "movq %%mm0, %%mm4 \n\t"
1140 "movq %%mm3, %%mm5 \n\t"
1141 "movq %%mm6, %%mm0 \n\t"
1142 "movq %%mm7, %%mm1 \n\t"
1143
1144 "movq %%mm4, %%mm6 \n\t"
1145 "movq %%mm5, %%mm7 \n\t"
1146 "movq %%mm0, %%mm2 \n\t"
1147 "movq %%mm1, %%mm3 \n\t"
1148
1149 "psrlq $8, %%mm2 \n\t"
1150 "psrlq $8, %%mm3 \n\t"
1151 "psrlq $8, %%mm6 \n\t"
1152 "psrlq $8, %%mm7 \n\t"
1153 "pand %2, %%mm0 \n\t"
1154 "pand %2, %%mm1 \n\t"
1155 "pand %2, %%mm4 \n\t"
1156 "pand %2, %%mm5 \n\t"
1157 "pand %3, %%mm2 \n\t"
1158 "pand %3, %%mm3 \n\t"
1159 "pand %3, %%mm6 \n\t"
1160 "pand %3, %%mm7 \n\t"
1161 "por %%mm2, %%mm0 \n\t"
1162 "por %%mm3, %%mm1 \n\t"
1163 "por %%mm6, %%mm4 \n\t"
1164 "por %%mm7, %%mm5 \n\t"
1165
1166 "movq %%mm1, %%mm2 \n\t"
1167 "movq %%mm4, %%mm3 \n\t"
1168 "psllq $48, %%mm2 \n\t"
1169 "psllq $32, %%mm3 \n\t"
1170 "pand %4, %%mm2 \n\t"
1171 "pand %5, %%mm3 \n\t"
1172 "por %%mm2, %%mm0 \n\t"
1173 "psrlq $16, %%mm1 \n\t"
1174 "psrlq $32, %%mm4 \n\t"
1175 "psllq $16, %%mm5 \n\t"
1176 "por %%mm3, %%mm1 \n\t"
1177 "pand %6, %%mm5 \n\t"
1178 "por %%mm5, %%mm4 \n\t"
1179
1180 MOVNTQ" %%mm0, %0 \n\t"
1181 MOVNTQ" %%mm1, 8%0 \n\t"
1182 MOVNTQ" %%mm4, 16%0"
1183
1184 :"=m"(*d)
1185 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1186 :"memory");
1187 d += 24;
1188 s += 8;
1189 }
1190 __asm__ volatile(SFENCE:::"memory");
1191 __asm__ volatile(EMMS:::"memory");
1192 #endif
1193 while (s < end)
1194 {
1195 register uint16_t bgr;
1196 bgr = *s++;
1197 *d++ = (bgr&0x1F)<<3;
1198 *d++ = (bgr&0x7E0)>>3;
1199 *d++ = (bgr&0xF800)>>8;
1200 }
1201 }
1202
1203 /*
1204 * mm0 = 00 B3 00 B2 00 B1 00 B0
1205 * mm1 = 00 G3 00 G2 00 G1 00 G0
1206 * mm2 = 00 R3 00 R2 00 R1 00 R0
1207 * mm6 = FF FF FF FF FF FF FF FF
1208 * mm7 = 00 00 00 00 00 00 00 00
1209 */
1210 #define PACK_RGB32 \
1211 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1212 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1213 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1214 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1215 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1216 "movq %%mm0, %%mm3 \n\t" \
1217 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1218 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1219 MOVNTQ" %%mm0, %0 \n\t" \
1220 MOVNTQ" %%mm3, 8%0 \n\t" \
1221
1222 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1223 {
1224 const uint16_t *end;
1225 #if HAVE_MMX
1226 const uint16_t *mm_end;
1227 #endif
1228 uint8_t *d = dst;
1229 const uint16_t *s = (const uint16_t *)src;
1230 end = s + src_size/2;
1231 #if HAVE_MMX
1232 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1233 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1234 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1235 mm_end = end - 3;
1236 while (s < mm_end)
1237 {
1238 __asm__ volatile(
1239 PREFETCH" 32%1 \n\t"
1240 "movq %1, %%mm0 \n\t"
1241 "movq %1, %%mm1 \n\t"
1242 "movq %1, %%mm2 \n\t"
1243 "pand %2, %%mm0 \n\t"
1244 "pand %3, %%mm1 \n\t"
1245 "pand %4, %%mm2 \n\t"
1246 "psllq $3, %%mm0 \n\t"
1247 "psrlq $2, %%mm1 \n\t"
1248 "psrlq $7, %%mm2 \n\t"
1249 PACK_RGB32
1250 :"=m"(*d)
1251 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1252 :"memory");
1253 d += 16;
1254 s += 4;
1255 }
1256 __asm__ volatile(SFENCE:::"memory");
1257 __asm__ volatile(EMMS:::"memory");
1258 #endif
1259 while (s < end)
1260 {
1261 register uint16_t bgr;
1262 bgr = *s++;
1263 #if HAVE_BIGENDIAN
1264 *d++ = 255;
1265 *d++ = (bgr&0x7C00)>>7;
1266 *d++ = (bgr&0x3E0)>>2;
1267 *d++ = (bgr&0x1F)<<3;
1268 #else
1269 *d++ = (bgr&0x1F)<<3;
1270 *d++ = (bgr&0x3E0)>>2;
1271 *d++ = (bgr&0x7C00)>>7;
1272 *d++ = 255;
1273 #endif
1274 }
1275 }
1276
1277 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1278 {
1279 const uint16_t *end;
1280 #if HAVE_MMX
1281 const uint16_t *mm_end;
1282 #endif
1283 uint8_t *d = dst;
1284 const uint16_t *s = (const uint16_t*)src;
1285 end = s + src_size/2;
1286 #if HAVE_MMX
1287 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1288 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1289 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1290 mm_end = end - 3;
1291 while (s < mm_end)
1292 {
1293 __asm__ volatile(
1294 PREFETCH" 32%1 \n\t"
1295 "movq %1, %%mm0 \n\t"
1296 "movq %1, %%mm1 \n\t"
1297 "movq %1, %%mm2 \n\t"
1298 "pand %2, %%mm0 \n\t"
1299 "pand %3, %%mm1 \n\t"
1300 "pand %4, %%mm2 \n\t"
1301 "psllq $3, %%mm0 \n\t"
1302 "psrlq $3, %%mm1 \n\t"
1303 "psrlq $8, %%mm2 \n\t"
1304 PACK_RGB32
1305 :"=m"(*d)
1306 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1307 :"memory");
1308 d += 16;
1309 s += 4;
1310 }
1311 __asm__ volatile(SFENCE:::"memory");
1312 __asm__ volatile(EMMS:::"memory");
1313 #endif
1314 while (s < end)
1315 {
1316 register uint16_t bgr;
1317 bgr = *s++;
1318 #if HAVE_BIGENDIAN
1319 *d++ = 255;
1320 *d++ = (bgr&0xF800)>>8;
1321 *d++ = (bgr&0x7E0)>>3;
1322 *d++ = (bgr&0x1F)<<3;
1323 #else
1324 *d++ = (bgr&0x1F)<<3;
1325 *d++ = (bgr&0x7E0)>>3;
1326 *d++ = (bgr&0xF800)>>8;
1327 *d++ = 255;
1328 #endif
1329 }
1330 }
1331
1332 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1333 {
1334 x86_reg idx = 15 - src_size;
1335 const uint8_t *s = src-idx;
1336 uint8_t *d = dst-idx;
1337 #if HAVE_MMX
1338 __asm__ volatile(
1339 "test %0, %0 \n\t"
1340 "jns 2f \n\t"
1341 PREFETCH" (%1, %0) \n\t"
1342 "movq %3, %%mm7 \n\t"
1343 "pxor %4, %%mm7 \n\t"
1344 "movq %%mm7, %%mm6 \n\t"
1345 "pxor %5, %%mm7 \n\t"
1346 ASMALIGN(4)
1347 "1: \n\t"
1348 PREFETCH" 32(%1, %0) \n\t"
1349 "movq (%1, %0), %%mm0 \n\t"
1350 "movq 8(%1, %0), %%mm1 \n\t"
1351 # if HAVE_MMX2
1352 "pshufw $177, %%mm0, %%mm3 \n\t"
1353 "pshufw $177, %%mm1, %%mm5 \n\t"
1354 "pand %%mm7, %%mm0 \n\t"
1355 "pand %%mm6, %%mm3 \n\t"
1356 "pand %%mm7, %%mm1 \n\t"
1357 "pand %%mm6, %%mm5 \n\t"
1358 "por %%mm3, %%mm0 \n\t"
1359 "por %%mm5, %%mm1 \n\t"
1360 # else
1361 "movq %%mm0, %%mm2 \n\t"
1362 "movq %%mm1, %%mm4 \n\t"
1363 "pand %%mm7, %%mm0 \n\t"
1364 "pand %%mm6, %%mm2 \n\t"
1365 "pand %%mm7, %%mm1 \n\t"
1366 "pand %%mm6, %%mm4 \n\t"
1367 "movq %%mm2, %%mm3 \n\t"
1368 "movq %%mm4, %%mm5 \n\t"
1369 "pslld $16, %%mm2 \n\t"
1370 "psrld $16, %%mm3 \n\t"
1371 "pslld $16, %%mm4 \n\t"
1372 "psrld $16, %%mm5 \n\t"
1373 "por %%mm2, %%mm0 \n\t"
1374 "por %%mm4, %%mm1 \n\t"
1375 "por %%mm3, %%mm0 \n\t"
1376 "por %%mm5, %%mm1 \n\t"
1377 # endif
1378 MOVNTQ" %%mm0, (%2, %0) \n\t"
1379 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1380 "add $16, %0 \n\t"
1381 "js 1b \n\t"
1382 SFENCE" \n\t"
1383 EMMS" \n\t"
1384 "2: \n\t"
1385 : "+&r"(idx)
1386 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1387 : "memory");
1388 #endif
1389 for (; idx<15; idx+=4) {
1390 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1391 v &= 0xff00ff;
1392 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1393 }
1394 }
1395
1396 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1397 {
1398 unsigned i;
1399 #if HAVE_MMX
1400 x86_reg mmx_size= 23 - src_size;
1401 __asm__ volatile (
1402 "test %%"REG_a", %%"REG_a" \n\t"
1403 "jns 2f \n\t"
1404 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1405 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1406 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1407 ASMALIGN(4)
1408 "1: \n\t"
1409 PREFETCH" 32(%1, %%"REG_a") \n\t"
1410 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1411 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1412 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1413 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1414 "pand %%mm5, %%mm0 \n\t"
1415 "pand %%mm6, %%mm1 \n\t"
1416 "pand %%mm7, %%mm2 \n\t"
1417 "por %%mm0, %%mm1 \n\t"
1418 "por %%mm2, %%mm1 \n\t"
1419 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1420 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1421 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1422 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1423 "pand %%mm7, %%mm0 \n\t"
1424 "pand %%mm5, %%mm1 \n\t"
1425 "pand %%mm6, %%mm2 \n\t"
1426 "por %%mm0, %%mm1 \n\t"
1427 "por %%mm2, %%mm1 \n\t"
1428 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1429 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1430 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1431 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1432 "pand %%mm6, %%mm0 \n\t"
1433 "pand %%mm7, %%mm1 \n\t"
1434 "pand %%mm5, %%mm2 \n\t"
1435 "por %%mm0, %%mm1 \n\t"
1436 "por %%mm2, %%mm1 \n\t"
1437 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1438 "add $24, %%"REG_a" \n\t"
1439 " js 1b \n\t"
1440 "2: \n\t"
1441 : "+a" (mmx_size)
1442 : "r" (src-mmx_size), "r"(dst-mmx_size)
1443 );
1444
1445 __asm__ volatile(SFENCE:::"memory");
1446 __asm__ volatile(EMMS:::"memory");
1447
1448 if (mmx_size==23) return; //finished, was multiple of 8
1449
1450 src+= src_size;
1451 dst+= src_size;
1452 src_size= 23-mmx_size;
1453 src-= src_size;
1454 dst-= src_size;
1455 #endif
1456 for (i=0; i<src_size; i+=3)
1457 {
1458 register uint8_t x;
1459 x = src[i + 2];
1460 dst[i + 1] = src[i + 1];
1461 dst[i + 2] = src[i + 0];
1462 dst[i + 0] = x;
1463 }
1464 }
1465
1466 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1467 long width, long height,
1468 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1469 {
1470 long y;
1471 const x86_reg chromWidth= width>>1;
1472 for (y=0; y<height; y++)
1473 {
1474 #if HAVE_MMX
1475 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1476 __asm__ volatile(
1477 "xor %%"REG_a", %%"REG_a" \n\t"
1478 ASMALIGN(4)
1479 "1: \n\t"
1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1481 PREFETCH" 32(%2, %%"REG_a") \n\t"
1482 PREFETCH" 32(%3, %%"REG_a") \n\t"
1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1484 "movq %%mm0, %%mm2 \n\t" // U(0)
1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1488
1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1491 "movq %%mm3, %%mm4 \n\t" // Y(0)
1492 "movq %%mm5, %%mm6 \n\t" // Y(8)
1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1497
1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1502
1503 "add $8, %%"REG_a" \n\t"
1504 "cmp %4, %%"REG_a" \n\t"
1505 " jb 1b \n\t"
1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1507 : "%"REG_a
1508 );
1509 #else
1510
1511 #if ARCH_ALPHA && HAVE_MVI
1512 #define pl2yuy2(n) \
1513 y1 = yc[n]; \
1514 y2 = yc2[n]; \
1515 u = uc[n]; \
1516 v = vc[n]; \
1517 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1518 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1519 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1520 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1521 yuv1 = (u << 8) + (v << 24); \
1522 yuv2 = yuv1 + y2; \
1523 yuv1 += y1; \
1524 qdst[n] = yuv1; \
1525 qdst2[n] = yuv2;
1526
1527 int i;
1528 uint64_t *qdst = (uint64_t *) dst;
1529 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1530 const uint32_t *yc = (uint32_t *) ysrc;
1531 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1532 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1533 for (i = 0; i < chromWidth; i += 8){
1534 uint64_t y1, y2, yuv1, yuv2;
1535 uint64_t u, v;
1536 /* Prefetch */
1537 __asm__("ldq $31,64(%0)" :: "r"(yc));
1538 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1539 __asm__("ldq $31,64(%0)" :: "r"(uc));
1540 __asm__("ldq $31,64(%0)" :: "r"(vc));
1541
1542 pl2yuy2(0);
1543 pl2yuy2(1);
1544 pl2yuy2(2);
1545 pl2yuy2(3);
1546
1547 yc += 4;
1548 yc2 += 4;
1549 uc += 4;
1550 vc += 4;
1551 qdst += 4;
1552 qdst2 += 4;
1553 }
1554 y++;
1555 ysrc += lumStride;
1556 dst += dstStride;
1557
1558 #elif HAVE_FAST_64BIT
1559 int i;
1560 uint64_t *ldst = (uint64_t *) dst;
1561 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1562 for (i = 0; i < chromWidth; i += 2){
1563 uint64_t k, l;
1564 k = yc[0] + (uc[0] << 8) +
1565 (yc[1] << 16) + (vc[0] << 24);
1566 l = yc[2] + (uc[1] << 8) +
1567 (yc[3] << 16) + (vc[1] << 24);
1568 *ldst++ = k + (l << 32);
1569 yc += 4;
1570 uc += 2;
1571 vc += 2;
1572 }
1573
1574 #else
1575 int i, *idst = (int32_t *) dst;
1576 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1577 for (i = 0; i < chromWidth; i++){
1578 #if HAVE_BIGENDIAN
1579 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1580 (yc[1] << 8) + (vc[0] << 0);
1581 #else
1582 *idst++ = yc[0] + (uc[0] << 8) +
1583 (yc[1] << 16) + (vc[0] << 24);
1584 #endif
1585 yc += 2;
1586 uc++;
1587 vc++;
1588 }
1589 #endif
1590 #endif
1591 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1592 {
1593 usrc += chromStride;
1594 vsrc += chromStride;
1595 }
1596 ysrc += lumStride;
1597 dst += dstStride;
1598 }
1599 #if HAVE_MMX
1600 __asm__( EMMS" \n\t"
1601 SFENCE" \n\t"
1602 :::"memory");
1603 #endif
1604 }
1605
1606 /**
1607 * Height should be a multiple of 2 and width should be a multiple of 16.
1608 * (If this is a problem for anyone then tell me, and I will fix it.)
1609 */
1610 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1611 long width, long height,
1612 long lumStride, long chromStride, long dstStride)
1613 {
1614 //FIXME interpolate chroma
1615 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1616 }
1617
1618 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619 long width, long height,
1620 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1621 {
1622 long y;
1623 const x86_reg chromWidth= width>>1;
1624 for (y=0; y<height; y++)
1625 {
1626 #if HAVE_MMX
1627 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1628 __asm__ volatile(
1629 "xor %%"REG_a", %%"REG_a" \n\t"
1630 ASMALIGN(4)
1631 "1: \n\t"
1632 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1633 PREFETCH" 32(%2, %%"REG_a") \n\t"
1634 PREFETCH" 32(%3, %%"REG_a") \n\t"
1635 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1636 "movq %%mm0, %%mm2 \n\t" // U(0)
1637 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1638 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1639 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1640
1641 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1642 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1643 "movq %%mm0, %%mm4 \n\t" // Y(0)
1644 "movq %%mm2, %%mm6 \n\t" // Y(8)
1645 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1646 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1647 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1648 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1649
1650 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1651 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1652 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1653 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1654
1655 "add $8, %%"REG_a" \n\t"
1656 "cmp %4, %%"REG_a" \n\t"
1657 " jb 1b \n\t"
1658 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1659 : "%"REG_a
1660 );
1661 #else
1662 //FIXME adapt the Alpha ASM code from yv12->yuy2
1663
1664 #if HAVE_FAST_64BIT
1665 int i;
1666 uint64_t *ldst = (uint64_t *) dst;
1667 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1668 for (i = 0; i < chromWidth; i += 2){
1669 uint64_t k, l;
1670 k = uc[0] + (yc[0] << 8) +
1671 (vc[0] << 16) + (yc[1] << 24);
1672 l = uc[1] + (yc[2] << 8) +
1673 (vc[1] << 16) + (yc[3] << 24);
1674 *ldst++ = k + (l << 32);
1675 yc += 4;
1676 uc += 2;
1677 vc += 2;
1678 }
1679
1680 #else
1681 int i, *idst = (int32_t *) dst;
1682 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1683 for (i = 0; i < chromWidth; i++){
1684 #if HAVE_BIGENDIAN
1685 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1686 (vc[0] << 8) + (yc[1] << 0);
1687 #else
1688 *idst++ = uc[0] + (yc[0] << 8) +
1689 (vc[0] << 16) + (yc[1] << 24);
1690 #endif
1691 yc += 2;
1692 uc++;
1693 vc++;
1694 }
1695 #endif
1696 #endif
1697 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1698 {
1699 usrc += chromStride;
1700 vsrc += chromStride;
1701 }
1702 ysrc += lumStride;
1703 dst += dstStride;
1704 }
1705 #if HAVE_MMX
1706 __asm__( EMMS" \n\t"
1707 SFENCE" \n\t"
1708 :::"memory");
1709 #endif
1710 }
1711
1712 /**
1713 * Height should be a multiple of 2 and width should be a multiple of 16
1714 * (If this is a problem for anyone then tell me, and I will fix it.)
1715 */
1716 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1717 long width, long height,
1718 long lumStride, long chromStride, long dstStride)
1719 {
1720 //FIXME interpolate chroma
1721 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1722 }
1723
1724 /**
1725 * Width should be a multiple of 16.
1726 */
1727 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1728 long width, long height,
1729 long lumStride, long chromStride, long dstStride)
1730 {
1731 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1732 }
1733
1734 /**
1735 * Width should be a multiple of 16.
1736 */
1737 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1738 long width, long height,
1739 long lumStride, long chromStride, long dstStride)
1740 {
1741 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1742 }
1743
1744 /**
1745 * Height should be a multiple of 2 and width should be a multiple of 16.
1746 * (If this is a problem for anyone then tell me, and I will fix it.)
1747 */
1748 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1749 long width, long height,
1750 long lumStride, long chromStride, long srcStride)
1751 {
1752 long y;
1753 const x86_reg chromWidth= width>>1;
1754 for (y=0; y<height; y+=2)
1755 {
1756 #if HAVE_MMX
1757 __asm__ volatile(
1758 "xor %%"REG_a", %%"REG_a" \n\t"
1759 "pcmpeqw %%mm7, %%mm7 \n\t"
1760 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1761 ASMALIGN(4)
1762 "1: \n\t"
1763 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1764 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1765 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1766 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1767 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1768 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1769 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1770 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1771 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1772 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1773 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1774
1775 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1776
1777 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1778 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1779 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1780 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1781 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1782 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1783 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1784 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1785 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1786 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1787
1788 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1789
1790 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1791 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1792 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1793 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1794 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1795 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1796 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1797 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1798
1799 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1800 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1801
1802 "add $8, %%"REG_a" \n\t"
1803 "cmp %4, %%"REG_a" \n\t"
1804 " jb 1b \n\t"
1805 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1806 : "memory", "%"REG_a
1807 );
1808
1809 ydst += lumStride;
1810 src += srcStride;
1811
1812 __asm__ volatile(
1813 "xor %%"REG_a", %%"REG_a" \n\t"
1814 ASMALIGN(4)
1815 "1: \n\t"
1816 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1817 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1818 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1819 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1820 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1821 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1822 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1823 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1824 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1825 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1826 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1827
1828 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1829 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1830
1831 "add $8, %%"REG_a" \n\t"
1832 "cmp %4, %%"REG_a" \n\t"
1833 " jb 1b \n\t"
1834
1835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1836 : "memory", "%"REG_a
1837 );
1838 #else
1839 long i;
1840 for (i=0; i<chromWidth; i++)
1841 {
1842 ydst[2*i+0] = src[4*i+0];
1843 udst[i] = src[4*i+1];
1844 ydst[2*i+1] = src[4*i+2];
1845 vdst[i] = src[4*i+3];
1846 }
1847 ydst += lumStride;
1848 src += srcStride;
1849
1850 for (i=0; i<chromWidth; i++)
1851 {
1852 ydst[2*i+0] = src[4*i+0];
1853 ydst[2*i+1] = src[4*i+2];
1854 }
1855 #endif
1856 udst += chromStride;
1857 vdst += chromStride;
1858 ydst += lumStride;
1859 src += srcStride;
1860 }
1861 #if HAVE_MMX
1862 __asm__ volatile( EMMS" \n\t"
1863 SFENCE" \n\t"
1864 :::"memory");
1865 #endif
1866 }
1867
1868 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1869 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1870 long width, long height, long lumStride, long chromStride)
1871 {
1872 /* Y Plane */
1873 memcpy(ydst, ysrc, width*height);
1874
1875 /* XXX: implement upscaling for U,V */
1876 }
1877
1878 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1879 {
1880 long x,y;
1881
1882 dst[0]= src[0];
1883
1884 // first line
1885 for (x=0; x<srcWidth-1; x++){
1886 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1887 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1888 }
1889 dst[2*srcWidth-1]= src[srcWidth-1];
1890
1891 dst+= dstStride;
1892
1893 for (y=1; y<srcHeight; y++){
1894 #if HAVE_MMX2 || HAVE_AMD3DNOW
1895 const x86_reg mmxSize= srcWidth&~15;
1896 __asm__ volatile(
1897 "mov %4, %%"REG_a" \n\t"
1898 "1: \n\t"
1899 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1900 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1901 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1902 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1903 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1904 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1905 PAVGB" %%mm0, %%mm5 \n\t"
1906 PAVGB" %%mm0, %%mm3 \n\t"
1907 PAVGB" %%mm0, %%mm5 \n\t"
1908 PAVGB" %%mm0, %%mm3 \n\t"
1909 PAVGB" %%mm1, %%mm4 \n\t"
1910 PAVGB" %%mm1, %%mm2 \n\t"
1911 PAVGB" %%mm1, %%mm4 \n\t"
1912 PAVGB" %%mm1, %%mm2 \n\t"
1913 "movq %%mm5, %%mm7 \n\t"
1914 "movq %%mm4, %%mm6 \n\t"
1915 "punpcklbw %%mm3, %%mm5 \n\t"
1916 "punpckhbw %%mm3, %%mm7 \n\t"
1917 "punpcklbw %%mm2, %%mm4 \n\t"
1918 "punpckhbw %%mm2, %%mm6 \n\t"
1919 #if 1
1920 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1921 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1922 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1923 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1924 #else
1925 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1926 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1927 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1928 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1929 #endif
1930 "add $8, %%"REG_a" \n\t"
1931 " js 1b \n\t"
1932 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1933 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1934 "g" (-mmxSize)
1935 : "%"REG_a
1936
1937 );
1938 #else
1939 const x86_reg mmxSize=1;
1940 #endif
1941 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1942 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1943
1944 for (x=mmxSize-1; x<srcWidth-1; x++){
1945 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1946 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1947 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1948 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1949 }
1950 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1951 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1952
1953 dst+=dstStride*2;
1954 src+=srcStride;
1955 }
1956
1957 // last line
1958 #if 1
1959 dst[0]= src[0];
1960
1961 for (x=0; x<srcWidth-1; x++){
1962 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1963 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1964 }
1965 dst[2*srcWidth-1]= src[srcWidth-1];
1966 #else
1967 for (x=0; x<srcWidth; x++){
1968 dst[2*x+0]=
1969 dst[2*x+1]= src[x];
1970 }
1971 #endif
1972
1973 #if HAVE_MMX
1974 __asm__ volatile( EMMS" \n\t"
1975 SFENCE" \n\t"
1976 :::"memory");
1977 #endif
1978 }
1979
1980 /**
1981 * Height should be a multiple of 2 and width should be a multiple of 16.
1982 * (If this is a problem for anyone then tell me, and I will fix it.)
1983 * Chrominance data is only taken from every second line, others are ignored.
1984 * FIXME: Write HQ version.
1985 */
1986 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1987 long width, long height,
1988 long lumStride, long chromStride, long srcStride)
1989 {
1990 long y;
1991 const x86_reg chromWidth= width>>1;
1992 for (y=0; y<height; y+=2)
1993 {
1994 #if HAVE_MMX
1995 __asm__ volatile(
1996 "xor %%"REG_a", %%"REG_a" \n\t"
1997 "pcmpeqw %%mm7, %%mm7 \n\t"
1998 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1999 ASMALIGN(4)
2000 "1: \n\t"
2001 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2002 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
2003 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
2004 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2005 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2006 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2007 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2008 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2009 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2010 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2011 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2012
2013 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
2014
2015 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
2016 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
2017 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2018 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2019 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2020 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2021 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2022 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2023 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2024 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2025
2026 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2027
2028 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2029 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2030 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2031 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2032 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2033 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2034 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2035 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2036
2037 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
2038 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
2039
2040 "add $8, %%"REG_a" \n\t"
2041 "cmp %4, %%"REG_a" \n\t"
2042 " jb 1b \n\t"
2043 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2044 : "memory", "%"REG_a
2045 );
2046
2047 ydst += lumStride;
2048 src += srcStride;
2049
2050 __asm__ volatile(
2051 "xor %%"REG_a", %%"REG_a" \n\t"
2052 ASMALIGN(4)
2053 "1: \n\t"
2054 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2055 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
2056 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
2057 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
2058 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
2059 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2060 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2061 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2062 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2063 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2064 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2065
2066 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
2067 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2068
2069 "add $8, %%"REG_a" \n\t"
2070 "cmp %4, %%"REG_a" \n\t"
2071 " jb 1b \n\t"
2072
2073 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2074 : "memory", "%"REG_a
2075 );
2076 #else
2077 long i;
2078 for (i=0; i<chromWidth; i++)
2079 {
2080 udst[i] = src[4*i+0];
2081 ydst[2*i+0] = src[4*i+1];
2082 vdst[i] = src[4*i+2];
2083 ydst[2*i+1] = src[4*i+3];
2084 }
2085 ydst += lumStride;
2086 src += srcStride;
2087
2088 for (i=0; i<chromWidth; i++)
2089 {
2090 ydst[2*i+0] = src[4*i+1];
2091 ydst[2*i+1] = src[4*i+3];
2092 }
2093 #endif
2094 udst += chromStride;
2095 vdst += chromStride;
2096 ydst += lumStride;
2097 src += srcStride;
2098 }
2099 #if HAVE_MMX
2100 __asm__ volatile( EMMS" \n\t"
2101 SFENCE" \n\t"
2102 :::"memory");
2103 #endif
2104 }
2105
2106 /**
2107 * Height should be a multiple of 2 and width should be a multiple of 2.
2108 * (If this is a problem for anyone then tell me, and I will fix it.)
2109 * Chrominance data is only taken from every second line,
2110 * others are ignored in the C version.
2111 * FIXME: Write HQ version.
2112 */
2113 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2114 long width, long height,
2115 long lumStride, long chromStride, long srcStride)
2116 {
2117 long y;
2118 const x86_reg chromWidth= width>>1;
2119 #if HAVE_MMX
2120 for (y=0; y<height-2; y+=2)
2121 {
2122 long i;
2123 for (i=0; i<2; i++)
2124 {
2125 __asm__ volatile(
2126 "mov %2, %%"REG_a" \n\t"
2127 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2128 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2129 "pxor %%mm7, %%mm7 \n\t"
2130 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2131 ASMALIGN(4)
2132 "1: \n\t"
2133 PREFETCH" 64(%0, %%"REG_d") \n\t"
2134 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2135 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2136 "punpcklbw %%mm7, %%mm0 \n\t"
2137 "punpcklbw %%mm7, %%mm1 \n\t"
2138 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2139 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2140 "punpcklbw %%mm7, %%mm2 \n\t"
2141 "punpcklbw %%mm7, %%mm3 \n\t"
2142 "pmaddwd %%mm6, %%mm0 \n\t"
2143 "pmaddwd %%mm6, %%mm1 \n\t"
2144 "pmaddwd %%mm6, %%mm2 \n\t"
2145 "pmaddwd %%mm6, %%mm3 \n\t"
2146 #ifndef FAST_BGR2YV12
2147 "psrad $8, %%mm0 \n\t"
2148 "psrad $8, %%mm1 \n\t"
2149 "psrad $8, %%mm2 \n\t"
2150 "psrad $8, %%mm3 \n\t"
2151 #endif
2152 "packssdw %%mm1, %%mm0 \n\t"
2153 "packssdw %%mm3, %%mm2 \n\t"
2154 "pmaddwd %%mm5, %%mm0 \n\t"
2155 "pmaddwd %%mm5, %%mm2 \n\t"
2156 "packssdw %%mm2, %%mm0 \n\t"
2157 "psraw $7, %%mm0 \n\t"
2158
2159 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2160 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2161 "punpcklbw %%mm7, %%mm4 \n\t"
2162 "punpcklbw %%mm7, %%mm1 \n\t"
2163 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2164 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2165 "punpcklbw %%mm7, %%mm2 \n\t"
2166 "punpcklbw %%mm7, %%mm3 \n\t"
2167 "pmaddwd %%mm6, %%mm4 \n\t"
2168 "pmaddwd %%mm6, %%mm1 \n\t"
2169 "pmaddwd %%mm6, %%mm2 \n\t"
2170 "pmaddwd %%mm6, %%mm3 \n\t"
2171 #ifndef FAST_BGR2YV12
2172 "psrad $8, %%mm4 \n\t"
2173 "psrad $8, %%mm1 \n\t"
2174 "psrad $8, %%mm2 \n\t"
2175 "psrad $8, %%mm3 \n\t"
2176 #endif
2177 "packssdw %%mm1, %%mm4 \n\t"
2178 "packssdw %%mm3, %%mm2 \n\t"
2179 "pmaddwd %%mm5, %%mm4 \n\t"
2180 "pmaddwd %%mm5, %%mm2 \n\t"
2181 "add $24, %%"REG_d" \n\t"
2182 "packssdw %%mm2, %%mm4 \n\t"
2183 "psraw $7, %%mm4 \n\t"
2184
2185 "packuswb %%mm4, %%mm0 \n\t"
2186 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2187
2188 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2189 "add $8, %%"REG_a" \n\t"
2190 " js 1b \n\t"
2191 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2192 : "%"REG_a, "%"REG_d
2193 );
2194 ydst += lumStride;
2195 src += srcStride;
2196 }
2197 src -= srcStride*2;
2198 __asm__ volatile(
2199 "mov %4, %%"REG_a" \n\t"
2200 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2201 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2202 "pxor %%mm7, %%mm7 \n\t"
2203 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2204 "add %%"REG_d", %%"REG_d" \n\t"
2205 ASMALIGN(4)
2206 "1: \n\t"
2207 PREFETCH" 64(%0, %%"REG_d") \n\t"
2208 PREFETCH" 64(%1, %%"REG_d") \n\t"
2209 #if HAVE_MMX2 || HAVE_AMD3DNOW
2210 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2211 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2212 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2213 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2214 PAVGB" %%mm1, %%mm0 \n\t"
2215 PAVGB" %%mm3, %%mm2 \n\t"
2216 "movq %%mm0, %%mm1 \n\t"
2217 "movq %%mm2, %%mm3 \n\t"
2218 "psrlq $24, %%mm0 \n\t"
2219 "psrlq $24, %%mm2 \n\t"
2220 PAVGB" %%mm1, %%mm0 \n\t"
2221 PAVGB" %%mm3, %%mm2 \n\t"
2222 "punpcklbw %%mm7, %%mm0 \n\t"
2223 "punpcklbw %%mm7, %%mm2 \n\t"
2224 #else
2225 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2226 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2227 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2228 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2229 "punpcklbw %%mm7, %%mm0 \n\t"
2230 "punpcklbw %%mm7, %%mm1 \n\t"
2231 "punpcklbw %%mm7, %%mm2 \n\t"
2232 "punpcklbw %%mm7, %%mm3 \n\t"
2233 "paddw %%mm1, %%mm0 \n\t"
2234 "paddw %%mm3, %%mm2 \n\t"
2235 "paddw %%mm2, %%mm0 \n\t"
2236 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2237 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2238 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2239 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2240 "punpcklbw %%mm7, %%mm4 \n\t"
2241 "punpcklbw %%mm7, %%mm1 \n\t"
2242 "punpcklbw %%mm7, %%mm2 \n\t"
2243 "punpcklbw %%mm7, %%mm3 \n\t"
2244 "paddw %%mm1, %%mm4 \n\t"
2245 "paddw %%mm3, %%mm2 \n\t"
2246 "paddw %%mm4, %%mm2 \n\t"
2247 "psrlw $2, %%mm0 \n\t"
2248 "psrlw $2, %%mm2 \n\t"
2249 #endif
2250 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2251 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2252
2253 "pmaddwd %%mm0, %%mm1 \n\t"
2254 "pmaddwd %%mm2, %%mm3 \n\t"
2255 "pmaddwd %%mm6, %%mm0 \n\t"
2256 "pmaddwd %%mm6, %%mm2 \n\t"
2257 #ifndef FAST_BGR2YV12
2258 "psrad $8, %%mm0 \n\t"
2259 "psrad $8, %%mm1 \n\t"
2260 "psrad $8, %%mm2 \n\t"
2261 "psrad $8, %%mm3 \n\t"
2262 #endif
2263 "packssdw %%mm2, %%mm0 \n\t"
2264 "packssdw %%mm3, %%mm1 \n\t"
2265 "pmaddwd %%mm5, %%mm0 \n\t"
2266 "pmaddwd %%mm5, %%mm1 \n\t"
2267 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2268 "psraw $7, %%mm0 \n\t"
2269
2270 #if HAVE_MMX2 || HAVE_AMD3DNOW
2271 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2272 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2273 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2274 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2275 PAVGB" %%mm1, %%mm4 \n\t"
2276 PAVGB" %%mm3, %%mm2 \n\t"
2277 "movq %%mm4, %%mm1 \n\t"
2278 "movq %%mm2, %%mm3 \n\t"
2279 "psrlq $24, %%mm4 \n\t"
2280 "psrlq $24, %%mm2 \n\t"
2281 PAVGB" %%mm1, %%mm4 \n\t"
2282 PAVGB" %%mm3, %%mm2 \n\t"
2283 "punpcklbw %%mm7, %%mm4 \n\t"
2284 "punpcklbw %%mm7, %%mm2 \n\t"
2285 #else
2286 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2287 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2288 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2289 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2290 "punpcklbw %%mm7, %%mm4 \n\t"
2291 "punpcklbw %%mm7, %%mm1 \n\t"
2292 "punpcklbw %%mm7, %%mm2 \n\t"
2293 "punpcklbw %%mm7, %%mm3 \n\t"
2294 "paddw %%mm1, %%mm4 \n\t"
2295 "paddw %%mm3, %%mm2 \n\t"
2296 "paddw %%mm2, %%mm4 \n\t"
2297 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2298 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2299 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2300 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2301 "punpcklbw %%mm7, %%mm5 \n\t"
2302 "punpcklbw %%mm7, %%mm1 \n\t"
2303 "punpcklbw %%mm7, %%mm2 \n\t"
2304 "punpcklbw %%mm7, %%mm3 \n\t"
2305 "paddw %%mm1, %%mm5 \n\t"
2306 "paddw %%mm3, %%mm2 \n\t"
2307 "paddw %%mm5, %%mm2 \n\t"
2308 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2309 "psrlw $2, %%mm4 \n\t"
2310 "psrlw $2, %%mm2 \n\t"
2311 #endif
2312 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2313 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2314
2315 "pmaddwd %%mm4, %%mm1 \n\t"
2316 "pmaddwd %%mm2, %%mm3 \n\t"
2317 "pmaddwd %%mm6, %%mm4 \n\t"
2318 "pmaddwd %%mm6, %%mm2 \n\t"
2319 #ifndef FAST_BGR2YV12
2320 "psrad $8, %%mm4 \n\t"
2321 "psrad $8, %%mm1 \n\t"
2322 "psrad $8, %%mm2 \n\t"
2323 "psrad $8, %%mm3 \n\t"
2324 #endif
2325 "packssdw %%mm2, %%mm4 \n\t"
2326 "packssdw %%mm3, %%mm1 \n\t"
2327 "pmaddwd %%mm5, %%mm4 \n\t"
2328 "pmaddwd %%mm5, %%mm1 \n\t"
2329 "add $24, %%"REG_d" \n\t"
2330 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2331 "psraw $7, %%mm4 \n\t"
2332
2333 "movq %%mm0, %%mm1 \n\t"
2334 "punpckldq %%mm4, %%mm0 \n\t"
2335 "punpckhdq %%mm4, %%mm1 \n\t"
2336 "packsswb %%mm1, %%mm0 \n\t"
2337 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2338 "movd %%mm0, (%2, %%"REG_a") \n\t"
2339 "punpckhdq %%mm0, %%mm0 \n\t"
2340 "movd %%mm0, (%3, %%"REG_a") \n\t"
2341 "add $4, %%"REG_a" \n\t"
2342 " js 1b \n\t"
2343 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2344 : "%"REG_a, "%"REG_d
2345 );
2346
2347 udst += chromStride;
2348 vdst += chromStride;
2349 src += srcStride*2;
2350 }
2351
2352 __asm__ volatile( EMMS" \n\t"
2353 SFENCE" \n\t"
2354 :::"memory");
2355 #else
2356 y=0;
2357 #endif
2358 for (; y<height; y+=2)
2359 {
2360 long i;
2361 for (i=0; i<chromWidth; i++)
2362 {
2363 unsigned int b = src[6*i+0];
2364 unsigned int g = src[6*i+1];
2365 unsigned int r = src[6*i+2];
2366
2367 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2368 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2369 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2370
2371 udst[i] = U;
2372 vdst[i] = V;
2373 ydst[2*i] = Y;
2374
2375 b = src[6*i+3];
2376 g = src[6*i+4];
2377 r = src[6*i+5];
2378
2379 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2380 ydst[2*i+1] = Y;
2381 }
2382 ydst += lumStride;
2383 src += srcStride;
2384
2385 for (i=0; i<chromWidth; i++)
2386 {
2387 unsigned int b = src[6*i+0];
2388 unsigned int g = src[6*i+1];
2389 unsigned int r = src[6*i+2];
2390
2391 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2392
2393 ydst[2*i] = Y;
2394
2395 b = src[6*i+3];
2396 g = src[6*i+4];
2397 r = src[6*i+5];
2398
2399 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2400 ydst[2*i+1] = Y;
2401 }
2402 udst += chromStride;
2403 vdst += chromStride;
2404 ydst += lumStride;
2405 src += srcStride;
2406 }
2407 }
2408
2409 static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2410 long width, long height, long src1Stride,
2411 long src2Stride, long dstStride){
2412 long h;
2413
2414 for (h=0; h < height; h++)
2415 {
2416 long w;
2417
2418 #if HAVE_MMX
2419 #if HAVE_SSE2
2420 __asm__(
2421 "xor %%"REG_a", %%"REG_a" \n\t"
2422 "1: \n\t"
2423 PREFETCH" 64(%1, %%"REG_a") \n\t"
2424 PREFETCH" 64(%2, %%"REG_a") \n\t"
2425 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2426 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2427 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2428 "punpcklbw %%xmm2, %%xmm0 \n\t"
2429 "punpckhbw %%xmm2, %%xmm1 \n\t"
2430 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2431 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2432 "add $16, %%"REG_a" \n\t"
2433 "cmp %3, %%"REG_a" \n\t"
2434 " jb 1b \n\t"
2435 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2436 : "memory", "%"REG_a""
2437 );
2438 #else
2439 __asm__(
2440 "xor %%"REG_a", %%"REG_a" \n\t"
2441 "1: \n\t"
2442 PREFETCH" 64(%1, %%"REG_a") \n\t"
2443 PREFETCH" 64(%2, %%"REG_a") \n\t"
2444 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2445 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2446 "movq %%mm0, %%mm1 \n\t"
2447 "movq %%mm2, %%mm3 \n\t"
2448 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2449 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2450 "punpcklbw %%mm4, %%mm0 \n\t"
2451 "punpckhbw %%mm4, %%mm1 \n\t"
2452 "punpcklbw %%mm5, %%mm2 \n\t"
2453 "punpckhbw %%mm5, %%mm3 \n\t"
2454 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2455 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2456 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2457 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2458 "add $16, %%"REG_a" \n\t"
2459 "cmp %3, %%"REG_a" \n\t"
2460 " jb 1b \n\t"
2461 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2462 : "memory", "%"REG_a
2463 );
2464 #endif
2465 for (w= (width&(~15)); w < width; w++)
2466 {
2467 dest[2*w+0] = src1[w];
2468 dest[2*w+1] = src2[w];
2469 }
2470 #else
2471 for (w=0; w < width; w++)
2472 {
2473 dest[2*w+0] = src1[w];
2474 dest[2*w+1] = src2[w];
2475 }
2476 #endif
2477 dest += dstStride;
2478 src1 += src1Stride;
2479 src2 += src2Stride;
2480 }
2481 #if HAVE_MMX
2482 __asm__(
2483 EMMS" \n\t"
2484 SFENCE" \n\t"
2485 ::: "memory"
2486 );
2487 #endif
2488 }
2489
2490 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2491 uint8_t *dst1, uint8_t *dst2,
2492 long width, long height,
2493 long srcStride1, long srcStride2,
2494 long dstStride1, long dstStride2)
2495 {
2496 x86_reg y;
2497 long x,w,h;
2498 w=width/2; h=height/2;
2499 #if HAVE_MMX
2500 __asm__ volatile(
2501 PREFETCH" %0 \n\t"
2502 PREFETCH" %1 \n\t"
2503 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2504 #endif
2505 for (y=0;y<h;y++){
2506 const uint8_t* s1=src1+srcStride1*(y>>1);
2507 uint8_t* d=dst1+dstStride1*y;
2508 x=0;
2509 #if HAVE_MMX
2510 for (;x<w-31;x+=32)
2511 {
2512 __asm__ volatile(
2513 PREFETCH" 32%1 \n\t"
2514 "movq %1, %%mm0 \n\t"
2515 "movq 8%1, %%mm2 \n\t"
2516 "movq 16%1, %%mm4 \n\t"
2517 "movq 24%1, %%mm6 \n\t"
2518 "movq %%mm0, %%mm1 \n\t"
2519 "movq %%mm2, %%mm3 \n\t"
2520 "movq %%mm4, %%mm5 \n\t"
2521 "movq %%mm6, %%mm7 \n\t"
2522 "punpcklbw %%mm0, %%mm0 \n\t"
2523 "punpckhbw %%mm1, %%mm1 \n\t"
2524 "punpcklbw %%mm2, %%mm2 \n\t"
2525 "punpckhbw %%mm3, %%mm3 \n\t"
2526 "punpcklbw %%mm4, %%mm4 \n\t"
2527 "punpckhbw %%mm5, %%mm5 \n\t"
2528 "punpcklbw %%mm6, %%mm6 \n\t"
2529 "punpckhbw %%mm7, %%mm7 \n\t"
2530 MOVNTQ" %%mm0, %0 \n\t"
2531 MOVNTQ" %%mm1, 8%0 \n\t"
2532 MOVNTQ" %%mm2, 16%0 \n\t"
2533 MOVNTQ" %%mm3, 24%0 \n\t"
2534 MOVNTQ" %%mm4, 32%0 \n\t"
2535 MOVNTQ" %%mm5, 40%0 \n\t"
2536 MOVNTQ" %%mm6, 48%0 \n\t"
2537 MOVNTQ" %%mm7, 56%0"
2538 :"=m"(d[2*x])
2539 :"m"(s1[x])
2540 :"memory");
2541 }
2542 #endif
2543 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2544 }
2545 for (y=0;y<h;y++){
2546 const uint8_t* s2=src2+srcStride2*(y>>1);
2547 uint8_t* d=dst2+dstStride2*y;
2548 x=0;
2549 #if HAVE_MMX
2550 for (;x<w-31;x+=32)
2551 {
2552 __asm__ volatile(
2553 PREFETCH" 32%1 \n\t"
2554 "movq %1, %%mm0 \n\t"
2555 "movq 8%1, %%mm2 \n\t"
2556 "movq 16%1, %%mm4 \n\t"
2557 "movq 24%1, %%mm6 \n\t"
2558 "movq %%mm0, %%mm1 \n\t"
2559 "movq %%mm2, %%mm3 \n\t"
2560 "movq %%mm4, %%mm5 \n\t"
2561 "movq %%mm6, %%mm7 \n\t"
2562 "punpcklbw %%mm0, %%mm0 \n\t"
2563 "punpckhbw %%mm1, %%mm1 \n\t"
2564 "punpcklbw %%mm2, %%mm2 \n\t"
2565 "punpckhbw %%mm3, %%mm3 \n\t"
2566 "punpcklbw %%mm4, %%mm4 \n\t"
2567 "punpckhbw %%mm5, %%mm5 \n\t"
2568 "punpcklbw %%mm6, %%mm6 \n\t"
2569 "punpckhbw %%mm7, %%mm7 \n\t"
2570 MOVNTQ" %%mm0, %0 \n\t"
2571 MOVNTQ" %%mm1, 8%0 \n\t"
2572 MOVNTQ" %%mm2, 16%0 \n\t"
2573 MOVNTQ" %%mm3, 24%0 \n\t"
2574 MOVNTQ" %%mm4, 32%0 \n\t"
2575 MOVNTQ" %%mm5, 40%0 \n\t"
2576 MOVNTQ" %%mm6, 48%0 \n\t"
2577 MOVNTQ" %%mm7, 56%0"
2578 :"=m"(d[2*x])
2579 :"m"(s2[x])
2580 :"memory");
2581 }
2582 #endif
2583 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2584 }
2585 #if HAVE_MMX
2586 __asm__(
2587 EMMS" \n\t"
2588 SFENCE" \n\t"
2589 ::: "memory"
2590 );
2591 #endif
2592 }
2593
2594 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2595 uint8_t *dst,
2596 long width, long height,
2597 long srcStride1, long srcStride2,
2598 long srcStride3, long dstStride)
2599 {
2600 x86_reg x;
2601 long y,w,h;
2602 w=width/2; h=height;
2603 for (y=0;y<h;y++){
2604 const uint8_t* yp=src1+srcStride1*y;
2605 const uint8_t* up=src2+srcStride2*(y>>2);
2606 const uint8_t* vp=src3+srcStride3*(y>>2);
2607 uint8_t* d=dst+dstStride*y;
2608 x=0;
2609 #if HAVE_MMX
2610 for (;x<w-7;x+=8)
2611 {
2612 __asm__ volatile(
2613 PREFETCH" 32(%1, %0) \n\t"
2614 PREFETCH" 32(%2, %0) \n\t"
2615 PREFETCH" 32(%3, %0) \n\t"
2616 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2617 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2618 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2619 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2620 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2621 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2622 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2623 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2624 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2625 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2626
2627 "movq %%mm1, %%mm6 \n\t"
2628 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2629 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2630 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2631 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2632 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2633
2634 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2635 "movq 8(%1, %0, 4), %%mm0 \n\t"
2636 "movq %%mm0, %%mm3 \n\t"
2637 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2638 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2639 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2640 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2641
2642 "movq %%mm4, %%mm6 \n\t"
2643 "movq 16(%1, %0, 4), %%mm0 \n\t"
2644 "movq %%mm0, %%mm3 \n\t"
2645 "punpcklbw %%mm5, %%mm4 \n\t"
2646 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2647 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2648 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2649 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2650
2651 "punpckhbw %%mm5, %%mm6 \n\t"
2652 "movq 24(%1, %0, 4), %%mm0 \n\t"
2653 "movq %%mm0, %%mm3 \n\t"
2654 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2655 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2656 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2657 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2658
2659 : "+r" (x)
2660 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2661 :"memory");
2662 }
2663 #endif
2664 for (; x<w; x++)
2665 {
2666 const long x2 = x<<2;
2667 d[8*x+0] = yp[x2];
2668 d[8*x+1] = up[x];
2669 d[8*x+2] = yp[x2+1];
2670 d[8*x+3] = vp[x];
2671 d[8*x+4] = yp[x2+2];
2672 d[8*x+5] = up[x];
2673 d[8*x+6] = yp[x2+3];
2674 d[8*x+7] = vp[x];
2675 }
2676 }
2677 #if HAVE_MMX
2678 __asm__(
2679 EMMS" \n\t"
2680 SFENCE" \n\t"
2681 ::: "memory"
2682 );
2683 #endif
2684 }
2685
2686 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2687 {
2688 dst += count;
2689 src += 2*count;
2690 count= - count;
2691
2692 #if HAVE_MMX
2693 if(count <= -16){
2694 count += 15;
2695 __asm__ volatile(
2696 "pcmpeqw %%mm7, %%mm7 \n\t"
2697 "psrlw $8, %%mm7 \n\t"
2698 "1: \n\t"
2699 "movq -30(%1, %0, 2), %%mm0 \n\t"
2700 "movq -22(%1, %0, 2), %%mm1 \n\t"
2701 "movq -14(%1, %0, 2), %%mm2 \n\t"
2702 "movq -6(%1, %0, 2), %%mm3 \n\t"
2703 "pand %%mm7, %%mm0 \n\t"
2704 "pand %%mm7, %%mm1 \n\t"
2705 "pand %%mm7, %%mm2 \n\t"
2706 "pand %%mm7, %%mm3 \n\t"
2707 "packuswb %%mm1, %%mm0 \n\t"
2708 "packuswb %%mm3, %%mm2 \n\t"
2709 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2710 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2711 "add $16, %0 \n\t"
2712 " js 1b \n\t"
2713 : "+r"(count)
2714 : "r"(src), "r"(dst)
2715 );
2716 count -= 15;
2717 }
2718 #endif
2719 while(count<0){
2720 dst[count]= src[2*count];
2721 count++;
2722 }
2723 }
2724
2725 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2726 {
2727 dst0+= count;
2728 dst1+= count;
2729 src += 4*count;
2730 count= - count;
2731 #if HAVE_MMX
2732 if(count <= -8){
2733 count += 7;
2734 __asm__ volatile(
2735 "pcmpeqw %%mm7, %%mm7 \n\t"
2736 "psrlw $8, %%mm7 \n\t"
2737 "1: \n\t"
2738 "movq -28(%1, %0, 4), %%mm0 \n\t"
2739 "movq -20(%1, %0, 4), %%mm1 \n\t"
2740 "movq -12(%1, %0, 4), %%mm2 \n\t"
2741 "movq -4(%1, %0, 4), %%mm3 \n\t"
2742 "pand %%mm7, %%mm0 \n\t"
2743 "pand %%mm7, %%mm1 \n\t"
2744 "pand %%mm7, %%mm2 \n\t"
2745 "pand %%mm7, %%mm3 \n\t"
2746 "packuswb %%mm1, %%mm0 \n\t"
2747 "packuswb %%mm3, %%mm2 \n\t"
2748 "movq %%mm0, %%mm1 \n\t"
2749 "movq %%mm2, %%mm3 \n\t"
2750 "psrlw $8, %%mm0 \n\t"
2751 "psrlw $8, %%mm2 \n\t"
2752 "pand %%mm7, %%mm1 \n\t"
2753 "pand %%mm7, %%mm3 \n\t"
2754 "packuswb %%mm2, %%mm0 \n\t"
2755 "packuswb %%mm3, %%mm1 \n\t"
2756 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2757 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2758 "add $8, %0 \n\t"
2759 " js 1b \n\t"
2760 : "+r"(count)
2761 : "r"(src), "r"(dst0), "r"(dst1)
2762 );
2763 count -= 7;
2764 }
2765 #endif
2766 while(count<0){
2767 dst0[count]= src[4*count+0];
2768 dst1[count]= src[4*count+2];
2769 count++;
2770 }
2771 }
2772
2773 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2774 {
2775 dst0 += count;
2776 dst1 += count;
2777 src0 += 4*count;
2778 src1 += 4*count;
2779 count= - count;
2780 #ifdef PAVGB
2781 if(count <= -8){
2782 count += 7;
2783 __asm__ volatile(
2784 "pcmpeqw %%mm7, %%mm7 \n\t"
2785 "psrlw $8, %%mm7 \n\t"
2786 "1: \n\t"
2787 "movq -28(%1, %0, 4), %%mm0 \n\t"
2788 "movq -20(%1, %0, 4), %%mm1 \n\t"
2789 "movq -12(%1, %0, 4), %%mm2 \n\t"
2790 "movq -4(%1, %0, 4), %%mm3 \n\t"
2791 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2792 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2793 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2794 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2795 "pand %%mm7, %%mm0 \n\t"
2796 "pand %%mm7, %%mm1 \n\t"
2797 "pand %%mm7, %%mm2 \n\t"
2798 "pand %%mm7, %%mm3 \n\t"
2799 "packuswb %%mm1, %%mm0 \n\t"
2800 "packuswb %%mm3, %%mm2 \n\t"
2801 "movq %%mm0, %%mm1 \n\t"
2802 "movq %%mm2, %%mm3 \n\t"
2803 "psrlw $8, %%mm0 \n\t"
2804 "psrlw $8, %%mm2 \n\t"
2805 "pand %%mm7, %%mm1 \n\t"
2806 "pand %%mm7, %%mm3 \n\t"
2807 "packuswb %%mm2, %%mm0 \n\t"
2808 "packuswb %%mm3, %%mm1 \n\t"
2809 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2810 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2811 "add $8, %0 \n\t"
2812 " js 1b \n\t"
2813 : "+r"(count)
2814 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2815 );
2816 count -= 7;
2817 }
2818 #endif
2819 while(count<0){
2820 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2821 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2822 count++;
2823 }
2824 }
2825
2826 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2827 {
2828 dst0+= count;
2829 dst1+= count;
2830 src += 4*count;
2831 count= - count;
2832 #if HAVE_MMX
2833 if(count <= -8){
2834 count += 7;
2835 __asm__ volatile(
2836 "pcmpeqw %%mm7, %%mm7 \n\t"
2837 "psrlw $8, %%mm7 \n\t"
2838 "1: \n\t"
2839 "movq -28(%1, %0, 4), %%mm0 \n\t"
2840 "movq -20(%1, %0, 4), %%mm1 \n\t"
2841 "movq -12(%1, %0, 4), %%mm2 \n\t"
2842 "movq -4(%1, %0, 4), %%mm3 \n\t"
2843 "psrlw $8, %%mm0 \n\t"
2844 "psrlw $8, %%mm1 \n\t"
2845 "psrlw $8, %%mm2 \n\t"
2846 "psrlw $8, %%mm3 \n\t"
2847 "packuswb %%mm1, %%mm0 \n\t"
2848 "packuswb %%mm3, %%mm2 \n\t"
2849 "movq %%mm0, %%mm1 \n\t"
2850 "movq %%mm2, %%mm3 \n\t"
2851 "psrlw $8, %%mm0 \n\t"
2852 "psrlw $8, %%mm2 \n\t"
2853 "pand %%mm7, %%mm1 \n\t"
2854 "pand %%mm7, %%mm3 \n\t"
2855 "packuswb %%mm2, %%mm0 \n\t"
2856 "packuswb %%mm3, %%mm1 \n\t"
2857 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2858 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2859 "add $8, %0 \n\t"
2860 " js 1b \n\t"
2861 : "+r"(count)
2862 : "r"(src), "r"(dst0), "r"(dst1)
2863 );
2864 count -= 7;
2865 }
2866 #endif
2867 src++;
2868 while(count<0){
2869 dst0[count]= src[4*count+0];
2870 dst1[count]= src[4*count+2];
2871 count++;
2872 }
2873 }
2874
2875 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2876 {
2877 dst0 += count;
2878 dst1 += count;
2879 src0 += 4*count;
2880 src1 += 4*count;
2881 count= - count;
2882 #ifdef PAVGB
2883 if(count <= -8){
2884 count += 7;
2885 __asm__ volatile(
2886 "pcmpeqw %%mm7, %%mm7 \n\t"
2887 "psrlw $8, %%mm7 \n\t"
2888 "1: \n\t"
2889 "movq -28(%1, %0, 4), %%mm0 \n\t"
2890 "movq -20(%1, %0, 4), %%mm1 \n\t"
2891 "movq -12(%1, %0, 4), %%mm2 \n\t"
2892 "movq -4(%1, %0, 4), %%mm3 \n\t"
2893 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2894 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2895 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2896 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2897 "psrlw $8, %%mm0 \n\t"
2898 "psrlw $8, %%mm1 \n\t"
2899 "psrlw $8, %%mm2 \n\t"
2900 "psrlw $8, %%mm3 \n\t"
2901 "packuswb %%mm1, %%mm0 \n\t"
2902 "packuswb %%mm3, %%mm2 \n\t"
2903 "movq %%mm0, %%mm1 \n\t"
2904 "movq %%mm2, %%mm3 \n\t"
2905 "psrlw $8, %%mm0 \n\t"
2906 "psrlw $8, %%mm2 \n\t"
2907 "pand %%mm7, %%mm1 \n\t"
2908 "pand %%mm7, %%mm3 \n\t"
2909 "packuswb %%mm2, %%mm0 \n\t"
2910 "packuswb %%mm3, %%mm1 \n\t"
2911 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2912 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2913 "add $8, %0 \n\t"
2914 " js 1b \n\t"
2915 : "+r"(count)
2916 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2917 );
2918 count -= 7;
2919 }
2920 #endif
2921 src0++;
2922 src1++;
2923 while(count<0){
2924 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2925 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2926 count++;
2927 }
2928 }
2929
2930 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2931 long width, long height,
2932 long lumStride, long chromStride, long srcStride)
2933 {
2934 long y;
2935 const long chromWidth= -((-width)>>1);
2936
2937 for (y=0; y<height; y++){
2938 RENAME(extract_even)(src, ydst, width);
2939 if(y&1){
2940 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2941 udst+= chromStride;
2942 vdst+= chromStride;
2943 }
2944
2945 src += srcStride;
2946 ydst+= lumStride;
2947 }
2948 #if HAVE_MMX
2949 __asm__(
2950 EMMS" \n\t"
2951 SFENCE" \n\t"
2952 ::: "memory"
2953 );
2954 #endif
2955 }
2956
2957 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2958 long width, long height,
2959 long lumStride, long chromStride, long srcStride)
2960 {
2961 long y;
2962 const long chromWidth= -((-width)>>1);
2963
2964 for (y=0; y<height; y++){
2965 RENAME(extract_even)(src, ydst, width);
2966 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2967
2968 src += srcStride;
2969 ydst+= lumStride;
2970 udst+= chromStride;
2971 vdst+= chromStride;
2972 }
2973 #if HAVE_MMX
2974 __asm__(
2975 EMMS" \n\t"
2976 SFENCE" \n\t"
2977 ::: "memory"
2978 );
2979 #endif
2980 }
2981
2982 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2983 long width, long height,
2984 long lumStride, long chromStride, long srcStride)
2985 {
2986 long y;
2987 const long chromWidth= -((-width)>>1);
2988
2989 for (y=0; y<height; y++){
2990 RENAME(extract_even)(src+1, ydst, width);
2991 if(y&1){
2992 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2993 udst+= chromStride;
2994 vdst+= chromStride;
2995 }
2996
2997 src += srcStride;
2998 ydst+= lumStride;
2999 }
3000 #if HAVE_MMX
3001 __asm__(
3002 EMMS" \n\t"
3003 SFENCE" \n\t"
3004 ::: "memory"
3005 );
3006 #endif
3007 }
3008
3009 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
3010 long width, long height,
3011 long lumStride, long chromStride, long srcStride)
3012 {
3013 long y;
3014 const long chromWidth= -((-width)>>1);
3015
3016 for (y=0; y<height; y++){
3017 RENAME(extract_even)(src+1, ydst, width);
3018 RENAME(extract_even2)(src, udst, vdst, chromWidth);
3019
3020 src += srcStride;
3021 ydst+= lumStride;
3022 udst+= chromStride;
3023 vdst+= chromStride;
3024 }
3025 #if HAVE_MMX
3026 __asm__(
3027 EMMS" \n\t"
3028 SFENCE" \n\t"
3029 ::: "memory"
3030 );
3031 #endif
3032 }
3033
3034 static inline void RENAME(rgb2rgb_init)(void){
3035 rgb15to16 = RENAME(rgb15to16);
3036 rgb15tobgr24 = RENAME(rgb15tobgr24);
3037 rgb15to32 = RENAME(rgb15to32);
3038 rgb16tobgr24 = RENAME(rgb16tobgr24);
3039 rgb16to32 = RENAME(rgb16to32);
3040 rgb16to15 = RENAME(rgb16to15);
3041 rgb24tobgr16 = RENAME(rgb24tobgr16);
3042 rgb24tobgr15 = RENAME(rgb24tobgr15);
3043 rgb24tobgr32 = RENAME(rgb24tobgr32);
3044 rgb32to16 = RENAME(rgb32to16);
3045 rgb32to15 = RENAME(rgb32to15);
3046 rgb32tobgr24 = RENAME(rgb32tobgr24);
3047 rgb24to15 = RENAME(rgb24to15);
3048 rgb24to16 = RENAME(rgb24to16);
3049 rgb24tobgr24 = RENAME(rgb24tobgr24);
3050 rgb32tobgr32 = RENAME(rgb32tobgr32);
3051 rgb32tobgr16 = RENAME(rgb32tobgr16);
3052 rgb32tobgr15 = RENAME(rgb32tobgr15);
3053 yv12toyuy2 = RENAME(yv12toyuy2);
3054 yv12touyvy = RENAME(yv12touyvy);
3055 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
3056 yuv422ptouyvy = RENAME(yuv422ptouyvy);
3057 yuy2toyv12 = RENAME(yuy2toyv12);
3058 // yvu9toyv12 = RENAME(yvu9toyv12);
3059 planar2x = RENAME(planar2x);
3060 rgb24toyv12 = RENAME(rgb24toyv12);
3061 interleaveBytes = RENAME(interleaveBytes);
3062 vu9_to_vu12 = RENAME(vu9_to_vu12);
3063 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
3064
3065 uyvytoyuv420 = RENAME(uyvytoyuv420);
3066 uyvytoyuv422 = RENAME(uyvytoyuv422);
3067 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
3068 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
3069 }