swscale: move away x86 specific code from rgb2rgb
[libav.git] / libswscale / x86 / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of Libav.
11 *
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27 #include <stddef.h>
28
29 #undef PREFETCH
30 #undef MOVNTQ
31 #undef EMMS
32 #undef SFENCE
33 #undef MMREG_SIZE
34 #undef PAVGB
35
36 #if COMPILE_TEMPLATE_SSE2
37 #define MMREG_SIZE 16
38 #else
39 #define MMREG_SIZE 8
40 #endif
41
42 #if COMPILE_TEMPLATE_AMD3DNOW
43 #define PREFETCH "prefetch"
44 #define PAVGB "pavgusb"
45 #elif COMPILE_TEMPLATE_MMX2
46 #define PREFETCH "prefetchnta"
47 #define PAVGB "pavgb"
48 #else
49 #define PREFETCH " # nop"
50 #endif
51
52 #if COMPILE_TEMPLATE_AMD3DNOW
53 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54 #define EMMS "femms"
55 #else
56 #define EMMS "emms"
57 #endif
58
59 #if COMPILE_TEMPLATE_MMX2
60 #define MOVNTQ "movntq"
61 #define SFENCE "sfence"
62 #else
63 #define MOVNTQ "movq"
64 #define SFENCE " # nop"
65 #endif
66
67 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
68 {
69 uint8_t *dest = dst;
70 const uint8_t *s = src;
71 const uint8_t *end;
72 #if COMPILE_TEMPLATE_MMX
73 const uint8_t *mm_end;
74 #endif
75 end = s + src_size;
76 #if COMPILE_TEMPLATE_MMX
77 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
78 mm_end = end - 23;
79 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
80 while (s < mm_end) {
81 __asm__ volatile(
82 PREFETCH" 32%1 \n\t"
83 "movd %1, %%mm0 \n\t"
84 "punpckldq 3%1, %%mm0 \n\t"
85 "movd 6%1, %%mm1 \n\t"
86 "punpckldq 9%1, %%mm1 \n\t"
87 "movd 12%1, %%mm2 \n\t"
88 "punpckldq 15%1, %%mm2 \n\t"
89 "movd 18%1, %%mm3 \n\t"
90 "punpckldq 21%1, %%mm3 \n\t"
91 "por %%mm7, %%mm0 \n\t"
92 "por %%mm7, %%mm1 \n\t"
93 "por %%mm7, %%mm2 \n\t"
94 "por %%mm7, %%mm3 \n\t"
95 MOVNTQ" %%mm0, %0 \n\t"
96 MOVNTQ" %%mm1, 8%0 \n\t"
97 MOVNTQ" %%mm2, 16%0 \n\t"
98 MOVNTQ" %%mm3, 24%0"
99 :"=m"(*dest)
100 :"m"(*s)
101 :"memory");
102 dest += 32;
103 s += 24;
104 }
105 __asm__ volatile(SFENCE:::"memory");
106 __asm__ volatile(EMMS:::"memory");
107 #endif
108 while (s < end) {
109 #if HAVE_BIGENDIAN
110 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
111 *dest++ = 255;
112 *dest++ = s[2];
113 *dest++ = s[1];
114 *dest++ = s[0];
115 s+=3;
116 #else
117 *dest++ = *s++;
118 *dest++ = *s++;
119 *dest++ = *s++;
120 *dest++ = 255;
121 #endif
122 }
123 }
124
125 #define STORE_BGR24_MMX \
126 "psrlq $8, %%mm2 \n\t" \
127 "psrlq $8, %%mm3 \n\t" \
128 "psrlq $8, %%mm6 \n\t" \
129 "psrlq $8, %%mm7 \n\t" \
130 "pand "MANGLE(mask24l)", %%mm0\n\t" \
131 "pand "MANGLE(mask24l)", %%mm1\n\t" \
132 "pand "MANGLE(mask24l)", %%mm4\n\t" \
133 "pand "MANGLE(mask24l)", %%mm5\n\t" \
134 "pand "MANGLE(mask24h)", %%mm2\n\t" \
135 "pand "MANGLE(mask24h)", %%mm3\n\t" \
136 "pand "MANGLE(mask24h)", %%mm6\n\t" \
137 "pand "MANGLE(mask24h)", %%mm7\n\t" \
138 "por %%mm2, %%mm0 \n\t" \
139 "por %%mm3, %%mm1 \n\t" \
140 "por %%mm6, %%mm4 \n\t" \
141 "por %%mm7, %%mm5 \n\t" \
142 \
143 "movq %%mm1, %%mm2 \n\t" \
144 "movq %%mm4, %%mm3 \n\t" \
145 "psllq $48, %%mm2 \n\t" \
146 "psllq $32, %%mm3 \n\t" \
147 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
148 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
149 "por %%mm2, %%mm0 \n\t" \
150 "psrlq $16, %%mm1 \n\t" \
151 "psrlq $32, %%mm4 \n\t" \
152 "psllq $16, %%mm5 \n\t" \
153 "por %%mm3, %%mm1 \n\t" \
154 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
155 "por %%mm5, %%mm4 \n\t" \
156 \
157 MOVNTQ" %%mm0, %0 \n\t" \
158 MOVNTQ" %%mm1, 8%0 \n\t" \
159 MOVNTQ" %%mm4, 16%0"
160
161
162 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
163 {
164 uint8_t *dest = dst;
165 const uint8_t *s = src;
166 const uint8_t *end;
167 #if COMPILE_TEMPLATE_MMX
168 const uint8_t *mm_end;
169 #endif
170 end = s + src_size;
171 #if COMPILE_TEMPLATE_MMX
172 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
173 mm_end = end - 31;
174 while (s < mm_end) {
175 __asm__ volatile(
176 PREFETCH" 32%1 \n\t"
177 "movq %1, %%mm0 \n\t"
178 "movq 8%1, %%mm1 \n\t"
179 "movq 16%1, %%mm4 \n\t"
180 "movq 24%1, %%mm5 \n\t"
181 "movq %%mm0, %%mm2 \n\t"
182 "movq %%mm1, %%mm3 \n\t"
183 "movq %%mm4, %%mm6 \n\t"
184 "movq %%mm5, %%mm7 \n\t"
185 STORE_BGR24_MMX
186 :"=m"(*dest)
187 :"m"(*s)
188 :"memory");
189 dest += 24;
190 s += 32;
191 }
192 __asm__ volatile(SFENCE:::"memory");
193 __asm__ volatile(EMMS:::"memory");
194 #endif
195 while (s < end) {
196 #if HAVE_BIGENDIAN
197 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
198 s++;
199 dest[2] = *s++;
200 dest[1] = *s++;
201 dest[0] = *s++;
202 dest += 3;
203 #else
204 *dest++ = *s++;
205 *dest++ = *s++;
206 *dest++ = *s++;
207 s++;
208 #endif
209 }
210 }
211
212 /*
213 original by Strepto/Astral
214 ported to gcc & bugfixed: A'rpi
215 MMX2, 3DNOW optimization by Nick Kurshev
216 32-bit C version, and and&add trick by Michael Niedermayer
217 */
218 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
219 {
220 register const uint8_t* s=src;
221 register uint8_t* d=dst;
222 register const uint8_t *end;
223 const uint8_t *mm_end;
224 end = s + src_size;
225 #if COMPILE_TEMPLATE_MMX
226 __asm__ volatile(PREFETCH" %0"::"m"(*s));
227 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
228 mm_end = end - 15;
229 while (s<mm_end) {
230 __asm__ volatile(
231 PREFETCH" 32%1 \n\t"
232 "movq %1, %%mm0 \n\t"
233 "movq 8%1, %%mm2 \n\t"
234 "movq %%mm0, %%mm1 \n\t"
235 "movq %%mm2, %%mm3 \n\t"
236 "pand %%mm4, %%mm0 \n\t"
237 "pand %%mm4, %%mm2 \n\t"
238 "paddw %%mm1, %%mm0 \n\t"
239 "paddw %%mm3, %%mm2 \n\t"
240 MOVNTQ" %%mm0, %0 \n\t"
241 MOVNTQ" %%mm2, 8%0"
242 :"=m"(*d)
243 :"m"(*s)
244 );
245 d+=16;
246 s+=16;
247 }
248 __asm__ volatile(SFENCE:::"memory");
249 __asm__ volatile(EMMS:::"memory");
250 #endif
251 mm_end = end - 3;
252 while (s < mm_end) {
253 register unsigned x= *((const uint32_t *)s);
254 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
255 d+=4;
256 s+=4;
257 }
258 if (s < end) {
259 register unsigned short x= *((const uint16_t *)s);
260 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
261 }
262 }
263
264 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
265 {
266 register const uint8_t* s=src;
267 register uint8_t* d=dst;
268 register const uint8_t *end;
269 const uint8_t *mm_end;
270 end = s + src_size;
271 #if COMPILE_TEMPLATE_MMX
272 __asm__ volatile(PREFETCH" %0"::"m"(*s));
273 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
274 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
275 mm_end = end - 15;
276 while (s<mm_end) {
277 __asm__ volatile(
278 PREFETCH" 32%1 \n\t"
279 "movq %1, %%mm0 \n\t"
280 "movq 8%1, %%mm2 \n\t"
281 "movq %%mm0, %%mm1 \n\t"
282 "movq %%mm2, %%mm3 \n\t"
283 "psrlq $1, %%mm0 \n\t"
284 "psrlq $1, %%mm2 \n\t"
285 "pand %%mm7, %%mm0 \n\t"
286 "pand %%mm7, %%mm2 \n\t"
287 "pand %%mm6, %%mm1 \n\t"
288 "pand %%mm6, %%mm3 \n\t"
289 "por %%mm1, %%mm0 \n\t"
290 "por %%mm3, %%mm2 \n\t"
291 MOVNTQ" %%mm0, %0 \n\t"
292 MOVNTQ" %%mm2, 8%0"
293 :"=m"(*d)
294 :"m"(*s)
295 );
296 d+=16;
297 s+=16;
298 }
299 __asm__ volatile(SFENCE:::"memory");
300 __asm__ volatile(EMMS:::"memory");
301 #endif
302 mm_end = end - 3;
303 while (s < mm_end) {
304 register uint32_t x= *((const uint32_t*)s);
305 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
306 s+=4;
307 d+=4;
308 }
309 if (s < end) {
310 register uint16_t x= *((const uint16_t*)s);
311 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
312 }
313 }
314
315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
316 {
317 const uint8_t *s = src;
318 const uint8_t *end;
319 #if COMPILE_TEMPLATE_MMX
320 const uint8_t *mm_end;
321 #endif
322 uint16_t *d = (uint16_t *)dst;
323 end = s + src_size;
324 #if COMPILE_TEMPLATE_MMX
325 mm_end = end - 15;
326 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
327 __asm__ volatile(
328 "movq %3, %%mm5 \n\t"
329 "movq %4, %%mm6 \n\t"
330 "movq %5, %%mm7 \n\t"
331 "jmp 2f \n\t"
332 ".p2align 4 \n\t"
333 "1: \n\t"
334 PREFETCH" 32(%1) \n\t"
335 "movd (%1), %%mm0 \n\t"
336 "movd 4(%1), %%mm3 \n\t"
337 "punpckldq 8(%1), %%mm0 \n\t"
338 "punpckldq 12(%1), %%mm3 \n\t"
339 "movq %%mm0, %%mm1 \n\t"
340 "movq %%mm3, %%mm4 \n\t"
341 "pand %%mm6, %%mm0 \n\t"
342 "pand %%mm6, %%mm3 \n\t"
343 "pmaddwd %%mm7, %%mm0 \n\t"
344 "pmaddwd %%mm7, %%mm3 \n\t"
345 "pand %%mm5, %%mm1 \n\t"
346 "pand %%mm5, %%mm4 \n\t"
347 "por %%mm1, %%mm0 \n\t"
348 "por %%mm4, %%mm3 \n\t"
349 "psrld $5, %%mm0 \n\t"
350 "pslld $11, %%mm3 \n\t"
351 "por %%mm3, %%mm0 \n\t"
352 MOVNTQ" %%mm0, (%0) \n\t"
353 "add $16, %1 \n\t"
354 "add $8, %0 \n\t"
355 "2: \n\t"
356 "cmp %2, %1 \n\t"
357 " jb 1b \n\t"
358 : "+r" (d), "+r"(s)
359 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
360 );
361 #else
362 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
363 __asm__ volatile(
364 "movq %0, %%mm7 \n\t"
365 "movq %1, %%mm6 \n\t"
366 ::"m"(red_16mask),"m"(green_16mask));
367 while (s < mm_end) {
368 __asm__ volatile(
369 PREFETCH" 32%1 \n\t"
370 "movd %1, %%mm0 \n\t"
371 "movd 4%1, %%mm3 \n\t"
372 "punpckldq 8%1, %%mm0 \n\t"
373 "punpckldq 12%1, %%mm3 \n\t"
374 "movq %%mm0, %%mm1 \n\t"
375 "movq %%mm0, %%mm2 \n\t"
376 "movq %%mm3, %%mm4 \n\t"
377 "movq %%mm3, %%mm5 \n\t"
378 "psrlq $3, %%mm0 \n\t"
379 "psrlq $3, %%mm3 \n\t"
380 "pand %2, %%mm0 \n\t"
381 "pand %2, %%mm3 \n\t"
382 "psrlq $5, %%mm1 \n\t"
383 "psrlq $5, %%mm4 \n\t"
384 "pand %%mm6, %%mm1 \n\t"
385 "pand %%mm6, %%mm4 \n\t"
386 "psrlq $8, %%mm2 \n\t"
387 "psrlq $8, %%mm5 \n\t"
388 "pand %%mm7, %%mm2 \n\t"
389 "pand %%mm7, %%mm5 \n\t"
390 "por %%mm1, %%mm0 \n\t"
391 "por %%mm4, %%mm3 \n\t"
392 "por %%mm2, %%mm0 \n\t"
393 "por %%mm5, %%mm3 \n\t"
394 "psllq $16, %%mm3 \n\t"
395 "por %%mm3, %%mm0 \n\t"
396 MOVNTQ" %%mm0, %0 \n\t"
397 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
398 d += 4;
399 s += 16;
400 }
401 #endif
402 __asm__ volatile(SFENCE:::"memory");
403 __asm__ volatile(EMMS:::"memory");
404 #endif
405 while (s < end) {
406 register int rgb = *(const uint32_t*)s; s += 4;
407 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
408 }
409 }
410
411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
412 {
413 const uint8_t *s = src;
414 const uint8_t *end;
415 #if COMPILE_TEMPLATE_MMX
416 const uint8_t *mm_end;
417 #endif
418 uint16_t *d = (uint16_t *)dst;
419 end = s + src_size;
420 #if COMPILE_TEMPLATE_MMX
421 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
422 __asm__ volatile(
423 "movq %0, %%mm7 \n\t"
424 "movq %1, %%mm6 \n\t"
425 ::"m"(red_16mask),"m"(green_16mask));
426 mm_end = end - 15;
427 while (s < mm_end) {
428 __asm__ volatile(
429 PREFETCH" 32%1 \n\t"
430 "movd %1, %%mm0 \n\t"
431 "movd 4%1, %%mm3 \n\t"
432 "punpckldq 8%1, %%mm0 \n\t"
433 "punpckldq 12%1, %%mm3 \n\t"
434 "movq %%mm0, %%mm1 \n\t"
435 "movq %%mm0, %%mm2 \n\t"
436 "movq %%mm3, %%mm4 \n\t"
437 "movq %%mm3, %%mm5 \n\t"
438 "psllq $8, %%mm0 \n\t"
439 "psllq $8, %%mm3 \n\t"
440 "pand %%mm7, %%mm0 \n\t"
441 "pand %%mm7, %%mm3 \n\t"
442 "psrlq $5, %%mm1 \n\t"
443 "psrlq $5, %%mm4 \n\t"
444 "pand %%mm6, %%mm1 \n\t"
445 "pand %%mm6, %%mm4 \n\t"
446 "psrlq $19, %%mm2 \n\t"
447 "psrlq $19, %%mm5 \n\t"
448 "pand %2, %%mm2 \n\t"
449 "pand %2, %%mm5 \n\t"
450 "por %%mm1, %%mm0 \n\t"
451 "por %%mm4, %%mm3 \n\t"
452 "por %%mm2, %%mm0 \n\t"
453 "por %%mm5, %%mm3 \n\t"
454 "psllq $16, %%mm3 \n\t"
455 "por %%mm3, %%mm0 \n\t"
456 MOVNTQ" %%mm0, %0 \n\t"
457 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
458 d += 4;
459 s += 16;
460 }
461 __asm__ volatile(SFENCE:::"memory");
462 __asm__ volatile(EMMS:::"memory");
463 #endif
464 while (s < end) {
465 register int rgb = *(const uint32_t*)s; s += 4;
466 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
467 }
468 }
469
470 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
471 {
472 const uint8_t *s = src;
473 const uint8_t *end;
474 #if COMPILE_TEMPLATE_MMX
475 const uint8_t *mm_end;
476 #endif
477 uint16_t *d = (uint16_t *)dst;
478 end = s + src_size;
479 #if COMPILE_TEMPLATE_MMX
480 mm_end = end - 15;
481 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
482 __asm__ volatile(
483 "movq %3, %%mm5 \n\t"
484 "movq %4, %%mm6 \n\t"
485 "movq %5, %%mm7 \n\t"
486 "jmp 2f \n\t"
487 ".p2align 4 \n\t"
488 "1: \n\t"
489 PREFETCH" 32(%1) \n\t"
490 "movd (%1), %%mm0 \n\t"
491 "movd 4(%1), %%mm3 \n\t"
492 "punpckldq 8(%1), %%mm0 \n\t"
493 "punpckldq 12(%1), %%mm3 \n\t"
494 "movq %%mm0, %%mm1 \n\t"
495 "movq %%mm3, %%mm4 \n\t"
496 "pand %%mm6, %%mm0 \n\t"
497 "pand %%mm6, %%mm3 \n\t"
498 "pmaddwd %%mm7, %%mm0 \n\t"
499 "pmaddwd %%mm7, %%mm3 \n\t"
500 "pand %%mm5, %%mm1 \n\t"
501 "pand %%mm5, %%mm4 \n\t"
502 "por %%mm1, %%mm0 \n\t"
503 "por %%mm4, %%mm3 \n\t"
504 "psrld $6, %%mm0 \n\t"
505 "pslld $10, %%mm3 \n\t"
506 "por %%mm3, %%mm0 \n\t"
507 MOVNTQ" %%mm0, (%0) \n\t"
508 "add $16, %1 \n\t"
509 "add $8, %0 \n\t"
510 "2: \n\t"
511 "cmp %2, %1 \n\t"
512 " jb 1b \n\t"
513 : "+r" (d), "+r"(s)
514 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
515 );
516 #else
517 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
518 __asm__ volatile(
519 "movq %0, %%mm7 \n\t"
520 "movq %1, %%mm6 \n\t"
521 ::"m"(red_15mask),"m"(green_15mask));
522 while (s < mm_end) {
523 __asm__ volatile(
524 PREFETCH" 32%1 \n\t"
525 "movd %1, %%mm0 \n\t"
526 "movd 4%1, %%mm3 \n\t"
527 "punpckldq 8%1, %%mm0 \n\t"
528 "punpckldq 12%1, %%mm3 \n\t"
529 "movq %%mm0, %%mm1 \n\t"
530 "movq %%mm0, %%mm2 \n\t"
531 "movq %%mm3, %%mm4 \n\t"
532 "movq %%mm3, %%mm5 \n\t"
533 "psrlq $3, %%mm0 \n\t"
534 "psrlq $3, %%mm3 \n\t"
535 "pand %2, %%mm0 \n\t"
536 "pand %2, %%mm3 \n\t"
537 "psrlq $6, %%mm1 \n\t"
538 "psrlq $6, %%mm4 \n\t"
539 "pand %%mm6, %%mm1 \n\t"
540 "pand %%mm6, %%mm4 \n\t"
541 "psrlq $9, %%mm2 \n\t"
542 "psrlq $9, %%mm5 \n\t"
543 "pand %%mm7, %%mm2 \n\t"
544 "pand %%mm7, %%mm5 \n\t"
545 "por %%mm1, %%mm0 \n\t"
546 "por %%mm4, %%mm3 \n\t"
547 "por %%mm2, %%mm0 \n\t"
548 "por %%mm5, %%mm3 \n\t"
549 "psllq $16, %%mm3 \n\t"
550 "por %%mm3, %%mm0 \n\t"
551 MOVNTQ" %%mm0, %0 \n\t"
552 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
553 d += 4;
554 s += 16;
555 }
556 #endif
557 __asm__ volatile(SFENCE:::"memory");
558 __asm__ volatile(EMMS:::"memory");
559 #endif
560 while (s < end) {
561 register int rgb = *(const uint32_t*)s; s += 4;
562 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
563 }
564 }
565
566 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
567 {
568 const uint8_t *s = src;
569 const uint8_t *end;
570 #if COMPILE_TEMPLATE_MMX
571 const uint8_t *mm_end;
572 #endif
573 uint16_t *d = (uint16_t *)dst;
574 end = s + src_size;
575 #if COMPILE_TEMPLATE_MMX
576 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
577 __asm__ volatile(
578 "movq %0, %%mm7 \n\t"
579 "movq %1, %%mm6 \n\t"
580 ::"m"(red_15mask),"m"(green_15mask));
581 mm_end = end - 15;
582 while (s < mm_end) {
583 __asm__ volatile(
584 PREFETCH" 32%1 \n\t"
585 "movd %1, %%mm0 \n\t"
586 "movd 4%1, %%mm3 \n\t"
587 "punpckldq 8%1, %%mm0 \n\t"
588 "punpckldq 12%1, %%mm3 \n\t"
589 "movq %%mm0, %%mm1 \n\t"
590 "movq %%mm0, %%mm2 \n\t"
591 "movq %%mm3, %%mm4 \n\t"
592 "movq %%mm3, %%mm5 \n\t"
593 "psllq $7, %%mm0 \n\t"
594 "psllq $7, %%mm3 \n\t"
595 "pand %%mm7, %%mm0 \n\t"
596 "pand %%mm7, %%mm3 \n\t"
597 "psrlq $6, %%mm1 \n\t"
598 "psrlq $6, %%mm4 \n\t"
599 "pand %%mm6, %%mm1 \n\t"
600 "pand %%mm6, %%mm4 \n\t"
601 "psrlq $19, %%mm2 \n\t"
602 "psrlq $19, %%mm5 \n\t"
603 "pand %2, %%mm2 \n\t"
604 "pand %2, %%mm5 \n\t"
605 "por %%mm1, %%mm0 \n\t"
606 "por %%mm4, %%mm3 \n\t"
607 "por %%mm2, %%mm0 \n\t"
608 "por %%mm5, %%mm3 \n\t"
609 "psllq $16, %%mm3 \n\t"
610 "por %%mm3, %%mm0 \n\t"
611 MOVNTQ" %%mm0, %0 \n\t"
612 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
613 d += 4;
614 s += 16;
615 }
616 __asm__ volatile(SFENCE:::"memory");
617 __asm__ volatile(EMMS:::"memory");
618 #endif
619 while (s < end) {
620 register int rgb = *(const uint32_t*)s; s += 4;
621 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
622 }
623 }
624
625 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
626 {
627 const uint8_t *s = src;
628 const uint8_t *end;
629 #if COMPILE_TEMPLATE_MMX
630 const uint8_t *mm_end;
631 #endif
632 uint16_t *d = (uint16_t *)dst;
633 end = s + src_size;
634 #if COMPILE_TEMPLATE_MMX
635 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
636 __asm__ volatile(
637 "movq %0, %%mm7 \n\t"
638 "movq %1, %%mm6 \n\t"
639 ::"m"(red_16mask),"m"(green_16mask));
640 mm_end = end - 11;
641 while (s < mm_end) {
642 __asm__ volatile(
643 PREFETCH" 32%1 \n\t"
644 "movd %1, %%mm0 \n\t"
645 "movd 3%1, %%mm3 \n\t"
646 "punpckldq 6%1, %%mm0 \n\t"
647 "punpckldq 9%1, %%mm3 \n\t"
648 "movq %%mm0, %%mm1 \n\t"
649 "movq %%mm0, %%mm2 \n\t"
650 "movq %%mm3, %%mm4 \n\t"
651 "movq %%mm3, %%mm5 \n\t"
652 "psrlq $3, %%mm0 \n\t"
653 "psrlq $3, %%mm3 \n\t"
654 "pand %2, %%mm0 \n\t"
655 "pand %2, %%mm3 \n\t"
656 "psrlq $5, %%mm1 \n\t"
657 "psrlq $5, %%mm4 \n\t"
658 "pand %%mm6, %%mm1 \n\t"
659 "pand %%mm6, %%mm4 \n\t"
660 "psrlq $8, %%mm2 \n\t"
661 "psrlq $8, %%mm5 \n\t"
662 "pand %%mm7, %%mm2 \n\t"
663 "pand %%mm7, %%mm5 \n\t"
664 "por %%mm1, %%mm0 \n\t"
665 "por %%mm4, %%mm3 \n\t"
666 "por %%mm2, %%mm0 \n\t"
667 "por %%mm5, %%mm3 \n\t"
668 "psllq $16, %%mm3 \n\t"
669 "por %%mm3, %%mm0 \n\t"
670 MOVNTQ" %%mm0, %0 \n\t"
671 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
672 d += 4;
673 s += 12;
674 }
675 __asm__ volatile(SFENCE:::"memory");
676 __asm__ volatile(EMMS:::"memory");
677 #endif
678 while (s < end) {
679 const int b = *s++;
680 const int g = *s++;
681 const int r = *s++;
682 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
683 }
684 }
685
686 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
687 {
688 const uint8_t *s = src;
689 const uint8_t *end;
690 #if COMPILE_TEMPLATE_MMX
691 const uint8_t *mm_end;
692 #endif
693 uint16_t *d = (uint16_t *)dst;
694 end = s + src_size;
695 #if COMPILE_TEMPLATE_MMX
696 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
697 __asm__ volatile(
698 "movq %0, %%mm7 \n\t"
699 "movq %1, %%mm6 \n\t"
700 ::"m"(red_16mask),"m"(green_16mask));
701 mm_end = end - 15;
702 while (s < mm_end) {
703 __asm__ volatile(
704 PREFETCH" 32%1 \n\t"
705 "movd %1, %%mm0 \n\t"
706 "movd 3%1, %%mm3 \n\t"
707 "punpckldq 6%1, %%mm0 \n\t"
708 "punpckldq 9%1, %%mm3 \n\t"
709 "movq %%mm0, %%mm1 \n\t"
710 "movq %%mm0, %%mm2 \n\t"
711 "movq %%mm3, %%mm4 \n\t"
712 "movq %%mm3, %%mm5 \n\t"
713 "psllq $8, %%mm0 \n\t"
714 "psllq $8, %%mm3 \n\t"
715 "pand %%mm7, %%mm0 \n\t"
716 "pand %%mm7, %%mm3 \n\t"
717 "psrlq $5, %%mm1 \n\t"
718 "psrlq $5, %%mm4 \n\t"
719 "pand %%mm6, %%mm1 \n\t"
720 "pand %%mm6, %%mm4 \n\t"
721 "psrlq $19, %%mm2 \n\t"
722 "psrlq $19, %%mm5 \n\t"
723 "pand %2, %%mm2 \n\t"
724 "pand %2, %%mm5 \n\t"
725 "por %%mm1, %%mm0 \n\t"
726 "por %%mm4, %%mm3 \n\t"
727 "por %%mm2, %%mm0 \n\t"
728 "por %%mm5, %%mm3 \n\t"
729 "psllq $16, %%mm3 \n\t"
730 "por %%mm3, %%mm0 \n\t"
731 MOVNTQ" %%mm0, %0 \n\t"
732 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
733 d += 4;
734 s += 12;
735 }
736 __asm__ volatile(SFENCE:::"memory");
737 __asm__ volatile(EMMS:::"memory");
738 #endif
739 while (s < end) {
740 const int r = *s++;
741 const int g = *s++;
742 const int b = *s++;
743 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
744 }
745 }
746
747 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
748 {
749 const uint8_t *s = src;
750 const uint8_t *end;
751 #if COMPILE_TEMPLATE_MMX
752 const uint8_t *mm_end;
753 #endif
754 uint16_t *d = (uint16_t *)dst;
755 end = s + src_size;
756 #if COMPILE_TEMPLATE_MMX
757 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
758 __asm__ volatile(
759 "movq %0, %%mm7 \n\t"
760 "movq %1, %%mm6 \n\t"
761 ::"m"(red_15mask),"m"(green_15mask));
762 mm_end = end - 11;
763 while (s < mm_end) {
764 __asm__ volatile(
765 PREFETCH" 32%1 \n\t"
766 "movd %1, %%mm0 \n\t"
767 "movd 3%1, %%mm3 \n\t"
768 "punpckldq 6%1, %%mm0 \n\t"
769 "punpckldq 9%1, %%mm3 \n\t"
770 "movq %%mm0, %%mm1 \n\t"
771 "movq %%mm0, %%mm2 \n\t"
772 "movq %%mm3, %%mm4 \n\t"
773 "movq %%mm3, %%mm5 \n\t"
774 "psrlq $3, %%mm0 \n\t"
775 "psrlq $3, %%mm3 \n\t"
776 "pand %2, %%mm0 \n\t"
777 "pand %2, %%mm3 \n\t"
778 "psrlq $6, %%mm1 \n\t"
779 "psrlq $6, %%mm4 \n\t"
780 "pand %%mm6, %%mm1 \n\t"
781 "pand %%mm6, %%mm4 \n\t"
782 "psrlq $9, %%mm2 \n\t"
783 "psrlq $9, %%mm5 \n\t"
784 "pand %%mm7, %%mm2 \n\t"
785 "pand %%mm7, %%mm5 \n\t"
786 "por %%mm1, %%mm0 \n\t"
787 "por %%mm4, %%mm3 \n\t"
788 "por %%mm2, %%mm0 \n\t"
789 "por %%mm5, %%mm3 \n\t"
790 "psllq $16, %%mm3 \n\t"
791 "por %%mm3, %%mm0 \n\t"
792 MOVNTQ" %%mm0, %0 \n\t"
793 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794 d += 4;
795 s += 12;
796 }
797 __asm__ volatile(SFENCE:::"memory");
798 __asm__ volatile(EMMS:::"memory");
799 #endif
800 while (s < end) {
801 const int b = *s++;
802 const int g = *s++;
803 const int r = *s++;
804 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
805 }
806 }
807
808 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
809 {
810 const uint8_t *s = src;
811 const uint8_t *end;
812 #if COMPILE_TEMPLATE_MMX
813 const uint8_t *mm_end;
814 #endif
815 uint16_t *d = (uint16_t *)dst;
816 end = s + src_size;
817 #if COMPILE_TEMPLATE_MMX
818 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
819 __asm__ volatile(
820 "movq %0, %%mm7 \n\t"
821 "movq %1, %%mm6 \n\t"
822 ::"m"(red_15mask),"m"(green_15mask));
823 mm_end = end - 15;
824 while (s < mm_end) {
825 __asm__ volatile(
826 PREFETCH" 32%1 \n\t"
827 "movd %1, %%mm0 \n\t"
828 "movd 3%1, %%mm3 \n\t"
829 "punpckldq 6%1, %%mm0 \n\t"
830 "punpckldq 9%1, %%mm3 \n\t"
831 "movq %%mm0, %%mm1 \n\t"
832 "movq %%mm0, %%mm2 \n\t"
833 "movq %%mm3, %%mm4 \n\t"
834 "movq %%mm3, %%mm5 \n\t"
835 "psllq $7, %%mm0 \n\t"
836 "psllq $7, %%mm3 \n\t"
837 "pand %%mm7, %%mm0 \n\t"
838 "pand %%mm7, %%mm3 \n\t"
839 "psrlq $6, %%mm1 \n\t"
840 "psrlq $6, %%mm4 \n\t"
841 "pand %%mm6, %%mm1 \n\t"
842 "pand %%mm6, %%mm4 \n\t"
843 "psrlq $19, %%mm2 \n\t"
844 "psrlq $19, %%mm5 \n\t"
845 "pand %2, %%mm2 \n\t"
846 "pand %2, %%mm5 \n\t"
847 "por %%mm1, %%mm0 \n\t"
848 "por %%mm4, %%mm3 \n\t"
849 "por %%mm2, %%mm0 \n\t"
850 "por %%mm5, %%mm3 \n\t"
851 "psllq $16, %%mm3 \n\t"
852 "por %%mm3, %%mm0 \n\t"
853 MOVNTQ" %%mm0, %0 \n\t"
854 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855 d += 4;
856 s += 12;
857 }
858 __asm__ volatile(SFENCE:::"memory");
859 __asm__ volatile(EMMS:::"memory");
860 #endif
861 while (s < end) {
862 const int r = *s++;
863 const int g = *s++;
864 const int b = *s++;
865 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
866 }
867 }
868
869 /*
870 I use less accurate approximation here by simply left-shifting the input
871 value and filling the low order bits with zeroes. This method improves PNG
872 compression but this scheme cannot reproduce white exactly, since it does
873 not generate an all-ones maximum value; the net effect is to darken the
874 image slightly.
875
876 The better method should be "left bit replication":
877
878 4 3 2 1 0
879 ---------
880 1 1 0 1 1
881
882 7 6 5 4 3 2 1 0
883 ----------------
884 1 1 0 1 1 1 1 0
885 |=======| |===|
886 | leftmost bits repeated to fill open bits
887 |
888 original bits
889 */
890 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
891 {
892 const uint16_t *end;
893 #if COMPILE_TEMPLATE_MMX
894 const uint16_t *mm_end;
895 #endif
896 uint8_t *d = dst;
897 const uint16_t *s = (const uint16_t*)src;
898 end = s + src_size/2;
899 #if COMPILE_TEMPLATE_MMX
900 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
901 mm_end = end - 7;
902 while (s < mm_end) {
903 __asm__ volatile(
904 PREFETCH" 32%1 \n\t"
905 "movq %1, %%mm0 \n\t"
906 "movq %1, %%mm1 \n\t"
907 "movq %1, %%mm2 \n\t"
908 "pand %2, %%mm0 \n\t"
909 "pand %3, %%mm1 \n\t"
910 "pand %4, %%mm2 \n\t"
911 "psllq $3, %%mm0 \n\t"
912 "psrlq $2, %%mm1 \n\t"
913 "psrlq $7, %%mm2 \n\t"
914 "movq %%mm0, %%mm3 \n\t"
915 "movq %%mm1, %%mm4 \n\t"
916 "movq %%mm2, %%mm5 \n\t"
917 "punpcklwd %5, %%mm0 \n\t"
918 "punpcklwd %5, %%mm1 \n\t"
919 "punpcklwd %5, %%mm2 \n\t"
920 "punpckhwd %5, %%mm3 \n\t"
921 "punpckhwd %5, %%mm4 \n\t"
922 "punpckhwd %5, %%mm5 \n\t"
923 "psllq $8, %%mm1 \n\t"
924 "psllq $16, %%mm2 \n\t"
925 "por %%mm1, %%mm0 \n\t"
926 "por %%mm2, %%mm0 \n\t"
927 "psllq $8, %%mm4 \n\t"
928 "psllq $16, %%mm5 \n\t"
929 "por %%mm4, %%mm3 \n\t"
930 "por %%mm5, %%mm3 \n\t"
931
932 "movq %%mm0, %%mm6 \n\t"
933 "movq %%mm3, %%mm7 \n\t"
934
935 "movq 8%1, %%mm0 \n\t"
936 "movq 8%1, %%mm1 \n\t"
937 "movq 8%1, %%mm2 \n\t"
938 "pand %2, %%mm0 \n\t"
939 "pand %3, %%mm1 \n\t"
940 "pand %4, %%mm2 \n\t"
941 "psllq $3, %%mm0 \n\t"
942 "psrlq $2, %%mm1 \n\t"
943 "psrlq $7, %%mm2 \n\t"
944 "movq %%mm0, %%mm3 \n\t"
945 "movq %%mm1, %%mm4 \n\t"
946 "movq %%mm2, %%mm5 \n\t"
947 "punpcklwd %5, %%mm0 \n\t"
948 "punpcklwd %5, %%mm1 \n\t"
949 "punpcklwd %5, %%mm2 \n\t"
950 "punpckhwd %5, %%mm3 \n\t"
951 "punpckhwd %5, %%mm4 \n\t"
952 "punpckhwd %5, %%mm5 \n\t"
953 "psllq $8, %%mm1 \n\t"
954 "psllq $16, %%mm2 \n\t"
955 "por %%mm1, %%mm0 \n\t"
956 "por %%mm2, %%mm0 \n\t"
957 "psllq $8, %%mm4 \n\t"
958 "psllq $16, %%mm5 \n\t"
959 "por %%mm4, %%mm3 \n\t"
960 "por %%mm5, %%mm3 \n\t"
961
962 :"=m"(*d)
963 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
964 :"memory");
965 /* borrowed 32 to 24 */
966 __asm__ volatile(
967 "movq %%mm0, %%mm4 \n\t"
968 "movq %%mm3, %%mm5 \n\t"
969 "movq %%mm6, %%mm0 \n\t"
970 "movq %%mm7, %%mm1 \n\t"
971
972 "movq %%mm4, %%mm6 \n\t"
973 "movq %%mm5, %%mm7 \n\t"
974 "movq %%mm0, %%mm2 \n\t"
975 "movq %%mm1, %%mm3 \n\t"
976
977 STORE_BGR24_MMX
978
979 :"=m"(*d)
980 :"m"(*s)
981 :"memory");
982 d += 24;
983 s += 8;
984 }
985 __asm__ volatile(SFENCE:::"memory");
986 __asm__ volatile(EMMS:::"memory");
987 #endif
988 while (s < end) {
989 register uint16_t bgr;
990 bgr = *s++;
991 *d++ = (bgr&0x1F)<<3;
992 *d++ = (bgr&0x3E0)>>2;
993 *d++ = (bgr&0x7C00)>>7;
994 }
995 }
996
997 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
998 {
999 const uint16_t *end;
1000 #if COMPILE_TEMPLATE_MMX
1001 const uint16_t *mm_end;
1002 #endif
1003 uint8_t *d = (uint8_t *)dst;
1004 const uint16_t *s = (const uint16_t *)src;
1005 end = s + src_size/2;
1006 #if COMPILE_TEMPLATE_MMX
1007 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1008 mm_end = end - 7;
1009 while (s < mm_end) {
1010 __asm__ volatile(
1011 PREFETCH" 32%1 \n\t"
1012 "movq %1, %%mm0 \n\t"
1013 "movq %1, %%mm1 \n\t"
1014 "movq %1, %%mm2 \n\t"
1015 "pand %2, %%mm0 \n\t"
1016 "pand %3, %%mm1 \n\t"
1017 "pand %4, %%mm2 \n\t"
1018 "psllq $3, %%mm0 \n\t"
1019 "psrlq $3, %%mm1 \n\t"
1020 "psrlq $8, %%mm2 \n\t"
1021 "movq %%mm0, %%mm3 \n\t"
1022 "movq %%mm1, %%mm4 \n\t"
1023 "movq %%mm2, %%mm5 \n\t"
1024 "punpcklwd %5, %%mm0 \n\t"
1025 "punpcklwd %5, %%mm1 \n\t"
1026 "punpcklwd %5, %%mm2 \n\t"
1027 "punpckhwd %5, %%mm3 \n\t"
1028 "punpckhwd %5, %%mm4 \n\t"
1029 "punpckhwd %5, %%mm5 \n\t"
1030 "psllq $8, %%mm1 \n\t"
1031 "psllq $16, %%mm2 \n\t"
1032 "por %%mm1, %%mm0 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "psllq $8, %%mm4 \n\t"
1035 "psllq $16, %%mm5 \n\t"
1036 "por %%mm4, %%mm3 \n\t"
1037 "por %%mm5, %%mm3 \n\t"
1038
1039 "movq %%mm0, %%mm6 \n\t"
1040 "movq %%mm3, %%mm7 \n\t"
1041
1042 "movq 8%1, %%mm0 \n\t"
1043 "movq 8%1, %%mm1 \n\t"
1044 "movq 8%1, %%mm2 \n\t"
1045 "pand %2, %%mm0 \n\t"
1046 "pand %3, %%mm1 \n\t"
1047 "pand %4, %%mm2 \n\t"
1048 "psllq $3, %%mm0 \n\t"
1049 "psrlq $3, %%mm1 \n\t"
1050 "psrlq $8, %%mm2 \n\t"
1051 "movq %%mm0, %%mm3 \n\t"
1052 "movq %%mm1, %%mm4 \n\t"
1053 "movq %%mm2, %%mm5 \n\t"
1054 "punpcklwd %5, %%mm0 \n\t"
1055 "punpcklwd %5, %%mm1 \n\t"
1056 "punpcklwd %5, %%mm2 \n\t"
1057 "punpckhwd %5, %%mm3 \n\t"
1058 "punpckhwd %5, %%mm4 \n\t"
1059 "punpckhwd %5, %%mm5 \n\t"
1060 "psllq $8, %%mm1 \n\t"
1061 "psllq $16, %%mm2 \n\t"
1062 "por %%mm1, %%mm0 \n\t"
1063 "por %%mm2, %%mm0 \n\t"
1064 "psllq $8, %%mm4 \n\t"
1065 "psllq $16, %%mm5 \n\t"
1066 "por %%mm4, %%mm3 \n\t"
1067 "por %%mm5, %%mm3 \n\t"
1068 :"=m"(*d)
1069 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1070 :"memory");
1071 /* borrowed 32 to 24 */
1072 __asm__ volatile(
1073 "movq %%mm0, %%mm4 \n\t"
1074 "movq %%mm3, %%mm5 \n\t"
1075 "movq %%mm6, %%mm0 \n\t"
1076 "movq %%mm7, %%mm1 \n\t"
1077
1078 "movq %%mm4, %%mm6 \n\t"
1079 "movq %%mm5, %%mm7 \n\t"
1080 "movq %%mm0, %%mm2 \n\t"
1081 "movq %%mm1, %%mm3 \n\t"
1082
1083 STORE_BGR24_MMX
1084
1085 :"=m"(*d)
1086 :"m"(*s)
1087 :"memory");
1088 d += 24;
1089 s += 8;
1090 }
1091 __asm__ volatile(SFENCE:::"memory");
1092 __asm__ volatile(EMMS:::"memory");
1093 #endif
1094 while (s < end) {
1095 register uint16_t bgr;
1096 bgr = *s++;
1097 *d++ = (bgr&0x1F)<<3;
1098 *d++ = (bgr&0x7E0)>>3;
1099 *d++ = (bgr&0xF800)>>8;
1100 }
1101 }
1102
1103 /*
1104 * mm0 = 00 B3 00 B2 00 B1 00 B0
1105 * mm1 = 00 G3 00 G2 00 G1 00 G0
1106 * mm2 = 00 R3 00 R2 00 R1 00 R0
1107 * mm6 = FF FF FF FF FF FF FF FF
1108 * mm7 = 00 00 00 00 00 00 00 00
1109 */
1110 #define PACK_RGB32 \
1111 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1112 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1113 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1114 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1115 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1116 "movq %%mm0, %%mm3 \n\t" \
1117 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1118 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1119 MOVNTQ" %%mm0, %0 \n\t" \
1120 MOVNTQ" %%mm3, 8%0 \n\t" \
1121
1122 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1123 {
1124 const uint16_t *end;
1125 #if COMPILE_TEMPLATE_MMX
1126 const uint16_t *mm_end;
1127 #endif
1128 uint8_t *d = dst;
1129 const uint16_t *s = (const uint16_t *)src;
1130 end = s + src_size/2;
1131 #if COMPILE_TEMPLATE_MMX
1132 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1133 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1134 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1135 mm_end = end - 3;
1136 while (s < mm_end) {
1137 __asm__ volatile(
1138 PREFETCH" 32%1 \n\t"
1139 "movq %1, %%mm0 \n\t"
1140 "movq %1, %%mm1 \n\t"
1141 "movq %1, %%mm2 \n\t"
1142 "pand %2, %%mm0 \n\t"
1143 "pand %3, %%mm1 \n\t"
1144 "pand %4, %%mm2 \n\t"
1145 "psllq $3, %%mm0 \n\t"
1146 "psrlq $2, %%mm1 \n\t"
1147 "psrlq $7, %%mm2 \n\t"
1148 PACK_RGB32
1149 :"=m"(*d)
1150 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1151 :"memory");
1152 d += 16;
1153 s += 4;
1154 }
1155 __asm__ volatile(SFENCE:::"memory");
1156 __asm__ volatile(EMMS:::"memory");
1157 #endif
1158 while (s < end) {
1159 register uint16_t bgr;
1160 bgr = *s++;
1161 #if HAVE_BIGENDIAN
1162 *d++ = 255;
1163 *d++ = (bgr&0x7C00)>>7;
1164 *d++ = (bgr&0x3E0)>>2;
1165 *d++ = (bgr&0x1F)<<3;
1166 #else
1167 *d++ = (bgr&0x1F)<<3;
1168 *d++ = (bgr&0x3E0)>>2;
1169 *d++ = (bgr&0x7C00)>>7;
1170 *d++ = 255;
1171 #endif
1172 }
1173 }
1174
1175 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1176 {
1177 const uint16_t *end;
1178 #if COMPILE_TEMPLATE_MMX
1179 const uint16_t *mm_end;
1180 #endif
1181 uint8_t *d = dst;
1182 const uint16_t *s = (const uint16_t*)src;
1183 end = s + src_size/2;
1184 #if COMPILE_TEMPLATE_MMX
1185 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1186 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1187 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1188 mm_end = end - 3;
1189 while (s < mm_end) {
1190 __asm__ volatile(
1191 PREFETCH" 32%1 \n\t"
1192 "movq %1, %%mm0 \n\t"
1193 "movq %1, %%mm1 \n\t"
1194 "movq %1, %%mm2 \n\t"
1195 "pand %2, %%mm0 \n\t"
1196 "pand %3, %%mm1 \n\t"
1197 "pand %4, %%mm2 \n\t"
1198 "psllq $3, %%mm0 \n\t"
1199 "psrlq $3, %%mm1 \n\t"
1200 "psrlq $8, %%mm2 \n\t"
1201 PACK_RGB32
1202 :"=m"(*d)
1203 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1204 :"memory");
1205 d += 16;
1206 s += 4;
1207 }
1208 __asm__ volatile(SFENCE:::"memory");
1209 __asm__ volatile(EMMS:::"memory");
1210 #endif
1211 while (s < end) {
1212 register uint16_t bgr;
1213 bgr = *s++;
1214 #if HAVE_BIGENDIAN
1215 *d++ = 255;
1216 *d++ = (bgr&0xF800)>>8;
1217 *d++ = (bgr&0x7E0)>>3;
1218 *d++ = (bgr&0x1F)<<3;
1219 #else
1220 *d++ = (bgr&0x1F)<<3;
1221 *d++ = (bgr&0x7E0)>>3;
1222 *d++ = (bgr&0xF800)>>8;
1223 *d++ = 255;
1224 #endif
1225 }
1226 }
1227
1228 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
1229 {
1230 x86_reg idx = 15 - src_size;
1231 const uint8_t *s = src-idx;
1232 uint8_t *d = dst-idx;
1233 #if COMPILE_TEMPLATE_MMX
1234 __asm__ volatile(
1235 "test %0, %0 \n\t"
1236 "jns 2f \n\t"
1237 PREFETCH" (%1, %0) \n\t"
1238 "movq %3, %%mm7 \n\t"
1239 "pxor %4, %%mm7 \n\t"
1240 "movq %%mm7, %%mm6 \n\t"
1241 "pxor %5, %%mm7 \n\t"
1242 ".p2align 4 \n\t"
1243 "1: \n\t"
1244 PREFETCH" 32(%1, %0) \n\t"
1245 "movq (%1, %0), %%mm0 \n\t"
1246 "movq 8(%1, %0), %%mm1 \n\t"
1247 # if COMPILE_TEMPLATE_MMX2
1248 "pshufw $177, %%mm0, %%mm3 \n\t"
1249 "pshufw $177, %%mm1, %%mm5 \n\t"
1250 "pand %%mm7, %%mm0 \n\t"
1251 "pand %%mm6, %%mm3 \n\t"
1252 "pand %%mm7, %%mm1 \n\t"
1253 "pand %%mm6, %%mm5 \n\t"
1254 "por %%mm3, %%mm0 \n\t"
1255 "por %%mm5, %%mm1 \n\t"
1256 # else
1257 "movq %%mm0, %%mm2 \n\t"
1258 "movq %%mm1, %%mm4 \n\t"
1259 "pand %%mm7, %%mm0 \n\t"
1260 "pand %%mm6, %%mm2 \n\t"
1261 "pand %%mm7, %%mm1 \n\t"
1262 "pand %%mm6, %%mm4 \n\t"
1263 "movq %%mm2, %%mm3 \n\t"
1264 "movq %%mm4, %%mm5 \n\t"
1265 "pslld $16, %%mm2 \n\t"
1266 "psrld $16, %%mm3 \n\t"
1267 "pslld $16, %%mm4 \n\t"
1268 "psrld $16, %%mm5 \n\t"
1269 "por %%mm2, %%mm0 \n\t"
1270 "por %%mm4, %%mm1 \n\t"
1271 "por %%mm3, %%mm0 \n\t"
1272 "por %%mm5, %%mm1 \n\t"
1273 # endif
1274 MOVNTQ" %%mm0, (%2, %0) \n\t"
1275 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1276 "add $16, %0 \n\t"
1277 "js 1b \n\t"
1278 SFENCE" \n\t"
1279 EMMS" \n\t"
1280 "2: \n\t"
1281 : "+&r"(idx)
1282 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1283 : "memory");
1284 #endif
1285 for (; idx<15; idx+=4) {
1286 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1287 v &= 0xff00ff;
1288 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1289 }
1290 }
1291
1292 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1293 {
1294 unsigned i;
1295 #if COMPILE_TEMPLATE_MMX
1296 x86_reg mmx_size= 23 - src_size;
1297 __asm__ volatile (
1298 "test %%"REG_a", %%"REG_a" \n\t"
1299 "jns 2f \n\t"
1300 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1301 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1302 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1303 ".p2align 4 \n\t"
1304 "1: \n\t"
1305 PREFETCH" 32(%1, %%"REG_a") \n\t"
1306 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1307 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1308 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1309 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1310 "pand %%mm5, %%mm0 \n\t"
1311 "pand %%mm6, %%mm1 \n\t"
1312 "pand %%mm7, %%mm2 \n\t"
1313 "por %%mm0, %%mm1 \n\t"
1314 "por %%mm2, %%mm1 \n\t"
1315 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1316 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1317 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1318 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1319 "pand %%mm7, %%mm0 \n\t"
1320 "pand %%mm5, %%mm1 \n\t"
1321 "pand %%mm6, %%mm2 \n\t"
1322 "por %%mm0, %%mm1 \n\t"
1323 "por %%mm2, %%mm1 \n\t"
1324 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1325 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1326 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1327 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1328 "pand %%mm6, %%mm0 \n\t"
1329 "pand %%mm7, %%mm1 \n\t"
1330 "pand %%mm5, %%mm2 \n\t"
1331 "por %%mm0, %%mm1 \n\t"
1332 "por %%mm2, %%mm1 \n\t"
1333 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1334 "add $24, %%"REG_a" \n\t"
1335 " js 1b \n\t"
1336 "2: \n\t"
1337 : "+a" (mmx_size)
1338 : "r" (src-mmx_size), "r"(dst-mmx_size)
1339 );
1340
1341 __asm__ volatile(SFENCE:::"memory");
1342 __asm__ volatile(EMMS:::"memory");
1343
1344 if (mmx_size==23) return; //finished, was multiple of 8
1345
1346 src+= src_size;
1347 dst+= src_size;
1348 src_size= 23-mmx_size;
1349 src-= src_size;
1350 dst-= src_size;
1351 #endif
1352 for (i=0; i<src_size; i+=3) {
1353 register uint8_t x;
1354 x = src[i + 2];
1355 dst[i + 1] = src[i + 1];
1356 dst[i + 2] = src[i + 0];
1357 dst[i + 0] = x;
1358 }
1359 }
1360
1361 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362 long width, long height,
1363 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1364 {
1365 long y;
1366 const x86_reg chromWidth= width>>1;
1367 for (y=0; y<height; y++) {
1368 #if COMPILE_TEMPLATE_MMX
1369 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1370 __asm__ volatile(
1371 "xor %%"REG_a", %%"REG_a" \n\t"
1372 ".p2align 4 \n\t"
1373 "1: \n\t"
1374 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1375 PREFETCH" 32(%2, %%"REG_a") \n\t"
1376 PREFETCH" 32(%3, %%"REG_a") \n\t"
1377 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1378 "movq %%mm0, %%mm2 \n\t" // U(0)
1379 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1380 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1381 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1382
1383 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1384 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1385 "movq %%mm3, %%mm4 \n\t" // Y(0)
1386 "movq %%mm5, %%mm6 \n\t" // Y(8)
1387 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1388 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1389 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1390 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1391
1392 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1393 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1394 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1395 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1396
1397 "add $8, %%"REG_a" \n\t"
1398 "cmp %4, %%"REG_a" \n\t"
1399 " jb 1b \n\t"
1400 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1401 : "%"REG_a
1402 );
1403 #else
1404
1405 #if ARCH_ALPHA && HAVE_MVI
1406 #define pl2yuy2(n) \
1407 y1 = yc[n]; \
1408 y2 = yc2[n]; \
1409 u = uc[n]; \
1410 v = vc[n]; \
1411 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1412 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1413 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1414 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1415 yuv1 = (u << 8) + (v << 24); \
1416 yuv2 = yuv1 + y2; \
1417 yuv1 += y1; \
1418 qdst[n] = yuv1; \
1419 qdst2[n] = yuv2;
1420
1421 int i;
1422 uint64_t *qdst = (uint64_t *) dst;
1423 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424 const uint32_t *yc = (uint32_t *) ysrc;
1425 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427 for (i = 0; i < chromWidth; i += 8) {
1428 uint64_t y1, y2, yuv1, yuv2;
1429 uint64_t u, v;
1430 /* Prefetch */
1431 __asm__("ldq $31,64(%0)" :: "r"(yc));
1432 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433 __asm__("ldq $31,64(%0)" :: "r"(uc));
1434 __asm__("ldq $31,64(%0)" :: "r"(vc));
1435
1436 pl2yuy2(0);
1437 pl2yuy2(1);
1438 pl2yuy2(2);
1439 pl2yuy2(3);
1440
1441 yc += 4;
1442 yc2 += 4;
1443 uc += 4;
1444 vc += 4;
1445 qdst += 4;
1446 qdst2 += 4;
1447 }
1448 y++;
1449 ysrc += lumStride;
1450 dst += dstStride;
1451
1452 #elif HAVE_FAST_64BIT
1453 int i;
1454 uint64_t *ldst = (uint64_t *) dst;
1455 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456 for (i = 0; i < chromWidth; i += 2) {
1457 uint64_t k, l;
1458 k = yc[0] + (uc[0] << 8) +
1459 (yc[1] << 16) + (vc[0] << 24);
1460 l = yc[2] + (uc[1] << 8) +
1461 (yc[3] << 16) + (vc[1] << 24);
1462 *ldst++ = k + (l << 32);
1463 yc += 4;
1464 uc += 2;
1465 vc += 2;
1466 }
1467
1468 #else
1469 int i, *idst = (int32_t *) dst;
1470 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471 for (i = 0; i < chromWidth; i++) {
1472 #if HAVE_BIGENDIAN
1473 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474 (yc[1] << 8) + (vc[0] << 0);
1475 #else
1476 *idst++ = yc[0] + (uc[0] << 8) +
1477 (yc[1] << 16) + (vc[0] << 24);
1478 #endif
1479 yc += 2;
1480 uc++;
1481 vc++;
1482 }
1483 #endif
1484 #endif
1485 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486 usrc += chromStride;
1487 vsrc += chromStride;
1488 }
1489 ysrc += lumStride;
1490 dst += dstStride;
1491 }
1492 #if COMPILE_TEMPLATE_MMX
1493 __asm__(EMMS" \n\t"
1494 SFENCE" \n\t"
1495 :::"memory");
1496 #endif
1497 }
1498
1499 /**
1500 * Height should be a multiple of 2 and width should be a multiple of 16.
1501 * (If this is a problem for anyone then tell me, and I will fix it.)
1502 */
1503 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1504 long width, long height,
1505 long lumStride, long chromStride, long dstStride)
1506 {
1507 //FIXME interpolate chroma
1508 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1509 }
1510
1511 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1512 long width, long height,
1513 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1514 {
1515 long y;
1516 const x86_reg chromWidth= width>>1;
1517 for (y=0; y<height; y++) {
1518 #if COMPILE_TEMPLATE_MMX
1519 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1520 __asm__ volatile(
1521 "xor %%"REG_a", %%"REG_a" \n\t"
1522 ".p2align 4 \n\t"
1523 "1: \n\t"
1524 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1525 PREFETCH" 32(%2, %%"REG_a") \n\t"
1526 PREFETCH" 32(%3, %%"REG_a") \n\t"
1527 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1528 "movq %%mm0, %%mm2 \n\t" // U(0)
1529 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1530 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1531 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1532
1533 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1534 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1535 "movq %%mm0, %%mm4 \n\t" // Y(0)
1536 "movq %%mm2, %%mm6 \n\t" // Y(8)
1537 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1538 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1539 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1540 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1541
1542 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1543 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1544 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1545 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1546
1547 "add $8, %%"REG_a" \n\t"
1548 "cmp %4, %%"REG_a" \n\t"
1549 " jb 1b \n\t"
1550 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1551 : "%"REG_a
1552 );
1553 #else
1554 //FIXME adapt the Alpha ASM code from yv12->yuy2
1555
1556 #if HAVE_FAST_64BIT
1557 int i;
1558 uint64_t *ldst = (uint64_t *) dst;
1559 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560 for (i = 0; i < chromWidth; i += 2) {
1561 uint64_t k, l;
1562 k = uc[0] + (yc[0] << 8) +
1563 (vc[0] << 16) + (yc[1] << 24);
1564 l = uc[1] + (yc[2] << 8) +
1565 (vc[1] << 16) + (yc[3] << 24);
1566 *ldst++ = k + (l << 32);
1567 yc += 4;
1568 uc += 2;
1569 vc += 2;
1570 }
1571
1572 #else
1573 int i, *idst = (int32_t *) dst;
1574 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575 for (i = 0; i < chromWidth; i++) {
1576 #if HAVE_BIGENDIAN
1577 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578 (vc[0] << 8) + (yc[1] << 0);
1579 #else
1580 *idst++ = uc[0] + (yc[0] << 8) +
1581 (vc[0] << 16) + (yc[1] << 24);
1582 #endif
1583 yc += 2;
1584 uc++;
1585 vc++;
1586 }
1587 #endif
1588 #endif
1589 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590 usrc += chromStride;
1591 vsrc += chromStride;
1592 }
1593 ysrc += lumStride;
1594 dst += dstStride;
1595 }
1596 #if COMPILE_TEMPLATE_MMX
1597 __asm__(EMMS" \n\t"
1598 SFENCE" \n\t"
1599 :::"memory");
1600 #endif
1601 }
1602
1603 /**
1604 * Height should be a multiple of 2 and width should be a multiple of 16
1605 * (If this is a problem for anyone then tell me, and I will fix it.)
1606 */
1607 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1608 long width, long height,
1609 long lumStride, long chromStride, long dstStride)
1610 {
1611 //FIXME interpolate chroma
1612 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1613 }
1614
1615 /**
1616 * Width should be a multiple of 16.
1617 */
1618 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619 long width, long height,
1620 long lumStride, long chromStride, long dstStride)
1621 {
1622 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1623 }
1624
1625 /**
1626 * Width should be a multiple of 16.
1627 */
1628 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1629 long width, long height,
1630 long lumStride, long chromStride, long dstStride)
1631 {
1632 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1633 }
1634
1635 /**
1636 * Height should be a multiple of 2 and width should be a multiple of 16.
1637 * (If this is a problem for anyone then tell me, and I will fix it.)
1638 */
1639 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1640 long width, long height,
1641 long lumStride, long chromStride, long srcStride)
1642 {
1643 long y;
1644 const x86_reg chromWidth= width>>1;
1645 for (y=0; y<height; y+=2) {
1646 #if COMPILE_TEMPLATE_MMX
1647 __asm__ volatile(
1648 "xor %%"REG_a", %%"REG_a" \n\t"
1649 "pcmpeqw %%mm7, %%mm7 \n\t"
1650 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1651 ".p2align 4 \n\t"
1652 "1: \n\t"
1653 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1654 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1655 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1656 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1657 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1658 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1659 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1660 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1661 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1662 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1663 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1664
1665 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1666
1667 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1668 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1669 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1670 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1671 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1672 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1673 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1674 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1675 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1676 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1677
1678 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1679
1680 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1681 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1682 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1683 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1684 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1685 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1686 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1687 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1688
1689 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1690 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1691
1692 "add $8, %%"REG_a" \n\t"
1693 "cmp %4, %%"REG_a" \n\t"
1694 " jb 1b \n\t"
1695 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1696 : "memory", "%"REG_a
1697 );
1698
1699 ydst += lumStride;
1700 src += srcStride;
1701
1702 __asm__ volatile(
1703 "xor %%"REG_a", %%"REG_a" \n\t"
1704 ".p2align 4 \n\t"
1705 "1: \n\t"
1706 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1707 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1708 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1709 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1710 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1711 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1712 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1713 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1714 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1715 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1716 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1717
1718 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1719 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1720
1721 "add $8, %%"REG_a" \n\t"
1722 "cmp %4, %%"REG_a" \n\t"
1723 " jb 1b \n\t"
1724
1725 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726 : "memory", "%"REG_a
1727 );
1728 #else
1729 long i;
1730 for (i=0; i<chromWidth; i++) {
1731 ydst[2*i+0] = src[4*i+0];
1732 udst[i] = src[4*i+1];
1733 ydst[2*i+1] = src[4*i+2];
1734 vdst[i] = src[4*i+3];
1735 }
1736 ydst += lumStride;
1737 src += srcStride;
1738
1739 for (i=0; i<chromWidth; i++) {
1740 ydst[2*i+0] = src[4*i+0];
1741 ydst[2*i+1] = src[4*i+2];
1742 }
1743 #endif
1744 udst += chromStride;
1745 vdst += chromStride;
1746 ydst += lumStride;
1747 src += srcStride;
1748 }
1749 #if COMPILE_TEMPLATE_MMX
1750 __asm__ volatile(EMMS" \n\t"
1751 SFENCE" \n\t"
1752 :::"memory");
1753 #endif
1754 }
1755
1756 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1757 {
1758 long x,y;
1759
1760 dst[0]= src[0];
1761
1762 // first line
1763 for (x=0; x<srcWidth-1; x++) {
1764 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1765 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1766 }
1767 dst[2*srcWidth-1]= src[srcWidth-1];
1768
1769 dst+= dstStride;
1770
1771 for (y=1; y<srcHeight; y++) {
1772 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1773 const x86_reg mmxSize= srcWidth&~15;
1774 __asm__ volatile(
1775 "mov %4, %%"REG_a" \n\t"
1776 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1777 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1778 "movq %%mm4, %%mm2 \n\t"
1779 "psllq $8, %%mm4 \n\t"
1780 "pand %%mm0, %%mm2 \n\t"
1781 "por %%mm2, %%mm4 \n\t"
1782 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1783 "movq %%mm5, %%mm3 \n\t"
1784 "psllq $8, %%mm5 \n\t"
1785 "pand %%mm0, %%mm3 \n\t"
1786 "por %%mm3, %%mm5 \n\t"
1787 "1: \n\t"
1788 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1789 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1790 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1791 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1792 PAVGB" %%mm0, %%mm5 \n\t"
1793 PAVGB" %%mm0, %%mm3 \n\t"
1794 PAVGB" %%mm0, %%mm5 \n\t"
1795 PAVGB" %%mm0, %%mm3 \n\t"
1796 PAVGB" %%mm1, %%mm4 \n\t"
1797 PAVGB" %%mm1, %%mm2 \n\t"
1798 PAVGB" %%mm1, %%mm4 \n\t"
1799 PAVGB" %%mm1, %%mm2 \n\t"
1800 "movq %%mm5, %%mm7 \n\t"
1801 "movq %%mm4, %%mm6 \n\t"
1802 "punpcklbw %%mm3, %%mm5 \n\t"
1803 "punpckhbw %%mm3, %%mm7 \n\t"
1804 "punpcklbw %%mm2, %%mm4 \n\t"
1805 "punpckhbw %%mm2, %%mm6 \n\t"
1806 #if 1
1807 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1808 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1809 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1810 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1811 #else
1812 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1813 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1814 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1815 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1816 #endif
1817 "add $8, %%"REG_a" \n\t"
1818 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1819 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1820 " js 1b \n\t"
1821 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1822 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1823 "g" (-mmxSize)
1824 : "%"REG_a
1825 );
1826 #else
1827 const x86_reg mmxSize=1;
1828
1829 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1830 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1831 #endif
1832
1833 for (x=mmxSize-1; x<srcWidth-1; x++) {
1834 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1835 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1836 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1837 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1838 }
1839 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1840 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1841
1842 dst+=dstStride*2;
1843 src+=srcStride;
1844 }
1845
1846 // last line
1847 #if 1
1848 dst[0]= src[0];
1849
1850 for (x=0; x<srcWidth-1; x++) {
1851 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1852 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1853 }
1854 dst[2*srcWidth-1]= src[srcWidth-1];
1855 #else
1856 for (x=0; x<srcWidth; x++) {
1857 dst[2*x+0]=
1858 dst[2*x+1]= src[x];
1859 }
1860 #endif
1861
1862 #if COMPILE_TEMPLATE_MMX
1863 __asm__ volatile(EMMS" \n\t"
1864 SFENCE" \n\t"
1865 :::"memory");
1866 #endif
1867 }
1868
1869 /**
1870 * Height should be a multiple of 2 and width should be a multiple of 16.
1871 * (If this is a problem for anyone then tell me, and I will fix it.)
1872 * Chrominance data is only taken from every second line, others are ignored.
1873 * FIXME: Write HQ version.
1874 */
1875 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1876 long width, long height,
1877 long lumStride, long chromStride, long srcStride)
1878 {
1879 long y;
1880 const x86_reg chromWidth= width>>1;
1881 for (y=0; y<height; y+=2) {
1882 #if COMPILE_TEMPLATE_MMX
1883 __asm__ volatile(
1884 "xor %%"REG_a", %%"REG_a" \n\t"
1885 "pcmpeqw %%mm7, %%mm7 \n\t"
1886 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1887 ".p2align 4 \n\t"
1888 "1: \n\t"
1889 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1890 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1891 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1892 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1893 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1894 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1895 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1896 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1897 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1898 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1899 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1900
1901 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1902
1903 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1904 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1905 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1906 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1907 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1908 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1909 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1910 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1911 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1912 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1913
1914 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1915
1916 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1917 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1918 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1919 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1920 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1921 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1922 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1923 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1924
1925 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1926 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1927
1928 "add $8, %%"REG_a" \n\t"
1929 "cmp %4, %%"REG_a" \n\t"
1930 " jb 1b \n\t"
1931 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1932 : "memory", "%"REG_a
1933 );
1934
1935 ydst += lumStride;
1936 src += srcStride;
1937
1938 __asm__ volatile(
1939 "xor %%"REG_a", %%"REG_a" \n\t"
1940 ".p2align 4 \n\t"
1941 "1: \n\t"
1942 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1943 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1944 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1945 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1946 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1947 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1948 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1949 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1950 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1951 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1952 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1953
1954 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1955 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1956
1957 "add $8, %%"REG_a" \n\t"
1958 "cmp %4, %%"REG_a" \n\t"
1959 " jb 1b \n\t"
1960
1961 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1962 : "memory", "%"REG_a
1963 );
1964 #else
1965 long i;
1966 for (i=0; i<chromWidth; i++) {
1967 udst[i] = src[4*i+0];
1968 ydst[2*i+0] = src[4*i+1];
1969 vdst[i] = src[4*i+2];
1970 ydst[2*i+1] = src[4*i+3];
1971 }
1972 ydst += lumStride;
1973 src += srcStride;
1974
1975 for (i=0; i<chromWidth; i++) {
1976 ydst[2*i+0] = src[4*i+1];
1977 ydst[2*i+1] = src[4*i+3];
1978 }
1979 #endif
1980 udst += chromStride;
1981 vdst += chromStride;
1982 ydst += lumStride;
1983 src += srcStride;
1984 }
1985 #if COMPILE_TEMPLATE_MMX
1986 __asm__ volatile(EMMS" \n\t"
1987 SFENCE" \n\t"
1988 :::"memory");
1989 #endif
1990 }
1991
1992 /**
1993 * Height should be a multiple of 2 and width should be a multiple of 2.
1994 * (If this is a problem for anyone then tell me, and I will fix it.)
1995 * Chrominance data is only taken from every second line,
1996 * others are ignored in the C version.
1997 * FIXME: Write HQ version.
1998 */
1999 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2000 long width, long height,
2001 long lumStride, long chromStride, long srcStride)
2002 {
2003 long y;
2004 const x86_reg chromWidth= width>>1;
2005 #if COMPILE_TEMPLATE_MMX
2006 for (y=0; y<height-2; y+=2) {
2007 long i;
2008 for (i=0; i<2; i++) {
2009 __asm__ volatile(
2010 "mov %2, %%"REG_a" \n\t"
2011 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2012 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2013 "pxor %%mm7, %%mm7 \n\t"
2014 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2015 ".p2align 4 \n\t"
2016 "1: \n\t"
2017 PREFETCH" 64(%0, %%"REG_d") \n\t"
2018 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2019 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2020 "punpcklbw %%mm7, %%mm0 \n\t"
2021 "punpcklbw %%mm7, %%mm1 \n\t"
2022 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2023 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2024 "punpcklbw %%mm7, %%mm2 \n\t"
2025 "punpcklbw %%mm7, %%mm3 \n\t"
2026 "pmaddwd %%mm6, %%mm0 \n\t"
2027 "pmaddwd %%mm6, %%mm1 \n\t"
2028 "pmaddwd %%mm6, %%mm2 \n\t"
2029 "pmaddwd %%mm6, %%mm3 \n\t"
2030 #ifndef FAST_BGR2YV12
2031 "psrad $8, %%mm0 \n\t"
2032 "psrad $8, %%mm1 \n\t"
2033 "psrad $8, %%mm2 \n\t"
2034 "psrad $8, %%mm3 \n\t"
2035 #endif
2036 "packssdw %%mm1, %%mm0 \n\t"
2037 "packssdw %%mm3, %%mm2 \n\t"
2038 "pmaddwd %%mm5, %%mm0 \n\t"
2039 "pmaddwd %%mm5, %%mm2 \n\t"
2040 "packssdw %%mm2, %%mm0 \n\t"
2041 "psraw $7, %%mm0 \n\t"
2042
2043 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2044 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2045 "punpcklbw %%mm7, %%mm4 \n\t"
2046 "punpcklbw %%mm7, %%mm1 \n\t"
2047 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2048 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2049 "punpcklbw %%mm7, %%mm2 \n\t"
2050 "punpcklbw %%mm7, %%mm3 \n\t"
2051 "pmaddwd %%mm6, %%mm4 \n\t"
2052 "pmaddwd %%mm6, %%mm1 \n\t"
2053 "pmaddwd %%mm6, %%mm2 \n\t"
2054 "pmaddwd %%mm6, %%mm3 \n\t"
2055 #ifndef FAST_BGR2YV12
2056 "psrad $8, %%mm4 \n\t"
2057 "psrad $8, %%mm1 \n\t"
2058 "psrad $8, %%mm2 \n\t"
2059 "psrad $8, %%mm3 \n\t"
2060 #endif
2061 "packssdw %%mm1, %%mm4 \n\t"
2062 "packssdw %%mm3, %%mm2 \n\t"
2063 "pmaddwd %%mm5, %%mm4 \n\t"
2064 "pmaddwd %%mm5, %%mm2 \n\t"
2065 "add $24, %%"REG_d" \n\t"
2066 "packssdw %%mm2, %%mm4 \n\t"
2067 "psraw $7, %%mm4 \n\t"
2068
2069 "packuswb %%mm4, %%mm0 \n\t"
2070 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2071
2072 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2073 "add $8, %%"REG_a" \n\t"
2074 " js 1b \n\t"
2075 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2076 : "%"REG_a, "%"REG_d
2077 );
2078 ydst += lumStride;
2079 src += srcStride;
2080 }
2081 src -= srcStride*2;
2082 __asm__ volatile(
2083 "mov %4, %%"REG_a" \n\t"
2084 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2085 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2086 "pxor %%mm7, %%mm7 \n\t"
2087 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2088 "add %%"REG_d", %%"REG_d" \n\t"
2089 ".p2align 4 \n\t"
2090 "1: \n\t"
2091 PREFETCH" 64(%0, %%"REG_d") \n\t"
2092 PREFETCH" 64(%1, %%"REG_d") \n\t"
2093 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2094 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2095 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2096 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2097 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2098 PAVGB" %%mm1, %%mm0 \n\t"
2099 PAVGB" %%mm3, %%mm2 \n\t"
2100 "movq %%mm0, %%mm1 \n\t"
2101 "movq %%mm2, %%mm3 \n\t"
2102 "psrlq $24, %%mm0 \n\t"
2103 "psrlq $24, %%mm2 \n\t"
2104 PAVGB" %%mm1, %%mm0 \n\t"
2105 PAVGB" %%mm3, %%mm2 \n\t"
2106 "punpcklbw %%mm7, %%mm0 \n\t"
2107 "punpcklbw %%mm7, %%mm2 \n\t"
2108 #else
2109 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2110 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2111 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2112 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2113 "punpcklbw %%mm7, %%mm0 \n\t"
2114 "punpcklbw %%mm7, %%mm1 \n\t"
2115 "punpcklbw %%mm7, %%mm2 \n\t"
2116 "punpcklbw %%mm7, %%mm3 \n\t"
2117 "paddw %%mm1, %%mm0 \n\t"
2118 "paddw %%mm3, %%mm2 \n\t"
2119 "paddw %%mm2, %%mm0 \n\t"
2120 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2121 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2122 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2123 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2124 "punpcklbw %%mm7, %%mm4 \n\t"
2125 "punpcklbw %%mm7, %%mm1 \n\t"
2126 "punpcklbw %%mm7, %%mm2 \n\t"
2127 "punpcklbw %%mm7, %%mm3 \n\t"
2128 "paddw %%mm1, %%mm4 \n\t"
2129 "paddw %%mm3, %%mm2 \n\t"
2130 "paddw %%mm4, %%mm2 \n\t"
2131 "psrlw $2, %%mm0 \n\t"
2132 "psrlw $2, %%mm2 \n\t"
2133 #endif
2134 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2135 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2136
2137 "pmaddwd %%mm0, %%mm1 \n\t"
2138 "pmaddwd %%mm2, %%mm3 \n\t"
2139 "pmaddwd %%mm6, %%mm0 \n\t"
2140 "pmaddwd %%mm6, %%mm2 \n\t"
2141 #ifndef FAST_BGR2YV12
2142 "psrad $8, %%mm0 \n\t"
2143 "psrad $8, %%mm1 \n\t"
2144 "psrad $8, %%mm2 \n\t"
2145 "psrad $8, %%mm3 \n\t"
2146 #endif
2147 "packssdw %%mm2, %%mm0 \n\t"
2148 "packssdw %%mm3, %%mm1 \n\t"
2149 "pmaddwd %%mm5, %%mm0 \n\t"
2150 "pmaddwd %%mm5, %%mm1 \n\t"
2151 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2152 "psraw $7, %%mm0 \n\t"
2153
2154 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2155 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2156 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2157 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2158 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2159 PAVGB" %%mm1, %%mm4 \n\t"
2160 PAVGB" %%mm3, %%mm2 \n\t"
2161 "movq %%mm4, %%mm1 \n\t"
2162 "movq %%mm2, %%mm3 \n\t"
2163 "psrlq $24, %%mm4 \n\t"
2164 "psrlq $24, %%mm2 \n\t"
2165 PAVGB" %%mm1, %%mm4 \n\t"
2166 PAVGB" %%mm3, %%mm2 \n\t"
2167 "punpcklbw %%mm7, %%mm4 \n\t"
2168 "punpcklbw %%mm7, %%mm2 \n\t"
2169 #else
2170 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2171 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2172 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2173 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2174 "punpcklbw %%mm7, %%mm4 \n\t"
2175 "punpcklbw %%mm7, %%mm1 \n\t"
2176 "punpcklbw %%mm7, %%mm2 \n\t"
2177 "punpcklbw %%mm7, %%mm3 \n\t"
2178 "paddw %%mm1, %%mm4 \n\t"
2179 "paddw %%mm3, %%mm2 \n\t"
2180 "paddw %%mm2, %%mm4 \n\t"
2181 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2182 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2183 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2184 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2185 "punpcklbw %%mm7, %%mm5 \n\t"
2186 "punpcklbw %%mm7, %%mm1 \n\t"
2187 "punpcklbw %%mm7, %%mm2 \n\t"
2188 "punpcklbw %%mm7, %%mm3 \n\t"
2189 "paddw %%mm1, %%mm5 \n\t"
2190 "paddw %%mm3, %%mm2 \n\t"
2191 "paddw %%mm5, %%mm2 \n\t"
2192 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2193 "psrlw $2, %%mm4 \n\t"
2194 "psrlw $2, %%mm2 \n\t"
2195 #endif
2196 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2197 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2198
2199 "pmaddwd %%mm4, %%mm1 \n\t"
2200 "pmaddwd %%mm2, %%mm3 \n\t"
2201 "pmaddwd %%mm6, %%mm4 \n\t"
2202 "pmaddwd %%mm6, %%mm2 \n\t"
2203 #ifndef FAST_BGR2YV12
2204 "psrad $8, %%mm4 \n\t"
2205 "psrad $8, %%mm1 \n\t"
2206 "psrad $8, %%mm2 \n\t"
2207 "psrad $8, %%mm3 \n\t"
2208 #endif
2209 "packssdw %%mm2, %%mm4 \n\t"
2210 "packssdw %%mm3, %%mm1 \n\t"
2211 "pmaddwd %%mm5, %%mm4 \n\t"
2212 "pmaddwd %%mm5, %%mm1 \n\t"
2213 "add $24, %%"REG_d" \n\t"
2214 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2215 "psraw $7, %%mm4 \n\t"
2216
2217 "movq %%mm0, %%mm1 \n\t"
2218 "punpckldq %%mm4, %%mm0 \n\t"
2219 "punpckhdq %%mm4, %%mm1 \n\t"
2220 "packsswb %%mm1, %%mm0 \n\t"
2221 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2222 "movd %%mm0, (%2, %%"REG_a") \n\t"
2223 "punpckhdq %%mm0, %%mm0 \n\t"
2224 "movd %%mm0, (%3, %%"REG_a") \n\t"
2225 "add $4, %%"REG_a" \n\t"
2226 " js 1b \n\t"
2227 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2228 : "%"REG_a, "%"REG_d
2229 );
2230
2231 udst += chromStride;
2232 vdst += chromStride;
2233 src += srcStride*2;
2234 }
2235
2236 __asm__ volatile(EMMS" \n\t"
2237 SFENCE" \n\t"
2238 :::"memory");
2239 #else
2240 y=0;
2241 #endif
2242 for (; y<height; y+=2) {
2243 long i;
2244 for (i=0; i<chromWidth; i++) {
2245 unsigned int b = src[6*i+0];
2246 unsigned int g = src[6*i+1];
2247 unsigned int r = src[6*i+2];
2248
2249 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2250 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2251 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2252
2253 udst[i] = U;
2254 vdst[i] = V;
2255 ydst[2*i] = Y;
2256
2257 b = src[6*i+3];
2258 g = src[6*i+4];
2259 r = src[6*i+5];
2260
2261 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2262 ydst[2*i+1] = Y;
2263 }
2264 ydst += lumStride;
2265 src += srcStride;
2266
2267 for (i=0; i<chromWidth; i++) {
2268 unsigned int b = src[6*i+0];
2269 unsigned int g = src[6*i+1];
2270 unsigned int r = src[6*i+2];
2271
2272 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2273
2274 ydst[2*i] = Y;
2275
2276 b = src[6*i+3];
2277 g = src[6*i+4];
2278 r = src[6*i+5];
2279
2280 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2281 ydst[2*i+1] = Y;
2282 }
2283 udst += chromStride;
2284 vdst += chromStride;
2285 ydst += lumStride;
2286 src += srcStride;
2287 }
2288 }
2289
2290 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2291 long width, long height, long src1Stride,
2292 long src2Stride, long dstStride)
2293 {
2294 long h;
2295
2296 for (h=0; h < height; h++) {
2297 long w;
2298
2299 #if COMPILE_TEMPLATE_MMX
2300 #if COMPILE_TEMPLATE_SSE2
2301 __asm__(
2302 "xor %%"REG_a", %%"REG_a" \n\t"
2303 "1: \n\t"
2304 PREFETCH" 64(%1, %%"REG_a") \n\t"
2305 PREFETCH" 64(%2, %%"REG_a") \n\t"
2306 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2307 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2308 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2309 "punpcklbw %%xmm2, %%xmm0 \n\t"
2310 "punpckhbw %%xmm2, %%xmm1 \n\t"
2311 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2312 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2313 "add $16, %%"REG_a" \n\t"
2314 "cmp %3, %%"REG_a" \n\t"
2315 " jb 1b \n\t"
2316 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2317 : "memory", "%"REG_a""
2318 );
2319 #else
2320 __asm__(
2321 "xor %%"REG_a", %%"REG_a" \n\t"
2322 "1: \n\t"
2323 PREFETCH" 64(%1, %%"REG_a") \n\t"
2324 PREFETCH" 64(%2, %%"REG_a") \n\t"
2325 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2326 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2327 "movq %%mm0, %%mm1 \n\t"
2328 "movq %%mm2, %%mm3 \n\t"
2329 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2330 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2331 "punpcklbw %%mm4, %%mm0 \n\t"
2332 "punpckhbw %%mm4, %%mm1 \n\t"
2333 "punpcklbw %%mm5, %%mm2 \n\t"
2334 "punpckhbw %%mm5, %%mm3 \n\t"
2335 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2336 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2337 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2338 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2339 "add $16, %%"REG_a" \n\t"
2340 "cmp %3, %%"REG_a" \n\t"
2341 " jb 1b \n\t"
2342 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2343 : "memory", "%"REG_a
2344 );
2345 #endif
2346 for (w= (width&(~15)); w < width; w++) {
2347 dest[2*w+0] = src1[w];
2348 dest[2*w+1] = src2[w];
2349 }
2350 #else
2351 for (w=0; w < width; w++) {
2352 dest[2*w+0] = src1[w];
2353 dest[2*w+1] = src2[w];
2354 }
2355 #endif
2356 dest += dstStride;
2357 src1 += src1Stride;
2358 src2 += src2Stride;
2359 }
2360 #if COMPILE_TEMPLATE_MMX
2361 __asm__(
2362 EMMS" \n\t"
2363 SFENCE" \n\t"
2364 ::: "memory"
2365 );
2366 #endif
2367 }
2368
2369 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2370 uint8_t *dst1, uint8_t *dst2,
2371 long width, long height,
2372 long srcStride1, long srcStride2,
2373 long dstStride1, long dstStride2)
2374 {
2375 x86_reg y;
2376 long x,w,h;
2377 w=width/2; h=height/2;
2378 #if COMPILE_TEMPLATE_MMX
2379 __asm__ volatile(
2380 PREFETCH" %0 \n\t"
2381 PREFETCH" %1 \n\t"
2382 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2383 #endif
2384 for (y=0;y<h;y++) {
2385 const uint8_t* s1=src1+srcStride1*(y>>1);
2386 uint8_t* d=dst1+dstStride1*y;
2387 x=0;
2388 #if COMPILE_TEMPLATE_MMX
2389 for (;x<w-31;x+=32) {
2390 __asm__ volatile(
2391 PREFETCH" 32%1 \n\t"
2392 "movq %1, %%mm0 \n\t"
2393 "movq 8%1, %%mm2 \n\t"
2394 "movq 16%1, %%mm4 \n\t"
2395 "movq 24%1, %%mm6 \n\t"
2396 "movq %%mm0, %%mm1 \n\t"
2397 "movq %%mm2, %%mm3 \n\t"
2398 "movq %%mm4, %%mm5 \n\t"
2399 "movq %%mm6, %%mm7 \n\t"
2400 "punpcklbw %%mm0, %%mm0 \n\t"
2401 "punpckhbw %%mm1, %%mm1 \n\t"
2402 "punpcklbw %%mm2, %%mm2 \n\t"
2403 "punpckhbw %%mm3, %%mm3 \n\t"
2404 "punpcklbw %%mm4, %%mm4 \n\t"
2405 "punpckhbw %%mm5, %%mm5 \n\t"
2406 "punpcklbw %%mm6, %%mm6 \n\t"
2407 "punpckhbw %%mm7, %%mm7 \n\t"
2408 MOVNTQ" %%mm0, %0 \n\t"
2409 MOVNTQ" %%mm1, 8%0 \n\t"
2410 MOVNTQ" %%mm2, 16%0 \n\t"
2411 MOVNTQ" %%mm3, 24%0 \n\t"
2412 MOVNTQ" %%mm4, 32%0 \n\t"
2413 MOVNTQ" %%mm5, 40%0 \n\t"
2414 MOVNTQ" %%mm6, 48%0 \n\t"
2415 MOVNTQ" %%mm7, 56%0"
2416 :"=m"(d[2*x])
2417 :"m"(s1[x])
2418 :"memory");
2419 }
2420 #endif
2421 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2422 }
2423 for (y=0;y<h;y++) {
2424 const uint8_t* s2=src2+srcStride2*(y>>1);
2425 uint8_t* d=dst2+dstStride2*y;
2426 x=0;
2427 #if COMPILE_TEMPLATE_MMX
2428 for (;x<w-31;x+=32) {
2429 __asm__ volatile(
2430 PREFETCH" 32%1 \n\t"
2431 "movq %1, %%mm0 \n\t"
2432 "movq 8%1, %%mm2 \n\t"
2433 "movq 16%1, %%mm4 \n\t"
2434 "movq 24%1, %%mm6 \n\t"
2435 "movq %%mm0, %%mm1 \n\t"
2436 "movq %%mm2, %%mm3 \n\t"
2437 "movq %%mm4, %%mm5 \n\t"
2438 "movq %%mm6, %%mm7 \n\t"
2439 "punpcklbw %%mm0, %%mm0 \n\t"
2440 "punpckhbw %%mm1, %%mm1 \n\t"
2441 "punpcklbw %%mm2, %%mm2 \n\t"
2442 "punpckhbw %%mm3, %%mm3 \n\t"
2443 "punpcklbw %%mm4, %%mm4 \n\t"
2444 "punpckhbw %%mm5, %%mm5 \n\t"
2445 "punpcklbw %%mm6, %%mm6 \n\t"
2446 "punpckhbw %%mm7, %%mm7 \n\t"
2447 MOVNTQ" %%mm0, %0 \n\t"
2448 MOVNTQ" %%mm1, 8%0 \n\t"
2449 MOVNTQ" %%mm2, 16%0 \n\t"
2450 MOVNTQ" %%mm3, 24%0 \n\t"
2451 MOVNTQ" %%mm4, 32%0 \n\t"
2452 MOVNTQ" %%mm5, 40%0 \n\t"
2453 MOVNTQ" %%mm6, 48%0 \n\t"
2454 MOVNTQ" %%mm7, 56%0"
2455 :"=m"(d[2*x])
2456 :"m"(s2[x])
2457 :"memory");
2458 }
2459 #endif
2460 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2461 }
2462 #if COMPILE_TEMPLATE_MMX
2463 __asm__(
2464 EMMS" \n\t"
2465 SFENCE" \n\t"
2466 ::: "memory"
2467 );
2468 #endif
2469 }
2470
2471 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2472 uint8_t *dst,
2473 long width, long height,
2474 long srcStride1, long srcStride2,
2475 long srcStride3, long dstStride)
2476 {
2477 x86_reg x;
2478 long y,w,h;
2479 w=width/2; h=height;
2480 for (y=0;y<h;y++) {
2481 const uint8_t* yp=src1+srcStride1*y;
2482 const uint8_t* up=src2+srcStride2*(y>>2);
2483 const uint8_t* vp=src3+srcStride3*(y>>2);
2484 uint8_t* d=dst+dstStride*y;
2485 x=0;
2486 #if COMPILE_TEMPLATE_MMX
2487 for (;x<w-7;x+=8) {
2488 __asm__ volatile(
2489 PREFETCH" 32(%1, %0) \n\t"
2490 PREFETCH" 32(%2, %0) \n\t"
2491 PREFETCH" 32(%3, %0) \n\t"
2492 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2493 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2494 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2495 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2496 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2497 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2498 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2499 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2500 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2501 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2502
2503 "movq %%mm1, %%mm6 \n\t"
2504 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2505 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2506 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2507 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2508 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2509
2510 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2511 "movq 8(%1, %0, 4), %%mm0 \n\t"
2512 "movq %%mm0, %%mm3 \n\t"
2513 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2514 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2515 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2516 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2517
2518 "movq %%mm4, %%mm6 \n\t"
2519 "movq 16(%1, %0, 4), %%mm0 \n\t"
2520 "movq %%mm0, %%mm3 \n\t"
2521 "punpcklbw %%mm5, %%mm4 \n\t"
2522 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2523 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2524 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2525 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2526
2527 "punpckhbw %%mm5, %%mm6 \n\t"
2528 "movq 24(%1, %0, 4), %%mm0 \n\t"
2529 "movq %%mm0, %%mm3 \n\t"
2530 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2531 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2532 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2533 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2534
2535 : "+r" (x)
2536 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2537 :"memory");
2538 }
2539 #endif
2540 for (; x<w; x++) {
2541 const long x2 = x<<2;
2542 d[8*x+0] = yp[x2];
2543 d[8*x+1] = up[x];
2544 d[8*x+2] = yp[x2+1];
2545 d[8*x+3] = vp[x];
2546 d[8*x+4] = yp[x2+2];
2547 d[8*x+5] = up[x];
2548 d[8*x+6] = yp[x2+3];
2549 d[8*x+7] = vp[x];
2550 }
2551 }
2552 #if COMPILE_TEMPLATE_MMX
2553 __asm__(
2554 EMMS" \n\t"
2555 SFENCE" \n\t"
2556 ::: "memory"
2557 );
2558 #endif
2559 }
2560
2561 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2562 {
2563 dst += count;
2564 src += 2*count;
2565 count= - count;
2566
2567 #if COMPILE_TEMPLATE_MMX
2568 if(count <= -16) {
2569 count += 15;
2570 __asm__ volatile(
2571 "pcmpeqw %%mm7, %%mm7 \n\t"
2572 "psrlw $8, %%mm7 \n\t"
2573 "1: \n\t"
2574 "movq -30(%1, %0, 2), %%mm0 \n\t"
2575 "movq -22(%1, %0, 2), %%mm1 \n\t"
2576 "movq -14(%1, %0, 2), %%mm2 \n\t"
2577 "movq -6(%1, %0, 2), %%mm3 \n\t"
2578 "pand %%mm7, %%mm0 \n\t"
2579 "pand %%mm7, %%mm1 \n\t"
2580 "pand %%mm7, %%mm2 \n\t"
2581 "pand %%mm7, %%mm3 \n\t"
2582 "packuswb %%mm1, %%mm0 \n\t"
2583 "packuswb %%mm3, %%mm2 \n\t"
2584 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2585 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2586 "add $16, %0 \n\t"
2587 " js 1b \n\t"
2588 : "+r"(count)
2589 : "r"(src), "r"(dst)
2590 );
2591 count -= 15;
2592 }
2593 #endif
2594 while(count<0) {
2595 dst[count]= src[2*count];
2596 count++;
2597 }
2598 }
2599
2600 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2601 {
2602 dst0+= count;
2603 dst1+= count;
2604 src += 4*count;
2605 count= - count;
2606 #if COMPILE_TEMPLATE_MMX
2607 if(count <= -8) {
2608 count += 7;
2609 __asm__ volatile(
2610 "pcmpeqw %%mm7, %%mm7 \n\t"
2611 "psrlw $8, %%mm7 \n\t"
2612 "1: \n\t"
2613 "movq -28(%1, %0, 4), %%mm0 \n\t"
2614 "movq -20(%1, %0, 4), %%mm1 \n\t"
2615 "movq -12(%1, %0, 4), %%mm2 \n\t"
2616 "movq -4(%1, %0, 4), %%mm3 \n\t"
2617 "pand %%mm7, %%mm0 \n\t"
2618 "pand %%mm7, %%mm1 \n\t"
2619 "pand %%mm7, %%mm2 \n\t"
2620 "pand %%mm7, %%mm3 \n\t"
2621 "packuswb %%mm1, %%mm0 \n\t"
2622 "packuswb %%mm3, %%mm2 \n\t"
2623 "movq %%mm0, %%mm1 \n\t"
2624 "movq %%mm2, %%mm3 \n\t"
2625 "psrlw $8, %%mm0 \n\t"
2626 "psrlw $8, %%mm2 \n\t"
2627 "pand %%mm7, %%mm1 \n\t"
2628 "pand %%mm7, %%mm3 \n\t"
2629 "packuswb %%mm2, %%mm0 \n\t"
2630 "packuswb %%mm3, %%mm1 \n\t"
2631 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2632 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2633 "add $8, %0 \n\t"
2634 " js 1b \n\t"
2635 : "+r"(count)
2636 : "r"(src), "r"(dst0), "r"(dst1)
2637 );
2638 count -= 7;
2639 }
2640 #endif
2641 while(count<0) {
2642 dst0[count]= src[4*count+0];
2643 dst1[count]= src[4*count+2];
2644 count++;
2645 }
2646 }
2647
2648 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2649 {
2650 dst0 += count;
2651 dst1 += count;
2652 src0 += 4*count;
2653 src1 += 4*count;
2654 count= - count;
2655 #ifdef PAVGB
2656 if(count <= -8) {
2657 count += 7;
2658 __asm__ volatile(
2659 "pcmpeqw %%mm7, %%mm7 \n\t"
2660 "psrlw $8, %%mm7 \n\t"
2661 "1: \n\t"
2662 "movq -28(%1, %0, 4), %%mm0 \n\t"
2663 "movq -20(%1, %0, 4), %%mm1 \n\t"
2664 "movq -12(%1, %0, 4), %%mm2 \n\t"
2665 "movq -4(%1, %0, 4), %%mm3 \n\t"
2666 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2667 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2668 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2669 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2670 "pand %%mm7, %%mm0 \n\t"
2671 "pand %%mm7, %%mm1 \n\t"
2672 "pand %%mm7, %%mm2 \n\t"
2673 "pand %%mm7, %%mm3 \n\t"
2674 "packuswb %%mm1, %%mm0 \n\t"
2675 "packuswb %%mm3, %%mm2 \n\t"
2676 "movq %%mm0, %%mm1 \n\t"
2677 "movq %%mm2, %%mm3 \n\t"
2678 "psrlw $8, %%mm0 \n\t"
2679 "psrlw $8, %%mm2 \n\t"
2680 "pand %%mm7, %%mm1 \n\t"
2681 "pand %%mm7, %%mm3 \n\t"
2682 "packuswb %%mm2, %%mm0 \n\t"
2683 "packuswb %%mm3, %%mm1 \n\t"
2684 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2685 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2686 "add $8, %0 \n\t"
2687 " js 1b \n\t"
2688 : "+r"(count)
2689 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2690 );
2691 count -= 7;
2692 }
2693 #endif
2694 while(count<0) {
2695 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2696 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2697 count++;
2698 }
2699 }
2700
2701 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2702 {
2703 dst0+= count;
2704 dst1+= count;
2705 src += 4*count;
2706 count= - count;
2707 #if COMPILE_TEMPLATE_MMX
2708 if(count <= -8) {
2709 count += 7;
2710 __asm__ volatile(
2711 "pcmpeqw %%mm7, %%mm7 \n\t"
2712 "psrlw $8, %%mm7 \n\t"
2713 "1: \n\t"
2714 "movq -28(%1, %0, 4), %%mm0 \n\t"
2715 "movq -20(%1, %0, 4), %%mm1 \n\t"
2716 "movq -12(%1, %0, 4), %%mm2 \n\t"
2717 "movq -4(%1, %0, 4), %%mm3 \n\t"
2718 "psrlw $8, %%mm0 \n\t"
2719 "psrlw $8, %%mm1 \n\t"
2720 "psrlw $8, %%mm2 \n\t"
2721 "psrlw $8, %%mm3 \n\t"
2722 "packuswb %%mm1, %%mm0 \n\t"
2723 "packuswb %%mm3, %%mm2 \n\t"
2724 "movq %%mm0, %%mm1 \n\t"
2725 "movq %%mm2, %%mm3 \n\t"
2726 "psrlw $8, %%mm0 \n\t"
2727 "psrlw $8, %%mm2 \n\t"
2728 "pand %%mm7, %%mm1 \n\t"
2729 "pand %%mm7, %%mm3 \n\t"
2730 "packuswb %%mm2, %%mm0 \n\t"
2731 "packuswb %%mm3, %%mm1 \n\t"
2732 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2733 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2734 "add $8, %0 \n\t"
2735 " js 1b \n\t"
2736 : "+r"(count)
2737 : "r"(src), "r"(dst0), "r"(dst1)
2738 );
2739 count -= 7;
2740 }
2741 #endif
2742 src++;
2743 while(count<0) {
2744 dst0[count]= src[4*count+0];
2745 dst1[count]= src[4*count+2];
2746 count++;
2747 }
2748 }
2749
2750 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2751 {
2752 dst0 += count;
2753 dst1 += count;
2754 src0 += 4*count;
2755 src1 += 4*count;
2756 count= - count;
2757 #ifdef PAVGB
2758 if(count <= -8) {
2759 count += 7;
2760 __asm__ volatile(
2761 "pcmpeqw %%mm7, %%mm7 \n\t"
2762 "psrlw $8, %%mm7 \n\t"
2763 "1: \n\t"
2764 "movq -28(%1, %0, 4), %%mm0 \n\t"
2765 "movq -20(%1, %0, 4), %%mm1 \n\t"
2766 "movq -12(%1, %0, 4), %%mm2 \n\t"
2767 "movq -4(%1, %0, 4), %%mm3 \n\t"
2768 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2769 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2770 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2771 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2772 "psrlw $8, %%mm0 \n\t"
2773 "psrlw $8, %%mm1 \n\t"
2774 "psrlw $8, %%mm2 \n\t"
2775 "psrlw $8, %%mm3 \n\t"
2776 "packuswb %%mm1, %%mm0 \n\t"
2777 "packuswb %%mm3, %%mm2 \n\t"
2778 "movq %%mm0, %%mm1 \n\t"
2779 "movq %%mm2, %%mm3 \n\t"
2780 "psrlw $8, %%mm0 \n\t"
2781 "psrlw $8, %%mm2 \n\t"
2782 "pand %%mm7, %%mm1 \n\t"
2783 "pand %%mm7, %%mm3 \n\t"
2784 "packuswb %%mm2, %%mm0 \n\t"
2785 "packuswb %%mm3, %%mm1 \n\t"
2786 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2787 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2788 "add $8, %0 \n\t"
2789 " js 1b \n\t"
2790 : "+r"(count)
2791 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2792 );
2793 count -= 7;
2794 }
2795 #endif
2796 src0++;
2797 src1++;
2798 while(count<0) {
2799 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2800 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2801 count++;
2802 }
2803 }
2804
2805 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2806 long width, long height,
2807 long lumStride, long chromStride, long srcStride)
2808 {
2809 long y;
2810 const long chromWidth= -((-width)>>1);
2811
2812 for (y=0; y<height; y++) {
2813 RENAME(extract_even)(src, ydst, width);
2814 if(y&1) {
2815 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2816 udst+= chromStride;
2817 vdst+= chromStride;
2818 }
2819
2820 src += srcStride;
2821 ydst+= lumStride;
2822 }
2823 #if COMPILE_TEMPLATE_MMX
2824 __asm__(
2825 EMMS" \n\t"
2826 SFENCE" \n\t"
2827 ::: "memory"
2828 );
2829 #endif
2830 }
2831
2832 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2833 long width, long height,
2834 long lumStride, long chromStride, long srcStride)
2835 {
2836 long y;
2837 const long chromWidth= -((-width)>>1);
2838
2839 for (y=0; y<height; y++) {
2840 RENAME(extract_even)(src, ydst, width);
2841 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2842
2843 src += srcStride;
2844 ydst+= lumStride;
2845 udst+= chromStride;
2846 vdst+= chromStride;
2847 }
2848 #if COMPILE_TEMPLATE_MMX
2849 __asm__(
2850 EMMS" \n\t"
2851 SFENCE" \n\t"
2852 ::: "memory"
2853 );
2854 #endif
2855 }
2856
2857 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2858 long width, long height,
2859 long lumStride, long chromStride, long srcStride)
2860 {
2861 long y;
2862 const long chromWidth= -((-width)>>1);
2863
2864 for (y=0; y<height; y++) {
2865 RENAME(extract_even)(src+1, ydst, width);
2866 if(y&1) {
2867 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2868 udst+= chromStride;
2869 vdst+= chromStride;
2870 }
2871
2872 src += srcStride;
2873 ydst+= lumStride;
2874 }
2875 #if COMPILE_TEMPLATE_MMX
2876 __asm__(
2877 EMMS" \n\t"
2878 SFENCE" \n\t"
2879 ::: "memory"
2880 );
2881 #endif
2882 }
2883
2884 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2885 long width, long height,
2886 long lumStride, long chromStride, long srcStride)
2887 {
2888 long y;
2889 const long chromWidth= -((-width)>>1);
2890
2891 for (y=0; y<height; y++) {
2892 RENAME(extract_even)(src+1, ydst, width);
2893 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2894
2895 src += srcStride;
2896 ydst+= lumStride;
2897 udst+= chromStride;
2898 vdst+= chromStride;
2899 }
2900 #if COMPILE_TEMPLATE_MMX
2901 __asm__(
2902 EMMS" \n\t"
2903 SFENCE" \n\t"
2904 ::: "memory"
2905 );
2906 #endif
2907 }
2908
2909 static inline void RENAME(rgb2rgb_init)(void)
2910 {
2911 rgb15to16 = RENAME(rgb15to16);
2912 rgb15tobgr24 = RENAME(rgb15tobgr24);
2913 rgb15to32 = RENAME(rgb15to32);
2914 rgb16tobgr24 = RENAME(rgb16tobgr24);
2915 rgb16to32 = RENAME(rgb16to32);
2916 rgb16to15 = RENAME(rgb16to15);
2917 rgb24tobgr16 = RENAME(rgb24tobgr16);
2918 rgb24tobgr15 = RENAME(rgb24tobgr15);
2919 rgb24tobgr32 = RENAME(rgb24tobgr32);
2920 rgb32to16 = RENAME(rgb32to16);
2921 rgb32to15 = RENAME(rgb32to15);
2922 rgb32tobgr24 = RENAME(rgb32tobgr24);
2923 rgb24to15 = RENAME(rgb24to15);
2924 rgb24to16 = RENAME(rgb24to16);
2925 rgb24tobgr24 = RENAME(rgb24tobgr24);
2926 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2927 rgb32tobgr16 = RENAME(rgb32tobgr16);
2928 rgb32tobgr15 = RENAME(rgb32tobgr15);
2929 yv12toyuy2 = RENAME(yv12toyuy2);
2930 yv12touyvy = RENAME(yv12touyvy);
2931 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2932 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2933 yuy2toyv12 = RENAME(yuy2toyv12);
2934 planar2x = RENAME(planar2x);
2935 rgb24toyv12 = RENAME(rgb24toyv12);
2936 interleaveBytes = RENAME(interleaveBytes);
2937 vu9_to_vu12 = RENAME(vu9_to_vu12);
2938 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2939
2940 uyvytoyuv420 = RENAME(uyvytoyuv420);
2941 uyvytoyuv422 = RENAME(uyvytoyuv422);
2942 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2943 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2944 }