swscale: add nv12/nv21->yuv420 converter
[libav.git] / libswscale / x86 / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of Libav.
11 *
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27 #include <stddef.h>
28
29 #include "libavutil/attributes.h"
30
31 #undef PREFETCH
32 #undef MOVNTQ
33 #undef EMMS
34 #undef SFENCE
35 #undef PAVGB
36
37 #if COMPILE_TEMPLATE_AMD3DNOW
38 #define PREFETCH "prefetch"
39 #define PAVGB "pavgusb"
40 #elif COMPILE_TEMPLATE_MMXEXT
41 #define PREFETCH "prefetchnta"
42 #define PAVGB "pavgb"
43 #else
44 #define PREFETCH " # nop"
45 #endif
46
47 #if COMPILE_TEMPLATE_AMD3DNOW
48 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
49 #define EMMS "femms"
50 #else
51 #define EMMS "emms"
52 #endif
53
54 #if COMPILE_TEMPLATE_MMXEXT
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
57 #else
58 #define MOVNTQ "movq"
59 #define SFENCE " # nop"
60 #endif
61
62 #if !COMPILE_TEMPLATE_SSE2
63
64 #if !COMPILE_TEMPLATE_AMD3DNOW
65
66 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
67 {
68 uint8_t *dest = dst;
69 const uint8_t *s = src;
70 const uint8_t *end;
71 const uint8_t *mm_end;
72 end = s + src_size;
73 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
74 mm_end = end - 23;
75 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
76 while (s < mm_end) {
77 __asm__ volatile(
78 PREFETCH" 32(%1) \n\t"
79 "movd (%1), %%mm0 \n\t"
80 "punpckldq 3(%1), %%mm0 \n\t"
81 "movd 6(%1), %%mm1 \n\t"
82 "punpckldq 9(%1), %%mm1 \n\t"
83 "movd 12(%1), %%mm2 \n\t"
84 "punpckldq 15(%1), %%mm2 \n\t"
85 "movd 18(%1), %%mm3 \n\t"
86 "punpckldq 21(%1), %%mm3 \n\t"
87 "por %%mm7, %%mm0 \n\t"
88 "por %%mm7, %%mm1 \n\t"
89 "por %%mm7, %%mm2 \n\t"
90 "por %%mm7, %%mm3 \n\t"
91 MOVNTQ" %%mm0, (%0) \n\t"
92 MOVNTQ" %%mm1, 8(%0) \n\t"
93 MOVNTQ" %%mm2, 16(%0) \n\t"
94 MOVNTQ" %%mm3, 24(%0)"
95 :: "r"(dest), "r"(s)
96 :"memory");
97 dest += 32;
98 s += 24;
99 }
100 __asm__ volatile(SFENCE:::"memory");
101 __asm__ volatile(EMMS:::"memory");
102 while (s < end) {
103 *dest++ = *s++;
104 *dest++ = *s++;
105 *dest++ = *s++;
106 *dest++ = 255;
107 }
108 }
109
110 #define STORE_BGR24_MMX \
111 "psrlq $8, %%mm2 \n\t" \
112 "psrlq $8, %%mm3 \n\t" \
113 "psrlq $8, %%mm6 \n\t" \
114 "psrlq $8, %%mm7 \n\t" \
115 "pand "MANGLE(mask24l)", %%mm0\n\t" \
116 "pand "MANGLE(mask24l)", %%mm1\n\t" \
117 "pand "MANGLE(mask24l)", %%mm4\n\t" \
118 "pand "MANGLE(mask24l)", %%mm5\n\t" \
119 "pand "MANGLE(mask24h)", %%mm2\n\t" \
120 "pand "MANGLE(mask24h)", %%mm3\n\t" \
121 "pand "MANGLE(mask24h)", %%mm6\n\t" \
122 "pand "MANGLE(mask24h)", %%mm7\n\t" \
123 "por %%mm2, %%mm0 \n\t" \
124 "por %%mm3, %%mm1 \n\t" \
125 "por %%mm6, %%mm4 \n\t" \
126 "por %%mm7, %%mm5 \n\t" \
127 \
128 "movq %%mm1, %%mm2 \n\t" \
129 "movq %%mm4, %%mm3 \n\t" \
130 "psllq $48, %%mm2 \n\t" \
131 "psllq $32, %%mm3 \n\t" \
132 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
133 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
134 "por %%mm2, %%mm0 \n\t" \
135 "psrlq $16, %%mm1 \n\t" \
136 "psrlq $32, %%mm4 \n\t" \
137 "psllq $16, %%mm5 \n\t" \
138 "por %%mm3, %%mm1 \n\t" \
139 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
140 "por %%mm5, %%mm4 \n\t" \
141 \
142 MOVNTQ" %%mm0, (%0) \n\t" \
143 MOVNTQ" %%mm1, 8(%0) \n\t" \
144 MOVNTQ" %%mm4, 16(%0)"
145
146
147 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
148 {
149 uint8_t *dest = dst;
150 const uint8_t *s = src;
151 const uint8_t *end;
152 const uint8_t *mm_end;
153 end = s + src_size;
154 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
155 mm_end = end - 31;
156 while (s < mm_end) {
157 __asm__ volatile(
158 PREFETCH" 32(%1) \n\t"
159 "movq (%1), %%mm0 \n\t"
160 "movq 8(%1), %%mm1 \n\t"
161 "movq 16(%1), %%mm4 \n\t"
162 "movq 24(%1), %%mm5 \n\t"
163 "movq %%mm0, %%mm2 \n\t"
164 "movq %%mm1, %%mm3 \n\t"
165 "movq %%mm4, %%mm6 \n\t"
166 "movq %%mm5, %%mm7 \n\t"
167 STORE_BGR24_MMX
168 :: "r"(dest), "r"(s)
169 :"memory");
170 dest += 24;
171 s += 32;
172 }
173 __asm__ volatile(SFENCE:::"memory");
174 __asm__ volatile(EMMS:::"memory");
175 while (s < end) {
176 *dest++ = *s++;
177 *dest++ = *s++;
178 *dest++ = *s++;
179 s++;
180 }
181 }
182
183 /*
184 original by Strepto/Astral
185 ported to gcc & bugfixed: A'rpi
186 MMXEXT, 3DNOW optimization by Nick Kurshev
187 32-bit C version, and and&add trick by Michael Niedermayer
188 */
189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
190 {
191 register const uint8_t* s=src;
192 register uint8_t* d=dst;
193 register const uint8_t *end;
194 const uint8_t *mm_end;
195 end = s + src_size;
196 __asm__ volatile(PREFETCH" %0"::"m"(*s));
197 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
198 mm_end = end - 15;
199 while (s<mm_end) {
200 __asm__ volatile(
201 PREFETCH" 32(%1) \n\t"
202 "movq (%1), %%mm0 \n\t"
203 "movq 8(%1), %%mm2 \n\t"
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
210 MOVNTQ" %%mm0, (%0) \n\t"
211 MOVNTQ" %%mm2, 8(%0)"
212 :: "r"(d), "r"(s)
213 );
214 d+=16;
215 s+=16;
216 }
217 __asm__ volatile(SFENCE:::"memory");
218 __asm__ volatile(EMMS:::"memory");
219 mm_end = end - 3;
220 while (s < mm_end) {
221 register unsigned x= *((const uint32_t *)s);
222 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
223 d+=4;
224 s+=4;
225 }
226 if (s < end) {
227 register unsigned short x= *((const uint16_t *)s);
228 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
229 }
230 }
231
232 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
233 {
234 register const uint8_t* s=src;
235 register uint8_t* d=dst;
236 register const uint8_t *end;
237 const uint8_t *mm_end;
238 end = s + src_size;
239 __asm__ volatile(PREFETCH" %0"::"m"(*s));
240 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
241 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
242 mm_end = end - 15;
243 while (s<mm_end) {
244 __asm__ volatile(
245 PREFETCH" 32(%1) \n\t"
246 "movq (%1), %%mm0 \n\t"
247 "movq 8(%1), %%mm2 \n\t"
248 "movq %%mm0, %%mm1 \n\t"
249 "movq %%mm2, %%mm3 \n\t"
250 "psrlq $1, %%mm0 \n\t"
251 "psrlq $1, %%mm2 \n\t"
252 "pand %%mm7, %%mm0 \n\t"
253 "pand %%mm7, %%mm2 \n\t"
254 "pand %%mm6, %%mm1 \n\t"
255 "pand %%mm6, %%mm3 \n\t"
256 "por %%mm1, %%mm0 \n\t"
257 "por %%mm3, %%mm2 \n\t"
258 MOVNTQ" %%mm0, (%0) \n\t"
259 MOVNTQ" %%mm2, 8(%0)"
260 :: "r"(d), "r"(s)
261 );
262 d+=16;
263 s+=16;
264 }
265 __asm__ volatile(SFENCE:::"memory");
266 __asm__ volatile(EMMS:::"memory");
267 mm_end = end - 3;
268 while (s < mm_end) {
269 register uint32_t x= *((const uint32_t*)s);
270 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
271 s+=4;
272 d+=4;
273 }
274 if (s < end) {
275 register uint16_t x= *((const uint16_t*)s);
276 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
277 }
278 }
279
280 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
281 {
282 const uint8_t *s = src;
283 const uint8_t *end;
284 const uint8_t *mm_end;
285 uint16_t *d = (uint16_t *)dst;
286 end = s + src_size;
287 mm_end = end - 15;
288 __asm__ volatile(
289 "movq %3, %%mm5 \n\t"
290 "movq %4, %%mm6 \n\t"
291 "movq %5, %%mm7 \n\t"
292 "jmp 2f \n\t"
293 ".p2align 4 \n\t"
294 "1: \n\t"
295 PREFETCH" 32(%1) \n\t"
296 "movd (%1), %%mm0 \n\t"
297 "movd 4(%1), %%mm3 \n\t"
298 "punpckldq 8(%1), %%mm0 \n\t"
299 "punpckldq 12(%1), %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm4 \n\t"
302 "pand %%mm6, %%mm0 \n\t"
303 "pand %%mm6, %%mm3 \n\t"
304 "pmaddwd %%mm7, %%mm0 \n\t"
305 "pmaddwd %%mm7, %%mm3 \n\t"
306 "pand %%mm5, %%mm1 \n\t"
307 "pand %%mm5, %%mm4 \n\t"
308 "por %%mm1, %%mm0 \n\t"
309 "por %%mm4, %%mm3 \n\t"
310 "psrld $5, %%mm0 \n\t"
311 "pslld $11, %%mm3 \n\t"
312 "por %%mm3, %%mm0 \n\t"
313 MOVNTQ" %%mm0, (%0) \n\t"
314 "add $16, %1 \n\t"
315 "add $8, %0 \n\t"
316 "2: \n\t"
317 "cmp %2, %1 \n\t"
318 " jb 1b \n\t"
319 : "+r" (d), "+r"(s)
320 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
321 );
322 __asm__ volatile(SFENCE:::"memory");
323 __asm__ volatile(EMMS:::"memory");
324 while (s < end) {
325 register int rgb = *(const uint32_t*)s; s += 4;
326 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
327 }
328 }
329
330 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
331 {
332 const uint8_t *s = src;
333 const uint8_t *end;
334 const uint8_t *mm_end;
335 uint16_t *d = (uint16_t *)dst;
336 end = s + src_size;
337 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
338 __asm__ volatile(
339 "movq %0, %%mm7 \n\t"
340 "movq %1, %%mm6 \n\t"
341 ::"m"(red_16mask),"m"(green_16mask));
342 mm_end = end - 15;
343 while (s < mm_end) {
344 __asm__ volatile(
345 PREFETCH" 32(%1) \n\t"
346 "movd (%1), %%mm0 \n\t"
347 "movd 4(%1), %%mm3 \n\t"
348 "punpckldq 8(%1), %%mm0 \n\t"
349 "punpckldq 12(%1), %%mm3 \n\t"
350 "movq %%mm0, %%mm1 \n\t"
351 "movq %%mm0, %%mm2 \n\t"
352 "movq %%mm3, %%mm4 \n\t"
353 "movq %%mm3, %%mm5 \n\t"
354 "psllq $8, %%mm0 \n\t"
355 "psllq $8, %%mm3 \n\t"
356 "pand %%mm7, %%mm0 \n\t"
357 "pand %%mm7, %%mm3 \n\t"
358 "psrlq $5, %%mm1 \n\t"
359 "psrlq $5, %%mm4 \n\t"
360 "pand %%mm6, %%mm1 \n\t"
361 "pand %%mm6, %%mm4 \n\t"
362 "psrlq $19, %%mm2 \n\t"
363 "psrlq $19, %%mm5 \n\t"
364 "pand %2, %%mm2 \n\t"
365 "pand %2, %%mm5 \n\t"
366 "por %%mm1, %%mm0 \n\t"
367 "por %%mm4, %%mm3 \n\t"
368 "por %%mm2, %%mm0 \n\t"
369 "por %%mm5, %%mm3 \n\t"
370 "psllq $16, %%mm3 \n\t"
371 "por %%mm3, %%mm0 \n\t"
372 MOVNTQ" %%mm0, (%0) \n\t"
373 :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
374 d += 4;
375 s += 16;
376 }
377 __asm__ volatile(SFENCE:::"memory");
378 __asm__ volatile(EMMS:::"memory");
379 while (s < end) {
380 register int rgb = *(const uint32_t*)s; s += 4;
381 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
382 }
383 }
384
385 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
386 {
387 const uint8_t *s = src;
388 const uint8_t *end;
389 const uint8_t *mm_end;
390 uint16_t *d = (uint16_t *)dst;
391 end = s + src_size;
392 mm_end = end - 15;
393 __asm__ volatile(
394 "movq %3, %%mm5 \n\t"
395 "movq %4, %%mm6 \n\t"
396 "movq %5, %%mm7 \n\t"
397 "jmp 2f \n\t"
398 ".p2align 4 \n\t"
399 "1: \n\t"
400 PREFETCH" 32(%1) \n\t"
401 "movd (%1), %%mm0 \n\t"
402 "movd 4(%1), %%mm3 \n\t"
403 "punpckldq 8(%1), %%mm0 \n\t"
404 "punpckldq 12(%1), %%mm3 \n\t"
405 "movq %%mm0, %%mm1 \n\t"
406 "movq %%mm3, %%mm4 \n\t"
407 "pand %%mm6, %%mm0 \n\t"
408 "pand %%mm6, %%mm3 \n\t"
409 "pmaddwd %%mm7, %%mm0 \n\t"
410 "pmaddwd %%mm7, %%mm3 \n\t"
411 "pand %%mm5, %%mm1 \n\t"
412 "pand %%mm5, %%mm4 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "psrld $6, %%mm0 \n\t"
416 "pslld $10, %%mm3 \n\t"
417 "por %%mm3, %%mm0 \n\t"
418 MOVNTQ" %%mm0, (%0) \n\t"
419 "add $16, %1 \n\t"
420 "add $8, %0 \n\t"
421 "2: \n\t"
422 "cmp %2, %1 \n\t"
423 " jb 1b \n\t"
424 : "+r" (d), "+r"(s)
425 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
426 );
427 __asm__ volatile(SFENCE:::"memory");
428 __asm__ volatile(EMMS:::"memory");
429 while (s < end) {
430 register int rgb = *(const uint32_t*)s; s += 4;
431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
432 }
433 }
434
435 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
436 {
437 const uint8_t *s = src;
438 const uint8_t *end;
439 const uint8_t *mm_end;
440 uint16_t *d = (uint16_t *)dst;
441 end = s + src_size;
442 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
443 __asm__ volatile(
444 "movq %0, %%mm7 \n\t"
445 "movq %1, %%mm6 \n\t"
446 ::"m"(red_15mask),"m"(green_15mask));
447 mm_end = end - 15;
448 while (s < mm_end) {
449 __asm__ volatile(
450 PREFETCH" 32(%1) \n\t"
451 "movd (%1), %%mm0 \n\t"
452 "movd 4(%1), %%mm3 \n\t"
453 "punpckldq 8(%1), %%mm0 \n\t"
454 "punpckldq 12(%1), %%mm3 \n\t"
455 "movq %%mm0, %%mm1 \n\t"
456 "movq %%mm0, %%mm2 \n\t"
457 "movq %%mm3, %%mm4 \n\t"
458 "movq %%mm3, %%mm5 \n\t"
459 "psllq $7, %%mm0 \n\t"
460 "psllq $7, %%mm3 \n\t"
461 "pand %%mm7, %%mm0 \n\t"
462 "pand %%mm7, %%mm3 \n\t"
463 "psrlq $6, %%mm1 \n\t"
464 "psrlq $6, %%mm4 \n\t"
465 "pand %%mm6, %%mm1 \n\t"
466 "pand %%mm6, %%mm4 \n\t"
467 "psrlq $19, %%mm2 \n\t"
468 "psrlq $19, %%mm5 \n\t"
469 "pand %2, %%mm2 \n\t"
470 "pand %2, %%mm5 \n\t"
471 "por %%mm1, %%mm0 \n\t"
472 "por %%mm4, %%mm3 \n\t"
473 "por %%mm2, %%mm0 \n\t"
474 "por %%mm5, %%mm3 \n\t"
475 "psllq $16, %%mm3 \n\t"
476 "por %%mm3, %%mm0 \n\t"
477 MOVNTQ" %%mm0, (%0) \n\t"
478 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
479 d += 4;
480 s += 16;
481 }
482 __asm__ volatile(SFENCE:::"memory");
483 __asm__ volatile(EMMS:::"memory");
484 while (s < end) {
485 register int rgb = *(const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
487 }
488 }
489
490 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
491 {
492 const uint8_t *s = src;
493 const uint8_t *end;
494 const uint8_t *mm_end;
495 uint16_t *d = (uint16_t *)dst;
496 end = s + src_size;
497 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
498 __asm__ volatile(
499 "movq %0, %%mm7 \n\t"
500 "movq %1, %%mm6 \n\t"
501 ::"m"(red_16mask),"m"(green_16mask));
502 mm_end = end - 11;
503 while (s < mm_end) {
504 __asm__ volatile(
505 PREFETCH" 32(%1) \n\t"
506 "movd (%1), %%mm0 \n\t"
507 "movd 3(%1), %%mm3 \n\t"
508 "punpckldq 6(%1), %%mm0 \n\t"
509 "punpckldq 9(%1), %%mm3 \n\t"
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm0, %%mm2 \n\t"
512 "movq %%mm3, %%mm4 \n\t"
513 "movq %%mm3, %%mm5 \n\t"
514 "psrlq $3, %%mm0 \n\t"
515 "psrlq $3, %%mm3 \n\t"
516 "pand %2, %%mm0 \n\t"
517 "pand %2, %%mm3 \n\t"
518 "psrlq $5, %%mm1 \n\t"
519 "psrlq $5, %%mm4 \n\t"
520 "pand %%mm6, %%mm1 \n\t"
521 "pand %%mm6, %%mm4 \n\t"
522 "psrlq $8, %%mm2 \n\t"
523 "psrlq $8, %%mm5 \n\t"
524 "pand %%mm7, %%mm2 \n\t"
525 "pand %%mm7, %%mm5 \n\t"
526 "por %%mm1, %%mm0 \n\t"
527 "por %%mm4, %%mm3 \n\t"
528 "por %%mm2, %%mm0 \n\t"
529 "por %%mm5, %%mm3 \n\t"
530 "psllq $16, %%mm3 \n\t"
531 "por %%mm3, %%mm0 \n\t"
532 MOVNTQ" %%mm0, (%0) \n\t"
533 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
534 d += 4;
535 s += 12;
536 }
537 __asm__ volatile(SFENCE:::"memory");
538 __asm__ volatile(EMMS:::"memory");
539 while (s < end) {
540 const int b = *s++;
541 const int g = *s++;
542 const int r = *s++;
543 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
544 }
545 }
546
547 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
548 {
549 const uint8_t *s = src;
550 const uint8_t *end;
551 const uint8_t *mm_end;
552 uint16_t *d = (uint16_t *)dst;
553 end = s + src_size;
554 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
555 __asm__ volatile(
556 "movq %0, %%mm7 \n\t"
557 "movq %1, %%mm6 \n\t"
558 ::"m"(red_16mask),"m"(green_16mask));
559 mm_end = end - 15;
560 while (s < mm_end) {
561 __asm__ volatile(
562 PREFETCH" 32(%1) \n\t"
563 "movd (%1), %%mm0 \n\t"
564 "movd 3(%1), %%mm3 \n\t"
565 "punpckldq 6(%1), %%mm0 \n\t"
566 "punpckldq 9(%1), %%mm3 \n\t"
567 "movq %%mm0, %%mm1 \n\t"
568 "movq %%mm0, %%mm2 \n\t"
569 "movq %%mm3, %%mm4 \n\t"
570 "movq %%mm3, %%mm5 \n\t"
571 "psllq $8, %%mm0 \n\t"
572 "psllq $8, %%mm3 \n\t"
573 "pand %%mm7, %%mm0 \n\t"
574 "pand %%mm7, %%mm3 \n\t"
575 "psrlq $5, %%mm1 \n\t"
576 "psrlq $5, %%mm4 \n\t"
577 "pand %%mm6, %%mm1 \n\t"
578 "pand %%mm6, %%mm4 \n\t"
579 "psrlq $19, %%mm2 \n\t"
580 "psrlq $19, %%mm5 \n\t"
581 "pand %2, %%mm2 \n\t"
582 "pand %2, %%mm5 \n\t"
583 "por %%mm1, %%mm0 \n\t"
584 "por %%mm4, %%mm3 \n\t"
585 "por %%mm2, %%mm0 \n\t"
586 "por %%mm5, %%mm3 \n\t"
587 "psllq $16, %%mm3 \n\t"
588 "por %%mm3, %%mm0 \n\t"
589 MOVNTQ" %%mm0, (%0) \n\t"
590 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
591 d += 4;
592 s += 12;
593 }
594 __asm__ volatile(SFENCE:::"memory");
595 __asm__ volatile(EMMS:::"memory");
596 while (s < end) {
597 const int r = *s++;
598 const int g = *s++;
599 const int b = *s++;
600 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
601 }
602 }
603
604 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
605 {
606 const uint8_t *s = src;
607 const uint8_t *end;
608 const uint8_t *mm_end;
609 uint16_t *d = (uint16_t *)dst;
610 end = s + src_size;
611 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
612 __asm__ volatile(
613 "movq %0, %%mm7 \n\t"
614 "movq %1, %%mm6 \n\t"
615 ::"m"(red_15mask),"m"(green_15mask));
616 mm_end = end - 11;
617 while (s < mm_end) {
618 __asm__ volatile(
619 PREFETCH" 32(%1) \n\t"
620 "movd (%1), %%mm0 \n\t"
621 "movd 3(%1), %%mm3 \n\t"
622 "punpckldq 6(%1), %%mm0 \n\t"
623 "punpckldq 9(%1), %%mm3 \n\t"
624 "movq %%mm0, %%mm1 \n\t"
625 "movq %%mm0, %%mm2 \n\t"
626 "movq %%mm3, %%mm4 \n\t"
627 "movq %%mm3, %%mm5 \n\t"
628 "psrlq $3, %%mm0 \n\t"
629 "psrlq $3, %%mm3 \n\t"
630 "pand %2, %%mm0 \n\t"
631 "pand %2, %%mm3 \n\t"
632 "psrlq $6, %%mm1 \n\t"
633 "psrlq $6, %%mm4 \n\t"
634 "pand %%mm6, %%mm1 \n\t"
635 "pand %%mm6, %%mm4 \n\t"
636 "psrlq $9, %%mm2 \n\t"
637 "psrlq $9, %%mm5 \n\t"
638 "pand %%mm7, %%mm2 \n\t"
639 "pand %%mm7, %%mm5 \n\t"
640 "por %%mm1, %%mm0 \n\t"
641 "por %%mm4, %%mm3 \n\t"
642 "por %%mm2, %%mm0 \n\t"
643 "por %%mm5, %%mm3 \n\t"
644 "psllq $16, %%mm3 \n\t"
645 "por %%mm3, %%mm0 \n\t"
646 MOVNTQ" %%mm0, (%0) \n\t"
647 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
648 d += 4;
649 s += 12;
650 }
651 __asm__ volatile(SFENCE:::"memory");
652 __asm__ volatile(EMMS:::"memory");
653 while (s < end) {
654 const int b = *s++;
655 const int g = *s++;
656 const int r = *s++;
657 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
658 }
659 }
660
661 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
662 {
663 const uint8_t *s = src;
664 const uint8_t *end;
665 const uint8_t *mm_end;
666 uint16_t *d = (uint16_t *)dst;
667 end = s + src_size;
668 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
669 __asm__ volatile(
670 "movq %0, %%mm7 \n\t"
671 "movq %1, %%mm6 \n\t"
672 ::"m"(red_15mask),"m"(green_15mask));
673 mm_end = end - 15;
674 while (s < mm_end) {
675 __asm__ volatile(
676 PREFETCH" 32(%1) \n\t"
677 "movd (%1), %%mm0 \n\t"
678 "movd 3(%1), %%mm3 \n\t"
679 "punpckldq 6(%1), %%mm0 \n\t"
680 "punpckldq 9(%1), %%mm3 \n\t"
681 "movq %%mm0, %%mm1 \n\t"
682 "movq %%mm0, %%mm2 \n\t"
683 "movq %%mm3, %%mm4 \n\t"
684 "movq %%mm3, %%mm5 \n\t"
685 "psllq $7, %%mm0 \n\t"
686 "psllq $7, %%mm3 \n\t"
687 "pand %%mm7, %%mm0 \n\t"
688 "pand %%mm7, %%mm3 \n\t"
689 "psrlq $6, %%mm1 \n\t"
690 "psrlq $6, %%mm4 \n\t"
691 "pand %%mm6, %%mm1 \n\t"
692 "pand %%mm6, %%mm4 \n\t"
693 "psrlq $19, %%mm2 \n\t"
694 "psrlq $19, %%mm5 \n\t"
695 "pand %2, %%mm2 \n\t"
696 "pand %2, %%mm5 \n\t"
697 "por %%mm1, %%mm0 \n\t"
698 "por %%mm4, %%mm3 \n\t"
699 "por %%mm2, %%mm0 \n\t"
700 "por %%mm5, %%mm3 \n\t"
701 "psllq $16, %%mm3 \n\t"
702 "por %%mm3, %%mm0 \n\t"
703 MOVNTQ" %%mm0, (%0) \n\t"
704 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
705 d += 4;
706 s += 12;
707 }
708 __asm__ volatile(SFENCE:::"memory");
709 __asm__ volatile(EMMS:::"memory");
710 while (s < end) {
711 const int r = *s++;
712 const int g = *s++;
713 const int b = *s++;
714 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
715 }
716 }
717
718 /*
719 I use less accurate approximation here by simply left-shifting the input
720 value and filling the low order bits with zeroes. This method improves PNG
721 compression but this scheme cannot reproduce white exactly, since it does
722 not generate an all-ones maximum value; the net effect is to darken the
723 image slightly.
724
725 The better method should be "left bit replication":
726
727 4 3 2 1 0
728 ---------
729 1 1 0 1 1
730
731 7 6 5 4 3 2 1 0
732 ----------------
733 1 1 0 1 1 1 1 0
734 |=======| |===|
735 | leftmost bits repeated to fill open bits
736 |
737 original bits
738 */
739 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
740 {
741 const uint16_t *end;
742 const uint16_t *mm_end;
743 uint8_t *d = dst;
744 const uint16_t *s = (const uint16_t*)src;
745 end = s + src_size/2;
746 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
747 mm_end = end - 7;
748 while (s < mm_end) {
749 __asm__ volatile(
750 PREFETCH" 32(%1) \n\t"
751 "movq (%1), %%mm0 \n\t"
752 "movq (%1), %%mm1 \n\t"
753 "movq (%1), %%mm2 \n\t"
754 "pand %2, %%mm0 \n\t"
755 "pand %3, %%mm1 \n\t"
756 "pand %4, %%mm2 \n\t"
757 "psllq $3, %%mm0 \n\t"
758 "psrlq $2, %%mm1 \n\t"
759 "psrlq $7, %%mm2 \n\t"
760 "movq %%mm0, %%mm3 \n\t"
761 "movq %%mm1, %%mm4 \n\t"
762 "movq %%mm2, %%mm5 \n\t"
763 "punpcklwd %5, %%mm0 \n\t"
764 "punpcklwd %5, %%mm1 \n\t"
765 "punpcklwd %5, %%mm2 \n\t"
766 "punpckhwd %5, %%mm3 \n\t"
767 "punpckhwd %5, %%mm4 \n\t"
768 "punpckhwd %5, %%mm5 \n\t"
769 "psllq $8, %%mm1 \n\t"
770 "psllq $16, %%mm2 \n\t"
771 "por %%mm1, %%mm0 \n\t"
772 "por %%mm2, %%mm0 \n\t"
773 "psllq $8, %%mm4 \n\t"
774 "psllq $16, %%mm5 \n\t"
775 "por %%mm4, %%mm3 \n\t"
776 "por %%mm5, %%mm3 \n\t"
777
778 "movq %%mm0, %%mm6 \n\t"
779 "movq %%mm3, %%mm7 \n\t"
780
781 "movq 8(%1), %%mm0 \n\t"
782 "movq 8(%1), %%mm1 \n\t"
783 "movq 8(%1), %%mm2 \n\t"
784 "pand %2, %%mm0 \n\t"
785 "pand %3, %%mm1 \n\t"
786 "pand %4, %%mm2 \n\t"
787 "psllq $3, %%mm0 \n\t"
788 "psrlq $2, %%mm1 \n\t"
789 "psrlq $7, %%mm2 \n\t"
790 "movq %%mm0, %%mm3 \n\t"
791 "movq %%mm1, %%mm4 \n\t"
792 "movq %%mm2, %%mm5 \n\t"
793 "punpcklwd %5, %%mm0 \n\t"
794 "punpcklwd %5, %%mm1 \n\t"
795 "punpcklwd %5, %%mm2 \n\t"
796 "punpckhwd %5, %%mm3 \n\t"
797 "punpckhwd %5, %%mm4 \n\t"
798 "punpckhwd %5, %%mm5 \n\t"
799 "psllq $8, %%mm1 \n\t"
800 "psllq $16, %%mm2 \n\t"
801 "por %%mm1, %%mm0 \n\t"
802 "por %%mm2, %%mm0 \n\t"
803 "psllq $8, %%mm4 \n\t"
804 "psllq $16, %%mm5 \n\t"
805 "por %%mm4, %%mm3 \n\t"
806 "por %%mm5, %%mm3 \n\t"
807
808 :"=m"(*d)
809 :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
810 :"memory");
811 /* borrowed 32 to 24 */
812 __asm__ volatile(
813 "movq %%mm0, %%mm4 \n\t"
814 "movq %%mm3, %%mm5 \n\t"
815 "movq %%mm6, %%mm0 \n\t"
816 "movq %%mm7, %%mm1 \n\t"
817
818 "movq %%mm4, %%mm6 \n\t"
819 "movq %%mm5, %%mm7 \n\t"
820 "movq %%mm0, %%mm2 \n\t"
821 "movq %%mm1, %%mm3 \n\t"
822
823 STORE_BGR24_MMX
824
825 :: "r"(d), "m"(*s)
826 :"memory");
827 d += 24;
828 s += 8;
829 }
830 __asm__ volatile(SFENCE:::"memory");
831 __asm__ volatile(EMMS:::"memory");
832 while (s < end) {
833 register uint16_t bgr;
834 bgr = *s++;
835 *d++ = (bgr&0x1F)<<3;
836 *d++ = (bgr&0x3E0)>>2;
837 *d++ = (bgr&0x7C00)>>7;
838 }
839 }
840
841 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
842 {
843 const uint16_t *end;
844 const uint16_t *mm_end;
845 uint8_t *d = (uint8_t *)dst;
846 const uint16_t *s = (const uint16_t *)src;
847 end = s + src_size/2;
848 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
849 mm_end = end - 7;
850 while (s < mm_end) {
851 __asm__ volatile(
852 PREFETCH" 32(%1) \n\t"
853 "movq (%1), %%mm0 \n\t"
854 "movq (%1), %%mm1 \n\t"
855 "movq (%1), %%mm2 \n\t"
856 "pand %2, %%mm0 \n\t"
857 "pand %3, %%mm1 \n\t"
858 "pand %4, %%mm2 \n\t"
859 "psllq $3, %%mm0 \n\t"
860 "psrlq $3, %%mm1 \n\t"
861 "psrlq $8, %%mm2 \n\t"
862 "movq %%mm0, %%mm3 \n\t"
863 "movq %%mm1, %%mm4 \n\t"
864 "movq %%mm2, %%mm5 \n\t"
865 "punpcklwd %5, %%mm0 \n\t"
866 "punpcklwd %5, %%mm1 \n\t"
867 "punpcklwd %5, %%mm2 \n\t"
868 "punpckhwd %5, %%mm3 \n\t"
869 "punpckhwd %5, %%mm4 \n\t"
870 "punpckhwd %5, %%mm5 \n\t"
871 "psllq $8, %%mm1 \n\t"
872 "psllq $16, %%mm2 \n\t"
873 "por %%mm1, %%mm0 \n\t"
874 "por %%mm2, %%mm0 \n\t"
875 "psllq $8, %%mm4 \n\t"
876 "psllq $16, %%mm5 \n\t"
877 "por %%mm4, %%mm3 \n\t"
878 "por %%mm5, %%mm3 \n\t"
879
880 "movq %%mm0, %%mm6 \n\t"
881 "movq %%mm3, %%mm7 \n\t"
882
883 "movq 8(%1), %%mm0 \n\t"
884 "movq 8(%1), %%mm1 \n\t"
885 "movq 8(%1), %%mm2 \n\t"
886 "pand %2, %%mm0 \n\t"
887 "pand %3, %%mm1 \n\t"
888 "pand %4, %%mm2 \n\t"
889 "psllq $3, %%mm0 \n\t"
890 "psrlq $3, %%mm1 \n\t"
891 "psrlq $8, %%mm2 \n\t"
892 "movq %%mm0, %%mm3 \n\t"
893 "movq %%mm1, %%mm4 \n\t"
894 "movq %%mm2, %%mm5 \n\t"
895 "punpcklwd %5, %%mm0 \n\t"
896 "punpcklwd %5, %%mm1 \n\t"
897 "punpcklwd %5, %%mm2 \n\t"
898 "punpckhwd %5, %%mm3 \n\t"
899 "punpckhwd %5, %%mm4 \n\t"
900 "punpckhwd %5, %%mm5 \n\t"
901 "psllq $8, %%mm1 \n\t"
902 "psllq $16, %%mm2 \n\t"
903 "por %%mm1, %%mm0 \n\t"
904 "por %%mm2, %%mm0 \n\t"
905 "psllq $8, %%mm4 \n\t"
906 "psllq $16, %%mm5 \n\t"
907 "por %%mm4, %%mm3 \n\t"
908 "por %%mm5, %%mm3 \n\t"
909 :"=m"(*d)
910 :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
911 :"memory");
912 /* borrowed 32 to 24 */
913 __asm__ volatile(
914 "movq %%mm0, %%mm4 \n\t"
915 "movq %%mm3, %%mm5 \n\t"
916 "movq %%mm6, %%mm0 \n\t"
917 "movq %%mm7, %%mm1 \n\t"
918
919 "movq %%mm4, %%mm6 \n\t"
920 "movq %%mm5, %%mm7 \n\t"
921 "movq %%mm0, %%mm2 \n\t"
922 "movq %%mm1, %%mm3 \n\t"
923
924 STORE_BGR24_MMX
925
926 :: "r"(d), "m"(*s)
927 :"memory");
928 d += 24;
929 s += 8;
930 }
931 __asm__ volatile(SFENCE:::"memory");
932 __asm__ volatile(EMMS:::"memory");
933 while (s < end) {
934 register uint16_t bgr;
935 bgr = *s++;
936 *d++ = (bgr&0x1F)<<3;
937 *d++ = (bgr&0x7E0)>>3;
938 *d++ = (bgr&0xF800)>>8;
939 }
940 }
941
942 /*
943 * mm0 = 00 B3 00 B2 00 B1 00 B0
944 * mm1 = 00 G3 00 G2 00 G1 00 G0
945 * mm2 = 00 R3 00 R2 00 R1 00 R0
946 * mm6 = FF FF FF FF FF FF FF FF
947 * mm7 = 00 00 00 00 00 00 00 00
948 */
949 #define PACK_RGB32 \
950 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
951 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
952 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
953 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
954 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
955 "movq %%mm0, %%mm3 \n\t" \
956 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
957 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
958 MOVNTQ" %%mm0, (%0) \n\t" \
959 MOVNTQ" %%mm3, 8(%0) \n\t" \
960
961 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
962 {
963 const uint16_t *end;
964 const uint16_t *mm_end;
965 uint8_t *d = dst;
966 const uint16_t *s = (const uint16_t *)src;
967 end = s + src_size/2;
968 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
969 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
970 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
971 mm_end = end - 3;
972 while (s < mm_end) {
973 __asm__ volatile(
974 PREFETCH" 32(%1) \n\t"
975 "movq (%1), %%mm0 \n\t"
976 "movq (%1), %%mm1 \n\t"
977 "movq (%1), %%mm2 \n\t"
978 "pand %2, %%mm0 \n\t"
979 "pand %3, %%mm1 \n\t"
980 "pand %4, %%mm2 \n\t"
981 "psllq $3, %%mm0 \n\t"
982 "psrlq $2, %%mm1 \n\t"
983 "psrlq $7, %%mm2 \n\t"
984 PACK_RGB32
985 ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
986 :"memory");
987 d += 16;
988 s += 4;
989 }
990 __asm__ volatile(SFENCE:::"memory");
991 __asm__ volatile(EMMS:::"memory");
992 while (s < end) {
993 register uint16_t bgr;
994 bgr = *s++;
995 *d++ = (bgr&0x1F)<<3;
996 *d++ = (bgr&0x3E0)>>2;
997 *d++ = (bgr&0x7C00)>>7;
998 *d++ = 255;
999 }
1000 }
1001
1002 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1003 {
1004 const uint16_t *end;
1005 const uint16_t *mm_end;
1006 uint8_t *d = dst;
1007 const uint16_t *s = (const uint16_t*)src;
1008 end = s + src_size/2;
1009 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1010 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1011 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1012 mm_end = end - 3;
1013 while (s < mm_end) {
1014 __asm__ volatile(
1015 PREFETCH" 32(%1) \n\t"
1016 "movq (%1), %%mm0 \n\t"
1017 "movq (%1), %%mm1 \n\t"
1018 "movq (%1), %%mm2 \n\t"
1019 "pand %2, %%mm0 \n\t"
1020 "pand %3, %%mm1 \n\t"
1021 "pand %4, %%mm2 \n\t"
1022 "psllq $3, %%mm0 \n\t"
1023 "psrlq $3, %%mm1 \n\t"
1024 "psrlq $8, %%mm2 \n\t"
1025 PACK_RGB32
1026 ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1027 :"memory");
1028 d += 16;
1029 s += 4;
1030 }
1031 __asm__ volatile(SFENCE:::"memory");
1032 __asm__ volatile(EMMS:::"memory");
1033 while (s < end) {
1034 register uint16_t bgr;
1035 bgr = *s++;
1036 *d++ = (bgr&0x1F)<<3;
1037 *d++ = (bgr&0x7E0)>>3;
1038 *d++ = (bgr&0xF800)>>8;
1039 *d++ = 255;
1040 }
1041 }
1042
1043 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1044 {
1045 x86_reg idx = 15 - src_size;
1046 const uint8_t *s = src-idx;
1047 uint8_t *d = dst-idx;
1048 __asm__ volatile(
1049 "test %0, %0 \n\t"
1050 "jns 2f \n\t"
1051 PREFETCH" (%1, %0) \n\t"
1052 "movq %3, %%mm7 \n\t"
1053 "pxor %4, %%mm7 \n\t"
1054 "movq %%mm7, %%mm6 \n\t"
1055 "pxor %5, %%mm7 \n\t"
1056 ".p2align 4 \n\t"
1057 "1: \n\t"
1058 PREFETCH" 32(%1, %0) \n\t"
1059 "movq (%1, %0), %%mm0 \n\t"
1060 "movq 8(%1, %0), %%mm1 \n\t"
1061 # if COMPILE_TEMPLATE_MMXEXT
1062 "pshufw $177, %%mm0, %%mm3 \n\t"
1063 "pshufw $177, %%mm1, %%mm5 \n\t"
1064 "pand %%mm7, %%mm0 \n\t"
1065 "pand %%mm6, %%mm3 \n\t"
1066 "pand %%mm7, %%mm1 \n\t"
1067 "pand %%mm6, %%mm5 \n\t"
1068 "por %%mm3, %%mm0 \n\t"
1069 "por %%mm5, %%mm1 \n\t"
1070 # else
1071 "movq %%mm0, %%mm2 \n\t"
1072 "movq %%mm1, %%mm4 \n\t"
1073 "pand %%mm7, %%mm0 \n\t"
1074 "pand %%mm6, %%mm2 \n\t"
1075 "pand %%mm7, %%mm1 \n\t"
1076 "pand %%mm6, %%mm4 \n\t"
1077 "movq %%mm2, %%mm3 \n\t"
1078 "movq %%mm4, %%mm5 \n\t"
1079 "pslld $16, %%mm2 \n\t"
1080 "psrld $16, %%mm3 \n\t"
1081 "pslld $16, %%mm4 \n\t"
1082 "psrld $16, %%mm5 \n\t"
1083 "por %%mm2, %%mm0 \n\t"
1084 "por %%mm4, %%mm1 \n\t"
1085 "por %%mm3, %%mm0 \n\t"
1086 "por %%mm5, %%mm1 \n\t"
1087 # endif
1088 MOVNTQ" %%mm0, (%2, %0) \n\t"
1089 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1090 "add $16, %0 \n\t"
1091 "js 1b \n\t"
1092 SFENCE" \n\t"
1093 EMMS" \n\t"
1094 "2: \n\t"
1095 : "+&r"(idx)
1096 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1097 : "memory");
1098 for (; idx<15; idx+=4) {
1099 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1100 v &= 0xff00ff;
1101 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1102 }
1103 }
1104
1105 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1106 {
1107 unsigned i;
1108 x86_reg mmx_size= 23 - src_size;
1109 __asm__ volatile (
1110 "test %%"REG_a", %%"REG_a" \n\t"
1111 "jns 2f \n\t"
1112 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1113 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1114 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1115 ".p2align 4 \n\t"
1116 "1: \n\t"
1117 PREFETCH" 32(%1, %%"REG_a") \n\t"
1118 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1119 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1120 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1121 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1122 "pand %%mm5, %%mm0 \n\t"
1123 "pand %%mm6, %%mm1 \n\t"
1124 "pand %%mm7, %%mm2 \n\t"
1125 "por %%mm0, %%mm1 \n\t"
1126 "por %%mm2, %%mm1 \n\t"
1127 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1128 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1129 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1130 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1131 "pand %%mm7, %%mm0 \n\t"
1132 "pand %%mm5, %%mm1 \n\t"
1133 "pand %%mm6, %%mm2 \n\t"
1134 "por %%mm0, %%mm1 \n\t"
1135 "por %%mm2, %%mm1 \n\t"
1136 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1137 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1138 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1139 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1140 "pand %%mm6, %%mm0 \n\t"
1141 "pand %%mm7, %%mm1 \n\t"
1142 "pand %%mm5, %%mm2 \n\t"
1143 "por %%mm0, %%mm1 \n\t"
1144 "por %%mm2, %%mm1 \n\t"
1145 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1146 "add $24, %%"REG_a" \n\t"
1147 " js 1b \n\t"
1148 "2: \n\t"
1149 : "+a" (mmx_size)
1150 : "r" (src-mmx_size), "r"(dst-mmx_size)
1151 );
1152
1153 __asm__ volatile(SFENCE:::"memory");
1154 __asm__ volatile(EMMS:::"memory");
1155
1156 if (mmx_size==23) return; //finished, was multiple of 8
1157
1158 src+= src_size;
1159 dst+= src_size;
1160 src_size= 23-mmx_size;
1161 src-= src_size;
1162 dst-= src_size;
1163 for (i=0; i<src_size; i+=3) {
1164 register uint8_t x;
1165 x = src[i + 2];
1166 dst[i + 1] = src[i + 1];
1167 dst[i + 2] = src[i + 0];
1168 dst[i + 0] = x;
1169 }
1170 }
1171
1172 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1173 int width, int height,
1174 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1175 {
1176 int y;
1177 const x86_reg chromWidth= width>>1;
1178 for (y=0; y<height; y++) {
1179 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1180 __asm__ volatile(
1181 "xor %%"REG_a", %%"REG_a" \n\t"
1182 ".p2align 4 \n\t"
1183 "1: \n\t"
1184 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1185 PREFETCH" 32(%2, %%"REG_a") \n\t"
1186 PREFETCH" 32(%3, %%"REG_a") \n\t"
1187 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1188 "movq %%mm0, %%mm2 \n\t" // U(0)
1189 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1190 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1191 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1192
1193 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1194 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1195 "movq %%mm3, %%mm4 \n\t" // Y(0)
1196 "movq %%mm5, %%mm6 \n\t" // Y(8)
1197 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1198 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1199 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1200 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1201
1202 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1203 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1204 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1205 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1206
1207 "add $8, %%"REG_a" \n\t"
1208 "cmp %4, %%"REG_a" \n\t"
1209 " jb 1b \n\t"
1210 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1211 : "%"REG_a
1212 );
1213 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1214 usrc += chromStride;
1215 vsrc += chromStride;
1216 }
1217 ysrc += lumStride;
1218 dst += dstStride;
1219 }
1220 __asm__(EMMS" \n\t"
1221 SFENCE" \n\t"
1222 :::"memory");
1223 }
1224
1225 /**
1226 * Height should be a multiple of 2 and width should be a multiple of 16.
1227 * (If this is a problem for anyone then tell me, and I will fix it.)
1228 */
1229 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1230 int width, int height,
1231 int lumStride, int chromStride, int dstStride)
1232 {
1233 //FIXME interpolate chroma
1234 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1235 }
1236
1237 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1238 int width, int height,
1239 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1240 {
1241 int y;
1242 const x86_reg chromWidth= width>>1;
1243 for (y=0; y<height; y++) {
1244 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1245 __asm__ volatile(
1246 "xor %%"REG_a", %%"REG_a" \n\t"
1247 ".p2align 4 \n\t"
1248 "1: \n\t"
1249 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1250 PREFETCH" 32(%2, %%"REG_a") \n\t"
1251 PREFETCH" 32(%3, %%"REG_a") \n\t"
1252 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1253 "movq %%mm0, %%mm2 \n\t" // U(0)
1254 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1255 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1256 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1257
1258 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1259 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1260 "movq %%mm0, %%mm4 \n\t" // Y(0)
1261 "movq %%mm2, %%mm6 \n\t" // Y(8)
1262 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1263 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1264 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1265 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1266
1267 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1268 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1269 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1270 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1271
1272 "add $8, %%"REG_a" \n\t"
1273 "cmp %4, %%"REG_a" \n\t"
1274 " jb 1b \n\t"
1275 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1276 : "%"REG_a
1277 );
1278 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1279 usrc += chromStride;
1280 vsrc += chromStride;
1281 }
1282 ysrc += lumStride;
1283 dst += dstStride;
1284 }
1285 __asm__(EMMS" \n\t"
1286 SFENCE" \n\t"
1287 :::"memory");
1288 }
1289
1290 /**
1291 * Height should be a multiple of 2 and width should be a multiple of 16
1292 * (If this is a problem for anyone then tell me, and I will fix it.)
1293 */
1294 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1295 int width, int height,
1296 int lumStride, int chromStride, int dstStride)
1297 {
1298 //FIXME interpolate chroma
1299 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1300 }
1301
1302 /**
1303 * Width should be a multiple of 16.
1304 */
1305 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1306 int width, int height,
1307 int lumStride, int chromStride, int dstStride)
1308 {
1309 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1310 }
1311
1312 /**
1313 * Width should be a multiple of 16.
1314 */
1315 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1316 int width, int height,
1317 int lumStride, int chromStride, int dstStride)
1318 {
1319 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1320 }
1321
1322 /**
1323 * Height should be a multiple of 2 and width should be a multiple of 16.
1324 * (If this is a problem for anyone then tell me, and I will fix it.)
1325 */
1326 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1327 int width, int height,
1328 int lumStride, int chromStride, int srcStride)
1329 {
1330 int y;
1331 const x86_reg chromWidth= width>>1;
1332 for (y=0; y<height; y+=2) {
1333 __asm__ volatile(
1334 "xor %%"REG_a", %%"REG_a" \n\t"
1335 "pcmpeqw %%mm7, %%mm7 \n\t"
1336 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1337 ".p2align 4 \n\t"
1338 "1: \n\t"
1339 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1340 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1341 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1342 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1343 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1344 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1345 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1346 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1347 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1348 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1349 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1350
1351 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1352
1353 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1354 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1355 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1356 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1357 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1358 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1359 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1360 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1361 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1362 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1363
1364 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1365
1366 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1367 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1368 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1369 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1370 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1371 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1372 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1373 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1374
1375 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1376 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1377
1378 "add $8, %%"REG_a" \n\t"
1379 "cmp %4, %%"REG_a" \n\t"
1380 " jb 1b \n\t"
1381 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1382 : "memory", "%"REG_a
1383 );
1384
1385 ydst += lumStride;
1386 src += srcStride;
1387
1388 __asm__ volatile(
1389 "xor %%"REG_a", %%"REG_a" \n\t"
1390 ".p2align 4 \n\t"
1391 "1: \n\t"
1392 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1393 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1394 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1395 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1396 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1397 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1398 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1399 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1400 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1401 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1402 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1403
1404 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1405 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1406
1407 "add $8, %%"REG_a" \n\t"
1408 "cmp %4, %%"REG_a" \n\t"
1409 " jb 1b \n\t"
1410
1411 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1412 : "memory", "%"REG_a
1413 );
1414 udst += chromStride;
1415 vdst += chromStride;
1416 ydst += lumStride;
1417 src += srcStride;
1418 }
1419 __asm__ volatile(EMMS" \n\t"
1420 SFENCE" \n\t"
1421 :::"memory");
1422 }
1423 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1424
1425 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1426 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1427 {
1428 int x,y;
1429
1430 dst[0]= src[0];
1431
1432 // first line
1433 for (x=0; x<srcWidth-1; x++) {
1434 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1435 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1436 }
1437 dst[2*srcWidth-1]= src[srcWidth-1];
1438
1439 dst+= dstStride;
1440
1441 for (y=1; y<srcHeight; y++) {
1442 const x86_reg mmxSize= srcWidth&~15;
1443 __asm__ volatile(
1444 "mov %4, %%"REG_a" \n\t"
1445 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1446 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1447 "movq %%mm4, %%mm2 \n\t"
1448 "psllq $8, %%mm4 \n\t"
1449 "pand %%mm0, %%mm2 \n\t"
1450 "por %%mm2, %%mm4 \n\t"
1451 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1452 "movq %%mm5, %%mm3 \n\t"
1453 "psllq $8, %%mm5 \n\t"
1454 "pand %%mm0, %%mm3 \n\t"
1455 "por %%mm3, %%mm5 \n\t"
1456 "1: \n\t"
1457 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1458 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1459 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1460 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1461 PAVGB" %%mm0, %%mm5 \n\t"
1462 PAVGB" %%mm0, %%mm3 \n\t"
1463 PAVGB" %%mm0, %%mm5 \n\t"
1464 PAVGB" %%mm0, %%mm3 \n\t"
1465 PAVGB" %%mm1, %%mm4 \n\t"
1466 PAVGB" %%mm1, %%mm2 \n\t"
1467 PAVGB" %%mm1, %%mm4 \n\t"
1468 PAVGB" %%mm1, %%mm2 \n\t"
1469 "movq %%mm5, %%mm7 \n\t"
1470 "movq %%mm4, %%mm6 \n\t"
1471 "punpcklbw %%mm3, %%mm5 \n\t"
1472 "punpckhbw %%mm3, %%mm7 \n\t"
1473 "punpcklbw %%mm2, %%mm4 \n\t"
1474 "punpckhbw %%mm2, %%mm6 \n\t"
1475 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1476 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1477 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1478 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1479 "add $8, %%"REG_a" \n\t"
1480 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1481 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1482 " js 1b \n\t"
1483 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1484 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1485 "g" (-mmxSize)
1486 : "%"REG_a
1487 );
1488
1489 for (x=mmxSize-1; x<srcWidth-1; x++) {
1490 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1491 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1492 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1493 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1494 }
1495 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1496 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1497
1498 dst+=dstStride*2;
1499 src+=srcStride;
1500 }
1501
1502 // last line
1503 dst[0]= src[0];
1504
1505 for (x=0; x<srcWidth-1; x++) {
1506 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1507 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1508 }
1509 dst[2*srcWidth-1]= src[srcWidth-1];
1510
1511 __asm__ volatile(EMMS" \n\t"
1512 SFENCE" \n\t"
1513 :::"memory");
1514 }
1515 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1516
1517 #if !COMPILE_TEMPLATE_AMD3DNOW
1518 /**
1519 * Height should be a multiple of 2 and width should be a multiple of 16.
1520 * (If this is a problem for anyone then tell me, and I will fix it.)
1521 * Chrominance data is only taken from every second line, others are ignored.
1522 * FIXME: Write HQ version.
1523 */
1524 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1525 int width, int height,
1526 int lumStride, int chromStride, int srcStride)
1527 {
1528 int y;
1529 const x86_reg chromWidth= width>>1;
1530 for (y=0; y<height; y+=2) {
1531 __asm__ volatile(
1532 "xor %%"REG_a", %%"REG_a" \n\t"
1533 "pcmpeqw %%mm7, %%mm7 \n\t"
1534 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1535 ".p2align 4 \n\t"
1536 "1: \n\t"
1537 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1538 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1539 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1540 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1541 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1542 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1543 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1544 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1545 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1546 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1547 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1548
1549 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1550
1551 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1552 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1553 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1554 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1555 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1556 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1557 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1558 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1559 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1560 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1561
1562 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1563
1564 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1565 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1566 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1567 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1568 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1569 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1570 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1571 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1572
1573 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1574 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1575
1576 "add $8, %%"REG_a" \n\t"
1577 "cmp %4, %%"REG_a" \n\t"
1578 " jb 1b \n\t"
1579 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1580 : "memory", "%"REG_a
1581 );
1582
1583 ydst += lumStride;
1584 src += srcStride;
1585
1586 __asm__ volatile(
1587 "xor %%"REG_a", %%"REG_a" \n\t"
1588 ".p2align 4 \n\t"
1589 "1: \n\t"
1590 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1591 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1592 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1593 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1594 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1595 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1596 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1597 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1598 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1599 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1600 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1601
1602 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1603 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1604
1605 "add $8, %%"REG_a" \n\t"
1606 "cmp %4, %%"REG_a" \n\t"
1607 " jb 1b \n\t"
1608
1609 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1610 : "memory", "%"REG_a
1611 );
1612 udst += chromStride;
1613 vdst += chromStride;
1614 ydst += lumStride;
1615 src += srcStride;
1616 }
1617 __asm__ volatile(EMMS" \n\t"
1618 SFENCE" \n\t"
1619 :::"memory");
1620 }
1621 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1622
1623 /**
1624 * Height should be a multiple of 2 and width should be a multiple of 2.
1625 * (If this is a problem for anyone then tell me, and I will fix it.)
1626 * Chrominance data is only taken from every second line,
1627 * others are ignored in the C version.
1628 * FIXME: Write HQ version.
1629 */
1630 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1631 int width, int height,
1632 int lumStride, int chromStride, int srcStride)
1633 {
1634 int y;
1635 const x86_reg chromWidth= width>>1;
1636 for (y=0; y<height-2; y+=2) {
1637 int i;
1638 for (i=0; i<2; i++) {
1639 __asm__ volatile(
1640 "mov %2, %%"REG_a" \n\t"
1641 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1642 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1643 "pxor %%mm7, %%mm7 \n\t"
1644 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1645 ".p2align 4 \n\t"
1646 "1: \n\t"
1647 PREFETCH" 64(%0, %%"REG_d") \n\t"
1648 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1649 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1650 "punpcklbw %%mm7, %%mm0 \n\t"
1651 "punpcklbw %%mm7, %%mm1 \n\t"
1652 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1653 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1654 "punpcklbw %%mm7, %%mm2 \n\t"
1655 "punpcklbw %%mm7, %%mm3 \n\t"
1656 "pmaddwd %%mm6, %%mm0 \n\t"
1657 "pmaddwd %%mm6, %%mm1 \n\t"
1658 "pmaddwd %%mm6, %%mm2 \n\t"
1659 "pmaddwd %%mm6, %%mm3 \n\t"
1660 #ifndef FAST_BGR2YV12
1661 "psrad $8, %%mm0 \n\t"
1662 "psrad $8, %%mm1 \n\t"
1663 "psrad $8, %%mm2 \n\t"
1664 "psrad $8, %%mm3 \n\t"
1665 #endif
1666 "packssdw %%mm1, %%mm0 \n\t"
1667 "packssdw %%mm3, %%mm2 \n\t"
1668 "pmaddwd %%mm5, %%mm0 \n\t"
1669 "pmaddwd %%mm5, %%mm2 \n\t"
1670 "packssdw %%mm2, %%mm0 \n\t"
1671 "psraw $7, %%mm0 \n\t"
1672
1673 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1674 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1675 "punpcklbw %%mm7, %%mm4 \n\t"
1676 "punpcklbw %%mm7, %%mm1 \n\t"
1677 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1678 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1679 "punpcklbw %%mm7, %%mm2 \n\t"
1680 "punpcklbw %%mm7, %%mm3 \n\t"
1681 "pmaddwd %%mm6, %%mm4 \n\t"
1682 "pmaddwd %%mm6, %%mm1 \n\t"
1683 "pmaddwd %%mm6, %%mm2 \n\t"
1684 "pmaddwd %%mm6, %%mm3 \n\t"
1685 #ifndef FAST_BGR2YV12
1686 "psrad $8, %%mm4 \n\t"
1687 "psrad $8, %%mm1 \n\t"
1688 "psrad $8, %%mm2 \n\t"
1689 "psrad $8, %%mm3 \n\t"
1690 #endif
1691 "packssdw %%mm1, %%mm4 \n\t"
1692 "packssdw %%mm3, %%mm2 \n\t"
1693 "pmaddwd %%mm5, %%mm4 \n\t"
1694 "pmaddwd %%mm5, %%mm2 \n\t"
1695 "add $24, %%"REG_d" \n\t"
1696 "packssdw %%mm2, %%mm4 \n\t"
1697 "psraw $7, %%mm4 \n\t"
1698
1699 "packuswb %%mm4, %%mm0 \n\t"
1700 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1701
1702 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1703 "add $8, %%"REG_a" \n\t"
1704 " js 1b \n\t"
1705 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1706 : "%"REG_a, "%"REG_d
1707 );
1708 ydst += lumStride;
1709 src += srcStride;
1710 }
1711 src -= srcStride*2;
1712 __asm__ volatile(
1713 "mov %4, %%"REG_a" \n\t"
1714 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1715 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1716 "pxor %%mm7, %%mm7 \n\t"
1717 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1718 "add %%"REG_d", %%"REG_d" \n\t"
1719 ".p2align 4 \n\t"
1720 "1: \n\t"
1721 PREFETCH" 64(%0, %%"REG_d") \n\t"
1722 PREFETCH" 64(%1, %%"REG_d") \n\t"
1723 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1724 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1725 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1726 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1727 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1728 PAVGB" %%mm1, %%mm0 \n\t"
1729 PAVGB" %%mm3, %%mm2 \n\t"
1730 "movq %%mm0, %%mm1 \n\t"
1731 "movq %%mm2, %%mm3 \n\t"
1732 "psrlq $24, %%mm0 \n\t"
1733 "psrlq $24, %%mm2 \n\t"
1734 PAVGB" %%mm1, %%mm0 \n\t"
1735 PAVGB" %%mm3, %%mm2 \n\t"
1736 "punpcklbw %%mm7, %%mm0 \n\t"
1737 "punpcklbw %%mm7, %%mm2 \n\t"
1738 #else
1739 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1740 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1741 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1742 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1743 "punpcklbw %%mm7, %%mm0 \n\t"
1744 "punpcklbw %%mm7, %%mm1 \n\t"
1745 "punpcklbw %%mm7, %%mm2 \n\t"
1746 "punpcklbw %%mm7, %%mm3 \n\t"
1747 "paddw %%mm1, %%mm0 \n\t"
1748 "paddw %%mm3, %%mm2 \n\t"
1749 "paddw %%mm2, %%mm0 \n\t"
1750 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1751 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1752 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1753 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1754 "punpcklbw %%mm7, %%mm4 \n\t"
1755 "punpcklbw %%mm7, %%mm1 \n\t"
1756 "punpcklbw %%mm7, %%mm2 \n\t"
1757 "punpcklbw %%mm7, %%mm3 \n\t"
1758 "paddw %%mm1, %%mm4 \n\t"
1759 "paddw %%mm3, %%mm2 \n\t"
1760 "paddw %%mm4, %%mm2 \n\t"
1761 "psrlw $2, %%mm0 \n\t"
1762 "psrlw $2, %%mm2 \n\t"
1763 #endif
1764 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1765 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1766
1767 "pmaddwd %%mm0, %%mm1 \n\t"
1768 "pmaddwd %%mm2, %%mm3 \n\t"
1769 "pmaddwd %%mm6, %%mm0 \n\t"
1770 "pmaddwd %%mm6, %%mm2 \n\t"
1771 #ifndef FAST_BGR2YV12
1772 "psrad $8, %%mm0 \n\t"
1773 "psrad $8, %%mm1 \n\t"
1774 "psrad $8, %%mm2 \n\t"
1775 "psrad $8, %%mm3 \n\t"
1776 #endif
1777 "packssdw %%mm2, %%mm0 \n\t"
1778 "packssdw %%mm3, %%mm1 \n\t"
1779 "pmaddwd %%mm5, %%mm0 \n\t"
1780 "pmaddwd %%mm5, %%mm1 \n\t"
1781 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1782 "psraw $7, %%mm0 \n\t"
1783
1784 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1785 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1786 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1787 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1788 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1789 PAVGB" %%mm1, %%mm4 \n\t"
1790 PAVGB" %%mm3, %%mm2 \n\t"
1791 "movq %%mm4, %%mm1 \n\t"
1792 "movq %%mm2, %%mm3 \n\t"
1793 "psrlq $24, %%mm4 \n\t"
1794 "psrlq $24, %%mm2 \n\t"
1795 PAVGB" %%mm1, %%mm4 \n\t"
1796 PAVGB" %%mm3, %%mm2 \n\t"
1797 "punpcklbw %%mm7, %%mm4 \n\t"
1798 "punpcklbw %%mm7, %%mm2 \n\t"
1799 #else
1800 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1801 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1802 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1803 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1804 "punpcklbw %%mm7, %%mm4 \n\t"
1805 "punpcklbw %%mm7, %%mm1 \n\t"
1806 "punpcklbw %%mm7, %%mm2 \n\t"
1807 "punpcklbw %%mm7, %%mm3 \n\t"
1808 "paddw %%mm1, %%mm4 \n\t"
1809 "paddw %%mm3, %%mm2 \n\t"
1810 "paddw %%mm2, %%mm4 \n\t"
1811 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1812 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1813 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1814 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1815 "punpcklbw %%mm7, %%mm5 \n\t"
1816 "punpcklbw %%mm7, %%mm1 \n\t"
1817 "punpcklbw %%mm7, %%mm2 \n\t"
1818 "punpcklbw %%mm7, %%mm3 \n\t"
1819 "paddw %%mm1, %%mm5 \n\t"
1820 "paddw %%mm3, %%mm2 \n\t"
1821 "paddw %%mm5, %%mm2 \n\t"
1822 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1823 "psrlw $2, %%mm4 \n\t"
1824 "psrlw $2, %%mm2 \n\t"
1825 #endif
1826 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1827 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1828
1829 "pmaddwd %%mm4, %%mm1 \n\t"
1830 "pmaddwd %%mm2, %%mm3 \n\t"
1831 "pmaddwd %%mm6, %%mm4 \n\t"
1832 "pmaddwd %%mm6, %%mm2 \n\t"
1833 #ifndef FAST_BGR2YV12
1834 "psrad $8, %%mm4 \n\t"
1835 "psrad $8, %%mm1 \n\t"
1836 "psrad $8, %%mm2 \n\t"
1837 "psrad $8, %%mm3 \n\t"
1838 #endif
1839 "packssdw %%mm2, %%mm4 \n\t"
1840 "packssdw %%mm3, %%mm1 \n\t"
1841 "pmaddwd %%mm5, %%mm4 \n\t"
1842 "pmaddwd %%mm5, %%mm1 \n\t"
1843 "add $24, %%"REG_d" \n\t"
1844 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1845 "psraw $7, %%mm4 \n\t"
1846
1847 "movq %%mm0, %%mm1 \n\t"
1848 "punpckldq %%mm4, %%mm0 \n\t"
1849 "punpckhdq %%mm4, %%mm1 \n\t"
1850 "packsswb %%mm1, %%mm0 \n\t"
1851 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1852 "movd %%mm0, (%2, %%"REG_a") \n\t"
1853 "punpckhdq %%mm0, %%mm0 \n\t"
1854 "movd %%mm0, (%3, %%"REG_a") \n\t"
1855 "add $4, %%"REG_a" \n\t"
1856 " js 1b \n\t"
1857 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1858 : "%"REG_a, "%"REG_d
1859 );
1860
1861 udst += chromStride;
1862 vdst += chromStride;
1863 src += srcStride*2;
1864 }
1865
1866 __asm__ volatile(EMMS" \n\t"
1867 SFENCE" \n\t"
1868 :::"memory");
1869
1870 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1871 }
1872 #endif /* !COMPILE_TEMPLATE_SSE2 */
1873
1874 #if !COMPILE_TEMPLATE_AMD3DNOW
1875 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1876 int width, int height, int src1Stride,
1877 int src2Stride, int dstStride)
1878 {
1879 int h;
1880
1881 for (h=0; h < height; h++) {
1882 int w;
1883
1884 #if COMPILE_TEMPLATE_SSE2
1885 __asm__(
1886 "xor %%"REG_a", %%"REG_a" \n\t"
1887 "1: \n\t"
1888 PREFETCH" 64(%1, %%"REG_a") \n\t"
1889 PREFETCH" 64(%2, %%"REG_a") \n\t"
1890 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1891 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1892 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1893 "punpcklbw %%xmm2, %%xmm0 \n\t"
1894 "punpckhbw %%xmm2, %%xmm1 \n\t"
1895 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1896 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1897 "add $16, %%"REG_a" \n\t"
1898 "cmp %3, %%"REG_a" \n\t"
1899 " jb 1b \n\t"
1900 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1901 : "memory", "%"REG_a""
1902 );
1903 #else
1904 __asm__(
1905 "xor %%"REG_a", %%"REG_a" \n\t"
1906 "1: \n\t"
1907 PREFETCH" 64(%1, %%"REG_a") \n\t"
1908 PREFETCH" 64(%2, %%"REG_a") \n\t"
1909 "movq (%1, %%"REG_a"), %%mm0 \n\t"
1910 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1911 "movq %%mm0, %%mm1 \n\t"
1912 "movq %%mm2, %%mm3 \n\t"
1913 "movq (%2, %%"REG_a"), %%mm4 \n\t"
1914 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1915 "punpcklbw %%mm4, %%mm0 \n\t"
1916 "punpckhbw %%mm4, %%mm1 \n\t"
1917 "punpcklbw %%mm5, %%mm2 \n\t"
1918 "punpckhbw %%mm5, %%mm3 \n\t"
1919 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1920 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1921 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1922 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1923 "add $16, %%"REG_a" \n\t"
1924 "cmp %3, %%"REG_a" \n\t"
1925 " jb 1b \n\t"
1926 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1927 : "memory", "%"REG_a
1928 );
1929 #endif
1930 for (w= (width&(~15)); w < width; w++) {
1931 dest[2*w+0] = src1[w];
1932 dest[2*w+1] = src2[w];
1933 }
1934 dest += dstStride;
1935 src1 += src1Stride;
1936 src2 += src2Stride;
1937 }
1938 __asm__(
1939 EMMS" \n\t"
1940 SFENCE" \n\t"
1941 ::: "memory"
1942 );
1943 }
1944 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1945
1946 #if !COMPILE_TEMPLATE_SSE2
1947 #if !COMPILE_TEMPLATE_AMD3DNOW
1948 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1949 uint8_t *dst1, uint8_t *dst2,
1950 int width, int height,
1951 int srcStride1, int srcStride2,
1952 int dstStride1, int dstStride2)
1953 {
1954 x86_reg x, y;
1955 int w,h;
1956 w=width/2; h=height/2;
1957 __asm__ volatile(
1958 PREFETCH" %0 \n\t"
1959 PREFETCH" %1 \n\t"
1960 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1961 for (y=0;y<h;y++) {
1962 const uint8_t* s1=src1+srcStride1*(y>>1);
1963 uint8_t* d=dst1+dstStride1*y;
1964 x=0;
1965 for (;x<w-31;x+=32) {
1966 __asm__ volatile(
1967 PREFETCH" 32(%1,%2) \n\t"
1968 "movq (%1,%2), %%mm0 \n\t"
1969 "movq 8(%1,%2), %%mm2 \n\t"
1970 "movq 16(%1,%2), %%mm4 \n\t"
1971 "movq 24(%1,%2), %%mm6 \n\t"
1972 "movq %%mm0, %%mm1 \n\t"
1973 "movq %%mm2, %%mm3 \n\t"
1974 "movq %%mm4, %%mm5 \n\t"
1975 "movq %%mm6, %%mm7 \n\t"
1976 "punpcklbw %%mm0, %%mm0 \n\t"
1977 "punpckhbw %%mm1, %%mm1 \n\t"
1978 "punpcklbw %%mm2, %%mm2 \n\t"
1979 "punpckhbw %%mm3, %%mm3 \n\t"
1980 "punpcklbw %%mm4, %%mm4 \n\t"
1981 "punpckhbw %%mm5, %%mm5 \n\t"
1982 "punpcklbw %%mm6, %%mm6 \n\t"
1983 "punpckhbw %%mm7, %%mm7 \n\t"
1984 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1985 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1986 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1987 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1988 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1989 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1990 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1991 MOVNTQ" %%mm7, 56(%0,%2,2)"
1992 :: "r"(d), "r"(s1), "r"(x)
1993 :"memory");
1994 }
1995 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1996 }
1997 for (y=0;y<h;y++) {
1998 const uint8_t* s2=src2+srcStride2*(y>>1);
1999 uint8_t* d=dst2+dstStride2*y;
2000 x=0;
2001 for (;x<w-31;x+=32) {
2002 __asm__ volatile(
2003 PREFETCH" 32(%1,%2) \n\t"
2004 "movq (%1,%2), %%mm0 \n\t"
2005 "movq 8(%1,%2), %%mm2 \n\t"
2006 "movq 16(%1,%2), %%mm4 \n\t"
2007 "movq 24(%1,%2), %%mm6 \n\t"
2008 "movq %%mm0, %%mm1 \n\t"
2009 "movq %%mm2, %%mm3 \n\t"
2010 "movq %%mm4, %%mm5 \n\t"
2011 "movq %%mm6, %%mm7 \n\t"
2012 "punpcklbw %%mm0, %%mm0 \n\t"
2013 "punpckhbw %%mm1, %%mm1 \n\t"
2014 "punpcklbw %%mm2, %%mm2 \n\t"
2015 "punpckhbw %%mm3, %%mm3 \n\t"
2016 "punpcklbw %%mm4, %%mm4 \n\t"
2017 "punpckhbw %%mm5, %%mm5 \n\t"
2018 "punpcklbw %%mm6, %%mm6 \n\t"
2019 "punpckhbw %%mm7, %%mm7 \n\t"
2020 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2021 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2022 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2023 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2024 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2025 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2026 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2027 MOVNTQ" %%mm7, 56(%0,%2,2)"
2028 :: "r"(d), "r"(s2), "r"(x)
2029 :"memory");
2030 }
2031 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2032 }
2033 __asm__(
2034 EMMS" \n\t"
2035 SFENCE" \n\t"
2036 ::: "memory"
2037 );
2038 }
2039
2040 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2041 uint8_t *dst,
2042 int width, int height,
2043 int srcStride1, int srcStride2,
2044 int srcStride3, int dstStride)
2045 {
2046 x86_reg x;
2047 int y,w,h;
2048 w=width/2; h=height;
2049 for (y=0;y<h;y++) {
2050 const uint8_t* yp=src1+srcStride1*y;
2051 const uint8_t* up=src2+srcStride2*(y>>2);
2052 const uint8_t* vp=src3+srcStride3*(y>>2);
2053 uint8_t* d=dst+dstStride*y;
2054 x=0;
2055 for (;x<w-7;x+=8) {
2056 __asm__ volatile(
2057 PREFETCH" 32(%1, %0) \n\t"
2058 PREFETCH" 32(%2, %0) \n\t"
2059 PREFETCH" 32(%3, %0) \n\t"
2060 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2061 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2062 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2063 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2064 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2065 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2066 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2067 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2068 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2069 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2070
2071 "movq %%mm1, %%mm6 \n\t"
2072 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2073 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2074 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2075 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2076 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2077
2078 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2079 "movq 8(%1, %0, 4), %%mm0 \n\t"
2080 "movq %%mm0, %%mm3 \n\t"
2081 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2082 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2083 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2084 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2085
2086 "movq %%mm4, %%mm6 \n\t"
2087 "movq 16(%1, %0, 4), %%mm0 \n\t"
2088 "movq %%mm0, %%mm3 \n\t"
2089 "punpcklbw %%mm5, %%mm4 \n\t"
2090 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2091 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2092 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2093 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2094
2095 "punpckhbw %%mm5, %%mm6 \n\t"
2096 "movq 24(%1, %0, 4), %%mm0 \n\t"
2097 "movq %%mm0, %%mm3 \n\t"
2098 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2099 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2100 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2101 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2102
2103 : "+r" (x)
2104 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2105 :"memory");
2106 }
2107 for (; x<w; x++) {
2108 const int x2 = x<<2;
2109 d[8*x+0] = yp[x2];
2110 d[8*x+1] = up[x];
2111 d[8*x+2] = yp[x2+1];
2112 d[8*x+3] = vp[x];
2113 d[8*x+4] = yp[x2+2];
2114 d[8*x+5] = up[x];
2115 d[8*x+6] = yp[x2+3];
2116 d[8*x+7] = vp[x];
2117 }
2118 }
2119 __asm__(
2120 EMMS" \n\t"
2121 SFENCE" \n\t"
2122 ::: "memory"
2123 );
2124 }
2125 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2126
2127 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2128 {
2129 dst += count;
2130 src += 2*count;
2131 count= - count;
2132
2133 if(count <= -16) {
2134 count += 15;
2135 __asm__ volatile(
2136 "pcmpeqw %%mm7, %%mm7 \n\t"
2137 "psrlw $8, %%mm7 \n\t"
2138 "1: \n\t"
2139 "movq -30(%1, %0, 2), %%mm0 \n\t"
2140 "movq -22(%1, %0, 2), %%mm1 \n\t"
2141 "movq -14(%1, %0, 2), %%mm2 \n\t"
2142 "movq -6(%1, %0, 2), %%mm3 \n\t"
2143 "pand %%mm7, %%mm0 \n\t"
2144 "pand %%mm7, %%mm1 \n\t"
2145 "pand %%mm7, %%mm2 \n\t"
2146 "pand %%mm7, %%mm3 \n\t"
2147 "packuswb %%mm1, %%mm0 \n\t"
2148 "packuswb %%mm3, %%mm2 \n\t"
2149 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2150 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2151 "add $16, %0 \n\t"
2152 " js 1b \n\t"
2153 : "+r"(count)
2154 : "r"(src), "r"(dst)
2155 );
2156 count -= 15;
2157 }
2158 while(count<0) {
2159 dst[count]= src[2*count];
2160 count++;
2161 }
2162 }
2163
2164 #if !COMPILE_TEMPLATE_AMD3DNOW
2165 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2166 {
2167 dst0+= count;
2168 dst1+= count;
2169 src += 4*count;
2170 count= - count;
2171 if(count <= -8) {
2172 count += 7;
2173 __asm__ volatile(
2174 "pcmpeqw %%mm7, %%mm7 \n\t"
2175 "psrlw $8, %%mm7 \n\t"
2176 "1: \n\t"
2177 "movq -28(%1, %0, 4), %%mm0 \n\t"
2178 "movq -20(%1, %0, 4), %%mm1 \n\t"
2179 "movq -12(%1, %0, 4), %%mm2 \n\t"
2180 "movq -4(%1, %0, 4), %%mm3 \n\t"
2181 "pand %%mm7, %%mm0 \n\t"
2182 "pand %%mm7, %%mm1 \n\t"
2183 "pand %%mm7, %%mm2 \n\t"
2184 "pand %%mm7, %%mm3 \n\t"
2185 "packuswb %%mm1, %%mm0 \n\t"
2186 "packuswb %%mm3, %%mm2 \n\t"
2187 "movq %%mm0, %%mm1 \n\t"
2188 "movq %%mm2, %%mm3 \n\t"
2189 "psrlw $8, %%mm0 \n\t"
2190 "psrlw $8, %%mm2 \n\t"
2191 "pand %%mm7, %%mm1 \n\t"
2192 "pand %%mm7, %%mm3 \n\t"
2193 "packuswb %%mm2, %%mm0 \n\t"
2194 "packuswb %%mm3, %%mm1 \n\t"
2195 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2196 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2197 "add $8, %0 \n\t"
2198 " js 1b \n\t"
2199 : "+r"(count)
2200 : "r"(src), "r"(dst0), "r"(dst1)
2201 );
2202 count -= 7;
2203 }
2204 while(count<0) {
2205 dst0[count]= src[4*count+0];
2206 dst1[count]= src[4*count+2];
2207 count++;
2208 }
2209 }
2210 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2211
2212 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2213 {
2214 dst0 += count;
2215 dst1 += count;
2216 src0 += 4*count;
2217 src1 += 4*count;
2218 count= - count;
2219 #ifdef PAVGB
2220 if(count <= -8) {
2221 count += 7;
2222 __asm__ volatile(
2223 "pcmpeqw %%mm7, %%mm7 \n\t"
2224 "psrlw $8, %%mm7 \n\t"
2225 "1: \n\t"
2226 "movq -28(%1, %0, 4), %%mm0 \n\t"
2227 "movq -20(%1, %0, 4), %%mm1 \n\t"
2228 "movq -12(%1, %0, 4), %%mm2 \n\t"
2229 "movq -4(%1, %0, 4), %%mm3 \n\t"
2230 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2231 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2232 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2233 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2234 "pand %%mm7, %%mm0 \n\t"
2235 "pand %%mm7, %%mm1 \n\t"
2236 "pand %%mm7, %%mm2 \n\t"
2237 "pand %%mm7, %%mm3 \n\t"
2238 "packuswb %%mm1, %%mm0 \n\t"
2239 "packuswb %%mm3, %%mm2 \n\t"
2240 "movq %%mm0, %%mm1 \n\t"
2241 "movq %%mm2, %%mm3 \n\t"
2242 "psrlw $8, %%mm0 \n\t"
2243 "psrlw $8, %%mm2 \n\t"
2244 "pand %%mm7, %%mm1 \n\t"
2245 "pand %%mm7, %%mm3 \n\t"
2246 "packuswb %%mm2, %%mm0 \n\t"
2247 "packuswb %%mm3, %%mm1 \n\t"
2248 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2249 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2250 "add $8, %0 \n\t"
2251 " js 1b \n\t"
2252 : "+r"(count)
2253 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2254 );
2255 count -= 7;
2256 }
2257 #endif
2258 while(count<0) {
2259 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2260 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2261 count++;
2262 }
2263 }
2264
2265 #if !COMPILE_TEMPLATE_AMD3DNOW
2266 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2267 {
2268 dst0+= count;
2269 dst1+= count;
2270 src += 4*count;
2271 count= - count;
2272 if(count <= -8) {
2273 count += 7;
2274 __asm__ volatile(
2275 "pcmpeqw %%mm7, %%mm7 \n\t"
2276 "psrlw $8, %%mm7 \n\t"
2277 "1: \n\t"
2278 "movq -28(%1, %0, 4), %%mm0 \n\t"
2279 "movq -20(%1, %0, 4), %%mm1 \n\t"
2280 "movq -12(%1, %0, 4), %%mm2 \n\t"
2281 "movq -4(%1, %0, 4), %%mm3 \n\t"
2282 "psrlw $8, %%mm0 \n\t"
2283 "psrlw $8, %%mm1 \n\t"
2284 "psrlw $8, %%mm2 \n\t"
2285 "psrlw $8, %%mm3 \n\t"
2286 "packuswb %%mm1, %%mm0 \n\t"
2287 "packuswb %%mm3, %%mm2 \n\t"
2288 "movq %%mm0, %%mm1 \n\t"
2289 "movq %%mm2, %%mm3 \n\t"
2290 "psrlw $8, %%mm0 \n\t"
2291 "psrlw $8, %%mm2 \n\t"
2292 "pand %%mm7, %%mm1 \n\t"
2293 "pand %%mm7, %%mm3 \n\t"
2294 "packuswb %%mm2, %%mm0 \n\t"
2295 "packuswb %%mm3, %%mm1 \n\t"
2296 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2297 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2298 "add $8, %0 \n\t"
2299 " js 1b \n\t"
2300 : "+r"(count)
2301 : "r"(src), "r"(dst0), "r"(dst1)
2302 );
2303 count -= 7;
2304 }
2305 src++;
2306 while(count<0) {
2307 dst0[count]= src[4*count+0];
2308 dst1[count]= src[4*count+2];
2309 count++;
2310 }
2311 }
2312 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2313
2314 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2315 {
2316 dst0 += count;
2317 dst1 += count;
2318 src0 += 4*count;
2319 src1 += 4*count;
2320 count= - count;
2321 #ifdef PAVGB
2322 if(count <= -8) {
2323 count += 7;
2324 __asm__ volatile(
2325 "pcmpeqw %%mm7, %%mm7 \n\t"
2326 "psrlw $8, %%mm7 \n\t"
2327 "1: \n\t"
2328 "movq -28(%1, %0, 4), %%mm0 \n\t"
2329 "movq -20(%1, %0, 4), %%mm1 \n\t"
2330 "movq -12(%1, %0, 4), %%mm2 \n\t"
2331 "movq -4(%1, %0, 4), %%mm3 \n\t"
2332 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2333 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2334 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2335 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2336 "psrlw $8, %%mm0 \n\t"
2337 "psrlw $8, %%mm1 \n\t"
2338 "psrlw $8, %%mm2 \n\t"
2339 "psrlw $8, %%mm3 \n\t"
2340 "packuswb %%mm1, %%mm0 \n\t"
2341 "packuswb %%mm3, %%mm2 \n\t"
2342 "movq %%mm0, %%mm1 \n\t"
2343 "movq %%mm2, %%mm3 \n\t"
2344 "psrlw $8, %%mm0 \n\t"
2345 "psrlw $8, %%mm2 \n\t"
2346 "pand %%mm7, %%mm1 \n\t"
2347 "pand %%mm7, %%mm3 \n\t"
2348 "packuswb %%mm2, %%mm0 \n\t"
2349 "packuswb %%mm3, %%mm1 \n\t"
2350 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2351 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2352 "add $8, %0 \n\t"
2353 " js 1b \n\t"
2354 : "+r"(count)
2355 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2356 );
2357 count -= 7;
2358 }
2359 #endif
2360 src0++;
2361 src1++;
2362 while(count<0) {
2363 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2364 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2365 count++;
2366 }
2367 }
2368
2369 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2370 int width, int height,
2371 int lumStride, int chromStride, int srcStride)
2372 {
2373 int y;
2374 const int chromWidth= -((-width)>>1);
2375
2376 for (y=0; y<height; y++) {
2377 RENAME(extract_even)(src, ydst, width);
2378 if(y&1) {
2379 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2380 udst+= chromStride;
2381 vdst+= chromStride;
2382 }
2383
2384 src += srcStride;
2385 ydst+= lumStride;
2386 }
2387 __asm__(
2388 EMMS" \n\t"
2389 SFENCE" \n\t"
2390 ::: "memory"
2391 );
2392 }
2393
2394 #if !COMPILE_TEMPLATE_AMD3DNOW
2395 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2396 int width, int height,
2397 int lumStride, int chromStride, int srcStride)
2398 {
2399 int y;
2400 const int chromWidth= -((-width)>>1);
2401
2402 for (y=0; y<height; y++) {
2403 RENAME(extract_even)(src, ydst, width);
2404 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2405
2406 src += srcStride;
2407 ydst+= lumStride;
2408 udst+= chromStride;
2409 vdst+= chromStride;
2410 }
2411 __asm__(
2412 EMMS" \n\t"
2413 SFENCE" \n\t"
2414 ::: "memory"
2415 );
2416 }
2417 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2418
2419 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2420 int width, int height,
2421 int lumStride, int chromStride, int srcStride)
2422 {
2423 int y;
2424 const int chromWidth= -((-width)>>1);
2425
2426 for (y=0; y<height; y++) {
2427 RENAME(extract_even)(src+1, ydst, width);
2428 if(y&1) {
2429 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2430 udst+= chromStride;
2431 vdst+= chromStride;
2432 }
2433
2434 src += srcStride;
2435 ydst+= lumStride;
2436 }
2437 __asm__(
2438 EMMS" \n\t"
2439 SFENCE" \n\t"
2440 ::: "memory"
2441 );
2442 }
2443
2444 #if !COMPILE_TEMPLATE_AMD3DNOW
2445 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2446 int width, int height,
2447 int lumStride, int chromStride, int srcStride)
2448 {
2449 int y;
2450 const int chromWidth= -((-width)>>1);
2451
2452 for (y=0; y<height; y++) {
2453 RENAME(extract_even)(src+1, ydst, width);
2454 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2455
2456 src += srcStride;
2457 ydst+= lumStride;
2458 udst+= chromStride;
2459 vdst+= chromStride;
2460 }
2461 __asm__(
2462 EMMS" \n\t"
2463 SFENCE" \n\t"
2464 ::: "memory"
2465 );
2466 }
2467 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2468 #endif /* !COMPILE_TEMPLATE_SSE2 */
2469
2470 static av_cold void RENAME(rgb2rgb_init)(void)
2471 {
2472 #if !COMPILE_TEMPLATE_SSE2
2473 #if !COMPILE_TEMPLATE_AMD3DNOW
2474 rgb15to16 = RENAME(rgb15to16);
2475 rgb15tobgr24 = RENAME(rgb15tobgr24);
2476 rgb15to32 = RENAME(rgb15to32);
2477 rgb16tobgr24 = RENAME(rgb16tobgr24);
2478 rgb16to32 = RENAME(rgb16to32);
2479 rgb16to15 = RENAME(rgb16to15);
2480 rgb24tobgr16 = RENAME(rgb24tobgr16);
2481 rgb24tobgr15 = RENAME(rgb24tobgr15);
2482 rgb24tobgr32 = RENAME(rgb24tobgr32);
2483 rgb32to16 = RENAME(rgb32to16);
2484 rgb32to15 = RENAME(rgb32to15);
2485 rgb32tobgr24 = RENAME(rgb32tobgr24);
2486 rgb24to15 = RENAME(rgb24to15);
2487 rgb24to16 = RENAME(rgb24to16);
2488 rgb24tobgr24 = RENAME(rgb24tobgr24);
2489 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2490 rgb32tobgr16 = RENAME(rgb32tobgr16);
2491 rgb32tobgr15 = RENAME(rgb32tobgr15);
2492 yv12toyuy2 = RENAME(yv12toyuy2);
2493 yv12touyvy = RENAME(yv12touyvy);
2494 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2495 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2496 yuy2toyv12 = RENAME(yuy2toyv12);
2497 vu9_to_vu12 = RENAME(vu9_to_vu12);
2498 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2499 uyvytoyuv422 = RENAME(uyvytoyuv422);
2500 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2501 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2502
2503 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2504 planar2x = RENAME(planar2x);
2505 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2506 rgb24toyv12 = RENAME(rgb24toyv12);
2507
2508 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2509 uyvytoyuv420 = RENAME(uyvytoyuv420);
2510 #endif /* !COMPILE_TEMPLATE_SSE2 */
2511
2512 #if !COMPILE_TEMPLATE_AMD3DNOW
2513 interleaveBytes = RENAME(interleaveBytes);
2514 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2515 }