rgb2rgb_template: add MMX/SSE2/AVX-optimized deinterleaveBytes
[libav.git] / libswscale / x86 / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of Libav.
11 *
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27 #include <stddef.h>
28
29 #include "libavutil/attributes.h"
30
31 #undef PREFETCH
32 #undef MOVNTQ
33 #undef EMMS
34 #undef SFENCE
35 #undef PAVGB
36
37 #if COMPILE_TEMPLATE_AMD3DNOW
38 #define PREFETCH "prefetch"
39 #define PAVGB "pavgusb"
40 #elif COMPILE_TEMPLATE_MMXEXT
41 #define PREFETCH "prefetchnta"
42 #define PAVGB "pavgb"
43 #else
44 #define PREFETCH " # nop"
45 #endif
46
47 #if COMPILE_TEMPLATE_AMD3DNOW
48 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
49 #define EMMS "femms"
50 #else
51 #define EMMS "emms"
52 #endif
53
54 #if COMPILE_TEMPLATE_MMXEXT
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
57 #else
58 #define MOVNTQ "movq"
59 #define SFENCE " # nop"
60 #endif
61
62 #if !COMPILE_TEMPLATE_SSE2
63
64 #if !COMPILE_TEMPLATE_AMD3DNOW
65
66 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
67 {
68 uint8_t *dest = dst;
69 const uint8_t *s = src;
70 const uint8_t *end;
71 const uint8_t *mm_end;
72 end = s + src_size;
73 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
74 mm_end = end - 23;
75 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
76 while (s < mm_end) {
77 __asm__ volatile(
78 PREFETCH" 32(%1) \n\t"
79 "movd (%1), %%mm0 \n\t"
80 "punpckldq 3(%1), %%mm0 \n\t"
81 "movd 6(%1), %%mm1 \n\t"
82 "punpckldq 9(%1), %%mm1 \n\t"
83 "movd 12(%1), %%mm2 \n\t"
84 "punpckldq 15(%1), %%mm2 \n\t"
85 "movd 18(%1), %%mm3 \n\t"
86 "punpckldq 21(%1), %%mm3 \n\t"
87 "por %%mm7, %%mm0 \n\t"
88 "por %%mm7, %%mm1 \n\t"
89 "por %%mm7, %%mm2 \n\t"
90 "por %%mm7, %%mm3 \n\t"
91 MOVNTQ" %%mm0, (%0) \n\t"
92 MOVNTQ" %%mm1, 8(%0) \n\t"
93 MOVNTQ" %%mm2, 16(%0) \n\t"
94 MOVNTQ" %%mm3, 24(%0)"
95 :: "r"(dest), "r"(s)
96 :"memory");
97 dest += 32;
98 s += 24;
99 }
100 __asm__ volatile(SFENCE:::"memory");
101 __asm__ volatile(EMMS:::"memory");
102 while (s < end) {
103 *dest++ = *s++;
104 *dest++ = *s++;
105 *dest++ = *s++;
106 *dest++ = 255;
107 }
108 }
109
110 #define STORE_BGR24_MMX \
111 "psrlq $8, %%mm2 \n\t" \
112 "psrlq $8, %%mm3 \n\t" \
113 "psrlq $8, %%mm6 \n\t" \
114 "psrlq $8, %%mm7 \n\t" \
115 "pand "MANGLE(mask24l)", %%mm0\n\t" \
116 "pand "MANGLE(mask24l)", %%mm1\n\t" \
117 "pand "MANGLE(mask24l)", %%mm4\n\t" \
118 "pand "MANGLE(mask24l)", %%mm5\n\t" \
119 "pand "MANGLE(mask24h)", %%mm2\n\t" \
120 "pand "MANGLE(mask24h)", %%mm3\n\t" \
121 "pand "MANGLE(mask24h)", %%mm6\n\t" \
122 "pand "MANGLE(mask24h)", %%mm7\n\t" \
123 "por %%mm2, %%mm0 \n\t" \
124 "por %%mm3, %%mm1 \n\t" \
125 "por %%mm6, %%mm4 \n\t" \
126 "por %%mm7, %%mm5 \n\t" \
127 \
128 "movq %%mm1, %%mm2 \n\t" \
129 "movq %%mm4, %%mm3 \n\t" \
130 "psllq $48, %%mm2 \n\t" \
131 "psllq $32, %%mm3 \n\t" \
132 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
133 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
134 "por %%mm2, %%mm0 \n\t" \
135 "psrlq $16, %%mm1 \n\t" \
136 "psrlq $32, %%mm4 \n\t" \
137 "psllq $16, %%mm5 \n\t" \
138 "por %%mm3, %%mm1 \n\t" \
139 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
140 "por %%mm5, %%mm4 \n\t" \
141 \
142 MOVNTQ" %%mm0, (%0) \n\t" \
143 MOVNTQ" %%mm1, 8(%0) \n\t" \
144 MOVNTQ" %%mm4, 16(%0)"
145
146
147 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
148 {
149 uint8_t *dest = dst;
150 const uint8_t *s = src;
151 const uint8_t *end;
152 const uint8_t *mm_end;
153 end = s + src_size;
154 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
155 mm_end = end - 31;
156 while (s < mm_end) {
157 __asm__ volatile(
158 PREFETCH" 32(%1) \n\t"
159 "movq (%1), %%mm0 \n\t"
160 "movq 8(%1), %%mm1 \n\t"
161 "movq 16(%1), %%mm4 \n\t"
162 "movq 24(%1), %%mm5 \n\t"
163 "movq %%mm0, %%mm2 \n\t"
164 "movq %%mm1, %%mm3 \n\t"
165 "movq %%mm4, %%mm6 \n\t"
166 "movq %%mm5, %%mm7 \n\t"
167 STORE_BGR24_MMX
168 :: "r"(dest), "r"(s)
169 :"memory");
170 dest += 24;
171 s += 32;
172 }
173 __asm__ volatile(SFENCE:::"memory");
174 __asm__ volatile(EMMS:::"memory");
175 while (s < end) {
176 *dest++ = *s++;
177 *dest++ = *s++;
178 *dest++ = *s++;
179 s++;
180 }
181 }
182
183 /*
184 original by Strepto/Astral
185 ported to gcc & bugfixed: A'rpi
186 MMXEXT, 3DNOW optimization by Nick Kurshev
187 32-bit C version, and and&add trick by Michael Niedermayer
188 */
189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
190 {
191 register const uint8_t* s=src;
192 register uint8_t* d=dst;
193 register const uint8_t *end;
194 const uint8_t *mm_end;
195 end = s + src_size;
196 __asm__ volatile(PREFETCH" %0"::"m"(*s));
197 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
198 mm_end = end - 15;
199 while (s<mm_end) {
200 __asm__ volatile(
201 PREFETCH" 32(%1) \n\t"
202 "movq (%1), %%mm0 \n\t"
203 "movq 8(%1), %%mm2 \n\t"
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
210 MOVNTQ" %%mm0, (%0) \n\t"
211 MOVNTQ" %%mm2, 8(%0)"
212 :: "r"(d), "r"(s)
213 );
214 d+=16;
215 s+=16;
216 }
217 __asm__ volatile(SFENCE:::"memory");
218 __asm__ volatile(EMMS:::"memory");
219 mm_end = end - 3;
220 while (s < mm_end) {
221 register unsigned x= *((const uint32_t *)s);
222 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
223 d+=4;
224 s+=4;
225 }
226 if (s < end) {
227 register unsigned short x= *((const uint16_t *)s);
228 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
229 }
230 }
231
232 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
233 {
234 register const uint8_t* s=src;
235 register uint8_t* d=dst;
236 register const uint8_t *end;
237 const uint8_t *mm_end;
238 end = s + src_size;
239 __asm__ volatile(PREFETCH" %0"::"m"(*s));
240 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
241 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
242 mm_end = end - 15;
243 while (s<mm_end) {
244 __asm__ volatile(
245 PREFETCH" 32(%1) \n\t"
246 "movq (%1), %%mm0 \n\t"
247 "movq 8(%1), %%mm2 \n\t"
248 "movq %%mm0, %%mm1 \n\t"
249 "movq %%mm2, %%mm3 \n\t"
250 "psrlq $1, %%mm0 \n\t"
251 "psrlq $1, %%mm2 \n\t"
252 "pand %%mm7, %%mm0 \n\t"
253 "pand %%mm7, %%mm2 \n\t"
254 "pand %%mm6, %%mm1 \n\t"
255 "pand %%mm6, %%mm3 \n\t"
256 "por %%mm1, %%mm0 \n\t"
257 "por %%mm3, %%mm2 \n\t"
258 MOVNTQ" %%mm0, (%0) \n\t"
259 MOVNTQ" %%mm2, 8(%0)"
260 :: "r"(d), "r"(s)
261 );
262 d+=16;
263 s+=16;
264 }
265 __asm__ volatile(SFENCE:::"memory");
266 __asm__ volatile(EMMS:::"memory");
267 mm_end = end - 3;
268 while (s < mm_end) {
269 register uint32_t x= *((const uint32_t*)s);
270 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
271 s+=4;
272 d+=4;
273 }
274 if (s < end) {
275 register uint16_t x= *((const uint16_t*)s);
276 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
277 }
278 }
279
280 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
281 {
282 const uint8_t *s = src;
283 const uint8_t *end;
284 const uint8_t *mm_end;
285 uint16_t *d = (uint16_t *)dst;
286 end = s + src_size;
287 mm_end = end - 15;
288 __asm__ volatile(
289 "movq %3, %%mm5 \n\t"
290 "movq %4, %%mm6 \n\t"
291 "movq %5, %%mm7 \n\t"
292 "jmp 2f \n\t"
293 ".p2align 4 \n\t"
294 "1: \n\t"
295 PREFETCH" 32(%1) \n\t"
296 "movd (%1), %%mm0 \n\t"
297 "movd 4(%1), %%mm3 \n\t"
298 "punpckldq 8(%1), %%mm0 \n\t"
299 "punpckldq 12(%1), %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm4 \n\t"
302 "pand %%mm6, %%mm0 \n\t"
303 "pand %%mm6, %%mm3 \n\t"
304 "pmaddwd %%mm7, %%mm0 \n\t"
305 "pmaddwd %%mm7, %%mm3 \n\t"
306 "pand %%mm5, %%mm1 \n\t"
307 "pand %%mm5, %%mm4 \n\t"
308 "por %%mm1, %%mm0 \n\t"
309 "por %%mm4, %%mm3 \n\t"
310 "psrld $5, %%mm0 \n\t"
311 "pslld $11, %%mm3 \n\t"
312 "por %%mm3, %%mm0 \n\t"
313 MOVNTQ" %%mm0, (%0) \n\t"
314 "add $16, %1 \n\t"
315 "add $8, %0 \n\t"
316 "2: \n\t"
317 "cmp %2, %1 \n\t"
318 " jb 1b \n\t"
319 : "+r" (d), "+r"(s)
320 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
321 );
322 __asm__ volatile(SFENCE:::"memory");
323 __asm__ volatile(EMMS:::"memory");
324 while (s < end) {
325 register int rgb = *(const uint32_t*)s; s += 4;
326 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
327 }
328 }
329
330 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
331 {
332 const uint8_t *s = src;
333 const uint8_t *end;
334 const uint8_t *mm_end;
335 uint16_t *d = (uint16_t *)dst;
336 end = s + src_size;
337 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
338 __asm__ volatile(
339 "movq %0, %%mm7 \n\t"
340 "movq %1, %%mm6 \n\t"
341 ::"m"(red_16mask),"m"(green_16mask));
342 mm_end = end - 15;
343 while (s < mm_end) {
344 __asm__ volatile(
345 PREFETCH" 32(%1) \n\t"
346 "movd (%1), %%mm0 \n\t"
347 "movd 4(%1), %%mm3 \n\t"
348 "punpckldq 8(%1), %%mm0 \n\t"
349 "punpckldq 12(%1), %%mm3 \n\t"
350 "movq %%mm0, %%mm1 \n\t"
351 "movq %%mm0, %%mm2 \n\t"
352 "movq %%mm3, %%mm4 \n\t"
353 "movq %%mm3, %%mm5 \n\t"
354 "psllq $8, %%mm0 \n\t"
355 "psllq $8, %%mm3 \n\t"
356 "pand %%mm7, %%mm0 \n\t"
357 "pand %%mm7, %%mm3 \n\t"
358 "psrlq $5, %%mm1 \n\t"
359 "psrlq $5, %%mm4 \n\t"
360 "pand %%mm6, %%mm1 \n\t"
361 "pand %%mm6, %%mm4 \n\t"
362 "psrlq $19, %%mm2 \n\t"
363 "psrlq $19, %%mm5 \n\t"
364 "pand %2, %%mm2 \n\t"
365 "pand %2, %%mm5 \n\t"
366 "por %%mm1, %%mm0 \n\t"
367 "por %%mm4, %%mm3 \n\t"
368 "por %%mm2, %%mm0 \n\t"
369 "por %%mm5, %%mm3 \n\t"
370 "psllq $16, %%mm3 \n\t"
371 "por %%mm3, %%mm0 \n\t"
372 MOVNTQ" %%mm0, (%0) \n\t"
373 :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
374 d += 4;
375 s += 16;
376 }
377 __asm__ volatile(SFENCE:::"memory");
378 __asm__ volatile(EMMS:::"memory");
379 while (s < end) {
380 register int rgb = *(const uint32_t*)s; s += 4;
381 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
382 }
383 }
384
385 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
386 {
387 const uint8_t *s = src;
388 const uint8_t *end;
389 const uint8_t *mm_end;
390 uint16_t *d = (uint16_t *)dst;
391 end = s + src_size;
392 mm_end = end - 15;
393 __asm__ volatile(
394 "movq %3, %%mm5 \n\t"
395 "movq %4, %%mm6 \n\t"
396 "movq %5, %%mm7 \n\t"
397 "jmp 2f \n\t"
398 ".p2align 4 \n\t"
399 "1: \n\t"
400 PREFETCH" 32(%1) \n\t"
401 "movd (%1), %%mm0 \n\t"
402 "movd 4(%1), %%mm3 \n\t"
403 "punpckldq 8(%1), %%mm0 \n\t"
404 "punpckldq 12(%1), %%mm3 \n\t"
405 "movq %%mm0, %%mm1 \n\t"
406 "movq %%mm3, %%mm4 \n\t"
407 "pand %%mm6, %%mm0 \n\t"
408 "pand %%mm6, %%mm3 \n\t"
409 "pmaddwd %%mm7, %%mm0 \n\t"
410 "pmaddwd %%mm7, %%mm3 \n\t"
411 "pand %%mm5, %%mm1 \n\t"
412 "pand %%mm5, %%mm4 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "psrld $6, %%mm0 \n\t"
416 "pslld $10, %%mm3 \n\t"
417 "por %%mm3, %%mm0 \n\t"
418 MOVNTQ" %%mm0, (%0) \n\t"
419 "add $16, %1 \n\t"
420 "add $8, %0 \n\t"
421 "2: \n\t"
422 "cmp %2, %1 \n\t"
423 " jb 1b \n\t"
424 : "+r" (d), "+r"(s)
425 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
426 );
427 __asm__ volatile(SFENCE:::"memory");
428 __asm__ volatile(EMMS:::"memory");
429 while (s < end) {
430 register int rgb = *(const uint32_t*)s; s += 4;
431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
432 }
433 }
434
435 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
436 {
437 const uint8_t *s = src;
438 const uint8_t *end;
439 const uint8_t *mm_end;
440 uint16_t *d = (uint16_t *)dst;
441 end = s + src_size;
442 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
443 __asm__ volatile(
444 "movq %0, %%mm7 \n\t"
445 "movq %1, %%mm6 \n\t"
446 ::"m"(red_15mask),"m"(green_15mask));
447 mm_end = end - 15;
448 while (s < mm_end) {
449 __asm__ volatile(
450 PREFETCH" 32(%1) \n\t"
451 "movd (%1), %%mm0 \n\t"
452 "movd 4(%1), %%mm3 \n\t"
453 "punpckldq 8(%1), %%mm0 \n\t"
454 "punpckldq 12(%1), %%mm3 \n\t"
455 "movq %%mm0, %%mm1 \n\t"
456 "movq %%mm0, %%mm2 \n\t"
457 "movq %%mm3, %%mm4 \n\t"
458 "movq %%mm3, %%mm5 \n\t"
459 "psllq $7, %%mm0 \n\t"
460 "psllq $7, %%mm3 \n\t"
461 "pand %%mm7, %%mm0 \n\t"
462 "pand %%mm7, %%mm3 \n\t"
463 "psrlq $6, %%mm1 \n\t"
464 "psrlq $6, %%mm4 \n\t"
465 "pand %%mm6, %%mm1 \n\t"
466 "pand %%mm6, %%mm4 \n\t"
467 "psrlq $19, %%mm2 \n\t"
468 "psrlq $19, %%mm5 \n\t"
469 "pand %2, %%mm2 \n\t"
470 "pand %2, %%mm5 \n\t"
471 "por %%mm1, %%mm0 \n\t"
472 "por %%mm4, %%mm3 \n\t"
473 "por %%mm2, %%mm0 \n\t"
474 "por %%mm5, %%mm3 \n\t"
475 "psllq $16, %%mm3 \n\t"
476 "por %%mm3, %%mm0 \n\t"
477 MOVNTQ" %%mm0, (%0) \n\t"
478 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
479 d += 4;
480 s += 16;
481 }
482 __asm__ volatile(SFENCE:::"memory");
483 __asm__ volatile(EMMS:::"memory");
484 while (s < end) {
485 register int rgb = *(const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
487 }
488 }
489
490 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
491 {
492 const uint8_t *s = src;
493 const uint8_t *end;
494 const uint8_t *mm_end;
495 uint16_t *d = (uint16_t *)dst;
496 end = s + src_size;
497 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
498 __asm__ volatile(
499 "movq %0, %%mm7 \n\t"
500 "movq %1, %%mm6 \n\t"
501 ::"m"(red_16mask),"m"(green_16mask));
502 mm_end = end - 11;
503 while (s < mm_end) {
504 __asm__ volatile(
505 PREFETCH" 32(%1) \n\t"
506 "movd (%1), %%mm0 \n\t"
507 "movd 3(%1), %%mm3 \n\t"
508 "punpckldq 6(%1), %%mm0 \n\t"
509 "punpckldq 9(%1), %%mm3 \n\t"
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm0, %%mm2 \n\t"
512 "movq %%mm3, %%mm4 \n\t"
513 "movq %%mm3, %%mm5 \n\t"
514 "psrlq $3, %%mm0 \n\t"
515 "psrlq $3, %%mm3 \n\t"
516 "pand %2, %%mm0 \n\t"
517 "pand %2, %%mm3 \n\t"
518 "psrlq $5, %%mm1 \n\t"
519 "psrlq $5, %%mm4 \n\t"
520 "pand %%mm6, %%mm1 \n\t"
521 "pand %%mm6, %%mm4 \n\t"
522 "psrlq $8, %%mm2 \n\t"
523 "psrlq $8, %%mm5 \n\t"
524 "pand %%mm7, %%mm2 \n\t"
525 "pand %%mm7, %%mm5 \n\t"
526 "por %%mm1, %%mm0 \n\t"
527 "por %%mm4, %%mm3 \n\t"
528 "por %%mm2, %%mm0 \n\t"
529 "por %%mm5, %%mm3 \n\t"
530 "psllq $16, %%mm3 \n\t"
531 "por %%mm3, %%mm0 \n\t"
532 MOVNTQ" %%mm0, (%0) \n\t"
533 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
534 d += 4;
535 s += 12;
536 }
537 __asm__ volatile(SFENCE:::"memory");
538 __asm__ volatile(EMMS:::"memory");
539 while (s < end) {
540 const int b = *s++;
541 const int g = *s++;
542 const int r = *s++;
543 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
544 }
545 }
546
547 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
548 {
549 const uint8_t *s = src;
550 const uint8_t *end;
551 const uint8_t *mm_end;
552 uint16_t *d = (uint16_t *)dst;
553 end = s + src_size;
554 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
555 __asm__ volatile(
556 "movq %0, %%mm7 \n\t"
557 "movq %1, %%mm6 \n\t"
558 ::"m"(red_16mask),"m"(green_16mask));
559 mm_end = end - 15;
560 while (s < mm_end) {
561 __asm__ volatile(
562 PREFETCH" 32(%1) \n\t"
563 "movd (%1), %%mm0 \n\t"
564 "movd 3(%1), %%mm3 \n\t"
565 "punpckldq 6(%1), %%mm0 \n\t"
566 "punpckldq 9(%1), %%mm3 \n\t"
567 "movq %%mm0, %%mm1 \n\t"
568 "movq %%mm0, %%mm2 \n\t"
569 "movq %%mm3, %%mm4 \n\t"
570 "movq %%mm3, %%mm5 \n\t"
571 "psllq $8, %%mm0 \n\t"
572 "psllq $8, %%mm3 \n\t"
573 "pand %%mm7, %%mm0 \n\t"
574 "pand %%mm7, %%mm3 \n\t"
575 "psrlq $5, %%mm1 \n\t"
576 "psrlq $5, %%mm4 \n\t"
577 "pand %%mm6, %%mm1 \n\t"
578 "pand %%mm6, %%mm4 \n\t"
579 "psrlq $19, %%mm2 \n\t"
580 "psrlq $19, %%mm5 \n\t"
581 "pand %2, %%mm2 \n\t"
582 "pand %2, %%mm5 \n\t"
583 "por %%mm1, %%mm0 \n\t"
584 "por %%mm4, %%mm3 \n\t"
585 "por %%mm2, %%mm0 \n\t"
586 "por %%mm5, %%mm3 \n\t"
587 "psllq $16, %%mm3 \n\t"
588 "por %%mm3, %%mm0 \n\t"
589 MOVNTQ" %%mm0, (%0) \n\t"
590 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
591 d += 4;
592 s += 12;
593 }
594 __asm__ volatile(SFENCE:::"memory");
595 __asm__ volatile(EMMS:::"memory");
596 while (s < end) {
597 const int r = *s++;
598 const int g = *s++;
599 const int b = *s++;
600 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
601 }
602 }
603
604 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
605 {
606 const uint8_t *s = src;
607 const uint8_t *end;
608 const uint8_t *mm_end;
609 uint16_t *d = (uint16_t *)dst;
610 end = s + src_size;
611 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
612 __asm__ volatile(
613 "movq %0, %%mm7 \n\t"
614 "movq %1, %%mm6 \n\t"
615 ::"m"(red_15mask),"m"(green_15mask));
616 mm_end = end - 11;
617 while (s < mm_end) {
618 __asm__ volatile(
619 PREFETCH" 32(%1) \n\t"
620 "movd (%1), %%mm0 \n\t"
621 "movd 3(%1), %%mm3 \n\t"
622 "punpckldq 6(%1), %%mm0 \n\t"
623 "punpckldq 9(%1), %%mm3 \n\t"
624 "movq %%mm0, %%mm1 \n\t"
625 "movq %%mm0, %%mm2 \n\t"
626 "movq %%mm3, %%mm4 \n\t"
627 "movq %%mm3, %%mm5 \n\t"
628 "psrlq $3, %%mm0 \n\t"
629 "psrlq $3, %%mm3 \n\t"
630 "pand %2, %%mm0 \n\t"
631 "pand %2, %%mm3 \n\t"
632 "psrlq $6, %%mm1 \n\t"
633 "psrlq $6, %%mm4 \n\t"
634 "pand %%mm6, %%mm1 \n\t"
635 "pand %%mm6, %%mm4 \n\t"
636 "psrlq $9, %%mm2 \n\t"
637 "psrlq $9, %%mm5 \n\t"
638 "pand %%mm7, %%mm2 \n\t"
639 "pand %%mm7, %%mm5 \n\t"
640 "por %%mm1, %%mm0 \n\t"
641 "por %%mm4, %%mm3 \n\t"
642 "por %%mm2, %%mm0 \n\t"
643 "por %%mm5, %%mm3 \n\t"
644 "psllq $16, %%mm3 \n\t"
645 "por %%mm3, %%mm0 \n\t"
646 MOVNTQ" %%mm0, (%0) \n\t"
647 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
648 d += 4;
649 s += 12;
650 }
651 __asm__ volatile(SFENCE:::"memory");
652 __asm__ volatile(EMMS:::"memory");
653 while (s < end) {
654 const int b = *s++;
655 const int g = *s++;
656 const int r = *s++;
657 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
658 }
659 }
660
661 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
662 {
663 const uint8_t *s = src;
664 const uint8_t *end;
665 const uint8_t *mm_end;
666 uint16_t *d = (uint16_t *)dst;
667 end = s + src_size;
668 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
669 __asm__ volatile(
670 "movq %0, %%mm7 \n\t"
671 "movq %1, %%mm6 \n\t"
672 ::"m"(red_15mask),"m"(green_15mask));
673 mm_end = end - 15;
674 while (s < mm_end) {
675 __asm__ volatile(
676 PREFETCH" 32(%1) \n\t"
677 "movd (%1), %%mm0 \n\t"
678 "movd 3(%1), %%mm3 \n\t"
679 "punpckldq 6(%1), %%mm0 \n\t"
680 "punpckldq 9(%1), %%mm3 \n\t"
681 "movq %%mm0, %%mm1 \n\t"
682 "movq %%mm0, %%mm2 \n\t"
683 "movq %%mm3, %%mm4 \n\t"
684 "movq %%mm3, %%mm5 \n\t"
685 "psllq $7, %%mm0 \n\t"
686 "psllq $7, %%mm3 \n\t"
687 "pand %%mm7, %%mm0 \n\t"
688 "pand %%mm7, %%mm3 \n\t"
689 "psrlq $6, %%mm1 \n\t"
690 "psrlq $6, %%mm4 \n\t"
691 "pand %%mm6, %%mm1 \n\t"
692 "pand %%mm6, %%mm4 \n\t"
693 "psrlq $19, %%mm2 \n\t"
694 "psrlq $19, %%mm5 \n\t"
695 "pand %2, %%mm2 \n\t"
696 "pand %2, %%mm5 \n\t"
697 "por %%mm1, %%mm0 \n\t"
698 "por %%mm4, %%mm3 \n\t"
699 "por %%mm2, %%mm0 \n\t"
700 "por %%mm5, %%mm3 \n\t"
701 "psllq $16, %%mm3 \n\t"
702 "por %%mm3, %%mm0 \n\t"
703 MOVNTQ" %%mm0, (%0) \n\t"
704 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
705 d += 4;
706 s += 12;
707 }
708 __asm__ volatile(SFENCE:::"memory");
709 __asm__ volatile(EMMS:::"memory");
710 while (s < end) {
711 const int r = *s++;
712 const int g = *s++;
713 const int b = *s++;
714 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
715 }
716 }
717
718 /*
719 I use less accurate approximation here by simply left-shifting the input
720 value and filling the low order bits with zeroes. This method improves PNG
721 compression but this scheme cannot reproduce white exactly, since it does
722 not generate an all-ones maximum value; the net effect is to darken the
723 image slightly.
724
725 The better method should be "left bit replication":
726
727 4 3 2 1 0
728 ---------
729 1 1 0 1 1
730
731 7 6 5 4 3 2 1 0
732 ----------------
733 1 1 0 1 1 1 1 0
734 |=======| |===|
735 | leftmost bits repeated to fill open bits
736 |
737 original bits
738 */
739 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
740 {
741 const uint16_t *end;
742 const uint16_t *mm_end;
743 uint8_t *d = dst;
744 const uint16_t *s = (const uint16_t*)src;
745 end = s + src_size/2;
746 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
747 mm_end = end - 7;
748 while (s < mm_end) {
749 __asm__ volatile(
750 PREFETCH" 32(%1) \n\t"
751 "movq (%1), %%mm0 \n\t"
752 "movq (%1), %%mm1 \n\t"
753 "movq (%1), %%mm2 \n\t"
754 "pand %2, %%mm0 \n\t"
755 "pand %3, %%mm1 \n\t"
756 "pand %4, %%mm2 \n\t"
757 "psllq $3, %%mm0 \n\t"
758 "psrlq $2, %%mm1 \n\t"
759 "psrlq $7, %%mm2 \n\t"
760 "movq %%mm0, %%mm3 \n\t"
761 "movq %%mm1, %%mm4 \n\t"
762 "movq %%mm2, %%mm5 \n\t"
763 "punpcklwd %5, %%mm0 \n\t"
764 "punpcklwd %5, %%mm1 \n\t"
765 "punpcklwd %5, %%mm2 \n\t"
766 "punpckhwd %5, %%mm3 \n\t"
767 "punpckhwd %5, %%mm4 \n\t"
768 "punpckhwd %5, %%mm5 \n\t"
769 "psllq $8, %%mm1 \n\t"
770 "psllq $16, %%mm2 \n\t"
771 "por %%mm1, %%mm0 \n\t"
772 "por %%mm2, %%mm0 \n\t"
773 "psllq $8, %%mm4 \n\t"
774 "psllq $16, %%mm5 \n\t"
775 "por %%mm4, %%mm3 \n\t"
776 "por %%mm5, %%mm3 \n\t"
777
778 "movq %%mm0, %%mm6 \n\t"
779 "movq %%mm3, %%mm7 \n\t"
780
781 "movq 8(%1), %%mm0 \n\t"
782 "movq 8(%1), %%mm1 \n\t"
783 "movq 8(%1), %%mm2 \n\t"
784 "pand %2, %%mm0 \n\t"
785 "pand %3, %%mm1 \n\t"
786 "pand %4, %%mm2 \n\t"
787 "psllq $3, %%mm0 \n\t"
788 "psrlq $2, %%mm1 \n\t"
789 "psrlq $7, %%mm2 \n\t"
790 "movq %%mm0, %%mm3 \n\t"
791 "movq %%mm1, %%mm4 \n\t"
792 "movq %%mm2, %%mm5 \n\t"
793 "punpcklwd %5, %%mm0 \n\t"
794 "punpcklwd %5, %%mm1 \n\t"
795 "punpcklwd %5, %%mm2 \n\t"
796 "punpckhwd %5, %%mm3 \n\t"
797 "punpckhwd %5, %%mm4 \n\t"
798 "punpckhwd %5, %%mm5 \n\t"
799 "psllq $8, %%mm1 \n\t"
800 "psllq $16, %%mm2 \n\t"
801 "por %%mm1, %%mm0 \n\t"
802 "por %%mm2, %%mm0 \n\t"
803 "psllq $8, %%mm4 \n\t"
804 "psllq $16, %%mm5 \n\t"
805 "por %%mm4, %%mm3 \n\t"
806 "por %%mm5, %%mm3 \n\t"
807
808 :"=m"(*d)
809 :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
810 :"memory");
811 /* borrowed 32 to 24 */
812 __asm__ volatile(
813 "movq %%mm0, %%mm4 \n\t"
814 "movq %%mm3, %%mm5 \n\t"
815 "movq %%mm6, %%mm0 \n\t"
816 "movq %%mm7, %%mm1 \n\t"
817
818 "movq %%mm4, %%mm6 \n\t"
819 "movq %%mm5, %%mm7 \n\t"
820 "movq %%mm0, %%mm2 \n\t"
821 "movq %%mm1, %%mm3 \n\t"
822
823 STORE_BGR24_MMX
824
825 :: "r"(d), "m"(*s)
826 :"memory");
827 d += 24;
828 s += 8;
829 }
830 __asm__ volatile(SFENCE:::"memory");
831 __asm__ volatile(EMMS:::"memory");
832 while (s < end) {
833 register uint16_t bgr;
834 bgr = *s++;
835 *d++ = (bgr&0x1F)<<3;
836 *d++ = (bgr&0x3E0)>>2;
837 *d++ = (bgr&0x7C00)>>7;
838 }
839 }
840
841 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
842 {
843 const uint16_t *end;
844 const uint16_t *mm_end;
845 uint8_t *d = (uint8_t *)dst;
846 const uint16_t *s = (const uint16_t *)src;
847 end = s + src_size/2;
848 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
849 mm_end = end - 7;
850 while (s < mm_end) {
851 __asm__ volatile(
852 PREFETCH" 32(%1) \n\t"
853 "movq (%1), %%mm0 \n\t"
854 "movq (%1), %%mm1 \n\t"
855 "movq (%1), %%mm2 \n\t"
856 "pand %2, %%mm0 \n\t"
857 "pand %3, %%mm1 \n\t"
858 "pand %4, %%mm2 \n\t"
859 "psllq $3, %%mm0 \n\t"
860 "psrlq $3, %%mm1 \n\t"
861 "psrlq $8, %%mm2 \n\t"
862 "movq %%mm0, %%mm3 \n\t"
863 "movq %%mm1, %%mm4 \n\t"
864 "movq %%mm2, %%mm5 \n\t"
865 "punpcklwd %5, %%mm0 \n\t"
866 "punpcklwd %5, %%mm1 \n\t"
867 "punpcklwd %5, %%mm2 \n\t"
868 "punpckhwd %5, %%mm3 \n\t"
869 "punpckhwd %5, %%mm4 \n\t"
870 "punpckhwd %5, %%mm5 \n\t"
871 "psllq $8, %%mm1 \n\t"
872 "psllq $16, %%mm2 \n\t"
873 "por %%mm1, %%mm0 \n\t"
874 "por %%mm2, %%mm0 \n\t"
875 "psllq $8, %%mm4 \n\t"
876 "psllq $16, %%mm5 \n\t"
877 "por %%mm4, %%mm3 \n\t"
878 "por %%mm5, %%mm3 \n\t"
879
880 "movq %%mm0, %%mm6 \n\t"
881 "movq %%mm3, %%mm7 \n\t"
882
883 "movq 8(%1), %%mm0 \n\t"
884 "movq 8(%1), %%mm1 \n\t"
885 "movq 8(%1), %%mm2 \n\t"
886 "pand %2, %%mm0 \n\t"
887 "pand %3, %%mm1 \n\t"
888 "pand %4, %%mm2 \n\t"
889 "psllq $3, %%mm0 \n\t"
890 "psrlq $3, %%mm1 \n\t"
891 "psrlq $8, %%mm2 \n\t"
892 "movq %%mm0, %%mm3 \n\t"
893 "movq %%mm1, %%mm4 \n\t"
894 "movq %%mm2, %%mm5 \n\t"
895 "punpcklwd %5, %%mm0 \n\t"
896 "punpcklwd %5, %%mm1 \n\t"
897 "punpcklwd %5, %%mm2 \n\t"
898 "punpckhwd %5, %%mm3 \n\t"
899 "punpckhwd %5, %%mm4 \n\t"
900 "punpckhwd %5, %%mm5 \n\t"
901 "psllq $8, %%mm1 \n\t"
902 "psllq $16, %%mm2 \n\t"
903 "por %%mm1, %%mm0 \n\t"
904 "por %%mm2, %%mm0 \n\t"
905 "psllq $8, %%mm4 \n\t"
906 "psllq $16, %%mm5 \n\t"
907 "por %%mm4, %%mm3 \n\t"
908 "por %%mm5, %%mm3 \n\t"
909 :"=m"(*d)
910 :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
911 :"memory");
912 /* borrowed 32 to 24 */
913 __asm__ volatile(
914 "movq %%mm0, %%mm4 \n\t"
915 "movq %%mm3, %%mm5 \n\t"
916 "movq %%mm6, %%mm0 \n\t"
917 "movq %%mm7, %%mm1 \n\t"
918
919 "movq %%mm4, %%mm6 \n\t"
920 "movq %%mm5, %%mm7 \n\t"
921 "movq %%mm0, %%mm2 \n\t"
922 "movq %%mm1, %%mm3 \n\t"
923
924 STORE_BGR24_MMX
925
926 :: "r"(d), "m"(*s)
927 :"memory");
928 d += 24;
929 s += 8;
930 }
931 __asm__ volatile(SFENCE:::"memory");
932 __asm__ volatile(EMMS:::"memory");
933 while (s < end) {
934 register uint16_t bgr;
935 bgr = *s++;
936 *d++ = (bgr&0x1F)<<3;
937 *d++ = (bgr&0x7E0)>>3;
938 *d++ = (bgr&0xF800)>>8;
939 }
940 }
941
942 /*
943 * mm0 = 00 B3 00 B2 00 B1 00 B0
944 * mm1 = 00 G3 00 G2 00 G1 00 G0
945 * mm2 = 00 R3 00 R2 00 R1 00 R0
946 * mm6 = FF FF FF FF FF FF FF FF
947 * mm7 = 00 00 00 00 00 00 00 00
948 */
949 #define PACK_RGB32 \
950 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
951 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
952 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
953 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
954 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
955 "movq %%mm0, %%mm3 \n\t" \
956 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
957 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
958 MOVNTQ" %%mm0, (%0) \n\t" \
959 MOVNTQ" %%mm3, 8(%0) \n\t" \
960
961 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
962 {
963 const uint16_t *end;
964 const uint16_t *mm_end;
965 uint8_t *d = dst;
966 const uint16_t *s = (const uint16_t *)src;
967 end = s + src_size/2;
968 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
969 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
970 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
971 mm_end = end - 3;
972 while (s < mm_end) {
973 __asm__ volatile(
974 PREFETCH" 32(%1) \n\t"
975 "movq (%1), %%mm0 \n\t"
976 "movq (%1), %%mm1 \n\t"
977 "movq (%1), %%mm2 \n\t"
978 "pand %2, %%mm0 \n\t"
979 "pand %3, %%mm1 \n\t"
980 "pand %4, %%mm2 \n\t"
981 "psllq $3, %%mm0 \n\t"
982 "psrlq $2, %%mm1 \n\t"
983 "psrlq $7, %%mm2 \n\t"
984 PACK_RGB32
985 ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
986 :"memory");
987 d += 16;
988 s += 4;
989 }
990 __asm__ volatile(SFENCE:::"memory");
991 __asm__ volatile(EMMS:::"memory");
992 while (s < end) {
993 register uint16_t bgr;
994 bgr = *s++;
995 *d++ = (bgr&0x1F)<<3;
996 *d++ = (bgr&0x3E0)>>2;
997 *d++ = (bgr&0x7C00)>>7;
998 *d++ = 255;
999 }
1000 }
1001
1002 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1003 {
1004 const uint16_t *end;
1005 const uint16_t *mm_end;
1006 uint8_t *d = dst;
1007 const uint16_t *s = (const uint16_t*)src;
1008 end = s + src_size/2;
1009 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1010 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1011 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1012 mm_end = end - 3;
1013 while (s < mm_end) {
1014 __asm__ volatile(
1015 PREFETCH" 32(%1) \n\t"
1016 "movq (%1), %%mm0 \n\t"
1017 "movq (%1), %%mm1 \n\t"
1018 "movq (%1), %%mm2 \n\t"
1019 "pand %2, %%mm0 \n\t"
1020 "pand %3, %%mm1 \n\t"
1021 "pand %4, %%mm2 \n\t"
1022 "psllq $3, %%mm0 \n\t"
1023 "psrlq $3, %%mm1 \n\t"
1024 "psrlq $8, %%mm2 \n\t"
1025 PACK_RGB32
1026 ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1027 :"memory");
1028 d += 16;
1029 s += 4;
1030 }
1031 __asm__ volatile(SFENCE:::"memory");
1032 __asm__ volatile(EMMS:::"memory");
1033 while (s < end) {
1034 register uint16_t bgr;
1035 bgr = *s++;
1036 *d++ = (bgr&0x1F)<<3;
1037 *d++ = (bgr&0x7E0)>>3;
1038 *d++ = (bgr&0xF800)>>8;
1039 *d++ = 255;
1040 }
1041 }
1042
1043 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1044 {
1045 x86_reg idx = 15 - src_size;
1046 const uint8_t *s = src-idx;
1047 uint8_t *d = dst-idx;
1048 __asm__ volatile(
1049 "test %0, %0 \n\t"
1050 "jns 2f \n\t"
1051 PREFETCH" (%1, %0) \n\t"
1052 "movq %3, %%mm7 \n\t"
1053 "pxor %4, %%mm7 \n\t"
1054 "movq %%mm7, %%mm6 \n\t"
1055 "pxor %5, %%mm7 \n\t"
1056 ".p2align 4 \n\t"
1057 "1: \n\t"
1058 PREFETCH" 32(%1, %0) \n\t"
1059 "movq (%1, %0), %%mm0 \n\t"
1060 "movq 8(%1, %0), %%mm1 \n\t"
1061 # if COMPILE_TEMPLATE_MMXEXT
1062 "pshufw $177, %%mm0, %%mm3 \n\t"
1063 "pshufw $177, %%mm1, %%mm5 \n\t"
1064 "pand %%mm7, %%mm0 \n\t"
1065 "pand %%mm6, %%mm3 \n\t"
1066 "pand %%mm7, %%mm1 \n\t"
1067 "pand %%mm6, %%mm5 \n\t"
1068 "por %%mm3, %%mm0 \n\t"
1069 "por %%mm5, %%mm1 \n\t"
1070 # else
1071 "movq %%mm0, %%mm2 \n\t"
1072 "movq %%mm1, %%mm4 \n\t"
1073 "pand %%mm7, %%mm0 \n\t"
1074 "pand %%mm6, %%mm2 \n\t"
1075 "pand %%mm7, %%mm1 \n\t"
1076 "pand %%mm6, %%mm4 \n\t"
1077 "movq %%mm2, %%mm3 \n\t"
1078 "movq %%mm4, %%mm5 \n\t"
1079 "pslld $16, %%mm2 \n\t"
1080 "psrld $16, %%mm3 \n\t"
1081 "pslld $16, %%mm4 \n\t"
1082 "psrld $16, %%mm5 \n\t"
1083 "por %%mm2, %%mm0 \n\t"
1084 "por %%mm4, %%mm1 \n\t"
1085 "por %%mm3, %%mm0 \n\t"
1086 "por %%mm5, %%mm1 \n\t"
1087 # endif
1088 MOVNTQ" %%mm0, (%2, %0) \n\t"
1089 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1090 "add $16, %0 \n\t"
1091 "js 1b \n\t"
1092 SFENCE" \n\t"
1093 EMMS" \n\t"
1094 "2: \n\t"
1095 : "+&r"(idx)
1096 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1097 : "memory");
1098 for (; idx<15; idx+=4) {
1099 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1100 v &= 0xff00ff;
1101 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1102 }
1103 }
1104
1105 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1106 {
1107 unsigned i;
1108 x86_reg mmx_size= 23 - src_size;
1109 __asm__ volatile (
1110 "test %%"REG_a", %%"REG_a" \n\t"
1111 "jns 2f \n\t"
1112 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1113 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1114 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1115 ".p2align 4 \n\t"
1116 "1: \n\t"
1117 PREFETCH" 32(%1, %%"REG_a") \n\t"
1118 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1119 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1120 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1121 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1122 "pand %%mm5, %%mm0 \n\t"
1123 "pand %%mm6, %%mm1 \n\t"
1124 "pand %%mm7, %%mm2 \n\t"
1125 "por %%mm0, %%mm1 \n\t"
1126 "por %%mm2, %%mm1 \n\t"
1127 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1128 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1129 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1130 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1131 "pand %%mm7, %%mm0 \n\t"
1132 "pand %%mm5, %%mm1 \n\t"
1133 "pand %%mm6, %%mm2 \n\t"
1134 "por %%mm0, %%mm1 \n\t"
1135 "por %%mm2, %%mm1 \n\t"
1136 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1137 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1138 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1139 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1140 "pand %%mm6, %%mm0 \n\t"
1141 "pand %%mm7, %%mm1 \n\t"
1142 "pand %%mm5, %%mm2 \n\t"
1143 "por %%mm0, %%mm1 \n\t"
1144 "por %%mm2, %%mm1 \n\t"
1145 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1146 "add $24, %%"REG_a" \n\t"
1147 " js 1b \n\t"
1148 "2: \n\t"
1149 : "+a" (mmx_size)
1150 : "r" (src-mmx_size), "r"(dst-mmx_size)
1151 );
1152
1153 __asm__ volatile(SFENCE:::"memory");
1154 __asm__ volatile(EMMS:::"memory");
1155
1156 if (mmx_size==23) return; //finished, was multiple of 8
1157
1158 src+= src_size;
1159 dst+= src_size;
1160 src_size= 23-mmx_size;
1161 src-= src_size;
1162 dst-= src_size;
1163 for (i=0; i<src_size; i+=3) {
1164 register uint8_t x;
1165 x = src[i + 2];
1166 dst[i + 1] = src[i + 1];
1167 dst[i + 2] = src[i + 0];
1168 dst[i + 0] = x;
1169 }
1170 }
1171
1172 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1173 int width, int height,
1174 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1175 {
1176 int y;
1177 const x86_reg chromWidth= width>>1;
1178 for (y=0; y<height; y++) {
1179 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1180 __asm__ volatile(
1181 "xor %%"REG_a", %%"REG_a" \n\t"
1182 ".p2align 4 \n\t"
1183 "1: \n\t"
1184 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1185 PREFETCH" 32(%2, %%"REG_a") \n\t"
1186 PREFETCH" 32(%3, %%"REG_a") \n\t"
1187 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1188 "movq %%mm0, %%mm2 \n\t" // U(0)
1189 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1190 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1191 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1192
1193 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1194 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1195 "movq %%mm3, %%mm4 \n\t" // Y(0)
1196 "movq %%mm5, %%mm6 \n\t" // Y(8)
1197 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1198 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1199 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1200 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1201
1202 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1203 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1204 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1205 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1206
1207 "add $8, %%"REG_a" \n\t"
1208 "cmp %4, %%"REG_a" \n\t"
1209 " jb 1b \n\t"
1210 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1211 : "%"REG_a
1212 );
1213 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1214 usrc += chromStride;
1215 vsrc += chromStride;
1216 }
1217 ysrc += lumStride;
1218 dst += dstStride;
1219 }
1220 __asm__(EMMS" \n\t"
1221 SFENCE" \n\t"
1222 :::"memory");
1223 }
1224
1225 /**
1226 * Height should be a multiple of 2 and width should be a multiple of 16.
1227 * (If this is a problem for anyone then tell me, and I will fix it.)
1228 */
1229 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1230 int width, int height,
1231 int lumStride, int chromStride, int dstStride)
1232 {
1233 //FIXME interpolate chroma
1234 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1235 }
1236
1237 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1238 int width, int height,
1239 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1240 {
1241 int y;
1242 const x86_reg chromWidth= width>>1;
1243 for (y=0; y<height; y++) {
1244 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1245 __asm__ volatile(
1246 "xor %%"REG_a", %%"REG_a" \n\t"
1247 ".p2align 4 \n\t"
1248 "1: \n\t"
1249 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1250 PREFETCH" 32(%2, %%"REG_a") \n\t"
1251 PREFETCH" 32(%3, %%"REG_a") \n\t"
1252 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1253 "movq %%mm0, %%mm2 \n\t" // U(0)
1254 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1255 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1256 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1257
1258 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1259 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1260 "movq %%mm0, %%mm4 \n\t" // Y(0)
1261 "movq %%mm2, %%mm6 \n\t" // Y(8)
1262 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1263 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1264 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1265 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1266
1267 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1268 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1269 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1270 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1271
1272 "add $8, %%"REG_a" \n\t"
1273 "cmp %4, %%"REG_a" \n\t"
1274 " jb 1b \n\t"
1275 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1276 : "%"REG_a
1277 );
1278 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1279 usrc += chromStride;
1280 vsrc += chromStride;
1281 }
1282 ysrc += lumStride;
1283 dst += dstStride;
1284 }
1285 __asm__(EMMS" \n\t"
1286 SFENCE" \n\t"
1287 :::"memory");
1288 }
1289
1290 /**
1291 * Height should be a multiple of 2 and width should be a multiple of 16
1292 * (If this is a problem for anyone then tell me, and I will fix it.)
1293 */
1294 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1295 int width, int height,
1296 int lumStride, int chromStride, int dstStride)
1297 {
1298 //FIXME interpolate chroma
1299 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1300 }
1301
1302 /**
1303 * Width should be a multiple of 16.
1304 */
1305 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1306 int width, int height,
1307 int lumStride, int chromStride, int dstStride)
1308 {
1309 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1310 }
1311
1312 /**
1313 * Width should be a multiple of 16.
1314 */
1315 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1316 int width, int height,
1317 int lumStride, int chromStride, int dstStride)
1318 {
1319 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1320 }
1321
1322 /**
1323 * Height should be a multiple of 2 and width should be a multiple of 16.
1324 * (If this is a problem for anyone then tell me, and I will fix it.)
1325 */
1326 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1327 int width, int height,
1328 int lumStride, int chromStride, int srcStride)
1329 {
1330 int y;
1331 const x86_reg chromWidth= width>>1;
1332 for (y=0; y<height; y+=2) {
1333 __asm__ volatile(
1334 "xor %%"REG_a", %%"REG_a" \n\t"
1335 "pcmpeqw %%mm7, %%mm7 \n\t"
1336 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1337 ".p2align 4 \n\t"
1338 "1: \n\t"
1339 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1340 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1341 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1342 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1343 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1344 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1345 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1346 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1347 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1348 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1349 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1350
1351 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1352
1353 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1354 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1355 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1356 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1357 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1358 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1359 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1360 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1361 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1362 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1363
1364 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1365
1366 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1367 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1368 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1369 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1370 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1371 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1372 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1373 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1374
1375 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1376 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1377
1378 "add $8, %%"REG_a" \n\t"
1379 "cmp %4, %%"REG_a" \n\t"
1380 " jb 1b \n\t"
1381 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1382 : "memory", "%"REG_a
1383 );
1384
1385 ydst += lumStride;
1386 src += srcStride;
1387
1388 __asm__ volatile(
1389 "xor %%"REG_a", %%"REG_a" \n\t"
1390 ".p2align 4 \n\t"
1391 "1: \n\t"
1392 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1393 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1394 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1395 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1396 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1397 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1398 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1399 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1400 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1401 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1402 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1403
1404 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1405 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1406
1407 "add $8, %%"REG_a" \n\t"
1408 "cmp %4, %%"REG_a" \n\t"
1409 " jb 1b \n\t"
1410
1411 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1412 : "memory", "%"REG_a
1413 );
1414 udst += chromStride;
1415 vdst += chromStride;
1416 ydst += lumStride;
1417 src += srcStride;
1418 }
1419 __asm__ volatile(EMMS" \n\t"
1420 SFENCE" \n\t"
1421 :::"memory");
1422 }
1423 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1424
1425 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1426 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1427 {
1428 int x,y;
1429
1430 dst[0]= src[0];
1431
1432 // first line
1433 for (x=0; x<srcWidth-1; x++) {
1434 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1435 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1436 }
1437 dst[2*srcWidth-1]= src[srcWidth-1];
1438
1439 dst+= dstStride;
1440
1441 for (y=1; y<srcHeight; y++) {
1442 const x86_reg mmxSize= srcWidth&~15;
1443 __asm__ volatile(
1444 "mov %4, %%"REG_a" \n\t"
1445 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1446 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1447 "movq %%mm4, %%mm2 \n\t"
1448 "psllq $8, %%mm4 \n\t"
1449 "pand %%mm0, %%mm2 \n\t"
1450 "por %%mm2, %%mm4 \n\t"
1451 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1452 "movq %%mm5, %%mm3 \n\t"
1453 "psllq $8, %%mm5 \n\t"
1454 "pand %%mm0, %%mm3 \n\t"
1455 "por %%mm3, %%mm5 \n\t"
1456 "1: \n\t"
1457 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1458 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1459 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1460 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1461 PAVGB" %%mm0, %%mm5 \n\t"
1462 PAVGB" %%mm0, %%mm3 \n\t"
1463 PAVGB" %%mm0, %%mm5 \n\t"
1464 PAVGB" %%mm0, %%mm3 \n\t"
1465 PAVGB" %%mm1, %%mm4 \n\t"
1466 PAVGB" %%mm1, %%mm2 \n\t"
1467 PAVGB" %%mm1, %%mm4 \n\t"
1468 PAVGB" %%mm1, %%mm2 \n\t"
1469 "movq %%mm5, %%mm7 \n\t"
1470 "movq %%mm4, %%mm6 \n\t"
1471 "punpcklbw %%mm3, %%mm5 \n\t"
1472 "punpckhbw %%mm3, %%mm7 \n\t"
1473 "punpcklbw %%mm2, %%mm4 \n\t"
1474 "punpckhbw %%mm2, %%mm6 \n\t"
1475 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1476 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1477 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1478 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1479 "add $8, %%"REG_a" \n\t"
1480 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1481 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1482 " js 1b \n\t"
1483 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1484 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1485 "g" (-mmxSize)
1486 : "%"REG_a
1487 );
1488
1489 for (x=mmxSize-1; x<srcWidth-1; x++) {
1490 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1491 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1492 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1493 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1494 }
1495 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1496 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1497
1498 dst+=dstStride*2;
1499 src+=srcStride;
1500 }
1501
1502 // last line
1503 dst[0]= src[0];
1504
1505 for (x=0; x<srcWidth-1; x++) {
1506 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1507 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1508 }
1509 dst[2*srcWidth-1]= src[srcWidth-1];
1510
1511 __asm__ volatile(EMMS" \n\t"
1512 SFENCE" \n\t"
1513 :::"memory");
1514 }
1515 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1516
1517 #if !COMPILE_TEMPLATE_AMD3DNOW
1518 /**
1519 * Height should be a multiple of 2 and width should be a multiple of 16.
1520 * (If this is a problem for anyone then tell me, and I will fix it.)
1521 * Chrominance data is only taken from every second line, others are ignored.
1522 * FIXME: Write HQ version.
1523 */
1524 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1525 int width, int height,
1526 int lumStride, int chromStride, int srcStride)
1527 {
1528 int y;
1529 const x86_reg chromWidth= width>>1;
1530 for (y=0; y<height; y+=2) {
1531 __asm__ volatile(
1532 "xor %%"REG_a", %%"REG_a" \n\t"
1533 "pcmpeqw %%mm7, %%mm7 \n\t"
1534 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1535 ".p2align 4 \n\t"
1536 "1: \n\t"
1537 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1538 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1539 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1540 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1541 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1542 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1543 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1544 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1545 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1546 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1547 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1548
1549 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1550
1551 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1552 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1553 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1554 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1555 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1556 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1557 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1558 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1559 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1560 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1561
1562 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1563
1564 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1565 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1566 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1567 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1568 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1569 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1570 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1571 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1572
1573 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1574 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1575
1576 "add $8, %%"REG_a" \n\t"
1577 "cmp %4, %%"REG_a" \n\t"
1578 " jb 1b \n\t"
1579 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1580 : "memory", "%"REG_a
1581 );
1582
1583 ydst += lumStride;
1584 src += srcStride;
1585
1586 __asm__ volatile(
1587 "xor %%"REG_a", %%"REG_a" \n\t"
1588 ".p2align 4 \n\t"
1589 "1: \n\t"
1590 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1591 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1592 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1593 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1594 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1595 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1596 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1597 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1598 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1599 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1600 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1601
1602 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1603 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1604
1605 "add $8, %%"REG_a" \n\t"
1606 "cmp %4, %%"REG_a" \n\t"
1607 " jb 1b \n\t"
1608
1609 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1610 : "memory", "%"REG_a
1611 );
1612 udst += chromStride;
1613 vdst += chromStride;
1614 ydst += lumStride;
1615 src += srcStride;
1616 }
1617 __asm__ volatile(EMMS" \n\t"
1618 SFENCE" \n\t"
1619 :::"memory");
1620 }
1621 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1622
1623 /**
1624 * Height should be a multiple of 2 and width should be a multiple of 2.
1625 * (If this is a problem for anyone then tell me, and I will fix it.)
1626 * Chrominance data is only taken from every second line,
1627 * others are ignored in the C version.
1628 * FIXME: Write HQ version.
1629 */
1630 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1631 int width, int height,
1632 int lumStride, int chromStride, int srcStride)
1633 {
1634 int y;
1635 const x86_reg chromWidth= width>>1;
1636 for (y=0; y<height-2; y+=2) {
1637 int i;
1638 for (i=0; i<2; i++) {
1639 __asm__ volatile(
1640 "mov %2, %%"REG_a" \n\t"
1641 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1642 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1643 "pxor %%mm7, %%mm7 \n\t"
1644 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1645 ".p2align 4 \n\t"
1646 "1: \n\t"
1647 PREFETCH" 64(%0, %%"REG_d") \n\t"
1648 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1649 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1650 "punpcklbw %%mm7, %%mm0 \n\t"
1651 "punpcklbw %%mm7, %%mm1 \n\t"
1652 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1653 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1654 "punpcklbw %%mm7, %%mm2 \n\t"
1655 "punpcklbw %%mm7, %%mm3 \n\t"
1656 "pmaddwd %%mm6, %%mm0 \n\t"
1657 "pmaddwd %%mm6, %%mm1 \n\t"
1658 "pmaddwd %%mm6, %%mm2 \n\t"
1659 "pmaddwd %%mm6, %%mm3 \n\t"
1660 #ifndef FAST_BGR2YV12
1661 "psrad $8, %%mm0 \n\t"
1662 "psrad $8, %%mm1 \n\t"
1663 "psrad $8, %%mm2 \n\t"
1664 "psrad $8, %%mm3 \n\t"
1665 #endif
1666 "packssdw %%mm1, %%mm0 \n\t"
1667 "packssdw %%mm3, %%mm2 \n\t"
1668 "pmaddwd %%mm5, %%mm0 \n\t"
1669 "pmaddwd %%mm5, %%mm2 \n\t"
1670 "packssdw %%mm2, %%mm0 \n\t"
1671 "psraw $7, %%mm0 \n\t"
1672
1673 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1674 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1675 "punpcklbw %%mm7, %%mm4 \n\t"
1676 "punpcklbw %%mm7, %%mm1 \n\t"
1677 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1678 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1679 "punpcklbw %%mm7, %%mm2 \n\t"
1680 "punpcklbw %%mm7, %%mm3 \n\t"
1681 "pmaddwd %%mm6, %%mm4 \n\t"
1682 "pmaddwd %%mm6, %%mm1 \n\t"
1683 "pmaddwd %%mm6, %%mm2 \n\t"
1684 "pmaddwd %%mm6, %%mm3 \n\t"
1685 #ifndef FAST_BGR2YV12
1686 "psrad $8, %%mm4 \n\t"
1687 "psrad $8, %%mm1 \n\t"
1688 "psrad $8, %%mm2 \n\t"
1689 "psrad $8, %%mm3 \n\t"
1690 #endif
1691 "packssdw %%mm1, %%mm4 \n\t"
1692 "packssdw %%mm3, %%mm2 \n\t"
1693 "pmaddwd %%mm5, %%mm4 \n\t"
1694 "pmaddwd %%mm5, %%mm2 \n\t"
1695 "add $24, %%"REG_d" \n\t"
1696 "packssdw %%mm2, %%mm4 \n\t"
1697 "psraw $7, %%mm4 \n\t"
1698
1699 "packuswb %%mm4, %%mm0 \n\t"
1700 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1701
1702 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1703 "add $8, %%"REG_a" \n\t"
1704 " js 1b \n\t"
1705 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1706 : "%"REG_a, "%"REG_d
1707 );
1708 ydst += lumStride;
1709 src += srcStride;
1710 }
1711 src -= srcStride*2;
1712 __asm__ volatile(
1713 "mov %4, %%"REG_a" \n\t"
1714 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1715 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1716 "pxor %%mm7, %%mm7 \n\t"
1717 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1718 "add %%"REG_d", %%"REG_d" \n\t"
1719 ".p2align 4 \n\t"
1720 "1: \n\t"
1721 PREFETCH" 64(%0, %%"REG_d") \n\t"
1722 PREFETCH" 64(%1, %%"REG_d") \n\t"
1723 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1724 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1725 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1726 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1727 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1728 PAVGB" %%mm1, %%mm0 \n\t"
1729 PAVGB" %%mm3, %%mm2 \n\t"
1730 "movq %%mm0, %%mm1 \n\t"
1731 "movq %%mm2, %%mm3 \n\t"
1732 "psrlq $24, %%mm0 \n\t"
1733 "psrlq $24, %%mm2 \n\t"
1734 PAVGB" %%mm1, %%mm0 \n\t"
1735 PAVGB" %%mm3, %%mm2 \n\t"
1736 "punpcklbw %%mm7, %%mm0 \n\t"
1737 "punpcklbw %%mm7, %%mm2 \n\t"
1738 #else
1739 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1740 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1741 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1742 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1743 "punpcklbw %%mm7, %%mm0 \n\t"
1744 "punpcklbw %%mm7, %%mm1 \n\t"
1745 "punpcklbw %%mm7, %%mm2 \n\t"
1746 "punpcklbw %%mm7, %%mm3 \n\t"
1747 "paddw %%mm1, %%mm0 \n\t"
1748 "paddw %%mm3, %%mm2 \n\t"
1749 "paddw %%mm2, %%mm0 \n\t"
1750 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1751 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1752 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1753 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1754 "punpcklbw %%mm7, %%mm4 \n\t"
1755 "punpcklbw %%mm7, %%mm1 \n\t"
1756 "punpcklbw %%mm7, %%mm2 \n\t"
1757 "punpcklbw %%mm7, %%mm3 \n\t"
1758 "paddw %%mm1, %%mm4 \n\t"
1759 "paddw %%mm3, %%mm2 \n\t"
1760 "paddw %%mm4, %%mm2 \n\t"
1761 "psrlw $2, %%mm0 \n\t"
1762 "psrlw $2, %%mm2 \n\t"
1763 #endif
1764 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1765 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1766
1767 "pmaddwd %%mm0, %%mm1 \n\t"
1768 "pmaddwd %%mm2, %%mm3 \n\t"
1769 "pmaddwd %%mm6, %%mm0 \n\t"
1770 "pmaddwd %%mm6, %%mm2 \n\t"
1771 #ifndef FAST_BGR2YV12
1772 "psrad $8, %%mm0 \n\t"
1773 "psrad $8, %%mm1 \n\t"
1774 "psrad $8, %%mm2 \n\t"
1775 "psrad $8, %%mm3 \n\t"
1776 #endif
1777 "packssdw %%mm2, %%mm0 \n\t"
1778 "packssdw %%mm3, %%mm1 \n\t"
1779 "pmaddwd %%mm5, %%mm0 \n\t"
1780 "pmaddwd %%mm5, %%mm1 \n\t"
1781 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1782 "psraw $7, %%mm0 \n\t"
1783
1784 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1785 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1786 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1787 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1788 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1789 PAVGB" %%mm1, %%mm4 \n\t"
1790 PAVGB" %%mm3, %%mm2 \n\t"
1791 "movq %%mm4, %%mm1 \n\t"
1792 "movq %%mm2, %%mm3 \n\t"
1793 "psrlq $24, %%mm4 \n\t"
1794 "psrlq $24, %%mm2 \n\t"
1795 PAVGB" %%mm1, %%mm4 \n\t"
1796 PAVGB" %%mm3, %%mm2 \n\t"
1797 "punpcklbw %%mm7, %%mm4 \n\t"
1798 "punpcklbw %%mm7, %%mm2 \n\t"
1799 #else
1800 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1801 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1802 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1803 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1804 "punpcklbw %%mm7, %%mm4 \n\t"
1805 "punpcklbw %%mm7, %%mm1 \n\t"
1806 "punpcklbw %%mm7, %%mm2 \n\t"
1807 "punpcklbw %%mm7, %%mm3 \n\t"
1808 "paddw %%mm1, %%mm4 \n\t"
1809 "paddw %%mm3, %%mm2 \n\t"
1810 "paddw %%mm2, %%mm4 \n\t"
1811 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1812 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1813 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1814 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1815 "punpcklbw %%mm7, %%mm5 \n\t"
1816 "punpcklbw %%mm7, %%mm1 \n\t"
1817 "punpcklbw %%mm7, %%mm2 \n\t"
1818 "punpcklbw %%mm7, %%mm3 \n\t"
1819 "paddw %%mm1, %%mm5 \n\t"
1820 "paddw %%mm3, %%mm2 \n\t"
1821 "paddw %%mm5, %%mm2 \n\t"
1822 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1823 "psrlw $2, %%mm4 \n\t"
1824 "psrlw $2, %%mm2 \n\t"
1825 #endif
1826 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1827 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1828
1829 "pmaddwd %%mm4, %%mm1 \n\t"
1830 "pmaddwd %%mm2, %%mm3 \n\t"
1831 "pmaddwd %%mm6, %%mm4 \n\t"
1832 "pmaddwd %%mm6, %%mm2 \n\t"
1833 #ifndef FAST_BGR2YV12
1834 "psrad $8, %%mm4 \n\t"
1835 "psrad $8, %%mm1 \n\t"
1836 "psrad $8, %%mm2 \n\t"
1837 "psrad $8, %%mm3 \n\t"
1838 #endif
1839 "packssdw %%mm2, %%mm4 \n\t"
1840 "packssdw %%mm3, %%mm1 \n\t"
1841 "pmaddwd %%mm5, %%mm4 \n\t"
1842 "pmaddwd %%mm5, %%mm1 \n\t"
1843 "add $24, %%"REG_d" \n\t"
1844 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1845 "psraw $7, %%mm4 \n\t"
1846
1847 "movq %%mm0, %%mm1 \n\t"
1848 "punpckldq %%mm4, %%mm0 \n\t"
1849 "punpckhdq %%mm4, %%mm1 \n\t"
1850 "packsswb %%mm1, %%mm0 \n\t"
1851 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1852 "movd %%mm0, (%2, %%"REG_a") \n\t"
1853 "punpckhdq %%mm0, %%mm0 \n\t"
1854 "movd %%mm0, (%3, %%"REG_a") \n\t"
1855 "add $4, %%"REG_a" \n\t"
1856 " js 1b \n\t"
1857 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1858 : "%"REG_a, "%"REG_d
1859 );
1860
1861 udst += chromStride;
1862 vdst += chromStride;
1863 src += srcStride*2;
1864 }
1865
1866 __asm__ volatile(EMMS" \n\t"
1867 SFENCE" \n\t"
1868 :::"memory");
1869
1870 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1871 }
1872 #endif /* !COMPILE_TEMPLATE_SSE2 */
1873
1874 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1875 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1876 int width, int height, int src1Stride,
1877 int src2Stride, int dstStride)
1878 {
1879 int h;
1880
1881 for (h=0; h < height; h++) {
1882 int w;
1883
1884 #if COMPILE_TEMPLATE_SSE2
1885 __asm__(
1886 "xor %%"REG_a", %%"REG_a" \n\t"
1887 "1: \n\t"
1888 PREFETCH" 64(%1, %%"REG_a") \n\t"
1889 PREFETCH" 64(%2, %%"REG_a") \n\t"
1890 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1891 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1892 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1893 "punpcklbw %%xmm2, %%xmm0 \n\t"
1894 "punpckhbw %%xmm2, %%xmm1 \n\t"
1895 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1896 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1897 "add $16, %%"REG_a" \n\t"
1898 "cmp %3, %%"REG_a" \n\t"
1899 " jb 1b \n\t"
1900 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1901 : "memory", "%"REG_a""
1902 );
1903 #else
1904 __asm__(
1905 "xor %%"REG_a", %%"REG_a" \n\t"
1906 "1: \n\t"
1907 PREFETCH" 64(%1, %%"REG_a") \n\t"
1908 PREFETCH" 64(%2, %%"REG_a") \n\t"
1909 "movq (%1, %%"REG_a"), %%mm0 \n\t"
1910 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1911 "movq %%mm0, %%mm1 \n\t"
1912 "movq %%mm2, %%mm3 \n\t"
1913 "movq (%2, %%"REG_a"), %%mm4 \n\t"
1914 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1915 "punpcklbw %%mm4, %%mm0 \n\t"
1916 "punpckhbw %%mm4, %%mm1 \n\t"
1917 "punpcklbw %%mm5, %%mm2 \n\t"
1918 "punpckhbw %%mm5, %%mm3 \n\t"
1919 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1920 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1921 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1922 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1923 "add $16, %%"REG_a" \n\t"
1924 "cmp %3, %%"REG_a" \n\t"
1925 " jb 1b \n\t"
1926 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1927 : "memory", "%"REG_a
1928 );
1929 #endif
1930 for (w= (width&(~15)); w < width; w++) {
1931 dest[2*w+0] = src1[w];
1932 dest[2*w+1] = src2[w];
1933 }
1934 dest += dstStride;
1935 src1 += src1Stride;
1936 src2 += src2Stride;
1937 }
1938 __asm__(
1939 EMMS" \n\t"
1940 SFENCE" \n\t"
1941 ::: "memory"
1942 );
1943 }
1944 #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
1945
1946 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1947 void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1948 const uint8_t *src, const uint8_t *unused, int w,
1949 uint32_t *unused2);
1950 static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
1951 int width, int height, int srcStride,
1952 int dst1Stride, int dst2Stride)
1953 {
1954 int h;
1955
1956 for (h = 0; h < height; h++) {
1957 RENAME(ff_nv12ToUV)(dst1, dst2, src, NULL, width, NULL);
1958 src += srcStride;
1959 dst1 += dst1Stride;
1960 dst2 += dst2Stride;
1961 }
1962 __asm__(
1963 EMMS" \n\t"
1964 SFENCE" \n\t"
1965 ::: "memory"
1966 );
1967 }
1968 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1969
1970 #if !COMPILE_TEMPLATE_SSE2
1971 #if !COMPILE_TEMPLATE_AMD3DNOW
1972 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1973 uint8_t *dst1, uint8_t *dst2,
1974 int width, int height,
1975 int srcStride1, int srcStride2,
1976 int dstStride1, int dstStride2)
1977 {
1978 x86_reg x, y;
1979 int w,h;
1980 w=width/2; h=height/2;
1981 __asm__ volatile(
1982 PREFETCH" %0 \n\t"
1983 PREFETCH" %1 \n\t"
1984 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1985 for (y=0;y<h;y++) {
1986 const uint8_t* s1=src1+srcStride1*(y>>1);
1987 uint8_t* d=dst1+dstStride1*y;
1988 x=0;
1989 for (;x<w-31;x+=32) {
1990 __asm__ volatile(
1991 PREFETCH" 32(%1,%2) \n\t"
1992 "movq (%1,%2), %%mm0 \n\t"
1993 "movq 8(%1,%2), %%mm2 \n\t"
1994 "movq 16(%1,%2), %%mm4 \n\t"
1995 "movq 24(%1,%2), %%mm6 \n\t"
1996 "movq %%mm0, %%mm1 \n\t"
1997 "movq %%mm2, %%mm3 \n\t"
1998 "movq %%mm4, %%mm5 \n\t"
1999 "movq %%mm6, %%mm7 \n\t"
2000 "punpcklbw %%mm0, %%mm0 \n\t"
2001 "punpckhbw %%mm1, %%mm1 \n\t"
2002 "punpcklbw %%mm2, %%mm2 \n\t"
2003 "punpckhbw %%mm3, %%mm3 \n\t"
2004 "punpcklbw %%mm4, %%mm4 \n\t"
2005 "punpckhbw %%mm5, %%mm5 \n\t"
2006 "punpcklbw %%mm6, %%mm6 \n\t"
2007 "punpckhbw %%mm7, %%mm7 \n\t"
2008 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2009 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2010 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2011 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2012 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2013 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2014 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2015 MOVNTQ" %%mm7, 56(%0,%2,2)"
2016 :: "r"(d), "r"(s1), "r"(x)
2017 :"memory");
2018 }
2019 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2020 }
2021 for (y=0;y<h;y++) {
2022 const uint8_t* s2=src2+srcStride2*(y>>1);
2023 uint8_t* d=dst2+dstStride2*y;
2024 x=0;
2025 for (;x<w-31;x+=32) {
2026 __asm__ volatile(
2027 PREFETCH" 32(%1,%2) \n\t"
2028 "movq (%1,%2), %%mm0 \n\t"
2029 "movq 8(%1,%2), %%mm2 \n\t"
2030 "movq 16(%1,%2), %%mm4 \n\t"
2031 "movq 24(%1,%2), %%mm6 \n\t"
2032 "movq %%mm0, %%mm1 \n\t"
2033 "movq %%mm2, %%mm3 \n\t"
2034 "movq %%mm4, %%mm5 \n\t"
2035 "movq %%mm6, %%mm7 \n\t"
2036 "punpcklbw %%mm0, %%mm0 \n\t"
2037 "punpckhbw %%mm1, %%mm1 \n\t"
2038 "punpcklbw %%mm2, %%mm2 \n\t"
2039 "punpckhbw %%mm3, %%mm3 \n\t"
2040 "punpcklbw %%mm4, %%mm4 \n\t"
2041 "punpckhbw %%mm5, %%mm5 \n\t"
2042 "punpcklbw %%mm6, %%mm6 \n\t"
2043 "punpckhbw %%mm7, %%mm7 \n\t"
2044 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2045 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2046 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2047 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2048 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2049 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2050 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2051 MOVNTQ" %%mm7, 56(%0,%2,2)"
2052 :: "r"(d), "r"(s2), "r"(x)
2053 :"memory");
2054 }
2055 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2056 }
2057 __asm__(
2058 EMMS" \n\t"
2059 SFENCE" \n\t"
2060 ::: "memory"
2061 );
2062 }
2063
2064 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2065 uint8_t *dst,
2066 int width, int height,
2067 int srcStride1, int srcStride2,
2068 int srcStride3, int dstStride)
2069 {
2070 x86_reg x;
2071 int y,w,h;
2072 w=width/2; h=height;
2073 for (y=0;y<h;y++) {
2074 const uint8_t* yp=src1+srcStride1*y;
2075 const uint8_t* up=src2+srcStride2*(y>>2);
2076 const uint8_t* vp=src3+srcStride3*(y>>2);
2077 uint8_t* d=dst+dstStride*y;
2078 x=0;
2079 for (;x<w-7;x+=8) {
2080 __asm__ volatile(
2081 PREFETCH" 32(%1, %0) \n\t"
2082 PREFETCH" 32(%2, %0) \n\t"
2083 PREFETCH" 32(%3, %0) \n\t"
2084 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2085 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2086 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2087 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2088 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2089 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2090 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2091 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2092 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2093 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2094
2095 "movq %%mm1, %%mm6 \n\t"
2096 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2097 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2098 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2099 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2100 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2101
2102 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2103 "movq 8(%1, %0, 4), %%mm0 \n\t"
2104 "movq %%mm0, %%mm3 \n\t"
2105 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2106 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2107 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2108 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2109
2110 "movq %%mm4, %%mm6 \n\t"
2111 "movq 16(%1, %0, 4), %%mm0 \n\t"
2112 "movq %%mm0, %%mm3 \n\t"
2113 "punpcklbw %%mm5, %%mm4 \n\t"
2114 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2115 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2116 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2117 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2118
2119 "punpckhbw %%mm5, %%mm6 \n\t"
2120 "movq 24(%1, %0, 4), %%mm0 \n\t"
2121 "movq %%mm0, %%mm3 \n\t"
2122 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2123 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2124 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2125 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2126
2127 : "+r" (x)
2128 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2129 :"memory");
2130 }
2131 for (; x<w; x++) {
2132 const int x2 = x<<2;
2133 d[8*x+0] = yp[x2];
2134 d[8*x+1] = up[x];
2135 d[8*x+2] = yp[x2+1];
2136 d[8*x+3] = vp[x];
2137 d[8*x+4] = yp[x2+2];
2138 d[8*x+5] = up[x];
2139 d[8*x+6] = yp[x2+3];
2140 d[8*x+7] = vp[x];
2141 }
2142 }
2143 __asm__(
2144 EMMS" \n\t"
2145 SFENCE" \n\t"
2146 ::: "memory"
2147 );
2148 }
2149 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2150
2151 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2152 {
2153 dst += count;
2154 src += 2*count;
2155 count= - count;
2156
2157 if(count <= -16) {
2158 count += 15;
2159 __asm__ volatile(
2160 "pcmpeqw %%mm7, %%mm7 \n\t"
2161 "psrlw $8, %%mm7 \n\t"
2162 "1: \n\t"
2163 "movq -30(%1, %0, 2), %%mm0 \n\t"
2164 "movq -22(%1, %0, 2), %%mm1 \n\t"
2165 "movq -14(%1, %0, 2), %%mm2 \n\t"
2166 "movq -6(%1, %0, 2), %%mm3 \n\t"
2167 "pand %%mm7, %%mm0 \n\t"
2168 "pand %%mm7, %%mm1 \n\t"
2169 "pand %%mm7, %%mm2 \n\t"
2170 "pand %%mm7, %%mm3 \n\t"
2171 "packuswb %%mm1, %%mm0 \n\t"
2172 "packuswb %%mm3, %%mm2 \n\t"
2173 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2174 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2175 "add $16, %0 \n\t"
2176 " js 1b \n\t"
2177 : "+r"(count)
2178 : "r"(src), "r"(dst)
2179 );
2180 count -= 15;
2181 }
2182 while(count<0) {
2183 dst[count]= src[2*count];
2184 count++;
2185 }
2186 }
2187
2188 #if !COMPILE_TEMPLATE_AMD3DNOW
2189 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2190 {
2191 dst0+= count;
2192 dst1+= count;
2193 src += 4*count;
2194 count= - count;
2195 if(count <= -8) {
2196 count += 7;
2197 __asm__ volatile(
2198 "pcmpeqw %%mm7, %%mm7 \n\t"
2199 "psrlw $8, %%mm7 \n\t"
2200 "1: \n\t"
2201 "movq -28(%1, %0, 4), %%mm0 \n\t"
2202 "movq -20(%1, %0, 4), %%mm1 \n\t"
2203 "movq -12(%1, %0, 4), %%mm2 \n\t"
2204 "movq -4(%1, %0, 4), %%mm3 \n\t"
2205 "pand %%mm7, %%mm0 \n\t"
2206 "pand %%mm7, %%mm1 \n\t"
2207 "pand %%mm7, %%mm2 \n\t"
2208 "pand %%mm7, %%mm3 \n\t"
2209 "packuswb %%mm1, %%mm0 \n\t"
2210 "packuswb %%mm3, %%mm2 \n\t"
2211 "movq %%mm0, %%mm1 \n\t"
2212 "movq %%mm2, %%mm3 \n\t"
2213 "psrlw $8, %%mm0 \n\t"
2214 "psrlw $8, %%mm2 \n\t"
2215 "pand %%mm7, %%mm1 \n\t"
2216 "pand %%mm7, %%mm3 \n\t"
2217 "packuswb %%mm2, %%mm0 \n\t"
2218 "packuswb %%mm3, %%mm1 \n\t"
2219 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2220 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2221 "add $8, %0 \n\t"
2222 " js 1b \n\t"
2223 : "+r"(count)
2224 : "r"(src), "r"(dst0), "r"(dst1)
2225 );
2226 count -= 7;
2227 }
2228 while(count<0) {
2229 dst0[count]= src[4*count+0];
2230 dst1[count]= src[4*count+2];
2231 count++;
2232 }
2233 }
2234 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2235
2236 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2237 {
2238 dst0 += count;
2239 dst1 += count;
2240 src0 += 4*count;
2241 src1 += 4*count;
2242 count= - count;
2243 #ifdef PAVGB
2244 if(count <= -8) {
2245 count += 7;
2246 __asm__ volatile(
2247 "pcmpeqw %%mm7, %%mm7 \n\t"
2248 "psrlw $8, %%mm7 \n\t"
2249 "1: \n\t"
2250 "movq -28(%1, %0, 4), %%mm0 \n\t"
2251 "movq -20(%1, %0, 4), %%mm1 \n\t"
2252 "movq -12(%1, %0, 4), %%mm2 \n\t"
2253 "movq -4(%1, %0, 4), %%mm3 \n\t"
2254 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2255 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2256 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2257 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2258 "pand %%mm7, %%mm0 \n\t"
2259 "pand %%mm7, %%mm1 \n\t"
2260 "pand %%mm7, %%mm2 \n\t"
2261 "pand %%mm7, %%mm3 \n\t"
2262 "packuswb %%mm1, %%mm0 \n\t"
2263 "packuswb %%mm3, %%mm2 \n\t"
2264 "movq %%mm0, %%mm1 \n\t"
2265 "movq %%mm2, %%mm3 \n\t"
2266 "psrlw $8, %%mm0 \n\t"
2267 "psrlw $8, %%mm2 \n\t"
2268 "pand %%mm7, %%mm1 \n\t"
2269 "pand %%mm7, %%mm3 \n\t"
2270 "packuswb %%mm2, %%mm0 \n\t"
2271 "packuswb %%mm3, %%mm1 \n\t"
2272 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2273 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2274 "add $8, %0 \n\t"
2275 " js 1b \n\t"
2276 : "+r"(count)
2277 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2278 );
2279 count -= 7;
2280 }
2281 #endif
2282 while(count<0) {
2283 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2284 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2285 count++;
2286 }
2287 }
2288
2289 #if !COMPILE_TEMPLATE_AMD3DNOW
2290 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2291 {
2292 dst0+= count;
2293 dst1+= count;
2294 src += 4*count;
2295 count= - count;
2296 if(count <= -8) {
2297 count += 7;
2298 __asm__ volatile(
2299 "pcmpeqw %%mm7, %%mm7 \n\t"
2300 "psrlw $8, %%mm7 \n\t"
2301 "1: \n\t"
2302 "movq -28(%1, %0, 4), %%mm0 \n\t"
2303 "movq -20(%1, %0, 4), %%mm1 \n\t"
2304 "movq -12(%1, %0, 4), %%mm2 \n\t"
2305 "movq -4(%1, %0, 4), %%mm3 \n\t"
2306 "psrlw $8, %%mm0 \n\t"
2307 "psrlw $8, %%mm1 \n\t"
2308 "psrlw $8, %%mm2 \n\t"
2309 "psrlw $8, %%mm3 \n\t"
2310 "packuswb %%mm1, %%mm0 \n\t"
2311 "packuswb %%mm3, %%mm2 \n\t"
2312 "movq %%mm0, %%mm1 \n\t"
2313 "movq %%mm2, %%mm3 \n\t"
2314 "psrlw $8, %%mm0 \n\t"
2315 "psrlw $8, %%mm2 \n\t"
2316 "pand %%mm7, %%mm1 \n\t"
2317 "pand %%mm7, %%mm3 \n\t"
2318 "packuswb %%mm2, %%mm0 \n\t"
2319 "packuswb %%mm3, %%mm1 \n\t"
2320 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2321 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2322 "add $8, %0 \n\t"
2323 " js 1b \n\t"
2324 : "+r"(count)
2325 : "r"(src), "r"(dst0), "r"(dst1)
2326 );
2327 count -= 7;
2328 }
2329 src++;
2330 while(count<0) {
2331 dst0[count]= src[4*count+0];
2332 dst1[count]= src[4*count+2];
2333 count++;
2334 }
2335 }
2336 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2337
2338 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2339 {
2340 dst0 += count;
2341 dst1 += count;
2342 src0 += 4*count;
2343 src1 += 4*count;
2344 count= - count;
2345 #ifdef PAVGB
2346 if(count <= -8) {
2347 count += 7;
2348 __asm__ volatile(
2349 "pcmpeqw %%mm7, %%mm7 \n\t"
2350 "psrlw $8, %%mm7 \n\t"
2351 "1: \n\t"
2352 "movq -28(%1, %0, 4), %%mm0 \n\t"
2353 "movq -20(%1, %0, 4), %%mm1 \n\t"
2354 "movq -12(%1, %0, 4), %%mm2 \n\t"
2355 "movq -4(%1, %0, 4), %%mm3 \n\t"
2356 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2357 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2358 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2359 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2360 "psrlw $8, %%mm0 \n\t"
2361 "psrlw $8, %%mm1 \n\t"
2362 "psrlw $8, %%mm2 \n\t"
2363 "psrlw $8, %%mm3 \n\t"
2364 "packuswb %%mm1, %%mm0 \n\t"
2365 "packuswb %%mm3, %%mm2 \n\t"
2366 "movq %%mm0, %%mm1 \n\t"
2367 "movq %%mm2, %%mm3 \n\t"
2368 "psrlw $8, %%mm0 \n\t"
2369 "psrlw $8, %%mm2 \n\t"
2370 "pand %%mm7, %%mm1 \n\t"
2371 "pand %%mm7, %%mm3 \n\t"
2372 "packuswb %%mm2, %%mm0 \n\t"
2373 "packuswb %%mm3, %%mm1 \n\t"
2374 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2375 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2376 "add $8, %0 \n\t"
2377 " js 1b \n\t"
2378 : "+r"(count)
2379 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2380 );
2381 count -= 7;
2382 }
2383 #endif
2384 src0++;
2385 src1++;
2386 while(count<0) {
2387 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2388 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2389 count++;
2390 }
2391 }
2392
2393 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2394 int width, int height,
2395 int lumStride, int chromStride, int srcStride)
2396 {
2397 int y;
2398 const int chromWidth= -((-width)>>1);
2399
2400 for (y=0; y<height; y++) {
2401 RENAME(extract_even)(src, ydst, width);
2402 if(y&1) {
2403 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2404 udst+= chromStride;
2405 vdst+= chromStride;
2406 }
2407
2408 src += srcStride;
2409 ydst+= lumStride;
2410 }
2411 __asm__(
2412 EMMS" \n\t"
2413 SFENCE" \n\t"
2414 ::: "memory"
2415 );
2416 }
2417
2418 #if !COMPILE_TEMPLATE_AMD3DNOW
2419 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2420 int width, int height,
2421 int lumStride, int chromStride, int srcStride)
2422 {
2423 int y;
2424 const int chromWidth= -((-width)>>1);
2425
2426 for (y=0; y<height; y++) {
2427 RENAME(extract_even)(src, ydst, width);
2428 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2429
2430 src += srcStride;
2431 ydst+= lumStride;
2432 udst+= chromStride;
2433 vdst+= chromStride;
2434 }
2435 __asm__(
2436 EMMS" \n\t"
2437 SFENCE" \n\t"
2438 ::: "memory"
2439 );
2440 }
2441 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2442
2443 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2444 int width, int height,
2445 int lumStride, int chromStride, int srcStride)
2446 {
2447 int y;
2448 const int chromWidth= -((-width)>>1);
2449
2450 for (y=0; y<height; y++) {
2451 RENAME(extract_even)(src+1, ydst, width);
2452 if(y&1) {
2453 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2454 udst+= chromStride;
2455 vdst+= chromStride;
2456 }
2457
2458 src += srcStride;
2459 ydst+= lumStride;
2460 }
2461 __asm__(
2462 EMMS" \n\t"
2463 SFENCE" \n\t"
2464 ::: "memory"
2465 );
2466 }
2467
2468 #if !COMPILE_TEMPLATE_AMD3DNOW
2469 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2470 int width, int height,
2471 int lumStride, int chromStride, int srcStride)
2472 {
2473 int y;
2474 const int chromWidth= -((-width)>>1);
2475
2476 for (y=0; y<height; y++) {
2477 RENAME(extract_even)(src+1, ydst, width);
2478 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2479
2480 src += srcStride;
2481 ydst+= lumStride;
2482 udst+= chromStride;
2483 vdst+= chromStride;
2484 }
2485 __asm__(
2486 EMMS" \n\t"
2487 SFENCE" \n\t"
2488 ::: "memory"
2489 );
2490 }
2491 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2492 #endif /* !COMPILE_TEMPLATE_SSE2 */
2493
2494 static av_cold void RENAME(rgb2rgb_init)(void)
2495 {
2496 #if !COMPILE_TEMPLATE_SSE2
2497 #if !COMPILE_TEMPLATE_AMD3DNOW
2498 rgb15to16 = RENAME(rgb15to16);
2499 rgb15tobgr24 = RENAME(rgb15tobgr24);
2500 rgb15to32 = RENAME(rgb15to32);
2501 rgb16tobgr24 = RENAME(rgb16tobgr24);
2502 rgb16to32 = RENAME(rgb16to32);
2503 rgb16to15 = RENAME(rgb16to15);
2504 rgb24tobgr16 = RENAME(rgb24tobgr16);
2505 rgb24tobgr15 = RENAME(rgb24tobgr15);
2506 rgb24tobgr32 = RENAME(rgb24tobgr32);
2507 rgb32to16 = RENAME(rgb32to16);
2508 rgb32to15 = RENAME(rgb32to15);
2509 rgb32tobgr24 = RENAME(rgb32tobgr24);
2510 rgb24to15 = RENAME(rgb24to15);
2511 rgb24to16 = RENAME(rgb24to16);
2512 rgb24tobgr24 = RENAME(rgb24tobgr24);
2513 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2514 rgb32tobgr16 = RENAME(rgb32tobgr16);
2515 rgb32tobgr15 = RENAME(rgb32tobgr15);
2516 yv12toyuy2 = RENAME(yv12toyuy2);
2517 yv12touyvy = RENAME(yv12touyvy);
2518 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2519 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2520 yuy2toyv12 = RENAME(yuy2toyv12);
2521 vu9_to_vu12 = RENAME(vu9_to_vu12);
2522 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2523 uyvytoyuv422 = RENAME(uyvytoyuv422);
2524 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2525 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2526
2527 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2528 planar2x = RENAME(planar2x);
2529 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2530 rgb24toyv12 = RENAME(rgb24toyv12);
2531
2532 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2533 uyvytoyuv420 = RENAME(uyvytoyuv420);
2534 #endif /* !COMPILE_TEMPLATE_SSE2 */
2535
2536 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2537 interleaveBytes = RENAME(interleaveBytes);
2538 #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
2539 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
2540 deinterleaveBytes = RENAME(deinterleaveBytes);
2541 #endif
2542 }