x86: mmx2 ---> mmxext in comments and messages
[libav.git] / libswscale / x86 / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of Libav.
11 *
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27 #include <stddef.h>
28
29 #undef PREFETCH
30 #undef MOVNTQ
31 #undef EMMS
32 #undef SFENCE
33 #undef PAVGB
34
35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMXEXT
39 #define PREFETCH "prefetchnta"
40 #define PAVGB "pavgb"
41 #else
42 #define PREFETCH " # nop"
43 #endif
44
45 #if COMPILE_TEMPLATE_AMD3DNOW
46 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
47 #define EMMS "femms"
48 #else
49 #define EMMS "emms"
50 #endif
51
52 #if COMPILE_TEMPLATE_MMXEXT
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
55 #else
56 #define MOVNTQ "movq"
57 #define SFENCE " # nop"
58 #endif
59
60 #if !COMPILE_TEMPLATE_SSE2
61
62 #if !COMPILE_TEMPLATE_AMD3DNOW
63
64 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
65 {
66 uint8_t *dest = dst;
67 const uint8_t *s = src;
68 const uint8_t *end;
69 const uint8_t *mm_end;
70 end = s + src_size;
71 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
72 mm_end = end - 23;
73 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
74 while (s < mm_end) {
75 __asm__ volatile(
76 PREFETCH" 32(%1) \n\t"
77 "movd (%1), %%mm0 \n\t"
78 "punpckldq 3(%1), %%mm0 \n\t"
79 "movd 6(%1), %%mm1 \n\t"
80 "punpckldq 9(%1), %%mm1 \n\t"
81 "movd 12(%1), %%mm2 \n\t"
82 "punpckldq 15(%1), %%mm2 \n\t"
83 "movd 18(%1), %%mm3 \n\t"
84 "punpckldq 21(%1), %%mm3 \n\t"
85 "por %%mm7, %%mm0 \n\t"
86 "por %%mm7, %%mm1 \n\t"
87 "por %%mm7, %%mm2 \n\t"
88 "por %%mm7, %%mm3 \n\t"
89 MOVNTQ" %%mm0, (%0) \n\t"
90 MOVNTQ" %%mm1, 8(%0) \n\t"
91 MOVNTQ" %%mm2, 16(%0) \n\t"
92 MOVNTQ" %%mm3, 24(%0)"
93 :: "r"(dest), "r"(s)
94 :"memory");
95 dest += 32;
96 s += 24;
97 }
98 __asm__ volatile(SFENCE:::"memory");
99 __asm__ volatile(EMMS:::"memory");
100 while (s < end) {
101 *dest++ = *s++;
102 *dest++ = *s++;
103 *dest++ = *s++;
104 *dest++ = 255;
105 }
106 }
107
108 #define STORE_BGR24_MMX \
109 "psrlq $8, %%mm2 \n\t" \
110 "psrlq $8, %%mm3 \n\t" \
111 "psrlq $8, %%mm6 \n\t" \
112 "psrlq $8, %%mm7 \n\t" \
113 "pand "MANGLE(mask24l)", %%mm0\n\t" \
114 "pand "MANGLE(mask24l)", %%mm1\n\t" \
115 "pand "MANGLE(mask24l)", %%mm4\n\t" \
116 "pand "MANGLE(mask24l)", %%mm5\n\t" \
117 "pand "MANGLE(mask24h)", %%mm2\n\t" \
118 "pand "MANGLE(mask24h)", %%mm3\n\t" \
119 "pand "MANGLE(mask24h)", %%mm6\n\t" \
120 "pand "MANGLE(mask24h)", %%mm7\n\t" \
121 "por %%mm2, %%mm0 \n\t" \
122 "por %%mm3, %%mm1 \n\t" \
123 "por %%mm6, %%mm4 \n\t" \
124 "por %%mm7, %%mm5 \n\t" \
125 \
126 "movq %%mm1, %%mm2 \n\t" \
127 "movq %%mm4, %%mm3 \n\t" \
128 "psllq $48, %%mm2 \n\t" \
129 "psllq $32, %%mm3 \n\t" \
130 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
131 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
132 "por %%mm2, %%mm0 \n\t" \
133 "psrlq $16, %%mm1 \n\t" \
134 "psrlq $32, %%mm4 \n\t" \
135 "psllq $16, %%mm5 \n\t" \
136 "por %%mm3, %%mm1 \n\t" \
137 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
138 "por %%mm5, %%mm4 \n\t" \
139 \
140 MOVNTQ" %%mm0, (%0) \n\t" \
141 MOVNTQ" %%mm1, 8(%0) \n\t" \
142 MOVNTQ" %%mm4, 16(%0)"
143
144
145 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
146 {
147 uint8_t *dest = dst;
148 const uint8_t *s = src;
149 const uint8_t *end;
150 const uint8_t *mm_end;
151 end = s + src_size;
152 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
153 mm_end = end - 31;
154 while (s < mm_end) {
155 __asm__ volatile(
156 PREFETCH" 32(%1) \n\t"
157 "movq (%1), %%mm0 \n\t"
158 "movq 8(%1), %%mm1 \n\t"
159 "movq 16(%1), %%mm4 \n\t"
160 "movq 24(%1), %%mm5 \n\t"
161 "movq %%mm0, %%mm2 \n\t"
162 "movq %%mm1, %%mm3 \n\t"
163 "movq %%mm4, %%mm6 \n\t"
164 "movq %%mm5, %%mm7 \n\t"
165 STORE_BGR24_MMX
166 :: "r"(dest), "r"(s)
167 :"memory");
168 dest += 24;
169 s += 32;
170 }
171 __asm__ volatile(SFENCE:::"memory");
172 __asm__ volatile(EMMS:::"memory");
173 while (s < end) {
174 *dest++ = *s++;
175 *dest++ = *s++;
176 *dest++ = *s++;
177 s++;
178 }
179 }
180
181 /*
182 original by Strepto/Astral
183 ported to gcc & bugfixed: A'rpi
184 MMXEXT, 3DNOW optimization by Nick Kurshev
185 32-bit C version, and and&add trick by Michael Niedermayer
186 */
187 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
188 {
189 register const uint8_t* s=src;
190 register uint8_t* d=dst;
191 register const uint8_t *end;
192 const uint8_t *mm_end;
193 end = s + src_size;
194 __asm__ volatile(PREFETCH" %0"::"m"(*s));
195 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
196 mm_end = end - 15;
197 while (s<mm_end) {
198 __asm__ volatile(
199 PREFETCH" 32(%1) \n\t"
200 "movq (%1), %%mm0 \n\t"
201 "movq 8(%1), %%mm2 \n\t"
202 "movq %%mm0, %%mm1 \n\t"
203 "movq %%mm2, %%mm3 \n\t"
204 "pand %%mm4, %%mm0 \n\t"
205 "pand %%mm4, %%mm2 \n\t"
206 "paddw %%mm1, %%mm0 \n\t"
207 "paddw %%mm3, %%mm2 \n\t"
208 MOVNTQ" %%mm0, (%0) \n\t"
209 MOVNTQ" %%mm2, 8(%0)"
210 :: "r"(d), "r"(s)
211 );
212 d+=16;
213 s+=16;
214 }
215 __asm__ volatile(SFENCE:::"memory");
216 __asm__ volatile(EMMS:::"memory");
217 mm_end = end - 3;
218 while (s < mm_end) {
219 register unsigned x= *((const uint32_t *)s);
220 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
221 d+=4;
222 s+=4;
223 }
224 if (s < end) {
225 register unsigned short x= *((const uint16_t *)s);
226 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
227 }
228 }
229
230 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
231 {
232 register const uint8_t* s=src;
233 register uint8_t* d=dst;
234 register const uint8_t *end;
235 const uint8_t *mm_end;
236 end = s + src_size;
237 __asm__ volatile(PREFETCH" %0"::"m"(*s));
238 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
239 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
240 mm_end = end - 15;
241 while (s<mm_end) {
242 __asm__ volatile(
243 PREFETCH" 32(%1) \n\t"
244 "movq (%1), %%mm0 \n\t"
245 "movq 8(%1), %%mm2 \n\t"
246 "movq %%mm0, %%mm1 \n\t"
247 "movq %%mm2, %%mm3 \n\t"
248 "psrlq $1, %%mm0 \n\t"
249 "psrlq $1, %%mm2 \n\t"
250 "pand %%mm7, %%mm0 \n\t"
251 "pand %%mm7, %%mm2 \n\t"
252 "pand %%mm6, %%mm1 \n\t"
253 "pand %%mm6, %%mm3 \n\t"
254 "por %%mm1, %%mm0 \n\t"
255 "por %%mm3, %%mm2 \n\t"
256 MOVNTQ" %%mm0, (%0) \n\t"
257 MOVNTQ" %%mm2, 8(%0)"
258 :: "r"(d), "r"(s)
259 );
260 d+=16;
261 s+=16;
262 }
263 __asm__ volatile(SFENCE:::"memory");
264 __asm__ volatile(EMMS:::"memory");
265 mm_end = end - 3;
266 while (s < mm_end) {
267 register uint32_t x= *((const uint32_t*)s);
268 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
269 s+=4;
270 d+=4;
271 }
272 if (s < end) {
273 register uint16_t x= *((const uint16_t*)s);
274 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
275 }
276 }
277
278 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
279 {
280 const uint8_t *s = src;
281 const uint8_t *end;
282 const uint8_t *mm_end;
283 uint16_t *d = (uint16_t *)dst;
284 end = s + src_size;
285 mm_end = end - 15;
286 __asm__ volatile(
287 "movq %3, %%mm5 \n\t"
288 "movq %4, %%mm6 \n\t"
289 "movq %5, %%mm7 \n\t"
290 "jmp 2f \n\t"
291 ".p2align 4 \n\t"
292 "1: \n\t"
293 PREFETCH" 32(%1) \n\t"
294 "movd (%1), %%mm0 \n\t"
295 "movd 4(%1), %%mm3 \n\t"
296 "punpckldq 8(%1), %%mm0 \n\t"
297 "punpckldq 12(%1), %%mm3 \n\t"
298 "movq %%mm0, %%mm1 \n\t"
299 "movq %%mm3, %%mm4 \n\t"
300 "pand %%mm6, %%mm0 \n\t"
301 "pand %%mm6, %%mm3 \n\t"
302 "pmaddwd %%mm7, %%mm0 \n\t"
303 "pmaddwd %%mm7, %%mm3 \n\t"
304 "pand %%mm5, %%mm1 \n\t"
305 "pand %%mm5, %%mm4 \n\t"
306 "por %%mm1, %%mm0 \n\t"
307 "por %%mm4, %%mm3 \n\t"
308 "psrld $5, %%mm0 \n\t"
309 "pslld $11, %%mm3 \n\t"
310 "por %%mm3, %%mm0 \n\t"
311 MOVNTQ" %%mm0, (%0) \n\t"
312 "add $16, %1 \n\t"
313 "add $8, %0 \n\t"
314 "2: \n\t"
315 "cmp %2, %1 \n\t"
316 " jb 1b \n\t"
317 : "+r" (d), "+r"(s)
318 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
319 );
320 __asm__ volatile(SFENCE:::"memory");
321 __asm__ volatile(EMMS:::"memory");
322 while (s < end) {
323 register int rgb = *(const uint32_t*)s; s += 4;
324 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
325 }
326 }
327
328 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
329 {
330 const uint8_t *s = src;
331 const uint8_t *end;
332 const uint8_t *mm_end;
333 uint16_t *d = (uint16_t *)dst;
334 end = s + src_size;
335 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
336 __asm__ volatile(
337 "movq %0, %%mm7 \n\t"
338 "movq %1, %%mm6 \n\t"
339 ::"m"(red_16mask),"m"(green_16mask));
340 mm_end = end - 15;
341 while (s < mm_end) {
342 __asm__ volatile(
343 PREFETCH" 32(%1) \n\t"
344 "movd (%1), %%mm0 \n\t"
345 "movd 4(%1), %%mm3 \n\t"
346 "punpckldq 8(%1), %%mm0 \n\t"
347 "punpckldq 12(%1), %%mm3 \n\t"
348 "movq %%mm0, %%mm1 \n\t"
349 "movq %%mm0, %%mm2 \n\t"
350 "movq %%mm3, %%mm4 \n\t"
351 "movq %%mm3, %%mm5 \n\t"
352 "psllq $8, %%mm0 \n\t"
353 "psllq $8, %%mm3 \n\t"
354 "pand %%mm7, %%mm0 \n\t"
355 "pand %%mm7, %%mm3 \n\t"
356 "psrlq $5, %%mm1 \n\t"
357 "psrlq $5, %%mm4 \n\t"
358 "pand %%mm6, %%mm1 \n\t"
359 "pand %%mm6, %%mm4 \n\t"
360 "psrlq $19, %%mm2 \n\t"
361 "psrlq $19, %%mm5 \n\t"
362 "pand %2, %%mm2 \n\t"
363 "pand %2, %%mm5 \n\t"
364 "por %%mm1, %%mm0 \n\t"
365 "por %%mm4, %%mm3 \n\t"
366 "por %%mm2, %%mm0 \n\t"
367 "por %%mm5, %%mm3 \n\t"
368 "psllq $16, %%mm3 \n\t"
369 "por %%mm3, %%mm0 \n\t"
370 MOVNTQ" %%mm0, (%0) \n\t"
371 :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
372 d += 4;
373 s += 16;
374 }
375 __asm__ volatile(SFENCE:::"memory");
376 __asm__ volatile(EMMS:::"memory");
377 while (s < end) {
378 register int rgb = *(const uint32_t*)s; s += 4;
379 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
380 }
381 }
382
383 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
384 {
385 const uint8_t *s = src;
386 const uint8_t *end;
387 const uint8_t *mm_end;
388 uint16_t *d = (uint16_t *)dst;
389 end = s + src_size;
390 mm_end = end - 15;
391 __asm__ volatile(
392 "movq %3, %%mm5 \n\t"
393 "movq %4, %%mm6 \n\t"
394 "movq %5, %%mm7 \n\t"
395 "jmp 2f \n\t"
396 ".p2align 4 \n\t"
397 "1: \n\t"
398 PREFETCH" 32(%1) \n\t"
399 "movd (%1), %%mm0 \n\t"
400 "movd 4(%1), %%mm3 \n\t"
401 "punpckldq 8(%1), %%mm0 \n\t"
402 "punpckldq 12(%1), %%mm3 \n\t"
403 "movq %%mm0, %%mm1 \n\t"
404 "movq %%mm3, %%mm4 \n\t"
405 "pand %%mm6, %%mm0 \n\t"
406 "pand %%mm6, %%mm3 \n\t"
407 "pmaddwd %%mm7, %%mm0 \n\t"
408 "pmaddwd %%mm7, %%mm3 \n\t"
409 "pand %%mm5, %%mm1 \n\t"
410 "pand %%mm5, %%mm4 \n\t"
411 "por %%mm1, %%mm0 \n\t"
412 "por %%mm4, %%mm3 \n\t"
413 "psrld $6, %%mm0 \n\t"
414 "pslld $10, %%mm3 \n\t"
415 "por %%mm3, %%mm0 \n\t"
416 MOVNTQ" %%mm0, (%0) \n\t"
417 "add $16, %1 \n\t"
418 "add $8, %0 \n\t"
419 "2: \n\t"
420 "cmp %2, %1 \n\t"
421 " jb 1b \n\t"
422 : "+r" (d), "+r"(s)
423 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
424 );
425 __asm__ volatile(SFENCE:::"memory");
426 __asm__ volatile(EMMS:::"memory");
427 while (s < end) {
428 register int rgb = *(const uint32_t*)s; s += 4;
429 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
430 }
431 }
432
433 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
434 {
435 const uint8_t *s = src;
436 const uint8_t *end;
437 const uint8_t *mm_end;
438 uint16_t *d = (uint16_t *)dst;
439 end = s + src_size;
440 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
441 __asm__ volatile(
442 "movq %0, %%mm7 \n\t"
443 "movq %1, %%mm6 \n\t"
444 ::"m"(red_15mask),"m"(green_15mask));
445 mm_end = end - 15;
446 while (s < mm_end) {
447 __asm__ volatile(
448 PREFETCH" 32(%1) \n\t"
449 "movd (%1), %%mm0 \n\t"
450 "movd 4(%1), %%mm3 \n\t"
451 "punpckldq 8(%1), %%mm0 \n\t"
452 "punpckldq 12(%1), %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm0, %%mm2 \n\t"
455 "movq %%mm3, %%mm4 \n\t"
456 "movq %%mm3, %%mm5 \n\t"
457 "psllq $7, %%mm0 \n\t"
458 "psllq $7, %%mm3 \n\t"
459 "pand %%mm7, %%mm0 \n\t"
460 "pand %%mm7, %%mm3 \n\t"
461 "psrlq $6, %%mm1 \n\t"
462 "psrlq $6, %%mm4 \n\t"
463 "pand %%mm6, %%mm1 \n\t"
464 "pand %%mm6, %%mm4 \n\t"
465 "psrlq $19, %%mm2 \n\t"
466 "psrlq $19, %%mm5 \n\t"
467 "pand %2, %%mm2 \n\t"
468 "pand %2, %%mm5 \n\t"
469 "por %%mm1, %%mm0 \n\t"
470 "por %%mm4, %%mm3 \n\t"
471 "por %%mm2, %%mm0 \n\t"
472 "por %%mm5, %%mm3 \n\t"
473 "psllq $16, %%mm3 \n\t"
474 "por %%mm3, %%mm0 \n\t"
475 MOVNTQ" %%mm0, (%0) \n\t"
476 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
477 d += 4;
478 s += 16;
479 }
480 __asm__ volatile(SFENCE:::"memory");
481 __asm__ volatile(EMMS:::"memory");
482 while (s < end) {
483 register int rgb = *(const uint32_t*)s; s += 4;
484 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
485 }
486 }
487
488 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
489 {
490 const uint8_t *s = src;
491 const uint8_t *end;
492 const uint8_t *mm_end;
493 uint16_t *d = (uint16_t *)dst;
494 end = s + src_size;
495 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
496 __asm__ volatile(
497 "movq %0, %%mm7 \n\t"
498 "movq %1, %%mm6 \n\t"
499 ::"m"(red_16mask),"m"(green_16mask));
500 mm_end = end - 11;
501 while (s < mm_end) {
502 __asm__ volatile(
503 PREFETCH" 32(%1) \n\t"
504 "movd (%1), %%mm0 \n\t"
505 "movd 3(%1), %%mm3 \n\t"
506 "punpckldq 6(%1), %%mm0 \n\t"
507 "punpckldq 9(%1), %%mm3 \n\t"
508 "movq %%mm0, %%mm1 \n\t"
509 "movq %%mm0, %%mm2 \n\t"
510 "movq %%mm3, %%mm4 \n\t"
511 "movq %%mm3, %%mm5 \n\t"
512 "psrlq $3, %%mm0 \n\t"
513 "psrlq $3, %%mm3 \n\t"
514 "pand %2, %%mm0 \n\t"
515 "pand %2, %%mm3 \n\t"
516 "psrlq $5, %%mm1 \n\t"
517 "psrlq $5, %%mm4 \n\t"
518 "pand %%mm6, %%mm1 \n\t"
519 "pand %%mm6, %%mm4 \n\t"
520 "psrlq $8, %%mm2 \n\t"
521 "psrlq $8, %%mm5 \n\t"
522 "pand %%mm7, %%mm2 \n\t"
523 "pand %%mm7, %%mm5 \n\t"
524 "por %%mm1, %%mm0 \n\t"
525 "por %%mm4, %%mm3 \n\t"
526 "por %%mm2, %%mm0 \n\t"
527 "por %%mm5, %%mm3 \n\t"
528 "psllq $16, %%mm3 \n\t"
529 "por %%mm3, %%mm0 \n\t"
530 MOVNTQ" %%mm0, (%0) \n\t"
531 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
532 d += 4;
533 s += 12;
534 }
535 __asm__ volatile(SFENCE:::"memory");
536 __asm__ volatile(EMMS:::"memory");
537 while (s < end) {
538 const int b = *s++;
539 const int g = *s++;
540 const int r = *s++;
541 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
542 }
543 }
544
545 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
546 {
547 const uint8_t *s = src;
548 const uint8_t *end;
549 const uint8_t *mm_end;
550 uint16_t *d = (uint16_t *)dst;
551 end = s + src_size;
552 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
553 __asm__ volatile(
554 "movq %0, %%mm7 \n\t"
555 "movq %1, %%mm6 \n\t"
556 ::"m"(red_16mask),"m"(green_16mask));
557 mm_end = end - 15;
558 while (s < mm_end) {
559 __asm__ volatile(
560 PREFETCH" 32(%1) \n\t"
561 "movd (%1), %%mm0 \n\t"
562 "movd 3(%1), %%mm3 \n\t"
563 "punpckldq 6(%1), %%mm0 \n\t"
564 "punpckldq 9(%1), %%mm3 \n\t"
565 "movq %%mm0, %%mm1 \n\t"
566 "movq %%mm0, %%mm2 \n\t"
567 "movq %%mm3, %%mm4 \n\t"
568 "movq %%mm3, %%mm5 \n\t"
569 "psllq $8, %%mm0 \n\t"
570 "psllq $8, %%mm3 \n\t"
571 "pand %%mm7, %%mm0 \n\t"
572 "pand %%mm7, %%mm3 \n\t"
573 "psrlq $5, %%mm1 \n\t"
574 "psrlq $5, %%mm4 \n\t"
575 "pand %%mm6, %%mm1 \n\t"
576 "pand %%mm6, %%mm4 \n\t"
577 "psrlq $19, %%mm2 \n\t"
578 "psrlq $19, %%mm5 \n\t"
579 "pand %2, %%mm2 \n\t"
580 "pand %2, %%mm5 \n\t"
581 "por %%mm1, %%mm0 \n\t"
582 "por %%mm4, %%mm3 \n\t"
583 "por %%mm2, %%mm0 \n\t"
584 "por %%mm5, %%mm3 \n\t"
585 "psllq $16, %%mm3 \n\t"
586 "por %%mm3, %%mm0 \n\t"
587 MOVNTQ" %%mm0, (%0) \n\t"
588 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
589 d += 4;
590 s += 12;
591 }
592 __asm__ volatile(SFENCE:::"memory");
593 __asm__ volatile(EMMS:::"memory");
594 while (s < end) {
595 const int r = *s++;
596 const int g = *s++;
597 const int b = *s++;
598 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
599 }
600 }
601
602 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
603 {
604 const uint8_t *s = src;
605 const uint8_t *end;
606 const uint8_t *mm_end;
607 uint16_t *d = (uint16_t *)dst;
608 end = s + src_size;
609 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
610 __asm__ volatile(
611 "movq %0, %%mm7 \n\t"
612 "movq %1, %%mm6 \n\t"
613 ::"m"(red_15mask),"m"(green_15mask));
614 mm_end = end - 11;
615 while (s < mm_end) {
616 __asm__ volatile(
617 PREFETCH" 32(%1) \n\t"
618 "movd (%1), %%mm0 \n\t"
619 "movd 3(%1), %%mm3 \n\t"
620 "punpckldq 6(%1), %%mm0 \n\t"
621 "punpckldq 9(%1), %%mm3 \n\t"
622 "movq %%mm0, %%mm1 \n\t"
623 "movq %%mm0, %%mm2 \n\t"
624 "movq %%mm3, %%mm4 \n\t"
625 "movq %%mm3, %%mm5 \n\t"
626 "psrlq $3, %%mm0 \n\t"
627 "psrlq $3, %%mm3 \n\t"
628 "pand %2, %%mm0 \n\t"
629 "pand %2, %%mm3 \n\t"
630 "psrlq $6, %%mm1 \n\t"
631 "psrlq $6, %%mm4 \n\t"
632 "pand %%mm6, %%mm1 \n\t"
633 "pand %%mm6, %%mm4 \n\t"
634 "psrlq $9, %%mm2 \n\t"
635 "psrlq $9, %%mm5 \n\t"
636 "pand %%mm7, %%mm2 \n\t"
637 "pand %%mm7, %%mm5 \n\t"
638 "por %%mm1, %%mm0 \n\t"
639 "por %%mm4, %%mm3 \n\t"
640 "por %%mm2, %%mm0 \n\t"
641 "por %%mm5, %%mm3 \n\t"
642 "psllq $16, %%mm3 \n\t"
643 "por %%mm3, %%mm0 \n\t"
644 MOVNTQ" %%mm0, (%0) \n\t"
645 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
646 d += 4;
647 s += 12;
648 }
649 __asm__ volatile(SFENCE:::"memory");
650 __asm__ volatile(EMMS:::"memory");
651 while (s < end) {
652 const int b = *s++;
653 const int g = *s++;
654 const int r = *s++;
655 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
656 }
657 }
658
659 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
660 {
661 const uint8_t *s = src;
662 const uint8_t *end;
663 const uint8_t *mm_end;
664 uint16_t *d = (uint16_t *)dst;
665 end = s + src_size;
666 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
667 __asm__ volatile(
668 "movq %0, %%mm7 \n\t"
669 "movq %1, %%mm6 \n\t"
670 ::"m"(red_15mask),"m"(green_15mask));
671 mm_end = end - 15;
672 while (s < mm_end) {
673 __asm__ volatile(
674 PREFETCH" 32(%1) \n\t"
675 "movd (%1), %%mm0 \n\t"
676 "movd 3(%1), %%mm3 \n\t"
677 "punpckldq 6(%1), %%mm0 \n\t"
678 "punpckldq 9(%1), %%mm3 \n\t"
679 "movq %%mm0, %%mm1 \n\t"
680 "movq %%mm0, %%mm2 \n\t"
681 "movq %%mm3, %%mm4 \n\t"
682 "movq %%mm3, %%mm5 \n\t"
683 "psllq $7, %%mm0 \n\t"
684 "psllq $7, %%mm3 \n\t"
685 "pand %%mm7, %%mm0 \n\t"
686 "pand %%mm7, %%mm3 \n\t"
687 "psrlq $6, %%mm1 \n\t"
688 "psrlq $6, %%mm4 \n\t"
689 "pand %%mm6, %%mm1 \n\t"
690 "pand %%mm6, %%mm4 \n\t"
691 "psrlq $19, %%mm2 \n\t"
692 "psrlq $19, %%mm5 \n\t"
693 "pand %2, %%mm2 \n\t"
694 "pand %2, %%mm5 \n\t"
695 "por %%mm1, %%mm0 \n\t"
696 "por %%mm4, %%mm3 \n\t"
697 "por %%mm2, %%mm0 \n\t"
698 "por %%mm5, %%mm3 \n\t"
699 "psllq $16, %%mm3 \n\t"
700 "por %%mm3, %%mm0 \n\t"
701 MOVNTQ" %%mm0, (%0) \n\t"
702 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
703 d += 4;
704 s += 12;
705 }
706 __asm__ volatile(SFENCE:::"memory");
707 __asm__ volatile(EMMS:::"memory");
708 while (s < end) {
709 const int r = *s++;
710 const int g = *s++;
711 const int b = *s++;
712 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
713 }
714 }
715
716 /*
717 I use less accurate approximation here by simply left-shifting the input
718 value and filling the low order bits with zeroes. This method improves PNG
719 compression but this scheme cannot reproduce white exactly, since it does
720 not generate an all-ones maximum value; the net effect is to darken the
721 image slightly.
722
723 The better method should be "left bit replication":
724
725 4 3 2 1 0
726 ---------
727 1 1 0 1 1
728
729 7 6 5 4 3 2 1 0
730 ----------------
731 1 1 0 1 1 1 1 0
732 |=======| |===|
733 | leftmost bits repeated to fill open bits
734 |
735 original bits
736 */
737 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
738 {
739 const uint16_t *end;
740 const uint16_t *mm_end;
741 uint8_t *d = dst;
742 const uint16_t *s = (const uint16_t*)src;
743 end = s + src_size/2;
744 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
745 mm_end = end - 7;
746 while (s < mm_end) {
747 __asm__ volatile(
748 PREFETCH" 32(%1) \n\t"
749 "movq (%1), %%mm0 \n\t"
750 "movq (%1), %%mm1 \n\t"
751 "movq (%1), %%mm2 \n\t"
752 "pand %2, %%mm0 \n\t"
753 "pand %3, %%mm1 \n\t"
754 "pand %4, %%mm2 \n\t"
755 "psllq $3, %%mm0 \n\t"
756 "psrlq $2, %%mm1 \n\t"
757 "psrlq $7, %%mm2 \n\t"
758 "movq %%mm0, %%mm3 \n\t"
759 "movq %%mm1, %%mm4 \n\t"
760 "movq %%mm2, %%mm5 \n\t"
761 "punpcklwd %5, %%mm0 \n\t"
762 "punpcklwd %5, %%mm1 \n\t"
763 "punpcklwd %5, %%mm2 \n\t"
764 "punpckhwd %5, %%mm3 \n\t"
765 "punpckhwd %5, %%mm4 \n\t"
766 "punpckhwd %5, %%mm5 \n\t"
767 "psllq $8, %%mm1 \n\t"
768 "psllq $16, %%mm2 \n\t"
769 "por %%mm1, %%mm0 \n\t"
770 "por %%mm2, %%mm0 \n\t"
771 "psllq $8, %%mm4 \n\t"
772 "psllq $16, %%mm5 \n\t"
773 "por %%mm4, %%mm3 \n\t"
774 "por %%mm5, %%mm3 \n\t"
775
776 "movq %%mm0, %%mm6 \n\t"
777 "movq %%mm3, %%mm7 \n\t"
778
779 "movq 8(%1), %%mm0 \n\t"
780 "movq 8(%1), %%mm1 \n\t"
781 "movq 8(%1), %%mm2 \n\t"
782 "pand %2, %%mm0 \n\t"
783 "pand %3, %%mm1 \n\t"
784 "pand %4, %%mm2 \n\t"
785 "psllq $3, %%mm0 \n\t"
786 "psrlq $2, %%mm1 \n\t"
787 "psrlq $7, %%mm2 \n\t"
788 "movq %%mm0, %%mm3 \n\t"
789 "movq %%mm1, %%mm4 \n\t"
790 "movq %%mm2, %%mm5 \n\t"
791 "punpcklwd %5, %%mm0 \n\t"
792 "punpcklwd %5, %%mm1 \n\t"
793 "punpcklwd %5, %%mm2 \n\t"
794 "punpckhwd %5, %%mm3 \n\t"
795 "punpckhwd %5, %%mm4 \n\t"
796 "punpckhwd %5, %%mm5 \n\t"
797 "psllq $8, %%mm1 \n\t"
798 "psllq $16, %%mm2 \n\t"
799 "por %%mm1, %%mm0 \n\t"
800 "por %%mm2, %%mm0 \n\t"
801 "psllq $8, %%mm4 \n\t"
802 "psllq $16, %%mm5 \n\t"
803 "por %%mm4, %%mm3 \n\t"
804 "por %%mm5, %%mm3 \n\t"
805
806 :"=m"(*d)
807 :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
808 :"memory");
809 /* borrowed 32 to 24 */
810 __asm__ volatile(
811 "movq %%mm0, %%mm4 \n\t"
812 "movq %%mm3, %%mm5 \n\t"
813 "movq %%mm6, %%mm0 \n\t"
814 "movq %%mm7, %%mm1 \n\t"
815
816 "movq %%mm4, %%mm6 \n\t"
817 "movq %%mm5, %%mm7 \n\t"
818 "movq %%mm0, %%mm2 \n\t"
819 "movq %%mm1, %%mm3 \n\t"
820
821 STORE_BGR24_MMX
822
823 :: "r"(d), "m"(*s)
824 :"memory");
825 d += 24;
826 s += 8;
827 }
828 __asm__ volatile(SFENCE:::"memory");
829 __asm__ volatile(EMMS:::"memory");
830 while (s < end) {
831 register uint16_t bgr;
832 bgr = *s++;
833 *d++ = (bgr&0x1F)<<3;
834 *d++ = (bgr&0x3E0)>>2;
835 *d++ = (bgr&0x7C00)>>7;
836 }
837 }
838
839 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
840 {
841 const uint16_t *end;
842 const uint16_t *mm_end;
843 uint8_t *d = (uint8_t *)dst;
844 const uint16_t *s = (const uint16_t *)src;
845 end = s + src_size/2;
846 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
847 mm_end = end - 7;
848 while (s < mm_end) {
849 __asm__ volatile(
850 PREFETCH" 32(%1) \n\t"
851 "movq (%1), %%mm0 \n\t"
852 "movq (%1), %%mm1 \n\t"
853 "movq (%1), %%mm2 \n\t"
854 "pand %2, %%mm0 \n\t"
855 "pand %3, %%mm1 \n\t"
856 "pand %4, %%mm2 \n\t"
857 "psllq $3, %%mm0 \n\t"
858 "psrlq $3, %%mm1 \n\t"
859 "psrlq $8, %%mm2 \n\t"
860 "movq %%mm0, %%mm3 \n\t"
861 "movq %%mm1, %%mm4 \n\t"
862 "movq %%mm2, %%mm5 \n\t"
863 "punpcklwd %5, %%mm0 \n\t"
864 "punpcklwd %5, %%mm1 \n\t"
865 "punpcklwd %5, %%mm2 \n\t"
866 "punpckhwd %5, %%mm3 \n\t"
867 "punpckhwd %5, %%mm4 \n\t"
868 "punpckhwd %5, %%mm5 \n\t"
869 "psllq $8, %%mm1 \n\t"
870 "psllq $16, %%mm2 \n\t"
871 "por %%mm1, %%mm0 \n\t"
872 "por %%mm2, %%mm0 \n\t"
873 "psllq $8, %%mm4 \n\t"
874 "psllq $16, %%mm5 \n\t"
875 "por %%mm4, %%mm3 \n\t"
876 "por %%mm5, %%mm3 \n\t"
877
878 "movq %%mm0, %%mm6 \n\t"
879 "movq %%mm3, %%mm7 \n\t"
880
881 "movq 8(%1), %%mm0 \n\t"
882 "movq 8(%1), %%mm1 \n\t"
883 "movq 8(%1), %%mm2 \n\t"
884 "pand %2, %%mm0 \n\t"
885 "pand %3, %%mm1 \n\t"
886 "pand %4, %%mm2 \n\t"
887 "psllq $3, %%mm0 \n\t"
888 "psrlq $3, %%mm1 \n\t"
889 "psrlq $8, %%mm2 \n\t"
890 "movq %%mm0, %%mm3 \n\t"
891 "movq %%mm1, %%mm4 \n\t"
892 "movq %%mm2, %%mm5 \n\t"
893 "punpcklwd %5, %%mm0 \n\t"
894 "punpcklwd %5, %%mm1 \n\t"
895 "punpcklwd %5, %%mm2 \n\t"
896 "punpckhwd %5, %%mm3 \n\t"
897 "punpckhwd %5, %%mm4 \n\t"
898 "punpckhwd %5, %%mm5 \n\t"
899 "psllq $8, %%mm1 \n\t"
900 "psllq $16, %%mm2 \n\t"
901 "por %%mm1, %%mm0 \n\t"
902 "por %%mm2, %%mm0 \n\t"
903 "psllq $8, %%mm4 \n\t"
904 "psllq $16, %%mm5 \n\t"
905 "por %%mm4, %%mm3 \n\t"
906 "por %%mm5, %%mm3 \n\t"
907 :"=m"(*d)
908 :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
909 :"memory");
910 /* borrowed 32 to 24 */
911 __asm__ volatile(
912 "movq %%mm0, %%mm4 \n\t"
913 "movq %%mm3, %%mm5 \n\t"
914 "movq %%mm6, %%mm0 \n\t"
915 "movq %%mm7, %%mm1 \n\t"
916
917 "movq %%mm4, %%mm6 \n\t"
918 "movq %%mm5, %%mm7 \n\t"
919 "movq %%mm0, %%mm2 \n\t"
920 "movq %%mm1, %%mm3 \n\t"
921
922 STORE_BGR24_MMX
923
924 :: "r"(d), "m"(*s)
925 :"memory");
926 d += 24;
927 s += 8;
928 }
929 __asm__ volatile(SFENCE:::"memory");
930 __asm__ volatile(EMMS:::"memory");
931 while (s < end) {
932 register uint16_t bgr;
933 bgr = *s++;
934 *d++ = (bgr&0x1F)<<3;
935 *d++ = (bgr&0x7E0)>>3;
936 *d++ = (bgr&0xF800)>>8;
937 }
938 }
939
940 /*
941 * mm0 = 00 B3 00 B2 00 B1 00 B0
942 * mm1 = 00 G3 00 G2 00 G1 00 G0
943 * mm2 = 00 R3 00 R2 00 R1 00 R0
944 * mm6 = FF FF FF FF FF FF FF FF
945 * mm7 = 00 00 00 00 00 00 00 00
946 */
947 #define PACK_RGB32 \
948 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
949 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
950 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
951 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
952 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
953 "movq %%mm0, %%mm3 \n\t" \
954 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
955 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
956 MOVNTQ" %%mm0, (%0) \n\t" \
957 MOVNTQ" %%mm3, 8(%0) \n\t" \
958
959 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
960 {
961 const uint16_t *end;
962 const uint16_t *mm_end;
963 uint8_t *d = dst;
964 const uint16_t *s = (const uint16_t *)src;
965 end = s + src_size/2;
966 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
967 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
968 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
969 mm_end = end - 3;
970 while (s < mm_end) {
971 __asm__ volatile(
972 PREFETCH" 32(%1) \n\t"
973 "movq (%1), %%mm0 \n\t"
974 "movq (%1), %%mm1 \n\t"
975 "movq (%1), %%mm2 \n\t"
976 "pand %2, %%mm0 \n\t"
977 "pand %3, %%mm1 \n\t"
978 "pand %4, %%mm2 \n\t"
979 "psllq $3, %%mm0 \n\t"
980 "psrlq $2, %%mm1 \n\t"
981 "psrlq $7, %%mm2 \n\t"
982 PACK_RGB32
983 ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
984 :"memory");
985 d += 16;
986 s += 4;
987 }
988 __asm__ volatile(SFENCE:::"memory");
989 __asm__ volatile(EMMS:::"memory");
990 while (s < end) {
991 register uint16_t bgr;
992 bgr = *s++;
993 *d++ = (bgr&0x1F)<<3;
994 *d++ = (bgr&0x3E0)>>2;
995 *d++ = (bgr&0x7C00)>>7;
996 *d++ = 255;
997 }
998 }
999
1000 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1001 {
1002 const uint16_t *end;
1003 const uint16_t *mm_end;
1004 uint8_t *d = dst;
1005 const uint16_t *s = (const uint16_t*)src;
1006 end = s + src_size/2;
1007 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1008 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1009 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1010 mm_end = end - 3;
1011 while (s < mm_end) {
1012 __asm__ volatile(
1013 PREFETCH" 32(%1) \n\t"
1014 "movq (%1), %%mm0 \n\t"
1015 "movq (%1), %%mm1 \n\t"
1016 "movq (%1), %%mm2 \n\t"
1017 "pand %2, %%mm0 \n\t"
1018 "pand %3, %%mm1 \n\t"
1019 "pand %4, %%mm2 \n\t"
1020 "psllq $3, %%mm0 \n\t"
1021 "psrlq $3, %%mm1 \n\t"
1022 "psrlq $8, %%mm2 \n\t"
1023 PACK_RGB32
1024 ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1025 :"memory");
1026 d += 16;
1027 s += 4;
1028 }
1029 __asm__ volatile(SFENCE:::"memory");
1030 __asm__ volatile(EMMS:::"memory");
1031 while (s < end) {
1032 register uint16_t bgr;
1033 bgr = *s++;
1034 *d++ = (bgr&0x1F)<<3;
1035 *d++ = (bgr&0x7E0)>>3;
1036 *d++ = (bgr&0xF800)>>8;
1037 *d++ = 255;
1038 }
1039 }
1040
1041 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1042 {
1043 x86_reg idx = 15 - src_size;
1044 const uint8_t *s = src-idx;
1045 uint8_t *d = dst-idx;
1046 __asm__ volatile(
1047 "test %0, %0 \n\t"
1048 "jns 2f \n\t"
1049 PREFETCH" (%1, %0) \n\t"
1050 "movq %3, %%mm7 \n\t"
1051 "pxor %4, %%mm7 \n\t"
1052 "movq %%mm7, %%mm6 \n\t"
1053 "pxor %5, %%mm7 \n\t"
1054 ".p2align 4 \n\t"
1055 "1: \n\t"
1056 PREFETCH" 32(%1, %0) \n\t"
1057 "movq (%1, %0), %%mm0 \n\t"
1058 "movq 8(%1, %0), %%mm1 \n\t"
1059 # if COMPILE_TEMPLATE_MMXEXT
1060 "pshufw $177, %%mm0, %%mm3 \n\t"
1061 "pshufw $177, %%mm1, %%mm5 \n\t"
1062 "pand %%mm7, %%mm0 \n\t"
1063 "pand %%mm6, %%mm3 \n\t"
1064 "pand %%mm7, %%mm1 \n\t"
1065 "pand %%mm6, %%mm5 \n\t"
1066 "por %%mm3, %%mm0 \n\t"
1067 "por %%mm5, %%mm1 \n\t"
1068 # else
1069 "movq %%mm0, %%mm2 \n\t"
1070 "movq %%mm1, %%mm4 \n\t"
1071 "pand %%mm7, %%mm0 \n\t"
1072 "pand %%mm6, %%mm2 \n\t"
1073 "pand %%mm7, %%mm1 \n\t"
1074 "pand %%mm6, %%mm4 \n\t"
1075 "movq %%mm2, %%mm3 \n\t"
1076 "movq %%mm4, %%mm5 \n\t"
1077 "pslld $16, %%mm2 \n\t"
1078 "psrld $16, %%mm3 \n\t"
1079 "pslld $16, %%mm4 \n\t"
1080 "psrld $16, %%mm5 \n\t"
1081 "por %%mm2, %%mm0 \n\t"
1082 "por %%mm4, %%mm1 \n\t"
1083 "por %%mm3, %%mm0 \n\t"
1084 "por %%mm5, %%mm1 \n\t"
1085 # endif
1086 MOVNTQ" %%mm0, (%2, %0) \n\t"
1087 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1088 "add $16, %0 \n\t"
1089 "js 1b \n\t"
1090 SFENCE" \n\t"
1091 EMMS" \n\t"
1092 "2: \n\t"
1093 : "+&r"(idx)
1094 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1095 : "memory");
1096 for (; idx<15; idx+=4) {
1097 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1098 v &= 0xff00ff;
1099 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1100 }
1101 }
1102
1103 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1104 {
1105 unsigned i;
1106 x86_reg mmx_size= 23 - src_size;
1107 __asm__ volatile (
1108 "test %%"REG_a", %%"REG_a" \n\t"
1109 "jns 2f \n\t"
1110 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1111 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1112 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1113 ".p2align 4 \n\t"
1114 "1: \n\t"
1115 PREFETCH" 32(%1, %%"REG_a") \n\t"
1116 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1117 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1118 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1119 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1120 "pand %%mm5, %%mm0 \n\t"
1121 "pand %%mm6, %%mm1 \n\t"
1122 "pand %%mm7, %%mm2 \n\t"
1123 "por %%mm0, %%mm1 \n\t"
1124 "por %%mm2, %%mm1 \n\t"
1125 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1126 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1127 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1128 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1129 "pand %%mm7, %%mm0 \n\t"
1130 "pand %%mm5, %%mm1 \n\t"
1131 "pand %%mm6, %%mm2 \n\t"
1132 "por %%mm0, %%mm1 \n\t"
1133 "por %%mm2, %%mm1 \n\t"
1134 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1135 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1136 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1137 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1138 "pand %%mm6, %%mm0 \n\t"
1139 "pand %%mm7, %%mm1 \n\t"
1140 "pand %%mm5, %%mm2 \n\t"
1141 "por %%mm0, %%mm1 \n\t"
1142 "por %%mm2, %%mm1 \n\t"
1143 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1144 "add $24, %%"REG_a" \n\t"
1145 " js 1b \n\t"
1146 "2: \n\t"
1147 : "+a" (mmx_size)
1148 : "r" (src-mmx_size), "r"(dst-mmx_size)
1149 );
1150
1151 __asm__ volatile(SFENCE:::"memory");
1152 __asm__ volatile(EMMS:::"memory");
1153
1154 if (mmx_size==23) return; //finished, was multiple of 8
1155
1156 src+= src_size;
1157 dst+= src_size;
1158 src_size= 23-mmx_size;
1159 src-= src_size;
1160 dst-= src_size;
1161 for (i=0; i<src_size; i+=3) {
1162 register uint8_t x;
1163 x = src[i + 2];
1164 dst[i + 1] = src[i + 1];
1165 dst[i + 2] = src[i + 0];
1166 dst[i + 0] = x;
1167 }
1168 }
1169
1170 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1171 int width, int height,
1172 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1173 {
1174 int y;
1175 const x86_reg chromWidth= width>>1;
1176 for (y=0; y<height; y++) {
1177 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1178 __asm__ volatile(
1179 "xor %%"REG_a", %%"REG_a" \n\t"
1180 ".p2align 4 \n\t"
1181 "1: \n\t"
1182 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1183 PREFETCH" 32(%2, %%"REG_a") \n\t"
1184 PREFETCH" 32(%3, %%"REG_a") \n\t"
1185 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1186 "movq %%mm0, %%mm2 \n\t" // U(0)
1187 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1188 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1189 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1190
1191 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1192 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1193 "movq %%mm3, %%mm4 \n\t" // Y(0)
1194 "movq %%mm5, %%mm6 \n\t" // Y(8)
1195 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1196 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1197 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1198 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1199
1200 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1201 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1202 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1203 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1204
1205 "add $8, %%"REG_a" \n\t"
1206 "cmp %4, %%"REG_a" \n\t"
1207 " jb 1b \n\t"
1208 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1209 : "%"REG_a
1210 );
1211 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1212 usrc += chromStride;
1213 vsrc += chromStride;
1214 }
1215 ysrc += lumStride;
1216 dst += dstStride;
1217 }
1218 __asm__(EMMS" \n\t"
1219 SFENCE" \n\t"
1220 :::"memory");
1221 }
1222
1223 /**
1224 * Height should be a multiple of 2 and width should be a multiple of 16.
1225 * (If this is a problem for anyone then tell me, and I will fix it.)
1226 */
1227 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1228 int width, int height,
1229 int lumStride, int chromStride, int dstStride)
1230 {
1231 //FIXME interpolate chroma
1232 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1233 }
1234
1235 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1236 int width, int height,
1237 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1238 {
1239 int y;
1240 const x86_reg chromWidth= width>>1;
1241 for (y=0; y<height; y++) {
1242 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1243 __asm__ volatile(
1244 "xor %%"REG_a", %%"REG_a" \n\t"
1245 ".p2align 4 \n\t"
1246 "1: \n\t"
1247 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1248 PREFETCH" 32(%2, %%"REG_a") \n\t"
1249 PREFETCH" 32(%3, %%"REG_a") \n\t"
1250 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1251 "movq %%mm0, %%mm2 \n\t" // U(0)
1252 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1253 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1254 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1255
1256 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1257 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1258 "movq %%mm0, %%mm4 \n\t" // Y(0)
1259 "movq %%mm2, %%mm6 \n\t" // Y(8)
1260 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1261 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1262 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1263 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1264
1265 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1266 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1267 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1268 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1269
1270 "add $8, %%"REG_a" \n\t"
1271 "cmp %4, %%"REG_a" \n\t"
1272 " jb 1b \n\t"
1273 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1274 : "%"REG_a
1275 );
1276 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1277 usrc += chromStride;
1278 vsrc += chromStride;
1279 }
1280 ysrc += lumStride;
1281 dst += dstStride;
1282 }
1283 __asm__(EMMS" \n\t"
1284 SFENCE" \n\t"
1285 :::"memory");
1286 }
1287
1288 /**
1289 * Height should be a multiple of 2 and width should be a multiple of 16
1290 * (If this is a problem for anyone then tell me, and I will fix it.)
1291 */
1292 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1293 int width, int height,
1294 int lumStride, int chromStride, int dstStride)
1295 {
1296 //FIXME interpolate chroma
1297 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1298 }
1299
1300 /**
1301 * Width should be a multiple of 16.
1302 */
1303 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1304 int width, int height,
1305 int lumStride, int chromStride, int dstStride)
1306 {
1307 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1308 }
1309
1310 /**
1311 * Width should be a multiple of 16.
1312 */
1313 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1314 int width, int height,
1315 int lumStride, int chromStride, int dstStride)
1316 {
1317 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1318 }
1319
1320 /**
1321 * Height should be a multiple of 2 and width should be a multiple of 16.
1322 * (If this is a problem for anyone then tell me, and I will fix it.)
1323 */
1324 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1325 int width, int height,
1326 int lumStride, int chromStride, int srcStride)
1327 {
1328 int y;
1329 const x86_reg chromWidth= width>>1;
1330 for (y=0; y<height; y+=2) {
1331 __asm__ volatile(
1332 "xor %%"REG_a", %%"REG_a" \n\t"
1333 "pcmpeqw %%mm7, %%mm7 \n\t"
1334 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1335 ".p2align 4 \n\t"
1336 "1: \n\t"
1337 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1338 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1339 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1340 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1341 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1342 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1343 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1344 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1345 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1346 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1347 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1348
1349 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1350
1351 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1352 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1353 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1354 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1355 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1356 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1357 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1358 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1359 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1360 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1361
1362 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1363
1364 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1365 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1366 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1367 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1368 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1369 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1370 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1371 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1372
1373 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1374 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1375
1376 "add $8, %%"REG_a" \n\t"
1377 "cmp %4, %%"REG_a" \n\t"
1378 " jb 1b \n\t"
1379 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1380 : "memory", "%"REG_a
1381 );
1382
1383 ydst += lumStride;
1384 src += srcStride;
1385
1386 __asm__ volatile(
1387 "xor %%"REG_a", %%"REG_a" \n\t"
1388 ".p2align 4 \n\t"
1389 "1: \n\t"
1390 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1391 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1392 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1393 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1394 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1395 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1396 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1397 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1398 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1399 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1400 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1401
1402 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1403 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1404
1405 "add $8, %%"REG_a" \n\t"
1406 "cmp %4, %%"REG_a" \n\t"
1407 " jb 1b \n\t"
1408
1409 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1410 : "memory", "%"REG_a
1411 );
1412 udst += chromStride;
1413 vdst += chromStride;
1414 ydst += lumStride;
1415 src += srcStride;
1416 }
1417 __asm__ volatile(EMMS" \n\t"
1418 SFENCE" \n\t"
1419 :::"memory");
1420 }
1421 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1422
1423 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1424 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1425 {
1426 int x,y;
1427
1428 dst[0]= src[0];
1429
1430 // first line
1431 for (x=0; x<srcWidth-1; x++) {
1432 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1433 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1434 }
1435 dst[2*srcWidth-1]= src[srcWidth-1];
1436
1437 dst+= dstStride;
1438
1439 for (y=1; y<srcHeight; y++) {
1440 const x86_reg mmxSize= srcWidth&~15;
1441 __asm__ volatile(
1442 "mov %4, %%"REG_a" \n\t"
1443 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1444 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1445 "movq %%mm4, %%mm2 \n\t"
1446 "psllq $8, %%mm4 \n\t"
1447 "pand %%mm0, %%mm2 \n\t"
1448 "por %%mm2, %%mm4 \n\t"
1449 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1450 "movq %%mm5, %%mm3 \n\t"
1451 "psllq $8, %%mm5 \n\t"
1452 "pand %%mm0, %%mm3 \n\t"
1453 "por %%mm3, %%mm5 \n\t"
1454 "1: \n\t"
1455 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1456 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1457 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1458 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1459 PAVGB" %%mm0, %%mm5 \n\t"
1460 PAVGB" %%mm0, %%mm3 \n\t"
1461 PAVGB" %%mm0, %%mm5 \n\t"
1462 PAVGB" %%mm0, %%mm3 \n\t"
1463 PAVGB" %%mm1, %%mm4 \n\t"
1464 PAVGB" %%mm1, %%mm2 \n\t"
1465 PAVGB" %%mm1, %%mm4 \n\t"
1466 PAVGB" %%mm1, %%mm2 \n\t"
1467 "movq %%mm5, %%mm7 \n\t"
1468 "movq %%mm4, %%mm6 \n\t"
1469 "punpcklbw %%mm3, %%mm5 \n\t"
1470 "punpckhbw %%mm3, %%mm7 \n\t"
1471 "punpcklbw %%mm2, %%mm4 \n\t"
1472 "punpckhbw %%mm2, %%mm6 \n\t"
1473 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1474 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1475 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1476 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1477 "add $8, %%"REG_a" \n\t"
1478 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1479 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1480 " js 1b \n\t"
1481 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1482 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1483 "g" (-mmxSize)
1484 : "%"REG_a
1485 );
1486
1487 for (x=mmxSize-1; x<srcWidth-1; x++) {
1488 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1489 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1490 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1491 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1492 }
1493 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1494 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1495
1496 dst+=dstStride*2;
1497 src+=srcStride;
1498 }
1499
1500 // last line
1501 dst[0]= src[0];
1502
1503 for (x=0; x<srcWidth-1; x++) {
1504 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1505 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1506 }
1507 dst[2*srcWidth-1]= src[srcWidth-1];
1508
1509 __asm__ volatile(EMMS" \n\t"
1510 SFENCE" \n\t"
1511 :::"memory");
1512 }
1513 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1514
1515 #if !COMPILE_TEMPLATE_AMD3DNOW
1516 /**
1517 * Height should be a multiple of 2 and width should be a multiple of 16.
1518 * (If this is a problem for anyone then tell me, and I will fix it.)
1519 * Chrominance data is only taken from every second line, others are ignored.
1520 * FIXME: Write HQ version.
1521 */
1522 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1523 int width, int height,
1524 int lumStride, int chromStride, int srcStride)
1525 {
1526 int y;
1527 const x86_reg chromWidth= width>>1;
1528 for (y=0; y<height; y+=2) {
1529 __asm__ volatile(
1530 "xor %%"REG_a", %%"REG_a" \n\t"
1531 "pcmpeqw %%mm7, %%mm7 \n\t"
1532 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1533 ".p2align 4 \n\t"
1534 "1: \n\t"
1535 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1536 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1537 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1538 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1539 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1540 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1541 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1542 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1543 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1544 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1545 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1546
1547 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1548
1549 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1550 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1551 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1552 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1553 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1554 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1555 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1556 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1557 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1558 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1559
1560 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1561
1562 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1563 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1564 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1565 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1566 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1567 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1568 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1569 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1570
1571 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1572 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1573
1574 "add $8, %%"REG_a" \n\t"
1575 "cmp %4, %%"REG_a" \n\t"
1576 " jb 1b \n\t"
1577 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1578 : "memory", "%"REG_a
1579 );
1580
1581 ydst += lumStride;
1582 src += srcStride;
1583
1584 __asm__ volatile(
1585 "xor %%"REG_a", %%"REG_a" \n\t"
1586 ".p2align 4 \n\t"
1587 "1: \n\t"
1588 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1589 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1590 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1591 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1592 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1593 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1594 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1595 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1596 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1597 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1598 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1599
1600 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1601 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1602
1603 "add $8, %%"REG_a" \n\t"
1604 "cmp %4, %%"REG_a" \n\t"
1605 " jb 1b \n\t"
1606
1607 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1608 : "memory", "%"REG_a
1609 );
1610 udst += chromStride;
1611 vdst += chromStride;
1612 ydst += lumStride;
1613 src += srcStride;
1614 }
1615 __asm__ volatile(EMMS" \n\t"
1616 SFENCE" \n\t"
1617 :::"memory");
1618 }
1619 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1620
1621 /**
1622 * Height should be a multiple of 2 and width should be a multiple of 2.
1623 * (If this is a problem for anyone then tell me, and I will fix it.)
1624 * Chrominance data is only taken from every second line,
1625 * others are ignored in the C version.
1626 * FIXME: Write HQ version.
1627 */
1628 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1629 int width, int height,
1630 int lumStride, int chromStride, int srcStride)
1631 {
1632 int y;
1633 const x86_reg chromWidth= width>>1;
1634 for (y=0; y<height-2; y+=2) {
1635 int i;
1636 for (i=0; i<2; i++) {
1637 __asm__ volatile(
1638 "mov %2, %%"REG_a" \n\t"
1639 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1640 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1641 "pxor %%mm7, %%mm7 \n\t"
1642 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1643 ".p2align 4 \n\t"
1644 "1: \n\t"
1645 PREFETCH" 64(%0, %%"REG_d") \n\t"
1646 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1647 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1648 "punpcklbw %%mm7, %%mm0 \n\t"
1649 "punpcklbw %%mm7, %%mm1 \n\t"
1650 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1651 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1652 "punpcklbw %%mm7, %%mm2 \n\t"
1653 "punpcklbw %%mm7, %%mm3 \n\t"
1654 "pmaddwd %%mm6, %%mm0 \n\t"
1655 "pmaddwd %%mm6, %%mm1 \n\t"
1656 "pmaddwd %%mm6, %%mm2 \n\t"
1657 "pmaddwd %%mm6, %%mm3 \n\t"
1658 #ifndef FAST_BGR2YV12
1659 "psrad $8, %%mm0 \n\t"
1660 "psrad $8, %%mm1 \n\t"
1661 "psrad $8, %%mm2 \n\t"
1662 "psrad $8, %%mm3 \n\t"
1663 #endif
1664 "packssdw %%mm1, %%mm0 \n\t"
1665 "packssdw %%mm3, %%mm2 \n\t"
1666 "pmaddwd %%mm5, %%mm0 \n\t"
1667 "pmaddwd %%mm5, %%mm2 \n\t"
1668 "packssdw %%mm2, %%mm0 \n\t"
1669 "psraw $7, %%mm0 \n\t"
1670
1671 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1672 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1673 "punpcklbw %%mm7, %%mm4 \n\t"
1674 "punpcklbw %%mm7, %%mm1 \n\t"
1675 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1676 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1677 "punpcklbw %%mm7, %%mm2 \n\t"
1678 "punpcklbw %%mm7, %%mm3 \n\t"
1679 "pmaddwd %%mm6, %%mm4 \n\t"
1680 "pmaddwd %%mm6, %%mm1 \n\t"
1681 "pmaddwd %%mm6, %%mm2 \n\t"
1682 "pmaddwd %%mm6, %%mm3 \n\t"
1683 #ifndef FAST_BGR2YV12
1684 "psrad $8, %%mm4 \n\t"
1685 "psrad $8, %%mm1 \n\t"
1686 "psrad $8, %%mm2 \n\t"
1687 "psrad $8, %%mm3 \n\t"
1688 #endif
1689 "packssdw %%mm1, %%mm4 \n\t"
1690 "packssdw %%mm3, %%mm2 \n\t"
1691 "pmaddwd %%mm5, %%mm4 \n\t"
1692 "pmaddwd %%mm5, %%mm2 \n\t"
1693 "add $24, %%"REG_d" \n\t"
1694 "packssdw %%mm2, %%mm4 \n\t"
1695 "psraw $7, %%mm4 \n\t"
1696
1697 "packuswb %%mm4, %%mm0 \n\t"
1698 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1699
1700 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1701 "add $8, %%"REG_a" \n\t"
1702 " js 1b \n\t"
1703 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1704 : "%"REG_a, "%"REG_d
1705 );
1706 ydst += lumStride;
1707 src += srcStride;
1708 }
1709 src -= srcStride*2;
1710 __asm__ volatile(
1711 "mov %4, %%"REG_a" \n\t"
1712 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1713 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1714 "pxor %%mm7, %%mm7 \n\t"
1715 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1716 "add %%"REG_d", %%"REG_d" \n\t"
1717 ".p2align 4 \n\t"
1718 "1: \n\t"
1719 PREFETCH" 64(%0, %%"REG_d") \n\t"
1720 PREFETCH" 64(%1, %%"REG_d") \n\t"
1721 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1722 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1723 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1724 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1725 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1726 PAVGB" %%mm1, %%mm0 \n\t"
1727 PAVGB" %%mm3, %%mm2 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "movq %%mm2, %%mm3 \n\t"
1730 "psrlq $24, %%mm0 \n\t"
1731 "psrlq $24, %%mm2 \n\t"
1732 PAVGB" %%mm1, %%mm0 \n\t"
1733 PAVGB" %%mm3, %%mm2 \n\t"
1734 "punpcklbw %%mm7, %%mm0 \n\t"
1735 "punpcklbw %%mm7, %%mm2 \n\t"
1736 #else
1737 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1738 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1739 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1740 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1741 "punpcklbw %%mm7, %%mm0 \n\t"
1742 "punpcklbw %%mm7, %%mm1 \n\t"
1743 "punpcklbw %%mm7, %%mm2 \n\t"
1744 "punpcklbw %%mm7, %%mm3 \n\t"
1745 "paddw %%mm1, %%mm0 \n\t"
1746 "paddw %%mm3, %%mm2 \n\t"
1747 "paddw %%mm2, %%mm0 \n\t"
1748 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1749 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1750 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1751 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1752 "punpcklbw %%mm7, %%mm4 \n\t"
1753 "punpcklbw %%mm7, %%mm1 \n\t"
1754 "punpcklbw %%mm7, %%mm2 \n\t"
1755 "punpcklbw %%mm7, %%mm3 \n\t"
1756 "paddw %%mm1, %%mm4 \n\t"
1757 "paddw %%mm3, %%mm2 \n\t"
1758 "paddw %%mm4, %%mm2 \n\t"
1759 "psrlw $2, %%mm0 \n\t"
1760 "psrlw $2, %%mm2 \n\t"
1761 #endif
1762 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1763 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1764
1765 "pmaddwd %%mm0, %%mm1 \n\t"
1766 "pmaddwd %%mm2, %%mm3 \n\t"
1767 "pmaddwd %%mm6, %%mm0 \n\t"
1768 "pmaddwd %%mm6, %%mm2 \n\t"
1769 #ifndef FAST_BGR2YV12
1770 "psrad $8, %%mm0 \n\t"
1771 "psrad $8, %%mm1 \n\t"
1772 "psrad $8, %%mm2 \n\t"
1773 "psrad $8, %%mm3 \n\t"
1774 #endif
1775 "packssdw %%mm2, %%mm0 \n\t"
1776 "packssdw %%mm3, %%mm1 \n\t"
1777 "pmaddwd %%mm5, %%mm0 \n\t"
1778 "pmaddwd %%mm5, %%mm1 \n\t"
1779 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1780 "psraw $7, %%mm0 \n\t"
1781
1782 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1783 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1784 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1785 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1786 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1787 PAVGB" %%mm1, %%mm4 \n\t"
1788 PAVGB" %%mm3, %%mm2 \n\t"
1789 "movq %%mm4, %%mm1 \n\t"
1790 "movq %%mm2, %%mm3 \n\t"
1791 "psrlq $24, %%mm4 \n\t"
1792 "psrlq $24, %%mm2 \n\t"
1793 PAVGB" %%mm1, %%mm4 \n\t"
1794 PAVGB" %%mm3, %%mm2 \n\t"
1795 "punpcklbw %%mm7, %%mm4 \n\t"
1796 "punpcklbw %%mm7, %%mm2 \n\t"
1797 #else
1798 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1799 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1800 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1801 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1802 "punpcklbw %%mm7, %%mm4 \n\t"
1803 "punpcklbw %%mm7, %%mm1 \n\t"
1804 "punpcklbw %%mm7, %%mm2 \n\t"
1805 "punpcklbw %%mm7, %%mm3 \n\t"
1806 "paddw %%mm1, %%mm4 \n\t"
1807 "paddw %%mm3, %%mm2 \n\t"
1808 "paddw %%mm2, %%mm4 \n\t"
1809 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1810 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1811 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1812 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1813 "punpcklbw %%mm7, %%mm5 \n\t"
1814 "punpcklbw %%mm7, %%mm1 \n\t"
1815 "punpcklbw %%mm7, %%mm2 \n\t"
1816 "punpcklbw %%mm7, %%mm3 \n\t"
1817 "paddw %%mm1, %%mm5 \n\t"
1818 "paddw %%mm3, %%mm2 \n\t"
1819 "paddw %%mm5, %%mm2 \n\t"
1820 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1821 "psrlw $2, %%mm4 \n\t"
1822 "psrlw $2, %%mm2 \n\t"
1823 #endif
1824 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1825 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1826
1827 "pmaddwd %%mm4, %%mm1 \n\t"
1828 "pmaddwd %%mm2, %%mm3 \n\t"
1829 "pmaddwd %%mm6, %%mm4 \n\t"
1830 "pmaddwd %%mm6, %%mm2 \n\t"
1831 #ifndef FAST_BGR2YV12
1832 "psrad $8, %%mm4 \n\t"
1833 "psrad $8, %%mm1 \n\t"
1834 "psrad $8, %%mm2 \n\t"
1835 "psrad $8, %%mm3 \n\t"
1836 #endif
1837 "packssdw %%mm2, %%mm4 \n\t"
1838 "packssdw %%mm3, %%mm1 \n\t"
1839 "pmaddwd %%mm5, %%mm4 \n\t"
1840 "pmaddwd %%mm5, %%mm1 \n\t"
1841 "add $24, %%"REG_d" \n\t"
1842 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1843 "psraw $7, %%mm4 \n\t"
1844
1845 "movq %%mm0, %%mm1 \n\t"
1846 "punpckldq %%mm4, %%mm0 \n\t"
1847 "punpckhdq %%mm4, %%mm1 \n\t"
1848 "packsswb %%mm1, %%mm0 \n\t"
1849 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1850 "movd %%mm0, (%2, %%"REG_a") \n\t"
1851 "punpckhdq %%mm0, %%mm0 \n\t"
1852 "movd %%mm0, (%3, %%"REG_a") \n\t"
1853 "add $4, %%"REG_a" \n\t"
1854 " js 1b \n\t"
1855 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1856 : "%"REG_a, "%"REG_d
1857 );
1858
1859 udst += chromStride;
1860 vdst += chromStride;
1861 src += srcStride*2;
1862 }
1863
1864 __asm__ volatile(EMMS" \n\t"
1865 SFENCE" \n\t"
1866 :::"memory");
1867
1868 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1869 }
1870 #endif /* !COMPILE_TEMPLATE_SSE2 */
1871
1872 #if !COMPILE_TEMPLATE_AMD3DNOW
1873 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1874 int width, int height, int src1Stride,
1875 int src2Stride, int dstStride)
1876 {
1877 int h;
1878
1879 for (h=0; h < height; h++) {
1880 int w;
1881
1882 #if COMPILE_TEMPLATE_SSE2
1883 __asm__(
1884 "xor %%"REG_a", %%"REG_a" \n\t"
1885 "1: \n\t"
1886 PREFETCH" 64(%1, %%"REG_a") \n\t"
1887 PREFETCH" 64(%2, %%"REG_a") \n\t"
1888 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1889 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1890 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1891 "punpcklbw %%xmm2, %%xmm0 \n\t"
1892 "punpckhbw %%xmm2, %%xmm1 \n\t"
1893 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1894 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1895 "add $16, %%"REG_a" \n\t"
1896 "cmp %3, %%"REG_a" \n\t"
1897 " jb 1b \n\t"
1898 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1899 : "memory", "%"REG_a""
1900 );
1901 #else
1902 __asm__(
1903 "xor %%"REG_a", %%"REG_a" \n\t"
1904 "1: \n\t"
1905 PREFETCH" 64(%1, %%"REG_a") \n\t"
1906 PREFETCH" 64(%2, %%"REG_a") \n\t"
1907 "movq (%1, %%"REG_a"), %%mm0 \n\t"
1908 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1909 "movq %%mm0, %%mm1 \n\t"
1910 "movq %%mm2, %%mm3 \n\t"
1911 "movq (%2, %%"REG_a"), %%mm4 \n\t"
1912 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1913 "punpcklbw %%mm4, %%mm0 \n\t"
1914 "punpckhbw %%mm4, %%mm1 \n\t"
1915 "punpcklbw %%mm5, %%mm2 \n\t"
1916 "punpckhbw %%mm5, %%mm3 \n\t"
1917 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1918 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1919 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1920 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1921 "add $16, %%"REG_a" \n\t"
1922 "cmp %3, %%"REG_a" \n\t"
1923 " jb 1b \n\t"
1924 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1925 : "memory", "%"REG_a
1926 );
1927 #endif
1928 for (w= (width&(~15)); w < width; w++) {
1929 dest[2*w+0] = src1[w];
1930 dest[2*w+1] = src2[w];
1931 }
1932 dest += dstStride;
1933 src1 += src1Stride;
1934 src2 += src2Stride;
1935 }
1936 __asm__(
1937 EMMS" \n\t"
1938 SFENCE" \n\t"
1939 ::: "memory"
1940 );
1941 }
1942 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1943
1944 #if !COMPILE_TEMPLATE_SSE2
1945 #if !COMPILE_TEMPLATE_AMD3DNOW
1946 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1947 uint8_t *dst1, uint8_t *dst2,
1948 int width, int height,
1949 int srcStride1, int srcStride2,
1950 int dstStride1, int dstStride2)
1951 {
1952 x86_reg x, y;
1953 int w,h;
1954 w=width/2; h=height/2;
1955 __asm__ volatile(
1956 PREFETCH" %0 \n\t"
1957 PREFETCH" %1 \n\t"
1958 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1959 for (y=0;y<h;y++) {
1960 const uint8_t* s1=src1+srcStride1*(y>>1);
1961 uint8_t* d=dst1+dstStride1*y;
1962 x=0;
1963 for (;x<w-31;x+=32) {
1964 __asm__ volatile(
1965 PREFETCH" 32(%1,%2) \n\t"
1966 "movq (%1,%2), %%mm0 \n\t"
1967 "movq 8(%1,%2), %%mm2 \n\t"
1968 "movq 16(%1,%2), %%mm4 \n\t"
1969 "movq 24(%1,%2), %%mm6 \n\t"
1970 "movq %%mm0, %%mm1 \n\t"
1971 "movq %%mm2, %%mm3 \n\t"
1972 "movq %%mm4, %%mm5 \n\t"
1973 "movq %%mm6, %%mm7 \n\t"
1974 "punpcklbw %%mm0, %%mm0 \n\t"
1975 "punpckhbw %%mm1, %%mm1 \n\t"
1976 "punpcklbw %%mm2, %%mm2 \n\t"
1977 "punpckhbw %%mm3, %%mm3 \n\t"
1978 "punpcklbw %%mm4, %%mm4 \n\t"
1979 "punpckhbw %%mm5, %%mm5 \n\t"
1980 "punpcklbw %%mm6, %%mm6 \n\t"
1981 "punpckhbw %%mm7, %%mm7 \n\t"
1982 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1983 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1984 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1985 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1986 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1987 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1988 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1989 MOVNTQ" %%mm7, 56(%0,%2,2)"
1990 :: "r"(d), "r"(s1), "r"(x)
1991 :"memory");
1992 }
1993 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1994 }
1995 for (y=0;y<h;y++) {
1996 const uint8_t* s2=src2+srcStride2*(y>>1);
1997 uint8_t* d=dst2+dstStride2*y;
1998 x=0;
1999 for (;x<w-31;x+=32) {
2000 __asm__ volatile(
2001 PREFETCH" 32(%1,%2) \n\t"
2002 "movq (%1,%2), %%mm0 \n\t"
2003 "movq 8(%1,%2), %%mm2 \n\t"
2004 "movq 16(%1,%2), %%mm4 \n\t"
2005 "movq 24(%1,%2), %%mm6 \n\t"
2006 "movq %%mm0, %%mm1 \n\t"
2007 "movq %%mm2, %%mm3 \n\t"
2008 "movq %%mm4, %%mm5 \n\t"
2009 "movq %%mm6, %%mm7 \n\t"
2010 "punpcklbw %%mm0, %%mm0 \n\t"
2011 "punpckhbw %%mm1, %%mm1 \n\t"
2012 "punpcklbw %%mm2, %%mm2 \n\t"
2013 "punpckhbw %%mm3, %%mm3 \n\t"
2014 "punpcklbw %%mm4, %%mm4 \n\t"
2015 "punpckhbw %%mm5, %%mm5 \n\t"
2016 "punpcklbw %%mm6, %%mm6 \n\t"
2017 "punpckhbw %%mm7, %%mm7 \n\t"
2018 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2019 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2020 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2021 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2022 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2023 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2024 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2025 MOVNTQ" %%mm7, 56(%0,%2,2)"
2026 :: "r"(d), "r"(s2), "r"(x)
2027 :"memory");
2028 }
2029 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2030 }
2031 __asm__(
2032 EMMS" \n\t"
2033 SFENCE" \n\t"
2034 ::: "memory"
2035 );
2036 }
2037
2038 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2039 uint8_t *dst,
2040 int width, int height,
2041 int srcStride1, int srcStride2,
2042 int srcStride3, int dstStride)
2043 {
2044 x86_reg x;
2045 int y,w,h;
2046 w=width/2; h=height;
2047 for (y=0;y<h;y++) {
2048 const uint8_t* yp=src1+srcStride1*y;
2049 const uint8_t* up=src2+srcStride2*(y>>2);
2050 const uint8_t* vp=src3+srcStride3*(y>>2);
2051 uint8_t* d=dst+dstStride*y;
2052 x=0;
2053 for (;x<w-7;x+=8) {
2054 __asm__ volatile(
2055 PREFETCH" 32(%1, %0) \n\t"
2056 PREFETCH" 32(%2, %0) \n\t"
2057 PREFETCH" 32(%3, %0) \n\t"
2058 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2059 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2060 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2061 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2062 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2063 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2064 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2065 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2066 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2067 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2068
2069 "movq %%mm1, %%mm6 \n\t"
2070 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2071 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2072 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2073 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2074 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2075
2076 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2077 "movq 8(%1, %0, 4), %%mm0 \n\t"
2078 "movq %%mm0, %%mm3 \n\t"
2079 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2080 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2081 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2082 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2083
2084 "movq %%mm4, %%mm6 \n\t"
2085 "movq 16(%1, %0, 4), %%mm0 \n\t"
2086 "movq %%mm0, %%mm3 \n\t"
2087 "punpcklbw %%mm5, %%mm4 \n\t"
2088 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2089 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2090 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2091 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2092
2093 "punpckhbw %%mm5, %%mm6 \n\t"
2094 "movq 24(%1, %0, 4), %%mm0 \n\t"
2095 "movq %%mm0, %%mm3 \n\t"
2096 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2097 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2098 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2099 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2100
2101 : "+r" (x)
2102 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2103 :"memory");
2104 }
2105 for (; x<w; x++) {
2106 const int x2 = x<<2;
2107 d[8*x+0] = yp[x2];
2108 d[8*x+1] = up[x];
2109 d[8*x+2] = yp[x2+1];
2110 d[8*x+3] = vp[x];
2111 d[8*x+4] = yp[x2+2];
2112 d[8*x+5] = up[x];
2113 d[8*x+6] = yp[x2+3];
2114 d[8*x+7] = vp[x];
2115 }
2116 }
2117 __asm__(
2118 EMMS" \n\t"
2119 SFENCE" \n\t"
2120 ::: "memory"
2121 );
2122 }
2123 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2124
2125 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2126 {
2127 dst += count;
2128 src += 2*count;
2129 count= - count;
2130
2131 if(count <= -16) {
2132 count += 15;
2133 __asm__ volatile(
2134 "pcmpeqw %%mm7, %%mm7 \n\t"
2135 "psrlw $8, %%mm7 \n\t"
2136 "1: \n\t"
2137 "movq -30(%1, %0, 2), %%mm0 \n\t"
2138 "movq -22(%1, %0, 2), %%mm1 \n\t"
2139 "movq -14(%1, %0, 2), %%mm2 \n\t"
2140 "movq -6(%1, %0, 2), %%mm3 \n\t"
2141 "pand %%mm7, %%mm0 \n\t"
2142 "pand %%mm7, %%mm1 \n\t"
2143 "pand %%mm7, %%mm2 \n\t"
2144 "pand %%mm7, %%mm3 \n\t"
2145 "packuswb %%mm1, %%mm0 \n\t"
2146 "packuswb %%mm3, %%mm2 \n\t"
2147 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2148 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2149 "add $16, %0 \n\t"
2150 " js 1b \n\t"
2151 : "+r"(count)
2152 : "r"(src), "r"(dst)
2153 );
2154 count -= 15;
2155 }
2156 while(count<0) {
2157 dst[count]= src[2*count];
2158 count++;
2159 }
2160 }
2161
2162 #if !COMPILE_TEMPLATE_AMD3DNOW
2163 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2164 {
2165 dst0+= count;
2166 dst1+= count;
2167 src += 4*count;
2168 count= - count;
2169 if(count <= -8) {
2170 count += 7;
2171 __asm__ volatile(
2172 "pcmpeqw %%mm7, %%mm7 \n\t"
2173 "psrlw $8, %%mm7 \n\t"
2174 "1: \n\t"
2175 "movq -28(%1, %0, 4), %%mm0 \n\t"
2176 "movq -20(%1, %0, 4), %%mm1 \n\t"
2177 "movq -12(%1, %0, 4), %%mm2 \n\t"
2178 "movq -4(%1, %0, 4), %%mm3 \n\t"
2179 "pand %%mm7, %%mm0 \n\t"
2180 "pand %%mm7, %%mm1 \n\t"
2181 "pand %%mm7, %%mm2 \n\t"
2182 "pand %%mm7, %%mm3 \n\t"
2183 "packuswb %%mm1, %%mm0 \n\t"
2184 "packuswb %%mm3, %%mm2 \n\t"
2185 "movq %%mm0, %%mm1 \n\t"
2186 "movq %%mm2, %%mm3 \n\t"
2187 "psrlw $8, %%mm0 \n\t"
2188 "psrlw $8, %%mm2 \n\t"
2189 "pand %%mm7, %%mm1 \n\t"
2190 "pand %%mm7, %%mm3 \n\t"
2191 "packuswb %%mm2, %%mm0 \n\t"
2192 "packuswb %%mm3, %%mm1 \n\t"
2193 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2194 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2195 "add $8, %0 \n\t"
2196 " js 1b \n\t"
2197 : "+r"(count)
2198 : "r"(src), "r"(dst0), "r"(dst1)
2199 );
2200 count -= 7;
2201 }
2202 while(count<0) {
2203 dst0[count]= src[4*count+0];
2204 dst1[count]= src[4*count+2];
2205 count++;
2206 }
2207 }
2208 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2209
2210 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2211 {
2212 dst0 += count;
2213 dst1 += count;
2214 src0 += 4*count;
2215 src1 += 4*count;
2216 count= - count;
2217 #ifdef PAVGB
2218 if(count <= -8) {
2219 count += 7;
2220 __asm__ volatile(
2221 "pcmpeqw %%mm7, %%mm7 \n\t"
2222 "psrlw $8, %%mm7 \n\t"
2223 "1: \n\t"
2224 "movq -28(%1, %0, 4), %%mm0 \n\t"
2225 "movq -20(%1, %0, 4), %%mm1 \n\t"
2226 "movq -12(%1, %0, 4), %%mm2 \n\t"
2227 "movq -4(%1, %0, 4), %%mm3 \n\t"
2228 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2229 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2230 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2231 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2232 "pand %%mm7, %%mm0 \n\t"
2233 "pand %%mm7, %%mm1 \n\t"
2234 "pand %%mm7, %%mm2 \n\t"
2235 "pand %%mm7, %%mm3 \n\t"
2236 "packuswb %%mm1, %%mm0 \n\t"
2237 "packuswb %%mm3, %%mm2 \n\t"
2238 "movq %%mm0, %%mm1 \n\t"
2239 "movq %%mm2, %%mm3 \n\t"
2240 "psrlw $8, %%mm0 \n\t"
2241 "psrlw $8, %%mm2 \n\t"
2242 "pand %%mm7, %%mm1 \n\t"
2243 "pand %%mm7, %%mm3 \n\t"
2244 "packuswb %%mm2, %%mm0 \n\t"
2245 "packuswb %%mm3, %%mm1 \n\t"
2246 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2247 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2248 "add $8, %0 \n\t"
2249 " js 1b \n\t"
2250 : "+r"(count)
2251 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2252 );
2253 count -= 7;
2254 }
2255 #endif
2256 while(count<0) {
2257 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2258 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2259 count++;
2260 }
2261 }
2262
2263 #if !COMPILE_TEMPLATE_AMD3DNOW
2264 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2265 {
2266 dst0+= count;
2267 dst1+= count;
2268 src += 4*count;
2269 count= - count;
2270 if(count <= -8) {
2271 count += 7;
2272 __asm__ volatile(
2273 "pcmpeqw %%mm7, %%mm7 \n\t"
2274 "psrlw $8, %%mm7 \n\t"
2275 "1: \n\t"
2276 "movq -28(%1, %0, 4), %%mm0 \n\t"
2277 "movq -20(%1, %0, 4), %%mm1 \n\t"
2278 "movq -12(%1, %0, 4), %%mm2 \n\t"
2279 "movq -4(%1, %0, 4), %%mm3 \n\t"
2280 "psrlw $8, %%mm0 \n\t"
2281 "psrlw $8, %%mm1 \n\t"
2282 "psrlw $8, %%mm2 \n\t"
2283 "psrlw $8, %%mm3 \n\t"
2284 "packuswb %%mm1, %%mm0 \n\t"
2285 "packuswb %%mm3, %%mm2 \n\t"
2286 "movq %%mm0, %%mm1 \n\t"
2287 "movq %%mm2, %%mm3 \n\t"
2288 "psrlw $8, %%mm0 \n\t"
2289 "psrlw $8, %%mm2 \n\t"
2290 "pand %%mm7, %%mm1 \n\t"
2291 "pand %%mm7, %%mm3 \n\t"
2292 "packuswb %%mm2, %%mm0 \n\t"
2293 "packuswb %%mm3, %%mm1 \n\t"
2294 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2295 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2296 "add $8, %0 \n\t"
2297 " js 1b \n\t"
2298 : "+r"(count)
2299 : "r"(src), "r"(dst0), "r"(dst1)
2300 );
2301 count -= 7;
2302 }
2303 src++;
2304 while(count<0) {
2305 dst0[count]= src[4*count+0];
2306 dst1[count]= src[4*count+2];
2307 count++;
2308 }
2309 }
2310 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2311
2312 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2313 {
2314 dst0 += count;
2315 dst1 += count;
2316 src0 += 4*count;
2317 src1 += 4*count;
2318 count= - count;
2319 #ifdef PAVGB
2320 if(count <= -8) {
2321 count += 7;
2322 __asm__ volatile(
2323 "pcmpeqw %%mm7, %%mm7 \n\t"
2324 "psrlw $8, %%mm7 \n\t"
2325 "1: \n\t"
2326 "movq -28(%1, %0, 4), %%mm0 \n\t"
2327 "movq -20(%1, %0, 4), %%mm1 \n\t"
2328 "movq -12(%1, %0, 4), %%mm2 \n\t"
2329 "movq -4(%1, %0, 4), %%mm3 \n\t"
2330 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2331 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2332 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2333 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2334 "psrlw $8, %%mm0 \n\t"
2335 "psrlw $8, %%mm1 \n\t"
2336 "psrlw $8, %%mm2 \n\t"
2337 "psrlw $8, %%mm3 \n\t"
2338 "packuswb %%mm1, %%mm0 \n\t"
2339 "packuswb %%mm3, %%mm2 \n\t"
2340 "movq %%mm0, %%mm1 \n\t"
2341 "movq %%mm2, %%mm3 \n\t"
2342 "psrlw $8, %%mm0 \n\t"
2343 "psrlw $8, %%mm2 \n\t"
2344 "pand %%mm7, %%mm1 \n\t"
2345 "pand %%mm7, %%mm3 \n\t"
2346 "packuswb %%mm2, %%mm0 \n\t"
2347 "packuswb %%mm3, %%mm1 \n\t"
2348 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2349 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2350 "add $8, %0 \n\t"
2351 " js 1b \n\t"
2352 : "+r"(count)
2353 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2354 );
2355 count -= 7;
2356 }
2357 #endif
2358 src0++;
2359 src1++;
2360 while(count<0) {
2361 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2362 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2363 count++;
2364 }
2365 }
2366
2367 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2368 int width, int height,
2369 int lumStride, int chromStride, int srcStride)
2370 {
2371 int y;
2372 const int chromWidth= -((-width)>>1);
2373
2374 for (y=0; y<height; y++) {
2375 RENAME(extract_even)(src, ydst, width);
2376 if(y&1) {
2377 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2378 udst+= chromStride;
2379 vdst+= chromStride;
2380 }
2381
2382 src += srcStride;
2383 ydst+= lumStride;
2384 }
2385 __asm__(
2386 EMMS" \n\t"
2387 SFENCE" \n\t"
2388 ::: "memory"
2389 );
2390 }
2391
2392 #if !COMPILE_TEMPLATE_AMD3DNOW
2393 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2394 int width, int height,
2395 int lumStride, int chromStride, int srcStride)
2396 {
2397 int y;
2398 const int chromWidth= -((-width)>>1);
2399
2400 for (y=0; y<height; y++) {
2401 RENAME(extract_even)(src, ydst, width);
2402 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2403
2404 src += srcStride;
2405 ydst+= lumStride;
2406 udst+= chromStride;
2407 vdst+= chromStride;
2408 }
2409 __asm__(
2410 EMMS" \n\t"
2411 SFENCE" \n\t"
2412 ::: "memory"
2413 );
2414 }
2415 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2416
2417 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2418 int width, int height,
2419 int lumStride, int chromStride, int srcStride)
2420 {
2421 int y;
2422 const int chromWidth= -((-width)>>1);
2423
2424 for (y=0; y<height; y++) {
2425 RENAME(extract_even)(src+1, ydst, width);
2426 if(y&1) {
2427 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2428 udst+= chromStride;
2429 vdst+= chromStride;
2430 }
2431
2432 src += srcStride;
2433 ydst+= lumStride;
2434 }
2435 __asm__(
2436 EMMS" \n\t"
2437 SFENCE" \n\t"
2438 ::: "memory"
2439 );
2440 }
2441
2442 #if !COMPILE_TEMPLATE_AMD3DNOW
2443 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2444 int width, int height,
2445 int lumStride, int chromStride, int srcStride)
2446 {
2447 int y;
2448 const int chromWidth= -((-width)>>1);
2449
2450 for (y=0; y<height; y++) {
2451 RENAME(extract_even)(src+1, ydst, width);
2452 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2453
2454 src += srcStride;
2455 ydst+= lumStride;
2456 udst+= chromStride;
2457 vdst+= chromStride;
2458 }
2459 __asm__(
2460 EMMS" \n\t"
2461 SFENCE" \n\t"
2462 ::: "memory"
2463 );
2464 }
2465 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2466 #endif /* !COMPILE_TEMPLATE_SSE2 */
2467
2468 static inline void RENAME(rgb2rgb_init)(void)
2469 {
2470 #if !COMPILE_TEMPLATE_SSE2
2471 #if !COMPILE_TEMPLATE_AMD3DNOW
2472 rgb15to16 = RENAME(rgb15to16);
2473 rgb15tobgr24 = RENAME(rgb15tobgr24);
2474 rgb15to32 = RENAME(rgb15to32);
2475 rgb16tobgr24 = RENAME(rgb16tobgr24);
2476 rgb16to32 = RENAME(rgb16to32);
2477 rgb16to15 = RENAME(rgb16to15);
2478 rgb24tobgr16 = RENAME(rgb24tobgr16);
2479 rgb24tobgr15 = RENAME(rgb24tobgr15);
2480 rgb24tobgr32 = RENAME(rgb24tobgr32);
2481 rgb32to16 = RENAME(rgb32to16);
2482 rgb32to15 = RENAME(rgb32to15);
2483 rgb32tobgr24 = RENAME(rgb32tobgr24);
2484 rgb24to15 = RENAME(rgb24to15);
2485 rgb24to16 = RENAME(rgb24to16);
2486 rgb24tobgr24 = RENAME(rgb24tobgr24);
2487 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2488 rgb32tobgr16 = RENAME(rgb32tobgr16);
2489 rgb32tobgr15 = RENAME(rgb32tobgr15);
2490 yv12toyuy2 = RENAME(yv12toyuy2);
2491 yv12touyvy = RENAME(yv12touyvy);
2492 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2493 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2494 yuy2toyv12 = RENAME(yuy2toyv12);
2495 vu9_to_vu12 = RENAME(vu9_to_vu12);
2496 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2497 uyvytoyuv422 = RENAME(uyvytoyuv422);
2498 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2499 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2500
2501 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2502 planar2x = RENAME(planar2x);
2503 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2504 rgb24toyv12 = RENAME(rgb24toyv12);
2505
2506 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2507 uyvytoyuv420 = RENAME(uyvytoyuv420);
2508 #endif /* !COMPILE_TEMPLATE_SSE2 */
2509
2510 #if !COMPILE_TEMPLATE_AMD3DNOW
2511 interleaveBytes = RENAME(interleaveBytes);
2512 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2513 }