rgb2rgb: remove duplicate mmx/mmx2/3dnow/sse2 functions.
[libav.git] / libswscale / x86 / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of Libav.
11 *
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27 #include <stddef.h>
28
29 #undef PREFETCH
30 #undef MOVNTQ
31 #undef EMMS
32 #undef SFENCE
33 #undef PAVGB
34
35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMX2
39 #define PREFETCH "prefetchnta"
40 #define PAVGB "pavgb"
41 #else
42 #define PREFETCH " # nop"
43 #endif
44
45 #if COMPILE_TEMPLATE_AMD3DNOW
46 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
47 #define EMMS "femms"
48 #else
49 #define EMMS "emms"
50 #endif
51
52 #if COMPILE_TEMPLATE_MMX2
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
55 #else
56 #define MOVNTQ "movq"
57 #define SFENCE " # nop"
58 #endif
59
60 #if !COMPILE_TEMPLATE_SSE2
61
62 #if !COMPILE_TEMPLATE_AMD3DNOW
63
64 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
65 {
66 uint8_t *dest = dst;
67 const uint8_t *s = src;
68 const uint8_t *end;
69 const uint8_t *mm_end;
70 end = s + src_size;
71 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
72 mm_end = end - 23;
73 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
74 while (s < mm_end) {
75 __asm__ volatile(
76 PREFETCH" 32%1 \n\t"
77 "movd %1, %%mm0 \n\t"
78 "punpckldq 3%1, %%mm0 \n\t"
79 "movd 6%1, %%mm1 \n\t"
80 "punpckldq 9%1, %%mm1 \n\t"
81 "movd 12%1, %%mm2 \n\t"
82 "punpckldq 15%1, %%mm2 \n\t"
83 "movd 18%1, %%mm3 \n\t"
84 "punpckldq 21%1, %%mm3 \n\t"
85 "por %%mm7, %%mm0 \n\t"
86 "por %%mm7, %%mm1 \n\t"
87 "por %%mm7, %%mm2 \n\t"
88 "por %%mm7, %%mm3 \n\t"
89 MOVNTQ" %%mm0, %0 \n\t"
90 MOVNTQ" %%mm1, 8%0 \n\t"
91 MOVNTQ" %%mm2, 16%0 \n\t"
92 MOVNTQ" %%mm3, 24%0"
93 :"=m"(*dest)
94 :"m"(*s)
95 :"memory");
96 dest += 32;
97 s += 24;
98 }
99 __asm__ volatile(SFENCE:::"memory");
100 __asm__ volatile(EMMS:::"memory");
101 while (s < end) {
102 *dest++ = *s++;
103 *dest++ = *s++;
104 *dest++ = *s++;
105 *dest++ = 255;
106 }
107 }
108
109 #define STORE_BGR24_MMX \
110 "psrlq $8, %%mm2 \n\t" \
111 "psrlq $8, %%mm3 \n\t" \
112 "psrlq $8, %%mm6 \n\t" \
113 "psrlq $8, %%mm7 \n\t" \
114 "pand "MANGLE(mask24l)", %%mm0\n\t" \
115 "pand "MANGLE(mask24l)", %%mm1\n\t" \
116 "pand "MANGLE(mask24l)", %%mm4\n\t" \
117 "pand "MANGLE(mask24l)", %%mm5\n\t" \
118 "pand "MANGLE(mask24h)", %%mm2\n\t" \
119 "pand "MANGLE(mask24h)", %%mm3\n\t" \
120 "pand "MANGLE(mask24h)", %%mm6\n\t" \
121 "pand "MANGLE(mask24h)", %%mm7\n\t" \
122 "por %%mm2, %%mm0 \n\t" \
123 "por %%mm3, %%mm1 \n\t" \
124 "por %%mm6, %%mm4 \n\t" \
125 "por %%mm7, %%mm5 \n\t" \
126 \
127 "movq %%mm1, %%mm2 \n\t" \
128 "movq %%mm4, %%mm3 \n\t" \
129 "psllq $48, %%mm2 \n\t" \
130 "psllq $32, %%mm3 \n\t" \
131 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
132 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
133 "por %%mm2, %%mm0 \n\t" \
134 "psrlq $16, %%mm1 \n\t" \
135 "psrlq $32, %%mm4 \n\t" \
136 "psllq $16, %%mm5 \n\t" \
137 "por %%mm3, %%mm1 \n\t" \
138 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
139 "por %%mm5, %%mm4 \n\t" \
140 \
141 MOVNTQ" %%mm0, %0 \n\t" \
142 MOVNTQ" %%mm1, 8%0 \n\t" \
143 MOVNTQ" %%mm4, 16%0"
144
145
146 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
147 {
148 uint8_t *dest = dst;
149 const uint8_t *s = src;
150 const uint8_t *end;
151 const uint8_t *mm_end;
152 end = s + src_size;
153 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
154 mm_end = end - 31;
155 while (s < mm_end) {
156 __asm__ volatile(
157 PREFETCH" 32%1 \n\t"
158 "movq %1, %%mm0 \n\t"
159 "movq 8%1, %%mm1 \n\t"
160 "movq 16%1, %%mm4 \n\t"
161 "movq 24%1, %%mm5 \n\t"
162 "movq %%mm0, %%mm2 \n\t"
163 "movq %%mm1, %%mm3 \n\t"
164 "movq %%mm4, %%mm6 \n\t"
165 "movq %%mm5, %%mm7 \n\t"
166 STORE_BGR24_MMX
167 :"=m"(*dest)
168 :"m"(*s)
169 :"memory");
170 dest += 24;
171 s += 32;
172 }
173 __asm__ volatile(SFENCE:::"memory");
174 __asm__ volatile(EMMS:::"memory");
175 while (s < end) {
176 *dest++ = *s++;
177 *dest++ = *s++;
178 *dest++ = *s++;
179 s++;
180 }
181 }
182
183 /*
184 original by Strepto/Astral
185 ported to gcc & bugfixed: A'rpi
186 MMX2, 3DNOW optimization by Nick Kurshev
187 32-bit C version, and and&add trick by Michael Niedermayer
188 */
189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
190 {
191 register const uint8_t* s=src;
192 register uint8_t* d=dst;
193 register const uint8_t *end;
194 const uint8_t *mm_end;
195 end = s + src_size;
196 __asm__ volatile(PREFETCH" %0"::"m"(*s));
197 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
198 mm_end = end - 15;
199 while (s<mm_end) {
200 __asm__ volatile(
201 PREFETCH" 32%1 \n\t"
202 "movq %1, %%mm0 \n\t"
203 "movq 8%1, %%mm2 \n\t"
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
210 MOVNTQ" %%mm0, %0 \n\t"
211 MOVNTQ" %%mm2, 8%0"
212 :"=m"(*d)
213 :"m"(*s)
214 );
215 d+=16;
216 s+=16;
217 }
218 __asm__ volatile(SFENCE:::"memory");
219 __asm__ volatile(EMMS:::"memory");
220 mm_end = end - 3;
221 while (s < mm_end) {
222 register unsigned x= *((const uint32_t *)s);
223 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
224 d+=4;
225 s+=4;
226 }
227 if (s < end) {
228 register unsigned short x= *((const uint16_t *)s);
229 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
230 }
231 }
232
233 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
234 {
235 register const uint8_t* s=src;
236 register uint8_t* d=dst;
237 register const uint8_t *end;
238 const uint8_t *mm_end;
239 end = s + src_size;
240 __asm__ volatile(PREFETCH" %0"::"m"(*s));
241 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
242 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
243 mm_end = end - 15;
244 while (s<mm_end) {
245 __asm__ volatile(
246 PREFETCH" 32%1 \n\t"
247 "movq %1, %%mm0 \n\t"
248 "movq 8%1, %%mm2 \n\t"
249 "movq %%mm0, %%mm1 \n\t"
250 "movq %%mm2, %%mm3 \n\t"
251 "psrlq $1, %%mm0 \n\t"
252 "psrlq $1, %%mm2 \n\t"
253 "pand %%mm7, %%mm0 \n\t"
254 "pand %%mm7, %%mm2 \n\t"
255 "pand %%mm6, %%mm1 \n\t"
256 "pand %%mm6, %%mm3 \n\t"
257 "por %%mm1, %%mm0 \n\t"
258 "por %%mm3, %%mm2 \n\t"
259 MOVNTQ" %%mm0, %0 \n\t"
260 MOVNTQ" %%mm2, 8%0"
261 :"=m"(*d)
262 :"m"(*s)
263 );
264 d+=16;
265 s+=16;
266 }
267 __asm__ volatile(SFENCE:::"memory");
268 __asm__ volatile(EMMS:::"memory");
269 mm_end = end - 3;
270 while (s < mm_end) {
271 register uint32_t x= *((const uint32_t*)s);
272 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
273 s+=4;
274 d+=4;
275 }
276 if (s < end) {
277 register uint16_t x= *((const uint16_t*)s);
278 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
279 }
280 }
281
282 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
283 {
284 const uint8_t *s = src;
285 const uint8_t *end;
286 const uint8_t *mm_end;
287 uint16_t *d = (uint16_t *)dst;
288 end = s + src_size;
289 mm_end = end - 15;
290 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
291 __asm__ volatile(
292 "movq %3, %%mm5 \n\t"
293 "movq %4, %%mm6 \n\t"
294 "movq %5, %%mm7 \n\t"
295 "jmp 2f \n\t"
296 ".p2align 4 \n\t"
297 "1: \n\t"
298 PREFETCH" 32(%1) \n\t"
299 "movd (%1), %%mm0 \n\t"
300 "movd 4(%1), %%mm3 \n\t"
301 "punpckldq 8(%1), %%mm0 \n\t"
302 "punpckldq 12(%1), %%mm3 \n\t"
303 "movq %%mm0, %%mm1 \n\t"
304 "movq %%mm3, %%mm4 \n\t"
305 "pand %%mm6, %%mm0 \n\t"
306 "pand %%mm6, %%mm3 \n\t"
307 "pmaddwd %%mm7, %%mm0 \n\t"
308 "pmaddwd %%mm7, %%mm3 \n\t"
309 "pand %%mm5, %%mm1 \n\t"
310 "pand %%mm5, %%mm4 \n\t"
311 "por %%mm1, %%mm0 \n\t"
312 "por %%mm4, %%mm3 \n\t"
313 "psrld $5, %%mm0 \n\t"
314 "pslld $11, %%mm3 \n\t"
315 "por %%mm3, %%mm0 \n\t"
316 MOVNTQ" %%mm0, (%0) \n\t"
317 "add $16, %1 \n\t"
318 "add $8, %0 \n\t"
319 "2: \n\t"
320 "cmp %2, %1 \n\t"
321 " jb 1b \n\t"
322 : "+r" (d), "+r"(s)
323 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
324 );
325 #else
326 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
327 __asm__ volatile(
328 "movq %0, %%mm7 \n\t"
329 "movq %1, %%mm6 \n\t"
330 ::"m"(red_16mask),"m"(green_16mask));
331 while (s < mm_end) {
332 __asm__ volatile(
333 PREFETCH" 32%1 \n\t"
334 "movd %1, %%mm0 \n\t"
335 "movd 4%1, %%mm3 \n\t"
336 "punpckldq 8%1, %%mm0 \n\t"
337 "punpckldq 12%1, %%mm3 \n\t"
338 "movq %%mm0, %%mm1 \n\t"
339 "movq %%mm0, %%mm2 \n\t"
340 "movq %%mm3, %%mm4 \n\t"
341 "movq %%mm3, %%mm5 \n\t"
342 "psrlq $3, %%mm0 \n\t"
343 "psrlq $3, %%mm3 \n\t"
344 "pand %2, %%mm0 \n\t"
345 "pand %2, %%mm3 \n\t"
346 "psrlq $5, %%mm1 \n\t"
347 "psrlq $5, %%mm4 \n\t"
348 "pand %%mm6, %%mm1 \n\t"
349 "pand %%mm6, %%mm4 \n\t"
350 "psrlq $8, %%mm2 \n\t"
351 "psrlq $8, %%mm5 \n\t"
352 "pand %%mm7, %%mm2 \n\t"
353 "pand %%mm7, %%mm5 \n\t"
354 "por %%mm1, %%mm0 \n\t"
355 "por %%mm4, %%mm3 \n\t"
356 "por %%mm2, %%mm0 \n\t"
357 "por %%mm5, %%mm3 \n\t"
358 "psllq $16, %%mm3 \n\t"
359 "por %%mm3, %%mm0 \n\t"
360 MOVNTQ" %%mm0, %0 \n\t"
361 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
362 d += 4;
363 s += 16;
364 }
365 #endif
366 __asm__ volatile(SFENCE:::"memory");
367 __asm__ volatile(EMMS:::"memory");
368 while (s < end) {
369 register int rgb = *(const uint32_t*)s; s += 4;
370 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
371 }
372 }
373
374 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
375 {
376 const uint8_t *s = src;
377 const uint8_t *end;
378 const uint8_t *mm_end;
379 uint16_t *d = (uint16_t *)dst;
380 end = s + src_size;
381 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
382 __asm__ volatile(
383 "movq %0, %%mm7 \n\t"
384 "movq %1, %%mm6 \n\t"
385 ::"m"(red_16mask),"m"(green_16mask));
386 mm_end = end - 15;
387 while (s < mm_end) {
388 __asm__ volatile(
389 PREFETCH" 32%1 \n\t"
390 "movd %1, %%mm0 \n\t"
391 "movd 4%1, %%mm3 \n\t"
392 "punpckldq 8%1, %%mm0 \n\t"
393 "punpckldq 12%1, %%mm3 \n\t"
394 "movq %%mm0, %%mm1 \n\t"
395 "movq %%mm0, %%mm2 \n\t"
396 "movq %%mm3, %%mm4 \n\t"
397 "movq %%mm3, %%mm5 \n\t"
398 "psllq $8, %%mm0 \n\t"
399 "psllq $8, %%mm3 \n\t"
400 "pand %%mm7, %%mm0 \n\t"
401 "pand %%mm7, %%mm3 \n\t"
402 "psrlq $5, %%mm1 \n\t"
403 "psrlq $5, %%mm4 \n\t"
404 "pand %%mm6, %%mm1 \n\t"
405 "pand %%mm6, %%mm4 \n\t"
406 "psrlq $19, %%mm2 \n\t"
407 "psrlq $19, %%mm5 \n\t"
408 "pand %2, %%mm2 \n\t"
409 "pand %2, %%mm5 \n\t"
410 "por %%mm1, %%mm0 \n\t"
411 "por %%mm4, %%mm3 \n\t"
412 "por %%mm2, %%mm0 \n\t"
413 "por %%mm5, %%mm3 \n\t"
414 "psllq $16, %%mm3 \n\t"
415 "por %%mm3, %%mm0 \n\t"
416 MOVNTQ" %%mm0, %0 \n\t"
417 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
418 d += 4;
419 s += 16;
420 }
421 __asm__ volatile(SFENCE:::"memory");
422 __asm__ volatile(EMMS:::"memory");
423 while (s < end) {
424 register int rgb = *(const uint32_t*)s; s += 4;
425 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
426 }
427 }
428
429 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
430 {
431 const uint8_t *s = src;
432 const uint8_t *end;
433 const uint8_t *mm_end;
434 uint16_t *d = (uint16_t *)dst;
435 end = s + src_size;
436 mm_end = end - 15;
437 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
438 __asm__ volatile(
439 "movq %3, %%mm5 \n\t"
440 "movq %4, %%mm6 \n\t"
441 "movq %5, %%mm7 \n\t"
442 "jmp 2f \n\t"
443 ".p2align 4 \n\t"
444 "1: \n\t"
445 PREFETCH" 32(%1) \n\t"
446 "movd (%1), %%mm0 \n\t"
447 "movd 4(%1), %%mm3 \n\t"
448 "punpckldq 8(%1), %%mm0 \n\t"
449 "punpckldq 12(%1), %%mm3 \n\t"
450 "movq %%mm0, %%mm1 \n\t"
451 "movq %%mm3, %%mm4 \n\t"
452 "pand %%mm6, %%mm0 \n\t"
453 "pand %%mm6, %%mm3 \n\t"
454 "pmaddwd %%mm7, %%mm0 \n\t"
455 "pmaddwd %%mm7, %%mm3 \n\t"
456 "pand %%mm5, %%mm1 \n\t"
457 "pand %%mm5, %%mm4 \n\t"
458 "por %%mm1, %%mm0 \n\t"
459 "por %%mm4, %%mm3 \n\t"
460 "psrld $6, %%mm0 \n\t"
461 "pslld $10, %%mm3 \n\t"
462 "por %%mm3, %%mm0 \n\t"
463 MOVNTQ" %%mm0, (%0) \n\t"
464 "add $16, %1 \n\t"
465 "add $8, %0 \n\t"
466 "2: \n\t"
467 "cmp %2, %1 \n\t"
468 " jb 1b \n\t"
469 : "+r" (d), "+r"(s)
470 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
471 );
472 #else
473 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
474 __asm__ volatile(
475 "movq %0, %%mm7 \n\t"
476 "movq %1, %%mm6 \n\t"
477 ::"m"(red_15mask),"m"(green_15mask));
478 while (s < mm_end) {
479 __asm__ volatile(
480 PREFETCH" 32%1 \n\t"
481 "movd %1, %%mm0 \n\t"
482 "movd 4%1, %%mm3 \n\t"
483 "punpckldq 8%1, %%mm0 \n\t"
484 "punpckldq 12%1, %%mm3 \n\t"
485 "movq %%mm0, %%mm1 \n\t"
486 "movq %%mm0, %%mm2 \n\t"
487 "movq %%mm3, %%mm4 \n\t"
488 "movq %%mm3, %%mm5 \n\t"
489 "psrlq $3, %%mm0 \n\t"
490 "psrlq $3, %%mm3 \n\t"
491 "pand %2, %%mm0 \n\t"
492 "pand %2, %%mm3 \n\t"
493 "psrlq $6, %%mm1 \n\t"
494 "psrlq $6, %%mm4 \n\t"
495 "pand %%mm6, %%mm1 \n\t"
496 "pand %%mm6, %%mm4 \n\t"
497 "psrlq $9, %%mm2 \n\t"
498 "psrlq $9, %%mm5 \n\t"
499 "pand %%mm7, %%mm2 \n\t"
500 "pand %%mm7, %%mm5 \n\t"
501 "por %%mm1, %%mm0 \n\t"
502 "por %%mm4, %%mm3 \n\t"
503 "por %%mm2, %%mm0 \n\t"
504 "por %%mm5, %%mm3 \n\t"
505 "psllq $16, %%mm3 \n\t"
506 "por %%mm3, %%mm0 \n\t"
507 MOVNTQ" %%mm0, %0 \n\t"
508 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
509 d += 4;
510 s += 16;
511 }
512 #endif
513 __asm__ volatile(SFENCE:::"memory");
514 __asm__ volatile(EMMS:::"memory");
515 while (s < end) {
516 register int rgb = *(const uint32_t*)s; s += 4;
517 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
518 }
519 }
520
521 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
522 {
523 const uint8_t *s = src;
524 const uint8_t *end;
525 const uint8_t *mm_end;
526 uint16_t *d = (uint16_t *)dst;
527 end = s + src_size;
528 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
529 __asm__ volatile(
530 "movq %0, %%mm7 \n\t"
531 "movq %1, %%mm6 \n\t"
532 ::"m"(red_15mask),"m"(green_15mask));
533 mm_end = end - 15;
534 while (s < mm_end) {
535 __asm__ volatile(
536 PREFETCH" 32%1 \n\t"
537 "movd %1, %%mm0 \n\t"
538 "movd 4%1, %%mm3 \n\t"
539 "punpckldq 8%1, %%mm0 \n\t"
540 "punpckldq 12%1, %%mm3 \n\t"
541 "movq %%mm0, %%mm1 \n\t"
542 "movq %%mm0, %%mm2 \n\t"
543 "movq %%mm3, %%mm4 \n\t"
544 "movq %%mm3, %%mm5 \n\t"
545 "psllq $7, %%mm0 \n\t"
546 "psllq $7, %%mm3 \n\t"
547 "pand %%mm7, %%mm0 \n\t"
548 "pand %%mm7, %%mm3 \n\t"
549 "psrlq $6, %%mm1 \n\t"
550 "psrlq $6, %%mm4 \n\t"
551 "pand %%mm6, %%mm1 \n\t"
552 "pand %%mm6, %%mm4 \n\t"
553 "psrlq $19, %%mm2 \n\t"
554 "psrlq $19, %%mm5 \n\t"
555 "pand %2, %%mm2 \n\t"
556 "pand %2, %%mm5 \n\t"
557 "por %%mm1, %%mm0 \n\t"
558 "por %%mm4, %%mm3 \n\t"
559 "por %%mm2, %%mm0 \n\t"
560 "por %%mm5, %%mm3 \n\t"
561 "psllq $16, %%mm3 \n\t"
562 "por %%mm3, %%mm0 \n\t"
563 MOVNTQ" %%mm0, %0 \n\t"
564 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
565 d += 4;
566 s += 16;
567 }
568 __asm__ volatile(SFENCE:::"memory");
569 __asm__ volatile(EMMS:::"memory");
570 while (s < end) {
571 register int rgb = *(const uint32_t*)s; s += 4;
572 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
573 }
574 }
575
576 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
577 {
578 const uint8_t *s = src;
579 const uint8_t *end;
580 const uint8_t *mm_end;
581 uint16_t *d = (uint16_t *)dst;
582 end = s + src_size;
583 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
584 __asm__ volatile(
585 "movq %0, %%mm7 \n\t"
586 "movq %1, %%mm6 \n\t"
587 ::"m"(red_16mask),"m"(green_16mask));
588 mm_end = end - 11;
589 while (s < mm_end) {
590 __asm__ volatile(
591 PREFETCH" 32%1 \n\t"
592 "movd %1, %%mm0 \n\t"
593 "movd 3%1, %%mm3 \n\t"
594 "punpckldq 6%1, %%mm0 \n\t"
595 "punpckldq 9%1, %%mm3 \n\t"
596 "movq %%mm0, %%mm1 \n\t"
597 "movq %%mm0, %%mm2 \n\t"
598 "movq %%mm3, %%mm4 \n\t"
599 "movq %%mm3, %%mm5 \n\t"
600 "psrlq $3, %%mm0 \n\t"
601 "psrlq $3, %%mm3 \n\t"
602 "pand %2, %%mm0 \n\t"
603 "pand %2, %%mm3 \n\t"
604 "psrlq $5, %%mm1 \n\t"
605 "psrlq $5, %%mm4 \n\t"
606 "pand %%mm6, %%mm1 \n\t"
607 "pand %%mm6, %%mm4 \n\t"
608 "psrlq $8, %%mm2 \n\t"
609 "psrlq $8, %%mm5 \n\t"
610 "pand %%mm7, %%mm2 \n\t"
611 "pand %%mm7, %%mm5 \n\t"
612 "por %%mm1, %%mm0 \n\t"
613 "por %%mm4, %%mm3 \n\t"
614 "por %%mm2, %%mm0 \n\t"
615 "por %%mm5, %%mm3 \n\t"
616 "psllq $16, %%mm3 \n\t"
617 "por %%mm3, %%mm0 \n\t"
618 MOVNTQ" %%mm0, %0 \n\t"
619 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
620 d += 4;
621 s += 12;
622 }
623 __asm__ volatile(SFENCE:::"memory");
624 __asm__ volatile(EMMS:::"memory");
625 while (s < end) {
626 const int b = *s++;
627 const int g = *s++;
628 const int r = *s++;
629 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
630 }
631 }
632
633 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
634 {
635 const uint8_t *s = src;
636 const uint8_t *end;
637 const uint8_t *mm_end;
638 uint16_t *d = (uint16_t *)dst;
639 end = s + src_size;
640 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
641 __asm__ volatile(
642 "movq %0, %%mm7 \n\t"
643 "movq %1, %%mm6 \n\t"
644 ::"m"(red_16mask),"m"(green_16mask));
645 mm_end = end - 15;
646 while (s < mm_end) {
647 __asm__ volatile(
648 PREFETCH" 32%1 \n\t"
649 "movd %1, %%mm0 \n\t"
650 "movd 3%1, %%mm3 \n\t"
651 "punpckldq 6%1, %%mm0 \n\t"
652 "punpckldq 9%1, %%mm3 \n\t"
653 "movq %%mm0, %%mm1 \n\t"
654 "movq %%mm0, %%mm2 \n\t"
655 "movq %%mm3, %%mm4 \n\t"
656 "movq %%mm3, %%mm5 \n\t"
657 "psllq $8, %%mm0 \n\t"
658 "psllq $8, %%mm3 \n\t"
659 "pand %%mm7, %%mm0 \n\t"
660 "pand %%mm7, %%mm3 \n\t"
661 "psrlq $5, %%mm1 \n\t"
662 "psrlq $5, %%mm4 \n\t"
663 "pand %%mm6, %%mm1 \n\t"
664 "pand %%mm6, %%mm4 \n\t"
665 "psrlq $19, %%mm2 \n\t"
666 "psrlq $19, %%mm5 \n\t"
667 "pand %2, %%mm2 \n\t"
668 "pand %2, %%mm5 \n\t"
669 "por %%mm1, %%mm0 \n\t"
670 "por %%mm4, %%mm3 \n\t"
671 "por %%mm2, %%mm0 \n\t"
672 "por %%mm5, %%mm3 \n\t"
673 "psllq $16, %%mm3 \n\t"
674 "por %%mm3, %%mm0 \n\t"
675 MOVNTQ" %%mm0, %0 \n\t"
676 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
677 d += 4;
678 s += 12;
679 }
680 __asm__ volatile(SFENCE:::"memory");
681 __asm__ volatile(EMMS:::"memory");
682 while (s < end) {
683 const int r = *s++;
684 const int g = *s++;
685 const int b = *s++;
686 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
687 }
688 }
689
690 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
691 {
692 const uint8_t *s = src;
693 const uint8_t *end;
694 const uint8_t *mm_end;
695 uint16_t *d = (uint16_t *)dst;
696 end = s + src_size;
697 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
698 __asm__ volatile(
699 "movq %0, %%mm7 \n\t"
700 "movq %1, %%mm6 \n\t"
701 ::"m"(red_15mask),"m"(green_15mask));
702 mm_end = end - 11;
703 while (s < mm_end) {
704 __asm__ volatile(
705 PREFETCH" 32%1 \n\t"
706 "movd %1, %%mm0 \n\t"
707 "movd 3%1, %%mm3 \n\t"
708 "punpckldq 6%1, %%mm0 \n\t"
709 "punpckldq 9%1, %%mm3 \n\t"
710 "movq %%mm0, %%mm1 \n\t"
711 "movq %%mm0, %%mm2 \n\t"
712 "movq %%mm3, %%mm4 \n\t"
713 "movq %%mm3, %%mm5 \n\t"
714 "psrlq $3, %%mm0 \n\t"
715 "psrlq $3, %%mm3 \n\t"
716 "pand %2, %%mm0 \n\t"
717 "pand %2, %%mm3 \n\t"
718 "psrlq $6, %%mm1 \n\t"
719 "psrlq $6, %%mm4 \n\t"
720 "pand %%mm6, %%mm1 \n\t"
721 "pand %%mm6, %%mm4 \n\t"
722 "psrlq $9, %%mm2 \n\t"
723 "psrlq $9, %%mm5 \n\t"
724 "pand %%mm7, %%mm2 \n\t"
725 "pand %%mm7, %%mm5 \n\t"
726 "por %%mm1, %%mm0 \n\t"
727 "por %%mm4, %%mm3 \n\t"
728 "por %%mm2, %%mm0 \n\t"
729 "por %%mm5, %%mm3 \n\t"
730 "psllq $16, %%mm3 \n\t"
731 "por %%mm3, %%mm0 \n\t"
732 MOVNTQ" %%mm0, %0 \n\t"
733 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
734 d += 4;
735 s += 12;
736 }
737 __asm__ volatile(SFENCE:::"memory");
738 __asm__ volatile(EMMS:::"memory");
739 while (s < end) {
740 const int b = *s++;
741 const int g = *s++;
742 const int r = *s++;
743 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
744 }
745 }
746
747 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
748 {
749 const uint8_t *s = src;
750 const uint8_t *end;
751 const uint8_t *mm_end;
752 uint16_t *d = (uint16_t *)dst;
753 end = s + src_size;
754 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
755 __asm__ volatile(
756 "movq %0, %%mm7 \n\t"
757 "movq %1, %%mm6 \n\t"
758 ::"m"(red_15mask),"m"(green_15mask));
759 mm_end = end - 15;
760 while (s < mm_end) {
761 __asm__ volatile(
762 PREFETCH" 32%1 \n\t"
763 "movd %1, %%mm0 \n\t"
764 "movd 3%1, %%mm3 \n\t"
765 "punpckldq 6%1, %%mm0 \n\t"
766 "punpckldq 9%1, %%mm3 \n\t"
767 "movq %%mm0, %%mm1 \n\t"
768 "movq %%mm0, %%mm2 \n\t"
769 "movq %%mm3, %%mm4 \n\t"
770 "movq %%mm3, %%mm5 \n\t"
771 "psllq $7, %%mm0 \n\t"
772 "psllq $7, %%mm3 \n\t"
773 "pand %%mm7, %%mm0 \n\t"
774 "pand %%mm7, %%mm3 \n\t"
775 "psrlq $6, %%mm1 \n\t"
776 "psrlq $6, %%mm4 \n\t"
777 "pand %%mm6, %%mm1 \n\t"
778 "pand %%mm6, %%mm4 \n\t"
779 "psrlq $19, %%mm2 \n\t"
780 "psrlq $19, %%mm5 \n\t"
781 "pand %2, %%mm2 \n\t"
782 "pand %2, %%mm5 \n\t"
783 "por %%mm1, %%mm0 \n\t"
784 "por %%mm4, %%mm3 \n\t"
785 "por %%mm2, %%mm0 \n\t"
786 "por %%mm5, %%mm3 \n\t"
787 "psllq $16, %%mm3 \n\t"
788 "por %%mm3, %%mm0 \n\t"
789 MOVNTQ" %%mm0, %0 \n\t"
790 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
791 d += 4;
792 s += 12;
793 }
794 __asm__ volatile(SFENCE:::"memory");
795 __asm__ volatile(EMMS:::"memory");
796 while (s < end) {
797 const int r = *s++;
798 const int g = *s++;
799 const int b = *s++;
800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801 }
802 }
803
804 /*
805 I use less accurate approximation here by simply left-shifting the input
806 value and filling the low order bits with zeroes. This method improves PNG
807 compression but this scheme cannot reproduce white exactly, since it does
808 not generate an all-ones maximum value; the net effect is to darken the
809 image slightly.
810
811 The better method should be "left bit replication":
812
813 4 3 2 1 0
814 ---------
815 1 1 0 1 1
816
817 7 6 5 4 3 2 1 0
818 ----------------
819 1 1 0 1 1 1 1 0
820 |=======| |===|
821 | leftmost bits repeated to fill open bits
822 |
823 original bits
824 */
825 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
826 {
827 const uint16_t *end;
828 const uint16_t *mm_end;
829 uint8_t *d = dst;
830 const uint16_t *s = (const uint16_t*)src;
831 end = s + src_size/2;
832 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
833 mm_end = end - 7;
834 while (s < mm_end) {
835 __asm__ volatile(
836 PREFETCH" 32%1 \n\t"
837 "movq %1, %%mm0 \n\t"
838 "movq %1, %%mm1 \n\t"
839 "movq %1, %%mm2 \n\t"
840 "pand %2, %%mm0 \n\t"
841 "pand %3, %%mm1 \n\t"
842 "pand %4, %%mm2 \n\t"
843 "psllq $3, %%mm0 \n\t"
844 "psrlq $2, %%mm1 \n\t"
845 "psrlq $7, %%mm2 \n\t"
846 "movq %%mm0, %%mm3 \n\t"
847 "movq %%mm1, %%mm4 \n\t"
848 "movq %%mm2, %%mm5 \n\t"
849 "punpcklwd %5, %%mm0 \n\t"
850 "punpcklwd %5, %%mm1 \n\t"
851 "punpcklwd %5, %%mm2 \n\t"
852 "punpckhwd %5, %%mm3 \n\t"
853 "punpckhwd %5, %%mm4 \n\t"
854 "punpckhwd %5, %%mm5 \n\t"
855 "psllq $8, %%mm1 \n\t"
856 "psllq $16, %%mm2 \n\t"
857 "por %%mm1, %%mm0 \n\t"
858 "por %%mm2, %%mm0 \n\t"
859 "psllq $8, %%mm4 \n\t"
860 "psllq $16, %%mm5 \n\t"
861 "por %%mm4, %%mm3 \n\t"
862 "por %%mm5, %%mm3 \n\t"
863
864 "movq %%mm0, %%mm6 \n\t"
865 "movq %%mm3, %%mm7 \n\t"
866
867 "movq 8%1, %%mm0 \n\t"
868 "movq 8%1, %%mm1 \n\t"
869 "movq 8%1, %%mm2 \n\t"
870 "pand %2, %%mm0 \n\t"
871 "pand %3, %%mm1 \n\t"
872 "pand %4, %%mm2 \n\t"
873 "psllq $3, %%mm0 \n\t"
874 "psrlq $2, %%mm1 \n\t"
875 "psrlq $7, %%mm2 \n\t"
876 "movq %%mm0, %%mm3 \n\t"
877 "movq %%mm1, %%mm4 \n\t"
878 "movq %%mm2, %%mm5 \n\t"
879 "punpcklwd %5, %%mm0 \n\t"
880 "punpcklwd %5, %%mm1 \n\t"
881 "punpcklwd %5, %%mm2 \n\t"
882 "punpckhwd %5, %%mm3 \n\t"
883 "punpckhwd %5, %%mm4 \n\t"
884 "punpckhwd %5, %%mm5 \n\t"
885 "psllq $8, %%mm1 \n\t"
886 "psllq $16, %%mm2 \n\t"
887 "por %%mm1, %%mm0 \n\t"
888 "por %%mm2, %%mm0 \n\t"
889 "psllq $8, %%mm4 \n\t"
890 "psllq $16, %%mm5 \n\t"
891 "por %%mm4, %%mm3 \n\t"
892 "por %%mm5, %%mm3 \n\t"
893
894 :"=m"(*d)
895 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
896 :"memory");
897 /* borrowed 32 to 24 */
898 __asm__ volatile(
899 "movq %%mm0, %%mm4 \n\t"
900 "movq %%mm3, %%mm5 \n\t"
901 "movq %%mm6, %%mm0 \n\t"
902 "movq %%mm7, %%mm1 \n\t"
903
904 "movq %%mm4, %%mm6 \n\t"
905 "movq %%mm5, %%mm7 \n\t"
906 "movq %%mm0, %%mm2 \n\t"
907 "movq %%mm1, %%mm3 \n\t"
908
909 STORE_BGR24_MMX
910
911 :"=m"(*d)
912 :"m"(*s)
913 :"memory");
914 d += 24;
915 s += 8;
916 }
917 __asm__ volatile(SFENCE:::"memory");
918 __asm__ volatile(EMMS:::"memory");
919 while (s < end) {
920 register uint16_t bgr;
921 bgr = *s++;
922 *d++ = (bgr&0x1F)<<3;
923 *d++ = (bgr&0x3E0)>>2;
924 *d++ = (bgr&0x7C00)>>7;
925 }
926 }
927
928 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
929 {
930 const uint16_t *end;
931 const uint16_t *mm_end;
932 uint8_t *d = (uint8_t *)dst;
933 const uint16_t *s = (const uint16_t *)src;
934 end = s + src_size/2;
935 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
936 mm_end = end - 7;
937 while (s < mm_end) {
938 __asm__ volatile(
939 PREFETCH" 32%1 \n\t"
940 "movq %1, %%mm0 \n\t"
941 "movq %1, %%mm1 \n\t"
942 "movq %1, %%mm2 \n\t"
943 "pand %2, %%mm0 \n\t"
944 "pand %3, %%mm1 \n\t"
945 "pand %4, %%mm2 \n\t"
946 "psllq $3, %%mm0 \n\t"
947 "psrlq $3, %%mm1 \n\t"
948 "psrlq $8, %%mm2 \n\t"
949 "movq %%mm0, %%mm3 \n\t"
950 "movq %%mm1, %%mm4 \n\t"
951 "movq %%mm2, %%mm5 \n\t"
952 "punpcklwd %5, %%mm0 \n\t"
953 "punpcklwd %5, %%mm1 \n\t"
954 "punpcklwd %5, %%mm2 \n\t"
955 "punpckhwd %5, %%mm3 \n\t"
956 "punpckhwd %5, %%mm4 \n\t"
957 "punpckhwd %5, %%mm5 \n\t"
958 "psllq $8, %%mm1 \n\t"
959 "psllq $16, %%mm2 \n\t"
960 "por %%mm1, %%mm0 \n\t"
961 "por %%mm2, %%mm0 \n\t"
962 "psllq $8, %%mm4 \n\t"
963 "psllq $16, %%mm5 \n\t"
964 "por %%mm4, %%mm3 \n\t"
965 "por %%mm5, %%mm3 \n\t"
966
967 "movq %%mm0, %%mm6 \n\t"
968 "movq %%mm3, %%mm7 \n\t"
969
970 "movq 8%1, %%mm0 \n\t"
971 "movq 8%1, %%mm1 \n\t"
972 "movq 8%1, %%mm2 \n\t"
973 "pand %2, %%mm0 \n\t"
974 "pand %3, %%mm1 \n\t"
975 "pand %4, %%mm2 \n\t"
976 "psllq $3, %%mm0 \n\t"
977 "psrlq $3, %%mm1 \n\t"
978 "psrlq $8, %%mm2 \n\t"
979 "movq %%mm0, %%mm3 \n\t"
980 "movq %%mm1, %%mm4 \n\t"
981 "movq %%mm2, %%mm5 \n\t"
982 "punpcklwd %5, %%mm0 \n\t"
983 "punpcklwd %5, %%mm1 \n\t"
984 "punpcklwd %5, %%mm2 \n\t"
985 "punpckhwd %5, %%mm3 \n\t"
986 "punpckhwd %5, %%mm4 \n\t"
987 "punpckhwd %5, %%mm5 \n\t"
988 "psllq $8, %%mm1 \n\t"
989 "psllq $16, %%mm2 \n\t"
990 "por %%mm1, %%mm0 \n\t"
991 "por %%mm2, %%mm0 \n\t"
992 "psllq $8, %%mm4 \n\t"
993 "psllq $16, %%mm5 \n\t"
994 "por %%mm4, %%mm3 \n\t"
995 "por %%mm5, %%mm3 \n\t"
996 :"=m"(*d)
997 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
998 :"memory");
999 /* borrowed 32 to 24 */
1000 __asm__ volatile(
1001 "movq %%mm0, %%mm4 \n\t"
1002 "movq %%mm3, %%mm5 \n\t"
1003 "movq %%mm6, %%mm0 \n\t"
1004 "movq %%mm7, %%mm1 \n\t"
1005
1006 "movq %%mm4, %%mm6 \n\t"
1007 "movq %%mm5, %%mm7 \n\t"
1008 "movq %%mm0, %%mm2 \n\t"
1009 "movq %%mm1, %%mm3 \n\t"
1010
1011 STORE_BGR24_MMX
1012
1013 :"=m"(*d)
1014 :"m"(*s)
1015 :"memory");
1016 d += 24;
1017 s += 8;
1018 }
1019 __asm__ volatile(SFENCE:::"memory");
1020 __asm__ volatile(EMMS:::"memory");
1021 while (s < end) {
1022 register uint16_t bgr;
1023 bgr = *s++;
1024 *d++ = (bgr&0x1F)<<3;
1025 *d++ = (bgr&0x7E0)>>3;
1026 *d++ = (bgr&0xF800)>>8;
1027 }
1028 }
1029
1030 /*
1031 * mm0 = 00 B3 00 B2 00 B1 00 B0
1032 * mm1 = 00 G3 00 G2 00 G1 00 G0
1033 * mm2 = 00 R3 00 R2 00 R1 00 R0
1034 * mm6 = FF FF FF FF FF FF FF FF
1035 * mm7 = 00 00 00 00 00 00 00 00
1036 */
1037 #define PACK_RGB32 \
1038 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1039 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1040 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1041 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1042 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1043 "movq %%mm0, %%mm3 \n\t" \
1044 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1045 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1046 MOVNTQ" %%mm0, %0 \n\t" \
1047 MOVNTQ" %%mm3, 8%0 \n\t" \
1048
1049 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1050 {
1051 const uint16_t *end;
1052 const uint16_t *mm_end;
1053 uint8_t *d = dst;
1054 const uint16_t *s = (const uint16_t *)src;
1055 end = s + src_size/2;
1056 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1057 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1058 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1059 mm_end = end - 3;
1060 while (s < mm_end) {
1061 __asm__ volatile(
1062 PREFETCH" 32%1 \n\t"
1063 "movq %1, %%mm0 \n\t"
1064 "movq %1, %%mm1 \n\t"
1065 "movq %1, %%mm2 \n\t"
1066 "pand %2, %%mm0 \n\t"
1067 "pand %3, %%mm1 \n\t"
1068 "pand %4, %%mm2 \n\t"
1069 "psllq $3, %%mm0 \n\t"
1070 "psrlq $2, %%mm1 \n\t"
1071 "psrlq $7, %%mm2 \n\t"
1072 PACK_RGB32
1073 :"=m"(*d)
1074 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1075 :"memory");
1076 d += 16;
1077 s += 4;
1078 }
1079 __asm__ volatile(SFENCE:::"memory");
1080 __asm__ volatile(EMMS:::"memory");
1081 while (s < end) {
1082 register uint16_t bgr;
1083 bgr = *s++;
1084 *d++ = (bgr&0x1F)<<3;
1085 *d++ = (bgr&0x3E0)>>2;
1086 *d++ = (bgr&0x7C00)>>7;
1087 *d++ = 255;
1088 }
1089 }
1090
1091 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1092 {
1093 const uint16_t *end;
1094 const uint16_t *mm_end;
1095 uint8_t *d = dst;
1096 const uint16_t *s = (const uint16_t*)src;
1097 end = s + src_size/2;
1098 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1099 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1100 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1101 mm_end = end - 3;
1102 while (s < mm_end) {
1103 __asm__ volatile(
1104 PREFETCH" 32%1 \n\t"
1105 "movq %1, %%mm0 \n\t"
1106 "movq %1, %%mm1 \n\t"
1107 "movq %1, %%mm2 \n\t"
1108 "pand %2, %%mm0 \n\t"
1109 "pand %3, %%mm1 \n\t"
1110 "pand %4, %%mm2 \n\t"
1111 "psllq $3, %%mm0 \n\t"
1112 "psrlq $3, %%mm1 \n\t"
1113 "psrlq $8, %%mm2 \n\t"
1114 PACK_RGB32
1115 :"=m"(*d)
1116 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1117 :"memory");
1118 d += 16;
1119 s += 4;
1120 }
1121 __asm__ volatile(SFENCE:::"memory");
1122 __asm__ volatile(EMMS:::"memory");
1123 while (s < end) {
1124 register uint16_t bgr;
1125 bgr = *s++;
1126 *d++ = (bgr&0x1F)<<3;
1127 *d++ = (bgr&0x7E0)>>3;
1128 *d++ = (bgr&0xF800)>>8;
1129 *d++ = 255;
1130 }
1131 }
1132
1133 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
1134 {
1135 x86_reg idx = 15 - src_size;
1136 const uint8_t *s = src-idx;
1137 uint8_t *d = dst-idx;
1138 __asm__ volatile(
1139 "test %0, %0 \n\t"
1140 "jns 2f \n\t"
1141 PREFETCH" (%1, %0) \n\t"
1142 "movq %3, %%mm7 \n\t"
1143 "pxor %4, %%mm7 \n\t"
1144 "movq %%mm7, %%mm6 \n\t"
1145 "pxor %5, %%mm7 \n\t"
1146 ".p2align 4 \n\t"
1147 "1: \n\t"
1148 PREFETCH" 32(%1, %0) \n\t"
1149 "movq (%1, %0), %%mm0 \n\t"
1150 "movq 8(%1, %0), %%mm1 \n\t"
1151 # if COMPILE_TEMPLATE_MMX2
1152 "pshufw $177, %%mm0, %%mm3 \n\t"
1153 "pshufw $177, %%mm1, %%mm5 \n\t"
1154 "pand %%mm7, %%mm0 \n\t"
1155 "pand %%mm6, %%mm3 \n\t"
1156 "pand %%mm7, %%mm1 \n\t"
1157 "pand %%mm6, %%mm5 \n\t"
1158 "por %%mm3, %%mm0 \n\t"
1159 "por %%mm5, %%mm1 \n\t"
1160 # else
1161 "movq %%mm0, %%mm2 \n\t"
1162 "movq %%mm1, %%mm4 \n\t"
1163 "pand %%mm7, %%mm0 \n\t"
1164 "pand %%mm6, %%mm2 \n\t"
1165 "pand %%mm7, %%mm1 \n\t"
1166 "pand %%mm6, %%mm4 \n\t"
1167 "movq %%mm2, %%mm3 \n\t"
1168 "movq %%mm4, %%mm5 \n\t"
1169 "pslld $16, %%mm2 \n\t"
1170 "psrld $16, %%mm3 \n\t"
1171 "pslld $16, %%mm4 \n\t"
1172 "psrld $16, %%mm5 \n\t"
1173 "por %%mm2, %%mm0 \n\t"
1174 "por %%mm4, %%mm1 \n\t"
1175 "por %%mm3, %%mm0 \n\t"
1176 "por %%mm5, %%mm1 \n\t"
1177 # endif
1178 MOVNTQ" %%mm0, (%2, %0) \n\t"
1179 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1180 "add $16, %0 \n\t"
1181 "js 1b \n\t"
1182 SFENCE" \n\t"
1183 EMMS" \n\t"
1184 "2: \n\t"
1185 : "+&r"(idx)
1186 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1187 : "memory");
1188 for (; idx<15; idx+=4) {
1189 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1190 v &= 0xff00ff;
1191 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1192 }
1193 }
1194
1195 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1196 {
1197 unsigned i;
1198 x86_reg mmx_size= 23 - src_size;
1199 __asm__ volatile (
1200 "test %%"REG_a", %%"REG_a" \n\t"
1201 "jns 2f \n\t"
1202 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1203 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1204 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1205 ".p2align 4 \n\t"
1206 "1: \n\t"
1207 PREFETCH" 32(%1, %%"REG_a") \n\t"
1208 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1209 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1210 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1211 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1212 "pand %%mm5, %%mm0 \n\t"
1213 "pand %%mm6, %%mm1 \n\t"
1214 "pand %%mm7, %%mm2 \n\t"
1215 "por %%mm0, %%mm1 \n\t"
1216 "por %%mm2, %%mm1 \n\t"
1217 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1218 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1219 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1220 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1221 "pand %%mm7, %%mm0 \n\t"
1222 "pand %%mm5, %%mm1 \n\t"
1223 "pand %%mm6, %%mm2 \n\t"
1224 "por %%mm0, %%mm1 \n\t"
1225 "por %%mm2, %%mm1 \n\t"
1226 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1227 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1228 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1229 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1230 "pand %%mm6, %%mm0 \n\t"
1231 "pand %%mm7, %%mm1 \n\t"
1232 "pand %%mm5, %%mm2 \n\t"
1233 "por %%mm0, %%mm1 \n\t"
1234 "por %%mm2, %%mm1 \n\t"
1235 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1236 "add $24, %%"REG_a" \n\t"
1237 " js 1b \n\t"
1238 "2: \n\t"
1239 : "+a" (mmx_size)
1240 : "r" (src-mmx_size), "r"(dst-mmx_size)
1241 );
1242
1243 __asm__ volatile(SFENCE:::"memory");
1244 __asm__ volatile(EMMS:::"memory");
1245
1246 if (mmx_size==23) return; //finished, was multiple of 8
1247
1248 src+= src_size;
1249 dst+= src_size;
1250 src_size= 23-mmx_size;
1251 src-= src_size;
1252 dst-= src_size;
1253 for (i=0; i<src_size; i+=3) {
1254 register uint8_t x;
1255 x = src[i + 2];
1256 dst[i + 1] = src[i + 1];
1257 dst[i + 2] = src[i + 0];
1258 dst[i + 0] = x;
1259 }
1260 }
1261
1262 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1263 long width, long height,
1264 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1265 {
1266 long y;
1267 const x86_reg chromWidth= width>>1;
1268 for (y=0; y<height; y++) {
1269 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1270 __asm__ volatile(
1271 "xor %%"REG_a", %%"REG_a" \n\t"
1272 ".p2align 4 \n\t"
1273 "1: \n\t"
1274 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1275 PREFETCH" 32(%2, %%"REG_a") \n\t"
1276 PREFETCH" 32(%3, %%"REG_a") \n\t"
1277 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1278 "movq %%mm0, %%mm2 \n\t" // U(0)
1279 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1280 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1281 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1282
1283 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1284 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1285 "movq %%mm3, %%mm4 \n\t" // Y(0)
1286 "movq %%mm5, %%mm6 \n\t" // Y(8)
1287 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1288 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1289 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1290 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1291
1292 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1293 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1294 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1295 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1296
1297 "add $8, %%"REG_a" \n\t"
1298 "cmp %4, %%"REG_a" \n\t"
1299 " jb 1b \n\t"
1300 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1301 : "%"REG_a
1302 );
1303 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1304 usrc += chromStride;
1305 vsrc += chromStride;
1306 }
1307 ysrc += lumStride;
1308 dst += dstStride;
1309 }
1310 __asm__(EMMS" \n\t"
1311 SFENCE" \n\t"
1312 :::"memory");
1313 }
1314
1315 /**
1316 * Height should be a multiple of 2 and width should be a multiple of 16.
1317 * (If this is a problem for anyone then tell me, and I will fix it.)
1318 */
1319 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1320 long width, long height,
1321 long lumStride, long chromStride, long dstStride)
1322 {
1323 //FIXME interpolate chroma
1324 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1325 }
1326
1327 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1328 long width, long height,
1329 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1330 {
1331 long y;
1332 const x86_reg chromWidth= width>>1;
1333 for (y=0; y<height; y++) {
1334 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1335 __asm__ volatile(
1336 "xor %%"REG_a", %%"REG_a" \n\t"
1337 ".p2align 4 \n\t"
1338 "1: \n\t"
1339 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1340 PREFETCH" 32(%2, %%"REG_a") \n\t"
1341 PREFETCH" 32(%3, %%"REG_a") \n\t"
1342 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1343 "movq %%mm0, %%mm2 \n\t" // U(0)
1344 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1345 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1346 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1347
1348 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1349 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1350 "movq %%mm0, %%mm4 \n\t" // Y(0)
1351 "movq %%mm2, %%mm6 \n\t" // Y(8)
1352 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1353 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1354 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1355 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1356
1357 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1358 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1359 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1360 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1361
1362 "add $8, %%"REG_a" \n\t"
1363 "cmp %4, %%"REG_a" \n\t"
1364 " jb 1b \n\t"
1365 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1366 : "%"REG_a
1367 );
1368 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1369 usrc += chromStride;
1370 vsrc += chromStride;
1371 }
1372 ysrc += lumStride;
1373 dst += dstStride;
1374 }
1375 __asm__(EMMS" \n\t"
1376 SFENCE" \n\t"
1377 :::"memory");
1378 }
1379
1380 /**
1381 * Height should be a multiple of 2 and width should be a multiple of 16
1382 * (If this is a problem for anyone then tell me, and I will fix it.)
1383 */
1384 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1385 long width, long height,
1386 long lumStride, long chromStride, long dstStride)
1387 {
1388 //FIXME interpolate chroma
1389 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1390 }
1391
1392 /**
1393 * Width should be a multiple of 16.
1394 */
1395 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1396 long width, long height,
1397 long lumStride, long chromStride, long dstStride)
1398 {
1399 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1400 }
1401
1402 /**
1403 * Width should be a multiple of 16.
1404 */
1405 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1406 long width, long height,
1407 long lumStride, long chromStride, long dstStride)
1408 {
1409 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1410 }
1411
1412 /**
1413 * Height should be a multiple of 2 and width should be a multiple of 16.
1414 * (If this is a problem for anyone then tell me, and I will fix it.)
1415 */
1416 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1417 long width, long height,
1418 long lumStride, long chromStride, long srcStride)
1419 {
1420 long y;
1421 const x86_reg chromWidth= width>>1;
1422 for (y=0; y<height; y+=2) {
1423 __asm__ volatile(
1424 "xor %%"REG_a", %%"REG_a" \n\t"
1425 "pcmpeqw %%mm7, %%mm7 \n\t"
1426 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1427 ".p2align 4 \n\t"
1428 "1: \n\t"
1429 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1430 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1431 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1432 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1433 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1434 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1435 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1436 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1437 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1438 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1439 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1440
1441 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1442
1443 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1444 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1445 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1446 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1447 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1448 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1449 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1450 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1451 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1452 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1453
1454 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1455
1456 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1457 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1458 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1459 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1460 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1461 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1462 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1463 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1464
1465 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1466 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1467
1468 "add $8, %%"REG_a" \n\t"
1469 "cmp %4, %%"REG_a" \n\t"
1470 " jb 1b \n\t"
1471 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1472 : "memory", "%"REG_a
1473 );
1474
1475 ydst += lumStride;
1476 src += srcStride;
1477
1478 __asm__ volatile(
1479 "xor %%"REG_a", %%"REG_a" \n\t"
1480 ".p2align 4 \n\t"
1481 "1: \n\t"
1482 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1483 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1484 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1485 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1486 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1487 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1488 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1489 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1490 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1491 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1492 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1493
1494 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1495 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1496
1497 "add $8, %%"REG_a" \n\t"
1498 "cmp %4, %%"REG_a" \n\t"
1499 " jb 1b \n\t"
1500
1501 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1502 : "memory", "%"REG_a
1503 );
1504 udst += chromStride;
1505 vdst += chromStride;
1506 ydst += lumStride;
1507 src += srcStride;
1508 }
1509 __asm__ volatile(EMMS" \n\t"
1510 SFENCE" \n\t"
1511 :::"memory");
1512 }
1513 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1514
1515 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1516 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1517 {
1518 long x,y;
1519
1520 dst[0]= src[0];
1521
1522 // first line
1523 for (x=0; x<srcWidth-1; x++) {
1524 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1525 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1526 }
1527 dst[2*srcWidth-1]= src[srcWidth-1];
1528
1529 dst+= dstStride;
1530
1531 for (y=1; y<srcHeight; y++) {
1532 const x86_reg mmxSize= srcWidth&~15;
1533 __asm__ volatile(
1534 "mov %4, %%"REG_a" \n\t"
1535 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1536 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1537 "movq %%mm4, %%mm2 \n\t"
1538 "psllq $8, %%mm4 \n\t"
1539 "pand %%mm0, %%mm2 \n\t"
1540 "por %%mm2, %%mm4 \n\t"
1541 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1542 "movq %%mm5, %%mm3 \n\t"
1543 "psllq $8, %%mm5 \n\t"
1544 "pand %%mm0, %%mm3 \n\t"
1545 "por %%mm3, %%mm5 \n\t"
1546 "1: \n\t"
1547 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1548 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1549 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1550 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1551 PAVGB" %%mm0, %%mm5 \n\t"
1552 PAVGB" %%mm0, %%mm3 \n\t"
1553 PAVGB" %%mm0, %%mm5 \n\t"
1554 PAVGB" %%mm0, %%mm3 \n\t"
1555 PAVGB" %%mm1, %%mm4 \n\t"
1556 PAVGB" %%mm1, %%mm2 \n\t"
1557 PAVGB" %%mm1, %%mm4 \n\t"
1558 PAVGB" %%mm1, %%mm2 \n\t"
1559 "movq %%mm5, %%mm7 \n\t"
1560 "movq %%mm4, %%mm6 \n\t"
1561 "punpcklbw %%mm3, %%mm5 \n\t"
1562 "punpckhbw %%mm3, %%mm7 \n\t"
1563 "punpcklbw %%mm2, %%mm4 \n\t"
1564 "punpckhbw %%mm2, %%mm6 \n\t"
1565 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1566 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1567 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1568 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1569 "add $8, %%"REG_a" \n\t"
1570 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1571 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1572 " js 1b \n\t"
1573 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1574 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1575 "g" (-mmxSize)
1576 : "%"REG_a
1577 );
1578
1579 for (x=mmxSize-1; x<srcWidth-1; x++) {
1580 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1581 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1582 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1583 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1584 }
1585 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1586 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1587
1588 dst+=dstStride*2;
1589 src+=srcStride;
1590 }
1591
1592 // last line
1593 dst[0]= src[0];
1594
1595 for (x=0; x<srcWidth-1; x++) {
1596 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1597 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1598 }
1599 dst[2*srcWidth-1]= src[srcWidth-1];
1600
1601 __asm__ volatile(EMMS" \n\t"
1602 SFENCE" \n\t"
1603 :::"memory");
1604 }
1605 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
1606
1607 #if !COMPILE_TEMPLATE_AMD3DNOW
1608 /**
1609 * Height should be a multiple of 2 and width should be a multiple of 16.
1610 * (If this is a problem for anyone then tell me, and I will fix it.)
1611 * Chrominance data is only taken from every second line, others are ignored.
1612 * FIXME: Write HQ version.
1613 */
1614 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1615 long width, long height,
1616 long lumStride, long chromStride, long srcStride)
1617 {
1618 long y;
1619 const x86_reg chromWidth= width>>1;
1620 for (y=0; y<height; y+=2) {
1621 __asm__ volatile(
1622 "xor %%"REG_a", %%"REG_a" \n\t"
1623 "pcmpeqw %%mm7, %%mm7 \n\t"
1624 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1625 ".p2align 4 \n\t"
1626 "1: \n\t"
1627 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1628 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1629 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1630 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1631 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1632 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1633 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1634 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1635 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1636 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1637 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1638
1639 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1640
1641 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1642 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1643 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1644 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1645 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1646 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1647 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1648 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1649 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1650 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1651
1652 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1653
1654 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1655 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1656 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1657 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1658 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1659 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1660 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1661 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1662
1663 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1664 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1665
1666 "add $8, %%"REG_a" \n\t"
1667 "cmp %4, %%"REG_a" \n\t"
1668 " jb 1b \n\t"
1669 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1670 : "memory", "%"REG_a
1671 );
1672
1673 ydst += lumStride;
1674 src += srcStride;
1675
1676 __asm__ volatile(
1677 "xor %%"REG_a", %%"REG_a" \n\t"
1678 ".p2align 4 \n\t"
1679 "1: \n\t"
1680 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1681 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1682 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1683 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1684 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1685 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1686 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1687 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1688 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1689 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1690 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1691
1692 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1693 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1694
1695 "add $8, %%"REG_a" \n\t"
1696 "cmp %4, %%"REG_a" \n\t"
1697 " jb 1b \n\t"
1698
1699 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1700 : "memory", "%"REG_a
1701 );
1702 udst += chromStride;
1703 vdst += chromStride;
1704 ydst += lumStride;
1705 src += srcStride;
1706 }
1707 __asm__ volatile(EMMS" \n\t"
1708 SFENCE" \n\t"
1709 :::"memory");
1710 }
1711 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1712
1713 /**
1714 * Height should be a multiple of 2 and width should be a multiple of 2.
1715 * (If this is a problem for anyone then tell me, and I will fix it.)
1716 * Chrominance data is only taken from every second line,
1717 * others are ignored in the C version.
1718 * FIXME: Write HQ version.
1719 */
1720 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1721 long width, long height,
1722 long lumStride, long chromStride, long srcStride)
1723 {
1724 long y;
1725 const x86_reg chromWidth= width>>1;
1726 for (y=0; y<height-2; y+=2) {
1727 long i;
1728 for (i=0; i<2; i++) {
1729 __asm__ volatile(
1730 "mov %2, %%"REG_a" \n\t"
1731 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1732 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1733 "pxor %%mm7, %%mm7 \n\t"
1734 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1735 ".p2align 4 \n\t"
1736 "1: \n\t"
1737 PREFETCH" 64(%0, %%"REG_d") \n\t"
1738 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1739 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1740 "punpcklbw %%mm7, %%mm0 \n\t"
1741 "punpcklbw %%mm7, %%mm1 \n\t"
1742 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1743 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1744 "punpcklbw %%mm7, %%mm2 \n\t"
1745 "punpcklbw %%mm7, %%mm3 \n\t"
1746 "pmaddwd %%mm6, %%mm0 \n\t"
1747 "pmaddwd %%mm6, %%mm1 \n\t"
1748 "pmaddwd %%mm6, %%mm2 \n\t"
1749 "pmaddwd %%mm6, %%mm3 \n\t"
1750 #ifndef FAST_BGR2YV12
1751 "psrad $8, %%mm0 \n\t"
1752 "psrad $8, %%mm1 \n\t"
1753 "psrad $8, %%mm2 \n\t"
1754 "psrad $8, %%mm3 \n\t"
1755 #endif
1756 "packssdw %%mm1, %%mm0 \n\t"
1757 "packssdw %%mm3, %%mm2 \n\t"
1758 "pmaddwd %%mm5, %%mm0 \n\t"
1759 "pmaddwd %%mm5, %%mm2 \n\t"
1760 "packssdw %%mm2, %%mm0 \n\t"
1761 "psraw $7, %%mm0 \n\t"
1762
1763 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1764 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1765 "punpcklbw %%mm7, %%mm4 \n\t"
1766 "punpcklbw %%mm7, %%mm1 \n\t"
1767 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1768 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1769 "punpcklbw %%mm7, %%mm2 \n\t"
1770 "punpcklbw %%mm7, %%mm3 \n\t"
1771 "pmaddwd %%mm6, %%mm4 \n\t"
1772 "pmaddwd %%mm6, %%mm1 \n\t"
1773 "pmaddwd %%mm6, %%mm2 \n\t"
1774 "pmaddwd %%mm6, %%mm3 \n\t"
1775 #ifndef FAST_BGR2YV12
1776 "psrad $8, %%mm4 \n\t"
1777 "psrad $8, %%mm1 \n\t"
1778 "psrad $8, %%mm2 \n\t"
1779 "psrad $8, %%mm3 \n\t"
1780 #endif
1781 "packssdw %%mm1, %%mm4 \n\t"
1782 "packssdw %%mm3, %%mm2 \n\t"
1783 "pmaddwd %%mm5, %%mm4 \n\t"
1784 "pmaddwd %%mm5, %%mm2 \n\t"
1785 "add $24, %%"REG_d" \n\t"
1786 "packssdw %%mm2, %%mm4 \n\t"
1787 "psraw $7, %%mm4 \n\t"
1788
1789 "packuswb %%mm4, %%mm0 \n\t"
1790 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1791
1792 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1793 "add $8, %%"REG_a" \n\t"
1794 " js 1b \n\t"
1795 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1796 : "%"REG_a, "%"REG_d
1797 );
1798 ydst += lumStride;
1799 src += srcStride;
1800 }
1801 src -= srcStride*2;
1802 __asm__ volatile(
1803 "mov %4, %%"REG_a" \n\t"
1804 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1805 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1806 "pxor %%mm7, %%mm7 \n\t"
1807 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1808 "add %%"REG_d", %%"REG_d" \n\t"
1809 ".p2align 4 \n\t"
1810 "1: \n\t"
1811 PREFETCH" 64(%0, %%"REG_d") \n\t"
1812 PREFETCH" 64(%1, %%"REG_d") \n\t"
1813 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1814 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1815 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1816 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1817 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1818 PAVGB" %%mm1, %%mm0 \n\t"
1819 PAVGB" %%mm3, %%mm2 \n\t"
1820 "movq %%mm0, %%mm1 \n\t"
1821 "movq %%mm2, %%mm3 \n\t"
1822 "psrlq $24, %%mm0 \n\t"
1823 "psrlq $24, %%mm2 \n\t"
1824 PAVGB" %%mm1, %%mm0 \n\t"
1825 PAVGB" %%mm3, %%mm2 \n\t"
1826 "punpcklbw %%mm7, %%mm0 \n\t"
1827 "punpcklbw %%mm7, %%mm2 \n\t"
1828 #else
1829 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1830 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1831 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1832 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1833 "punpcklbw %%mm7, %%mm0 \n\t"
1834 "punpcklbw %%mm7, %%mm1 \n\t"
1835 "punpcklbw %%mm7, %%mm2 \n\t"
1836 "punpcklbw %%mm7, %%mm3 \n\t"
1837 "paddw %%mm1, %%mm0 \n\t"
1838 "paddw %%mm3, %%mm2 \n\t"
1839 "paddw %%mm2, %%mm0 \n\t"
1840 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1841 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1842 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1843 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1844 "punpcklbw %%mm7, %%mm4 \n\t"
1845 "punpcklbw %%mm7, %%mm1 \n\t"
1846 "punpcklbw %%mm7, %%mm2 \n\t"
1847 "punpcklbw %%mm7, %%mm3 \n\t"
1848 "paddw %%mm1, %%mm4 \n\t"
1849 "paddw %%mm3, %%mm2 \n\t"
1850 "paddw %%mm4, %%mm2 \n\t"
1851 "psrlw $2, %%mm0 \n\t"
1852 "psrlw $2, %%mm2 \n\t"
1853 #endif
1854 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1855 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1856
1857 "pmaddwd %%mm0, %%mm1 \n\t"
1858 "pmaddwd %%mm2, %%mm3 \n\t"
1859 "pmaddwd %%mm6, %%mm0 \n\t"
1860 "pmaddwd %%mm6, %%mm2 \n\t"
1861 #ifndef FAST_BGR2YV12
1862 "psrad $8, %%mm0 \n\t"
1863 "psrad $8, %%mm1 \n\t"
1864 "psrad $8, %%mm2 \n\t"
1865 "psrad $8, %%mm3 \n\t"
1866 #endif
1867 "packssdw %%mm2, %%mm0 \n\t"
1868 "packssdw %%mm3, %%mm1 \n\t"
1869 "pmaddwd %%mm5, %%mm0 \n\t"
1870 "pmaddwd %%mm5, %%mm1 \n\t"
1871 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1872 "psraw $7, %%mm0 \n\t"
1873
1874 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1875 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1876 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1877 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1878 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1879 PAVGB" %%mm1, %%mm4 \n\t"
1880 PAVGB" %%mm3, %%mm2 \n\t"
1881 "movq %%mm4, %%mm1 \n\t"
1882 "movq %%mm2, %%mm3 \n\t"
1883 "psrlq $24, %%mm4 \n\t"
1884 "psrlq $24, %%mm2 \n\t"
1885 PAVGB" %%mm1, %%mm4 \n\t"
1886 PAVGB" %%mm3, %%mm2 \n\t"
1887 "punpcklbw %%mm7, %%mm4 \n\t"
1888 "punpcklbw %%mm7, %%mm2 \n\t"
1889 #else
1890 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1891 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1892 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1893 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1894 "punpcklbw %%mm7, %%mm4 \n\t"
1895 "punpcklbw %%mm7, %%mm1 \n\t"
1896 "punpcklbw %%mm7, %%mm2 \n\t"
1897 "punpcklbw %%mm7, %%mm3 \n\t"
1898 "paddw %%mm1, %%mm4 \n\t"
1899 "paddw %%mm3, %%mm2 \n\t"
1900 "paddw %%mm2, %%mm4 \n\t"
1901 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1902 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1903 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1904 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1905 "punpcklbw %%mm7, %%mm5 \n\t"
1906 "punpcklbw %%mm7, %%mm1 \n\t"
1907 "punpcklbw %%mm7, %%mm2 \n\t"
1908 "punpcklbw %%mm7, %%mm3 \n\t"
1909 "paddw %%mm1, %%mm5 \n\t"
1910 "paddw %%mm3, %%mm2 \n\t"
1911 "paddw %%mm5, %%mm2 \n\t"
1912 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1913 "psrlw $2, %%mm4 \n\t"
1914 "psrlw $2, %%mm2 \n\t"
1915 #endif
1916 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1917 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1918
1919 "pmaddwd %%mm4, %%mm1 \n\t"
1920 "pmaddwd %%mm2, %%mm3 \n\t"
1921 "pmaddwd %%mm6, %%mm4 \n\t"
1922 "pmaddwd %%mm6, %%mm2 \n\t"
1923 #ifndef FAST_BGR2YV12
1924 "psrad $8, %%mm4 \n\t"
1925 "psrad $8, %%mm1 \n\t"
1926 "psrad $8, %%mm2 \n\t"
1927 "psrad $8, %%mm3 \n\t"
1928 #endif
1929 "packssdw %%mm2, %%mm4 \n\t"
1930 "packssdw %%mm3, %%mm1 \n\t"
1931 "pmaddwd %%mm5, %%mm4 \n\t"
1932 "pmaddwd %%mm5, %%mm1 \n\t"
1933 "add $24, %%"REG_d" \n\t"
1934 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1935 "psraw $7, %%mm4 \n\t"
1936
1937 "movq %%mm0, %%mm1 \n\t"
1938 "punpckldq %%mm4, %%mm0 \n\t"
1939 "punpckhdq %%mm4, %%mm1 \n\t"
1940 "packsswb %%mm1, %%mm0 \n\t"
1941 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1942 "movd %%mm0, (%2, %%"REG_a") \n\t"
1943 "punpckhdq %%mm0, %%mm0 \n\t"
1944 "movd %%mm0, (%3, %%"REG_a") \n\t"
1945 "add $4, %%"REG_a" \n\t"
1946 " js 1b \n\t"
1947 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1948 : "%"REG_a, "%"REG_d
1949 );
1950
1951 udst += chromStride;
1952 vdst += chromStride;
1953 src += srcStride*2;
1954 }
1955
1956 __asm__ volatile(EMMS" \n\t"
1957 SFENCE" \n\t"
1958 :::"memory");
1959
1960 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1961 }
1962 #endif /* !COMPILE_TEMPLATE_SSE2 */
1963
1964 #if !COMPILE_TEMPLATE_AMD3DNOW
1965 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1966 long width, long height, long src1Stride,
1967 long src2Stride, long dstStride)
1968 {
1969 long h;
1970
1971 for (h=0; h < height; h++) {
1972 long w;
1973
1974 #if COMPILE_TEMPLATE_SSE2
1975 __asm__(
1976 "xor %%"REG_a", %%"REG_a" \n\t"
1977 "1: \n\t"
1978 PREFETCH" 64(%1, %%"REG_a") \n\t"
1979 PREFETCH" 64(%2, %%"REG_a") \n\t"
1980 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1981 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1982 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1983 "punpcklbw %%xmm2, %%xmm0 \n\t"
1984 "punpckhbw %%xmm2, %%xmm1 \n\t"
1985 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1986 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1987 "add $16, %%"REG_a" \n\t"
1988 "cmp %3, %%"REG_a" \n\t"
1989 " jb 1b \n\t"
1990 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1991 : "memory", "%"REG_a""
1992 );
1993 #else
1994 __asm__(
1995 "xor %%"REG_a", %%"REG_a" \n\t"
1996 "1: \n\t"
1997 PREFETCH" 64(%1, %%"REG_a") \n\t"
1998 PREFETCH" 64(%2, %%"REG_a") \n\t"
1999 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2000 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2001 "movq %%mm0, %%mm1 \n\t"
2002 "movq %%mm2, %%mm3 \n\t"
2003 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2004 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2005 "punpcklbw %%mm4, %%mm0 \n\t"
2006 "punpckhbw %%mm4, %%mm1 \n\t"
2007 "punpcklbw %%mm5, %%mm2 \n\t"
2008 "punpckhbw %%mm5, %%mm3 \n\t"
2009 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2010 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2011 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2012 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2013 "add $16, %%"REG_a" \n\t"
2014 "cmp %3, %%"REG_a" \n\t"
2015 " jb 1b \n\t"
2016 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2017 : "memory", "%"REG_a
2018 );
2019 #endif
2020 for (w= (width&(~15)); w < width; w++) {
2021 dest[2*w+0] = src1[w];
2022 dest[2*w+1] = src2[w];
2023 }
2024 dest += dstStride;
2025 src1 += src1Stride;
2026 src2 += src2Stride;
2027 }
2028 __asm__(
2029 EMMS" \n\t"
2030 SFENCE" \n\t"
2031 ::: "memory"
2032 );
2033 }
2034 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2035
2036 #if !COMPILE_TEMPLATE_SSE2
2037 #if !COMPILE_TEMPLATE_AMD3DNOW
2038 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2039 uint8_t *dst1, uint8_t *dst2,
2040 long width, long height,
2041 long srcStride1, long srcStride2,
2042 long dstStride1, long dstStride2)
2043 {
2044 x86_reg y;
2045 long x,w,h;
2046 w=width/2; h=height/2;
2047 __asm__ volatile(
2048 PREFETCH" %0 \n\t"
2049 PREFETCH" %1 \n\t"
2050 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2051 for (y=0;y<h;y++) {
2052 const uint8_t* s1=src1+srcStride1*(y>>1);
2053 uint8_t* d=dst1+dstStride1*y;
2054 x=0;
2055 for (;x<w-31;x+=32) {
2056 __asm__ volatile(
2057 PREFETCH" 32%1 \n\t"
2058 "movq %1, %%mm0 \n\t"
2059 "movq 8%1, %%mm2 \n\t"
2060 "movq 16%1, %%mm4 \n\t"
2061 "movq 24%1, %%mm6 \n\t"
2062 "movq %%mm0, %%mm1 \n\t"
2063 "movq %%mm2, %%mm3 \n\t"
2064 "movq %%mm4, %%mm5 \n\t"
2065 "movq %%mm6, %%mm7 \n\t"
2066 "punpcklbw %%mm0, %%mm0 \n\t"
2067 "punpckhbw %%mm1, %%mm1 \n\t"
2068 "punpcklbw %%mm2, %%mm2 \n\t"
2069 "punpckhbw %%mm3, %%mm3 \n\t"
2070 "punpcklbw %%mm4, %%mm4 \n\t"
2071 "punpckhbw %%mm5, %%mm5 \n\t"
2072 "punpcklbw %%mm6, %%mm6 \n\t"
2073 "punpckhbw %%mm7, %%mm7 \n\t"
2074 MOVNTQ" %%mm0, %0 \n\t"
2075 MOVNTQ" %%mm1, 8%0 \n\t"
2076 MOVNTQ" %%mm2, 16%0 \n\t"
2077 MOVNTQ" %%mm3, 24%0 \n\t"
2078 MOVNTQ" %%mm4, 32%0 \n\t"
2079 MOVNTQ" %%mm5, 40%0 \n\t"
2080 MOVNTQ" %%mm6, 48%0 \n\t"
2081 MOVNTQ" %%mm7, 56%0"
2082 :"=m"(d[2*x])
2083 :"m"(s1[x])
2084 :"memory");
2085 }
2086 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2087 }
2088 for (y=0;y<h;y++) {
2089 const uint8_t* s2=src2+srcStride2*(y>>1);
2090 uint8_t* d=dst2+dstStride2*y;
2091 x=0;
2092 for (;x<w-31;x+=32) {
2093 __asm__ volatile(
2094 PREFETCH" 32%1 \n\t"
2095 "movq %1, %%mm0 \n\t"
2096 "movq 8%1, %%mm2 \n\t"
2097 "movq 16%1, %%mm4 \n\t"
2098 "movq 24%1, %%mm6 \n\t"
2099 "movq %%mm0, %%mm1 \n\t"
2100 "movq %%mm2, %%mm3 \n\t"
2101 "movq %%mm4, %%mm5 \n\t"
2102 "movq %%mm6, %%mm7 \n\t"
2103 "punpcklbw %%mm0, %%mm0 \n\t"
2104 "punpckhbw %%mm1, %%mm1 \n\t"
2105 "punpcklbw %%mm2, %%mm2 \n\t"
2106 "punpckhbw %%mm3, %%mm3 \n\t"
2107 "punpcklbw %%mm4, %%mm4 \n\t"
2108 "punpckhbw %%mm5, %%mm5 \n\t"
2109 "punpcklbw %%mm6, %%mm6 \n\t"
2110 "punpckhbw %%mm7, %%mm7 \n\t"
2111 MOVNTQ" %%mm0, %0 \n\t"
2112 MOVNTQ" %%mm1, 8%0 \n\t"
2113 MOVNTQ" %%mm2, 16%0 \n\t"
2114 MOVNTQ" %%mm3, 24%0 \n\t"
2115 MOVNTQ" %%mm4, 32%0 \n\t"
2116 MOVNTQ" %%mm5, 40%0 \n\t"
2117 MOVNTQ" %%mm6, 48%0 \n\t"
2118 MOVNTQ" %%mm7, 56%0"
2119 :"=m"(d[2*x])
2120 :"m"(s2[x])
2121 :"memory");
2122 }
2123 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2124 }
2125 __asm__(
2126 EMMS" \n\t"
2127 SFENCE" \n\t"
2128 ::: "memory"
2129 );
2130 }
2131
2132 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2133 uint8_t *dst,
2134 long width, long height,
2135 long srcStride1, long srcStride2,
2136 long srcStride3, long dstStride)
2137 {
2138 x86_reg x;
2139 long y,w,h;
2140 w=width/2; h=height;
2141 for (y=0;y<h;y++) {
2142 const uint8_t* yp=src1+srcStride1*y;
2143 const uint8_t* up=src2+srcStride2*(y>>2);
2144 const uint8_t* vp=src3+srcStride3*(y>>2);
2145 uint8_t* d=dst+dstStride*y;
2146 x=0;
2147 for (;x<w-7;x+=8) {
2148 __asm__ volatile(
2149 PREFETCH" 32(%1, %0) \n\t"
2150 PREFETCH" 32(%2, %0) \n\t"
2151 PREFETCH" 32(%3, %0) \n\t"
2152 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2153 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2154 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2155 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2156 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2157 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2158 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2159 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2160 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2161 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2162
2163 "movq %%mm1, %%mm6 \n\t"
2164 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2165 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2166 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2167 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2168 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2169
2170 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2171 "movq 8(%1, %0, 4), %%mm0 \n\t"
2172 "movq %%mm0, %%mm3 \n\t"
2173 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2174 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2175 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2176 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2177
2178 "movq %%mm4, %%mm6 \n\t"
2179 "movq 16(%1, %0, 4), %%mm0 \n\t"
2180 "movq %%mm0, %%mm3 \n\t"
2181 "punpcklbw %%mm5, %%mm4 \n\t"
2182 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2183 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2184 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2185 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2186
2187 "punpckhbw %%mm5, %%mm6 \n\t"
2188 "movq 24(%1, %0, 4), %%mm0 \n\t"
2189 "movq %%mm0, %%mm3 \n\t"
2190 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2191 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2192 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2193 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2194
2195 : "+r" (x)
2196 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2197 :"memory");
2198 }
2199 for (; x<w; x++) {
2200 const long x2 = x<<2;
2201 d[8*x+0] = yp[x2];
2202 d[8*x+1] = up[x];
2203 d[8*x+2] = yp[x2+1];
2204 d[8*x+3] = vp[x];
2205 d[8*x+4] = yp[x2+2];
2206 d[8*x+5] = up[x];
2207 d[8*x+6] = yp[x2+3];
2208 d[8*x+7] = vp[x];
2209 }
2210 }
2211 __asm__(
2212 EMMS" \n\t"
2213 SFENCE" \n\t"
2214 ::: "memory"
2215 );
2216 }
2217 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2218
2219 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2220 {
2221 dst += count;
2222 src += 2*count;
2223 count= - count;
2224
2225 if(count <= -16) {
2226 count += 15;
2227 __asm__ volatile(
2228 "pcmpeqw %%mm7, %%mm7 \n\t"
2229 "psrlw $8, %%mm7 \n\t"
2230 "1: \n\t"
2231 "movq -30(%1, %0, 2), %%mm0 \n\t"
2232 "movq -22(%1, %0, 2), %%mm1 \n\t"
2233 "movq -14(%1, %0, 2), %%mm2 \n\t"
2234 "movq -6(%1, %0, 2), %%mm3 \n\t"
2235 "pand %%mm7, %%mm0 \n\t"
2236 "pand %%mm7, %%mm1 \n\t"
2237 "pand %%mm7, %%mm2 \n\t"
2238 "pand %%mm7, %%mm3 \n\t"
2239 "packuswb %%mm1, %%mm0 \n\t"
2240 "packuswb %%mm3, %%mm2 \n\t"
2241 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2242 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2243 "add $16, %0 \n\t"
2244 " js 1b \n\t"
2245 : "+r"(count)
2246 : "r"(src), "r"(dst)
2247 );
2248 count -= 15;
2249 }
2250 while(count<0) {
2251 dst[count]= src[2*count];
2252 count++;
2253 }
2254 }
2255
2256 #if !COMPILE_TEMPLATE_AMD3DNOW
2257 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2258 {
2259 dst0+= count;
2260 dst1+= count;
2261 src += 4*count;
2262 count= - count;
2263 if(count <= -8) {
2264 count += 7;
2265 __asm__ volatile(
2266 "pcmpeqw %%mm7, %%mm7 \n\t"
2267 "psrlw $8, %%mm7 \n\t"
2268 "1: \n\t"
2269 "movq -28(%1, %0, 4), %%mm0 \n\t"
2270 "movq -20(%1, %0, 4), %%mm1 \n\t"
2271 "movq -12(%1, %0, 4), %%mm2 \n\t"
2272 "movq -4(%1, %0, 4), %%mm3 \n\t"
2273 "pand %%mm7, %%mm0 \n\t"
2274 "pand %%mm7, %%mm1 \n\t"
2275 "pand %%mm7, %%mm2 \n\t"
2276 "pand %%mm7, %%mm3 \n\t"
2277 "packuswb %%mm1, %%mm0 \n\t"
2278 "packuswb %%mm3, %%mm2 \n\t"
2279 "movq %%mm0, %%mm1 \n\t"
2280 "movq %%mm2, %%mm3 \n\t"
2281 "psrlw $8, %%mm0 \n\t"
2282 "psrlw $8, %%mm2 \n\t"
2283 "pand %%mm7, %%mm1 \n\t"
2284 "pand %%mm7, %%mm3 \n\t"
2285 "packuswb %%mm2, %%mm0 \n\t"
2286 "packuswb %%mm3, %%mm1 \n\t"
2287 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2288 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2289 "add $8, %0 \n\t"
2290 " js 1b \n\t"
2291 : "+r"(count)
2292 : "r"(src), "r"(dst0), "r"(dst1)
2293 );
2294 count -= 7;
2295 }
2296 while(count<0) {
2297 dst0[count]= src[4*count+0];
2298 dst1[count]= src[4*count+2];
2299 count++;
2300 }
2301 }
2302 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2303
2304 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2305 {
2306 dst0 += count;
2307 dst1 += count;
2308 src0 += 4*count;
2309 src1 += 4*count;
2310 count= - count;
2311 #ifdef PAVGB
2312 if(count <= -8) {
2313 count += 7;
2314 __asm__ volatile(
2315 "pcmpeqw %%mm7, %%mm7 \n\t"
2316 "psrlw $8, %%mm7 \n\t"
2317 "1: \n\t"
2318 "movq -28(%1, %0, 4), %%mm0 \n\t"
2319 "movq -20(%1, %0, 4), %%mm1 \n\t"
2320 "movq -12(%1, %0, 4), %%mm2 \n\t"
2321 "movq -4(%1, %0, 4), %%mm3 \n\t"
2322 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2323 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2324 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2325 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2326 "pand %%mm7, %%mm0 \n\t"
2327 "pand %%mm7, %%mm1 \n\t"
2328 "pand %%mm7, %%mm2 \n\t"
2329 "pand %%mm7, %%mm3 \n\t"
2330 "packuswb %%mm1, %%mm0 \n\t"
2331 "packuswb %%mm3, %%mm2 \n\t"
2332 "movq %%mm0, %%mm1 \n\t"
2333 "movq %%mm2, %%mm3 \n\t"
2334 "psrlw $8, %%mm0 \n\t"
2335 "psrlw $8, %%mm2 \n\t"
2336 "pand %%mm7, %%mm1 \n\t"
2337 "pand %%mm7, %%mm3 \n\t"
2338 "packuswb %%mm2, %%mm0 \n\t"
2339 "packuswb %%mm3, %%mm1 \n\t"
2340 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2341 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2342 "add $8, %0 \n\t"
2343 " js 1b \n\t"
2344 : "+r"(count)
2345 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2346 );
2347 count -= 7;
2348 }
2349 #endif
2350 while(count<0) {
2351 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2352 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2353 count++;
2354 }
2355 }
2356
2357 #if !COMPILE_TEMPLATE_AMD3DNOW
2358 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2359 {
2360 dst0+= count;
2361 dst1+= count;
2362 src += 4*count;
2363 count= - count;
2364 if(count <= -8) {
2365 count += 7;
2366 __asm__ volatile(
2367 "pcmpeqw %%mm7, %%mm7 \n\t"
2368 "psrlw $8, %%mm7 \n\t"
2369 "1: \n\t"
2370 "movq -28(%1, %0, 4), %%mm0 \n\t"
2371 "movq -20(%1, %0, 4), %%mm1 \n\t"
2372 "movq -12(%1, %0, 4), %%mm2 \n\t"
2373 "movq -4(%1, %0, 4), %%mm3 \n\t"
2374 "psrlw $8, %%mm0 \n\t"
2375 "psrlw $8, %%mm1 \n\t"
2376 "psrlw $8, %%mm2 \n\t"
2377 "psrlw $8, %%mm3 \n\t"
2378 "packuswb %%mm1, %%mm0 \n\t"
2379 "packuswb %%mm3, %%mm2 \n\t"
2380 "movq %%mm0, %%mm1 \n\t"
2381 "movq %%mm2, %%mm3 \n\t"
2382 "psrlw $8, %%mm0 \n\t"
2383 "psrlw $8, %%mm2 \n\t"
2384 "pand %%mm7, %%mm1 \n\t"
2385 "pand %%mm7, %%mm3 \n\t"
2386 "packuswb %%mm2, %%mm0 \n\t"
2387 "packuswb %%mm3, %%mm1 \n\t"
2388 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2389 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2390 "add $8, %0 \n\t"
2391 " js 1b \n\t"
2392 : "+r"(count)
2393 : "r"(src), "r"(dst0), "r"(dst1)
2394 );
2395 count -= 7;
2396 }
2397 src++;
2398 while(count<0) {
2399 dst0[count]= src[4*count+0];
2400 dst1[count]= src[4*count+2];
2401 count++;
2402 }
2403 }
2404 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2405
2406 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2407 {
2408 dst0 += count;
2409 dst1 += count;
2410 src0 += 4*count;
2411 src1 += 4*count;
2412 count= - count;
2413 #ifdef PAVGB
2414 if(count <= -8) {
2415 count += 7;
2416 __asm__ volatile(
2417 "pcmpeqw %%mm7, %%mm7 \n\t"
2418 "psrlw $8, %%mm7 \n\t"
2419 "1: \n\t"
2420 "movq -28(%1, %0, 4), %%mm0 \n\t"
2421 "movq -20(%1, %0, 4), %%mm1 \n\t"
2422 "movq -12(%1, %0, 4), %%mm2 \n\t"
2423 "movq -4(%1, %0, 4), %%mm3 \n\t"
2424 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2425 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2426 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2427 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2428 "psrlw $8, %%mm0 \n\t"
2429 "psrlw $8, %%mm1 \n\t"
2430 "psrlw $8, %%mm2 \n\t"
2431 "psrlw $8, %%mm3 \n\t"
2432 "packuswb %%mm1, %%mm0 \n\t"
2433 "packuswb %%mm3, %%mm2 \n\t"
2434 "movq %%mm0, %%mm1 \n\t"
2435 "movq %%mm2, %%mm3 \n\t"
2436 "psrlw $8, %%mm0 \n\t"
2437 "psrlw $8, %%mm2 \n\t"
2438 "pand %%mm7, %%mm1 \n\t"
2439 "pand %%mm7, %%mm3 \n\t"
2440 "packuswb %%mm2, %%mm0 \n\t"
2441 "packuswb %%mm3, %%mm1 \n\t"
2442 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2443 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2444 "add $8, %0 \n\t"
2445 " js 1b \n\t"
2446 : "+r"(count)
2447 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2448 );
2449 count -= 7;
2450 }
2451 #endif
2452 src0++;
2453 src1++;
2454 while(count<0) {
2455 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2456 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2457 count++;
2458 }
2459 }
2460
2461 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2462 long width, long height,
2463 long lumStride, long chromStride, long srcStride)
2464 {
2465 long y;
2466 const long chromWidth= -((-width)>>1);
2467
2468 for (y=0; y<height; y++) {
2469 RENAME(extract_even)(src, ydst, width);
2470 if(y&1) {
2471 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2472 udst+= chromStride;
2473 vdst+= chromStride;
2474 }
2475
2476 src += srcStride;
2477 ydst+= lumStride;
2478 }
2479 __asm__(
2480 EMMS" \n\t"
2481 SFENCE" \n\t"
2482 ::: "memory"
2483 );
2484 }
2485
2486 #if !COMPILE_TEMPLATE_AMD3DNOW
2487 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2488 long width, long height,
2489 long lumStride, long chromStride, long srcStride)
2490 {
2491 long y;
2492 const long chromWidth= -((-width)>>1);
2493
2494 for (y=0; y<height; y++) {
2495 RENAME(extract_even)(src, ydst, width);
2496 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2497
2498 src += srcStride;
2499 ydst+= lumStride;
2500 udst+= chromStride;
2501 vdst+= chromStride;
2502 }
2503 __asm__(
2504 EMMS" \n\t"
2505 SFENCE" \n\t"
2506 ::: "memory"
2507 );
2508 }
2509 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2510
2511 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2512 long width, long height,
2513 long lumStride, long chromStride, long srcStride)
2514 {
2515 long y;
2516 const long chromWidth= -((-width)>>1);
2517
2518 for (y=0; y<height; y++) {
2519 RENAME(extract_even)(src+1, ydst, width);
2520 if(y&1) {
2521 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2522 udst+= chromStride;
2523 vdst+= chromStride;
2524 }
2525
2526 src += srcStride;
2527 ydst+= lumStride;
2528 }
2529 __asm__(
2530 EMMS" \n\t"
2531 SFENCE" \n\t"
2532 ::: "memory"
2533 );
2534 }
2535
2536 #if !COMPILE_TEMPLATE_AMD3DNOW
2537 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2538 long width, long height,
2539 long lumStride, long chromStride, long srcStride)
2540 {
2541 long y;
2542 const long chromWidth= -((-width)>>1);
2543
2544 for (y=0; y<height; y++) {
2545 RENAME(extract_even)(src+1, ydst, width);
2546 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2547
2548 src += srcStride;
2549 ydst+= lumStride;
2550 udst+= chromStride;
2551 vdst+= chromStride;
2552 }
2553 __asm__(
2554 EMMS" \n\t"
2555 SFENCE" \n\t"
2556 ::: "memory"
2557 );
2558 }
2559 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2560 #endif /* !COMPILE_TEMPLATE_SSE2 */
2561
2562 static inline void RENAME(rgb2rgb_init)(void)
2563 {
2564 #if !COMPILE_TEMPLATE_SSE2
2565 #if !COMPILE_TEMPLATE_AMD3DNOW
2566 rgb15to16 = RENAME(rgb15to16);
2567 rgb15tobgr24 = RENAME(rgb15tobgr24);
2568 rgb15to32 = RENAME(rgb15to32);
2569 rgb16tobgr24 = RENAME(rgb16tobgr24);
2570 rgb16to32 = RENAME(rgb16to32);
2571 rgb16to15 = RENAME(rgb16to15);
2572 rgb24tobgr16 = RENAME(rgb24tobgr16);
2573 rgb24tobgr15 = RENAME(rgb24tobgr15);
2574 rgb24tobgr32 = RENAME(rgb24tobgr32);
2575 rgb32to16 = RENAME(rgb32to16);
2576 rgb32to15 = RENAME(rgb32to15);
2577 rgb32tobgr24 = RENAME(rgb32tobgr24);
2578 rgb24to15 = RENAME(rgb24to15);
2579 rgb24to16 = RENAME(rgb24to16);
2580 rgb24tobgr24 = RENAME(rgb24tobgr24);
2581 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2582 rgb32tobgr16 = RENAME(rgb32tobgr16);
2583 rgb32tobgr15 = RENAME(rgb32tobgr15);
2584 yv12toyuy2 = RENAME(yv12toyuy2);
2585 yv12touyvy = RENAME(yv12touyvy);
2586 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2587 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2588 yuy2toyv12 = RENAME(yuy2toyv12);
2589 vu9_to_vu12 = RENAME(vu9_to_vu12);
2590 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2591 uyvytoyuv422 = RENAME(uyvytoyuv422);
2592 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2593 #endif /* !COMPILE_TEMPLATE_SSE2 */
2594
2595 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2596 planar2x = RENAME(planar2x);
2597 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
2598 rgb24toyv12 = RENAME(rgb24toyv12);
2599
2600 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2601 uyvytoyuv420 = RENAME(uyvytoyuv420);
2602 #endif /* COMPILE_TEMPLATE_SSE2 */
2603
2604 #if !COMPILE_TEMPLATE_AMD3DNOW
2605 interleaveBytes = RENAME(interleaveBytes);
2606 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2607 }