swscale: simplify rgb2rgb templating
[libav.git] / libswscale / x86 / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of Libav.
11 *
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27 #include <stddef.h>
28
29 #undef PREFETCH
30 #undef MOVNTQ
31 #undef EMMS
32 #undef SFENCE
33 #undef MMREG_SIZE
34 #undef PAVGB
35
36 #if COMPILE_TEMPLATE_SSE2
37 #define MMREG_SIZE 16
38 #else
39 #define MMREG_SIZE 8
40 #endif
41
42 #if COMPILE_TEMPLATE_AMD3DNOW
43 #define PREFETCH "prefetch"
44 #define PAVGB "pavgusb"
45 #elif COMPILE_TEMPLATE_MMX2
46 #define PREFETCH "prefetchnta"
47 #define PAVGB "pavgb"
48 #else
49 #define PREFETCH " # nop"
50 #endif
51
52 #if COMPILE_TEMPLATE_AMD3DNOW
53 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54 #define EMMS "femms"
55 #else
56 #define EMMS "emms"
57 #endif
58
59 #if COMPILE_TEMPLATE_MMX2
60 #define MOVNTQ "movntq"
61 #define SFENCE "sfence"
62 #else
63 #define MOVNTQ "movq"
64 #define SFENCE " # nop"
65 #endif
66
67 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
68 {
69 uint8_t *dest = dst;
70 const uint8_t *s = src;
71 const uint8_t *end;
72 const uint8_t *mm_end;
73 end = s + src_size;
74 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
75 mm_end = end - 23;
76 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
77 while (s < mm_end) {
78 __asm__ volatile(
79 PREFETCH" 32%1 \n\t"
80 "movd %1, %%mm0 \n\t"
81 "punpckldq 3%1, %%mm0 \n\t"
82 "movd 6%1, %%mm1 \n\t"
83 "punpckldq 9%1, %%mm1 \n\t"
84 "movd 12%1, %%mm2 \n\t"
85 "punpckldq 15%1, %%mm2 \n\t"
86 "movd 18%1, %%mm3 \n\t"
87 "punpckldq 21%1, %%mm3 \n\t"
88 "por %%mm7, %%mm0 \n\t"
89 "por %%mm7, %%mm1 \n\t"
90 "por %%mm7, %%mm2 \n\t"
91 "por %%mm7, %%mm3 \n\t"
92 MOVNTQ" %%mm0, %0 \n\t"
93 MOVNTQ" %%mm1, 8%0 \n\t"
94 MOVNTQ" %%mm2, 16%0 \n\t"
95 MOVNTQ" %%mm3, 24%0"
96 :"=m"(*dest)
97 :"m"(*s)
98 :"memory");
99 dest += 32;
100 s += 24;
101 }
102 __asm__ volatile(SFENCE:::"memory");
103 __asm__ volatile(EMMS:::"memory");
104 while (s < end) {
105 *dest++ = *s++;
106 *dest++ = *s++;
107 *dest++ = *s++;
108 *dest++ = 255;
109 }
110 }
111
112 #define STORE_BGR24_MMX \
113 "psrlq $8, %%mm2 \n\t" \
114 "psrlq $8, %%mm3 \n\t" \
115 "psrlq $8, %%mm6 \n\t" \
116 "psrlq $8, %%mm7 \n\t" \
117 "pand "MANGLE(mask24l)", %%mm0\n\t" \
118 "pand "MANGLE(mask24l)", %%mm1\n\t" \
119 "pand "MANGLE(mask24l)", %%mm4\n\t" \
120 "pand "MANGLE(mask24l)", %%mm5\n\t" \
121 "pand "MANGLE(mask24h)", %%mm2\n\t" \
122 "pand "MANGLE(mask24h)", %%mm3\n\t" \
123 "pand "MANGLE(mask24h)", %%mm6\n\t" \
124 "pand "MANGLE(mask24h)", %%mm7\n\t" \
125 "por %%mm2, %%mm0 \n\t" \
126 "por %%mm3, %%mm1 \n\t" \
127 "por %%mm6, %%mm4 \n\t" \
128 "por %%mm7, %%mm5 \n\t" \
129 \
130 "movq %%mm1, %%mm2 \n\t" \
131 "movq %%mm4, %%mm3 \n\t" \
132 "psllq $48, %%mm2 \n\t" \
133 "psllq $32, %%mm3 \n\t" \
134 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
135 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
136 "por %%mm2, %%mm0 \n\t" \
137 "psrlq $16, %%mm1 \n\t" \
138 "psrlq $32, %%mm4 \n\t" \
139 "psllq $16, %%mm5 \n\t" \
140 "por %%mm3, %%mm1 \n\t" \
141 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
142 "por %%mm5, %%mm4 \n\t" \
143 \
144 MOVNTQ" %%mm0, %0 \n\t" \
145 MOVNTQ" %%mm1, 8%0 \n\t" \
146 MOVNTQ" %%mm4, 16%0"
147
148
149 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
150 {
151 uint8_t *dest = dst;
152 const uint8_t *s = src;
153 const uint8_t *end;
154 const uint8_t *mm_end;
155 end = s + src_size;
156 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
157 mm_end = end - 31;
158 while (s < mm_end) {
159 __asm__ volatile(
160 PREFETCH" 32%1 \n\t"
161 "movq %1, %%mm0 \n\t"
162 "movq 8%1, %%mm1 \n\t"
163 "movq 16%1, %%mm4 \n\t"
164 "movq 24%1, %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
169 STORE_BGR24_MMX
170 :"=m"(*dest)
171 :"m"(*s)
172 :"memory");
173 dest += 24;
174 s += 32;
175 }
176 __asm__ volatile(SFENCE:::"memory");
177 __asm__ volatile(EMMS:::"memory");
178 while (s < end) {
179 *dest++ = *s++;
180 *dest++ = *s++;
181 *dest++ = *s++;
182 s++;
183 }
184 }
185
186 /*
187 original by Strepto/Astral
188 ported to gcc & bugfixed: A'rpi
189 MMX2, 3DNOW optimization by Nick Kurshev
190 32-bit C version, and and&add trick by Michael Niedermayer
191 */
192 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
193 {
194 register const uint8_t* s=src;
195 register uint8_t* d=dst;
196 register const uint8_t *end;
197 const uint8_t *mm_end;
198 end = s + src_size;
199 __asm__ volatile(PREFETCH" %0"::"m"(*s));
200 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
201 mm_end = end - 15;
202 while (s<mm_end) {
203 __asm__ volatile(
204 PREFETCH" 32%1 \n\t"
205 "movq %1, %%mm0 \n\t"
206 "movq 8%1, %%mm2 \n\t"
207 "movq %%mm0, %%mm1 \n\t"
208 "movq %%mm2, %%mm3 \n\t"
209 "pand %%mm4, %%mm0 \n\t"
210 "pand %%mm4, %%mm2 \n\t"
211 "paddw %%mm1, %%mm0 \n\t"
212 "paddw %%mm3, %%mm2 \n\t"
213 MOVNTQ" %%mm0, %0 \n\t"
214 MOVNTQ" %%mm2, 8%0"
215 :"=m"(*d)
216 :"m"(*s)
217 );
218 d+=16;
219 s+=16;
220 }
221 __asm__ volatile(SFENCE:::"memory");
222 __asm__ volatile(EMMS:::"memory");
223 mm_end = end - 3;
224 while (s < mm_end) {
225 register unsigned x= *((const uint32_t *)s);
226 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
227 d+=4;
228 s+=4;
229 }
230 if (s < end) {
231 register unsigned short x= *((const uint16_t *)s);
232 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
233 }
234 }
235
236 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
237 {
238 register const uint8_t* s=src;
239 register uint8_t* d=dst;
240 register const uint8_t *end;
241 const uint8_t *mm_end;
242 end = s + src_size;
243 __asm__ volatile(PREFETCH" %0"::"m"(*s));
244 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
245 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
246 mm_end = end - 15;
247 while (s<mm_end) {
248 __asm__ volatile(
249 PREFETCH" 32%1 \n\t"
250 "movq %1, %%mm0 \n\t"
251 "movq 8%1, %%mm2 \n\t"
252 "movq %%mm0, %%mm1 \n\t"
253 "movq %%mm2, %%mm3 \n\t"
254 "psrlq $1, %%mm0 \n\t"
255 "psrlq $1, %%mm2 \n\t"
256 "pand %%mm7, %%mm0 \n\t"
257 "pand %%mm7, %%mm2 \n\t"
258 "pand %%mm6, %%mm1 \n\t"
259 "pand %%mm6, %%mm3 \n\t"
260 "por %%mm1, %%mm0 \n\t"
261 "por %%mm3, %%mm2 \n\t"
262 MOVNTQ" %%mm0, %0 \n\t"
263 MOVNTQ" %%mm2, 8%0"
264 :"=m"(*d)
265 :"m"(*s)
266 );
267 d+=16;
268 s+=16;
269 }
270 __asm__ volatile(SFENCE:::"memory");
271 __asm__ volatile(EMMS:::"memory");
272 mm_end = end - 3;
273 while (s < mm_end) {
274 register uint32_t x= *((const uint32_t*)s);
275 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
276 s+=4;
277 d+=4;
278 }
279 if (s < end) {
280 register uint16_t x= *((const uint16_t*)s);
281 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
282 }
283 }
284
285 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
286 {
287 const uint8_t *s = src;
288 const uint8_t *end;
289 const uint8_t *mm_end;
290 uint16_t *d = (uint16_t *)dst;
291 end = s + src_size;
292 mm_end = end - 15;
293 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
294 __asm__ volatile(
295 "movq %3, %%mm5 \n\t"
296 "movq %4, %%mm6 \n\t"
297 "movq %5, %%mm7 \n\t"
298 "jmp 2f \n\t"
299 ".p2align 4 \n\t"
300 "1: \n\t"
301 PREFETCH" 32(%1) \n\t"
302 "movd (%1), %%mm0 \n\t"
303 "movd 4(%1), %%mm3 \n\t"
304 "punpckldq 8(%1), %%mm0 \n\t"
305 "punpckldq 12(%1), %%mm3 \n\t"
306 "movq %%mm0, %%mm1 \n\t"
307 "movq %%mm3, %%mm4 \n\t"
308 "pand %%mm6, %%mm0 \n\t"
309 "pand %%mm6, %%mm3 \n\t"
310 "pmaddwd %%mm7, %%mm0 \n\t"
311 "pmaddwd %%mm7, %%mm3 \n\t"
312 "pand %%mm5, %%mm1 \n\t"
313 "pand %%mm5, %%mm4 \n\t"
314 "por %%mm1, %%mm0 \n\t"
315 "por %%mm4, %%mm3 \n\t"
316 "psrld $5, %%mm0 \n\t"
317 "pslld $11, %%mm3 \n\t"
318 "por %%mm3, %%mm0 \n\t"
319 MOVNTQ" %%mm0, (%0) \n\t"
320 "add $16, %1 \n\t"
321 "add $8, %0 \n\t"
322 "2: \n\t"
323 "cmp %2, %1 \n\t"
324 " jb 1b \n\t"
325 : "+r" (d), "+r"(s)
326 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
327 );
328 #else
329 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
330 __asm__ volatile(
331 "movq %0, %%mm7 \n\t"
332 "movq %1, %%mm6 \n\t"
333 ::"m"(red_16mask),"m"(green_16mask));
334 while (s < mm_end) {
335 __asm__ volatile(
336 PREFETCH" 32%1 \n\t"
337 "movd %1, %%mm0 \n\t"
338 "movd 4%1, %%mm3 \n\t"
339 "punpckldq 8%1, %%mm0 \n\t"
340 "punpckldq 12%1, %%mm3 \n\t"
341 "movq %%mm0, %%mm1 \n\t"
342 "movq %%mm0, %%mm2 \n\t"
343 "movq %%mm3, %%mm4 \n\t"
344 "movq %%mm3, %%mm5 \n\t"
345 "psrlq $3, %%mm0 \n\t"
346 "psrlq $3, %%mm3 \n\t"
347 "pand %2, %%mm0 \n\t"
348 "pand %2, %%mm3 \n\t"
349 "psrlq $5, %%mm1 \n\t"
350 "psrlq $5, %%mm4 \n\t"
351 "pand %%mm6, %%mm1 \n\t"
352 "pand %%mm6, %%mm4 \n\t"
353 "psrlq $8, %%mm2 \n\t"
354 "psrlq $8, %%mm5 \n\t"
355 "pand %%mm7, %%mm2 \n\t"
356 "pand %%mm7, %%mm5 \n\t"
357 "por %%mm1, %%mm0 \n\t"
358 "por %%mm4, %%mm3 \n\t"
359 "por %%mm2, %%mm0 \n\t"
360 "por %%mm5, %%mm3 \n\t"
361 "psllq $16, %%mm3 \n\t"
362 "por %%mm3, %%mm0 \n\t"
363 MOVNTQ" %%mm0, %0 \n\t"
364 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
365 d += 4;
366 s += 16;
367 }
368 #endif
369 __asm__ volatile(SFENCE:::"memory");
370 __asm__ volatile(EMMS:::"memory");
371 while (s < end) {
372 register int rgb = *(const uint32_t*)s; s += 4;
373 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
374 }
375 }
376
377 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
378 {
379 const uint8_t *s = src;
380 const uint8_t *end;
381 const uint8_t *mm_end;
382 uint16_t *d = (uint16_t *)dst;
383 end = s + src_size;
384 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
385 __asm__ volatile(
386 "movq %0, %%mm7 \n\t"
387 "movq %1, %%mm6 \n\t"
388 ::"m"(red_16mask),"m"(green_16mask));
389 mm_end = end - 15;
390 while (s < mm_end) {
391 __asm__ volatile(
392 PREFETCH" 32%1 \n\t"
393 "movd %1, %%mm0 \n\t"
394 "movd 4%1, %%mm3 \n\t"
395 "punpckldq 8%1, %%mm0 \n\t"
396 "punpckldq 12%1, %%mm3 \n\t"
397 "movq %%mm0, %%mm1 \n\t"
398 "movq %%mm0, %%mm2 \n\t"
399 "movq %%mm3, %%mm4 \n\t"
400 "movq %%mm3, %%mm5 \n\t"
401 "psllq $8, %%mm0 \n\t"
402 "psllq $8, %%mm3 \n\t"
403 "pand %%mm7, %%mm0 \n\t"
404 "pand %%mm7, %%mm3 \n\t"
405 "psrlq $5, %%mm1 \n\t"
406 "psrlq $5, %%mm4 \n\t"
407 "pand %%mm6, %%mm1 \n\t"
408 "pand %%mm6, %%mm4 \n\t"
409 "psrlq $19, %%mm2 \n\t"
410 "psrlq $19, %%mm5 \n\t"
411 "pand %2, %%mm2 \n\t"
412 "pand %2, %%mm5 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "por %%mm2, %%mm0 \n\t"
416 "por %%mm5, %%mm3 \n\t"
417 "psllq $16, %%mm3 \n\t"
418 "por %%mm3, %%mm0 \n\t"
419 MOVNTQ" %%mm0, %0 \n\t"
420 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
421 d += 4;
422 s += 16;
423 }
424 __asm__ volatile(SFENCE:::"memory");
425 __asm__ volatile(EMMS:::"memory");
426 while (s < end) {
427 register int rgb = *(const uint32_t*)s; s += 4;
428 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
429 }
430 }
431
432 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
433 {
434 const uint8_t *s = src;
435 const uint8_t *end;
436 const uint8_t *mm_end;
437 uint16_t *d = (uint16_t *)dst;
438 end = s + src_size;
439 mm_end = end - 15;
440 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
441 __asm__ volatile(
442 "movq %3, %%mm5 \n\t"
443 "movq %4, %%mm6 \n\t"
444 "movq %5, %%mm7 \n\t"
445 "jmp 2f \n\t"
446 ".p2align 4 \n\t"
447 "1: \n\t"
448 PREFETCH" 32(%1) \n\t"
449 "movd (%1), %%mm0 \n\t"
450 "movd 4(%1), %%mm3 \n\t"
451 "punpckldq 8(%1), %%mm0 \n\t"
452 "punpckldq 12(%1), %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm3, %%mm4 \n\t"
455 "pand %%mm6, %%mm0 \n\t"
456 "pand %%mm6, %%mm3 \n\t"
457 "pmaddwd %%mm7, %%mm0 \n\t"
458 "pmaddwd %%mm7, %%mm3 \n\t"
459 "pand %%mm5, %%mm1 \n\t"
460 "pand %%mm5, %%mm4 \n\t"
461 "por %%mm1, %%mm0 \n\t"
462 "por %%mm4, %%mm3 \n\t"
463 "psrld $6, %%mm0 \n\t"
464 "pslld $10, %%mm3 \n\t"
465 "por %%mm3, %%mm0 \n\t"
466 MOVNTQ" %%mm0, (%0) \n\t"
467 "add $16, %1 \n\t"
468 "add $8, %0 \n\t"
469 "2: \n\t"
470 "cmp %2, %1 \n\t"
471 " jb 1b \n\t"
472 : "+r" (d), "+r"(s)
473 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
474 );
475 #else
476 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
477 __asm__ volatile(
478 "movq %0, %%mm7 \n\t"
479 "movq %1, %%mm6 \n\t"
480 ::"m"(red_15mask),"m"(green_15mask));
481 while (s < mm_end) {
482 __asm__ volatile(
483 PREFETCH" 32%1 \n\t"
484 "movd %1, %%mm0 \n\t"
485 "movd 4%1, %%mm3 \n\t"
486 "punpckldq 8%1, %%mm0 \n\t"
487 "punpckldq 12%1, %%mm3 \n\t"
488 "movq %%mm0, %%mm1 \n\t"
489 "movq %%mm0, %%mm2 \n\t"
490 "movq %%mm3, %%mm4 \n\t"
491 "movq %%mm3, %%mm5 \n\t"
492 "psrlq $3, %%mm0 \n\t"
493 "psrlq $3, %%mm3 \n\t"
494 "pand %2, %%mm0 \n\t"
495 "pand %2, %%mm3 \n\t"
496 "psrlq $6, %%mm1 \n\t"
497 "psrlq $6, %%mm4 \n\t"
498 "pand %%mm6, %%mm1 \n\t"
499 "pand %%mm6, %%mm4 \n\t"
500 "psrlq $9, %%mm2 \n\t"
501 "psrlq $9, %%mm5 \n\t"
502 "pand %%mm7, %%mm2 \n\t"
503 "pand %%mm7, %%mm5 \n\t"
504 "por %%mm1, %%mm0 \n\t"
505 "por %%mm4, %%mm3 \n\t"
506 "por %%mm2, %%mm0 \n\t"
507 "por %%mm5, %%mm3 \n\t"
508 "psllq $16, %%mm3 \n\t"
509 "por %%mm3, %%mm0 \n\t"
510 MOVNTQ" %%mm0, %0 \n\t"
511 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
512 d += 4;
513 s += 16;
514 }
515 #endif
516 __asm__ volatile(SFENCE:::"memory");
517 __asm__ volatile(EMMS:::"memory");
518 while (s < end) {
519 register int rgb = *(const uint32_t*)s; s += 4;
520 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
521 }
522 }
523
524 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
525 {
526 const uint8_t *s = src;
527 const uint8_t *end;
528 const uint8_t *mm_end;
529 uint16_t *d = (uint16_t *)dst;
530 end = s + src_size;
531 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
532 __asm__ volatile(
533 "movq %0, %%mm7 \n\t"
534 "movq %1, %%mm6 \n\t"
535 ::"m"(red_15mask),"m"(green_15mask));
536 mm_end = end - 15;
537 while (s < mm_end) {
538 __asm__ volatile(
539 PREFETCH" 32%1 \n\t"
540 "movd %1, %%mm0 \n\t"
541 "movd 4%1, %%mm3 \n\t"
542 "punpckldq 8%1, %%mm0 \n\t"
543 "punpckldq 12%1, %%mm3 \n\t"
544 "movq %%mm0, %%mm1 \n\t"
545 "movq %%mm0, %%mm2 \n\t"
546 "movq %%mm3, %%mm4 \n\t"
547 "movq %%mm3, %%mm5 \n\t"
548 "psllq $7, %%mm0 \n\t"
549 "psllq $7, %%mm3 \n\t"
550 "pand %%mm7, %%mm0 \n\t"
551 "pand %%mm7, %%mm3 \n\t"
552 "psrlq $6, %%mm1 \n\t"
553 "psrlq $6, %%mm4 \n\t"
554 "pand %%mm6, %%mm1 \n\t"
555 "pand %%mm6, %%mm4 \n\t"
556 "psrlq $19, %%mm2 \n\t"
557 "psrlq $19, %%mm5 \n\t"
558 "pand %2, %%mm2 \n\t"
559 "pand %2, %%mm5 \n\t"
560 "por %%mm1, %%mm0 \n\t"
561 "por %%mm4, %%mm3 \n\t"
562 "por %%mm2, %%mm0 \n\t"
563 "por %%mm5, %%mm3 \n\t"
564 "psllq $16, %%mm3 \n\t"
565 "por %%mm3, %%mm0 \n\t"
566 MOVNTQ" %%mm0, %0 \n\t"
567 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
568 d += 4;
569 s += 16;
570 }
571 __asm__ volatile(SFENCE:::"memory");
572 __asm__ volatile(EMMS:::"memory");
573 while (s < end) {
574 register int rgb = *(const uint32_t*)s; s += 4;
575 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
576 }
577 }
578
579 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
580 {
581 const uint8_t *s = src;
582 const uint8_t *end;
583 const uint8_t *mm_end;
584 uint16_t *d = (uint16_t *)dst;
585 end = s + src_size;
586 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
587 __asm__ volatile(
588 "movq %0, %%mm7 \n\t"
589 "movq %1, %%mm6 \n\t"
590 ::"m"(red_16mask),"m"(green_16mask));
591 mm_end = end - 11;
592 while (s < mm_end) {
593 __asm__ volatile(
594 PREFETCH" 32%1 \n\t"
595 "movd %1, %%mm0 \n\t"
596 "movd 3%1, %%mm3 \n\t"
597 "punpckldq 6%1, %%mm0 \n\t"
598 "punpckldq 9%1, %%mm3 \n\t"
599 "movq %%mm0, %%mm1 \n\t"
600 "movq %%mm0, %%mm2 \n\t"
601 "movq %%mm3, %%mm4 \n\t"
602 "movq %%mm3, %%mm5 \n\t"
603 "psrlq $3, %%mm0 \n\t"
604 "psrlq $3, %%mm3 \n\t"
605 "pand %2, %%mm0 \n\t"
606 "pand %2, %%mm3 \n\t"
607 "psrlq $5, %%mm1 \n\t"
608 "psrlq $5, %%mm4 \n\t"
609 "pand %%mm6, %%mm1 \n\t"
610 "pand %%mm6, %%mm4 \n\t"
611 "psrlq $8, %%mm2 \n\t"
612 "psrlq $8, %%mm5 \n\t"
613 "pand %%mm7, %%mm2 \n\t"
614 "pand %%mm7, %%mm5 \n\t"
615 "por %%mm1, %%mm0 \n\t"
616 "por %%mm4, %%mm3 \n\t"
617 "por %%mm2, %%mm0 \n\t"
618 "por %%mm5, %%mm3 \n\t"
619 "psllq $16, %%mm3 \n\t"
620 "por %%mm3, %%mm0 \n\t"
621 MOVNTQ" %%mm0, %0 \n\t"
622 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
623 d += 4;
624 s += 12;
625 }
626 __asm__ volatile(SFENCE:::"memory");
627 __asm__ volatile(EMMS:::"memory");
628 while (s < end) {
629 const int b = *s++;
630 const int g = *s++;
631 const int r = *s++;
632 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
633 }
634 }
635
636 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
637 {
638 const uint8_t *s = src;
639 const uint8_t *end;
640 const uint8_t *mm_end;
641 uint16_t *d = (uint16_t *)dst;
642 end = s + src_size;
643 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
644 __asm__ volatile(
645 "movq %0, %%mm7 \n\t"
646 "movq %1, %%mm6 \n\t"
647 ::"m"(red_16mask),"m"(green_16mask));
648 mm_end = end - 15;
649 while (s < mm_end) {
650 __asm__ volatile(
651 PREFETCH" 32%1 \n\t"
652 "movd %1, %%mm0 \n\t"
653 "movd 3%1, %%mm3 \n\t"
654 "punpckldq 6%1, %%mm0 \n\t"
655 "punpckldq 9%1, %%mm3 \n\t"
656 "movq %%mm0, %%mm1 \n\t"
657 "movq %%mm0, %%mm2 \n\t"
658 "movq %%mm3, %%mm4 \n\t"
659 "movq %%mm3, %%mm5 \n\t"
660 "psllq $8, %%mm0 \n\t"
661 "psllq $8, %%mm3 \n\t"
662 "pand %%mm7, %%mm0 \n\t"
663 "pand %%mm7, %%mm3 \n\t"
664 "psrlq $5, %%mm1 \n\t"
665 "psrlq $5, %%mm4 \n\t"
666 "pand %%mm6, %%mm1 \n\t"
667 "pand %%mm6, %%mm4 \n\t"
668 "psrlq $19, %%mm2 \n\t"
669 "psrlq $19, %%mm5 \n\t"
670 "pand %2, %%mm2 \n\t"
671 "pand %2, %%mm5 \n\t"
672 "por %%mm1, %%mm0 \n\t"
673 "por %%mm4, %%mm3 \n\t"
674 "por %%mm2, %%mm0 \n\t"
675 "por %%mm5, %%mm3 \n\t"
676 "psllq $16, %%mm3 \n\t"
677 "por %%mm3, %%mm0 \n\t"
678 MOVNTQ" %%mm0, %0 \n\t"
679 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
680 d += 4;
681 s += 12;
682 }
683 __asm__ volatile(SFENCE:::"memory");
684 __asm__ volatile(EMMS:::"memory");
685 while (s < end) {
686 const int r = *s++;
687 const int g = *s++;
688 const int b = *s++;
689 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
690 }
691 }
692
693 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
694 {
695 const uint8_t *s = src;
696 const uint8_t *end;
697 const uint8_t *mm_end;
698 uint16_t *d = (uint16_t *)dst;
699 end = s + src_size;
700 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
701 __asm__ volatile(
702 "movq %0, %%mm7 \n\t"
703 "movq %1, %%mm6 \n\t"
704 ::"m"(red_15mask),"m"(green_15mask));
705 mm_end = end - 11;
706 while (s < mm_end) {
707 __asm__ volatile(
708 PREFETCH" 32%1 \n\t"
709 "movd %1, %%mm0 \n\t"
710 "movd 3%1, %%mm3 \n\t"
711 "punpckldq 6%1, %%mm0 \n\t"
712 "punpckldq 9%1, %%mm3 \n\t"
713 "movq %%mm0, %%mm1 \n\t"
714 "movq %%mm0, %%mm2 \n\t"
715 "movq %%mm3, %%mm4 \n\t"
716 "movq %%mm3, %%mm5 \n\t"
717 "psrlq $3, %%mm0 \n\t"
718 "psrlq $3, %%mm3 \n\t"
719 "pand %2, %%mm0 \n\t"
720 "pand %2, %%mm3 \n\t"
721 "psrlq $6, %%mm1 \n\t"
722 "psrlq $6, %%mm4 \n\t"
723 "pand %%mm6, %%mm1 \n\t"
724 "pand %%mm6, %%mm4 \n\t"
725 "psrlq $9, %%mm2 \n\t"
726 "psrlq $9, %%mm5 \n\t"
727 "pand %%mm7, %%mm2 \n\t"
728 "pand %%mm7, %%mm5 \n\t"
729 "por %%mm1, %%mm0 \n\t"
730 "por %%mm4, %%mm3 \n\t"
731 "por %%mm2, %%mm0 \n\t"
732 "por %%mm5, %%mm3 \n\t"
733 "psllq $16, %%mm3 \n\t"
734 "por %%mm3, %%mm0 \n\t"
735 MOVNTQ" %%mm0, %0 \n\t"
736 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
737 d += 4;
738 s += 12;
739 }
740 __asm__ volatile(SFENCE:::"memory");
741 __asm__ volatile(EMMS:::"memory");
742 while (s < end) {
743 const int b = *s++;
744 const int g = *s++;
745 const int r = *s++;
746 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
747 }
748 }
749
750 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
751 {
752 const uint8_t *s = src;
753 const uint8_t *end;
754 const uint8_t *mm_end;
755 uint16_t *d = (uint16_t *)dst;
756 end = s + src_size;
757 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
758 __asm__ volatile(
759 "movq %0, %%mm7 \n\t"
760 "movq %1, %%mm6 \n\t"
761 ::"m"(red_15mask),"m"(green_15mask));
762 mm_end = end - 15;
763 while (s < mm_end) {
764 __asm__ volatile(
765 PREFETCH" 32%1 \n\t"
766 "movd %1, %%mm0 \n\t"
767 "movd 3%1, %%mm3 \n\t"
768 "punpckldq 6%1, %%mm0 \n\t"
769 "punpckldq 9%1, %%mm3 \n\t"
770 "movq %%mm0, %%mm1 \n\t"
771 "movq %%mm0, %%mm2 \n\t"
772 "movq %%mm3, %%mm4 \n\t"
773 "movq %%mm3, %%mm5 \n\t"
774 "psllq $7, %%mm0 \n\t"
775 "psllq $7, %%mm3 \n\t"
776 "pand %%mm7, %%mm0 \n\t"
777 "pand %%mm7, %%mm3 \n\t"
778 "psrlq $6, %%mm1 \n\t"
779 "psrlq $6, %%mm4 \n\t"
780 "pand %%mm6, %%mm1 \n\t"
781 "pand %%mm6, %%mm4 \n\t"
782 "psrlq $19, %%mm2 \n\t"
783 "psrlq $19, %%mm5 \n\t"
784 "pand %2, %%mm2 \n\t"
785 "pand %2, %%mm5 \n\t"
786 "por %%mm1, %%mm0 \n\t"
787 "por %%mm4, %%mm3 \n\t"
788 "por %%mm2, %%mm0 \n\t"
789 "por %%mm5, %%mm3 \n\t"
790 "psllq $16, %%mm3 \n\t"
791 "por %%mm3, %%mm0 \n\t"
792 MOVNTQ" %%mm0, %0 \n\t"
793 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794 d += 4;
795 s += 12;
796 }
797 __asm__ volatile(SFENCE:::"memory");
798 __asm__ volatile(EMMS:::"memory");
799 while (s < end) {
800 const int r = *s++;
801 const int g = *s++;
802 const int b = *s++;
803 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
804 }
805 }
806
807 /*
808 I use less accurate approximation here by simply left-shifting the input
809 value and filling the low order bits with zeroes. This method improves PNG
810 compression but this scheme cannot reproduce white exactly, since it does
811 not generate an all-ones maximum value; the net effect is to darken the
812 image slightly.
813
814 The better method should be "left bit replication":
815
816 4 3 2 1 0
817 ---------
818 1 1 0 1 1
819
820 7 6 5 4 3 2 1 0
821 ----------------
822 1 1 0 1 1 1 1 0
823 |=======| |===|
824 | leftmost bits repeated to fill open bits
825 |
826 original bits
827 */
828 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
829 {
830 const uint16_t *end;
831 const uint16_t *mm_end;
832 uint8_t *d = dst;
833 const uint16_t *s = (const uint16_t*)src;
834 end = s + src_size/2;
835 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
836 mm_end = end - 7;
837 while (s < mm_end) {
838 __asm__ volatile(
839 PREFETCH" 32%1 \n\t"
840 "movq %1, %%mm0 \n\t"
841 "movq %1, %%mm1 \n\t"
842 "movq %1, %%mm2 \n\t"
843 "pand %2, %%mm0 \n\t"
844 "pand %3, %%mm1 \n\t"
845 "pand %4, %%mm2 \n\t"
846 "psllq $3, %%mm0 \n\t"
847 "psrlq $2, %%mm1 \n\t"
848 "psrlq $7, %%mm2 \n\t"
849 "movq %%mm0, %%mm3 \n\t"
850 "movq %%mm1, %%mm4 \n\t"
851 "movq %%mm2, %%mm5 \n\t"
852 "punpcklwd %5, %%mm0 \n\t"
853 "punpcklwd %5, %%mm1 \n\t"
854 "punpcklwd %5, %%mm2 \n\t"
855 "punpckhwd %5, %%mm3 \n\t"
856 "punpckhwd %5, %%mm4 \n\t"
857 "punpckhwd %5, %%mm5 \n\t"
858 "psllq $8, %%mm1 \n\t"
859 "psllq $16, %%mm2 \n\t"
860 "por %%mm1, %%mm0 \n\t"
861 "por %%mm2, %%mm0 \n\t"
862 "psllq $8, %%mm4 \n\t"
863 "psllq $16, %%mm5 \n\t"
864 "por %%mm4, %%mm3 \n\t"
865 "por %%mm5, %%mm3 \n\t"
866
867 "movq %%mm0, %%mm6 \n\t"
868 "movq %%mm3, %%mm7 \n\t"
869
870 "movq 8%1, %%mm0 \n\t"
871 "movq 8%1, %%mm1 \n\t"
872 "movq 8%1, %%mm2 \n\t"
873 "pand %2, %%mm0 \n\t"
874 "pand %3, %%mm1 \n\t"
875 "pand %4, %%mm2 \n\t"
876 "psllq $3, %%mm0 \n\t"
877 "psrlq $2, %%mm1 \n\t"
878 "psrlq $7, %%mm2 \n\t"
879 "movq %%mm0, %%mm3 \n\t"
880 "movq %%mm1, %%mm4 \n\t"
881 "movq %%mm2, %%mm5 \n\t"
882 "punpcklwd %5, %%mm0 \n\t"
883 "punpcklwd %5, %%mm1 \n\t"
884 "punpcklwd %5, %%mm2 \n\t"
885 "punpckhwd %5, %%mm3 \n\t"
886 "punpckhwd %5, %%mm4 \n\t"
887 "punpckhwd %5, %%mm5 \n\t"
888 "psllq $8, %%mm1 \n\t"
889 "psllq $16, %%mm2 \n\t"
890 "por %%mm1, %%mm0 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "psllq $8, %%mm4 \n\t"
893 "psllq $16, %%mm5 \n\t"
894 "por %%mm4, %%mm3 \n\t"
895 "por %%mm5, %%mm3 \n\t"
896
897 :"=m"(*d)
898 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
899 :"memory");
900 /* borrowed 32 to 24 */
901 __asm__ volatile(
902 "movq %%mm0, %%mm4 \n\t"
903 "movq %%mm3, %%mm5 \n\t"
904 "movq %%mm6, %%mm0 \n\t"
905 "movq %%mm7, %%mm1 \n\t"
906
907 "movq %%mm4, %%mm6 \n\t"
908 "movq %%mm5, %%mm7 \n\t"
909 "movq %%mm0, %%mm2 \n\t"
910 "movq %%mm1, %%mm3 \n\t"
911
912 STORE_BGR24_MMX
913
914 :"=m"(*d)
915 :"m"(*s)
916 :"memory");
917 d += 24;
918 s += 8;
919 }
920 __asm__ volatile(SFENCE:::"memory");
921 __asm__ volatile(EMMS:::"memory");
922 while (s < end) {
923 register uint16_t bgr;
924 bgr = *s++;
925 *d++ = (bgr&0x1F)<<3;
926 *d++ = (bgr&0x3E0)>>2;
927 *d++ = (bgr&0x7C00)>>7;
928 }
929 }
930
931 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
932 {
933 const uint16_t *end;
934 const uint16_t *mm_end;
935 uint8_t *d = (uint8_t *)dst;
936 const uint16_t *s = (const uint16_t *)src;
937 end = s + src_size/2;
938 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
939 mm_end = end - 7;
940 while (s < mm_end) {
941 __asm__ volatile(
942 PREFETCH" 32%1 \n\t"
943 "movq %1, %%mm0 \n\t"
944 "movq %1, %%mm1 \n\t"
945 "movq %1, %%mm2 \n\t"
946 "pand %2, %%mm0 \n\t"
947 "pand %3, %%mm1 \n\t"
948 "pand %4, %%mm2 \n\t"
949 "psllq $3, %%mm0 \n\t"
950 "psrlq $3, %%mm1 \n\t"
951 "psrlq $8, %%mm2 \n\t"
952 "movq %%mm0, %%mm3 \n\t"
953 "movq %%mm1, %%mm4 \n\t"
954 "movq %%mm2, %%mm5 \n\t"
955 "punpcklwd %5, %%mm0 \n\t"
956 "punpcklwd %5, %%mm1 \n\t"
957 "punpcklwd %5, %%mm2 \n\t"
958 "punpckhwd %5, %%mm3 \n\t"
959 "punpckhwd %5, %%mm4 \n\t"
960 "punpckhwd %5, %%mm5 \n\t"
961 "psllq $8, %%mm1 \n\t"
962 "psllq $16, %%mm2 \n\t"
963 "por %%mm1, %%mm0 \n\t"
964 "por %%mm2, %%mm0 \n\t"
965 "psllq $8, %%mm4 \n\t"
966 "psllq $16, %%mm5 \n\t"
967 "por %%mm4, %%mm3 \n\t"
968 "por %%mm5, %%mm3 \n\t"
969
970 "movq %%mm0, %%mm6 \n\t"
971 "movq %%mm3, %%mm7 \n\t"
972
973 "movq 8%1, %%mm0 \n\t"
974 "movq 8%1, %%mm1 \n\t"
975 "movq 8%1, %%mm2 \n\t"
976 "pand %2, %%mm0 \n\t"
977 "pand %3, %%mm1 \n\t"
978 "pand %4, %%mm2 \n\t"
979 "psllq $3, %%mm0 \n\t"
980 "psrlq $3, %%mm1 \n\t"
981 "psrlq $8, %%mm2 \n\t"
982 "movq %%mm0, %%mm3 \n\t"
983 "movq %%mm1, %%mm4 \n\t"
984 "movq %%mm2, %%mm5 \n\t"
985 "punpcklwd %5, %%mm0 \n\t"
986 "punpcklwd %5, %%mm1 \n\t"
987 "punpcklwd %5, %%mm2 \n\t"
988 "punpckhwd %5, %%mm3 \n\t"
989 "punpckhwd %5, %%mm4 \n\t"
990 "punpckhwd %5, %%mm5 \n\t"
991 "psllq $8, %%mm1 \n\t"
992 "psllq $16, %%mm2 \n\t"
993 "por %%mm1, %%mm0 \n\t"
994 "por %%mm2, %%mm0 \n\t"
995 "psllq $8, %%mm4 \n\t"
996 "psllq $16, %%mm5 \n\t"
997 "por %%mm4, %%mm3 \n\t"
998 "por %%mm5, %%mm3 \n\t"
999 :"=m"(*d)
1000 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1001 :"memory");
1002 /* borrowed 32 to 24 */
1003 __asm__ volatile(
1004 "movq %%mm0, %%mm4 \n\t"
1005 "movq %%mm3, %%mm5 \n\t"
1006 "movq %%mm6, %%mm0 \n\t"
1007 "movq %%mm7, %%mm1 \n\t"
1008
1009 "movq %%mm4, %%mm6 \n\t"
1010 "movq %%mm5, %%mm7 \n\t"
1011 "movq %%mm0, %%mm2 \n\t"
1012 "movq %%mm1, %%mm3 \n\t"
1013
1014 STORE_BGR24_MMX
1015
1016 :"=m"(*d)
1017 :"m"(*s)
1018 :"memory");
1019 d += 24;
1020 s += 8;
1021 }
1022 __asm__ volatile(SFENCE:::"memory");
1023 __asm__ volatile(EMMS:::"memory");
1024 while (s < end) {
1025 register uint16_t bgr;
1026 bgr = *s++;
1027 *d++ = (bgr&0x1F)<<3;
1028 *d++ = (bgr&0x7E0)>>3;
1029 *d++ = (bgr&0xF800)>>8;
1030 }
1031 }
1032
1033 /*
1034 * mm0 = 00 B3 00 B2 00 B1 00 B0
1035 * mm1 = 00 G3 00 G2 00 G1 00 G0
1036 * mm2 = 00 R3 00 R2 00 R1 00 R0
1037 * mm6 = FF FF FF FF FF FF FF FF
1038 * mm7 = 00 00 00 00 00 00 00 00
1039 */
1040 #define PACK_RGB32 \
1041 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1042 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1043 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1044 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1045 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1046 "movq %%mm0, %%mm3 \n\t" \
1047 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1048 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1049 MOVNTQ" %%mm0, %0 \n\t" \
1050 MOVNTQ" %%mm3, 8%0 \n\t" \
1051
1052 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1053 {
1054 const uint16_t *end;
1055 const uint16_t *mm_end;
1056 uint8_t *d = dst;
1057 const uint16_t *s = (const uint16_t *)src;
1058 end = s + src_size/2;
1059 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1060 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1061 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1062 mm_end = end - 3;
1063 while (s < mm_end) {
1064 __asm__ volatile(
1065 PREFETCH" 32%1 \n\t"
1066 "movq %1, %%mm0 \n\t"
1067 "movq %1, %%mm1 \n\t"
1068 "movq %1, %%mm2 \n\t"
1069 "pand %2, %%mm0 \n\t"
1070 "pand %3, %%mm1 \n\t"
1071 "pand %4, %%mm2 \n\t"
1072 "psllq $3, %%mm0 \n\t"
1073 "psrlq $2, %%mm1 \n\t"
1074 "psrlq $7, %%mm2 \n\t"
1075 PACK_RGB32
1076 :"=m"(*d)
1077 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1078 :"memory");
1079 d += 16;
1080 s += 4;
1081 }
1082 __asm__ volatile(SFENCE:::"memory");
1083 __asm__ volatile(EMMS:::"memory");
1084 while (s < end) {
1085 register uint16_t bgr;
1086 bgr = *s++;
1087 *d++ = (bgr&0x1F)<<3;
1088 *d++ = (bgr&0x3E0)>>2;
1089 *d++ = (bgr&0x7C00)>>7;
1090 *d++ = 255;
1091 }
1092 }
1093
1094 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1095 {
1096 const uint16_t *end;
1097 const uint16_t *mm_end;
1098 uint8_t *d = dst;
1099 const uint16_t *s = (const uint16_t*)src;
1100 end = s + src_size/2;
1101 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1102 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1103 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1104 mm_end = end - 3;
1105 while (s < mm_end) {
1106 __asm__ volatile(
1107 PREFETCH" 32%1 \n\t"
1108 "movq %1, %%mm0 \n\t"
1109 "movq %1, %%mm1 \n\t"
1110 "movq %1, %%mm2 \n\t"
1111 "pand %2, %%mm0 \n\t"
1112 "pand %3, %%mm1 \n\t"
1113 "pand %4, %%mm2 \n\t"
1114 "psllq $3, %%mm0 \n\t"
1115 "psrlq $3, %%mm1 \n\t"
1116 "psrlq $8, %%mm2 \n\t"
1117 PACK_RGB32
1118 :"=m"(*d)
1119 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1120 :"memory");
1121 d += 16;
1122 s += 4;
1123 }
1124 __asm__ volatile(SFENCE:::"memory");
1125 __asm__ volatile(EMMS:::"memory");
1126 while (s < end) {
1127 register uint16_t bgr;
1128 bgr = *s++;
1129 *d++ = (bgr&0x1F)<<3;
1130 *d++ = (bgr&0x7E0)>>3;
1131 *d++ = (bgr&0xF800)>>8;
1132 *d++ = 255;
1133 }
1134 }
1135
1136 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
1137 {
1138 x86_reg idx = 15 - src_size;
1139 const uint8_t *s = src-idx;
1140 uint8_t *d = dst-idx;
1141 __asm__ volatile(
1142 "test %0, %0 \n\t"
1143 "jns 2f \n\t"
1144 PREFETCH" (%1, %0) \n\t"
1145 "movq %3, %%mm7 \n\t"
1146 "pxor %4, %%mm7 \n\t"
1147 "movq %%mm7, %%mm6 \n\t"
1148 "pxor %5, %%mm7 \n\t"
1149 ".p2align 4 \n\t"
1150 "1: \n\t"
1151 PREFETCH" 32(%1, %0) \n\t"
1152 "movq (%1, %0), %%mm0 \n\t"
1153 "movq 8(%1, %0), %%mm1 \n\t"
1154 # if COMPILE_TEMPLATE_MMX2
1155 "pshufw $177, %%mm0, %%mm3 \n\t"
1156 "pshufw $177, %%mm1, %%mm5 \n\t"
1157 "pand %%mm7, %%mm0 \n\t"
1158 "pand %%mm6, %%mm3 \n\t"
1159 "pand %%mm7, %%mm1 \n\t"
1160 "pand %%mm6, %%mm5 \n\t"
1161 "por %%mm3, %%mm0 \n\t"
1162 "por %%mm5, %%mm1 \n\t"
1163 # else
1164 "movq %%mm0, %%mm2 \n\t"
1165 "movq %%mm1, %%mm4 \n\t"
1166 "pand %%mm7, %%mm0 \n\t"
1167 "pand %%mm6, %%mm2 \n\t"
1168 "pand %%mm7, %%mm1 \n\t"
1169 "pand %%mm6, %%mm4 \n\t"
1170 "movq %%mm2, %%mm3 \n\t"
1171 "movq %%mm4, %%mm5 \n\t"
1172 "pslld $16, %%mm2 \n\t"
1173 "psrld $16, %%mm3 \n\t"
1174 "pslld $16, %%mm4 \n\t"
1175 "psrld $16, %%mm5 \n\t"
1176 "por %%mm2, %%mm0 \n\t"
1177 "por %%mm4, %%mm1 \n\t"
1178 "por %%mm3, %%mm0 \n\t"
1179 "por %%mm5, %%mm1 \n\t"
1180 # endif
1181 MOVNTQ" %%mm0, (%2, %0) \n\t"
1182 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1183 "add $16, %0 \n\t"
1184 "js 1b \n\t"
1185 SFENCE" \n\t"
1186 EMMS" \n\t"
1187 "2: \n\t"
1188 : "+&r"(idx)
1189 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1190 : "memory");
1191 for (; idx<15; idx+=4) {
1192 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1193 v &= 0xff00ff;
1194 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1195 }
1196 }
1197
1198 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1199 {
1200 unsigned i;
1201 x86_reg mmx_size= 23 - src_size;
1202 __asm__ volatile (
1203 "test %%"REG_a", %%"REG_a" \n\t"
1204 "jns 2f \n\t"
1205 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1206 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1207 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1208 ".p2align 4 \n\t"
1209 "1: \n\t"
1210 PREFETCH" 32(%1, %%"REG_a") \n\t"
1211 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1212 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1213 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1214 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1215 "pand %%mm5, %%mm0 \n\t"
1216 "pand %%mm6, %%mm1 \n\t"
1217 "pand %%mm7, %%mm2 \n\t"
1218 "por %%mm0, %%mm1 \n\t"
1219 "por %%mm2, %%mm1 \n\t"
1220 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1221 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1222 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1223 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1224 "pand %%mm7, %%mm0 \n\t"
1225 "pand %%mm5, %%mm1 \n\t"
1226 "pand %%mm6, %%mm2 \n\t"
1227 "por %%mm0, %%mm1 \n\t"
1228 "por %%mm2, %%mm1 \n\t"
1229 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1230 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1231 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1232 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1233 "pand %%mm6, %%mm0 \n\t"
1234 "pand %%mm7, %%mm1 \n\t"
1235 "pand %%mm5, %%mm2 \n\t"
1236 "por %%mm0, %%mm1 \n\t"
1237 "por %%mm2, %%mm1 \n\t"
1238 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1239 "add $24, %%"REG_a" \n\t"
1240 " js 1b \n\t"
1241 "2: \n\t"
1242 : "+a" (mmx_size)
1243 : "r" (src-mmx_size), "r"(dst-mmx_size)
1244 );
1245
1246 __asm__ volatile(SFENCE:::"memory");
1247 __asm__ volatile(EMMS:::"memory");
1248
1249 if (mmx_size==23) return; //finished, was multiple of 8
1250
1251 src+= src_size;
1252 dst+= src_size;
1253 src_size= 23-mmx_size;
1254 src-= src_size;
1255 dst-= src_size;
1256 for (i=0; i<src_size; i+=3) {
1257 register uint8_t x;
1258 x = src[i + 2];
1259 dst[i + 1] = src[i + 1];
1260 dst[i + 2] = src[i + 0];
1261 dst[i + 0] = x;
1262 }
1263 }
1264
1265 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1266 long width, long height,
1267 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1268 {
1269 long y;
1270 const x86_reg chromWidth= width>>1;
1271 for (y=0; y<height; y++) {
1272 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1273 __asm__ volatile(
1274 "xor %%"REG_a", %%"REG_a" \n\t"
1275 ".p2align 4 \n\t"
1276 "1: \n\t"
1277 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1278 PREFETCH" 32(%2, %%"REG_a") \n\t"
1279 PREFETCH" 32(%3, %%"REG_a") \n\t"
1280 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1281 "movq %%mm0, %%mm2 \n\t" // U(0)
1282 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1283 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1284 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1285
1286 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1287 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1288 "movq %%mm3, %%mm4 \n\t" // Y(0)
1289 "movq %%mm5, %%mm6 \n\t" // Y(8)
1290 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1291 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1292 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1293 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1294
1295 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1296 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1297 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1298 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1299
1300 "add $8, %%"REG_a" \n\t"
1301 "cmp %4, %%"REG_a" \n\t"
1302 " jb 1b \n\t"
1303 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1304 : "%"REG_a
1305 );
1306 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1307 usrc += chromStride;
1308 vsrc += chromStride;
1309 }
1310 ysrc += lumStride;
1311 dst += dstStride;
1312 }
1313 __asm__(EMMS" \n\t"
1314 SFENCE" \n\t"
1315 :::"memory");
1316 }
1317
1318 /**
1319 * Height should be a multiple of 2 and width should be a multiple of 16.
1320 * (If this is a problem for anyone then tell me, and I will fix it.)
1321 */
1322 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1323 long width, long height,
1324 long lumStride, long chromStride, long dstStride)
1325 {
1326 //FIXME interpolate chroma
1327 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1328 }
1329
1330 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1331 long width, long height,
1332 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1333 {
1334 long y;
1335 const x86_reg chromWidth= width>>1;
1336 for (y=0; y<height; y++) {
1337 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1338 __asm__ volatile(
1339 "xor %%"REG_a", %%"REG_a" \n\t"
1340 ".p2align 4 \n\t"
1341 "1: \n\t"
1342 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1343 PREFETCH" 32(%2, %%"REG_a") \n\t"
1344 PREFETCH" 32(%3, %%"REG_a") \n\t"
1345 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1346 "movq %%mm0, %%mm2 \n\t" // U(0)
1347 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1348 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1349 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1350
1351 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1352 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1353 "movq %%mm0, %%mm4 \n\t" // Y(0)
1354 "movq %%mm2, %%mm6 \n\t" // Y(8)
1355 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1356 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1357 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1358 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1359
1360 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1361 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1362 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1363 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1364
1365 "add $8, %%"REG_a" \n\t"
1366 "cmp %4, %%"REG_a" \n\t"
1367 " jb 1b \n\t"
1368 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1369 : "%"REG_a
1370 );
1371 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1372 usrc += chromStride;
1373 vsrc += chromStride;
1374 }
1375 ysrc += lumStride;
1376 dst += dstStride;
1377 }
1378 __asm__(EMMS" \n\t"
1379 SFENCE" \n\t"
1380 :::"memory");
1381 }
1382
1383 /**
1384 * Height should be a multiple of 2 and width should be a multiple of 16
1385 * (If this is a problem for anyone then tell me, and I will fix it.)
1386 */
1387 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1388 long width, long height,
1389 long lumStride, long chromStride, long dstStride)
1390 {
1391 //FIXME interpolate chroma
1392 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1393 }
1394
1395 /**
1396 * Width should be a multiple of 16.
1397 */
1398 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1399 long width, long height,
1400 long lumStride, long chromStride, long dstStride)
1401 {
1402 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1403 }
1404
1405 /**
1406 * Width should be a multiple of 16.
1407 */
1408 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1409 long width, long height,
1410 long lumStride, long chromStride, long dstStride)
1411 {
1412 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1413 }
1414
1415 /**
1416 * Height should be a multiple of 2 and width should be a multiple of 16.
1417 * (If this is a problem for anyone then tell me, and I will fix it.)
1418 */
1419 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1420 long width, long height,
1421 long lumStride, long chromStride, long srcStride)
1422 {
1423 long y;
1424 const x86_reg chromWidth= width>>1;
1425 for (y=0; y<height; y+=2) {
1426 __asm__ volatile(
1427 "xor %%"REG_a", %%"REG_a" \n\t"
1428 "pcmpeqw %%mm7, %%mm7 \n\t"
1429 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1430 ".p2align 4 \n\t"
1431 "1: \n\t"
1432 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1433 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1434 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1435 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1436 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1437 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1438 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1439 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1440 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1441 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1442 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1443
1444 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1445
1446 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1447 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1448 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1449 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1450 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1451 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1452 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1453 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1454 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1455 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1456
1457 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1458
1459 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1460 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1461 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1462 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1463 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1464 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1465 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1466 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1467
1468 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1469 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1470
1471 "add $8, %%"REG_a" \n\t"
1472 "cmp %4, %%"REG_a" \n\t"
1473 " jb 1b \n\t"
1474 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1475 : "memory", "%"REG_a
1476 );
1477
1478 ydst += lumStride;
1479 src += srcStride;
1480
1481 __asm__ volatile(
1482 "xor %%"REG_a", %%"REG_a" \n\t"
1483 ".p2align 4 \n\t"
1484 "1: \n\t"
1485 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1486 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1487 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1488 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1489 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1490 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1491 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1492 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1493 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1494 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1495 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1496
1497 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1498 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1499
1500 "add $8, %%"REG_a" \n\t"
1501 "cmp %4, %%"REG_a" \n\t"
1502 " jb 1b \n\t"
1503
1504 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1505 : "memory", "%"REG_a
1506 );
1507 udst += chromStride;
1508 vdst += chromStride;
1509 ydst += lumStride;
1510 src += srcStride;
1511 }
1512 __asm__ volatile(EMMS" \n\t"
1513 SFENCE" \n\t"
1514 :::"memory");
1515 }
1516
1517 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1518 {
1519 long x,y;
1520
1521 dst[0]= src[0];
1522
1523 // first line
1524 for (x=0; x<srcWidth-1; x++) {
1525 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1526 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1527 }
1528 dst[2*srcWidth-1]= src[srcWidth-1];
1529
1530 dst+= dstStride;
1531
1532 for (y=1; y<srcHeight; y++) {
1533 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1534 const x86_reg mmxSize= srcWidth&~15;
1535 __asm__ volatile(
1536 "mov %4, %%"REG_a" \n\t"
1537 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1538 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1539 "movq %%mm4, %%mm2 \n\t"
1540 "psllq $8, %%mm4 \n\t"
1541 "pand %%mm0, %%mm2 \n\t"
1542 "por %%mm2, %%mm4 \n\t"
1543 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1544 "movq %%mm5, %%mm3 \n\t"
1545 "psllq $8, %%mm5 \n\t"
1546 "pand %%mm0, %%mm3 \n\t"
1547 "por %%mm3, %%mm5 \n\t"
1548 "1: \n\t"
1549 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1550 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1551 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1552 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1553 PAVGB" %%mm0, %%mm5 \n\t"
1554 PAVGB" %%mm0, %%mm3 \n\t"
1555 PAVGB" %%mm0, %%mm5 \n\t"
1556 PAVGB" %%mm0, %%mm3 \n\t"
1557 PAVGB" %%mm1, %%mm4 \n\t"
1558 PAVGB" %%mm1, %%mm2 \n\t"
1559 PAVGB" %%mm1, %%mm4 \n\t"
1560 PAVGB" %%mm1, %%mm2 \n\t"
1561 "movq %%mm5, %%mm7 \n\t"
1562 "movq %%mm4, %%mm6 \n\t"
1563 "punpcklbw %%mm3, %%mm5 \n\t"
1564 "punpckhbw %%mm3, %%mm7 \n\t"
1565 "punpcklbw %%mm2, %%mm4 \n\t"
1566 "punpckhbw %%mm2, %%mm6 \n\t"
1567 #if 1
1568 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1569 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1570 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1571 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1572 #else
1573 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1574 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1575 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1576 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1577 #endif
1578 "add $8, %%"REG_a" \n\t"
1579 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1580 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1581 " js 1b \n\t"
1582 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1583 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1584 "g" (-mmxSize)
1585 : "%"REG_a
1586 );
1587 #else
1588 const x86_reg mmxSize=1;
1589
1590 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1591 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1592 #endif
1593
1594 for (x=mmxSize-1; x<srcWidth-1; x++) {
1595 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1596 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1597 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1598 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1599 }
1600 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1601 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1602
1603 dst+=dstStride*2;
1604 src+=srcStride;
1605 }
1606
1607 // last line
1608 #if 1
1609 dst[0]= src[0];
1610
1611 for (x=0; x<srcWidth-1; x++) {
1612 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1613 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1614 }
1615 dst[2*srcWidth-1]= src[srcWidth-1];
1616 #else
1617 for (x=0; x<srcWidth; x++) {
1618 dst[2*x+0]=
1619 dst[2*x+1]= src[x];
1620 }
1621 #endif
1622
1623 __asm__ volatile(EMMS" \n\t"
1624 SFENCE" \n\t"
1625 :::"memory");
1626 }
1627
1628 /**
1629 * Height should be a multiple of 2 and width should be a multiple of 16.
1630 * (If this is a problem for anyone then tell me, and I will fix it.)
1631 * Chrominance data is only taken from every second line, others are ignored.
1632 * FIXME: Write HQ version.
1633 */
1634 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1635 long width, long height,
1636 long lumStride, long chromStride, long srcStride)
1637 {
1638 long y;
1639 const x86_reg chromWidth= width>>1;
1640 for (y=0; y<height; y+=2) {
1641 __asm__ volatile(
1642 "xor %%"REG_a", %%"REG_a" \n\t"
1643 "pcmpeqw %%mm7, %%mm7 \n\t"
1644 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1645 ".p2align 4 \n\t"
1646 "1: \n\t"
1647 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1648 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1649 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1650 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1651 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1652 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1653 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1654 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1655 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1656 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1657 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1658
1659 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1660
1661 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1662 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1663 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1664 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1665 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1666 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1667 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1668 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1669 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1670 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1671
1672 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1673
1674 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1675 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1676 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1677 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1678 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1679 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1680 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1681 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1682
1683 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1684 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1685
1686 "add $8, %%"REG_a" \n\t"
1687 "cmp %4, %%"REG_a" \n\t"
1688 " jb 1b \n\t"
1689 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1690 : "memory", "%"REG_a
1691 );
1692
1693 ydst += lumStride;
1694 src += srcStride;
1695
1696 __asm__ volatile(
1697 "xor %%"REG_a", %%"REG_a" \n\t"
1698 ".p2align 4 \n\t"
1699 "1: \n\t"
1700 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1701 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1702 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1703 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1704 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1705 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1706 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1707 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1708 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1709 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1710 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1711
1712 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1713 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1714
1715 "add $8, %%"REG_a" \n\t"
1716 "cmp %4, %%"REG_a" \n\t"
1717 " jb 1b \n\t"
1718
1719 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1720 : "memory", "%"REG_a
1721 );
1722 udst += chromStride;
1723 vdst += chromStride;
1724 ydst += lumStride;
1725 src += srcStride;
1726 }
1727 __asm__ volatile(EMMS" \n\t"
1728 SFENCE" \n\t"
1729 :::"memory");
1730 }
1731
1732 /**
1733 * Height should be a multiple of 2 and width should be a multiple of 2.
1734 * (If this is a problem for anyone then tell me, and I will fix it.)
1735 * Chrominance data is only taken from every second line,
1736 * others are ignored in the C version.
1737 * FIXME: Write HQ version.
1738 */
1739 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1740 long width, long height,
1741 long lumStride, long chromStride, long srcStride)
1742 {
1743 long y;
1744 const x86_reg chromWidth= width>>1;
1745 for (y=0; y<height-2; y+=2) {
1746 long i;
1747 for (i=0; i<2; i++) {
1748 __asm__ volatile(
1749 "mov %2, %%"REG_a" \n\t"
1750 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1751 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1752 "pxor %%mm7, %%mm7 \n\t"
1753 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1754 ".p2align 4 \n\t"
1755 "1: \n\t"
1756 PREFETCH" 64(%0, %%"REG_d") \n\t"
1757 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1758 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1759 "punpcklbw %%mm7, %%mm0 \n\t"
1760 "punpcklbw %%mm7, %%mm1 \n\t"
1761 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1762 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1763 "punpcklbw %%mm7, %%mm2 \n\t"
1764 "punpcklbw %%mm7, %%mm3 \n\t"
1765 "pmaddwd %%mm6, %%mm0 \n\t"
1766 "pmaddwd %%mm6, %%mm1 \n\t"
1767 "pmaddwd %%mm6, %%mm2 \n\t"
1768 "pmaddwd %%mm6, %%mm3 \n\t"
1769 #ifndef FAST_BGR2YV12
1770 "psrad $8, %%mm0 \n\t"
1771 "psrad $8, %%mm1 \n\t"
1772 "psrad $8, %%mm2 \n\t"
1773 "psrad $8, %%mm3 \n\t"
1774 #endif
1775 "packssdw %%mm1, %%mm0 \n\t"
1776 "packssdw %%mm3, %%mm2 \n\t"
1777 "pmaddwd %%mm5, %%mm0 \n\t"
1778 "pmaddwd %%mm5, %%mm2 \n\t"
1779 "packssdw %%mm2, %%mm0 \n\t"
1780 "psraw $7, %%mm0 \n\t"
1781
1782 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1783 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1784 "punpcklbw %%mm7, %%mm4 \n\t"
1785 "punpcklbw %%mm7, %%mm1 \n\t"
1786 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1787 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1788 "punpcklbw %%mm7, %%mm2 \n\t"
1789 "punpcklbw %%mm7, %%mm3 \n\t"
1790 "pmaddwd %%mm6, %%mm4 \n\t"
1791 "pmaddwd %%mm6, %%mm1 \n\t"
1792 "pmaddwd %%mm6, %%mm2 \n\t"
1793 "pmaddwd %%mm6, %%mm3 \n\t"
1794 #ifndef FAST_BGR2YV12
1795 "psrad $8, %%mm4 \n\t"
1796 "psrad $8, %%mm1 \n\t"
1797 "psrad $8, %%mm2 \n\t"
1798 "psrad $8, %%mm3 \n\t"
1799 #endif
1800 "packssdw %%mm1, %%mm4 \n\t"
1801 "packssdw %%mm3, %%mm2 \n\t"
1802 "pmaddwd %%mm5, %%mm4 \n\t"
1803 "pmaddwd %%mm5, %%mm2 \n\t"
1804 "add $24, %%"REG_d" \n\t"
1805 "packssdw %%mm2, %%mm4 \n\t"
1806 "psraw $7, %%mm4 \n\t"
1807
1808 "packuswb %%mm4, %%mm0 \n\t"
1809 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1810
1811 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1812 "add $8, %%"REG_a" \n\t"
1813 " js 1b \n\t"
1814 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1815 : "%"REG_a, "%"REG_d
1816 );
1817 ydst += lumStride;
1818 src += srcStride;
1819 }
1820 src -= srcStride*2;
1821 __asm__ volatile(
1822 "mov %4, %%"REG_a" \n\t"
1823 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1824 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1825 "pxor %%mm7, %%mm7 \n\t"
1826 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1827 "add %%"REG_d", %%"REG_d" \n\t"
1828 ".p2align 4 \n\t"
1829 "1: \n\t"
1830 PREFETCH" 64(%0, %%"REG_d") \n\t"
1831 PREFETCH" 64(%1, %%"REG_d") \n\t"
1832 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1833 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1834 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1835 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1836 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1837 PAVGB" %%mm1, %%mm0 \n\t"
1838 PAVGB" %%mm3, %%mm2 \n\t"
1839 "movq %%mm0, %%mm1 \n\t"
1840 "movq %%mm2, %%mm3 \n\t"
1841 "psrlq $24, %%mm0 \n\t"
1842 "psrlq $24, %%mm2 \n\t"
1843 PAVGB" %%mm1, %%mm0 \n\t"
1844 PAVGB" %%mm3, %%mm2 \n\t"
1845 "punpcklbw %%mm7, %%mm0 \n\t"
1846 "punpcklbw %%mm7, %%mm2 \n\t"
1847 #else
1848 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1849 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1850 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1851 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1852 "punpcklbw %%mm7, %%mm0 \n\t"
1853 "punpcklbw %%mm7, %%mm1 \n\t"
1854 "punpcklbw %%mm7, %%mm2 \n\t"
1855 "punpcklbw %%mm7, %%mm3 \n\t"
1856 "paddw %%mm1, %%mm0 \n\t"
1857 "paddw %%mm3, %%mm2 \n\t"
1858 "paddw %%mm2, %%mm0 \n\t"
1859 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1860 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1861 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1862 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1863 "punpcklbw %%mm7, %%mm4 \n\t"
1864 "punpcklbw %%mm7, %%mm1 \n\t"
1865 "punpcklbw %%mm7, %%mm2 \n\t"
1866 "punpcklbw %%mm7, %%mm3 \n\t"
1867 "paddw %%mm1, %%mm4 \n\t"
1868 "paddw %%mm3, %%mm2 \n\t"
1869 "paddw %%mm4, %%mm2 \n\t"
1870 "psrlw $2, %%mm0 \n\t"
1871 "psrlw $2, %%mm2 \n\t"
1872 #endif
1873 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1874 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1875
1876 "pmaddwd %%mm0, %%mm1 \n\t"
1877 "pmaddwd %%mm2, %%mm3 \n\t"
1878 "pmaddwd %%mm6, %%mm0 \n\t"
1879 "pmaddwd %%mm6, %%mm2 \n\t"
1880 #ifndef FAST_BGR2YV12
1881 "psrad $8, %%mm0 \n\t"
1882 "psrad $8, %%mm1 \n\t"
1883 "psrad $8, %%mm2 \n\t"
1884 "psrad $8, %%mm3 \n\t"
1885 #endif
1886 "packssdw %%mm2, %%mm0 \n\t"
1887 "packssdw %%mm3, %%mm1 \n\t"
1888 "pmaddwd %%mm5, %%mm0 \n\t"
1889 "pmaddwd %%mm5, %%mm1 \n\t"
1890 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1891 "psraw $7, %%mm0 \n\t"
1892
1893 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1894 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1895 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1896 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1897 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1898 PAVGB" %%mm1, %%mm4 \n\t"
1899 PAVGB" %%mm3, %%mm2 \n\t"
1900 "movq %%mm4, %%mm1 \n\t"
1901 "movq %%mm2, %%mm3 \n\t"
1902 "psrlq $24, %%mm4 \n\t"
1903 "psrlq $24, %%mm2 \n\t"
1904 PAVGB" %%mm1, %%mm4 \n\t"
1905 PAVGB" %%mm3, %%mm2 \n\t"
1906 "punpcklbw %%mm7, %%mm4 \n\t"
1907 "punpcklbw %%mm7, %%mm2 \n\t"
1908 #else
1909 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1910 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1911 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1912 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1913 "punpcklbw %%mm7, %%mm4 \n\t"
1914 "punpcklbw %%mm7, %%mm1 \n\t"
1915 "punpcklbw %%mm7, %%mm2 \n\t"
1916 "punpcklbw %%mm7, %%mm3 \n\t"
1917 "paddw %%mm1, %%mm4 \n\t"
1918 "paddw %%mm3, %%mm2 \n\t"
1919 "paddw %%mm2, %%mm4 \n\t"
1920 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1921 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1922 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1923 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1924 "punpcklbw %%mm7, %%mm5 \n\t"
1925 "punpcklbw %%mm7, %%mm1 \n\t"
1926 "punpcklbw %%mm7, %%mm2 \n\t"
1927 "punpcklbw %%mm7, %%mm3 \n\t"
1928 "paddw %%mm1, %%mm5 \n\t"
1929 "paddw %%mm3, %%mm2 \n\t"
1930 "paddw %%mm5, %%mm2 \n\t"
1931 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1932 "psrlw $2, %%mm4 \n\t"
1933 "psrlw $2, %%mm2 \n\t"
1934 #endif
1935 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1936 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1937
1938 "pmaddwd %%mm4, %%mm1 \n\t"
1939 "pmaddwd %%mm2, %%mm3 \n\t"
1940 "pmaddwd %%mm6, %%mm4 \n\t"
1941 "pmaddwd %%mm6, %%mm2 \n\t"
1942 #ifndef FAST_BGR2YV12
1943 "psrad $8, %%mm4 \n\t"
1944 "psrad $8, %%mm1 \n\t"
1945 "psrad $8, %%mm2 \n\t"
1946 "psrad $8, %%mm3 \n\t"
1947 #endif
1948 "packssdw %%mm2, %%mm4 \n\t"
1949 "packssdw %%mm3, %%mm1 \n\t"
1950 "pmaddwd %%mm5, %%mm4 \n\t"
1951 "pmaddwd %%mm5, %%mm1 \n\t"
1952 "add $24, %%"REG_d" \n\t"
1953 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1954 "psraw $7, %%mm4 \n\t"
1955
1956 "movq %%mm0, %%mm1 \n\t"
1957 "punpckldq %%mm4, %%mm0 \n\t"
1958 "punpckhdq %%mm4, %%mm1 \n\t"
1959 "packsswb %%mm1, %%mm0 \n\t"
1960 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1961 "movd %%mm0, (%2, %%"REG_a") \n\t"
1962 "punpckhdq %%mm0, %%mm0 \n\t"
1963 "movd %%mm0, (%3, %%"REG_a") \n\t"
1964 "add $4, %%"REG_a" \n\t"
1965 " js 1b \n\t"
1966 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1967 : "%"REG_a, "%"REG_d
1968 );
1969
1970 udst += chromStride;
1971 vdst += chromStride;
1972 src += srcStride*2;
1973 }
1974
1975 __asm__ volatile(EMMS" \n\t"
1976 SFENCE" \n\t"
1977 :::"memory");
1978
1979 for (; y<height; y+=2) {
1980 long i;
1981 for (i=0; i<chromWidth; i++) {
1982 unsigned int b = src[6*i+0];
1983 unsigned int g = src[6*i+1];
1984 unsigned int r = src[6*i+2];
1985
1986 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1987 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
1988 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
1989
1990 udst[i] = U;
1991 vdst[i] = V;
1992 ydst[2*i] = Y;
1993
1994 b = src[6*i+3];
1995 g = src[6*i+4];
1996 r = src[6*i+5];
1997
1998 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1999 ydst[2*i+1] = Y;
2000 }
2001 ydst += lumStride;
2002 src += srcStride;
2003
2004 for (i=0; i<chromWidth; i++) {
2005 unsigned int b = src[6*i+0];
2006 unsigned int g = src[6*i+1];
2007 unsigned int r = src[6*i+2];
2008
2009 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2010
2011 ydst[2*i] = Y;
2012
2013 b = src[6*i+3];
2014 g = src[6*i+4];
2015 r = src[6*i+5];
2016
2017 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2018 ydst[2*i+1] = Y;
2019 }
2020 udst += chromStride;
2021 vdst += chromStride;
2022 ydst += lumStride;
2023 src += srcStride;
2024 }
2025 }
2026
2027 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2028 long width, long height, long src1Stride,
2029 long src2Stride, long dstStride)
2030 {
2031 long h;
2032
2033 for (h=0; h < height; h++) {
2034 long w;
2035
2036 #if COMPILE_TEMPLATE_SSE2
2037 __asm__(
2038 "xor %%"REG_a", %%"REG_a" \n\t"
2039 "1: \n\t"
2040 PREFETCH" 64(%1, %%"REG_a") \n\t"
2041 PREFETCH" 64(%2, %%"REG_a") \n\t"
2042 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2043 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2044 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2045 "punpcklbw %%xmm2, %%xmm0 \n\t"
2046 "punpckhbw %%xmm2, %%xmm1 \n\t"
2047 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2048 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2049 "add $16, %%"REG_a" \n\t"
2050 "cmp %3, %%"REG_a" \n\t"
2051 " jb 1b \n\t"
2052 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2053 : "memory", "%"REG_a""
2054 );
2055 #else
2056 __asm__(
2057 "xor %%"REG_a", %%"REG_a" \n\t"
2058 "1: \n\t"
2059 PREFETCH" 64(%1, %%"REG_a") \n\t"
2060 PREFETCH" 64(%2, %%"REG_a") \n\t"
2061 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2062 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2063 "movq %%mm0, %%mm1 \n\t"
2064 "movq %%mm2, %%mm3 \n\t"
2065 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2066 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2067 "punpcklbw %%mm4, %%mm0 \n\t"
2068 "punpckhbw %%mm4, %%mm1 \n\t"
2069 "punpcklbw %%mm5, %%mm2 \n\t"
2070 "punpckhbw %%mm5, %%mm3 \n\t"
2071 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2072 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2073 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2074 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2075 "add $16, %%"REG_a" \n\t"
2076 "cmp %3, %%"REG_a" \n\t"
2077 " jb 1b \n\t"
2078 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2079 : "memory", "%"REG_a
2080 );
2081 #endif
2082 for (w= (width&(~15)); w < width; w++) {
2083 dest[2*w+0] = src1[w];
2084 dest[2*w+1] = src2[w];
2085 }
2086 dest += dstStride;
2087 src1 += src1Stride;
2088 src2 += src2Stride;
2089 }
2090 __asm__(
2091 EMMS" \n\t"
2092 SFENCE" \n\t"
2093 ::: "memory"
2094 );
2095 }
2096
2097 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2098 uint8_t *dst1, uint8_t *dst2,
2099 long width, long height,
2100 long srcStride1, long srcStride2,
2101 long dstStride1, long dstStride2)
2102 {
2103 x86_reg y;
2104 long x,w,h;
2105 w=width/2; h=height/2;
2106 __asm__ volatile(
2107 PREFETCH" %0 \n\t"
2108 PREFETCH" %1 \n\t"
2109 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2110 for (y=0;y<h;y++) {
2111 const uint8_t* s1=src1+srcStride1*(y>>1);
2112 uint8_t* d=dst1+dstStride1*y;
2113 x=0;
2114 for (;x<w-31;x+=32) {
2115 __asm__ volatile(
2116 PREFETCH" 32%1 \n\t"
2117 "movq %1, %%mm0 \n\t"
2118 "movq 8%1, %%mm2 \n\t"
2119 "movq 16%1, %%mm4 \n\t"
2120 "movq 24%1, %%mm6 \n\t"
2121 "movq %%mm0, %%mm1 \n\t"
2122 "movq %%mm2, %%mm3 \n\t"
2123 "movq %%mm4, %%mm5 \n\t"
2124 "movq %%mm6, %%mm7 \n\t"
2125 "punpcklbw %%mm0, %%mm0 \n\t"
2126 "punpckhbw %%mm1, %%mm1 \n\t"
2127 "punpcklbw %%mm2, %%mm2 \n\t"
2128 "punpckhbw %%mm3, %%mm3 \n\t"
2129 "punpcklbw %%mm4, %%mm4 \n\t"
2130 "punpckhbw %%mm5, %%mm5 \n\t"
2131 "punpcklbw %%mm6, %%mm6 \n\t"
2132 "punpckhbw %%mm7, %%mm7 \n\t"
2133 MOVNTQ" %%mm0, %0 \n\t"
2134 MOVNTQ" %%mm1, 8%0 \n\t"
2135 MOVNTQ" %%mm2, 16%0 \n\t"
2136 MOVNTQ" %%mm3, 24%0 \n\t"
2137 MOVNTQ" %%mm4, 32%0 \n\t"
2138 MOVNTQ" %%mm5, 40%0 \n\t"
2139 MOVNTQ" %%mm6, 48%0 \n\t"
2140 MOVNTQ" %%mm7, 56%0"
2141 :"=m"(d[2*x])
2142 :"m"(s1[x])
2143 :"memory");
2144 }
2145 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2146 }
2147 for (y=0;y<h;y++) {
2148 const uint8_t* s2=src2+srcStride2*(y>>1);
2149 uint8_t* d=dst2+dstStride2*y;
2150 x=0;
2151 for (;x<w-31;x+=32) {
2152 __asm__ volatile(
2153 PREFETCH" 32%1 \n\t"
2154 "movq %1, %%mm0 \n\t"
2155 "movq 8%1, %%mm2 \n\t"
2156 "movq 16%1, %%mm4 \n\t"
2157 "movq 24%1, %%mm6 \n\t"
2158 "movq %%mm0, %%mm1 \n\t"
2159 "movq %%mm2, %%mm3 \n\t"
2160 "movq %%mm4, %%mm5 \n\t"
2161 "movq %%mm6, %%mm7 \n\t"
2162 "punpcklbw %%mm0, %%mm0 \n\t"
2163 "punpckhbw %%mm1, %%mm1 \n\t"
2164 "punpcklbw %%mm2, %%mm2 \n\t"
2165 "punpckhbw %%mm3, %%mm3 \n\t"
2166 "punpcklbw %%mm4, %%mm4 \n\t"
2167 "punpckhbw %%mm5, %%mm5 \n\t"
2168 "punpcklbw %%mm6, %%mm6 \n\t"
2169 "punpckhbw %%mm7, %%mm7 \n\t"
2170 MOVNTQ" %%mm0, %0 \n\t"
2171 MOVNTQ" %%mm1, 8%0 \n\t"
2172 MOVNTQ" %%mm2, 16%0 \n\t"
2173 MOVNTQ" %%mm3, 24%0 \n\t"
2174 MOVNTQ" %%mm4, 32%0 \n\t"
2175 MOVNTQ" %%mm5, 40%0 \n\t"
2176 MOVNTQ" %%mm6, 48%0 \n\t"
2177 MOVNTQ" %%mm7, 56%0"
2178 :"=m"(d[2*x])
2179 :"m"(s2[x])
2180 :"memory");
2181 }
2182 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2183 }
2184 __asm__(
2185 EMMS" \n\t"
2186 SFENCE" \n\t"
2187 ::: "memory"
2188 );
2189 }
2190
2191 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2192 uint8_t *dst,
2193 long width, long height,
2194 long srcStride1, long srcStride2,
2195 long srcStride3, long dstStride)
2196 {
2197 x86_reg x;
2198 long y,w,h;
2199 w=width/2; h=height;
2200 for (y=0;y<h;y++) {
2201 const uint8_t* yp=src1+srcStride1*y;
2202 const uint8_t* up=src2+srcStride2*(y>>2);
2203 const uint8_t* vp=src3+srcStride3*(y>>2);
2204 uint8_t* d=dst+dstStride*y;
2205 x=0;
2206 for (;x<w-7;x+=8) {
2207 __asm__ volatile(
2208 PREFETCH" 32(%1, %0) \n\t"
2209 PREFETCH" 32(%2, %0) \n\t"
2210 PREFETCH" 32(%3, %0) \n\t"
2211 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2212 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2213 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2214 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2215 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2216 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2217 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2218 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2219 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2220 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2221
2222 "movq %%mm1, %%mm6 \n\t"
2223 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2224 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2225 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2226 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2227 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2228
2229 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2230 "movq 8(%1, %0, 4), %%mm0 \n\t"
2231 "movq %%mm0, %%mm3 \n\t"
2232 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2233 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2234 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2235 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2236
2237 "movq %%mm4, %%mm6 \n\t"
2238 "movq 16(%1, %0, 4), %%mm0 \n\t"
2239 "movq %%mm0, %%mm3 \n\t"
2240 "punpcklbw %%mm5, %%mm4 \n\t"
2241 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2242 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2243 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2244 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2245
2246 "punpckhbw %%mm5, %%mm6 \n\t"
2247 "movq 24(%1, %0, 4), %%mm0 \n\t"
2248 "movq %%mm0, %%mm3 \n\t"
2249 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2250 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2251 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2252 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2253
2254 : "+r" (x)
2255 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2256 :"memory");
2257 }
2258 for (; x<w; x++) {
2259 const long x2 = x<<2;
2260 d[8*x+0] = yp[x2];
2261 d[8*x+1] = up[x];
2262 d[8*x+2] = yp[x2+1];
2263 d[8*x+3] = vp[x];
2264 d[8*x+4] = yp[x2+2];
2265 d[8*x+5] = up[x];
2266 d[8*x+6] = yp[x2+3];
2267 d[8*x+7] = vp[x];
2268 }
2269 }
2270 __asm__(
2271 EMMS" \n\t"
2272 SFENCE" \n\t"
2273 ::: "memory"
2274 );
2275 }
2276
2277 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2278 {
2279 dst += count;
2280 src += 2*count;
2281 count= - count;
2282
2283 if(count <= -16) {
2284 count += 15;
2285 __asm__ volatile(
2286 "pcmpeqw %%mm7, %%mm7 \n\t"
2287 "psrlw $8, %%mm7 \n\t"
2288 "1: \n\t"
2289 "movq -30(%1, %0, 2), %%mm0 \n\t"
2290 "movq -22(%1, %0, 2), %%mm1 \n\t"
2291 "movq -14(%1, %0, 2), %%mm2 \n\t"
2292 "movq -6(%1, %0, 2), %%mm3 \n\t"
2293 "pand %%mm7, %%mm0 \n\t"
2294 "pand %%mm7, %%mm1 \n\t"
2295 "pand %%mm7, %%mm2 \n\t"
2296 "pand %%mm7, %%mm3 \n\t"
2297 "packuswb %%mm1, %%mm0 \n\t"
2298 "packuswb %%mm3, %%mm2 \n\t"
2299 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2300 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2301 "add $16, %0 \n\t"
2302 " js 1b \n\t"
2303 : "+r"(count)
2304 : "r"(src), "r"(dst)
2305 );
2306 count -= 15;
2307 }
2308 while(count<0) {
2309 dst[count]= src[2*count];
2310 count++;
2311 }
2312 }
2313
2314 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2315 {
2316 dst0+= count;
2317 dst1+= count;
2318 src += 4*count;
2319 count= - count;
2320 if(count <= -8) {
2321 count += 7;
2322 __asm__ volatile(
2323 "pcmpeqw %%mm7, %%mm7 \n\t"
2324 "psrlw $8, %%mm7 \n\t"
2325 "1: \n\t"
2326 "movq -28(%1, %0, 4), %%mm0 \n\t"
2327 "movq -20(%1, %0, 4), %%mm1 \n\t"
2328 "movq -12(%1, %0, 4), %%mm2 \n\t"
2329 "movq -4(%1, %0, 4), %%mm3 \n\t"
2330 "pand %%mm7, %%mm0 \n\t"
2331 "pand %%mm7, %%mm1 \n\t"
2332 "pand %%mm7, %%mm2 \n\t"
2333 "pand %%mm7, %%mm3 \n\t"
2334 "packuswb %%mm1, %%mm0 \n\t"
2335 "packuswb %%mm3, %%mm2 \n\t"
2336 "movq %%mm0, %%mm1 \n\t"
2337 "movq %%mm2, %%mm3 \n\t"
2338 "psrlw $8, %%mm0 \n\t"
2339 "psrlw $8, %%mm2 \n\t"
2340 "pand %%mm7, %%mm1 \n\t"
2341 "pand %%mm7, %%mm3 \n\t"
2342 "packuswb %%mm2, %%mm0 \n\t"
2343 "packuswb %%mm3, %%mm1 \n\t"
2344 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2345 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2346 "add $8, %0 \n\t"
2347 " js 1b \n\t"
2348 : "+r"(count)
2349 : "r"(src), "r"(dst0), "r"(dst1)
2350 );
2351 count -= 7;
2352 }
2353 while(count<0) {
2354 dst0[count]= src[4*count+0];
2355 dst1[count]= src[4*count+2];
2356 count++;
2357 }
2358 }
2359
2360 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2361 {
2362 dst0 += count;
2363 dst1 += count;
2364 src0 += 4*count;
2365 src1 += 4*count;
2366 count= - count;
2367 #ifdef PAVGB
2368 if(count <= -8) {
2369 count += 7;
2370 __asm__ volatile(
2371 "pcmpeqw %%mm7, %%mm7 \n\t"
2372 "psrlw $8, %%mm7 \n\t"
2373 "1: \n\t"
2374 "movq -28(%1, %0, 4), %%mm0 \n\t"
2375 "movq -20(%1, %0, 4), %%mm1 \n\t"
2376 "movq -12(%1, %0, 4), %%mm2 \n\t"
2377 "movq -4(%1, %0, 4), %%mm3 \n\t"
2378 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2379 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2380 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2381 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2382 "pand %%mm7, %%mm0 \n\t"
2383 "pand %%mm7, %%mm1 \n\t"
2384 "pand %%mm7, %%mm2 \n\t"
2385 "pand %%mm7, %%mm3 \n\t"
2386 "packuswb %%mm1, %%mm0 \n\t"
2387 "packuswb %%mm3, %%mm2 \n\t"
2388 "movq %%mm0, %%mm1 \n\t"
2389 "movq %%mm2, %%mm3 \n\t"
2390 "psrlw $8, %%mm0 \n\t"
2391 "psrlw $8, %%mm2 \n\t"
2392 "pand %%mm7, %%mm1 \n\t"
2393 "pand %%mm7, %%mm3 \n\t"
2394 "packuswb %%mm2, %%mm0 \n\t"
2395 "packuswb %%mm3, %%mm1 \n\t"
2396 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2397 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2398 "add $8, %0 \n\t"
2399 " js 1b \n\t"
2400 : "+r"(count)
2401 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2402 );
2403 count -= 7;
2404 }
2405 #endif
2406 while(count<0) {
2407 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2408 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2409 count++;
2410 }
2411 }
2412
2413 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2414 {
2415 dst0+= count;
2416 dst1+= count;
2417 src += 4*count;
2418 count= - count;
2419 if(count <= -8) {
2420 count += 7;
2421 __asm__ volatile(
2422 "pcmpeqw %%mm7, %%mm7 \n\t"
2423 "psrlw $8, %%mm7 \n\t"
2424 "1: \n\t"
2425 "movq -28(%1, %0, 4), %%mm0 \n\t"
2426 "movq -20(%1, %0, 4), %%mm1 \n\t"
2427 "movq -12(%1, %0, 4), %%mm2 \n\t"
2428 "movq -4(%1, %0, 4), %%mm3 \n\t"
2429 "psrlw $8, %%mm0 \n\t"
2430 "psrlw $8, %%mm1 \n\t"
2431 "psrlw $8, %%mm2 \n\t"
2432 "psrlw $8, %%mm3 \n\t"
2433 "packuswb %%mm1, %%mm0 \n\t"
2434 "packuswb %%mm3, %%mm2 \n\t"
2435 "movq %%mm0, %%mm1 \n\t"
2436 "movq %%mm2, %%mm3 \n\t"
2437 "psrlw $8, %%mm0 \n\t"
2438 "psrlw $8, %%mm2 \n\t"
2439 "pand %%mm7, %%mm1 \n\t"
2440 "pand %%mm7, %%mm3 \n\t"
2441 "packuswb %%mm2, %%mm0 \n\t"
2442 "packuswb %%mm3, %%mm1 \n\t"
2443 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2444 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2445 "add $8, %0 \n\t"
2446 " js 1b \n\t"
2447 : "+r"(count)
2448 : "r"(src), "r"(dst0), "r"(dst1)
2449 );
2450 count -= 7;
2451 }
2452 src++;
2453 while(count<0) {
2454 dst0[count]= src[4*count+0];
2455 dst1[count]= src[4*count+2];
2456 count++;
2457 }
2458 }
2459
2460 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2461 {
2462 dst0 += count;
2463 dst1 += count;
2464 src0 += 4*count;
2465 src1 += 4*count;
2466 count= - count;
2467 #ifdef PAVGB
2468 if(count <= -8) {
2469 count += 7;
2470 __asm__ volatile(
2471 "pcmpeqw %%mm7, %%mm7 \n\t"
2472 "psrlw $8, %%mm7 \n\t"
2473 "1: \n\t"
2474 "movq -28(%1, %0, 4), %%mm0 \n\t"
2475 "movq -20(%1, %0, 4), %%mm1 \n\t"
2476 "movq -12(%1, %0, 4), %%mm2 \n\t"
2477 "movq -4(%1, %0, 4), %%mm3 \n\t"
2478 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2479 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2480 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2481 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2482 "psrlw $8, %%mm0 \n\t"
2483 "psrlw $8, %%mm1 \n\t"
2484 "psrlw $8, %%mm2 \n\t"
2485 "psrlw $8, %%mm3 \n\t"
2486 "packuswb %%mm1, %%mm0 \n\t"
2487 "packuswb %%mm3, %%mm2 \n\t"
2488 "movq %%mm0, %%mm1 \n\t"
2489 "movq %%mm2, %%mm3 \n\t"
2490 "psrlw $8, %%mm0 \n\t"
2491 "psrlw $8, %%mm2 \n\t"
2492 "pand %%mm7, %%mm1 \n\t"
2493 "pand %%mm7, %%mm3 \n\t"
2494 "packuswb %%mm2, %%mm0 \n\t"
2495 "packuswb %%mm3, %%mm1 \n\t"
2496 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2497 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2498 "add $8, %0 \n\t"
2499 " js 1b \n\t"
2500 : "+r"(count)
2501 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2502 );
2503 count -= 7;
2504 }
2505 #endif
2506 src0++;
2507 src1++;
2508 while(count<0) {
2509 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2510 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2511 count++;
2512 }
2513 }
2514
2515 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2516 long width, long height,
2517 long lumStride, long chromStride, long srcStride)
2518 {
2519 long y;
2520 const long chromWidth= -((-width)>>1);
2521
2522 for (y=0; y<height; y++) {
2523 RENAME(extract_even)(src, ydst, width);
2524 if(y&1) {
2525 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2526 udst+= chromStride;
2527 vdst+= chromStride;
2528 }
2529
2530 src += srcStride;
2531 ydst+= lumStride;
2532 }
2533 __asm__(
2534 EMMS" \n\t"
2535 SFENCE" \n\t"
2536 ::: "memory"
2537 );
2538 }
2539
2540 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2541 long width, long height,
2542 long lumStride, long chromStride, long srcStride)
2543 {
2544 long y;
2545 const long chromWidth= -((-width)>>1);
2546
2547 for (y=0; y<height; y++) {
2548 RENAME(extract_even)(src, ydst, width);
2549 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2550
2551 src += srcStride;
2552 ydst+= lumStride;
2553 udst+= chromStride;
2554 vdst+= chromStride;
2555 }
2556 __asm__(
2557 EMMS" \n\t"
2558 SFENCE" \n\t"
2559 ::: "memory"
2560 );
2561 }
2562
2563 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2564 long width, long height,
2565 long lumStride, long chromStride, long srcStride)
2566 {
2567 long y;
2568 const long chromWidth= -((-width)>>1);
2569
2570 for (y=0; y<height; y++) {
2571 RENAME(extract_even)(src+1, ydst, width);
2572 if(y&1) {
2573 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2574 udst+= chromStride;
2575 vdst+= chromStride;
2576 }
2577
2578 src += srcStride;
2579 ydst+= lumStride;
2580 }
2581 __asm__(
2582 EMMS" \n\t"
2583 SFENCE" \n\t"
2584 ::: "memory"
2585 );
2586 }
2587
2588 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2589 long width, long height,
2590 long lumStride, long chromStride, long srcStride)
2591 {
2592 long y;
2593 const long chromWidth= -((-width)>>1);
2594
2595 for (y=0; y<height; y++) {
2596 RENAME(extract_even)(src+1, ydst, width);
2597 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2598
2599 src += srcStride;
2600 ydst+= lumStride;
2601 udst+= chromStride;
2602 vdst+= chromStride;
2603 }
2604 __asm__(
2605 EMMS" \n\t"
2606 SFENCE" \n\t"
2607 ::: "memory"
2608 );
2609 }
2610
2611 static inline void RENAME(rgb2rgb_init)(void)
2612 {
2613 rgb15to16 = RENAME(rgb15to16);
2614 rgb15tobgr24 = RENAME(rgb15tobgr24);
2615 rgb15to32 = RENAME(rgb15to32);
2616 rgb16tobgr24 = RENAME(rgb16tobgr24);
2617 rgb16to32 = RENAME(rgb16to32);
2618 rgb16to15 = RENAME(rgb16to15);
2619 rgb24tobgr16 = RENAME(rgb24tobgr16);
2620 rgb24tobgr15 = RENAME(rgb24tobgr15);
2621 rgb24tobgr32 = RENAME(rgb24tobgr32);
2622 rgb32to16 = RENAME(rgb32to16);
2623 rgb32to15 = RENAME(rgb32to15);
2624 rgb32tobgr24 = RENAME(rgb32tobgr24);
2625 rgb24to15 = RENAME(rgb24to15);
2626 rgb24to16 = RENAME(rgb24to16);
2627 rgb24tobgr24 = RENAME(rgb24tobgr24);
2628 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2629 rgb32tobgr16 = RENAME(rgb32tobgr16);
2630 rgb32tobgr15 = RENAME(rgb32tobgr15);
2631 yv12toyuy2 = RENAME(yv12toyuy2);
2632 yv12touyvy = RENAME(yv12touyvy);
2633 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2634 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2635 yuy2toyv12 = RENAME(yuy2toyv12);
2636 planar2x = RENAME(planar2x);
2637 rgb24toyv12 = RENAME(rgb24toyv12);
2638 interleaveBytes = RENAME(interleaveBytes);
2639 vu9_to_vu12 = RENAME(vu9_to_vu12);
2640 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2641
2642 uyvytoyuv420 = RENAME(uyvytoyuv420);
2643 uyvytoyuv422 = RENAME(uyvytoyuv422);
2644 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2645 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2646 }