swscale: Eliminate rgb24toyv12_c() duplication.
[libav.git] / libswscale / x86 / rgb2rgb_template.c
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of Libav.
11 *
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27 #include <stddef.h>
28
29 #undef PREFETCH
30 #undef MOVNTQ
31 #undef EMMS
32 #undef SFENCE
33 #undef MMREG_SIZE
34 #undef PAVGB
35
36 #if COMPILE_TEMPLATE_SSE2
37 #define MMREG_SIZE 16
38 #else
39 #define MMREG_SIZE 8
40 #endif
41
42 #if COMPILE_TEMPLATE_AMD3DNOW
43 #define PREFETCH "prefetch"
44 #define PAVGB "pavgusb"
45 #elif COMPILE_TEMPLATE_MMX2
46 #define PREFETCH "prefetchnta"
47 #define PAVGB "pavgb"
48 #else
49 #define PREFETCH " # nop"
50 #endif
51
52 #if COMPILE_TEMPLATE_AMD3DNOW
53 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54 #define EMMS "femms"
55 #else
56 #define EMMS "emms"
57 #endif
58
59 #if COMPILE_TEMPLATE_MMX2
60 #define MOVNTQ "movntq"
61 #define SFENCE "sfence"
62 #else
63 #define MOVNTQ "movq"
64 #define SFENCE " # nop"
65 #endif
66
67 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
68 {
69 uint8_t *dest = dst;
70 const uint8_t *s = src;
71 const uint8_t *end;
72 const uint8_t *mm_end;
73 end = s + src_size;
74 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
75 mm_end = end - 23;
76 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
77 while (s < mm_end) {
78 __asm__ volatile(
79 PREFETCH" 32%1 \n\t"
80 "movd %1, %%mm0 \n\t"
81 "punpckldq 3%1, %%mm0 \n\t"
82 "movd 6%1, %%mm1 \n\t"
83 "punpckldq 9%1, %%mm1 \n\t"
84 "movd 12%1, %%mm2 \n\t"
85 "punpckldq 15%1, %%mm2 \n\t"
86 "movd 18%1, %%mm3 \n\t"
87 "punpckldq 21%1, %%mm3 \n\t"
88 "por %%mm7, %%mm0 \n\t"
89 "por %%mm7, %%mm1 \n\t"
90 "por %%mm7, %%mm2 \n\t"
91 "por %%mm7, %%mm3 \n\t"
92 MOVNTQ" %%mm0, %0 \n\t"
93 MOVNTQ" %%mm1, 8%0 \n\t"
94 MOVNTQ" %%mm2, 16%0 \n\t"
95 MOVNTQ" %%mm3, 24%0"
96 :"=m"(*dest)
97 :"m"(*s)
98 :"memory");
99 dest += 32;
100 s += 24;
101 }
102 __asm__ volatile(SFENCE:::"memory");
103 __asm__ volatile(EMMS:::"memory");
104 while (s < end) {
105 *dest++ = *s++;
106 *dest++ = *s++;
107 *dest++ = *s++;
108 *dest++ = 255;
109 }
110 }
111
112 #define STORE_BGR24_MMX \
113 "psrlq $8, %%mm2 \n\t" \
114 "psrlq $8, %%mm3 \n\t" \
115 "psrlq $8, %%mm6 \n\t" \
116 "psrlq $8, %%mm7 \n\t" \
117 "pand "MANGLE(mask24l)", %%mm0\n\t" \
118 "pand "MANGLE(mask24l)", %%mm1\n\t" \
119 "pand "MANGLE(mask24l)", %%mm4\n\t" \
120 "pand "MANGLE(mask24l)", %%mm5\n\t" \
121 "pand "MANGLE(mask24h)", %%mm2\n\t" \
122 "pand "MANGLE(mask24h)", %%mm3\n\t" \
123 "pand "MANGLE(mask24h)", %%mm6\n\t" \
124 "pand "MANGLE(mask24h)", %%mm7\n\t" \
125 "por %%mm2, %%mm0 \n\t" \
126 "por %%mm3, %%mm1 \n\t" \
127 "por %%mm6, %%mm4 \n\t" \
128 "por %%mm7, %%mm5 \n\t" \
129 \
130 "movq %%mm1, %%mm2 \n\t" \
131 "movq %%mm4, %%mm3 \n\t" \
132 "psllq $48, %%mm2 \n\t" \
133 "psllq $32, %%mm3 \n\t" \
134 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
135 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
136 "por %%mm2, %%mm0 \n\t" \
137 "psrlq $16, %%mm1 \n\t" \
138 "psrlq $32, %%mm4 \n\t" \
139 "psllq $16, %%mm5 \n\t" \
140 "por %%mm3, %%mm1 \n\t" \
141 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
142 "por %%mm5, %%mm4 \n\t" \
143 \
144 MOVNTQ" %%mm0, %0 \n\t" \
145 MOVNTQ" %%mm1, 8%0 \n\t" \
146 MOVNTQ" %%mm4, 16%0"
147
148
149 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
150 {
151 uint8_t *dest = dst;
152 const uint8_t *s = src;
153 const uint8_t *end;
154 const uint8_t *mm_end;
155 end = s + src_size;
156 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
157 mm_end = end - 31;
158 while (s < mm_end) {
159 __asm__ volatile(
160 PREFETCH" 32%1 \n\t"
161 "movq %1, %%mm0 \n\t"
162 "movq 8%1, %%mm1 \n\t"
163 "movq 16%1, %%mm4 \n\t"
164 "movq 24%1, %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
169 STORE_BGR24_MMX
170 :"=m"(*dest)
171 :"m"(*s)
172 :"memory");
173 dest += 24;
174 s += 32;
175 }
176 __asm__ volatile(SFENCE:::"memory");
177 __asm__ volatile(EMMS:::"memory");
178 while (s < end) {
179 *dest++ = *s++;
180 *dest++ = *s++;
181 *dest++ = *s++;
182 s++;
183 }
184 }
185
186 /*
187 original by Strepto/Astral
188 ported to gcc & bugfixed: A'rpi
189 MMX2, 3DNOW optimization by Nick Kurshev
190 32-bit C version, and and&add trick by Michael Niedermayer
191 */
192 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
193 {
194 register const uint8_t* s=src;
195 register uint8_t* d=dst;
196 register const uint8_t *end;
197 const uint8_t *mm_end;
198 end = s + src_size;
199 __asm__ volatile(PREFETCH" %0"::"m"(*s));
200 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
201 mm_end = end - 15;
202 while (s<mm_end) {
203 __asm__ volatile(
204 PREFETCH" 32%1 \n\t"
205 "movq %1, %%mm0 \n\t"
206 "movq 8%1, %%mm2 \n\t"
207 "movq %%mm0, %%mm1 \n\t"
208 "movq %%mm2, %%mm3 \n\t"
209 "pand %%mm4, %%mm0 \n\t"
210 "pand %%mm4, %%mm2 \n\t"
211 "paddw %%mm1, %%mm0 \n\t"
212 "paddw %%mm3, %%mm2 \n\t"
213 MOVNTQ" %%mm0, %0 \n\t"
214 MOVNTQ" %%mm2, 8%0"
215 :"=m"(*d)
216 :"m"(*s)
217 );
218 d+=16;
219 s+=16;
220 }
221 __asm__ volatile(SFENCE:::"memory");
222 __asm__ volatile(EMMS:::"memory");
223 mm_end = end - 3;
224 while (s < mm_end) {
225 register unsigned x= *((const uint32_t *)s);
226 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
227 d+=4;
228 s+=4;
229 }
230 if (s < end) {
231 register unsigned short x= *((const uint16_t *)s);
232 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
233 }
234 }
235
236 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
237 {
238 register const uint8_t* s=src;
239 register uint8_t* d=dst;
240 register const uint8_t *end;
241 const uint8_t *mm_end;
242 end = s + src_size;
243 __asm__ volatile(PREFETCH" %0"::"m"(*s));
244 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
245 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
246 mm_end = end - 15;
247 while (s<mm_end) {
248 __asm__ volatile(
249 PREFETCH" 32%1 \n\t"
250 "movq %1, %%mm0 \n\t"
251 "movq 8%1, %%mm2 \n\t"
252 "movq %%mm0, %%mm1 \n\t"
253 "movq %%mm2, %%mm3 \n\t"
254 "psrlq $1, %%mm0 \n\t"
255 "psrlq $1, %%mm2 \n\t"
256 "pand %%mm7, %%mm0 \n\t"
257 "pand %%mm7, %%mm2 \n\t"
258 "pand %%mm6, %%mm1 \n\t"
259 "pand %%mm6, %%mm3 \n\t"
260 "por %%mm1, %%mm0 \n\t"
261 "por %%mm3, %%mm2 \n\t"
262 MOVNTQ" %%mm0, %0 \n\t"
263 MOVNTQ" %%mm2, 8%0"
264 :"=m"(*d)
265 :"m"(*s)
266 );
267 d+=16;
268 s+=16;
269 }
270 __asm__ volatile(SFENCE:::"memory");
271 __asm__ volatile(EMMS:::"memory");
272 mm_end = end - 3;
273 while (s < mm_end) {
274 register uint32_t x= *((const uint32_t*)s);
275 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
276 s+=4;
277 d+=4;
278 }
279 if (s < end) {
280 register uint16_t x= *((const uint16_t*)s);
281 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
282 }
283 }
284
285 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
286 {
287 const uint8_t *s = src;
288 const uint8_t *end;
289 const uint8_t *mm_end;
290 uint16_t *d = (uint16_t *)dst;
291 end = s + src_size;
292 mm_end = end - 15;
293 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
294 __asm__ volatile(
295 "movq %3, %%mm5 \n\t"
296 "movq %4, %%mm6 \n\t"
297 "movq %5, %%mm7 \n\t"
298 "jmp 2f \n\t"
299 ".p2align 4 \n\t"
300 "1: \n\t"
301 PREFETCH" 32(%1) \n\t"
302 "movd (%1), %%mm0 \n\t"
303 "movd 4(%1), %%mm3 \n\t"
304 "punpckldq 8(%1), %%mm0 \n\t"
305 "punpckldq 12(%1), %%mm3 \n\t"
306 "movq %%mm0, %%mm1 \n\t"
307 "movq %%mm3, %%mm4 \n\t"
308 "pand %%mm6, %%mm0 \n\t"
309 "pand %%mm6, %%mm3 \n\t"
310 "pmaddwd %%mm7, %%mm0 \n\t"
311 "pmaddwd %%mm7, %%mm3 \n\t"
312 "pand %%mm5, %%mm1 \n\t"
313 "pand %%mm5, %%mm4 \n\t"
314 "por %%mm1, %%mm0 \n\t"
315 "por %%mm4, %%mm3 \n\t"
316 "psrld $5, %%mm0 \n\t"
317 "pslld $11, %%mm3 \n\t"
318 "por %%mm3, %%mm0 \n\t"
319 MOVNTQ" %%mm0, (%0) \n\t"
320 "add $16, %1 \n\t"
321 "add $8, %0 \n\t"
322 "2: \n\t"
323 "cmp %2, %1 \n\t"
324 " jb 1b \n\t"
325 : "+r" (d), "+r"(s)
326 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
327 );
328 #else
329 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
330 __asm__ volatile(
331 "movq %0, %%mm7 \n\t"
332 "movq %1, %%mm6 \n\t"
333 ::"m"(red_16mask),"m"(green_16mask));
334 while (s < mm_end) {
335 __asm__ volatile(
336 PREFETCH" 32%1 \n\t"
337 "movd %1, %%mm0 \n\t"
338 "movd 4%1, %%mm3 \n\t"
339 "punpckldq 8%1, %%mm0 \n\t"
340 "punpckldq 12%1, %%mm3 \n\t"
341 "movq %%mm0, %%mm1 \n\t"
342 "movq %%mm0, %%mm2 \n\t"
343 "movq %%mm3, %%mm4 \n\t"
344 "movq %%mm3, %%mm5 \n\t"
345 "psrlq $3, %%mm0 \n\t"
346 "psrlq $3, %%mm3 \n\t"
347 "pand %2, %%mm0 \n\t"
348 "pand %2, %%mm3 \n\t"
349 "psrlq $5, %%mm1 \n\t"
350 "psrlq $5, %%mm4 \n\t"
351 "pand %%mm6, %%mm1 \n\t"
352 "pand %%mm6, %%mm4 \n\t"
353 "psrlq $8, %%mm2 \n\t"
354 "psrlq $8, %%mm5 \n\t"
355 "pand %%mm7, %%mm2 \n\t"
356 "pand %%mm7, %%mm5 \n\t"
357 "por %%mm1, %%mm0 \n\t"
358 "por %%mm4, %%mm3 \n\t"
359 "por %%mm2, %%mm0 \n\t"
360 "por %%mm5, %%mm3 \n\t"
361 "psllq $16, %%mm3 \n\t"
362 "por %%mm3, %%mm0 \n\t"
363 MOVNTQ" %%mm0, %0 \n\t"
364 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
365 d += 4;
366 s += 16;
367 }
368 #endif
369 __asm__ volatile(SFENCE:::"memory");
370 __asm__ volatile(EMMS:::"memory");
371 while (s < end) {
372 register int rgb = *(const uint32_t*)s; s += 4;
373 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
374 }
375 }
376
377 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
378 {
379 const uint8_t *s = src;
380 const uint8_t *end;
381 const uint8_t *mm_end;
382 uint16_t *d = (uint16_t *)dst;
383 end = s + src_size;
384 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
385 __asm__ volatile(
386 "movq %0, %%mm7 \n\t"
387 "movq %1, %%mm6 \n\t"
388 ::"m"(red_16mask),"m"(green_16mask));
389 mm_end = end - 15;
390 while (s < mm_end) {
391 __asm__ volatile(
392 PREFETCH" 32%1 \n\t"
393 "movd %1, %%mm0 \n\t"
394 "movd 4%1, %%mm3 \n\t"
395 "punpckldq 8%1, %%mm0 \n\t"
396 "punpckldq 12%1, %%mm3 \n\t"
397 "movq %%mm0, %%mm1 \n\t"
398 "movq %%mm0, %%mm2 \n\t"
399 "movq %%mm3, %%mm4 \n\t"
400 "movq %%mm3, %%mm5 \n\t"
401 "psllq $8, %%mm0 \n\t"
402 "psllq $8, %%mm3 \n\t"
403 "pand %%mm7, %%mm0 \n\t"
404 "pand %%mm7, %%mm3 \n\t"
405 "psrlq $5, %%mm1 \n\t"
406 "psrlq $5, %%mm4 \n\t"
407 "pand %%mm6, %%mm1 \n\t"
408 "pand %%mm6, %%mm4 \n\t"
409 "psrlq $19, %%mm2 \n\t"
410 "psrlq $19, %%mm5 \n\t"
411 "pand %2, %%mm2 \n\t"
412 "pand %2, %%mm5 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "por %%mm2, %%mm0 \n\t"
416 "por %%mm5, %%mm3 \n\t"
417 "psllq $16, %%mm3 \n\t"
418 "por %%mm3, %%mm0 \n\t"
419 MOVNTQ" %%mm0, %0 \n\t"
420 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
421 d += 4;
422 s += 16;
423 }
424 __asm__ volatile(SFENCE:::"memory");
425 __asm__ volatile(EMMS:::"memory");
426 while (s < end) {
427 register int rgb = *(const uint32_t*)s; s += 4;
428 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
429 }
430 }
431
432 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
433 {
434 const uint8_t *s = src;
435 const uint8_t *end;
436 const uint8_t *mm_end;
437 uint16_t *d = (uint16_t *)dst;
438 end = s + src_size;
439 mm_end = end - 15;
440 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
441 __asm__ volatile(
442 "movq %3, %%mm5 \n\t"
443 "movq %4, %%mm6 \n\t"
444 "movq %5, %%mm7 \n\t"
445 "jmp 2f \n\t"
446 ".p2align 4 \n\t"
447 "1: \n\t"
448 PREFETCH" 32(%1) \n\t"
449 "movd (%1), %%mm0 \n\t"
450 "movd 4(%1), %%mm3 \n\t"
451 "punpckldq 8(%1), %%mm0 \n\t"
452 "punpckldq 12(%1), %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm3, %%mm4 \n\t"
455 "pand %%mm6, %%mm0 \n\t"
456 "pand %%mm6, %%mm3 \n\t"
457 "pmaddwd %%mm7, %%mm0 \n\t"
458 "pmaddwd %%mm7, %%mm3 \n\t"
459 "pand %%mm5, %%mm1 \n\t"
460 "pand %%mm5, %%mm4 \n\t"
461 "por %%mm1, %%mm0 \n\t"
462 "por %%mm4, %%mm3 \n\t"
463 "psrld $6, %%mm0 \n\t"
464 "pslld $10, %%mm3 \n\t"
465 "por %%mm3, %%mm0 \n\t"
466 MOVNTQ" %%mm0, (%0) \n\t"
467 "add $16, %1 \n\t"
468 "add $8, %0 \n\t"
469 "2: \n\t"
470 "cmp %2, %1 \n\t"
471 " jb 1b \n\t"
472 : "+r" (d), "+r"(s)
473 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
474 );
475 #else
476 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
477 __asm__ volatile(
478 "movq %0, %%mm7 \n\t"
479 "movq %1, %%mm6 \n\t"
480 ::"m"(red_15mask),"m"(green_15mask));
481 while (s < mm_end) {
482 __asm__ volatile(
483 PREFETCH" 32%1 \n\t"
484 "movd %1, %%mm0 \n\t"
485 "movd 4%1, %%mm3 \n\t"
486 "punpckldq 8%1, %%mm0 \n\t"
487 "punpckldq 12%1, %%mm3 \n\t"
488 "movq %%mm0, %%mm1 \n\t"
489 "movq %%mm0, %%mm2 \n\t"
490 "movq %%mm3, %%mm4 \n\t"
491 "movq %%mm3, %%mm5 \n\t"
492 "psrlq $3, %%mm0 \n\t"
493 "psrlq $3, %%mm3 \n\t"
494 "pand %2, %%mm0 \n\t"
495 "pand %2, %%mm3 \n\t"
496 "psrlq $6, %%mm1 \n\t"
497 "psrlq $6, %%mm4 \n\t"
498 "pand %%mm6, %%mm1 \n\t"
499 "pand %%mm6, %%mm4 \n\t"
500 "psrlq $9, %%mm2 \n\t"
501 "psrlq $9, %%mm5 \n\t"
502 "pand %%mm7, %%mm2 \n\t"
503 "pand %%mm7, %%mm5 \n\t"
504 "por %%mm1, %%mm0 \n\t"
505 "por %%mm4, %%mm3 \n\t"
506 "por %%mm2, %%mm0 \n\t"
507 "por %%mm5, %%mm3 \n\t"
508 "psllq $16, %%mm3 \n\t"
509 "por %%mm3, %%mm0 \n\t"
510 MOVNTQ" %%mm0, %0 \n\t"
511 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
512 d += 4;
513 s += 16;
514 }
515 #endif
516 __asm__ volatile(SFENCE:::"memory");
517 __asm__ volatile(EMMS:::"memory");
518 while (s < end) {
519 register int rgb = *(const uint32_t*)s; s += 4;
520 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
521 }
522 }
523
524 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
525 {
526 const uint8_t *s = src;
527 const uint8_t *end;
528 const uint8_t *mm_end;
529 uint16_t *d = (uint16_t *)dst;
530 end = s + src_size;
531 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
532 __asm__ volatile(
533 "movq %0, %%mm7 \n\t"
534 "movq %1, %%mm6 \n\t"
535 ::"m"(red_15mask),"m"(green_15mask));
536 mm_end = end - 15;
537 while (s < mm_end) {
538 __asm__ volatile(
539 PREFETCH" 32%1 \n\t"
540 "movd %1, %%mm0 \n\t"
541 "movd 4%1, %%mm3 \n\t"
542 "punpckldq 8%1, %%mm0 \n\t"
543 "punpckldq 12%1, %%mm3 \n\t"
544 "movq %%mm0, %%mm1 \n\t"
545 "movq %%mm0, %%mm2 \n\t"
546 "movq %%mm3, %%mm4 \n\t"
547 "movq %%mm3, %%mm5 \n\t"
548 "psllq $7, %%mm0 \n\t"
549 "psllq $7, %%mm3 \n\t"
550 "pand %%mm7, %%mm0 \n\t"
551 "pand %%mm7, %%mm3 \n\t"
552 "psrlq $6, %%mm1 \n\t"
553 "psrlq $6, %%mm4 \n\t"
554 "pand %%mm6, %%mm1 \n\t"
555 "pand %%mm6, %%mm4 \n\t"
556 "psrlq $19, %%mm2 \n\t"
557 "psrlq $19, %%mm5 \n\t"
558 "pand %2, %%mm2 \n\t"
559 "pand %2, %%mm5 \n\t"
560 "por %%mm1, %%mm0 \n\t"
561 "por %%mm4, %%mm3 \n\t"
562 "por %%mm2, %%mm0 \n\t"
563 "por %%mm5, %%mm3 \n\t"
564 "psllq $16, %%mm3 \n\t"
565 "por %%mm3, %%mm0 \n\t"
566 MOVNTQ" %%mm0, %0 \n\t"
567 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
568 d += 4;
569 s += 16;
570 }
571 __asm__ volatile(SFENCE:::"memory");
572 __asm__ volatile(EMMS:::"memory");
573 while (s < end) {
574 register int rgb = *(const uint32_t*)s; s += 4;
575 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
576 }
577 }
578
579 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
580 {
581 const uint8_t *s = src;
582 const uint8_t *end;
583 const uint8_t *mm_end;
584 uint16_t *d = (uint16_t *)dst;
585 end = s + src_size;
586 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
587 __asm__ volatile(
588 "movq %0, %%mm7 \n\t"
589 "movq %1, %%mm6 \n\t"
590 ::"m"(red_16mask),"m"(green_16mask));
591 mm_end = end - 11;
592 while (s < mm_end) {
593 __asm__ volatile(
594 PREFETCH" 32%1 \n\t"
595 "movd %1, %%mm0 \n\t"
596 "movd 3%1, %%mm3 \n\t"
597 "punpckldq 6%1, %%mm0 \n\t"
598 "punpckldq 9%1, %%mm3 \n\t"
599 "movq %%mm0, %%mm1 \n\t"
600 "movq %%mm0, %%mm2 \n\t"
601 "movq %%mm3, %%mm4 \n\t"
602 "movq %%mm3, %%mm5 \n\t"
603 "psrlq $3, %%mm0 \n\t"
604 "psrlq $3, %%mm3 \n\t"
605 "pand %2, %%mm0 \n\t"
606 "pand %2, %%mm3 \n\t"
607 "psrlq $5, %%mm1 \n\t"
608 "psrlq $5, %%mm4 \n\t"
609 "pand %%mm6, %%mm1 \n\t"
610 "pand %%mm6, %%mm4 \n\t"
611 "psrlq $8, %%mm2 \n\t"
612 "psrlq $8, %%mm5 \n\t"
613 "pand %%mm7, %%mm2 \n\t"
614 "pand %%mm7, %%mm5 \n\t"
615 "por %%mm1, %%mm0 \n\t"
616 "por %%mm4, %%mm3 \n\t"
617 "por %%mm2, %%mm0 \n\t"
618 "por %%mm5, %%mm3 \n\t"
619 "psllq $16, %%mm3 \n\t"
620 "por %%mm3, %%mm0 \n\t"
621 MOVNTQ" %%mm0, %0 \n\t"
622 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
623 d += 4;
624 s += 12;
625 }
626 __asm__ volatile(SFENCE:::"memory");
627 __asm__ volatile(EMMS:::"memory");
628 while (s < end) {
629 const int b = *s++;
630 const int g = *s++;
631 const int r = *s++;
632 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
633 }
634 }
635
636 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
637 {
638 const uint8_t *s = src;
639 const uint8_t *end;
640 const uint8_t *mm_end;
641 uint16_t *d = (uint16_t *)dst;
642 end = s + src_size;
643 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
644 __asm__ volatile(
645 "movq %0, %%mm7 \n\t"
646 "movq %1, %%mm6 \n\t"
647 ::"m"(red_16mask),"m"(green_16mask));
648 mm_end = end - 15;
649 while (s < mm_end) {
650 __asm__ volatile(
651 PREFETCH" 32%1 \n\t"
652 "movd %1, %%mm0 \n\t"
653 "movd 3%1, %%mm3 \n\t"
654 "punpckldq 6%1, %%mm0 \n\t"
655 "punpckldq 9%1, %%mm3 \n\t"
656 "movq %%mm0, %%mm1 \n\t"
657 "movq %%mm0, %%mm2 \n\t"
658 "movq %%mm3, %%mm4 \n\t"
659 "movq %%mm3, %%mm5 \n\t"
660 "psllq $8, %%mm0 \n\t"
661 "psllq $8, %%mm3 \n\t"
662 "pand %%mm7, %%mm0 \n\t"
663 "pand %%mm7, %%mm3 \n\t"
664 "psrlq $5, %%mm1 \n\t"
665 "psrlq $5, %%mm4 \n\t"
666 "pand %%mm6, %%mm1 \n\t"
667 "pand %%mm6, %%mm4 \n\t"
668 "psrlq $19, %%mm2 \n\t"
669 "psrlq $19, %%mm5 \n\t"
670 "pand %2, %%mm2 \n\t"
671 "pand %2, %%mm5 \n\t"
672 "por %%mm1, %%mm0 \n\t"
673 "por %%mm4, %%mm3 \n\t"
674 "por %%mm2, %%mm0 \n\t"
675 "por %%mm5, %%mm3 \n\t"
676 "psllq $16, %%mm3 \n\t"
677 "por %%mm3, %%mm0 \n\t"
678 MOVNTQ" %%mm0, %0 \n\t"
679 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
680 d += 4;
681 s += 12;
682 }
683 __asm__ volatile(SFENCE:::"memory");
684 __asm__ volatile(EMMS:::"memory");
685 while (s < end) {
686 const int r = *s++;
687 const int g = *s++;
688 const int b = *s++;
689 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
690 }
691 }
692
693 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
694 {
695 const uint8_t *s = src;
696 const uint8_t *end;
697 const uint8_t *mm_end;
698 uint16_t *d = (uint16_t *)dst;
699 end = s + src_size;
700 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
701 __asm__ volatile(
702 "movq %0, %%mm7 \n\t"
703 "movq %1, %%mm6 \n\t"
704 ::"m"(red_15mask),"m"(green_15mask));
705 mm_end = end - 11;
706 while (s < mm_end) {
707 __asm__ volatile(
708 PREFETCH" 32%1 \n\t"
709 "movd %1, %%mm0 \n\t"
710 "movd 3%1, %%mm3 \n\t"
711 "punpckldq 6%1, %%mm0 \n\t"
712 "punpckldq 9%1, %%mm3 \n\t"
713 "movq %%mm0, %%mm1 \n\t"
714 "movq %%mm0, %%mm2 \n\t"
715 "movq %%mm3, %%mm4 \n\t"
716 "movq %%mm3, %%mm5 \n\t"
717 "psrlq $3, %%mm0 \n\t"
718 "psrlq $3, %%mm3 \n\t"
719 "pand %2, %%mm0 \n\t"
720 "pand %2, %%mm3 \n\t"
721 "psrlq $6, %%mm1 \n\t"
722 "psrlq $6, %%mm4 \n\t"
723 "pand %%mm6, %%mm1 \n\t"
724 "pand %%mm6, %%mm4 \n\t"
725 "psrlq $9, %%mm2 \n\t"
726 "psrlq $9, %%mm5 \n\t"
727 "pand %%mm7, %%mm2 \n\t"
728 "pand %%mm7, %%mm5 \n\t"
729 "por %%mm1, %%mm0 \n\t"
730 "por %%mm4, %%mm3 \n\t"
731 "por %%mm2, %%mm0 \n\t"
732 "por %%mm5, %%mm3 \n\t"
733 "psllq $16, %%mm3 \n\t"
734 "por %%mm3, %%mm0 \n\t"
735 MOVNTQ" %%mm0, %0 \n\t"
736 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
737 d += 4;
738 s += 12;
739 }
740 __asm__ volatile(SFENCE:::"memory");
741 __asm__ volatile(EMMS:::"memory");
742 while (s < end) {
743 const int b = *s++;
744 const int g = *s++;
745 const int r = *s++;
746 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
747 }
748 }
749
750 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
751 {
752 const uint8_t *s = src;
753 const uint8_t *end;
754 const uint8_t *mm_end;
755 uint16_t *d = (uint16_t *)dst;
756 end = s + src_size;
757 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
758 __asm__ volatile(
759 "movq %0, %%mm7 \n\t"
760 "movq %1, %%mm6 \n\t"
761 ::"m"(red_15mask),"m"(green_15mask));
762 mm_end = end - 15;
763 while (s < mm_end) {
764 __asm__ volatile(
765 PREFETCH" 32%1 \n\t"
766 "movd %1, %%mm0 \n\t"
767 "movd 3%1, %%mm3 \n\t"
768 "punpckldq 6%1, %%mm0 \n\t"
769 "punpckldq 9%1, %%mm3 \n\t"
770 "movq %%mm0, %%mm1 \n\t"
771 "movq %%mm0, %%mm2 \n\t"
772 "movq %%mm3, %%mm4 \n\t"
773 "movq %%mm3, %%mm5 \n\t"
774 "psllq $7, %%mm0 \n\t"
775 "psllq $7, %%mm3 \n\t"
776 "pand %%mm7, %%mm0 \n\t"
777 "pand %%mm7, %%mm3 \n\t"
778 "psrlq $6, %%mm1 \n\t"
779 "psrlq $6, %%mm4 \n\t"
780 "pand %%mm6, %%mm1 \n\t"
781 "pand %%mm6, %%mm4 \n\t"
782 "psrlq $19, %%mm2 \n\t"
783 "psrlq $19, %%mm5 \n\t"
784 "pand %2, %%mm2 \n\t"
785 "pand %2, %%mm5 \n\t"
786 "por %%mm1, %%mm0 \n\t"
787 "por %%mm4, %%mm3 \n\t"
788 "por %%mm2, %%mm0 \n\t"
789 "por %%mm5, %%mm3 \n\t"
790 "psllq $16, %%mm3 \n\t"
791 "por %%mm3, %%mm0 \n\t"
792 MOVNTQ" %%mm0, %0 \n\t"
793 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794 d += 4;
795 s += 12;
796 }
797 __asm__ volatile(SFENCE:::"memory");
798 __asm__ volatile(EMMS:::"memory");
799 while (s < end) {
800 const int r = *s++;
801 const int g = *s++;
802 const int b = *s++;
803 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
804 }
805 }
806
807 /*
808 I use less accurate approximation here by simply left-shifting the input
809 value and filling the low order bits with zeroes. This method improves PNG
810 compression but this scheme cannot reproduce white exactly, since it does
811 not generate an all-ones maximum value; the net effect is to darken the
812 image slightly.
813
814 The better method should be "left bit replication":
815
816 4 3 2 1 0
817 ---------
818 1 1 0 1 1
819
820 7 6 5 4 3 2 1 0
821 ----------------
822 1 1 0 1 1 1 1 0
823 |=======| |===|
824 | leftmost bits repeated to fill open bits
825 |
826 original bits
827 */
828 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
829 {
830 const uint16_t *end;
831 const uint16_t *mm_end;
832 uint8_t *d = dst;
833 const uint16_t *s = (const uint16_t*)src;
834 end = s + src_size/2;
835 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
836 mm_end = end - 7;
837 while (s < mm_end) {
838 __asm__ volatile(
839 PREFETCH" 32%1 \n\t"
840 "movq %1, %%mm0 \n\t"
841 "movq %1, %%mm1 \n\t"
842 "movq %1, %%mm2 \n\t"
843 "pand %2, %%mm0 \n\t"
844 "pand %3, %%mm1 \n\t"
845 "pand %4, %%mm2 \n\t"
846 "psllq $3, %%mm0 \n\t"
847 "psrlq $2, %%mm1 \n\t"
848 "psrlq $7, %%mm2 \n\t"
849 "movq %%mm0, %%mm3 \n\t"
850 "movq %%mm1, %%mm4 \n\t"
851 "movq %%mm2, %%mm5 \n\t"
852 "punpcklwd %5, %%mm0 \n\t"
853 "punpcklwd %5, %%mm1 \n\t"
854 "punpcklwd %5, %%mm2 \n\t"
855 "punpckhwd %5, %%mm3 \n\t"
856 "punpckhwd %5, %%mm4 \n\t"
857 "punpckhwd %5, %%mm5 \n\t"
858 "psllq $8, %%mm1 \n\t"
859 "psllq $16, %%mm2 \n\t"
860 "por %%mm1, %%mm0 \n\t"
861 "por %%mm2, %%mm0 \n\t"
862 "psllq $8, %%mm4 \n\t"
863 "psllq $16, %%mm5 \n\t"
864 "por %%mm4, %%mm3 \n\t"
865 "por %%mm5, %%mm3 \n\t"
866
867 "movq %%mm0, %%mm6 \n\t"
868 "movq %%mm3, %%mm7 \n\t"
869
870 "movq 8%1, %%mm0 \n\t"
871 "movq 8%1, %%mm1 \n\t"
872 "movq 8%1, %%mm2 \n\t"
873 "pand %2, %%mm0 \n\t"
874 "pand %3, %%mm1 \n\t"
875 "pand %4, %%mm2 \n\t"
876 "psllq $3, %%mm0 \n\t"
877 "psrlq $2, %%mm1 \n\t"
878 "psrlq $7, %%mm2 \n\t"
879 "movq %%mm0, %%mm3 \n\t"
880 "movq %%mm1, %%mm4 \n\t"
881 "movq %%mm2, %%mm5 \n\t"
882 "punpcklwd %5, %%mm0 \n\t"
883 "punpcklwd %5, %%mm1 \n\t"
884 "punpcklwd %5, %%mm2 \n\t"
885 "punpckhwd %5, %%mm3 \n\t"
886 "punpckhwd %5, %%mm4 \n\t"
887 "punpckhwd %5, %%mm5 \n\t"
888 "psllq $8, %%mm1 \n\t"
889 "psllq $16, %%mm2 \n\t"
890 "por %%mm1, %%mm0 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "psllq $8, %%mm4 \n\t"
893 "psllq $16, %%mm5 \n\t"
894 "por %%mm4, %%mm3 \n\t"
895 "por %%mm5, %%mm3 \n\t"
896
897 :"=m"(*d)
898 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
899 :"memory");
900 /* borrowed 32 to 24 */
901 __asm__ volatile(
902 "movq %%mm0, %%mm4 \n\t"
903 "movq %%mm3, %%mm5 \n\t"
904 "movq %%mm6, %%mm0 \n\t"
905 "movq %%mm7, %%mm1 \n\t"
906
907 "movq %%mm4, %%mm6 \n\t"
908 "movq %%mm5, %%mm7 \n\t"
909 "movq %%mm0, %%mm2 \n\t"
910 "movq %%mm1, %%mm3 \n\t"
911
912 STORE_BGR24_MMX
913
914 :"=m"(*d)
915 :"m"(*s)
916 :"memory");
917 d += 24;
918 s += 8;
919 }
920 __asm__ volatile(SFENCE:::"memory");
921 __asm__ volatile(EMMS:::"memory");
922 while (s < end) {
923 register uint16_t bgr;
924 bgr = *s++;
925 *d++ = (bgr&0x1F)<<3;
926 *d++ = (bgr&0x3E0)>>2;
927 *d++ = (bgr&0x7C00)>>7;
928 }
929 }
930
931 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
932 {
933 const uint16_t *end;
934 const uint16_t *mm_end;
935 uint8_t *d = (uint8_t *)dst;
936 const uint16_t *s = (const uint16_t *)src;
937 end = s + src_size/2;
938 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
939 mm_end = end - 7;
940 while (s < mm_end) {
941 __asm__ volatile(
942 PREFETCH" 32%1 \n\t"
943 "movq %1, %%mm0 \n\t"
944 "movq %1, %%mm1 \n\t"
945 "movq %1, %%mm2 \n\t"
946 "pand %2, %%mm0 \n\t"
947 "pand %3, %%mm1 \n\t"
948 "pand %4, %%mm2 \n\t"
949 "psllq $3, %%mm0 \n\t"
950 "psrlq $3, %%mm1 \n\t"
951 "psrlq $8, %%mm2 \n\t"
952 "movq %%mm0, %%mm3 \n\t"
953 "movq %%mm1, %%mm4 \n\t"
954 "movq %%mm2, %%mm5 \n\t"
955 "punpcklwd %5, %%mm0 \n\t"
956 "punpcklwd %5, %%mm1 \n\t"
957 "punpcklwd %5, %%mm2 \n\t"
958 "punpckhwd %5, %%mm3 \n\t"
959 "punpckhwd %5, %%mm4 \n\t"
960 "punpckhwd %5, %%mm5 \n\t"
961 "psllq $8, %%mm1 \n\t"
962 "psllq $16, %%mm2 \n\t"
963 "por %%mm1, %%mm0 \n\t"
964 "por %%mm2, %%mm0 \n\t"
965 "psllq $8, %%mm4 \n\t"
966 "psllq $16, %%mm5 \n\t"
967 "por %%mm4, %%mm3 \n\t"
968 "por %%mm5, %%mm3 \n\t"
969
970 "movq %%mm0, %%mm6 \n\t"
971 "movq %%mm3, %%mm7 \n\t"
972
973 "movq 8%1, %%mm0 \n\t"
974 "movq 8%1, %%mm1 \n\t"
975 "movq 8%1, %%mm2 \n\t"
976 "pand %2, %%mm0 \n\t"
977 "pand %3, %%mm1 \n\t"
978 "pand %4, %%mm2 \n\t"
979 "psllq $3, %%mm0 \n\t"
980 "psrlq $3, %%mm1 \n\t"
981 "psrlq $8, %%mm2 \n\t"
982 "movq %%mm0, %%mm3 \n\t"
983 "movq %%mm1, %%mm4 \n\t"
984 "movq %%mm2, %%mm5 \n\t"
985 "punpcklwd %5, %%mm0 \n\t"
986 "punpcklwd %5, %%mm1 \n\t"
987 "punpcklwd %5, %%mm2 \n\t"
988 "punpckhwd %5, %%mm3 \n\t"
989 "punpckhwd %5, %%mm4 \n\t"
990 "punpckhwd %5, %%mm5 \n\t"
991 "psllq $8, %%mm1 \n\t"
992 "psllq $16, %%mm2 \n\t"
993 "por %%mm1, %%mm0 \n\t"
994 "por %%mm2, %%mm0 \n\t"
995 "psllq $8, %%mm4 \n\t"
996 "psllq $16, %%mm5 \n\t"
997 "por %%mm4, %%mm3 \n\t"
998 "por %%mm5, %%mm3 \n\t"
999 :"=m"(*d)
1000 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1001 :"memory");
1002 /* borrowed 32 to 24 */
1003 __asm__ volatile(
1004 "movq %%mm0, %%mm4 \n\t"
1005 "movq %%mm3, %%mm5 \n\t"
1006 "movq %%mm6, %%mm0 \n\t"
1007 "movq %%mm7, %%mm1 \n\t"
1008
1009 "movq %%mm4, %%mm6 \n\t"
1010 "movq %%mm5, %%mm7 \n\t"
1011 "movq %%mm0, %%mm2 \n\t"
1012 "movq %%mm1, %%mm3 \n\t"
1013
1014 STORE_BGR24_MMX
1015
1016 :"=m"(*d)
1017 :"m"(*s)
1018 :"memory");
1019 d += 24;
1020 s += 8;
1021 }
1022 __asm__ volatile(SFENCE:::"memory");
1023 __asm__ volatile(EMMS:::"memory");
1024 while (s < end) {
1025 register uint16_t bgr;
1026 bgr = *s++;
1027 *d++ = (bgr&0x1F)<<3;
1028 *d++ = (bgr&0x7E0)>>3;
1029 *d++ = (bgr&0xF800)>>8;
1030 }
1031 }
1032
1033 /*
1034 * mm0 = 00 B3 00 B2 00 B1 00 B0
1035 * mm1 = 00 G3 00 G2 00 G1 00 G0
1036 * mm2 = 00 R3 00 R2 00 R1 00 R0
1037 * mm6 = FF FF FF FF FF FF FF FF
1038 * mm7 = 00 00 00 00 00 00 00 00
1039 */
1040 #define PACK_RGB32 \
1041 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1042 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1043 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1044 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1045 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1046 "movq %%mm0, %%mm3 \n\t" \
1047 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1048 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1049 MOVNTQ" %%mm0, %0 \n\t" \
1050 MOVNTQ" %%mm3, 8%0 \n\t" \
1051
1052 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1053 {
1054 const uint16_t *end;
1055 const uint16_t *mm_end;
1056 uint8_t *d = dst;
1057 const uint16_t *s = (const uint16_t *)src;
1058 end = s + src_size/2;
1059 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1060 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1061 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1062 mm_end = end - 3;
1063 while (s < mm_end) {
1064 __asm__ volatile(
1065 PREFETCH" 32%1 \n\t"
1066 "movq %1, %%mm0 \n\t"
1067 "movq %1, %%mm1 \n\t"
1068 "movq %1, %%mm2 \n\t"
1069 "pand %2, %%mm0 \n\t"
1070 "pand %3, %%mm1 \n\t"
1071 "pand %4, %%mm2 \n\t"
1072 "psllq $3, %%mm0 \n\t"
1073 "psrlq $2, %%mm1 \n\t"
1074 "psrlq $7, %%mm2 \n\t"
1075 PACK_RGB32
1076 :"=m"(*d)
1077 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1078 :"memory");
1079 d += 16;
1080 s += 4;
1081 }
1082 __asm__ volatile(SFENCE:::"memory");
1083 __asm__ volatile(EMMS:::"memory");
1084 while (s < end) {
1085 register uint16_t bgr;
1086 bgr = *s++;
1087 *d++ = (bgr&0x1F)<<3;
1088 *d++ = (bgr&0x3E0)>>2;
1089 *d++ = (bgr&0x7C00)>>7;
1090 *d++ = 255;
1091 }
1092 }
1093
1094 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1095 {
1096 const uint16_t *end;
1097 const uint16_t *mm_end;
1098 uint8_t *d = dst;
1099 const uint16_t *s = (const uint16_t*)src;
1100 end = s + src_size/2;
1101 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1102 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1103 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1104 mm_end = end - 3;
1105 while (s < mm_end) {
1106 __asm__ volatile(
1107 PREFETCH" 32%1 \n\t"
1108 "movq %1, %%mm0 \n\t"
1109 "movq %1, %%mm1 \n\t"
1110 "movq %1, %%mm2 \n\t"
1111 "pand %2, %%mm0 \n\t"
1112 "pand %3, %%mm1 \n\t"
1113 "pand %4, %%mm2 \n\t"
1114 "psllq $3, %%mm0 \n\t"
1115 "psrlq $3, %%mm1 \n\t"
1116 "psrlq $8, %%mm2 \n\t"
1117 PACK_RGB32
1118 :"=m"(*d)
1119 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1120 :"memory");
1121 d += 16;
1122 s += 4;
1123 }
1124 __asm__ volatile(SFENCE:::"memory");
1125 __asm__ volatile(EMMS:::"memory");
1126 while (s < end) {
1127 register uint16_t bgr;
1128 bgr = *s++;
1129 *d++ = (bgr&0x1F)<<3;
1130 *d++ = (bgr&0x7E0)>>3;
1131 *d++ = (bgr&0xF800)>>8;
1132 *d++ = 255;
1133 }
1134 }
1135
1136 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
1137 {
1138 x86_reg idx = 15 - src_size;
1139 const uint8_t *s = src-idx;
1140 uint8_t *d = dst-idx;
1141 __asm__ volatile(
1142 "test %0, %0 \n\t"
1143 "jns 2f \n\t"
1144 PREFETCH" (%1, %0) \n\t"
1145 "movq %3, %%mm7 \n\t"
1146 "pxor %4, %%mm7 \n\t"
1147 "movq %%mm7, %%mm6 \n\t"
1148 "pxor %5, %%mm7 \n\t"
1149 ".p2align 4 \n\t"
1150 "1: \n\t"
1151 PREFETCH" 32(%1, %0) \n\t"
1152 "movq (%1, %0), %%mm0 \n\t"
1153 "movq 8(%1, %0), %%mm1 \n\t"
1154 # if COMPILE_TEMPLATE_MMX2
1155 "pshufw $177, %%mm0, %%mm3 \n\t"
1156 "pshufw $177, %%mm1, %%mm5 \n\t"
1157 "pand %%mm7, %%mm0 \n\t"
1158 "pand %%mm6, %%mm3 \n\t"
1159 "pand %%mm7, %%mm1 \n\t"
1160 "pand %%mm6, %%mm5 \n\t"
1161 "por %%mm3, %%mm0 \n\t"
1162 "por %%mm5, %%mm1 \n\t"
1163 # else
1164 "movq %%mm0, %%mm2 \n\t"
1165 "movq %%mm1, %%mm4 \n\t"
1166 "pand %%mm7, %%mm0 \n\t"
1167 "pand %%mm6, %%mm2 \n\t"
1168 "pand %%mm7, %%mm1 \n\t"
1169 "pand %%mm6, %%mm4 \n\t"
1170 "movq %%mm2, %%mm3 \n\t"
1171 "movq %%mm4, %%mm5 \n\t"
1172 "pslld $16, %%mm2 \n\t"
1173 "psrld $16, %%mm3 \n\t"
1174 "pslld $16, %%mm4 \n\t"
1175 "psrld $16, %%mm5 \n\t"
1176 "por %%mm2, %%mm0 \n\t"
1177 "por %%mm4, %%mm1 \n\t"
1178 "por %%mm3, %%mm0 \n\t"
1179 "por %%mm5, %%mm1 \n\t"
1180 # endif
1181 MOVNTQ" %%mm0, (%2, %0) \n\t"
1182 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1183 "add $16, %0 \n\t"
1184 "js 1b \n\t"
1185 SFENCE" \n\t"
1186 EMMS" \n\t"
1187 "2: \n\t"
1188 : "+&r"(idx)
1189 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1190 : "memory");
1191 for (; idx<15; idx+=4) {
1192 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1193 v &= 0xff00ff;
1194 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1195 }
1196 }
1197
1198 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1199 {
1200 unsigned i;
1201 x86_reg mmx_size= 23 - src_size;
1202 __asm__ volatile (
1203 "test %%"REG_a", %%"REG_a" \n\t"
1204 "jns 2f \n\t"
1205 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1206 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1207 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1208 ".p2align 4 \n\t"
1209 "1: \n\t"
1210 PREFETCH" 32(%1, %%"REG_a") \n\t"
1211 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1212 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1213 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1214 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1215 "pand %%mm5, %%mm0 \n\t"
1216 "pand %%mm6, %%mm1 \n\t"
1217 "pand %%mm7, %%mm2 \n\t"
1218 "por %%mm0, %%mm1 \n\t"
1219 "por %%mm2, %%mm1 \n\t"
1220 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1221 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1222 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1223 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1224 "pand %%mm7, %%mm0 \n\t"
1225 "pand %%mm5, %%mm1 \n\t"
1226 "pand %%mm6, %%mm2 \n\t"
1227 "por %%mm0, %%mm1 \n\t"
1228 "por %%mm2, %%mm1 \n\t"
1229 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1230 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1231 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1232 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1233 "pand %%mm6, %%mm0 \n\t"
1234 "pand %%mm7, %%mm1 \n\t"
1235 "pand %%mm5, %%mm2 \n\t"
1236 "por %%mm0, %%mm1 \n\t"
1237 "por %%mm2, %%mm1 \n\t"
1238 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1239 "add $24, %%"REG_a" \n\t"
1240 " js 1b \n\t"
1241 "2: \n\t"
1242 : "+a" (mmx_size)
1243 : "r" (src-mmx_size), "r"(dst-mmx_size)
1244 );
1245
1246 __asm__ volatile(SFENCE:::"memory");
1247 __asm__ volatile(EMMS:::"memory");
1248
1249 if (mmx_size==23) return; //finished, was multiple of 8
1250
1251 src+= src_size;
1252 dst+= src_size;
1253 src_size= 23-mmx_size;
1254 src-= src_size;
1255 dst-= src_size;
1256 for (i=0; i<src_size; i+=3) {
1257 register uint8_t x;
1258 x = src[i + 2];
1259 dst[i + 1] = src[i + 1];
1260 dst[i + 2] = src[i + 0];
1261 dst[i + 0] = x;
1262 }
1263 }
1264
1265 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1266 long width, long height,
1267 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1268 {
1269 long y;
1270 const x86_reg chromWidth= width>>1;
1271 for (y=0; y<height; y++) {
1272 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1273 __asm__ volatile(
1274 "xor %%"REG_a", %%"REG_a" \n\t"
1275 ".p2align 4 \n\t"
1276 "1: \n\t"
1277 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1278 PREFETCH" 32(%2, %%"REG_a") \n\t"
1279 PREFETCH" 32(%3, %%"REG_a") \n\t"
1280 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1281 "movq %%mm0, %%mm2 \n\t" // U(0)
1282 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1283 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1284 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1285
1286 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1287 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1288 "movq %%mm3, %%mm4 \n\t" // Y(0)
1289 "movq %%mm5, %%mm6 \n\t" // Y(8)
1290 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1291 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1292 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1293 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1294
1295 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1296 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1297 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1298 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1299
1300 "add $8, %%"REG_a" \n\t"
1301 "cmp %4, %%"REG_a" \n\t"
1302 " jb 1b \n\t"
1303 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1304 : "%"REG_a
1305 );
1306 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1307 usrc += chromStride;
1308 vsrc += chromStride;
1309 }
1310 ysrc += lumStride;
1311 dst += dstStride;
1312 }
1313 __asm__(EMMS" \n\t"
1314 SFENCE" \n\t"
1315 :::"memory");
1316 }
1317
1318 /**
1319 * Height should be a multiple of 2 and width should be a multiple of 16.
1320 * (If this is a problem for anyone then tell me, and I will fix it.)
1321 */
1322 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1323 long width, long height,
1324 long lumStride, long chromStride, long dstStride)
1325 {
1326 //FIXME interpolate chroma
1327 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1328 }
1329
1330 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1331 long width, long height,
1332 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1333 {
1334 long y;
1335 const x86_reg chromWidth= width>>1;
1336 for (y=0; y<height; y++) {
1337 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1338 __asm__ volatile(
1339 "xor %%"REG_a", %%"REG_a" \n\t"
1340 ".p2align 4 \n\t"
1341 "1: \n\t"
1342 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1343 PREFETCH" 32(%2, %%"REG_a") \n\t"
1344 PREFETCH" 32(%3, %%"REG_a") \n\t"
1345 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1346 "movq %%mm0, %%mm2 \n\t" // U(0)
1347 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1348 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1349 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1350
1351 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1352 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1353 "movq %%mm0, %%mm4 \n\t" // Y(0)
1354 "movq %%mm2, %%mm6 \n\t" // Y(8)
1355 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1356 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1357 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1358 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1359
1360 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1361 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1362 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1363 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1364
1365 "add $8, %%"REG_a" \n\t"
1366 "cmp %4, %%"REG_a" \n\t"
1367 " jb 1b \n\t"
1368 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1369 : "%"REG_a
1370 );
1371 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1372 usrc += chromStride;
1373 vsrc += chromStride;
1374 }
1375 ysrc += lumStride;
1376 dst += dstStride;
1377 }
1378 __asm__(EMMS" \n\t"
1379 SFENCE" \n\t"
1380 :::"memory");
1381 }
1382
1383 /**
1384 * Height should be a multiple of 2 and width should be a multiple of 16
1385 * (If this is a problem for anyone then tell me, and I will fix it.)
1386 */
1387 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1388 long width, long height,
1389 long lumStride, long chromStride, long dstStride)
1390 {
1391 //FIXME interpolate chroma
1392 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1393 }
1394
1395 /**
1396 * Width should be a multiple of 16.
1397 */
1398 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1399 long width, long height,
1400 long lumStride, long chromStride, long dstStride)
1401 {
1402 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1403 }
1404
1405 /**
1406 * Width should be a multiple of 16.
1407 */
1408 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1409 long width, long height,
1410 long lumStride, long chromStride, long dstStride)
1411 {
1412 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1413 }
1414
1415 /**
1416 * Height should be a multiple of 2 and width should be a multiple of 16.
1417 * (If this is a problem for anyone then tell me, and I will fix it.)
1418 */
1419 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1420 long width, long height,
1421 long lumStride, long chromStride, long srcStride)
1422 {
1423 long y;
1424 const x86_reg chromWidth= width>>1;
1425 for (y=0; y<height; y+=2) {
1426 __asm__ volatile(
1427 "xor %%"REG_a", %%"REG_a" \n\t"
1428 "pcmpeqw %%mm7, %%mm7 \n\t"
1429 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1430 ".p2align 4 \n\t"
1431 "1: \n\t"
1432 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1433 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1434 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1435 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1436 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1437 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1438 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1439 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1440 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1441 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1442 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1443
1444 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1445
1446 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1447 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1448 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1449 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1450 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1451 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1452 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1453 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1454 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1455 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1456
1457 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1458
1459 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1460 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1461 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1462 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1463 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1464 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1465 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1466 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1467
1468 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1469 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1470
1471 "add $8, %%"REG_a" \n\t"
1472 "cmp %4, %%"REG_a" \n\t"
1473 " jb 1b \n\t"
1474 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1475 : "memory", "%"REG_a
1476 );
1477
1478 ydst += lumStride;
1479 src += srcStride;
1480
1481 __asm__ volatile(
1482 "xor %%"REG_a", %%"REG_a" \n\t"
1483 ".p2align 4 \n\t"
1484 "1: \n\t"
1485 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1486 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1487 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1488 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1489 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1490 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1491 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1492 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1493 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1494 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1495 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1496
1497 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1498 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1499
1500 "add $8, %%"REG_a" \n\t"
1501 "cmp %4, %%"REG_a" \n\t"
1502 " jb 1b \n\t"
1503
1504 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1505 : "memory", "%"REG_a
1506 );
1507 udst += chromStride;
1508 vdst += chromStride;
1509 ydst += lumStride;
1510 src += srcStride;
1511 }
1512 __asm__ volatile(EMMS" \n\t"
1513 SFENCE" \n\t"
1514 :::"memory");
1515 }
1516
1517 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1518 {
1519 long x,y;
1520
1521 dst[0]= src[0];
1522
1523 // first line
1524 for (x=0; x<srcWidth-1; x++) {
1525 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1526 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1527 }
1528 dst[2*srcWidth-1]= src[srcWidth-1];
1529
1530 dst+= dstStride;
1531
1532 for (y=1; y<srcHeight; y++) {
1533 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1534 const x86_reg mmxSize= srcWidth&~15;
1535 __asm__ volatile(
1536 "mov %4, %%"REG_a" \n\t"
1537 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1538 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1539 "movq %%mm4, %%mm2 \n\t"
1540 "psllq $8, %%mm4 \n\t"
1541 "pand %%mm0, %%mm2 \n\t"
1542 "por %%mm2, %%mm4 \n\t"
1543 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1544 "movq %%mm5, %%mm3 \n\t"
1545 "psllq $8, %%mm5 \n\t"
1546 "pand %%mm0, %%mm3 \n\t"
1547 "por %%mm3, %%mm5 \n\t"
1548 "1: \n\t"
1549 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1550 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1551 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1552 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1553 PAVGB" %%mm0, %%mm5 \n\t"
1554 PAVGB" %%mm0, %%mm3 \n\t"
1555 PAVGB" %%mm0, %%mm5 \n\t"
1556 PAVGB" %%mm0, %%mm3 \n\t"
1557 PAVGB" %%mm1, %%mm4 \n\t"
1558 PAVGB" %%mm1, %%mm2 \n\t"
1559 PAVGB" %%mm1, %%mm4 \n\t"
1560 PAVGB" %%mm1, %%mm2 \n\t"
1561 "movq %%mm5, %%mm7 \n\t"
1562 "movq %%mm4, %%mm6 \n\t"
1563 "punpcklbw %%mm3, %%mm5 \n\t"
1564 "punpckhbw %%mm3, %%mm7 \n\t"
1565 "punpcklbw %%mm2, %%mm4 \n\t"
1566 "punpckhbw %%mm2, %%mm6 \n\t"
1567 #if 1
1568 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1569 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1570 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1571 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1572 #else
1573 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1574 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1575 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1576 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1577 #endif
1578 "add $8, %%"REG_a" \n\t"
1579 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1580 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1581 " js 1b \n\t"
1582 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1583 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1584 "g" (-mmxSize)
1585 : "%"REG_a
1586 );
1587 #else
1588 const x86_reg mmxSize=1;
1589
1590 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1591 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1592 #endif
1593
1594 for (x=mmxSize-1; x<srcWidth-1; x++) {
1595 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1596 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1597 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1598 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1599 }
1600 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1601 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1602
1603 dst+=dstStride*2;
1604 src+=srcStride;
1605 }
1606
1607 // last line
1608 #if 1
1609 dst[0]= src[0];
1610
1611 for (x=0; x<srcWidth-1; x++) {
1612 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1613 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1614 }
1615 dst[2*srcWidth-1]= src[srcWidth-1];
1616 #else
1617 for (x=0; x<srcWidth; x++) {
1618 dst[2*x+0]=
1619 dst[2*x+1]= src[x];
1620 }
1621 #endif
1622
1623 __asm__ volatile(EMMS" \n\t"
1624 SFENCE" \n\t"
1625 :::"memory");
1626 }
1627
1628 /**
1629 * Height should be a multiple of 2 and width should be a multiple of 16.
1630 * (If this is a problem for anyone then tell me, and I will fix it.)
1631 * Chrominance data is only taken from every second line, others are ignored.
1632 * FIXME: Write HQ version.
1633 */
1634 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1635 long width, long height,
1636 long lumStride, long chromStride, long srcStride)
1637 {
1638 long y;
1639 const x86_reg chromWidth= width>>1;
1640 for (y=0; y<height; y+=2) {
1641 __asm__ volatile(
1642 "xor %%"REG_a", %%"REG_a" \n\t"
1643 "pcmpeqw %%mm7, %%mm7 \n\t"
1644 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1645 ".p2align 4 \n\t"
1646 "1: \n\t"
1647 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1648 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1649 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1650 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1651 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1652 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1653 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1654 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1655 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1656 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1657 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1658
1659 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1660
1661 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1662 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1663 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1664 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1665 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1666 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1667 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1668 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1669 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1670 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1671
1672 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1673
1674 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1675 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1676 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1677 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1678 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1679 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1680 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1681 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1682
1683 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1684 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1685
1686 "add $8, %%"REG_a" \n\t"
1687 "cmp %4, %%"REG_a" \n\t"
1688 " jb 1b \n\t"
1689 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1690 : "memory", "%"REG_a
1691 );
1692
1693 ydst += lumStride;
1694 src += srcStride;
1695
1696 __asm__ volatile(
1697 "xor %%"REG_a", %%"REG_a" \n\t"
1698 ".p2align 4 \n\t"
1699 "1: \n\t"
1700 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1701 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1702 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1703 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1704 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1705 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1706 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1707 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1708 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1709 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1710 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1711
1712 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1713 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1714
1715 "add $8, %%"REG_a" \n\t"
1716 "cmp %4, %%"REG_a" \n\t"
1717 " jb 1b \n\t"
1718
1719 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1720 : "memory", "%"REG_a
1721 );
1722 udst += chromStride;
1723 vdst += chromStride;
1724 ydst += lumStride;
1725 src += srcStride;
1726 }
1727 __asm__ volatile(EMMS" \n\t"
1728 SFENCE" \n\t"
1729 :::"memory");
1730 }
1731
1732 /**
1733 * Height should be a multiple of 2 and width should be a multiple of 2.
1734 * (If this is a problem for anyone then tell me, and I will fix it.)
1735 * Chrominance data is only taken from every second line,
1736 * others are ignored in the C version.
1737 * FIXME: Write HQ version.
1738 */
1739 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1740 long width, long height,
1741 long lumStride, long chromStride, long srcStride)
1742 {
1743 long y;
1744 const x86_reg chromWidth= width>>1;
1745 for (y=0; y<height-2; y+=2) {
1746 long i;
1747 for (i=0; i<2; i++) {
1748 __asm__ volatile(
1749 "mov %2, %%"REG_a" \n\t"
1750 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1751 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1752 "pxor %%mm7, %%mm7 \n\t"
1753 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1754 ".p2align 4 \n\t"
1755 "1: \n\t"
1756 PREFETCH" 64(%0, %%"REG_d") \n\t"
1757 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1758 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1759 "punpcklbw %%mm7, %%mm0 \n\t"
1760 "punpcklbw %%mm7, %%mm1 \n\t"
1761 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1762 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1763 "punpcklbw %%mm7, %%mm2 \n\t"
1764 "punpcklbw %%mm7, %%mm3 \n\t"
1765 "pmaddwd %%mm6, %%mm0 \n\t"
1766 "pmaddwd %%mm6, %%mm1 \n\t"
1767 "pmaddwd %%mm6, %%mm2 \n\t"
1768 "pmaddwd %%mm6, %%mm3 \n\t"
1769 #ifndef FAST_BGR2YV12
1770 "psrad $8, %%mm0 \n\t"
1771 "psrad $8, %%mm1 \n\t"
1772 "psrad $8, %%mm2 \n\t"
1773 "psrad $8, %%mm3 \n\t"
1774 #endif
1775 "packssdw %%mm1, %%mm0 \n\t"
1776 "packssdw %%mm3, %%mm2 \n\t"
1777 "pmaddwd %%mm5, %%mm0 \n\t"
1778 "pmaddwd %%mm5, %%mm2 \n\t"
1779 "packssdw %%mm2, %%mm0 \n\t"
1780 "psraw $7, %%mm0 \n\t"
1781
1782 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1783 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1784 "punpcklbw %%mm7, %%mm4 \n\t"
1785 "punpcklbw %%mm7, %%mm1 \n\t"
1786 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1787 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1788 "punpcklbw %%mm7, %%mm2 \n\t"
1789 "punpcklbw %%mm7, %%mm3 \n\t"
1790 "pmaddwd %%mm6, %%mm4 \n\t"
1791 "pmaddwd %%mm6, %%mm1 \n\t"
1792 "pmaddwd %%mm6, %%mm2 \n\t"
1793 "pmaddwd %%mm6, %%mm3 \n\t"
1794 #ifndef FAST_BGR2YV12
1795 "psrad $8, %%mm4 \n\t"
1796 "psrad $8, %%mm1 \n\t"
1797 "psrad $8, %%mm2 \n\t"
1798 "psrad $8, %%mm3 \n\t"
1799 #endif
1800 "packssdw %%mm1, %%mm4 \n\t"
1801 "packssdw %%mm3, %%mm2 \n\t"
1802 "pmaddwd %%mm5, %%mm4 \n\t"
1803 "pmaddwd %%mm5, %%mm2 \n\t"
1804 "add $24, %%"REG_d" \n\t"
1805 "packssdw %%mm2, %%mm4 \n\t"
1806 "psraw $7, %%mm4 \n\t"
1807
1808 "packuswb %%mm4, %%mm0 \n\t"
1809 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1810
1811 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1812 "add $8, %%"REG_a" \n\t"
1813 " js 1b \n\t"
1814 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1815 : "%"REG_a, "%"REG_d
1816 );
1817 ydst += lumStride;
1818 src += srcStride;
1819 }
1820 src -= srcStride*2;
1821 __asm__ volatile(
1822 "mov %4, %%"REG_a" \n\t"
1823 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1824 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1825 "pxor %%mm7, %%mm7 \n\t"
1826 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1827 "add %%"REG_d", %%"REG_d" \n\t"
1828 ".p2align 4 \n\t"
1829 "1: \n\t"
1830 PREFETCH" 64(%0, %%"REG_d") \n\t"
1831 PREFETCH" 64(%1, %%"REG_d") \n\t"
1832 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1833 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1834 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1835 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1836 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1837 PAVGB" %%mm1, %%mm0 \n\t"
1838 PAVGB" %%mm3, %%mm2 \n\t"
1839 "movq %%mm0, %%mm1 \n\t"
1840 "movq %%mm2, %%mm3 \n\t"
1841 "psrlq $24, %%mm0 \n\t"
1842 "psrlq $24, %%mm2 \n\t"
1843 PAVGB" %%mm1, %%mm0 \n\t"
1844 PAVGB" %%mm3, %%mm2 \n\t"
1845 "punpcklbw %%mm7, %%mm0 \n\t"
1846 "punpcklbw %%mm7, %%mm2 \n\t"
1847 #else
1848 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1849 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1850 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1851 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1852 "punpcklbw %%mm7, %%mm0 \n\t"
1853 "punpcklbw %%mm7, %%mm1 \n\t"
1854 "punpcklbw %%mm7, %%mm2 \n\t"
1855 "punpcklbw %%mm7, %%mm3 \n\t"
1856 "paddw %%mm1, %%mm0 \n\t"
1857 "paddw %%mm3, %%mm2 \n\t"
1858 "paddw %%mm2, %%mm0 \n\t"
1859 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1860 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1861 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1862 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1863 "punpcklbw %%mm7, %%mm4 \n\t"
1864 "punpcklbw %%mm7, %%mm1 \n\t"
1865 "punpcklbw %%mm7, %%mm2 \n\t"
1866 "punpcklbw %%mm7, %%mm3 \n\t"
1867 "paddw %%mm1, %%mm4 \n\t"
1868 "paddw %%mm3, %%mm2 \n\t"
1869 "paddw %%mm4, %%mm2 \n\t"
1870 "psrlw $2, %%mm0 \n\t"
1871 "psrlw $2, %%mm2 \n\t"
1872 #endif
1873 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1874 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1875
1876 "pmaddwd %%mm0, %%mm1 \n\t"
1877 "pmaddwd %%mm2, %%mm3 \n\t"
1878 "pmaddwd %%mm6, %%mm0 \n\t"
1879 "pmaddwd %%mm6, %%mm2 \n\t"
1880 #ifndef FAST_BGR2YV12
1881 "psrad $8, %%mm0 \n\t"
1882 "psrad $8, %%mm1 \n\t"
1883 "psrad $8, %%mm2 \n\t"
1884 "psrad $8, %%mm3 \n\t"
1885 #endif
1886 "packssdw %%mm2, %%mm0 \n\t"
1887 "packssdw %%mm3, %%mm1 \n\t"
1888 "pmaddwd %%mm5, %%mm0 \n\t"
1889 "pmaddwd %%mm5, %%mm1 \n\t"
1890 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1891 "psraw $7, %%mm0 \n\t"
1892
1893 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1894 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1895 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1896 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1897 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1898 PAVGB" %%mm1, %%mm4 \n\t"
1899 PAVGB" %%mm3, %%mm2 \n\t"
1900 "movq %%mm4, %%mm1 \n\t"
1901 "movq %%mm2, %%mm3 \n\t"
1902 "psrlq $24, %%mm4 \n\t"
1903 "psrlq $24, %%mm2 \n\t"
1904 PAVGB" %%mm1, %%mm4 \n\t"
1905 PAVGB" %%mm3, %%mm2 \n\t"
1906 "punpcklbw %%mm7, %%mm4 \n\t"
1907 "punpcklbw %%mm7, %%mm2 \n\t"
1908 #else
1909 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1910 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1911 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1912 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1913 "punpcklbw %%mm7, %%mm4 \n\t"
1914 "punpcklbw %%mm7, %%mm1 \n\t"
1915 "punpcklbw %%mm7, %%mm2 \n\t"
1916 "punpcklbw %%mm7, %%mm3 \n\t"
1917 "paddw %%mm1, %%mm4 \n\t"
1918 "paddw %%mm3, %%mm2 \n\t"
1919 "paddw %%mm2, %%mm4 \n\t"
1920 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1921 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1922 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1923 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1924 "punpcklbw %%mm7, %%mm5 \n\t"
1925 "punpcklbw %%mm7, %%mm1 \n\t"
1926 "punpcklbw %%mm7, %%mm2 \n\t"
1927 "punpcklbw %%mm7, %%mm3 \n\t"
1928 "paddw %%mm1, %%mm5 \n\t"
1929 "paddw %%mm3, %%mm2 \n\t"
1930 "paddw %%mm5, %%mm2 \n\t"
1931 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1932 "psrlw $2, %%mm4 \n\t"
1933 "psrlw $2, %%mm2 \n\t"
1934 #endif
1935 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1936 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1937
1938 "pmaddwd %%mm4, %%mm1 \n\t"
1939 "pmaddwd %%mm2, %%mm3 \n\t"
1940 "pmaddwd %%mm6, %%mm4 \n\t"
1941 "pmaddwd %%mm6, %%mm2 \n\t"
1942 #ifndef FAST_BGR2YV12
1943 "psrad $8, %%mm4 \n\t"
1944 "psrad $8, %%mm1 \n\t"
1945 "psrad $8, %%mm2 \n\t"
1946 "psrad $8, %%mm3 \n\t"
1947 #endif
1948 "packssdw %%mm2, %%mm4 \n\t"
1949 "packssdw %%mm3, %%mm1 \n\t"
1950 "pmaddwd %%mm5, %%mm4 \n\t"
1951 "pmaddwd %%mm5, %%mm1 \n\t"
1952 "add $24, %%"REG_d" \n\t"
1953 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1954 "psraw $7, %%mm4 \n\t"
1955
1956 "movq %%mm0, %%mm1 \n\t"
1957 "punpckldq %%mm4, %%mm0 \n\t"
1958 "punpckhdq %%mm4, %%mm1 \n\t"
1959 "packsswb %%mm1, %%mm0 \n\t"
1960 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1961 "movd %%mm0, (%2, %%"REG_a") \n\t"
1962 "punpckhdq %%mm0, %%mm0 \n\t"
1963 "movd %%mm0, (%3, %%"REG_a") \n\t"
1964 "add $4, %%"REG_a" \n\t"
1965 " js 1b \n\t"
1966 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1967 : "%"REG_a, "%"REG_d
1968 );
1969
1970 udst += chromStride;
1971 vdst += chromStride;
1972 src += srcStride*2;
1973 }
1974
1975 __asm__ volatile(EMMS" \n\t"
1976 SFENCE" \n\t"
1977 :::"memory");
1978
1979 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1980 }
1981
1982 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1983 long width, long height, long src1Stride,
1984 long src2Stride, long dstStride)
1985 {
1986 long h;
1987
1988 for (h=0; h < height; h++) {
1989 long w;
1990
1991 #if COMPILE_TEMPLATE_SSE2
1992 __asm__(
1993 "xor %%"REG_a", %%"REG_a" \n\t"
1994 "1: \n\t"
1995 PREFETCH" 64(%1, %%"REG_a") \n\t"
1996 PREFETCH" 64(%2, %%"REG_a") \n\t"
1997 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1998 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1999 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2000 "punpcklbw %%xmm2, %%xmm0 \n\t"
2001 "punpckhbw %%xmm2, %%xmm1 \n\t"
2002 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2003 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2004 "add $16, %%"REG_a" \n\t"
2005 "cmp %3, %%"REG_a" \n\t"
2006 " jb 1b \n\t"
2007 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2008 : "memory", "%"REG_a""
2009 );
2010 #else
2011 __asm__(
2012 "xor %%"REG_a", %%"REG_a" \n\t"
2013 "1: \n\t"
2014 PREFETCH" 64(%1, %%"REG_a") \n\t"
2015 PREFETCH" 64(%2, %%"REG_a") \n\t"
2016 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2017 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2018 "movq %%mm0, %%mm1 \n\t"
2019 "movq %%mm2, %%mm3 \n\t"
2020 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2021 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2022 "punpcklbw %%mm4, %%mm0 \n\t"
2023 "punpckhbw %%mm4, %%mm1 \n\t"
2024 "punpcklbw %%mm5, %%mm2 \n\t"
2025 "punpckhbw %%mm5, %%mm3 \n\t"
2026 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2027 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2028 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2029 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2030 "add $16, %%"REG_a" \n\t"
2031 "cmp %3, %%"REG_a" \n\t"
2032 " jb 1b \n\t"
2033 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2034 : "memory", "%"REG_a
2035 );
2036 #endif
2037 for (w= (width&(~15)); w < width; w++) {
2038 dest[2*w+0] = src1[w];
2039 dest[2*w+1] = src2[w];
2040 }
2041 dest += dstStride;
2042 src1 += src1Stride;
2043 src2 += src2Stride;
2044 }
2045 __asm__(
2046 EMMS" \n\t"
2047 SFENCE" \n\t"
2048 ::: "memory"
2049 );
2050 }
2051
2052 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2053 uint8_t *dst1, uint8_t *dst2,
2054 long width, long height,
2055 long srcStride1, long srcStride2,
2056 long dstStride1, long dstStride2)
2057 {
2058 x86_reg y;
2059 long x,w,h;
2060 w=width/2; h=height/2;
2061 __asm__ volatile(
2062 PREFETCH" %0 \n\t"
2063 PREFETCH" %1 \n\t"
2064 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2065 for (y=0;y<h;y++) {
2066 const uint8_t* s1=src1+srcStride1*(y>>1);
2067 uint8_t* d=dst1+dstStride1*y;
2068 x=0;
2069 for (;x<w-31;x+=32) {
2070 __asm__ volatile(
2071 PREFETCH" 32%1 \n\t"
2072 "movq %1, %%mm0 \n\t"
2073 "movq 8%1, %%mm2 \n\t"
2074 "movq 16%1, %%mm4 \n\t"
2075 "movq 24%1, %%mm6 \n\t"
2076 "movq %%mm0, %%mm1 \n\t"
2077 "movq %%mm2, %%mm3 \n\t"
2078 "movq %%mm4, %%mm5 \n\t"
2079 "movq %%mm6, %%mm7 \n\t"
2080 "punpcklbw %%mm0, %%mm0 \n\t"
2081 "punpckhbw %%mm1, %%mm1 \n\t"
2082 "punpcklbw %%mm2, %%mm2 \n\t"
2083 "punpckhbw %%mm3, %%mm3 \n\t"
2084 "punpcklbw %%mm4, %%mm4 \n\t"
2085 "punpckhbw %%mm5, %%mm5 \n\t"
2086 "punpcklbw %%mm6, %%mm6 \n\t"
2087 "punpckhbw %%mm7, %%mm7 \n\t"
2088 MOVNTQ" %%mm0, %0 \n\t"
2089 MOVNTQ" %%mm1, 8%0 \n\t"
2090 MOVNTQ" %%mm2, 16%0 \n\t"
2091 MOVNTQ" %%mm3, 24%0 \n\t"
2092 MOVNTQ" %%mm4, 32%0 \n\t"
2093 MOVNTQ" %%mm5, 40%0 \n\t"
2094 MOVNTQ" %%mm6, 48%0 \n\t"
2095 MOVNTQ" %%mm7, 56%0"
2096 :"=m"(d[2*x])
2097 :"m"(s1[x])
2098 :"memory");
2099 }
2100 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2101 }
2102 for (y=0;y<h;y++) {
2103 const uint8_t* s2=src2+srcStride2*(y>>1);
2104 uint8_t* d=dst2+dstStride2*y;
2105 x=0;
2106 for (;x<w-31;x+=32) {
2107 __asm__ volatile(
2108 PREFETCH" 32%1 \n\t"
2109 "movq %1, %%mm0 \n\t"
2110 "movq 8%1, %%mm2 \n\t"
2111 "movq 16%1, %%mm4 \n\t"
2112 "movq 24%1, %%mm6 \n\t"
2113 "movq %%mm0, %%mm1 \n\t"
2114 "movq %%mm2, %%mm3 \n\t"
2115 "movq %%mm4, %%mm5 \n\t"
2116 "movq %%mm6, %%mm7 \n\t"
2117 "punpcklbw %%mm0, %%mm0 \n\t"
2118 "punpckhbw %%mm1, %%mm1 \n\t"
2119 "punpcklbw %%mm2, %%mm2 \n\t"
2120 "punpckhbw %%mm3, %%mm3 \n\t"
2121 "punpcklbw %%mm4, %%mm4 \n\t"
2122 "punpckhbw %%mm5, %%mm5 \n\t"
2123 "punpcklbw %%mm6, %%mm6 \n\t"
2124 "punpckhbw %%mm7, %%mm7 \n\t"
2125 MOVNTQ" %%mm0, %0 \n\t"
2126 MOVNTQ" %%mm1, 8%0 \n\t"
2127 MOVNTQ" %%mm2, 16%0 \n\t"
2128 MOVNTQ" %%mm3, 24%0 \n\t"
2129 MOVNTQ" %%mm4, 32%0 \n\t"
2130 MOVNTQ" %%mm5, 40%0 \n\t"
2131 MOVNTQ" %%mm6, 48%0 \n\t"
2132 MOVNTQ" %%mm7, 56%0"
2133 :"=m"(d[2*x])
2134 :"m"(s2[x])
2135 :"memory");
2136 }
2137 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2138 }
2139 __asm__(
2140 EMMS" \n\t"
2141 SFENCE" \n\t"
2142 ::: "memory"
2143 );
2144 }
2145
2146 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2147 uint8_t *dst,
2148 long width, long height,
2149 long srcStride1, long srcStride2,
2150 long srcStride3, long dstStride)
2151 {
2152 x86_reg x;
2153 long y,w,h;
2154 w=width/2; h=height;
2155 for (y=0;y<h;y++) {
2156 const uint8_t* yp=src1+srcStride1*y;
2157 const uint8_t* up=src2+srcStride2*(y>>2);
2158 const uint8_t* vp=src3+srcStride3*(y>>2);
2159 uint8_t* d=dst+dstStride*y;
2160 x=0;
2161 for (;x<w-7;x+=8) {
2162 __asm__ volatile(
2163 PREFETCH" 32(%1, %0) \n\t"
2164 PREFETCH" 32(%2, %0) \n\t"
2165 PREFETCH" 32(%3, %0) \n\t"
2166 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2167 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2168 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2169 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2170 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2171 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2172 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2173 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2174 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2175 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2176
2177 "movq %%mm1, %%mm6 \n\t"
2178 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2179 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2180 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2181 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2182 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2183
2184 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2185 "movq 8(%1, %0, 4), %%mm0 \n\t"
2186 "movq %%mm0, %%mm3 \n\t"
2187 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2188 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2189 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2190 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2191
2192 "movq %%mm4, %%mm6 \n\t"
2193 "movq 16(%1, %0, 4), %%mm0 \n\t"
2194 "movq %%mm0, %%mm3 \n\t"
2195 "punpcklbw %%mm5, %%mm4 \n\t"
2196 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2197 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2198 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2199 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2200
2201 "punpckhbw %%mm5, %%mm6 \n\t"
2202 "movq 24(%1, %0, 4), %%mm0 \n\t"
2203 "movq %%mm0, %%mm3 \n\t"
2204 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2205 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2206 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2207 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2208
2209 : "+r" (x)
2210 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2211 :"memory");
2212 }
2213 for (; x<w; x++) {
2214 const long x2 = x<<2;
2215 d[8*x+0] = yp[x2];
2216 d[8*x+1] = up[x];
2217 d[8*x+2] = yp[x2+1];
2218 d[8*x+3] = vp[x];
2219 d[8*x+4] = yp[x2+2];
2220 d[8*x+5] = up[x];
2221 d[8*x+6] = yp[x2+3];
2222 d[8*x+7] = vp[x];
2223 }
2224 }
2225 __asm__(
2226 EMMS" \n\t"
2227 SFENCE" \n\t"
2228 ::: "memory"
2229 );
2230 }
2231
2232 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2233 {
2234 dst += count;
2235 src += 2*count;
2236 count= - count;
2237
2238 if(count <= -16) {
2239 count += 15;
2240 __asm__ volatile(
2241 "pcmpeqw %%mm7, %%mm7 \n\t"
2242 "psrlw $8, %%mm7 \n\t"
2243 "1: \n\t"
2244 "movq -30(%1, %0, 2), %%mm0 \n\t"
2245 "movq -22(%1, %0, 2), %%mm1 \n\t"
2246 "movq -14(%1, %0, 2), %%mm2 \n\t"
2247 "movq -6(%1, %0, 2), %%mm3 \n\t"
2248 "pand %%mm7, %%mm0 \n\t"
2249 "pand %%mm7, %%mm1 \n\t"
2250 "pand %%mm7, %%mm2 \n\t"
2251 "pand %%mm7, %%mm3 \n\t"
2252 "packuswb %%mm1, %%mm0 \n\t"
2253 "packuswb %%mm3, %%mm2 \n\t"
2254 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2255 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2256 "add $16, %0 \n\t"
2257 " js 1b \n\t"
2258 : "+r"(count)
2259 : "r"(src), "r"(dst)
2260 );
2261 count -= 15;
2262 }
2263 while(count<0) {
2264 dst[count]= src[2*count];
2265 count++;
2266 }
2267 }
2268
2269 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2270 {
2271 dst0+= count;
2272 dst1+= count;
2273 src += 4*count;
2274 count= - count;
2275 if(count <= -8) {
2276 count += 7;
2277 __asm__ volatile(
2278 "pcmpeqw %%mm7, %%mm7 \n\t"
2279 "psrlw $8, %%mm7 \n\t"
2280 "1: \n\t"
2281 "movq -28(%1, %0, 4), %%mm0 \n\t"
2282 "movq -20(%1, %0, 4), %%mm1 \n\t"
2283 "movq -12(%1, %0, 4), %%mm2 \n\t"
2284 "movq -4(%1, %0, 4), %%mm3 \n\t"
2285 "pand %%mm7, %%mm0 \n\t"
2286 "pand %%mm7, %%mm1 \n\t"
2287 "pand %%mm7, %%mm2 \n\t"
2288 "pand %%mm7, %%mm3 \n\t"
2289 "packuswb %%mm1, %%mm0 \n\t"
2290 "packuswb %%mm3, %%mm2 \n\t"
2291 "movq %%mm0, %%mm1 \n\t"
2292 "movq %%mm2, %%mm3 \n\t"
2293 "psrlw $8, %%mm0 \n\t"
2294 "psrlw $8, %%mm2 \n\t"
2295 "pand %%mm7, %%mm1 \n\t"
2296 "pand %%mm7, %%mm3 \n\t"
2297 "packuswb %%mm2, %%mm0 \n\t"
2298 "packuswb %%mm3, %%mm1 \n\t"
2299 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2300 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2301 "add $8, %0 \n\t"
2302 " js 1b \n\t"
2303 : "+r"(count)
2304 : "r"(src), "r"(dst0), "r"(dst1)
2305 );
2306 count -= 7;
2307 }
2308 while(count<0) {
2309 dst0[count]= src[4*count+0];
2310 dst1[count]= src[4*count+2];
2311 count++;
2312 }
2313 }
2314
2315 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2316 {
2317 dst0 += count;
2318 dst1 += count;
2319 src0 += 4*count;
2320 src1 += 4*count;
2321 count= - count;
2322 #ifdef PAVGB
2323 if(count <= -8) {
2324 count += 7;
2325 __asm__ volatile(
2326 "pcmpeqw %%mm7, %%mm7 \n\t"
2327 "psrlw $8, %%mm7 \n\t"
2328 "1: \n\t"
2329 "movq -28(%1, %0, 4), %%mm0 \n\t"
2330 "movq -20(%1, %0, 4), %%mm1 \n\t"
2331 "movq -12(%1, %0, 4), %%mm2 \n\t"
2332 "movq -4(%1, %0, 4), %%mm3 \n\t"
2333 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2334 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2335 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2336 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2337 "pand %%mm7, %%mm0 \n\t"
2338 "pand %%mm7, %%mm1 \n\t"
2339 "pand %%mm7, %%mm2 \n\t"
2340 "pand %%mm7, %%mm3 \n\t"
2341 "packuswb %%mm1, %%mm0 \n\t"
2342 "packuswb %%mm3, %%mm2 \n\t"
2343 "movq %%mm0, %%mm1 \n\t"
2344 "movq %%mm2, %%mm3 \n\t"
2345 "psrlw $8, %%mm0 \n\t"
2346 "psrlw $8, %%mm2 \n\t"
2347 "pand %%mm7, %%mm1 \n\t"
2348 "pand %%mm7, %%mm3 \n\t"
2349 "packuswb %%mm2, %%mm0 \n\t"
2350 "packuswb %%mm3, %%mm1 \n\t"
2351 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2352 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2353 "add $8, %0 \n\t"
2354 " js 1b \n\t"
2355 : "+r"(count)
2356 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2357 );
2358 count -= 7;
2359 }
2360 #endif
2361 while(count<0) {
2362 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2363 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2364 count++;
2365 }
2366 }
2367
2368 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2369 {
2370 dst0+= count;
2371 dst1+= count;
2372 src += 4*count;
2373 count= - count;
2374 if(count <= -8) {
2375 count += 7;
2376 __asm__ volatile(
2377 "pcmpeqw %%mm7, %%mm7 \n\t"
2378 "psrlw $8, %%mm7 \n\t"
2379 "1: \n\t"
2380 "movq -28(%1, %0, 4), %%mm0 \n\t"
2381 "movq -20(%1, %0, 4), %%mm1 \n\t"
2382 "movq -12(%1, %0, 4), %%mm2 \n\t"
2383 "movq -4(%1, %0, 4), %%mm3 \n\t"
2384 "psrlw $8, %%mm0 \n\t"
2385 "psrlw $8, %%mm1 \n\t"
2386 "psrlw $8, %%mm2 \n\t"
2387 "psrlw $8, %%mm3 \n\t"
2388 "packuswb %%mm1, %%mm0 \n\t"
2389 "packuswb %%mm3, %%mm2 \n\t"
2390 "movq %%mm0, %%mm1 \n\t"
2391 "movq %%mm2, %%mm3 \n\t"
2392 "psrlw $8, %%mm0 \n\t"
2393 "psrlw $8, %%mm2 \n\t"
2394 "pand %%mm7, %%mm1 \n\t"
2395 "pand %%mm7, %%mm3 \n\t"
2396 "packuswb %%mm2, %%mm0 \n\t"
2397 "packuswb %%mm3, %%mm1 \n\t"
2398 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2399 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2400 "add $8, %0 \n\t"
2401 " js 1b \n\t"
2402 : "+r"(count)
2403 : "r"(src), "r"(dst0), "r"(dst1)
2404 );
2405 count -= 7;
2406 }
2407 src++;
2408 while(count<0) {
2409 dst0[count]= src[4*count+0];
2410 dst1[count]= src[4*count+2];
2411 count++;
2412 }
2413 }
2414
2415 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2416 {
2417 dst0 += count;
2418 dst1 += count;
2419 src0 += 4*count;
2420 src1 += 4*count;
2421 count= - count;
2422 #ifdef PAVGB
2423 if(count <= -8) {
2424 count += 7;
2425 __asm__ volatile(
2426 "pcmpeqw %%mm7, %%mm7 \n\t"
2427 "psrlw $8, %%mm7 \n\t"
2428 "1: \n\t"
2429 "movq -28(%1, %0, 4), %%mm0 \n\t"
2430 "movq -20(%1, %0, 4), %%mm1 \n\t"
2431 "movq -12(%1, %0, 4), %%mm2 \n\t"
2432 "movq -4(%1, %0, 4), %%mm3 \n\t"
2433 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2434 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2435 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2436 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2437 "psrlw $8, %%mm0 \n\t"
2438 "psrlw $8, %%mm1 \n\t"
2439 "psrlw $8, %%mm2 \n\t"
2440 "psrlw $8, %%mm3 \n\t"
2441 "packuswb %%mm1, %%mm0 \n\t"
2442 "packuswb %%mm3, %%mm2 \n\t"
2443 "movq %%mm0, %%mm1 \n\t"
2444 "movq %%mm2, %%mm3 \n\t"
2445 "psrlw $8, %%mm0 \n\t"
2446 "psrlw $8, %%mm2 \n\t"
2447 "pand %%mm7, %%mm1 \n\t"
2448 "pand %%mm7, %%mm3 \n\t"
2449 "packuswb %%mm2, %%mm0 \n\t"
2450 "packuswb %%mm3, %%mm1 \n\t"
2451 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2452 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2453 "add $8, %0 \n\t"
2454 " js 1b \n\t"
2455 : "+r"(count)
2456 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2457 );
2458 count -= 7;
2459 }
2460 #endif
2461 src0++;
2462 src1++;
2463 while(count<0) {
2464 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2465 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2466 count++;
2467 }
2468 }
2469
2470 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2471 long width, long height,
2472 long lumStride, long chromStride, long srcStride)
2473 {
2474 long y;
2475 const long chromWidth= -((-width)>>1);
2476
2477 for (y=0; y<height; y++) {
2478 RENAME(extract_even)(src, ydst, width);
2479 if(y&1) {
2480 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2481 udst+= chromStride;
2482 vdst+= chromStride;
2483 }
2484
2485 src += srcStride;
2486 ydst+= lumStride;
2487 }
2488 __asm__(
2489 EMMS" \n\t"
2490 SFENCE" \n\t"
2491 ::: "memory"
2492 );
2493 }
2494
2495 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2496 long width, long height,
2497 long lumStride, long chromStride, long srcStride)
2498 {
2499 long y;
2500 const long chromWidth= -((-width)>>1);
2501
2502 for (y=0; y<height; y++) {
2503 RENAME(extract_even)(src, ydst, width);
2504 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2505
2506 src += srcStride;
2507 ydst+= lumStride;
2508 udst+= chromStride;
2509 vdst+= chromStride;
2510 }
2511 __asm__(
2512 EMMS" \n\t"
2513 SFENCE" \n\t"
2514 ::: "memory"
2515 );
2516 }
2517
2518 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2519 long width, long height,
2520 long lumStride, long chromStride, long srcStride)
2521 {
2522 long y;
2523 const long chromWidth= -((-width)>>1);
2524
2525 for (y=0; y<height; y++) {
2526 RENAME(extract_even)(src+1, ydst, width);
2527 if(y&1) {
2528 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2529 udst+= chromStride;
2530 vdst+= chromStride;
2531 }
2532
2533 src += srcStride;
2534 ydst+= lumStride;
2535 }
2536 __asm__(
2537 EMMS" \n\t"
2538 SFENCE" \n\t"
2539 ::: "memory"
2540 );
2541 }
2542
2543 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2544 long width, long height,
2545 long lumStride, long chromStride, long srcStride)
2546 {
2547 long y;
2548 const long chromWidth= -((-width)>>1);
2549
2550 for (y=0; y<height; y++) {
2551 RENAME(extract_even)(src+1, ydst, width);
2552 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2553
2554 src += srcStride;
2555 ydst+= lumStride;
2556 udst+= chromStride;
2557 vdst+= chromStride;
2558 }
2559 __asm__(
2560 EMMS" \n\t"
2561 SFENCE" \n\t"
2562 ::: "memory"
2563 );
2564 }
2565
2566 static inline void RENAME(rgb2rgb_init)(void)
2567 {
2568 rgb15to16 = RENAME(rgb15to16);
2569 rgb15tobgr24 = RENAME(rgb15tobgr24);
2570 rgb15to32 = RENAME(rgb15to32);
2571 rgb16tobgr24 = RENAME(rgb16tobgr24);
2572 rgb16to32 = RENAME(rgb16to32);
2573 rgb16to15 = RENAME(rgb16to15);
2574 rgb24tobgr16 = RENAME(rgb24tobgr16);
2575 rgb24tobgr15 = RENAME(rgb24tobgr15);
2576 rgb24tobgr32 = RENAME(rgb24tobgr32);
2577 rgb32to16 = RENAME(rgb32to16);
2578 rgb32to15 = RENAME(rgb32to15);
2579 rgb32tobgr24 = RENAME(rgb32tobgr24);
2580 rgb24to15 = RENAME(rgb24to15);
2581 rgb24to16 = RENAME(rgb24to16);
2582 rgb24tobgr24 = RENAME(rgb24tobgr24);
2583 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2584 rgb32tobgr16 = RENAME(rgb32tobgr16);
2585 rgb32tobgr15 = RENAME(rgb32tobgr15);
2586 yv12toyuy2 = RENAME(yv12toyuy2);
2587 yv12touyvy = RENAME(yv12touyvy);
2588 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2589 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2590 yuy2toyv12 = RENAME(yuy2toyv12);
2591 planar2x = RENAME(planar2x);
2592 rgb24toyv12 = RENAME(rgb24toyv12);
2593 interleaveBytes = RENAME(interleaveBytes);
2594 vu9_to_vu12 = RENAME(vu9_to_vu12);
2595 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2596
2597 uyvytoyuv420 = RENAME(uyvytoyuv420);
2598 uyvytoyuv422 = RENAME(uyvytoyuv422);
2599 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2600 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2601 }