rgb2rgb_template: add MMX/SSE2/AVX-optimized deinterleaveBytes
[libav.git] / libswscale / x86 / rgb2rgb_template.c
CommitLineData
c0038328
LB
1/*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of Libav.
11 *
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include <stddef.h>
28
2b677ffc
DB
29#include "libavutil/attributes.h"
30
c0038328
LB
31#undef PREFETCH
32#undef MOVNTQ
33#undef EMMS
34#undef SFENCE
c0038328
LB
35#undef PAVGB
36
c0038328
LB
37#if COMPILE_TEMPLATE_AMD3DNOW
38#define PREFETCH "prefetch"
39#define PAVGB "pavgusb"
239fdf1b 40#elif COMPILE_TEMPLATE_MMXEXT
c0038328
LB
41#define PREFETCH "prefetchnta"
42#define PAVGB "pavgb"
43#else
44#define PREFETCH " # nop"
45#endif
46
47#if COMPILE_TEMPLATE_AMD3DNOW
48/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
49#define EMMS "femms"
50#else
51#define EMMS "emms"
52#endif
53
239fdf1b 54#if COMPILE_TEMPLATE_MMXEXT
c0038328
LB
55#define MOVNTQ "movntq"
56#define SFENCE "sfence"
57#else
58#define MOVNTQ "movq"
59#define SFENCE " # nop"
60#endif
61
522d65ba
RB
62#if !COMPILE_TEMPLATE_SSE2
63
64#if !COMPILE_TEMPLATE_AMD3DNOW
65
b8e89339 66static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
67{
68 uint8_t *dest = dst;
69 const uint8_t *s = src;
70 const uint8_t *end;
c0038328 71 const uint8_t *mm_end;
c0038328 72 end = s + src_size;
c0038328
LB
73 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
74 mm_end = end - 23;
75 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
76 while (s < mm_end) {
77 __asm__ volatile(
90540c2d
MR
78 PREFETCH" 32(%1) \n\t"
79 "movd (%1), %%mm0 \n\t"
80 "punpckldq 3(%1), %%mm0 \n\t"
81 "movd 6(%1), %%mm1 \n\t"
82 "punpckldq 9(%1), %%mm1 \n\t"
83 "movd 12(%1), %%mm2 \n\t"
84 "punpckldq 15(%1), %%mm2 \n\t"
85 "movd 18(%1), %%mm3 \n\t"
86 "punpckldq 21(%1), %%mm3 \n\t"
c0038328
LB
87 "por %%mm7, %%mm0 \n\t"
88 "por %%mm7, %%mm1 \n\t"
89 "por %%mm7, %%mm2 \n\t"
90 "por %%mm7, %%mm3 \n\t"
90540c2d
MR
91 MOVNTQ" %%mm0, (%0) \n\t"
92 MOVNTQ" %%mm1, 8(%0) \n\t"
93 MOVNTQ" %%mm2, 16(%0) \n\t"
94 MOVNTQ" %%mm3, 24(%0)"
95 :: "r"(dest), "r"(s)
c0038328
LB
96 :"memory");
97 dest += 32;
98 s += 24;
99 }
100 __asm__ volatile(SFENCE:::"memory");
101 __asm__ volatile(EMMS:::"memory");
c0038328 102 while (s < end) {
c0038328
LB
103 *dest++ = *s++;
104 *dest++ = *s++;
105 *dest++ = *s++;
106 *dest++ = 255;
c0038328
LB
107 }
108}
109
110#define STORE_BGR24_MMX \
111 "psrlq $8, %%mm2 \n\t" \
112 "psrlq $8, %%mm3 \n\t" \
113 "psrlq $8, %%mm6 \n\t" \
114 "psrlq $8, %%mm7 \n\t" \
115 "pand "MANGLE(mask24l)", %%mm0\n\t" \
116 "pand "MANGLE(mask24l)", %%mm1\n\t" \
117 "pand "MANGLE(mask24l)", %%mm4\n\t" \
118 "pand "MANGLE(mask24l)", %%mm5\n\t" \
119 "pand "MANGLE(mask24h)", %%mm2\n\t" \
120 "pand "MANGLE(mask24h)", %%mm3\n\t" \
121 "pand "MANGLE(mask24h)", %%mm6\n\t" \
122 "pand "MANGLE(mask24h)", %%mm7\n\t" \
123 "por %%mm2, %%mm0 \n\t" \
124 "por %%mm3, %%mm1 \n\t" \
125 "por %%mm6, %%mm4 \n\t" \
126 "por %%mm7, %%mm5 \n\t" \
127 \
128 "movq %%mm1, %%mm2 \n\t" \
129 "movq %%mm4, %%mm3 \n\t" \
130 "psllq $48, %%mm2 \n\t" \
131 "psllq $32, %%mm3 \n\t" \
132 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
133 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
134 "por %%mm2, %%mm0 \n\t" \
135 "psrlq $16, %%mm1 \n\t" \
136 "psrlq $32, %%mm4 \n\t" \
137 "psllq $16, %%mm5 \n\t" \
138 "por %%mm3, %%mm1 \n\t" \
139 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
140 "por %%mm5, %%mm4 \n\t" \
141 \
90540c2d
MR
142 MOVNTQ" %%mm0, (%0) \n\t" \
143 MOVNTQ" %%mm1, 8(%0) \n\t" \
144 MOVNTQ" %%mm4, 16(%0)"
c0038328
LB
145
146
b8e89339 147static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
148{
149 uint8_t *dest = dst;
150 const uint8_t *s = src;
151 const uint8_t *end;
c0038328 152 const uint8_t *mm_end;
c0038328 153 end = s + src_size;
c0038328
LB
154 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
155 mm_end = end - 31;
156 while (s < mm_end) {
157 __asm__ volatile(
90540c2d
MR
158 PREFETCH" 32(%1) \n\t"
159 "movq (%1), %%mm0 \n\t"
160 "movq 8(%1), %%mm1 \n\t"
161 "movq 16(%1), %%mm4 \n\t"
162 "movq 24(%1), %%mm5 \n\t"
c0038328
LB
163 "movq %%mm0, %%mm2 \n\t"
164 "movq %%mm1, %%mm3 \n\t"
165 "movq %%mm4, %%mm6 \n\t"
166 "movq %%mm5, %%mm7 \n\t"
167 STORE_BGR24_MMX
90540c2d 168 :: "r"(dest), "r"(s)
c0038328
LB
169 :"memory");
170 dest += 24;
171 s += 32;
172 }
173 __asm__ volatile(SFENCE:::"memory");
174 __asm__ volatile(EMMS:::"memory");
c0038328 175 while (s < end) {
c0038328
LB
176 *dest++ = *s++;
177 *dest++ = *s++;
178 *dest++ = *s++;
179 s++;
c0038328
LB
180 }
181}
182
183/*
184 original by Strepto/Astral
185 ported to gcc & bugfixed: A'rpi
652f5185 186 MMXEXT, 3DNOW optimization by Nick Kurshev
c0038328
LB
187 32-bit C version, and and&add trick by Michael Niedermayer
188*/
b8e89339 189static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
190{
191 register const uint8_t* s=src;
192 register uint8_t* d=dst;
193 register const uint8_t *end;
194 const uint8_t *mm_end;
195 end = s + src_size;
c0038328
LB
196 __asm__ volatile(PREFETCH" %0"::"m"(*s));
197 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
198 mm_end = end - 15;
199 while (s<mm_end) {
200 __asm__ volatile(
90540c2d
MR
201 PREFETCH" 32(%1) \n\t"
202 "movq (%1), %%mm0 \n\t"
203 "movq 8(%1), %%mm2 \n\t"
c0038328
LB
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
90540c2d
MR
210 MOVNTQ" %%mm0, (%0) \n\t"
211 MOVNTQ" %%mm2, 8(%0)"
212 :: "r"(d), "r"(s)
c0038328
LB
213 );
214 d+=16;
215 s+=16;
216 }
217 __asm__ volatile(SFENCE:::"memory");
218 __asm__ volatile(EMMS:::"memory");
c0038328
LB
219 mm_end = end - 3;
220 while (s < mm_end) {
221 register unsigned x= *((const uint32_t *)s);
222 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
223 d+=4;
224 s+=4;
225 }
226 if (s < end) {
227 register unsigned short x= *((const uint16_t *)s);
228 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
229 }
230}
231
b8e89339 232static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
233{
234 register const uint8_t* s=src;
235 register uint8_t* d=dst;
236 register const uint8_t *end;
237 const uint8_t *mm_end;
238 end = s + src_size;
c0038328
LB
239 __asm__ volatile(PREFETCH" %0"::"m"(*s));
240 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
241 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
242 mm_end = end - 15;
243 while (s<mm_end) {
244 __asm__ volatile(
90540c2d
MR
245 PREFETCH" 32(%1) \n\t"
246 "movq (%1), %%mm0 \n\t"
247 "movq 8(%1), %%mm2 \n\t"
c0038328
LB
248 "movq %%mm0, %%mm1 \n\t"
249 "movq %%mm2, %%mm3 \n\t"
250 "psrlq $1, %%mm0 \n\t"
251 "psrlq $1, %%mm2 \n\t"
252 "pand %%mm7, %%mm0 \n\t"
253 "pand %%mm7, %%mm2 \n\t"
254 "pand %%mm6, %%mm1 \n\t"
255 "pand %%mm6, %%mm3 \n\t"
256 "por %%mm1, %%mm0 \n\t"
257 "por %%mm3, %%mm2 \n\t"
90540c2d
MR
258 MOVNTQ" %%mm0, (%0) \n\t"
259 MOVNTQ" %%mm2, 8(%0)"
260 :: "r"(d), "r"(s)
c0038328
LB
261 );
262 d+=16;
263 s+=16;
264 }
265 __asm__ volatile(SFENCE:::"memory");
266 __asm__ volatile(EMMS:::"memory");
c0038328
LB
267 mm_end = end - 3;
268 while (s < mm_end) {
269 register uint32_t x= *((const uint32_t*)s);
270 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
271 s+=4;
272 d+=4;
273 }
274 if (s < end) {
275 register uint16_t x= *((const uint16_t*)s);
276 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
277 }
278}
279
b8e89339 280static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
281{
282 const uint8_t *s = src;
283 const uint8_t *end;
c0038328 284 const uint8_t *mm_end;
c0038328
LB
285 uint16_t *d = (uint16_t *)dst;
286 end = s + src_size;
c0038328 287 mm_end = end - 15;
c0038328
LB
288 __asm__ volatile(
289 "movq %3, %%mm5 \n\t"
290 "movq %4, %%mm6 \n\t"
291 "movq %5, %%mm7 \n\t"
292 "jmp 2f \n\t"
293 ".p2align 4 \n\t"
294 "1: \n\t"
295 PREFETCH" 32(%1) \n\t"
296 "movd (%1), %%mm0 \n\t"
297 "movd 4(%1), %%mm3 \n\t"
298 "punpckldq 8(%1), %%mm0 \n\t"
299 "punpckldq 12(%1), %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm4 \n\t"
302 "pand %%mm6, %%mm0 \n\t"
303 "pand %%mm6, %%mm3 \n\t"
304 "pmaddwd %%mm7, %%mm0 \n\t"
305 "pmaddwd %%mm7, %%mm3 \n\t"
306 "pand %%mm5, %%mm1 \n\t"
307 "pand %%mm5, %%mm4 \n\t"
308 "por %%mm1, %%mm0 \n\t"
309 "por %%mm4, %%mm3 \n\t"
310 "psrld $5, %%mm0 \n\t"
311 "pslld $11, %%mm3 \n\t"
312 "por %%mm3, %%mm0 \n\t"
313 MOVNTQ" %%mm0, (%0) \n\t"
314 "add $16, %1 \n\t"
315 "add $8, %0 \n\t"
316 "2: \n\t"
317 "cmp %2, %1 \n\t"
318 " jb 1b \n\t"
319 : "+r" (d), "+r"(s)
320 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
321 );
c0038328
LB
322 __asm__ volatile(SFENCE:::"memory");
323 __asm__ volatile(EMMS:::"memory");
c0038328
LB
324 while (s < end) {
325 register int rgb = *(const uint32_t*)s; s += 4;
326 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
327 }
328}
329
b8e89339 330static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
331{
332 const uint8_t *s = src;
333 const uint8_t *end;
c0038328 334 const uint8_t *mm_end;
c0038328
LB
335 uint16_t *d = (uint16_t *)dst;
336 end = s + src_size;
c0038328
LB
337 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
338 __asm__ volatile(
339 "movq %0, %%mm7 \n\t"
340 "movq %1, %%mm6 \n\t"
341 ::"m"(red_16mask),"m"(green_16mask));
342 mm_end = end - 15;
343 while (s < mm_end) {
344 __asm__ volatile(
90540c2d
MR
345 PREFETCH" 32(%1) \n\t"
346 "movd (%1), %%mm0 \n\t"
347 "movd 4(%1), %%mm3 \n\t"
348 "punpckldq 8(%1), %%mm0 \n\t"
349 "punpckldq 12(%1), %%mm3 \n\t"
c0038328
LB
350 "movq %%mm0, %%mm1 \n\t"
351 "movq %%mm0, %%mm2 \n\t"
352 "movq %%mm3, %%mm4 \n\t"
353 "movq %%mm3, %%mm5 \n\t"
354 "psllq $8, %%mm0 \n\t"
355 "psllq $8, %%mm3 \n\t"
356 "pand %%mm7, %%mm0 \n\t"
357 "pand %%mm7, %%mm3 \n\t"
358 "psrlq $5, %%mm1 \n\t"
359 "psrlq $5, %%mm4 \n\t"
360 "pand %%mm6, %%mm1 \n\t"
361 "pand %%mm6, %%mm4 \n\t"
362 "psrlq $19, %%mm2 \n\t"
363 "psrlq $19, %%mm5 \n\t"
364 "pand %2, %%mm2 \n\t"
365 "pand %2, %%mm5 \n\t"
366 "por %%mm1, %%mm0 \n\t"
367 "por %%mm4, %%mm3 \n\t"
368 "por %%mm2, %%mm0 \n\t"
369 "por %%mm5, %%mm3 \n\t"
370 "psllq $16, %%mm3 \n\t"
371 "por %%mm3, %%mm0 \n\t"
90540c2d
MR
372 MOVNTQ" %%mm0, (%0) \n\t"
373 :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
c0038328
LB
374 d += 4;
375 s += 16;
376 }
377 __asm__ volatile(SFENCE:::"memory");
378 __asm__ volatile(EMMS:::"memory");
c0038328
LB
379 while (s < end) {
380 register int rgb = *(const uint32_t*)s; s += 4;
381 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
382 }
383}
384
b8e89339 385static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
386{
387 const uint8_t *s = src;
388 const uint8_t *end;
c0038328 389 const uint8_t *mm_end;
c0038328
LB
390 uint16_t *d = (uint16_t *)dst;
391 end = s + src_size;
c0038328 392 mm_end = end - 15;
c0038328
LB
393 __asm__ volatile(
394 "movq %3, %%mm5 \n\t"
395 "movq %4, %%mm6 \n\t"
396 "movq %5, %%mm7 \n\t"
397 "jmp 2f \n\t"
398 ".p2align 4 \n\t"
399 "1: \n\t"
400 PREFETCH" 32(%1) \n\t"
401 "movd (%1), %%mm0 \n\t"
402 "movd 4(%1), %%mm3 \n\t"
403 "punpckldq 8(%1), %%mm0 \n\t"
404 "punpckldq 12(%1), %%mm3 \n\t"
405 "movq %%mm0, %%mm1 \n\t"
406 "movq %%mm3, %%mm4 \n\t"
407 "pand %%mm6, %%mm0 \n\t"
408 "pand %%mm6, %%mm3 \n\t"
409 "pmaddwd %%mm7, %%mm0 \n\t"
410 "pmaddwd %%mm7, %%mm3 \n\t"
411 "pand %%mm5, %%mm1 \n\t"
412 "pand %%mm5, %%mm4 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "psrld $6, %%mm0 \n\t"
416 "pslld $10, %%mm3 \n\t"
417 "por %%mm3, %%mm0 \n\t"
418 MOVNTQ" %%mm0, (%0) \n\t"
419 "add $16, %1 \n\t"
420 "add $8, %0 \n\t"
421 "2: \n\t"
422 "cmp %2, %1 \n\t"
423 " jb 1b \n\t"
424 : "+r" (d), "+r"(s)
425 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
426 );
c0038328
LB
427 __asm__ volatile(SFENCE:::"memory");
428 __asm__ volatile(EMMS:::"memory");
c0038328
LB
429 while (s < end) {
430 register int rgb = *(const uint32_t*)s; s += 4;
431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
432 }
433}
434
b8e89339 435static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
436{
437 const uint8_t *s = src;
438 const uint8_t *end;
c0038328 439 const uint8_t *mm_end;
c0038328
LB
440 uint16_t *d = (uint16_t *)dst;
441 end = s + src_size;
c0038328
LB
442 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
443 __asm__ volatile(
444 "movq %0, %%mm7 \n\t"
445 "movq %1, %%mm6 \n\t"
446 ::"m"(red_15mask),"m"(green_15mask));
447 mm_end = end - 15;
448 while (s < mm_end) {
449 __asm__ volatile(
90540c2d
MR
450 PREFETCH" 32(%1) \n\t"
451 "movd (%1), %%mm0 \n\t"
452 "movd 4(%1), %%mm3 \n\t"
453 "punpckldq 8(%1), %%mm0 \n\t"
454 "punpckldq 12(%1), %%mm3 \n\t"
c0038328
LB
455 "movq %%mm0, %%mm1 \n\t"
456 "movq %%mm0, %%mm2 \n\t"
457 "movq %%mm3, %%mm4 \n\t"
458 "movq %%mm3, %%mm5 \n\t"
459 "psllq $7, %%mm0 \n\t"
460 "psllq $7, %%mm3 \n\t"
461 "pand %%mm7, %%mm0 \n\t"
462 "pand %%mm7, %%mm3 \n\t"
463 "psrlq $6, %%mm1 \n\t"
464 "psrlq $6, %%mm4 \n\t"
465 "pand %%mm6, %%mm1 \n\t"
466 "pand %%mm6, %%mm4 \n\t"
467 "psrlq $19, %%mm2 \n\t"
468 "psrlq $19, %%mm5 \n\t"
469 "pand %2, %%mm2 \n\t"
470 "pand %2, %%mm5 \n\t"
471 "por %%mm1, %%mm0 \n\t"
472 "por %%mm4, %%mm3 \n\t"
473 "por %%mm2, %%mm0 \n\t"
474 "por %%mm5, %%mm3 \n\t"
475 "psllq $16, %%mm3 \n\t"
476 "por %%mm3, %%mm0 \n\t"
90540c2d
MR
477 MOVNTQ" %%mm0, (%0) \n\t"
478 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
c0038328
LB
479 d += 4;
480 s += 16;
481 }
482 __asm__ volatile(SFENCE:::"memory");
483 __asm__ volatile(EMMS:::"memory");
c0038328
LB
484 while (s < end) {
485 register int rgb = *(const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
487 }
488}
489
b8e89339 490static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
491{
492 const uint8_t *s = src;
493 const uint8_t *end;
c0038328 494 const uint8_t *mm_end;
c0038328
LB
495 uint16_t *d = (uint16_t *)dst;
496 end = s + src_size;
c0038328
LB
497 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
498 __asm__ volatile(
499 "movq %0, %%mm7 \n\t"
500 "movq %1, %%mm6 \n\t"
501 ::"m"(red_16mask),"m"(green_16mask));
502 mm_end = end - 11;
503 while (s < mm_end) {
504 __asm__ volatile(
90540c2d
MR
505 PREFETCH" 32(%1) \n\t"
506 "movd (%1), %%mm0 \n\t"
507 "movd 3(%1), %%mm3 \n\t"
508 "punpckldq 6(%1), %%mm0 \n\t"
509 "punpckldq 9(%1), %%mm3 \n\t"
c0038328
LB
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm0, %%mm2 \n\t"
512 "movq %%mm3, %%mm4 \n\t"
513 "movq %%mm3, %%mm5 \n\t"
514 "psrlq $3, %%mm0 \n\t"
515 "psrlq $3, %%mm3 \n\t"
516 "pand %2, %%mm0 \n\t"
517 "pand %2, %%mm3 \n\t"
518 "psrlq $5, %%mm1 \n\t"
519 "psrlq $5, %%mm4 \n\t"
520 "pand %%mm6, %%mm1 \n\t"
521 "pand %%mm6, %%mm4 \n\t"
522 "psrlq $8, %%mm2 \n\t"
523 "psrlq $8, %%mm5 \n\t"
524 "pand %%mm7, %%mm2 \n\t"
525 "pand %%mm7, %%mm5 \n\t"
526 "por %%mm1, %%mm0 \n\t"
527 "por %%mm4, %%mm3 \n\t"
528 "por %%mm2, %%mm0 \n\t"
529 "por %%mm5, %%mm3 \n\t"
530 "psllq $16, %%mm3 \n\t"
531 "por %%mm3, %%mm0 \n\t"
90540c2d
MR
532 MOVNTQ" %%mm0, (%0) \n\t"
533 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
c0038328
LB
534 d += 4;
535 s += 12;
536 }
537 __asm__ volatile(SFENCE:::"memory");
538 __asm__ volatile(EMMS:::"memory");
c0038328
LB
539 while (s < end) {
540 const int b = *s++;
541 const int g = *s++;
542 const int r = *s++;
543 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
544 }
545}
546
b8e89339 547static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
548{
549 const uint8_t *s = src;
550 const uint8_t *end;
c0038328 551 const uint8_t *mm_end;
c0038328
LB
552 uint16_t *d = (uint16_t *)dst;
553 end = s + src_size;
c0038328
LB
554 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
555 __asm__ volatile(
556 "movq %0, %%mm7 \n\t"
557 "movq %1, %%mm6 \n\t"
558 ::"m"(red_16mask),"m"(green_16mask));
559 mm_end = end - 15;
560 while (s < mm_end) {
561 __asm__ volatile(
90540c2d
MR
562 PREFETCH" 32(%1) \n\t"
563 "movd (%1), %%mm0 \n\t"
564 "movd 3(%1), %%mm3 \n\t"
565 "punpckldq 6(%1), %%mm0 \n\t"
566 "punpckldq 9(%1), %%mm3 \n\t"
c0038328
LB
567 "movq %%mm0, %%mm1 \n\t"
568 "movq %%mm0, %%mm2 \n\t"
569 "movq %%mm3, %%mm4 \n\t"
570 "movq %%mm3, %%mm5 \n\t"
571 "psllq $8, %%mm0 \n\t"
572 "psllq $8, %%mm3 \n\t"
573 "pand %%mm7, %%mm0 \n\t"
574 "pand %%mm7, %%mm3 \n\t"
575 "psrlq $5, %%mm1 \n\t"
576 "psrlq $5, %%mm4 \n\t"
577 "pand %%mm6, %%mm1 \n\t"
578 "pand %%mm6, %%mm4 \n\t"
579 "psrlq $19, %%mm2 \n\t"
580 "psrlq $19, %%mm5 \n\t"
581 "pand %2, %%mm2 \n\t"
582 "pand %2, %%mm5 \n\t"
583 "por %%mm1, %%mm0 \n\t"
584 "por %%mm4, %%mm3 \n\t"
585 "por %%mm2, %%mm0 \n\t"
586 "por %%mm5, %%mm3 \n\t"
587 "psllq $16, %%mm3 \n\t"
588 "por %%mm3, %%mm0 \n\t"
90540c2d
MR
589 MOVNTQ" %%mm0, (%0) \n\t"
590 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
c0038328
LB
591 d += 4;
592 s += 12;
593 }
594 __asm__ volatile(SFENCE:::"memory");
595 __asm__ volatile(EMMS:::"memory");
c0038328
LB
596 while (s < end) {
597 const int r = *s++;
598 const int g = *s++;
599 const int b = *s++;
600 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
601 }
602}
603
b8e89339 604static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
605{
606 const uint8_t *s = src;
607 const uint8_t *end;
c0038328 608 const uint8_t *mm_end;
c0038328
LB
609 uint16_t *d = (uint16_t *)dst;
610 end = s + src_size;
c0038328
LB
611 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
612 __asm__ volatile(
613 "movq %0, %%mm7 \n\t"
614 "movq %1, %%mm6 \n\t"
615 ::"m"(red_15mask),"m"(green_15mask));
616 mm_end = end - 11;
617 while (s < mm_end) {
618 __asm__ volatile(
90540c2d
MR
619 PREFETCH" 32(%1) \n\t"
620 "movd (%1), %%mm0 \n\t"
621 "movd 3(%1), %%mm3 \n\t"
622 "punpckldq 6(%1), %%mm0 \n\t"
623 "punpckldq 9(%1), %%mm3 \n\t"
c0038328
LB
624 "movq %%mm0, %%mm1 \n\t"
625 "movq %%mm0, %%mm2 \n\t"
626 "movq %%mm3, %%mm4 \n\t"
627 "movq %%mm3, %%mm5 \n\t"
628 "psrlq $3, %%mm0 \n\t"
629 "psrlq $3, %%mm3 \n\t"
630 "pand %2, %%mm0 \n\t"
631 "pand %2, %%mm3 \n\t"
632 "psrlq $6, %%mm1 \n\t"
633 "psrlq $6, %%mm4 \n\t"
634 "pand %%mm6, %%mm1 \n\t"
635 "pand %%mm6, %%mm4 \n\t"
636 "psrlq $9, %%mm2 \n\t"
637 "psrlq $9, %%mm5 \n\t"
638 "pand %%mm7, %%mm2 \n\t"
639 "pand %%mm7, %%mm5 \n\t"
640 "por %%mm1, %%mm0 \n\t"
641 "por %%mm4, %%mm3 \n\t"
642 "por %%mm2, %%mm0 \n\t"
643 "por %%mm5, %%mm3 \n\t"
644 "psllq $16, %%mm3 \n\t"
645 "por %%mm3, %%mm0 \n\t"
90540c2d
MR
646 MOVNTQ" %%mm0, (%0) \n\t"
647 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
c0038328
LB
648 d += 4;
649 s += 12;
650 }
651 __asm__ volatile(SFENCE:::"memory");
652 __asm__ volatile(EMMS:::"memory");
c0038328
LB
653 while (s < end) {
654 const int b = *s++;
655 const int g = *s++;
656 const int r = *s++;
657 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
658 }
659}
660
b8e89339 661static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
662{
663 const uint8_t *s = src;
664 const uint8_t *end;
c0038328 665 const uint8_t *mm_end;
c0038328
LB
666 uint16_t *d = (uint16_t *)dst;
667 end = s + src_size;
c0038328
LB
668 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
669 __asm__ volatile(
670 "movq %0, %%mm7 \n\t"
671 "movq %1, %%mm6 \n\t"
672 ::"m"(red_15mask),"m"(green_15mask));
673 mm_end = end - 15;
674 while (s < mm_end) {
675 __asm__ volatile(
90540c2d
MR
676 PREFETCH" 32(%1) \n\t"
677 "movd (%1), %%mm0 \n\t"
678 "movd 3(%1), %%mm3 \n\t"
679 "punpckldq 6(%1), %%mm0 \n\t"
680 "punpckldq 9(%1), %%mm3 \n\t"
c0038328
LB
681 "movq %%mm0, %%mm1 \n\t"
682 "movq %%mm0, %%mm2 \n\t"
683 "movq %%mm3, %%mm4 \n\t"
684 "movq %%mm3, %%mm5 \n\t"
685 "psllq $7, %%mm0 \n\t"
686 "psllq $7, %%mm3 \n\t"
687 "pand %%mm7, %%mm0 \n\t"
688 "pand %%mm7, %%mm3 \n\t"
689 "psrlq $6, %%mm1 \n\t"
690 "psrlq $6, %%mm4 \n\t"
691 "pand %%mm6, %%mm1 \n\t"
692 "pand %%mm6, %%mm4 \n\t"
693 "psrlq $19, %%mm2 \n\t"
694 "psrlq $19, %%mm5 \n\t"
695 "pand %2, %%mm2 \n\t"
696 "pand %2, %%mm5 \n\t"
697 "por %%mm1, %%mm0 \n\t"
698 "por %%mm4, %%mm3 \n\t"
699 "por %%mm2, %%mm0 \n\t"
700 "por %%mm5, %%mm3 \n\t"
701 "psllq $16, %%mm3 \n\t"
702 "por %%mm3, %%mm0 \n\t"
90540c2d
MR
703 MOVNTQ" %%mm0, (%0) \n\t"
704 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
c0038328
LB
705 d += 4;
706 s += 12;
707 }
708 __asm__ volatile(SFENCE:::"memory");
709 __asm__ volatile(EMMS:::"memory");
c0038328
LB
710 while (s < end) {
711 const int r = *s++;
712 const int g = *s++;
713 const int b = *s++;
714 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
715 }
716}
717
718/*
719 I use less accurate approximation here by simply left-shifting the input
720 value and filling the low order bits with zeroes. This method improves PNG
721 compression but this scheme cannot reproduce white exactly, since it does
722 not generate an all-ones maximum value; the net effect is to darken the
723 image slightly.
724
725 The better method should be "left bit replication":
726
727 4 3 2 1 0
728 ---------
729 1 1 0 1 1
730
731 7 6 5 4 3 2 1 0
732 ----------------
733 1 1 0 1 1 1 1 0
734 |=======| |===|
735 | leftmost bits repeated to fill open bits
736 |
737 original bits
738*/
b8e89339 739static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
740{
741 const uint16_t *end;
c0038328 742 const uint16_t *mm_end;
c0038328
LB
743 uint8_t *d = dst;
744 const uint16_t *s = (const uint16_t*)src;
745 end = s + src_size/2;
c0038328
LB
746 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
747 mm_end = end - 7;
748 while (s < mm_end) {
749 __asm__ volatile(
90540c2d
MR
750 PREFETCH" 32(%1) \n\t"
751 "movq (%1), %%mm0 \n\t"
752 "movq (%1), %%mm1 \n\t"
753 "movq (%1), %%mm2 \n\t"
c0038328
LB
754 "pand %2, %%mm0 \n\t"
755 "pand %3, %%mm1 \n\t"
756 "pand %4, %%mm2 \n\t"
757 "psllq $3, %%mm0 \n\t"
758 "psrlq $2, %%mm1 \n\t"
759 "psrlq $7, %%mm2 \n\t"
760 "movq %%mm0, %%mm3 \n\t"
761 "movq %%mm1, %%mm4 \n\t"
762 "movq %%mm2, %%mm5 \n\t"
763 "punpcklwd %5, %%mm0 \n\t"
764 "punpcklwd %5, %%mm1 \n\t"
765 "punpcklwd %5, %%mm2 \n\t"
766 "punpckhwd %5, %%mm3 \n\t"
767 "punpckhwd %5, %%mm4 \n\t"
768 "punpckhwd %5, %%mm5 \n\t"
769 "psllq $8, %%mm1 \n\t"
770 "psllq $16, %%mm2 \n\t"
771 "por %%mm1, %%mm0 \n\t"
772 "por %%mm2, %%mm0 \n\t"
773 "psllq $8, %%mm4 \n\t"
774 "psllq $16, %%mm5 \n\t"
775 "por %%mm4, %%mm3 \n\t"
776 "por %%mm5, %%mm3 \n\t"
777
778 "movq %%mm0, %%mm6 \n\t"
779 "movq %%mm3, %%mm7 \n\t"
780
90540c2d
MR
781 "movq 8(%1), %%mm0 \n\t"
782 "movq 8(%1), %%mm1 \n\t"
783 "movq 8(%1), %%mm2 \n\t"
c0038328
LB
784 "pand %2, %%mm0 \n\t"
785 "pand %3, %%mm1 \n\t"
786 "pand %4, %%mm2 \n\t"
787 "psllq $3, %%mm0 \n\t"
788 "psrlq $2, %%mm1 \n\t"
789 "psrlq $7, %%mm2 \n\t"
790 "movq %%mm0, %%mm3 \n\t"
791 "movq %%mm1, %%mm4 \n\t"
792 "movq %%mm2, %%mm5 \n\t"
793 "punpcklwd %5, %%mm0 \n\t"
794 "punpcklwd %5, %%mm1 \n\t"
795 "punpcklwd %5, %%mm2 \n\t"
796 "punpckhwd %5, %%mm3 \n\t"
797 "punpckhwd %5, %%mm4 \n\t"
798 "punpckhwd %5, %%mm5 \n\t"
799 "psllq $8, %%mm1 \n\t"
800 "psllq $16, %%mm2 \n\t"
801 "por %%mm1, %%mm0 \n\t"
802 "por %%mm2, %%mm0 \n\t"
803 "psllq $8, %%mm4 \n\t"
804 "psllq $16, %%mm5 \n\t"
805 "por %%mm4, %%mm3 \n\t"
806 "por %%mm5, %%mm3 \n\t"
807
808 :"=m"(*d)
90540c2d 809 :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
c0038328
LB
810 :"memory");
811 /* borrowed 32 to 24 */
812 __asm__ volatile(
813 "movq %%mm0, %%mm4 \n\t"
814 "movq %%mm3, %%mm5 \n\t"
815 "movq %%mm6, %%mm0 \n\t"
816 "movq %%mm7, %%mm1 \n\t"
817
818 "movq %%mm4, %%mm6 \n\t"
819 "movq %%mm5, %%mm7 \n\t"
820 "movq %%mm0, %%mm2 \n\t"
821 "movq %%mm1, %%mm3 \n\t"
822
823 STORE_BGR24_MMX
824
90540c2d 825 :: "r"(d), "m"(*s)
c0038328
LB
826 :"memory");
827 d += 24;
828 s += 8;
829 }
830 __asm__ volatile(SFENCE:::"memory");
831 __asm__ volatile(EMMS:::"memory");
c0038328
LB
832 while (s < end) {
833 register uint16_t bgr;
834 bgr = *s++;
835 *d++ = (bgr&0x1F)<<3;
836 *d++ = (bgr&0x3E0)>>2;
837 *d++ = (bgr&0x7C00)>>7;
838 }
839}
840
b8e89339 841static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
842{
843 const uint16_t *end;
c0038328 844 const uint16_t *mm_end;
c0038328
LB
845 uint8_t *d = (uint8_t *)dst;
846 const uint16_t *s = (const uint16_t *)src;
847 end = s + src_size/2;
c0038328
LB
848 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
849 mm_end = end - 7;
850 while (s < mm_end) {
851 __asm__ volatile(
90540c2d
MR
852 PREFETCH" 32(%1) \n\t"
853 "movq (%1), %%mm0 \n\t"
854 "movq (%1), %%mm1 \n\t"
855 "movq (%1), %%mm2 \n\t"
c0038328
LB
856 "pand %2, %%mm0 \n\t"
857 "pand %3, %%mm1 \n\t"
858 "pand %4, %%mm2 \n\t"
859 "psllq $3, %%mm0 \n\t"
860 "psrlq $3, %%mm1 \n\t"
861 "psrlq $8, %%mm2 \n\t"
862 "movq %%mm0, %%mm3 \n\t"
863 "movq %%mm1, %%mm4 \n\t"
864 "movq %%mm2, %%mm5 \n\t"
865 "punpcklwd %5, %%mm0 \n\t"
866 "punpcklwd %5, %%mm1 \n\t"
867 "punpcklwd %5, %%mm2 \n\t"
868 "punpckhwd %5, %%mm3 \n\t"
869 "punpckhwd %5, %%mm4 \n\t"
870 "punpckhwd %5, %%mm5 \n\t"
871 "psllq $8, %%mm1 \n\t"
872 "psllq $16, %%mm2 \n\t"
873 "por %%mm1, %%mm0 \n\t"
874 "por %%mm2, %%mm0 \n\t"
875 "psllq $8, %%mm4 \n\t"
876 "psllq $16, %%mm5 \n\t"
877 "por %%mm4, %%mm3 \n\t"
878 "por %%mm5, %%mm3 \n\t"
879
880 "movq %%mm0, %%mm6 \n\t"
881 "movq %%mm3, %%mm7 \n\t"
882
90540c2d
MR
883 "movq 8(%1), %%mm0 \n\t"
884 "movq 8(%1), %%mm1 \n\t"
885 "movq 8(%1), %%mm2 \n\t"
c0038328
LB
886 "pand %2, %%mm0 \n\t"
887 "pand %3, %%mm1 \n\t"
888 "pand %4, %%mm2 \n\t"
889 "psllq $3, %%mm0 \n\t"
890 "psrlq $3, %%mm1 \n\t"
891 "psrlq $8, %%mm2 \n\t"
892 "movq %%mm0, %%mm3 \n\t"
893 "movq %%mm1, %%mm4 \n\t"
894 "movq %%mm2, %%mm5 \n\t"
895 "punpcklwd %5, %%mm0 \n\t"
896 "punpcklwd %5, %%mm1 \n\t"
897 "punpcklwd %5, %%mm2 \n\t"
898 "punpckhwd %5, %%mm3 \n\t"
899 "punpckhwd %5, %%mm4 \n\t"
900 "punpckhwd %5, %%mm5 \n\t"
901 "psllq $8, %%mm1 \n\t"
902 "psllq $16, %%mm2 \n\t"
903 "por %%mm1, %%mm0 \n\t"
904 "por %%mm2, %%mm0 \n\t"
905 "psllq $8, %%mm4 \n\t"
906 "psllq $16, %%mm5 \n\t"
907 "por %%mm4, %%mm3 \n\t"
908 "por %%mm5, %%mm3 \n\t"
909 :"=m"(*d)
90540c2d 910 :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
c0038328
LB
911 :"memory");
912 /* borrowed 32 to 24 */
913 __asm__ volatile(
914 "movq %%mm0, %%mm4 \n\t"
915 "movq %%mm3, %%mm5 \n\t"
916 "movq %%mm6, %%mm0 \n\t"
917 "movq %%mm7, %%mm1 \n\t"
918
919 "movq %%mm4, %%mm6 \n\t"
920 "movq %%mm5, %%mm7 \n\t"
921 "movq %%mm0, %%mm2 \n\t"
922 "movq %%mm1, %%mm3 \n\t"
923
924 STORE_BGR24_MMX
925
90540c2d 926 :: "r"(d), "m"(*s)
c0038328
LB
927 :"memory");
928 d += 24;
929 s += 8;
930 }
931 __asm__ volatile(SFENCE:::"memory");
932 __asm__ volatile(EMMS:::"memory");
c0038328
LB
933 while (s < end) {
934 register uint16_t bgr;
935 bgr = *s++;
936 *d++ = (bgr&0x1F)<<3;
937 *d++ = (bgr&0x7E0)>>3;
938 *d++ = (bgr&0xF800)>>8;
939 }
940}
941
942/*
943 * mm0 = 00 B3 00 B2 00 B1 00 B0
944 * mm1 = 00 G3 00 G2 00 G1 00 G0
945 * mm2 = 00 R3 00 R2 00 R1 00 R0
946 * mm6 = FF FF FF FF FF FF FF FF
947 * mm7 = 00 00 00 00 00 00 00 00
948 */
949#define PACK_RGB32 \
950 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
951 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
952 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
953 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
954 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
955 "movq %%mm0, %%mm3 \n\t" \
956 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
957 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
90540c2d
MR
958 MOVNTQ" %%mm0, (%0) \n\t" \
959 MOVNTQ" %%mm3, 8(%0) \n\t" \
c0038328 960
b8e89339 961static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
962{
963 const uint16_t *end;
c0038328 964 const uint16_t *mm_end;
c0038328
LB
965 uint8_t *d = dst;
966 const uint16_t *s = (const uint16_t *)src;
967 end = s + src_size/2;
c0038328
LB
968 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
969 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
970 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
971 mm_end = end - 3;
972 while (s < mm_end) {
973 __asm__ volatile(
90540c2d
MR
974 PREFETCH" 32(%1) \n\t"
975 "movq (%1), %%mm0 \n\t"
976 "movq (%1), %%mm1 \n\t"
977 "movq (%1), %%mm2 \n\t"
c0038328
LB
978 "pand %2, %%mm0 \n\t"
979 "pand %3, %%mm1 \n\t"
980 "pand %4, %%mm2 \n\t"
981 "psllq $3, %%mm0 \n\t"
982 "psrlq $2, %%mm1 \n\t"
983 "psrlq $7, %%mm2 \n\t"
984 PACK_RGB32
90540c2d 985 ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
c0038328
LB
986 :"memory");
987 d += 16;
988 s += 4;
989 }
990 __asm__ volatile(SFENCE:::"memory");
991 __asm__ volatile(EMMS:::"memory");
c0038328
LB
992 while (s < end) {
993 register uint16_t bgr;
994 bgr = *s++;
c0038328
LB
995 *d++ = (bgr&0x1F)<<3;
996 *d++ = (bgr&0x3E0)>>2;
997 *d++ = (bgr&0x7C00)>>7;
998 *d++ = 255;
c0038328
LB
999 }
1000}
1001
b8e89339 1002static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
1003{
1004 const uint16_t *end;
c0038328 1005 const uint16_t *mm_end;
c0038328
LB
1006 uint8_t *d = dst;
1007 const uint16_t *s = (const uint16_t*)src;
1008 end = s + src_size/2;
c0038328
LB
1009 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1010 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1011 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1012 mm_end = end - 3;
1013 while (s < mm_end) {
1014 __asm__ volatile(
90540c2d
MR
1015 PREFETCH" 32(%1) \n\t"
1016 "movq (%1), %%mm0 \n\t"
1017 "movq (%1), %%mm1 \n\t"
1018 "movq (%1), %%mm2 \n\t"
c0038328
LB
1019 "pand %2, %%mm0 \n\t"
1020 "pand %3, %%mm1 \n\t"
1021 "pand %4, %%mm2 \n\t"
1022 "psllq $3, %%mm0 \n\t"
1023 "psrlq $3, %%mm1 \n\t"
1024 "psrlq $8, %%mm2 \n\t"
1025 PACK_RGB32
90540c2d 1026 ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
c0038328
LB
1027 :"memory");
1028 d += 16;
1029 s += 4;
1030 }
1031 __asm__ volatile(SFENCE:::"memory");
1032 __asm__ volatile(EMMS:::"memory");
c0038328
LB
1033 while (s < end) {
1034 register uint16_t bgr;
1035 bgr = *s++;
c0038328
LB
1036 *d++ = (bgr&0x1F)<<3;
1037 *d++ = (bgr&0x7E0)>>3;
1038 *d++ = (bgr&0xF800)>>8;
1039 *d++ = 255;
c0038328
LB
1040 }
1041}
1042
b8e89339 1043static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
1044{
1045 x86_reg idx = 15 - src_size;
1046 const uint8_t *s = src-idx;
1047 uint8_t *d = dst-idx;
c0038328
LB
1048 __asm__ volatile(
1049 "test %0, %0 \n\t"
1050 "jns 2f \n\t"
1051 PREFETCH" (%1, %0) \n\t"
1052 "movq %3, %%mm7 \n\t"
1053 "pxor %4, %%mm7 \n\t"
1054 "movq %%mm7, %%mm6 \n\t"
1055 "pxor %5, %%mm7 \n\t"
1056 ".p2align 4 \n\t"
1057 "1: \n\t"
1058 PREFETCH" 32(%1, %0) \n\t"
1059 "movq (%1, %0), %%mm0 \n\t"
1060 "movq 8(%1, %0), %%mm1 \n\t"
239fdf1b 1061# if COMPILE_TEMPLATE_MMXEXT
c0038328
LB
1062 "pshufw $177, %%mm0, %%mm3 \n\t"
1063 "pshufw $177, %%mm1, %%mm5 \n\t"
1064 "pand %%mm7, %%mm0 \n\t"
1065 "pand %%mm6, %%mm3 \n\t"
1066 "pand %%mm7, %%mm1 \n\t"
1067 "pand %%mm6, %%mm5 \n\t"
1068 "por %%mm3, %%mm0 \n\t"
1069 "por %%mm5, %%mm1 \n\t"
1070# else
1071 "movq %%mm0, %%mm2 \n\t"
1072 "movq %%mm1, %%mm4 \n\t"
1073 "pand %%mm7, %%mm0 \n\t"
1074 "pand %%mm6, %%mm2 \n\t"
1075 "pand %%mm7, %%mm1 \n\t"
1076 "pand %%mm6, %%mm4 \n\t"
1077 "movq %%mm2, %%mm3 \n\t"
1078 "movq %%mm4, %%mm5 \n\t"
1079 "pslld $16, %%mm2 \n\t"
1080 "psrld $16, %%mm3 \n\t"
1081 "pslld $16, %%mm4 \n\t"
1082 "psrld $16, %%mm5 \n\t"
1083 "por %%mm2, %%mm0 \n\t"
1084 "por %%mm4, %%mm1 \n\t"
1085 "por %%mm3, %%mm0 \n\t"
1086 "por %%mm5, %%mm1 \n\t"
1087# endif
1088 MOVNTQ" %%mm0, (%2, %0) \n\t"
1089 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1090 "add $16, %0 \n\t"
1091 "js 1b \n\t"
1092 SFENCE" \n\t"
1093 EMMS" \n\t"
1094 "2: \n\t"
1095 : "+&r"(idx)
1096 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1097 : "memory");
c0038328
LB
1098 for (; idx<15; idx+=4) {
1099 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1100 v &= 0xff00ff;
1101 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1102 }
1103}
1104
b8e89339 1105static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
c0038328
LB
1106{
1107 unsigned i;
c0038328
LB
1108 x86_reg mmx_size= 23 - src_size;
1109 __asm__ volatile (
1110 "test %%"REG_a", %%"REG_a" \n\t"
1111 "jns 2f \n\t"
1112 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1113 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1114 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1115 ".p2align 4 \n\t"
1116 "1: \n\t"
1117 PREFETCH" 32(%1, %%"REG_a") \n\t"
1118 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1119 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1120 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1121 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1122 "pand %%mm5, %%mm0 \n\t"
1123 "pand %%mm6, %%mm1 \n\t"
1124 "pand %%mm7, %%mm2 \n\t"
1125 "por %%mm0, %%mm1 \n\t"
1126 "por %%mm2, %%mm1 \n\t"
1127 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1128 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1129 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1130 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1131 "pand %%mm7, %%mm0 \n\t"
1132 "pand %%mm5, %%mm1 \n\t"
1133 "pand %%mm6, %%mm2 \n\t"
1134 "por %%mm0, %%mm1 \n\t"
1135 "por %%mm2, %%mm1 \n\t"
1136 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1137 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1138 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1139 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1140 "pand %%mm6, %%mm0 \n\t"
1141 "pand %%mm7, %%mm1 \n\t"
1142 "pand %%mm5, %%mm2 \n\t"
1143 "por %%mm0, %%mm1 \n\t"
1144 "por %%mm2, %%mm1 \n\t"
1145 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1146 "add $24, %%"REG_a" \n\t"
1147 " js 1b \n\t"
1148 "2: \n\t"
1149 : "+a" (mmx_size)
1150 : "r" (src-mmx_size), "r"(dst-mmx_size)
1151 );
1152
1153 __asm__ volatile(SFENCE:::"memory");
1154 __asm__ volatile(EMMS:::"memory");
1155
1156 if (mmx_size==23) return; //finished, was multiple of 8
1157
1158 src+= src_size;
1159 dst+= src_size;
1160 src_size= 23-mmx_size;
1161 src-= src_size;
1162 dst-= src_size;
c0038328
LB
1163 for (i=0; i<src_size; i+=3) {
1164 register uint8_t x;
1165 x = src[i + 2];
1166 dst[i + 1] = src[i + 1];
1167 dst[i + 2] = src[i + 0];
1168 dst[i + 0] = x;
1169 }
1170}
1171
1172static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
b8e89339
AK
1173 int width, int height,
1174 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
c0038328 1175{
b8e89339 1176 int y;
c0038328
LB
1177 const x86_reg chromWidth= width>>1;
1178 for (y=0; y<height; y++) {
c0038328
LB
1179 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1180 __asm__ volatile(
1181 "xor %%"REG_a", %%"REG_a" \n\t"
1182 ".p2align 4 \n\t"
1183 "1: \n\t"
1184 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1185 PREFETCH" 32(%2, %%"REG_a") \n\t"
1186 PREFETCH" 32(%3, %%"REG_a") \n\t"
1187 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1188 "movq %%mm0, %%mm2 \n\t" // U(0)
1189 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1190 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1191 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1192
1193 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1194 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1195 "movq %%mm3, %%mm4 \n\t" // Y(0)
1196 "movq %%mm5, %%mm6 \n\t" // Y(8)
1197 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1198 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1199 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1200 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1201
1202 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1203 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1204 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1205 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1206
1207 "add $8, %%"REG_a" \n\t"
1208 "cmp %4, %%"REG_a" \n\t"
1209 " jb 1b \n\t"
1210 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1211 : "%"REG_a
1212 );
c0038328
LB
1213 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1214 usrc += chromStride;
1215 vsrc += chromStride;
1216 }
1217 ysrc += lumStride;
1218 dst += dstStride;
1219 }
c0038328
LB
1220 __asm__(EMMS" \n\t"
1221 SFENCE" \n\t"
1222 :::"memory");
c0038328
LB
1223}
1224
1225/**
1226 * Height should be a multiple of 2 and width should be a multiple of 16.
1227 * (If this is a problem for anyone then tell me, and I will fix it.)
1228 */
1229static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
b8e89339
AK
1230 int width, int height,
1231 int lumStride, int chromStride, int dstStride)
c0038328
LB
1232{
1233 //FIXME interpolate chroma
1234 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1235}
1236
1237static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
b8e89339
AK
1238 int width, int height,
1239 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
c0038328 1240{
b8e89339 1241 int y;
c0038328
LB
1242 const x86_reg chromWidth= width>>1;
1243 for (y=0; y<height; y++) {
c0038328
LB
1244 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1245 __asm__ volatile(
1246 "xor %%"REG_a", %%"REG_a" \n\t"
1247 ".p2align 4 \n\t"
1248 "1: \n\t"
1249 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1250 PREFETCH" 32(%2, %%"REG_a") \n\t"
1251 PREFETCH" 32(%3, %%"REG_a") \n\t"
1252 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1253 "movq %%mm0, %%mm2 \n\t" // U(0)
1254 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1255 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1256 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1257
1258 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1259 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1260 "movq %%mm0, %%mm4 \n\t" // Y(0)
1261 "movq %%mm2, %%mm6 \n\t" // Y(8)
1262 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1263 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1264 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1265 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1266
1267 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1268 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1269 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1270 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1271
1272 "add $8, %%"REG_a" \n\t"
1273 "cmp %4, %%"REG_a" \n\t"
1274 " jb 1b \n\t"
1275 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1276 : "%"REG_a
1277 );
c0038328
LB
1278 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1279 usrc += chromStride;
1280 vsrc += chromStride;
1281 }
1282 ysrc += lumStride;
1283 dst += dstStride;
1284 }
c0038328
LB
1285 __asm__(EMMS" \n\t"
1286 SFENCE" \n\t"
1287 :::"memory");
c0038328
LB
1288}
1289
1290/**
1291 * Height should be a multiple of 2 and width should be a multiple of 16
1292 * (If this is a problem for anyone then tell me, and I will fix it.)
1293 */
1294static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
b8e89339
AK
1295 int width, int height,
1296 int lumStride, int chromStride, int dstStride)
c0038328
LB
1297{
1298 //FIXME interpolate chroma
1299 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1300}
1301
1302/**
1303 * Width should be a multiple of 16.
1304 */
1305static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
b8e89339
AK
1306 int width, int height,
1307 int lumStride, int chromStride, int dstStride)
c0038328
LB
1308{
1309 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1310}
1311
1312/**
1313 * Width should be a multiple of 16.
1314 */
1315static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
b8e89339
AK
1316 int width, int height,
1317 int lumStride, int chromStride, int dstStride)
c0038328
LB
1318{
1319 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1320}
1321
1322/**
1323 * Height should be a multiple of 2 and width should be a multiple of 16.
1324 * (If this is a problem for anyone then tell me, and I will fix it.)
1325 */
1326static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
b8e89339
AK
1327 int width, int height,
1328 int lumStride, int chromStride, int srcStride)
c0038328 1329{
b8e89339 1330 int y;
c0038328
LB
1331 const x86_reg chromWidth= width>>1;
1332 for (y=0; y<height; y+=2) {
c0038328
LB
1333 __asm__ volatile(
1334 "xor %%"REG_a", %%"REG_a" \n\t"
1335 "pcmpeqw %%mm7, %%mm7 \n\t"
1336 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1337 ".p2align 4 \n\t"
1338 "1: \n\t"
1339 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1340 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1341 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1342 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1343 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1344 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1345 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1346 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1347 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1348 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1349 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1350
1351 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1352
1353 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1354 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1355 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1356 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1357 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1358 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1359 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1360 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1361 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1362 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1363
1364 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1365
1366 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1367 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1368 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1369 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1370 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1371 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1372 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1373 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1374
1375 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1376 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1377
1378 "add $8, %%"REG_a" \n\t"
1379 "cmp %4, %%"REG_a" \n\t"
1380 " jb 1b \n\t"
1381 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1382 : "memory", "%"REG_a
1383 );
1384
1385 ydst += lumStride;
1386 src += srcStride;
1387
1388 __asm__ volatile(
1389 "xor %%"REG_a", %%"REG_a" \n\t"
1390 ".p2align 4 \n\t"
1391 "1: \n\t"
1392 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1393 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1394 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1395 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1396 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1397 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1398 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1399 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1400 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1401 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1402 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1403
1404 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1405 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1406
1407 "add $8, %%"REG_a" \n\t"
1408 "cmp %4, %%"REG_a" \n\t"
1409 " jb 1b \n\t"
1410
1411 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1412 : "memory", "%"REG_a
1413 );
c0038328
LB
1414 udst += chromStride;
1415 vdst += chromStride;
1416 ydst += lumStride;
1417 src += srcStride;
1418 }
c0038328
LB
1419 __asm__ volatile(EMMS" \n\t"
1420 SFENCE" \n\t"
1421 :::"memory");
c0038328 1422}
522d65ba 1423#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
c0038328 1424
239fdf1b 1425#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
b8e89339 1426static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
c0038328 1427{
b8e89339 1428 int x,y;
c0038328
LB
1429
1430 dst[0]= src[0];
1431
1432 // first line
1433 for (x=0; x<srcWidth-1; x++) {
1434 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1435 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1436 }
1437 dst[2*srcWidth-1]= src[srcWidth-1];
1438
1439 dst+= dstStride;
1440
1441 for (y=1; y<srcHeight; y++) {
c0038328
LB
1442 const x86_reg mmxSize= srcWidth&~15;
1443 __asm__ volatile(
1444 "mov %4, %%"REG_a" \n\t"
1445 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1446 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1447 "movq %%mm4, %%mm2 \n\t"
1448 "psllq $8, %%mm4 \n\t"
1449 "pand %%mm0, %%mm2 \n\t"
1450 "por %%mm2, %%mm4 \n\t"
1451 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1452 "movq %%mm5, %%mm3 \n\t"
1453 "psllq $8, %%mm5 \n\t"
1454 "pand %%mm0, %%mm3 \n\t"
1455 "por %%mm3, %%mm5 \n\t"
1456 "1: \n\t"
1457 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1458 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1459 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1460 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1461 PAVGB" %%mm0, %%mm5 \n\t"
1462 PAVGB" %%mm0, %%mm3 \n\t"
1463 PAVGB" %%mm0, %%mm5 \n\t"
1464 PAVGB" %%mm0, %%mm3 \n\t"
1465 PAVGB" %%mm1, %%mm4 \n\t"
1466 PAVGB" %%mm1, %%mm2 \n\t"
1467 PAVGB" %%mm1, %%mm4 \n\t"
1468 PAVGB" %%mm1, %%mm2 \n\t"
1469 "movq %%mm5, %%mm7 \n\t"
1470 "movq %%mm4, %%mm6 \n\t"
1471 "punpcklbw %%mm3, %%mm5 \n\t"
1472 "punpckhbw %%mm3, %%mm7 \n\t"
1473 "punpcklbw %%mm2, %%mm4 \n\t"
1474 "punpckhbw %%mm2, %%mm6 \n\t"
c0038328
LB
1475 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1476 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1477 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1478 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
c0038328
LB
1479 "add $8, %%"REG_a" \n\t"
1480 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1481 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1482 " js 1b \n\t"
1483 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1484 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1485 "g" (-mmxSize)
1486 : "%"REG_a
1487 );
c0038328
LB
1488
1489 for (x=mmxSize-1; x<srcWidth-1; x++) {
1490 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1491 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1492 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1493 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1494 }
1495 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1496 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1497
1498 dst+=dstStride*2;
1499 src+=srcStride;
1500 }
1501
1502 // last line
c0038328
LB
1503 dst[0]= src[0];
1504
1505 for (x=0; x<srcWidth-1; x++) {
1506 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1507 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1508 }
1509 dst[2*srcWidth-1]= src[srcWidth-1];
c0038328 1510
c0038328
LB
1511 __asm__ volatile(EMMS" \n\t"
1512 SFENCE" \n\t"
1513 :::"memory");
c0038328 1514}
239fdf1b 1515#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
c0038328 1516
522d65ba 1517#if !COMPILE_TEMPLATE_AMD3DNOW
c0038328
LB
1518/**
1519 * Height should be a multiple of 2 and width should be a multiple of 16.
1520 * (If this is a problem for anyone then tell me, and I will fix it.)
1521 * Chrominance data is only taken from every second line, others are ignored.
1522 * FIXME: Write HQ version.
1523 */
1524static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
b8e89339
AK
1525 int width, int height,
1526 int lumStride, int chromStride, int srcStride)
c0038328 1527{
b8e89339 1528 int y;
c0038328
LB
1529 const x86_reg chromWidth= width>>1;
1530 for (y=0; y<height; y+=2) {
c0038328
LB
1531 __asm__ volatile(
1532 "xor %%"REG_a", %%"REG_a" \n\t"
1533 "pcmpeqw %%mm7, %%mm7 \n\t"
1534 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1535 ".p2align 4 \n\t"
1536 "1: \n\t"
1537 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1538 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1539 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1540 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1541 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1542 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1543 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1544 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1545 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1546 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1547 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1548
1549 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1550
1551 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1552 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1553 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1554 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1555 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1556 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1557 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1558 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1559 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1560 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1561
1562 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1563
1564 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1565 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1566 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1567 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1568 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1569 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1570 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1571 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1572
1573 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1574 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1575
1576 "add $8, %%"REG_a" \n\t"
1577 "cmp %4, %%"REG_a" \n\t"
1578 " jb 1b \n\t"
1579 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1580 : "memory", "%"REG_a
1581 );
1582
1583 ydst += lumStride;
1584 src += srcStride;
1585
1586 __asm__ volatile(
1587 "xor %%"REG_a", %%"REG_a" \n\t"
1588 ".p2align 4 \n\t"
1589 "1: \n\t"
1590 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1591 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1592 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1593 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1594 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1595 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1596 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1597 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1598 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1599 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1600 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1601
1602 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1603 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1604
1605 "add $8, %%"REG_a" \n\t"
1606 "cmp %4, %%"REG_a" \n\t"
1607 " jb 1b \n\t"
1608
1609 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1610 : "memory", "%"REG_a
1611 );
c0038328
LB
1612 udst += chromStride;
1613 vdst += chromStride;
1614 ydst += lumStride;
1615 src += srcStride;
1616 }
c0038328
LB
1617 __asm__ volatile(EMMS" \n\t"
1618 SFENCE" \n\t"
1619 :::"memory");
c0038328 1620}
522d65ba 1621#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
c0038328
LB
1622
1623/**
1624 * Height should be a multiple of 2 and width should be a multiple of 2.
1625 * (If this is a problem for anyone then tell me, and I will fix it.)
1626 * Chrominance data is only taken from every second line,
1627 * others are ignored in the C version.
1628 * FIXME: Write HQ version.
1629 */
1630static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
b8e89339
AK
1631 int width, int height,
1632 int lumStride, int chromStride, int srcStride)
c0038328 1633{
b8e89339 1634 int y;
c0038328 1635 const x86_reg chromWidth= width>>1;
c0038328 1636 for (y=0; y<height-2; y+=2) {
b8e89339 1637 int i;
c0038328
LB
1638 for (i=0; i<2; i++) {
1639 __asm__ volatile(
1640 "mov %2, %%"REG_a" \n\t"
1641 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1642 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1643 "pxor %%mm7, %%mm7 \n\t"
1644 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1645 ".p2align 4 \n\t"
1646 "1: \n\t"
1647 PREFETCH" 64(%0, %%"REG_d") \n\t"
1648 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1649 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1650 "punpcklbw %%mm7, %%mm0 \n\t"
1651 "punpcklbw %%mm7, %%mm1 \n\t"
1652 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1653 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1654 "punpcklbw %%mm7, %%mm2 \n\t"
1655 "punpcklbw %%mm7, %%mm3 \n\t"
1656 "pmaddwd %%mm6, %%mm0 \n\t"
1657 "pmaddwd %%mm6, %%mm1 \n\t"
1658 "pmaddwd %%mm6, %%mm2 \n\t"
1659 "pmaddwd %%mm6, %%mm3 \n\t"
1660#ifndef FAST_BGR2YV12
1661 "psrad $8, %%mm0 \n\t"
1662 "psrad $8, %%mm1 \n\t"
1663 "psrad $8, %%mm2 \n\t"
1664 "psrad $8, %%mm3 \n\t"
1665#endif
1666 "packssdw %%mm1, %%mm0 \n\t"
1667 "packssdw %%mm3, %%mm2 \n\t"
1668 "pmaddwd %%mm5, %%mm0 \n\t"
1669 "pmaddwd %%mm5, %%mm2 \n\t"
1670 "packssdw %%mm2, %%mm0 \n\t"
1671 "psraw $7, %%mm0 \n\t"
1672
1673 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1674 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1675 "punpcklbw %%mm7, %%mm4 \n\t"
1676 "punpcklbw %%mm7, %%mm1 \n\t"
1677 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1678 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1679 "punpcklbw %%mm7, %%mm2 \n\t"
1680 "punpcklbw %%mm7, %%mm3 \n\t"
1681 "pmaddwd %%mm6, %%mm4 \n\t"
1682 "pmaddwd %%mm6, %%mm1 \n\t"
1683 "pmaddwd %%mm6, %%mm2 \n\t"
1684 "pmaddwd %%mm6, %%mm3 \n\t"
1685#ifndef FAST_BGR2YV12
1686 "psrad $8, %%mm4 \n\t"
1687 "psrad $8, %%mm1 \n\t"
1688 "psrad $8, %%mm2 \n\t"
1689 "psrad $8, %%mm3 \n\t"
1690#endif
1691 "packssdw %%mm1, %%mm4 \n\t"
1692 "packssdw %%mm3, %%mm2 \n\t"
1693 "pmaddwd %%mm5, %%mm4 \n\t"
1694 "pmaddwd %%mm5, %%mm2 \n\t"
1695 "add $24, %%"REG_d" \n\t"
1696 "packssdw %%mm2, %%mm4 \n\t"
1697 "psraw $7, %%mm4 \n\t"
1698
1699 "packuswb %%mm4, %%mm0 \n\t"
1700 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1701
1702 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1703 "add $8, %%"REG_a" \n\t"
1704 " js 1b \n\t"
1705 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1706 : "%"REG_a, "%"REG_d
1707 );
1708 ydst += lumStride;
1709 src += srcStride;
1710 }
1711 src -= srcStride*2;
1712 __asm__ volatile(
1713 "mov %4, %%"REG_a" \n\t"
1714 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1715 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1716 "pxor %%mm7, %%mm7 \n\t"
1717 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1718 "add %%"REG_d", %%"REG_d" \n\t"
1719 ".p2align 4 \n\t"
1720 "1: \n\t"
1721 PREFETCH" 64(%0, %%"REG_d") \n\t"
1722 PREFETCH" 64(%1, %%"REG_d") \n\t"
239fdf1b 1723#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
c0038328
LB
1724 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1725 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1726 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1727 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1728 PAVGB" %%mm1, %%mm0 \n\t"
1729 PAVGB" %%mm3, %%mm2 \n\t"
1730 "movq %%mm0, %%mm1 \n\t"
1731 "movq %%mm2, %%mm3 \n\t"
1732 "psrlq $24, %%mm0 \n\t"
1733 "psrlq $24, %%mm2 \n\t"
1734 PAVGB" %%mm1, %%mm0 \n\t"
1735 PAVGB" %%mm3, %%mm2 \n\t"
1736 "punpcklbw %%mm7, %%mm0 \n\t"
1737 "punpcklbw %%mm7, %%mm2 \n\t"
1738#else
1739 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1740 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1741 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1742 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1743 "punpcklbw %%mm7, %%mm0 \n\t"
1744 "punpcklbw %%mm7, %%mm1 \n\t"
1745 "punpcklbw %%mm7, %%mm2 \n\t"
1746 "punpcklbw %%mm7, %%mm3 \n\t"
1747 "paddw %%mm1, %%mm0 \n\t"
1748 "paddw %%mm3, %%mm2 \n\t"
1749 "paddw %%mm2, %%mm0 \n\t"
1750 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1751 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1752 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1753 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1754 "punpcklbw %%mm7, %%mm4 \n\t"
1755 "punpcklbw %%mm7, %%mm1 \n\t"
1756 "punpcklbw %%mm7, %%mm2 \n\t"
1757 "punpcklbw %%mm7, %%mm3 \n\t"
1758 "paddw %%mm1, %%mm4 \n\t"
1759 "paddw %%mm3, %%mm2 \n\t"
1760 "paddw %%mm4, %%mm2 \n\t"
1761 "psrlw $2, %%mm0 \n\t"
1762 "psrlw $2, %%mm2 \n\t"
1763#endif
1764 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1765 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1766
1767 "pmaddwd %%mm0, %%mm1 \n\t"
1768 "pmaddwd %%mm2, %%mm3 \n\t"
1769 "pmaddwd %%mm6, %%mm0 \n\t"
1770 "pmaddwd %%mm6, %%mm2 \n\t"
1771#ifndef FAST_BGR2YV12
1772 "psrad $8, %%mm0 \n\t"
1773 "psrad $8, %%mm1 \n\t"
1774 "psrad $8, %%mm2 \n\t"
1775 "psrad $8, %%mm3 \n\t"
1776#endif
1777 "packssdw %%mm2, %%mm0 \n\t"
1778 "packssdw %%mm3, %%mm1 \n\t"
1779 "pmaddwd %%mm5, %%mm0 \n\t"
1780 "pmaddwd %%mm5, %%mm1 \n\t"
1781 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1782 "psraw $7, %%mm0 \n\t"
1783
239fdf1b 1784#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
c0038328
LB
1785 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1786 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1787 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1788 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1789 PAVGB" %%mm1, %%mm4 \n\t"
1790 PAVGB" %%mm3, %%mm2 \n\t"
1791 "movq %%mm4, %%mm1 \n\t"
1792 "movq %%mm2, %%mm3 \n\t"
1793 "psrlq $24, %%mm4 \n\t"
1794 "psrlq $24, %%mm2 \n\t"
1795 PAVGB" %%mm1, %%mm4 \n\t"
1796 PAVGB" %%mm3, %%mm2 \n\t"
1797 "punpcklbw %%mm7, %%mm4 \n\t"
1798 "punpcklbw %%mm7, %%mm2 \n\t"
1799#else
1800 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1801 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1802 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1803 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1804 "punpcklbw %%mm7, %%mm4 \n\t"
1805 "punpcklbw %%mm7, %%mm1 \n\t"
1806 "punpcklbw %%mm7, %%mm2 \n\t"
1807 "punpcklbw %%mm7, %%mm3 \n\t"
1808 "paddw %%mm1, %%mm4 \n\t"
1809 "paddw %%mm3, %%mm2 \n\t"
1810 "paddw %%mm2, %%mm4 \n\t"
1811 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1812 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1813 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1814 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1815 "punpcklbw %%mm7, %%mm5 \n\t"
1816 "punpcklbw %%mm7, %%mm1 \n\t"
1817 "punpcklbw %%mm7, %%mm2 \n\t"
1818 "punpcklbw %%mm7, %%mm3 \n\t"
1819 "paddw %%mm1, %%mm5 \n\t"
1820 "paddw %%mm3, %%mm2 \n\t"
1821 "paddw %%mm5, %%mm2 \n\t"
1822 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1823 "psrlw $2, %%mm4 \n\t"
1824 "psrlw $2, %%mm2 \n\t"
1825#endif
1826 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1827 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1828
1829 "pmaddwd %%mm4, %%mm1 \n\t"
1830 "pmaddwd %%mm2, %%mm3 \n\t"
1831 "pmaddwd %%mm6, %%mm4 \n\t"
1832 "pmaddwd %%mm6, %%mm2 \n\t"
1833#ifndef FAST_BGR2YV12
1834 "psrad $8, %%mm4 \n\t"
1835 "psrad $8, %%mm1 \n\t"
1836 "psrad $8, %%mm2 \n\t"
1837 "psrad $8, %%mm3 \n\t"
1838#endif
1839 "packssdw %%mm2, %%mm4 \n\t"
1840 "packssdw %%mm3, %%mm1 \n\t"
1841 "pmaddwd %%mm5, %%mm4 \n\t"
1842 "pmaddwd %%mm5, %%mm1 \n\t"
1843 "add $24, %%"REG_d" \n\t"
1844 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1845 "psraw $7, %%mm4 \n\t"
1846
1847 "movq %%mm0, %%mm1 \n\t"
1848 "punpckldq %%mm4, %%mm0 \n\t"
1849 "punpckhdq %%mm4, %%mm1 \n\t"
1850 "packsswb %%mm1, %%mm0 \n\t"
1851 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1852 "movd %%mm0, (%2, %%"REG_a") \n\t"
1853 "punpckhdq %%mm0, %%mm0 \n\t"
1854 "movd %%mm0, (%3, %%"REG_a") \n\t"
1855 "add $4, %%"REG_a" \n\t"
1856 " js 1b \n\t"
1857 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1858 : "%"REG_a, "%"REG_d
1859 );
1860
1861 udst += chromStride;
1862 vdst += chromStride;
1863 src += srcStride*2;
1864 }
1865
1866 __asm__ volatile(EMMS" \n\t"
1867 SFENCE" \n\t"
1868 :::"memory");
6216fc70 1869
7dc303a6 1870 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
c0038328 1871}
522d65ba 1872#endif /* !COMPILE_TEMPLATE_SSE2 */
c0038328 1873
7597e6ef 1874#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
c0038328 1875static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
b8e89339
AK
1876 int width, int height, int src1Stride,
1877 int src2Stride, int dstStride)
c0038328 1878{
b8e89339 1879 int h;
c0038328
LB
1880
1881 for (h=0; h < height; h++) {
b8e89339 1882 int w;
c0038328 1883
c0038328
LB
1884#if COMPILE_TEMPLATE_SSE2
1885 __asm__(
1886 "xor %%"REG_a", %%"REG_a" \n\t"
1887 "1: \n\t"
1888 PREFETCH" 64(%1, %%"REG_a") \n\t"
1889 PREFETCH" 64(%2, %%"REG_a") \n\t"
1890 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1891 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1892 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1893 "punpcklbw %%xmm2, %%xmm0 \n\t"
1894 "punpckhbw %%xmm2, %%xmm1 \n\t"
1895 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1896 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1897 "add $16, %%"REG_a" \n\t"
1898 "cmp %3, %%"REG_a" \n\t"
1899 " jb 1b \n\t"
1900 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1901 : "memory", "%"REG_a""
1902 );
1903#else
1904 __asm__(
1905 "xor %%"REG_a", %%"REG_a" \n\t"
1906 "1: \n\t"
1907 PREFETCH" 64(%1, %%"REG_a") \n\t"
1908 PREFETCH" 64(%2, %%"REG_a") \n\t"
1909 "movq (%1, %%"REG_a"), %%mm0 \n\t"
1910 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1911 "movq %%mm0, %%mm1 \n\t"
1912 "movq %%mm2, %%mm3 \n\t"
1913 "movq (%2, %%"REG_a"), %%mm4 \n\t"
1914 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1915 "punpcklbw %%mm4, %%mm0 \n\t"
1916 "punpckhbw %%mm4, %%mm1 \n\t"
1917 "punpcklbw %%mm5, %%mm2 \n\t"
1918 "punpckhbw %%mm5, %%mm3 \n\t"
1919 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1920 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1921 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1922 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1923 "add $16, %%"REG_a" \n\t"
1924 "cmp %3, %%"REG_a" \n\t"
1925 " jb 1b \n\t"
1926 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1927 : "memory", "%"REG_a
1928 );
1929#endif
1930 for (w= (width&(~15)); w < width; w++) {
1931 dest[2*w+0] = src1[w];
1932 dest[2*w+1] = src2[w];
1933 }
c0038328
LB
1934 dest += dstStride;
1935 src1 += src1Stride;
1936 src2 += src2Stride;
1937 }
c0038328
LB
1938 __asm__(
1939 EMMS" \n\t"
1940 SFENCE" \n\t"
1941 ::: "memory"
1942 );
c0038328 1943}
7597e6ef 1944#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
c0038328 1945
91c98185
MN
1946#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1947void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1948 const uint8_t *src, const uint8_t *unused, int w,
1949 uint32_t *unused2);
1950static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
1951 int width, int height, int srcStride,
1952 int dst1Stride, int dst2Stride)
1953{
1954 int h;
1955
1956 for (h = 0; h < height; h++) {
1957 RENAME(ff_nv12ToUV)(dst1, dst2, src, NULL, width, NULL);
1958 src += srcStride;
1959 dst1 += dst1Stride;
1960 dst2 += dst2Stride;
1961 }
1962 __asm__(
1963 EMMS" \n\t"
1964 SFENCE" \n\t"
1965 ::: "memory"
1966 );
1967}
1968#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1969
522d65ba
RB
1970#if !COMPILE_TEMPLATE_SSE2
1971#if !COMPILE_TEMPLATE_AMD3DNOW
c0038328
LB
1972static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1973 uint8_t *dst1, uint8_t *dst2,
b8e89339
AK
1974 int width, int height,
1975 int srcStride1, int srcStride2,
1976 int dstStride1, int dstStride2)
c0038328 1977{
90540c2d
MR
1978 x86_reg x, y;
1979 int w,h;
c0038328 1980 w=width/2; h=height/2;
c0038328
LB
1981 __asm__ volatile(
1982 PREFETCH" %0 \n\t"
1983 PREFETCH" %1 \n\t"
1984 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
c0038328
LB
1985 for (y=0;y<h;y++) {
1986 const uint8_t* s1=src1+srcStride1*(y>>1);
1987 uint8_t* d=dst1+dstStride1*y;
1988 x=0;
c0038328
LB
1989 for (;x<w-31;x+=32) {
1990 __asm__ volatile(
90540c2d
MR
1991 PREFETCH" 32(%1,%2) \n\t"
1992 "movq (%1,%2), %%mm0 \n\t"
1993 "movq 8(%1,%2), %%mm2 \n\t"
1994 "movq 16(%1,%2), %%mm4 \n\t"
1995 "movq 24(%1,%2), %%mm6 \n\t"
c0038328
LB
1996 "movq %%mm0, %%mm1 \n\t"
1997 "movq %%mm2, %%mm3 \n\t"
1998 "movq %%mm4, %%mm5 \n\t"
1999 "movq %%mm6, %%mm7 \n\t"
2000 "punpcklbw %%mm0, %%mm0 \n\t"
2001 "punpckhbw %%mm1, %%mm1 \n\t"
2002 "punpcklbw %%mm2, %%mm2 \n\t"
2003 "punpckhbw %%mm3, %%mm3 \n\t"
2004 "punpcklbw %%mm4, %%mm4 \n\t"
2005 "punpckhbw %%mm5, %%mm5 \n\t"
2006 "punpcklbw %%mm6, %%mm6 \n\t"
2007 "punpckhbw %%mm7, %%mm7 \n\t"
90540c2d
MR
2008 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2009 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2010 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2011 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2012 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2013 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2014 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2015 MOVNTQ" %%mm7, 56(%0,%2,2)"
2016 :: "r"(d), "r"(s1), "r"(x)
c0038328
LB
2017 :"memory");
2018 }
c0038328
LB
2019 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2020 }
2021 for (y=0;y<h;y++) {
2022 const uint8_t* s2=src2+srcStride2*(y>>1);
2023 uint8_t* d=dst2+dstStride2*y;
2024 x=0;
c0038328
LB
2025 for (;x<w-31;x+=32) {
2026 __asm__ volatile(
90540c2d
MR
2027 PREFETCH" 32(%1,%2) \n\t"
2028 "movq (%1,%2), %%mm0 \n\t"
2029 "movq 8(%1,%2), %%mm2 \n\t"
2030 "movq 16(%1,%2), %%mm4 \n\t"
2031 "movq 24(%1,%2), %%mm6 \n\t"
c0038328
LB
2032 "movq %%mm0, %%mm1 \n\t"
2033 "movq %%mm2, %%mm3 \n\t"
2034 "movq %%mm4, %%mm5 \n\t"
2035 "movq %%mm6, %%mm7 \n\t"
2036 "punpcklbw %%mm0, %%mm0 \n\t"
2037 "punpckhbw %%mm1, %%mm1 \n\t"
2038 "punpcklbw %%mm2, %%mm2 \n\t"
2039 "punpckhbw %%mm3, %%mm3 \n\t"
2040 "punpcklbw %%mm4, %%mm4 \n\t"
2041 "punpckhbw %%mm5, %%mm5 \n\t"
2042 "punpcklbw %%mm6, %%mm6 \n\t"
2043 "punpckhbw %%mm7, %%mm7 \n\t"
90540c2d
MR
2044 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2045 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2046 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2047 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2048 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2049 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2050 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2051 MOVNTQ" %%mm7, 56(%0,%2,2)"
2052 :: "r"(d), "r"(s2), "r"(x)
c0038328
LB
2053 :"memory");
2054 }
c0038328
LB
2055 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2056 }
c0038328
LB
2057 __asm__(
2058 EMMS" \n\t"
2059 SFENCE" \n\t"
2060 ::: "memory"
2061 );
c0038328
LB
2062}
2063
2064static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2065 uint8_t *dst,
b8e89339
AK
2066 int width, int height,
2067 int srcStride1, int srcStride2,
2068 int srcStride3, int dstStride)
c0038328
LB
2069{
2070 x86_reg x;
b8e89339 2071 int y,w,h;
c0038328
LB
2072 w=width/2; h=height;
2073 for (y=0;y<h;y++) {
2074 const uint8_t* yp=src1+srcStride1*y;
2075 const uint8_t* up=src2+srcStride2*(y>>2);
2076 const uint8_t* vp=src3+srcStride3*(y>>2);
2077 uint8_t* d=dst+dstStride*y;
2078 x=0;
c0038328
LB
2079 for (;x<w-7;x+=8) {
2080 __asm__ volatile(
2081 PREFETCH" 32(%1, %0) \n\t"
2082 PREFETCH" 32(%2, %0) \n\t"
2083 PREFETCH" 32(%3, %0) \n\t"
2084 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2085 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2086 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2087 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2088 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2089 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2090 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2091 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2092 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2093 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2094
2095 "movq %%mm1, %%mm6 \n\t"
2096 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2097 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2098 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2099 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2100 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2101
2102 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2103 "movq 8(%1, %0, 4), %%mm0 \n\t"
2104 "movq %%mm0, %%mm3 \n\t"
2105 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2106 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2107 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2108 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2109
2110 "movq %%mm4, %%mm6 \n\t"
2111 "movq 16(%1, %0, 4), %%mm0 \n\t"
2112 "movq %%mm0, %%mm3 \n\t"
2113 "punpcklbw %%mm5, %%mm4 \n\t"
2114 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2115 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2116 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2117 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2118
2119 "punpckhbw %%mm5, %%mm6 \n\t"
2120 "movq 24(%1, %0, 4), %%mm0 \n\t"
2121 "movq %%mm0, %%mm3 \n\t"
2122 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2123 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2124 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2125 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2126
2127 : "+r" (x)
2128 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2129 :"memory");
2130 }
c0038328 2131 for (; x<w; x++) {
b8e89339 2132 const int x2 = x<<2;
c0038328
LB
2133 d[8*x+0] = yp[x2];
2134 d[8*x+1] = up[x];
2135 d[8*x+2] = yp[x2+1];
2136 d[8*x+3] = vp[x];
2137 d[8*x+4] = yp[x2+2];
2138 d[8*x+5] = up[x];
2139 d[8*x+6] = yp[x2+3];
2140 d[8*x+7] = vp[x];
2141 }
2142 }
c0038328
LB
2143 __asm__(
2144 EMMS" \n\t"
2145 SFENCE" \n\t"
2146 ::: "memory"
2147 );
c0038328 2148}
522d65ba 2149#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
c0038328
LB
2150
2151static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2152{
2153 dst += count;
2154 src += 2*count;
2155 count= - count;
2156
c0038328
LB
2157 if(count <= -16) {
2158 count += 15;
2159 __asm__ volatile(
2160 "pcmpeqw %%mm7, %%mm7 \n\t"
2161 "psrlw $8, %%mm7 \n\t"
2162 "1: \n\t"
2163 "movq -30(%1, %0, 2), %%mm0 \n\t"
2164 "movq -22(%1, %0, 2), %%mm1 \n\t"
2165 "movq -14(%1, %0, 2), %%mm2 \n\t"
2166 "movq -6(%1, %0, 2), %%mm3 \n\t"
2167 "pand %%mm7, %%mm0 \n\t"
2168 "pand %%mm7, %%mm1 \n\t"
2169 "pand %%mm7, %%mm2 \n\t"
2170 "pand %%mm7, %%mm3 \n\t"
2171 "packuswb %%mm1, %%mm0 \n\t"
2172 "packuswb %%mm3, %%mm2 \n\t"
2173 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2174 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2175 "add $16, %0 \n\t"
2176 " js 1b \n\t"
2177 : "+r"(count)
2178 : "r"(src), "r"(dst)
2179 );
2180 count -= 15;
2181 }
c0038328
LB
2182 while(count<0) {
2183 dst[count]= src[2*count];
2184 count++;
2185 }
2186}
2187
522d65ba 2188#if !COMPILE_TEMPLATE_AMD3DNOW
c0038328
LB
2189static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2190{
2191 dst0+= count;
2192 dst1+= count;
2193 src += 4*count;
2194 count= - count;
c0038328
LB
2195 if(count <= -8) {
2196 count += 7;
2197 __asm__ volatile(
2198 "pcmpeqw %%mm7, %%mm7 \n\t"
2199 "psrlw $8, %%mm7 \n\t"
2200 "1: \n\t"
2201 "movq -28(%1, %0, 4), %%mm0 \n\t"
2202 "movq -20(%1, %0, 4), %%mm1 \n\t"
2203 "movq -12(%1, %0, 4), %%mm2 \n\t"
2204 "movq -4(%1, %0, 4), %%mm3 \n\t"
2205 "pand %%mm7, %%mm0 \n\t"
2206 "pand %%mm7, %%mm1 \n\t"
2207 "pand %%mm7, %%mm2 \n\t"
2208 "pand %%mm7, %%mm3 \n\t"
2209 "packuswb %%mm1, %%mm0 \n\t"
2210 "packuswb %%mm3, %%mm2 \n\t"
2211 "movq %%mm0, %%mm1 \n\t"
2212 "movq %%mm2, %%mm3 \n\t"
2213 "psrlw $8, %%mm0 \n\t"
2214 "psrlw $8, %%mm2 \n\t"
2215 "pand %%mm7, %%mm1 \n\t"
2216 "pand %%mm7, %%mm3 \n\t"
2217 "packuswb %%mm2, %%mm0 \n\t"
2218 "packuswb %%mm3, %%mm1 \n\t"
2219 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2220 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2221 "add $8, %0 \n\t"
2222 " js 1b \n\t"
2223 : "+r"(count)
2224 : "r"(src), "r"(dst0), "r"(dst1)
2225 );
2226 count -= 7;
2227 }
c0038328
LB
2228 while(count<0) {
2229 dst0[count]= src[4*count+0];
2230 dst1[count]= src[4*count+2];
2231 count++;
2232 }
2233}
522d65ba 2234#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
c0038328
LB
2235
2236static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2237{
2238 dst0 += count;
2239 dst1 += count;
2240 src0 += 4*count;
2241 src1 += 4*count;
2242 count= - count;
2243#ifdef PAVGB
2244 if(count <= -8) {
2245 count += 7;
2246 __asm__ volatile(
2247 "pcmpeqw %%mm7, %%mm7 \n\t"
2248 "psrlw $8, %%mm7 \n\t"
2249 "1: \n\t"
2250 "movq -28(%1, %0, 4), %%mm0 \n\t"
2251 "movq -20(%1, %0, 4), %%mm1 \n\t"
2252 "movq -12(%1, %0, 4), %%mm2 \n\t"
2253 "movq -4(%1, %0, 4), %%mm3 \n\t"
2254 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2255 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2256 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2257 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2258 "pand %%mm7, %%mm0 \n\t"
2259 "pand %%mm7, %%mm1 \n\t"
2260 "pand %%mm7, %%mm2 \n\t"
2261 "pand %%mm7, %%mm3 \n\t"
2262 "packuswb %%mm1, %%mm0 \n\t"
2263 "packuswb %%mm3, %%mm2 \n\t"
2264 "movq %%mm0, %%mm1 \n\t"
2265 "movq %%mm2, %%mm3 \n\t"
2266 "psrlw $8, %%mm0 \n\t"
2267 "psrlw $8, %%mm2 \n\t"
2268 "pand %%mm7, %%mm1 \n\t"
2269 "pand %%mm7, %%mm3 \n\t"
2270 "packuswb %%mm2, %%mm0 \n\t"
2271 "packuswb %%mm3, %%mm1 \n\t"
2272 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2273 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2274 "add $8, %0 \n\t"
2275 " js 1b \n\t"
2276 : "+r"(count)
2277 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2278 );
2279 count -= 7;
2280 }
2281#endif
2282 while(count<0) {
2283 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2284 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2285 count++;
2286 }
2287}
2288
522d65ba 2289#if !COMPILE_TEMPLATE_AMD3DNOW
c0038328
LB
2290static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2291{
2292 dst0+= count;
2293 dst1+= count;
2294 src += 4*count;
2295 count= - count;
c0038328
LB
2296 if(count <= -8) {
2297 count += 7;
2298 __asm__ volatile(
2299 "pcmpeqw %%mm7, %%mm7 \n\t"
2300 "psrlw $8, %%mm7 \n\t"
2301 "1: \n\t"
2302 "movq -28(%1, %0, 4), %%mm0 \n\t"
2303 "movq -20(%1, %0, 4), %%mm1 \n\t"
2304 "movq -12(%1, %0, 4), %%mm2 \n\t"
2305 "movq -4(%1, %0, 4), %%mm3 \n\t"
2306 "psrlw $8, %%mm0 \n\t"
2307 "psrlw $8, %%mm1 \n\t"
2308 "psrlw $8, %%mm2 \n\t"
2309 "psrlw $8, %%mm3 \n\t"
2310 "packuswb %%mm1, %%mm0 \n\t"
2311 "packuswb %%mm3, %%mm2 \n\t"
2312 "movq %%mm0, %%mm1 \n\t"
2313 "movq %%mm2, %%mm3 \n\t"
2314 "psrlw $8, %%mm0 \n\t"
2315 "psrlw $8, %%mm2 \n\t"
2316 "pand %%mm7, %%mm1 \n\t"
2317 "pand %%mm7, %%mm3 \n\t"
2318 "packuswb %%mm2, %%mm0 \n\t"
2319 "packuswb %%mm3, %%mm1 \n\t"
2320 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2321 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2322 "add $8, %0 \n\t"
2323 " js 1b \n\t"
2324 : "+r"(count)
2325 : "r"(src), "r"(dst0), "r"(dst1)
2326 );
2327 count -= 7;
2328 }
c0038328
LB
2329 src++;
2330 while(count<0) {
2331 dst0[count]= src[4*count+0];
2332 dst1[count]= src[4*count+2];
2333 count++;
2334 }
2335}
522d65ba 2336#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
c0038328
LB
2337
2338static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2339{
2340 dst0 += count;
2341 dst1 += count;
2342 src0 += 4*count;
2343 src1 += 4*count;
2344 count= - count;
2345#ifdef PAVGB
2346 if(count <= -8) {
2347 count += 7;
2348 __asm__ volatile(
2349 "pcmpeqw %%mm7, %%mm7 \n\t"
2350 "psrlw $8, %%mm7 \n\t"
2351 "1: \n\t"
2352 "movq -28(%1, %0, 4), %%mm0 \n\t"
2353 "movq -20(%1, %0, 4), %%mm1 \n\t"
2354 "movq -12(%1, %0, 4), %%mm2 \n\t"
2355 "movq -4(%1, %0, 4), %%mm3 \n\t"
2356 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2357 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2358 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2359 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2360 "psrlw $8, %%mm0 \n\t"
2361 "psrlw $8, %%mm1 \n\t"
2362 "psrlw $8, %%mm2 \n\t"
2363 "psrlw $8, %%mm3 \n\t"
2364 "packuswb %%mm1, %%mm0 \n\t"
2365 "packuswb %%mm3, %%mm2 \n\t"
2366 "movq %%mm0, %%mm1 \n\t"
2367 "movq %%mm2, %%mm3 \n\t"
2368 "psrlw $8, %%mm0 \n\t"
2369 "psrlw $8, %%mm2 \n\t"
2370 "pand %%mm7, %%mm1 \n\t"
2371 "pand %%mm7, %%mm3 \n\t"
2372 "packuswb %%mm2, %%mm0 \n\t"
2373 "packuswb %%mm3, %%mm1 \n\t"
2374 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2375 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2376 "add $8, %0 \n\t"
2377 " js 1b \n\t"
2378 : "+r"(count)
2379 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2380 );
2381 count -= 7;
2382 }
2383#endif
2384 src0++;
2385 src1++;
2386 while(count<0) {
2387 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2388 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2389 count++;
2390 }
2391}
2392
2393static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
b8e89339
AK
2394 int width, int height,
2395 int lumStride, int chromStride, int srcStride)
c0038328 2396{
b8e89339
AK
2397 int y;
2398 const int chromWidth= -((-width)>>1);
c0038328
LB
2399
2400 for (y=0; y<height; y++) {
2401 RENAME(extract_even)(src, ydst, width);
2402 if(y&1) {
2403 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2404 udst+= chromStride;
2405 vdst+= chromStride;
2406 }
2407
2408 src += srcStride;
2409 ydst+= lumStride;
2410 }
c0038328
LB
2411 __asm__(
2412 EMMS" \n\t"
2413 SFENCE" \n\t"
2414 ::: "memory"
2415 );
c0038328
LB
2416}
2417
522d65ba 2418#if !COMPILE_TEMPLATE_AMD3DNOW
c0038328 2419static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
b8e89339
AK
2420 int width, int height,
2421 int lumStride, int chromStride, int srcStride)
c0038328 2422{
b8e89339
AK
2423 int y;
2424 const int chromWidth= -((-width)>>1);
c0038328
LB
2425
2426 for (y=0; y<height; y++) {
2427 RENAME(extract_even)(src, ydst, width);
2428 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2429
2430 src += srcStride;
2431 ydst+= lumStride;
2432 udst+= chromStride;
2433 vdst+= chromStride;
2434 }
c0038328
LB
2435 __asm__(
2436 EMMS" \n\t"
2437 SFENCE" \n\t"
2438 ::: "memory"
2439 );
c0038328 2440}
522d65ba 2441#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
c0038328
LB
2442
2443static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
b8e89339
AK
2444 int width, int height,
2445 int lumStride, int chromStride, int srcStride)
c0038328 2446{
b8e89339
AK
2447 int y;
2448 const int chromWidth= -((-width)>>1);
c0038328
LB
2449
2450 for (y=0; y<height; y++) {
2451 RENAME(extract_even)(src+1, ydst, width);
2452 if(y&1) {
2453 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2454 udst+= chromStride;
2455 vdst+= chromStride;
2456 }
2457
2458 src += srcStride;
2459 ydst+= lumStride;
2460 }
c0038328
LB
2461 __asm__(
2462 EMMS" \n\t"
2463 SFENCE" \n\t"
2464 ::: "memory"
2465 );
c0038328
LB
2466}
2467
522d65ba 2468#if !COMPILE_TEMPLATE_AMD3DNOW
c0038328 2469static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
b8e89339
AK
2470 int width, int height,
2471 int lumStride, int chromStride, int srcStride)
c0038328 2472{
b8e89339
AK
2473 int y;
2474 const int chromWidth= -((-width)>>1);
c0038328
LB
2475
2476 for (y=0; y<height; y++) {
2477 RENAME(extract_even)(src+1, ydst, width);
2478 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2479
2480 src += srcStride;
2481 ydst+= lumStride;
2482 udst+= chromStride;
2483 vdst+= chromStride;
2484 }
c0038328
LB
2485 __asm__(
2486 EMMS" \n\t"
2487 SFENCE" \n\t"
2488 ::: "memory"
2489 );
c0038328 2490}
522d65ba
RB
2491#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2492#endif /* !COMPILE_TEMPLATE_SSE2 */
c0038328 2493
2b677ffc 2494static av_cold void RENAME(rgb2rgb_init)(void)
c0038328 2495{
522d65ba
RB
2496#if !COMPILE_TEMPLATE_SSE2
2497#if !COMPILE_TEMPLATE_AMD3DNOW
c0038328
LB
2498 rgb15to16 = RENAME(rgb15to16);
2499 rgb15tobgr24 = RENAME(rgb15tobgr24);
2500 rgb15to32 = RENAME(rgb15to32);
2501 rgb16tobgr24 = RENAME(rgb16tobgr24);
2502 rgb16to32 = RENAME(rgb16to32);
2503 rgb16to15 = RENAME(rgb16to15);
2504 rgb24tobgr16 = RENAME(rgb24tobgr16);
2505 rgb24tobgr15 = RENAME(rgb24tobgr15);
2506 rgb24tobgr32 = RENAME(rgb24tobgr32);
2507 rgb32to16 = RENAME(rgb32to16);
2508 rgb32to15 = RENAME(rgb32to15);
2509 rgb32tobgr24 = RENAME(rgb32tobgr24);
2510 rgb24to15 = RENAME(rgb24to15);
2511 rgb24to16 = RENAME(rgb24to16);
2512 rgb24tobgr24 = RENAME(rgb24tobgr24);
2513 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2514 rgb32tobgr16 = RENAME(rgb32tobgr16);
2515 rgb32tobgr15 = RENAME(rgb32tobgr15);
2516 yv12toyuy2 = RENAME(yv12toyuy2);
2517 yv12touyvy = RENAME(yv12touyvy);
2518 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2519 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2520 yuy2toyv12 = RENAME(yuy2toyv12);
c0038328
LB
2521 vu9_to_vu12 = RENAME(vu9_to_vu12);
2522 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
c0038328 2523 uyvytoyuv422 = RENAME(uyvytoyuv422);
c0038328 2524 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
1e607213 2525#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
522d65ba 2526
239fdf1b 2527#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
522d65ba 2528 planar2x = RENAME(planar2x);
239fdf1b 2529#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
522d65ba
RB
2530 rgb24toyv12 = RENAME(rgb24toyv12);
2531
2532 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2533 uyvytoyuv420 = RENAME(uyvytoyuv420);
1e607213 2534#endif /* !COMPILE_TEMPLATE_SSE2 */
522d65ba 2535
7597e6ef 2536#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
522d65ba 2537 interleaveBytes = RENAME(interleaveBytes);
7597e6ef 2538#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
91c98185
MN
2539#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
2540 deinterleaveBytes = RENAME(deinterleaveBytes);
2541#endif
c0038328 2542}