Add YUVA420P -> RGBA/BGRA/ARGB/ABGR unscaled converters
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
8a322796
DB
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec
MN
27#undef PREFETCH
28#undef PREFETCHW
29#undef EMMS
30#undef SFENCE
31
f4406ec1 32#if HAVE_AMD3DNOW
aeb87a49 33/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
48a05cec
MN
34#define EMMS "femms"
35#else
36#define EMMS "emms"
37#endif
38
f4406ec1 39#if HAVE_AMD3DNOW
48a05cec
MN
40#define PREFETCH "prefetch"
41#define PREFETCHW "prefetchw"
b63f641e 42#elif HAVE_MMX2
48a05cec
MN
43#define PREFETCH "prefetchnta"
44#define PREFETCHW "prefetcht0"
45#else
d904b5fc
NP
46#define PREFETCH " # nop"
47#define PREFETCHW " # nop"
48a05cec
MN
48#endif
49
b63f641e 50#if HAVE_MMX2
48a05cec
MN
51#define SFENCE "sfence"
52#else
d904b5fc 53#define SFENCE " # nop"
48a05cec 54#endif
d3f41512 55
b63f641e 56#if HAVE_MMX2
d604bab9 57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
f4406ec1 58#elif HAVE_AMD3DNOW
d604bab9
MN
59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60#endif
d3f41512 61
b63f641e 62#if HAVE_MMX2
6e1c66bc 63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 64#else
6e1c66bc 65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 66#endif
6e1c66bc 67#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 68
b63f641e 69#if HAVE_ALTIVEC
a2faa401
RD
70#include "swscale_altivec_template.c"
71#endif
72
bca11e75 73#define YSCALEYUV2YV12X(x, offset, dest, width) \
7ad6469e 74 __asm__ volatile(\
2da0d70d
DB
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
8b2fce0d
MN
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
2da0d70d
DB
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
bca11e75
MN
108
109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
7ad6469e 110 __asm__ volatile(\
2da0d70d
DB
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
8b2fce0d
MN
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
1625216e 122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
8b2fce0d 123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
2da0d70d
DB
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
1625216e
MN
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
c1b0bfb4
MN
170
171#define YSCALEYUV2YV121 \
2da0d70d
DB
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
c1b0bfb4 183
bf2bdde6
MN
184#define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
33a67bd6
MN
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
bf2bdde6
MN
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
201
c1b0bfb4 202/*
2da0d70d
DB
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 208*/
df57ab14 209#define YSCALEYUV2PACKEDX_UV \
7ad6469e 210 __asm__ volatile(\
2da0d70d
DB
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
8b2fce0d 223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
2da0d70d
DB
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
df57ab14 232
fe91924d 233#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
df57ab14 234 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d 235 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
236 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
237 "movq "#dst1", "#dst2" \n\t"\
2da0d70d
DB
238 ASMALIGN(4)\
239 "2: \n\t"\
fe91924d
CS
240 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
241 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
242 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
2da0d70d
DB
243 "add $16, %%"REG_d" \n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
245 "pmulhw "#coeff", "#src1" \n\t"\
246 "pmulhw "#coeff", "#src2" \n\t"\
247 "paddw "#src1", "#dst1" \n\t"\
248 "paddw "#src2", "#dst2" \n\t"\
2da0d70d
DB
249 "test %%"REG_S", %%"REG_S" \n\t"\
250 " jnz 2b \n\t"\
251
df57ab14
CS
252#define YSCALEYUV2PACKEDX \
253 YSCALEYUV2PACKEDX_UV \
fe91924d 254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
df57ab14 255
2da0d70d
DB
256#define YSCALEYUV2PACKEDX_END \
257 :: "r" (&c->redDither), \
258 "m" (dummy), "m" (dummy), "m" (dummy),\
259 "r" (dest), "m" (dstW) \
260 : "%"REG_a, "%"REG_d, "%"REG_S \
261 );
8422aa88 262
df57ab14 263#define YSCALEYUV2PACKEDX_ACCURATE_UV \
7ad6469e 264 __asm__ volatile(\
2da0d70d
DB
265 "xor %%"REG_a", %%"REG_a" \n\t"\
266 ASMALIGN(4)\
267 "nop \n\t"\
268 "1: \n\t"\
269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pxor %%mm4, %%mm4 \n\t"\
272 "pxor %%mm5, %%mm5 \n\t"\
273 "pxor %%mm6, %%mm6 \n\t"\
274 "pxor %%mm7, %%mm7 \n\t"\
275 ASMALIGN(4)\
276 "2: \n\t"\
277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
8b2fce0d 278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
1625216e 279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
281 "movq %%mm0, %%mm3 \n\t"\
282 "punpcklwd %%mm1, %%mm0 \n\t"\
283 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
285 "pmaddwd %%mm1, %%mm0 \n\t"\
286 "pmaddwd %%mm1, %%mm3 \n\t"\
287 "paddd %%mm0, %%mm4 \n\t"\
288 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
1625216e
MN
290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
292 "test %%"REG_S", %%"REG_S" \n\t"\
293 "movq %%mm2, %%mm0 \n\t"\
294 "punpcklwd %%mm3, %%mm2 \n\t"\
295 "punpckhwd %%mm3, %%mm0 \n\t"\
296 "pmaddwd %%mm1, %%mm2 \n\t"\
297 "pmaddwd %%mm1, %%mm0 \n\t"\
298 "paddd %%mm2, %%mm6 \n\t"\
299 "paddd %%mm0, %%mm7 \n\t"\
300 " jnz 2b \n\t"\
301 "psrad $16, %%mm4 \n\t"\
302 "psrad $16, %%mm5 \n\t"\
303 "psrad $16, %%mm6 \n\t"\
304 "psrad $16, %%mm7 \n\t"\
305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
306 "packssdw %%mm5, %%mm4 \n\t"\
307 "packssdw %%mm7, %%mm6 \n\t"\
308 "paddw %%mm0, %%mm4 \n\t"\
309 "paddw %%mm0, %%mm6 \n\t"\
310 "movq %%mm4, "U_TEMP"(%0) \n\t"\
311 "movq %%mm6, "V_TEMP"(%0) \n\t"\
df57ab14
CS
312
313#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d
DB
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm1, %%mm1 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm7, %%mm7 \n\t"\
319 "pxor %%mm6, %%mm6 \n\t"\
320 ASMALIGN(4)\
321 "2: \n\t"\
322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
1625216e 324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
326 "movq %%mm0, %%mm3 \n\t"\
327 "punpcklwd %%mm4, %%mm0 \n\t"\
328 "punpckhwd %%mm4, %%mm3 \n\t"\
1625216e 329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
2da0d70d
DB
330 "pmaddwd %%mm4, %%mm0 \n\t"\
331 "pmaddwd %%mm4, %%mm3 \n\t"\
332 "paddd %%mm0, %%mm1 \n\t"\
333 "paddd %%mm3, %%mm5 \n\t"\
334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
1625216e
MN
335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
337 "test %%"REG_S", %%"REG_S" \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "punpcklwd %%mm3, %%mm2 \n\t"\
340 "punpckhwd %%mm3, %%mm0 \n\t"\
341 "pmaddwd %%mm4, %%mm2 \n\t"\
342 "pmaddwd %%mm4, %%mm0 \n\t"\
343 "paddd %%mm2, %%mm7 \n\t"\
344 "paddd %%mm0, %%mm6 \n\t"\
345 " jnz 2b \n\t"\
346 "psrad $16, %%mm1 \n\t"\
347 "psrad $16, %%mm5 \n\t"\
348 "psrad $16, %%mm7 \n\t"\
349 "psrad $16, %%mm6 \n\t"\
350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
351 "packssdw %%mm5, %%mm1 \n\t"\
352 "packssdw %%mm6, %%mm7 \n\t"\
353 "paddw %%mm0, %%mm1 \n\t"\
354 "paddw %%mm0, %%mm7 \n\t"\
355 "movq "U_TEMP"(%0), %%mm3 \n\t"\
356 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 357
df57ab14
CS
358#define YSCALEYUV2PACKEDX_ACCURATE \
359 YSCALEYUV2PACKEDX_ACCURATE_UV \
360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
361
8422aa88 362#define YSCALEYUV2RGBX \
2da0d70d
DB
363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
369/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
376/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377 "paddw %%mm3, %%mm4 \n\t"\
378 "movq %%mm2, %%mm0 \n\t"\
379 "movq %%mm5, %%mm6 \n\t"\
380 "movq %%mm4, %%mm3 \n\t"\
381 "punpcklwd %%mm2, %%mm2 \n\t"\
382 "punpcklwd %%mm5, %%mm5 \n\t"\
383 "punpcklwd %%mm4, %%mm4 \n\t"\
384 "paddw %%mm1, %%mm2 \n\t"\
385 "paddw %%mm1, %%mm5 \n\t"\
386 "paddw %%mm1, %%mm4 \n\t"\
387 "punpckhwd %%mm0, %%mm0 \n\t"\
388 "punpckhwd %%mm6, %%mm6 \n\t"\
389 "punpckhwd %%mm3, %%mm3 \n\t"\
390 "paddw %%mm7, %%mm0 \n\t"\
391 "paddw %%mm7, %%mm6 \n\t"\
392 "paddw %%mm7, %%mm3 \n\t"\
393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394 "packuswb %%mm0, %%mm2 \n\t"\
395 "packuswb %%mm6, %%mm5 \n\t"\
396 "packuswb %%mm3, %%mm4 \n\t"\
d604bab9 397
6e1c66bc 398#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
401 "psraw $3, %%mm0 \n\t"\
402 "psraw $3, %%mm1 \n\t"\
403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405 "xor "#index", "#index" \n\t"\
406 ASMALIGN(4)\
407 "1: \n\t"\
408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 433
6e1c66bc 434#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 435
df57ab14 436#define REAL_YSCALEYUV2RGB_UV(index, c) \
2da0d70d
DB
437 "xor "#index", "#index" \n\t"\
438 ASMALIGN(4)\
439 "1: \n\t"\
440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
df57ab14 460
786dcfef
CS
461#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
462 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
463 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
464 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
465 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
2da0d70d
DB
466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
df57ab14
CS
474
475#define REAL_YSCALEYUV2RGB_COEFF(c) \
2da0d70d
DB
476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483 "paddw %%mm3, %%mm4 \n\t"\
484 "movq %%mm2, %%mm0 \n\t"\
485 "movq %%mm5, %%mm6 \n\t"\
486 "movq %%mm4, %%mm3 \n\t"\
487 "punpcklwd %%mm2, %%mm2 \n\t"\
488 "punpcklwd %%mm5, %%mm5 \n\t"\
489 "punpcklwd %%mm4, %%mm4 \n\t"\
490 "paddw %%mm1, %%mm2 \n\t"\
491 "paddw %%mm1, %%mm5 \n\t"\
492 "paddw %%mm1, %%mm4 \n\t"\
493 "punpckhwd %%mm0, %%mm0 \n\t"\
494 "punpckhwd %%mm6, %%mm6 \n\t"\
495 "punpckhwd %%mm3, %%mm3 \n\t"\
496 "paddw %%mm7, %%mm0 \n\t"\
497 "paddw %%mm7, %%mm6 \n\t"\
498 "paddw %%mm7, %%mm3 \n\t"\
499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500 "packuswb %%mm0, %%mm2 \n\t"\
501 "packuswb %%mm6, %%mm5 \n\t"\
502 "packuswb %%mm3, %%mm4 \n\t"\
40494418 503
786dcfef 504#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
df57ab14
CS
505
506#define YSCALEYUV2RGB(index, c) \
507 REAL_YSCALEYUV2RGB_UV(index, c) \
786dcfef 508 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
df57ab14 509 REAL_YSCALEYUV2RGB_COEFF(c)
6a4970ab 510
6e1c66bc 511#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
517 "psraw $7, %%mm3 \n\t" \
518 "psraw $7, %%mm4 \n\t" \
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $7, %%mm1 \n\t" \
522 "psraw $7, %%mm7 \n\t" \
6a4970ab 523
6e1c66bc 524#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 525
6e1c66bc 526#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
527 "xor "#index", "#index" \n\t"\
528 ASMALIGN(4)\
529 "1: \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552 "paddw %%mm3, %%mm4 \n\t"\
553 "movq %%mm2, %%mm0 \n\t"\
554 "movq %%mm5, %%mm6 \n\t"\
555 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklwd %%mm2, %%mm2 \n\t"\
557 "punpcklwd %%mm5, %%mm5 \n\t"\
558 "punpcklwd %%mm4, %%mm4 \n\t"\
559 "paddw %%mm1, %%mm2 \n\t"\
560 "paddw %%mm1, %%mm5 \n\t"\
561 "paddw %%mm1, %%mm4 \n\t"\
562 "punpckhwd %%mm0, %%mm0 \n\t"\
563 "punpckhwd %%mm6, %%mm6 \n\t"\
564 "punpckhwd %%mm3, %%mm3 \n\t"\
565 "paddw %%mm7, %%mm0 \n\t"\
566 "paddw %%mm7, %%mm6 \n\t"\
567 "paddw %%mm7, %%mm3 \n\t"\
568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569 "packuswb %%mm0, %%mm2 \n\t"\
570 "packuswb %%mm6, %%mm5 \n\t"\
571 "packuswb %%mm3, %%mm4 \n\t"\
40494418 572
6e1c66bc 573#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 574
6e1c66bc 575#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $8, %%mm3 \n\t" \
586 "psrlw $8, %%mm4 \n\t" \
587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
589 "psraw $7, %%mm1 \n\t" \
590 "psraw $7, %%mm7 \n\t"
6e1c66bc 591#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 592
497d4f99 593// do vertical chrominance interpolation
6e1c66bc 594#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
595 "xor "#index", "#index" \n\t"\
596 ASMALIGN(4)\
597 "1: \n\t"\
598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624 "paddw %%mm3, %%mm4 \n\t"\
625 "movq %%mm2, %%mm0 \n\t"\
626 "movq %%mm5, %%mm6 \n\t"\
627 "movq %%mm4, %%mm3 \n\t"\
628 "punpcklwd %%mm2, %%mm2 \n\t"\
629 "punpcklwd %%mm5, %%mm5 \n\t"\
630 "punpcklwd %%mm4, %%mm4 \n\t"\
631 "paddw %%mm1, %%mm2 \n\t"\
632 "paddw %%mm1, %%mm5 \n\t"\
633 "paddw %%mm1, %%mm4 \n\t"\
634 "punpckhwd %%mm0, %%mm0 \n\t"\
635 "punpckhwd %%mm6, %%mm6 \n\t"\
636 "punpckhwd %%mm3, %%mm3 \n\t"\
637 "paddw %%mm7, %%mm0 \n\t"\
638 "paddw %%mm7, %%mm6 \n\t"\
639 "paddw %%mm7, %%mm3 \n\t"\
640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641 "packuswb %%mm0, %%mm2 \n\t"\
642 "packuswb %%mm6, %%mm5 \n\t"\
643 "packuswb %%mm3, %%mm4 \n\t"\
40494418 644
6e1c66bc 645#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 646
9c77b26b
CS
647#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
648 "movq "#b", "#q2" \n\t" /* B */\
649 "movq "#r", "#t" \n\t" /* R */\
650 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
651 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
652 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
653 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
654 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
655 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
656 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
657 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
658 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
659 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
d604bab9 660\
9c77b26b
CS
661 MOVNTQ( q0, (dst, index, 4))\
662 MOVNTQ( b, 8(dst, index, 4))\
663 MOVNTQ( q2, 16(dst, index, 4))\
664 MOVNTQ( q3, 24(dst, index, 4))\
d604bab9 665\
2da0d70d
DB
666 "add $8, "#index" \n\t"\
667 "cmp "#dstw", "#index" \n\t"\
668 " jb 1b \n\t"
9c77b26b 669#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
d604bab9 670
27a90b04 671#define REAL_WRITERGB16(dst, dstw, index) \
2da0d70d
DB
672 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
673 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
674 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
675 "psrlq $3, %%mm2 \n\t"\
d604bab9 676\
2da0d70d
DB
677 "movq %%mm2, %%mm1 \n\t"\
678 "movq %%mm4, %%mm3 \n\t"\
d604bab9 679\
2da0d70d
DB
680 "punpcklbw %%mm7, %%mm3 \n\t"\
681 "punpcklbw %%mm5, %%mm2 \n\t"\
682 "punpckhbw %%mm7, %%mm4 \n\t"\
683 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 684\
2da0d70d
DB
685 "psllq $3, %%mm3 \n\t"\
686 "psllq $3, %%mm4 \n\t"\
d604bab9 687\
2da0d70d
DB
688 "por %%mm3, %%mm2 \n\t"\
689 "por %%mm4, %%mm1 \n\t"\
d604bab9 690\
2da0d70d
DB
691 MOVNTQ(%%mm2, (dst, index, 2))\
692 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 693\
2da0d70d
DB
694 "add $8, "#index" \n\t"\
695 "cmp "#dstw", "#index" \n\t"\
696 " jb 1b \n\t"
27a90b04 697#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
d604bab9 698
27a90b04 699#define REAL_WRITERGB15(dst, dstw, index) \
2da0d70d
DB
700 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
701 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
702 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
703 "psrlq $3, %%mm2 \n\t"\
704 "psrlq $1, %%mm5 \n\t"\
d604bab9 705\
2da0d70d
DB
706 "movq %%mm2, %%mm1 \n\t"\
707 "movq %%mm4, %%mm3 \n\t"\
d604bab9 708\
2da0d70d
DB
709 "punpcklbw %%mm7, %%mm3 \n\t"\
710 "punpcklbw %%mm5, %%mm2 \n\t"\
711 "punpckhbw %%mm7, %%mm4 \n\t"\
712 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 713\
2da0d70d
DB
714 "psllq $2, %%mm3 \n\t"\
715 "psllq $2, %%mm4 \n\t"\
d604bab9 716\
2da0d70d
DB
717 "por %%mm3, %%mm2 \n\t"\
718 "por %%mm4, %%mm1 \n\t"\
d604bab9 719\
2da0d70d
DB
720 MOVNTQ(%%mm2, (dst, index, 2))\
721 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 722\
2da0d70d
DB
723 "add $8, "#index" \n\t"\
724 "cmp "#dstw", "#index" \n\t"\
725 " jb 1b \n\t"
27a90b04 726#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
f62255fb 727
6542b44e 728#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
729 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
730 "movq %%mm2, %%mm1 \n\t" /* B */\
731 "movq %%mm5, %%mm6 \n\t" /* R */\
732 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
733 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
734 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
735 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
736 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
737 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
738 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
739 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
740 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
741 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 742\
2da0d70d
DB
743 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
744 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
746 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
747 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
748 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
749 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
750 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 751\
2da0d70d
DB
752 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
753 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
754 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
755 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
756 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
757 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
758 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
759 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
760 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
761 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
762 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
763 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
764 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 765\
2da0d70d
DB
766 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
767 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
768 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
769 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
770 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
771 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
772 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
773 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 774\
2da0d70d
DB
775 MOVNTQ(%%mm0, (dst))\
776 MOVNTQ(%%mm2, 8(dst))\
777 MOVNTQ(%%mm3, 16(dst))\
778 "add $24, "#dst" \n\t"\
d604bab9 779\
2da0d70d
DB
780 "add $8, "#index" \n\t"\
781 "cmp "#dstw", "#index" \n\t"\
782 " jb 1b \n\t"
d604bab9 783
6542b44e 784#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
785 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
786 "movq %%mm2, %%mm1 \n\t" /* B */\
787 "movq %%mm5, %%mm6 \n\t" /* R */\
788 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
789 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
790 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
791 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
792 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
793 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
794 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
795 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
796 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
797 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 798\
2da0d70d
DB
799 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
800 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
801 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
802 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 803\
2da0d70d
DB
804 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
805 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
806 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
807 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 808\
2da0d70d
DB
809 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
810 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
811 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
812 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 813\
2da0d70d
DB
814 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
815 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
816 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
817 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
818 MOVNTQ(%%mm0, (dst))\
99d2cb72 819\
2da0d70d
DB
820 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
821 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
822 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
823 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
824 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 825\
2da0d70d
DB
826 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
827 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
828 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
829 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 830\
2da0d70d 831 "add $24, "#dst" \n\t"\
99d2cb72 832\
2da0d70d
DB
833 "add $8, "#index" \n\t"\
834 "cmp "#dstw", "#index" \n\t"\
835 " jb 1b \n\t"
99d2cb72 836
6542b44e 837#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 838 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
839 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
840 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
841 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
842 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
843 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 844\
2da0d70d
DB
845 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
846 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
847 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 848\
2da0d70d
DB
849 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
850 "por %%mm1, %%mm6 \n\t"\
851 "por %%mm3, %%mm6 \n\t"\
852 MOVNTQ(%%mm6, (dst))\
99d2cb72 853\
2da0d70d
DB
854 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
855 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
856 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
857 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 858\
5802683a 859 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
860 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
861 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 862\
2da0d70d
DB
863 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
864 "por %%mm3, %%mm6 \n\t"\
865 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 866\
2da0d70d
DB
867 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
868 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
869 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 870\
2da0d70d
DB
871 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
872 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 873 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 874\
2da0d70d
DB
875 "por %%mm1, %%mm3 \n\t"\
876 "por %%mm3, %%mm6 \n\t"\
877 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 878\
2da0d70d 879 "add $24, "#dst" \n\t"\
99d2cb72 880\
2da0d70d
DB
881 "add $8, "#index" \n\t"\
882 "cmp "#dstw", "#index" \n\t"\
883 " jb 1b \n\t"
99d2cb72 884
b63f641e 885#if HAVE_MMX2
7630f2e0 886#undef WRITEBGR24
6e1c66bc 887#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 888#else
7630f2e0 889#undef WRITEBGR24
6e1c66bc 890#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
891#endif
892
6e1c66bc 893#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
894 "packuswb %%mm3, %%mm3 \n\t"\
895 "packuswb %%mm4, %%mm4 \n\t"\
896 "packuswb %%mm7, %%mm1 \n\t"\
897 "punpcklbw %%mm4, %%mm3 \n\t"\
898 "movq %%mm1, %%mm7 \n\t"\
899 "punpcklbw %%mm3, %%mm1 \n\t"\
900 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 901\
2da0d70d
DB
902 MOVNTQ(%%mm1, (dst, index, 2))\
903 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 904\
2da0d70d
DB
905 "add $8, "#index" \n\t"\
906 "cmp "#dstw", "#index" \n\t"\
907 " jb 1b \n\t"
6e1c66bc 908#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
909
910
77a49659 911static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
912 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
913 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
38858470 914{
b63f641e 915#if HAVE_MMX
f433c8ab 916 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
917 if (c->flags & SWS_ACCURATE_RND){
918 if (uDest){
919 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
920 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
921 }
bca11e75 922
14014d47
MN
923 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
924 }else{
925 if (uDest){
926 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
927 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
928 }
2da0d70d 929
14014d47
MN
930 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
931 }
f433c8ab
MN
932 return;
933 }
934#endif
b63f641e 935#if HAVE_ALTIVEC
a2faa401 936yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
937 chrFilter, chrSrc, chrFilterSize,
938 dest, uDest, vDest, dstW, chrDstW);
a2faa401 939#else //HAVE_ALTIVEC
5859233b 940yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
941 chrFilter, chrSrc, chrFilterSize,
942 dest, uDest, vDest, dstW, chrDstW);
a2faa401 943#endif //!HAVE_ALTIVEC
c1b0bfb4 944}
2add307d 945
6118e52e 946static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
947 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
948 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
6118e52e
VS
949{
950yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
951 chrFilter, chrSrc, chrFilterSize,
952 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
953}
954
bf2bdde6 955static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
2da0d70d 956 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
c1b0bfb4 957{
f433c8ab 958 int i;
b63f641e 959#if HAVE_MMX
f433c8ab 960 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
961 long p= uDest ? 3 : 1;
962 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
963 uint8_t *dst[3]= {dest, uDest, vDest};
d0ce212a 964 x86_reg counter[3] = {dstW, chrDstW, chrDstW};
2da0d70d 965
14014d47
MN
966 if (c->flags & SWS_ACCURATE_RND){
967 while(p--){
7ad6469e 968 __asm__ volatile(
14014d47
MN
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
971 "g" (-counter[p])
972 : "%"REG_a
973 );
974 }
975 }else{
976 while(p--){
7ad6469e 977 __asm__ volatile(
14014d47
MN
978 YSCALEYUV2YV121
979 :: "r" (src[p]), "r" (dst[p] + counter[p]),
980 "g" (-counter[p])
981 : "%"REG_a
982 );
983 }
d78c1ea1 984 }
f433c8ab
MN
985 return;
986 }
987#endif
2da0d70d
DB
988 for (i=0; i<dstW; i++)
989 {
a1f3ffa3 990 int val= (lumSrc[i]+64)>>7;
2da0d70d
DB
991
992 if (val&256){
993 if (val<0) val=0;
994 else val=255;
995 }
996
997 dest[i]= val;
998 }
999
1b0a4572 1000 if (uDest)
2da0d70d
DB
1001 for (i=0; i<chrDstW; i++)
1002 {
a1f3ffa3
MN
1003 int u=(chrSrc[i ]+64)>>7;
1004 int v=(chrSrc[i + VOFW]+64)>>7;
2da0d70d
DB
1005
1006 if ((u|v)&256){
1007 if (u<0) u=0;
1008 else if (u>255) u=255;
1009 if (v<0) v=0;
1010 else if (v>255) v=255;
1011 }
1012
1013 uDest[i]= u;
1014 vDest[i]= v;
1015 }
38858470
MN
1016}
1017
c1b0bfb4 1018
d604bab9
MN
1019/**
1020 * vertical scale YV12 to RGB
1021 */
25593e29 1022static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
1023 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1024 uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1025{
b63f641e 1026#if HAVE_MMX
d0ce212a 1027 x86_reg dummy=0;
f433c8ab 1028 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
1029 if (c->flags & SWS_ACCURATE_RND){
1030 switch(c->dstFormat){
1031 case PIX_FMT_RGB32:
1032 YSCALEYUV2PACKEDX_ACCURATE
1033 YSCALEYUV2RGBX
f8a138be 1034 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1035 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
2da0d70d 1036
14014d47
MN
1037 YSCALEYUV2PACKEDX_END
1038 return;
1039 case PIX_FMT_BGR24:
1040 YSCALEYUV2PACKEDX_ACCURATE
1041 YSCALEYUV2RGBX
40494418 1042 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1043 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1044 "add %4, %%"REG_c" \n\t"
1045 WRITEBGR24(%%REGc, %5, %%REGa)
2da0d70d
DB
1046
1047
14014d47
MN
1048 :: "r" (&c->redDither),
1049 "m" (dummy), "m" (dummy), "m" (dummy),
1050 "r" (dest), "m" (dstW)
1051 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1052 );
1053 return;
1054 case PIX_FMT_RGB555:
1055 YSCALEYUV2PACKEDX_ACCURATE
1056 YSCALEYUV2RGBX
40494418 1057 "pxor %%mm7, %%mm7 \n\t"
14014d47 1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1059#ifdef DITHER1XBPP
88e2a9ae
CEH
1060 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1061 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1062 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1063#endif
1064
14014d47
MN
1065 WRITERGB15(%4, %5, %%REGa)
1066 YSCALEYUV2PACKEDX_END
1067 return;
1068 case PIX_FMT_RGB565:
1069 YSCALEYUV2PACKEDX_ACCURATE
1070 YSCALEYUV2RGBX
40494418 1071 "pxor %%mm7, %%mm7 \n\t"
14014d47 1072 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1073#ifdef DITHER1XBPP
88e2a9ae
CEH
1074 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1075 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1076 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1077#endif
1078
14014d47
MN
1079 WRITERGB16(%4, %5, %%REGa)
1080 YSCALEYUV2PACKEDX_END
1081 return;
1082 case PIX_FMT_YUYV422:
1083 YSCALEYUV2PACKEDX_ACCURATE
1084 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1085
1086 "psraw $3, %%mm3 \n\t"
1087 "psraw $3, %%mm4 \n\t"
1088 "psraw $3, %%mm1 \n\t"
1089 "psraw $3, %%mm7 \n\t"
1090 WRITEYUY2(%4, %5, %%REGa)
1091 YSCALEYUV2PACKEDX_END
1092 return;
1093 }
1094 }else{
1095 switch(c->dstFormat)
1096 {
1097 case PIX_FMT_RGB32:
1098 YSCALEYUV2PACKEDX
1099 YSCALEYUV2RGBX
f8a138be 1100 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1101 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
14014d47
MN
1102 YSCALEYUV2PACKEDX_END
1103 return;
1104 case PIX_FMT_BGR24:
1105 YSCALEYUV2PACKEDX
1106 YSCALEYUV2RGBX
40494418 1107 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1108 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1109 "add %4, %%"REG_c" \n\t"
1110 WRITEBGR24(%%REGc, %5, %%REGa)
1111
1112 :: "r" (&c->redDither),
1113 "m" (dummy), "m" (dummy), "m" (dummy),
1114 "r" (dest), "m" (dstW)
1115 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1116 );
1117 return;
1118 case PIX_FMT_RGB555:
1119 YSCALEYUV2PACKEDX
1120 YSCALEYUV2RGBX
40494418 1121 "pxor %%mm7, %%mm7 \n\t"
14014d47 1122 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1123#ifdef DITHER1XBPP
88e2a9ae
CEH
1124 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1125 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1126 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1127#endif
1128
14014d47
MN
1129 WRITERGB15(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1131 return;
1132 case PIX_FMT_RGB565:
1133 YSCALEYUV2PACKEDX
1134 YSCALEYUV2RGBX
40494418 1135 "pxor %%mm7, %%mm7 \n\t"
14014d47 1136 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1137#ifdef DITHER1XBPP
88e2a9ae
CEH
1138 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1139 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1140 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1141#endif
1142
14014d47
MN
1143 WRITERGB16(%4, %5, %%REGa)
1144 YSCALEYUV2PACKEDX_END
1145 return;
1146 case PIX_FMT_YUYV422:
1147 YSCALEYUV2PACKEDX
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149
1150 "psraw $3, %%mm3 \n\t"
1151 "psraw $3, %%mm4 \n\t"
1152 "psraw $3, %%mm1 \n\t"
1153 "psraw $3, %%mm7 \n\t"
1154 WRITEYUY2(%4, %5, %%REGa)
1155 YSCALEYUV2PACKEDX_END
1156 return;
1157 }
bca11e75
MN
1158 }
1159 }
bc279024 1160#endif /* HAVE_MMX */
b63f641e 1161#if HAVE_ALTIVEC
2da0d70d 1162 /* The following list of supported dstFormat values should
780daf2b 1163 match what's found in the body of ff_yuv2packedX_altivec() */
12794f73
KS
1164 if (!(c->flags & SWS_BITEXACT) &&
1165 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
2da0d70d 1166 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
12794f73 1167 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
780daf2b
DB
1168 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1169 chrFilter, chrSrc, chrFilterSize,
1170 dest, dstW, dstY);
2da0d70d
DB
1171 else
1172#endif
1173 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1174 chrFilter, chrSrc, chrFilterSize,
1175 dest, dstW, dstY);
c1b0bfb4
MN
1176}
1177
c1b0bfb4
MN
1178/**
1179 * vertical bilinear scale YV12 to RGB
1180 */
25593e29 1181static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1182 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1183{
ac0ad729
MN
1184 int yalpha1=4095- yalpha;
1185 int uvalpha1=4095-uvalpha;
2da0d70d 1186 int i;
d604bab9 1187
b63f641e 1188#if HAVE_MMX
f433c8ab 1189 if(!(c->flags & SWS_BITEXACT)){
2da0d70d
DB
1190 switch(c->dstFormat)
1191 {
1192 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1193 case PIX_FMT_RGB32:
7ad6469e 1194 __asm__ volatile(
2da0d70d
DB
1195 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1196 "mov %4, %%"REG_b" \n\t"
1197 "push %%"REG_BP" \n\t"
1198 YSCALEYUV2RGB(%%REGBP, %5)
f8a138be 1199 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1200 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
2da0d70d
DB
1201 "pop %%"REG_BP" \n\t"
1202 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1203
1204 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1205 "a" (&c->redDither)
1206 );
1207 return;
1208 case PIX_FMT_BGR24:
7ad6469e 1209 __asm__ volatile(
2da0d70d
DB
1210 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1211 "mov %4, %%"REG_b" \n\t"
1212 "push %%"REG_BP" \n\t"
1213 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1214 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
1215 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1216 "pop %%"REG_BP" \n\t"
1217 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1218 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1219 "a" (&c->redDither)
1220 );
1221 return;
27a90b04 1222 case PIX_FMT_RGB555:
7ad6469e 1223 __asm__ volatile(
2da0d70d
DB
1224 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1225 "mov %4, %%"REG_b" \n\t"
1226 "push %%"REG_BP" \n\t"
1227 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1228 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1229 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1230#ifdef DITHER1XBPP
88e2a9ae
CEH
1231 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1232 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1233 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1234#endif
1235
27a90b04 1236 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1237 "pop %%"REG_BP" \n\t"
1238 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1239
1240 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1241 "a" (&c->redDither)
1242 );
1243 return;
27a90b04 1244 case PIX_FMT_RGB565:
7ad6469e 1245 __asm__ volatile(
2da0d70d
DB
1246 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1247 "mov %4, %%"REG_b" \n\t"
1248 "push %%"REG_BP" \n\t"
1249 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1250 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1252#ifdef DITHER1XBPP
88e2a9ae
CEH
1253 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1254 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1255 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1256#endif
1257
27a90b04 1258 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1259 "pop %%"REG_BP" \n\t"
1260 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1261 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1262 "a" (&c->redDither)
1263 );
1264 return;
1265 case PIX_FMT_YUYV422:
7ad6469e 1266 __asm__ volatile(
2da0d70d
DB
1267 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1268 "mov %4, %%"REG_b" \n\t"
1269 "push %%"REG_BP" \n\t"
1270 YSCALEYUV2PACKED(%%REGBP, %5)
1271 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1272 "pop %%"REG_BP" \n\t"
1273 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275 "a" (&c->redDither)
1276 );
1277 return;
1278 default: break;
1279 }
f433c8ab 1280 }
cf7d1c1a 1281#endif //HAVE_MMX
ec1bca2a 1282YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
d604bab9
MN
1283}
1284
1285/**
1286 * YV12 to RGB without scaling or interpolating
1287 */
25593e29 1288static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1289 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1290{
2da0d70d
DB
1291 const int yalpha1=0;
1292 int i;
6a4970ab 1293
8a322796 1294 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
2da0d70d 1295 const int yalpha= 4096; //FIXME ...
96034638 1296
2da0d70d
DB
1297 if (flags&SWS_FULL_CHR_H_INT)
1298 {
1299 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300 return;
1301 }
397c035e 1302
b63f641e 1303#if HAVE_MMX
f433c8ab 1304 if(!(flags & SWS_BITEXACT)){
14014d47 1305 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
2da0d70d 1306 {
14014d47
MN
1307 switch(dstFormat)
1308 {
1309 case PIX_FMT_RGB32:
7ad6469e 1310 __asm__ volatile(
14014d47
MN
1311 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1312 "mov %4, %%"REG_b" \n\t"
1313 "push %%"REG_BP" \n\t"
1314 YSCALEYUV2RGB1(%%REGBP, %5)
f8a138be 1315 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1316 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
14014d47
MN
1317 "pop %%"REG_BP" \n\t"
1318 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1319
1320 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1321 "a" (&c->redDither)
1322 );
1323 return;
1324 case PIX_FMT_BGR24:
7ad6469e 1325 __asm__ volatile(
14014d47
MN
1326 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1327 "mov %4, %%"REG_b" \n\t"
1328 "push %%"REG_BP" \n\t"
1329 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1330 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1331 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1332 "pop %%"REG_BP" \n\t"
1333 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1334
1335 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1336 "a" (&c->redDither)
1337 );
1338 return;
1339 case PIX_FMT_RGB555:
7ad6469e 1340 __asm__ volatile(
14014d47
MN
1341 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1342 "mov %4, %%"REG_b" \n\t"
1343 "push %%"REG_BP" \n\t"
1344 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1345 "pxor %%mm7, %%mm7 \n\t"
14014d47 1346 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1347#ifdef DITHER1XBPP
88e2a9ae
CEH
1348 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1349 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1350 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1351#endif
14014d47
MN
1352 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1353 "pop %%"REG_BP" \n\t"
1354 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1355
14014d47
MN
1356 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1357 "a" (&c->redDither)
1358 );
1359 return;
1360 case PIX_FMT_RGB565:
7ad6469e 1361 __asm__ volatile(
14014d47
MN
1362 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1363 "mov %4, %%"REG_b" \n\t"
1364 "push %%"REG_BP" \n\t"
1365 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1366 "pxor %%mm7, %%mm7 \n\t"
14014d47 1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1368#ifdef DITHER1XBPP
88e2a9ae
CEH
1369 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1370 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1371 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1372#endif
1373
14014d47
MN
1374 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1375 "pop %%"REG_BP" \n\t"
1376 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1377
14014d47
MN
1378 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1379 "a" (&c->redDither)
1380 );
1381 return;
1382 case PIX_FMT_YUYV422:
7ad6469e 1383 __asm__ volatile(
14014d47
MN
1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_b" \n\t"
1386 "push %%"REG_BP" \n\t"
1387 YSCALEYUV2PACKED1(%%REGBP, %5)
1388 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1389 "pop %%"REG_BP" \n\t"
1390 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1391
1392 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1393 "a" (&c->redDither)
1394 );
1395 return;
1396 }
2da0d70d 1397 }
14014d47 1398 else
2da0d70d 1399 {
14014d47
MN
1400 switch(dstFormat)
1401 {
1402 case PIX_FMT_RGB32:
7ad6469e 1403 __asm__ volatile(
14014d47
MN
1404 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1405 "mov %4, %%"REG_b" \n\t"
1406 "push %%"REG_BP" \n\t"
1407 YSCALEYUV2RGB1b(%%REGBP, %5)
f8a138be 1408 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1409 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
14014d47
MN
1410 "pop %%"REG_BP" \n\t"
1411 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1412
1413 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414 "a" (&c->redDither)
1415 );
1416 return;
1417 case PIX_FMT_BGR24:
7ad6469e 1418 __asm__ volatile(
14014d47
MN
1419 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1420 "mov %4, %%"REG_b" \n\t"
1421 "push %%"REG_BP" \n\t"
1422 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1423 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1424 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1425 "pop %%"REG_BP" \n\t"
1426 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1427
1428 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1429 "a" (&c->redDither)
1430 );
1431 return;
1432 case PIX_FMT_RGB555:
7ad6469e 1433 __asm__ volatile(
14014d47
MN
1434 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1435 "mov %4, %%"REG_b" \n\t"
1436 "push %%"REG_BP" \n\t"
1437 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1438 "pxor %%mm7, %%mm7 \n\t"
14014d47 1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1440#ifdef DITHER1XBPP
88e2a9ae
CEH
1441 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1442 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1443 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1444#endif
14014d47
MN
1445 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1446 "pop %%"REG_BP" \n\t"
1447 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1448
14014d47
MN
1449 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1450 "a" (&c->redDither)
1451 );
1452 return;
1453 case PIX_FMT_RGB565:
7ad6469e 1454 __asm__ volatile(
14014d47
MN
1455 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1456 "mov %4, %%"REG_b" \n\t"
1457 "push %%"REG_BP" \n\t"
1458 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1459 "pxor %%mm7, %%mm7 \n\t"
14014d47 1460 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1461#ifdef DITHER1XBPP
88e2a9ae
CEH
1462 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1463 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1464 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1465#endif
1466
14014d47
MN
1467 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1468 "pop %%"REG_BP" \n\t"
1469 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1470
14014d47
MN
1471 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1472 "a" (&c->redDither)
1473 );
1474 return;
1475 case PIX_FMT_YUYV422:
7ad6469e 1476 __asm__ volatile(
14014d47
MN
1477 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1478 "mov %4, %%"REG_b" \n\t"
1479 "push %%"REG_BP" \n\t"
1480 YSCALEYUV2PACKED1b(%%REGBP, %5)
1481 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1482 "pop %%"REG_BP" \n\t"
1483 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1484
1485 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1486 "a" (&c->redDither)
1487 );
1488 return;
1489 }
2da0d70d
DB
1490 }
1491 }
bc279024 1492#endif /* HAVE_MMX */
e5091488 1493 if (uvalpha < 2048)
2da0d70d 1494 {
ec1bca2a 1495 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1496 }else{
ec1bca2a 1497 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1498 }
d604bab9
MN
1499}
1500
8a322796 1501//FIXME yuy2* can read up to 7 samples too much
6ff0ad6b 1502
896a22b8 1503static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1e621b18 1504{
b63f641e 1505#if HAVE_MMX
7ad6469e 1506 __asm__ volatile(
2da0d70d
DB
1507 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1508 "mov %0, %%"REG_a" \n\t"
1509 "1: \n\t"
1510 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1511 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1512 "pand %%mm2, %%mm0 \n\t"
1513 "pand %%mm2, %%mm1 \n\t"
1514 "packuswb %%mm1, %%mm0 \n\t"
1515 "movq %%mm0, (%2, %%"REG_a") \n\t"
1516 "add $8, %%"REG_a" \n\t"
1517 " js 1b \n\t"
d0ce212a 1518 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
2da0d70d
DB
1519 : "%"REG_a
1520 );
1e621b18 1521#else
2da0d70d
DB
1522 int i;
1523 for (i=0; i<width; i++)
1524 dst[i]= src[2*i];
1e621b18
MN
1525#endif
1526}
1527
896a22b8 1528static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1e621b18 1529{
b63f641e 1530#if HAVE_MMX
7ad6469e 1531 __asm__ volatile(
2da0d70d
DB
1532 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1533 "mov %0, %%"REG_a" \n\t"
1534 "1: \n\t"
1535 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1536 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1537 "psrlw $8, %%mm0 \n\t"
1538 "psrlw $8, %%mm1 \n\t"
1539 "packuswb %%mm1, %%mm0 \n\t"
1540 "movq %%mm0, %%mm1 \n\t"
1541 "psrlw $8, %%mm0 \n\t"
1542 "pand %%mm4, %%mm1 \n\t"
1543 "packuswb %%mm0, %%mm0 \n\t"
1544 "packuswb %%mm1, %%mm1 \n\t"
1545 "movd %%mm0, (%3, %%"REG_a") \n\t"
1546 "movd %%mm1, (%2, %%"REG_a") \n\t"
1547 "add $4, %%"REG_a" \n\t"
1548 " js 1b \n\t"
d0ce212a 1549 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
2da0d70d
DB
1550 : "%"REG_a
1551 );
1e621b18 1552#else
2da0d70d
DB
1553 int i;
1554 for (i=0; i<width; i++)
1555 {
1556 dstU[i]= src1[4*i + 1];
1557 dstV[i]= src1[4*i + 3];
1558 }
1559#endif
1560 assert(src1 == src2);
1e621b18
MN
1561}
1562
4cf16bbe
DB
1563/* This is almost identical to the previous, end exists only because
1564 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
896a22b8 1565static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
7322a67c 1566{
b63f641e 1567#if HAVE_MMX
7ad6469e 1568 __asm__ volatile(
2da0d70d
DB
1569 "mov %0, %%"REG_a" \n\t"
1570 "1: \n\t"
1571 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1572 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1573 "psrlw $8, %%mm0 \n\t"
1574 "psrlw $8, %%mm1 \n\t"
1575 "packuswb %%mm1, %%mm0 \n\t"
1576 "movq %%mm0, (%2, %%"REG_a") \n\t"
1577 "add $8, %%"REG_a" \n\t"
1578 " js 1b \n\t"
d0ce212a 1579 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
2da0d70d
DB
1580 : "%"REG_a
1581 );
7322a67c 1582#else
2da0d70d
DB
1583 int i;
1584 for (i=0; i<width; i++)
1585 dst[i]= src[2*i+1];
7322a67c
MN
1586#endif
1587}
1588
896a22b8 1589static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
7322a67c 1590{
b63f641e 1591#if HAVE_MMX
7ad6469e 1592 __asm__ volatile(
2da0d70d
DB
1593 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1594 "mov %0, %%"REG_a" \n\t"
1595 "1: \n\t"
1596 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1597 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1598 "pand %%mm4, %%mm0 \n\t"
1599 "pand %%mm4, %%mm1 \n\t"
1600 "packuswb %%mm1, %%mm0 \n\t"
1601 "movq %%mm0, %%mm1 \n\t"
1602 "psrlw $8, %%mm0 \n\t"
1603 "pand %%mm4, %%mm1 \n\t"
1604 "packuswb %%mm0, %%mm0 \n\t"
1605 "packuswb %%mm1, %%mm1 \n\t"
1606 "movd %%mm0, (%3, %%"REG_a") \n\t"
1607 "movd %%mm1, (%2, %%"REG_a") \n\t"
1608 "add $4, %%"REG_a" \n\t"
1609 " js 1b \n\t"
d0ce212a 1610 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
2da0d70d
DB
1611 : "%"REG_a
1612 );
7322a67c 1613#else
2da0d70d
DB
1614 int i;
1615 for (i=0; i<width; i++)
1616 {
1617 dstU[i]= src1[4*i + 0];
1618 dstV[i]= src1[4*i + 2];
1619 }
1620#endif
1621 assert(src1 == src2);
7322a67c
MN
1622}
1623
214892ee 1624#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
896a22b8 1625static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
214892ee
MN
1626{\
1627 int i;\
1628 for (i=0; i<width; i++)\
1629 {\
1630 int b= (((type*)src)[i]>>shb)&maskb;\
1631 int g= (((type*)src)[i]>>shg)&maskg;\
1632 int r= (((type*)src)[i]>>shr)&maskr;\
1633\
1634 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1635 }\
1e621b18
MN
1636}
1637
214892ee
MN
1638BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1639BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1640BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1641BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1642BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1643BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1644
f8a138be 1645#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
896a22b8 1646static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
a0baa07a
MN
1647{\
1648 int i;\
1649 for (i=0; i<width; i++)\
1650 {\
ba83d862
MN
1651 int b= (((type*)src)[i]&maskb)>>shb;\
1652 int g= (((type*)src)[i]&maskg)>>shg;\
1653 int r= (((type*)src)[i]&maskr)>>shr;\
a0baa07a
MN
1654\
1655 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1656 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1657 }\
ba83d862 1658}\
896a22b8 1659static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
ba83d862
MN
1660{\
1661 int i;\
1662 for (i=0; i<width; i++)\
1663 {\
1664 int pix0= ((type*)src)[2*i+0];\
1665 int pix1= ((type*)src)[2*i+1];\
bcff32d1 1666 int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
ba83d862
MN
1667 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1668 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
f8a138be 1669 g&= maskg|(2*maskg);\
ba83d862
MN
1670\
1671 g>>=shg;\
1672\
6b79dbce
MN
1673 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1674 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
ba83d862 1675 }\
2f60f629
MN
1676}
1677
f8a138be
CS
1678BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1679BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1680BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1681BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1682BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1683BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
a0baa07a 1684
b63f641e 1685#if HAVE_MMX
a35acd7f 1686static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
dfb09bd1
MN
1687{
1688
1689 if(srcFormat == PIX_FMT_BGR24){
7ad6469e 1690 __asm__ volatile(
ff9a056d
MN
1691 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1692 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1693 :
dfb09bd1
MN
1694 );
1695 }else{
7ad6469e 1696 __asm__ volatile(
ff9a056d
MN
1697 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1698 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1699 :
dfb09bd1
MN
1700 );
1701 }
1702
7ad6469e 1703 __asm__ volatile(
dfb09bd1
MN
1704 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1705 "mov %2, %%"REG_a" \n\t"
1706 "pxor %%mm7, %%mm7 \n\t"
1707 "1: \n\t"
1708 PREFETCH" 64(%0) \n\t"
1709 "movd (%0), %%mm0 \n\t"
1710 "movd 2(%0), %%mm1 \n\t"
1711 "movd 6(%0), %%mm2 \n\t"
1712 "movd 8(%0), %%mm3 \n\t"
1713 "add $12, %0 \n\t"
1714 "punpcklbw %%mm7, %%mm0 \n\t"
1715 "punpcklbw %%mm7, %%mm1 \n\t"
1716 "punpcklbw %%mm7, %%mm2 \n\t"
1717 "punpcklbw %%mm7, %%mm3 \n\t"
1718 "pmaddwd %%mm5, %%mm0 \n\t"
1719 "pmaddwd %%mm6, %%mm1 \n\t"
1720 "pmaddwd %%mm5, %%mm2 \n\t"
1721 "pmaddwd %%mm6, %%mm3 \n\t"
1722 "paddd %%mm1, %%mm0 \n\t"
1723 "paddd %%mm3, %%mm2 \n\t"
1724 "paddd %%mm4, %%mm0 \n\t"
1725 "paddd %%mm4, %%mm2 \n\t"
1726 "psrad $15, %%mm0 \n\t"
1727 "psrad $15, %%mm2 \n\t"
1728 "packssdw %%mm2, %%mm0 \n\t"
1729 "packuswb %%mm0, %%mm0 \n\t"
1730 "movd %%mm0, (%1, %%"REG_a") \n\t"
1731 "add $4, %%"REG_a" \n\t"
1732 " js 1b \n\t"
1733 : "+r" (src)
d0ce212a 1734 : "r" (dst+width), "g" ((x86_reg)-width)
dfb09bd1 1735 : "%"REG_a
2da0d70d 1736 );
dfb09bd1
MN
1737}
1738
a35acd7f 1739static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
dfb09bd1 1740{
7ad6469e 1741 __asm__ volatile(
dfb09bd1
MN
1742 "movq 24+%4, %%mm6 \n\t"
1743 "mov %3, %%"REG_a" \n\t"
1744 "pxor %%mm7, %%mm7 \n\t"
1745 "1: \n\t"
1746 PREFETCH" 64(%0) \n\t"
1747 "movd (%0), %%mm0 \n\t"
1748 "movd 2(%0), %%mm1 \n\t"
1749 "punpcklbw %%mm7, %%mm0 \n\t"
1750 "punpcklbw %%mm7, %%mm1 \n\t"
1751 "movq %%mm0, %%mm2 \n\t"
1752 "movq %%mm1, %%mm3 \n\t"
1753 "pmaddwd %4, %%mm0 \n\t"
1754 "pmaddwd 8+%4, %%mm1 \n\t"
1755 "pmaddwd 16+%4, %%mm2 \n\t"
1756 "pmaddwd %%mm6, %%mm3 \n\t"
1757 "paddd %%mm1, %%mm0 \n\t"
1758 "paddd %%mm3, %%mm2 \n\t"
1759
1760 "movd 6(%0), %%mm1 \n\t"
1761 "movd 8(%0), %%mm3 \n\t"
1762 "add $12, %0 \n\t"
1763 "punpcklbw %%mm7, %%mm1 \n\t"
1764 "punpcklbw %%mm7, %%mm3 \n\t"
1765 "movq %%mm1, %%mm4 \n\t"
1766 "movq %%mm3, %%mm5 \n\t"
1767 "pmaddwd %4, %%mm1 \n\t"
1768 "pmaddwd 8+%4, %%mm3 \n\t"
1769 "pmaddwd 16+%4, %%mm4 \n\t"
1770 "pmaddwd %%mm6, %%mm5 \n\t"
1771 "paddd %%mm3, %%mm1 \n\t"
1772 "paddd %%mm5, %%mm4 \n\t"
1773
1774 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1775 "paddd %%mm3, %%mm0 \n\t"
1776 "paddd %%mm3, %%mm2 \n\t"
1777 "paddd %%mm3, %%mm1 \n\t"
1778 "paddd %%mm3, %%mm4 \n\t"
1779 "psrad $15, %%mm0 \n\t"
1780 "psrad $15, %%mm2 \n\t"
1781 "psrad $15, %%mm1 \n\t"
1782 "psrad $15, %%mm4 \n\t"
1783 "packssdw %%mm1, %%mm0 \n\t"
1784 "packssdw %%mm4, %%mm2 \n\t"
1785 "packuswb %%mm0, %%mm0 \n\t"
1786 "packuswb %%mm2, %%mm2 \n\t"
1787 "movd %%mm0, (%1, %%"REG_a") \n\t"
1788 "movd %%mm2, (%2, %%"REG_a") \n\t"
1789 "add $4, %%"REG_a" \n\t"
1790 " js 1b \n\t"
1791 : "+r" (src)
d0ce212a 1792 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
dfb09bd1
MN
1793 : "%"REG_a
1794 );
1795}
1796#endif
1797
896a22b8 1798static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
dfb09bd1 1799{
b63f641e 1800#if HAVE_MMX
a35acd7f 1801 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1e621b18 1802#else
2da0d70d
DB
1803 int i;
1804 for (i=0; i<width; i++)
1805 {
1806 int b= src[i*3+0];
1807 int g= src[i*3+1];
1808 int r= src[i*3+2];
1e621b18 1809
e5091488 1810 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1811 }
bc279024 1812#endif /* HAVE_MMX */
1e621b18
MN
1813}
1814
896a22b8 1815static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1e621b18 1816{
b63f641e 1817#if HAVE_MMX
a35acd7f 1818 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1e621b18 1819#else
2da0d70d
DB
1820 int i;
1821 for (i=0; i<width; i++)
1822 {
dfb09bd1
MN
1823 int b= src1[3*i + 0];
1824 int g= src1[3*i + 1];
1825 int r= src1[3*i + 2];
2da0d70d 1826
dfb09bd1
MN
1827 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1828 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 1829 }
bc279024 1830#endif /* HAVE_MMX */
2da0d70d 1831 assert(src1 == src2);
1e621b18
MN
1832}
1833
896a22b8 1834static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
1835{
1836 int i;
1837 for (i=0; i<width; i++)
1838 {
1839 int b= src1[6*i + 0] + src1[6*i + 3];
1840 int g= src1[6*i + 1] + src1[6*i + 4];
1841 int r= src1[6*i + 2] + src1[6*i + 5];
1842
1843 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1844 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1845 }
1846 assert(src1 == src2);
1847}
1848
896a22b8 1849static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
a861d4d7 1850{
b63f641e 1851#if HAVE_MMX
a35acd7f 1852 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
dfb09bd1 1853#else
2da0d70d
DB
1854 int i;
1855 for (i=0; i<width; i++)
1856 {
1857 int r= src[i*3+0];
1858 int g= src[i*3+1];
1859 int b= src[i*3+2];
1860
e5091488 1861 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1862 }
dfb09bd1 1863#endif
a861d4d7
MN
1864}
1865
896a22b8 1866static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
a861d4d7 1867{
b63f641e 1868#if HAVE_MMX
5155b839 1869 assert(src1==src2);
a35acd7f 1870 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
dfb09bd1 1871#else
5155b839
DB
1872 int i;
1873 assert(src1==src2);
2da0d70d
DB
1874 for (i=0; i<width; i++)
1875 {
dfb09bd1
MN
1876 int r= src1[3*i + 0];
1877 int g= src1[3*i + 1];
1878 int b= src1[3*i + 2];
2da0d70d 1879
dfb09bd1
MN
1880 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1881 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 1882 }
dfb09bd1 1883#endif
a861d4d7
MN
1884}
1885
896a22b8 1886static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
1887{
1888 int i;
1889 assert(src1==src2);
1890 for (i=0; i<width; i++)
1891 {
e09d7eef
MN
1892 int r= src1[6*i + 0] + src1[6*i + 3];
1893 int g= src1[6*i + 1] + src1[6*i + 4];
1894 int b= src1[6*i + 2] + src1[6*i + 5];
2f60f629
MN
1895
1896 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1897 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1898 }
1899}
1900
1e621b18 1901
97b93389 1902static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
e28630fc 1903{
2da0d70d
DB
1904 int i;
1905 for (i=0; i<width; i++)
1906 {
1907 int d= src[i];
e28630fc 1908
2da0d70d
DB
1909 dst[i]= pal[d] & 0xFF;
1910 }
e28630fc
MN
1911}
1912
97b93389 1913static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
e28630fc 1914{
2da0d70d
DB
1915 int i;
1916 assert(src1 == src2);
1917 for (i=0; i<width; i++)
1918 {
1919 int p= pal[src1[i]];
1920
1921 dstU[i]= p>>8;
1922 dstV[i]= p>>16;
1923 }
e28630fc
MN
1924}
1925
896a22b8 1926static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
3d05e078
MN
1927{
1928 int i, j;
1929 for (i=0; i<width/8; i++){
3a5ba0c3
LB
1930 int d= ~src[i];
1931 for(j=0; j<8; j++)
1932 dst[8*i+j]= ((d>>(7-j))&1)*255;
1933 }
1934}
1935
896a22b8 1936static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
3a5ba0c3
LB
1937{
1938 int i, j;
1939 for (i=0; i<width/8; i++){
1940 int d= src[i];
78454dfc
MN
1941 for(j=0; j<8; j++)
1942 dst[8*i+j]= ((d>>(7-j))&1)*255;
3d05e078
MN
1943 }
1944}
1945
8a322796 1946// bilinear / bicubic scaling
077ea8a7 1947static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2da0d70d 1948 int16_t *filter, int16_t *filterPos, long filterSize)
2ff198c1 1949{
b63f641e 1950#if HAVE_MMX
2da0d70d
DB
1951 assert(filterSize % 4 == 0 && filterSize>0);
1952 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
1953 {
d0ce212a 1954 x86_reg counter= -2*dstW;
2da0d70d
DB
1955 filter-= counter*2;
1956 filterPos-= counter/2;
1957 dst-= counter/2;
7ad6469e 1958 __asm__ volatile(
83c89c78 1959#if defined(PIC)
2da0d70d
DB
1960 "push %%"REG_b" \n\t"
1961#endif
1962 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
1963 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1964 "mov %%"REG_a", %%"REG_BP" \n\t"
1965 ASMALIGN(4)
1966 "1: \n\t"
1967 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1968 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1969 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1970 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1971 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1972 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1973 "punpcklbw %%mm7, %%mm0 \n\t"
1974 "punpcklbw %%mm7, %%mm2 \n\t"
1975 "pmaddwd %%mm1, %%mm0 \n\t"
1976 "pmaddwd %%mm2, %%mm3 \n\t"
ef423a66
MN
1977 "movq %%mm0, %%mm4 \n\t"
1978 "punpckldq %%mm3, %%mm0 \n\t"
1979 "punpckhdq %%mm3, %%mm4 \n\t"
1980 "paddd %%mm4, %%mm0 \n\t"
1981 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
1982 "packssdw %%mm0, %%mm0 \n\t"
1983 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1984 "add $4, %%"REG_BP" \n\t"
1985 " jnc 1b \n\t"
1986
1987 "pop %%"REG_BP" \n\t"
83c89c78 1988#if defined(PIC)
2da0d70d 1989 "pop %%"REG_b" \n\t"
83c89c78 1990#endif
2da0d70d
DB
1991 : "+a" (counter)
1992 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 1993#if !defined(PIC)
2da0d70d
DB
1994 : "%"REG_b
1995#endif
1996 );
1997 }
1998 else if (filterSize==8)
1999 {
d0ce212a 2000 x86_reg counter= -2*dstW;
2da0d70d
DB
2001 filter-= counter*4;
2002 filterPos-= counter/2;
2003 dst-= counter/2;
7ad6469e 2004 __asm__ volatile(
83c89c78 2005#if defined(PIC)
2da0d70d
DB
2006 "push %%"REG_b" \n\t"
2007#endif
2008 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2009 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2010 "mov %%"REG_a", %%"REG_BP" \n\t"
2011 ASMALIGN(4)
2012 "1: \n\t"
2013 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2014 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2015 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2016 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2017 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2018 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm0 \n\t"
2020 "punpcklbw %%mm7, %%mm2 \n\t"
2021 "pmaddwd %%mm1, %%mm0 \n\t"
2022 "pmaddwd %%mm2, %%mm3 \n\t"
2023
2024 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2025 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2026 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2027 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2028 "punpcklbw %%mm7, %%mm4 \n\t"
2029 "punpcklbw %%mm7, %%mm2 \n\t"
2030 "pmaddwd %%mm1, %%mm4 \n\t"
2031 "pmaddwd %%mm2, %%mm5 \n\t"
2032 "paddd %%mm4, %%mm0 \n\t"
2033 "paddd %%mm5, %%mm3 \n\t"
ef423a66
MN
2034 "movq %%mm0, %%mm4 \n\t"
2035 "punpckldq %%mm3, %%mm0 \n\t"
2036 "punpckhdq %%mm3, %%mm4 \n\t"
2037 "paddd %%mm4, %%mm0 \n\t"
2038 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2039 "packssdw %%mm0, %%mm0 \n\t"
2040 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2041 "add $4, %%"REG_BP" \n\t"
2042 " jnc 1b \n\t"
2043
2044 "pop %%"REG_BP" \n\t"
83c89c78 2045#if defined(PIC)
2da0d70d 2046 "pop %%"REG_b" \n\t"
83c89c78 2047#endif
2da0d70d
DB
2048 : "+a" (counter)
2049 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2050#if !defined(PIC)
2da0d70d
DB
2051 : "%"REG_b
2052#endif
2053 );
2054 }
2055 else
2056 {
2057 uint8_t *offset = src+filterSize;
d0ce212a 2058 x86_reg counter= -2*dstW;
2da0d70d
DB
2059 //filter-= counter*filterSize/2;
2060 filterPos-= counter/2;
2061 dst-= counter/2;
7ad6469e 2062 __asm__ volatile(
2da0d70d 2063 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2064 ASMALIGN(4)
2065 "1: \n\t"
2066 "mov %2, %%"REG_c" \n\t"
2067 "movzwl (%%"REG_c", %0), %%eax \n\t"
2068 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2069 "mov %5, %%"REG_c" \n\t"
2070 "pxor %%mm4, %%mm4 \n\t"
2071 "pxor %%mm5, %%mm5 \n\t"
2072 "2: \n\t"
2073 "movq (%1), %%mm1 \n\t"
2074 "movq (%1, %6), %%mm3 \n\t"
2075 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2076 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2077 "punpcklbw %%mm7, %%mm0 \n\t"
2078 "punpcklbw %%mm7, %%mm2 \n\t"
2079 "pmaddwd %%mm1, %%mm0 \n\t"
2080 "pmaddwd %%mm2, %%mm3 \n\t"
2081 "paddd %%mm3, %%mm5 \n\t"
2082 "paddd %%mm0, %%mm4 \n\t"
2083 "add $8, %1 \n\t"
2084 "add $4, %%"REG_c" \n\t"
2085 "cmp %4, %%"REG_c" \n\t"
2086 " jb 2b \n\t"
2087 "add %6, %1 \n\t"
ef423a66
MN
2088 "movq %%mm4, %%mm0 \n\t"
2089 "punpckldq %%mm5, %%mm4 \n\t"
2090 "punpckhdq %%mm5, %%mm0 \n\t"
2091 "paddd %%mm0, %%mm4 \n\t"
2092 "psrad $7, %%mm4 \n\t"
2da0d70d
DB
2093 "packssdw %%mm4, %%mm4 \n\t"
2094 "mov %3, %%"REG_a" \n\t"
2095 "movd %%mm4, (%%"REG_a", %0) \n\t"
2096 "add $4, %0 \n\t"
2097 " jnc 1b \n\t"
2098
2099 : "+r" (counter), "+r" (filter)
2100 : "m" (filterPos), "m" (dst), "m"(offset),
d0ce212a 2101 "m" (src), "r" ((x86_reg)filterSize*2)
2da0d70d
DB
2102 : "%"REG_a, "%"REG_c, "%"REG_d
2103 );
2104 }
077ea8a7 2105#else
b63f641e 2106#if HAVE_ALTIVEC
2da0d70d 2107 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2108#else
2da0d70d
DB
2109 int i;
2110 for (i=0; i<dstW; i++)
2111 {
2112 int j;
2113 int srcPos= filterPos[i];
2114 int val=0;
2115 //printf("filterPos: %d\n", filterPos[i]);
2116 for (j=0; j<filterSize; j++)
2117 {
2118 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2119 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2120 }
2121 //filter += hFilterSize;
881c4294 2122 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2da0d70d
DB
2123 //dst[i] = val>>7;
2124 }
bc279024
DB
2125#endif /* HAVE_ALTIVEC */
2126#endif /* HAVE_MMX */
077ea8a7 2127}
2ff198c1 2128 // *** horizontal scale Y line to temp buffer
6bc0c792 2129static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2da0d70d
DB
2130 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2131 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2132 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
e48a79c9 2133 int32_t *mmx2FilterPos, uint32_t *pal)
077ea8a7 2134{
2da0d70d 2135 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
1e621b18 2136 {
896a22b8 2137 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2138 src= formatConvBuffer;
1e621b18 2139 }
2da0d70d 2140 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
7322a67c 2141 {
896a22b8 2142 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2143 src= formatConvBuffer;
7322a67c 2144 }
2da0d70d 2145 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2146 {
896a22b8 2147 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2148 src= formatConvBuffer;
1e621b18 2149 }
9990e426
MN
2150 else if (srcFormat==PIX_FMT_RGB32_1)
2151 {
896a22b8 2152 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2153 src= formatConvBuffer;
2154 }
2da0d70d 2155 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2156 {
896a22b8 2157 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2158 src= formatConvBuffer;
1e621b18 2159 }
2da0d70d 2160 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2161 {
896a22b8 2162 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2163 src= formatConvBuffer;
6af250ea 2164 }
2da0d70d 2165 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2166 {
896a22b8 2167 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2168 src= formatConvBuffer;
b72034dd 2169 }
2da0d70d 2170 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2171 {
896a22b8 2172 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2173 src= formatConvBuffer;
a861d4d7 2174 }
9990e426
MN
2175 else if (srcFormat==PIX_FMT_BGR32_1)
2176 {
896a22b8 2177 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2178 src= formatConvBuffer;
2179 }
2da0d70d 2180 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2181 {
896a22b8 2182 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2183 src= formatConvBuffer;
a861d4d7 2184 }
2da0d70d 2185 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2186 {
896a22b8 2187 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2188 src= formatConvBuffer;
a43fb6b3 2189 }
2da0d70d 2190 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2191 {
896a22b8 2192 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2193 src= formatConvBuffer;
a43fb6b3 2194 }
2da0d70d 2195 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2196 {
e48a79c9 2197 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2198 src= formatConvBuffer;
e28630fc 2199 }
3a5ba0c3
LB
2200 else if (srcFormat==PIX_FMT_MONOBLACK)
2201 {
896a22b8 2202 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
3a5ba0c3
LB
2203 src= formatConvBuffer;
2204 }
2205 else if (srcFormat==PIX_FMT_MONOWHITE)
3d05e078 2206 {
896a22b8 2207 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
3d05e078
MN
2208 src= formatConvBuffer;
2209 }
1e621b18 2210
b63f641e 2211#if HAVE_MMX
8a322796 2212 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2213 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2214#else
2da0d70d 2215 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2216#endif
077ea8a7 2217 {
2da0d70d 2218 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
077ea8a7 2219 }
8a322796 2220 else // fast bilinear upscale / crap downscale
077ea8a7 2221 {
57f9a560 2222#if ARCH_X86 && CONFIG_GPL
b63f641e 2223#if HAVE_MMX2
2da0d70d 2224 int i;
83c89c78 2225#if defined(PIC)
2da0d70d 2226 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2227#endif
2da0d70d
DB
2228 if (canMMX2BeUsed)
2229 {
7ad6469e 2230 __asm__ volatile(
83c89c78 2231#if defined(PIC)
2da0d70d
DB
2232 "mov %%"REG_b", %5 \n\t"
2233#endif
2234 "pxor %%mm7, %%mm7 \n\t"
2235 "mov %0, %%"REG_c" \n\t"
2236 "mov %1, %%"REG_D" \n\t"
2237 "mov %2, %%"REG_d" \n\t"
2238 "mov %3, %%"REG_b" \n\t"
2239 "xor %%"REG_a", %%"REG_a" \n\t" // i
2240 PREFETCH" (%%"REG_c") \n\t"
2241 PREFETCH" 32(%%"REG_c") \n\t"
2242 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2243
b63f641e 2244#if ARCH_X86_64
6d606c4f
AJ
2245
2246#define FUNNY_Y_CODE \
2da0d70d
DB
2247 "movl (%%"REG_b"), %%esi \n\t"\
2248 "call *%4 \n\t"\
2249 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2250 "add %%"REG_S", %%"REG_c" \n\t"\
2251 "add %%"REG_a", %%"REG_D" \n\t"\
2252 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2253
2254#else
2255
2ff198c1 2256#define FUNNY_Y_CODE \
2da0d70d
DB
2257 "movl (%%"REG_b"), %%esi \n\t"\
2258 "call *%4 \n\t"\
2259 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2260 "add %%"REG_a", %%"REG_D" \n\t"\
2261 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2262
bc279024 2263#endif /* ARCH_X86_64 */
6d606c4f 2264
2ff198c1
MN
2265FUNNY_Y_CODE
2266FUNNY_Y_CODE
2267FUNNY_Y_CODE
2268FUNNY_Y_CODE
2269FUNNY_Y_CODE
2270FUNNY_Y_CODE
2271FUNNY_Y_CODE
2272FUNNY_Y_CODE
2273
83c89c78 2274#if defined(PIC)
2da0d70d 2275 "mov %5, %%"REG_b" \n\t"
83c89c78 2276#endif
2da0d70d
DB
2277 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2278 "m" (funnyYCode)
83c89c78 2279#if defined(PIC)
2da0d70d 2280 ,"m" (ebxsave)
83c89c78 2281#endif
2da0d70d 2282 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2283#if !defined(PIC)
2da0d70d
DB
2284 ,"%"REG_b
2285#endif
2286 );
2287 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2288 }
2289 else
2290 {
bc279024 2291#endif /* HAVE_MMX2 */
d0ce212a 2292 x86_reg xInc_shr16 = xInc >> 16;
2da0d70d
DB
2293 uint16_t xInc_mask = xInc & 0xffff;
2294 //NO MMX just normal asm ...
7ad6469e 2295 __asm__ volatile(
2da0d70d
DB
2296 "xor %%"REG_a", %%"REG_a" \n\t" // i
2297 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2298 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2299 ASMALIGN(4)
2300 "1: \n\t"
2301 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2302 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2303 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2304 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2305 "shll $16, %%edi \n\t"
2306 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2307 "mov %1, %%"REG_D" \n\t"
2308 "shrl $9, %%esi \n\t"
2309 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2310 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2311 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2312
2313 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2314 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2315 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2316 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2317 "shll $16, %%edi \n\t"
2318 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2319 "mov %1, %%"REG_D" \n\t"
2320 "shrl $9, %%esi \n\t"
2321 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2322 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2323 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2324
2325
2326 "add $2, %%"REG_a" \n\t"
2327 "cmp %2, %%"REG_a" \n\t"
2328 " jb 1b \n\t"
2329
2330
2331 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2332 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2333 );
b63f641e 2334#if HAVE_MMX2
2da0d70d 2335 } //if MMX2 can't be used
2ff198c1
MN
2336#endif
2337#else
2da0d70d
DB
2338 int i;
2339 unsigned int xpos=0;
2340 for (i=0;i<dstWidth;i++)
2341 {
2342 register unsigned int xx=xpos>>16;
2343 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2344 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2345 xpos+=xInc;
2346 }
b63f641e 2347#endif /* ARCH_X86 */
077ea8a7 2348 }
6bc0c792
MN
2349
2350 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2351 int i;
2352 //FIXME all pal and rgb srcFormats could do this convertion as well
2353 //FIXME all scalers more complex than bilinear could do half of this transform
2354 if(c->srcRange){
2355 for (i=0; i<dstWidth; i++)
2356 dst[i]= (dst[i]*14071 + 33561947)>>14;
2357 }else{
2358 for (i=0; i<dstWidth; i++)
aa13b0fc 2359 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
6bc0c792
MN
2360 }
2361 }
2ff198c1
MN
2362}
2363
6bc0c792 2364inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2da0d70d
DB
2365 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2366 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2367 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
e48a79c9 2368 int32_t *mmx2FilterPos, uint32_t *pal)
2ff198c1 2369{
2da0d70d 2370 if (srcFormat==PIX_FMT_YUYV422)
1e621b18 2371 {
896a22b8 2372 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2373 src1= formatConvBuffer;
8b2fce0d 2374 src2= formatConvBuffer+VOFW;
1e621b18 2375 }
2da0d70d 2376 else if (srcFormat==PIX_FMT_UYVY422)
7322a67c 2377 {
896a22b8 2378 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2379 src1= formatConvBuffer;
8b2fce0d 2380 src2= formatConvBuffer+VOFW;
7322a67c 2381 }
2da0d70d 2382 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2383 {
2f60f629 2384 if(c->chrSrcHSubSample)
896a22b8 2385 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2386 else
896a22b8 2387 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2388 src1= formatConvBuffer;
8b2fce0d 2389 src2= formatConvBuffer+VOFW;
1e621b18 2390 }
9990e426
MN
2391 else if (srcFormat==PIX_FMT_RGB32_1)
2392 {
2f60f629 2393 if(c->chrSrcHSubSample)
896a22b8 2394 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2395 else
896a22b8 2396 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2397 src1= formatConvBuffer;
2398 src2= formatConvBuffer+VOFW;
2399 }
2da0d70d 2400 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2401 {
2f60f629 2402 if(c->chrSrcHSubSample)
896a22b8 2403 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2404 else
896a22b8 2405 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2406 src1= formatConvBuffer;
8b2fce0d 2407 src2= formatConvBuffer+VOFW;
1e621b18 2408 }
2da0d70d 2409 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2410 {
2f60f629 2411 if(c->chrSrcHSubSample)
896a22b8 2412 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2413 else
896a22b8 2414 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2415 src1= formatConvBuffer;
8b2fce0d 2416 src2= formatConvBuffer+VOFW;
6af250ea 2417 }
2da0d70d 2418 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2419 {
2f60f629 2420 if(c->chrSrcHSubSample)
896a22b8 2421 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2422 else
896a22b8 2423 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2424 src1= formatConvBuffer;
8b2fce0d 2425 src2= formatConvBuffer+VOFW;
b72034dd 2426 }
2da0d70d 2427 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2428 {
2f60f629 2429 if(c->chrSrcHSubSample)
896a22b8 2430 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2431 else
896a22b8 2432 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2433 src1= formatConvBuffer;
8b2fce0d 2434 src2= formatConvBuffer+VOFW;
a861d4d7 2435 }
9990e426
MN
2436 else if (srcFormat==PIX_FMT_BGR32_1)
2437 {
2f60f629 2438 if(c->chrSrcHSubSample)
896a22b8 2439 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2440 else
896a22b8 2441 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2442 src1= formatConvBuffer;
2443 src2= formatConvBuffer+VOFW;
2444 }
2da0d70d 2445 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2446 {
2f60f629 2447 if(c->chrSrcHSubSample)
896a22b8 2448 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2449 else
896a22b8 2450 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2451 src1= formatConvBuffer;
8b2fce0d 2452 src2= formatConvBuffer+VOFW;
a861d4d7 2453 }
2da0d70d 2454 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2455 {
2f60f629 2456 if(c->chrSrcHSubSample)
896a22b8 2457 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2458 else
896a22b8 2459 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2460 src1= formatConvBuffer;
8b2fce0d 2461 src2= formatConvBuffer+VOFW;
a43fb6b3 2462 }
2da0d70d 2463 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2464 {
2f60f629 2465 if(c->chrSrcHSubSample)
896a22b8 2466 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2467 else
896a22b8 2468 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2469 src1= formatConvBuffer;
8b2fce0d 2470 src2= formatConvBuffer+VOFW;
a43fb6b3 2471 }
4bb9adcf 2472 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
6ff0ad6b 2473 {
2da0d70d 2474 return;
6ff0ad6b 2475 }
2da0d70d 2476 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2477 {
e48a79c9 2478 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2479 src1= formatConvBuffer;
8b2fce0d 2480 src2= formatConvBuffer+VOFW;
e28630fc 2481 }
1e621b18 2482
b63f641e 2483#if HAVE_MMX
8a322796 2484 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2485 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2486#else
2da0d70d 2487 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2488#endif
077ea8a7 2489 {
2da0d70d 2490 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
8b2fce0d 2491 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
077ea8a7 2492 }
8a322796 2493 else // fast bilinear upscale / crap downscale
077ea8a7 2494 {
57f9a560 2495#if ARCH_X86 && CONFIG_GPL
b63f641e 2496#if HAVE_MMX2
2da0d70d 2497 int i;
83c89c78 2498#if defined(PIC)
2da0d70d 2499 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2500#endif
2da0d70d
DB
2501 if (canMMX2BeUsed)
2502 {
7ad6469e 2503 __asm__ volatile(
83c89c78 2504#if defined(PIC)
2da0d70d
DB
2505 "mov %%"REG_b", %6 \n\t"
2506#endif
2507 "pxor %%mm7, %%mm7 \n\t"
2508 "mov %0, %%"REG_c" \n\t"
2509 "mov %1, %%"REG_D" \n\t"
2510 "mov %2, %%"REG_d" \n\t"
2511 "mov %3, %%"REG_b" \n\t"
2512 "xor %%"REG_a", %%"REG_a" \n\t" // i
2513 PREFETCH" (%%"REG_c") \n\t"
2514 PREFETCH" 32(%%"REG_c") \n\t"
2515 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2516
b63f641e 2517#if ARCH_X86_64
6d606c4f
AJ
2518
2519#define FUNNY_UV_CODE \
2da0d70d
DB
2520 "movl (%%"REG_b"), %%esi \n\t"\
2521 "call *%4 \n\t"\
2522 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2523 "add %%"REG_S", %%"REG_c" \n\t"\
2524 "add %%"REG_a", %%"REG_D" \n\t"\
2525 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2526
2527#else
2528
b7dc6f66 2529#define FUNNY_UV_CODE \
2da0d70d
DB
2530 "movl (%%"REG_b"), %%esi \n\t"\
2531 "call *%4 \n\t"\
2532 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2533 "add %%"REG_a", %%"REG_D" \n\t"\
2534 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2535
bc279024 2536#endif /* ARCH_X86_64 */
6d606c4f 2537
b7dc6f66
MN
2538FUNNY_UV_CODE
2539FUNNY_UV_CODE
2540FUNNY_UV_CODE
2541FUNNY_UV_CODE
2da0d70d
DB
2542 "xor %%"REG_a", %%"REG_a" \n\t" // i
2543 "mov %5, %%"REG_c" \n\t" // src
2544 "mov %1, %%"REG_D" \n\t" // buf1
8b2fce0d 2545 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2da0d70d
DB
2546 PREFETCH" (%%"REG_c") \n\t"
2547 PREFETCH" 32(%%"REG_c") \n\t"
2548 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2549
2550FUNNY_UV_CODE
2551FUNNY_UV_CODE
2552FUNNY_UV_CODE
2553FUNNY_UV_CODE
2554
83c89c78 2555#if defined(PIC)
2da0d70d 2556 "mov %6, %%"REG_b" \n\t"
83c89c78 2557#endif
2da0d70d
DB
2558 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2559 "m" (funnyUVCode), "m" (src2)
83c89c78 2560#if defined(PIC)
2da0d70d 2561 ,"m" (ebxsave)
83c89c78 2562#endif
2da0d70d 2563 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2564#if !defined(PIC)
2da0d70d
DB
2565 ,"%"REG_b
2566#endif
2567 );
2568 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2569 {
2570 //printf("%d %d %d\n", dstWidth, i, srcW);
2571 dst[i] = src1[srcW-1]*128;
8b2fce0d 2572 dst[i+VOFW] = src2[srcW-1]*128;
2da0d70d
DB
2573 }
2574 }
2575 else
2576 {
bc279024 2577#endif /* HAVE_MMX2 */
d0ce212a 2578 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2da0d70d 2579 uint16_t xInc_mask = xInc & 0xffff;
7ad6469e 2580 __asm__ volatile(
2da0d70d
DB
2581 "xor %%"REG_a", %%"REG_a" \n\t" // i
2582 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2583 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2584 ASMALIGN(4)
2585 "1: \n\t"
2586 "mov %0, %%"REG_S" \n\t"
2587 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2588 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2589 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2590 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2591 "shll $16, %%edi \n\t"
2592 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2593 "mov %1, %%"REG_D" \n\t"
2594 "shrl $9, %%esi \n\t"
2595 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2596
2597 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2598 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2599 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2600 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2601 "shll $16, %%edi \n\t"
2602 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2603 "mov %1, %%"REG_D" \n\t"
2604 "shrl $9, %%esi \n\t"
8b2fce0d 2605 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2da0d70d
DB
2606
2607 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2608 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2609 "add $1, %%"REG_a" \n\t"
2610 "cmp %2, %%"REG_a" \n\t"
2611 " jb 1b \n\t"
2ff198c1 2612
8a322796
DB
2613/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2614 which is needed to support GCC 4.0. */
b63f641e 2615#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
e29c3f93 2616 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2617#else
e29c3f93 2618 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2619#endif
2da0d70d
DB
2620 "r" (src2)
2621 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2622 );
b63f641e 2623#if HAVE_MMX2
2da0d70d 2624 } //if MMX2 can't be used
2ff198c1
MN
2625#endif
2626#else
2da0d70d
DB
2627 int i;
2628 unsigned int xpos=0;
2629 for (i=0;i<dstWidth;i++)
2630 {
2631 register unsigned int xx=xpos>>16;
2632 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2633 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
8b2fce0d 2634 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2da0d70d
DB
2635 /* slower
2636 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
8b2fce0d 2637 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2da0d70d
DB
2638 */
2639 xpos+=xInc;
2640 }
b63f641e 2641#endif /* ARCH_X86 */
2da0d70d 2642 }
6bc0c792
MN
2643 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2644 int i;
2645 //FIXME all pal and rgb srcFormats could do this convertion as well
2646 //FIXME all scalers more complex than bilinear could do half of this transform
2647 if(c->srcRange){
2648 for (i=0; i<dstWidth; i++){
2649 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2650 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2651 }
2652 }else{
2653 for (i=0; i<dstWidth; i++){
aa13b0fc
MN
2654 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2655 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
6bc0c792
MN
2656 }
2657 }
2658 }
077ea8a7
MN
2659}
2660
3e499f53 2661static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2da0d70d
DB
2662 int srcSliceH, uint8_t* dst[], int dstStride[]){
2663
2664 /* load a few things into local vars to make the code more readable? and faster */
2665 const int srcW= c->srcW;
2666 const int dstW= c->dstW;
2667 const int dstH= c->dstH;
2668 const int chrDstW= c->chrDstW;
2669 const int chrSrcW= c->chrSrcW;
2670 const int lumXInc= c->lumXInc;
2671 const int chrXInc= c->chrXInc;
2672 const int dstFormat= c->dstFormat;
2673 const int srcFormat= c->srcFormat;
2674 const int flags= c->flags;
2675 const int canMMX2BeUsed= c->canMMX2BeUsed;
2676 int16_t *vLumFilterPos= c->vLumFilterPos;
2677 int16_t *vChrFilterPos= c->vChrFilterPos;
2678 int16_t *hLumFilterPos= c->hLumFilterPos;
2679 int16_t *hChrFilterPos= c->hChrFilterPos;
2680 int16_t *vLumFilter= c->vLumFilter;
2681 int16_t *vChrFilter= c->vChrFilter;
2682 int16_t *hLumFilter= c->hLumFilter;
2683 int16_t *hChrFilter= c->hChrFilter;
2684 int32_t *lumMmxFilter= c->lumMmxFilter;
2685 int32_t *chrMmxFilter= c->chrMmxFilter;
2686 const int vLumFilterSize= c->vLumFilterSize;
2687 const int vChrFilterSize= c->vChrFilterSize;
2688 const int hLumFilterSize= c->hLumFilterSize;
2689 const int hChrFilterSize= c->hChrFilterSize;
2690 int16_t **lumPixBuf= c->lumPixBuf;
2691 int16_t **chrPixBuf= c->chrPixBuf;
2692 const int vLumBufSize= c->vLumBufSize;
2693 const int vChrBufSize= c->vChrBufSize;
2694 uint8_t *funnyYCode= c->funnyYCode;
2695 uint8_t *funnyUVCode= c->funnyUVCode;
2696 uint8_t *formatConvBuffer= c->formatConvBuffer;
2697 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2698 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2699 int lastDstY;
e150ef8d 2700 uint32_t *pal=c->pal_yuv;
2da0d70d 2701
8a322796 2702 /* vars which will change and which we need to store back in the context */
2da0d70d
DB
2703 int dstY= c->dstY;
2704 int lumBufIndex= c->lumBufIndex;
2705 int chrBufIndex= c->chrBufIndex;
2706 int lastInLumBuf= c->lastInLumBuf;
2707 int lastInChrBuf= c->lastInChrBuf;
2708
2709 if (isPacked(c->srcFormat)){
2da0d70d
DB
2710 src[0]=
2711 src[1]=
2712 src[2]= src[0];
2713 srcStride[0]=
2714 srcStride[1]=
2715 srcStride[2]= srcStride[0];
2716 }
2717 srcStride[1]<<= c->vChrDrop;
2718 srcStride[2]<<= c->vChrDrop;
2719
2720 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2721 // (int)dst[0], (int)dst[1], (int)dst[2]);
c7a810cc
MN
2722
2723#if 0 //self test FIXME move to a vfilter or something
2da0d70d
DB
2724 {
2725 static volatile int i=0;
2726 i++;
2727 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2728 selfTest(src, srcStride, c->srcW, c->srcH);
2729 i--;
2730 }
c7a810cc 2731#endif
37079906 2732
2da0d70d
DB
2733 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2734 //dstStride[0],dstStride[1],dstStride[2]);
2735
2736 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2737 {
6683a37f
DP
2738 static int warnedAlready=0; //FIXME move this into the context perhaps
2739 if (flags & SWS_PRINT_INFO && !warnedAlready)
2da0d70d 2740 {
4b0c30b7 2741 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
8a322796 2742 " ->cannot do aligned memory accesses anymore\n");
6683a37f 2743 warnedAlready=1;
2da0d70d
DB
2744 }
2745 }
2746
8a322796
DB
2747 /* Note the user might start scaling the picture in the middle so this
2748 will not get executed. This is not really intended but works
2749 currently, so people might do it. */
2da0d70d
DB
2750 if (srcSliceY ==0){
2751 lumBufIndex=0;
2752 chrBufIndex=0;
2753 dstY=0;
2754 lastInLumBuf= -1;
2755 lastInChrBuf= -1;
2756 }
2757
2758 lastDstY= dstY;
2759
2760 for (;dstY < dstH; dstY++){
2761 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2762 const int chrDstY= dstY>>c->chrDstVSubSample;
2763 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2764 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2765
2766 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2767 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2768 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2769 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2770
2771 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2772 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2773 //handle holes (FAST_BILINEAR & weird filters)
2774 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2775 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2776 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
fcc402b1
LB
2777 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2778 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2da0d70d
DB
2779
2780 // Do we have enough lines in this slice to output the dstY line
2781 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2782 {
2783 //Do horizontal scaling
2784 while(lastInLumBuf < lastLumSrcY)
2785 {
2786 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2787 lumBufIndex++;
2788 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
fcc402b1
LB
2789 assert(lumBufIndex < 2*vLumBufSize);
2790 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2791 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2da0d70d 2792 //printf("%d %d\n", lumBufIndex, vLumBufSize);
6bc0c792 2793 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2da0d70d
DB
2794 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2795 funnyYCode, c->srcFormat, formatConvBuffer,
2796 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2797 lastInLumBuf++;
2798 }
2799 while(lastInChrBuf < lastChrSrcY)
2800 {
2801 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2802 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2803 chrBufIndex++;
fcc402b1
LB
2804 assert(chrBufIndex < 2*vChrBufSize);
2805 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2806 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
2807 //FIXME replace parameters through context struct (some at least)
2808
2809 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 2810 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
2811 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2812 funnyUVCode, c->srcFormat, formatConvBuffer,
2813 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2814 lastInChrBuf++;
2815 }
2816 //wrap buf index around to stay inside the ring buffer
e5091488
BF
2817 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2818 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
2819 }
2820 else // not enough lines left in this slice -> load the rest in the buffer
2821 {
2822 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2823 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2824 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2825 vChrBufSize, vLumBufSize);*/
2826
2827 //Do horizontal scaling
2828 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2829 {
2830 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2831 lumBufIndex++;
fcc402b1
LB
2832 assert(lumBufIndex < 2*vLumBufSize);
2833 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2834 assert(lastInLumBuf + 1 - srcSliceY >= 0);
6bc0c792 2835 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2da0d70d
DB
2836 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2837 funnyYCode, c->srcFormat, formatConvBuffer,
2838 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2839 lastInLumBuf++;
2840 }
2841 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2842 {
2843 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2844 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2845 chrBufIndex++;
fcc402b1
LB
2846 assert(chrBufIndex < 2*vChrBufSize);
2847 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
2848 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
2849
2850 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 2851 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
2852 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2853 funnyUVCode, c->srcFormat, formatConvBuffer,
2854 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2855 lastInChrBuf++;
2856 }
2857 //wrap buf index around to stay inside the ring buffer
e5091488
BF
2858 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2859 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
2860 break; //we can't output a dstY line so let's try with the next slice
2861 }
d3f41512 2862
b63f641e 2863#if HAVE_MMX
88e2a9ae 2864 c->blueDither= ff_dither8[dstY&1];
92c7b471 2865 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
88e2a9ae 2866 c->greenDither= ff_dither8[dstY&1];
92c7b471 2867 else
88e2a9ae
CEH
2868 c->greenDither= ff_dither4[dstY&1];
2869 c->redDither= ff_dither8[(dstY+1)&1];
2da0d70d
DB
2870#endif
2871 if (dstY < dstH-2)
2872 {
2873 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2874 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
b63f641e 2875#if HAVE_MMX
2da0d70d
DB
2876 int i;
2877 if (flags & SWS_ACCURATE_RND){
1625216e 2878 int s= APCK_SIZE / 8;
2da0d70d 2879 for (i=0; i<vLumFilterSize; i+=2){
1625216e
MN
2880 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2881 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2882 lumMmxFilter[s*i+APCK_COEF/4 ]=
2883 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2da0d70d
DB
2884 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2885 }
2886 for (i=0; i<vChrFilterSize; i+=2){
1625216e
MN
2887 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2888 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2889 chrMmxFilter[s*i+APCK_COEF/4 ]=
2890 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2da0d70d 2891 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
bca11e75 2892 }
2da0d70d
DB
2893 }else{
2894 for (i=0; i<vLumFilterSize; i++)
2895 {
2896 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2897 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2898 lumMmxFilter[4*i+2]=
2899 lumMmxFilter[4*i+3]=
2900 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2901 }
2902 for (i=0; i<vChrFilterSize; i++)
2903 {
2904 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2905 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2906 chrMmxFilter[4*i+2]=
2907 chrMmxFilter[4*i+3]=
2908 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2909 }
2910 }
6542b44e 2911#endif
2da0d70d
DB
2912 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2913 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2914 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2915 RENAME(yuv2nv12X)(c,
2916 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2917 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2918 dest, uDest, dstW, chrDstW, dstFormat);
e3d2500f 2919 }
b0880d5d 2920 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2da0d70d
DB
2921 {
2922 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2923 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
8a322796 2924 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2da0d70d
DB
2925 {
2926 int16_t *lumBuf = lumPixBuf[0];
2927 int16_t *chrBuf= chrPixBuf[0];
bf2bdde6 2928 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2da0d70d
DB
2929 }
2930 else //General YV12
2931 {
2932 RENAME(yuv2yuvX)(c,
2933 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2934 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2935 dest, uDest, vDest, dstW, chrDstW);
2936 }
2937 }
2938 else
2939 {
fcc402b1
LB
2940 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2941 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
8a322796 2942 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2da0d70d
DB
2943 {
2944 int chrAlpha= vChrFilter[2*dstY+1];
f0faee4c
MN
2945 if(flags & SWS_FULL_CHR_H_INT){
2946 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2947 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2948 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2949 dest, dstW, dstY);
2950 }else{
14014d47
MN
2951 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2952 dest, dstW, chrAlpha, dstFormat, flags, dstY);
f0faee4c 2953 }
2da0d70d 2954 }
8a322796 2955 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2da0d70d
DB
2956 {
2957 int lumAlpha= vLumFilter[2*dstY+1];
2958 int chrAlpha= vChrFilter[2*dstY+1];
2959 lumMmxFilter[2]=
2960 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2961 chrMmxFilter[2]=
2962 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
f0faee4c
MN
2963 if(flags & SWS_FULL_CHR_H_INT){
2964 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2965 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2966 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2967 dest, dstW, dstY);
2968 }else{
14014d47
MN
2969 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2970 dest, dstW, lumAlpha, chrAlpha, dstY);
f0faee4c 2971 }
2da0d70d 2972 }
8a322796 2973 else //general RGB
2da0d70d 2974 {
f0faee4c
MN
2975 if(flags & SWS_FULL_CHR_H_INT){
2976 yuv2rgbXinC_full(c,
2977 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2978 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2979 dest, dstW, dstY);
2980 }else{
14014d47
MN
2981 RENAME(yuv2packedX)(c,
2982 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2983 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2984 dest, dstW, dstY);
f0faee4c 2985 }
2da0d70d
DB
2986 }
2987 }
2988 }
2989 else // hmm looks like we can't use MMX here without overwriting this array's tail
2990 {
2991 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2992 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2993 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2994 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2995 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2996 yuv2nv12XinC(
2997 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2998 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2999 dest, uDest, dstW, chrDstW, dstFormat);
3000 }
b0880d5d 3001 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
2da0d70d
DB
3002 {
3003 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3004 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3005 yuv2yuvXinC(
3006 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3007 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3008 dest, uDest, vDest, dstW, chrDstW);
3009 }
3010 else
3011 {
fcc402b1
LB
3012 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3013 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
f0faee4c
MN
3014 if(flags & SWS_FULL_CHR_H_INT){
3015 yuv2rgbXinC_full(c,
3016 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3017 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3018 dest, dstW, dstY);
3019 }else{
14014d47
MN
3020 yuv2packedXinC(c,
3021 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3022 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3023 dest, dstW, dstY);
f0faee4c 3024 }
2da0d70d
DB
3025 }
3026 }
3027 }
17f715fa 3028
b63f641e 3029#if HAVE_MMX
7ad6469e
DP
3030 __asm__ volatile(SFENCE:::"memory");
3031 __asm__ volatile(EMMS:::"memory");
17f715fa 3032#endif
2da0d70d
DB
3033 /* store changed local vars back in the context */
3034 c->dstY= dstY;
3035 c->lumBufIndex= lumBufIndex;
3036 c->chrBufIndex= chrBufIndex;
3037 c->lastInLumBuf= lastInLumBuf;
3038 c->lastInChrBuf= lastInChrBuf;
d4e24275 3039
2da0d70d 3040 return dstY - lastDstY;
627690b5 3041}