Add a changelog entry for the alpha scaler support
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
8a322796
DB
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec
MN
27#undef PREFETCH
28#undef PREFETCHW
29#undef EMMS
30#undef SFENCE
31
f4406ec1 32#if HAVE_AMD3DNOW
aeb87a49 33/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
48a05cec
MN
34#define EMMS "femms"
35#else
36#define EMMS "emms"
37#endif
38
f4406ec1 39#if HAVE_AMD3DNOW
48a05cec
MN
40#define PREFETCH "prefetch"
41#define PREFETCHW "prefetchw"
b63f641e 42#elif HAVE_MMX2
48a05cec
MN
43#define PREFETCH "prefetchnta"
44#define PREFETCHW "prefetcht0"
45#else
d904b5fc
NP
46#define PREFETCH " # nop"
47#define PREFETCHW " # nop"
48a05cec
MN
48#endif
49
b63f641e 50#if HAVE_MMX2
48a05cec
MN
51#define SFENCE "sfence"
52#else
d904b5fc 53#define SFENCE " # nop"
48a05cec 54#endif
d3f41512 55
b63f641e 56#if HAVE_MMX2
d604bab9 57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
f4406ec1 58#elif HAVE_AMD3DNOW
d604bab9
MN
59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60#endif
d3f41512 61
b63f641e 62#if HAVE_MMX2
6e1c66bc 63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 64#else
6e1c66bc 65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 66#endif
6e1c66bc 67#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 68
b63f641e 69#if HAVE_ALTIVEC
a2faa401
RD
70#include "swscale_altivec_template.c"
71#endif
72
bca11e75 73#define YSCALEYUV2YV12X(x, offset, dest, width) \
7ad6469e 74 __asm__ volatile(\
2da0d70d
DB
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
8b2fce0d
MN
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
2da0d70d
DB
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
bca11e75
MN
108
109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
7ad6469e 110 __asm__ volatile(\
2da0d70d
DB
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
8b2fce0d
MN
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
1625216e 122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
8b2fce0d 123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
2da0d70d
DB
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
1625216e
MN
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
c1b0bfb4
MN
170
171#define YSCALEYUV2YV121 \
2da0d70d
DB
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
c1b0bfb4 183
bf2bdde6
MN
184#define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
33a67bd6
MN
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
bf2bdde6
MN
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
201
c1b0bfb4 202/*
2da0d70d
DB
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 208*/
df57ab14 209#define YSCALEYUV2PACKEDX_UV \
7ad6469e 210 __asm__ volatile(\
2da0d70d
DB
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
8b2fce0d 223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
2da0d70d
DB
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
df57ab14 232
fe91924d 233#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
df57ab14 234 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d 235 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
236 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
237 "movq "#dst1", "#dst2" \n\t"\
2da0d70d
DB
238 ASMALIGN(4)\
239 "2: \n\t"\
fe91924d
CS
240 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
241 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
242 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
2da0d70d
DB
243 "add $16, %%"REG_d" \n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
245 "pmulhw "#coeff", "#src1" \n\t"\
246 "pmulhw "#coeff", "#src2" \n\t"\
247 "paddw "#src1", "#dst1" \n\t"\
248 "paddw "#src2", "#dst2" \n\t"\
2da0d70d
DB
249 "test %%"REG_S", %%"REG_S" \n\t"\
250 " jnz 2b \n\t"\
251
df57ab14
CS
252#define YSCALEYUV2PACKEDX \
253 YSCALEYUV2PACKEDX_UV \
fe91924d 254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
df57ab14 255
2da0d70d
DB
256#define YSCALEYUV2PACKEDX_END \
257 :: "r" (&c->redDither), \
258 "m" (dummy), "m" (dummy), "m" (dummy),\
259 "r" (dest), "m" (dstW) \
260 : "%"REG_a, "%"REG_d, "%"REG_S \
261 );
8422aa88 262
df57ab14 263#define YSCALEYUV2PACKEDX_ACCURATE_UV \
7ad6469e 264 __asm__ volatile(\
2da0d70d
DB
265 "xor %%"REG_a", %%"REG_a" \n\t"\
266 ASMALIGN(4)\
267 "nop \n\t"\
268 "1: \n\t"\
269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pxor %%mm4, %%mm4 \n\t"\
272 "pxor %%mm5, %%mm5 \n\t"\
273 "pxor %%mm6, %%mm6 \n\t"\
274 "pxor %%mm7, %%mm7 \n\t"\
275 ASMALIGN(4)\
276 "2: \n\t"\
277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
8b2fce0d 278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
1625216e 279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
281 "movq %%mm0, %%mm3 \n\t"\
282 "punpcklwd %%mm1, %%mm0 \n\t"\
283 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
285 "pmaddwd %%mm1, %%mm0 \n\t"\
286 "pmaddwd %%mm1, %%mm3 \n\t"\
287 "paddd %%mm0, %%mm4 \n\t"\
288 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
1625216e
MN
290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
292 "test %%"REG_S", %%"REG_S" \n\t"\
293 "movq %%mm2, %%mm0 \n\t"\
294 "punpcklwd %%mm3, %%mm2 \n\t"\
295 "punpckhwd %%mm3, %%mm0 \n\t"\
296 "pmaddwd %%mm1, %%mm2 \n\t"\
297 "pmaddwd %%mm1, %%mm0 \n\t"\
298 "paddd %%mm2, %%mm6 \n\t"\
299 "paddd %%mm0, %%mm7 \n\t"\
300 " jnz 2b \n\t"\
301 "psrad $16, %%mm4 \n\t"\
302 "psrad $16, %%mm5 \n\t"\
303 "psrad $16, %%mm6 \n\t"\
304 "psrad $16, %%mm7 \n\t"\
305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
306 "packssdw %%mm5, %%mm4 \n\t"\
307 "packssdw %%mm7, %%mm6 \n\t"\
308 "paddw %%mm0, %%mm4 \n\t"\
309 "paddw %%mm0, %%mm6 \n\t"\
310 "movq %%mm4, "U_TEMP"(%0) \n\t"\
311 "movq %%mm6, "V_TEMP"(%0) \n\t"\
df57ab14
CS
312
313#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d
DB
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm1, %%mm1 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm7, %%mm7 \n\t"\
319 "pxor %%mm6, %%mm6 \n\t"\
320 ASMALIGN(4)\
321 "2: \n\t"\
322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
1625216e 324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
326 "movq %%mm0, %%mm3 \n\t"\
327 "punpcklwd %%mm4, %%mm0 \n\t"\
328 "punpckhwd %%mm4, %%mm3 \n\t"\
1625216e 329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
2da0d70d
DB
330 "pmaddwd %%mm4, %%mm0 \n\t"\
331 "pmaddwd %%mm4, %%mm3 \n\t"\
332 "paddd %%mm0, %%mm1 \n\t"\
333 "paddd %%mm3, %%mm5 \n\t"\
334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
1625216e
MN
335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
337 "test %%"REG_S", %%"REG_S" \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "punpcklwd %%mm3, %%mm2 \n\t"\
340 "punpckhwd %%mm3, %%mm0 \n\t"\
341 "pmaddwd %%mm4, %%mm2 \n\t"\
342 "pmaddwd %%mm4, %%mm0 \n\t"\
343 "paddd %%mm2, %%mm7 \n\t"\
344 "paddd %%mm0, %%mm6 \n\t"\
345 " jnz 2b \n\t"\
346 "psrad $16, %%mm1 \n\t"\
347 "psrad $16, %%mm5 \n\t"\
348 "psrad $16, %%mm7 \n\t"\
349 "psrad $16, %%mm6 \n\t"\
350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
351 "packssdw %%mm5, %%mm1 \n\t"\
352 "packssdw %%mm6, %%mm7 \n\t"\
353 "paddw %%mm0, %%mm1 \n\t"\
354 "paddw %%mm0, %%mm7 \n\t"\
355 "movq "U_TEMP"(%0), %%mm3 \n\t"\
356 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 357
df57ab14
CS
358#define YSCALEYUV2PACKEDX_ACCURATE \
359 YSCALEYUV2PACKEDX_ACCURATE_UV \
360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
361
8422aa88 362#define YSCALEYUV2RGBX \
2da0d70d
DB
363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
369/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
376/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377 "paddw %%mm3, %%mm4 \n\t"\
378 "movq %%mm2, %%mm0 \n\t"\
379 "movq %%mm5, %%mm6 \n\t"\
380 "movq %%mm4, %%mm3 \n\t"\
381 "punpcklwd %%mm2, %%mm2 \n\t"\
382 "punpcklwd %%mm5, %%mm5 \n\t"\
383 "punpcklwd %%mm4, %%mm4 \n\t"\
384 "paddw %%mm1, %%mm2 \n\t"\
385 "paddw %%mm1, %%mm5 \n\t"\
386 "paddw %%mm1, %%mm4 \n\t"\
387 "punpckhwd %%mm0, %%mm0 \n\t"\
388 "punpckhwd %%mm6, %%mm6 \n\t"\
389 "punpckhwd %%mm3, %%mm3 \n\t"\
390 "paddw %%mm7, %%mm0 \n\t"\
391 "paddw %%mm7, %%mm6 \n\t"\
392 "paddw %%mm7, %%mm3 \n\t"\
393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394 "packuswb %%mm0, %%mm2 \n\t"\
395 "packuswb %%mm6, %%mm5 \n\t"\
396 "packuswb %%mm3, %%mm4 \n\t"\
d604bab9 397
6e1c66bc 398#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
401 "psraw $3, %%mm0 \n\t"\
402 "psraw $3, %%mm1 \n\t"\
403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405 "xor "#index", "#index" \n\t"\
406 ASMALIGN(4)\
407 "1: \n\t"\
408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 433
6e1c66bc 434#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 435
df57ab14 436#define REAL_YSCALEYUV2RGB_UV(index, c) \
2da0d70d
DB
437 "xor "#index", "#index" \n\t"\
438 ASMALIGN(4)\
439 "1: \n\t"\
440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
df57ab14 460
786dcfef
CS
461#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
462 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
463 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
464 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
465 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
2da0d70d
DB
466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
df57ab14
CS
474
475#define REAL_YSCALEYUV2RGB_COEFF(c) \
2da0d70d
DB
476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483 "paddw %%mm3, %%mm4 \n\t"\
484 "movq %%mm2, %%mm0 \n\t"\
485 "movq %%mm5, %%mm6 \n\t"\
486 "movq %%mm4, %%mm3 \n\t"\
487 "punpcklwd %%mm2, %%mm2 \n\t"\
488 "punpcklwd %%mm5, %%mm5 \n\t"\
489 "punpcklwd %%mm4, %%mm4 \n\t"\
490 "paddw %%mm1, %%mm2 \n\t"\
491 "paddw %%mm1, %%mm5 \n\t"\
492 "paddw %%mm1, %%mm4 \n\t"\
493 "punpckhwd %%mm0, %%mm0 \n\t"\
494 "punpckhwd %%mm6, %%mm6 \n\t"\
495 "punpckhwd %%mm3, %%mm3 \n\t"\
496 "paddw %%mm7, %%mm0 \n\t"\
497 "paddw %%mm7, %%mm6 \n\t"\
498 "paddw %%mm7, %%mm3 \n\t"\
499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500 "packuswb %%mm0, %%mm2 \n\t"\
501 "packuswb %%mm6, %%mm5 \n\t"\
502 "packuswb %%mm3, %%mm4 \n\t"\
40494418 503
786dcfef 504#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
df57ab14
CS
505
506#define YSCALEYUV2RGB(index, c) \
507 REAL_YSCALEYUV2RGB_UV(index, c) \
786dcfef 508 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
df57ab14 509 REAL_YSCALEYUV2RGB_COEFF(c)
6a4970ab 510
6e1c66bc 511#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
517 "psraw $7, %%mm3 \n\t" \
518 "psraw $7, %%mm4 \n\t" \
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $7, %%mm1 \n\t" \
522 "psraw $7, %%mm7 \n\t" \
6a4970ab 523
6e1c66bc 524#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 525
6e1c66bc 526#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
527 "xor "#index", "#index" \n\t"\
528 ASMALIGN(4)\
529 "1: \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552 "paddw %%mm3, %%mm4 \n\t"\
553 "movq %%mm2, %%mm0 \n\t"\
554 "movq %%mm5, %%mm6 \n\t"\
555 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklwd %%mm2, %%mm2 \n\t"\
557 "punpcklwd %%mm5, %%mm5 \n\t"\
558 "punpcklwd %%mm4, %%mm4 \n\t"\
559 "paddw %%mm1, %%mm2 \n\t"\
560 "paddw %%mm1, %%mm5 \n\t"\
561 "paddw %%mm1, %%mm4 \n\t"\
562 "punpckhwd %%mm0, %%mm0 \n\t"\
563 "punpckhwd %%mm6, %%mm6 \n\t"\
564 "punpckhwd %%mm3, %%mm3 \n\t"\
565 "paddw %%mm7, %%mm0 \n\t"\
566 "paddw %%mm7, %%mm6 \n\t"\
567 "paddw %%mm7, %%mm3 \n\t"\
568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569 "packuswb %%mm0, %%mm2 \n\t"\
570 "packuswb %%mm6, %%mm5 \n\t"\
571 "packuswb %%mm3, %%mm4 \n\t"\
40494418 572
6e1c66bc 573#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 574
6e1c66bc 575#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $8, %%mm3 \n\t" \
586 "psrlw $8, %%mm4 \n\t" \
587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
589 "psraw $7, %%mm1 \n\t" \
590 "psraw $7, %%mm7 \n\t"
6e1c66bc 591#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 592
497d4f99 593// do vertical chrominance interpolation
6e1c66bc 594#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
595 "xor "#index", "#index" \n\t"\
596 ASMALIGN(4)\
597 "1: \n\t"\
598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624 "paddw %%mm3, %%mm4 \n\t"\
625 "movq %%mm2, %%mm0 \n\t"\
626 "movq %%mm5, %%mm6 \n\t"\
627 "movq %%mm4, %%mm3 \n\t"\
628 "punpcklwd %%mm2, %%mm2 \n\t"\
629 "punpcklwd %%mm5, %%mm5 \n\t"\
630 "punpcklwd %%mm4, %%mm4 \n\t"\
631 "paddw %%mm1, %%mm2 \n\t"\
632 "paddw %%mm1, %%mm5 \n\t"\
633 "paddw %%mm1, %%mm4 \n\t"\
634 "punpckhwd %%mm0, %%mm0 \n\t"\
635 "punpckhwd %%mm6, %%mm6 \n\t"\
636 "punpckhwd %%mm3, %%mm3 \n\t"\
637 "paddw %%mm7, %%mm0 \n\t"\
638 "paddw %%mm7, %%mm6 \n\t"\
639 "paddw %%mm7, %%mm3 \n\t"\
640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641 "packuswb %%mm0, %%mm2 \n\t"\
642 "packuswb %%mm6, %%mm5 \n\t"\
643 "packuswb %%mm3, %%mm4 \n\t"\
40494418 644
6e1c66bc 645#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 646
6858492e
CS
647#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
648 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
649 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
650 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
651 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
652 "packuswb %%mm1, %%mm7 \n\t"
653#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
654
9c77b26b
CS
655#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
656 "movq "#b", "#q2" \n\t" /* B */\
657 "movq "#r", "#t" \n\t" /* R */\
658 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
659 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
660 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
661 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
662 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
663 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
664 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
665 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
666 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
667 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
d604bab9 668\
9c77b26b
CS
669 MOVNTQ( q0, (dst, index, 4))\
670 MOVNTQ( b, 8(dst, index, 4))\
671 MOVNTQ( q2, 16(dst, index, 4))\
672 MOVNTQ( q3, 24(dst, index, 4))\
d604bab9 673\
2da0d70d
DB
674 "add $8, "#index" \n\t"\
675 "cmp "#dstw", "#index" \n\t"\
676 " jb 1b \n\t"
9c77b26b 677#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
d604bab9 678
27a90b04 679#define REAL_WRITERGB16(dst, dstw, index) \
2da0d70d
DB
680 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
681 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
682 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
683 "psrlq $3, %%mm2 \n\t"\
d604bab9 684\
2da0d70d
DB
685 "movq %%mm2, %%mm1 \n\t"\
686 "movq %%mm4, %%mm3 \n\t"\
d604bab9 687\
2da0d70d
DB
688 "punpcklbw %%mm7, %%mm3 \n\t"\
689 "punpcklbw %%mm5, %%mm2 \n\t"\
690 "punpckhbw %%mm7, %%mm4 \n\t"\
691 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 692\
2da0d70d
DB
693 "psllq $3, %%mm3 \n\t"\
694 "psllq $3, %%mm4 \n\t"\
d604bab9 695\
2da0d70d
DB
696 "por %%mm3, %%mm2 \n\t"\
697 "por %%mm4, %%mm1 \n\t"\
d604bab9 698\
2da0d70d
DB
699 MOVNTQ(%%mm2, (dst, index, 2))\
700 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 701\
2da0d70d
DB
702 "add $8, "#index" \n\t"\
703 "cmp "#dstw", "#index" \n\t"\
704 " jb 1b \n\t"
27a90b04 705#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
d604bab9 706
27a90b04 707#define REAL_WRITERGB15(dst, dstw, index) \
2da0d70d
DB
708 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
709 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
710 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
711 "psrlq $3, %%mm2 \n\t"\
712 "psrlq $1, %%mm5 \n\t"\
d604bab9 713\
2da0d70d
DB
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
d604bab9 716\
2da0d70d
DB
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 721\
2da0d70d
DB
722 "psllq $2, %%mm3 \n\t"\
723 "psllq $2, %%mm4 \n\t"\
d604bab9 724\
2da0d70d
DB
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
d604bab9 727\
2da0d70d
DB
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 730\
2da0d70d
DB
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
733 " jb 1b \n\t"
27a90b04 734#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
f62255fb 735
6542b44e 736#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
737 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
738 "movq %%mm2, %%mm1 \n\t" /* B */\
739 "movq %%mm5, %%mm6 \n\t" /* R */\
740 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
741 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
742 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
743 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
744 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
745 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
746 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
747 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
748 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
749 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 750\
2da0d70d
DB
751 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
752 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
753 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
754 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
755 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
756 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
757 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
758 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 759\
2da0d70d
DB
760 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
761 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
762 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
763 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
764 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
765 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
766 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
767 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
768 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
769 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
770 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
771 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
772 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 773\
2da0d70d
DB
774 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
775 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
776 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
777 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
778 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
779 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
780 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
781 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 782\
2da0d70d
DB
783 MOVNTQ(%%mm0, (dst))\
784 MOVNTQ(%%mm2, 8(dst))\
785 MOVNTQ(%%mm3, 16(dst))\
786 "add $24, "#dst" \n\t"\
d604bab9 787\
2da0d70d
DB
788 "add $8, "#index" \n\t"\
789 "cmp "#dstw", "#index" \n\t"\
790 " jb 1b \n\t"
d604bab9 791
6542b44e 792#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
793 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
794 "movq %%mm2, %%mm1 \n\t" /* B */\
795 "movq %%mm5, %%mm6 \n\t" /* R */\
796 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
797 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
798 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
799 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
800 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
801 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
802 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
803 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
804 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
805 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 806\
2da0d70d
DB
807 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
809 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
810 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 811\
2da0d70d
DB
812 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
813 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
814 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
815 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 816\
2da0d70d
DB
817 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
818 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
819 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
820 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 821\
2da0d70d
DB
822 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
823 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
824 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
825 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
826 MOVNTQ(%%mm0, (dst))\
99d2cb72 827\
2da0d70d
DB
828 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
829 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
830 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
831 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
832 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 833\
2da0d70d
DB
834 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
835 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
836 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
837 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 838\
2da0d70d 839 "add $24, "#dst" \n\t"\
99d2cb72 840\
2da0d70d
DB
841 "add $8, "#index" \n\t"\
842 "cmp "#dstw", "#index" \n\t"\
843 " jb 1b \n\t"
99d2cb72 844
6542b44e 845#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 846 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
847 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
848 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
849 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
850 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
851 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 852\
2da0d70d
DB
853 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
854 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
855 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 856\
2da0d70d
DB
857 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
858 "por %%mm1, %%mm6 \n\t"\
859 "por %%mm3, %%mm6 \n\t"\
860 MOVNTQ(%%mm6, (dst))\
99d2cb72 861\
2da0d70d
DB
862 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
863 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
864 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
865 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 866\
5802683a 867 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
868 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
869 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 870\
2da0d70d
DB
871 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
872 "por %%mm3, %%mm6 \n\t"\
873 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 874\
2da0d70d
DB
875 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
876 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
877 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 878\
2da0d70d
DB
879 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
880 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 881 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 882\
2da0d70d
DB
883 "por %%mm1, %%mm3 \n\t"\
884 "por %%mm3, %%mm6 \n\t"\
885 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 886\
2da0d70d 887 "add $24, "#dst" \n\t"\
99d2cb72 888\
2da0d70d
DB
889 "add $8, "#index" \n\t"\
890 "cmp "#dstw", "#index" \n\t"\
891 " jb 1b \n\t"
99d2cb72 892
b63f641e 893#if HAVE_MMX2
7630f2e0 894#undef WRITEBGR24
6e1c66bc 895#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 896#else
7630f2e0 897#undef WRITEBGR24
6e1c66bc 898#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
899#endif
900
6e1c66bc 901#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
902 "packuswb %%mm3, %%mm3 \n\t"\
903 "packuswb %%mm4, %%mm4 \n\t"\
904 "packuswb %%mm7, %%mm1 \n\t"\
905 "punpcklbw %%mm4, %%mm3 \n\t"\
906 "movq %%mm1, %%mm7 \n\t"\
907 "punpcklbw %%mm3, %%mm1 \n\t"\
908 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 909\
2da0d70d
DB
910 MOVNTQ(%%mm1, (dst, index, 2))\
911 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 912\
2da0d70d
DB
913 "add $8, "#index" \n\t"\
914 "cmp "#dstw", "#index" \n\t"\
915 " jb 1b \n\t"
6e1c66bc 916#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
917
918
77a49659 919static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
6858492e
CS
920 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, int16_t **alpSrc,
921 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
38858470 922{
b63f641e 923#if HAVE_MMX
f433c8ab 924 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
925 if (c->flags & SWS_ACCURATE_RND){
926 if (uDest){
927 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
928 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
929 }
6858492e
CS
930 if (CONFIG_SWSCALE_ALPHA && aDest){
931 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
932 }
bca11e75 933
14014d47
MN
934 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
935 }else{
936 if (uDest){
937 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
939 }
6858492e
CS
940 if (CONFIG_SWSCALE_ALPHA && aDest){
941 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
942 }
2da0d70d 943
14014d47
MN
944 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
945 }
f433c8ab
MN
946 return;
947 }
948#endif
b63f641e 949#if HAVE_ALTIVEC
a2faa401 950yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
951 chrFilter, chrSrc, chrFilterSize,
952 dest, uDest, vDest, dstW, chrDstW);
a2faa401 953#else //HAVE_ALTIVEC
5859233b 954yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d 955 chrFilter, chrSrc, chrFilterSize,
6858492e 956 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
a2faa401 957#endif //!HAVE_ALTIVEC
c1b0bfb4 958}
2add307d 959
6118e52e 960static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
961 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
962 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
6118e52e
VS
963{
964yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
965 chrFilter, chrSrc, chrFilterSize,
966 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
967}
968
6858492e
CS
969static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, int16_t *alpSrc,
970 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
c1b0bfb4 971{
f433c8ab 972 int i;
b63f641e 973#if HAVE_MMX
f433c8ab 974 if(!(c->flags & SWS_BITEXACT)){
6858492e
CS
975 long p= 4;
976 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
977 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
978 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
2da0d70d 979
14014d47
MN
980 if (c->flags & SWS_ACCURATE_RND){
981 while(p--){
6858492e 982 if (dst[p]){
7ad6469e 983 __asm__ volatile(
14014d47
MN
984 YSCALEYUV2YV121_ACCURATE
985 :: "r" (src[p]), "r" (dst[p] + counter[p]),
986 "g" (-counter[p])
987 : "%"REG_a
988 );
989 }
6858492e 990 }
14014d47
MN
991 }else{
992 while(p--){
6858492e 993 if (dst[p]){
7ad6469e 994 __asm__ volatile(
14014d47
MN
995 YSCALEYUV2YV121
996 :: "r" (src[p]), "r" (dst[p] + counter[p]),
997 "g" (-counter[p])
998 : "%"REG_a
999 );
1000 }
6858492e 1001 }
d78c1ea1 1002 }
f433c8ab
MN
1003 return;
1004 }
1005#endif
2da0d70d
DB
1006 for (i=0; i<dstW; i++)
1007 {
a1f3ffa3 1008 int val= (lumSrc[i]+64)>>7;
2da0d70d
DB
1009
1010 if (val&256){
1011 if (val<0) val=0;
1012 else val=255;
1013 }
1014
1015 dest[i]= val;
1016 }
1017
1b0a4572 1018 if (uDest)
2da0d70d
DB
1019 for (i=0; i<chrDstW; i++)
1020 {
a1f3ffa3
MN
1021 int u=(chrSrc[i ]+64)>>7;
1022 int v=(chrSrc[i + VOFW]+64)>>7;
2da0d70d
DB
1023
1024 if ((u|v)&256){
1025 if (u<0) u=0;
1026 else if (u>255) u=255;
1027 if (v<0) v=0;
1028 else if (v>255) v=255;
1029 }
1030
1031 uDest[i]= u;
1032 vDest[i]= v;
1033 }
6858492e
CS
1034
1035 if (CONFIG_SWSCALE_ALPHA && aDest)
1036 for (i=0; i<dstW; i++){
1037 int val= (alpSrc[i]+64)>>7;
1038 aDest[i]= av_clip_uint8(val);
1039 }
38858470
MN
1040}
1041
c1b0bfb4 1042
d604bab9
MN
1043/**
1044 * vertical scale YV12 to RGB
1045 */
25593e29 1046static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d 1047 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
6858492e 1048 int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1049{
b63f641e 1050#if HAVE_MMX
d0ce212a 1051 x86_reg dummy=0;
f433c8ab 1052 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
1053 if (c->flags & SWS_ACCURATE_RND){
1054 switch(c->dstFormat){
1055 case PIX_FMT_RGB32:
6858492e
CS
1056 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1057 YSCALEYUV2PACKEDX_ACCURATE
1058 YSCALEYUV2RGBX
1059 "movq %%mm2, "U_TEMP"(%0) \n\t"
1060 "movq %%mm4, "V_TEMP"(%0) \n\t"
1061 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1062 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1063 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1064 "psraw $3, %%mm1 \n\t"
1065 "psraw $3, %%mm7 \n\t"
1066 "packuswb %%mm7, %%mm1 \n\t"
1067 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1068
1069 YSCALEYUV2PACKEDX_END
1070 }else{
14014d47
MN
1071 YSCALEYUV2PACKEDX_ACCURATE
1072 YSCALEYUV2RGBX
f8a138be 1073 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1074 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
2da0d70d 1075
14014d47 1076 YSCALEYUV2PACKEDX_END
6858492e 1077 }
14014d47
MN
1078 return;
1079 case PIX_FMT_BGR24:
1080 YSCALEYUV2PACKEDX_ACCURATE
1081 YSCALEYUV2RGBX
40494418 1082 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1083 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1084 "add %4, %%"REG_c" \n\t"
1085 WRITEBGR24(%%REGc, %5, %%REGa)
2da0d70d
DB
1086
1087
14014d47
MN
1088 :: "r" (&c->redDither),
1089 "m" (dummy), "m" (dummy), "m" (dummy),
1090 "r" (dest), "m" (dstW)
1091 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1092 );
1093 return;
1094 case PIX_FMT_RGB555:
1095 YSCALEYUV2PACKEDX_ACCURATE
1096 YSCALEYUV2RGBX
40494418 1097 "pxor %%mm7, %%mm7 \n\t"
14014d47 1098 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1099#ifdef DITHER1XBPP
88e2a9ae
CEH
1100 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1101 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1102 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1103#endif
1104
14014d47
MN
1105 WRITERGB15(%4, %5, %%REGa)
1106 YSCALEYUV2PACKEDX_END
1107 return;
1108 case PIX_FMT_RGB565:
1109 YSCALEYUV2PACKEDX_ACCURATE
1110 YSCALEYUV2RGBX
40494418 1111 "pxor %%mm7, %%mm7 \n\t"
14014d47 1112 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1113#ifdef DITHER1XBPP
88e2a9ae
CEH
1114 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1115 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1116 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1117#endif
1118
14014d47
MN
1119 WRITERGB16(%4, %5, %%REGa)
1120 YSCALEYUV2PACKEDX_END
1121 return;
1122 case PIX_FMT_YUYV422:
1123 YSCALEYUV2PACKEDX_ACCURATE
1124 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1125
1126 "psraw $3, %%mm3 \n\t"
1127 "psraw $3, %%mm4 \n\t"
1128 "psraw $3, %%mm1 \n\t"
1129 "psraw $3, %%mm7 \n\t"
1130 WRITEYUY2(%4, %5, %%REGa)
1131 YSCALEYUV2PACKEDX_END
1132 return;
1133 }
1134 }else{
1135 switch(c->dstFormat)
1136 {
1137 case PIX_FMT_RGB32:
6858492e
CS
1138 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1139 YSCALEYUV2PACKEDX
1140 YSCALEYUV2RGBX
1141 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1142 "psraw $3, %%mm1 \n\t"
1143 "psraw $3, %%mm7 \n\t"
1144 "packuswb %%mm7, %%mm1 \n\t"
1145 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1146 YSCALEYUV2PACKEDX_END
1147 }else{
14014d47
MN
1148 YSCALEYUV2PACKEDX
1149 YSCALEYUV2RGBX
f8a138be 1150 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1151 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
14014d47 1152 YSCALEYUV2PACKEDX_END
6858492e 1153 }
14014d47
MN
1154 return;
1155 case PIX_FMT_BGR24:
1156 YSCALEYUV2PACKEDX
1157 YSCALEYUV2RGBX
40494418 1158 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1159 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1160 "add %4, %%"REG_c" \n\t"
1161 WRITEBGR24(%%REGc, %5, %%REGa)
1162
1163 :: "r" (&c->redDither),
1164 "m" (dummy), "m" (dummy), "m" (dummy),
1165 "r" (dest), "m" (dstW)
1166 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1167 );
1168 return;
1169 case PIX_FMT_RGB555:
1170 YSCALEYUV2PACKEDX
1171 YSCALEYUV2RGBX
40494418 1172 "pxor %%mm7, %%mm7 \n\t"
14014d47 1173 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1174#ifdef DITHER1XBPP
88e2a9ae
CEH
1175 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1176 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1177 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1178#endif
1179
14014d47
MN
1180 WRITERGB15(%4, %5, %%REGa)
1181 YSCALEYUV2PACKEDX_END
1182 return;
1183 case PIX_FMT_RGB565:
1184 YSCALEYUV2PACKEDX
1185 YSCALEYUV2RGBX
40494418 1186 "pxor %%mm7, %%mm7 \n\t"
14014d47 1187 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1188#ifdef DITHER1XBPP
88e2a9ae
CEH
1189 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1190 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1191 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1192#endif
1193
14014d47
MN
1194 WRITERGB16(%4, %5, %%REGa)
1195 YSCALEYUV2PACKEDX_END
1196 return;
1197 case PIX_FMT_YUYV422:
1198 YSCALEYUV2PACKEDX
1199 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1200
1201 "psraw $3, %%mm3 \n\t"
1202 "psraw $3, %%mm4 \n\t"
1203 "psraw $3, %%mm1 \n\t"
1204 "psraw $3, %%mm7 \n\t"
1205 WRITEYUY2(%4, %5, %%REGa)
1206 YSCALEYUV2PACKEDX_END
1207 return;
1208 }
bca11e75
MN
1209 }
1210 }
bc279024 1211#endif /* HAVE_MMX */
b63f641e 1212#if HAVE_ALTIVEC
2da0d70d 1213 /* The following list of supported dstFormat values should
780daf2b 1214 match what's found in the body of ff_yuv2packedX_altivec() */
6858492e 1215 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf
12794f73 1216 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
2da0d70d 1217 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
12794f73 1218 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
780daf2b
DB
1219 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1220 chrFilter, chrSrc, chrFilterSize,
1221 dest, dstW, dstY);
2da0d70d
DB
1222 else
1223#endif
1224 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1225 chrFilter, chrSrc, chrFilterSize,
6858492e 1226 alpSrc, dest, dstW, dstY);
c1b0bfb4
MN
1227}
1228
c1b0bfb4
MN
1229/**
1230 * vertical bilinear scale YV12 to RGB
1231 */
25593e29 1232static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
6858492e 1233 uint16_t *abuf0, uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1234{
ac0ad729
MN
1235 int yalpha1=4095- yalpha;
1236 int uvalpha1=4095-uvalpha;
2da0d70d 1237 int i;
d604bab9 1238
b63f641e 1239#if HAVE_MMX
f433c8ab 1240 if(!(c->flags & SWS_BITEXACT)){
2da0d70d
DB
1241 switch(c->dstFormat)
1242 {
1243 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1244 case PIX_FMT_RGB32:
6858492e
CS
1245 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1246#if ARCH_X86_64
1247 __asm__ volatile(
1248 "mov %4, %%"REG_b" \n\t"
1249 YSCALEYUV2RGB(%%REGBP, %5)
1250 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1251 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1253 "packuswb %%mm7, %%mm1 \n\t"
1254 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1255
1256 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1257 "a" (&c->redDither)
1258 ,"r" (abuf0), "r" (abuf1)
1259 : "%"REG_b, "%"REG_BP
1260 );
1261#else
1262 *(uint16_t **)(&c->u_temp)=abuf0;
1263 *(uint16_t **)(&c->v_temp)=abuf1;
1264 __asm__ volatile(
1265 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1266 "mov %4, %%"REG_b" \n\t"
1267 "push %%"REG_BP" \n\t"
1268 YSCALEYUV2RGB(%%REGBP, %5)
1269 "push %0 \n\t"
1270 "push %1 \n\t"
1271 "mov "U_TEMP"(%5), %0 \n\t"
1272 "mov "V_TEMP"(%5), %1 \n\t"
1273 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1274 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1275 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1276 "packuswb %%mm7, %%mm1 \n\t"
1277 "pop %1 \n\t"
1278 "pop %0 \n\t"
1279 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1280 "pop %%"REG_BP" \n\t"
1281 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1282
1283 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1284 "a" (&c->redDither)
1285 );
1286#endif
1287 }else{
7ad6469e 1288 __asm__ volatile(
2da0d70d
DB
1289 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1290 "mov %4, %%"REG_b" \n\t"
1291 "push %%"REG_BP" \n\t"
1292 YSCALEYUV2RGB(%%REGBP, %5)
f8a138be 1293 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1294 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
2da0d70d
DB
1295 "pop %%"REG_BP" \n\t"
1296 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1297
1298 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1299 "a" (&c->redDither)
1300 );
6858492e 1301 }
2da0d70d
DB
1302 return;
1303 case PIX_FMT_BGR24:
7ad6469e 1304 __asm__ volatile(
2da0d70d
DB
1305 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1306 "mov %4, %%"REG_b" \n\t"
1307 "push %%"REG_BP" \n\t"
1308 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1309 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
1310 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1311 "pop %%"REG_BP" \n\t"
1312 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1313 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1314 "a" (&c->redDither)
1315 );
1316 return;
27a90b04 1317 case PIX_FMT_RGB555:
7ad6469e 1318 __asm__ volatile(
2da0d70d
DB
1319 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1320 "mov %4, %%"REG_b" \n\t"
1321 "push %%"REG_BP" \n\t"
1322 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1323 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1324 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1325#ifdef DITHER1XBPP
88e2a9ae
CEH
1326 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1327 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1328 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1329#endif
1330
27a90b04 1331 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1332 "pop %%"REG_BP" \n\t"
1333 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1334
1335 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1336 "a" (&c->redDither)
1337 );
1338 return;
27a90b04 1339 case PIX_FMT_RGB565:
7ad6469e 1340 __asm__ volatile(
2da0d70d
DB
1341 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1342 "mov %4, %%"REG_b" \n\t"
1343 "push %%"REG_BP" \n\t"
1344 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1345 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1346 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1347#ifdef DITHER1XBPP
88e2a9ae
CEH
1348 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1349 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1350 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1351#endif
1352
27a90b04 1353 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1354 "pop %%"REG_BP" \n\t"
1355 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1356 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1357 "a" (&c->redDither)
1358 );
1359 return;
1360 case PIX_FMT_YUYV422:
7ad6469e 1361 __asm__ volatile(
2da0d70d
DB
1362 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1363 "mov %4, %%"REG_b" \n\t"
1364 "push %%"REG_BP" \n\t"
1365 YSCALEYUV2PACKED(%%REGBP, %5)
1366 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1367 "pop %%"REG_BP" \n\t"
1368 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1369 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1370 "a" (&c->redDither)
1371 );
1372 return;
1373 default: break;
1374 }
f433c8ab 1375 }
cf7d1c1a 1376#endif //HAVE_MMX
6858492e 1377YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
d604bab9
MN
1378}
1379
1380/**
1381 * YV12 to RGB without scaling or interpolating
1382 */
25593e29 1383static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
6858492e 1384 uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1385{
2da0d70d
DB
1386 const int yalpha1=0;
1387 int i;
6a4970ab 1388
8a322796 1389 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
2da0d70d 1390 const int yalpha= 4096; //FIXME ...
96034638 1391
2da0d70d
DB
1392 if (flags&SWS_FULL_CHR_H_INT)
1393 {
6858492e 1394 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
2da0d70d
DB
1395 return;
1396 }
397c035e 1397
b63f641e 1398#if HAVE_MMX
f433c8ab 1399 if(!(flags & SWS_BITEXACT)){
14014d47 1400 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
2da0d70d 1401 {
14014d47
MN
1402 switch(dstFormat)
1403 {
1404 case PIX_FMT_RGB32:
6858492e
CS
1405 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1406 __asm__ volatile(
1407 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1408 "mov %4, %%"REG_b" \n\t"
1409 "push %%"REG_BP" \n\t"
1410 YSCALEYUV2RGB1(%%REGBP, %5)
1411 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1412 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1413 "pop %%"REG_BP" \n\t"
1414 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1415
1416 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1417 "a" (&c->redDither)
1418 );
1419 }else{
7ad6469e 1420 __asm__ volatile(
14014d47
MN
1421 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1422 "mov %4, %%"REG_b" \n\t"
1423 "push %%"REG_BP" \n\t"
1424 YSCALEYUV2RGB1(%%REGBP, %5)
f8a138be 1425 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1426 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
14014d47
MN
1427 "pop %%"REG_BP" \n\t"
1428 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1429
1430 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1431 "a" (&c->redDither)
1432 );
6858492e 1433 }
14014d47
MN
1434 return;
1435 case PIX_FMT_BGR24:
7ad6469e 1436 __asm__ volatile(
14014d47
MN
1437 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1438 "mov %4, %%"REG_b" \n\t"
1439 "push %%"REG_BP" \n\t"
1440 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1441 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1442 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1443 "pop %%"REG_BP" \n\t"
1444 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1445
1446 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1447 "a" (&c->redDither)
1448 );
1449 return;
1450 case PIX_FMT_RGB555:
7ad6469e 1451 __asm__ volatile(
14014d47
MN
1452 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1453 "mov %4, %%"REG_b" \n\t"
1454 "push %%"REG_BP" \n\t"
1455 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1456 "pxor %%mm7, %%mm7 \n\t"
14014d47 1457 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1458#ifdef DITHER1XBPP
88e2a9ae
CEH
1459 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1460 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1461 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1462#endif
14014d47
MN
1463 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1464 "pop %%"REG_BP" \n\t"
1465 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1466
14014d47
MN
1467 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1468 "a" (&c->redDither)
1469 );
1470 return;
1471 case PIX_FMT_RGB565:
7ad6469e 1472 __asm__ volatile(
14014d47
MN
1473 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1474 "mov %4, %%"REG_b" \n\t"
1475 "push %%"REG_BP" \n\t"
1476 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1477 "pxor %%mm7, %%mm7 \n\t"
14014d47 1478 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1479#ifdef DITHER1XBPP
88e2a9ae
CEH
1480 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1481 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1482 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1483#endif
1484
14014d47
MN
1485 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1486 "pop %%"REG_BP" \n\t"
1487 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1488
14014d47
MN
1489 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1490 "a" (&c->redDither)
1491 );
1492 return;
1493 case PIX_FMT_YUYV422:
7ad6469e 1494 __asm__ volatile(
14014d47
MN
1495 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1496 "mov %4, %%"REG_b" \n\t"
1497 "push %%"REG_BP" \n\t"
1498 YSCALEYUV2PACKED1(%%REGBP, %5)
1499 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1500 "pop %%"REG_BP" \n\t"
1501 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1502
1503 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1504 "a" (&c->redDither)
1505 );
1506 return;
1507 }
2da0d70d 1508 }
14014d47 1509 else
2da0d70d 1510 {
14014d47
MN
1511 switch(dstFormat)
1512 {
1513 case PIX_FMT_RGB32:
6858492e
CS
1514 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1515 __asm__ volatile(
1516 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1517 "mov %4, %%"REG_b" \n\t"
1518 "push %%"REG_BP" \n\t"
1519 YSCALEYUV2RGB1b(%%REGBP, %5)
1520 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1521 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1522 "pop %%"REG_BP" \n\t"
1523 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1524
1525 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1526 "a" (&c->redDither)
1527 );
1528 }else{
7ad6469e 1529 __asm__ volatile(
14014d47
MN
1530 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1531 "mov %4, %%"REG_b" \n\t"
1532 "push %%"REG_BP" \n\t"
1533 YSCALEYUV2RGB1b(%%REGBP, %5)
f8a138be 1534 "pcmpeqd %%mm7, %%mm7 \n\t"
9c77b26b 1535 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
14014d47
MN
1536 "pop %%"REG_BP" \n\t"
1537 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1538
1539 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1540 "a" (&c->redDither)
1541 );
6858492e 1542 }
14014d47
MN
1543 return;
1544 case PIX_FMT_BGR24:
7ad6469e 1545 __asm__ volatile(
14014d47
MN
1546 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1547 "mov %4, %%"REG_b" \n\t"
1548 "push %%"REG_BP" \n\t"
1549 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1550 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1551 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1552 "pop %%"REG_BP" \n\t"
1553 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1554
1555 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1556 "a" (&c->redDither)
1557 );
1558 return;
1559 case PIX_FMT_RGB555:
7ad6469e 1560 __asm__ volatile(
14014d47
MN
1561 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1562 "mov %4, %%"REG_b" \n\t"
1563 "push %%"REG_BP" \n\t"
1564 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1565 "pxor %%mm7, %%mm7 \n\t"
14014d47 1566 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1567#ifdef DITHER1XBPP
88e2a9ae
CEH
1568 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1569 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1570 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1571#endif
14014d47
MN
1572 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1573 "pop %%"REG_BP" \n\t"
1574 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1575
14014d47
MN
1576 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1577 "a" (&c->redDither)
1578 );
1579 return;
1580 case PIX_FMT_RGB565:
7ad6469e 1581 __asm__ volatile(
14014d47
MN
1582 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1583 "mov %4, %%"REG_b" \n\t"
1584 "push %%"REG_BP" \n\t"
1585 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1586 "pxor %%mm7, %%mm7 \n\t"
14014d47 1587 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1588#ifdef DITHER1XBPP
88e2a9ae
CEH
1589 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1590 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1591 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1592#endif
1593
14014d47
MN
1594 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1595 "pop %%"REG_BP" \n\t"
1596 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1597
14014d47
MN
1598 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1599 "a" (&c->redDither)
1600 );
1601 return;
1602 case PIX_FMT_YUYV422:
7ad6469e 1603 __asm__ volatile(
14014d47
MN
1604 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1605 "mov %4, %%"REG_b" \n\t"
1606 "push %%"REG_BP" \n\t"
1607 YSCALEYUV2PACKED1b(%%REGBP, %5)
1608 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1609 "pop %%"REG_BP" \n\t"
1610 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1611
1612 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1613 "a" (&c->redDither)
1614 );
1615 return;
1616 }
2da0d70d
DB
1617 }
1618 }
bc279024 1619#endif /* HAVE_MMX */
e5091488 1620 if (uvalpha < 2048)
2da0d70d 1621 {
6858492e 1622 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1623 }else{
6858492e 1624 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1625 }
d604bab9
MN
1626}
1627
8a322796 1628//FIXME yuy2* can read up to 7 samples too much
6ff0ad6b 1629
896a22b8 1630static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1e621b18 1631{
b63f641e 1632#if HAVE_MMX
7ad6469e 1633 __asm__ volatile(
2da0d70d
DB
1634 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1635 "mov %0, %%"REG_a" \n\t"
1636 "1: \n\t"
1637 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1638 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1639 "pand %%mm2, %%mm0 \n\t"
1640 "pand %%mm2, %%mm1 \n\t"
1641 "packuswb %%mm1, %%mm0 \n\t"
1642 "movq %%mm0, (%2, %%"REG_a") \n\t"
1643 "add $8, %%"REG_a" \n\t"
1644 " js 1b \n\t"
d0ce212a 1645 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
2da0d70d
DB
1646 : "%"REG_a
1647 );
1e621b18 1648#else
2da0d70d
DB
1649 int i;
1650 for (i=0; i<width; i++)
1651 dst[i]= src[2*i];
1e621b18
MN
1652#endif
1653}
1654
896a22b8 1655static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1e621b18 1656{
b63f641e 1657#if HAVE_MMX
7ad6469e 1658 __asm__ volatile(
2da0d70d
DB
1659 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1660 "mov %0, %%"REG_a" \n\t"
1661 "1: \n\t"
1662 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1663 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1664 "psrlw $8, %%mm0 \n\t"
1665 "psrlw $8, %%mm1 \n\t"
1666 "packuswb %%mm1, %%mm0 \n\t"
1667 "movq %%mm0, %%mm1 \n\t"
1668 "psrlw $8, %%mm0 \n\t"
1669 "pand %%mm4, %%mm1 \n\t"
1670 "packuswb %%mm0, %%mm0 \n\t"
1671 "packuswb %%mm1, %%mm1 \n\t"
1672 "movd %%mm0, (%3, %%"REG_a") \n\t"
1673 "movd %%mm1, (%2, %%"REG_a") \n\t"
1674 "add $4, %%"REG_a" \n\t"
1675 " js 1b \n\t"
d0ce212a 1676 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
2da0d70d
DB
1677 : "%"REG_a
1678 );
1e621b18 1679#else
2da0d70d
DB
1680 int i;
1681 for (i=0; i<width; i++)
1682 {
1683 dstU[i]= src1[4*i + 1];
1684 dstV[i]= src1[4*i + 3];
1685 }
1686#endif
1687 assert(src1 == src2);
1e621b18
MN
1688}
1689
4cf16bbe
DB
1690/* This is almost identical to the previous, end exists only because
1691 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
896a22b8 1692static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
7322a67c 1693{
b63f641e 1694#if HAVE_MMX
7ad6469e 1695 __asm__ volatile(
2da0d70d
DB
1696 "mov %0, %%"REG_a" \n\t"
1697 "1: \n\t"
1698 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1699 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1700 "psrlw $8, %%mm0 \n\t"
1701 "psrlw $8, %%mm1 \n\t"
1702 "packuswb %%mm1, %%mm0 \n\t"
1703 "movq %%mm0, (%2, %%"REG_a") \n\t"
1704 "add $8, %%"REG_a" \n\t"
1705 " js 1b \n\t"
d0ce212a 1706 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
2da0d70d
DB
1707 : "%"REG_a
1708 );
7322a67c 1709#else
2da0d70d
DB
1710 int i;
1711 for (i=0; i<width; i++)
1712 dst[i]= src[2*i+1];
7322a67c
MN
1713#endif
1714}
1715
896a22b8 1716static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
7322a67c 1717{
b63f641e 1718#if HAVE_MMX
7ad6469e 1719 __asm__ volatile(
2da0d70d
DB
1720 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1721 "mov %0, %%"REG_a" \n\t"
1722 "1: \n\t"
1723 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1724 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1725 "pand %%mm4, %%mm0 \n\t"
1726 "pand %%mm4, %%mm1 \n\t"
1727 "packuswb %%mm1, %%mm0 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "psrlw $8, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm0, %%mm0 \n\t"
1732 "packuswb %%mm1, %%mm1 \n\t"
1733 "movd %%mm0, (%3, %%"REG_a") \n\t"
1734 "movd %%mm1, (%2, %%"REG_a") \n\t"
1735 "add $4, %%"REG_a" \n\t"
1736 " js 1b \n\t"
d0ce212a 1737 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
2da0d70d
DB
1738 : "%"REG_a
1739 );
7322a67c 1740#else
2da0d70d
DB
1741 int i;
1742 for (i=0; i<width; i++)
1743 {
1744 dstU[i]= src1[4*i + 0];
1745 dstV[i]= src1[4*i + 2];
1746 }
1747#endif
1748 assert(src1 == src2);
7322a67c
MN
1749}
1750
214892ee 1751#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
896a22b8 1752static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
214892ee
MN
1753{\
1754 int i;\
1755 for (i=0; i<width; i++)\
1756 {\
1757 int b= (((type*)src)[i]>>shb)&maskb;\
1758 int g= (((type*)src)[i]>>shg)&maskg;\
1759 int r= (((type*)src)[i]>>shr)&maskr;\
1760\
1761 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1762 }\
1e621b18
MN
1763}
1764
214892ee
MN
1765BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1766BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1767BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1768BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1769BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1770BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1771
6858492e
CS
1772static inline void RENAME(abgrToA)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused){
1773 int i;
1774 for (i=0; i<width; i++){
1775 dst[i]= src[4*i];
1776 }
1777}
1778
f8a138be 1779#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
896a22b8 1780static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
a0baa07a
MN
1781{\
1782 int i;\
1783 for (i=0; i<width; i++)\
1784 {\
ba83d862
MN
1785 int b= (((type*)src)[i]&maskb)>>shb;\
1786 int g= (((type*)src)[i]&maskg)>>shg;\
1787 int r= (((type*)src)[i]&maskr)>>shr;\
a0baa07a
MN
1788\
1789 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1790 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1791 }\
ba83d862 1792}\
896a22b8 1793static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
ba83d862
MN
1794{\
1795 int i;\
1796 for (i=0; i<width; i++)\
1797 {\
1798 int pix0= ((type*)src)[2*i+0];\
1799 int pix1= ((type*)src)[2*i+1];\
bcff32d1 1800 int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
ba83d862
MN
1801 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1802 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
f8a138be 1803 g&= maskg|(2*maskg);\
ba83d862
MN
1804\
1805 g>>=shg;\
1806\
6b79dbce
MN
1807 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1808 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
ba83d862 1809 }\
2f60f629
MN
1810}
1811
f8a138be
CS
1812BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1813BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1814BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1815BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1816BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1817BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
a0baa07a 1818
b63f641e 1819#if HAVE_MMX
a35acd7f 1820static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
dfb09bd1
MN
1821{
1822
1823 if(srcFormat == PIX_FMT_BGR24){
7ad6469e 1824 __asm__ volatile(
ff9a056d
MN
1825 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1826 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1827 :
dfb09bd1
MN
1828 );
1829 }else{
7ad6469e 1830 __asm__ volatile(
ff9a056d
MN
1831 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1832 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1833 :
dfb09bd1
MN
1834 );
1835 }
1836
7ad6469e 1837 __asm__ volatile(
dfb09bd1
MN
1838 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1839 "mov %2, %%"REG_a" \n\t"
1840 "pxor %%mm7, %%mm7 \n\t"
1841 "1: \n\t"
1842 PREFETCH" 64(%0) \n\t"
1843 "movd (%0), %%mm0 \n\t"
1844 "movd 2(%0), %%mm1 \n\t"
1845 "movd 6(%0), %%mm2 \n\t"
1846 "movd 8(%0), %%mm3 \n\t"
1847 "add $12, %0 \n\t"
1848 "punpcklbw %%mm7, %%mm0 \n\t"
1849 "punpcklbw %%mm7, %%mm1 \n\t"
1850 "punpcklbw %%mm7, %%mm2 \n\t"
1851 "punpcklbw %%mm7, %%mm3 \n\t"
1852 "pmaddwd %%mm5, %%mm0 \n\t"
1853 "pmaddwd %%mm6, %%mm1 \n\t"
1854 "pmaddwd %%mm5, %%mm2 \n\t"
1855 "pmaddwd %%mm6, %%mm3 \n\t"
1856 "paddd %%mm1, %%mm0 \n\t"
1857 "paddd %%mm3, %%mm2 \n\t"
1858 "paddd %%mm4, %%mm0 \n\t"
1859 "paddd %%mm4, %%mm2 \n\t"
1860 "psrad $15, %%mm0 \n\t"
1861 "psrad $15, %%mm2 \n\t"
1862 "packssdw %%mm2, %%mm0 \n\t"
1863 "packuswb %%mm0, %%mm0 \n\t"
1864 "movd %%mm0, (%1, %%"REG_a") \n\t"
1865 "add $4, %%"REG_a" \n\t"
1866 " js 1b \n\t"
1867 : "+r" (src)
d0ce212a 1868 : "r" (dst+width), "g" ((x86_reg)-width)
dfb09bd1 1869 : "%"REG_a
2da0d70d 1870 );
dfb09bd1
MN
1871}
1872
a35acd7f 1873static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
dfb09bd1 1874{
7ad6469e 1875 __asm__ volatile(
dfb09bd1
MN
1876 "movq 24+%4, %%mm6 \n\t"
1877 "mov %3, %%"REG_a" \n\t"
1878 "pxor %%mm7, %%mm7 \n\t"
1879 "1: \n\t"
1880 PREFETCH" 64(%0) \n\t"
1881 "movd (%0), %%mm0 \n\t"
1882 "movd 2(%0), %%mm1 \n\t"
1883 "punpcklbw %%mm7, %%mm0 \n\t"
1884 "punpcklbw %%mm7, %%mm1 \n\t"
1885 "movq %%mm0, %%mm2 \n\t"
1886 "movq %%mm1, %%mm3 \n\t"
1887 "pmaddwd %4, %%mm0 \n\t"
1888 "pmaddwd 8+%4, %%mm1 \n\t"
1889 "pmaddwd 16+%4, %%mm2 \n\t"
1890 "pmaddwd %%mm6, %%mm3 \n\t"
1891 "paddd %%mm1, %%mm0 \n\t"
1892 "paddd %%mm3, %%mm2 \n\t"
1893
1894 "movd 6(%0), %%mm1 \n\t"
1895 "movd 8(%0), %%mm3 \n\t"
1896 "add $12, %0 \n\t"
1897 "punpcklbw %%mm7, %%mm1 \n\t"
1898 "punpcklbw %%mm7, %%mm3 \n\t"
1899 "movq %%mm1, %%mm4 \n\t"
1900 "movq %%mm3, %%mm5 \n\t"
1901 "pmaddwd %4, %%mm1 \n\t"
1902 "pmaddwd 8+%4, %%mm3 \n\t"
1903 "pmaddwd 16+%4, %%mm4 \n\t"
1904 "pmaddwd %%mm6, %%mm5 \n\t"
1905 "paddd %%mm3, %%mm1 \n\t"
1906 "paddd %%mm5, %%mm4 \n\t"
1907
1908 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1909 "paddd %%mm3, %%mm0 \n\t"
1910 "paddd %%mm3, %%mm2 \n\t"
1911 "paddd %%mm3, %%mm1 \n\t"
1912 "paddd %%mm3, %%mm4 \n\t"
1913 "psrad $15, %%mm0 \n\t"
1914 "psrad $15, %%mm2 \n\t"
1915 "psrad $15, %%mm1 \n\t"
1916 "psrad $15, %%mm4 \n\t"
1917 "packssdw %%mm1, %%mm0 \n\t"
1918 "packssdw %%mm4, %%mm2 \n\t"
1919 "packuswb %%mm0, %%mm0 \n\t"
1920 "packuswb %%mm2, %%mm2 \n\t"
1921 "movd %%mm0, (%1, %%"REG_a") \n\t"
1922 "movd %%mm2, (%2, %%"REG_a") \n\t"
1923 "add $4, %%"REG_a" \n\t"
1924 " js 1b \n\t"
1925 : "+r" (src)
d0ce212a 1926 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
dfb09bd1
MN
1927 : "%"REG_a
1928 );
1929}
1930#endif
1931
896a22b8 1932static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
dfb09bd1 1933{
b63f641e 1934#if HAVE_MMX
a35acd7f 1935 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1e621b18 1936#else
2da0d70d
DB
1937 int i;
1938 for (i=0; i<width; i++)
1939 {
1940 int b= src[i*3+0];
1941 int g= src[i*3+1];
1942 int r= src[i*3+2];
1e621b18 1943
e5091488 1944 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1945 }
bc279024 1946#endif /* HAVE_MMX */
1e621b18
MN
1947}
1948
896a22b8 1949static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1e621b18 1950{
b63f641e 1951#if HAVE_MMX
a35acd7f 1952 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1e621b18 1953#else
2da0d70d
DB
1954 int i;
1955 for (i=0; i<width; i++)
1956 {
dfb09bd1
MN
1957 int b= src1[3*i + 0];
1958 int g= src1[3*i + 1];
1959 int r= src1[3*i + 2];
2da0d70d 1960
dfb09bd1
MN
1961 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1962 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 1963 }
bc279024 1964#endif /* HAVE_MMX */
2da0d70d 1965 assert(src1 == src2);
1e621b18
MN
1966}
1967
896a22b8 1968static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
1969{
1970 int i;
1971 for (i=0; i<width; i++)
1972 {
1973 int b= src1[6*i + 0] + src1[6*i + 3];
1974 int g= src1[6*i + 1] + src1[6*i + 4];
1975 int r= src1[6*i + 2] + src1[6*i + 5];
1976
1977 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1978 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1979 }
1980 assert(src1 == src2);
1981}
1982
896a22b8 1983static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
a861d4d7 1984{
b63f641e 1985#if HAVE_MMX
a35acd7f 1986 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
dfb09bd1 1987#else
2da0d70d
DB
1988 int i;
1989 for (i=0; i<width; i++)
1990 {
1991 int r= src[i*3+0];
1992 int g= src[i*3+1];
1993 int b= src[i*3+2];
1994
e5091488 1995 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1996 }
dfb09bd1 1997#endif
a861d4d7
MN
1998}
1999
896a22b8 2000static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
a861d4d7 2001{
b63f641e 2002#if HAVE_MMX
5155b839 2003 assert(src1==src2);
a35acd7f 2004 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
dfb09bd1 2005#else
5155b839
DB
2006 int i;
2007 assert(src1==src2);
2da0d70d
DB
2008 for (i=0; i<width; i++)
2009 {
dfb09bd1
MN
2010 int r= src1[3*i + 0];
2011 int g= src1[3*i + 1];
2012 int b= src1[3*i + 2];
2da0d70d 2013
dfb09bd1
MN
2014 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2015 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2016 }
dfb09bd1 2017#endif
a861d4d7
MN
2018}
2019
896a22b8 2020static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
2021{
2022 int i;
2023 assert(src1==src2);
2024 for (i=0; i<width; i++)
2025 {
e09d7eef
MN
2026 int r= src1[6*i + 0] + src1[6*i + 3];
2027 int g= src1[6*i + 1] + src1[6*i + 4];
2028 int b= src1[6*i + 2] + src1[6*i + 5];
2f60f629
MN
2029
2030 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2031 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2032 }
2033}
2034
1e621b18 2035
97b93389 2036static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
e28630fc 2037{
2da0d70d
DB
2038 int i;
2039 for (i=0; i<width; i++)
2040 {
2041 int d= src[i];
e28630fc 2042
2da0d70d
DB
2043 dst[i]= pal[d] & 0xFF;
2044 }
e28630fc
MN
2045}
2046
97b93389 2047static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
e28630fc 2048{
2da0d70d
DB
2049 int i;
2050 assert(src1 == src2);
2051 for (i=0; i<width; i++)
2052 {
2053 int p= pal[src1[i]];
2054
2055 dstU[i]= p>>8;
2056 dstV[i]= p>>16;
2057 }
e28630fc
MN
2058}
2059
896a22b8 2060static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
3d05e078
MN
2061{
2062 int i, j;
2063 for (i=0; i<width/8; i++){
3a5ba0c3
LB
2064 int d= ~src[i];
2065 for(j=0; j<8; j++)
2066 dst[8*i+j]= ((d>>(7-j))&1)*255;
2067 }
2068}
2069
896a22b8 2070static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
3a5ba0c3
LB
2071{
2072 int i, j;
2073 for (i=0; i<width/8; i++){
2074 int d= src[i];
78454dfc
MN
2075 for(j=0; j<8; j++)
2076 dst[8*i+j]= ((d>>(7-j))&1)*255;
3d05e078
MN
2077 }
2078}
2079
8a322796 2080// bilinear / bicubic scaling
077ea8a7 2081static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2da0d70d 2082 int16_t *filter, int16_t *filterPos, long filterSize)
2ff198c1 2083{
b63f641e 2084#if HAVE_MMX
2da0d70d
DB
2085 assert(filterSize % 4 == 0 && filterSize>0);
2086 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2087 {
d0ce212a 2088 x86_reg counter= -2*dstW;
2da0d70d
DB
2089 filter-= counter*2;
2090 filterPos-= counter/2;
2091 dst-= counter/2;
7ad6469e 2092 __asm__ volatile(
83c89c78 2093#if defined(PIC)
2da0d70d
DB
2094 "push %%"REG_b" \n\t"
2095#endif
2096 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2097 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2098 "mov %%"REG_a", %%"REG_BP" \n\t"
2099 ASMALIGN(4)
2100 "1: \n\t"
2101 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2102 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2103 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2104 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2105 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2106 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2107 "punpcklbw %%mm7, %%mm0 \n\t"
2108 "punpcklbw %%mm7, %%mm2 \n\t"
2109 "pmaddwd %%mm1, %%mm0 \n\t"
2110 "pmaddwd %%mm2, %%mm3 \n\t"
ef423a66
MN
2111 "movq %%mm0, %%mm4 \n\t"
2112 "punpckldq %%mm3, %%mm0 \n\t"
2113 "punpckhdq %%mm3, %%mm4 \n\t"
2114 "paddd %%mm4, %%mm0 \n\t"
2115 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2116 "packssdw %%mm0, %%mm0 \n\t"
2117 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2118 "add $4, %%"REG_BP" \n\t"
2119 " jnc 1b \n\t"
2120
2121 "pop %%"REG_BP" \n\t"
83c89c78 2122#if defined(PIC)
2da0d70d 2123 "pop %%"REG_b" \n\t"
83c89c78 2124#endif
2da0d70d
DB
2125 : "+a" (counter)
2126 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2127#if !defined(PIC)
2da0d70d
DB
2128 : "%"REG_b
2129#endif
2130 );
2131 }
2132 else if (filterSize==8)
2133 {
d0ce212a 2134 x86_reg counter= -2*dstW;
2da0d70d
DB
2135 filter-= counter*4;
2136 filterPos-= counter/2;
2137 dst-= counter/2;
7ad6469e 2138 __asm__ volatile(
83c89c78 2139#if defined(PIC)
2da0d70d
DB
2140 "push %%"REG_b" \n\t"
2141#endif
2142 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2143 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2144 "mov %%"REG_a", %%"REG_BP" \n\t"
2145 ASMALIGN(4)
2146 "1: \n\t"
2147 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2148 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2149 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2150 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2151 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2152 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2153 "punpcklbw %%mm7, %%mm0 \n\t"
2154 "punpcklbw %%mm7, %%mm2 \n\t"
2155 "pmaddwd %%mm1, %%mm0 \n\t"
2156 "pmaddwd %%mm2, %%mm3 \n\t"
2157
2158 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2159 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2160 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2161 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2162 "punpcklbw %%mm7, %%mm4 \n\t"
2163 "punpcklbw %%mm7, %%mm2 \n\t"
2164 "pmaddwd %%mm1, %%mm4 \n\t"
2165 "pmaddwd %%mm2, %%mm5 \n\t"
2166 "paddd %%mm4, %%mm0 \n\t"
2167 "paddd %%mm5, %%mm3 \n\t"
ef423a66
MN
2168 "movq %%mm0, %%mm4 \n\t"
2169 "punpckldq %%mm3, %%mm0 \n\t"
2170 "punpckhdq %%mm3, %%mm4 \n\t"
2171 "paddd %%mm4, %%mm0 \n\t"
2172 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2173 "packssdw %%mm0, %%mm0 \n\t"
2174 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2175 "add $4, %%"REG_BP" \n\t"
2176 " jnc 1b \n\t"
2177
2178 "pop %%"REG_BP" \n\t"
83c89c78 2179#if defined(PIC)
2da0d70d 2180 "pop %%"REG_b" \n\t"
83c89c78 2181#endif
2da0d70d
DB
2182 : "+a" (counter)
2183 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2184#if !defined(PIC)
2da0d70d
DB
2185 : "%"REG_b
2186#endif
2187 );
2188 }
2189 else
2190 {
2191 uint8_t *offset = src+filterSize;
d0ce212a 2192 x86_reg counter= -2*dstW;
2da0d70d
DB
2193 //filter-= counter*filterSize/2;
2194 filterPos-= counter/2;
2195 dst-= counter/2;
7ad6469e 2196 __asm__ volatile(
2da0d70d 2197 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2198 ASMALIGN(4)
2199 "1: \n\t"
2200 "mov %2, %%"REG_c" \n\t"
2201 "movzwl (%%"REG_c", %0), %%eax \n\t"
2202 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2203 "mov %5, %%"REG_c" \n\t"
2204 "pxor %%mm4, %%mm4 \n\t"
2205 "pxor %%mm5, %%mm5 \n\t"
2206 "2: \n\t"
2207 "movq (%1), %%mm1 \n\t"
2208 "movq (%1, %6), %%mm3 \n\t"
2209 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2210 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2211 "punpcklbw %%mm7, %%mm0 \n\t"
2212 "punpcklbw %%mm7, %%mm2 \n\t"
2213 "pmaddwd %%mm1, %%mm0 \n\t"
2214 "pmaddwd %%mm2, %%mm3 \n\t"
2215 "paddd %%mm3, %%mm5 \n\t"
2216 "paddd %%mm0, %%mm4 \n\t"
2217 "add $8, %1 \n\t"
2218 "add $4, %%"REG_c" \n\t"
2219 "cmp %4, %%"REG_c" \n\t"
2220 " jb 2b \n\t"
2221 "add %6, %1 \n\t"
ef423a66
MN
2222 "movq %%mm4, %%mm0 \n\t"
2223 "punpckldq %%mm5, %%mm4 \n\t"
2224 "punpckhdq %%mm5, %%mm0 \n\t"
2225 "paddd %%mm0, %%mm4 \n\t"
2226 "psrad $7, %%mm4 \n\t"
2da0d70d
DB
2227 "packssdw %%mm4, %%mm4 \n\t"
2228 "mov %3, %%"REG_a" \n\t"
2229 "movd %%mm4, (%%"REG_a", %0) \n\t"
2230 "add $4, %0 \n\t"
2231 " jnc 1b \n\t"
2232
2233 : "+r" (counter), "+r" (filter)
2234 : "m" (filterPos), "m" (dst), "m"(offset),
d0ce212a 2235 "m" (src), "r" ((x86_reg)filterSize*2)
2da0d70d
DB
2236 : "%"REG_a, "%"REG_c, "%"REG_d
2237 );
2238 }
077ea8a7 2239#else
b63f641e 2240#if HAVE_ALTIVEC
2da0d70d 2241 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2242#else
2da0d70d
DB
2243 int i;
2244 for (i=0; i<dstW; i++)
2245 {
2246 int j;
2247 int srcPos= filterPos[i];
2248 int val=0;
2249 //printf("filterPos: %d\n", filterPos[i]);
2250 for (j=0; j<filterSize; j++)
2251 {
2252 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2253 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2254 }
2255 //filter += hFilterSize;
881c4294 2256 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2da0d70d
DB
2257 //dst[i] = val>>7;
2258 }
bc279024
DB
2259#endif /* HAVE_ALTIVEC */
2260#endif /* HAVE_MMX */
077ea8a7 2261}
2ff198c1 2262 // *** horizontal scale Y line to temp buffer
6bc0c792 2263static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2da0d70d
DB
2264 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2265 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2266 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
6858492e 2267 int32_t *mmx2FilterPos, uint32_t *pal, int isAlpha)
077ea8a7 2268{
2da0d70d 2269 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
1e621b18 2270 {
896a22b8 2271 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2272 src= formatConvBuffer;
1e621b18 2273 }
2da0d70d 2274 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
7322a67c 2275 {
896a22b8 2276 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2277 src= formatConvBuffer;
7322a67c 2278 }
2da0d70d 2279 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2280 {
6858492e
CS
2281 if (isAlpha)
2282 RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
2283 else
896a22b8 2284 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2285 src= formatConvBuffer;
1e621b18 2286 }
9990e426
MN
2287 else if (srcFormat==PIX_FMT_RGB32_1)
2288 {
6858492e
CS
2289 if (isAlpha)
2290 RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
2291 else
896a22b8 2292 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2293 src= formatConvBuffer;
2294 }
2da0d70d 2295 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2296 {
896a22b8 2297 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2298 src= formatConvBuffer;
1e621b18 2299 }
2da0d70d 2300 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2301 {
896a22b8 2302 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2303 src= formatConvBuffer;
6af250ea 2304 }
2da0d70d 2305 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2306 {
896a22b8 2307 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2308 src= formatConvBuffer;
b72034dd 2309 }
2da0d70d 2310 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2311 {
6858492e
CS
2312 if (isAlpha)
2313 RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
2314 else
896a22b8 2315 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2316 src= formatConvBuffer;
a861d4d7 2317 }
9990e426
MN
2318 else if (srcFormat==PIX_FMT_BGR32_1)
2319 {
6858492e
CS
2320 if (isAlpha)
2321 RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
2322 else
896a22b8 2323 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2324 src= formatConvBuffer;
2325 }
2da0d70d 2326 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2327 {
896a22b8 2328 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2329 src= formatConvBuffer;
a861d4d7 2330 }
2da0d70d 2331 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2332 {
896a22b8 2333 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2334 src= formatConvBuffer;
a43fb6b3 2335 }
2da0d70d 2336 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2337 {
896a22b8 2338 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2339 src= formatConvBuffer;
a43fb6b3 2340 }
2da0d70d 2341 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2342 {
e48a79c9 2343 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2344 src= formatConvBuffer;
e28630fc 2345 }
3a5ba0c3
LB
2346 else if (srcFormat==PIX_FMT_MONOBLACK)
2347 {
896a22b8 2348 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
3a5ba0c3
LB
2349 src= formatConvBuffer;
2350 }
2351 else if (srcFormat==PIX_FMT_MONOWHITE)
3d05e078 2352 {
896a22b8 2353 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
3d05e078
MN
2354 src= formatConvBuffer;
2355 }
1e621b18 2356
b63f641e 2357#if HAVE_MMX
8a322796 2358 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2359 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2360#else
2da0d70d 2361 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2362#endif
077ea8a7 2363 {
2da0d70d 2364 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
077ea8a7 2365 }
8a322796 2366 else // fast bilinear upscale / crap downscale
077ea8a7 2367 {
57f9a560 2368#if ARCH_X86 && CONFIG_GPL
b63f641e 2369#if HAVE_MMX2
2da0d70d 2370 int i;
83c89c78 2371#if defined(PIC)
2da0d70d 2372 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2373#endif
2da0d70d
DB
2374 if (canMMX2BeUsed)
2375 {
7ad6469e 2376 __asm__ volatile(
83c89c78 2377#if defined(PIC)
2da0d70d
DB
2378 "mov %%"REG_b", %5 \n\t"
2379#endif
2380 "pxor %%mm7, %%mm7 \n\t"
2381 "mov %0, %%"REG_c" \n\t"
2382 "mov %1, %%"REG_D" \n\t"
2383 "mov %2, %%"REG_d" \n\t"
2384 "mov %3, %%"REG_b" \n\t"
2385 "xor %%"REG_a", %%"REG_a" \n\t" // i
2386 PREFETCH" (%%"REG_c") \n\t"
2387 PREFETCH" 32(%%"REG_c") \n\t"
2388 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2389
b63f641e 2390#if ARCH_X86_64
6d606c4f
AJ
2391
2392#define FUNNY_Y_CODE \
2da0d70d
DB
2393 "movl (%%"REG_b"), %%esi \n\t"\
2394 "call *%4 \n\t"\
2395 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2396 "add %%"REG_S", %%"REG_c" \n\t"\
2397 "add %%"REG_a", %%"REG_D" \n\t"\
2398 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2399
2400#else
2401
2ff198c1 2402#define FUNNY_Y_CODE \
2da0d70d
DB
2403 "movl (%%"REG_b"), %%esi \n\t"\
2404 "call *%4 \n\t"\
2405 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2406 "add %%"REG_a", %%"REG_D" \n\t"\
2407 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2408
bc279024 2409#endif /* ARCH_X86_64 */
6d606c4f 2410
2ff198c1
MN
2411FUNNY_Y_CODE
2412FUNNY_Y_CODE
2413FUNNY_Y_CODE
2414FUNNY_Y_CODE
2415FUNNY_Y_CODE
2416FUNNY_Y_CODE
2417FUNNY_Y_CODE
2418FUNNY_Y_CODE
2419
83c89c78 2420#if defined(PIC)
2da0d70d 2421 "mov %5, %%"REG_b" \n\t"
83c89c78 2422#endif
2da0d70d
DB
2423 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2424 "m" (funnyYCode)
83c89c78 2425#if defined(PIC)
2da0d70d 2426 ,"m" (ebxsave)
83c89c78 2427#endif
2da0d70d 2428 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2429#if !defined(PIC)
2da0d70d
DB
2430 ,"%"REG_b
2431#endif
2432 );
2433 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2434 }
2435 else
2436 {
bc279024 2437#endif /* HAVE_MMX2 */
d0ce212a 2438 x86_reg xInc_shr16 = xInc >> 16;
2da0d70d
DB
2439 uint16_t xInc_mask = xInc & 0xffff;
2440 //NO MMX just normal asm ...
7ad6469e 2441 __asm__ volatile(
2da0d70d
DB
2442 "xor %%"REG_a", %%"REG_a" \n\t" // i
2443 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2444 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2445 ASMALIGN(4)
2446 "1: \n\t"
2447 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2448 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2449 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2450 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2451 "shll $16, %%edi \n\t"
2452 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2453 "mov %1, %%"REG_D" \n\t"
2454 "shrl $9, %%esi \n\t"
2455 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2456 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2457 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2458
2459 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2460 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2461 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2462 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2463 "shll $16, %%edi \n\t"
2464 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2465 "mov %1, %%"REG_D" \n\t"
2466 "shrl $9, %%esi \n\t"
2467 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2468 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2469 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2470
2471
2472 "add $2, %%"REG_a" \n\t"
2473 "cmp %2, %%"REG_a" \n\t"
2474 " jb 1b \n\t"
2475
2476
2477 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2478 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2479 );
b63f641e 2480#if HAVE_MMX2
2da0d70d 2481 } //if MMX2 can't be used
2ff198c1
MN
2482#endif
2483#else
2da0d70d
DB
2484 int i;
2485 unsigned int xpos=0;
2486 for (i=0;i<dstWidth;i++)
2487 {
2488 register unsigned int xx=xpos>>16;
2489 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2490 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2491 xpos+=xInc;
2492 }
b63f641e 2493#endif /* ARCH_X86 */
077ea8a7 2494 }
6bc0c792 2495
6858492e 2496 if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
6bc0c792
MN
2497 int i;
2498 //FIXME all pal and rgb srcFormats could do this convertion as well
2499 //FIXME all scalers more complex than bilinear could do half of this transform
2500 if(c->srcRange){
2501 for (i=0; i<dstWidth; i++)
2502 dst[i]= (dst[i]*14071 + 33561947)>>14;
2503 }else{
2504 for (i=0; i<dstWidth; i++)
aa13b0fc 2505 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
6bc0c792
MN
2506 }
2507 }
2ff198c1
MN
2508}
2509
6bc0c792 2510inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2da0d70d
DB
2511 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2512 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2513 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
e48a79c9 2514 int32_t *mmx2FilterPos, uint32_t *pal)
2ff198c1 2515{
2da0d70d 2516 if (srcFormat==PIX_FMT_YUYV422)
1e621b18 2517 {
896a22b8 2518 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2519 src1= formatConvBuffer;
8b2fce0d 2520 src2= formatConvBuffer+VOFW;
1e621b18 2521 }
2da0d70d 2522 else if (srcFormat==PIX_FMT_UYVY422)
7322a67c 2523 {
896a22b8 2524 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2525 src1= formatConvBuffer;
8b2fce0d 2526 src2= formatConvBuffer+VOFW;
7322a67c 2527 }
2da0d70d 2528 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2529 {
2f60f629 2530 if(c->chrSrcHSubSample)
896a22b8 2531 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2532 else
896a22b8 2533 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2534 src1= formatConvBuffer;
8b2fce0d 2535 src2= formatConvBuffer+VOFW;
1e621b18 2536 }
9990e426
MN
2537 else if (srcFormat==PIX_FMT_RGB32_1)
2538 {
2f60f629 2539 if(c->chrSrcHSubSample)
896a22b8 2540 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2541 else
896a22b8 2542 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2543 src1= formatConvBuffer;
2544 src2= formatConvBuffer+VOFW;
2545 }
2da0d70d 2546 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2547 {
2f60f629 2548 if(c->chrSrcHSubSample)
896a22b8 2549 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2550 else
896a22b8 2551 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2552 src1= formatConvBuffer;
8b2fce0d 2553 src2= formatConvBuffer+VOFW;
1e621b18 2554 }
2da0d70d 2555 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2556 {
2f60f629 2557 if(c->chrSrcHSubSample)
896a22b8 2558 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2559 else
896a22b8 2560 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2561 src1= formatConvBuffer;
8b2fce0d 2562 src2= formatConvBuffer+VOFW;
6af250ea 2563 }
2da0d70d 2564 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2565 {
2f60f629 2566 if(c->chrSrcHSubSample)
896a22b8 2567 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2568 else
896a22b8 2569 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2570 src1= formatConvBuffer;
8b2fce0d 2571 src2= formatConvBuffer+VOFW;
b72034dd 2572 }
2da0d70d 2573 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2574 {
2f60f629 2575 if(c->chrSrcHSubSample)
896a22b8 2576 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2577 else
896a22b8 2578 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2579 src1= formatConvBuffer;
8b2fce0d 2580 src2= formatConvBuffer+VOFW;
a861d4d7 2581 }
9990e426
MN
2582 else if (srcFormat==PIX_FMT_BGR32_1)
2583 {
2f60f629 2584 if(c->chrSrcHSubSample)
896a22b8 2585 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2586 else
896a22b8 2587 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2588 src1= formatConvBuffer;
2589 src2= formatConvBuffer+VOFW;
2590 }
2da0d70d 2591 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2592 {
2f60f629 2593 if(c->chrSrcHSubSample)
896a22b8 2594 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2595 else
896a22b8 2596 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2597 src1= formatConvBuffer;
8b2fce0d 2598 src2= formatConvBuffer+VOFW;
a861d4d7 2599 }
2da0d70d 2600 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2601 {
2f60f629 2602 if(c->chrSrcHSubSample)
896a22b8 2603 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2604 else
896a22b8 2605 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2606 src1= formatConvBuffer;
8b2fce0d 2607 src2= formatConvBuffer+VOFW;
a43fb6b3 2608 }
2da0d70d 2609 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2610 {
2f60f629 2611 if(c->chrSrcHSubSample)
896a22b8 2612 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2613 else
896a22b8 2614 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2615 src1= formatConvBuffer;
8b2fce0d 2616 src2= formatConvBuffer+VOFW;
a43fb6b3 2617 }
4bb9adcf 2618 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
6ff0ad6b 2619 {
2da0d70d 2620 return;
6ff0ad6b 2621 }
2da0d70d 2622 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2623 {
e48a79c9 2624 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2625 src1= formatConvBuffer;
8b2fce0d 2626 src2= formatConvBuffer+VOFW;
e28630fc 2627 }
1e621b18 2628
b63f641e 2629#if HAVE_MMX
8a322796 2630 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2631 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2632#else
2da0d70d 2633 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2634#endif
077ea8a7 2635 {
2da0d70d 2636 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
8b2fce0d 2637 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
077ea8a7 2638 }
8a322796 2639 else // fast bilinear upscale / crap downscale
077ea8a7 2640 {
57f9a560 2641#if ARCH_X86 && CONFIG_GPL
b63f641e 2642#if HAVE_MMX2
2da0d70d 2643 int i;
83c89c78 2644#if defined(PIC)
2da0d70d 2645 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2646#endif
2da0d70d
DB
2647 if (canMMX2BeUsed)
2648 {
7ad6469e 2649 __asm__ volatile(
83c89c78 2650#if defined(PIC)
2da0d70d
DB
2651 "mov %%"REG_b", %6 \n\t"
2652#endif
2653 "pxor %%mm7, %%mm7 \n\t"
2654 "mov %0, %%"REG_c" \n\t"
2655 "mov %1, %%"REG_D" \n\t"
2656 "mov %2, %%"REG_d" \n\t"
2657 "mov %3, %%"REG_b" \n\t"
2658 "xor %%"REG_a", %%"REG_a" \n\t" // i
2659 PREFETCH" (%%"REG_c") \n\t"
2660 PREFETCH" 32(%%"REG_c") \n\t"
2661 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2662
b63f641e 2663#if ARCH_X86_64
6d606c4f
AJ
2664
2665#define FUNNY_UV_CODE \
2da0d70d
DB
2666 "movl (%%"REG_b"), %%esi \n\t"\
2667 "call *%4 \n\t"\
2668 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2669 "add %%"REG_S", %%"REG_c" \n\t"\
2670 "add %%"REG_a", %%"REG_D" \n\t"\
2671 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2672
2673#else
2674
b7dc6f66 2675#define FUNNY_UV_CODE \
2da0d70d
DB
2676 "movl (%%"REG_b"), %%esi \n\t"\
2677 "call *%4 \n\t"\
2678 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2679 "add %%"REG_a", %%"REG_D" \n\t"\
2680 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2681
bc279024 2682#endif /* ARCH_X86_64 */
6d606c4f 2683
b7dc6f66
MN
2684FUNNY_UV_CODE
2685FUNNY_UV_CODE
2686FUNNY_UV_CODE
2687FUNNY_UV_CODE
2da0d70d
DB
2688 "xor %%"REG_a", %%"REG_a" \n\t" // i
2689 "mov %5, %%"REG_c" \n\t" // src
2690 "mov %1, %%"REG_D" \n\t" // buf1
8b2fce0d 2691 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2da0d70d
DB
2692 PREFETCH" (%%"REG_c") \n\t"
2693 PREFETCH" 32(%%"REG_c") \n\t"
2694 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2695
2696FUNNY_UV_CODE
2697FUNNY_UV_CODE
2698FUNNY_UV_CODE
2699FUNNY_UV_CODE
2700
83c89c78 2701#if defined(PIC)
2da0d70d 2702 "mov %6, %%"REG_b" \n\t"
83c89c78 2703#endif
2da0d70d
DB
2704 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2705 "m" (funnyUVCode), "m" (src2)
83c89c78 2706#if defined(PIC)
2da0d70d 2707 ,"m" (ebxsave)
83c89c78 2708#endif
2da0d70d 2709 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2710#if !defined(PIC)
2da0d70d
DB
2711 ,"%"REG_b
2712#endif
2713 );
2714 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2715 {
2716 //printf("%d %d %d\n", dstWidth, i, srcW);
2717 dst[i] = src1[srcW-1]*128;
8b2fce0d 2718 dst[i+VOFW] = src2[srcW-1]*128;
2da0d70d
DB
2719 }
2720 }
2721 else
2722 {
bc279024 2723#endif /* HAVE_MMX2 */
d0ce212a 2724 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2da0d70d 2725 uint16_t xInc_mask = xInc & 0xffff;
7ad6469e 2726 __asm__ volatile(
2da0d70d
DB
2727 "xor %%"REG_a", %%"REG_a" \n\t" // i
2728 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2729 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2730 ASMALIGN(4)
2731 "1: \n\t"
2732 "mov %0, %%"REG_S" \n\t"
2733 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2734 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2735 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2736 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2737 "shll $16, %%edi \n\t"
2738 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2739 "mov %1, %%"REG_D" \n\t"
2740 "shrl $9, %%esi \n\t"
2741 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2742
2743 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2744 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2745 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2746 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2747 "shll $16, %%edi \n\t"
2748 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2749 "mov %1, %%"REG_D" \n\t"
2750 "shrl $9, %%esi \n\t"
8b2fce0d 2751 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2da0d70d
DB
2752
2753 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2754 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2755 "add $1, %%"REG_a" \n\t"
2756 "cmp %2, %%"REG_a" \n\t"
2757 " jb 1b \n\t"
2ff198c1 2758
8a322796
DB
2759/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2760 which is needed to support GCC 4.0. */
b63f641e 2761#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
e29c3f93 2762 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2763#else
e29c3f93 2764 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2765#endif
2da0d70d
DB
2766 "r" (src2)
2767 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2768 );
b63f641e 2769#if HAVE_MMX2
2da0d70d 2770 } //if MMX2 can't be used
2ff198c1
MN
2771#endif
2772#else
2da0d70d
DB
2773 int i;
2774 unsigned int xpos=0;
2775 for (i=0;i<dstWidth;i++)
2776 {
2777 register unsigned int xx=xpos>>16;
2778 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2779 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
8b2fce0d 2780 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2da0d70d
DB
2781 /* slower
2782 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
8b2fce0d 2783 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2da0d70d
DB
2784 */
2785 xpos+=xInc;
2786 }
b63f641e 2787#endif /* ARCH_X86 */
2da0d70d 2788 }
6bc0c792
MN
2789 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2790 int i;
2791 //FIXME all pal and rgb srcFormats could do this convertion as well
2792 //FIXME all scalers more complex than bilinear could do half of this transform
2793 if(c->srcRange){
2794 for (i=0; i<dstWidth; i++){
2795 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2796 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2797 }
2798 }else{
2799 for (i=0; i<dstWidth; i++){
aa13b0fc
MN
2800 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2801 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
6bc0c792
MN
2802 }
2803 }
2804 }
077ea8a7
MN
2805}
2806
3e499f53 2807static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2da0d70d
DB
2808 int srcSliceH, uint8_t* dst[], int dstStride[]){
2809
2810 /* load a few things into local vars to make the code more readable? and faster */
2811 const int srcW= c->srcW;
2812 const int dstW= c->dstW;
2813 const int dstH= c->dstH;
2814 const int chrDstW= c->chrDstW;
2815 const int chrSrcW= c->chrSrcW;
2816 const int lumXInc= c->lumXInc;
2817 const int chrXInc= c->chrXInc;
2818 const int dstFormat= c->dstFormat;
2819 const int srcFormat= c->srcFormat;
2820 const int flags= c->flags;
2821 const int canMMX2BeUsed= c->canMMX2BeUsed;
2822 int16_t *vLumFilterPos= c->vLumFilterPos;
2823 int16_t *vChrFilterPos= c->vChrFilterPos;
2824 int16_t *hLumFilterPos= c->hLumFilterPos;
2825 int16_t *hChrFilterPos= c->hChrFilterPos;
2826 int16_t *vLumFilter= c->vLumFilter;
2827 int16_t *vChrFilter= c->vChrFilter;
2828 int16_t *hLumFilter= c->hLumFilter;
2829 int16_t *hChrFilter= c->hChrFilter;
2830 int32_t *lumMmxFilter= c->lumMmxFilter;
2831 int32_t *chrMmxFilter= c->chrMmxFilter;
6858492e 2832 int32_t *alpMmxFilter= c->alpMmxFilter;
2da0d70d
DB
2833 const int vLumFilterSize= c->vLumFilterSize;
2834 const int vChrFilterSize= c->vChrFilterSize;
2835 const int hLumFilterSize= c->hLumFilterSize;
2836 const int hChrFilterSize= c->hChrFilterSize;
2837 int16_t **lumPixBuf= c->lumPixBuf;
2838 int16_t **chrPixBuf= c->chrPixBuf;
6858492e 2839 int16_t **alpPixBuf= c->alpPixBuf;
2da0d70d
DB
2840 const int vLumBufSize= c->vLumBufSize;
2841 const int vChrBufSize= c->vChrBufSize;
2842 uint8_t *funnyYCode= c->funnyYCode;
2843 uint8_t *funnyUVCode= c->funnyUVCode;
2844 uint8_t *formatConvBuffer= c->formatConvBuffer;
2845 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2846 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2847 int lastDstY;
e150ef8d 2848 uint32_t *pal=c->pal_yuv;
2da0d70d 2849
8a322796 2850 /* vars which will change and which we need to store back in the context */
2da0d70d
DB
2851 int dstY= c->dstY;
2852 int lumBufIndex= c->lumBufIndex;
2853 int chrBufIndex= c->chrBufIndex;
2854 int lastInLumBuf= c->lastInLumBuf;
2855 int lastInChrBuf= c->lastInChrBuf;
2856
2857 if (isPacked(c->srcFormat)){
2da0d70d
DB
2858 src[0]=
2859 src[1]=
6858492e
CS
2860 src[2]=
2861 src[3]= src[0];
2da0d70d
DB
2862 srcStride[0]=
2863 srcStride[1]=
6858492e
CS
2864 srcStride[2]=
2865 srcStride[3]= srcStride[0];
2da0d70d
DB
2866 }
2867 srcStride[1]<<= c->vChrDrop;
2868 srcStride[2]<<= c->vChrDrop;
2869
2870 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2871 // (int)dst[0], (int)dst[1], (int)dst[2]);
c7a810cc
MN
2872
2873#if 0 //self test FIXME move to a vfilter or something
2da0d70d
DB
2874 {
2875 static volatile int i=0;
2876 i++;
2877 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2878 selfTest(src, srcStride, c->srcW, c->srcH);
2879 i--;
2880 }
c7a810cc 2881#endif
37079906 2882
2da0d70d
DB
2883 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2884 //dstStride[0],dstStride[1],dstStride[2]);
2885
6858492e 2886 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
2da0d70d 2887 {
6683a37f
DP
2888 static int warnedAlready=0; //FIXME move this into the context perhaps
2889 if (flags & SWS_PRINT_INFO && !warnedAlready)
2da0d70d 2890 {
4b0c30b7 2891 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
8a322796 2892 " ->cannot do aligned memory accesses anymore\n");
6683a37f 2893 warnedAlready=1;
2da0d70d
DB
2894 }
2895 }
2896
8a322796
DB
2897 /* Note the user might start scaling the picture in the middle so this
2898 will not get executed. This is not really intended but works
2899 currently, so people might do it. */
2da0d70d
DB
2900 if (srcSliceY ==0){
2901 lumBufIndex=0;
2902 chrBufIndex=0;
2903 dstY=0;
2904 lastInLumBuf= -1;
2905 lastInChrBuf= -1;
2906 }
2907
2908 lastDstY= dstY;
2909
2910 for (;dstY < dstH; dstY++){
2911 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2912 const int chrDstY= dstY>>c->chrDstVSubSample;
2913 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2914 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
6858492e 2915 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2da0d70d
DB
2916
2917 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2918 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2919 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2920 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2921
2922 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2923 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2924 //handle holes (FAST_BILINEAR & weird filters)
2925 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2926 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2927 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
fcc402b1
LB
2928 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2929 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2da0d70d
DB
2930
2931 // Do we have enough lines in this slice to output the dstY line
2932 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2933 {
2934 //Do horizontal scaling
2935 while(lastInLumBuf < lastLumSrcY)
2936 {
6858492e
CS
2937 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2938 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2da0d70d
DB
2939 lumBufIndex++;
2940 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
fcc402b1
LB
2941 assert(lumBufIndex < 2*vLumBufSize);
2942 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2943 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2da0d70d 2944 //printf("%d %d\n", lumBufIndex, vLumBufSize);
6858492e 2945 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2da0d70d
DB
2946 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2947 funnyYCode, c->srcFormat, formatConvBuffer,
6858492e
CS
2948 c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 0);
2949 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2950 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2951 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2952 funnyYCode, c->srcFormat, formatConvBuffer,
2953 c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 1);
2da0d70d
DB
2954 lastInLumBuf++;
2955 }
2956 while(lastInChrBuf < lastChrSrcY)
2957 {
2958 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2959 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2960 chrBufIndex++;
fcc402b1
LB
2961 assert(chrBufIndex < 2*vChrBufSize);
2962 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2963 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
2964 //FIXME replace parameters through context struct (some at least)
2965
2966 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 2967 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
2968 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2969 funnyUVCode, c->srcFormat, formatConvBuffer,
2970 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2971 lastInChrBuf++;
2972 }
2973 //wrap buf index around to stay inside the ring buffer
e5091488
BF
2974 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2975 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
2976 }
2977 else // not enough lines left in this slice -> load the rest in the buffer
2978 {
2979 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2980 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2981 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2982 vChrBufSize, vLumBufSize);*/
2983
2984 //Do horizontal scaling
2985 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2986 {
6858492e
CS
2987 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2988 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2da0d70d 2989 lumBufIndex++;
fcc402b1
LB
2990 assert(lumBufIndex < 2*vLumBufSize);
2991 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2992 assert(lastInLumBuf + 1 - srcSliceY >= 0);
6858492e 2993 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2da0d70d
DB
2994 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2995 funnyYCode, c->srcFormat, formatConvBuffer,
6858492e
CS
2996 c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 0);
2997 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2998 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2999 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3000 funnyYCode, c->srcFormat, formatConvBuffer,
3001 c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 1);
2da0d70d
DB
3002 lastInLumBuf++;
3003 }
3004 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3005 {
3006 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3007 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3008 chrBufIndex++;
fcc402b1
LB
3009 assert(chrBufIndex < 2*vChrBufSize);
3010 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3011 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
3012
3013 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 3014 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
3015 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3016 funnyUVCode, c->srcFormat, formatConvBuffer,
3017 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3018 lastInChrBuf++;
3019 }
3020 //wrap buf index around to stay inside the ring buffer
e5091488
BF
3021 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3022 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
3023 break; //we can't output a dstY line so let's try with the next slice
3024 }
d3f41512 3025
b63f641e 3026#if HAVE_MMX
88e2a9ae 3027 c->blueDither= ff_dither8[dstY&1];
92c7b471 3028 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
88e2a9ae 3029 c->greenDither= ff_dither8[dstY&1];
92c7b471 3030 else
88e2a9ae
CEH
3031 c->greenDither= ff_dither4[dstY&1];
3032 c->redDither= ff_dither8[(dstY+1)&1];
2da0d70d
DB
3033#endif
3034 if (dstY < dstH-2)
3035 {