swscale: {}-related cosmetics.
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
8a322796
DB
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec
MN
27#undef PREFETCH
28#undef PREFETCHW
48a05cec 29
f4406ec1 30#if HAVE_AMD3DNOW
48a05cec
MN
31#define PREFETCH "prefetch"
32#define PREFETCHW "prefetchw"
b63f641e 33#elif HAVE_MMX2
48a05cec
MN
34#define PREFETCH "prefetchnta"
35#define PREFETCHW "prefetcht0"
36#else
d904b5fc
NP
37#define PREFETCH " # nop"
38#define PREFETCHW " # nop"
48a05cec
MN
39#endif
40
b63f641e 41#if HAVE_MMX2
d604bab9 42#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
f4406ec1 43#elif HAVE_AMD3DNOW
d604bab9
MN
44#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45#endif
d3f41512 46
b63f641e 47#if HAVE_MMX2
6e1c66bc 48#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 49#else
6e1c66bc 50#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 51#endif
6e1c66bc 52#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 53
b63f641e 54#if HAVE_ALTIVEC
009d2d74 55#include "ppc/swscale_altivec_template.c"
a2faa401
RD
56#endif
57
bca11e75 58#define YSCALEYUV2YV12X(x, offset, dest, width) \
7ad6469e 59 __asm__ volatile(\
2da0d70d
DB
60 "xor %%"REG_a", %%"REG_a" \n\t"\
61 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
62 "movq %%mm3, %%mm4 \n\t"\
63 "lea " offset "(%0), %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 ASMALIGN(4) /* FIXME Unroll? */\
66 "1: \n\t"\
67 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
8b2fce0d
MN
68 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
69 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
2da0d70d
DB
70 "add $16, %%"REG_d" \n\t"\
71 "mov (%%"REG_d"), %%"REG_S" \n\t"\
72 "test %%"REG_S", %%"REG_S" \n\t"\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
77 " jnz 1b \n\t"\
78 "psraw $3, %%mm3 \n\t"\
79 "psraw $3, %%mm4 \n\t"\
80 "packuswb %%mm4, %%mm3 \n\t"\
81 MOVNTQ(%%mm3, (%1, %%REGa))\
82 "add $8, %%"REG_a" \n\t"\
83 "cmp %2, %%"REG_a" \n\t"\
84 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
85 "movq %%mm3, %%mm4 \n\t"\
86 "lea " offset "(%0), %%"REG_d" \n\t"\
87 "mov (%%"REG_d"), %%"REG_S" \n\t"\
88 "jb 1b \n\t"\
89 :: "r" (&c->redDither),\
90 "r" (dest), "g" (width)\
91 : "%"REG_a, "%"REG_d, "%"REG_S\
92 );
bca11e75
MN
93
94#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
7ad6469e 95 __asm__ volatile(\
2da0d70d
DB
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "xor %%"REG_a", %%"REG_a" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 ASMALIGN(4) \
104 "1: \n\t"\
8b2fce0d
MN
105 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
106 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
1625216e 107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
8b2fce0d 108 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
2da0d70d
DB
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 117 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
1625216e
MN
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
128 " jnz 1b \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %%REGa))\
142 "add $8, %%"REG_a" \n\t"\
143 "cmp %2, %%"REG_a" \n\t"\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
150 "jb 1b \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" (width)\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
154 );
c1b0bfb4
MN
155
156#define YSCALEYUV2YV121 \
2da0d70d
DB
157 "mov %2, %%"REG_a" \n\t"\
158 ASMALIGN(4) /* FIXME Unroll? */\
159 "1: \n\t"\
160 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
161 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
167 "jnc 1b \n\t"
c1b0bfb4 168
bf2bdde6
MN
169#define YSCALEYUV2YV121_ACCURATE \
170 "mov %2, %%"REG_a" \n\t"\
171 "pcmpeqw %%mm7, %%mm7 \n\t"\
172 "psrlw $15, %%mm7 \n\t"\
173 "psllw $6, %%mm7 \n\t"\
174 ASMALIGN(4) /* FIXME Unroll? */\
175 "1: \n\t"\
176 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
177 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
33a67bd6
MN
178 "paddsw %%mm7, %%mm0 \n\t"\
179 "paddsw %%mm7, %%mm1 \n\t"\
bf2bdde6
MN
180 "psraw $7, %%mm0 \n\t"\
181 "psraw $7, %%mm1 \n\t"\
182 "packuswb %%mm1, %%mm0 \n\t"\
183 MOVNTQ(%%mm0, (%1, %%REGa))\
184 "add $8, %%"REG_a" \n\t"\
185 "jnc 1b \n\t"
186
c1b0bfb4 187/*
2da0d70d
DB
188 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190 "r" (dest), "m" (dstW),
191 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 193*/
df57ab14 194#define YSCALEYUV2PACKEDX_UV \
7ad6469e 195 __asm__ volatile(\
2da0d70d
DB
196 "xor %%"REG_a", %%"REG_a" \n\t"\
197 ASMALIGN(4)\
198 "nop \n\t"\
199 "1: \n\t"\
200 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
201 "mov (%%"REG_d"), %%"REG_S" \n\t"\
202 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
203 "movq %%mm3, %%mm4 \n\t"\
204 ASMALIGN(4)\
205 "2: \n\t"\
206 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
207 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
8b2fce0d 208 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
2da0d70d
DB
209 "add $16, %%"REG_d" \n\t"\
210 "mov (%%"REG_d"), %%"REG_S" \n\t"\
211 "pmulhw %%mm0, %%mm2 \n\t"\
212 "pmulhw %%mm0, %%mm5 \n\t"\
213 "paddw %%mm2, %%mm3 \n\t"\
214 "paddw %%mm5, %%mm4 \n\t"\
215 "test %%"REG_S", %%"REG_S" \n\t"\
216 " jnz 2b \n\t"\
df57ab14 217
fe91924d 218#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
df57ab14 219 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d 220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
221 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
222 "movq "#dst1", "#dst2" \n\t"\
2da0d70d
DB
223 ASMALIGN(4)\
224 "2: \n\t"\
fe91924d
CS
225 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
226 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
227 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
2da0d70d
DB
228 "add $16, %%"REG_d" \n\t"\
229 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
230 "pmulhw "#coeff", "#src1" \n\t"\
231 "pmulhw "#coeff", "#src2" \n\t"\
232 "paddw "#src1", "#dst1" \n\t"\
233 "paddw "#src2", "#dst2" \n\t"\
2da0d70d
DB
234 "test %%"REG_S", %%"REG_S" \n\t"\
235 " jnz 2b \n\t"\
236
df57ab14
CS
237#define YSCALEYUV2PACKEDX \
238 YSCALEYUV2PACKEDX_UV \
fe91924d 239 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
df57ab14 240
2da0d70d
DB
241#define YSCALEYUV2PACKEDX_END \
242 :: "r" (&c->redDither), \
243 "m" (dummy), "m" (dummy), "m" (dummy),\
244 "r" (dest), "m" (dstW) \
245 : "%"REG_a, "%"REG_d, "%"REG_S \
246 );
8422aa88 247
df57ab14 248#define YSCALEYUV2PACKEDX_ACCURATE_UV \
7ad6469e 249 __asm__ volatile(\
2da0d70d
DB
250 "xor %%"REG_a", %%"REG_a" \n\t"\
251 ASMALIGN(4)\
252 "nop \n\t"\
253 "1: \n\t"\
254 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
255 "mov (%%"REG_d"), %%"REG_S" \n\t"\
256 "pxor %%mm4, %%mm4 \n\t"\
257 "pxor %%mm5, %%mm5 \n\t"\
258 "pxor %%mm6, %%mm6 \n\t"\
259 "pxor %%mm7, %%mm7 \n\t"\
260 ASMALIGN(4)\
261 "2: \n\t"\
262 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
8b2fce0d 263 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
1625216e 264 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
265 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
266 "movq %%mm0, %%mm3 \n\t"\
267 "punpcklwd %%mm1, %%mm0 \n\t"\
268 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 269 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
270 "pmaddwd %%mm1, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm3 \n\t"\
272 "paddd %%mm0, %%mm4 \n\t"\
273 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 274 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
1625216e
MN
275 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
276 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
277 "test %%"REG_S", %%"REG_S" \n\t"\
278 "movq %%mm2, %%mm0 \n\t"\
279 "punpcklwd %%mm3, %%mm2 \n\t"\
280 "punpckhwd %%mm3, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm2 \n\t"\
282 "pmaddwd %%mm1, %%mm0 \n\t"\
283 "paddd %%mm2, %%mm6 \n\t"\
284 "paddd %%mm0, %%mm7 \n\t"\
285 " jnz 2b \n\t"\
286 "psrad $16, %%mm4 \n\t"\
287 "psrad $16, %%mm5 \n\t"\
288 "psrad $16, %%mm6 \n\t"\
289 "psrad $16, %%mm7 \n\t"\
290 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
291 "packssdw %%mm5, %%mm4 \n\t"\
292 "packssdw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm0, %%mm4 \n\t"\
294 "paddw %%mm0, %%mm6 \n\t"\
295 "movq %%mm4, "U_TEMP"(%0) \n\t"\
296 "movq %%mm6, "V_TEMP"(%0) \n\t"\
df57ab14
CS
297
298#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d
DB
300 "mov (%%"REG_d"), %%"REG_S" \n\t"\
301 "pxor %%mm1, %%mm1 \n\t"\
302 "pxor %%mm5, %%mm5 \n\t"\
303 "pxor %%mm7, %%mm7 \n\t"\
304 "pxor %%mm6, %%mm6 \n\t"\
305 ASMALIGN(4)\
306 "2: \n\t"\
307 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
1625216e 309 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
310 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
311 "movq %%mm0, %%mm3 \n\t"\
312 "punpcklwd %%mm4, %%mm0 \n\t"\
313 "punpckhwd %%mm4, %%mm3 \n\t"\
1625216e 314 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
2da0d70d
DB
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "pmaddwd %%mm4, %%mm3 \n\t"\
317 "paddd %%mm0, %%mm1 \n\t"\
318 "paddd %%mm3, %%mm5 \n\t"\
319 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
1625216e
MN
320 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
321 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
322 "test %%"REG_S", %%"REG_S" \n\t"\
323 "movq %%mm2, %%mm0 \n\t"\
324 "punpcklwd %%mm3, %%mm2 \n\t"\
325 "punpckhwd %%mm3, %%mm0 \n\t"\
326 "pmaddwd %%mm4, %%mm2 \n\t"\
327 "pmaddwd %%mm4, %%mm0 \n\t"\
328 "paddd %%mm2, %%mm7 \n\t"\
329 "paddd %%mm0, %%mm6 \n\t"\
330 " jnz 2b \n\t"\
331 "psrad $16, %%mm1 \n\t"\
332 "psrad $16, %%mm5 \n\t"\
333 "psrad $16, %%mm7 \n\t"\
334 "psrad $16, %%mm6 \n\t"\
335 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
336 "packssdw %%mm5, %%mm1 \n\t"\
337 "packssdw %%mm6, %%mm7 \n\t"\
338 "paddw %%mm0, %%mm1 \n\t"\
339 "paddw %%mm0, %%mm7 \n\t"\
340 "movq "U_TEMP"(%0), %%mm3 \n\t"\
341 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 342
df57ab14
CS
343#define YSCALEYUV2PACKEDX_ACCURATE \
344 YSCALEYUV2PACKEDX_ACCURATE_UV \
345 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346
8422aa88 347#define YSCALEYUV2RGBX \
2da0d70d
DB
348 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
349 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
350 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
351 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
352 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
353 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
354/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
356 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
357 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
358 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
359 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
360 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
361/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362 "paddw %%mm3, %%mm4 \n\t"\
363 "movq %%mm2, %%mm0 \n\t"\
364 "movq %%mm5, %%mm6 \n\t"\
365 "movq %%mm4, %%mm3 \n\t"\
366 "punpcklwd %%mm2, %%mm2 \n\t"\
367 "punpcklwd %%mm5, %%mm5 \n\t"\
368 "punpcklwd %%mm4, %%mm4 \n\t"\
369 "paddw %%mm1, %%mm2 \n\t"\
370 "paddw %%mm1, %%mm5 \n\t"\
371 "paddw %%mm1, %%mm4 \n\t"\
372 "punpckhwd %%mm0, %%mm0 \n\t"\
373 "punpckhwd %%mm6, %%mm6 \n\t"\
374 "punpckhwd %%mm3, %%mm3 \n\t"\
375 "paddw %%mm7, %%mm0 \n\t"\
376 "paddw %%mm7, %%mm6 \n\t"\
377 "paddw %%mm7, %%mm3 \n\t"\
378 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379 "packuswb %%mm0, %%mm2 \n\t"\
380 "packuswb %%mm6, %%mm5 \n\t"\
381 "packuswb %%mm3, %%mm4 \n\t"\
d604bab9 382
6e1c66bc 383#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
384 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
385 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
386 "psraw $3, %%mm0 \n\t"\
387 "psraw $3, %%mm1 \n\t"\
388 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390 "xor "#index", "#index" \n\t"\
391 ASMALIGN(4)\
392 "1: \n\t"\
393 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
394 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
395 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
396 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
397 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
407 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
408 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
409 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
410 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
411 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
412 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 418
6e1c66bc 419#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 420
df57ab14 421#define REAL_YSCALEYUV2RGB_UV(index, c) \
2da0d70d
DB
422 "xor "#index", "#index" \n\t"\
423 ASMALIGN(4)\
424 "1: \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
427 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
429 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
432 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
439 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
440 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
441 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
442 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
443 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
444 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
df57ab14 445
786dcfef
CS
446#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
448 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
449 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
450 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
2da0d70d
DB
451 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
df57ab14
CS
459
460#define REAL_YSCALEYUV2RGB_COEFF(c) \
2da0d70d
DB
461 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
462 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
463 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
464 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
465 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
466 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
467 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468 "paddw %%mm3, %%mm4 \n\t"\
469 "movq %%mm2, %%mm0 \n\t"\
470 "movq %%mm5, %%mm6 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklwd %%mm2, %%mm2 \n\t"\
473 "punpcklwd %%mm5, %%mm5 \n\t"\
474 "punpcklwd %%mm4, %%mm4 \n\t"\
475 "paddw %%mm1, %%mm2 \n\t"\
476 "paddw %%mm1, %%mm5 \n\t"\
477 "paddw %%mm1, %%mm4 \n\t"\
478 "punpckhwd %%mm0, %%mm0 \n\t"\
479 "punpckhwd %%mm6, %%mm6 \n\t"\
480 "punpckhwd %%mm3, %%mm3 \n\t"\
481 "paddw %%mm7, %%mm0 \n\t"\
482 "paddw %%mm7, %%mm6 \n\t"\
483 "paddw %%mm7, %%mm3 \n\t"\
484 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485 "packuswb %%mm0, %%mm2 \n\t"\
486 "packuswb %%mm6, %%mm5 \n\t"\
487 "packuswb %%mm3, %%mm4 \n\t"\
40494418 488
786dcfef 489#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
df57ab14
CS
490
491#define YSCALEYUV2RGB(index, c) \
492 REAL_YSCALEYUV2RGB_UV(index, c) \
786dcfef 493 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
df57ab14 494 REAL_YSCALEYUV2RGB_COEFF(c)
6a4970ab 495
6e1c66bc 496#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
497 "xor "#index", "#index" \n\t"\
498 ASMALIGN(4)\
499 "1: \n\t"\
500 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 501 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
502 "psraw $7, %%mm3 \n\t" \
503 "psraw $7, %%mm4 \n\t" \
504 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
505 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
506 "psraw $7, %%mm1 \n\t" \
507 "psraw $7, %%mm7 \n\t" \
6a4970ab 508
6e1c66bc 509#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 510
6e1c66bc 511#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
517 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
520 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
521 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
522 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
523 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
524 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
525 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
527 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
528 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
531 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
532 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
533 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
534 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
535 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
536 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537 "paddw %%mm3, %%mm4 \n\t"\
538 "movq %%mm2, %%mm0 \n\t"\
539 "movq %%mm5, %%mm6 \n\t"\
540 "movq %%mm4, %%mm3 \n\t"\
541 "punpcklwd %%mm2, %%mm2 \n\t"\
542 "punpcklwd %%mm5, %%mm5 \n\t"\
543 "punpcklwd %%mm4, %%mm4 \n\t"\
544 "paddw %%mm1, %%mm2 \n\t"\
545 "paddw %%mm1, %%mm5 \n\t"\
546 "paddw %%mm1, %%mm4 \n\t"\
547 "punpckhwd %%mm0, %%mm0 \n\t"\
548 "punpckhwd %%mm6, %%mm6 \n\t"\
549 "punpckhwd %%mm3, %%mm3 \n\t"\
550 "paddw %%mm7, %%mm0 \n\t"\
551 "paddw %%mm7, %%mm6 \n\t"\
552 "paddw %%mm7, %%mm3 \n\t"\
553 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554 "packuswb %%mm0, %%mm2 \n\t"\
555 "packuswb %%mm6, %%mm5 \n\t"\
556 "packuswb %%mm3, %%mm4 \n\t"\
40494418 557
6e1c66bc 558#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 559
6e1c66bc 560#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
561 "xor "#index", "#index" \n\t"\
562 ASMALIGN(4)\
563 "1: \n\t"\
564 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
565 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
566 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
567 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570 "psrlw $8, %%mm3 \n\t" \
571 "psrlw $8, %%mm4 \n\t" \
572 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
573 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
574 "psraw $7, %%mm1 \n\t" \
575 "psraw $7, %%mm7 \n\t"
6e1c66bc 576#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 577
497d4f99 578// do vertical chrominance interpolation
6e1c66bc 579#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
580 "xor "#index", "#index" \n\t"\
581 ASMALIGN(4)\
582 "1: \n\t"\
583 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
584 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
585 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
586 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
587 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
590 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
591 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
592 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
593 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
594 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
595 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
596 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
597 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
599 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
600 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
603 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
604 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
605 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
606 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
607 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
608 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609 "paddw %%mm3, %%mm4 \n\t"\
610 "movq %%mm2, %%mm0 \n\t"\
611 "movq %%mm5, %%mm6 \n\t"\
612 "movq %%mm4, %%mm3 \n\t"\
613 "punpcklwd %%mm2, %%mm2 \n\t"\
614 "punpcklwd %%mm5, %%mm5 \n\t"\
615 "punpcklwd %%mm4, %%mm4 \n\t"\
616 "paddw %%mm1, %%mm2 \n\t"\
617 "paddw %%mm1, %%mm5 \n\t"\
618 "paddw %%mm1, %%mm4 \n\t"\
619 "punpckhwd %%mm0, %%mm0 \n\t"\
620 "punpckhwd %%mm6, %%mm6 \n\t"\
621 "punpckhwd %%mm3, %%mm3 \n\t"\
622 "paddw %%mm7, %%mm0 \n\t"\
623 "paddw %%mm7, %%mm6 \n\t"\
624 "paddw %%mm7, %%mm3 \n\t"\
625 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626 "packuswb %%mm0, %%mm2 \n\t"\
627 "packuswb %%mm6, %%mm5 \n\t"\
628 "packuswb %%mm3, %%mm4 \n\t"\
40494418 629
6e1c66bc 630#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 631
6858492e
CS
632#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
634 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
635 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
636 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
637 "packuswb %%mm1, %%mm7 \n\t"
638#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639
9c77b26b
CS
640#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641 "movq "#b", "#q2" \n\t" /* B */\
642 "movq "#r", "#t" \n\t" /* R */\
643 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
644 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
645 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
646 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
647 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
648 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
649 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
650 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
651 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
652 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
d604bab9 653\
9c77b26b
CS
654 MOVNTQ( q0, (dst, index, 4))\
655 MOVNTQ( b, 8(dst, index, 4))\
656 MOVNTQ( q2, 16(dst, index, 4))\
657 MOVNTQ( q3, 24(dst, index, 4))\
d604bab9 658\
2da0d70d
DB
659 "add $8, "#index" \n\t"\
660 "cmp "#dstw", "#index" \n\t"\
661 " jb 1b \n\t"
9c77b26b 662#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
d604bab9 663
27a90b04 664#define REAL_WRITERGB16(dst, dstw, index) \
2da0d70d
DB
665 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
666 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
667 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
668 "psrlq $3, %%mm2 \n\t"\
d604bab9 669\
2da0d70d
DB
670 "movq %%mm2, %%mm1 \n\t"\
671 "movq %%mm4, %%mm3 \n\t"\
d604bab9 672\
2da0d70d
DB
673 "punpcklbw %%mm7, %%mm3 \n\t"\
674 "punpcklbw %%mm5, %%mm2 \n\t"\
675 "punpckhbw %%mm7, %%mm4 \n\t"\
676 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 677\
2da0d70d
DB
678 "psllq $3, %%mm3 \n\t"\
679 "psllq $3, %%mm4 \n\t"\
d604bab9 680\
2da0d70d
DB
681 "por %%mm3, %%mm2 \n\t"\
682 "por %%mm4, %%mm1 \n\t"\
d604bab9 683\
2da0d70d
DB
684 MOVNTQ(%%mm2, (dst, index, 2))\
685 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 686\
2da0d70d
DB
687 "add $8, "#index" \n\t"\
688 "cmp "#dstw", "#index" \n\t"\
689 " jb 1b \n\t"
27a90b04 690#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
d604bab9 691
27a90b04 692#define REAL_WRITERGB15(dst, dstw, index) \
2da0d70d
DB
693 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
694 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
695 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
696 "psrlq $3, %%mm2 \n\t"\
697 "psrlq $1, %%mm5 \n\t"\
d604bab9 698\
2da0d70d
DB
699 "movq %%mm2, %%mm1 \n\t"\
700 "movq %%mm4, %%mm3 \n\t"\
d604bab9 701\
2da0d70d
DB
702 "punpcklbw %%mm7, %%mm3 \n\t"\
703 "punpcklbw %%mm5, %%mm2 \n\t"\
704 "punpckhbw %%mm7, %%mm4 \n\t"\
705 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 706\
2da0d70d
DB
707 "psllq $2, %%mm3 \n\t"\
708 "psllq $2, %%mm4 \n\t"\
d604bab9 709\
2da0d70d
DB
710 "por %%mm3, %%mm2 \n\t"\
711 "por %%mm4, %%mm1 \n\t"\
d604bab9 712\
2da0d70d
DB
713 MOVNTQ(%%mm2, (dst, index, 2))\
714 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 715\
2da0d70d
DB
716 "add $8, "#index" \n\t"\
717 "cmp "#dstw", "#index" \n\t"\
718 " jb 1b \n\t"
27a90b04 719#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
f62255fb 720
6542b44e 721#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
722 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723 "movq %%mm2, %%mm1 \n\t" /* B */\
724 "movq %%mm5, %%mm6 \n\t" /* R */\
725 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
726 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
727 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
728 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
729 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
730 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
731 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
732 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
733 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
734 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 735\
2da0d70d
DB
736 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
737 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
738 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
739 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
740 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
741 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
742 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
743 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 744\
2da0d70d
DB
745 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
746 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
747 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
748 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
749 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
750 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
751 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
752 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
753 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
754 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
755 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
756 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
757 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 758\
2da0d70d
DB
759 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
760 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
761 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
762 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
763 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
764 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
765 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
766 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 767\
2da0d70d
DB
768 MOVNTQ(%%mm0, (dst))\
769 MOVNTQ(%%mm2, 8(dst))\
770 MOVNTQ(%%mm3, 16(dst))\
771 "add $24, "#dst" \n\t"\
d604bab9 772\
2da0d70d
DB
773 "add $8, "#index" \n\t"\
774 "cmp "#dstw", "#index" \n\t"\
775 " jb 1b \n\t"
d604bab9 776
6542b44e 777#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779 "movq %%mm2, %%mm1 \n\t" /* B */\
780 "movq %%mm5, %%mm6 \n\t" /* R */\
781 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
782 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
783 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
784 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
785 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
786 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
787 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
788 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
789 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
790 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 791\
2da0d70d
DB
792 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
793 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
794 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
795 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 796\
2da0d70d
DB
797 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
798 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
799 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
800 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 801\
2da0d70d
DB
802 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
803 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
804 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
805 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 806\
2da0d70d
DB
807 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
809 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
810 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
811 MOVNTQ(%%mm0, (dst))\
99d2cb72 812\
2da0d70d
DB
813 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
814 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
815 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
816 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
817 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 818\
2da0d70d
DB
819 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
820 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
821 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
822 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 823\
2da0d70d 824 "add $24, "#dst" \n\t"\
99d2cb72 825\
2da0d70d
DB
826 "add $8, "#index" \n\t"\
827 "cmp "#dstw", "#index" \n\t"\
828 " jb 1b \n\t"
99d2cb72 829
6542b44e 830#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 831 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
832 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
834 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
835 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
836 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 837\
2da0d70d
DB
838 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
839 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
840 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 841\
2da0d70d
DB
842 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
843 "por %%mm1, %%mm6 \n\t"\
844 "por %%mm3, %%mm6 \n\t"\
845 MOVNTQ(%%mm6, (dst))\
99d2cb72 846\
2da0d70d
DB
847 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
848 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
849 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
850 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 851\
5802683a 852 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
853 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
854 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 855\
2da0d70d
DB
856 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
857 "por %%mm3, %%mm6 \n\t"\
858 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 859\
2da0d70d
DB
860 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
861 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
862 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 863\
2da0d70d
DB
864 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
865 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 866 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 867\
2da0d70d
DB
868 "por %%mm1, %%mm3 \n\t"\
869 "por %%mm3, %%mm6 \n\t"\
870 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 871\
2da0d70d 872 "add $24, "#dst" \n\t"\
99d2cb72 873\
2da0d70d
DB
874 "add $8, "#index" \n\t"\
875 "cmp "#dstw", "#index" \n\t"\
876 " jb 1b \n\t"
99d2cb72 877
b63f641e 878#if HAVE_MMX2
7630f2e0 879#undef WRITEBGR24
6e1c66bc 880#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 881#else
7630f2e0 882#undef WRITEBGR24
6e1c66bc 883#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
884#endif
885
6e1c66bc 886#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
887 "packuswb %%mm3, %%mm3 \n\t"\
888 "packuswb %%mm4, %%mm4 \n\t"\
889 "packuswb %%mm7, %%mm1 \n\t"\
890 "punpcklbw %%mm4, %%mm3 \n\t"\
891 "movq %%mm1, %%mm7 \n\t"\
892 "punpcklbw %%mm3, %%mm1 \n\t"\
893 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 894\
2da0d70d
DB
895 MOVNTQ(%%mm1, (dst, index, 2))\
896 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 897\
2da0d70d
DB
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
900 " jb 1b \n\t"
6e1c66bc 901#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
902
903
7ac40327
RP
904static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
6858492e 906 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
38858470 907{
b63f641e 908#if HAVE_MMX
f433c8ab 909 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
910 if (c->flags & SWS_ACCURATE_RND){
911 if (uDest){
912 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914 }
6858492e
CS
915 if (CONFIG_SWSCALE_ALPHA && aDest){
916 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917 }
bca11e75 918
14014d47
MN
919 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920 }else{
921 if (uDest){
922 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924 }
6858492e
CS
925 if (CONFIG_SWSCALE_ALPHA && aDest){
926 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927 }
2da0d70d 928
14014d47
MN
929 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930 }
f433c8ab
MN
931 return;
932 }
933#endif
b63f641e 934#if HAVE_ALTIVEC
a2faa401 935yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
936 chrFilter, chrSrc, chrFilterSize,
937 dest, uDest, vDest, dstW, chrDstW);
a2faa401 938#else //HAVE_ALTIVEC
5859233b 939yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d 940 chrFilter, chrSrc, chrFilterSize,
6858492e 941 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
a2faa401 942#endif //!HAVE_ALTIVEC
c1b0bfb4 943}
2add307d 944
7ac40327
RP
945static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
2da0d70d 947 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
6118e52e
VS
948{
949yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
952}
953
7ac40327 954static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
6858492e 955 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
c1b0bfb4 956{
f433c8ab 957 int i;
b63f641e 958#if HAVE_MMX
f433c8ab 959 if(!(c->flags & SWS_BITEXACT)){
6858492e
CS
960 long p= 4;
961 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
2da0d70d 964
14014d47
MN
965 if (c->flags & SWS_ACCURATE_RND){
966 while(p--){
3164d25e
CS
967 if (dst[p]){
968 __asm__ volatile(
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
971 "g" (-counter[p])
972 : "%"REG_a
973 );
974 }
6858492e 975 }
14014d47
MN
976 }else{
977 while(p--){
3164d25e
CS
978 if (dst[p]){
979 __asm__ volatile(
980 YSCALEYUV2YV121
981 :: "r" (src[p]), "r" (dst[p] + counter[p]),
982 "g" (-counter[p])
983 : "%"REG_a
984 );
985 }
6858492e 986 }
d78c1ea1 987 }
f433c8ab
MN
988 return;
989 }
990#endif
2da0d70d
DB
991 for (i=0; i<dstW; i++)
992 {
a1f3ffa3 993 int val= (lumSrc[i]+64)>>7;
2da0d70d
DB
994
995 if (val&256){
996 if (val<0) val=0;
997 else val=255;
998 }
999
1000 dest[i]= val;
1001 }
1002
1b0a4572 1003 if (uDest)
2da0d70d
DB
1004 for (i=0; i<chrDstW; i++)
1005 {
a1f3ffa3
MN
1006 int u=(chrSrc[i ]+64)>>7;
1007 int v=(chrSrc[i + VOFW]+64)>>7;
2da0d70d
DB
1008
1009 if ((u|v)&256){
1010 if (u<0) u=0;
1011 else if (u>255) u=255;
1012 if (v<0) v=0;
1013 else if (v>255) v=255;
1014 }
1015
1016 uDest[i]= u;
1017 vDest[i]= v;
1018 }
6858492e
CS
1019
1020 if (CONFIG_SWSCALE_ALPHA && aDest)
1021 for (i=0; i<dstW; i++){
1022 int val= (alpSrc[i]+64)>>7;
1023 aDest[i]= av_clip_uint8(val);
1024 }
38858470
MN
1025}
1026
c1b0bfb4 1027
d604bab9
MN
1028/**
1029 * vertical scale YV12 to RGB
1030 */
7ac40327
RP
1031static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1032 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1033 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1034{
b63f641e 1035#if HAVE_MMX
d0ce212a 1036 x86_reg dummy=0;
f433c8ab 1037 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
1038 if (c->flags & SWS_ACCURATE_RND){
1039 switch(c->dstFormat){
1040 case PIX_FMT_RGB32:
6858492e
CS
1041 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1042 YSCALEYUV2PACKEDX_ACCURATE
1043 YSCALEYUV2RGBX
1044 "movq %%mm2, "U_TEMP"(%0) \n\t"
1045 "movq %%mm4, "V_TEMP"(%0) \n\t"
1046 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1047 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1048 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1049 "psraw $3, %%mm1 \n\t"
1050 "psraw $3, %%mm7 \n\t"
1051 "packuswb %%mm7, %%mm1 \n\t"
1052 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1053
1054 YSCALEYUV2PACKEDX_END
1055 }else{
3164d25e
CS
1056 YSCALEYUV2PACKEDX_ACCURATE
1057 YSCALEYUV2RGBX
1058 "pcmpeqd %%mm7, %%mm7 \n\t"
1059 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
2da0d70d 1060
3164d25e 1061 YSCALEYUV2PACKEDX_END
6858492e 1062 }
14014d47
MN
1063 return;
1064 case PIX_FMT_BGR24:
1065 YSCALEYUV2PACKEDX_ACCURATE
1066 YSCALEYUV2RGBX
40494418 1067 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1068 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1069 "add %4, %%"REG_c" \n\t"
1070 WRITEBGR24(%%REGc, %5, %%REGa)
2da0d70d
DB
1071
1072
14014d47
MN
1073 :: "r" (&c->redDither),
1074 "m" (dummy), "m" (dummy), "m" (dummy),
1075 "r" (dest), "m" (dstW)
1076 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1077 );
1078 return;
1079 case PIX_FMT_RGB555:
1080 YSCALEYUV2PACKEDX_ACCURATE
1081 YSCALEYUV2RGBX
40494418 1082 "pxor %%mm7, %%mm7 \n\t"
14014d47 1083 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1084#ifdef DITHER1XBPP
88e2a9ae
CEH
1085 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1086 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1087 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1088#endif
1089
14014d47
MN
1090 WRITERGB15(%4, %5, %%REGa)
1091 YSCALEYUV2PACKEDX_END
1092 return;
1093 case PIX_FMT_RGB565:
1094 YSCALEYUV2PACKEDX_ACCURATE
1095 YSCALEYUV2RGBX
40494418 1096 "pxor %%mm7, %%mm7 \n\t"
14014d47 1097 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1098#ifdef DITHER1XBPP
88e2a9ae
CEH
1099 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1100 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1101 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1102#endif
1103
14014d47
MN
1104 WRITERGB16(%4, %5, %%REGa)
1105 YSCALEYUV2PACKEDX_END
1106 return;
1107 case PIX_FMT_YUYV422:
1108 YSCALEYUV2PACKEDX_ACCURATE
1109 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1110
1111 "psraw $3, %%mm3 \n\t"
1112 "psraw $3, %%mm4 \n\t"
1113 "psraw $3, %%mm1 \n\t"
1114 "psraw $3, %%mm7 \n\t"
1115 WRITEYUY2(%4, %5, %%REGa)
1116 YSCALEYUV2PACKEDX_END
1117 return;
1118 }
1119 }else{
1120 switch(c->dstFormat)
1121 {
1122 case PIX_FMT_RGB32:
6858492e
CS
1123 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1124 YSCALEYUV2PACKEDX
1125 YSCALEYUV2RGBX
1126 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1127 "psraw $3, %%mm1 \n\t"
1128 "psraw $3, %%mm7 \n\t"
1129 "packuswb %%mm7, %%mm1 \n\t"
1130 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1131 YSCALEYUV2PACKEDX_END
1132 }else{
3164d25e
CS
1133 YSCALEYUV2PACKEDX
1134 YSCALEYUV2RGBX
1135 "pcmpeqd %%mm7, %%mm7 \n\t"
1136 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1137 YSCALEYUV2PACKEDX_END
6858492e 1138 }
14014d47
MN
1139 return;
1140 case PIX_FMT_BGR24:
1141 YSCALEYUV2PACKEDX
1142 YSCALEYUV2RGBX
40494418 1143 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1144 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1145 "add %4, %%"REG_c" \n\t"
1146 WRITEBGR24(%%REGc, %5, %%REGa)
1147
1148 :: "r" (&c->redDither),
1149 "m" (dummy), "m" (dummy), "m" (dummy),
1150 "r" (dest), "m" (dstW)
1151 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1152 );
1153 return;
1154 case PIX_FMT_RGB555:
1155 YSCALEYUV2PACKEDX
1156 YSCALEYUV2RGBX
40494418 1157 "pxor %%mm7, %%mm7 \n\t"
14014d47 1158 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1159#ifdef DITHER1XBPP
88e2a9ae
CEH
1160 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1161 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1162 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1163#endif
1164
14014d47
MN
1165 WRITERGB15(%4, %5, %%REGa)
1166 YSCALEYUV2PACKEDX_END
1167 return;
1168 case PIX_FMT_RGB565:
1169 YSCALEYUV2PACKEDX
1170 YSCALEYUV2RGBX
40494418 1171 "pxor %%mm7, %%mm7 \n\t"
14014d47 1172 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1173#ifdef DITHER1XBPP
88e2a9ae
CEH
1174 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1175 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1176 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1177#endif
1178
14014d47
MN
1179 WRITERGB16(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1181 return;
1182 case PIX_FMT_YUYV422:
1183 YSCALEYUV2PACKEDX
1184 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1185
1186 "psraw $3, %%mm3 \n\t"
1187 "psraw $3, %%mm4 \n\t"
1188 "psraw $3, %%mm1 \n\t"
1189 "psraw $3, %%mm7 \n\t"
1190 WRITEYUY2(%4, %5, %%REGa)
1191 YSCALEYUV2PACKEDX_END
1192 return;
1193 }
bca11e75
MN
1194 }
1195 }
bc279024 1196#endif /* HAVE_MMX */
b63f641e 1197#if HAVE_ALTIVEC
2da0d70d 1198 /* The following list of supported dstFormat values should
780daf2b 1199 match what's found in the body of ff_yuv2packedX_altivec() */
d55ef636 1200 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
12794f73 1201 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
2da0d70d 1202 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
12794f73 1203 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
780daf2b
DB
1204 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1205 chrFilter, chrSrc, chrFilterSize,
1206 dest, dstW, dstY);
2da0d70d
DB
1207 else
1208#endif
1209 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1210 chrFilter, chrSrc, chrFilterSize,
6858492e 1211 alpSrc, dest, dstW, dstY);
c1b0bfb4
MN
1212}
1213
c1b0bfb4
MN
1214/**
1215 * vertical bilinear scale YV12 to RGB
1216 */
7ac40327
RP
1217static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1218 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1219{
ac0ad729
MN
1220 int yalpha1=4095- yalpha;
1221 int uvalpha1=4095-uvalpha;
2da0d70d 1222 int i;
d604bab9 1223
b63f641e 1224#if HAVE_MMX
f433c8ab 1225 if(!(c->flags & SWS_BITEXACT)){
2da0d70d
DB
1226 switch(c->dstFormat)
1227 {
1228 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1229 case PIX_FMT_RGB32:
6858492e
CS
1230 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1231#if ARCH_X86_64
1232 __asm__ volatile(
6858492e
CS
1233 YSCALEYUV2RGB(%%REGBP, %5)
1234 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1235 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1236 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1237 "packuswb %%mm7, %%mm1 \n\t"
04ef1d3f 1238 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
6858492e 1239
04ef1d3f 1240 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
6858492e
CS
1241 "a" (&c->redDither)
1242 ,"r" (abuf0), "r" (abuf1)
04ef1d3f 1243 : "%"REG_BP
6858492e
CS
1244 );
1245#else
1246 *(uint16_t **)(&c->u_temp)=abuf0;
1247 *(uint16_t **)(&c->v_temp)=abuf1;
1248 __asm__ volatile(
1249 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1250 "mov %4, %%"REG_b" \n\t"
1251 "push %%"REG_BP" \n\t"
1252 YSCALEYUV2RGB(%%REGBP, %5)
1253 "push %0 \n\t"
1254 "push %1 \n\t"
1255 "mov "U_TEMP"(%5), %0 \n\t"
1256 "mov "V_TEMP"(%5), %1 \n\t"
1257 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1258 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1259 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1260 "packuswb %%mm7, %%mm1 \n\t"
1261 "pop %1 \n\t"
1262 "pop %0 \n\t"
1263 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1264 "pop %%"REG_BP" \n\t"
1265 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1266
1267 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1268 "a" (&c->redDither)
1269 );
1270#endif
1271 }else{
3164d25e
CS
1272 __asm__ volatile(
1273 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1274 "mov %4, %%"REG_b" \n\t"
1275 "push %%"REG_BP" \n\t"
1276 YSCALEYUV2RGB(%%REGBP, %5)
1277 "pcmpeqd %%mm7, %%mm7 \n\t"
1278 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1279 "pop %%"REG_BP" \n\t"
1280 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1281
3164d25e
CS
1282 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1283 "a" (&c->redDither)
1284 );
6858492e 1285 }
2da0d70d
DB
1286 return;
1287 case PIX_FMT_BGR24:
7ad6469e 1288 __asm__ volatile(
2da0d70d
DB
1289 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1290 "mov %4, %%"REG_b" \n\t"
1291 "push %%"REG_BP" \n\t"
1292 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1293 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
1294 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1295 "pop %%"REG_BP" \n\t"
1296 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1297 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1298 "a" (&c->redDither)
1299 );
1300 return;
27a90b04 1301 case PIX_FMT_RGB555:
7ad6469e 1302 __asm__ volatile(
2da0d70d
DB
1303 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1304 "mov %4, %%"REG_b" \n\t"
1305 "push %%"REG_BP" \n\t"
1306 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1307 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1308 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1309#ifdef DITHER1XBPP
88e2a9ae
CEH
1310 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1311 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1312 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1313#endif
1314
27a90b04 1315 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1316 "pop %%"REG_BP" \n\t"
1317 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1318
1319 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1320 "a" (&c->redDither)
1321 );
1322 return;
27a90b04 1323 case PIX_FMT_RGB565:
7ad6469e 1324 __asm__ volatile(
2da0d70d
DB
1325 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1326 "mov %4, %%"REG_b" \n\t"
1327 "push %%"REG_BP" \n\t"
1328 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1329 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1330 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1331#ifdef DITHER1XBPP
88e2a9ae
CEH
1332 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1333 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1334 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1335#endif
1336
27a90b04 1337 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1338 "pop %%"REG_BP" \n\t"
1339 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1340 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1341 "a" (&c->redDither)
1342 );
1343 return;
1344 case PIX_FMT_YUYV422:
7ad6469e 1345 __asm__ volatile(
2da0d70d
DB
1346 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1347 "mov %4, %%"REG_b" \n\t"
1348 "push %%"REG_BP" \n\t"
1349 YSCALEYUV2PACKED(%%REGBP, %5)
1350 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1351 "pop %%"REG_BP" \n\t"
1352 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1353 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1354 "a" (&c->redDither)
1355 );
1356 return;
1357 default: break;
1358 }
f433c8ab 1359 }
cf7d1c1a 1360#endif //HAVE_MMX
6858492e 1361YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
d604bab9
MN
1362}
1363
1364/**
1365 * YV12 to RGB without scaling or interpolating
1366 */
7ac40327
RP
1367static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1368 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1369{
2da0d70d
DB
1370 const int yalpha1=0;
1371 int i;
6a4970ab 1372
7ac40327 1373 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
2da0d70d 1374 const int yalpha= 4096; //FIXME ...
96034638 1375
2da0d70d
DB
1376 if (flags&SWS_FULL_CHR_H_INT)
1377 {
6858492e 1378 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
2da0d70d
DB
1379 return;
1380 }
397c035e 1381
b63f641e 1382#if HAVE_MMX
f433c8ab 1383 if(!(flags & SWS_BITEXACT)){
14014d47 1384 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
2da0d70d 1385 {
14014d47
MN
1386 switch(dstFormat)
1387 {
1388 case PIX_FMT_RGB32:
6858492e
CS
1389 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1390 __asm__ volatile(
1391 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1392 "mov %4, %%"REG_b" \n\t"
1393 "push %%"REG_BP" \n\t"
1394 YSCALEYUV2RGB1(%%REGBP, %5)
1395 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1396 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1397 "pop %%"REG_BP" \n\t"
1398 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1399
1400 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1401 "a" (&c->redDither)
1402 );
1403 }else{
3164d25e
CS
1404 __asm__ volatile(
1405 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1406 "mov %4, %%"REG_b" \n\t"
1407 "push %%"REG_BP" \n\t"
1408 YSCALEYUV2RGB1(%%REGBP, %5)
1409 "pcmpeqd %%mm7, %%mm7 \n\t"
1410 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1411 "pop %%"REG_BP" \n\t"
1412 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1413
3164d25e
CS
1414 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1415 "a" (&c->redDither)
1416 );
6858492e 1417 }
14014d47
MN
1418 return;
1419 case PIX_FMT_BGR24:
7ad6469e 1420 __asm__ volatile(
14014d47
MN
1421 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1422 "mov %4, %%"REG_b" \n\t"
1423 "push %%"REG_BP" \n\t"
1424 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1425 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1426 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1427 "pop %%"REG_BP" \n\t"
1428 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1429
1430 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1431 "a" (&c->redDither)
1432 );
1433 return;
1434 case PIX_FMT_RGB555:
7ad6469e 1435 __asm__ volatile(
14014d47
MN
1436 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1437 "mov %4, %%"REG_b" \n\t"
1438 "push %%"REG_BP" \n\t"
1439 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1440 "pxor %%mm7, %%mm7 \n\t"
14014d47 1441 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1442#ifdef DITHER1XBPP
88e2a9ae
CEH
1443 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1444 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1445 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1446#endif
14014d47
MN
1447 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1448 "pop %%"REG_BP" \n\t"
1449 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1450
14014d47
MN
1451 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1452 "a" (&c->redDither)
1453 );
1454 return;
1455 case PIX_FMT_RGB565:
7ad6469e 1456 __asm__ volatile(
14014d47
MN
1457 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1458 "mov %4, %%"REG_b" \n\t"
1459 "push %%"REG_BP" \n\t"
1460 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1461 "pxor %%mm7, %%mm7 \n\t"
14014d47 1462 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1463#ifdef DITHER1XBPP
88e2a9ae
CEH
1464 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1465 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1466 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1467#endif
1468
14014d47
MN
1469 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1470 "pop %%"REG_BP" \n\t"
1471 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1472
14014d47
MN
1473 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1474 "a" (&c->redDither)
1475 );
1476 return;
1477 case PIX_FMT_YUYV422:
7ad6469e 1478 __asm__ volatile(
14014d47
MN
1479 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1480 "mov %4, %%"REG_b" \n\t"
1481 "push %%"REG_BP" \n\t"
1482 YSCALEYUV2PACKED1(%%REGBP, %5)
1483 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1484 "pop %%"REG_BP" \n\t"
1485 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1486
1487 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 "a" (&c->redDither)
1489 );
1490 return;
1491 }
2da0d70d 1492 }
14014d47 1493 else
2da0d70d 1494 {
14014d47
MN
1495 switch(dstFormat)
1496 {
1497 case PIX_FMT_RGB32:
6858492e
CS
1498 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1499 __asm__ volatile(
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP, %5)
1504 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1505 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506 "pop %%"REG_BP" \n\t"
1507 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1508
1509 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510 "a" (&c->redDither)
1511 );
1512 }else{
3164d25e
CS
1513 __asm__ volatile(
1514 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1515 "mov %4, %%"REG_b" \n\t"
1516 "push %%"REG_BP" \n\t"
1517 YSCALEYUV2RGB1b(%%REGBP, %5)
1518 "pcmpeqd %%mm7, %%mm7 \n\t"
1519 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1520 "pop %%"REG_BP" \n\t"
1521 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1522
3164d25e
CS
1523 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524 "a" (&c->redDither)
1525 );
6858492e 1526 }
14014d47
MN
1527 return;
1528 case PIX_FMT_BGR24:
7ad6469e 1529 __asm__ volatile(
14014d47
MN
1530 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1531 "mov %4, %%"REG_b" \n\t"
1532 "push %%"REG_BP" \n\t"
1533 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1534 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1535 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1536 "pop %%"REG_BP" \n\t"
1537 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1538
1539 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1540 "a" (&c->redDither)
1541 );
1542 return;
1543 case PIX_FMT_RGB555:
7ad6469e 1544 __asm__ volatile(
14014d47
MN
1545 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1546 "mov %4, %%"REG_b" \n\t"
1547 "push %%"REG_BP" \n\t"
1548 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1549 "pxor %%mm7, %%mm7 \n\t"
14014d47 1550 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1551#ifdef DITHER1XBPP
88e2a9ae
CEH
1552 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1553 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1554 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1555#endif
14014d47
MN
1556 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1557 "pop %%"REG_BP" \n\t"
1558 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1559
14014d47
MN
1560 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1561 "a" (&c->redDither)
1562 );
1563 return;
1564 case PIX_FMT_RGB565:
7ad6469e 1565 __asm__ volatile(
14014d47
MN
1566 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1567 "mov %4, %%"REG_b" \n\t"
1568 "push %%"REG_BP" \n\t"
1569 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1570 "pxor %%mm7, %%mm7 \n\t"
14014d47 1571 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1572#ifdef DITHER1XBPP
88e2a9ae
CEH
1573 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1574 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1575 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1576#endif
1577
14014d47
MN
1578 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1579 "pop %%"REG_BP" \n\t"
1580 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1581
14014d47
MN
1582 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583 "a" (&c->redDither)
1584 );
1585 return;
1586 case PIX_FMT_YUYV422:
7ad6469e 1587 __asm__ volatile(
14014d47
MN
1588 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1589 "mov %4, %%"REG_b" \n\t"
1590 "push %%"REG_BP" \n\t"
1591 YSCALEYUV2PACKED1b(%%REGBP, %5)
1592 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1593 "pop %%"REG_BP" \n\t"
1594 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1595
1596 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1597 "a" (&c->redDither)
1598 );
1599 return;
1600 }
2da0d70d
DB
1601 }
1602 }
bc279024 1603#endif /* HAVE_MMX */
e5091488 1604 if (uvalpha < 2048)
2da0d70d 1605 {
6858492e 1606 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1607 }else{
6858492e 1608 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1609 }
d604bab9
MN
1610}
1611
8a322796 1612//FIXME yuy2* can read up to 7 samples too much
6ff0ad6b 1613
7ac40327 1614static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1e621b18 1615{
b63f641e 1616#if HAVE_MMX
7ad6469e 1617 __asm__ volatile(
2da0d70d
DB
1618 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1619 "mov %0, %%"REG_a" \n\t"
1620 "1: \n\t"
1621 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1622 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1623 "pand %%mm2, %%mm0 \n\t"
1624 "pand %%mm2, %%mm1 \n\t"
1625 "packuswb %%mm1, %%mm0 \n\t"
1626 "movq %%mm0, (%2, %%"REG_a") \n\t"
1627 "add $8, %%"REG_a" \n\t"
1628 " js 1b \n\t"
d0ce212a 1629 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
2da0d70d
DB
1630 : "%"REG_a
1631 );
1e621b18 1632#else
2da0d70d
DB
1633 int i;
1634 for (i=0; i<width; i++)
1635 dst[i]= src[2*i];
1e621b18
MN
1636#endif
1637}
1638
7ac40327 1639static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1e621b18 1640{
b63f641e 1641#if HAVE_MMX
7ad6469e 1642 __asm__ volatile(
2da0d70d
DB
1643 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1644 "mov %0, %%"REG_a" \n\t"
1645 "1: \n\t"
1646 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1647 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1648 "psrlw $8, %%mm0 \n\t"
1649 "psrlw $8, %%mm1 \n\t"
1650 "packuswb %%mm1, %%mm0 \n\t"
1651 "movq %%mm0, %%mm1 \n\t"
1652 "psrlw $8, %%mm0 \n\t"
1653 "pand %%mm4, %%mm1 \n\t"
1654 "packuswb %%mm0, %%mm0 \n\t"
1655 "packuswb %%mm1, %%mm1 \n\t"
1656 "movd %%mm0, (%3, %%"REG_a") \n\t"
1657 "movd %%mm1, (%2, %%"REG_a") \n\t"
1658 "add $4, %%"REG_a" \n\t"
1659 " js 1b \n\t"
d0ce212a 1660 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
2da0d70d
DB
1661 : "%"REG_a
1662 );
1e621b18 1663#else
2da0d70d
DB
1664 int i;
1665 for (i=0; i<width; i++)
1666 {
1667 dstU[i]= src1[4*i + 1];
1668 dstV[i]= src1[4*i + 3];
1669 }
1670#endif
1671 assert(src1 == src2);
1e621b18
MN
1672}
1673
4cf16bbe
DB
1674/* This is almost identical to the previous, end exists only because
1675 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
7ac40327 1676static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
7322a67c 1677{
b63f641e 1678#if HAVE_MMX
7ad6469e 1679 __asm__ volatile(
2da0d70d
DB
1680 "mov %0, %%"REG_a" \n\t"
1681 "1: \n\t"
1682 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1683 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1684 "psrlw $8, %%mm0 \n\t"
1685 "psrlw $8, %%mm1 \n\t"
1686 "packuswb %%mm1, %%mm0 \n\t"
1687 "movq %%mm0, (%2, %%"REG_a") \n\t"
1688 "add $8, %%"REG_a" \n\t"
1689 " js 1b \n\t"
d0ce212a 1690 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
2da0d70d
DB
1691 : "%"REG_a
1692 );
7322a67c 1693#else
2da0d70d
DB
1694 int i;
1695 for (i=0; i<width; i++)
1696 dst[i]= src[2*i+1];
7322a67c
MN
1697#endif
1698}
1699
7ac40327 1700static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
7322a67c 1701{
b63f641e 1702#if HAVE_MMX
7ad6469e 1703 __asm__ volatile(
2da0d70d
DB
1704 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1705 "mov %0, %%"REG_a" \n\t"
1706 "1: \n\t"
1707 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1708 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1709 "pand %%mm4, %%mm0 \n\t"
1710 "pand %%mm4, %%mm1 \n\t"
1711 "packuswb %%mm1, %%mm0 \n\t"
1712 "movq %%mm0, %%mm1 \n\t"
1713 "psrlw $8, %%mm0 \n\t"
1714 "pand %%mm4, %%mm1 \n\t"
1715 "packuswb %%mm0, %%mm0 \n\t"
1716 "packuswb %%mm1, %%mm1 \n\t"
1717 "movd %%mm0, (%3, %%"REG_a") \n\t"
1718 "movd %%mm1, (%2, %%"REG_a") \n\t"
1719 "add $4, %%"REG_a" \n\t"
1720 " js 1b \n\t"
d0ce212a 1721 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
2da0d70d
DB
1722 : "%"REG_a
1723 );
7322a67c 1724#else
2da0d70d
DB
1725 int i;
1726 for (i=0; i<width; i++)
1727 {
1728 dstU[i]= src1[4*i + 0];
1729 dstV[i]= src1[4*i + 2];
1730 }
1731#endif
1732 assert(src1 == src2);
7322a67c
MN
1733}
1734
214892ee 1735#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
7ac40327 1736static inline void RENAME(name)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)\
214892ee
MN
1737{\
1738 int i;\
1739 for (i=0; i<width; i++)\
1740 {\
7ac40327
RP
1741 int b= (((const type*)src)[i]>>shb)&maskb;\
1742 int g= (((const type*)src)[i]>>shg)&maskg;\
1743 int r= (((const type*)src)[i]>>shr)&maskr;\
214892ee
MN
1744\
1745 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1746 }\
1e621b18
MN
1747}
1748
214892ee
MN
1749BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1750BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1751BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1752BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1753BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1754BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1755
7ac40327 1756static inline void RENAME(abgrToA)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused){
6858492e
CS
1757 int i;
1758 for (i=0; i<width; i++){
1759 dst[i]= src[4*i];
1760 }
1761}
1762
f8a138be 1763#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
7ac40327 1764static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
a0baa07a
MN
1765{\
1766 int i;\
1767 for (i=0; i<width; i++)\
1768 {\
7ac40327
RP
1769 int b= (((const type*)src)[i]&maskb)>>shb;\
1770 int g= (((const type*)src)[i]&maskg)>>shg;\
1771 int r= (((const type*)src)[i]&maskr)>>shr;\
a0baa07a
MN
1772\
1773 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1774 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1775 }\
ba83d862 1776}\
7ac40327 1777static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
ba83d862
MN
1778{\
1779 int i;\
1780 for (i=0; i<width; i++)\
1781 {\
7ac40327
RP
1782 int pix0= ((const type*)src)[2*i+0];\
1783 int pix1= ((const type*)src)[2*i+1];\
bcff32d1 1784 int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
ba83d862
MN
1785 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1786 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
f8a138be 1787 g&= maskg|(2*maskg);\
ba83d862
MN
1788\
1789 g>>=shg;\
1790\
6b79dbce
MN
1791 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1792 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
ba83d862 1793 }\
2f60f629
MN
1794}
1795
f8a138be
CS
1796BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1797BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1798BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1799BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1800BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1801BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
a0baa07a 1802
b63f641e 1803#if HAVE_MMX
7ac40327 1804static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
dfb09bd1
MN
1805{
1806
1807 if(srcFormat == PIX_FMT_BGR24){
7ad6469e 1808 __asm__ volatile(
ff9a056d
MN
1809 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1810 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1811 :
dfb09bd1
MN
1812 );
1813 }else{
7ad6469e 1814 __asm__ volatile(
ff9a056d
MN
1815 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1816 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1817 :
dfb09bd1
MN
1818 );
1819 }
1820
7ad6469e 1821 __asm__ volatile(
dfb09bd1
MN
1822 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1823 "mov %2, %%"REG_a" \n\t"
1824 "pxor %%mm7, %%mm7 \n\t"
1825 "1: \n\t"
1826 PREFETCH" 64(%0) \n\t"
1827 "movd (%0), %%mm0 \n\t"
1828 "movd 2(%0), %%mm1 \n\t"
1829 "movd 6(%0), %%mm2 \n\t"
1830 "movd 8(%0), %%mm3 \n\t"
1831 "add $12, %0 \n\t"
1832 "punpcklbw %%mm7, %%mm0 \n\t"
1833 "punpcklbw %%mm7, %%mm1 \n\t"
1834 "punpcklbw %%mm7, %%mm2 \n\t"
1835 "punpcklbw %%mm7, %%mm3 \n\t"
1836 "pmaddwd %%mm5, %%mm0 \n\t"
1837 "pmaddwd %%mm6, %%mm1 \n\t"
1838 "pmaddwd %%mm5, %%mm2 \n\t"
1839 "pmaddwd %%mm6, %%mm3 \n\t"
1840 "paddd %%mm1, %%mm0 \n\t"
1841 "paddd %%mm3, %%mm2 \n\t"
1842 "paddd %%mm4, %%mm0 \n\t"
1843 "paddd %%mm4, %%mm2 \n\t"
1844 "psrad $15, %%mm0 \n\t"
1845 "psrad $15, %%mm2 \n\t"
1846 "packssdw %%mm2, %%mm0 \n\t"
1847 "packuswb %%mm0, %%mm0 \n\t"
1848 "movd %%mm0, (%1, %%"REG_a") \n\t"
1849 "add $4, %%"REG_a" \n\t"
1850 " js 1b \n\t"
1851 : "+r" (src)
d0ce212a 1852 : "r" (dst+width), "g" ((x86_reg)-width)
dfb09bd1 1853 : "%"REG_a
2da0d70d 1854 );
dfb09bd1
MN
1855}
1856
7ac40327 1857static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
dfb09bd1 1858{
7ad6469e 1859 __asm__ volatile(
dfb09bd1
MN
1860 "movq 24+%4, %%mm6 \n\t"
1861 "mov %3, %%"REG_a" \n\t"
1862 "pxor %%mm7, %%mm7 \n\t"
1863 "1: \n\t"
1864 PREFETCH" 64(%0) \n\t"
1865 "movd (%0), %%mm0 \n\t"
1866 "movd 2(%0), %%mm1 \n\t"
1867 "punpcklbw %%mm7, %%mm0 \n\t"
1868 "punpcklbw %%mm7, %%mm1 \n\t"
1869 "movq %%mm0, %%mm2 \n\t"
1870 "movq %%mm1, %%mm3 \n\t"
1871 "pmaddwd %4, %%mm0 \n\t"
1872 "pmaddwd 8+%4, %%mm1 \n\t"
1873 "pmaddwd 16+%4, %%mm2 \n\t"
1874 "pmaddwd %%mm6, %%mm3 \n\t"
1875 "paddd %%mm1, %%mm0 \n\t"
1876 "paddd %%mm3, %%mm2 \n\t"
1877
1878 "movd 6(%0), %%mm1 \n\t"
1879 "movd 8(%0), %%mm3 \n\t"
1880 "add $12, %0 \n\t"
1881 "punpcklbw %%mm7, %%mm1 \n\t"
1882 "punpcklbw %%mm7, %%mm3 \n\t"
1883 "movq %%mm1, %%mm4 \n\t"
1884 "movq %%mm3, %%mm5 \n\t"
1885 "pmaddwd %4, %%mm1 \n\t"
1886 "pmaddwd 8+%4, %%mm3 \n\t"
1887 "pmaddwd 16+%4, %%mm4 \n\t"
1888 "pmaddwd %%mm6, %%mm5 \n\t"
1889 "paddd %%mm3, %%mm1 \n\t"
1890 "paddd %%mm5, %%mm4 \n\t"
1891
1892 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1893 "paddd %%mm3, %%mm0 \n\t"
1894 "paddd %%mm3, %%mm2 \n\t"
1895 "paddd %%mm3, %%mm1 \n\t"
1896 "paddd %%mm3, %%mm4 \n\t"
1897 "psrad $15, %%mm0 \n\t"
1898 "psrad $15, %%mm2 \n\t"
1899 "psrad $15, %%mm1 \n\t"
1900 "psrad $15, %%mm4 \n\t"
1901 "packssdw %%mm1, %%mm0 \n\t"
1902 "packssdw %%mm4, %%mm2 \n\t"
1903 "packuswb %%mm0, %%mm0 \n\t"
1904 "packuswb %%mm2, %%mm2 \n\t"
1905 "movd %%mm0, (%1, %%"REG_a") \n\t"
1906 "movd %%mm2, (%2, %%"REG_a") \n\t"
1907 "add $4, %%"REG_a" \n\t"
1908 " js 1b \n\t"
1909 : "+r" (src)
d0ce212a 1910 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
dfb09bd1
MN
1911 : "%"REG_a
1912 );
1913}
1914#endif
1915
7ac40327 1916static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
dfb09bd1 1917{
b63f641e 1918#if HAVE_MMX
a35acd7f 1919 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1e621b18 1920#else
2da0d70d
DB
1921 int i;
1922 for (i=0; i<width; i++)
1923 {
1924 int b= src[i*3+0];
1925 int g= src[i*3+1];
1926 int r= src[i*3+2];
1e621b18 1927
e5091488 1928 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1929 }
bc279024 1930#endif /* HAVE_MMX */
1e621b18
MN
1931}
1932
7ac40327 1933static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1e621b18 1934{
b63f641e 1935#if HAVE_MMX
a35acd7f 1936 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1e621b18 1937#else
2da0d70d
DB
1938 int i;
1939 for (i=0; i<width; i++)
1940 {
dfb09bd1
MN
1941 int b= src1[3*i + 0];
1942 int g= src1[3*i + 1];
1943 int r= src1[3*i + 2];
2da0d70d 1944
dfb09bd1
MN
1945 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1946 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 1947 }
bc279024 1948#endif /* HAVE_MMX */
2da0d70d 1949 assert(src1 == src2);
1e621b18
MN
1950}
1951
7ac40327 1952static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
1953{
1954 int i;
1955 for (i=0; i<width; i++)
1956 {
1957 int b= src1[6*i + 0] + src1[6*i + 3];
1958 int g= src1[6*i + 1] + src1[6*i + 4];
1959 int r= src1[6*i + 2] + src1[6*i + 5];
1960
1961 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1962 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1963 }
1964 assert(src1 == src2);
1965}
1966
7ac40327 1967static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
a861d4d7 1968{
b63f641e 1969#if HAVE_MMX
a35acd7f 1970 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
dfb09bd1 1971#else
2da0d70d
DB
1972 int i;
1973 for (i=0; i<width; i++)
1974 {
1975 int r= src[i*3+0];
1976 int g= src[i*3+1];
1977 int b= src[i*3+2];
1978
e5091488 1979 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1980 }
dfb09bd1 1981#endif
a861d4d7
MN
1982}
1983
7ac40327 1984static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
a861d4d7 1985{
b63f641e 1986#if HAVE_MMX
5155b839 1987 assert(src1==src2);
a35acd7f 1988 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
dfb09bd1 1989#else
5155b839
DB
1990 int i;
1991 assert(src1==src2);
2da0d70d
DB
1992 for (i=0; i<width; i++)
1993 {
dfb09bd1
MN
1994 int r= src1[3*i + 0];
1995 int g= src1[3*i + 1];
1996 int b= src1[3*i + 2];
2da0d70d 1997
dfb09bd1
MN
1998 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1999 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2000 }
dfb09bd1 2001#endif
a861d4d7
MN
2002}
2003
7ac40327 2004static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
2005{
2006 int i;
2007 assert(src1==src2);
2008 for (i=0; i<width; i++)
2009 {
e09d7eef
MN
2010 int r= src1[6*i + 0] + src1[6*i + 3];
2011 int g= src1[6*i + 1] + src1[6*i + 4];
2012 int b= src1[6*i + 2] + src1[6*i + 5];
2f60f629
MN
2013
2014 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2015 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2016 }
2017}
2018
1e621b18 2019
7ac40327 2020static inline void RENAME(palToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *pal)
e28630fc 2021{
2da0d70d
DB
2022 int i;
2023 for (i=0; i<width; i++)
2024 {
2025 int d= src[i];
e28630fc 2026
2da0d70d
DB
2027 dst[i]= pal[d] & 0xFF;
2028 }
e28630fc
MN
2029}
2030
7ac40327
RP
2031static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV,
2032 const uint8_t *src1, const uint8_t *src2,
2033 long width, uint32_t *pal)
e28630fc 2034{
2da0d70d
DB
2035 int i;
2036 assert(src1 == src2);
2037 for (i=0; i<width; i++)
2038 {
2039 int p= pal[src1[i]];
2040
2041 dstU[i]= p>>8;
2042 dstV[i]= p>>16;
2043 }
e28630fc
MN
2044}
2045
7ac40327 2046static inline void RENAME(monowhite2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
3d05e078
MN
2047{
2048 int i, j;
2049 for (i=0; i<width/8; i++){
3a5ba0c3
LB
2050 int d= ~src[i];
2051 for(j=0; j<8; j++)
2052 dst[8*i+j]= ((d>>(7-j))&1)*255;
2053 }
2054}
2055
7ac40327 2056static inline void RENAME(monoblack2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
3a5ba0c3
LB
2057{
2058 int i, j;
2059 for (i=0; i<width/8; i++){
2060 int d= src[i];
78454dfc
MN
2061 for(j=0; j<8; j++)
2062 dst[8*i+j]= ((d>>(7-j))&1)*255;
3d05e078
MN
2063 }
2064}
2065
8a322796 2066// bilinear / bicubic scaling
7ac40327
RP
2067static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2068 const int16_t *filter, const int16_t *filterPos, long filterSize)
2ff198c1 2069{
b63f641e 2070#if HAVE_MMX
2da0d70d
DB
2071 assert(filterSize % 4 == 0 && filterSize>0);
2072 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2073 {
d0ce212a 2074 x86_reg counter= -2*dstW;
2da0d70d
DB
2075 filter-= counter*2;
2076 filterPos-= counter/2;
2077 dst-= counter/2;
7ad6469e 2078 __asm__ volatile(
83c89c78 2079#if defined(PIC)
2da0d70d
DB
2080 "push %%"REG_b" \n\t"
2081#endif
2082 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2083 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2084 "mov %%"REG_a", %%"REG_BP" \n\t"
2085 ASMALIGN(4)
2086 "1: \n\t"
2087 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2088 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2089 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2090 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2091 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2092 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2093 "punpcklbw %%mm7, %%mm0 \n\t"
2094 "punpcklbw %%mm7, %%mm2 \n\t"
2095 "pmaddwd %%mm1, %%mm0 \n\t"
2096 "pmaddwd %%mm2, %%mm3 \n\t"
ef423a66
MN
2097 "movq %%mm0, %%mm4 \n\t"
2098 "punpckldq %%mm3, %%mm0 \n\t"
2099 "punpckhdq %%mm3, %%mm4 \n\t"
2100 "paddd %%mm4, %%mm0 \n\t"
2101 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2102 "packssdw %%mm0, %%mm0 \n\t"
2103 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2104 "add $4, %%"REG_BP" \n\t"
2105 " jnc 1b \n\t"
2106
2107 "pop %%"REG_BP" \n\t"
83c89c78 2108#if defined(PIC)
2da0d70d 2109 "pop %%"REG_b" \n\t"
83c89c78 2110#endif
2da0d70d
DB
2111 : "+a" (counter)
2112 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2113#if !defined(PIC)
2da0d70d
DB
2114 : "%"REG_b
2115#endif
2116 );
2117 }
2118 else if (filterSize==8)
2119 {
d0ce212a 2120 x86_reg counter= -2*dstW;
2da0d70d
DB
2121 filter-= counter*4;
2122 filterPos-= counter/2;
2123 dst-= counter/2;
7ad6469e 2124 __asm__ volatile(
83c89c78 2125#if defined(PIC)
2da0d70d
DB
2126 "push %%"REG_b" \n\t"
2127#endif
2128 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2129 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2130 "mov %%"REG_a", %%"REG_BP" \n\t"
2131 ASMALIGN(4)
2132 "1: \n\t"
2133 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2134 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2135 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2136 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2137 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2138 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2139 "punpcklbw %%mm7, %%mm0 \n\t"
2140 "punpcklbw %%mm7, %%mm2 \n\t"
2141 "pmaddwd %%mm1, %%mm0 \n\t"
2142 "pmaddwd %%mm2, %%mm3 \n\t"
2143
2144 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2145 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2146 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2147 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2148 "punpcklbw %%mm7, %%mm4 \n\t"
2149 "punpcklbw %%mm7, %%mm2 \n\t"
2150 "pmaddwd %%mm1, %%mm4 \n\t"
2151 "pmaddwd %%mm2, %%mm5 \n\t"
2152 "paddd %%mm4, %%mm0 \n\t"
2153 "paddd %%mm5, %%mm3 \n\t"
ef423a66
MN
2154 "movq %%mm0, %%mm4 \n\t"
2155 "punpckldq %%mm3, %%mm0 \n\t"
2156 "punpckhdq %%mm3, %%mm4 \n\t"
2157 "paddd %%mm4, %%mm0 \n\t"
2158 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2159 "packssdw %%mm0, %%mm0 \n\t"
2160 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2161 "add $4, %%"REG_BP" \n\t"
2162 " jnc 1b \n\t"
2163
2164 "pop %%"REG_BP" \n\t"
83c89c78 2165#if defined(PIC)
2da0d70d 2166 "pop %%"REG_b" \n\t"
83c89c78 2167#endif
2da0d70d
DB
2168 : "+a" (counter)
2169 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2170#if !defined(PIC)
2da0d70d
DB
2171 : "%"REG_b
2172#endif
2173 );
2174 }
2175 else
2176 {
2177 uint8_t *offset = src+filterSize;
d0ce212a 2178 x86_reg counter= -2*dstW;
2da0d70d
DB
2179 //filter-= counter*filterSize/2;
2180 filterPos-= counter/2;
2181 dst-= counter/2;
7ad6469e 2182 __asm__ volatile(
2da0d70d 2183 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2184 ASMALIGN(4)
2185 "1: \n\t"
2186 "mov %2, %%"REG_c" \n\t"
2187 "movzwl (%%"REG_c", %0), %%eax \n\t"
2188 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2189 "mov %5, %%"REG_c" \n\t"
2190 "pxor %%mm4, %%mm4 \n\t"
2191 "pxor %%mm5, %%mm5 \n\t"
2192 "2: \n\t"
2193 "movq (%1), %%mm1 \n\t"
2194 "movq (%1, %6), %%mm3 \n\t"
2195 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2196 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2197 "punpcklbw %%mm7, %%mm0 \n\t"
2198 "punpcklbw %%mm7, %%mm2 \n\t"
2199 "pmaddwd %%mm1, %%mm0 \n\t"
2200 "pmaddwd %%mm2, %%mm3 \n\t"
2201 "paddd %%mm3, %%mm5 \n\t"
2202 "paddd %%mm0, %%mm4 \n\t"
2203 "add $8, %1 \n\t"
2204 "add $4, %%"REG_c" \n\t"
2205 "cmp %4, %%"REG_c" \n\t"
2206 " jb 2b \n\t"
2207 "add %6, %1 \n\t"
ef423a66
MN
2208 "movq %%mm4, %%mm0 \n\t"
2209 "punpckldq %%mm5, %%mm4 \n\t"
2210 "punpckhdq %%mm5, %%mm0 \n\t"
2211 "paddd %%mm0, %%mm4 \n\t"
2212 "psrad $7, %%mm4 \n\t"
2da0d70d
DB
2213 "packssdw %%mm4, %%mm4 \n\t"
2214 "mov %3, %%"REG_a" \n\t"
2215 "movd %%mm4, (%%"REG_a", %0) \n\t"
2216 "add $4, %0 \n\t"
2217 " jnc 1b \n\t"
2218
2219 : "+r" (counter), "+r" (filter)
2220 : "m" (filterPos), "m" (dst), "m"(offset),
d0ce212a 2221 "m" (src), "r" ((x86_reg)filterSize*2)
2da0d70d
DB
2222 : "%"REG_a, "%"REG_c, "%"REG_d
2223 );
2224 }
077ea8a7 2225#else
b63f641e 2226#if HAVE_ALTIVEC
2da0d70d 2227 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2228#else
2da0d70d
DB
2229 int i;
2230 for (i=0; i<dstW; i++)
2231 {
2232 int j;
2233 int srcPos= filterPos[i];
2234 int val=0;
2235 //printf("filterPos: %d\n", filterPos[i]);
2236 for (j=0; j<filterSize; j++)
2237 {
2238 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2239 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2240 }
2241 //filter += hFilterSize;
881c4294 2242 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2da0d70d
DB
2243 //dst[i] = val>>7;
2244 }
bc279024
DB
2245#endif /* HAVE_ALTIVEC */
2246#endif /* HAVE_MMX */
077ea8a7 2247}
392b6567
RP
2248
2249static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
7ac40327 2250 int dstWidth, const uint8_t *src, int srcW,
392b6567
RP
2251 int xInc)
2252{
2253 int i;
2254 unsigned int xpos=0;
2255 for (i=0;i<dstWidth;i++)
2256 {
2257 register unsigned int xx=xpos>>16;
2258 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2259 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2260 xpos+=xInc;
2261 }
2262}
2263
2ff198c1 2264 // *** horizontal scale Y line to temp buffer
7ac40327
RP
2265static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2266 int flags, const int16_t *hLumFilter,
2267 const int16_t *hLumFilterPos, int hLumFilterSize,
95b5770b
RP
2268 int srcFormat, uint8_t *formatConvBuffer,
2269 uint32_t *pal, int isAlpha)
077ea8a7 2270{
95b5770b
RP
2271 int32_t *mmx2FilterPos = c->lumMmx2FilterPos;
2272 int16_t *mmx2Filter = c->lumMmx2Filter;
2273 int canMMX2BeUsed = c->canMMX2BeUsed;
2274 void *funnyYCode = c->funnyYCode;
2275
2da0d70d 2276 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
1e621b18 2277 {
896a22b8 2278 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2279 src= formatConvBuffer;
1e621b18 2280 }
2da0d70d 2281 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
7322a67c 2282 {
896a22b8 2283 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2284 src= formatConvBuffer;
7322a67c 2285 }
2da0d70d 2286 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2287 {
6858492e
CS
2288 if (isAlpha)
2289 RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
2290 else
3164d25e 2291 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2292 src= formatConvBuffer;
1e621b18 2293 }
9990e426
MN
2294 else if (srcFormat==PIX_FMT_RGB32_1)
2295 {
6858492e
CS
2296 if (isAlpha)
2297 RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
2298 else
3164d25e 2299 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2300 src= formatConvBuffer;
2301 }
2da0d70d 2302 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2303 {
896a22b8 2304 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2305 src= formatConvBuffer;
1e621b18 2306 }
2da0d70d 2307 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2308 {
896a22b8 2309 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2310 src= formatConvBuffer;
6af250ea 2311 }
2da0d70d 2312 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2313 {
896a22b8 2314 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2315 src= formatConvBuffer;
b72034dd 2316 }
2da0d70d 2317 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2318 {
6858492e
CS
2319 if (isAlpha)
2320 RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
2321 else
3164d25e 2322 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2323 src= formatConvBuffer;
a861d4d7 2324 }
9990e426
MN
2325 else if (srcFormat==PIX_FMT_BGR32_1)
2326 {
6858492e
CS
2327 if (isAlpha)
2328 RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
2329 else
3164d25e 2330 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2331 src= formatConvBuffer;
2332 }
2da0d70d 2333 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2334 {
896a22b8 2335 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2336 src= formatConvBuffer;
a861d4d7 2337 }
2da0d70d 2338 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2339 {
896a22b8 2340 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2341 src= formatConvBuffer;
a43fb6b3 2342 }
2da0d70d 2343 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2344 {
896a22b8 2345 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2346 src= formatConvBuffer;
a43fb6b3 2347 }
2da0d70d 2348 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2349 {
e48a79c9 2350 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2351 src= formatConvBuffer;
e28630fc 2352 }
3a5ba0c3
LB
2353 else if (srcFormat==PIX_FMT_MONOBLACK)
2354 {
896a22b8 2355 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
3a5ba0c3
LB
2356 src= formatConvBuffer;
2357 }
2358 else if (srcFormat==PIX_FMT_MONOWHITE)
3d05e078 2359 {
896a22b8 2360 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
3d05e078
MN
2361 src= formatConvBuffer;
2362 }
1e621b18 2363
b63f641e 2364#if HAVE_MMX
8a322796 2365 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2366 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2367#else
2da0d70d 2368 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2369#endif
077ea8a7 2370 {
2da0d70d 2371 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
077ea8a7 2372 }
8a322796 2373 else // fast bilinear upscale / crap downscale
077ea8a7 2374 {
57f9a560 2375#if ARCH_X86 && CONFIG_GPL
b63f641e 2376#if HAVE_MMX2
2da0d70d 2377 int i;
83c89c78 2378#if defined(PIC)
2da0d70d 2379 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2380#endif
2da0d70d
DB
2381 if (canMMX2BeUsed)
2382 {
7ad6469e 2383 __asm__ volatile(
83c89c78 2384#if defined(PIC)
2da0d70d
DB
2385 "mov %%"REG_b", %5 \n\t"
2386#endif
2387 "pxor %%mm7, %%mm7 \n\t"
2388 "mov %0, %%"REG_c" \n\t"
2389 "mov %1, %%"REG_D" \n\t"
2390 "mov %2, %%"REG_d" \n\t"
2391 "mov %3, %%"REG_b" \n\t"
2392 "xor %%"REG_a", %%"REG_a" \n\t" // i
2393 PREFETCH" (%%"REG_c") \n\t"
2394 PREFETCH" 32(%%"REG_c") \n\t"
2395 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2396
b63f641e 2397#if ARCH_X86_64
6d606c4f
AJ
2398
2399#define FUNNY_Y_CODE \
2da0d70d
DB
2400 "movl (%%"REG_b"), %%esi \n\t"\
2401 "call *%4 \n\t"\
2402 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2403 "add %%"REG_S", %%"REG_c" \n\t"\
2404 "add %%"REG_a", %%"REG_D" \n\t"\
2405 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2406
2407#else
2408
2ff198c1 2409#define FUNNY_Y_CODE \
2da0d70d
DB
2410 "movl (%%"REG_b"), %%esi \n\t"\
2411 "call *%4 \n\t"\
2412 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2413 "add %%"REG_a", %%"REG_D" \n\t"\
2414 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2415
bc279024 2416#endif /* ARCH_X86_64 */
6d606c4f 2417
2ff198c1
MN
2418FUNNY_Y_CODE
2419FUNNY_Y_CODE
2420FUNNY_Y_CODE
2421FUNNY_Y_CODE
2422FUNNY_Y_CODE
2423FUNNY_Y_CODE
2424FUNNY_Y_CODE
2425FUNNY_Y_CODE
2426
83c89c78 2427#if defined(PIC)
2da0d70d 2428 "mov %5, %%"REG_b" \n\t"
83c89c78 2429#endif
2da0d70d
DB
2430 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2431 "m" (funnyYCode)
83c89c78 2432#if defined(PIC)
2da0d70d 2433 ,"m" (ebxsave)
83c89c78 2434#endif
2da0d70d 2435 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2436#if !defined(PIC)
2da0d70d
DB
2437 ,"%"REG_b
2438#endif
2439 );
2440 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2441 }
2442 else
2443 {
bc279024 2444#endif /* HAVE_MMX2 */
d0ce212a 2445 x86_reg xInc_shr16 = xInc >> 16;
2da0d70d
DB
2446 uint16_t xInc_mask = xInc & 0xffff;
2447 //NO MMX just normal asm ...
7ad6469e 2448 __asm__ volatile(
2da0d70d
DB
2449 "xor %%"REG_a", %%"REG_a" \n\t" // i
2450 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2451 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2452 ASMALIGN(4)
2453 "1: \n\t"
2454 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2455 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2456 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2457 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2458 "shll $16, %%edi \n\t"
2459 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2460 "mov %1, %%"REG_D" \n\t"
2461 "shrl $9, %%esi \n\t"
2462 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2463 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2464 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2465
2466 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2467 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2468 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2469 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2470 "shll $16, %%edi \n\t"
2471 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2472 "mov %1, %%"REG_D" \n\t"
2473 "shrl $9, %%esi \n\t"
2474 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2475 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2476 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2477
2478
2479 "add $2, %%"REG_a" \n\t"
2480 "cmp %2, %%"REG_a" \n\t"
2481 " jb 1b \n\t"
2482
2483
2484 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2485 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2486 );
b63f641e 2487#if HAVE_MMX2
2da0d70d 2488 } //if MMX2 can't be used
2ff198c1
MN
2489#endif
2490#else
392b6567 2491 RENAME(hyscale_fast)(c, dst, dstWidth, src, srcW, xInc);
b63f641e 2492#endif /* ARCH_X86 */
077ea8a7 2493 }
6bc0c792 2494
6858492e 2495 if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
6bc0c792
MN
2496 int i;
2497 //FIXME all pal and rgb srcFormats could do this convertion as well
2498 //FIXME all scalers more complex than bilinear could do half of this transform
2499 if(c->srcRange){
2500 for (i=0; i<dstWidth; i++)
2501 dst[i]= (dst[i]*14071 + 33561947)>>14;
2502 }else{
2503 for (i=0; i<dstWidth; i++)
aa13b0fc 2504 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
6bc0c792
MN
2505 }
2506 }
2ff198c1
MN
2507}
2508
392b6567 2509static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
7ac40327
RP
2510 int dstWidth, const uint8_t *src1,
2511 const uint8_t *src2, int srcW, int xInc)
392b6567
RP
2512{
2513 int i;
2514 unsigned int xpos=0;
2515 for (i=0;i<dstWidth;i++)
2516 {
2517 register unsigned int xx=xpos>>16;
2518 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2519 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2520 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2521 /* slower
2522 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2523 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2524 */
2525 xpos+=xInc;
2526 }
2527}
2528
7ac40327
RP
2529inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2530 int srcW, int xInc, int flags, const int16_t *hChrFilter,
2531 const int16_t *hChrFilterPos, int hChrFilterSize,
95b5770b
RP
2532 int srcFormat, uint8_t *formatConvBuffer,
2533 uint32_t *pal)
2ff198c1 2534{
95b5770b
RP
2535 int32_t *mmx2FilterPos = c->chrMmx2FilterPos;
2536 int16_t *mmx2Filter = c->chrMmx2Filter;
2537 int canMMX2BeUsed = c->canMMX2BeUsed;
2538 void *funnyUVCode = c->funnyUVCode;
2539
2da0d70d 2540 if (srcFormat==PIX_FMT_YUYV422)
1e621b18 2541 {
896a22b8 2542 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2543 src1= formatConvBuffer;
8b2fce0d 2544 src2= formatConvBuffer+VOFW;
1e621b18 2545 }
2da0d70d 2546 else if (srcFormat==PIX_FMT_UYVY422)
7322a67c 2547 {
896a22b8 2548 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2549 src1= formatConvBuffer;
8b2fce0d 2550 src2= formatConvBuffer+VOFW;
7322a67c 2551 }
2da0d70d 2552 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2553 {
2f60f629 2554 if(c->chrSrcHSubSample)
896a22b8 2555 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2556 else
896a22b8 2557 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2558 src1= formatConvBuffer;
8b2fce0d 2559 src2= formatConvBuffer+VOFW;
1e621b18 2560 }
9990e426
MN
2561 else if (srcFormat==PIX_FMT_RGB32_1)
2562 {
2f60f629 2563 if(c->chrSrcHSubSample)
896a22b8 2564 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2565 else
896a22b8 2566 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2567 src1= formatConvBuffer;
2568 src2= formatConvBuffer+VOFW;
2569 }
2da0d70d 2570 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2571 {
2f60f629 2572 if(c->chrSrcHSubSample)
896a22b8 2573 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2574 else
896a22b8 2575 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2576 src1= formatConvBuffer;
8b2fce0d 2577 src2= formatConvBuffer+VOFW;
1e621b18 2578 }
2da0d70d 2579 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2580 {
2f60f629 2581 if(c->chrSrcHSubSample)
896a22b8 2582 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2583 else
896a22b8 2584 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2585 src1= formatConvBuffer;
8b2fce0d 2586 src2= formatConvBuffer+VOFW;
6af250ea 2587 }
2da0d70d 2588 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2589 {
2f60f629 2590 if(c->chrSrcHSubSample)
896a22b8 2591 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2592 else
896a22b8 2593 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2594 src1= formatConvBuffer;
8b2fce0d 2595 src2= formatConvBuffer+VOFW;
b72034dd 2596 }
2da0d70d 2597 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2598 {
2f60f629 2599 if(c->chrSrcHSubSample)
896a22b8 2600 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2601 else
896a22b8 2602 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2603 src1= formatConvBuffer;
8b2fce0d 2604 src2= formatConvBuffer+VOFW;
a861d4d7 2605 }
9990e426
MN
2606 else if (srcFormat==PIX_FMT_BGR32_1)
2607 {
2f60f629 2608 if(c->chrSrcHSubSample)
896a22b8 2609 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2610 else
896a22b8 2611 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2612 src1= formatConvBuffer;
2613 src2= formatConvBuffer+VOFW;
2614 }
2da0d70d 2615 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2616 {
2f60f629 2617 if(c->chrSrcHSubSample)
896a22b8 2618 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2619 else
896a22b8 2620 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2621 src1= formatConvBuffer;
8b2fce0d 2622 src2= formatConvBuffer+VOFW;
a861d4d7 2623 }
2da0d70d 2624 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2625 {
2f60f629 2626 if(c->chrSrcHSubSample)
896a22b8 2627 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2628 else
896a22b8 2629 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2630 src1= formatConvBuffer;
8b2fce0d 2631 src2= formatConvBuffer+VOFW;
a43fb6b3 2632 }
2da0d70d 2633 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2634 {
2f60f629 2635 if(c->chrSrcHSubSample)
896a22b8 2636 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2637 else
896a22b8 2638 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2639 src1= formatConvBuffer;
8b2fce0d 2640 src2= formatConvBuffer+VOFW;
a43fb6b3 2641 }
4bb9adcf 2642 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
6ff0ad6b 2643 {
2da0d70d 2644 return;
6ff0ad6b 2645 }
2da0d70d 2646 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2647 {
e48a79c9 2648 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2649 src1= formatConvBuffer;
8b2fce0d 2650 src2= formatConvBuffer+VOFW;
e28630fc 2651 }
1e621b18 2652
b63f641e 2653#if HAVE_MMX
8a322796 2654 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2655 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2656#else
2da0d70d 2657 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2658#endif
077ea8a7 2659 {
2da0d70d 2660 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
8b2fce0d 2661 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
077ea8a7 2662 }
8a322796 2663 else // fast bilinear upscale / crap downscale
077ea8a7 2664 {
57f9a560 2665#if ARCH_X86 && CONFIG_GPL
b63f641e 2666#if HAVE_MMX2
2da0d70d 2667 int i;
83c89c78 2668#if defined(PIC)
2da0d70d 2669 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2670#endif
2da0d70d
DB
2671 if (canMMX2BeUsed)
2672 {
7ad6469e 2673 __asm__ volatile(
83c89c78 2674#if defined(PIC)
2da0d70d
DB
2675 "mov %%"REG_b", %6 \n\t"
2676#endif
2677 "pxor %%mm7, %%mm7 \n\t"
2678 "mov %0, %%"REG_c" \n\t"
2679 "mov %1, %%"REG_D" \n\t"
2680 "mov %2, %%"REG_d" \n\t"
2681 "mov %3, %%"REG_b" \n\t"
2682 "xor %%"REG_a", %%"REG_a" \n\t" // i
2683 PREFETCH" (%%"REG_c") \n\t"
2684 PREFETCH" 32(%%"REG_c") \n\t"
2685 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2686
b63f641e 2687#if ARCH_X86_64
6d606c4f
AJ
2688
2689#define FUNNY_UV_CODE \
2da0d70d
DB
2690 "movl (%%"REG_b"), %%esi \n\t"\
2691 "call *%4 \n\t"\
2692 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2693 "add %%"REG_S", %%"REG_c" \n\t"\
2694 "add %%"REG_a", %%"REG_D" \n\t"\
2695 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2696
2697#else
2698
b7dc6f66 2699#define FUNNY_UV_CODE \
2da0d70d
DB
2700 "movl (%%"REG_b"), %%esi \n\t"\
2701 "call *%4 \n\t"\
2702 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2703 "add %%"REG_a", %%"REG_D" \n\t"\
2704 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2705
bc279024 2706#endif /* ARCH_X86_64 */
6d606c4f 2707
b7dc6f66
MN
2708FUNNY_UV_CODE
2709FUNNY_UV_CODE
2710FUNNY_UV_CODE
2711FUNNY_UV_CODE
2da0d70d
DB
2712 "xor %%"REG_a", %%"REG_a" \n\t" // i
2713 "mov %5, %%"REG_c" \n\t" // src
2714 "mov %1, %%"REG_D" \n\t" // buf1
8b2fce0d 2715 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2da0d70d
DB
2716 PREFETCH" (%%"REG_c") \n\t"
2717 PREFETCH" 32(%%"REG_c") \n\t"
2718 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2719
2720FUNNY_UV_CODE
2721FUNNY_UV_CODE
2722FUNNY_UV_CODE
2723FUNNY_UV_CODE
2724
83c89c78 2725#if defined(PIC)
2da0d70d 2726 "mov %6, %%"REG_b" \n\t"
83c89c78 2727#endif
2da0d70d
DB
2728 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2729 "m" (funnyUVCode), "m" (src2)
83c89c78 2730#if defined(PIC)
2da0d70d 2731 ,"m" (ebxsave)
83c89c78 2732#endif
2da0d70d 2733 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2734#if !defined(PIC)
2da0d70d
DB
2735 ,"%"REG_b
2736#endif
2737 );
2738 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2739 {
2740 //printf("%d %d %d\n", dstWidth, i, srcW);
2741 dst[i] = src1[srcW-1]*128;
8b2fce0d 2742 dst[i+VOFW] = src2[srcW-1]*128;
2da0d70d
DB
2743 }
2744 }
2745 else
2746 {
bc279024 2747#endif /* HAVE_MMX2 */
d0ce212a 2748 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2da0d70d 2749 uint16_t xInc_mask = xInc & 0xffff;
7ad6469e 2750 __asm__ volatile(
2da0d70d
DB
2751 "xor %%"REG_a", %%"REG_a" \n\t" // i
2752 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2753 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2754 ASMALIGN(4)
2755 "1: \n\t"
2756 "mov %0, %%"REG_S" \n\t"
2757 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2758 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2759 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2760 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2761 "shll $16, %%edi \n\t"
2762 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2763 "mov %1, %%"REG_D" \n\t"
2764 "shrl $9, %%esi \n\t"
2765 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2766
2767 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2768 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2769 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2770 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2771 "shll $16, %%edi \n\t"
2772 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2773 "mov %1, %%"REG_D" \n\t"
2774 "shrl $9, %%esi \n\t"
8b2fce0d 2775 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2da0d70d
DB
2776
2777 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2778 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2779 "add $1, %%"REG_a" \n\t"
2780 "cmp %2, %%"REG_a" \n\t"
2781 " jb 1b \n\t"
2ff198c1 2782
8a322796
DB
2783/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2784 which is needed to support GCC 4.0. */
b63f641e 2785#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
e29c3f93 2786 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2787#else
e29c3f93 2788 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2789#endif
2da0d70d
DB
2790 "r" (src2)
2791 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2792 );
b63f641e 2793#if HAVE_MMX2
2da0d70d 2794 } //if MMX2 can't be used
2ff198c1
MN
2795#endif
2796#else
392b6567 2797 RENAME(hcscale_fast)(c, dst, dstWidth, src1, src2, srcW, xInc);
b63f641e 2798#endif /* ARCH_X86 */
2da0d70d 2799 }
6bc0c792
MN
2800 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2801 int i;
2802 //FIXME all pal and rgb srcFormats could do this convertion as well
2803 //FIXME all scalers more complex than bilinear could do half of this transform
2804 if(c->srcRange){
2805 for (i=0; i<dstWidth; i++){
2806 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2807 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2808 }
2809 }else{
2810 for (i=0; i<dstWidth; i++){
aa13b0fc
MN
2811 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2812 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
6bc0c792
MN
2813 }
2814 }
2815 }
077ea8a7
MN
2816}
2817
3e499f53 2818static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2da0d70d
DB
2819 int srcSliceH, uint8_t* dst[], int dstStride[]){
2820
2821 /* load a few things into local vars to make the code more readable? and faster */
2822 const int srcW= c->srcW;
2823 const int dstW= c->dstW;
2824 const int dstH= c->dstH;
2825 const int chrDstW= c->chrDstW;
2826 const int chrSrcW= c->chrSrcW;
2827 const int lumXInc= c->lumXInc;
2828 const int chrXInc= c->chrXInc;
2829 const int dstFormat= c->dstFormat;
2830 const int srcFormat= c->srcFormat;
2831 const int flags= c->flags;
2da0d70d
DB
2832 int16_t *vLumFilterPos= c->vLumFilterPos;
2833 int16_t *vChrFilterPos= c->vChrFilterPos;
2834 int16_t *hLumFilterPos= c->hLumFilterPos;
2835 int16_t *hChrFilterPos= c->hChrFilterPos;
2836 int16_t *vLumFilter= c->vLumFilter;
2837 int16_t *vChrFilter= c->vChrFilter;
2838 int16_t *hLumFilter= c->hLumFilter;
2839 int16_t *hChrFilter= c->hChrFilter;
2840 int32_t *lumMmxFilter= c->lumMmxFilter;
2841 int32_t *chrMmxFilter= c->chrMmxFilter;
6858492e 2842 int32_t *alpMmxFilter= c->alpMmxFilter;
2da0d70d
DB
2843 const int vLumFilterSize= c->vLumFilterSize;
2844 const int vChrFilterSize= c->vChrFilterSize;
2845 const int hLumFilterSize= c->hLumFilterSize;
2846 const int hChrFilterSize= c->hChrFilterSize;
2847 int16_t **lumPixBuf= c->lumPixBuf;
2848 int16_t **chrPixBuf= c->chrPixBuf;
6858492e 2849 int16_t **alpPixBuf= c->alpPixBuf;
2da0d70d
DB
2850 const int vLumBufSize= c->vLumBufSize;
2851 const int vChrBufSize= c->vChrBufSize;
2da0d70d
DB
2852 uint8_t *formatConvBuffer= c->formatConvBuffer;
2853 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2854 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2855 int lastDstY;
e150ef8d 2856 uint32_t *pal=c->pal_yuv;
2da0d70d 2857
8a322796 2858 /* vars which will change and which we need to store back in the context */
2da0d70d
DB
2859 int dstY= c->dstY;
2860 int lumBufIndex= c->lumBufIndex;
2861 int chrBufIndex= c->chrBufIndex;
2862 int lastInLumBuf= c->lastInLumBuf;
2863 int lastInChrBuf= c->lastInChrBuf;
2864
2865 if (isPacked(c->srcFormat)){
2da0d70d
DB
2866 src[0]=
2867 src[1]=
6858492e
CS
2868 src[2]=
2869 src[3]= src[0];
2da0d70d
DB
2870 srcStride[0]=
2871 srcStride[1]=
6858492e
CS
2872 srcStride[2]=
2873 srcStride[3]= srcStride[0];
2da0d70d
DB
2874 }
2875 srcStride[1]<<= c->vChrDrop;
2876 srcStride[2]<<= c->vChrDrop;
2877
2878 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2879 // (int)dst[0], (int)dst[1], (int)dst[2]);
c7a810cc
MN
2880
2881#if 0 //self test FIXME move to a vfilter or something
2da0d70d
DB
2882 {
2883 static volatile int i=0;
2884 i++;
2885 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2886 selfTest(src, srcStride, c->srcW, c->srcH);
2887 i--;
2888 }
c7a810cc 2889#endif
37079906 2890
2da0d70d
DB
2891 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2892 //dstStride[0],dstStride[1],dstStride[2]);
2893
6858492e 2894 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
2da0d70d 2895 {
6683a37f
DP
2896 static int warnedAlready=0; //FIXME move this into the context perhaps
2897 if (flags & SWS_PRINT_INFO && !warnedAlready)
2da0d70d 2898 {
4b0c30b7 2899 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
8a322796 2900 " ->cannot do aligned memory accesses anymore\n");
6683a37f 2901 warnedAlready=1;
2da0d70d
DB
2902 }
2903 }
2904
8a322796
DB
2905 /* Note the user might start scaling the picture in the middle so this
2906 will not get executed. This is not really intended but works
2907 currently, so people might do it. */
2da0d70d
DB
2908 if (srcSliceY ==0){
2909 lumBufIndex=0;
2910 chrBufIndex=0;
2911 dstY=0;
2912 lastInLumBuf= -1;
2913 lastInChrBuf= -1;
2914 }
2915
2916 lastDstY= dstY;
2917
2918 for (;dstY < dstH; dstY++){
2919 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2920 const int chrDstY= dstY>>c->chrDstVSubSample;
2921 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2922 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
6858492e 2923 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2da0d70d
DB
2924
2925 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2926 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2927 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2928 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2929
2930 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2931 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2932 //handle holes (FAST_BILINEAR & weird filters)
2933 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2934 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2935 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
fcc402b1
LB
2936 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2937 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2da0d70d
DB
2938
2939 // Do we have enough lines in this slice to output the dstY line
2940 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2941 {
2942 //Do horizontal scaling
2943 while(lastInLumBuf < lastLumSrcY)
2944 {
6858492e
CS
2945 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2946 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2da0d70d
DB
2947 lumBufIndex++;
2948 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
fcc402b1
LB
2949 assert(lumBufIndex < 2*vLumBufSize);
2950 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2951 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2da0d70d 2952 //printf("%d %d\n", lumBufIndex, vLumBufSize);
6858492e 2953 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
95b5770b
RP
2954 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2955 c->srcFormat, formatConvBuffer,
2956 pal, 0);
6858492e
CS
2957 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2958 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
95b5770b
RP
2959 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2960 c->srcFormat, formatConvBuffer,
2961 pal, 1);
2da0d70d
DB
2962 lastInLumBuf++;
2963 }
2964 while(lastInChrBuf < lastChrSrcY)
2965 {
2966 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2967 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2968 chrBufIndex++;
fcc402b1
LB
2969 assert(chrBufIndex < 2*vChrBufSize);
2970 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2971 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
2972 //FIXME replace parameters through context struct (some at least)
2973
2974 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 2975 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
95b5770b
RP
2976 flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2977 c->srcFormat, formatConvBuffer,
2978 pal);
2da0d70d
DB
2979 lastInChrBuf++;
2980 }
2981 //wrap buf index around to stay inside the ring buffer
e5091488
BF
2982 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2983 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
2984 }
2985 else // not enough lines left in this slice -> load the rest in the buffer
2986 {
2987 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2988 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2989 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2990 vChrBufSize, vLumBufSize);*/
2991
2992 //Do horizontal scaling
2993 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2994 {
6858492e
CS
2995 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2996 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2da0d70d 2997 lumBufIndex++;
fcc402b1
LB
2998 assert(lumBufIndex < 2*vLumBufSize);
2999 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3000 assert(lastInLumBuf + 1 - srcSliceY >= 0);
6858492e 3001 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
95b5770b
RP
3002 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
3003 c->srcFormat, formatConvBuffer,
3004 pal, 0);
6858492e
CS
3005 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
3006 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
95b5770b
RP
3007 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
3008 c->srcFormat, formatConvBuffer,
3009 pal, 1);
2da0d70d
DB
3010 lastInLumBuf++;
3011 }
3012 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3013 {
3014 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3015 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3016 chrBufIndex++;
fcc402b1
LB
3017 assert(chrBufIndex < 2*vChrBufSize);
3018 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3019 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
3020
3021 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 3022 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
95b5770b
RP
3023 flags, hChrFilter, hChrFilterPos, hChrFilterSize,
3024 c->srcFormat, formatConvBuffer,
3025 pal);
2da0d70d
DB
3026 lastInChrBuf++;
3027 }
3028 //wrap buf index around to stay inside the ring buffer
e5091488
BF
3029 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3030 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
3031 break; //we can't output a dstY line so let's try with the next slice
3032 }
d3f41512 3033
b63f641e 3034#if HAVE_MMX
88e2a9ae 3035 c->blueDither= ff_dither8[dstY&1];
92c7b471 3036 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
88e2a9ae 3037 c->greenDither= ff_dither8[dstY&1];
92c7b471 3038 else
88e2a9ae
CEH
3039 c->greenDither= ff_dither4[dstY&1];