eac3dec: get rid of unnecessary left shifts in 16-bit * 24-bit
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
8a322796
DB
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec
MN
27#undef PREFETCH
28#undef PREFETCHW
48a05cec 29
f4406ec1 30#if HAVE_AMD3DNOW
48a05cec
MN
31#define PREFETCH "prefetch"
32#define PREFETCHW "prefetchw"
b63f641e 33#elif HAVE_MMX2
48a05cec
MN
34#define PREFETCH "prefetchnta"
35#define PREFETCHW "prefetcht0"
36#else
d904b5fc
NP
37#define PREFETCH " # nop"
38#define PREFETCHW " # nop"
48a05cec
MN
39#endif
40
b63f641e 41#if HAVE_MMX2
d604bab9 42#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
f4406ec1 43#elif HAVE_AMD3DNOW
d604bab9
MN
44#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45#endif
d3f41512 46
b63f641e 47#if HAVE_MMX2
6e1c66bc 48#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 49#else
6e1c66bc 50#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 51#endif
6e1c66bc 52#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 53
b63f641e 54#if HAVE_ALTIVEC
009d2d74 55#include "ppc/swscale_altivec_template.c"
a2faa401
RD
56#endif
57
bca11e75 58#define YSCALEYUV2YV12X(x, offset, dest, width) \
7ad6469e 59 __asm__ volatile(\
2da0d70d
DB
60 "xor %%"REG_a", %%"REG_a" \n\t"\
61 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
62 "movq %%mm3, %%mm4 \n\t"\
63 "lea " offset "(%0), %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 ASMALIGN(4) /* FIXME Unroll? */\
66 "1: \n\t"\
67 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
8b2fce0d
MN
68 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
69 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
2da0d70d
DB
70 "add $16, %%"REG_d" \n\t"\
71 "mov (%%"REG_d"), %%"REG_S" \n\t"\
72 "test %%"REG_S", %%"REG_S" \n\t"\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
77 " jnz 1b \n\t"\
78 "psraw $3, %%mm3 \n\t"\
79 "psraw $3, %%mm4 \n\t"\
80 "packuswb %%mm4, %%mm3 \n\t"\
81 MOVNTQ(%%mm3, (%1, %%REGa))\
82 "add $8, %%"REG_a" \n\t"\
83 "cmp %2, %%"REG_a" \n\t"\
84 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
85 "movq %%mm3, %%mm4 \n\t"\
86 "lea " offset "(%0), %%"REG_d" \n\t"\
87 "mov (%%"REG_d"), %%"REG_S" \n\t"\
88 "jb 1b \n\t"\
89 :: "r" (&c->redDither),\
90 "r" (dest), "g" (width)\
91 : "%"REG_a, "%"REG_d, "%"REG_S\
92 );
bca11e75
MN
93
94#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
7ad6469e 95 __asm__ volatile(\
2da0d70d
DB
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "xor %%"REG_a", %%"REG_a" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 ASMALIGN(4) \
104 "1: \n\t"\
8b2fce0d
MN
105 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
106 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
1625216e 107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
8b2fce0d 108 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
2da0d70d
DB
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 117 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
1625216e
MN
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
128 " jnz 1b \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %%REGa))\
142 "add $8, %%"REG_a" \n\t"\
143 "cmp %2, %%"REG_a" \n\t"\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
150 "jb 1b \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" (width)\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
154 );
c1b0bfb4
MN
155
156#define YSCALEYUV2YV121 \
2da0d70d
DB
157 "mov %2, %%"REG_a" \n\t"\
158 ASMALIGN(4) /* FIXME Unroll? */\
159 "1: \n\t"\
160 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
161 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
167 "jnc 1b \n\t"
c1b0bfb4 168
bf2bdde6
MN
169#define YSCALEYUV2YV121_ACCURATE \
170 "mov %2, %%"REG_a" \n\t"\
171 "pcmpeqw %%mm7, %%mm7 \n\t"\
172 "psrlw $15, %%mm7 \n\t"\
173 "psllw $6, %%mm7 \n\t"\
174 ASMALIGN(4) /* FIXME Unroll? */\
175 "1: \n\t"\
176 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
177 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
33a67bd6
MN
178 "paddsw %%mm7, %%mm0 \n\t"\
179 "paddsw %%mm7, %%mm1 \n\t"\
bf2bdde6
MN
180 "psraw $7, %%mm0 \n\t"\
181 "psraw $7, %%mm1 \n\t"\
182 "packuswb %%mm1, %%mm0 \n\t"\
183 MOVNTQ(%%mm0, (%1, %%REGa))\
184 "add $8, %%"REG_a" \n\t"\
185 "jnc 1b \n\t"
186
c1b0bfb4 187/*
2da0d70d
DB
188 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190 "r" (dest), "m" (dstW),
191 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 193*/
df57ab14 194#define YSCALEYUV2PACKEDX_UV \
7ad6469e 195 __asm__ volatile(\
2da0d70d
DB
196 "xor %%"REG_a", %%"REG_a" \n\t"\
197 ASMALIGN(4)\
198 "nop \n\t"\
199 "1: \n\t"\
200 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
201 "mov (%%"REG_d"), %%"REG_S" \n\t"\
202 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
203 "movq %%mm3, %%mm4 \n\t"\
204 ASMALIGN(4)\
205 "2: \n\t"\
206 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
207 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
8b2fce0d 208 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
2da0d70d
DB
209 "add $16, %%"REG_d" \n\t"\
210 "mov (%%"REG_d"), %%"REG_S" \n\t"\
211 "pmulhw %%mm0, %%mm2 \n\t"\
212 "pmulhw %%mm0, %%mm5 \n\t"\
213 "paddw %%mm2, %%mm3 \n\t"\
214 "paddw %%mm5, %%mm4 \n\t"\
215 "test %%"REG_S", %%"REG_S" \n\t"\
216 " jnz 2b \n\t"\
df57ab14 217
fe91924d 218#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
df57ab14 219 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d 220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
221 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
222 "movq "#dst1", "#dst2" \n\t"\
2da0d70d
DB
223 ASMALIGN(4)\
224 "2: \n\t"\
fe91924d
CS
225 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
226 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
227 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
2da0d70d
DB
228 "add $16, %%"REG_d" \n\t"\
229 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
230 "pmulhw "#coeff", "#src1" \n\t"\
231 "pmulhw "#coeff", "#src2" \n\t"\
232 "paddw "#src1", "#dst1" \n\t"\
233 "paddw "#src2", "#dst2" \n\t"\
2da0d70d
DB
234 "test %%"REG_S", %%"REG_S" \n\t"\
235 " jnz 2b \n\t"\
236
df57ab14
CS
237#define YSCALEYUV2PACKEDX \
238 YSCALEYUV2PACKEDX_UV \
fe91924d 239 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
df57ab14 240
2da0d70d
DB
241#define YSCALEYUV2PACKEDX_END \
242 :: "r" (&c->redDither), \
243 "m" (dummy), "m" (dummy), "m" (dummy),\
244 "r" (dest), "m" (dstW) \
245 : "%"REG_a, "%"REG_d, "%"REG_S \
246 );
8422aa88 247
df57ab14 248#define YSCALEYUV2PACKEDX_ACCURATE_UV \
7ad6469e 249 __asm__ volatile(\
2da0d70d
DB
250 "xor %%"REG_a", %%"REG_a" \n\t"\
251 ASMALIGN(4)\
252 "nop \n\t"\
253 "1: \n\t"\
254 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
255 "mov (%%"REG_d"), %%"REG_S" \n\t"\
256 "pxor %%mm4, %%mm4 \n\t"\
257 "pxor %%mm5, %%mm5 \n\t"\
258 "pxor %%mm6, %%mm6 \n\t"\
259 "pxor %%mm7, %%mm7 \n\t"\
260 ASMALIGN(4)\
261 "2: \n\t"\
262 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
8b2fce0d 263 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
1625216e 264 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
265 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
266 "movq %%mm0, %%mm3 \n\t"\
267 "punpcklwd %%mm1, %%mm0 \n\t"\
268 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 269 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
270 "pmaddwd %%mm1, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm3 \n\t"\
272 "paddd %%mm0, %%mm4 \n\t"\
273 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 274 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
1625216e
MN
275 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
276 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
277 "test %%"REG_S", %%"REG_S" \n\t"\
278 "movq %%mm2, %%mm0 \n\t"\
279 "punpcklwd %%mm3, %%mm2 \n\t"\
280 "punpckhwd %%mm3, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm2 \n\t"\
282 "pmaddwd %%mm1, %%mm0 \n\t"\
283 "paddd %%mm2, %%mm6 \n\t"\
284 "paddd %%mm0, %%mm7 \n\t"\
285 " jnz 2b \n\t"\
286 "psrad $16, %%mm4 \n\t"\
287 "psrad $16, %%mm5 \n\t"\
288 "psrad $16, %%mm6 \n\t"\
289 "psrad $16, %%mm7 \n\t"\
290 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
291 "packssdw %%mm5, %%mm4 \n\t"\
292 "packssdw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm0, %%mm4 \n\t"\
294 "paddw %%mm0, %%mm6 \n\t"\
295 "movq %%mm4, "U_TEMP"(%0) \n\t"\
296 "movq %%mm6, "V_TEMP"(%0) \n\t"\
df57ab14
CS
297
298#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d
DB
300 "mov (%%"REG_d"), %%"REG_S" \n\t"\
301 "pxor %%mm1, %%mm1 \n\t"\
302 "pxor %%mm5, %%mm5 \n\t"\
303 "pxor %%mm7, %%mm7 \n\t"\
304 "pxor %%mm6, %%mm6 \n\t"\
305 ASMALIGN(4)\
306 "2: \n\t"\
307 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
1625216e 309 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
310 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
311 "movq %%mm0, %%mm3 \n\t"\
312 "punpcklwd %%mm4, %%mm0 \n\t"\
313 "punpckhwd %%mm4, %%mm3 \n\t"\
1625216e 314 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
2da0d70d
DB
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "pmaddwd %%mm4, %%mm3 \n\t"\
317 "paddd %%mm0, %%mm1 \n\t"\
318 "paddd %%mm3, %%mm5 \n\t"\
319 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
1625216e
MN
320 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
321 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
322 "test %%"REG_S", %%"REG_S" \n\t"\
323 "movq %%mm2, %%mm0 \n\t"\
324 "punpcklwd %%mm3, %%mm2 \n\t"\
325 "punpckhwd %%mm3, %%mm0 \n\t"\
326 "pmaddwd %%mm4, %%mm2 \n\t"\
327 "pmaddwd %%mm4, %%mm0 \n\t"\
328 "paddd %%mm2, %%mm7 \n\t"\
329 "paddd %%mm0, %%mm6 \n\t"\
330 " jnz 2b \n\t"\
331 "psrad $16, %%mm1 \n\t"\
332 "psrad $16, %%mm5 \n\t"\
333 "psrad $16, %%mm7 \n\t"\
334 "psrad $16, %%mm6 \n\t"\
335 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
336 "packssdw %%mm5, %%mm1 \n\t"\
337 "packssdw %%mm6, %%mm7 \n\t"\
338 "paddw %%mm0, %%mm1 \n\t"\
339 "paddw %%mm0, %%mm7 \n\t"\
340 "movq "U_TEMP"(%0), %%mm3 \n\t"\
341 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 342
df57ab14
CS
343#define YSCALEYUV2PACKEDX_ACCURATE \
344 YSCALEYUV2PACKEDX_ACCURATE_UV \
345 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346
8422aa88 347#define YSCALEYUV2RGBX \
2da0d70d
DB
348 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
349 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
350 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
351 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
352 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
353 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
354/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
356 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
357 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
358 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
359 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
360 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
361/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362 "paddw %%mm3, %%mm4 \n\t"\
363 "movq %%mm2, %%mm0 \n\t"\
364 "movq %%mm5, %%mm6 \n\t"\
365 "movq %%mm4, %%mm3 \n\t"\
366 "punpcklwd %%mm2, %%mm2 \n\t"\
367 "punpcklwd %%mm5, %%mm5 \n\t"\
368 "punpcklwd %%mm4, %%mm4 \n\t"\
369 "paddw %%mm1, %%mm2 \n\t"\
370 "paddw %%mm1, %%mm5 \n\t"\
371 "paddw %%mm1, %%mm4 \n\t"\
372 "punpckhwd %%mm0, %%mm0 \n\t"\
373 "punpckhwd %%mm6, %%mm6 \n\t"\
374 "punpckhwd %%mm3, %%mm3 \n\t"\
375 "paddw %%mm7, %%mm0 \n\t"\
376 "paddw %%mm7, %%mm6 \n\t"\
377 "paddw %%mm7, %%mm3 \n\t"\
378 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379 "packuswb %%mm0, %%mm2 \n\t"\
380 "packuswb %%mm6, %%mm5 \n\t"\
381 "packuswb %%mm3, %%mm4 \n\t"\
d604bab9 382
6e1c66bc 383#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
384 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
385 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
386 "psraw $3, %%mm0 \n\t"\
387 "psraw $3, %%mm1 \n\t"\
388 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390 "xor "#index", "#index" \n\t"\
391 ASMALIGN(4)\
392 "1: \n\t"\
393 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
394 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
395 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
396 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
397 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
407 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
408 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
409 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
410 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
411 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
412 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 418
6e1c66bc 419#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 420
df57ab14 421#define REAL_YSCALEYUV2RGB_UV(index, c) \
2da0d70d
DB
422 "xor "#index", "#index" \n\t"\
423 ASMALIGN(4)\
424 "1: \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
427 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
429 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
432 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
439 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
440 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
441 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
442 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
443 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
444 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
df57ab14 445
786dcfef
CS
446#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
448 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
449 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
450 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
2da0d70d
DB
451 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
df57ab14
CS
459
460#define REAL_YSCALEYUV2RGB_COEFF(c) \
2da0d70d
DB
461 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
462 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
463 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
464 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
465 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
466 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
467 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468 "paddw %%mm3, %%mm4 \n\t"\
469 "movq %%mm2, %%mm0 \n\t"\
470 "movq %%mm5, %%mm6 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklwd %%mm2, %%mm2 \n\t"\
473 "punpcklwd %%mm5, %%mm5 \n\t"\
474 "punpcklwd %%mm4, %%mm4 \n\t"\
475 "paddw %%mm1, %%mm2 \n\t"\
476 "paddw %%mm1, %%mm5 \n\t"\
477 "paddw %%mm1, %%mm4 \n\t"\
478 "punpckhwd %%mm0, %%mm0 \n\t"\
479 "punpckhwd %%mm6, %%mm6 \n\t"\
480 "punpckhwd %%mm3, %%mm3 \n\t"\
481 "paddw %%mm7, %%mm0 \n\t"\
482 "paddw %%mm7, %%mm6 \n\t"\
483 "paddw %%mm7, %%mm3 \n\t"\
484 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485 "packuswb %%mm0, %%mm2 \n\t"\
486 "packuswb %%mm6, %%mm5 \n\t"\
487 "packuswb %%mm3, %%mm4 \n\t"\
40494418 488
786dcfef 489#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
df57ab14
CS
490
491#define YSCALEYUV2RGB(index, c) \
492 REAL_YSCALEYUV2RGB_UV(index, c) \
786dcfef 493 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
df57ab14 494 REAL_YSCALEYUV2RGB_COEFF(c)
6a4970ab 495
6e1c66bc 496#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
497 "xor "#index", "#index" \n\t"\
498 ASMALIGN(4)\
499 "1: \n\t"\
500 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 501 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
502 "psraw $7, %%mm3 \n\t" \
503 "psraw $7, %%mm4 \n\t" \
504 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
505 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
506 "psraw $7, %%mm1 \n\t" \
507 "psraw $7, %%mm7 \n\t" \
6a4970ab 508
6e1c66bc 509#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 510
6e1c66bc 511#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
517 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
520 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
521 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
522 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
523 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
524 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
525 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
527 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
528 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
531 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
532 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
533 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
534 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
535 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
536 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537 "paddw %%mm3, %%mm4 \n\t"\
538 "movq %%mm2, %%mm0 \n\t"\
539 "movq %%mm5, %%mm6 \n\t"\
540 "movq %%mm4, %%mm3 \n\t"\
541 "punpcklwd %%mm2, %%mm2 \n\t"\
542 "punpcklwd %%mm5, %%mm5 \n\t"\
543 "punpcklwd %%mm4, %%mm4 \n\t"\
544 "paddw %%mm1, %%mm2 \n\t"\
545 "paddw %%mm1, %%mm5 \n\t"\
546 "paddw %%mm1, %%mm4 \n\t"\
547 "punpckhwd %%mm0, %%mm0 \n\t"\
548 "punpckhwd %%mm6, %%mm6 \n\t"\
549 "punpckhwd %%mm3, %%mm3 \n\t"\
550 "paddw %%mm7, %%mm0 \n\t"\
551 "paddw %%mm7, %%mm6 \n\t"\
552 "paddw %%mm7, %%mm3 \n\t"\
553 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554 "packuswb %%mm0, %%mm2 \n\t"\
555 "packuswb %%mm6, %%mm5 \n\t"\
556 "packuswb %%mm3, %%mm4 \n\t"\
40494418 557
6e1c66bc 558#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 559
6e1c66bc 560#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
561 "xor "#index", "#index" \n\t"\
562 ASMALIGN(4)\
563 "1: \n\t"\
564 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
565 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
566 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
567 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570 "psrlw $8, %%mm3 \n\t" \
571 "psrlw $8, %%mm4 \n\t" \
572 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
573 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
574 "psraw $7, %%mm1 \n\t" \
575 "psraw $7, %%mm7 \n\t"
6e1c66bc 576#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 577
497d4f99 578// do vertical chrominance interpolation
6e1c66bc 579#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
580 "xor "#index", "#index" \n\t"\
581 ASMALIGN(4)\
582 "1: \n\t"\
583 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
584 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
585 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
586 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
587 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
590 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
591 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
592 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
593 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
594 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
595 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
596 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
597 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
599 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
600 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
603 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
604 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
605 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
606 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
607 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
608 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609 "paddw %%mm3, %%mm4 \n\t"\
610 "movq %%mm2, %%mm0 \n\t"\
611 "movq %%mm5, %%mm6 \n\t"\
612 "movq %%mm4, %%mm3 \n\t"\
613 "punpcklwd %%mm2, %%mm2 \n\t"\
614 "punpcklwd %%mm5, %%mm5 \n\t"\
615 "punpcklwd %%mm4, %%mm4 \n\t"\
616 "paddw %%mm1, %%mm2 \n\t"\
617 "paddw %%mm1, %%mm5 \n\t"\
618 "paddw %%mm1, %%mm4 \n\t"\
619 "punpckhwd %%mm0, %%mm0 \n\t"\
620 "punpckhwd %%mm6, %%mm6 \n\t"\
621 "punpckhwd %%mm3, %%mm3 \n\t"\
622 "paddw %%mm7, %%mm0 \n\t"\
623 "paddw %%mm7, %%mm6 \n\t"\
624 "paddw %%mm7, %%mm3 \n\t"\
625 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626 "packuswb %%mm0, %%mm2 \n\t"\
627 "packuswb %%mm6, %%mm5 \n\t"\
628 "packuswb %%mm3, %%mm4 \n\t"\
40494418 629
6e1c66bc 630#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 631
6858492e
CS
632#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
634 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
635 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
636 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
637 "packuswb %%mm1, %%mm7 \n\t"
638#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639
9c77b26b
CS
640#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641 "movq "#b", "#q2" \n\t" /* B */\
642 "movq "#r", "#t" \n\t" /* R */\
643 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
644 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
645 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
646 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
647 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
648 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
649 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
650 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
651 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
652 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
d604bab9 653\
9c77b26b
CS
654 MOVNTQ( q0, (dst, index, 4))\
655 MOVNTQ( b, 8(dst, index, 4))\
656 MOVNTQ( q2, 16(dst, index, 4))\
657 MOVNTQ( q3, 24(dst, index, 4))\
d604bab9 658\
2da0d70d
DB
659 "add $8, "#index" \n\t"\
660 "cmp "#dstw", "#index" \n\t"\
661 " jb 1b \n\t"
9c77b26b 662#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
d604bab9 663
27a90b04 664#define REAL_WRITERGB16(dst, dstw, index) \
2da0d70d
DB
665 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
666 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
667 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
668 "psrlq $3, %%mm2 \n\t"\
d604bab9 669\
2da0d70d
DB
670 "movq %%mm2, %%mm1 \n\t"\
671 "movq %%mm4, %%mm3 \n\t"\
d604bab9 672\
2da0d70d
DB
673 "punpcklbw %%mm7, %%mm3 \n\t"\
674 "punpcklbw %%mm5, %%mm2 \n\t"\
675 "punpckhbw %%mm7, %%mm4 \n\t"\
676 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 677\
2da0d70d
DB
678 "psllq $3, %%mm3 \n\t"\
679 "psllq $3, %%mm4 \n\t"\
d604bab9 680\
2da0d70d
DB
681 "por %%mm3, %%mm2 \n\t"\
682 "por %%mm4, %%mm1 \n\t"\
d604bab9 683\
2da0d70d
DB
684 MOVNTQ(%%mm2, (dst, index, 2))\
685 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 686\
2da0d70d
DB
687 "add $8, "#index" \n\t"\
688 "cmp "#dstw", "#index" \n\t"\
689 " jb 1b \n\t"
27a90b04 690#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
d604bab9 691
27a90b04 692#define REAL_WRITERGB15(dst, dstw, index) \
2da0d70d
DB
693 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
694 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
695 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
696 "psrlq $3, %%mm2 \n\t"\
697 "psrlq $1, %%mm5 \n\t"\
d604bab9 698\
2da0d70d
DB
699 "movq %%mm2, %%mm1 \n\t"\
700 "movq %%mm4, %%mm3 \n\t"\
d604bab9 701\
2da0d70d
DB
702 "punpcklbw %%mm7, %%mm3 \n\t"\
703 "punpcklbw %%mm5, %%mm2 \n\t"\
704 "punpckhbw %%mm7, %%mm4 \n\t"\
705 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 706\
2da0d70d
DB
707 "psllq $2, %%mm3 \n\t"\
708 "psllq $2, %%mm4 \n\t"\
d604bab9 709\
2da0d70d
DB
710 "por %%mm3, %%mm2 \n\t"\
711 "por %%mm4, %%mm1 \n\t"\
d604bab9 712\
2da0d70d
DB
713 MOVNTQ(%%mm2, (dst, index, 2))\
714 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 715\
2da0d70d
DB
716 "add $8, "#index" \n\t"\
717 "cmp "#dstw", "#index" \n\t"\
718 " jb 1b \n\t"
27a90b04 719#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
f62255fb 720
6542b44e 721#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
722 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723 "movq %%mm2, %%mm1 \n\t" /* B */\
724 "movq %%mm5, %%mm6 \n\t" /* R */\
725 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
726 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
727 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
728 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
729 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
730 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
731 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
732 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
733 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
734 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 735\
2da0d70d
DB
736 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
737 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
738 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
739 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
740 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
741 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
742 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
743 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 744\
2da0d70d
DB
745 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
746 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
747 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
748 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
749 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
750 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
751 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
752 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
753 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
754 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
755 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
756 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
757 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 758\
2da0d70d
DB
759 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
760 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
761 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
762 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
763 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
764 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
765 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
766 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 767\
2da0d70d
DB
768 MOVNTQ(%%mm0, (dst))\
769 MOVNTQ(%%mm2, 8(dst))\
770 MOVNTQ(%%mm3, 16(dst))\
771 "add $24, "#dst" \n\t"\
d604bab9 772\
2da0d70d
DB
773 "add $8, "#index" \n\t"\
774 "cmp "#dstw", "#index" \n\t"\
775 " jb 1b \n\t"
d604bab9 776
6542b44e 777#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779 "movq %%mm2, %%mm1 \n\t" /* B */\
780 "movq %%mm5, %%mm6 \n\t" /* R */\
781 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
782 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
783 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
784 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
785 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
786 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
787 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
788 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
789 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
790 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 791\
2da0d70d
DB
792 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
793 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
794 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
795 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 796\
2da0d70d
DB
797 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
798 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
799 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
800 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 801\
2da0d70d
DB
802 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
803 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
804 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
805 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 806\
2da0d70d
DB
807 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
809 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
810 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
811 MOVNTQ(%%mm0, (dst))\
99d2cb72 812\
2da0d70d
DB
813 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
814 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
815 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
816 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
817 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 818\
2da0d70d
DB
819 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
820 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
821 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
822 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 823\
2da0d70d 824 "add $24, "#dst" \n\t"\
99d2cb72 825\
2da0d70d
DB
826 "add $8, "#index" \n\t"\
827 "cmp "#dstw", "#index" \n\t"\
828 " jb 1b \n\t"
99d2cb72 829
6542b44e 830#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 831 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
832 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
834 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
835 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
836 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 837\
2da0d70d
DB
838 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
839 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
840 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 841\
2da0d70d
DB
842 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
843 "por %%mm1, %%mm6 \n\t"\
844 "por %%mm3, %%mm6 \n\t"\
845 MOVNTQ(%%mm6, (dst))\
99d2cb72 846\
2da0d70d
DB
847 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
848 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
849 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
850 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 851\
5802683a 852 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
853 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
854 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 855\
2da0d70d
DB
856 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
857 "por %%mm3, %%mm6 \n\t"\
858 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 859\
2da0d70d
DB
860 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
861 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
862 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 863\
2da0d70d
DB
864 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
865 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 866 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 867\
2da0d70d
DB
868 "por %%mm1, %%mm3 \n\t"\
869 "por %%mm3, %%mm6 \n\t"\
870 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 871\
2da0d70d 872 "add $24, "#dst" \n\t"\
99d2cb72 873\
2da0d70d
DB
874 "add $8, "#index" \n\t"\
875 "cmp "#dstw", "#index" \n\t"\
876 " jb 1b \n\t"
99d2cb72 877
b63f641e 878#if HAVE_MMX2
7630f2e0 879#undef WRITEBGR24
6e1c66bc 880#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 881#else
7630f2e0 882#undef WRITEBGR24
6e1c66bc 883#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
884#endif
885
6e1c66bc 886#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
887 "packuswb %%mm3, %%mm3 \n\t"\
888 "packuswb %%mm4, %%mm4 \n\t"\
889 "packuswb %%mm7, %%mm1 \n\t"\
890 "punpcklbw %%mm4, %%mm3 \n\t"\
891 "movq %%mm1, %%mm7 \n\t"\
892 "punpcklbw %%mm3, %%mm1 \n\t"\
893 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 894\
2da0d70d
DB
895 MOVNTQ(%%mm1, (dst, index, 2))\
896 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 897\
2da0d70d
DB
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
900 " jb 1b \n\t"
6e1c66bc 901#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
902
903
7ac40327
RP
904static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
6858492e 906 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
38858470 907{
b63f641e 908#if HAVE_MMX
f433c8ab 909 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
910 if (c->flags & SWS_ACCURATE_RND){
911 if (uDest){
912 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914 }
6858492e
CS
915 if (CONFIG_SWSCALE_ALPHA && aDest){
916 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917 }
bca11e75 918
14014d47
MN
919 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920 }else{
921 if (uDest){
922 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924 }
6858492e
CS
925 if (CONFIG_SWSCALE_ALPHA && aDest){
926 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927 }
2da0d70d 928
14014d47
MN
929 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930 }
f433c8ab
MN
931 return;
932 }
933#endif
b63f641e 934#if HAVE_ALTIVEC
a2faa401 935yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
936 chrFilter, chrSrc, chrFilterSize,
937 dest, uDest, vDest, dstW, chrDstW);
a2faa401 938#else //HAVE_ALTIVEC
5859233b 939yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d 940 chrFilter, chrSrc, chrFilterSize,
6858492e 941 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
a2faa401 942#endif //!HAVE_ALTIVEC
c1b0bfb4 943}
2add307d 944
7ac40327
RP
945static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
2da0d70d 947 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
6118e52e
VS
948{
949yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
952}
953
7ac40327 954static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
6858492e 955 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
c1b0bfb4 956{
f433c8ab 957 int i;
b63f641e 958#if HAVE_MMX
f433c8ab 959 if(!(c->flags & SWS_BITEXACT)){
6858492e
CS
960 long p= 4;
961 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
2da0d70d 964
14014d47
MN
965 if (c->flags & SWS_ACCURATE_RND){
966 while(p--){
3164d25e
CS
967 if (dst[p]){
968 __asm__ volatile(
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
971 "g" (-counter[p])
972 : "%"REG_a
973 );
974 }
6858492e 975 }
14014d47
MN
976 }else{
977 while(p--){
3164d25e
CS
978 if (dst[p]){
979 __asm__ volatile(
980 YSCALEYUV2YV121
981 :: "r" (src[p]), "r" (dst[p] + counter[p]),
982 "g" (-counter[p])
983 : "%"REG_a
984 );
985 }
6858492e 986 }
d78c1ea1 987 }
f433c8ab
MN
988 return;
989 }
990#endif
2da0d70d
DB
991 for (i=0; i<dstW; i++)
992 {
a1f3ffa3 993 int val= (lumSrc[i]+64)>>7;
2da0d70d
DB
994
995 if (val&256){
996 if (val<0) val=0;
997 else val=255;
998 }
999
1000 dest[i]= val;
1001 }
1002
1b0a4572 1003 if (uDest)
2da0d70d
DB
1004 for (i=0; i<chrDstW; i++)
1005 {
a1f3ffa3
MN
1006 int u=(chrSrc[i ]+64)>>7;
1007 int v=(chrSrc[i + VOFW]+64)>>7;
2da0d70d
DB
1008
1009 if ((u|v)&256){
1010 if (u<0) u=0;
1011 else if (u>255) u=255;
1012 if (v<0) v=0;
1013 else if (v>255) v=255;
1014 }
1015
1016 uDest[i]= u;
1017 vDest[i]= v;
1018 }
6858492e
CS
1019
1020 if (CONFIG_SWSCALE_ALPHA && aDest)
1021 for (i=0; i<dstW; i++){
1022 int val= (alpSrc[i]+64)>>7;
1023 aDest[i]= av_clip_uint8(val);
1024 }
38858470
MN
1025}
1026
c1b0bfb4 1027
d604bab9
MN
1028/**
1029 * vertical scale YV12 to RGB
1030 */
7ac40327
RP
1031static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1032 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1033 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1034{
b63f641e 1035#if HAVE_MMX
d0ce212a 1036 x86_reg dummy=0;
f433c8ab 1037 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
1038 if (c->flags & SWS_ACCURATE_RND){
1039 switch(c->dstFormat){
1040 case PIX_FMT_RGB32:
6858492e
CS
1041 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1042 YSCALEYUV2PACKEDX_ACCURATE
1043 YSCALEYUV2RGBX
1044 "movq %%mm2, "U_TEMP"(%0) \n\t"
1045 "movq %%mm4, "V_TEMP"(%0) \n\t"
1046 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1047 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1048 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1049 "psraw $3, %%mm1 \n\t"
1050 "psraw $3, %%mm7 \n\t"
1051 "packuswb %%mm7, %%mm1 \n\t"
1052 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1053
1054 YSCALEYUV2PACKEDX_END
1055 }else{
3164d25e
CS
1056 YSCALEYUV2PACKEDX_ACCURATE
1057 YSCALEYUV2RGBX
1058 "pcmpeqd %%mm7, %%mm7 \n\t"
1059 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
2da0d70d 1060
3164d25e 1061 YSCALEYUV2PACKEDX_END
6858492e 1062 }
14014d47
MN
1063 return;
1064 case PIX_FMT_BGR24:
1065 YSCALEYUV2PACKEDX_ACCURATE
1066 YSCALEYUV2RGBX
40494418 1067 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1068 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1069 "add %4, %%"REG_c" \n\t"
1070 WRITEBGR24(%%REGc, %5, %%REGa)
2da0d70d
DB
1071
1072
14014d47
MN
1073 :: "r" (&c->redDither),
1074 "m" (dummy), "m" (dummy), "m" (dummy),
1075 "r" (dest), "m" (dstW)
1076 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1077 );
1078 return;
1079 case PIX_FMT_RGB555:
1080 YSCALEYUV2PACKEDX_ACCURATE
1081 YSCALEYUV2RGBX
40494418 1082 "pxor %%mm7, %%mm7 \n\t"
14014d47 1083 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1084#ifdef DITHER1XBPP
88e2a9ae
CEH
1085 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1086 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1087 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1088#endif
1089
14014d47
MN
1090 WRITERGB15(%4, %5, %%REGa)
1091 YSCALEYUV2PACKEDX_END
1092 return;
1093 case PIX_FMT_RGB565:
1094 YSCALEYUV2PACKEDX_ACCURATE
1095 YSCALEYUV2RGBX
40494418 1096 "pxor %%mm7, %%mm7 \n\t"
14014d47 1097 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1098#ifdef DITHER1XBPP
88e2a9ae
CEH
1099 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1100 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1101 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1102#endif
1103
14014d47
MN
1104 WRITERGB16(%4, %5, %%REGa)
1105 YSCALEYUV2PACKEDX_END
1106 return;
1107 case PIX_FMT_YUYV422:
1108 YSCALEYUV2PACKEDX_ACCURATE
1109 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1110
1111 "psraw $3, %%mm3 \n\t"
1112 "psraw $3, %%mm4 \n\t"
1113 "psraw $3, %%mm1 \n\t"
1114 "psraw $3, %%mm7 \n\t"
1115 WRITEYUY2(%4, %5, %%REGa)
1116 YSCALEYUV2PACKEDX_END
1117 return;
1118 }
1119 }else{
1120 switch(c->dstFormat)
1121 {
1122 case PIX_FMT_RGB32:
6858492e
CS
1123 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1124 YSCALEYUV2PACKEDX
1125 YSCALEYUV2RGBX
1126 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1127 "psraw $3, %%mm1 \n\t"
1128 "psraw $3, %%mm7 \n\t"
1129 "packuswb %%mm7, %%mm1 \n\t"
1130 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1131 YSCALEYUV2PACKEDX_END
1132 }else{
3164d25e
CS
1133 YSCALEYUV2PACKEDX
1134 YSCALEYUV2RGBX
1135 "pcmpeqd %%mm7, %%mm7 \n\t"
1136 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1137 YSCALEYUV2PACKEDX_END
6858492e 1138 }
14014d47
MN
1139 return;
1140 case PIX_FMT_BGR24:
1141 YSCALEYUV2PACKEDX
1142 YSCALEYUV2RGBX
40494418 1143 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1144 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1145 "add %4, %%"REG_c" \n\t"
1146 WRITEBGR24(%%REGc, %5, %%REGa)
1147
1148 :: "r" (&c->redDither),
1149 "m" (dummy), "m" (dummy), "m" (dummy),
1150 "r" (dest), "m" (dstW)
1151 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1152 );
1153 return;
1154 case PIX_FMT_RGB555:
1155 YSCALEYUV2PACKEDX
1156 YSCALEYUV2RGBX
40494418 1157 "pxor %%mm7, %%mm7 \n\t"
14014d47 1158 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1159#ifdef DITHER1XBPP
88e2a9ae
CEH
1160 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1161 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1162 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1163#endif
1164
14014d47
MN
1165 WRITERGB15(%4, %5, %%REGa)
1166 YSCALEYUV2PACKEDX_END
1167 return;
1168 case PIX_FMT_RGB565:
1169 YSCALEYUV2PACKEDX
1170 YSCALEYUV2RGBX
40494418 1171 "pxor %%mm7, %%mm7 \n\t"
14014d47 1172 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1173#ifdef DITHER1XBPP
88e2a9ae
CEH
1174 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1175 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1176 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1177#endif
1178
14014d47
MN
1179 WRITERGB16(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1181 return;
1182 case PIX_FMT_YUYV422:
1183 YSCALEYUV2PACKEDX
1184 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1185
1186 "psraw $3, %%mm3 \n\t"
1187 "psraw $3, %%mm4 \n\t"
1188 "psraw $3, %%mm1 \n\t"
1189 "psraw $3, %%mm7 \n\t"
1190 WRITEYUY2(%4, %5, %%REGa)
1191 YSCALEYUV2PACKEDX_END
1192 return;
1193 }
bca11e75
MN
1194 }
1195 }
bc279024 1196#endif /* HAVE_MMX */
b63f641e 1197#if HAVE_ALTIVEC
2da0d70d 1198 /* The following list of supported dstFormat values should
780daf2b 1199 match what's found in the body of ff_yuv2packedX_altivec() */
d55ef636 1200 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
12794f73 1201 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
2da0d70d 1202 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
12794f73 1203 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
780daf2b
DB
1204 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1205 chrFilter, chrSrc, chrFilterSize,
1206 dest, dstW, dstY);
2da0d70d
DB
1207 else
1208#endif
1209 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1210 chrFilter, chrSrc, chrFilterSize,
6858492e 1211 alpSrc, dest, dstW, dstY);
c1b0bfb4
MN
1212}
1213
c1b0bfb4
MN
1214/**
1215 * vertical bilinear scale YV12 to RGB
1216 */
7ac40327
RP
1217static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1218 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1219{
ac0ad729
MN
1220 int yalpha1=4095- yalpha;
1221 int uvalpha1=4095-uvalpha;
2da0d70d 1222 int i;
d604bab9 1223
b63f641e 1224#if HAVE_MMX
f433c8ab 1225 if(!(c->flags & SWS_BITEXACT)){
2da0d70d
DB
1226 switch(c->dstFormat)
1227 {
1228 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1229 case PIX_FMT_RGB32:
6858492e
CS
1230 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1231#if ARCH_X86_64
1232 __asm__ volatile(
6858492e
CS
1233 YSCALEYUV2RGB(%%REGBP, %5)
1234 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1235 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1236 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1237 "packuswb %%mm7, %%mm1 \n\t"
04ef1d3f 1238 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
6858492e 1239
04ef1d3f 1240 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
6858492e
CS
1241 "a" (&c->redDither)
1242 ,"r" (abuf0), "r" (abuf1)
04ef1d3f 1243 : "%"REG_BP
6858492e
CS
1244 );
1245#else
1246 *(uint16_t **)(&c->u_temp)=abuf0;
1247 *(uint16_t **)(&c->v_temp)=abuf1;
1248 __asm__ volatile(
1249 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1250 "mov %4, %%"REG_b" \n\t"
1251 "push %%"REG_BP" \n\t"
1252 YSCALEYUV2RGB(%%REGBP, %5)
1253 "push %0 \n\t"
1254 "push %1 \n\t"
1255 "mov "U_TEMP"(%5), %0 \n\t"
1256 "mov "V_TEMP"(%5), %1 \n\t"
1257 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1258 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1259 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1260 "packuswb %%mm7, %%mm1 \n\t"
1261 "pop %1 \n\t"
1262 "pop %0 \n\t"
1263 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1264 "pop %%"REG_BP" \n\t"
1265 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1266
1267 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1268 "a" (&c->redDither)
1269 );
1270#endif
1271 }else{
3164d25e
CS
1272 __asm__ volatile(
1273 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1274 "mov %4, %%"REG_b" \n\t"
1275 "push %%"REG_BP" \n\t"
1276 YSCALEYUV2RGB(%%REGBP, %5)
1277 "pcmpeqd %%mm7, %%mm7 \n\t"
1278 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1279 "pop %%"REG_BP" \n\t"
1280 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1281
3164d25e
CS
1282 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1283 "a" (&c->redDither)
1284 );
6858492e 1285 }
2da0d70d
DB
1286 return;
1287 case PIX_FMT_BGR24:
7ad6469e 1288 __asm__ volatile(
2da0d70d
DB
1289 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1290 "mov %4, %%"REG_b" \n\t"
1291 "push %%"REG_BP" \n\t"
1292 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1293 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
1294 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1295 "pop %%"REG_BP" \n\t"
1296 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1297 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1298 "a" (&c->redDither)
1299 );
1300 return;
27a90b04 1301 case PIX_FMT_RGB555:
7ad6469e 1302 __asm__ volatile(
2da0d70d
DB
1303 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1304 "mov %4, %%"REG_b" \n\t"
1305 "push %%"REG_BP" \n\t"
1306 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1307 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1308 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1309#ifdef DITHER1XBPP
88e2a9ae
CEH
1310 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1311 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1312 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1313#endif
1314
27a90b04 1315 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1316 "pop %%"REG_BP" \n\t"
1317 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1318
1319 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1320 "a" (&c->redDither)
1321 );
1322 return;
27a90b04 1323 case PIX_FMT_RGB565:
7ad6469e 1324 __asm__ volatile(
2da0d70d
DB
1325 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1326 "mov %4, %%"REG_b" \n\t"
1327 "push %%"REG_BP" \n\t"
1328 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1329 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1330 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1331#ifdef DITHER1XBPP
88e2a9ae
CEH
1332 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1333 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1334 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1335#endif
1336
27a90b04 1337 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1338 "pop %%"REG_BP" \n\t"
1339 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1340 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1341 "a" (&c->redDither)
1342 );
1343 return;
1344 case PIX_FMT_YUYV422:
7ad6469e 1345 __asm__ volatile(
2da0d70d
DB
1346 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1347 "mov %4, %%"REG_b" \n\t"
1348 "push %%"REG_BP" \n\t"
1349 YSCALEYUV2PACKED(%%REGBP, %5)
1350 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1351 "pop %%"REG_BP" \n\t"
1352 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1353 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1354 "a" (&c->redDither)
1355 );
1356 return;
1357 default: break;
1358 }
f433c8ab 1359 }
cf7d1c1a 1360#endif //HAVE_MMX
6858492e 1361YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
d604bab9
MN
1362}
1363
1364/**
1365 * YV12 to RGB without scaling or interpolating
1366 */
7ac40327
RP
1367static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1368 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1369{
2da0d70d
DB
1370 const int yalpha1=0;
1371 int i;
6a4970ab 1372
7ac40327 1373 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
2da0d70d 1374 const int yalpha= 4096; //FIXME ...
96034638 1375
2da0d70d
DB
1376 if (flags&SWS_FULL_CHR_H_INT)
1377 {
40fa5140 1378 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
2da0d70d
DB
1379 return;
1380 }
397c035e 1381
b63f641e 1382#if HAVE_MMX
f433c8ab 1383 if(!(flags & SWS_BITEXACT)){
14014d47 1384 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
2da0d70d 1385 {
14014d47
MN
1386 switch(dstFormat)
1387 {
1388 case PIX_FMT_RGB32:
6858492e
CS
1389 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1390 __asm__ volatile(
1391 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1392 "mov %4, %%"REG_b" \n\t"
1393 "push %%"REG_BP" \n\t"
1394 YSCALEYUV2RGB1(%%REGBP, %5)
1395 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1396 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1397 "pop %%"REG_BP" \n\t"
1398 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1399
1400 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1401 "a" (&c->redDither)
1402 );
1403 }else{
3164d25e
CS
1404 __asm__ volatile(
1405 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1406 "mov %4, %%"REG_b" \n\t"
1407 "push %%"REG_BP" \n\t"
1408 YSCALEYUV2RGB1(%%REGBP, %5)
1409 "pcmpeqd %%mm7, %%mm7 \n\t"
1410 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1411 "pop %%"REG_BP" \n\t"
1412 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1413
3164d25e
CS
1414 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1415 "a" (&c->redDither)
1416 );
6858492e 1417 }
14014d47
MN
1418 return;
1419 case PIX_FMT_BGR24:
7ad6469e 1420 __asm__ volatile(
14014d47
MN
1421 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1422 "mov %4, %%"REG_b" \n\t"
1423 "push %%"REG_BP" \n\t"
1424 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1425 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1426 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1427 "pop %%"REG_BP" \n\t"
1428 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1429
1430 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1431 "a" (&c->redDither)
1432 );
1433 return;
1434 case PIX_FMT_RGB555:
7ad6469e 1435 __asm__ volatile(
14014d47
MN
1436 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1437 "mov %4, %%"REG_b" \n\t"
1438 "push %%"REG_BP" \n\t"
1439 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1440 "pxor %%mm7, %%mm7 \n\t"
14014d47 1441 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1442#ifdef DITHER1XBPP
88e2a9ae
CEH
1443 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1444 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1445 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1446#endif
14014d47
MN
1447 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1448 "pop %%"REG_BP" \n\t"
1449 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1450
14014d47
MN
1451 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1452 "a" (&c->redDither)
1453 );
1454 return;
1455 case PIX_FMT_RGB565:
7ad6469e 1456 __asm__ volatile(
14014d47
MN
1457 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1458 "mov %4, %%"REG_b" \n\t"
1459 "push %%"REG_BP" \n\t"
1460 YSCALEYUV2RGB1(%%REGBP, %5)
40494418 1461 "pxor %%mm7, %%mm7 \n\t"
14014d47 1462 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1463#ifdef DITHER1XBPP
88e2a9ae
CEH
1464 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1465 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1466 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1467#endif
1468
14014d47
MN
1469 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1470 "pop %%"REG_BP" \n\t"
1471 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1472
14014d47
MN
1473 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1474 "a" (&c->redDither)
1475 );
1476 return;
1477 case PIX_FMT_YUYV422:
7ad6469e 1478 __asm__ volatile(
14014d47
MN
1479 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1480 "mov %4, %%"REG_b" \n\t"
1481 "push %%"REG_BP" \n\t"
1482 YSCALEYUV2PACKED1(%%REGBP, %5)
1483 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1484 "pop %%"REG_BP" \n\t"
1485 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1486
1487 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 "a" (&c->redDither)
1489 );
1490 return;
1491 }
2da0d70d 1492 }
14014d47 1493 else
2da0d70d 1494 {
14014d47
MN
1495 switch(dstFormat)
1496 {
1497 case PIX_FMT_RGB32:
6858492e
CS
1498 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1499 __asm__ volatile(
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP, %5)
1504 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1505 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506 "pop %%"REG_BP" \n\t"
1507 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1508
1509 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510 "a" (&c->redDither)
1511 );
1512 }else{
3164d25e
CS
1513 __asm__ volatile(
1514 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1515 "mov %4, %%"REG_b" \n\t"
1516 "push %%"REG_BP" \n\t"
1517 YSCALEYUV2RGB1b(%%REGBP, %5)
1518 "pcmpeqd %%mm7, %%mm7 \n\t"
1519 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1520 "pop %%"REG_BP" \n\t"
1521 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1522
3164d25e
CS
1523 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524 "a" (&c->redDither)
1525 );
6858492e 1526 }
14014d47
MN
1527 return;
1528 case PIX_FMT_BGR24:
7ad6469e 1529 __asm__ volatile(
14014d47
MN
1530 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1531 "mov %4, %%"REG_b" \n\t"
1532 "push %%"REG_BP" \n\t"
1533 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1534 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1535 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1536 "pop %%"REG_BP" \n\t"
1537 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1538
1539 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1540 "a" (&c->redDither)
1541 );
1542 return;
1543 case PIX_FMT_RGB555:
7ad6469e 1544 __asm__ volatile(
14014d47
MN
1545 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1546 "mov %4, %%"REG_b" \n\t"
1547 "push %%"REG_BP" \n\t"
1548 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1549 "pxor %%mm7, %%mm7 \n\t"
14014d47 1550 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1551#ifdef DITHER1XBPP
88e2a9ae
CEH
1552 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1553 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1554 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1555#endif
14014d47
MN
1556 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1557 "pop %%"REG_BP" \n\t"
1558 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1559
14014d47
MN
1560 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1561 "a" (&c->redDither)
1562 );
1563 return;
1564 case PIX_FMT_RGB565:
7ad6469e 1565 __asm__ volatile(
14014d47
MN
1566 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1567 "mov %4, %%"REG_b" \n\t"
1568 "push %%"REG_BP" \n\t"
1569 YSCALEYUV2RGB1b(%%REGBP, %5)
40494418 1570 "pxor %%mm7, %%mm7 \n\t"
14014d47 1571 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1572#ifdef DITHER1XBPP
88e2a9ae
CEH
1573 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1574 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1575 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1576#endif
1577
14014d47
MN
1578 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1579 "pop %%"REG_BP" \n\t"
1580 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1581
14014d47
MN
1582 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583 "a" (&c->redDither)
1584 );
1585 return;
1586 case PIX_FMT_YUYV422:
7ad6469e 1587 __asm__ volatile(
14014d47
MN
1588 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1589 "mov %4, %%"REG_b" \n\t"
1590 "push %%"REG_BP" \n\t"
1591 YSCALEYUV2PACKED1b(%%REGBP, %5)
1592 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1593 "pop %%"REG_BP" \n\t"
1594 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1595
1596 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1597 "a" (&c->redDither)
1598 );
1599 return;
1600 }
2da0d70d
DB
1601 }
1602 }
bc279024 1603#endif /* HAVE_MMX */
e5091488 1604 if (uvalpha < 2048)
2da0d70d 1605 {
6858492e 1606 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1607 }else{
6858492e 1608 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1609 }
d604bab9
MN
1610}
1611
8a322796 1612//FIXME yuy2* can read up to 7 samples too much
6ff0ad6b 1613
7ac40327 1614static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1e621b18 1615{
b63f641e 1616#if HAVE_MMX
7ad6469e 1617 __asm__ volatile(
2da0d70d
DB
1618 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1619 "mov %0, %%"REG_a" \n\t"
1620 "1: \n\t"
1621 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1622 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1623 "pand %%mm2, %%mm0 \n\t"
1624 "pand %%mm2, %%mm1 \n\t"
1625 "packuswb %%mm1, %%mm0 \n\t"
1626 "movq %%mm0, (%2, %%"REG_a") \n\t"
1627 "add $8, %%"REG_a" \n\t"
1628 " js 1b \n\t"
d0ce212a 1629 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
2da0d70d
DB
1630 : "%"REG_a
1631 );
1e621b18 1632#else
2da0d70d
DB
1633 int i;
1634 for (i=0; i<width; i++)
1635 dst[i]= src[2*i];
1e621b18
MN
1636#endif
1637}
1638
7ac40327 1639static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1e621b18 1640{
b63f641e 1641#if HAVE_MMX
7ad6469e 1642 __asm__ volatile(
2da0d70d
DB
1643 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1644 "mov %0, %%"REG_a" \n\t"
1645 "1: \n\t"
1646 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1647 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1648 "psrlw $8, %%mm0 \n\t"
1649 "psrlw $8, %%mm1 \n\t"
1650 "packuswb %%mm1, %%mm0 \n\t"
1651 "movq %%mm0, %%mm1 \n\t"
1652 "psrlw $8, %%mm0 \n\t"
1653 "pand %%mm4, %%mm1 \n\t"
1654 "packuswb %%mm0, %%mm0 \n\t"
1655 "packuswb %%mm1, %%mm1 \n\t"
1656 "movd %%mm0, (%3, %%"REG_a") \n\t"
1657 "movd %%mm1, (%2, %%"REG_a") \n\t"
1658 "add $4, %%"REG_a" \n\t"
1659 " js 1b \n\t"
d0ce212a 1660 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
2da0d70d
DB
1661 : "%"REG_a
1662 );
1e621b18 1663#else
2da0d70d
DB
1664 int i;
1665 for (i=0; i<width; i++)
1666 {
1667 dstU[i]= src1[4*i + 1];
1668 dstV[i]= src1[4*i + 3];
1669 }
1670#endif
1671 assert(src1 == src2);
1e621b18
MN
1672}
1673
de1275d5
MN
1674static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1675{
1676#if HAVE_MMX
1677 __asm__ volatile(
1678 "mov %0, %%"REG_a" \n\t"
1679 "1: \n\t"
1680 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1681 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1682 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1683 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1684 "psrlw $8, %%mm0 \n\t"
1685 "psrlw $8, %%mm1 \n\t"
1686 "psrlw $8, %%mm2 \n\t"
1687 "psrlw $8, %%mm3 \n\t"
1688 "packuswb %%mm1, %%mm0 \n\t"
1689 "packuswb %%mm3, %%mm2 \n\t"
1690 "movq %%mm0, (%3, %%"REG_a") \n\t"
1691 "movq %%mm2, (%4, %%"REG_a") \n\t"
1692 "add $8, %%"REG_a" \n\t"
1693 " js 1b \n\t"
1694 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1695 : "%"REG_a
1696 );
1697#else
1698 int i;
1699 for (i=0; i<width; i++)
1700 {
1701 dstU[i]= src1[2*i + 1];
1702 dstV[i]= src2[2*i + 1];
1703 }
1704#endif
1705}
1706
4cf16bbe
DB
1707/* This is almost identical to the previous, end exists only because
1708 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
7ac40327 1709static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
7322a67c 1710{
b63f641e 1711#if HAVE_MMX
7ad6469e 1712 __asm__ volatile(
2da0d70d
DB
1713 "mov %0, %%"REG_a" \n\t"
1714 "1: \n\t"
1715 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1716 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1717 "psrlw $8, %%mm0 \n\t"
1718 "psrlw $8, %%mm1 \n\t"
1719 "packuswb %%mm1, %%mm0 \n\t"
1720 "movq %%mm0, (%2, %%"REG_a") \n\t"
1721 "add $8, %%"REG_a" \n\t"
1722 " js 1b \n\t"
d0ce212a 1723 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
2da0d70d
DB
1724 : "%"REG_a
1725 );
7322a67c 1726#else
2da0d70d
DB
1727 int i;
1728 for (i=0; i<width; i++)
1729 dst[i]= src[2*i+1];
7322a67c
MN
1730#endif
1731}
1732
7ac40327 1733static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
7322a67c 1734{
b63f641e 1735#if HAVE_MMX
7ad6469e 1736 __asm__ volatile(
2da0d70d
DB
1737 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1738 "mov %0, %%"REG_a" \n\t"
1739 "1: \n\t"
1740 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1741 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1742 "pand %%mm4, %%mm0 \n\t"
1743 "pand %%mm4, %%mm1 \n\t"
1744 "packuswb %%mm1, %%mm0 \n\t"
1745 "movq %%mm0, %%mm1 \n\t"
1746 "psrlw $8, %%mm0 \n\t"
1747 "pand %%mm4, %%mm1 \n\t"
1748 "packuswb %%mm0, %%mm0 \n\t"
1749 "packuswb %%mm1, %%mm1 \n\t"
1750 "movd %%mm0, (%3, %%"REG_a") \n\t"
1751 "movd %%mm1, (%2, %%"REG_a") \n\t"
1752 "add $4, %%"REG_a" \n\t"
1753 " js 1b \n\t"
d0ce212a 1754 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
2da0d70d
DB
1755 : "%"REG_a
1756 );
7322a67c 1757#else
2da0d70d
DB
1758 int i;
1759 for (i=0; i<width; i++)
1760 {
1761 dstU[i]= src1[4*i + 0];
1762 dstV[i]= src1[4*i + 2];
1763 }
1764#endif
1765 assert(src1 == src2);
7322a67c
MN
1766}
1767
de1275d5
MN
1768static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1769{
1770#if HAVE_MMX
1771 __asm__ volatile(
1772 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1773 "mov %0, %%"REG_a" \n\t"
1774 "1: \n\t"
1775 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1776 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1777 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1778 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1779 "pand %%mm4, %%mm0 \n\t"
1780 "pand %%mm4, %%mm1 \n\t"
1781 "pand %%mm4, %%mm2 \n\t"
1782 "pand %%mm4, %%mm3 \n\t"
1783 "packuswb %%mm1, %%mm0 \n\t"
1784 "packuswb %%mm3, %%mm2 \n\t"
1785 "movq %%mm0, (%3, %%"REG_a") \n\t"
1786 "movq %%mm2, (%4, %%"REG_a") \n\t"
1787 "add $8, %%"REG_a" \n\t"
1788 " js 1b \n\t"
1789 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1790 : "%"REG_a
1791 );
1792#else
1793 int i;
1794 for (i=0; i<width; i++)
1795 {
1796 dstU[i]= src1[2*i];
1797 dstV[i]= src2[2*i];
1798 }
1799#endif
1800}
1801
214892ee 1802#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
7ac40327 1803static inline void RENAME(name)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)\
214892ee
MN
1804{\
1805 int i;\
1806 for (i=0; i<width; i++)\
1807 {\
7ac40327
RP
1808 int b= (((const type*)src)[i]>>shb)&maskb;\
1809 int g= (((const type*)src)[i]>>shg)&maskg;\
1810 int r= (((const type*)src)[i]>>shr)&maskr;\
214892ee
MN
1811\
1812 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1813 }\
1e621b18
MN
1814}
1815
214892ee
MN
1816BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1817BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1818BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1819BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1820BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1821BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1822
7ac40327 1823static inline void RENAME(abgrToA)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused){
6858492e
CS
1824 int i;
1825 for (i=0; i<width; i++){
1826 dst[i]= src[4*i];
1827 }
1828}
1829
f8a138be 1830#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
7ac40327 1831static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
a0baa07a
MN
1832{\
1833 int i;\
1834 for (i=0; i<width; i++)\
1835 {\
7ac40327
RP
1836 int b= (((const type*)src)[i]&maskb)>>shb;\
1837 int g= (((const type*)src)[i]&maskg)>>shg;\
1838 int r= (((const type*)src)[i]&maskr)>>shr;\
a0baa07a
MN
1839\
1840 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1841 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1842 }\
ba83d862 1843}\
7ac40327 1844static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
ba83d862
MN
1845{\
1846 int i;\
1847 for (i=0; i<width; i++)\
1848 {\
7ac40327
RP
1849 int pix0= ((const type*)src)[2*i+0];\
1850 int pix1= ((const type*)src)[2*i+1];\
bcff32d1 1851 int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
ba83d862
MN
1852 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1853 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
f8a138be 1854 g&= maskg|(2*maskg);\
ba83d862
MN
1855\
1856 g>>=shg;\
1857\
6b79dbce
MN
1858 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1859 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
ba83d862 1860 }\
2f60f629
MN
1861}
1862
f8a138be
CS
1863BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1864BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1865BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1866BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1867BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1868BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
a0baa07a 1869
b63f641e 1870#if HAVE_MMX
7ac40327 1871static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
dfb09bd1
MN
1872{
1873
1874 if(srcFormat == PIX_FMT_BGR24){
7ad6469e 1875 __asm__ volatile(
ff9a056d
MN
1876 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1877 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1878 :
dfb09bd1
MN
1879 );
1880 }else{
7ad6469e 1881 __asm__ volatile(
ff9a056d
MN
1882 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1883 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1884 :
dfb09bd1
MN
1885 );
1886 }
1887
7ad6469e 1888 __asm__ volatile(
dfb09bd1
MN
1889 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1890 "mov %2, %%"REG_a" \n\t"
1891 "pxor %%mm7, %%mm7 \n\t"
1892 "1: \n\t"
1893 PREFETCH" 64(%0) \n\t"
1894 "movd (%0), %%mm0 \n\t"
1895 "movd 2(%0), %%mm1 \n\t"
1896 "movd 6(%0), %%mm2 \n\t"
1897 "movd 8(%0), %%mm3 \n\t"
1898 "add $12, %0 \n\t"
1899 "punpcklbw %%mm7, %%mm0 \n\t"
1900 "punpcklbw %%mm7, %%mm1 \n\t"
1901 "punpcklbw %%mm7, %%mm2 \n\t"
1902 "punpcklbw %%mm7, %%mm3 \n\t"
1903 "pmaddwd %%mm5, %%mm0 \n\t"
1904 "pmaddwd %%mm6, %%mm1 \n\t"
1905 "pmaddwd %%mm5, %%mm2 \n\t"
1906 "pmaddwd %%mm6, %%mm3 \n\t"
1907 "paddd %%mm1, %%mm0 \n\t"
1908 "paddd %%mm3, %%mm2 \n\t"
1909 "paddd %%mm4, %%mm0 \n\t"
1910 "paddd %%mm4, %%mm2 \n\t"
1911 "psrad $15, %%mm0 \n\t"
1912 "psrad $15, %%mm2 \n\t"
1913 "packssdw %%mm2, %%mm0 \n\t"
1914 "packuswb %%mm0, %%mm0 \n\t"
1915 "movd %%mm0, (%1, %%"REG_a") \n\t"
1916 "add $4, %%"REG_a" \n\t"
1917 " js 1b \n\t"
1918 : "+r" (src)
d0ce212a 1919 : "r" (dst+width), "g" ((x86_reg)-width)
dfb09bd1 1920 : "%"REG_a
2da0d70d 1921 );
dfb09bd1
MN
1922}
1923
7ac40327 1924static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
dfb09bd1 1925{
7ad6469e 1926 __asm__ volatile(
dfb09bd1
MN
1927 "movq 24+%4, %%mm6 \n\t"
1928 "mov %3, %%"REG_a" \n\t"
1929 "pxor %%mm7, %%mm7 \n\t"
1930 "1: \n\t"
1931 PREFETCH" 64(%0) \n\t"
1932 "movd (%0), %%mm0 \n\t"
1933 "movd 2(%0), %%mm1 \n\t"
1934 "punpcklbw %%mm7, %%mm0 \n\t"
1935 "punpcklbw %%mm7, %%mm1 \n\t"
1936 "movq %%mm0, %%mm2 \n\t"
1937 "movq %%mm1, %%mm3 \n\t"
1938 "pmaddwd %4, %%mm0 \n\t"
1939 "pmaddwd 8+%4, %%mm1 \n\t"
1940 "pmaddwd 16+%4, %%mm2 \n\t"
1941 "pmaddwd %%mm6, %%mm3 \n\t"
1942 "paddd %%mm1, %%mm0 \n\t"
1943 "paddd %%mm3, %%mm2 \n\t"
1944
1945 "movd 6(%0), %%mm1 \n\t"
1946 "movd 8(%0), %%mm3 \n\t"
1947 "add $12, %0 \n\t"
1948 "punpcklbw %%mm7, %%mm1 \n\t"
1949 "punpcklbw %%mm7, %%mm3 \n\t"
1950 "movq %%mm1, %%mm4 \n\t"
1951 "movq %%mm3, %%mm5 \n\t"
1952 "pmaddwd %4, %%mm1 \n\t"
1953 "pmaddwd 8+%4, %%mm3 \n\t"
1954 "pmaddwd 16+%4, %%mm4 \n\t"
1955 "pmaddwd %%mm6, %%mm5 \n\t"
1956 "paddd %%mm3, %%mm1 \n\t"
1957 "paddd %%mm5, %%mm4 \n\t"
1958
1959 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1960 "paddd %%mm3, %%mm0 \n\t"
1961 "paddd %%mm3, %%mm2 \n\t"
1962 "paddd %%mm3, %%mm1 \n\t"
1963 "paddd %%mm3, %%mm4 \n\t"
1964 "psrad $15, %%mm0 \n\t"
1965 "psrad $15, %%mm2 \n\t"
1966 "psrad $15, %%mm1 \n\t"
1967 "psrad $15, %%mm4 \n\t"
1968 "packssdw %%mm1, %%mm0 \n\t"
1969 "packssdw %%mm4, %%mm2 \n\t"
1970 "packuswb %%mm0, %%mm0 \n\t"
1971 "packuswb %%mm2, %%mm2 \n\t"
1972 "movd %%mm0, (%1, %%"REG_a") \n\t"
1973 "movd %%mm2, (%2, %%"REG_a") \n\t"
1974 "add $4, %%"REG_a" \n\t"
1975 " js 1b \n\t"
1976 : "+r" (src)
d0ce212a 1977 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
dfb09bd1
MN
1978 : "%"REG_a
1979 );
1980}
1981#endif
1982
7ac40327 1983static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
dfb09bd1 1984{
b63f641e 1985#if HAVE_MMX
a35acd7f 1986 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1e621b18 1987#else
2da0d70d
DB
1988 int i;
1989 for (i=0; i<width; i++)
1990 {
1991 int b= src[i*3+0];
1992 int g= src[i*3+1];
1993 int r= src[i*3+2];
1e621b18 1994
e5091488 1995 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1996 }
bc279024 1997#endif /* HAVE_MMX */
1e621b18
MN
1998}
1999
7ac40327 2000static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1e621b18 2001{
b63f641e 2002#if HAVE_MMX
a35acd7f 2003 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1e621b18 2004#else
2da0d70d
DB
2005 int i;
2006 for (i=0; i<width; i++)
2007 {
dfb09bd1
MN
2008 int b= src1[3*i + 0];
2009 int g= src1[3*i + 1];
2010 int r= src1[3*i + 2];
2da0d70d 2011
dfb09bd1
MN
2012 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2013 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2014 }
bc279024 2015#endif /* HAVE_MMX */
2da0d70d 2016 assert(src1 == src2);
1e621b18
MN
2017}
2018
7ac40327 2019static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
2020{
2021 int i;
2022 for (i=0; i<width; i++)
2023 {
2024 int b= src1[6*i + 0] + src1[6*i + 3];
2025 int g= src1[6*i + 1] + src1[6*i + 4];
2026 int r= src1[6*i + 2] + src1[6*i + 5];
2027
2028 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2029 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2030 }
2031 assert(src1 == src2);
2032}
2033
7ac40327 2034static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
a861d4d7 2035{
b63f641e 2036#if HAVE_MMX
a35acd7f 2037 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
dfb09bd1 2038#else
2da0d70d
DB
2039 int i;
2040 for (i=0; i<width; i++)
2041 {
2042 int r= src[i*3+0];
2043 int g= src[i*3+1];
2044 int b= src[i*3+2];
2045
e5091488 2046 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 2047 }
dfb09bd1 2048#endif
a861d4d7
MN
2049}
2050
7ac40327 2051static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
a861d4d7 2052{
b63f641e 2053#if HAVE_MMX
5155b839 2054 assert(src1==src2);
a35acd7f 2055 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
dfb09bd1 2056#else
5155b839
DB
2057 int i;
2058 assert(src1==src2);
2da0d70d
DB
2059 for (i=0; i<width; i++)
2060 {
dfb09bd1
MN
2061 int r= src1[3*i + 0];
2062 int g= src1[3*i + 1];
2063 int b= src1[3*i + 2];
2da0d70d 2064
dfb09bd1
MN
2065 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2066 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2067 }
dfb09bd1 2068#endif
a861d4d7
MN
2069}
2070
7ac40327 2071static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
2072{
2073 int i;
2074 assert(src1==src2);
2075 for (i=0; i<width; i++)
2076 {
e09d7eef
MN
2077 int r= src1[6*i + 0] + src1[6*i + 3];
2078 int g= src1[6*i + 1] + src1[6*i + 4];
2079 int b= src1[6*i + 2] + src1[6*i + 5];
2f60f629
MN
2080
2081 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2082 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2083 }
2084}
2085
1e621b18 2086
7ac40327 2087static inline void RENAME(palToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *pal)
e28630fc 2088{
2da0d70d
DB
2089 int i;
2090 for (i=0; i<width; i++)
2091 {
2092 int d= src[i];
e28630fc 2093
2da0d70d
DB
2094 dst[i]= pal[d] & 0xFF;
2095 }
e28630fc
MN
2096}
2097
7ac40327
RP
2098static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV,
2099 const uint8_t *src1, const uint8_t *src2,
2100 long width, uint32_t *pal)
e28630fc 2101{
2da0d70d
DB
2102 int i;
2103 assert(src1 == src2);
2104 for (i=0; i<width; i++)
2105 {
2106 int p= pal[src1[i]];
2107
2108 dstU[i]= p>>8;
2109 dstV[i]= p>>16;
2110 }
e28630fc
MN
2111}
2112
7ac40327 2113static inline void RENAME(monowhite2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
3d05e078
MN
2114{
2115 int i, j;
2116 for (i=0; i<width/8; i++){
3a5ba0c3
LB
2117 int d= ~src[i];
2118 for(j=0; j<8; j++)
2119 dst[8*i+j]= ((d>>(7-j))&1)*255;
2120 }
2121}
2122
7ac40327 2123static inline void RENAME(monoblack2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
3a5ba0c3
LB
2124{
2125 int i, j;
2126 for (i=0; i<width/8; i++){
2127 int d= src[i];
78454dfc
MN
2128 for(j=0; j<8; j++)
2129 dst[8*i+j]= ((d>>(7-j))&1)*255;
3d05e078
MN
2130 }
2131}
2132
8a322796 2133// bilinear / bicubic scaling
7ac40327
RP
2134static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2135 const int16_t *filter, const int16_t *filterPos, long filterSize)
2ff198c1 2136{
b63f641e 2137#if HAVE_MMX
2da0d70d
DB
2138 assert(filterSize % 4 == 0 && filterSize>0);
2139 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2140 {
d0ce212a 2141 x86_reg counter= -2*dstW;
2da0d70d
DB
2142 filter-= counter*2;
2143 filterPos-= counter/2;
2144 dst-= counter/2;
7ad6469e 2145 __asm__ volatile(
83c89c78 2146#if defined(PIC)
2da0d70d
DB
2147 "push %%"REG_b" \n\t"
2148#endif
2149 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2150 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2151 "mov %%"REG_a", %%"REG_BP" \n\t"
2152 ASMALIGN(4)
2153 "1: \n\t"
2154 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2155 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2156 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2157 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2158 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2159 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2160 "punpcklbw %%mm7, %%mm0 \n\t"
2161 "punpcklbw %%mm7, %%mm2 \n\t"
2162 "pmaddwd %%mm1, %%mm0 \n\t"
2163 "pmaddwd %%mm2, %%mm3 \n\t"
ef423a66
MN
2164 "movq %%mm0, %%mm4 \n\t"
2165 "punpckldq %%mm3, %%mm0 \n\t"
2166 "punpckhdq %%mm3, %%mm4 \n\t"
2167 "paddd %%mm4, %%mm0 \n\t"
2168 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2169 "packssdw %%mm0, %%mm0 \n\t"
2170 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2171 "add $4, %%"REG_BP" \n\t"
2172 " jnc 1b \n\t"
2173
2174 "pop %%"REG_BP" \n\t"
83c89c78 2175#if defined(PIC)
2da0d70d 2176 "pop %%"REG_b" \n\t"
83c89c78 2177#endif
2da0d70d
DB
2178 : "+a" (counter)
2179 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2180#if !defined(PIC)
2da0d70d
DB
2181 : "%"REG_b
2182#endif
2183 );
2184 }
2185 else if (filterSize==8)
2186 {
d0ce212a 2187 x86_reg counter= -2*dstW;
2da0d70d
DB
2188 filter-= counter*4;
2189 filterPos-= counter/2;
2190 dst-= counter/2;
7ad6469e 2191 __asm__ volatile(
83c89c78 2192#if defined(PIC)
2da0d70d
DB
2193 "push %%"REG_b" \n\t"
2194#endif
2195 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2196 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2197 "mov %%"REG_a", %%"REG_BP" \n\t"
2198 ASMALIGN(4)
2199 "1: \n\t"
2200 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2201 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2202 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2203 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2204 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2205 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2206 "punpcklbw %%mm7, %%mm0 \n\t"
2207 "punpcklbw %%mm7, %%mm2 \n\t"
2208 "pmaddwd %%mm1, %%mm0 \n\t"
2209 "pmaddwd %%mm2, %%mm3 \n\t"
2210
2211 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2212 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2213 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2214 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2215 "punpcklbw %%mm7, %%mm4 \n\t"
2216 "punpcklbw %%mm7, %%mm2 \n\t"
2217 "pmaddwd %%mm1, %%mm4 \n\t"
2218 "pmaddwd %%mm2, %%mm5 \n\t"
2219 "paddd %%mm4, %%mm0 \n\t"
2220 "paddd %%mm5, %%mm3 \n\t"
ef423a66
MN
2221 "movq %%mm0, %%mm4 \n\t"
2222 "punpckldq %%mm3, %%mm0 \n\t"
2223 "punpckhdq %%mm3, %%mm4 \n\t"
2224 "paddd %%mm4, %%mm0 \n\t"
2225 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2226 "packssdw %%mm0, %%mm0 \n\t"
2227 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2228 "add $4, %%"REG_BP" \n\t"
2229 " jnc 1b \n\t"
2230
2231 "pop %%"REG_BP" \n\t"
83c89c78 2232#if defined(PIC)
2da0d70d 2233 "pop %%"REG_b" \n\t"
83c89c78 2234#endif
2da0d70d
DB
2235 : "+a" (counter)
2236 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2237#if !defined(PIC)
2da0d70d
DB
2238 : "%"REG_b
2239#endif
2240 );
2241 }
2242 else
2243 {
2244 uint8_t *offset = src+filterSize;
d0ce212a 2245 x86_reg counter= -2*dstW;
2da0d70d
DB
2246 //filter-= counter*filterSize/2;
2247 filterPos-= counter/2;
2248 dst-= counter/2;
7ad6469e 2249 __asm__ volatile(
2da0d70d 2250 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2251 ASMALIGN(4)
2252 "1: \n\t"
2253 "mov %2, %%"REG_c" \n\t"
2254 "movzwl (%%"REG_c", %0), %%eax \n\t"
2255 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2256 "mov %5, %%"REG_c" \n\t"
2257 "pxor %%mm4, %%mm4 \n\t"
2258 "pxor %%mm5, %%mm5 \n\t"
2259 "2: \n\t"
2260 "movq (%1), %%mm1 \n\t"
2261 "movq (%1, %6), %%mm3 \n\t"
2262 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2263 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2264 "punpcklbw %%mm7, %%mm0 \n\t"
2265 "punpcklbw %%mm7, %%mm2 \n\t"
2266 "pmaddwd %%mm1, %%mm0 \n\t"
2267 "pmaddwd %%mm2, %%mm3 \n\t"
2268 "paddd %%mm3, %%mm5 \n\t"
2269 "paddd %%mm0, %%mm4 \n\t"
2270 "add $8, %1 \n\t"
2271 "add $4, %%"REG_c" \n\t"
2272 "cmp %4, %%"REG_c" \n\t"
2273 " jb 2b \n\t"
2274 "add %6, %1 \n\t"
ef423a66
MN
2275 "movq %%mm4, %%mm0 \n\t"
2276 "punpckldq %%mm5, %%mm4 \n\t"
2277 "punpckhdq %%mm5, %%mm0 \n\t"
2278 "paddd %%mm0, %%mm4 \n\t"
2279 "psrad $7, %%mm4 \n\t"
2da0d70d
DB
2280 "packssdw %%mm4, %%mm4 \n\t"
2281 "mov %3, %%"REG_a" \n\t"
2282 "movd %%mm4, (%%"REG_a", %0) \n\t"
2283 "add $4, %0 \n\t"
2284 " jnc 1b \n\t"
2285
2286 : "+r" (counter), "+r" (filter)
2287 : "m" (filterPos), "m" (dst), "m"(offset),
d0ce212a 2288 "m" (src), "r" ((x86_reg)filterSize*2)
2da0d70d
DB
2289 : "%"REG_a, "%"REG_c, "%"REG_d
2290 );
2291 }
077ea8a7 2292#else
b63f641e 2293#if HAVE_ALTIVEC
2da0d70d 2294 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2295#else
2da0d70d
DB
2296 int i;
2297 for (i=0; i<dstW; i++)
2298 {
2299 int j;
2300 int srcPos= filterPos[i];
2301 int val=0;
2302 //printf("filterPos: %d\n", filterPos[i]);
2303 for (j=0; j<filterSize; j++)
2304 {
2305 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2306 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2307 }
2308 //filter += hFilterSize;
881c4294 2309 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2da0d70d
DB
2310 //dst[i] = val>>7;
2311 }
bc279024
DB
2312#endif /* HAVE_ALTIVEC */
2313#endif /* HAVE_MMX */
077ea8a7 2314}
392b6567
RP
2315
2316static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
7ac40327 2317 int dstWidth, const uint8_t *src, int srcW,
392b6567
RP
2318 int xInc)
2319{
2320 int i;
2321 unsigned int xpos=0;
2322 for (i=0;i<dstWidth;i++)
2323 {
2324 register unsigned int xx=xpos>>16;
2325 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2326 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2327 xpos+=xInc;
2328 }
2329}
2330
2ff198c1 2331 // *** horizontal scale Y line to temp buffer
7ac40327
RP
2332static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2333 int flags, const int16_t *hLumFilter,
2334 const int16_t *hLumFilterPos, int hLumFilterSize,
95b5770b
RP
2335 int srcFormat, uint8_t *formatConvBuffer,
2336 uint32_t *pal, int isAlpha)
077ea8a7 2337{
95b5770b
RP
2338 int32_t *mmx2FilterPos = c->lumMmx2FilterPos;
2339 int16_t *mmx2Filter = c->lumMmx2Filter;
2340 int canMMX2BeUsed = c->canMMX2BeUsed;
2341 void *funnyYCode = c->funnyYCode;
39e5f87b 2342 void (*internal_func)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->hascale_internal : c->hyscale_internal;
95b5770b 2343
40fa5140
RP
2344 if (isAlpha) {
2345 if (srcFormat == PIX_FMT_RGB32 || srcFormat == PIX_FMT_BGR32 )
2346 src += 3;
2347 } else {
2348 if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
2349 src += ALT32_CORR;
9990e426 2350 }
40fa5140 2351
39e5f87b
CS
2352 if (internal_func) {
2353 internal_func(formatConvBuffer, src, srcW, pal);
3d05e078
MN
2354 src= formatConvBuffer;
2355 }
1e621b18 2356
b63f641e 2357#if HAVE_MMX
8a322796 2358 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2359 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2360#else
2da0d70d 2361 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2362#endif
077ea8a7 2363 {
40fa5140 2364 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
077ea8a7 2365 }
8a322796 2366 else // fast bilinear upscale / crap downscale
077ea8a7 2367 {
57f9a560 2368#if ARCH_X86 && CONFIG_GPL
b63f641e 2369#if HAVE_MMX2
2da0d70d 2370 int i;
83c89c78 2371#if defined(PIC)
2da0d70d 2372 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2373#endif
2da0d70d
DB
2374 if (canMMX2BeUsed)
2375 {
7ad6469e 2376 __asm__ volatile(
83c89c78 2377#if defined(PIC)
2da0d70d
DB
2378 "mov %%"REG_b", %5 \n\t"
2379#endif
2380 "pxor %%mm7, %%mm7 \n\t"
2381 "mov %0, %%"REG_c" \n\t"
2382 "mov %1, %%"REG_D" \n\t"
2383 "mov %2, %%"REG_d" \n\t"
2384 "mov %3, %%"REG_b" \n\t"
2385 "xor %%"REG_a", %%"REG_a" \n\t" // i
2386 PREFETCH" (%%"REG_c") \n\t"
2387 PREFETCH" 32(%%"REG_c") \n\t"
2388 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2389
b63f641e 2390#if ARCH_X86_64
6d606c4f
AJ
2391
2392#define FUNNY_Y_CODE \
2da0d70d
DB
2393 "movl (%%"REG_b"), %%esi \n\t"\
2394 "call *%4 \n\t"\
2395 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2396 "add %%"REG_S", %%"REG_c" \n\t"\
2397 "add %%"REG_a", %%"REG_D" \n\t"\
2398 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2399
2400#else
2401
2ff198c1 2402#define FUNNY_Y_CODE \
2da0d70d
DB
2403 "movl (%%"REG_b"), %%esi \n\t"\
2404 "call *%4 \n\t"\
2405 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2406 "add %%"REG_a", %%"REG_D" \n\t"\
2407 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2408
bc279024 2409#endif /* ARCH_X86_64 */
6d606c4f 2410
2ff198c1
MN
2411FUNNY_Y_CODE
2412FUNNY_Y_CODE
2413FUNNY_Y_CODE
2414FUNNY_Y_CODE
2415FUNNY_Y_CODE
2416FUNNY_Y_CODE
2417FUNNY_Y_CODE
2418FUNNY_Y_CODE
2419
83c89c78 2420#if defined(PIC)
2da0d70d 2421 "mov %5, %%"REG_b" \n\t"
83c89c78 2422#endif
2da0d70d
DB
2423 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2424 "m" (funnyYCode)
83c89c78 2425#if defined(PIC)
2da0d70d 2426 ,"m" (ebxsave)
83c89c78 2427#endif
2da0d70d 2428 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2429#if !defined(PIC)
2da0d70d
DB
2430 ,"%"REG_b
2431#endif
2432 );
2433 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2434 }
2435 else
2436 {
bc279024 2437#endif /* HAVE_MMX2 */
d0ce212a 2438 x86_reg xInc_shr16 = xInc >> 16;
2da0d70d
DB
2439 uint16_t xInc_mask = xInc & 0xffff;
2440 //NO MMX just normal asm ...
7ad6469e 2441 __asm__ volatile(
2da0d70d
DB
2442 "xor %%"REG_a", %%"REG_a" \n\t" // i
2443 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2444 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2445 ASMALIGN(4)
2446 "1: \n\t"
2447 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2448 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2449 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2450 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2451 "shll $16, %%edi \n\t"
2452 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2453 "mov %1, %%"REG_D" \n\t"
2454 "shrl $9, %%esi \n\t"
2455 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2456 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2457 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2458
2459 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2460 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2461 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2462 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2463 "shll $16, %%edi \n\t"
2464 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2465 "mov %1, %%"REG_D" \n\t"
2466 "shrl $9, %%esi \n\t"
2467 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2468 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2469 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2470
2471
2472 "add $2, %%"REG_a" \n\t"
2473 "cmp %2, %%"REG_a" \n\t"
2474 " jb 1b \n\t"
2475
2476
2477 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2478 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2479 );
b63f641e 2480#if HAVE_MMX2
2da0d70d 2481 } //if MMX2 can't be used
2ff198c1
MN
2482#endif
2483#else
40fa5140 2484 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
b63f641e 2485#endif /* ARCH_X86 */
077ea8a7 2486 }
6bc0c792 2487
6858492e 2488 if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
6bc0c792
MN
2489 int i;
2490 //FIXME all pal and rgb srcFormats could do this convertion as well
2491 //FIXME all scalers more complex than bilinear could do half of this transform
2492 if(c->srcRange){
2493 for (i=0; i<dstWidth; i++)
2494 dst[i]= (dst[i]*14071 + 33561947)>>14;
2495 }else{
2496 for (i=0; i<dstWidth; i++)
aa13b0fc 2497 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
6bc0c792
MN
2498 }
2499 }
2ff198c1
MN
2500}
2501
392b6567 2502static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
7ac40327
RP
2503 int dstWidth, const uint8_t *src1,
2504 const uint8_t *src2, int srcW, int xInc)
392b6567
RP
2505{
2506 int i;
2507 unsigned int xpos=0;
2508 for (i=0;i<dstWidth;i++)
2509 {
2510 register unsigned int xx=xpos>>16;
2511 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2512 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2513 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2514 /* slower
2515 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2516 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2517 */
2518 xpos+=xInc;
2519 }
2520}
2521
7ac40327
RP
2522inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2523 int srcW, int xInc, int flags, const int16_t *hChrFilter,
2524 const int16_t *hChrFilterPos, int hChrFilterSize,
95b5770b
RP
2525 int srcFormat, uint8_t *formatConvBuffer,
2526 uint32_t *pal)
2ff198c1 2527{
95b5770b
RP
2528 int32_t *mmx2FilterPos = c->chrMmx2FilterPos;
2529 int16_t *mmx2Filter = c->chrMmx2Filter;
2530 int canMMX2BeUsed = c->canMMX2BeUsed;
2531 void *funnyUVCode = c->funnyUVCode;
2532
40fa5140 2533 if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2da0d70d 2534 return;
40fa5140 2535
f2671197 2536 if (srcFormat==PIX_FMT_RGB32_1 || srcFormat==PIX_FMT_BGR32_1) {
40fa5140
RP
2537 src1 += ALT32_CORR;
2538 src2 += ALT32_CORR;
6ff0ad6b 2539 }
40fa5140
RP
2540
2541 if (c->hcscale_internal) {
2542 c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2543 src1= formatConvBuffer;
8b2fce0d 2544 src2= formatConvBuffer+VOFW;
e28630fc 2545 }
1e621b18 2546
b63f641e 2547#if HAVE_MMX
8a322796 2548 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2549 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2550#else
2da0d70d 2551 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2552#endif
077ea8a7 2553 {
40fa5140
RP
2554 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2555 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
077ea8a7 2556 }
8a322796 2557 else // fast bilinear upscale / crap downscale
077ea8a7 2558 {
57f9a560 2559#if ARCH_X86 && CONFIG_GPL
b63f641e 2560#if HAVE_MMX2
2da0d70d 2561 int i;
83c89c78 2562#if defined(PIC)
2da0d70d 2563 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2564#endif
2da0d70d
DB
2565 if (canMMX2BeUsed)
2566 {
7ad6469e 2567 __asm__ volatile(
83c89c78 2568#if defined(PIC)
2da0d70d
DB
2569 "mov %%"REG_b", %6 \n\t"
2570#endif
2571 "pxor %%mm7, %%mm7 \n\t"
2572 "mov %0, %%"REG_c" \n\t"
2573 "mov %1, %%"REG_D" \n\t"
2574 "mov %2, %%"REG_d" \n\t"
2575 "mov %3, %%"REG_b" \n\t"
2576 "xor %%"REG_a", %%"REG_a" \n\t" // i
2577 PREFETCH" (%%"REG_c") \n\t"
2578 PREFETCH" 32(%%"REG_c") \n\t"
2579 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2580
b63f641e 2581#if ARCH_X86_64
6d606c4f
AJ
2582
2583#define FUNNY_UV_CODE \
2da0d70d
DB
2584 "movl (%%"REG_b"), %%esi \n\t"\
2585 "call *%4 \n\t"\
2586 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2587 "add %%"REG_S", %%"REG_c" \n\t"\
2588 "add %%"REG_a", %%"REG_D" \n\t"\
2589 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2590
2591#else
2592
b7dc6f66 2593#define FUNNY_UV_CODE \
2da0d70d
DB
2594 "movl (%%"REG_b"), %%esi \n\t"\
2595 "call *%4 \n\t"\
2596 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2597 "add %%"REG_a", %%"REG_D" \n\t"\
2598 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2599
bc279024 2600#endif /* ARCH_X86_64 */
6d606c4f 2601
b7dc6f66
MN
2602FUNNY_UV_CODE
2603FUNNY_UV_CODE
2604FUNNY_UV_CODE
2605FUNNY_UV_CODE
2da0d70d
DB
2606 "xor %%"REG_a", %%"REG_a" \n\t" // i
2607 "mov %5, %%"REG_c" \n\t" // src
2608 "mov %1, %%"REG_D" \n\t" // buf1
8b2fce0d 2609 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2da0d70d
DB
2610 PREFETCH" (%%"REG_c") \n\t"
2611 PREFETCH" 32(%%"REG_c") \n\t"
2612 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2613
2614FUNNY_UV_CODE
2615FUNNY_UV_CODE
2616FUNNY_UV_CODE
2617FUNNY_UV_CODE
2618
83c89c78 2619#if defined(PIC)
2da0d70d 2620 "mov %6, %%"REG_b" \n\t"
83c89c78 2621#endif
2da0d70d
DB
2622 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2623 "m" (funnyUVCode), "m" (src2)
83c89c78 2624#if defined(PIC)
2da0d70d 2625 ,"m" (ebxsave)
83c89c78 2626#endif
2da0d70d 2627 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2628#if !defined(PIC)
2da0d70d
DB
2629 ,"%"REG_b
2630#endif
2631 );
2632 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2633 {
2634 //printf("%d %d %d\n", dstWidth, i, srcW);
2635 dst[i] = src1[srcW-1]*128;
8b2fce0d 2636 dst[i+VOFW] = src2[srcW-1]*128;
2da0d70d
DB
2637 }
2638 }
2639 else
2640 {
bc279024 2641#endif /* HAVE_MMX2 */
d0ce212a 2642 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2da0d70d 2643 uint16_t xInc_mask = xInc & 0xffff;
7ad6469e 2644 __asm__ volatile(
2da0d70d
DB
2645 "xor %%"REG_a", %%"REG_a" \n\t" // i
2646 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2647 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2648 ASMALIGN(4)
2649 "1: \n\t"
2650 "mov %0, %%"REG_S" \n\t"
2651 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2652 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2653 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2654 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2655 "shll $16, %%edi \n\t"
2656 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2657 "mov %1, %%"REG_D" \n\t"
2658 "shrl $9, %%esi \n\t"
2659 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2660
2661 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2662 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2663 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2664 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2665 "shll $16, %%edi \n\t"
2666 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2667 "mov %1, %%"REG_D" \n\t"
2668 "shrl $9, %%esi \n\t"
8b2fce0d 2669 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2da0d70d
DB
2670
2671 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2672 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2673 "add $1, %%"REG_a" \n\t"
2674 "cmp %2, %%"REG_a" \n\t"
2675 " jb 1b \n\t"
2ff198c1 2676
8a322796
DB
2677/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2678 which is needed to support GCC 4.0. */
b63f641e 2679#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
e29c3f93 2680 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2681#else
e29c3f93 2682 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2683#endif
2da0d70d
DB
2684 "r" (src2)
2685 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2686 );
b63f641e 2687#if HAVE_MMX2
2da0d70d 2688 } //if MMX2 can't be used
2ff198c1
MN
2689#endif
2690#else
40fa5140 2691 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
b63f641e 2692#endif /* ARCH_X86 */
2da0d70d 2693 }
6bc0c792
MN
2694 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2695 int i;
2696 //FIXME all pal and rgb srcFormats could do this convertion as well
2697 //FIXME all scalers more complex than bilinear could do half of this transform
2698 if(c->srcRange){
2699 for (i=0; i<dstWidth; i++){
2700 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2701 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2702 }
2703 }else{
2704 for (i=0; i<dstWidth; i++){
aa13b0fc
MN
2705 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2706 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
6bc0c792
MN
2707 }
2708 }
2709 }
077ea8a7
MN
2710}
2711
3e499f53 2712static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2da0d70d
DB
2713 int srcSliceH, uint8_t* dst[], int dstStride[]){
2714
2715 /* load a few things into local vars to make the code more readable? and faster */
2716 const int srcW= c->srcW;
2717 const int dstW= c->dstW;
2718 const int dstH= c->dstH;
2719 const int chrDstW= c->chrDstW;
2720 const int chrSrcW= c->chrSrcW;
2721 const int lumXInc= c->lumXInc;
2722 const int chrXInc= c->chrXInc;
2723 const int dstFormat= c->dstFormat;
2724 const int srcFormat= c->srcFormat;
2725 const int flags= c->flags;
2da0d70d
DB
2726 int16_t *vLumFilterPos= c->vLumFilterPos;
2727 int16_t *vChrFilterPos= c->vChrFilterPos;
2728 int16_t *hLumFilterPos= c->hLumFilterPos;
2729 int16_t *hChrFilterPos= c->hChrFilterPos;
2730 int16_t *vLumFilter= c->vLumFilter;
2731 int16_t *vChrFilter= c->vChrFilter;
2732 int16_t *hLumFilter= c->hLumFilter;
2733 int16_t *hChrFilter= c->hChrFilter;
2734 int32_t *lumMmxFilter= c->lumMmxFilter;
2735 int32_t *chrMmxFilter= c->chrMmxFilter;
6858492e 2736 int32_t *alpMmxFilter= c->alpMmxFilter;
2da0d70d
DB
2737 const int vLumFilterSize= c->vLumFilterSize;
2738 const int vChrFilterSize= c->vChrFilterSize;
2739 const int hLumFilterSize= c->hLumFilterSize;
2740 const int hChrFilterSize= c->hChrFilterSize;
2741 int16_t **lumPixBuf= c->lumPixBuf;
2742 int16_t **chrPixBuf= c->chrPixBuf;
6858492e 2743 int16_t **alpPixBuf= c->alpPixBuf;
2da0d70d
DB
2744 const int vLumBufSize= c->vLumBufSize;
2745 const int vChrBufSize= c->vChrBufSize;
2da0d70d
DB
2746 uint8_t *formatConvBuffer= c->formatConvBuffer;
2747 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2748 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2749 int lastDstY;
e150ef8d 2750 uint32_t *pal=c->pal_yuv;
2da0d70d 2751
8a322796 2752 /* vars which will change and which we need to store back in the context */
2da0d70d
DB
2753 int dstY= c->dstY;
2754 int lumBufIndex= c->lumBufIndex;
2755 int chrBufIndex= c->chrBufIndex;
2756 int lastInLumBuf= c->lastInLumBuf;
2757 int lastInChrBuf= c->lastInChrBuf;
2758
2759 if (isPacked(c->srcFormat)){
2da0d70d
DB
2760 src[0]=
2761 src[1]=
6858492e
CS
2762 src[2]=
2763 src[3]= src[0];
2da0d70d
DB
2764 srcStride[0]=
2765 srcStride[1]=
6858492e
CS
2766 srcStride[2]=
2767 srcStride[3]= srcStride[0];
2da0d70d
DB
2768 }
2769 srcStride[1]<<= c->vChrDrop;
2770 srcStride[2]<<= c->vChrDrop;
2771
2772 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2773 // (int)dst[0], (int)dst[1], (int)dst[2]);
c7a810cc
MN
2774
2775#if 0 //self test FIXME move to a vfilter or something
2da0d70d
DB
2776 {
2777 static volatile int i=0;
2778 i++;
2779 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2780 selfTest(src, srcStride, c->srcW, c->srcH);
2781 i--;
2782 }
c7a810cc 2783#endif
37079906 2784
2da0d70d
DB
2785 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2786 //dstStride[0],dstStride[1],dstStride[2]);
2787
6858492e 2788 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
2da0d70d 2789 {
6683a37f
DP
2790 static int warnedAlready=0; //FIXME move this into the context perhaps
2791 if (flags & SWS_PRINT_INFO && !warnedAlready)
2da0d70d 2792 {
4b0c30b7 2793 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
8a322796 2794 " ->cannot do aligned memory accesses anymore\n");
6683a37f 2795 warnedAlready=1;
2da0d70d
DB
2796 }
2797 }
2798
8a322796
DB
2799 /* Note the user might start scaling the picture in the middle so this
2800 will not get executed. This is not really intended but works
2801 currently, so people might do it. */
2da0d70d
DB
2802 if (srcSliceY ==0){
2803 lumBufIndex=0;
2804 chrBufIndex=0;
2805 dstY=0;
2806 lastInLumBuf= -1;
2807 lastInChrBuf= -1;
2808 }
2809
2810 lastDstY= dstY;
2811
2812 for (;dstY < dstH; dstY++){
2813 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2814 const int chrDstY= dstY>>c->chrDstVSubSample;
2815 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2816 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
6858492e 2817 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2da0d70d
DB
2818
2819 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2820 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2821 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2822 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2823
2824 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2825 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2826 //handle holes (FAST_BILINEAR & weird filters)
2827 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2828 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2829 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
fcc402b1
LB
2830 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2831 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2da0d70d
DB
2832
2833 // Do we have enough lines in this slice to output the dstY line
2834 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2835 {
2836 //Do horizontal scaling
2837 while(lastInLumBuf < lastLumSrcY)
2838 {
6858492e
CS
2839 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2840 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2da0d70d
DB
2841 lumBufIndex++;
2842 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
fcc402b1
LB
2843 assert(lumBufIndex < 2*vLumBufSize);
2844 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2845 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2da0d70d 2846 //printf("%d %d\n", lumBufIndex, vLumBufSize);
6858492e 2847 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
95b5770b
RP
2848 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2849 c->srcFormat, formatConvBuffer,
2850 pal, 0);
6858492e
CS
2851 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2852 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
95b5770b
RP
2853 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2854 c->srcFormat, formatConvBuffer,
2855 pal, 1);
2da0d70d
DB
2856 lastInLumBuf++;
2857 }
2858 while(lastInChrBuf < lastChrSrcY)
2859 {
2860 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2861 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2862 chrBufIndex++;
fcc402b1
LB
2863 assert(chrBufIndex < 2*vChrBufSize);
2864 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2865 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
2866 //FIXME replace parameters through context struct (some at least)
2867
2868 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 2869 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
95b5770b
RP
2870 flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2871 c->srcFormat, formatConvBuffer,
2872 pal);
2da0d70d
DB
2873 lastInChrBuf++;
2874 }
2875 //wrap buf index around to stay inside the ring buffer
e5091488
BF
2876 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2877 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
2878 }
2879 else // not enough lines left in this slice -> load the rest in the buffer
2880 {
2881 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2882 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2883 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2884 vChrBufSize, vLumBufSize);*/
2885
2886 //Do horizontal scaling
2887 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2888 {
6858492e
CS
2889 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2890 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2da0d70d 2891 lumBufIndex++;
fcc402b1
LB
2892 assert(lumBufIndex < 2*vLumBufSize);
2893 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2894 assert(lastInLumBuf + 1 - srcSliceY >= 0);
6858492e 2895 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
95b5770b
RP
2896 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2897 c->srcFormat, formatConvBuffer,
2898 pal, 0);
6858492e
CS
2899 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2900 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
95b5770b
RP
2901 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2902 c->srcFormat, formatConvBuffer,
2903 pal, 1);
2da0d70d
DB
2904 lastInLumBuf++;
2905 }
2906 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2907 {
2908 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2909 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2910 chrBufIndex++;
fcc402b1
LB
2911 assert(chrBufIndex < 2*vChrBufSize);
2912 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
2913 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
2914
2915 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 2916 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
95b5770b
RP
2917 flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2918 c->srcFormat, formatConvBuffer,
2919 pal);
2da0d70d
DB
2920 lastInChrBuf++;
2921 }
2922 //wrap buf index around to stay inside the ring buffer
e5091488
BF
2923 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2924 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
2925 break; //we can't output a dstY line so let's try with the next slice
2926 }
d3f41512 2927
b63f641e 2928#if HAVE_MMX
88e2a9ae 2929 c->blueDither= ff_dither8[dstY&1];
92c7b471 2930 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
88e2a9ae 2931 c->greenDither= ff_dither8[dstY&1];
92c7b471 2932 else
88e2a9ae
CEH
2933 c->greenDither= ff_dither4[dstY&1];
2934 c->redDither= ff_dither8[(dstY+1)&1];
2da0d70d
DB
2935#endif
2936 if (dstY < dstH-2)
2937 {
7ac40327
RP
2938 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2939 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2940 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
b63f641e 2941#if HAVE_MMX
2da0d70d
DB
2942 int i;
2943 if (flags & SWS_ACCURATE_RND){
1625216e 2944 int s= APCK_SIZE / 8;
2da0d70d 2945 for (i=0; i<vLumFilterSize; i+=2){
1625216e
MN
2946 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2947 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2948 lumMmxFilter[s*i+APCK_COEF/4 ]=
2949 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2da0d70d 2950 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
6858492e
CS
2951 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2952 *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2953 *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2954 alpMmxFilter[s*i+APCK_COEF/4 ]=
2955 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2956 }
2da0d70d
DB
2957 }
2958 for (i=0; i<vChrFilterSize; i+=2){
1625216e
MN
2959 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2960 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2961 chrMmxFilter[s*i+APCK_COEF/4 ]=
2962 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2da0d70d 2963 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
bca11e75 2964 }
2da0d70d
DB
2965 }else{
2966 for (i=0; i<vLumFilterSize; i++)
2967 {
2968 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2969 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2970 lumMmxFilter[4*i+2]=
2971 lumMmxFilter[4*i+3]=
2972 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
6858492e
CS
2973 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2974 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2975 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2976 alpMmxFilter[4*i+2]=
2977 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2978 }
2da0d70d
DB
2979 }
2980 for (i=0; i<vChrFilterSize; i++)
2981 {
2982 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2983 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2984 chrMmxFilter[4*i+2]=
2985 chrMmxFilter[4*i+3]=
2986 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2987 }
2988 }
6542b44e 2989#endif
2da0d70d
DB
2990 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2991 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2992 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
40fa5140 2993 c->yuv2nv12X(c,
2da0d70d
DB
2994 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2995 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2996 dest, uDest, dstW, chrDstW, dstFormat);
e3d2500f 2997 }
b0880d5d 2998 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2da0d70d
DB
2999 {
3000 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3001 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
8a322796 3002 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2da0d70d
DB
3003 {
3004 int16_t *lumBuf = lumPixBuf[0];
3005 int16_t *chrBuf= chrPixBuf[0];
6858492e 3006 int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
40fa5140 3007 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2da0d70d
DB
3008 }
3009 else //General YV12
3010 {
40fa5140 3011 c->yuv2yuvX(c,
2da0d70d
DB
3012 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3013 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
6858492e 3014 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2da0d70d
DB
3015 }
3016 }
3017 else
3018 {
fcc402b1
LB
3019 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3020 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
8a322796 3021 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2da0d70d
DB
3022 {
3023 int chrAlpha= vChrFilter[2*dstY+1];
f0faee4c
MN
3024 if(flags & SWS_FULL_CHR_H_INT){
3025 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3026 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3027 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
6858492e 3028 alpSrcPtr, dest, dstW, dstY);
f0faee4c 3029 }else{
40fa5140 3030 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
6858492e 3031 alpPixBuf ? *alpSrcPtr : NULL,
14014d47 3032 dest, dstW, chrAlpha, dstFormat, flags, dstY);
f0faee4c 3033 }
2da0d70d 3034 }
8a322796 3035 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2da0d70d
DB
3036 {
3037 int lumAlpha= vLumFilter[2*dstY+1];
3038 int chrAlpha= vChrFilter[2*dstY+1];
3039 lumMmxFilter[2]=
3040 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3041 chrMmxFilter[2]=
3042 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
f0faee4c
MN
3043 if(flags & SWS_FULL_CHR_H_INT){
3044 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3045 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3046 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
6858492e 3047 alpSrcPtr, dest, dstW, dstY);
f0faee4c 3048 }else{
40fa5140 3049 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
6858492e 3050 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
14014d47 3051 dest, dstW, lumAlpha, chrAlpha, dstY);
f0faee4c 3052 }
2da0d70d 3053 }
8a322796 3054 else //general RGB
2da0d70d 3055 {
f0faee4c
MN
3056 if(flags & SWS_FULL_CHR_H_INT){
3057 yuv2rgbXinC_full(c,
3058 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3059 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
6858492e 3060 alpSrcPtr, dest, dstW, dstY);
f0faee4c 3061 }else{
40fa5140 3062 c->yuv2packedX(c,
14014d47
MN
3063 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3064 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
6858492e 3065 alpSrcPtr, dest, dstW, dstY);
f0faee4c 3066 }
2da0d70d
DB
3067 }
3068 }
3069 }
3070 else // hmm looks like we can't use MMX here without overwriting this array's tail
3071 {
7ac40327
RP
3072 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3073 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3074 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2da0d70d
DB
3075 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3076 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3077 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3078 yuv2nv12XinC(
3079 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3080 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3081 dest, uDest, dstW, chrDstW, dstFormat);
3082 }
b0880d5d 3083 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
2da0d70d
DB
3084 {
3085 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3086 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3087 yuv2yuvXinC(
3088 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3089 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
6858492e 3090 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);