try to avoid returning odd slices.
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
8a322796
DB
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec 27#undef PREFETCH
48a05cec 28
94daf2e9 29#if COMPILE_TEMPLATE_AMD3DNOW
48a05cec 30#define PREFETCH "prefetch"
94daf2e9 31#elif COMPILE_TEMPLATE_MMX2
48a05cec 32#define PREFETCH "prefetchnta"
48a05cec 33#else
d904b5fc 34#define PREFETCH " # nop"
48a05cec
MN
35#endif
36
94daf2e9 37#if COMPILE_TEMPLATE_MMX2
d604bab9 38#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
94daf2e9 39#elif COMPILE_TEMPLATE_AMD3DNOW
d604bab9
MN
40#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
41#endif
d3f41512 42
94daf2e9 43#if COMPILE_TEMPLATE_MMX2
6e1c66bc 44#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 45#else
6e1c66bc 46#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 47#endif
6e1c66bc 48#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 49
94daf2e9 50#if COMPILE_TEMPLATE_ALTIVEC
009d2d74 51#include "ppc/swscale_altivec_template.c"
a2faa401
RD
52#endif
53
bca11e75 54#define YSCALEYUV2YV12X(x, offset, dest, width) \
7ad6469e 55 __asm__ volatile(\
c255994b
RP
56 "xor %%"REG_a", %%"REG_a" \n\t"\
57 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
58 "movq %%mm3, %%mm4 \n\t"\
59 "lea " offset "(%0), %%"REG_d" \n\t"\
60 "mov (%%"REG_d"), %%"REG_S" \n\t"\
61 ASMALIGN(4) /* FIXME Unroll? */\
62 "1: \n\t"\
63 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
64 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
65 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
66 "add $16, %%"REG_d" \n\t"\
67 "mov (%%"REG_d"), %%"REG_S" \n\t"\
68 "test %%"REG_S", %%"REG_S" \n\t"\
69 "pmulhw %%mm0, %%mm2 \n\t"\
70 "pmulhw %%mm0, %%mm5 \n\t"\
71 "paddw %%mm2, %%mm3 \n\t"\
72 "paddw %%mm5, %%mm4 \n\t"\
73 " jnz 1b \n\t"\
74 "psraw $3, %%mm3 \n\t"\
75 "psraw $3, %%mm4 \n\t"\
76 "packuswb %%mm4, %%mm3 \n\t"\
77 MOVNTQ(%%mm3, (%1, %%REGa))\
78 "add $8, %%"REG_a" \n\t"\
79 "cmp %2, %%"REG_a" \n\t"\
80 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
81 "movq %%mm3, %%mm4 \n\t"\
82 "lea " offset "(%0), %%"REG_d" \n\t"\
83 "mov (%%"REG_d"), %%"REG_S" \n\t"\
84 "jb 1b \n\t"\
85 :: "r" (&c->redDither),\
86 "r" (dest), "g" (width)\
87 : "%"REG_a, "%"REG_d, "%"REG_S\
2da0d70d 88 );
bca11e75
MN
89
90#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
7ad6469e 91 __asm__ volatile(\
c255994b
RP
92 "lea " offset "(%0), %%"REG_d" \n\t"\
93 "xor %%"REG_a", %%"REG_a" \n\t"\
94 "pxor %%mm4, %%mm4 \n\t"\
95 "pxor %%mm5, %%mm5 \n\t"\
96 "pxor %%mm6, %%mm6 \n\t"\
97 "pxor %%mm7, %%mm7 \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 ASMALIGN(4) \
100 "1: \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
102 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
103 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
104 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
105 "movq %%mm0, %%mm3 \n\t"\
106 "punpcklwd %%mm1, %%mm0 \n\t"\
107 "punpckhwd %%mm1, %%mm3 \n\t"\
108 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
109 "pmaddwd %%mm1, %%mm0 \n\t"\
110 "pmaddwd %%mm1, %%mm3 \n\t"\
111 "paddd %%mm0, %%mm4 \n\t"\
112 "paddd %%mm3, %%mm5 \n\t"\
113 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
114 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
115 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
116 "test %%"REG_S", %%"REG_S" \n\t"\
117 "movq %%mm2, %%mm0 \n\t"\
118 "punpcklwd %%mm3, %%mm2 \n\t"\
119 "punpckhwd %%mm3, %%mm0 \n\t"\
120 "pmaddwd %%mm1, %%mm2 \n\t"\
121 "pmaddwd %%mm1, %%mm0 \n\t"\
122 "paddd %%mm2, %%mm6 \n\t"\
123 "paddd %%mm0, %%mm7 \n\t"\
124 " jnz 1b \n\t"\
125 "psrad $16, %%mm4 \n\t"\
126 "psrad $16, %%mm5 \n\t"\
127 "psrad $16, %%mm6 \n\t"\
128 "psrad $16, %%mm7 \n\t"\
129 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
130 "packssdw %%mm5, %%mm4 \n\t"\
131 "packssdw %%mm7, %%mm6 \n\t"\
132 "paddw %%mm0, %%mm4 \n\t"\
133 "paddw %%mm0, %%mm6 \n\t"\
134 "psraw $3, %%mm4 \n\t"\
135 "psraw $3, %%mm6 \n\t"\
136 "packuswb %%mm6, %%mm4 \n\t"\
137 MOVNTQ(%%mm4, (%1, %%REGa))\
138 "add $8, %%"REG_a" \n\t"\
139 "cmp %2, %%"REG_a" \n\t"\
140 "lea " offset "(%0), %%"REG_d" \n\t"\
141 "pxor %%mm4, %%mm4 \n\t"\
142 "pxor %%mm5, %%mm5 \n\t"\
143 "pxor %%mm6, %%mm6 \n\t"\
144 "pxor %%mm7, %%mm7 \n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
146 "jb 1b \n\t"\
147 :: "r" (&c->redDither),\
148 "r" (dest), "g" (width)\
149 : "%"REG_a, "%"REG_d, "%"REG_S\
2da0d70d 150 );
c1b0bfb4
MN
151
152#define YSCALEYUV2YV121 \
2da0d70d
DB
153 "mov %2, %%"REG_a" \n\t"\
154 ASMALIGN(4) /* FIXME Unroll? */\
155 "1: \n\t"\
156 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
157 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
158 "psraw $7, %%mm0 \n\t"\
159 "psraw $7, %%mm1 \n\t"\
160 "packuswb %%mm1, %%mm0 \n\t"\
161 MOVNTQ(%%mm0, (%1, %%REGa))\
162 "add $8, %%"REG_a" \n\t"\
163 "jnc 1b \n\t"
c1b0bfb4 164
bf2bdde6
MN
165#define YSCALEYUV2YV121_ACCURATE \
166 "mov %2, %%"REG_a" \n\t"\
167 "pcmpeqw %%mm7, %%mm7 \n\t"\
168 "psrlw $15, %%mm7 \n\t"\
169 "psllw $6, %%mm7 \n\t"\
170 ASMALIGN(4) /* FIXME Unroll? */\
171 "1: \n\t"\
172 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
173 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
33a67bd6
MN
174 "paddsw %%mm7, %%mm0 \n\t"\
175 "paddsw %%mm7, %%mm1 \n\t"\
bf2bdde6
MN
176 "psraw $7, %%mm0 \n\t"\
177 "psraw $7, %%mm1 \n\t"\
178 "packuswb %%mm1, %%mm0 \n\t"\
179 MOVNTQ(%%mm0, (%1, %%REGa))\
180 "add $8, %%"REG_a" \n\t"\
181 "jnc 1b \n\t"
182
c1b0bfb4 183/*
2da0d70d
DB
184 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
185 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
186 "r" (dest), "m" (dstW),
187 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
188 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 189*/
df57ab14 190#define YSCALEYUV2PACKEDX_UV \
7ad6469e 191 __asm__ volatile(\
c255994b
RP
192 "xor %%"REG_a", %%"REG_a" \n\t"\
193 ASMALIGN(4)\
194 "nop \n\t"\
195 "1: \n\t"\
196 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
197 "mov (%%"REG_d"), %%"REG_S" \n\t"\
198 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
199 "movq %%mm3, %%mm4 \n\t"\
200 ASMALIGN(4)\
201 "2: \n\t"\
202 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
203 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
204 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
205 "add $16, %%"REG_d" \n\t"\
206 "mov (%%"REG_d"), %%"REG_S" \n\t"\
207 "pmulhw %%mm0, %%mm2 \n\t"\
208 "pmulhw %%mm0, %%mm5 \n\t"\
209 "paddw %%mm2, %%mm3 \n\t"\
210 "paddw %%mm5, %%mm4 \n\t"\
211 "test %%"REG_S", %%"REG_S" \n\t"\
212 " jnz 2b \n\t"\
df57ab14 213
fe91924d 214#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
df57ab14 215 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d 216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
217 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
218 "movq "#dst1", "#dst2" \n\t"\
2da0d70d
DB
219 ASMALIGN(4)\
220 "2: \n\t"\
fe91924d
CS
221 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
2da0d70d
DB
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
226 "pmulhw "#coeff", "#src1" \n\t"\
227 "pmulhw "#coeff", "#src2" \n\t"\
228 "paddw "#src1", "#dst1" \n\t"\
229 "paddw "#src2", "#dst2" \n\t"\
2da0d70d
DB
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
232
df57ab14
CS
233#define YSCALEYUV2PACKEDX \
234 YSCALEYUV2PACKEDX_UV \
fe91924d 235 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
df57ab14 236
c255994b
RP
237#define YSCALEYUV2PACKEDX_END \
238 :: "r" (&c->redDither), \
239 "m" (dummy), "m" (dummy), "m" (dummy),\
240 "r" (dest), "m" (dstW) \
241 : "%"REG_a, "%"REG_d, "%"REG_S \
2da0d70d 242 );
8422aa88 243
df57ab14 244#define YSCALEYUV2PACKEDX_ACCURATE_UV \
7ad6469e 245 __asm__ volatile(\
c255994b
RP
246 "xor %%"REG_a", %%"REG_a" \n\t"\
247 ASMALIGN(4)\
248 "nop \n\t"\
249 "1: \n\t"\
250 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
251 "mov (%%"REG_d"), %%"REG_S" \n\t"\
252 "pxor %%mm4, %%mm4 \n\t"\
253 "pxor %%mm5, %%mm5 \n\t"\
254 "pxor %%mm6, %%mm6 \n\t"\
255 "pxor %%mm7, %%mm7 \n\t"\
256 ASMALIGN(4)\
257 "2: \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
259 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
260 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
261 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
262 "movq %%mm0, %%mm3 \n\t"\
263 "punpcklwd %%mm1, %%mm0 \n\t"\
264 "punpckhwd %%mm1, %%mm3 \n\t"\
265 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
266 "pmaddwd %%mm1, %%mm0 \n\t"\
267 "pmaddwd %%mm1, %%mm3 \n\t"\
268 "paddd %%mm0, %%mm4 \n\t"\
269 "paddd %%mm3, %%mm5 \n\t"\
270 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
271 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
272 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
273 "test %%"REG_S", %%"REG_S" \n\t"\
274 "movq %%mm2, %%mm0 \n\t"\
275 "punpcklwd %%mm3, %%mm2 \n\t"\
276 "punpckhwd %%mm3, %%mm0 \n\t"\
277 "pmaddwd %%mm1, %%mm2 \n\t"\
278 "pmaddwd %%mm1, %%mm0 \n\t"\
279 "paddd %%mm2, %%mm6 \n\t"\
280 "paddd %%mm0, %%mm7 \n\t"\
281 " jnz 2b \n\t"\
282 "psrad $16, %%mm4 \n\t"\
283 "psrad $16, %%mm5 \n\t"\
284 "psrad $16, %%mm6 \n\t"\
285 "psrad $16, %%mm7 \n\t"\
286 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
287 "packssdw %%mm5, %%mm4 \n\t"\
288 "packssdw %%mm7, %%mm6 \n\t"\
289 "paddw %%mm0, %%mm4 \n\t"\
290 "paddw %%mm0, %%mm6 \n\t"\
291 "movq %%mm4, "U_TEMP"(%0) \n\t"\
292 "movq %%mm6, "V_TEMP"(%0) \n\t"\
df57ab14
CS
293
294#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
295 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d
DB
296 "mov (%%"REG_d"), %%"REG_S" \n\t"\
297 "pxor %%mm1, %%mm1 \n\t"\
298 "pxor %%mm5, %%mm5 \n\t"\
299 "pxor %%mm7, %%mm7 \n\t"\
300 "pxor %%mm6, %%mm6 \n\t"\
301 ASMALIGN(4)\
302 "2: \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
304 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
1625216e 305 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
306 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
307 "movq %%mm0, %%mm3 \n\t"\
308 "punpcklwd %%mm4, %%mm0 \n\t"\
309 "punpckhwd %%mm4, %%mm3 \n\t"\
1625216e 310 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
2da0d70d
DB
311 "pmaddwd %%mm4, %%mm0 \n\t"\
312 "pmaddwd %%mm4, %%mm3 \n\t"\
313 "paddd %%mm0, %%mm1 \n\t"\
314 "paddd %%mm3, %%mm5 \n\t"\
315 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
1625216e
MN
316 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
317 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
318 "test %%"REG_S", %%"REG_S" \n\t"\
319 "movq %%mm2, %%mm0 \n\t"\
320 "punpcklwd %%mm3, %%mm2 \n\t"\
321 "punpckhwd %%mm3, %%mm0 \n\t"\
322 "pmaddwd %%mm4, %%mm2 \n\t"\
323 "pmaddwd %%mm4, %%mm0 \n\t"\
324 "paddd %%mm2, %%mm7 \n\t"\
325 "paddd %%mm0, %%mm6 \n\t"\
326 " jnz 2b \n\t"\
327 "psrad $16, %%mm1 \n\t"\
328 "psrad $16, %%mm5 \n\t"\
329 "psrad $16, %%mm7 \n\t"\
330 "psrad $16, %%mm6 \n\t"\
331 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
332 "packssdw %%mm5, %%mm1 \n\t"\
333 "packssdw %%mm6, %%mm7 \n\t"\
334 "paddw %%mm0, %%mm1 \n\t"\
335 "paddw %%mm0, %%mm7 \n\t"\
336 "movq "U_TEMP"(%0), %%mm3 \n\t"\
337 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 338
df57ab14
CS
339#define YSCALEYUV2PACKEDX_ACCURATE \
340 YSCALEYUV2PACKEDX_ACCURATE_UV \
341 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
342
8422aa88 343#define YSCALEYUV2RGBX \
2da0d70d
DB
344 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
345 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
346 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
347 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
348 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
349 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
c255994b 350 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
2da0d70d
DB
351 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
352 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
353 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
354 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
355 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
356 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
c255994b 357 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
2da0d70d
DB
358 "paddw %%mm3, %%mm4 \n\t"\
359 "movq %%mm2, %%mm0 \n\t"\
360 "movq %%mm5, %%mm6 \n\t"\
361 "movq %%mm4, %%mm3 \n\t"\
362 "punpcklwd %%mm2, %%mm2 \n\t"\
363 "punpcklwd %%mm5, %%mm5 \n\t"\
364 "punpcklwd %%mm4, %%mm4 \n\t"\
365 "paddw %%mm1, %%mm2 \n\t"\
366 "paddw %%mm1, %%mm5 \n\t"\
367 "paddw %%mm1, %%mm4 \n\t"\
368 "punpckhwd %%mm0, %%mm0 \n\t"\
369 "punpckhwd %%mm6, %%mm6 \n\t"\
370 "punpckhwd %%mm3, %%mm3 \n\t"\
371 "paddw %%mm7, %%mm0 \n\t"\
372 "paddw %%mm7, %%mm6 \n\t"\
373 "paddw %%mm7, %%mm3 \n\t"\
374 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
375 "packuswb %%mm0, %%mm2 \n\t"\
376 "packuswb %%mm6, %%mm5 \n\t"\
377 "packuswb %%mm3, %%mm4 \n\t"\
d604bab9 378
6e1c66bc 379#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
380 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
381 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
382 "psraw $3, %%mm0 \n\t"\
383 "psraw $3, %%mm1 \n\t"\
384 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
385 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
386 "xor "#index", "#index" \n\t"\
387 ASMALIGN(4)\
388 "1: \n\t"\
389 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
390 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
391 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
392 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
393 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
394 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
396 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
397 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
399 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
400 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
401 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
402 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
403 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
404 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
405 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
406 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
407 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
408 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
409 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
410 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
411 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
412 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
413 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 414
6e1c66bc 415#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 416
df57ab14 417#define REAL_YSCALEYUV2RGB_UV(index, c) \
2da0d70d
DB
418 "xor "#index", "#index" \n\t"\
419 ASMALIGN(4)\
420 "1: \n\t"\
421 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
422 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
423 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
424 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
425 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
426 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
427 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
428 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
429 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
430 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
431 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
432 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
433 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
434 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
435 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
436 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
437 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
438 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
439 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
440 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
df57ab14 441
786dcfef
CS
442#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
443 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
444 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
445 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
446 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
2da0d70d
DB
447 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
448 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
449 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
df57ab14
CS
455
456#define REAL_YSCALEYUV2RGB_COEFF(c) \
2da0d70d
DB
457 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
458 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
459 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
460 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
461 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
462 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
463 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
464 "paddw %%mm3, %%mm4 \n\t"\
465 "movq %%mm2, %%mm0 \n\t"\
466 "movq %%mm5, %%mm6 \n\t"\
467 "movq %%mm4, %%mm3 \n\t"\
468 "punpcklwd %%mm2, %%mm2 \n\t"\
469 "punpcklwd %%mm5, %%mm5 \n\t"\
470 "punpcklwd %%mm4, %%mm4 \n\t"\
471 "paddw %%mm1, %%mm2 \n\t"\
472 "paddw %%mm1, %%mm5 \n\t"\
473 "paddw %%mm1, %%mm4 \n\t"\
474 "punpckhwd %%mm0, %%mm0 \n\t"\
475 "punpckhwd %%mm6, %%mm6 \n\t"\
476 "punpckhwd %%mm3, %%mm3 \n\t"\
477 "paddw %%mm7, %%mm0 \n\t"\
478 "paddw %%mm7, %%mm6 \n\t"\
479 "paddw %%mm7, %%mm3 \n\t"\
480 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
481 "packuswb %%mm0, %%mm2 \n\t"\
482 "packuswb %%mm6, %%mm5 \n\t"\
483 "packuswb %%mm3, %%mm4 \n\t"\
40494418 484
786dcfef 485#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
df57ab14
CS
486
487#define YSCALEYUV2RGB(index, c) \
488 REAL_YSCALEYUV2RGB_UV(index, c) \
786dcfef 489 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
df57ab14 490 REAL_YSCALEYUV2RGB_COEFF(c)
6a4970ab 491
6e1c66bc 492#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
493 "xor "#index", "#index" \n\t"\
494 ASMALIGN(4)\
495 "1: \n\t"\
496 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 497 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
498 "psraw $7, %%mm3 \n\t" \
499 "psraw $7, %%mm4 \n\t" \
500 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
501 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
502 "psraw $7, %%mm1 \n\t" \
503 "psraw $7, %%mm7 \n\t" \
6a4970ab 504
6e1c66bc 505#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 506
6e1c66bc 507#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
508 "xor "#index", "#index" \n\t"\
509 ASMALIGN(4)\
510 "1: \n\t"\
511 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 512 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
513 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
514 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
515 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
516 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
517 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
518 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
519 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
520 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
521 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
522 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
523 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
524 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
525 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
527 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
528 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
529 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
530 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
531 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
532 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
533 "paddw %%mm3, %%mm4 \n\t"\
534 "movq %%mm2, %%mm0 \n\t"\
535 "movq %%mm5, %%mm6 \n\t"\
536 "movq %%mm4, %%mm3 \n\t"\
537 "punpcklwd %%mm2, %%mm2 \n\t"\
538 "punpcklwd %%mm5, %%mm5 \n\t"\
539 "punpcklwd %%mm4, %%mm4 \n\t"\
540 "paddw %%mm1, %%mm2 \n\t"\
541 "paddw %%mm1, %%mm5 \n\t"\
542 "paddw %%mm1, %%mm4 \n\t"\
543 "punpckhwd %%mm0, %%mm0 \n\t"\
544 "punpckhwd %%mm6, %%mm6 \n\t"\
545 "punpckhwd %%mm3, %%mm3 \n\t"\
546 "paddw %%mm7, %%mm0 \n\t"\
547 "paddw %%mm7, %%mm6 \n\t"\
548 "paddw %%mm7, %%mm3 \n\t"\
549 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
550 "packuswb %%mm0, %%mm2 \n\t"\
551 "packuswb %%mm6, %%mm5 \n\t"\
552 "packuswb %%mm3, %%mm4 \n\t"\
40494418 553
6e1c66bc 554#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 555
6e1c66bc 556#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
557 "xor "#index", "#index" \n\t"\
558 ASMALIGN(4)\
559 "1: \n\t"\
560 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
561 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
562 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
563 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
564 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
565 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
566 "psrlw $8, %%mm3 \n\t" \
567 "psrlw $8, %%mm4 \n\t" \
568 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
569 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
570 "psraw $7, %%mm1 \n\t" \
571 "psraw $7, %%mm7 \n\t"
6e1c66bc 572#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 573
497d4f99 574// do vertical chrominance interpolation
6e1c66bc 575#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
586 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
587 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
588 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
589 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
590 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
591 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
592 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
593 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
594 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
595 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
596 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
597 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
599 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
600 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
601 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
602 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
603 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
604 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
605 "paddw %%mm3, %%mm4 \n\t"\
606 "movq %%mm2, %%mm0 \n\t"\
607 "movq %%mm5, %%mm6 \n\t"\
608 "movq %%mm4, %%mm3 \n\t"\
609 "punpcklwd %%mm2, %%mm2 \n\t"\
610 "punpcklwd %%mm5, %%mm5 \n\t"\
611 "punpcklwd %%mm4, %%mm4 \n\t"\
612 "paddw %%mm1, %%mm2 \n\t"\
613 "paddw %%mm1, %%mm5 \n\t"\
614 "paddw %%mm1, %%mm4 \n\t"\
615 "punpckhwd %%mm0, %%mm0 \n\t"\
616 "punpckhwd %%mm6, %%mm6 \n\t"\
617 "punpckhwd %%mm3, %%mm3 \n\t"\
618 "paddw %%mm7, %%mm0 \n\t"\
619 "paddw %%mm7, %%mm6 \n\t"\
620 "paddw %%mm7, %%mm3 \n\t"\
621 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
622 "packuswb %%mm0, %%mm2 \n\t"\
623 "packuswb %%mm6, %%mm5 \n\t"\
624 "packuswb %%mm3, %%mm4 \n\t"\
40494418 625
6e1c66bc 626#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 627
6858492e
CS
628#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
629 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
630 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
631 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
632 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
633 "packuswb %%mm1, %%mm7 \n\t"
634#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
635
9c77b26b
CS
636#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
637 "movq "#b", "#q2" \n\t" /* B */\
638 "movq "#r", "#t" \n\t" /* R */\
639 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
640 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
641 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
642 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
643 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
644 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
645 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
646 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
647 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
648 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
d604bab9 649\
9c77b26b
CS
650 MOVNTQ( q0, (dst, index, 4))\
651 MOVNTQ( b, 8(dst, index, 4))\
652 MOVNTQ( q2, 16(dst, index, 4))\
653 MOVNTQ( q3, 24(dst, index, 4))\
d604bab9 654\
2da0d70d
DB
655 "add $8, "#index" \n\t"\
656 "cmp "#dstw", "#index" \n\t"\
657 " jb 1b \n\t"
9c77b26b 658#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
d604bab9 659
27a90b04 660#define REAL_WRITERGB16(dst, dstw, index) \
2da0d70d
DB
661 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
662 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
663 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
664 "psrlq $3, %%mm2 \n\t"\
d604bab9 665\
2da0d70d
DB
666 "movq %%mm2, %%mm1 \n\t"\
667 "movq %%mm4, %%mm3 \n\t"\
d604bab9 668\
2da0d70d
DB
669 "punpcklbw %%mm7, %%mm3 \n\t"\
670 "punpcklbw %%mm5, %%mm2 \n\t"\
671 "punpckhbw %%mm7, %%mm4 \n\t"\
672 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 673\
2da0d70d
DB
674 "psllq $3, %%mm3 \n\t"\
675 "psllq $3, %%mm4 \n\t"\
d604bab9 676\
2da0d70d
DB
677 "por %%mm3, %%mm2 \n\t"\
678 "por %%mm4, %%mm1 \n\t"\
d604bab9 679\
2da0d70d
DB
680 MOVNTQ(%%mm2, (dst, index, 2))\
681 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 682\
2da0d70d
DB
683 "add $8, "#index" \n\t"\
684 "cmp "#dstw", "#index" \n\t"\
685 " jb 1b \n\t"
27a90b04 686#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
d604bab9 687
27a90b04 688#define REAL_WRITERGB15(dst, dstw, index) \
2da0d70d
DB
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
690 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
692 "psrlq $3, %%mm2 \n\t"\
693 "psrlq $1, %%mm5 \n\t"\
d604bab9 694\
2da0d70d
DB
695 "movq %%mm2, %%mm1 \n\t"\
696 "movq %%mm4, %%mm3 \n\t"\
d604bab9 697\
2da0d70d
DB
698 "punpcklbw %%mm7, %%mm3 \n\t"\
699 "punpcklbw %%mm5, %%mm2 \n\t"\
700 "punpckhbw %%mm7, %%mm4 \n\t"\
701 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 702\
2da0d70d
DB
703 "psllq $2, %%mm3 \n\t"\
704 "psllq $2, %%mm4 \n\t"\
d604bab9 705\
2da0d70d
DB
706 "por %%mm3, %%mm2 \n\t"\
707 "por %%mm4, %%mm1 \n\t"\
d604bab9 708\
2da0d70d
DB
709 MOVNTQ(%%mm2, (dst, index, 2))\
710 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 711\
2da0d70d
DB
712 "add $8, "#index" \n\t"\
713 "cmp "#dstw", "#index" \n\t"\
714 " jb 1b \n\t"
27a90b04 715#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
f62255fb 716
6542b44e 717#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
718 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
719 "movq %%mm2, %%mm1 \n\t" /* B */\
720 "movq %%mm5, %%mm6 \n\t" /* R */\
721 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
723 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
724 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
725 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
726 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
727 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
728 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
729 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
730 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 731\
2da0d70d
DB
732 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
733 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
734 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
735 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
736 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
737 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
738 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
739 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 740\
2da0d70d
DB
741 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
742 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
743 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
744 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
745 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
746 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
747 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
748 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
749 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
750 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
751 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
752 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
753 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 754\
2da0d70d
DB
755 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
756 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
757 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
758 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
759 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
760 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
761 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
762 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 763\
2da0d70d
DB
764 MOVNTQ(%%mm0, (dst))\
765 MOVNTQ(%%mm2, 8(dst))\
766 MOVNTQ(%%mm3, 16(dst))\
767 "add $24, "#dst" \n\t"\
d604bab9 768\
2da0d70d
DB
769 "add $8, "#index" \n\t"\
770 "cmp "#dstw", "#index" \n\t"\
771 " jb 1b \n\t"
d604bab9 772
6542b44e 773#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
774 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
775 "movq %%mm2, %%mm1 \n\t" /* B */\
776 "movq %%mm5, %%mm6 \n\t" /* R */\
777 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
778 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
779 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
780 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
781 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
782 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
783 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
784 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
785 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
786 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 787\
2da0d70d
DB
788 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
789 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
790 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
791 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 792\
2da0d70d
DB
793 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
794 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
795 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
796 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 797\
2da0d70d
DB
798 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
799 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
800 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
801 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 802\
2da0d70d
DB
803 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
804 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
805 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
806 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
807 MOVNTQ(%%mm0, (dst))\
99d2cb72 808\
2da0d70d
DB
809 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
810 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
811 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
812 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
813 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 814\
2da0d70d
DB
815 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
816 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
817 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
818 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 819\
2da0d70d 820 "add $24, "#dst" \n\t"\
99d2cb72 821\
2da0d70d
DB
822 "add $8, "#index" \n\t"\
823 "cmp "#dstw", "#index" \n\t"\
824 " jb 1b \n\t"
99d2cb72 825
6542b44e 826#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 827 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
828 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
829 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
830 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
831 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
832 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 833\
2da0d70d
DB
834 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
835 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
836 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 837\
2da0d70d
DB
838 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
839 "por %%mm1, %%mm6 \n\t"\
840 "por %%mm3, %%mm6 \n\t"\
841 MOVNTQ(%%mm6, (dst))\
99d2cb72 842\
2da0d70d
DB
843 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
844 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
845 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
846 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 847\
5802683a 848 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
849 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
850 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 851\
2da0d70d
DB
852 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
853 "por %%mm3, %%mm6 \n\t"\
854 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 855\
2da0d70d
DB
856 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
857 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
858 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 859\
2da0d70d
DB
860 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
861 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 862 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 863\
2da0d70d
DB
864 "por %%mm1, %%mm3 \n\t"\
865 "por %%mm3, %%mm6 \n\t"\
866 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 867\
2da0d70d 868 "add $24, "#dst" \n\t"\
99d2cb72 869\
2da0d70d
DB
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
872 " jb 1b \n\t"
99d2cb72 873
94daf2e9 874#if COMPILE_TEMPLATE_MMX2
7630f2e0 875#undef WRITEBGR24
6e1c66bc 876#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 877#else
7630f2e0 878#undef WRITEBGR24
6e1c66bc 879#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
880#endif
881
6e1c66bc 882#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
883 "packuswb %%mm3, %%mm3 \n\t"\
884 "packuswb %%mm4, %%mm4 \n\t"\
885 "packuswb %%mm7, %%mm1 \n\t"\
886 "punpcklbw %%mm4, %%mm3 \n\t"\
887 "movq %%mm1, %%mm7 \n\t"\
888 "punpcklbw %%mm3, %%mm1 \n\t"\
889 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 890\
2da0d70d
DB
891 MOVNTQ(%%mm1, (dst, index, 2))\
892 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 893\
2da0d70d
DB
894 "add $8, "#index" \n\t"\
895 "cmp "#dstw", "#index" \n\t"\
896 " jb 1b \n\t"
6e1c66bc 897#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
898
899
7ac40327
RP
900static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
901 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
6858492e 902 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
38858470 903{
94daf2e9 904#if COMPILE_TEMPLATE_MMX
dd68318c
RP
905 if(!(c->flags & SWS_BITEXACT)) {
906 if (c->flags & SWS_ACCURATE_RND) {
907 if (uDest) {
14014d47
MN
908 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
909 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
910 }
dd68318c 911 if (CONFIG_SWSCALE_ALPHA && aDest) {
6858492e
CS
912 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
913 }
bca11e75 914
14014d47 915 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
dd68318c
RP
916 } else {
917 if (uDest) {
14014d47
MN
918 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
919 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
920 }
dd68318c 921 if (CONFIG_SWSCALE_ALPHA && aDest) {
6858492e
CS
922 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
923 }
2da0d70d 924
14014d47
MN
925 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
926 }
f433c8ab
MN
927 return;
928 }
929#endif
94daf2e9 930#if COMPILE_TEMPLATE_ALTIVEC
9b734d44
RP
931 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
932 chrFilter, chrSrc, chrFilterSize,
933 dest, uDest, vDest, dstW, chrDstW);
94daf2e9 934#else //COMPILE_TEMPLATE_ALTIVEC
9b734d44
RP
935 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
94daf2e9 938#endif //!COMPILE_TEMPLATE_ALTIVEC
c1b0bfb4 939}
2add307d 940
7ac40327
RP
941static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
942 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
b411dfff 943 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
6118e52e 944{
9b734d44
RP
945 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
946 chrFilter, chrSrc, chrFilterSize,
947 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
948}
949
7ac40327 950static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
6858492e 951 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
c1b0bfb4 952{
f433c8ab 953 int i;
94daf2e9 954#if COMPILE_TEMPLATE_MMX
dd68318c 955 if(!(c->flags & SWS_BITEXACT)) {
6858492e 956 long p= 4;
a959e247 957 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
6858492e
CS
958 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
959 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
2da0d70d 960
dd68318c
RP
961 if (c->flags & SWS_ACCURATE_RND) {
962 while(p--) {
963 if (dst[p]) {
3164d25e
CS
964 __asm__ volatile(
965 YSCALEYUV2YV121_ACCURATE
966 :: "r" (src[p]), "r" (dst[p] + counter[p]),
967 "g" (-counter[p])
968 : "%"REG_a
969 );
970 }
6858492e 971 }
dd68318c
RP
972 } else {
973 while(p--) {
974 if (dst[p]) {
3164d25e
CS
975 __asm__ volatile(
976 YSCALEYUV2YV121
977 :: "r" (src[p]), "r" (dst[p] + counter[p]),
978 "g" (-counter[p])
979 : "%"REG_a
980 );
981 }
6858492e 982 }
d78c1ea1 983 }
f433c8ab
MN
984 return;
985 }
986#endif
dd68318c 987 for (i=0; i<dstW; i++) {
a1f3ffa3 988 int val= (lumSrc[i]+64)>>7;
2da0d70d 989
dd68318c 990 if (val&256) {
2da0d70d
DB
991 if (val<0) val=0;
992 else val=255;
993 }
994
995 dest[i]= val;
996 }
997
1b0a4572 998 if (uDest)
dd68318c 999 for (i=0; i<chrDstW; i++) {
a1f3ffa3
MN
1000 int u=(chrSrc[i ]+64)>>7;
1001 int v=(chrSrc[i + VOFW]+64)>>7;
2da0d70d 1002
dd68318c 1003 if ((u|v)&256) {
2da0d70d
DB
1004 if (u<0) u=0;
1005 else if (u>255) u=255;
1006 if (v<0) v=0;
1007 else if (v>255) v=255;
1008 }
1009
1010 uDest[i]= u;
1011 vDest[i]= v;
1012 }
6858492e
CS
1013
1014 if (CONFIG_SWSCALE_ALPHA && aDest)
dd68318c 1015 for (i=0; i<dstW; i++) {
6858492e
CS
1016 int val= (alpSrc[i]+64)>>7;
1017 aDest[i]= av_clip_uint8(val);
1018 }
38858470
MN
1019}
1020
c1b0bfb4 1021
d604bab9
MN
1022/**
1023 * vertical scale YV12 to RGB
1024 */
7ac40327
RP
1025static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1026 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1027 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1028{
94daf2e9 1029#if COMPILE_TEMPLATE_MMX
d0ce212a 1030 x86_reg dummy=0;
dd68318c
RP
1031 if(!(c->flags & SWS_BITEXACT)) {
1032 if (c->flags & SWS_ACCURATE_RND) {
1033 switch(c->dstFormat) {
14014d47 1034 case PIX_FMT_RGB32:
dd68318c 1035 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e
CS
1036 YSCALEYUV2PACKEDX_ACCURATE
1037 YSCALEYUV2RGBX
1038 "movq %%mm2, "U_TEMP"(%0) \n\t"
1039 "movq %%mm4, "V_TEMP"(%0) \n\t"
1040 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1041 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1042 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1043 "psraw $3, %%mm1 \n\t"
1044 "psraw $3, %%mm7 \n\t"
1045 "packuswb %%mm7, %%mm1 \n\t"
1046 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1047
1048 YSCALEYUV2PACKEDX_END
dd68318c 1049 } else {
3164d25e
CS
1050 YSCALEYUV2PACKEDX_ACCURATE
1051 YSCALEYUV2RGBX
1052 "pcmpeqd %%mm7, %%mm7 \n\t"
1053 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
2da0d70d 1054
3164d25e 1055 YSCALEYUV2PACKEDX_END
6858492e 1056 }
14014d47
MN
1057 return;
1058 case PIX_FMT_BGR24:
1059 YSCALEYUV2PACKEDX_ACCURATE
1060 YSCALEYUV2RGBX
40494418 1061 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1062 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1063 "add %4, %%"REG_c" \n\t"
1064 WRITEBGR24(%%REGc, %5, %%REGa)
2da0d70d
DB
1065
1066
14014d47
MN
1067 :: "r" (&c->redDither),
1068 "m" (dummy), "m" (dummy), "m" (dummy),
1069 "r" (dest), "m" (dstW)
1070 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1071 );
1072 return;
1073 case PIX_FMT_RGB555:
1074 YSCALEYUV2PACKEDX_ACCURATE
1075 YSCALEYUV2RGBX
40494418 1076 "pxor %%mm7, %%mm7 \n\t"
14014d47 1077 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1078#ifdef DITHER1XBPP
88e2a9ae
CEH
1079 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1080 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1081 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1082#endif
1083
14014d47
MN
1084 WRITERGB15(%4, %5, %%REGa)
1085 YSCALEYUV2PACKEDX_END
1086 return;
1087 case PIX_FMT_RGB565:
1088 YSCALEYUV2PACKEDX_ACCURATE
1089 YSCALEYUV2RGBX
40494418 1090 "pxor %%mm7, %%mm7 \n\t"
14014d47 1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1092#ifdef DITHER1XBPP
88e2a9ae
CEH
1093 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1094 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1095 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1096#endif
1097
14014d47
MN
1098 WRITERGB16(%4, %5, %%REGa)
1099 YSCALEYUV2PACKEDX_END
1100 return;
1101 case PIX_FMT_YUYV422:
1102 YSCALEYUV2PACKEDX_ACCURATE
1103 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1104
1105 "psraw $3, %%mm3 \n\t"
1106 "psraw $3, %%mm4 \n\t"
1107 "psraw $3, %%mm1 \n\t"
1108 "psraw $3, %%mm7 \n\t"
1109 WRITEYUY2(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1111 return;
1112 }
dd68318c
RP
1113 } else {
1114 switch(c->dstFormat) {
14014d47 1115 case PIX_FMT_RGB32:
dd68318c 1116 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e
CS
1117 YSCALEYUV2PACKEDX
1118 YSCALEYUV2RGBX
1119 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1120 "psraw $3, %%mm1 \n\t"
1121 "psraw $3, %%mm7 \n\t"
1122 "packuswb %%mm7, %%mm1 \n\t"
1123 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1124 YSCALEYUV2PACKEDX_END
dd68318c 1125 } else {
3164d25e
CS
1126 YSCALEYUV2PACKEDX
1127 YSCALEYUV2RGBX
1128 "pcmpeqd %%mm7, %%mm7 \n\t"
1129 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1130 YSCALEYUV2PACKEDX_END
6858492e 1131 }
14014d47
MN
1132 return;
1133 case PIX_FMT_BGR24:
1134 YSCALEYUV2PACKEDX
1135 YSCALEYUV2RGBX
40494418 1136 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1137 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1138 "add %4, %%"REG_c" \n\t"
1139 WRITEBGR24(%%REGc, %5, %%REGa)
1140
1141 :: "r" (&c->redDither),
1142 "m" (dummy), "m" (dummy), "m" (dummy),
1143 "r" (dest), "m" (dstW)
1144 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145 );
1146 return;
1147 case PIX_FMT_RGB555:
1148 YSCALEYUV2PACKEDX
1149 YSCALEYUV2RGBX
40494418 1150 "pxor %%mm7, %%mm7 \n\t"
14014d47 1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1152#ifdef DITHER1XBPP
88e2a9ae
CEH
1153 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1154 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1155 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1156#endif
1157
14014d47
MN
1158 WRITERGB15(%4, %5, %%REGa)
1159 YSCALEYUV2PACKEDX_END
1160 return;
1161 case PIX_FMT_RGB565:
1162 YSCALEYUV2PACKEDX
1163 YSCALEYUV2RGBX
40494418 1164 "pxor %%mm7, %%mm7 \n\t"
14014d47 1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1166#ifdef DITHER1XBPP
88e2a9ae
CEH
1167 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1168 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1169 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1170#endif
1171
14014d47
MN
1172 WRITERGB16(%4, %5, %%REGa)
1173 YSCALEYUV2PACKEDX_END
1174 return;
1175 case PIX_FMT_YUYV422:
1176 YSCALEYUV2PACKEDX
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1185 return;
1186 }
bca11e75
MN
1187 }
1188 }
94daf2e9
RP
1189#endif /* COMPILE_TEMPLATE_MMX */
1190#if COMPILE_TEMPLATE_ALTIVEC
2da0d70d 1191 /* The following list of supported dstFormat values should
780daf2b 1192 match what's found in the body of ff_yuv2packedX_altivec() */
d55ef636 1193 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
9b734d44
RP
1194 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1195 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1196 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
780daf2b
DB
1197 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1198 chrFilter, chrSrc, chrFilterSize,
1199 dest, dstW, dstY);
2da0d70d
DB
1200 else
1201#endif
1202 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1203 chrFilter, chrSrc, chrFilterSize,
6858492e 1204 alpSrc, dest, dstW, dstY);
c1b0bfb4
MN
1205}
1206
c1b0bfb4
MN
1207/**
1208 * vertical bilinear scale YV12 to RGB
1209 */
7ac40327
RP
1210static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1211 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1212{
ac0ad729
MN
1213 int yalpha1=4095- yalpha;
1214 int uvalpha1=4095-uvalpha;
2da0d70d 1215 int i;
d604bab9 1216
94daf2e9 1217#if COMPILE_TEMPLATE_MMX
dd68318c
RP
1218 if(!(c->flags & SWS_BITEXACT)) {
1219 switch(c->dstFormat) {
c255994b
RP
1220 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1221 case PIX_FMT_RGB32:
1222 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e 1223#if ARCH_X86_64
c255994b 1224 __asm__ volatile(
f514b4f9
RD
1225 YSCALEYUV2RGB(%%r8, %5)
1226 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
6858492e
CS
1227 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1228 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1229 "packuswb %%mm7, %%mm1 \n\t"
f514b4f9 1230 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
6858492e 1231
04ef1d3f 1232 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
6858492e
CS
1233 "a" (&c->redDither)
1234 ,"r" (abuf0), "r" (abuf1)
f514b4f9 1235 : "%r8"
c255994b 1236 );
6858492e 1237#else
a959e247
ZM
1238 *(const uint16_t **)(&c->u_temp)=abuf0;
1239 *(const uint16_t **)(&c->v_temp)=abuf1;
c255994b 1240 __asm__ volatile(
6858492e
CS
1241 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1242 "mov %4, %%"REG_b" \n\t"
1243 "push %%"REG_BP" \n\t"
1244 YSCALEYUV2RGB(%%REGBP, %5)
1245 "push %0 \n\t"
1246 "push %1 \n\t"
1247 "mov "U_TEMP"(%5), %0 \n\t"
1248 "mov "V_TEMP"(%5), %1 \n\t"
1249 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1250 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "packuswb %%mm7, %%mm1 \n\t"
1253 "pop %1 \n\t"
1254 "pop %0 \n\t"
1255 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1256 "pop %%"REG_BP" \n\t"
1257 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1258
1259 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1260 "a" (&c->redDither)
c255994b 1261 );
6858492e 1262#endif
c255994b
RP
1263 } else {
1264 __asm__ volatile(
3164d25e
CS
1265 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1266 "mov %4, %%"REG_b" \n\t"
1267 "push %%"REG_BP" \n\t"
1268 YSCALEYUV2RGB(%%REGBP, %5)
1269 "pcmpeqd %%mm7, %%mm7 \n\t"
1270 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1271 "pop %%"REG_BP" \n\t"
1272 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1273
3164d25e
CS
1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275 "a" (&c->redDither)
c255994b
RP
1276 );
1277 }
1278 return;
1279 case PIX_FMT_BGR24:
1280 __asm__ volatile(
2da0d70d
DB
1281 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1282 "mov %4, %%"REG_b" \n\t"
1283 "push %%"REG_BP" \n\t"
1284 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1285 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
1286 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1287 "pop %%"REG_BP" \n\t"
1288 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1289 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1290 "a" (&c->redDither)
c255994b
RP
1291 );
1292 return;
1293 case PIX_FMT_RGB555:
1294 __asm__ volatile(
2da0d70d
DB
1295 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1296 "mov %4, %%"REG_b" \n\t"
1297 "push %%"REG_BP" \n\t"
1298 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1299 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1300 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1301#ifdef DITHER1XBPP
88e2a9ae
CEH
1302 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1303 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1304 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1305#endif
1306
27a90b04 1307 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1308 "pop %%"REG_BP" \n\t"
1309 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1310
1311 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1312 "a" (&c->redDither)
c255994b
RP
1313 );
1314 return;
1315 case PIX_FMT_RGB565:
1316 __asm__ volatile(
2da0d70d
DB
1317 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1318 "mov %4, %%"REG_b" \n\t"
1319 "push %%"REG_BP" \n\t"
1320 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1321 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1322 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1323#ifdef DITHER1XBPP
88e2a9ae
CEH
1324 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1325 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1326 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1327#endif
1328
27a90b04 1329 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1330 "pop %%"REG_BP" \n\t"
1331 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1332 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1333 "a" (&c->redDither)
c255994b
RP
1334 );
1335 return;
1336 case PIX_FMT_YUYV422:
1337 __asm__ volatile(
2da0d70d
DB
1338 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1339 "mov %4, %%"REG_b" \n\t"
1340 "push %%"REG_BP" \n\t"
1341 YSCALEYUV2PACKED(%%REGBP, %5)
1342 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1343 "pop %%"REG_BP" \n\t"
1344 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1345 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1346 "a" (&c->redDither)
c255994b
RP
1347 );
1348 return;
1349 default: break;
2da0d70d 1350 }
f433c8ab 1351 }
94daf2e9 1352#endif //COMPILE_TEMPLATE_MMX
9b734d44 1353 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
d604bab9
MN
1354}
1355
1356/**
1357 * YV12 to RGB without scaling or interpolating
1358 */
7ac40327 1359static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
b411dfff 1360 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
d604bab9 1361{
2da0d70d
DB
1362 const int yalpha1=0;
1363 int i;
6a4970ab 1364
7ac40327 1365 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
2da0d70d 1366 const int yalpha= 4096; //FIXME ...
96034638 1367
dd68318c 1368 if (flags&SWS_FULL_CHR_H_INT) {
40fa5140 1369 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
2da0d70d
DB
1370 return;
1371 }
397c035e 1372
94daf2e9 1373#if COMPILE_TEMPLATE_MMX
dd68318c
RP
1374 if(!(flags & SWS_BITEXACT)) {
1375 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1376 switch(dstFormat) {
14014d47 1377 case PIX_FMT_RGB32:
dd68318c 1378 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e 1379 __asm__ volatile(
c255994b
RP
1380 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1381 "mov %4, %%"REG_b" \n\t"
1382 "push %%"REG_BP" \n\t"
1383 YSCALEYUV2RGB1(%%REGBP, %5)
1384 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1385 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1386 "pop %%"REG_BP" \n\t"
1387 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1388
1389 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1390 "a" (&c->redDither)
6858492e 1391 );
dd68318c 1392 } else {
3164d25e 1393 __asm__ volatile(
c255994b
RP
1394 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1395 "mov %4, %%"REG_b" \n\t"
1396 "push %%"REG_BP" \n\t"
1397 YSCALEYUV2RGB1(%%REGBP, %5)
1398 "pcmpeqd %%mm7, %%mm7 \n\t"
1399 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1400 "pop %%"REG_BP" \n\t"
1401 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1402
1403 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1404 "a" (&c->redDither)
1405 );
1406 }
1407 return;
1408 case PIX_FMT_BGR24:
1409 __asm__ volatile(
3164d25e
CS
1410 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1411 "mov %4, %%"REG_b" \n\t"
1412 "push %%"REG_BP" \n\t"
1413 YSCALEYUV2RGB1(%%REGBP, %5)
c255994b
RP
1414 "pxor %%mm7, %%mm7 \n\t"
1415 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
3164d25e
CS
1416 "pop %%"REG_BP" \n\t"
1417 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1418
3164d25e
CS
1419 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1420 "a" (&c->redDither)
14014d47
MN
1421 );
1422 return;
1423 case PIX_FMT_RGB555:
7ad6469e 1424 __asm__ volatile(
c255994b
RP
1425 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1426 "mov %4, %%"REG_b" \n\t"
1427 "push %%"REG_BP" \n\t"
1428 YSCALEYUV2RGB1(%%REGBP, %5)
1429 "pxor %%mm7, %%mm7 \n\t"
1430 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1431#ifdef DITHER1XBPP
c255994b
RP
1432 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1433 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1434 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1435#endif
c255994b
RP
1436 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1439
c255994b
RP
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441 "a" (&c->redDither)
14014d47
MN
1442 );
1443 return;
1444 case PIX_FMT_RGB565:
7ad6469e 1445 __asm__ volatile(
c255994b
RP
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB1(%%REGBP, %5)
1450 "pxor %%mm7, %%mm7 \n\t"
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1452#ifdef DITHER1XBPP
c255994b
RP
1453 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1454 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1455 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1456#endif
1457
c255994b
RP
1458 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1459 "pop %%"REG_BP" \n\t"
1460 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1461
c255994b
RP
1462 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463 "a" (&c->redDither)
14014d47
MN
1464 );
1465 return;
1466 case PIX_FMT_YUYV422:
7ad6469e 1467 __asm__ volatile(
c255994b
RP
1468 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_b" \n\t"
1470 "push %%"REG_BP" \n\t"
1471 YSCALEYUV2PACKED1(%%REGBP, %5)
1472 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1473 "pop %%"REG_BP" \n\t"
1474 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1475
c255994b
RP
1476 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1477 "a" (&c->redDither)
14014d47
MN
1478 );
1479 return;
1480 }
dd68318c
RP
1481 } else {
1482 switch(dstFormat) {
14014d47 1483 case PIX_FMT_RGB32:
dd68318c 1484 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e 1485 __asm__ volatile(
c255994b
RP
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB1b(%%REGBP, %5)
1490 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1491 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494
1495 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1496 "a" (&c->redDither)
6858492e 1497 );
dd68318c 1498 } else {
3164d25e 1499 __asm__ volatile(
c255994b
RP
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP, %5)
1504 "pcmpeqd %%mm7, %%mm7 \n\t"
1505 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506 "pop %%"REG_BP" \n\t"
1507 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1508
1509 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510 "a" (&c->redDither)
1511 );
1512 }
1513 return;
1514 case PIX_FMT_BGR24:
1515 __asm__ volatile(
3164d25e
CS
1516 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1517 "mov %4, %%"REG_b" \n\t"
1518 "push %%"REG_BP" \n\t"
1519 YSCALEYUV2RGB1b(%%REGBP, %5)
c255994b
RP
1520 "pxor %%mm7, %%mm7 \n\t"
1521 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
3164d25e
CS
1522 "pop %%"REG_BP" \n\t"
1523 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1524
3164d25e
CS
1525 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1526 "a" (&c->redDither)
14014d47
MN
1527 );
1528 return;
1529 case PIX_FMT_RGB555:
7ad6469e 1530 __asm__ volatile(
c255994b
RP
1531 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1532 "mov %4, %%"REG_b" \n\t"
1533 "push %%"REG_BP" \n\t"
1534 YSCALEYUV2RGB1b(%%REGBP, %5)
1535 "pxor %%mm7, %%mm7 \n\t"
1536 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1537#ifdef DITHER1XBPP
c255994b
RP
1538 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1539 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1540 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1541#endif
c255994b
RP
1542 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1543 "pop %%"REG_BP" \n\t"
1544 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1545
c255994b
RP
1546 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1547 "a" (&c->redDither)
14014d47
MN
1548 );
1549 return;
1550 case PIX_FMT_RGB565:
7ad6469e 1551 __asm__ volatile(
c255994b
RP
1552 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1553 "mov %4, %%"REG_b" \n\t"
1554 "push %%"REG_BP" \n\t"
1555 YSCALEYUV2RGB1b(%%REGBP, %5)
1556 "pxor %%mm7, %%mm7 \n\t"
1557 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1558#ifdef DITHER1XBPP
c255994b
RP
1559 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1560 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1561 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1562#endif
1563
c255994b
RP
1564 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1565 "pop %%"REG_BP" \n\t"
1566 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1567
c255994b
RP
1568 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1569 "a" (&c->redDither)
14014d47
MN
1570 );
1571 return;
1572 case PIX_FMT_YUYV422:
7ad6469e 1573 __asm__ volatile(
c255994b
RP
1574 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1575 "mov %4, %%"REG_b" \n\t"
1576 "push %%"REG_BP" \n\t"
1577 YSCALEYUV2PACKED1b(%%REGBP, %5)
1578 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1579 "pop %%"REG_BP" \n\t"
1580 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1581
c255994b
RP
1582 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583 "a" (&c->redDither)
14014d47
MN
1584 );
1585 return;
1586 }
2da0d70d
DB
1587 }
1588 }
94daf2e9 1589#endif /* COMPILE_TEMPLATE_MMX */
dd68318c 1590 if (uvalpha < 2048) {
6858492e 1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
dd68318c 1592 } else {
6858492e 1593 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1594 }
d604bab9
MN
1595}
1596
8a322796 1597//FIXME yuy2* can read up to 7 samples too much
6ff0ad6b 1598
7ac40327 1599static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1e621b18 1600{
94daf2e9 1601#if COMPILE_TEMPLATE_MMX
7ad6469e 1602 __asm__ volatile(
c255994b
RP
1603 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1604 "mov %0, %%"REG_a" \n\t"
1605 "1: \n\t"
1606 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1607 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1608 "pand %%mm2, %%mm0 \n\t"
1609 "pand %%mm2, %%mm1 \n\t"
1610 "packuswb %%mm1, %%mm0 \n\t"
1611 "movq %%mm0, (%2, %%"REG_a") \n\t"
1612 "add $8, %%"REG_a" \n\t"
1613 " js 1b \n\t"
1614 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1615 : "%"REG_a
2da0d70d 1616 );
1e621b18 1617#else
2da0d70d
DB
1618 int i;
1619 for (i=0; i<width; i++)
1620 dst[i]= src[2*i];
1e621b18
MN
1621#endif
1622}
1623
7ac40327 1624static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1e621b18 1625{
94daf2e9 1626#if COMPILE_TEMPLATE_MMX
7ad6469e 1627 __asm__ volatile(
c255994b
RP
1628 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1629 "mov %0, %%"REG_a" \n\t"
1630 "1: \n\t"
1631 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1632 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1633 "psrlw $8, %%mm0 \n\t"
1634 "psrlw $8, %%mm1 \n\t"
1635 "packuswb %%mm1, %%mm0 \n\t"
1636 "movq %%mm0, %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "pand %%mm4, %%mm1 \n\t"
1639 "packuswb %%mm0, %%mm0 \n\t"
1640 "packuswb %%mm1, %%mm1 \n\t"
1641 "movd %%mm0, (%3, %%"REG_a") \n\t"
1642 "movd %%mm1, (%2, %%"REG_a") \n\t"
1643 "add $4, %%"REG_a" \n\t"
1644 " js 1b \n\t"
1645 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1646 : "%"REG_a
2da0d70d 1647 );
1e621b18 1648#else
2da0d70d 1649 int i;
dd68318c 1650 for (i=0; i<width; i++) {
2da0d70d
DB
1651 dstU[i]= src1[4*i + 1];
1652 dstV[i]= src1[4*i + 3];
1653 }
1654#endif
1655 assert(src1 == src2);
1e621b18
MN
1656}
1657
de1275d5
MN
1658static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1659{
94daf2e9 1660#if COMPILE_TEMPLATE_MMX
de1275d5 1661 __asm__ volatile(
c255994b
RP
1662 "mov %0, %%"REG_a" \n\t"
1663 "1: \n\t"
1664 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1665 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1666 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1667 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1668 "psrlw $8, %%mm0 \n\t"
1669 "psrlw $8, %%mm1 \n\t"
1670 "psrlw $8, %%mm2 \n\t"
1671 "psrlw $8, %%mm3 \n\t"
1672 "packuswb %%mm1, %%mm0 \n\t"
1673 "packuswb %%mm3, %%mm2 \n\t"
1674 "movq %%mm0, (%3, %%"REG_a") \n\t"
1675 "movq %%mm2, (%4, %%"REG_a") \n\t"
1676 "add $8, %%"REG_a" \n\t"
1677 " js 1b \n\t"
1678 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1679 : "%"REG_a
de1275d5
MN
1680 );
1681#else
1682 int i;
dd68318c 1683 for (i=0; i<width; i++) {
de1275d5
MN
1684 dstU[i]= src1[2*i + 1];
1685 dstV[i]= src2[2*i + 1];
1686 }
1687#endif
1688}
1689
4cf16bbe
DB
1690/* This is almost identical to the previous, end exists only because
1691 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
7ac40327 1692static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
7322a67c 1693{
94daf2e9 1694#if COMPILE_TEMPLATE_MMX
7ad6469e 1695 __asm__ volatile(
c255994b
RP
1696 "mov %0, %%"REG_a" \n\t"
1697 "1: \n\t"
1698 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1699 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1700 "psrlw $8, %%mm0 \n\t"
1701 "psrlw $8, %%mm1 \n\t"
1702 "packuswb %%mm1, %%mm0 \n\t"
1703 "movq %%mm0, (%2, %%"REG_a") \n\t"
1704 "add $8, %%"REG_a" \n\t"
1705 " js 1b \n\t"
1706 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1707 : "%"REG_a
2da0d70d 1708 );
7322a67c 1709#else
2da0d70d
DB
1710 int i;
1711 for (i=0; i<width; i++)
1712 dst[i]= src[2*i+1];
7322a67c
MN
1713#endif
1714}
1715
7ac40327 1716static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
7322a67c 1717{
94daf2e9 1718#if COMPILE_TEMPLATE_MMX
7ad6469e 1719 __asm__ volatile(
c255994b
RP
1720 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1721 "mov %0, %%"REG_a" \n\t"
1722 "1: \n\t"
1723 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1724 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1725 "pand %%mm4, %%mm0 \n\t"
1726 "pand %%mm4, %%mm1 \n\t"
1727 "packuswb %%mm1, %%mm0 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "psrlw $8, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm0, %%mm0 \n\t"
1732 "packuswb %%mm1, %%mm1 \n\t"
1733 "movd %%mm0, (%3, %%"REG_a") \n\t"
1734 "movd %%mm1, (%2, %%"REG_a") \n\t"
1735 "add $4, %%"REG_a" \n\t"
1736 " js 1b \n\t"
1737 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1738 : "%"REG_a
2da0d70d 1739 );
7322a67c 1740#else
2da0d70d 1741 int i;
dd68318c 1742 for (i=0; i<width; i++) {
2da0d70d
DB
1743 dstU[i]= src1[4*i + 0];
1744 dstV[i]= src1[4*i + 2];
1745 }
1746#endif
1747 assert(src1 == src2);
7322a67c
MN
1748}
1749
de1275d5
MN
1750static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1751{
94daf2e9 1752#if COMPILE_TEMPLATE_MMX
de1275d5 1753 __asm__ volatile(
c255994b
RP
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1756 "1: \n\t"
1757 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1759 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1760 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1761 "pand %%mm4, %%mm0 \n\t"
1762 "pand %%mm4, %%mm1 \n\t"
1763 "pand %%mm4, %%mm2 \n\t"
1764 "pand %%mm4, %%mm3 \n\t"
1765 "packuswb %%mm1, %%mm0 \n\t"
1766 "packuswb %%mm3, %%mm2 \n\t"
1767 "movq %%mm0, (%3, %%"REG_a") \n\t"
1768 "movq %%mm2, (%4, %%"REG_a") \n\t"
1769 "add $8, %%"REG_a" \n\t"
1770 " js 1b \n\t"
1771 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1772 : "%"REG_a
de1275d5
MN
1773 );
1774#else
1775 int i;
dd68318c 1776 for (i=0; i<width; i++) {
de1275d5
MN
1777 dstU[i]= src1[2*i];
1778 dstV[i]= src2[2*i];
1779 }
1780#endif
1781}
1782
f415be68
RP
1783static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1784 const uint8_t *src, long width)
1785{
1786#if COMPILE_TEMPLATE_MMX
1787 __asm__ volatile(
1788 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1789 "mov %0, %%"REG_a" \n\t"
1790 "1: \n\t"
1791 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1792 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1793 "movq %%mm0, %%mm2 \n\t"
1794 "movq %%mm1, %%mm3 \n\t"
1795 "pand %%mm4, %%mm0 \n\t"
1796 "pand %%mm4, %%mm1 \n\t"
1797 "psrlw $8, %%mm2 \n\t"
1798 "psrlw $8, %%mm3 \n\t"
1799 "packuswb %%mm1, %%mm0 \n\t"
1800 "packuswb %%mm3, %%mm2 \n\t"
1801 "movq %%mm0, (%2, %%"REG_a") \n\t"
1802 "movq %%mm2, (%3, %%"REG_a") \n\t"
1803 "add $8, %%"REG_a" \n\t"
1804 " js 1b \n\t"
1805 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1806 : "%"REG_a
1807 );
1808#else
1809 int i;
1810 for (i = 0; i < width; i++) {
1811 dst1[i] = src[2*i+0];
1812 dst2[i] = src[2*i+1];
1813 }
1814#endif
1815}
1816
e470691a
RP
1817static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1818 const uint8_t *src1, const uint8_t *src2,
1819 long width, uint32_t *unused)
f415be68
RP
1820{
1821 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1822}
1823
e470691a
RP
1824static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1825 const uint8_t *src1, const uint8_t *src2,
1826 long width, uint32_t *unused)
f415be68
RP
1827{
1828 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1829}
1830
94daf2e9 1831#if COMPILE_TEMPLATE_MMX
b411dfff 1832static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
dfb09bd1
MN
1833{
1834
dd68318c 1835 if(srcFormat == PIX_FMT_BGR24) {
7ad6469e 1836 __asm__ volatile(
ff9a056d
MN
1837 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1838 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1839 :
dfb09bd1 1840 );
dd68318c 1841 } else {
7ad6469e 1842 __asm__ volatile(
ff9a056d
MN
1843 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1844 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1845 :
dfb09bd1
MN
1846 );
1847 }
1848
7ad6469e 1849 __asm__ volatile(
dfb09bd1
MN
1850 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1851 "mov %2, %%"REG_a" \n\t"
1852 "pxor %%mm7, %%mm7 \n\t"
1853 "1: \n\t"
1854 PREFETCH" 64(%0) \n\t"
1855 "movd (%0), %%mm0 \n\t"
1856 "movd 2(%0), %%mm1 \n\t"
1857 "movd 6(%0), %%mm2 \n\t"
1858 "movd 8(%0), %%mm3 \n\t"
1859 "add $12, %0 \n\t"
1860 "punpcklbw %%mm7, %%mm0 \n\t"
1861 "punpcklbw %%mm7, %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm2 \n\t"
1863 "punpcklbw %%mm7, %%mm3 \n\t"
1864 "pmaddwd %%mm5, %%mm0 \n\t"
1865 "pmaddwd %%mm6, %%mm1 \n\t"
1866 "pmaddwd %%mm5, %%mm2 \n\t"
1867 "pmaddwd %%mm6, %%mm3 \n\t"
1868 "paddd %%mm1, %%mm0 \n\t"
1869 "paddd %%mm3, %%mm2 \n\t"
1870 "paddd %%mm4, %%mm0 \n\t"
1871 "paddd %%mm4, %%mm2 \n\t"
1872 "psrad $15, %%mm0 \n\t"
1873 "psrad $15, %%mm2 \n\t"
1874 "packssdw %%mm2, %%mm0 \n\t"
1875 "packuswb %%mm0, %%mm0 \n\t"
1876 "movd %%mm0, (%1, %%"REG_a") \n\t"
1877 "add $4, %%"REG_a" \n\t"
1878 " js 1b \n\t"
1879 : "+r" (src)
d0ce212a 1880 : "r" (dst+width), "g" ((x86_reg)-width)
dfb09bd1 1881 : "%"REG_a
2da0d70d 1882 );
dfb09bd1
MN
1883}
1884
b411dfff 1885static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
dfb09bd1 1886{
7ad6469e 1887 __asm__ volatile(
dfb09bd1
MN
1888 "movq 24+%4, %%mm6 \n\t"
1889 "mov %3, %%"REG_a" \n\t"
1890 "pxor %%mm7, %%mm7 \n\t"
1891 "1: \n\t"
1892 PREFETCH" 64(%0) \n\t"
1893 "movd (%0), %%mm0 \n\t"
1894 "movd 2(%0), %%mm1 \n\t"
1895 "punpcklbw %%mm7, %%mm0 \n\t"
1896 "punpcklbw %%mm7, %%mm1 \n\t"
1897 "movq %%mm0, %%mm2 \n\t"
1898 "movq %%mm1, %%mm3 \n\t"
1899 "pmaddwd %4, %%mm0 \n\t"
1900 "pmaddwd 8+%4, %%mm1 \n\t"
1901 "pmaddwd 16+%4, %%mm2 \n\t"
1902 "pmaddwd %%mm6, %%mm3 \n\t"
1903 "paddd %%mm1, %%mm0 \n\t"
1904 "paddd %%mm3, %%mm2 \n\t"
1905
1906 "movd 6(%0), %%mm1 \n\t"
1907 "movd 8(%0), %%mm3 \n\t"
1908 "add $12, %0 \n\t"
1909 "punpcklbw %%mm7, %%mm1 \n\t"
1910 "punpcklbw %%mm7, %%mm3 \n\t"
1911 "movq %%mm1, %%mm4 \n\t"
1912 "movq %%mm3, %%mm5 \n\t"
1913 "pmaddwd %4, %%mm1 \n\t"
1914 "pmaddwd 8+%4, %%mm3 \n\t"
1915 "pmaddwd 16+%4, %%mm4 \n\t"
1916 "pmaddwd %%mm6, %%mm5 \n\t"
1917 "paddd %%mm3, %%mm1 \n\t"
1918 "paddd %%mm5, %%mm4 \n\t"
1919
1920 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1921 "paddd %%mm3, %%mm0 \n\t"
1922 "paddd %%mm3, %%mm2 \n\t"
1923 "paddd %%mm3, %%mm1 \n\t"
1924 "paddd %%mm3, %%mm4 \n\t"
1925 "psrad $15, %%mm0 \n\t"
1926 "psrad $15, %%mm2 \n\t"
1927 "psrad $15, %%mm1 \n\t"
1928 "psrad $15, %%mm4 \n\t"
1929 "packssdw %%mm1, %%mm0 \n\t"
1930 "packssdw %%mm4, %%mm2 \n\t"
1931 "packuswb %%mm0, %%mm0 \n\t"
1932 "packuswb %%mm2, %%mm2 \n\t"
1933 "movd %%mm0, (%1, %%"REG_a") \n\t"
1934 "movd %%mm2, (%2, %%"REG_a") \n\t"
1935 "add $4, %%"REG_a" \n\t"
1936 " js 1b \n\t"
1937 : "+r" (src)
d0ce212a 1938 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
dfb09bd1
MN
1939 : "%"REG_a
1940 );
1941}
1942#endif
1943
7ac40327 1944static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
dfb09bd1 1945{
94daf2e9 1946#if COMPILE_TEMPLATE_MMX
a35acd7f 1947 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1e621b18 1948#else
2da0d70d 1949 int i;
dd68318c 1950 for (i=0; i<width; i++) {
2da0d70d
DB
1951 int b= src[i*3+0];
1952 int g= src[i*3+1];
1953 int r= src[i*3+2];
1e621b18 1954
e5091488 1955 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1956 }
94daf2e9 1957#endif /* COMPILE_TEMPLATE_MMX */
1e621b18
MN
1958}
1959
7ac40327 1960static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1e621b18 1961{
94daf2e9 1962#if COMPILE_TEMPLATE_MMX
a35acd7f 1963 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1e621b18 1964#else
2da0d70d 1965 int i;
dd68318c 1966 for (i=0; i<width; i++) {
dfb09bd1
MN
1967 int b= src1[3*i + 0];
1968 int g= src1[3*i + 1];
1969 int r= src1[3*i + 2];
2da0d70d 1970
dfb09bd1
MN
1971 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1972 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 1973 }
94daf2e9 1974#endif /* COMPILE_TEMPLATE_MMX */
2da0d70d 1975 assert(src1 == src2);
1e621b18
MN
1976}
1977
7ac40327 1978static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
1979{
1980 int i;
dd68318c 1981 for (i=0; i<width; i++) {
2f60f629
MN
1982 int b= src1[6*i + 0] + src1[6*i + 3];
1983 int g= src1[6*i + 1] + src1[6*i + 4];
1984 int r= src1[6*i + 2] + src1[6*i + 5];
1985
1986 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1987 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1988 }
1989 assert(src1 == src2);
1990}
1991
7ac40327 1992static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
a861d4d7 1993{
94daf2e9 1994#if COMPILE_TEMPLATE_MMX
a35acd7f 1995 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
dfb09bd1 1996#else
2da0d70d 1997 int i;
dd68318c 1998 for (i=0; i<width; i++) {
2da0d70d
DB
1999 int r= src[i*3+0];
2000 int g= src[i*3+1];
2001 int b= src[i*3+2];
2002
e5091488 2003 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 2004 }
dfb09bd1 2005#endif
a861d4d7
MN
2006}
2007
7ac40327 2008static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
a861d4d7 2009{
94daf2e9 2010#if COMPILE_TEMPLATE_MMX
5155b839 2011 assert(src1==src2);
a35acd7f 2012 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
dfb09bd1 2013#else
5155b839
DB
2014 int i;
2015 assert(src1==src2);
dd68318c 2016 for (i=0; i<width; i++) {
dfb09bd1
MN
2017 int r= src1[3*i + 0];
2018 int g= src1[3*i + 1];
2019 int b= src1[3*i + 2];
2da0d70d 2020
dfb09bd1
MN
2021 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2022 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2023 }
dfb09bd1 2024#endif
a861d4d7
MN
2025}
2026
7ac40327 2027static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
2028{
2029 int i;
2030 assert(src1==src2);
dd68318c 2031 for (i=0; i<width; i++) {
e09d7eef
MN
2032 int r= src1[6*i + 0] + src1[6*i + 3];
2033 int g= src1[6*i + 1] + src1[6*i + 4];
2034 int b= src1[6*i + 2] + src1[6*i + 5];
2f60f629
MN
2035
2036 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2037 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2038 }
2039}
2040
1e621b18 2041
8a322796 2042// bilinear / bicubic scaling
7ac40327
RP
2043static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2044 const int16_t *filter, const int16_t *filterPos, long filterSize)
2ff198c1 2045{
94daf2e9 2046#if COMPILE_TEMPLATE_MMX
2da0d70d 2047 assert(filterSize % 4 == 0 && filterSize>0);
dd68318c 2048 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
d0ce212a 2049 x86_reg counter= -2*dstW;
2da0d70d
DB
2050 filter-= counter*2;
2051 filterPos-= counter/2;
2052 dst-= counter/2;
7ad6469e 2053 __asm__ volatile(
83c89c78 2054#if defined(PIC)
c255994b 2055 "push %%"REG_b" \n\t"
2da0d70d 2056#endif
c255994b
RP
2057 "pxor %%mm7, %%mm7 \n\t"
2058 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a", %%"REG_BP" \n\t"
2060 ASMALIGN(4)
2061 "1: \n\t"
2062 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2064 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2065 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2066 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2072 "movq %%mm0, %%mm4 \n\t"
2073 "punpckldq %%mm3, %%mm0 \n\t"
2074 "punpckhdq %%mm3, %%mm4 \n\t"
2075 "paddd %%mm4, %%mm0 \n\t"
2076 "psrad $7, %%mm0 \n\t"
2077 "packssdw %%mm0, %%mm0 \n\t"
2078 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2079 "add $4, %%"REG_BP" \n\t"
2080 " jnc 1b \n\t"
2081
2082 "pop %%"REG_BP" \n\t"
83c89c78 2083#if defined(PIC)
c255994b 2084 "pop %%"REG_b" \n\t"
83c89c78 2085#endif
c255994b
RP
2086 : "+a" (counter)
2087 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2088#if !defined(PIC)
c255994b 2089 : "%"REG_b
2da0d70d
DB
2090#endif
2091 );
dd68318c 2092 } else if (filterSize==8) {
d0ce212a 2093 x86_reg counter= -2*dstW;
2da0d70d
DB
2094 filter-= counter*4;
2095 filterPos-= counter/2;
2096 dst-= counter/2;
7ad6469e 2097 __asm__ volatile(
83c89c78 2098#if defined(PIC)
c255994b 2099 "push %%"REG_b" \n\t"
2da0d70d 2100#endif
c255994b
RP
2101 "pxor %%mm7, %%mm7 \n\t"
2102 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2103 "mov %%"REG_a", %%"REG_BP" \n\t"
2104 ASMALIGN(4)
2105 "1: \n\t"
2106 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2107 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2108 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2109 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2110 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2111 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2112 "punpcklbw %%mm7, %%mm0 \n\t"
2113 "punpcklbw %%mm7, %%mm2 \n\t"
2114 "pmaddwd %%mm1, %%mm0 \n\t"
2115 "pmaddwd %%mm2, %%mm3 \n\t"
2116
2117 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2118 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2119 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2120 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm4 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "pmaddwd %%mm1, %%mm4 \n\t"
2124 "pmaddwd %%mm2, %%mm5 \n\t"
2125 "paddd %%mm4, %%mm0 \n\t"
2126 "paddd %%mm5, %%mm3 \n\t"
2127 "movq %%mm0, %%mm4 \n\t"
2128 "punpckldq %%mm3, %%mm0 \n\t"
2129 "punpckhdq %%mm3, %%mm4 \n\t"
2130 "paddd %%mm4, %%mm0 \n\t"
2131 "psrad $7, %%mm0 \n\t"
2132 "packssdw %%mm0, %%mm0 \n\t"
2133 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2134 "add $4, %%"REG_BP" \n\t"
2135 " jnc 1b \n\t"
2136
2137 "pop %%"REG_BP" \n\t"
83c89c78 2138#if defined(PIC)
c255994b 2139 "pop %%"REG_b" \n\t"
83c89c78 2140#endif
c255994b
RP
2141 : "+a" (counter)
2142 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2143#if !defined(PIC)
c255994b 2144 : "%"REG_b
2da0d70d
DB
2145#endif
2146 );
dd68318c 2147 } else {
a959e247 2148 const uint8_t *offset = src+filterSize;
d0ce212a 2149 x86_reg counter= -2*dstW;
2da0d70d
DB
2150 //filter-= counter*filterSize/2;
2151 filterPos-= counter/2;
2152 dst-= counter/2;
7ad6469e 2153 __asm__ volatile(
c255994b
RP
2154 "pxor %%mm7, %%mm7 \n\t"
2155 ASMALIGN(4)
2156 "1: \n\t"
2157 "mov %2, %%"REG_c" \n\t"
2158 "movzwl (%%"REG_c", %0), %%eax \n\t"
2159 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2160 "mov %5, %%"REG_c" \n\t"
2161 "pxor %%mm4, %%mm4 \n\t"
2162 "pxor %%mm5, %%mm5 \n\t"
2163 "2: \n\t"
2164 "movq (%1), %%mm1 \n\t"
2165 "movq (%1, %6), %%mm3 \n\t"
2166 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2167 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2168 "punpcklbw %%mm7, %%mm0 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "pmaddwd %%mm1, %%mm0 \n\t"
2171 "pmaddwd %%mm2, %%mm3 \n\t"
2172 "paddd %%mm3, %%mm5 \n\t"
2173 "paddd %%mm0, %%mm4 \n\t"
2174 "add $8, %1 \n\t"
2175 "add $4, %%"REG_c" \n\t"
2176 "cmp %4, %%"REG_c" \n\t"
2177 " jb 2b \n\t"
2178 "add %6, %1 \n\t"
2179 "movq %%mm4, %%mm0 \n\t"
2180 "punpckldq %%mm5, %%mm4 \n\t"
2181 "punpckhdq %%mm5, %%mm0 \n\t"
2182 "paddd %%mm0, %%mm4 \n\t"
2183 "psrad $7, %%mm4 \n\t"
2184 "packssdw %%mm4, %%mm4 \n\t"
2185 "mov %3, %%"REG_a" \n\t"
2186 "movd %%mm4, (%%"REG_a", %0) \n\t"
2187 "add $4, %0 \n\t"
2188 " jnc 1b \n\t"
2189
2190 : "+r" (counter), "+r" (filter)
2191 : "m" (filterPos), "m" (dst), "m"(offset),
2192 "m" (src), "r" ((x86_reg)filterSize*2)
2193 : "%"REG_a, "%"REG_c, "%"REG_d
2da0d70d
DB
2194 );
2195 }
077ea8a7 2196#else
94daf2e9 2197#if COMPILE_TEMPLATE_ALTIVEC
2da0d70d 2198 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2199#else
2da0d70d 2200 int i;
dd68318c 2201 for (i=0; i<dstW; i++) {
2da0d70d
DB
2202 int j;
2203 int srcPos= filterPos[i];
2204 int val=0;
2205 //printf("filterPos: %d\n", filterPos[i]);
dd68318c 2206 for (j=0; j<filterSize; j++) {
2da0d70d
DB
2207 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2208 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2209 }
2210 //filter += hFilterSize;
881c4294 2211 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2da0d70d
DB
2212 //dst[i] = val>>7;
2213 }
94daf2e9
RP
2214#endif /* COMPILE_ALTIVEC */
2215#endif /* COMPILE_MMX */
077ea8a7 2216}
392b6567 2217
bae76dc3
RP
2218//FIXME all pal and rgb srcFormats could do this convertion as well
2219//FIXME all scalers more complex than bilinear could do half of this transform
2220static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2221{
2222 int i;
2223 for (i = 0; i < width; i++) {
2224 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2225 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2226 }
2227}
2228static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2229{
2230 int i;
2231 for (i = 0; i < width; i++) {
2232 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2233 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2234 }
2235}
2236static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2237{
2238 int i;
2239 for (i = 0; i < width; i++)
2240 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2241}
2242static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2243{
2244 int i;
2245 for (i = 0; i < width; i++)
2246 dst[i] = (dst[i]*14071 + 33561947)>>14;
2247}
2248
18c61752
RP
2249#define FAST_BILINEAR_X86 \
2250 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2251 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2252 "shll $16, %%edi \n\t" \
2253 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2254 "mov %1, %%"REG_D"\n\t" \
2255 "shrl $9, %%esi \n\t" \
2256
392b6567 2257static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
a1f4b4bb 2258 long dstWidth, const uint8_t *src, int srcW,
392b6567
RP
2259 int xInc)
2260{
a1f4b4bb
RP
2261#if ARCH_X86 && CONFIG_GPL
2262#if COMPILE_TEMPLATE_MMX2
0cc854e3
RP
2263 int32_t *filterPos = c->hLumFilterPos;
2264 int16_t *filter = c->hLumFilter;
a1f4b4bb
RP
2265 int canMMX2BeUsed = c->canMMX2BeUsed;
2266 void *mmx2FilterCode= c->lumMmx2FilterCode;
2267 int i;
2268#if defined(PIC)
2269 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2270#endif
2271 if (canMMX2BeUsed) {
2272 __asm__ volatile(
2273#if defined(PIC)
2274 "mov %%"REG_b", %5 \n\t"
2275#endif
2276 "pxor %%mm7, %%mm7 \n\t"
2277 "mov %0, %%"REG_c" \n\t"
2278 "mov %1, %%"REG_D" \n\t"
2279 "mov %2, %%"REG_d" \n\t"
2280 "mov %3, %%"REG_b" \n\t"
2281 "xor %%"REG_a", %%"REG_a" \n\t" // i
2282 PREFETCH" (%%"REG_c") \n\t"
2283 PREFETCH" 32(%%"REG_c") \n\t"
2284 PREFETCH" 64(%%"REG_c") \n\t"
2285
2286#if ARCH_X86_64
2287
2288#define CALL_MMX2_FILTER_CODE \
2289 "movl (%%"REG_b"), %%esi \n\t"\
2290 "call *%4 \n\t"\
2291 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2292 "add %%"REG_S", %%"REG_c" \n\t"\
2293 "add %%"REG_a", %%"REG_D" \n\t"\
2294 "xor %%"REG_a", %%"REG_a" \n\t"\
2295
2296#else
2297
2298#define CALL_MMX2_FILTER_CODE \
2299 "movl (%%"REG_b"), %%esi \n\t"\
2300 "call *%4 \n\t"\
2301 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2302 "add %%"REG_a", %%"REG_D" \n\t"\
2303 "xor %%"REG_a", %%"REG_a" \n\t"\
2304
2305#endif /* ARCH_X86_64 */
2306
2307 CALL_MMX2_FILTER_CODE
2308 CALL_MMX2_FILTER_CODE
2309 CALL_MMX2_FILTER_CODE
2310 CALL_MMX2_FILTER_CODE
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2313 CALL_MMX2_FILTER_CODE
2314 CALL_MMX2_FILTER_CODE
2315
2316#if defined(PIC)
2317 "mov %5, %%"REG_b" \n\t"
2318#endif
0cc854e3 2319 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
a1f4b4bb
RP
2320 "m" (mmx2FilterCode)
2321#if defined(PIC)
2322 ,"m" (ebxsave)
2323#endif
2324 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2325#if !defined(PIC)
2326 ,"%"REG_b
2327#endif
2328 );
2329 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2330 } else {
2331#endif /* COMPILE_TEMPLATE_MMX2 */
2332 x86_reg xInc_shr16 = xInc >> 16;
2333 uint16_t xInc_mask = xInc & 0xffff;
2334 //NO MMX just normal asm ...
2335 __asm__ volatile(
2336 "xor %%"REG_a", %%"REG_a" \n\t" // i
2337 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2338 "xorl %%ecx, %%ecx \n\t" // xalpha
2339 ASMALIGN(4)
2340 "1: \n\t"
2341 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2342 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2343 FAST_BILINEAR_X86
2344 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2345 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2346 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2347
2348 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2350 FAST_BILINEAR_X86
2351 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2352 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2353 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2354
2355
2356 "add $2, %%"REG_a" \n\t"
2357 "cmp %2, %%"REG_a" \n\t"
2358 " jb 1b \n\t"
2359
2360
2361 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2362 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2363 );
2364#if COMPILE_TEMPLATE_MMX2
2365 } //if MMX2 can't be used
2366#endif
2367#else
392b6567
RP
2368 int i;
2369 unsigned int xpos=0;
dd68318c 2370 for (i=0;i<dstWidth;i++) {
392b6567
RP
2371 register unsigned int xx=xpos>>16;
2372 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2373 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2374 xpos+=xInc;
2375 }
a1f4b4bb 2376#endif /* ARCH_X86 */
392b6567
RP
2377}
2378
2ff198c1 2379 // *** horizontal scale Y line to temp buffer
7ac40327 2380static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
aaba7e6c 2381 const int16_t *hLumFilter,
7ac40327 2382 const int16_t *hLumFilterPos, int hLumFilterSize,
aaba7e6c 2383 uint8_t *formatConvBuffer,
95b5770b 2384 uint32_t *pal, int isAlpha)
077ea8a7 2385{
bb53e1d1 2386 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
bae76dc3 2387 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
95b5770b 2388
84011f10 2389 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
e8417235 2390
bb53e1d1
RP
2391 if (toYV12) {
2392 toYV12(formatConvBuffer, src, srcW, pal);
3d05e078
MN
2393 src= formatConvBuffer;
2394 }
1e621b18 2395
996de2fe 2396 if (!c->hyscale_fast) {
40fa5140 2397 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
dd68318c 2398 } else { // fast bilinear upscale / crap downscale
a1f4b4bb
RP
2399 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2400 }
2401
2402 if (convertRange)
2403 convertRange(dst, dstWidth);
2404}
2405
2406static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2407 long dstWidth, const uint8_t *src1,
2408 const uint8_t *src2, int srcW, int xInc)
2409{
57f9a560 2410#if ARCH_X86 && CONFIG_GPL
94daf2e9 2411#if COMPILE_TEMPLATE_MMX2
0cc854e3
RP
2412 int32_t *filterPos = c->hChrFilterPos;
2413 int16_t *filter = c->hChrFilter;
a1f4b4bb
RP
2414 int canMMX2BeUsed = c->canMMX2BeUsed;
2415 void *mmx2FilterCode= c->chrMmx2FilterCode;
2416 int i;
83c89c78 2417#if defined(PIC)
a1f4b4bb 2418 DECLARE_ALIGNED(8, uint64_t, ebxsave);
83c89c78 2419#endif
a1f4b4bb
RP
2420 if (canMMX2BeUsed) {
2421 __asm__ volatile(
83c89c78 2422#if defined(PIC)
a1f4b4bb 2423 "mov %%"REG_b", %6 \n\t"
2da0d70d 2424#endif
a1f4b4bb
RP
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c" \n\t"
2427 "mov %1, %%"REG_D" \n\t"
2428 "mov %2, %%"REG_d" \n\t"
2429 "mov %3, %%"REG_b" \n\t"
2430 "xor %%"REG_a", %%"REG_a" \n\t" // i
2431 PREFETCH" (%%"REG_c") \n\t"
2432 PREFETCH" 32(%%"REG_c") \n\t"
2433 PREFETCH" 64(%%"REG_c") \n\t"
2434
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 CALL_MMX2_FILTER_CODE
2439 "xor %%"REG_a", %%"REG_a" \n\t" // i
2440 "mov %5, %%"REG_c" \n\t" // src
2441 "mov %1, %%"REG_D" \n\t" // buf1
2442 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2443 PREFETCH" (%%"REG_c") \n\t"
2444 PREFETCH" 32(%%"REG_c") \n\t"
2445 PREFETCH" 64(%%"REG_c") \n\t"
2446
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2450 CALL_MMX2_FILTER_CODE
2ff198c1 2451
83c89c78 2452#if defined(PIC)
a1f4b4bb 2453 "mov %6, %%"REG_b" \n\t"
83c89c78 2454#endif
0cc854e3 2455 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
a1f4b4bb 2456 "m" (mmx2FilterCode), "m" (src2)
83c89c78 2457#if defined(PIC)
a1f4b4bb 2458 ,"m" (ebxsave)
83c89c78 2459#endif
a1f4b4bb 2460 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2461#if !defined(PIC)
a1f4b4bb 2462 ,"%"REG_b
2da0d70d 2463#endif
a1f4b4bb
RP
2464 );
2465 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2466 //printf("%d %d %d\n", dstWidth, i, srcW);
2467 dst[i] = src1[srcW-1]*128;
2468 dst[i+VOFW] = src2[srcW-1]*128;
2469 }
2470 } else {
94daf2e9 2471#endif /* COMPILE_TEMPLATE_MMX2 */
a1f4b4bb 2472 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2da0d70d 2473 uint16_t xInc_mask = xInc & 0xffff;
7ad6469e 2474 __asm__ volatile(
a1f4b4bb
RP
2475 "xor %%"REG_a", %%"REG_a" \n\t" // i
2476 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
c255994b 2478 ASMALIGN(4)
a1f4b4bb
RP
2479 "1: \n\t"
2480 "mov %0, %%"REG_S" \n\t"
2481 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
c255994b
RP
2483 FAST_BILINEAR_X86
2484 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
c255994b 2485
a1f4b4bb
RP
2486 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
c255994b 2488 FAST_BILINEAR_X86
a1f4b4bb 2489 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
c255994b 2490
a1f4b4bb
RP
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a" \n\t"
2494 "cmp %2, %%"REG_a" \n\t"
2495 " jb 1b \n\t"
c255994b 2496
a1f4b4bb
RP
2497/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498which is needed to support GCC 4.0. */
2499#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2501#else
2502 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2503#endif
2504 "r" (src2)
c255994b 2505 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2da0d70d 2506 );
94daf2e9 2507#if COMPILE_TEMPLATE_MMX2
a1f4b4bb 2508 } //if MMX2 can't be used
2ff198c1
MN
2509#endif
2510#else
392b6567
RP
2511 int i;
2512 unsigned int xpos=0;
dd68318c 2513 for (i=0;i<dstWidth;i++) {
392b6567
RP
2514 register unsigned int xx=xpos>>16;
2515 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2516 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2517 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2518 /* slower
2519 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2521 */
2522 xpos+=xInc;
2523 }
a1f4b4bb 2524#endif /* ARCH_X86 */
392b6567
RP
2525}
2526
7ac40327 2527inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
aaba7e6c 2528 int srcW, int xInc, const int16_t *hChrFilter,
7ac40327 2529 const int16_t *hChrFilterPos, int hChrFilterSize,
aaba7e6c 2530 uint8_t *formatConvBuffer,
95b5770b 2531 uint32_t *pal)
2ff198c1 2532{
95b5770b 2533
84011f10
RP
2534 src1 += c->chrSrcOffset;
2535 src2 += c->chrSrcOffset;
e8417235 2536
bb53e1d1
RP
2537 if (c->chrToYV12) {
2538 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2539 src1= formatConvBuffer;
8b2fce0d 2540 src2= formatConvBuffer+VOFW;
e28630fc 2541 }
1e621b18 2542
996de2fe 2543 if (!c->hcscale_fast) {
40fa5140
RP
2544 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2545 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
dd68318c 2546 } else { // fast bilinear upscale / crap downscale
40fa5140 2547 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2da0d70d 2548 }
bae76dc3
RP
2549
2550 if (c->chrConvertRange)
2551 c->chrConvertRange(dst, dstWidth);
077ea8a7
MN
2552}
2553
c866c7d0
RP
2554#define DEBUG_SWSCALE_BUFFERS 0
2555#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2556
a959e247 2557static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
dd68318c
RP
2558 int srcSliceH, uint8_t* dst[], int dstStride[])
2559{
2da0d70d
DB
2560 /* load a few things into local vars to make the code more readable? and faster */
2561 const int srcW= c->srcW;
2562 const int dstW= c->dstW;
2563 const int dstH= c->dstH;
2564 const int chrDstW= c->chrDstW;
2565 const int chrSrcW= c->chrSrcW;
2566 const int lumXInc= c->lumXInc;
2567 const int chrXInc= c->chrXInc;
b411dfff 2568 const enum PixelFormat dstFormat= c->dstFormat;
2da0d70d 2569 const int flags= c->flags;
2da0d70d
DB
2570 int16_t *vLumFilterPos= c->vLumFilterPos;
2571 int16_t *vChrFilterPos= c->vChrFilterPos;
2572 int16_t *hLumFilterPos= c->hLumFilterPos;
2573 int16_t *hChrFilterPos= c->hChrFilterPos;
2574 int16_t *vLumFilter= c->vLumFilter;
2575 int16_t *vChrFilter= c->vChrFilter;
2576 int16_t *hLumFilter= c->hLumFilter;
2577 int16_t *hChrFilter= c->hChrFilter;
2578 int32_t *lumMmxFilter= c->lumMmxFilter;
2579 int32_t *chrMmxFilter= c->chrMmxFilter;
bc45751f 2580 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2da0d70d
DB
2581 const int vLumFilterSize= c->vLumFilterSize;
2582 const int vChrFilterSize= c->vChrFilterSize;
2583 const int hLumFilterSize= c->hLumFilterSize;
2584 const int hChrFilterSize= c->hChrFilterSize;
2585 int16_t **lumPixBuf= c->lumPixBuf;
2586 int16_t **chrPixBuf= c->chrPixBuf;
6858492e 2587 int16_t **alpPixBuf= c->alpPixBuf;
2da0d70d
DB
2588 const int vLumBufSize= c->vLumBufSize;
2589 const int vChrBufSize= c->vChrBufSize;
2da0d70d
DB
2590 uint8_t *formatConvBuffer= c->formatConvBuffer;
2591 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2592 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2593 int lastDstY;
e150ef8d 2594 uint32_t *pal=c->pal_yuv;
2da0d70d 2595
8a322796 2596 /* vars which will change and which we need to store back in the context */
2da0d70d
DB
2597 int dstY= c->dstY;
2598 int lumBufIndex= c->lumBufIndex;
2599 int chrBufIndex= c->chrBufIndex;
2600 int lastInLumBuf= c->lastInLumBuf;
2601 int lastInChrBuf= c->lastInChrBuf;
2602
dd68318c 2603 if (isPacked(c->srcFormat)) {
2da0d70d
DB
2604 src[0]=
2605 src[1]=
6858492e
CS
2606 src[2]=
2607 src[3]= src[0];
2da0d70d
DB
2608 srcStride[0]=
2609 srcStride[1]=
6858492e
CS
2610 srcStride[2]=
2611 srcStride[3]= srcStride[0];
2da0d70d
DB
2612 }
2613 srcStride[1]<<= c->vChrDrop;
2614 srcStride[2]<<= c->vChrDrop;
2615
c866c7d0
RP
2616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2618 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620 srcSliceY, srcSliceH, dstY, dstH);
2621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2da0d70d 2623
dd68318c 2624 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
6683a37f 2625 static int warnedAlready=0; //FIXME move this into the context perhaps
dd68318c 2626 if (flags & SWS_PRINT_INFO && !warnedAlready) {
4b0c30b7 2627 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
8a322796 2628 " ->cannot do aligned memory accesses anymore\n");
6683a37f 2629 warnedAlready=1;
2da0d70d
DB
2630 }
2631 }
2632
8a322796
DB
2633 /* Note the user might start scaling the picture in the middle so this
2634 will not get executed. This is not really intended but works
2635 currently, so people might do it. */
dd68318c 2636 if (srcSliceY ==0) {
75084e47
RP
2637 lumBufIndex=-1;
2638 chrBufIndex=-1;
2da0d70d
DB
2639 dstY=0;
2640 lastInLumBuf= -1;
2641 lastInChrBuf= -1;
2642 }
2643
2644 lastDstY= dstY;
2645
dd68318c 2646 for (;dstY < dstH; dstY++) {
2da0d70d
DB
2647 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2648 const int chrDstY= dstY>>c->chrDstVSubSample;
2649 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2650 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
6858492e 2651 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2da0d70d
DB
2652
2653 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
e5a1c207 2654 const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2da0d70d 2655 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
fb91df39 2656 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
e5a1c207 2657 int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
fb91df39
RP
2658 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2659 int enough_lines;
2da0d70d 2660
2da0d70d
DB
2661 //handle holes (FAST_BILINEAR & weird filters)
2662 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2663 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
fcc402b1
LB
2664 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2665 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2da0d70d 2666
f1d5a1f2
RP
2667 DEBUG_BUFFERS("dstY: %d\n", dstY);
2668 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2669 firstLumSrcY, lastLumSrcY, lastInLumBuf);
2670 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2671 firstChrSrcY, lastChrSrcY, lastInChrBuf);
2672
2da0d70d 2673 // Do we have enough lines in this slice to output the dstY line
e5a1c207
MN
2674 enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2675
fb91df39
RP
2676 if (!enough_lines) {
2677 lastLumSrcY = srcSliceY + srcSliceH - 1;
2678 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
f1d5a1f2
RP
2679 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2680 lastLumSrcY, lastChrSrcY);
fb91df39
RP
2681 }
2682
cbdc08d5 2683 //Do horizontal scaling
dd68318c 2684 while(lastInLumBuf < lastLumSrcY) {
a959e247
ZM
2685 const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2686 const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
cbdc08d5 2687 lumBufIndex++;
cbdc08d5
RP
2688 assert(lumBufIndex < 2*vLumBufSize);
2689 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2690 assert(lastInLumBuf + 1 - srcSliceY >= 0);
cbdc08d5 2691 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
aaba7e6c
RP
2692 hLumFilter, hLumFilterPos, hLumFilterSize,
2693 formatConvBuffer,
cbdc08d5
RP
2694 pal, 0);
2695 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2696 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
aaba7e6c
RP
2697 hLumFilter, hLumFilterPos, hLumFilterSize,
2698 formatConvBuffer,
cbdc08d5
RP
2699 pal, 1);
2700 lastInLumBuf++;
f1d5a1f2
RP
2701 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2702 lumBufIndex, lastInLumBuf);
cbdc08d5 2703 }
dd68318c 2704 while(lastInChrBuf < lastChrSrcY) {
a959e247
ZM
2705 const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2706 const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
cbdc08d5
RP
2707 chrBufIndex++;
2708 assert(chrBufIndex < 2*vChrBufSize);
2709 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2710 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2711 //FIXME replace parameters through context struct (some at least)
2712
e7a47515 2713 if (c->needs_hcscale)
cbdc08d5 2714 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
aaba7e6c
RP
2715 hChrFilter, hChrFilterPos, hChrFilterSize,
2716 formatConvBuffer,
cbdc08d5
RP
2717 pal);
2718 lastInChrBuf++;
f1d5a1f2
RP
2719 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2720 chrBufIndex, lastInChrBuf);
cbdc08d5
RP
2721 }
2722 //wrap buf index around to stay inside the ring buffer
2723 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2724 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2725 if (!enough_lines)
2da0d70d 2726 break; //we can't output a dstY line so let's try with the next slice
d3f41512 2727
94daf2e9 2728#if COMPILE_TEMPLATE_MMX
88e2a9ae 2729 c->blueDither= ff_dither8[dstY&1];
92c7b471 2730 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
88e2a9ae 2731 c->greenDither= ff_dither8[dstY&1];
92c7b471 2732 else
88e2a9ae
CEH
2733 c->greenDither= ff_dither4[dstY&1];
2734 c->redDither= ff_dither8[(dstY+1)&1];
2da0d70d 2735#endif
dd68318c 2736 if (dstY < dstH-2) {
7ac40327
RP
2737 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2738 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2739 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
94daf2e9 2740#if COMPILE_TEMPLATE_MMX
2da0d70d 2741 int i;
dd68318c 2742 if (flags & SWS_ACCURATE_RND) {
9b734d44 2743 int s= APCK_SIZE / 8;
dd68318c 2744 for (i=0; i<vLumFilterSize; i+=2) {
a959e247
ZM
2745 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2746 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
9b734d44
RP
2747 lumMmxFilter[s*i+APCK_COEF/4 ]=
2748 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2749 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
dd68318c 2750 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
a959e247
ZM
2751 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2752 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
9b734d44
RP
2753 alpMmxFilter[s*i+APCK_COEF/4 ]=
2754 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2755 }
6858492e 2756 }
dd68318c 2757 for (i=0; i<vChrFilterSize; i+=2) {
a959e247
ZM
2758 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2759 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
9b734d44
RP
2760 chrMmxFilter[s*i+APCK_COEF/4 ]=
2761 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2762 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2763 }
dd68318c
RP
2764 } else {
2765 for (i=0; i<vLumFilterSize; i++) {
9b734d44
RP
2766 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2767 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2768 lumMmxFilter[4*i+2]=
2769 lumMmxFilter[4*i+3]=
2770 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
dd68318c 2771 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
9b734d44
RP
2772 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2773 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2774 alpMmxFilter[4*i+2]=
2775 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2776 }
2777 }
dd68318c 2778 for (i=0; i<vChrFilterSize; i++) {
9b734d44
RP
2779 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2780 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2781 chrMmxFilter[4*i+2]=
2782 chrMmxFilter[4*i+3]=
2783 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
6858492e 2784 }
2da0d70d 2785 }
6542b44e 2786#endif
dd68318c 2787 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2da0d70d
DB
2788 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2789 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
40fa5140 2790 c->yuv2nv12X(c,
9b734d44
RP
2791 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2792 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2793 dest, uDest, dstW, chrDstW, dstFormat);
dd68318c 2794 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2da0d70d
DB
2795 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2796 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
dd68318c 2797 if (is16BPS(dstFormat)) {
52154148 2798 yuv2yuvX16inC(
9b734d44
RP
2799 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2800 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2801 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2802 dstFormat);
dd68318c 2803 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
a959e247
ZM
2804 const int16_t *lumBuf = lumSrcPtr[0];
2805 const int16_t *chrBuf= chrSrcPtr[0];
2806 const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
40fa5140 2807 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
dd68318c 2808 } else { //General YV12
40fa5140 2809 c->yuv2yuvX(c,
9b734d44
RP
2810 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2811 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2812 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2da0d70d 2813 }
dd68318c 2814 } else {
fcc402b1
LB
2815 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2816 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
dd68318c 2817 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2da0d70d 2818 int chrAlpha= vChrFilter[2*dstY+1];
dd68318c 2819 if(flags & SWS_FULL_CHR_H_INT) {
f0faee4c 2820 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
9b734d44
RP
2821 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2822 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2823 alpSrcPtr, dest, dstW, dstY);
dd68318c 2824 } else {
40fa5140 2825 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
9b734d44
RP
2826 alpPixBuf ? *alpSrcPtr : NULL,
2827 dest, dstW, chrAlpha, dstFormat, flags, dstY);
f0faee4c 2828 }
dd68318c 2829 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2da0d70d
DB
2830 int lumAlpha= vLumFilter[2*dstY+1];
2831 int chrAlpha= vChrFilter[2*dstY+1];
2832 lumMmxFilter[2]=
2833 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2834 chrMmxFilter[2]=
2835 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
dd68318c 2836 if(flags & SWS_FULL_CHR_H_INT) {
f0faee4c 2837 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
9b734d44
RP
2838 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2839 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2840 alpSrcPtr, dest, dstW, dstY);
dd68318c 2841 } else {
40fa5140 2842 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
9b734d44
RP
2843 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2844 dest, dstW, lumAlpha, chrAlpha, dstY);
f0faee4c 2845 }
dd68318c
RP
2846 } else { //general RGB
2847 if(flags & SWS_FULL_CHR_H_INT) {
f0faee4c 2848 yuv2rgbXinC_full(c,
9b734d44
RP
2849 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2850 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2851 alpSrcPtr, dest, dstW, dstY);
dd68318c 2852 } else {
40fa5140 2853 c->yuv2packedX(c,
9b734d44
RP
2854 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2855 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2856 alpSrcPtr, dest, dstW, dstY);
f0faee4c 2857 }
2da0d70d
DB
2858 }
2859 }
dd68318c 2860 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
7ac40327
RP
2861 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2862 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2863 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
dd68318c 2864 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2da0d70d
DB
2865 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2866 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2867 yuv2nv12XinC(
9b734d44
RP
2868 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2869 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2870 dest, uDest, dstW, chrDstW, dstFormat);
dd68318c 2871 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2da0d70d
DB
2872 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2873 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
dd68318c 2874 if (is16BPS(dstFormat)) {
52154148 2875 yuv2yuvX16inC(
9b734d44
RP
2876 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2877 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2878 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2879 dstFormat);
dd68318c 2880 } else {
ebe5dec2 2881 yuv2yuvXinC(
9b734d44
RP
2882 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2883 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2884 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
52154148 2885 }
dd68318c 2886 } else {
fcc402b1
LB
2887 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2888 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
dd68318c 2889 if(flags & SWS_FULL_CHR_H_INT) {
f0faee4c 2890 yuv2rgbXinC_full(c,
9b734d44
RP
2891 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2892 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893 alpSrcPtr, dest, dstW, dstY);
dd68318c 2894 } else {
14014d47 2895 yuv2packedXinC(c,
9b734d44
RP
2896 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2897 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2898 alpSrcPtr, dest, dstW, dstY);
f0faee4c 2899 }
2da0d70d
DB
2900 }
2901 }
2902 }
17f715fa 2903
6268f55b
CS
2904 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2905 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2906
94daf2e9 2907#if COMPILE_TEMPLATE_MMX
5b7c7dd3
RP
2908 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2909 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2910 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2911 else __asm__ volatile("emms" :::"memory");
17f715fa 2912#endif
2da0d70d
DB
2913 /* store changed local vars back in the context */
2914 c->dstY= dstY;
2915 c->lumBufIndex= lumBufIndex;
2916 c->chrBufIndex= chrBufIndex;
2917 c->lastInLumBuf= lastInLumBuf;
2918 c->lastInChrBuf= lastInChrBuf;
d4e24275 2919
2da0d70d 2920 return dstY - lastDstY;
627690b5 2921}
40fa5140
RP
2922
2923static void RENAME(sws_init_swScale)(SwsContext *c)
2924{
2925 enum PixelFormat srcFormat = c->srcFormat;
2926
2927 c->yuv2nv12X = RENAME(yuv2nv12X );
2928 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2929 c->yuv2yuvX = RENAME(yuv2yuvX );
2930 c->yuv2packed1 = RENAME(yuv2packed1 );
2931 c->yuv2packed2 = RENAME(yuv2packed2 );
2932 c->yuv2packedX = RENAME(yuv2packedX );
2933
2934 c->hScale = RENAME(hScale );
2935
b501a1f5
RP
2936#if COMPILE_TEMPLATE_MMX
2937 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2938 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2939#else
2940 if (c->flags & SWS_FAST_BILINEAR)
2941#endif
2942 {
e77ddfa2
RP
2943 c->hyscale_fast = RENAME(hyscale_fast);
2944 c->hcscale_fast = RENAME(hcscale_fast);
b501a1f5 2945 }
40fa5140 2946
bb53e1d1 2947 c->chrToYV12 = NULL;
40fa5140 2948 switch(srcFormat) {
bb53e1d1
RP
2949 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2950 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2951 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2952 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
40fa5140
RP
2953 case PIX_FMT_RGB8 :
2954 case PIX_FMT_BGR8 :
2955 case PIX_FMT_PAL8 :
2956 case PIX_FMT_BGR4_BYTE:
bb53e1d1 2957 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
991e579c
LT
2958 case PIX_FMT_YUV420P16BE:
2959 case PIX_FMT_YUV422P16BE:
bb53e1d1 2960 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
991e579c
LT
2961 case PIX_FMT_YUV420P16LE:
2962 case PIX_FMT_YUV422P16LE:
bb53e1d1 2963 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
40fa5140
RP
2964 }
2965 if (c->chrSrcHSubSample) {
2966 switch(srcFormat) {
e8417235 2967 case PIX_FMT_RGB48BE:
bb53e1d1 2968 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
40fa5140 2969 case PIX_FMT_RGB32 :
bb53e1d1
RP
2970 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
2971 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2972 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2973 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
40fa5140 2974 case PIX_FMT_BGR32 :
bb53e1d1
RP
2975 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
2976 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2977 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2978 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
40fa5140
RP
2979 }
2980 } else {
2981 switch(srcFormat) {
e8417235 2982 case PIX_FMT_RGB48BE:
bb53e1d1 2983 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
40fa5140 2984 case PIX_FMT_RGB32 :
bb53e1d1
RP
2985 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
2986 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2987 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2988 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
40fa5140 2989 case PIX_FMT_BGR32 :
bb53e1d1
RP
2990 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
2991 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2992 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2993 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
40fa5140
RP
2994 }
2995 }
2996
bb53e1d1
RP
2997 c->lumToYV12 = NULL;
2998 c->alpToYV12 = NULL;
40fa5140
RP
2999 switch (srcFormat) {
3000 case PIX_FMT_YUYV422 :
991e579c
LT
3001 case PIX_FMT_YUV420P16BE:
3002 case PIX_FMT_YUV422P16BE:
3003 case PIX_FMT_YUV444P16BE:
bb53e1d1 3004 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
40fa5140 3005 case PIX_FMT_UYVY422 :
991e579c
LT
3006 case PIX_FMT_YUV420P16LE:
3007 case PIX_FMT_YUV422P16LE:
3008 case PIX_FMT_YUV444P16LE:
bb53e1d1
RP
3009 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3010 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
3011 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
3012 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
3013 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
3014 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
3015 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
40fa5140
RP
3016 case PIX_FMT_RGB8 :
3017 case PIX_FMT_BGR8 :
3018 case PIX_FMT_PAL8 :
3019 case PIX_FMT_BGR4_BYTE:
bb53e1d1
RP
3020 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3021 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3022 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
39e5f87b 3023 case PIX_FMT_RGB32 :
bb53e1d1 3024 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
39e5f87b 3025 case PIX_FMT_BGR32 :
bb53e1d1 3026 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
e8417235 3027 case PIX_FMT_RGB48BE:
bb53e1d1 3028 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
40fa5140
RP
3029 }
3030 if (c->alpPixBuf) {
3031 switch (srcFormat) {
3032 case PIX_FMT_RGB32 :
3033 case PIX_FMT_RGB32_1:
3034 case PIX_FMT_BGR32 :
bb53e1d1 3035 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
40fa5140
RP
3036 }
3037 }
84011f10
RP
3038
3039 switch (srcFormat) {
3040 case PIX_FMT_RGB32 :
3041 case PIX_FMT_BGR32 :
3042 c->alpSrcOffset = 3;
3043 break;
3044 case PIX_FMT_RGB32_1:
3045 case PIX_FMT_BGR32_1:
3046 c->lumSrcOffset = ALT32_CORR;
3047 c->chrSrcOffset = ALT32_CORR;
3048 break;
3049 case PIX_FMT_RGB48LE:
3050 c->lumSrcOffset = 1;
3051 c->chrSrcOffset = 1;
3052 c->alpSrcOffset = 1;
3053 break;
3054 }
bae76dc3 3055
60222557