More accurate value for log2(10)
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
8a322796
DB
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec 27#undef PREFETCH
48a05cec 28
94daf2e9 29#if COMPILE_TEMPLATE_AMD3DNOW
48a05cec 30#define PREFETCH "prefetch"
94daf2e9 31#elif COMPILE_TEMPLATE_MMX2
48a05cec 32#define PREFETCH "prefetchnta"
48a05cec 33#else
d904b5fc 34#define PREFETCH " # nop"
48a05cec
MN
35#endif
36
94daf2e9 37#if COMPILE_TEMPLATE_MMX2
d604bab9 38#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
94daf2e9 39#elif COMPILE_TEMPLATE_AMD3DNOW
d604bab9
MN
40#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
41#endif
d3f41512 42
94daf2e9 43#if COMPILE_TEMPLATE_MMX2
6e1c66bc 44#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 45#else
6e1c66bc 46#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 47#endif
6e1c66bc 48#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 49
94daf2e9 50#if COMPILE_TEMPLATE_ALTIVEC
009d2d74 51#include "ppc/swscale_altivec_template.c"
a2faa401
RD
52#endif
53
bca11e75 54#define YSCALEYUV2YV12X(x, offset, dest, width) \
7ad6469e 55 __asm__ volatile(\
c255994b
RP
56 "xor %%"REG_a", %%"REG_a" \n\t"\
57 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
58 "movq %%mm3, %%mm4 \n\t"\
59 "lea " offset "(%0), %%"REG_d" \n\t"\
60 "mov (%%"REG_d"), %%"REG_S" \n\t"\
61 ASMALIGN(4) /* FIXME Unroll? */\
62 "1: \n\t"\
63 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
64 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
65 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
66 "add $16, %%"REG_d" \n\t"\
67 "mov (%%"REG_d"), %%"REG_S" \n\t"\
68 "test %%"REG_S", %%"REG_S" \n\t"\
69 "pmulhw %%mm0, %%mm2 \n\t"\
70 "pmulhw %%mm0, %%mm5 \n\t"\
71 "paddw %%mm2, %%mm3 \n\t"\
72 "paddw %%mm5, %%mm4 \n\t"\
73 " jnz 1b \n\t"\
74 "psraw $3, %%mm3 \n\t"\
75 "psraw $3, %%mm4 \n\t"\
76 "packuswb %%mm4, %%mm3 \n\t"\
77 MOVNTQ(%%mm3, (%1, %%REGa))\
78 "add $8, %%"REG_a" \n\t"\
79 "cmp %2, %%"REG_a" \n\t"\
80 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
81 "movq %%mm3, %%mm4 \n\t"\
82 "lea " offset "(%0), %%"REG_d" \n\t"\
83 "mov (%%"REG_d"), %%"REG_S" \n\t"\
84 "jb 1b \n\t"\
85 :: "r" (&c->redDither),\
86 "r" (dest), "g" (width)\
87 : "%"REG_a, "%"REG_d, "%"REG_S\
2da0d70d 88 );
bca11e75
MN
89
90#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
7ad6469e 91 __asm__ volatile(\
c255994b
RP
92 "lea " offset "(%0), %%"REG_d" \n\t"\
93 "xor %%"REG_a", %%"REG_a" \n\t"\
94 "pxor %%mm4, %%mm4 \n\t"\
95 "pxor %%mm5, %%mm5 \n\t"\
96 "pxor %%mm6, %%mm6 \n\t"\
97 "pxor %%mm7, %%mm7 \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 ASMALIGN(4) \
100 "1: \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
102 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
103 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
104 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
105 "movq %%mm0, %%mm3 \n\t"\
106 "punpcklwd %%mm1, %%mm0 \n\t"\
107 "punpckhwd %%mm1, %%mm3 \n\t"\
108 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
109 "pmaddwd %%mm1, %%mm0 \n\t"\
110 "pmaddwd %%mm1, %%mm3 \n\t"\
111 "paddd %%mm0, %%mm4 \n\t"\
112 "paddd %%mm3, %%mm5 \n\t"\
113 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
114 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
115 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
116 "test %%"REG_S", %%"REG_S" \n\t"\
117 "movq %%mm2, %%mm0 \n\t"\
118 "punpcklwd %%mm3, %%mm2 \n\t"\
119 "punpckhwd %%mm3, %%mm0 \n\t"\
120 "pmaddwd %%mm1, %%mm2 \n\t"\
121 "pmaddwd %%mm1, %%mm0 \n\t"\
122 "paddd %%mm2, %%mm6 \n\t"\
123 "paddd %%mm0, %%mm7 \n\t"\
124 " jnz 1b \n\t"\
125 "psrad $16, %%mm4 \n\t"\
126 "psrad $16, %%mm5 \n\t"\
127 "psrad $16, %%mm6 \n\t"\
128 "psrad $16, %%mm7 \n\t"\
129 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
130 "packssdw %%mm5, %%mm4 \n\t"\
131 "packssdw %%mm7, %%mm6 \n\t"\
132 "paddw %%mm0, %%mm4 \n\t"\
133 "paddw %%mm0, %%mm6 \n\t"\
134 "psraw $3, %%mm4 \n\t"\
135 "psraw $3, %%mm6 \n\t"\
136 "packuswb %%mm6, %%mm4 \n\t"\
137 MOVNTQ(%%mm4, (%1, %%REGa))\
138 "add $8, %%"REG_a" \n\t"\
139 "cmp %2, %%"REG_a" \n\t"\
140 "lea " offset "(%0), %%"REG_d" \n\t"\
141 "pxor %%mm4, %%mm4 \n\t"\
142 "pxor %%mm5, %%mm5 \n\t"\
143 "pxor %%mm6, %%mm6 \n\t"\
144 "pxor %%mm7, %%mm7 \n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
146 "jb 1b \n\t"\
147 :: "r" (&c->redDither),\
148 "r" (dest), "g" (width)\
149 : "%"REG_a, "%"REG_d, "%"REG_S\
2da0d70d 150 );
c1b0bfb4
MN
151
152#define YSCALEYUV2YV121 \
2da0d70d
DB
153 "mov %2, %%"REG_a" \n\t"\
154 ASMALIGN(4) /* FIXME Unroll? */\
155 "1: \n\t"\
156 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
157 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
158 "psraw $7, %%mm0 \n\t"\
159 "psraw $7, %%mm1 \n\t"\
160 "packuswb %%mm1, %%mm0 \n\t"\
161 MOVNTQ(%%mm0, (%1, %%REGa))\
162 "add $8, %%"REG_a" \n\t"\
163 "jnc 1b \n\t"
c1b0bfb4 164
bf2bdde6
MN
165#define YSCALEYUV2YV121_ACCURATE \
166 "mov %2, %%"REG_a" \n\t"\
167 "pcmpeqw %%mm7, %%mm7 \n\t"\
168 "psrlw $15, %%mm7 \n\t"\
169 "psllw $6, %%mm7 \n\t"\
170 ASMALIGN(4) /* FIXME Unroll? */\
171 "1: \n\t"\
172 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
173 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
33a67bd6
MN
174 "paddsw %%mm7, %%mm0 \n\t"\
175 "paddsw %%mm7, %%mm1 \n\t"\
bf2bdde6
MN
176 "psraw $7, %%mm0 \n\t"\
177 "psraw $7, %%mm1 \n\t"\
178 "packuswb %%mm1, %%mm0 \n\t"\
179 MOVNTQ(%%mm0, (%1, %%REGa))\
180 "add $8, %%"REG_a" \n\t"\
181 "jnc 1b \n\t"
182
c1b0bfb4 183/*
2da0d70d
DB
184 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
185 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
186 "r" (dest), "m" (dstW),
187 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
188 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 189*/
df57ab14 190#define YSCALEYUV2PACKEDX_UV \
7ad6469e 191 __asm__ volatile(\
c255994b
RP
192 "xor %%"REG_a", %%"REG_a" \n\t"\
193 ASMALIGN(4)\
194 "nop \n\t"\
195 "1: \n\t"\
196 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
197 "mov (%%"REG_d"), %%"REG_S" \n\t"\
198 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
199 "movq %%mm3, %%mm4 \n\t"\
200 ASMALIGN(4)\
201 "2: \n\t"\
202 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
203 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
204 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
205 "add $16, %%"REG_d" \n\t"\
206 "mov (%%"REG_d"), %%"REG_S" \n\t"\
207 "pmulhw %%mm0, %%mm2 \n\t"\
208 "pmulhw %%mm0, %%mm5 \n\t"\
209 "paddw %%mm2, %%mm3 \n\t"\
210 "paddw %%mm5, %%mm4 \n\t"\
211 "test %%"REG_S", %%"REG_S" \n\t"\
212 " jnz 2b \n\t"\
df57ab14 213
fe91924d 214#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
df57ab14 215 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d 216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
217 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
218 "movq "#dst1", "#dst2" \n\t"\
2da0d70d
DB
219 ASMALIGN(4)\
220 "2: \n\t"\
fe91924d
CS
221 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
2da0d70d
DB
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
fe91924d
CS
226 "pmulhw "#coeff", "#src1" \n\t"\
227 "pmulhw "#coeff", "#src2" \n\t"\
228 "paddw "#src1", "#dst1" \n\t"\
229 "paddw "#src2", "#dst2" \n\t"\
2da0d70d
DB
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
232
df57ab14
CS
233#define YSCALEYUV2PACKEDX \
234 YSCALEYUV2PACKEDX_UV \
fe91924d 235 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
df57ab14 236
c255994b
RP
237#define YSCALEYUV2PACKEDX_END \
238 :: "r" (&c->redDither), \
239 "m" (dummy), "m" (dummy), "m" (dummy),\
240 "r" (dest), "m" (dstW) \
241 : "%"REG_a, "%"REG_d, "%"REG_S \
2da0d70d 242 );
8422aa88 243
df57ab14 244#define YSCALEYUV2PACKEDX_ACCURATE_UV \
7ad6469e 245 __asm__ volatile(\
c255994b
RP
246 "xor %%"REG_a", %%"REG_a" \n\t"\
247 ASMALIGN(4)\
248 "nop \n\t"\
249 "1: \n\t"\
250 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
251 "mov (%%"REG_d"), %%"REG_S" \n\t"\
252 "pxor %%mm4, %%mm4 \n\t"\
253 "pxor %%mm5, %%mm5 \n\t"\
254 "pxor %%mm6, %%mm6 \n\t"\
255 "pxor %%mm7, %%mm7 \n\t"\
256 ASMALIGN(4)\
257 "2: \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
259 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
260 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
261 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
262 "movq %%mm0, %%mm3 \n\t"\
263 "punpcklwd %%mm1, %%mm0 \n\t"\
264 "punpckhwd %%mm1, %%mm3 \n\t"\
265 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
266 "pmaddwd %%mm1, %%mm0 \n\t"\
267 "pmaddwd %%mm1, %%mm3 \n\t"\
268 "paddd %%mm0, %%mm4 \n\t"\
269 "paddd %%mm3, %%mm5 \n\t"\
270 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
271 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
272 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
273 "test %%"REG_S", %%"REG_S" \n\t"\
274 "movq %%mm2, %%mm0 \n\t"\
275 "punpcklwd %%mm3, %%mm2 \n\t"\
276 "punpckhwd %%mm3, %%mm0 \n\t"\
277 "pmaddwd %%mm1, %%mm2 \n\t"\
278 "pmaddwd %%mm1, %%mm0 \n\t"\
279 "paddd %%mm2, %%mm6 \n\t"\
280 "paddd %%mm0, %%mm7 \n\t"\
281 " jnz 2b \n\t"\
282 "psrad $16, %%mm4 \n\t"\
283 "psrad $16, %%mm5 \n\t"\
284 "psrad $16, %%mm6 \n\t"\
285 "psrad $16, %%mm7 \n\t"\
286 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
287 "packssdw %%mm5, %%mm4 \n\t"\
288 "packssdw %%mm7, %%mm6 \n\t"\
289 "paddw %%mm0, %%mm4 \n\t"\
290 "paddw %%mm0, %%mm6 \n\t"\
291 "movq %%mm4, "U_TEMP"(%0) \n\t"\
292 "movq %%mm6, "V_TEMP"(%0) \n\t"\
df57ab14
CS
293
294#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
295 "lea "offset"(%0), %%"REG_d" \n\t"\
2da0d70d
DB
296 "mov (%%"REG_d"), %%"REG_S" \n\t"\
297 "pxor %%mm1, %%mm1 \n\t"\
298 "pxor %%mm5, %%mm5 \n\t"\
299 "pxor %%mm7, %%mm7 \n\t"\
300 "pxor %%mm6, %%mm6 \n\t"\
301 ASMALIGN(4)\
302 "2: \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
304 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
1625216e 305 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
306 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
307 "movq %%mm0, %%mm3 \n\t"\
308 "punpcklwd %%mm4, %%mm0 \n\t"\
309 "punpckhwd %%mm4, %%mm3 \n\t"\
1625216e 310 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
2da0d70d
DB
311 "pmaddwd %%mm4, %%mm0 \n\t"\
312 "pmaddwd %%mm4, %%mm3 \n\t"\
313 "paddd %%mm0, %%mm1 \n\t"\
314 "paddd %%mm3, %%mm5 \n\t"\
315 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
1625216e
MN
316 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
317 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
318 "test %%"REG_S", %%"REG_S" \n\t"\
319 "movq %%mm2, %%mm0 \n\t"\
320 "punpcklwd %%mm3, %%mm2 \n\t"\
321 "punpckhwd %%mm3, %%mm0 \n\t"\
322 "pmaddwd %%mm4, %%mm2 \n\t"\
323 "pmaddwd %%mm4, %%mm0 \n\t"\
324 "paddd %%mm2, %%mm7 \n\t"\
325 "paddd %%mm0, %%mm6 \n\t"\
326 " jnz 2b \n\t"\
327 "psrad $16, %%mm1 \n\t"\
328 "psrad $16, %%mm5 \n\t"\
329 "psrad $16, %%mm7 \n\t"\
330 "psrad $16, %%mm6 \n\t"\
331 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
332 "packssdw %%mm5, %%mm1 \n\t"\
333 "packssdw %%mm6, %%mm7 \n\t"\
334 "paddw %%mm0, %%mm1 \n\t"\
335 "paddw %%mm0, %%mm7 \n\t"\
336 "movq "U_TEMP"(%0), %%mm3 \n\t"\
337 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 338
df57ab14
CS
339#define YSCALEYUV2PACKEDX_ACCURATE \
340 YSCALEYUV2PACKEDX_ACCURATE_UV \
341 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
342
8422aa88 343#define YSCALEYUV2RGBX \
2da0d70d
DB
344 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
345 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
346 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
347 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
348 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
349 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
c255994b 350 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
2da0d70d
DB
351 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
352 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
353 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
354 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
355 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
356 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
c255994b 357 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
2da0d70d
DB
358 "paddw %%mm3, %%mm4 \n\t"\
359 "movq %%mm2, %%mm0 \n\t"\
360 "movq %%mm5, %%mm6 \n\t"\
361 "movq %%mm4, %%mm3 \n\t"\
362 "punpcklwd %%mm2, %%mm2 \n\t"\
363 "punpcklwd %%mm5, %%mm5 \n\t"\
364 "punpcklwd %%mm4, %%mm4 \n\t"\
365 "paddw %%mm1, %%mm2 \n\t"\
366 "paddw %%mm1, %%mm5 \n\t"\
367 "paddw %%mm1, %%mm4 \n\t"\
368 "punpckhwd %%mm0, %%mm0 \n\t"\
369 "punpckhwd %%mm6, %%mm6 \n\t"\
370 "punpckhwd %%mm3, %%mm3 \n\t"\
371 "paddw %%mm7, %%mm0 \n\t"\
372 "paddw %%mm7, %%mm6 \n\t"\
373 "paddw %%mm7, %%mm3 \n\t"\
374 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
375 "packuswb %%mm0, %%mm2 \n\t"\
376 "packuswb %%mm6, %%mm5 \n\t"\
377 "packuswb %%mm3, %%mm4 \n\t"\
d604bab9 378
6e1c66bc 379#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
380 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
381 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
382 "psraw $3, %%mm0 \n\t"\
383 "psraw $3, %%mm1 \n\t"\
384 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
385 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
386 "xor "#index", "#index" \n\t"\
387 ASMALIGN(4)\
388 "1: \n\t"\
389 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
390 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
391 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
392 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
393 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
394 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
396 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
397 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
399 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
400 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
401 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
402 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
403 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
404 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
405 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
406 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
407 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
408 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
409 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
410 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
411 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
412 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
413 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 414
6e1c66bc 415#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 416
df57ab14 417#define REAL_YSCALEYUV2RGB_UV(index, c) \
2da0d70d
DB
418 "xor "#index", "#index" \n\t"\
419 ASMALIGN(4)\
420 "1: \n\t"\
421 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
422 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
423 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
424 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
425 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
426 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
427 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
428 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
429 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
430 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
431 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
432 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
433 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
434 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
435 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
436 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
437 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
438 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
439 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
440 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
df57ab14 441
786dcfef
CS
442#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
443 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
444 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
445 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
446 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
2da0d70d
DB
447 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
448 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
449 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
df57ab14
CS
455
456#define REAL_YSCALEYUV2RGB_COEFF(c) \
2da0d70d
DB
457 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
458 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
459 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
460 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
461 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
462 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
463 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
464 "paddw %%mm3, %%mm4 \n\t"\
465 "movq %%mm2, %%mm0 \n\t"\
466 "movq %%mm5, %%mm6 \n\t"\
467 "movq %%mm4, %%mm3 \n\t"\
468 "punpcklwd %%mm2, %%mm2 \n\t"\
469 "punpcklwd %%mm5, %%mm5 \n\t"\
470 "punpcklwd %%mm4, %%mm4 \n\t"\
471 "paddw %%mm1, %%mm2 \n\t"\
472 "paddw %%mm1, %%mm5 \n\t"\
473 "paddw %%mm1, %%mm4 \n\t"\
474 "punpckhwd %%mm0, %%mm0 \n\t"\
475 "punpckhwd %%mm6, %%mm6 \n\t"\
476 "punpckhwd %%mm3, %%mm3 \n\t"\
477 "paddw %%mm7, %%mm0 \n\t"\
478 "paddw %%mm7, %%mm6 \n\t"\
479 "paddw %%mm7, %%mm3 \n\t"\
480 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
481 "packuswb %%mm0, %%mm2 \n\t"\
482 "packuswb %%mm6, %%mm5 \n\t"\
483 "packuswb %%mm3, %%mm4 \n\t"\
40494418 484
786dcfef 485#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
df57ab14
CS
486
487#define YSCALEYUV2RGB(index, c) \
488 REAL_YSCALEYUV2RGB_UV(index, c) \
786dcfef 489 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
df57ab14 490 REAL_YSCALEYUV2RGB_COEFF(c)
6a4970ab 491
6e1c66bc 492#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
493 "xor "#index", "#index" \n\t"\
494 ASMALIGN(4)\
495 "1: \n\t"\
496 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 497 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
498 "psraw $7, %%mm3 \n\t" \
499 "psraw $7, %%mm4 \n\t" \
500 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
501 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
502 "psraw $7, %%mm1 \n\t" \
503 "psraw $7, %%mm7 \n\t" \
6a4970ab 504
6e1c66bc 505#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 506
6e1c66bc 507#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
508 "xor "#index", "#index" \n\t"\
509 ASMALIGN(4)\
510 "1: \n\t"\
511 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 512 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
513 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
514 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
515 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
516 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
517 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
518 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
519 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
520 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
521 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
522 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
523 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
524 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
525 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
527 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
528 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
529 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
530 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
531 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
532 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
533 "paddw %%mm3, %%mm4 \n\t"\
534 "movq %%mm2, %%mm0 \n\t"\
535 "movq %%mm5, %%mm6 \n\t"\
536 "movq %%mm4, %%mm3 \n\t"\
537 "punpcklwd %%mm2, %%mm2 \n\t"\
538 "punpcklwd %%mm5, %%mm5 \n\t"\
539 "punpcklwd %%mm4, %%mm4 \n\t"\
540 "paddw %%mm1, %%mm2 \n\t"\
541 "paddw %%mm1, %%mm5 \n\t"\
542 "paddw %%mm1, %%mm4 \n\t"\
543 "punpckhwd %%mm0, %%mm0 \n\t"\
544 "punpckhwd %%mm6, %%mm6 \n\t"\
545 "punpckhwd %%mm3, %%mm3 \n\t"\
546 "paddw %%mm7, %%mm0 \n\t"\
547 "paddw %%mm7, %%mm6 \n\t"\
548 "paddw %%mm7, %%mm3 \n\t"\
549 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
550 "packuswb %%mm0, %%mm2 \n\t"\
551 "packuswb %%mm6, %%mm5 \n\t"\
552 "packuswb %%mm3, %%mm4 \n\t"\
40494418 553
6e1c66bc 554#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 555
6e1c66bc 556#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
557 "xor "#index", "#index" \n\t"\
558 ASMALIGN(4)\
559 "1: \n\t"\
560 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
561 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
562 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
563 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
564 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
565 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
566 "psrlw $8, %%mm3 \n\t" \
567 "psrlw $8, %%mm4 \n\t" \
568 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
569 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
570 "psraw $7, %%mm1 \n\t" \
571 "psraw $7, %%mm7 \n\t"
6e1c66bc 572#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 573
497d4f99 574// do vertical chrominance interpolation
6e1c66bc 575#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
586 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
587 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
588 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
589 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
590 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
591 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
592 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
593 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
594 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
595 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
596 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
597 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
599 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
600 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
601 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
602 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
603 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
604 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
605 "paddw %%mm3, %%mm4 \n\t"\
606 "movq %%mm2, %%mm0 \n\t"\
607 "movq %%mm5, %%mm6 \n\t"\
608 "movq %%mm4, %%mm3 \n\t"\
609 "punpcklwd %%mm2, %%mm2 \n\t"\
610 "punpcklwd %%mm5, %%mm5 \n\t"\
611 "punpcklwd %%mm4, %%mm4 \n\t"\
612 "paddw %%mm1, %%mm2 \n\t"\
613 "paddw %%mm1, %%mm5 \n\t"\
614 "paddw %%mm1, %%mm4 \n\t"\
615 "punpckhwd %%mm0, %%mm0 \n\t"\
616 "punpckhwd %%mm6, %%mm6 \n\t"\
617 "punpckhwd %%mm3, %%mm3 \n\t"\
618 "paddw %%mm7, %%mm0 \n\t"\
619 "paddw %%mm7, %%mm6 \n\t"\
620 "paddw %%mm7, %%mm3 \n\t"\
621 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
622 "packuswb %%mm0, %%mm2 \n\t"\
623 "packuswb %%mm6, %%mm5 \n\t"\
624 "packuswb %%mm3, %%mm4 \n\t"\
40494418 625
6e1c66bc 626#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 627
6858492e
CS
628#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
629 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
630 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
631 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
632 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
633 "packuswb %%mm1, %%mm7 \n\t"
634#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
635
9c77b26b
CS
636#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
637 "movq "#b", "#q2" \n\t" /* B */\
638 "movq "#r", "#t" \n\t" /* R */\
639 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
640 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
641 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
642 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
643 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
644 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
645 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
646 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
647 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
648 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
d604bab9 649\
9c77b26b
CS
650 MOVNTQ( q0, (dst, index, 4))\
651 MOVNTQ( b, 8(dst, index, 4))\
652 MOVNTQ( q2, 16(dst, index, 4))\
653 MOVNTQ( q3, 24(dst, index, 4))\
d604bab9 654\
2da0d70d
DB
655 "add $8, "#index" \n\t"\
656 "cmp "#dstw", "#index" \n\t"\
657 " jb 1b \n\t"
9c77b26b 658#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
d604bab9 659
27a90b04 660#define REAL_WRITERGB16(dst, dstw, index) \
2da0d70d
DB
661 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
662 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
663 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
664 "psrlq $3, %%mm2 \n\t"\
d604bab9 665\
2da0d70d
DB
666 "movq %%mm2, %%mm1 \n\t"\
667 "movq %%mm4, %%mm3 \n\t"\
d604bab9 668\
2da0d70d
DB
669 "punpcklbw %%mm7, %%mm3 \n\t"\
670 "punpcklbw %%mm5, %%mm2 \n\t"\
671 "punpckhbw %%mm7, %%mm4 \n\t"\
672 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 673\
2da0d70d
DB
674 "psllq $3, %%mm3 \n\t"\
675 "psllq $3, %%mm4 \n\t"\
d604bab9 676\
2da0d70d
DB
677 "por %%mm3, %%mm2 \n\t"\
678 "por %%mm4, %%mm1 \n\t"\
d604bab9 679\
2da0d70d
DB
680 MOVNTQ(%%mm2, (dst, index, 2))\
681 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 682\
2da0d70d
DB
683 "add $8, "#index" \n\t"\
684 "cmp "#dstw", "#index" \n\t"\
685 " jb 1b \n\t"
27a90b04 686#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
d604bab9 687
27a90b04 688#define REAL_WRITERGB15(dst, dstw, index) \
2da0d70d
DB
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
690 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
692 "psrlq $3, %%mm2 \n\t"\
693 "psrlq $1, %%mm5 \n\t"\
d604bab9 694\
2da0d70d
DB
695 "movq %%mm2, %%mm1 \n\t"\
696 "movq %%mm4, %%mm3 \n\t"\
d604bab9 697\
2da0d70d
DB
698 "punpcklbw %%mm7, %%mm3 \n\t"\
699 "punpcklbw %%mm5, %%mm2 \n\t"\
700 "punpckhbw %%mm7, %%mm4 \n\t"\
701 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 702\
2da0d70d
DB
703 "psllq $2, %%mm3 \n\t"\
704 "psllq $2, %%mm4 \n\t"\
d604bab9 705\
2da0d70d
DB
706 "por %%mm3, %%mm2 \n\t"\
707 "por %%mm4, %%mm1 \n\t"\
d604bab9 708\
2da0d70d
DB
709 MOVNTQ(%%mm2, (dst, index, 2))\
710 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 711\
2da0d70d
DB
712 "add $8, "#index" \n\t"\
713 "cmp "#dstw", "#index" \n\t"\
714 " jb 1b \n\t"
27a90b04 715#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
f62255fb 716
6542b44e 717#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
718 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
719 "movq %%mm2, %%mm1 \n\t" /* B */\
720 "movq %%mm5, %%mm6 \n\t" /* R */\
721 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
723 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
724 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
725 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
726 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
727 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
728 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
729 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
730 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 731\
2da0d70d
DB
732 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
733 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
734 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
735 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
736 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
737 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
738 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
739 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 740\
2da0d70d
DB
741 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
742 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
743 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
744 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
745 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
746 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
747 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
748 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
749 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
750 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
751 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
752 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
753 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 754\
2da0d70d
DB
755 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
756 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
757 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
758 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
759 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
760 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
761 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
762 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 763\
2da0d70d
DB
764 MOVNTQ(%%mm0, (dst))\
765 MOVNTQ(%%mm2, 8(dst))\
766 MOVNTQ(%%mm3, 16(dst))\
767 "add $24, "#dst" \n\t"\
d604bab9 768\
2da0d70d
DB
769 "add $8, "#index" \n\t"\
770 "cmp "#dstw", "#index" \n\t"\
771 " jb 1b \n\t"
d604bab9 772
6542b44e 773#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
774 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
775 "movq %%mm2, %%mm1 \n\t" /* B */\
776 "movq %%mm5, %%mm6 \n\t" /* R */\
777 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
778 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
779 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
780 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
781 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
782 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
783 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
784 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
785 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
786 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 787\
2da0d70d
DB
788 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
789 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
790 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
791 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 792\
2da0d70d
DB
793 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
794 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
795 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
796 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 797\
2da0d70d
DB
798 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
799 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
800 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
801 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 802\
2da0d70d
DB
803 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
804 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
805 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
806 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
807 MOVNTQ(%%mm0, (dst))\
99d2cb72 808\
2da0d70d
DB
809 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
810 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
811 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
812 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
813 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 814\
2da0d70d
DB
815 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
816 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
817 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
818 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 819\
2da0d70d 820 "add $24, "#dst" \n\t"\
99d2cb72 821\
2da0d70d
DB
822 "add $8, "#index" \n\t"\
823 "cmp "#dstw", "#index" \n\t"\
824 " jb 1b \n\t"
99d2cb72 825
6542b44e 826#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 827 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
828 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
829 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
830 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
831 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
832 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 833\
2da0d70d
DB
834 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
835 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
836 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 837\
2da0d70d
DB
838 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
839 "por %%mm1, %%mm6 \n\t"\
840 "por %%mm3, %%mm6 \n\t"\
841 MOVNTQ(%%mm6, (dst))\
99d2cb72 842\
2da0d70d
DB
843 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
844 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
845 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
846 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 847\
5802683a 848 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
849 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
850 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 851\
2da0d70d
DB
852 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
853 "por %%mm3, %%mm6 \n\t"\
854 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 855\
2da0d70d
DB
856 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
857 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
858 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 859\
2da0d70d
DB
860 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
861 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 862 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 863\
2da0d70d
DB
864 "por %%mm1, %%mm3 \n\t"\
865 "por %%mm3, %%mm6 \n\t"\
866 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 867\
2da0d70d 868 "add $24, "#dst" \n\t"\
99d2cb72 869\
2da0d70d
DB
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
872 " jb 1b \n\t"
99d2cb72 873
94daf2e9 874#if COMPILE_TEMPLATE_MMX2
7630f2e0 875#undef WRITEBGR24
6e1c66bc 876#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 877#else
7630f2e0 878#undef WRITEBGR24
6e1c66bc 879#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
880#endif
881
6e1c66bc 882#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
883 "packuswb %%mm3, %%mm3 \n\t"\
884 "packuswb %%mm4, %%mm4 \n\t"\
885 "packuswb %%mm7, %%mm1 \n\t"\
886 "punpcklbw %%mm4, %%mm3 \n\t"\
887 "movq %%mm1, %%mm7 \n\t"\
888 "punpcklbw %%mm3, %%mm1 \n\t"\
889 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 890\
2da0d70d
DB
891 MOVNTQ(%%mm1, (dst, index, 2))\
892 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 893\
2da0d70d
DB
894 "add $8, "#index" \n\t"\
895 "cmp "#dstw", "#index" \n\t"\
896 " jb 1b \n\t"
6e1c66bc 897#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
898
899
7ac40327
RP
900static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
901 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
6858492e 902 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
38858470 903{
94daf2e9 904#if COMPILE_TEMPLATE_MMX
dd68318c
RP
905 if(!(c->flags & SWS_BITEXACT)) {
906 if (c->flags & SWS_ACCURATE_RND) {
907 if (uDest) {
14014d47
MN
908 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
909 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
910 }
dd68318c 911 if (CONFIG_SWSCALE_ALPHA && aDest) {
6858492e
CS
912 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
913 }
bca11e75 914
14014d47 915 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
dd68318c
RP
916 } else {
917 if (uDest) {
14014d47
MN
918 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
919 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
920 }
dd68318c 921 if (CONFIG_SWSCALE_ALPHA && aDest) {
6858492e
CS
922 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
923 }
2da0d70d 924
14014d47
MN
925 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
926 }
f433c8ab
MN
927 return;
928 }
929#endif
94daf2e9 930#if COMPILE_TEMPLATE_ALTIVEC
9b734d44
RP
931 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
932 chrFilter, chrSrc, chrFilterSize,
933 dest, uDest, vDest, dstW, chrDstW);
94daf2e9 934#else //COMPILE_TEMPLATE_ALTIVEC
9b734d44
RP
935 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
94daf2e9 938#endif //!COMPILE_TEMPLATE_ALTIVEC
c1b0bfb4 939}
2add307d 940
7ac40327
RP
941static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
942 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
b411dfff 943 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
6118e52e 944{
9b734d44
RP
945 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
946 chrFilter, chrSrc, chrFilterSize,
947 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
948}
949
7ac40327 950static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
6858492e 951 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
c1b0bfb4 952{
f433c8ab 953 int i;
94daf2e9 954#if COMPILE_TEMPLATE_MMX
dd68318c 955 if(!(c->flags & SWS_BITEXACT)) {
6858492e 956 long p= 4;
a959e247 957 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
6858492e
CS
958 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
959 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
2da0d70d 960
dd68318c
RP
961 if (c->flags & SWS_ACCURATE_RND) {
962 while(p--) {
963 if (dst[p]) {
3164d25e
CS
964 __asm__ volatile(
965 YSCALEYUV2YV121_ACCURATE
966 :: "r" (src[p]), "r" (dst[p] + counter[p]),
967 "g" (-counter[p])
968 : "%"REG_a
969 );
970 }
6858492e 971 }
dd68318c
RP
972 } else {
973 while(p--) {
974 if (dst[p]) {
3164d25e
CS
975 __asm__ volatile(
976 YSCALEYUV2YV121
977 :: "r" (src[p]), "r" (dst[p] + counter[p]),
978 "g" (-counter[p])
979 : "%"REG_a
980 );
981 }
6858492e 982 }
d78c1ea1 983 }
f433c8ab
MN
984 return;
985 }
986#endif
dd68318c 987 for (i=0; i<dstW; i++) {
a1f3ffa3 988 int val= (lumSrc[i]+64)>>7;
2da0d70d 989
dd68318c 990 if (val&256) {
2da0d70d
DB
991 if (val<0) val=0;
992 else val=255;
993 }
994
995 dest[i]= val;
996 }
997
1b0a4572 998 if (uDest)
dd68318c 999 for (i=0; i<chrDstW; i++) {
a1f3ffa3
MN
1000 int u=(chrSrc[i ]+64)>>7;
1001 int v=(chrSrc[i + VOFW]+64)>>7;
2da0d70d 1002
dd68318c 1003 if ((u|v)&256) {
2da0d70d
DB
1004 if (u<0) u=0;
1005 else if (u>255) u=255;
1006 if (v<0) v=0;
1007 else if (v>255) v=255;
1008 }
1009
1010 uDest[i]= u;
1011 vDest[i]= v;
1012 }
6858492e
CS
1013
1014 if (CONFIG_SWSCALE_ALPHA && aDest)
dd68318c 1015 for (i=0; i<dstW; i++) {
6858492e
CS
1016 int val= (alpSrc[i]+64)>>7;
1017 aDest[i]= av_clip_uint8(val);
1018 }
38858470
MN
1019}
1020
c1b0bfb4 1021
d604bab9
MN
1022/**
1023 * vertical scale YV12 to RGB
1024 */
7ac40327
RP
1025static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1026 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1027 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1028{
94daf2e9 1029#if COMPILE_TEMPLATE_MMX
d0ce212a 1030 x86_reg dummy=0;
dd68318c
RP
1031 if(!(c->flags & SWS_BITEXACT)) {
1032 if (c->flags & SWS_ACCURATE_RND) {
1033 switch(c->dstFormat) {
14014d47 1034 case PIX_FMT_RGB32:
dd68318c 1035 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e
CS
1036 YSCALEYUV2PACKEDX_ACCURATE
1037 YSCALEYUV2RGBX
1038 "movq %%mm2, "U_TEMP"(%0) \n\t"
1039 "movq %%mm4, "V_TEMP"(%0) \n\t"
1040 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1041 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1042 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1043 "psraw $3, %%mm1 \n\t"
1044 "psraw $3, %%mm7 \n\t"
1045 "packuswb %%mm7, %%mm1 \n\t"
1046 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1047
1048 YSCALEYUV2PACKEDX_END
dd68318c 1049 } else {
3164d25e
CS
1050 YSCALEYUV2PACKEDX_ACCURATE
1051 YSCALEYUV2RGBX
1052 "pcmpeqd %%mm7, %%mm7 \n\t"
1053 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
2da0d70d 1054
3164d25e 1055 YSCALEYUV2PACKEDX_END
6858492e 1056 }
14014d47
MN
1057 return;
1058 case PIX_FMT_BGR24:
1059 YSCALEYUV2PACKEDX_ACCURATE
1060 YSCALEYUV2RGBX
40494418 1061 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1062 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1063 "add %4, %%"REG_c" \n\t"
1064 WRITEBGR24(%%REGc, %5, %%REGa)
2da0d70d
DB
1065
1066
14014d47
MN
1067 :: "r" (&c->redDither),
1068 "m" (dummy), "m" (dummy), "m" (dummy),
1069 "r" (dest), "m" (dstW)
1070 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1071 );
1072 return;
1073 case PIX_FMT_RGB555:
1074 YSCALEYUV2PACKEDX_ACCURATE
1075 YSCALEYUV2RGBX
40494418 1076 "pxor %%mm7, %%mm7 \n\t"
14014d47 1077 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1078#ifdef DITHER1XBPP
88e2a9ae
CEH
1079 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1080 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1081 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1082#endif
1083
14014d47
MN
1084 WRITERGB15(%4, %5, %%REGa)
1085 YSCALEYUV2PACKEDX_END
1086 return;
1087 case PIX_FMT_RGB565:
1088 YSCALEYUV2PACKEDX_ACCURATE
1089 YSCALEYUV2RGBX
40494418 1090 "pxor %%mm7, %%mm7 \n\t"
14014d47 1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1092#ifdef DITHER1XBPP
88e2a9ae
CEH
1093 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1094 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1095 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1096#endif
1097
14014d47
MN
1098 WRITERGB16(%4, %5, %%REGa)
1099 YSCALEYUV2PACKEDX_END
1100 return;
1101 case PIX_FMT_YUYV422:
1102 YSCALEYUV2PACKEDX_ACCURATE
1103 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1104
1105 "psraw $3, %%mm3 \n\t"
1106 "psraw $3, %%mm4 \n\t"
1107 "psraw $3, %%mm1 \n\t"
1108 "psraw $3, %%mm7 \n\t"
1109 WRITEYUY2(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1111 return;
1112 }
dd68318c
RP
1113 } else {
1114 switch(c->dstFormat) {
14014d47 1115 case PIX_FMT_RGB32:
dd68318c 1116 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e
CS
1117 YSCALEYUV2PACKEDX
1118 YSCALEYUV2RGBX
1119 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1120 "psraw $3, %%mm1 \n\t"
1121 "psraw $3, %%mm7 \n\t"
1122 "packuswb %%mm7, %%mm1 \n\t"
1123 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1124 YSCALEYUV2PACKEDX_END
dd68318c 1125 } else {
3164d25e
CS
1126 YSCALEYUV2PACKEDX
1127 YSCALEYUV2RGBX
1128 "pcmpeqd %%mm7, %%mm7 \n\t"
1129 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1130 YSCALEYUV2PACKEDX_END
6858492e 1131 }
14014d47
MN
1132 return;
1133 case PIX_FMT_BGR24:
1134 YSCALEYUV2PACKEDX
1135 YSCALEYUV2RGBX
40494418 1136 "pxor %%mm7, %%mm7 \n\t"
14014d47
MN
1137 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1138 "add %4, %%"REG_c" \n\t"
1139 WRITEBGR24(%%REGc, %5, %%REGa)
1140
1141 :: "r" (&c->redDither),
1142 "m" (dummy), "m" (dummy), "m" (dummy),
1143 "r" (dest), "m" (dstW)
1144 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145 );
1146 return;
1147 case PIX_FMT_RGB555:
1148 YSCALEYUV2PACKEDX
1149 YSCALEYUV2RGBX
40494418 1150 "pxor %%mm7, %%mm7 \n\t"
14014d47 1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1152#ifdef DITHER1XBPP
88e2a9ae
CEH
1153 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1154 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1155 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1156#endif
1157
14014d47
MN
1158 WRITERGB15(%4, %5, %%REGa)
1159 YSCALEYUV2PACKEDX_END
1160 return;
1161 case PIX_FMT_RGB565:
1162 YSCALEYUV2PACKEDX
1163 YSCALEYUV2RGBX
40494418 1164 "pxor %%mm7, %%mm7 \n\t"
14014d47 1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1166#ifdef DITHER1XBPP
88e2a9ae
CEH
1167 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1168 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1169 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1170#endif
1171
14014d47
MN
1172 WRITERGB16(%4, %5, %%REGa)
1173 YSCALEYUV2PACKEDX_END
1174 return;
1175 case PIX_FMT_YUYV422:
1176 YSCALEYUV2PACKEDX
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1185 return;
1186 }
bca11e75
MN
1187 }
1188 }
94daf2e9
RP
1189#endif /* COMPILE_TEMPLATE_MMX */
1190#if COMPILE_TEMPLATE_ALTIVEC
2da0d70d 1191 /* The following list of supported dstFormat values should
780daf2b 1192 match what's found in the body of ff_yuv2packedX_altivec() */
d55ef636 1193 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
9b734d44
RP
1194 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1195 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1196 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
780daf2b
DB
1197 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1198 chrFilter, chrSrc, chrFilterSize,
1199 dest, dstW, dstY);
2da0d70d
DB
1200 else
1201#endif
1202 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1203 chrFilter, chrSrc, chrFilterSize,
6858492e 1204 alpSrc, dest, dstW, dstY);
c1b0bfb4
MN
1205}
1206
c1b0bfb4
MN
1207/**
1208 * vertical bilinear scale YV12 to RGB
1209 */
7ac40327
RP
1210static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1211 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1212{
ac0ad729
MN
1213 int yalpha1=4095- yalpha;
1214 int uvalpha1=4095-uvalpha;
2da0d70d 1215 int i;
d604bab9 1216
94daf2e9 1217#if COMPILE_TEMPLATE_MMX
dd68318c
RP
1218 if(!(c->flags & SWS_BITEXACT)) {
1219 switch(c->dstFormat) {
c255994b
RP
1220 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1221 case PIX_FMT_RGB32:
1222 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e 1223#if ARCH_X86_64
c255994b 1224 __asm__ volatile(
f514b4f9
RD
1225 YSCALEYUV2RGB(%%r8, %5)
1226 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
6858492e
CS
1227 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1228 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1229 "packuswb %%mm7, %%mm1 \n\t"
f514b4f9 1230 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
6858492e 1231
04ef1d3f 1232 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
6858492e
CS
1233 "a" (&c->redDither)
1234 ,"r" (abuf0), "r" (abuf1)
f514b4f9 1235 : "%r8"
c255994b 1236 );
6858492e 1237#else
a959e247
ZM
1238 *(const uint16_t **)(&c->u_temp)=abuf0;
1239 *(const uint16_t **)(&c->v_temp)=abuf1;
c255994b 1240 __asm__ volatile(
6858492e
CS
1241 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1242 "mov %4, %%"REG_b" \n\t"
1243 "push %%"REG_BP" \n\t"
1244 YSCALEYUV2RGB(%%REGBP, %5)
1245 "push %0 \n\t"
1246 "push %1 \n\t"
1247 "mov "U_TEMP"(%5), %0 \n\t"
1248 "mov "V_TEMP"(%5), %1 \n\t"
1249 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1250 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "packuswb %%mm7, %%mm1 \n\t"
1253 "pop %1 \n\t"
1254 "pop %0 \n\t"
1255 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1256 "pop %%"REG_BP" \n\t"
1257 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1258
1259 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1260 "a" (&c->redDither)
c255994b 1261 );
6858492e 1262#endif
c255994b
RP
1263 } else {
1264 __asm__ volatile(
3164d25e
CS
1265 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1266 "mov %4, %%"REG_b" \n\t"
1267 "push %%"REG_BP" \n\t"
1268 YSCALEYUV2RGB(%%REGBP, %5)
1269 "pcmpeqd %%mm7, %%mm7 \n\t"
1270 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1271 "pop %%"REG_BP" \n\t"
1272 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1273
3164d25e
CS
1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275 "a" (&c->redDither)
c255994b
RP
1276 );
1277 }
1278 return;
1279 case PIX_FMT_BGR24:
1280 __asm__ volatile(
2da0d70d
DB
1281 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1282 "mov %4, %%"REG_b" \n\t"
1283 "push %%"REG_BP" \n\t"
1284 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1285 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
1286 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1287 "pop %%"REG_BP" \n\t"
1288 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1289 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1290 "a" (&c->redDither)
c255994b
RP
1291 );
1292 return;
1293 case PIX_FMT_RGB555:
1294 __asm__ volatile(
2da0d70d
DB
1295 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1296 "mov %4, %%"REG_b" \n\t"
1297 "push %%"REG_BP" \n\t"
1298 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1299 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1300 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1301#ifdef DITHER1XBPP
88e2a9ae
CEH
1302 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1303 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1304 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1305#endif
1306
27a90b04 1307 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1308 "pop %%"REG_BP" \n\t"
1309 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1310
1311 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1312 "a" (&c->redDither)
c255994b
RP
1313 );
1314 return;
1315 case PIX_FMT_RGB565:
1316 __asm__ volatile(
2da0d70d
DB
1317 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1318 "mov %4, %%"REG_b" \n\t"
1319 "push %%"REG_BP" \n\t"
1320 YSCALEYUV2RGB(%%REGBP, %5)
40494418 1321 "pxor %%mm7, %%mm7 \n\t"
2da0d70d 1322 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1323#ifdef DITHER1XBPP
88e2a9ae
CEH
1324 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1325 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1326 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1327#endif
1328
27a90b04 1329 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1330 "pop %%"REG_BP" \n\t"
1331 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1332 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1333 "a" (&c->redDither)
c255994b
RP
1334 );
1335 return;
1336 case PIX_FMT_YUYV422:
1337 __asm__ volatile(
2da0d70d
DB
1338 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1339 "mov %4, %%"REG_b" \n\t"
1340 "push %%"REG_BP" \n\t"
1341 YSCALEYUV2PACKED(%%REGBP, %5)
1342 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1343 "pop %%"REG_BP" \n\t"
1344 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1345 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1346 "a" (&c->redDither)
c255994b
RP
1347 );
1348 return;
1349 default: break;
2da0d70d 1350 }
f433c8ab 1351 }
94daf2e9 1352#endif //COMPILE_TEMPLATE_MMX
9b734d44 1353 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
d604bab9
MN
1354}
1355
1356/**
1357 * YV12 to RGB without scaling or interpolating
1358 */
7ac40327 1359static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
b411dfff 1360 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
d604bab9 1361{
2da0d70d
DB
1362 const int yalpha1=0;
1363 int i;
6a4970ab 1364
7ac40327 1365 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
2da0d70d 1366 const int yalpha= 4096; //FIXME ...
96034638 1367
dd68318c 1368 if (flags&SWS_FULL_CHR_H_INT) {
40fa5140 1369 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
2da0d70d
DB
1370 return;
1371 }
397c035e 1372
94daf2e9 1373#if COMPILE_TEMPLATE_MMX
dd68318c
RP
1374 if(!(flags & SWS_BITEXACT)) {
1375 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1376 switch(dstFormat) {
14014d47 1377 case PIX_FMT_RGB32:
dd68318c 1378 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e 1379 __asm__ volatile(
c255994b
RP
1380 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1381 "mov %4, %%"REG_b" \n\t"
1382 "push %%"REG_BP" \n\t"
1383 YSCALEYUV2RGB1(%%REGBP, %5)
1384 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1385 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1386 "pop %%"REG_BP" \n\t"
1387 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1388
1389 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1390 "a" (&c->redDither)
6858492e 1391 );
dd68318c 1392 } else {
3164d25e 1393 __asm__ volatile(
c255994b
RP
1394 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1395 "mov %4, %%"REG_b" \n\t"
1396 "push %%"REG_BP" \n\t"
1397 YSCALEYUV2RGB1(%%REGBP, %5)
1398 "pcmpeqd %%mm7, %%mm7 \n\t"
1399 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1400 "pop %%"REG_BP" \n\t"
1401 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1402
1403 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1404 "a" (&c->redDither)
1405 );
1406 }
1407 return;
1408 case PIX_FMT_BGR24:
1409 __asm__ volatile(
3164d25e
CS
1410 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1411 "mov %4, %%"REG_b" \n\t"
1412 "push %%"REG_BP" \n\t"
1413 YSCALEYUV2RGB1(%%REGBP, %5)
c255994b
RP
1414 "pxor %%mm7, %%mm7 \n\t"
1415 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
3164d25e
CS
1416 "pop %%"REG_BP" \n\t"
1417 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1418
3164d25e
CS
1419 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1420 "a" (&c->redDither)
14014d47
MN
1421 );
1422 return;
1423 case PIX_FMT_RGB555:
7ad6469e 1424 __asm__ volatile(
c255994b
RP
1425 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1426 "mov %4, %%"REG_b" \n\t"
1427 "push %%"REG_BP" \n\t"
1428 YSCALEYUV2RGB1(%%REGBP, %5)
1429 "pxor %%mm7, %%mm7 \n\t"
1430 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1431#ifdef DITHER1XBPP
c255994b
RP
1432 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1433 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1434 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1435#endif
c255994b
RP
1436 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1439
c255994b
RP
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441 "a" (&c->redDither)
14014d47
MN
1442 );
1443 return;
1444 case PIX_FMT_RGB565:
7ad6469e 1445 __asm__ volatile(
c255994b
RP
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB1(%%REGBP, %5)
1450 "pxor %%mm7, %%mm7 \n\t"
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1452#ifdef DITHER1XBPP
c255994b
RP
1453 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1454 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1455 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1456#endif
1457
c255994b
RP
1458 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1459 "pop %%"REG_BP" \n\t"
1460 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1461
c255994b
RP
1462 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463 "a" (&c->redDither)
14014d47
MN
1464 );
1465 return;
1466 case PIX_FMT_YUYV422:
7ad6469e 1467 __asm__ volatile(
c255994b
RP
1468 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_b" \n\t"
1470 "push %%"REG_BP" \n\t"
1471 YSCALEYUV2PACKED1(%%REGBP, %5)
1472 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1473 "pop %%"REG_BP" \n\t"
1474 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1475
c255994b
RP
1476 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1477 "a" (&c->redDither)
14014d47
MN
1478 );
1479 return;
1480 }
dd68318c
RP
1481 } else {
1482 switch(dstFormat) {
14014d47 1483 case PIX_FMT_RGB32:
dd68318c 1484 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
6858492e 1485 __asm__ volatile(
c255994b
RP
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB1b(%%REGBP, %5)
1490 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1491 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494
1495 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1496 "a" (&c->redDither)
6858492e 1497 );
dd68318c 1498 } else {
3164d25e 1499 __asm__ volatile(
c255994b
RP
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP, %5)
1504 "pcmpeqd %%mm7, %%mm7 \n\t"
1505 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506 "pop %%"REG_BP" \n\t"
1507 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1508
1509 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510 "a" (&c->redDither)
1511 );
1512 }
1513 return;
1514 case PIX_FMT_BGR24:
1515 __asm__ volatile(
3164d25e
CS
1516 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1517 "mov %4, %%"REG_b" \n\t"
1518 "push %%"REG_BP" \n\t"
1519 YSCALEYUV2RGB1b(%%REGBP, %5)
c255994b
RP
1520 "pxor %%mm7, %%mm7 \n\t"
1521 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
3164d25e
CS
1522 "pop %%"REG_BP" \n\t"
1523 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1524
3164d25e
CS
1525 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1526 "a" (&c->redDither)
14014d47
MN
1527 );
1528 return;
1529 case PIX_FMT_RGB555:
7ad6469e 1530 __asm__ volatile(
c255994b
RP
1531 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1532 "mov %4, %%"REG_b" \n\t"
1533 "push %%"REG_BP" \n\t"
1534 YSCALEYUV2RGB1b(%%REGBP, %5)
1535 "pxor %%mm7, %%mm7 \n\t"
1536 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1537#ifdef DITHER1XBPP
c255994b
RP
1538 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1539 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1540 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1541#endif
c255994b
RP
1542 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1543 "pop %%"REG_BP" \n\t"
1544 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1545
c255994b
RP
1546 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1547 "a" (&c->redDither)
14014d47
MN
1548 );
1549 return;
1550 case PIX_FMT_RGB565:
7ad6469e 1551 __asm__ volatile(
c255994b
RP
1552 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1553 "mov %4, %%"REG_b" \n\t"
1554 "push %%"REG_BP" \n\t"
1555 YSCALEYUV2RGB1b(%%REGBP, %5)
1556 "pxor %%mm7, %%mm7 \n\t"
1557 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1558#ifdef DITHER1XBPP
c255994b
RP
1559 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1560 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1561 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1562#endif
1563
c255994b
RP
1564 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1565 "pop %%"REG_BP" \n\t"
1566 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1567
c255994b
RP
1568 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1569 "a" (&c->redDither)
14014d47
MN
1570 );
1571 return;
1572 case PIX_FMT_YUYV422:
7ad6469e 1573 __asm__ volatile(
c255994b
RP
1574 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1575 "mov %4, %%"REG_b" \n\t"
1576 "push %%"REG_BP" \n\t"
1577 YSCALEYUV2PACKED1b(%%REGBP, %5)
1578 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1579 "pop %%"REG_BP" \n\t"
1580 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
14014d47 1581
c255994b
RP
1582 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583 "a" (&c->redDither)
14014d47
MN
1584 );
1585 return;
1586 }
2da0d70d
DB
1587 }
1588 }
94daf2e9 1589#endif /* COMPILE_TEMPLATE_MMX */
dd68318c 1590 if (uvalpha < 2048) {
6858492e 1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
dd68318c 1592 } else {
6858492e 1593 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1594 }
d604bab9
MN
1595}
1596
8a322796 1597//FIXME yuy2* can read up to 7 samples too much
6ff0ad6b 1598
7ac40327 1599static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1e621b18 1600{
94daf2e9 1601#if COMPILE_TEMPLATE_MMX
7ad6469e 1602 __asm__ volatile(
c255994b
RP
1603 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1604 "mov %0, %%"REG_a" \n\t"
1605 "1: \n\t"
1606 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1607 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1608 "pand %%mm2, %%mm0 \n\t"
1609 "pand %%mm2, %%mm1 \n\t"
1610 "packuswb %%mm1, %%mm0 \n\t"
1611 "movq %%mm0, (%2, %%"REG_a") \n\t"
1612 "add $8, %%"REG_a" \n\t"
1613 " js 1b \n\t"
1614 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1615 : "%"REG_a
2da0d70d 1616 );
1e621b18 1617#else
2da0d70d
DB
1618 int i;
1619 for (i=0; i<width; i++)
1620 dst[i]= src[2*i];
1e621b18
MN
1621#endif
1622}
1623
7ac40327 1624static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1e621b18 1625{
94daf2e9 1626#if COMPILE_TEMPLATE_MMX
7ad6469e 1627 __asm__ volatile(
c255994b
RP
1628 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1629 "mov %0, %%"REG_a" \n\t"
1630 "1: \n\t"
1631 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1632 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1633 "psrlw $8, %%mm0 \n\t"
1634 "psrlw $8, %%mm1 \n\t"
1635 "packuswb %%mm1, %%mm0 \n\t"
1636 "movq %%mm0, %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "pand %%mm4, %%mm1 \n\t"
1639 "packuswb %%mm0, %%mm0 \n\t"
1640 "packuswb %%mm1, %%mm1 \n\t"
1641 "movd %%mm0, (%3, %%"REG_a") \n\t"
1642 "movd %%mm1, (%2, %%"REG_a") \n\t"
1643 "add $4, %%"REG_a" \n\t"
1644 " js 1b \n\t"
1645 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1646 : "%"REG_a
2da0d70d 1647 );
1e621b18 1648#else
2da0d70d 1649 int i;
dd68318c 1650 for (i=0; i<width; i++) {
2da0d70d
DB
1651 dstU[i]= src1[4*i + 1];
1652 dstV[i]= src1[4*i + 3];
1653 }
1654#endif
1655 assert(src1 == src2);
1e621b18
MN
1656}
1657
de1275d5
MN
1658static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1659{
94daf2e9 1660#if COMPILE_TEMPLATE_MMX
de1275d5 1661 __asm__ volatile(
c255994b
RP
1662 "mov %0, %%"REG_a" \n\t"
1663 "1: \n\t"
1664 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1665 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1666 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1667 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1668 "psrlw $8, %%mm0 \n\t"
1669 "psrlw $8, %%mm1 \n\t"
1670 "psrlw $8, %%mm2 \n\t"
1671 "psrlw $8, %%mm3 \n\t"
1672 "packuswb %%mm1, %%mm0 \n\t"
1673 "packuswb %%mm3, %%mm2 \n\t"
1674 "movq %%mm0, (%3, %%"REG_a") \n\t"
1675 "movq %%mm2, (%4, %%"REG_a") \n\t"
1676 "add $8, %%"REG_a" \n\t"
1677 " js 1b \n\t"
1678 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1679 : "%"REG_a
de1275d5
MN
1680 );
1681#else
1682 int i;
dd68318c 1683 for (i=0; i<width; i++) {
de1275d5
MN
1684 dstU[i]= src1[2*i + 1];
1685 dstV[i]= src2[2*i + 1];
1686 }
1687#endif
1688}
1689
4cf16bbe
DB
1690/* This is almost identical to the previous, end exists only because
1691 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
7ac40327 1692static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
7322a67c 1693{
94daf2e9 1694#if COMPILE_TEMPLATE_MMX
7ad6469e 1695 __asm__ volatile(
c255994b
RP
1696 "mov %0, %%"REG_a" \n\t"
1697 "1: \n\t"
1698 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1699 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1700 "psrlw $8, %%mm0 \n\t"
1701 "psrlw $8, %%mm1 \n\t"
1702 "packuswb %%mm1, %%mm0 \n\t"
1703 "movq %%mm0, (%2, %%"REG_a") \n\t"
1704 "add $8, %%"REG_a" \n\t"
1705 " js 1b \n\t"
1706 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1707 : "%"REG_a
2da0d70d 1708 );
7322a67c 1709#else
2da0d70d
DB
1710 int i;
1711 for (i=0; i<width; i++)
1712 dst[i]= src[2*i+1];
7322a67c
MN
1713#endif
1714}
1715
7ac40327 1716static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
7322a67c 1717{
94daf2e9 1718#if COMPILE_TEMPLATE_MMX
7ad6469e 1719 __asm__ volatile(
c255994b
RP
1720 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1721 "mov %0, %%"REG_a" \n\t"
1722 "1: \n\t"
1723 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1724 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1725 "pand %%mm4, %%mm0 \n\t"
1726 "pand %%mm4, %%mm1 \n\t"
1727 "packuswb %%mm1, %%mm0 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "psrlw $8, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm0, %%mm0 \n\t"
1732 "packuswb %%mm1, %%mm1 \n\t"
1733 "movd %%mm0, (%3, %%"REG_a") \n\t"
1734 "movd %%mm1, (%2, %%"REG_a") \n\t"
1735 "add $4, %%"REG_a" \n\t"
1736 " js 1b \n\t"
1737 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1738 : "%"REG_a
2da0d70d 1739 );
7322a67c 1740#else
2da0d70d 1741 int i;
dd68318c 1742 for (i=0; i<width; i++) {
2da0d70d
DB
1743 dstU[i]= src1[4*i + 0];
1744 dstV[i]= src1[4*i + 2];
1745 }
1746#endif
1747 assert(src1 == src2);
7322a67c
MN
1748}
1749
de1275d5
MN
1750static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1751{
94daf2e9 1752#if COMPILE_TEMPLATE_MMX
de1275d5 1753 __asm__ volatile(
c255994b
RP
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1756 "1: \n\t"
1757 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1759 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1760 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1761 "pand %%mm4, %%mm0 \n\t"
1762 "pand %%mm4, %%mm1 \n\t"
1763 "pand %%mm4, %%mm2 \n\t"
1764 "pand %%mm4, %%mm3 \n\t"
1765 "packuswb %%mm1, %%mm0 \n\t"
1766 "packuswb %%mm3, %%mm2 \n\t"
1767 "movq %%mm0, (%3, %%"REG_a") \n\t"
1768 "movq %%mm2, (%4, %%"REG_a") \n\t"
1769 "add $8, %%"REG_a" \n\t"
1770 " js 1b \n\t"
1771 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1772 : "%"REG_a
de1275d5
MN
1773 );
1774#else
1775 int i;
dd68318c 1776 for (i=0; i<width; i++) {
de1275d5
MN
1777 dstU[i]= src1[2*i];
1778 dstV[i]= src2[2*i];
1779 }
1780#endif
1781}
1782
f415be68
RP
1783static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1784 const uint8_t *src, long width)
1785{
1786#if COMPILE_TEMPLATE_MMX
1787 __asm__ volatile(
1788 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1789 "mov %0, %%"REG_a" \n\t"
1790 "1: \n\t"
1791 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1792 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1793 "movq %%mm0, %%mm2 \n\t"
1794 "movq %%mm1, %%mm3 \n\t"
1795 "pand %%mm4, %%mm0 \n\t"
1796 "pand %%mm4, %%mm1 \n\t"
1797 "psrlw $8, %%mm2 \n\t"
1798 "psrlw $8, %%mm3 \n\t"
1799 "packuswb %%mm1, %%mm0 \n\t"
1800 "packuswb %%mm3, %%mm2 \n\t"
1801 "movq %%mm0, (%2, %%"REG_a") \n\t"
1802 "movq %%mm2, (%3, %%"REG_a") \n\t"
1803 "add $8, %%"REG_a" \n\t"
1804 " js 1b \n\t"
1805 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1806 : "%"REG_a
1807 );
1808#else
1809 int i;
1810 for (i = 0; i < width; i++) {
1811 dst1[i] = src[2*i+0];
1812 dst2[i] = src[2*i+1];
1813 }
1814#endif
1815}
1816
e470691a
RP
1817static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1818 const uint8_t *src1, const uint8_t *src2,
1819 long width, uint32_t *unused)
f415be68
RP
1820{
1821 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1822}
1823
e470691a
RP
1824static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1825 const uint8_t *src1, const uint8_t *src2,
1826 long width, uint32_t *unused)
f415be68
RP
1827{
1828 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1829}
1830
94daf2e9 1831#if COMPILE_TEMPLATE_MMX
b411dfff 1832static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
dfb09bd1
MN
1833{
1834
dd68318c 1835 if(srcFormat == PIX_FMT_BGR24) {
7ad6469e 1836 __asm__ volatile(
ff9a056d
MN
1837 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1838 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1839 :
dfb09bd1 1840 );
dd68318c 1841 } else {
7ad6469e 1842 __asm__ volatile(
ff9a056d
MN
1843 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1844 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1845 :
dfb09bd1
MN
1846 );
1847 }
1848
7ad6469e 1849 __asm__ volatile(
dfb09bd1
MN
1850 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1851 "mov %2, %%"REG_a" \n\t"
1852 "pxor %%mm7, %%mm7 \n\t"
1853 "1: \n\t"
1854 PREFETCH" 64(%0) \n\t"
1855 "movd (%0), %%mm0 \n\t"
1856 "movd 2(%0), %%mm1 \n\t"
1857 "movd 6(%0), %%mm2 \n\t"
1858 "movd 8(%0), %%mm3 \n\t"
1859 "add $12, %0 \n\t"
1860 "punpcklbw %%mm7, %%mm0 \n\t"
1861 "punpcklbw %%mm7, %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm2 \n\t"
1863 "punpcklbw %%mm7, %%mm3 \n\t"
1864 "pmaddwd %%mm5, %%mm0 \n\t"
1865 "pmaddwd %%mm6, %%mm1 \n\t"
1866 "pmaddwd %%mm5, %%mm2 \n\t"
1867 "pmaddwd %%mm6, %%mm3 \n\t"
1868 "paddd %%mm1, %%mm0 \n\t"
1869 "paddd %%mm3, %%mm2 \n\t"
1870 "paddd %%mm4, %%mm0 \n\t"
1871 "paddd %%mm4, %%mm2 \n\t"
1872 "psrad $15, %%mm0 \n\t"
1873 "psrad $15, %%mm2 \n\t"
1874 "packssdw %%mm2, %%mm0 \n\t"
1875 "packuswb %%mm0, %%mm0 \n\t"
1876 "movd %%mm0, (%1, %%"REG_a") \n\t"
1877 "add $4, %%"REG_a" \n\t"
1878 " js 1b \n\t"
1879 : "+r" (src)
d0ce212a 1880 : "r" (dst+width), "g" ((x86_reg)-width)
dfb09bd1 1881 : "%"REG_a
2da0d70d 1882 );
dfb09bd1
MN
1883}
1884
b411dfff 1885static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
dfb09bd1 1886{
7ad6469e 1887 __asm__ volatile(
dfb09bd1
MN
1888 "movq 24+%4, %%mm6 \n\t"
1889 "mov %3, %%"REG_a" \n\t"
1890 "pxor %%mm7, %%mm7 \n\t"
1891 "1: \n\t"
1892 PREFETCH" 64(%0) \n\t"
1893 "movd (%0), %%mm0 \n\t"
1894 "movd 2(%0), %%mm1 \n\t"
1895 "punpcklbw %%mm7, %%mm0 \n\t"
1896 "punpcklbw %%mm7, %%mm1 \n\t"
1897 "movq %%mm0, %%mm2 \n\t"
1898 "movq %%mm1, %%mm3 \n\t"
1899 "pmaddwd %4, %%mm0 \n\t"
1900 "pmaddwd 8+%4, %%mm1 \n\t"
1901 "pmaddwd 16+%4, %%mm2 \n\t"
1902 "pmaddwd %%mm6, %%mm3 \n\t"
1903 "paddd %%mm1, %%mm0 \n\t"
1904 "paddd %%mm3, %%mm2 \n\t"
1905
1906 "movd 6(%0), %%mm1 \n\t"
1907 "movd 8(%0), %%mm3 \n\t"
1908 "add $12, %0 \n\t"
1909 "punpcklbw %%mm7, %%mm1 \n\t"
1910 "punpcklbw %%mm7, %%mm3 \n\t"
1911 "movq %%mm1, %%mm4 \n\t"
1912 "movq %%mm3, %%mm5 \n\t"
1913 "pmaddwd %4, %%mm1 \n\t"
1914 "pmaddwd 8+%4, %%mm3 \n\t"
1915 "pmaddwd 16+%4, %%mm4 \n\t"
1916 "pmaddwd %%mm6, %%mm5 \n\t"
1917 "paddd %%mm3, %%mm1 \n\t"
1918 "paddd %%mm5, %%mm4 \n\t"
1919
1920 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1921 "paddd %%mm3, %%mm0 \n\t"
1922 "paddd %%mm3, %%mm2 \n\t"
1923 "paddd %%mm3, %%mm1 \n\t"
1924 "paddd %%mm3, %%mm4 \n\t"
1925 "psrad $15, %%mm0 \n\t"
1926 "psrad $15, %%mm2 \n\t"
1927 "psrad $15, %%mm1 \n\t"
1928 "psrad $15, %%mm4 \n\t"
1929 "packssdw %%mm1, %%mm0 \n\t"
1930 "packssdw %%mm4, %%mm2 \n\t"
1931 "packuswb %%mm0, %%mm0 \n\t"
1932 "packuswb %%mm2, %%mm2 \n\t"
1933 "movd %%mm0, (%1, %%"REG_a") \n\t"
1934 "movd %%mm2, (%2, %%"REG_a") \n\t"
1935 "add $4, %%"REG_a" \n\t"
1936 " js 1b \n\t"
1937 : "+r" (src)
d0ce212a 1938 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
dfb09bd1
MN
1939 : "%"REG_a
1940 );
1941}
1942#endif
1943
7ac40327 1944static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
dfb09bd1 1945{
94daf2e9 1946#if COMPILE_TEMPLATE_MMX
a35acd7f 1947 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1e621b18 1948#else
2da0d70d 1949 int i;
dd68318c 1950 for (i=0; i<width; i++) {
2da0d70d
DB
1951 int b= src[i*3+0];
1952 int g= src[i*3+1];
1953 int r= src[i*3+2];
1e621b18 1954
e5091488 1955 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1956 }
94daf2e9 1957#endif /* COMPILE_TEMPLATE_MMX */
1e621b18
MN
1958}
1959
7ac40327 1960static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1e621b18 1961{
94daf2e9 1962#if COMPILE_TEMPLATE_MMX
a35acd7f 1963 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1e621b18 1964#else
2da0d70d 1965 int i;
dd68318c 1966 for (i=0; i<width; i++) {
dfb09bd1
MN
1967 int b= src1[3*i + 0];
1968 int g= src1[3*i + 1];
1969 int r= src1[3*i + 2];
2da0d70d 1970
dfb09bd1
MN
1971 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1972 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 1973 }
94daf2e9 1974#endif /* COMPILE_TEMPLATE_MMX */
2da0d70d 1975 assert(src1 == src2);
1e621b18
MN
1976}
1977
7ac40327 1978static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
1979{
1980 int i;
dd68318c 1981 for (i=0; i<width; i++) {
2f60f629
MN
1982 int b= src1[6*i + 0] + src1[6*i + 3];
1983 int g= src1[6*i + 1] + src1[6*i + 4];
1984 int r= src1[6*i + 2] + src1[6*i + 5];
1985
1986 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1987 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1988 }
1989 assert(src1 == src2);
1990}
1991
7ac40327 1992static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
a861d4d7 1993{
94daf2e9 1994#if COMPILE_TEMPLATE_MMX
a35acd7f 1995 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
dfb09bd1 1996#else
2da0d70d 1997 int i;
dd68318c 1998 for (i=0; i<width; i++) {
2da0d70d
DB
1999 int r= src[i*3+0];
2000 int g= src[i*3+1];
2001 int b= src[i*3+2];
2002
e5091488 2003 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 2004 }
dfb09bd1 2005#endif
a861d4d7
MN
2006}
2007
7ac40327 2008static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
a861d4d7 2009{
94daf2e9 2010#if COMPILE_TEMPLATE_MMX
5155b839 2011 assert(src1==src2);
a35acd7f 2012 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
dfb09bd1 2013#else
5155b839
DB
2014 int i;
2015 assert(src1==src2);
dd68318c 2016 for (i=0; i<width; i++) {
dfb09bd1
MN
2017 int r= src1[3*i + 0];
2018 int g= src1[3*i + 1];
2019 int b= src1[3*i + 2];
2da0d70d 2020
dfb09bd1
MN
2021 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2022 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2023 }
dfb09bd1 2024#endif
a861d4d7
MN
2025}
2026
7ac40327 2027static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
2028{
2029 int i;
2030 assert(src1==src2);
dd68318c 2031 for (i=0; i<width; i++) {
e09d7eef
MN
2032 int r= src1[6*i + 0] + src1[6*i + 3];
2033 int g= src1[6*i + 1] + src1[6*i + 4];
2034 int b= src1[6*i + 2] + src1[6*i + 5];
2f60f629
MN
2035
2036 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2037 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2038 }
2039}
2040
1e621b18 2041
8a322796 2042// bilinear / bicubic scaling
7ac40327
RP
2043static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2044 const int16_t *filter, const int16_t *filterPos, long filterSize)
2ff198c1 2045{
94daf2e9 2046#if COMPILE_TEMPLATE_MMX
2da0d70d 2047 assert(filterSize % 4 == 0 && filterSize>0);
dd68318c 2048 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
d0ce212a 2049 x86_reg counter= -2*dstW;
2da0d70d
DB
2050 filter-= counter*2;
2051 filterPos-= counter/2;
2052 dst-= counter/2;
7ad6469e 2053 __asm__ volatile(
83c89c78 2054#if defined(PIC)
c255994b 2055 "push %%"REG_b" \n\t"
2da0d70d 2056#endif
c255994b
RP
2057 "pxor %%mm7, %%mm7 \n\t"
2058 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a", %%"REG_BP" \n\t"
2060 ASMALIGN(4)
2061 "1: \n\t"
2062 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2064 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2065 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2066 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2072 "movq %%mm0, %%mm4 \n\t"
2073 "punpckldq %%mm3, %%mm0 \n\t"
2074 "punpckhdq %%mm3, %%mm4 \n\t"
2075 "paddd %%mm4, %%mm0 \n\t"
2076 "psrad $7, %%mm0 \n\t"
2077 "packssdw %%mm0, %%mm0 \n\t"
2078 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2079 "add $4, %%"REG_BP" \n\t"
2080 " jnc 1b \n\t"
2081
2082 "pop %%"REG_BP" \n\t"
83c89c78 2083#if defined(PIC)
c255994b 2084 "pop %%"REG_b" \n\t"
83c89c78 2085#endif
c255994b
RP
2086 : "+a" (counter)
2087 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2088#if !defined(PIC)
c255994b 2089 : "%"REG_b
2da0d70d
DB
2090#endif
2091 );
dd68318c 2092 } else if (filterSize==8) {
d0ce212a 2093 x86_reg counter= -2*dstW;
2da0d70d
DB
2094 filter-= counter*4;
2095 filterPos-= counter/2;
2096 dst-= counter/2;
7ad6469e 2097 __asm__ volatile(
83c89c78 2098#if defined(PIC)
c255994b 2099 "push %%"REG_b" \n\t"
2da0d70d 2100#endif
c255994b
RP
2101 "pxor %%mm7, %%mm7 \n\t"
2102 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2103 "mov %%"REG_a", %%"REG_BP" \n\t"
2104 ASMALIGN(4)
2105 "1: \n\t"
2106 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2107 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2108 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2109 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2110 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2111 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2112 "punpcklbw %%mm7, %%mm0 \n\t"
2113 "punpcklbw %%mm7, %%mm2 \n\t"
2114 "pmaddwd %%mm1, %%mm0 \n\t"
2115 "pmaddwd %%mm2, %%mm3 \n\t"
2116
2117 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2118 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2119 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2120 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm4 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "pmaddwd %%mm1, %%mm4 \n\t"
2124 "pmaddwd %%mm2, %%mm5 \n\t"
2125 "paddd %%mm4, %%mm0 \n\t"
2126 "paddd %%mm5, %%mm3 \n\t"
2127 "movq %%mm0, %%mm4 \n\t"
2128 "punpckldq %%mm3, %%mm0 \n\t"
2129 "punpckhdq %%mm3, %%mm4 \n\t"
2130 "paddd %%mm4, %%mm0 \n\t"
2131 "psrad $7, %%mm0 \n\t"
2132 "packssdw %%mm0, %%mm0 \n\t"
2133 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2134 "add $4, %%"REG_BP" \n\t"
2135 " jnc 1b \n\t"
2136
2137 "pop %%"REG_BP" \n\t"
83c89c78 2138#if defined(PIC)
c255994b 2139 "pop %%"REG_b" \n\t"
83c89c78 2140#endif
c255994b
RP
2141 : "+a" (counter)
2142 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2143#if !defined(PIC)
c255994b 2144 : "%"REG_b
2da0d70d
DB
2145#endif
2146 );
dd68318c 2147 } else {
a959e247 2148 const uint8_t *offset = src+filterSize;
d0ce212a 2149 x86_reg counter= -2*dstW;
2da0d70d
DB
2150 //filter-= counter*filterSize/2;
2151 filterPos-= counter/2;
2152 dst-= counter/2;
7ad6469e 2153 __asm__ volatile(
c255994b
RP
2154 "pxor %%mm7, %%mm7 \n\t"
2155 ASMALIGN(4)
2156 "1: \n\t"
2157 "mov %2, %%"REG_c" \n\t"
2158 "movzwl (%%"REG_c", %0), %%eax \n\t"
2159 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2160 "mov %5, %%"REG_c" \n\t"
2161 "pxor %%mm4, %%mm4 \n\t"
2162 "pxor %%mm5, %%mm5 \n\t"
2163 "2: \n\t"
2164 "movq (%1), %%mm1 \n\t"
2165 "movq (%1, %6), %%mm3 \n\t"
2166 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2167 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2168 "punpcklbw %%mm7, %%mm0 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "pmaddwd %%mm1, %%mm0 \n\t"
2171 "pmaddwd %%mm2, %%mm3 \n\t"
2172 "paddd %%mm3, %%mm5 \n\t"
2173 "paddd %%mm0, %%mm4 \n\t"
2174 "add $8, %1 \n\t"
2175 "add $4, %%"REG_c" \n\t"
2176 "cmp %4, %%"REG_c" \n\t"
2177 " jb 2b \n\t"
2178 "add %6, %1 \n\t"
2179 "movq %%mm4, %%mm0 \n\t"
2180 "punpckldq %%mm5, %%mm4 \n\t"
2181 "punpckhdq %%mm5, %%mm0 \n\t"
2182 "paddd %%mm0, %%mm4 \n\t"
2183 "psrad $7, %%mm4 \n\t"
2184 "packssdw %%mm4, %%mm4 \n\t"
2185 "mov %3, %%"REG_a" \n\t"
2186 "movd %%mm4, (%%"REG_a", %0) \n\t"
2187 "add $4, %0 \n\t"
2188 " jnc 1b \n\t"
2189
2190 : "+r" (counter), "+r" (filter)
2191 : "m" (filterPos), "m" (dst), "m"(offset),
2192 "m" (src), "r" ((x86_reg)filterSize*2)
2193 : "%"REG_a, "%"REG_c, "%"REG_d
2da0d70d
DB
2194 );
2195 }
077ea8a7 2196#else
94daf2e9 2197#if COMPILE_TEMPLATE_ALTIVEC
2da0d70d 2198 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2199#else
2da0d70d 2200 int i;
dd68318c 2201 for (i=0; i<dstW; i++) {
2da0d70d
DB
2202 int j;
2203 int srcPos= filterPos[i];
2204 int val=0;
2205 //printf("filterPos: %d\n", filterPos[i]);
dd68318c 2206 for (j=0; j<filterSize; j++) {
2da0d70d
DB
2207 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2208 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2209 }
2210 //filter += hFilterSize;
881c4294 2211 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2da0d70d
DB
2212 //dst[i] = val>>7;
2213 }
94daf2e9
RP
2214#endif /* COMPILE_ALTIVEC */
2215#endif /* COMPILE_MMX */
077ea8a7 2216}
392b6567 2217
bae76dc3
RP
2218//FIXME all pal and rgb srcFormats could do this convertion as well
2219//FIXME all scalers more complex than bilinear could do half of this transform
2220static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2221{
2222 int i;
2223 for (i = 0; i < width; i++) {
2224 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2225 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2226 }
2227}
2228static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2229{
2230 int i;
2231 for (i = 0; i < width; i++) {
2232 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2233 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2234 }
2235}
2236static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2237{
2238 int i;
2239 for (i = 0; i < width; i++)
2240 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2241}
2242static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2243{
2244 int i;
2245 for (i = 0; i < width; i++)
2246 dst[i] = (dst[i]*14071 + 33561947)>>14;
2247}
2248
18c61752
RP
2249#define FAST_BILINEAR_X86 \
2250 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2251 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2252 "shll $16, %%edi \n\t" \
2253 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2254 "mov %1, %%"REG_D"\n\t" \
2255 "shrl $9, %%esi \n\t" \
2256
392b6567 2257static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
a1f4b4bb 2258 long dstWidth, const uint8_t *src, int srcW,
392b6567
RP
2259 int xInc)
2260{
a1f4b4bb
RP
2261#if ARCH_X86 && CONFIG_GPL
2262#if COMPILE_TEMPLATE_MMX2
0cc854e3
RP
2263 int32_t *filterPos = c->hLumFilterPos;
2264 int16_t *filter = c->hLumFilter;
a1f4b4bb
RP
2265 int canMMX2BeUsed = c->canMMX2BeUsed;
2266 void *mmx2FilterCode= c->lumMmx2FilterCode;
2267 int i;
2268#if defined(PIC)
2269 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2270#endif
2271 if (canMMX2BeUsed) {
2272 __asm__ volatile(
2273#if defined(PIC)
2274 "mov %%"REG_b", %5 \n\t"
2275#endif
2276 "pxor %%mm7, %%mm7 \n\t"
2277 "mov %0, %%"REG_c" \n\t"
2278 "mov %1, %%"REG_D" \n\t"
2279 "mov %2, %%"REG_d" \n\t"
2280 "mov %3, %%"REG_b" \n\t"
2281 "xor %%"REG_a", %%"REG_a" \n\t" // i
2282 PREFETCH" (%%"REG_c") \n\t"
2283 PREFETCH" 32(%%"REG_c") \n\t"
2284 PREFETCH" 64(%%"REG_c") \n\t"
2285
2286#if ARCH_X86_64
2287
2288#define CALL_MMX2_FILTER_CODE \
2289 "movl (%%"REG_b"), %%esi \n\t"\
2290 "call *%4 \n\t"\
2291 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2292 "add %%"REG_S", %%"REG_c" \n\t"\
2293 "add %%"REG_a", %%"REG_D" \n\t"\
2294 "xor %%"REG_a", %%"REG_a" \n\t"\
2295
2296#else
2297
2298#define CALL_MMX2_FILTER_CODE \
2299 "movl (%%"REG_b"), %%esi \n\t"\
2300 "call *%4 \n\t"\
2301 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2302 "add %%"REG_a", %%"REG_D" \n\t"\
2303 "xor %%"REG_a", %%"REG_a" \n\t"\
2304
2305#endif /* ARCH_X86_64 */
2306
2307 CALL_MMX2_FILTER_CODE
2308 CALL_MMX2_FILTER_CODE
2309 CALL_MMX2_FILTER_CODE
2310 CALL_MMX2_FILTER_CODE
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2313 CALL_MMX2_FILTER_CODE
2314 CALL_MMX2_FILTER_CODE
2315
2316#if defined(PIC)
2317 "mov %5, %%"REG_b" \n\t"
2318#endif
0cc854e3 2319 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
a1f4b4bb
RP
2320 "m" (mmx2FilterCode)
2321#if defined(PIC)
2322 ,"m" (ebxsave)
2323#endif
2324 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2325#if !defined(PIC)
2326 ,"%"REG_b
2327#endif
2328 );
2329 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2330 } else {
2331#endif /* COMPILE_TEMPLATE_MMX2 */
2332 x86_reg xInc_shr16 = xInc >> 16;
2333 uint16_t xInc_mask = xInc & 0xffff;
2334 //NO MMX just normal asm ...
2335 __asm__ volatile(
2336 "xor %%"REG_a", %%"REG_a" \n\t" // i
2337 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2338 "xorl %%ecx, %%ecx \n\t" // xalpha
2339 ASMALIGN(4)
2340 "1: \n\t"
2341 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2342 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2343 FAST_BILINEAR_X86
2344 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2345 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2346 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2347
2348 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2350 FAST_BILINEAR_X86
2351 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2352 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2353 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2354
2355
2356 "add $2, %%"REG_a" \n\t"
2357 "cmp %2, %%"REG_a" \n\t"
2358 " jb 1b \n\t"
2359
2360
2361 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2362 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2363 );
2364#if COMPILE_TEMPLATE_MMX2
2365 } //if MMX2 can't be used
2366#endif
2367#else
392b6567
RP
2368 int i;
2369 unsigned int xpos=0;
dd68318c 2370 for (i=0;i<dstWidth;i++) {
392b6567
RP
2371 register unsigned int xx=xpos>>16;
2372 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2373 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2374 xpos+=xInc;
2375 }
a1f4b4bb 2376#endif /* ARCH_X86 */
392b6567
RP
2377}
2378
2ff198c1 2379 // *** horizontal scale Y line to temp buffer
7ac40327 2380static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
aaba7e6c 2381 const int16_t *hLumFilter,
7ac40327 2382 const int16_t *hLumFilterPos, int hLumFilterSize,
aaba7e6c 2383 uint8_t *formatConvBuffer,
95b5770b 2384 uint32_t *pal, int isAlpha)
077ea8a7 2385{
bb53e1d1 2386 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
bae76dc3 2387 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
95b5770b 2388
84011f10 2389 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
e8417235 2390
bb53e1d1
RP
2391 if (toYV12) {
2392 toYV12(formatConvBuffer, src, srcW, pal);
3d05e078
MN
2393 src= formatConvBuffer;
2394 }
1e621b18 2395
996de2fe 2396 if (!c->hyscale_fast) {
40fa5140 2397 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
dd68318c 2398 } else { // fast bilinear upscale / crap downscale
a1f4b4bb
RP
2399 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2400 }
2401
2402 if (convertRange)
2403 convertRange(dst, dstWidth);
2404}
2405
2406static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2407 long dstWidth, const uint8_t *src1,
2408 const uint8_t *src2, int srcW, int xInc)
2409{
57f9a560 2410#if ARCH_X86 && CONFIG_GPL
94daf2e9 2411#if COMPILE_TEMPLATE_MMX2
0cc854e3
RP
2412 int32_t *filterPos = c->hChrFilterPos;
2413 int16_t *filter = c->hChrFilter;
a1f4b4bb
RP
2414 int canMMX2BeUsed = c->canMMX2BeUsed;
2415 void *mmx2FilterCode= c->chrMmx2FilterCode;
2416 int i;
83c89c78 2417#if defined(PIC)
a1f4b4bb 2418 DECLARE_ALIGNED(8, uint64_t, ebxsave);
83c89c78 2419#endif
a1f4b4bb
RP
2420 if (canMMX2BeUsed) {
2421 __asm__ volatile(
83c89c78 2422#if defined(PIC)
a1f4b4bb 2423 "mov %%"REG_b", %6 \n\t"
2da0d70d 2424#endif
a1f4b4bb
RP
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c" \n\t"
2427 "mov %1, %%"REG_D" \n\t"
2428 "mov %2, %%"REG_d" \n\t"
2429 "mov %3, %%"REG_b" \n\t"
2430 "xor %%"REG_a", %%"REG_a" \n\t" // i
2431 PREFETCH" (%%"REG_c") \n\t"
2432 PREFETCH" 32(%%"REG_c") \n\t"
2433 PREFETCH" 64(%%"REG_c") \n\t"
2434
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 CALL_MMX2_FILTER_CODE
2439 "xor %%"REG_a", %%"REG_a" \n\t" // i
2440 "mov %5, %%"REG_c" \n\t" // src
2441 "mov %1, %%"REG_D" \n\t" // buf1
2442 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2443 PREFETCH" (%%"REG_c") \n\t"
2444 PREFETCH" 32(%%"REG_c") \n\t"
2445 PREFETCH" 64(%%"REG_c") \n\t"
2446
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2450 CALL_MMX2_FILTER_CODE
2ff198c1 2451
83c89c78 2452#if defined(PIC)
a1f4b4bb 2453 "mov %6, %%"REG_b" \n\t"
83c89c78 2454#endif
0cc854e3 2455 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
a1f4b4bb 2456 "m" (mmx2FilterCode), "m" (src2)
83c89c78 2457#if defined(PIC)
a1f4b4bb 2458 ,"m" (ebxsave)
83c89c78 2459#endif
a1f4b4bb 2460 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2461#if !defined(PIC)
a1f4b4bb 2462 ,"%"REG_b
2da0d70d 2463#endif
a1f4b4bb
RP
2464 );
2465 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2466 //printf("%d %d %d\n", dstWidth, i, srcW);
2467 dst[i] = src1[srcW-1]*128;
2468 dst[i+VOFW] = src2[srcW-1]*128;
2469 }
2470 } else {
94daf2e9 2471#endif /* COMPILE_TEMPLATE_MMX2 */
a1f4b4bb 2472 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2da0d70d 2473 uint16_t xInc_mask = xInc & 0xffff;
7ad6469e 2474 __asm__ volatile(
a1f4b4bb
RP
2475 "xor %%"REG_a", %%"REG_a" \n\t" // i
2476 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
c255994b 2478 ASMALIGN(4)
a1f4b4bb
RP
2479 "1: \n\t"
2480 "mov %0, %%"REG_S" \n\t"
2481 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
c255994b
RP
2483 FAST_BILINEAR_X86
2484 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
c255994b 2485
a1f4b4bb
RP
2486 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
c255994b 2488 FAST_BILINEAR_X86
a1f4b4bb 2489 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
c255994b 2490
a1f4b4bb
RP
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a" \n\t"
2494 "cmp %2, %%"REG_a" \n\t"
2495 " jb 1b \n\t"
c255994b 2496
a1f4b4bb
RP
2497/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498which is needed to support GCC 4.0. */
2499#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2501#else
2502 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2503#endif
2504 "r" (src2)
c255994b 2505 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2da0d70d 2506 );
94daf2e9 2507#if COMPILE_TEMPLATE_MMX2
a1f4b4bb 2508 } //if MMX2 can't be used
2ff198c1
MN
2509#endif
2510#else
392b6567
RP
2511 int i;
2512 unsigned int xpos=0;
dd68318c 2513 for (i=0;i<dstWidth;i++) {
392b6567
RP
2514 register unsigned int xx=xpos>>16;
2515 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2516 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2517 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2518 /* slower
2519 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2521 */
2522 xpos+=xInc;
2523 }
a1f4b4bb 2524#endif /* ARCH_X86 */
392b6567
RP
2525}
2526
7ac40327 2527inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
aaba7e6c 2528 int srcW, int xInc, const int16_t *hChrFilter,
7ac40327 2529 const int16_t *hChrFilterPos, int hChrFilterSize,
aaba7e6c 2530 uint8_t *formatConvBuffer,
95b5770b 2531 uint32_t *pal)
2ff198c1 2532{
95b5770b 2533
84011f10
RP
2534 src1 += c->chrSrcOffset;
2535 src2 += c->chrSrcOffset;
e8417235 2536
bb53e1d1
RP
2537 if (c->chrToYV12) {
2538 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2539 src1= formatConvBuffer;
8b2fce0d 2540 src2= formatConvBuffer+VOFW;
e28630fc 2541 }
1e621b18 2542
996de2fe 2543 if (!c->hcscale_fast) {
40fa5140
RP
2544 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2545 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
dd68318c 2546 } else { // fast bilinear upscale / crap downscale
40fa5140 2547 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2da0d70d 2548 }
bae76dc3
RP
2549
2550 if (c->chrConvertRange)
2551 c->chrConvertRange(dst, dstWidth);
077ea8a7
MN
2552}
2553
c866c7d0
RP
2554#define DEBUG_SWSCALE_BUFFERS 0
2555#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2556
a959e247 2557static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
dd68318c
RP
2558 int srcSliceH, uint8_t* dst[], int dstStride[])
2559{
2da0d70d
DB
2560 /* load a few things into local vars to make the code more readable? and faster */
2561 const int srcW= c->srcW;
2562 const int dstW= c->dstW;
2563 const int dstH= c->dstH;
2564 const int chrDstW= c->chrDstW;
2565 const int chrSrcW= c->chrSrcW;
2566 const int lumXInc= c->lumXInc;
2567 const int chrXInc= c->chrXInc;
b411dfff 2568 const enum PixelFormat dstFormat= c->dstFormat;
2da0d70d 2569 const int flags= c->flags;
2da0d70d
DB
2570 int16_t *vLumFilterPos= c->vLumFilterPos;
2571 int16_t *vChrFilterPos= c->vChrFilterPos;
2572 int16_t *hLumFilterPos= c->hLumFilterPos;
2573 int16_t *hChrFilterPos= c->hChrFilterPos;
2574 int16_t *vLumFilter= c->vLumFilter;
2575 int16_t *vChrFilter= c->vChrFilter;
2576 int16_t *hLumFilter= c->hLumFilter;
2577 int16_t *hChrFilter= c->hChrFilter;
2578 int32_t *lumMmxFilter= c->lumMmxFilter;
2579 int32_t *chrMmxFilter= c->chrMmxFilter;
bc45751f 2580 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2da0d70d
DB
2581 const int vLumFilterSize= c->vLumFilterSize;
2582 const int vChrFilterSize= c->vChrFilterSize;
2583 const int hLumFilterSize= c->hLumFilterSize;
2584 const int hChrFilterSize= c->hChrFilterSize;
2585 int16_t **lumPixBuf= c->lumPixBuf;
2586 int16_t **chrPixBuf= c->chrPixBuf;
6858492e 2587 int16_t **alpPixBuf= c->alpPixBuf;
2da0d70d
DB
2588 const int vLumBufSize= c->vLumBufSize;
2589 const int vChrBufSize= c->vChrBufSize;
2da0d70d
DB
2590 uint8_t *formatConvBuffer= c->formatConvBuffer;
2591 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2592 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2593 int lastDstY;
e150ef8d 2594 uint32_t *pal=c->pal_yuv;
2da0d70d 2595
8a322796 2596 /* vars which will change and which we need to store back in the context */
2da0d70d
DB
2597 int dstY= c->dstY;
2598 int lumBufIndex= c->lumBufIndex;
2599 int chrBufIndex= c->chrBufIndex;
2600 int lastInLumBuf= c->lastInLumBuf;
2601 int lastInChrBuf= c->lastInChrBuf;
2602
dd68318c 2603 if (isPacked(c->srcFormat)) {
2da0d70d
DB
2604 src[0]=
2605 src[1]=
6858492e
CS
2606 src[2]=
2607 src[3]= src[0];
2da0d70d
DB
2608 srcStride[0]=
2609 srcStride[1]=
6858492e
CS
2610 srcStride[2]=
2611 srcStride[3]= srcStride[0];
2da0d70d
DB
2612 }
2613 srcStride[1]<<= c->vChrDrop;
2614 srcStride[2]<<= c->vChrDrop;
2615
c866c7d0
RP
2616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2618 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620 srcSliceY, srcSliceH, dstY, dstH);
2621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2da0d70d 2623
dd68318c 2624 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
6683a37f 2625 static int warnedAlready=0; //FIXME move this into the context perhaps
dd68318c 2626 if (flags & SWS_PRINT_INFO && !warnedAlready) {
4b0c30b7 2627 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
8a322796 2628 " ->cannot do aligned memory accesses anymore\n");
6683a37f 2629 warnedAlready=1;
2da0d70d
DB
2630 }
2631 }
2632
8a322796
DB
2633 /* Note the user might start scaling the picture in the middle so this
2634 will not get executed. This is not really intended but works
2635 currently, so people might do it. */
dd68318c 2636 if (srcSliceY ==0) {
75084e47
RP
2637 lumBufIndex=-1;
2638 chrBufIndex=-1;
2da0d70d
DB
2639 dstY=0;
2640 lastInLumBuf= -1;
2641 lastInChrBuf= -1;
2642 }
2643
2644 lastDstY= dstY;
2645
dd68318c 2646 for (;dstY < dstH; dstY++) {
2da0d70d
DB
2647 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2648 const int chrDstY= dstY>>c->chrDstVSubSample;
2649 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2650 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
6858492e 2651 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2da0d70d
DB
2652
2653 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2654 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
fb91df39
RP
2655 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2656 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2657 int enough_lines;
2da0d70d 2658
2da0d70d
DB
2659 //handle holes (FAST_BILINEAR & weird filters)
2660 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2661 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
fcc402b1
LB
2662 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2663 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2da0d70d
DB
2664
2665 // Do we have enough lines in this slice to output the dstY line
fb91df39
RP
2666 enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2667 if (!enough_lines) {
2668 lastLumSrcY = srcSliceY + srcSliceH - 1;
2669 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2670 }
2671
c866c7d0
RP
2672 DEBUG_BUFFERS("dstY: %d\n", dstY);
2673 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2674 firstLumSrcY, lastLumSrcY, lastInLumBuf);
2675 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2676 firstChrSrcY, lastChrSrcY, lastInChrBuf);
fb91df39 2677
cbdc08d5 2678 //Do horizontal scaling
dd68318c 2679 while(lastInLumBuf < lastLumSrcY) {
a959e247
ZM
2680 const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2681 const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
cbdc08d5 2682 lumBufIndex++;
c866c7d0
RP
2683 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2684 lumBufIndex, lastInLumBuf);
cbdc08d5
RP
2685 assert(lumBufIndex < 2*vLumBufSize);
2686 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2687 assert(lastInLumBuf + 1 - srcSliceY >= 0);
cbdc08d5 2688 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
aaba7e6c
RP
2689 hLumFilter, hLumFilterPos, hLumFilterSize,
2690 formatConvBuffer,
cbdc08d5
RP
2691 pal, 0);
2692 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2693 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
aaba7e6c
RP
2694 hLumFilter, hLumFilterPos, hLumFilterSize,
2695 formatConvBuffer,
cbdc08d5
RP
2696 pal, 1);
2697 lastInLumBuf++;
2698 }
dd68318c 2699 while(lastInChrBuf < lastChrSrcY) {
a959e247
ZM
2700 const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2701 const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
cbdc08d5 2702 chrBufIndex++;
c866c7d0
RP
2703 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2704 chrBufIndex, lastInChrBuf);
cbdc08d5
RP
2705 assert(chrBufIndex < 2*vChrBufSize);
2706 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2707 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2708 //FIXME replace parameters through context struct (some at least)
2709
e7a47515 2710 if (c->needs_hcscale)
cbdc08d5 2711 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
aaba7e6c
RP
2712 hChrFilter, hChrFilterPos, hChrFilterSize,
2713 formatConvBuffer,
cbdc08d5
RP
2714 pal);
2715 lastInChrBuf++;
2716 }
2717 //wrap buf index around to stay inside the ring buffer
2718 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2719 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2720 if (!enough_lines)
2da0d70d 2721 break; //we can't output a dstY line so let's try with the next slice
d3f41512 2722
94daf2e9 2723#if COMPILE_TEMPLATE_MMX
88e2a9ae 2724 c->blueDither= ff_dither8[dstY&1];
92c7b471 2725 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
88e2a9ae 2726 c->greenDither= ff_dither8[dstY&1];
92c7b471 2727 else
88e2a9ae
CEH
2728 c->greenDither= ff_dither4[dstY&1];
2729 c->redDither= ff_dither8[(dstY+1)&1];
2da0d70d 2730#endif
dd68318c 2731 if (dstY < dstH-2) {
7ac40327
RP
2732 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2733 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2734 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
94daf2e9 2735#if COMPILE_TEMPLATE_MMX
2da0d70d 2736 int i;
dd68318c 2737 if (flags & SWS_ACCURATE_RND) {
9b734d44 2738 int s= APCK_SIZE / 8;
dd68318c 2739 for (i=0; i<vLumFilterSize; i+=2) {
a959e247
ZM
2740 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2741 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
9b734d44
RP
2742 lumMmxFilter[s*i+APCK_COEF/4 ]=
2743 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2744 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
dd68318c 2745 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
a959e247
ZM
2746 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2747 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
9b734d44
RP
2748 alpMmxFilter[s*i+APCK_COEF/4 ]=
2749 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2750 }
6858492e 2751 }
dd68318c 2752 for (i=0; i<vChrFilterSize; i+=2) {
a959e247
ZM
2753 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2754 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
9b734d44
RP
2755 chrMmxFilter[s*i+APCK_COEF/4 ]=
2756 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2757 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2758 }
dd68318c
RP
2759 } else {
2760 for (i=0; i<vLumFilterSize; i++) {
9b734d44
RP
2761 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2762 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2763 lumMmxFilter[4*i+2]=
2764 lumMmxFilter[4*i+3]=
2765 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
dd68318c 2766 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
9b734d44
RP
2767 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2768 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2769 alpMmxFilter[4*i+2]=
2770 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2771 }
2772 }
dd68318c 2773 for (i=0; i<vChrFilterSize; i++) {
9b734d44
RP
2774 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2775 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2776 chrMmxFilter[4*i+2]=
2777 chrMmxFilter[4*i+3]=
2778 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
6858492e 2779 }
2da0d70d 2780 }
6542b44e 2781#endif
dd68318c 2782 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2da0d70d
DB
2783 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2784 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
40fa5140 2785 c->yuv2nv12X(c,
9b734d44
RP
2786 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2787 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2788 dest, uDest, dstW, chrDstW, dstFormat);
dd68318c 2789 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2da0d70d
DB
2790 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2791 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
dd68318c 2792 if (is16BPS(dstFormat)) {
52154148 2793 yuv2yuvX16inC(
9b734d44
RP
2794 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2795 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2796 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2797 dstFormat);
dd68318c 2798 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
a959e247
ZM
2799 const int16_t *lumBuf = lumSrcPtr[0];
2800 const int16_t *chrBuf= chrSrcPtr[0];
2801 const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
40fa5140 2802 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
dd68318c 2803 } else { //General YV12
40fa5140 2804 c->yuv2yuvX(c,
9b734d44
RP
2805 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2806 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2807 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2da0d70d 2808 }
dd68318c 2809 } else {
fcc402b1
LB
2810 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2811 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
dd68318c 2812 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2da0d70d 2813 int chrAlpha= vChrFilter[2*dstY+1];
dd68318c 2814 if(flags & SWS_FULL_CHR_H_INT) {
f0faee4c 2815 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
9b734d44
RP
2816 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2817 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2818 alpSrcPtr, dest, dstW, dstY);
dd68318c 2819 } else {
40fa5140 2820 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
9b734d44
RP
2821 alpPixBuf ? *alpSrcPtr : NULL,
2822 dest, dstW, chrAlpha, dstFormat, flags, dstY);
f0faee4c 2823 }
dd68318c 2824 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2da0d70d
DB
2825 int lumAlpha= vLumFilter[2*dstY+1];
2826 int chrAlpha= vChrFilter[2*dstY+1];
2827 lumMmxFilter[2]=
2828 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2829 chrMmxFilter[2]=
2830 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
dd68318c 2831 if(flags & SWS_FULL_CHR_H_INT) {
f0faee4c 2832 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
9b734d44
RP
2833 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2834 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2835 alpSrcPtr, dest, dstW, dstY);
dd68318c 2836 } else {
40fa5140 2837 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
9b734d44
RP
2838 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2839 dest, dstW, lumAlpha, chrAlpha, dstY);
f0faee4c 2840 }
dd68318c
RP
2841 } else { //general RGB
2842 if(flags & SWS_FULL_CHR_H_INT) {
f0faee4c 2843 yuv2rgbXinC_full(c,
9b734d44
RP
2844 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2845 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2846 alpSrcPtr, dest, dstW, dstY);
dd68318c 2847 } else {
40fa5140 2848 c->yuv2packedX(c,
9b734d44
RP
2849 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2850 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2851 alpSrcPtr, dest, dstW, dstY);
f0faee4c 2852 }
2da0d70d
DB
2853 }
2854 }
dd68318c 2855 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
7ac40327
RP
2856 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2857 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2858 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
dd68318c 2859 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2da0d70d
DB
2860 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2861 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2862 yuv2nv12XinC(
9b734d44
RP
2863 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2864 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2865 dest, uDest, dstW, chrDstW, dstFormat);
dd68318c 2866 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2da0d70d
DB
2867 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2868 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
dd68318c 2869 if (is16BPS(dstFormat)) {
52154148 2870 yuv2yuvX16inC(
9b734d44
RP
2871 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2872 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2873 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2874 dstFormat);
dd68318c 2875 } else {
ebe5dec2 2876 yuv2yuvXinC(
9b734d44
RP
2877 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2878 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2879 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
52154148 2880 }
dd68318c 2881 } else {
fcc402b1
LB
2882 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2883 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
dd68318c 2884 if(flags & SWS_FULL_CHR_H_INT) {
f0faee4c 2885 yuv2rgbXinC_full(c,
9b734d44
RP
2886 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2887 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2888 alpSrcPtr, dest, dstW, dstY);
dd68318c 2889 } else {
14014d47 2890 yuv2packedXinC(c,
9b734d44
RP
2891 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2892 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893 alpSrcPtr, dest, dstW, dstY);
f0faee4c 2894 }
2da0d70d
DB
2895 }
2896 }
2897 }
17f715fa 2898
6268f55b
CS
2899 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2900 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2901
94daf2e9 2902#if COMPILE_TEMPLATE_MMX
5b7c7dd3
RP
2903 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2904 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2905 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2906 else __asm__ volatile("emms" :::"memory");
17f715fa 2907#endif
2da0d70d
DB
2908 /* store changed local vars back in the context */
2909 c->dstY= dstY;
2910 c->lumBufIndex= lumBufIndex;
2911 c->chrBufIndex= chrBufIndex;
2912 c->lastInLumBuf= lastInLumBuf;
2913 c->lastInChrBuf= lastInChrBuf;
d4e24275 2914
2da0d70d 2915 return dstY - lastDstY;
627690b5 2916}
40fa5140
RP
2917
2918static void RENAME(sws_init_swScale)(SwsContext *c)
2919{
2920 enum PixelFormat srcFormat = c->srcFormat;
2921
2922 c->yuv2nv12X = RENAME(yuv2nv12X );
2923 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2924 c->yuv2yuvX = RENAME(yuv2yuvX );
2925 c->yuv2packed1 = RENAME(yuv2packed1 );
2926 c->yuv2packed2 = RENAME(yuv2packed2 );
2927 c->yuv2packedX = RENAME(yuv2packedX );
2928
2929 c->hScale = RENAME(hScale );
2930
b501a1f5
RP
2931#if COMPILE_TEMPLATE_MMX
2932 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2933 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2934#else
2935 if (c->flags & SWS_FAST_BILINEAR)
2936#endif
2937 {
e77ddfa2
RP
2938 c->hyscale_fast = RENAME(hyscale_fast);
2939 c->hcscale_fast = RENAME(hcscale_fast);
b501a1f5 2940 }
40fa5140 2941
bb53e1d1 2942 c->chrToYV12 = NULL;
40fa5140 2943 switch(srcFormat) {
bb53e1d1
RP
2944 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2945 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2946 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2947 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
40fa5140
RP
2948 case PIX_FMT_RGB8 :
2949 case PIX_FMT_BGR8 :
2950 case PIX_FMT_PAL8 :
2951 case PIX_FMT_BGR4_BYTE:
bb53e1d1 2952 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
991e579c
LT
2953 case PIX_FMT_YUV420P16BE:
2954 case PIX_FMT_YUV422P16BE:
bb53e1d1 2955 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
991e579c
LT
2956 case PIX_FMT_YUV420P16LE:
2957 case PIX_FMT_YUV422P16LE:
bb53e1d1 2958 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
40fa5140
RP
2959 }
2960 if (c->chrSrcHSubSample) {
2961 switch(srcFormat) {
e8417235 2962 case PIX_FMT_RGB48BE:
bb53e1d1 2963 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
40fa5140 2964 case PIX_FMT_RGB32 :
bb53e1d1
RP
2965 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
2966 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2967 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2968 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
40fa5140 2969 case PIX_FMT_BGR32 :
bb53e1d1
RP
2970 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
2971 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2972 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2973 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
40fa5140
RP
2974 }
2975 } else {
2976 switch(srcFormat) {
e8417235 2977 case PIX_FMT_RGB48BE:
bb53e1d1 2978 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
40fa5140 2979 case PIX_FMT_RGB32 :
bb53e1d1
RP
2980 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
2981 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2982 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2983 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
40fa5140 2984 case PIX_FMT_BGR32 :
bb53e1d1
RP
2985 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
2986 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2987 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2988 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
40fa5140
RP
2989 }
2990 }
2991
bb53e1d1
RP
2992 c->lumToYV12 = NULL;
2993 c->alpToYV12 = NULL;
40fa5140
RP
2994 switch (srcFormat) {
2995 case PIX_FMT_YUYV422 :
991e579c
LT
2996 case PIX_FMT_YUV420P16BE:
2997 case PIX_FMT_YUV422P16BE:
2998 case PIX_FMT_YUV444P16BE:
bb53e1d1 2999 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
40fa5140 3000 case PIX_FMT_UYVY422 :
991e579c
LT
3001 case PIX_FMT_YUV420P16LE:
3002 case PIX_FMT_YUV422P16LE:
3003 case PIX_FMT_YUV444P16LE:
bb53e1d1
RP
3004 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3005 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
3006 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
3007 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
3008 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
3009 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
3010 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
40fa5140
RP
3011 case PIX_FMT_RGB8 :
3012 case PIX_FMT_BGR8 :
3013 case PIX_FMT_PAL8 :
3014 case PIX_FMT_BGR4_BYTE:
bb53e1d1
RP
3015 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3016 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3017 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
39e5f87b 3018 case PIX_FMT_RGB32 :
bb53e1d1 3019 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
39e5f87b 3020 case PIX_FMT_BGR32 :
bb53e1d1 3021 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
e8417235 3022 case PIX_FMT_RGB48BE:
bb53e1d1 3023 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
40fa5140
RP
3024 }
3025 if (c->alpPixBuf) {
3026 switch (srcFormat) {
3027 case PIX_FMT_RGB32 :
3028 case PIX_FMT_RGB32_1:
3029 case PIX_FMT_BGR32 :
bb53e1d1 3030 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
40fa5140
RP
3031 }
3032 }
84011f10
RP
3033
3034 switch (srcFormat) {
3035 case PIX_FMT_RGB32 :
3036 case PIX_FMT_BGR32 :
3037 c->alpSrcOffset = 3;
3038 break;
3039 case PIX_FMT_RGB32_1:
3040 case PIX_FMT_BGR32_1:
3041 c->lumSrcOffset = ALT32_CORR;
3042 c->chrSrcOffset = ALT32_CORR;
3043 break;
3044 case PIX_FMT_RGB48LE:
3045 c->lumSrcOffset = 1;
3046 c->chrSrcOffset = 1;
3047 c->alpSrcOffset = 1;
3048 break;
3049 }
bae76dc3 3050
60222557 3051 if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
bae76dc3
RP
3052 if (c->srcRange) {
3053 c->lumConvertRange = RENAME(lumRangeFromJpeg);
3054 c->chrConvertRange = RENAME(chrRangeFromJpeg);
3055 } else {
3056 c->lumConvertRange = RENAME(lumRangeToJpeg);
3057 c->chrConvertRange = RENAME(chrRangeToJpeg);
3058 }
3059 }
e7a47515
RP
3060