Fix compilation on Mac OS X 10.4: Defining _POSIX_C_SOURCE hides the u_char &
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
8a322796
DB
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec
MN
27#undef PREFETCH
28#undef PREFETCHW
29#undef EMMS
30#undef SFENCE
31
32#ifdef HAVE_3DNOW
8a322796 33/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
48a05cec
MN
34#define EMMS "femms"
35#else
36#define EMMS "emms"
37#endif
38
39#ifdef HAVE_3DNOW
40#define PREFETCH "prefetch"
41#define PREFETCHW "prefetchw"
e5091488 42#elif defined (HAVE_MMX2)
48a05cec
MN
43#define PREFETCH "prefetchnta"
44#define PREFETCHW "prefetcht0"
45#else
d904b5fc
NP
46#define PREFETCH " # nop"
47#define PREFETCHW " # nop"
48a05cec
MN
48#endif
49
50#ifdef HAVE_MMX2
51#define SFENCE "sfence"
52#else
d904b5fc 53#define SFENCE " # nop"
48a05cec 54#endif
d3f41512 55
d604bab9
MN
56#ifdef HAVE_MMX2
57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58#elif defined (HAVE_3DNOW)
59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60#endif
d3f41512 61
d604bab9 62#ifdef HAVE_MMX2
6e1c66bc 63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 64#else
6e1c66bc 65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 66#endif
6e1c66bc 67#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 68
a2faa401
RD
69#ifdef HAVE_ALTIVEC
70#include "swscale_altivec_template.c"
71#endif
72
bca11e75 73#define YSCALEYUV2YV12X(x, offset, dest, width) \
2da0d70d
DB
74 asm volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
8b2fce0d
MN
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
2da0d70d
DB
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
bca11e75
MN
108
109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
2da0d70d
DB
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
8b2fce0d
MN
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
1625216e 122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
8b2fce0d 123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
2da0d70d
DB
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
1625216e
MN
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
c1b0bfb4
MN
170
171#define YSCALEYUV2YV121 \
2da0d70d
DB
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
c1b0bfb4 183
bf2bdde6
MN
184#define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
33a67bd6
MN
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
bf2bdde6
MN
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
201
c1b0bfb4 202/*
2da0d70d
DB
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 208*/
25593e29 209#define YSCALEYUV2PACKEDX \
2da0d70d
DB
210 asm volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
8b2fce0d 223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
2da0d70d
DB
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
c1b0bfb4 232\
2da0d70d
DB
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
237 ASMALIGN(4)\
238 "2: \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
249 " jnz 2b \n\t"\
250
251#define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
256 );
8422aa88 257
bca11e75 258#define YSCALEYUV2PACKEDX_ACCURATE \
2da0d70d
DB
259 asm volatile(\
260 "xor %%"REG_a", %%"REG_a" \n\t"\
261 ASMALIGN(4)\
262 "nop \n\t"\
263 "1: \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
270 ASMALIGN(4)\
271 "2: \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
8b2fce0d 273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
1625216e 274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
1625216e
MN
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
295 " jnz 2b \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
bca11e75 307\
2da0d70d
DB
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
314 ASMALIGN(4)\
315 "2: \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
1625216e 318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
1625216e 323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
2da0d70d
DB
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
1625216e
MN
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
339 " jnz 2b \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 351
8422aa88 352#define YSCALEYUV2RGBX \
2da0d70d
DB
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
77a49659 388#if 0
d604bab9 389#define FULL_YSCALEYUV2RGB \
2da0d70d
DB
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
398 ASMALIGN(4)\
399 "1: \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
8b2fce0d 409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
8b2fce0d 412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
418\
419\
2da0d70d
DB
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
427\
428\
2da0d70d
DB
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
d604bab9 435\
2da0d70d
DB
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
d604bab9 439\
2da0d70d 440 "packuswb %%mm1, %%mm1 \n\t"
77a49659 441#endif
d604bab9 442
6e1c66bc 443#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
451 ASMALIGN(4)\
452 "1: \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 478
6e1c66bc 479#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 480
6e1c66bc 481#define REAL_YSCALEYUV2RGB(index, c) \
2da0d70d
DB
482 "xor "#index", "#index" \n\t"\
483 ASMALIGN(4)\
484 "1: \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 545#define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
6a4970ab 546
6e1c66bc 547#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
548 "xor "#index", "#index" \n\t"\
549 ASMALIGN(4)\
550 "1: \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
6a4970ab 559
6e1c66bc 560#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 561
6e1c66bc 562#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
563 "xor "#index", "#index" \n\t"\
564 ASMALIGN(4)\
565 "1: \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 609#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 610
6e1c66bc 611#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
612 "xor "#index", "#index" \n\t"\
613 ASMALIGN(4)\
614 "1: \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
6e1c66bc 627#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 628
497d4f99 629// do vertical chrominance interpolation
6e1c66bc 630#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
631 "xor "#index", "#index" \n\t"\
632 ASMALIGN(4)\
633 "1: \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 681#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 682
6e1c66bc 683#define REAL_WRITEBGR32(dst, dstw, index) \
2da0d70d
DB
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 697\
2da0d70d
DB
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
d604bab9 702\
2da0d70d
DB
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
705 " jb 1b \n\t"
6e1c66bc 706#define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
d604bab9 707
27a90b04 708#define REAL_WRITERGB16(dst, dstw, index) \
2da0d70d
DB
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
d604bab9 713\
2da0d70d
DB
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
d604bab9 716\
2da0d70d
DB
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 721\
2da0d70d
DB
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
d604bab9 724\
2da0d70d
DB
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
d604bab9 727\
2da0d70d
DB
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 730\
2da0d70d
DB
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
733 " jb 1b \n\t"
27a90b04 734#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
d604bab9 735
27a90b04 736#define REAL_WRITERGB15(dst, dstw, index) \
2da0d70d
DB
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
d604bab9 742\
2da0d70d
DB
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
d604bab9 745\
2da0d70d
DB
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 750\
2da0d70d
DB
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
d604bab9 753\
2da0d70d
DB
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
d604bab9 756\
2da0d70d
DB
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 759\
2da0d70d
DB
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
762 " jb 1b \n\t"
27a90b04 763#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
f62255fb 764
6542b44e 765#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 779\
2da0d70d
DB
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 788\
2da0d70d
DB
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 802\
2da0d70d
DB
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 811\
2da0d70d
DB
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
d604bab9 816\
2da0d70d
DB
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
819 " jb 1b \n\t"
d604bab9 820
6542b44e 821#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 835\
2da0d70d
DB
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 840\
2da0d70d
DB
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 845\
2da0d70d
DB
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 850\
2da0d70d
DB
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
99d2cb72 856\
2da0d70d
DB
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 862\
2da0d70d
DB
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 867\
2da0d70d 868 "add $24, "#dst" \n\t"\
99d2cb72 869\
2da0d70d
DB
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
872 " jb 1b \n\t"
99d2cb72 873
6542b44e 874#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 881\
2da0d70d
DB
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 885\
2da0d70d
DB
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
99d2cb72 890\
2da0d70d
DB
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 895\
5802683a 896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 899\
2da0d70d
DB
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 903\
2da0d70d
DB
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 907\
2da0d70d
DB
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 911\
2da0d70d
DB
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 915\
2da0d70d 916 "add $24, "#dst" \n\t"\
99d2cb72 917\
2da0d70d
DB
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
920 " jb 1b \n\t"
99d2cb72
MN
921
922#ifdef HAVE_MMX2
7630f2e0 923#undef WRITEBGR24
6e1c66bc 924#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 925#else
7630f2e0 926#undef WRITEBGR24
6e1c66bc 927#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
928#endif
929
6e1c66bc 930#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 938\
2da0d70d
DB
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 941\
2da0d70d
DB
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
944 " jb 1b \n\t"
6e1c66bc 945#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
946
947
77a49659 948static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
38858470 951{
c1b0bfb4 952#ifdef HAVE_MMX
f433c8ab 953 if(!(c->flags & SWS_BITEXACT)){
2da0d70d
DB
954 if (c->flags & SWS_ACCURATE_RND){
955 if (uDest){
8b2fce0d
MN
956 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
957 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
2da0d70d 958 }
bca11e75 959
8b2fce0d 960 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
2da0d70d
DB
961 }else{
962 if (uDest){
8b2fce0d
MN
963 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
964 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
bca11e75 965 }
2da0d70d 966
8b2fce0d 967 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
2da0d70d 968 }
f433c8ab
MN
969 return;
970 }
971#endif
a2faa401
RD
972#ifdef HAVE_ALTIVEC
973yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
974 chrFilter, chrSrc, chrFilterSize,
975 dest, uDest, vDest, dstW, chrDstW);
a2faa401 976#else //HAVE_ALTIVEC
5859233b 977yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
978 chrFilter, chrSrc, chrFilterSize,
979 dest, uDest, vDest, dstW, chrDstW);
a2faa401 980#endif //!HAVE_ALTIVEC
c1b0bfb4 981}
2add307d 982
6118e52e 983static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
984 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
985 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
6118e52e
VS
986{
987yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
988 chrFilter, chrSrc, chrFilterSize,
989 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
990}
991
bf2bdde6 992static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
2da0d70d 993 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
c1b0bfb4 994{
f433c8ab 995 int i;
c1b0bfb4 996#ifdef HAVE_MMX
f433c8ab 997 if(!(c->flags & SWS_BITEXACT)){
7bae01c6
MN
998 long p= uDest ? 3 : 1;
999 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
1000 uint8_t *dst[3]= {dest, uDest, vDest};
1001 long counter[3] = {dstW, chrDstW, chrDstW};
2da0d70d 1002
bf2bdde6
MN
1003 if (c->flags & SWS_ACCURATE_RND){
1004 while(p--){
1005 asm volatile(
1006 YSCALEYUV2YV121_ACCURATE
1007 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1008 "g" (-counter[p])
1009 : "%"REG_a
1010 );
1011 }
1012 }else{
d78c1ea1
MN
1013 while(p--){
1014 asm volatile(
1015 YSCALEYUV2YV121
1016 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1017 "g" (-counter[p])
1018 : "%"REG_a
1019 );
1020 }
bf2bdde6 1021 }
f433c8ab
MN
1022 return;
1023 }
1024#endif
2da0d70d
DB
1025 for (i=0; i<dstW; i++)
1026 {
a1f3ffa3 1027 int val= (lumSrc[i]+64)>>7;
2da0d70d
DB
1028
1029 if (val&256){
1030 if (val<0) val=0;
1031 else val=255;
1032 }
1033
1034 dest[i]= val;
1035 }
1036
1b0a4572 1037 if (uDest)
2da0d70d
DB
1038 for (i=0; i<chrDstW; i++)
1039 {
a1f3ffa3
MN
1040 int u=(chrSrc[i ]+64)>>7;
1041 int v=(chrSrc[i + VOFW]+64)>>7;
2da0d70d
DB
1042
1043 if ((u|v)&256){
1044 if (u<0) u=0;
1045 else if (u>255) u=255;
1046 if (v<0) v=0;
1047 else if (v>255) v=255;
1048 }
1049
1050 uDest[i]= u;
1051 vDest[i]= v;
1052 }
38858470
MN
1053}
1054
c1b0bfb4 1055
d604bab9
MN
1056/**
1057 * vertical scale YV12 to RGB
1058 */
25593e29 1059static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
1060 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1061 uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1062{
bca11e75 1063#ifdef HAVE_MMX
f8d61128 1064 long dummy=0;
f433c8ab 1065 if(!(c->flags & SWS_BITEXACT)){
2da0d70d
DB
1066 if (c->flags & SWS_ACCURATE_RND){
1067 switch(c->dstFormat){
1068 case PIX_FMT_RGB32:
1069 YSCALEYUV2PACKEDX_ACCURATE
1070 YSCALEYUV2RGBX
1071 WRITEBGR32(%4, %5, %%REGa)
1072
1073 YSCALEYUV2PACKEDX_END
1074 return;
1075 case PIX_FMT_BGR24:
1076 YSCALEYUV2PACKEDX_ACCURATE
1077 YSCALEYUV2RGBX
1078 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1079 "add %4, %%"REG_c" \n\t"
1080 WRITEBGR24(%%REGc, %5, %%REGa)
1081
1082
1083 :: "r" (&c->redDither),
1084 "m" (dummy), "m" (dummy), "m" (dummy),
1085 "r" (dest), "m" (dstW)
1086 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1087 );
1088 return;
27a90b04 1089 case PIX_FMT_RGB555:
2da0d70d
DB
1090 YSCALEYUV2PACKEDX_ACCURATE
1091 YSCALEYUV2RGBX
1092 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1093#ifdef DITHER1XBPP
2da0d70d
DB
1094 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1095 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1096 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1097#endif
1098
27a90b04 1099 WRITERGB15(%4, %5, %%REGa)
2da0d70d
DB
1100 YSCALEYUV2PACKEDX_END
1101 return;
27a90b04 1102 case PIX_FMT_RGB565:
2da0d70d
DB
1103 YSCALEYUV2PACKEDX_ACCURATE
1104 YSCALEYUV2RGBX
1105 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1106#ifdef DITHER1XBPP
2da0d70d
DB
1107 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1108 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1109 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1110#endif
1111
27a90b04 1112 WRITERGB16(%4, %5, %%REGa)
2da0d70d
DB
1113 YSCALEYUV2PACKEDX_END
1114 return;
1115 case PIX_FMT_YUYV422:
1116 YSCALEYUV2PACKEDX_ACCURATE
1117 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1118
1119 "psraw $3, %%mm3 \n\t"
1120 "psraw $3, %%mm4 \n\t"
1121 "psraw $3, %%mm1 \n\t"
1122 "psraw $3, %%mm7 \n\t"
1123 WRITEYUY2(%4, %5, %%REGa)
1124 YSCALEYUV2PACKEDX_END
1125 return;
1126 }
bca11e75 1127 }else{
2da0d70d
DB
1128 switch(c->dstFormat)
1129 {
1130 case PIX_FMT_RGB32:
1131 YSCALEYUV2PACKEDX
1132 YSCALEYUV2RGBX
1133 WRITEBGR32(%4, %5, %%REGa)
1134 YSCALEYUV2PACKEDX_END
1135 return;
1136 case PIX_FMT_BGR24:
1137 YSCALEYUV2PACKEDX
1138 YSCALEYUV2RGBX
1139 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1140 "add %4, %%"REG_c" \n\t"
1141 WRITEBGR24(%%REGc, %5, %%REGa)
1142
1143 :: "r" (&c->redDither),
1144 "m" (dummy), "m" (dummy), "m" (dummy),
1145 "r" (dest), "m" (dstW)
1146 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1147 );
1148 return;
27a90b04 1149 case PIX_FMT_RGB555:
2da0d70d
DB
1150 YSCALEYUV2PACKEDX
1151 YSCALEYUV2RGBX
1152 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1153#ifdef DITHER1XBPP
2da0d70d
DB
1154 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1155 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1156 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1157#endif
1158
27a90b04 1159 WRITERGB15(%4, %5, %%REGa)
2da0d70d
DB
1160 YSCALEYUV2PACKEDX_END
1161 return;
27a90b04 1162 case PIX_FMT_RGB565:
2da0d70d
DB
1163 YSCALEYUV2PACKEDX
1164 YSCALEYUV2RGBX
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1166#ifdef DITHER1XBPP
2da0d70d
DB
1167 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1168 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1169 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1170#endif
1171
27a90b04 1172 WRITERGB16(%4, %5, %%REGa)
2da0d70d
DB
1173 YSCALEYUV2PACKEDX_END
1174 return;
1175 case PIX_FMT_YUYV422:
1176 YSCALEYUV2PACKEDX
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1185 return;
bca11e75
MN
1186 }
1187 }
f433c8ab 1188 }
bc279024 1189#endif /* HAVE_MMX */
a31de956 1190#ifdef HAVE_ALTIVEC
2da0d70d
DB
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of altivec_yuv2packedX() */
1193 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1194 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1195 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1196 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1198 dest, dstW, dstY);
1199 else
1200#endif
1201 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1203 dest, dstW, dstY);
c1b0bfb4
MN
1204}
1205
c1b0bfb4
MN
1206/**
1207 * vertical bilinear scale YV12 to RGB
1208 */
25593e29 1209static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1210 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1211{
ac0ad729
MN
1212 int yalpha1=4095- yalpha;
1213 int uvalpha1=4095-uvalpha;
2da0d70d 1214 int i;
d604bab9 1215
77a416e8 1216#if 0 //isn't used
2da0d70d
DB
1217 if (flags&SWS_FULL_CHR_H_INT)
1218 {
1219 switch(dstFormat)
1220 {
cf7d1c1a 1221#ifdef HAVE_MMX
2da0d70d
DB
1222 case PIX_FMT_RGB32:
1223 asm volatile(
d604bab9
MN
1224
1225
1226FULL_YSCALEYUV2RGB
2da0d70d
DB
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
d604bab9 1229
2da0d70d
DB
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
d604bab9 1233
2da0d70d
DB
1234 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1235 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
d604bab9 1236
2da0d70d
DB
1237 "add $4, %%"REG_a" \n\t"
1238 "cmp %5, %%"REG_a" \n\t"
1239 " jb 1b \n\t"
d604bab9 1240
2da0d70d
DB
1241 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1242 "m" (yalpha1), "m" (uvalpha1)
1243 : "%"REG_a
1244 );
1245 break;
1246 case PIX_FMT_BGR24:
1247 asm volatile(
d604bab9
MN
1248
1249FULL_YSCALEYUV2RGB
1250
2da0d70d
DB
1251 // lsb ... msb
1252 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1253 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
d604bab9 1254
2da0d70d
DB
1255 "movq %%mm3, %%mm1 \n\t"
1256 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1257 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
d604bab9 1258
2da0d70d
DB
1259 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1260 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1261 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1262 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1263 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1264 "movq %%mm1, %%mm2 \n\t"
1265 "psllq $48, %%mm1 \n\t" // 000000BG
1266 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
d604bab9 1267
2da0d70d
DB
1268 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1269 "psrld $16, %%mm2 \n\t" // R000R000
1270 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1271 "por %%mm2, %%mm1 \n\t" // RBGRR000
d604bab9 1272
2da0d70d
DB
1273 "mov %4, %%"REG_b" \n\t"
1274 "add %%"REG_a", %%"REG_b" \n\t"
d604bab9
MN
1275
1276#ifdef HAVE_MMX2
2da0d70d
DB
1277 //FIXME Alignment
1278 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
d604bab9 1280#else
2da0d70d
DB
1281 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1282 "psrlq $32, %%mm3 \n\t"
1283 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1284 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1285#endif
1286 "add $4, %%"REG_a" \n\t"
1287 "cmp %5, %%"REG_a" \n\t"
1288 " jb 1b \n\t"
1289
1290 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1291 "m" (yalpha1), "m" (uvalpha1)
1292 : "%"REG_a, "%"REG_b
1293 );
1294 break;
1295 case PIX_FMT_BGR555:
1296 asm volatile(
d604bab9
MN
1297
1298FULL_YSCALEYUV2RGB
1299#ifdef DITHER1XBPP
2da0d70d
DB
1300 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1301 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1302 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
d604bab9 1303#endif
2da0d70d
DB
1304 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1305 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1306 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
d604bab9 1307
2da0d70d
DB
1308 "psrlw $3, %%mm3 \n\t"
1309 "psllw $2, %%mm1 \n\t"
1310 "psllw $7, %%mm0 \n\t"
1311 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1312 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9 1313
2da0d70d
DB
1314 "por %%mm3, %%mm1 \n\t"
1315 "por %%mm1, %%mm0 \n\t"
d604bab9 1316
2da0d70d 1317 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1318
2da0d70d
DB
1319 "add $4, %%"REG_a" \n\t"
1320 "cmp %5, %%"REG_a" \n\t"
1321 " jb 1b \n\t"
d604bab9 1322
2da0d70d
DB
1323 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1324 "m" (yalpha1), "m" (uvalpha1)
1325 : "%"REG_a
1326 );
1327 break;
1328 case PIX_FMT_BGR565:
1329 asm volatile(
d604bab9
MN
1330
1331FULL_YSCALEYUV2RGB
1332#ifdef DITHER1XBPP
2da0d70d
DB
1333 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1334 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1335 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
d604bab9 1336#endif
2da0d70d
DB
1337 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1338 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1339 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
d604bab9 1340
2da0d70d
DB
1341 "psrlw $3, %%mm3 \n\t"
1342 "psllw $3, %%mm1 \n\t"
1343 "psllw $8, %%mm0 \n\t"
1344 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1345 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9 1346
2da0d70d
DB
1347 "por %%mm3, %%mm1 \n\t"
1348 "por %%mm1, %%mm0 \n\t"
d604bab9 1349
2da0d70d 1350 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1351
2da0d70d
DB
1352 "add $4, %%"REG_a" \n\t"
1353 "cmp %5, %%"REG_a" \n\t"
1354 " jb 1b \n\t"
d604bab9 1355
2da0d70d
DB
1356 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1357 "m" (yalpha1), "m" (uvalpha1)
1358 : "%"REG_a
1359 );
1360 break;
bc279024 1361#endif /* HAVE_MMX */
2da0d70d 1362 case PIX_FMT_BGR32:
cf7d1c1a 1363#ifndef HAVE_MMX
2da0d70d 1364 case PIX_FMT_RGB32:
cf7d1c1a 1365#endif
2da0d70d
DB
1366 if (dstFormat==PIX_FMT_RGB32)
1367 {
1368 int i;
df3c183a 1369#ifdef WORDS_BIGENDIAN
2da0d70d
DB
1370 dest++;
1371#endif
1372 for (i=0;i<dstW;i++){
1373 // vertical linear interpolation && yuv2rgb in a single step:
1374 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1375 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1376 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1377 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1378 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1379 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1380 dest+= 4;
1381 }
1382 }
1383 else if (dstFormat==PIX_FMT_BGR24)
1384 {
1385 int i;
1386 for (i=0;i<dstW;i++){
1387 // vertical linear interpolation && yuv2rgb in a single step:
1388 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1390 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1391 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1392 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1393 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1394 dest+= 3;
1395 }
1396 }
1397 else if (dstFormat==PIX_FMT_BGR565)
1398 {
1399 int i;
1400 for (i=0;i<dstW;i++){
1401 // vertical linear interpolation && yuv2rgb in a single step:
1402 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1403 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1404 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1405
1406 ((uint16_t*)dest)[i] =
1407 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1408 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1409 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1410 }
1411 }
1412 else if (dstFormat==PIX_FMT_BGR555)
1413 {
1414 int i;
1415 for (i=0;i<dstW;i++){
1416 // vertical linear interpolation && yuv2rgb in a single step:
1417 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1418 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1419 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1420
1421 ((uint16_t*)dest)[i] =
1422 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1423 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1424 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1425 }
1426 }
1427 }//FULL_UV_IPOL
1428 else
1429 {
cf7d1c1a 1430#endif // if 0
d604bab9 1431#ifdef HAVE_MMX
f433c8ab 1432 if(!(c->flags & SWS_BITEXACT)){
2da0d70d
DB
1433 switch(c->dstFormat)
1434 {
1435 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1436 case PIX_FMT_RGB32:
1437 asm volatile(
1438 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1439 "mov %4, %%"REG_b" \n\t"
1440 "push %%"REG_BP" \n\t"
1441 YSCALEYUV2RGB(%%REGBP, %5)
1442 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1443 "pop %%"REG_BP" \n\t"
1444 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1445
1446 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1447 "a" (&c->redDither)
1448 );
1449 return;
1450 case PIX_FMT_BGR24:
1451 asm volatile(
1452 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1453 "mov %4, %%"REG_b" \n\t"
1454 "push %%"REG_BP" \n\t"
1455 YSCALEYUV2RGB(%%REGBP, %5)
1456 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1457 "pop %%"REG_BP" \n\t"
1458 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1459 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1460 "a" (&c->redDither)
1461 );
1462 return;
27a90b04 1463 case PIX_FMT_RGB555:
2da0d70d
DB
1464 asm volatile(
1465 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1466 "mov %4, %%"REG_b" \n\t"
1467 "push %%"REG_BP" \n\t"
1468 YSCALEYUV2RGB(%%REGBP, %5)
1469 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1470#ifdef DITHER1XBPP
2da0d70d
DB
1471 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1472 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1473 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1474#endif
1475
27a90b04 1476 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1479
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481 "a" (&c->redDither)
1482 );
1483 return;
27a90b04 1484 case PIX_FMT_RGB565:
2da0d70d
DB
1485 asm volatile(
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB(%%REGBP, %5)
1490 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1491#ifdef DITHER1XBPP
2da0d70d
DB
1492 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1493 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1494 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1495#endif
1496
27a90b04 1497 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1498 "pop %%"REG_BP" \n\t"
1499 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1500 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1501 "a" (&c->redDither)
1502 );
1503 return;
1504 case PIX_FMT_YUYV422:
1505 asm volatile(
1506 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1507 "mov %4, %%"REG_b" \n\t"
1508 "push %%"REG_BP" \n\t"
1509 YSCALEYUV2PACKED(%%REGBP, %5)
1510 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1511 "pop %%"REG_BP" \n\t"
1512 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 "a" (&c->redDither)
1515 );
1516 return;
1517 default: break;
1518 }
f433c8ab 1519 }
cf7d1c1a 1520#endif //HAVE_MMX
ec1bca2a 1521YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
d604bab9
MN
1522}
1523
1524/**
1525 * YV12 to RGB without scaling or interpolating
1526 */
25593e29 1527static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1528 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1529{
2da0d70d
DB
1530 const int yalpha1=0;
1531 int i;
6a4970ab 1532
8a322796 1533 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
2da0d70d 1534 const int yalpha= 4096; //FIXME ...
96034638 1535
2da0d70d
DB
1536 if (flags&SWS_FULL_CHR_H_INT)
1537 {
1538 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1539 return;
1540 }
397c035e
MN
1541
1542#ifdef HAVE_MMX
f433c8ab 1543 if(!(flags & SWS_BITEXACT)){
e5091488 1544 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
2da0d70d
DB
1545 {
1546 switch(dstFormat)
1547 {
1548 case PIX_FMT_RGB32:
1549 asm volatile(
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1555 "pop %%"REG_BP" \n\t"
1556 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1557
1558 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1559 "a" (&c->redDither)
1560 );
1561 return;
1562 case PIX_FMT_BGR24:
1563 asm volatile(
1564 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1565 "mov %4, %%"REG_b" \n\t"
1566 "push %%"REG_BP" \n\t"
1567 YSCALEYUV2RGB1(%%REGBP, %5)
1568 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1571
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573 "a" (&c->redDither)
1574 );
1575 return;
27a90b04 1576 case PIX_FMT_RGB555:
2da0d70d
DB
1577 asm volatile(
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2RGB1(%%REGBP, %5)
1582 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1583#ifdef DITHER1XBPP
2da0d70d
DB
1584 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1585 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1586 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1587#endif
27a90b04 1588 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1589 "pop %%"REG_BP" \n\t"
1590 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1591
1592 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1593 "a" (&c->redDither)
1594 );
1595 return;
27a90b04 1596 case PIX_FMT_RGB565:
2da0d70d
DB
1597 asm volatile(
1598 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1599 "mov %4, %%"REG_b" \n\t"
1600 "push %%"REG_BP" \n\t"
1601 YSCALEYUV2RGB1(%%REGBP, %5)
1602 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1603#ifdef DITHER1XBPP
2da0d70d
DB
1604 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1605 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1606 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1607#endif
1608
27a90b04 1609 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1610 "pop %%"REG_BP" \n\t"
1611 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1612
1613 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1614 "a" (&c->redDither)
1615 );
1616 return;
1617 case PIX_FMT_YUYV422:
1618 asm volatile(
1619 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1620 "mov %4, %%"REG_b" \n\t"
1621 "push %%"REG_BP" \n\t"
1622 YSCALEYUV2PACKED1(%%REGBP, %5)
1623 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1624 "pop %%"REG_BP" \n\t"
1625 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1626
1627 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1628 "a" (&c->redDither)
1629 );
1630 return;
1631 }
1632 }
1633 else
1634 {
1635 switch(dstFormat)
1636 {
1637 case PIX_FMT_RGB32:
1638 asm volatile(
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1644 "pop %%"REG_BP" \n\t"
1645 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1646
1647 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1648 "a" (&c->redDither)
1649 );
1650 return;
1651 case PIX_FMT_BGR24:
1652 asm volatile(
1653 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1654 "mov %4, %%"REG_b" \n\t"
1655 "push %%"REG_BP" \n\t"
1656 YSCALEYUV2RGB1b(%%REGBP, %5)
1657 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1658 "pop %%"REG_BP" \n\t"
1659 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1660
1661 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1662 "a" (&c->redDither)
1663 );
1664 return;
27a90b04 1665 case PIX_FMT_RGB555:
2da0d70d
DB
1666 asm volatile(
1667 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1668 "mov %4, %%"REG_b" \n\t"
1669 "push %%"REG_BP" \n\t"
1670 YSCALEYUV2RGB1b(%%REGBP, %5)
1671 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1672#ifdef DITHER1XBPP
2da0d70d
DB
1673 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1674 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1675 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1676#endif
27a90b04 1677 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1678 "pop %%"REG_BP" \n\t"
1679 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1680
1681 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1682 "a" (&c->redDither)
1683 );
1684 return;
27a90b04 1685 case PIX_FMT_RGB565:
2da0d70d
DB
1686 asm volatile(
1687 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1688 "mov %4, %%"REG_b" \n\t"
1689 "push %%"REG_BP" \n\t"
1690 YSCALEYUV2RGB1b(%%REGBP, %5)
1691 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1692#ifdef DITHER1XBPP
2da0d70d
DB
1693 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1694 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1695 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1696#endif
1697
27a90b04 1698 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1699 "pop %%"REG_BP" \n\t"
1700 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1701
1702 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1703 "a" (&c->redDither)
1704 );
1705 return;
1706 case PIX_FMT_YUYV422:
1707 asm volatile(
1708 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1709 "mov %4, %%"REG_b" \n\t"
1710 "push %%"REG_BP" \n\t"
1711 YSCALEYUV2PACKED1b(%%REGBP, %5)
1712 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1713 "pop %%"REG_BP" \n\t"
1714 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1715
1716 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1717 "a" (&c->redDither)
1718 );
1719 return;
1720 }
1721 }
f433c8ab 1722 }
bc279024 1723#endif /* HAVE_MMX */
e5091488 1724 if (uvalpha < 2048)
2da0d70d 1725 {
ec1bca2a 1726 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1727 }else{
ec1bca2a 1728 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1729 }
d604bab9
MN
1730}
1731
8a322796 1732//FIXME yuy2* can read up to 7 samples too much
6ff0ad6b 1733
7f526efd 1734static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1e621b18 1735{
6ff0ad6b 1736#ifdef HAVE_MMX
2da0d70d
DB
1737 asm volatile(
1738 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1739 "mov %0, %%"REG_a" \n\t"
1740 "1: \n\t"
1741 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1742 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1743 "pand %%mm2, %%mm0 \n\t"
1744 "pand %%mm2, %%mm1 \n\t"
1745 "packuswb %%mm1, %%mm0 \n\t"
1746 "movq %%mm0, (%2, %%"REG_a") \n\t"
1747 "add $8, %%"REG_a" \n\t"
1748 " js 1b \n\t"
1749 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1750 : "%"REG_a
1751 );
1e621b18 1752#else
2da0d70d
DB
1753 int i;
1754 for (i=0; i<width; i++)
1755 dst[i]= src[2*i];
1e621b18
MN
1756#endif
1757}
1758
7f526efd 1759static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1e621b18 1760{
c2271987 1761#ifdef HAVE_MMX
2da0d70d
DB
1762 asm volatile(
1763 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1764 "mov %0, %%"REG_a" \n\t"
1765 "1: \n\t"
1766 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1767 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1768 "psrlw $8, %%mm0 \n\t"
1769 "psrlw $8, %%mm1 \n\t"
1770 "packuswb %%mm1, %%mm0 \n\t"
1771 "movq %%mm0, %%mm1 \n\t"
1772 "psrlw $8, %%mm0 \n\t"
1773 "pand %%mm4, %%mm1 \n\t"
1774 "packuswb %%mm0, %%mm0 \n\t"
1775 "packuswb %%mm1, %%mm1 \n\t"
1776 "movd %%mm0, (%3, %%"REG_a") \n\t"
1777 "movd %%mm1, (%2, %%"REG_a") \n\t"
1778 "add $4, %%"REG_a" \n\t"
1779 " js 1b \n\t"
1780 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1781 : "%"REG_a
1782 );
1e621b18 1783#else
2da0d70d
DB
1784 int i;
1785 for (i=0; i<width; i++)
1786 {
1787 dstU[i]= src1[4*i + 1];
1788 dstV[i]= src1[4*i + 3];
1789 }
1790#endif
1791 assert(src1 == src2);
1e621b18
MN
1792}
1793
4cf16bbe
DB
1794/* This is almost identical to the previous, end exists only because
1795 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
7f526efd 1796static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
7322a67c
MN
1797{
1798#ifdef HAVE_MMX
2da0d70d
DB
1799 asm volatile(
1800 "mov %0, %%"REG_a" \n\t"
1801 "1: \n\t"
1802 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1803 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "psrlw $8, %%mm1 \n\t"
1806 "packuswb %%mm1, %%mm0 \n\t"
1807 "movq %%mm0, (%2, %%"REG_a") \n\t"
1808 "add $8, %%"REG_a" \n\t"
1809 " js 1b \n\t"
1810 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1811 : "%"REG_a
1812 );
7322a67c 1813#else
2da0d70d
DB
1814 int i;
1815 for (i=0; i<width; i++)
1816 dst[i]= src[2*i+1];
7322a67c
MN
1817#endif
1818}
1819
7f526efd 1820static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
7322a67c 1821{
c2271987 1822#ifdef HAVE_MMX
2da0d70d
DB
1823 asm volatile(
1824 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1825 "mov %0, %%"REG_a" \n\t"
1826 "1: \n\t"
1827 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1828 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1829 "pand %%mm4, %%mm0 \n\t"
1830 "pand %%mm4, %%mm1 \n\t"
1831 "packuswb %%mm1, %%mm0 \n\t"
1832 "movq %%mm0, %%mm1 \n\t"
1833 "psrlw $8, %%mm0 \n\t"
1834 "pand %%mm4, %%mm1 \n\t"
1835 "packuswb %%mm0, %%mm0 \n\t"
1836 "packuswb %%mm1, %%mm1 \n\t"
1837 "movd %%mm0, (%3, %%"REG_a") \n\t"
1838 "movd %%mm1, (%2, %%"REG_a") \n\t"
1839 "add $4, %%"REG_a" \n\t"
1840 " js 1b \n\t"
1841 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1842 : "%"REG_a
1843 );
7322a67c 1844#else
2da0d70d
DB
1845 int i;
1846 for (i=0; i<width; i++)
1847 {
1848 dstU[i]= src1[4*i + 0];
1849 dstV[i]= src1[4*i + 2];
1850 }
1851#endif
1852 assert(src1 == src2);
7322a67c
MN
1853}
1854
214892ee
MN
1855#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1856static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width)\
1857{\
1858 int i;\
1859 for (i=0; i<width; i++)\
1860 {\
1861 int b= (((type*)src)[i]>>shb)&maskb;\
1862 int g= (((type*)src)[i]>>shg)&maskg;\
1863 int r= (((type*)src)[i]>>shr)&maskr;\
1864\
1865 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1866 }\
1e621b18
MN
1867}
1868
214892ee
MN
1869BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1870BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1871BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1872BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1873BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1874BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1875
a0baa07a
MN
1876#define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1877static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1878{\
1879 int i;\
1880 for (i=0; i<width; i++)\
1881 {\
ba83d862
MN
1882 int b= (((type*)src)[i]&maskb)>>shb;\
1883 int g= (((type*)src)[i]&maskg)>>shg;\
1884 int r= (((type*)src)[i]&maskr)>>shr;\
a0baa07a
MN
1885\
1886 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1887 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1888 }\
ba83d862
MN
1889}\
1890static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1891{\
1892 int i;\
1893 for (i=0; i<width; i++)\
1894 {\
1895 int pix0= ((type*)src)[2*i+0];\
1896 int pix1= ((type*)src)[2*i+1];\
1897 int g= (pix0&maskg)+(pix1&maskg);\
1898 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1899 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1900\
1901 g>>=shg;\
1902\
6b79dbce
MN
1903 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1904 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
ba83d862 1905 }\
2f60f629
MN
1906}
1907
ba83d862
MN
1908BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1909BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
a0baa07a
MN
1910BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1911BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1912BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1913BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1914
ac6a2e45 1915#ifdef HAVE_MMX
dfb09bd1
MN
1916static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1917{
1918
1919 if(srcFormat == PIX_FMT_BGR24){
1920 asm volatile(
ff9a056d
MN
1921 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1922 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1923 :
dfb09bd1
MN
1924 );
1925 }else{
1926 asm volatile(
ff9a056d
MN
1927 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1928 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1929 :
dfb09bd1
MN
1930 );
1931 }
1932
2da0d70d 1933 asm volatile(
dfb09bd1
MN
1934 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1935 "mov %2, %%"REG_a" \n\t"
1936 "pxor %%mm7, %%mm7 \n\t"
1937 "1: \n\t"
1938 PREFETCH" 64(%0) \n\t"
1939 "movd (%0), %%mm0 \n\t"
1940 "movd 2(%0), %%mm1 \n\t"
1941 "movd 6(%0), %%mm2 \n\t"
1942 "movd 8(%0), %%mm3 \n\t"
1943 "add $12, %0 \n\t"
1944 "punpcklbw %%mm7, %%mm0 \n\t"
1945 "punpcklbw %%mm7, %%mm1 \n\t"
1946 "punpcklbw %%mm7, %%mm2 \n\t"
1947 "punpcklbw %%mm7, %%mm3 \n\t"
1948 "pmaddwd %%mm5, %%mm0 \n\t"
1949 "pmaddwd %%mm6, %%mm1 \n\t"
1950 "pmaddwd %%mm5, %%mm2 \n\t"
1951 "pmaddwd %%mm6, %%mm3 \n\t"
1952 "paddd %%mm1, %%mm0 \n\t"
1953 "paddd %%mm3, %%mm2 \n\t"
1954 "paddd %%mm4, %%mm0 \n\t"
1955 "paddd %%mm4, %%mm2 \n\t"
1956 "psrad $15, %%mm0 \n\t"
1957 "psrad $15, %%mm2 \n\t"
1958 "packssdw %%mm2, %%mm0 \n\t"
1959 "packuswb %%mm0, %%mm0 \n\t"
1960 "movd %%mm0, (%1, %%"REG_a") \n\t"
1961 "add $4, %%"REG_a" \n\t"
1962 " js 1b \n\t"
1963 : "+r" (src)
1964 : "r" (dst+width), "g" (-width)
1965 : "%"REG_a
2da0d70d 1966 );
dfb09bd1
MN
1967}
1968
1969static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1970{
1971 asm volatile(
1972 "movq 24+%4, %%mm6 \n\t"
1973 "mov %3, %%"REG_a" \n\t"
1974 "pxor %%mm7, %%mm7 \n\t"
1975 "1: \n\t"
1976 PREFETCH" 64(%0) \n\t"
1977 "movd (%0), %%mm0 \n\t"
1978 "movd 2(%0), %%mm1 \n\t"
1979 "punpcklbw %%mm7, %%mm0 \n\t"
1980 "punpcklbw %%mm7, %%mm1 \n\t"
1981 "movq %%mm0, %%mm2 \n\t"
1982 "movq %%mm1, %%mm3 \n\t"
1983 "pmaddwd %4, %%mm0 \n\t"
1984 "pmaddwd 8+%4, %%mm1 \n\t"
1985 "pmaddwd 16+%4, %%mm2 \n\t"
1986 "pmaddwd %%mm6, %%mm3 \n\t"
1987 "paddd %%mm1, %%mm0 \n\t"
1988 "paddd %%mm3, %%mm2 \n\t"
1989
1990 "movd 6(%0), %%mm1 \n\t"
1991 "movd 8(%0), %%mm3 \n\t"
1992 "add $12, %0 \n\t"
1993 "punpcklbw %%mm7, %%mm1 \n\t"
1994 "punpcklbw %%mm7, %%mm3 \n\t"
1995 "movq %%mm1, %%mm4 \n\t"
1996 "movq %%mm3, %%mm5 \n\t"
1997 "pmaddwd %4, %%mm1 \n\t"
1998 "pmaddwd 8+%4, %%mm3 \n\t"
1999 "pmaddwd 16+%4, %%mm4 \n\t"
2000 "pmaddwd %%mm6, %%mm5 \n\t"
2001 "paddd %%mm3, %%mm1 \n\t"
2002 "paddd %%mm5, %%mm4 \n\t"
2003
2004 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
2005 "paddd %%mm3, %%mm0 \n\t"
2006 "paddd %%mm3, %%mm2 \n\t"
2007 "paddd %%mm3, %%mm1 \n\t"
2008 "paddd %%mm3, %%mm4 \n\t"
2009 "psrad $15, %%mm0 \n\t"
2010 "psrad $15, %%mm2 \n\t"
2011 "psrad $15, %%mm1 \n\t"
2012 "psrad $15, %%mm4 \n\t"
2013 "packssdw %%mm1, %%mm0 \n\t"
2014 "packssdw %%mm4, %%mm2 \n\t"
2015 "packuswb %%mm0, %%mm0 \n\t"
2016 "packuswb %%mm2, %%mm2 \n\t"
2017 "movd %%mm0, (%1, %%"REG_a") \n\t"
2018 "movd %%mm2, (%2, %%"REG_a") \n\t"
2019 "add $4, %%"REG_a" \n\t"
2020 " js 1b \n\t"
2021 : "+r" (src)
2022 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2023 : "%"REG_a
2024 );
2025}
2026#endif
2027
2028static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
2029{
2030#ifdef HAVE_MMX
2031 bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24);
1e621b18 2032#else
2da0d70d
DB
2033 int i;
2034 for (i=0; i<width; i++)
2035 {
2036 int b= src[i*3+0];
2037 int g= src[i*3+1];
2038 int r= src[i*3+2];
1e621b18 2039
e5091488 2040 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 2041 }
bc279024 2042#endif /* HAVE_MMX */
1e621b18
MN
2043}
2044
7f526efd 2045static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1e621b18 2046{
4342fc14 2047#ifdef HAVE_MMX
dfb09bd1 2048 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24);
1e621b18 2049#else
2da0d70d
DB
2050 int i;
2051 for (i=0; i<width; i++)
2052 {
dfb09bd1
MN
2053 int b= src1[3*i + 0];
2054 int g= src1[3*i + 1];
2055 int r= src1[3*i + 2];
2da0d70d 2056
dfb09bd1
MN
2057 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2058 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2059 }
bc279024 2060#endif /* HAVE_MMX */
2da0d70d 2061 assert(src1 == src2);
1e621b18
MN
2062}
2063
2f60f629
MN
2064static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2065{
2066 int i;
2067 for (i=0; i<width; i++)
2068 {
2069 int b= src1[6*i + 0] + src1[6*i + 3];
2070 int g= src1[6*i + 1] + src1[6*i + 4];
2071 int r= src1[6*i + 2] + src1[6*i + 5];
2072
2073 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2074 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2075 }
2076 assert(src1 == src2);
2077}
2078
97b93389 2079static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
a861d4d7 2080{
dfb09bd1
MN
2081#ifdef HAVE_MMX
2082 bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24);
2083#else
2da0d70d
DB
2084 int i;
2085 for (i=0; i<width; i++)
2086 {
2087 int r= src[i*3+0];
2088 int g= src[i*3+1];
2089 int b= src[i*3+2];
2090
e5091488 2091 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 2092 }
dfb09bd1 2093#endif
a861d4d7
MN
2094}
2095
97b93389 2096static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
a861d4d7 2097{
2da0d70d
DB
2098 int i;
2099 assert(src1==src2);
dfb09bd1
MN
2100#ifdef HAVE_MMX
2101 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24);
2102#else
2da0d70d
DB
2103 for (i=0; i<width; i++)
2104 {
dfb09bd1
MN
2105 int r= src1[3*i + 0];
2106 int g= src1[3*i + 1];
2107 int b= src1[3*i + 2];
2da0d70d 2108
dfb09bd1
MN
2109 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2110 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2111 }
dfb09bd1 2112#endif
a861d4d7
MN
2113}
2114
2f60f629
MN
2115static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2116{
2117 int i;
2118 assert(src1==src2);
2119 for (i=0; i<width; i++)
2120 {
e09d7eef
MN
2121 int r= src1[6*i + 0] + src1[6*i + 3];
2122 int g= src1[6*i + 1] + src1[6*i + 4];
2123 int b= src1[6*i + 2] + src1[6*i + 5];
2f60f629
MN
2124
2125 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2126 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2127 }
2128}
2129
1e621b18 2130
97b93389 2131static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
e28630fc 2132{
2da0d70d
DB
2133 int i;
2134 for (i=0; i<width; i++)
2135 {
2136 int d= src[i];
e28630fc 2137
2da0d70d
DB
2138 dst[i]= pal[d] & 0xFF;
2139 }
e28630fc
MN
2140}
2141
97b93389 2142static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
e28630fc 2143{
2da0d70d
DB
2144 int i;
2145 assert(src1 == src2);
2146 for (i=0; i<width; i++)
2147 {
2148 int p= pal[src1[i]];
2149
2150 dstU[i]= p>>8;
2151 dstV[i]= p>>16;
2152 }
e28630fc
MN
2153}
2154
3d05e078
MN
2155static inline void RENAME(mono2Y)(uint8_t *dst, uint8_t *src, long width, int format)
2156{
2157 int i, j;
2158 for (i=0; i<width/8; i++){
2159 int d= format == PIX_FMT_MONOBLACK ? src[i] : ~src[i];
78454dfc
MN
2160 for(j=0; j<8; j++)
2161 dst[8*i+j]= ((d>>(7-j))&1)*255;
3d05e078
MN
2162 }
2163}
2164
8a322796 2165// bilinear / bicubic scaling
077ea8a7 2166static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2da0d70d 2167 int16_t *filter, int16_t *filterPos, long filterSize)
2ff198c1 2168{
077ea8a7 2169#ifdef HAVE_MMX
2da0d70d
DB
2170 assert(filterSize % 4 == 0 && filterSize>0);
2171 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2172 {
2173 long counter= -2*dstW;
2174 filter-= counter*2;
2175 filterPos-= counter/2;
2176 dst-= counter/2;
2177 asm volatile(
83c89c78 2178#if defined(PIC)
2da0d70d
DB
2179 "push %%"REG_b" \n\t"
2180#endif
2181 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2182 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2183 "mov %%"REG_a", %%"REG_BP" \n\t"
2184 ASMALIGN(4)
2185 "1: \n\t"
2186 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2187 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2188 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2189 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2190 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2191 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2192 "punpcklbw %%mm7, %%mm0 \n\t"
2193 "punpcklbw %%mm7, %%mm2 \n\t"
2194 "pmaddwd %%mm1, %%mm0 \n\t"
2195 "pmaddwd %%mm2, %%mm3 \n\t"
ef423a66
MN
2196 "movq %%mm0, %%mm4 \n\t"
2197 "punpckldq %%mm3, %%mm0 \n\t"
2198 "punpckhdq %%mm3, %%mm4 \n\t"
2199 "paddd %%mm4, %%mm0 \n\t"
2200 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2201 "packssdw %%mm0, %%mm0 \n\t"
2202 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2203 "add $4, %%"REG_BP" \n\t"
2204 " jnc 1b \n\t"
2205
2206 "pop %%"REG_BP" \n\t"
83c89c78 2207#if defined(PIC)
2da0d70d 2208 "pop %%"REG_b" \n\t"
83c89c78 2209#endif
2da0d70d
DB
2210 : "+a" (counter)
2211 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2212#if !defined(PIC)
2da0d70d
DB
2213 : "%"REG_b
2214#endif
2215 );
2216 }
2217 else if (filterSize==8)
2218 {
2219 long counter= -2*dstW;
2220 filter-= counter*4;
2221 filterPos-= counter/2;
2222 dst-= counter/2;
2223 asm volatile(
83c89c78 2224#if defined(PIC)
2da0d70d
DB
2225 "push %%"REG_b" \n\t"
2226#endif
2227 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2228 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2229 "mov %%"REG_a", %%"REG_BP" \n\t"
2230 ASMALIGN(4)
2231 "1: \n\t"
2232 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2233 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2234 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2235 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2236 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2237 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2238 "punpcklbw %%mm7, %%mm0 \n\t"
2239 "punpcklbw %%mm7, %%mm2 \n\t"
2240 "pmaddwd %%mm1, %%mm0 \n\t"
2241 "pmaddwd %%mm2, %%mm3 \n\t"
2242
2243 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2244 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2245 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2246 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2247 "punpcklbw %%mm7, %%mm4 \n\t"
2248 "punpcklbw %%mm7, %%mm2 \n\t"
2249 "pmaddwd %%mm1, %%mm4 \n\t"
2250 "pmaddwd %%mm2, %%mm5 \n\t"
2251 "paddd %%mm4, %%mm0 \n\t"
2252 "paddd %%mm5, %%mm3 \n\t"
ef423a66
MN
2253 "movq %%mm0, %%mm4 \n\t"
2254 "punpckldq %%mm3, %%mm0 \n\t"
2255 "punpckhdq %%mm3, %%mm4 \n\t"
2256 "paddd %%mm4, %%mm0 \n\t"
2257 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2258 "packssdw %%mm0, %%mm0 \n\t"
2259 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2260 "add $4, %%"REG_BP" \n\t"
2261 " jnc 1b \n\t"
2262
2263 "pop %%"REG_BP" \n\t"
83c89c78 2264#if defined(PIC)
2da0d70d 2265 "pop %%"REG_b" \n\t"
83c89c78 2266#endif
2da0d70d
DB
2267 : "+a" (counter)
2268 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2269#if !defined(PIC)
2da0d70d
DB
2270 : "%"REG_b
2271#endif
2272 );
2273 }
2274 else
2275 {
2276 uint8_t *offset = src+filterSize;
2277 long counter= -2*dstW;
2278 //filter-= counter*filterSize/2;
2279 filterPos-= counter/2;
2280 dst-= counter/2;
2281 asm volatile(
2282 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2283 ASMALIGN(4)
2284 "1: \n\t"
2285 "mov %2, %%"REG_c" \n\t"
2286 "movzwl (%%"REG_c", %0), %%eax \n\t"
2287 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2288 "mov %5, %%"REG_c" \n\t"
2289 "pxor %%mm4, %%mm4 \n\t"
2290 "pxor %%mm5, %%mm5 \n\t"
2291 "2: \n\t"
2292 "movq (%1), %%mm1 \n\t"
2293 "movq (%1, %6), %%mm3 \n\t"
2294 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2295 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2296 "punpcklbw %%mm7, %%mm0 \n\t"
2297 "punpcklbw %%mm7, %%mm2 \n\t"
2298 "pmaddwd %%mm1, %%mm0 \n\t"
2299 "pmaddwd %%mm2, %%mm3 \n\t"
2300 "paddd %%mm3, %%mm5 \n\t"
2301 "paddd %%mm0, %%mm4 \n\t"
2302 "add $8, %1 \n\t"
2303 "add $4, %%"REG_c" \n\t"
2304 "cmp %4, %%"REG_c" \n\t"
2305 " jb 2b \n\t"
2306 "add %6, %1 \n\t"
ef423a66
MN
2307 "movq %%mm4, %%mm0 \n\t"
2308 "punpckldq %%mm5, %%mm4 \n\t"
2309 "punpckhdq %%mm5, %%mm0 \n\t"
2310 "paddd %%mm0, %%mm4 \n\t"
2311 "psrad $7, %%mm4 \n\t"
2da0d70d
DB
2312 "packssdw %%mm4, %%mm4 \n\t"
2313 "mov %3, %%"REG_a" \n\t"
2314 "movd %%mm4, (%%"REG_a", %0) \n\t"
2315 "add $4, %0 \n\t"
2316 " jnc 1b \n\t"
2317
2318 : "+r" (counter), "+r" (filter)
2319 : "m" (filterPos), "m" (dst), "m"(offset),
2320 "m" (src), "r" (filterSize*2)
2321 : "%"REG_a, "%"REG_c, "%"REG_d
2322 );
2323 }
077ea8a7 2324#else
8c266f0c 2325#ifdef HAVE_ALTIVEC
2da0d70d 2326 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2327#else
2da0d70d
DB
2328 int i;
2329 for (i=0; i<dstW; i++)
2330 {
2331 int j;
2332 int srcPos= filterPos[i];
2333 int val=0;
2334 //printf("filterPos: %d\n", filterPos[i]);
2335 for (j=0; j<filterSize; j++)
2336 {
2337 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2338 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2339 }
2340 //filter += hFilterSize;
881c4294 2341 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2da0d70d
DB
2342 //dst[i] = val>>7;
2343 }
bc279024
DB
2344#endif /* HAVE_ALTIVEC */
2345#endif /* HAVE_MMX */
077ea8a7 2346}
2ff198c1 2347 // *** horizontal scale Y line to temp buffer
6bc0c792 2348static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2da0d70d
DB
2349 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2350 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2351 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2352 int32_t *mmx2FilterPos, uint8_t *pal)
077ea8a7 2353{
2da0d70d 2354 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
1e621b18 2355 {
2da0d70d
DB
2356 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2357 src= formatConvBuffer;
1e621b18 2358 }
2da0d70d 2359 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
7322a67c 2360 {
2da0d70d
DB
2361 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2362 src= formatConvBuffer;
7322a67c 2363 }
2da0d70d 2364 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2365 {
2da0d70d
DB
2366 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2367 src= formatConvBuffer;
1e621b18 2368 }
9990e426
MN
2369 else if (srcFormat==PIX_FMT_RGB32_1)
2370 {
2371 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2372 src= formatConvBuffer;
2373 }
2da0d70d 2374 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2375 {
2da0d70d
DB
2376 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2377 src= formatConvBuffer;
1e621b18 2378 }
2da0d70d 2379 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2380 {
2da0d70d
DB
2381 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2382 src= formatConvBuffer;
6af250ea 2383 }
2da0d70d 2384 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2385 {
2da0d70d
DB
2386 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2387 src= formatConvBuffer;
b72034dd 2388 }
2da0d70d 2389 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2390 {
2da0d70d
DB
2391 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2392 src= formatConvBuffer;
a861d4d7 2393 }
9990e426
MN
2394 else if (srcFormat==PIX_FMT_BGR32_1)
2395 {
2396 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2397 src= formatConvBuffer;
2398 }
2da0d70d 2399 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2400 {
2da0d70d
DB
2401 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2402 src= formatConvBuffer;
a861d4d7 2403 }
2da0d70d 2404 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2405 {
2da0d70d
DB
2406 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2407 src= formatConvBuffer;
a43fb6b3 2408 }
2da0d70d 2409 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2410 {
2da0d70d
DB
2411 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2412 src= formatConvBuffer;
a43fb6b3 2413 }
2da0d70d 2414 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2415 {
87cf861c 2416 RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2da0d70d 2417 src= formatConvBuffer;
e28630fc 2418 }
3d05e078
MN
2419 else if (srcFormat==PIX_FMT_MONOBLACK ||srcFormat==PIX_FMT_MONOWHITE)
2420 {
2421 RENAME(mono2Y)(formatConvBuffer, src, srcW, srcFormat);
2422 src= formatConvBuffer;
2423 }
1e621b18 2424
e3d2500f 2425#ifdef HAVE_MMX
8a322796 2426 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2427 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2428#else
2da0d70d 2429 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2430#endif
077ea8a7 2431 {
2da0d70d 2432 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
077ea8a7 2433 }
8a322796 2434 else // fast bilinear upscale / crap downscale
077ea8a7 2435 {
3d6a30d9 2436#if defined(ARCH_X86)
2ff198c1 2437#ifdef HAVE_MMX2
2da0d70d 2438 int i;
83c89c78 2439#if defined(PIC)
2da0d70d 2440 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2441#endif
2da0d70d
DB
2442 if (canMMX2BeUsed)
2443 {
2444 asm volatile(
83c89c78 2445#if defined(PIC)
2da0d70d
DB
2446 "mov %%"REG_b", %5 \n\t"
2447#endif
2448 "pxor %%mm7, %%mm7 \n\t"
2449 "mov %0, %%"REG_c" \n\t"
2450 "mov %1, %%"REG_D" \n\t"
2451 "mov %2, %%"REG_d" \n\t"
2452 "mov %3, %%"REG_b" \n\t"
2453 "xor %%"REG_a", %%"REG_a" \n\t" // i
2454 PREFETCH" (%%"REG_c") \n\t"
2455 PREFETCH" 32(%%"REG_c") \n\t"
2456 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2457
6d606c4f
AJ
2458#ifdef ARCH_X86_64
2459
2460#define FUNNY_Y_CODE \
2da0d70d
DB
2461 "movl (%%"REG_b"), %%esi \n\t"\
2462 "call *%4 \n\t"\
2463 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2464 "add %%"REG_S", %%"REG_c" \n\t"\
2465 "add %%"REG_a", %%"REG_D" \n\t"\
2466 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2467
2468#else
2469
2ff198c1 2470#define FUNNY_Y_CODE \
2da0d70d
DB
2471 "movl (%%"REG_b"), %%esi \n\t"\
2472 "call *%4 \n\t"\
2473 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2474 "add %%"REG_a", %%"REG_D" \n\t"\
2475 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2476
bc279024 2477#endif /* ARCH_X86_64 */
6d606c4f 2478
2ff198c1
MN
2479FUNNY_Y_CODE
2480FUNNY_Y_CODE
2481FUNNY_Y_CODE
2482FUNNY_Y_CODE
2483FUNNY_Y_CODE
2484FUNNY_Y_CODE
2485FUNNY_Y_CODE
2486FUNNY_Y_CODE
2487
83c89c78 2488#if defined(PIC)
2da0d70d 2489 "mov %5, %%"REG_b" \n\t"
83c89c78 2490#endif
2da0d70d
DB
2491 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2492 "m" (funnyYCode)
83c89c78 2493#if defined(PIC)
2da0d70d 2494 ,"m" (ebxsave)
83c89c78 2495#endif
2da0d70d 2496 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2497#if !defined(PIC)
2da0d70d
DB
2498 ,"%"REG_b
2499#endif
2500 );
2501 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2502 }
2503 else
2504 {
bc279024 2505#endif /* HAVE_MMX2 */
2da0d70d
DB
2506 long xInc_shr16 = xInc >> 16;
2507 uint16_t xInc_mask = xInc & 0xffff;
2508 //NO MMX just normal asm ...
2509 asm volatile(
2510 "xor %%"REG_a", %%"REG_a" \n\t" // i
2511 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2512 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2513 ASMALIGN(4)
2514 "1: \n\t"
2515 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2516 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2517 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2518 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2519 "shll $16, %%edi \n\t"
2520 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2521 "mov %1, %%"REG_D" \n\t"
2522 "shrl $9, %%esi \n\t"
2523 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2524 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2525 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2526
2527 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2528 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2529 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2530 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2531 "shll $16, %%edi \n\t"
2532 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2533 "mov %1, %%"REG_D" \n\t"
2534 "shrl $9, %%esi \n\t"
2535 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2536 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2537 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2538
2539
2540 "add $2, %%"REG_a" \n\t"
2541 "cmp %2, %%"REG_a" \n\t"
2542 " jb 1b \n\t"
2543
2544
2545 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2546 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2547 );
2ff198c1 2548#ifdef HAVE_MMX2
2da0d70d 2549 } //if MMX2 can't be used
2ff198c1
MN
2550#endif
2551#else
2da0d70d
DB
2552 int i;
2553 unsigned int xpos=0;
2554 for (i=0;i<dstWidth;i++)
2555 {
2556 register unsigned int xx=xpos>>16;
2557 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2558 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2559 xpos+=xInc;
2560 }
bc279024 2561#endif /* defined(ARCH_X86) */
077ea8a7 2562 }
6bc0c792
MN
2563
2564 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2565 int i;
2566 //FIXME all pal and rgb srcFormats could do this convertion as well
2567 //FIXME all scalers more complex than bilinear could do half of this transform
2568 if(c->srcRange){
2569 for (i=0; i<dstWidth; i++)
2570 dst[i]= (dst[i]*14071 + 33561947)>>14;
2571 }else{
2572 for (i=0; i<dstWidth; i++)
aa13b0fc 2573 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
6bc0c792
MN
2574 }
2575 }
2ff198c1
MN
2576}
2577
6bc0c792 2578inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2da0d70d
DB
2579 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2580 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2581 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2582 int32_t *mmx2FilterPos, uint8_t *pal)
2ff198c1 2583{
2da0d70d 2584 if (srcFormat==PIX_FMT_YUYV422)
1e621b18 2585 {
8b2fce0d 2586 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2587 src1= formatConvBuffer;
8b2fce0d 2588 src2= formatConvBuffer+VOFW;
1e621b18 2589 }
2da0d70d 2590 else if (srcFormat==PIX_FMT_UYVY422)
7322a67c 2591 {
8b2fce0d 2592 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2593 src1= formatConvBuffer;
8b2fce0d 2594 src2= formatConvBuffer+VOFW;
7322a67c 2595 }
2da0d70d 2596 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2597 {
2f60f629
MN
2598 if(c->chrSrcHSubSample)
2599 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2600 else
2601 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2602 src1= formatConvBuffer;
8b2fce0d 2603 src2= formatConvBuffer+VOFW;
1e621b18 2604 }
9990e426
MN
2605 else if (srcFormat==PIX_FMT_RGB32_1)
2606 {
2f60f629
MN
2607 if(c->chrSrcHSubSample)
2608 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2609 else
2610 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
9990e426
MN
2611 src1= formatConvBuffer;
2612 src2= formatConvBuffer+VOFW;
2613 }
2da0d70d 2614 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2615 {
2f60f629
MN
2616 if(c->chrSrcHSubSample)
2617 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2618 else
2619 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2620 src1= formatConvBuffer;
8b2fce0d 2621 src2= formatConvBuffer+VOFW;
1e621b18 2622 }
2da0d70d 2623 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2624 {
2f60f629
MN
2625 if(c->chrSrcHSubSample)
2626 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2627 else
2628 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2629 src1= formatConvBuffer;
8b2fce0d 2630 src2= formatConvBuffer+VOFW;
6af250ea 2631 }
2da0d70d 2632 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2633 {
2f60f629
MN
2634 if(c->chrSrcHSubSample)
2635 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2636 else
2637 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2638 src1= formatConvBuffer;
8b2fce0d 2639 src2= formatConvBuffer+VOFW;
b72034dd 2640 }
2da0d70d 2641 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2642 {
2f60f629
MN
2643 if(c->chrSrcHSubSample)
2644 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2645 else
2646 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2647 src1= formatConvBuffer;
8b2fce0d 2648 src2= formatConvBuffer+VOFW;
a861d4d7 2649 }
9990e426
MN
2650 else if (srcFormat==PIX_FMT_BGR32_1)
2651 {
2f60f629
MN
2652 if(c->chrSrcHSubSample)
2653 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2654 else
2655 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
9990e426
MN
2656 src1= formatConvBuffer;
2657 src2= formatConvBuffer+VOFW;
2658 }
2da0d70d 2659 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2660 {
2f60f629
MN
2661 if(c->chrSrcHSubSample)
2662 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2663 else
2664 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2665 src1= formatConvBuffer;
8b2fce0d 2666 src2= formatConvBuffer+VOFW;
a861d4d7 2667 }
2da0d70d 2668 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2669 {
2f60f629
MN
2670 if(c->chrSrcHSubSample)
2671 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2672 else
2673 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2674 src1= formatConvBuffer;
8b2fce0d 2675 src2= formatConvBuffer+VOFW;
a43fb6b3 2676 }
2da0d70d 2677 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2678 {
2f60f629
MN
2679 if(c->chrSrcHSubSample)
2680 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2681 else
2682 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2683 src1= formatConvBuffer;
8b2fce0d 2684 src2= formatConvBuffer+VOFW;
a43fb6b3 2685 }
4bb9adcf 2686 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
6ff0ad6b 2687 {
2da0d70d 2688 return;
6ff0ad6b 2689 }
2da0d70d 2690 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2691 {
87cf861c 2692 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2da0d70d 2693 src1= formatConvBuffer;
8b2fce0d 2694 src2= formatConvBuffer+VOFW;
e28630fc 2695 }
1e621b18 2696
e3d2500f 2697#ifdef HAVE_MMX
8a322796 2698 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2699 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2700#else
2da0d70d 2701 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2702#endif
077ea8a7 2703 {
2da0d70d 2704 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
8b2fce0d 2705 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
077ea8a7 2706 }
8a322796 2707 else // fast bilinear upscale / crap downscale
077ea8a7 2708 {
3d6a30d9 2709#if defined(ARCH_X86)
2ff198c1 2710#ifdef HAVE_MMX2
2da0d70d 2711 int i;
83c89c78 2712#if defined(PIC)
2da0d70d 2713 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2714#endif
2da0d70d
DB
2715 if (canMMX2BeUsed)
2716 {
2717 asm volatile(
83c89c78 2718#if defined(PIC)
2da0d70d
DB
2719 "mov %%"REG_b", %6 \n\t"
2720#endif
2721 "pxor %%mm7, %%mm7 \n\t"
2722 "mov %0, %%"REG_c" \n\t"
2723 "mov %1, %%"REG_D" \n\t"
2724 "mov %2, %%"REG_d" \n\t"
2725 "mov %3, %%"REG_b" \n\t"
2726 "xor %%"REG_a", %%"REG_a" \n\t" // i
2727 PREFETCH" (%%"REG_c") \n\t"
2728 PREFETCH" 32(%%"REG_c") \n\t"
2729 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2730
6d606c4f
AJ
2731#ifdef ARCH_X86_64
2732
2733#define FUNNY_UV_CODE \
2da0d70d
DB
2734 "movl (%%"REG_b"), %%esi \n\t"\
2735 "call *%4 \n\t"\
2736 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2737 "add %%"REG_S", %%"REG_c" \n\t"\
2738 "add %%"REG_a", %%"REG_D" \n\t"\
2739 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2740
2741#else
2742
b7dc6f66 2743#define FUNNY_UV_CODE \
2da0d70d
DB
2744 "movl (%%"REG_b"), %%esi \n\t"\
2745 "call *%4 \n\t"\
2746 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2747 "add %%"REG_a", %%"REG_D" \n\t"\
2748 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2749
bc279024 2750#endif /* ARCH_X86_64 */
6d606c4f 2751
b7dc6f66
MN
2752FUNNY_UV_CODE
2753FUNNY_UV_CODE
2754FUNNY_UV_CODE
2755FUNNY_UV_CODE
2da0d70d
DB
2756 "xor %%"REG_a", %%"REG_a" \n\t" // i
2757 "mov %5, %%"REG_c" \n\t" // src
2758 "mov %1, %%"REG_D" \n\t" // buf1
8b2fce0d 2759 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2da0d70d
DB
2760 PREFETCH" (%%"REG_c") \n\t"
2761 PREFETCH" 32(%%"REG_c") \n\t"
2762 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2763
2764FUNNY_UV_CODE
2765FUNNY_UV_CODE
2766FUNNY_UV_CODE
2767FUNNY_UV_CODE
2768
83c89c78 2769#if defined(PIC)
2da0d70d 2770 "mov %6, %%"REG_b" \n\t"
83c89c78 2771#endif
2da0d70d
DB
2772 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2773 "m" (funnyUVCode), "m" (src2)
83c89c78 2774#if defined(PIC)
2da0d70d 2775 ,"m" (ebxsave)
83c89c78 2776#endif
2da0d70d 2777 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2778#if !defined(PIC)
2da0d70d
DB
2779 ,"%"REG_b
2780#endif
2781 );
2782 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2783 {
2784 //printf("%d %d %d\n", dstWidth, i, srcW);
2785 dst[i] = src1[srcW-1]*128;
8b2fce0d 2786 dst[i+VOFW] = src2[srcW-1]*128;
2da0d70d
DB
2787 }
2788 }
2789 else
2790 {
bc279024 2791#endif /* HAVE_MMX2 */
2da0d70d
DB
2792 long xInc_shr16 = (long) (xInc >> 16);
2793 uint16_t xInc_mask = xInc & 0xffff;
2794 asm volatile(
2795 "xor %%"REG_a", %%"REG_a" \n\t" // i
2796 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2797 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2798 ASMALIGN(4)
2799 "1: \n\t"
2800 "mov %0, %%"REG_S" \n\t"
2801 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2802 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2803 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2804 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2805 "shll $16, %%edi \n\t"
2806 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2807 "mov %1, %%"REG_D" \n\t"
2808 "shrl $9, %%esi \n\t"
2809 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2810
2811 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2812 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2813 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2814 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2815 "shll $16, %%edi \n\t"
2816 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2817 "mov %1, %%"REG_D" \n\t"
2818 "shrl $9, %%esi \n\t"
8b2fce0d 2819 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2da0d70d
DB
2820
2821 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2822 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2823 "add $1, %%"REG_a" \n\t"
2824 "cmp %2, %%"REG_a" \n\t"
2825 " jb 1b \n\t"
2ff198c1 2826
8a322796
DB
2827/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2828 which is needed to support GCC 4.0. */
e5091488 2829#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2da0d70d 2830 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2831#else
2da0d70d 2832 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2833#endif
2da0d70d
DB
2834 "r" (src2)
2835 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2836 );
2ff198c1 2837#ifdef HAVE_MMX2
2da0d70d 2838 } //if MMX2 can't be used
2ff198c1
MN
2839#endif
2840#else
2da0d70d
DB
2841 int i;
2842 unsigned int xpos=0;
2843 for (i=0;i<dstWidth;i++)
2844 {
2845 register unsigned int xx=xpos>>16;
2846 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2847 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
8b2fce0d 2848 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2da0d70d
DB
2849 /* slower
2850 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
8b2fce0d 2851 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2da0d70d
DB
2852 */
2853 xpos+=xInc;
2854 }
bc279024 2855#endif /* defined(ARCH_X86) */
2da0d70d 2856 }
6bc0c792
MN
2857 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2858 int i;
2859 //FIXME all pal and rgb srcFormats could do this convertion as well
2860 //FIXME all scalers more complex than bilinear could do half of this transform
2861 if(c->srcRange){
2862 for (i=0; i<dstWidth; i++){
2863 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2864 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2865 }
2866 }else{
2867 for (i=0; i<dstWidth; i++){
aa13b0fc
MN
2868 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2869 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
6bc0c792
MN
2870 }
2871 }
2872 }
077ea8a7
MN
2873}
2874
3e499f53 2875static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2da0d70d
DB
2876 int srcSliceH, uint8_t* dst[], int dstStride[]){
2877
2878 /* load a few things into local vars to make the code more readable? and faster */
2879 const int srcW= c->srcW;
2880 const int dstW= c->dstW;
2881 const int dstH= c->dstH;
2882 const int chrDstW= c->chrDstW;
2883 const int chrSrcW= c->chrSrcW;
2884 const int lumXInc= c->lumXInc;
2885 const int chrXInc= c->chrXInc;
2886 const int dstFormat= c->dstFormat;
2887 const int srcFormat= c->srcFormat;
2888 const int flags= c->flags;
2889 const int canMMX2BeUsed= c->canMMX2BeUsed;
2890 int16_t *vLumFilterPos= c->vLumFilterPos;
2891 int16_t *vChrFilterPos= c->vChrFilterPos;
2892 int16_t *hLumFilterPos= c->hLumFilterPos;
2893 int16_t *hChrFilterPos= c->hChrFilterPos;
2894 int16_t *vLumFilter= c->vLumFilter;
2895 int16_t *vChrFilter= c->vChrFilter;
2896 int16_t *hLumFilter= c->hLumFilter;
2897 int16_t *hChrFilter= c->hChrFilter;
2898 int32_t *lumMmxFilter= c->lumMmxFilter;
2899 int32_t *chrMmxFilter= c->chrMmxFilter;
2900 const int vLumFilterSize= c->vLumFilterSize;
2901 const int vChrFilterSize= c->vChrFilterSize;
2902 const int hLumFilterSize= c->hLumFilterSize;
2903 const int hChrFilterSize= c->hChrFilterSize;
2904 int16_t **lumPixBuf= c->lumPixBuf;
2905 int16_t **chrPixBuf= c->chrPixBuf;
2906 const int vLumBufSize= c->vLumBufSize;
2907 const int vChrBufSize= c->vChrBufSize;
2908 uint8_t *funnyYCode= c->funnyYCode;
2909 uint8_t *funnyUVCode= c->funnyUVCode;
2910 uint8_t *formatConvBuffer= c->formatConvBuffer;
2911 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2912 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2913 int lastDstY;
2914 uint8_t *pal=NULL;
2915
8a322796 2916 /* vars which will change and which we need to store back in the context */
2da0d70d
DB
2917 int dstY= c->dstY;
2918 int lumBufIndex= c->lumBufIndex;
2919 int chrBufIndex= c->chrBufIndex;
2920 int lastInLumBuf= c->lastInLumBuf;
2921 int lastInChrBuf= c->lastInChrBuf;
2922
2923 if (isPacked(c->srcFormat)){
2924 pal= src[1];
2925 src[0]=
2926 src[1]=
2927 src[2]= src[0];
2928 srcStride[0]=
2929 srcStride[1]=
2930 srcStride[2]= srcStride[0];
2931 }
2932 srcStride[1]<<= c->vChrDrop;
2933 srcStride[2]<<= c->vChrDrop;
2934
2935 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2936 // (int)dst[0], (int)dst[1], (int)dst[2]);
c7a810cc
MN
2937
2938#if 0 //self test FIXME move to a vfilter or something
2da0d70d
DB
2939 {
2940 static volatile int i=0;
2941 i++;
2942 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2943 selfTest(src, srcStride, c->srcW, c->srcH);
2944 i--;
2945 }
c7a810cc 2946#endif
37079906 2947
2da0d70d
DB
2948 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2949 //dstStride[0],dstStride[1],dstStride[2]);
2950
2951 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2952 {
2953 static int firstTime=1; //FIXME move this into the context perhaps
2954 if (flags & SWS_PRINT_INFO && firstTime)
2955 {
4b0c30b7 2956 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
8a322796 2957 " ->cannot do aligned memory accesses anymore\n");
2da0d70d
DB
2958 firstTime=0;
2959 }
2960 }
2961
8a322796
DB
2962 /* Note the user might start scaling the picture in the middle so this
2963 will not get executed. This is not really intended but works
2964 currently, so people might do it. */
2da0d70d
DB
2965 if (srcSliceY ==0){
2966 lumBufIndex=0;
2967 chrBufIndex=0;
2968 dstY=0;
2969 lastInLumBuf= -1;
2970 lastInChrBuf= -1;
2971 }
2972
2973 lastDstY= dstY;
2974
2975 for (;dstY < dstH; dstY++){
2976 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2977 const int chrDstY= dstY>>c->chrDstVSubSample;
2978 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2979 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2980
2981 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2982 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2983 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2984 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2985
2986 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2987 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2988 //handle holes (FAST_BILINEAR & weird filters)
2989 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2990 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2991 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
fcc402b1
LB
2992 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2993 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2da0d70d
DB
2994
2995 // Do we have enough lines in this slice to output the dstY line
2996 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2997 {
2998 //Do horizontal scaling
2999 while(lastInLumBuf < lastLumSrcY)
3000 {
3001 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3002 lumBufIndex++;
3003 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
fcc402b1
LB
3004 assert(lumBufIndex < 2*vLumBufSize);
3005 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3006 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2da0d70d 3007 //printf("%d %d\n", lumBufIndex, vLumBufSize);
6bc0c792 3008 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2da0d70d
DB
3009 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3010 funnyYCode, c->srcFormat, formatConvBuffer,
3011 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3012 lastInLumBuf++;
3013 }
3014 while(lastInChrBuf < lastChrSrcY)
3015 {
3016 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3017 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3018 chrBufIndex++;
fcc402b1
LB
3019 assert(chrBufIndex < 2*vChrBufSize);
3020 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3021 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
3022 //FIXME replace parameters through context struct (some at least)
3023
3024 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 3025 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
3026 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3027 funnyUVCode, c->srcFormat, formatConvBuffer,
3028 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3029 lastInChrBuf++;
3030 }
3031 //wrap buf index around to stay inside the ring buffer
e5091488
BF
3032 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3033 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
3034 }
3035 else // not enough lines left in this slice -> load the rest in the buffer
3036 {
3037 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3038 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3039 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3040 vChrBufSize, vLumBufSize);*/
3041
3042 //Do horizontal scaling
3043 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3044 {
3045 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3046 lumBufIndex++;
fcc402b1
LB
3047 assert(lumBufIndex < 2*vLumBufSize);
3048 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3049 assert(lastInLumBuf + 1 - srcSliceY >= 0);
6bc0c792 3050 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2da0d70d
DB
3051 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3052 funnyYCode, c->srcFormat, formatConvBuffer,
3053 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3054 lastInLumBuf++;
3055 }
3056 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3057 {
3058 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3059 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3060 chrBufIndex++;
fcc402b1
LB
3061 assert(chrBufIndex < 2*vChrBufSize);
3062 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3063 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
3064
3065 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 3066 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
3067 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3068 funnyUVCode, c->srcFormat, formatConvBuffer,
3069 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3070 lastInChrBuf++;
3071 }
3072 //wrap buf index around to stay inside the ring buffer
e5091488
BF
3073 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3074 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
3075 break; //we can't output a dstY line so let's try with the next slice
3076 }
d3f41512 3077
c1b0bfb4 3078#ifdef HAVE_MMX
0cb25594
CEH
3079 b5Dither= ff_dither8[dstY&1];
3080 g6Dither= ff_dither4[dstY&1];
3081 g5Dither= ff_dither8[dstY&1];
3082 r5Dither= ff_dither8[(dstY+1)&1];
2da0d70d
DB
3083#endif
3084 if (dstY < dstH-2)
3085 {
3086 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3087 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
6542b44e 3088#ifdef HAVE_MMX
2da0d70d
DB
3089 int i;
3090 if (flags & SWS_ACCURATE_RND){
1625216e 3091 int s= APCK_SIZE / 8;
2da0d70d 3092 for (i=0; i<vLumFilterSize; i+=2){
1625216e
MN
3093 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
3094 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
3095 lumMmxFilter[s*i+APCK_COEF/4 ]=
3096 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2da0d70d
DB
3097 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3098 }
3099 for (i=0; i<vChrFilterSize; i+=2){
1625216e
MN
3100 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
3101 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
3102 chrMmxFilter[s*i+APCK_COEF/4 ]=
3103 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2da0d70d 3104 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
bca11e75 3105 }
2da0d70d
DB
3106 }else{
3107 for (i=0; i<vLumFilterSize; i++)
3108 {
3109 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3110 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3111 lumMmxFilter[4*i+2]=
3112 lumMmxFilter[4*i+3]=
3113 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3114 }
3115 for (i=0; i<vChrFilterSize; i++)
3116 {
3117 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3118 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3119 chrMmxFilter[4*i+2]=
3120 chrMmxFilter[4*i+3]=
3121 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3122 }
3123 }
6542b44e 3124#endif
2da0d70d
DB
3125 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3126 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3127 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3128 RENAME(yuv2nv12X)(c,
3129 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3130 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3131 dest, uDest, dstW, chrDstW, dstFormat);
e3d2500f 3132 }
b0880d5d 3133 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2da0d70d
DB
3134 {
3135 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3136 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
8a322796 3137 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2da0d70d
DB
3138 {
3139 int16_t *lumBuf = lumPixBuf[0];
3140 int16_t *chrBuf= chrPixBuf[0];
bf2bdde6 3141 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2da0d70d
DB
3142 }
3143 else //General YV12
3144 {
3145 RENAME(yuv2yuvX)(c,
3146 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3147 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3148 dest, uDest, vDest, dstW, chrDstW);
3149 }
3150 }
3151 else
3152 {
fcc402b1
LB
3153 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3154 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
8a322796 3155 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2da0d70d
DB
3156 {
3157 int chrAlpha= vChrFilter[2*dstY+1];
f0faee4c
MN
3158 if(flags & SWS_FULL_CHR_H_INT){
3159 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3160 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3161 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3162 dest, dstW, dstY);
3163 }else{
2da0d70d
DB
3164 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3165 dest, dstW, chrAlpha, dstFormat, flags, dstY);
f0faee4c 3166 }
2da0d70d 3167 }
8a322796 3168 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2da0d70d
DB
3169 {
3170 int lumAlpha= vLumFilter[2*dstY+1];
3171 int chrAlpha= vChrFilter[2*dstY+1];
3172 lumMmxFilter[2]=
3173 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3174 chrMmxFilter[2]=
3175 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
f0faee4c
MN
3176 if(flags & SWS_FULL_CHR_H_INT){
3177 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3178 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3179 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3180 dest, dstW, dstY);
3181 }else{
2da0d70d
DB
3182 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3183 dest, dstW, lumAlpha, chrAl