add raw demuxer for Chinese AVS elementary streams
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
8a322796
DB
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec
MN
27#undef PREFETCH
28#undef PREFETCHW
29#undef EMMS
30#undef SFENCE
31
f4406ec1 32#if HAVE_AMD3DNOW
aeb87a49 33/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
48a05cec
MN
34#define EMMS "femms"
35#else
36#define EMMS "emms"
37#endif
38
f4406ec1 39#if HAVE_AMD3DNOW
48a05cec
MN
40#define PREFETCH "prefetch"
41#define PREFETCHW "prefetchw"
b63f641e 42#elif HAVE_MMX2
48a05cec
MN
43#define PREFETCH "prefetchnta"
44#define PREFETCHW "prefetcht0"
45#else
d904b5fc
NP
46#define PREFETCH " # nop"
47#define PREFETCHW " # nop"
48a05cec
MN
48#endif
49
b63f641e 50#if HAVE_MMX2
48a05cec
MN
51#define SFENCE "sfence"
52#else
d904b5fc 53#define SFENCE " # nop"
48a05cec 54#endif
d3f41512 55
b63f641e 56#if HAVE_MMX2
d604bab9 57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
f4406ec1 58#elif HAVE_AMD3DNOW
d604bab9
MN
59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60#endif
d3f41512 61
b63f641e 62#if HAVE_MMX2
6e1c66bc 63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 64#else
6e1c66bc 65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 66#endif
6e1c66bc 67#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 68
b63f641e 69#if HAVE_ALTIVEC
a2faa401
RD
70#include "swscale_altivec_template.c"
71#endif
72
bca11e75 73#define YSCALEYUV2YV12X(x, offset, dest, width) \
7ad6469e 74 __asm__ volatile(\
2da0d70d
DB
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
8b2fce0d
MN
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
2da0d70d
DB
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
bca11e75
MN
108
109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
7ad6469e 110 __asm__ volatile(\
2da0d70d
DB
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
8b2fce0d
MN
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
1625216e 122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
8b2fce0d 123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
2da0d70d
DB
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
1625216e
MN
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
c1b0bfb4
MN
170
171#define YSCALEYUV2YV121 \
2da0d70d
DB
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
c1b0bfb4 183
bf2bdde6
MN
184#define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
33a67bd6
MN
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
bf2bdde6
MN
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
201
c1b0bfb4 202/*
2da0d70d
DB
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 208*/
25593e29 209#define YSCALEYUV2PACKEDX \
7ad6469e 210 __asm__ volatile(\
2da0d70d
DB
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
8b2fce0d 223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
2da0d70d
DB
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
c1b0bfb4 232\
2da0d70d
DB
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
237 ASMALIGN(4)\
238 "2: \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
249 " jnz 2b \n\t"\
250
251#define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
256 );
8422aa88 257
bca11e75 258#define YSCALEYUV2PACKEDX_ACCURATE \
7ad6469e 259 __asm__ volatile(\
2da0d70d
DB
260 "xor %%"REG_a", %%"REG_a" \n\t"\
261 ASMALIGN(4)\
262 "nop \n\t"\
263 "1: \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
270 ASMALIGN(4)\
271 "2: \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
8b2fce0d 273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
1625216e 274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
1625216e
MN
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
295 " jnz 2b \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
bca11e75 307\
2da0d70d
DB
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
314 ASMALIGN(4)\
315 "2: \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
1625216e 318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
1625216e 323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
2da0d70d
DB
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
1625216e
MN
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
339 " jnz 2b \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 351
8422aa88 352#define YSCALEYUV2RGBX \
2da0d70d
DB
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
d604bab9 388
6e1c66bc 389#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
390 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
391 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
392 "psraw $3, %%mm0 \n\t"\
393 "psraw $3, %%mm1 \n\t"\
394 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
395 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
396 "xor "#index", "#index" \n\t"\
397 ASMALIGN(4)\
398 "1: \n\t"\
399 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
400 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
401 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
402 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
403 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
404 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
405 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
406 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
407 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
408 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
409 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
410 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
411 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
412 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
413 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
414 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
415 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
416 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
417 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
418 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
419 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
420 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
421 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
422 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
423 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 424
6e1c66bc 425#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 426
6e1c66bc 427#define REAL_YSCALEYUV2RGB(index, c) \
2da0d70d
DB
428 "xor "#index", "#index" \n\t"\
429 ASMALIGN(4)\
430 "1: \n\t"\
431 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
432 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
433 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
434 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
435 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
436 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
437 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
438 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
439 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
440 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
441 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
442 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
443 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
444 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
445 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
446 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
447 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
448 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
449 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
450 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
451 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
452 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
453 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
454 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
455 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
456 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
457 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
458 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
459 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
460 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
461 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
462 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 491#define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
6a4970ab 492
6e1c66bc 493#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
494 "xor "#index", "#index" \n\t"\
495 ASMALIGN(4)\
496 "1: \n\t"\
497 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 498 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
499 "psraw $7, %%mm3 \n\t" \
500 "psraw $7, %%mm4 \n\t" \
501 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
502 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
503 "psraw $7, %%mm1 \n\t" \
504 "psraw $7, %%mm7 \n\t" \
6a4970ab 505
6e1c66bc 506#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 507
6e1c66bc 508#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
509 "xor "#index", "#index" \n\t"\
510 ASMALIGN(4)\
511 "1: \n\t"\
512 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 513 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
514 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
515 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
516 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
517 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
518 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
519 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
520 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
521 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
522 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
523 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
524 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
525 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
527 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
528 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
529 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
530 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
531 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
532 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
533 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
534 "paddw %%mm3, %%mm4 \n\t"\
535 "movq %%mm2, %%mm0 \n\t"\
536 "movq %%mm5, %%mm6 \n\t"\
537 "movq %%mm4, %%mm3 \n\t"\
538 "punpcklwd %%mm2, %%mm2 \n\t"\
539 "punpcklwd %%mm5, %%mm5 \n\t"\
540 "punpcklwd %%mm4, %%mm4 \n\t"\
541 "paddw %%mm1, %%mm2 \n\t"\
542 "paddw %%mm1, %%mm5 \n\t"\
543 "paddw %%mm1, %%mm4 \n\t"\
544 "punpckhwd %%mm0, %%mm0 \n\t"\
545 "punpckhwd %%mm6, %%mm6 \n\t"\
546 "punpckhwd %%mm3, %%mm3 \n\t"\
547 "paddw %%mm7, %%mm0 \n\t"\
548 "paddw %%mm7, %%mm6 \n\t"\
549 "paddw %%mm7, %%mm3 \n\t"\
550 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
551 "packuswb %%mm0, %%mm2 \n\t"\
552 "packuswb %%mm6, %%mm5 \n\t"\
553 "packuswb %%mm3, %%mm4 \n\t"\
554 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 555#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 556
6e1c66bc 557#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
558 "xor "#index", "#index" \n\t"\
559 ASMALIGN(4)\
560 "1: \n\t"\
561 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
562 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
563 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
564 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
565 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
566 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
567 "psrlw $8, %%mm3 \n\t" \
568 "psrlw $8, %%mm4 \n\t" \
569 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
570 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
571 "psraw $7, %%mm1 \n\t" \
572 "psraw $7, %%mm7 \n\t"
6e1c66bc 573#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 574
497d4f99 575// do vertical chrominance interpolation
6e1c66bc 576#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
577 "xor "#index", "#index" \n\t"\
578 ASMALIGN(4)\
579 "1: \n\t"\
580 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
581 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
582 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
583 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
584 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
585 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
586 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
587 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
588 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
589 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
590 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
591 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
592 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
593 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
594 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
595 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
596 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
597 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
599 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
600 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
601 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
602 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
603 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
604 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
605 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
606 "paddw %%mm3, %%mm4 \n\t"\
607 "movq %%mm2, %%mm0 \n\t"\
608 "movq %%mm5, %%mm6 \n\t"\
609 "movq %%mm4, %%mm3 \n\t"\
610 "punpcklwd %%mm2, %%mm2 \n\t"\
611 "punpcklwd %%mm5, %%mm5 \n\t"\
612 "punpcklwd %%mm4, %%mm4 \n\t"\
613 "paddw %%mm1, %%mm2 \n\t"\
614 "paddw %%mm1, %%mm5 \n\t"\
615 "paddw %%mm1, %%mm4 \n\t"\
616 "punpckhwd %%mm0, %%mm0 \n\t"\
617 "punpckhwd %%mm6, %%mm6 \n\t"\
618 "punpckhwd %%mm3, %%mm3 \n\t"\
619 "paddw %%mm7, %%mm0 \n\t"\
620 "paddw %%mm7, %%mm6 \n\t"\
621 "paddw %%mm7, %%mm3 \n\t"\
622 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
623 "packuswb %%mm0, %%mm2 \n\t"\
624 "packuswb %%mm6, %%mm5 \n\t"\
625 "packuswb %%mm3, %%mm4 \n\t"\
626 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 627#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 628
6e1c66bc 629#define REAL_WRITEBGR32(dst, dstw, index) \
2da0d70d
DB
630 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
631 "movq %%mm2, %%mm1 \n\t" /* B */\
632 "movq %%mm5, %%mm6 \n\t" /* R */\
633 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
634 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
635 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
636 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
637 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
638 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
639 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
640 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
641 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
642 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 643\
2da0d70d
DB
644 MOVNTQ(%%mm0, (dst, index, 4))\
645 MOVNTQ(%%mm2, 8(dst, index, 4))\
646 MOVNTQ(%%mm1, 16(dst, index, 4))\
647 MOVNTQ(%%mm3, 24(dst, index, 4))\
d604bab9 648\
2da0d70d
DB
649 "add $8, "#index" \n\t"\
650 "cmp "#dstw", "#index" \n\t"\
651 " jb 1b \n\t"
6e1c66bc 652#define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
d604bab9 653
27a90b04 654#define REAL_WRITERGB16(dst, dstw, index) \
2da0d70d
DB
655 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
656 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
657 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
658 "psrlq $3, %%mm2 \n\t"\
d604bab9 659\
2da0d70d
DB
660 "movq %%mm2, %%mm1 \n\t"\
661 "movq %%mm4, %%mm3 \n\t"\
d604bab9 662\
2da0d70d
DB
663 "punpcklbw %%mm7, %%mm3 \n\t"\
664 "punpcklbw %%mm5, %%mm2 \n\t"\
665 "punpckhbw %%mm7, %%mm4 \n\t"\
666 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 667\
2da0d70d
DB
668 "psllq $3, %%mm3 \n\t"\
669 "psllq $3, %%mm4 \n\t"\
d604bab9 670\
2da0d70d
DB
671 "por %%mm3, %%mm2 \n\t"\
672 "por %%mm4, %%mm1 \n\t"\
d604bab9 673\
2da0d70d
DB
674 MOVNTQ(%%mm2, (dst, index, 2))\
675 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 676\
2da0d70d
DB
677 "add $8, "#index" \n\t"\
678 "cmp "#dstw", "#index" \n\t"\
679 " jb 1b \n\t"
27a90b04 680#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
d604bab9 681
27a90b04 682#define REAL_WRITERGB15(dst, dstw, index) \
2da0d70d
DB
683 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
684 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
685 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
686 "psrlq $3, %%mm2 \n\t"\
687 "psrlq $1, %%mm5 \n\t"\
d604bab9 688\
2da0d70d
DB
689 "movq %%mm2, %%mm1 \n\t"\
690 "movq %%mm4, %%mm3 \n\t"\
d604bab9 691\
2da0d70d
DB
692 "punpcklbw %%mm7, %%mm3 \n\t"\
693 "punpcklbw %%mm5, %%mm2 \n\t"\
694 "punpckhbw %%mm7, %%mm4 \n\t"\
695 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 696\
2da0d70d
DB
697 "psllq $2, %%mm3 \n\t"\
698 "psllq $2, %%mm4 \n\t"\
d604bab9 699\
2da0d70d
DB
700 "por %%mm3, %%mm2 \n\t"\
701 "por %%mm4, %%mm1 \n\t"\
d604bab9 702\
2da0d70d
DB
703 MOVNTQ(%%mm2, (dst, index, 2))\
704 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 705\
2da0d70d
DB
706 "add $8, "#index" \n\t"\
707 "cmp "#dstw", "#index" \n\t"\
708 " jb 1b \n\t"
27a90b04 709#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
f62255fb 710
6542b44e 711#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
712 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
713 "movq %%mm2, %%mm1 \n\t" /* B */\
714 "movq %%mm5, %%mm6 \n\t" /* R */\
715 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
716 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
717 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
718 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
719 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
720 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
721 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
722 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
723 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
724 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 725\
2da0d70d
DB
726 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
727 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
728 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
729 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
730 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
731 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
732 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
733 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 734\
2da0d70d
DB
735 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
736 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
737 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
738 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
739 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
740 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
741 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
742 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
743 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
744 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
745 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
746 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
747 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 748\
2da0d70d
DB
749 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
750 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
751 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
752 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
753 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
754 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
755 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
756 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 757\
2da0d70d
DB
758 MOVNTQ(%%mm0, (dst))\
759 MOVNTQ(%%mm2, 8(dst))\
760 MOVNTQ(%%mm3, 16(dst))\
761 "add $24, "#dst" \n\t"\
d604bab9 762\
2da0d70d
DB
763 "add $8, "#index" \n\t"\
764 "cmp "#dstw", "#index" \n\t"\
765 " jb 1b \n\t"
d604bab9 766
6542b44e 767#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
768 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
769 "movq %%mm2, %%mm1 \n\t" /* B */\
770 "movq %%mm5, %%mm6 \n\t" /* R */\
771 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
772 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
773 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
774 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
775 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
776 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
777 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
778 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
779 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
780 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 781\
2da0d70d
DB
782 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
783 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
784 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
785 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 786\
2da0d70d
DB
787 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
788 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
789 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
790 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 791\
2da0d70d
DB
792 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
793 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
794 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
795 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 796\
2da0d70d
DB
797 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
798 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
799 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
800 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
801 MOVNTQ(%%mm0, (dst))\
99d2cb72 802\
2da0d70d
DB
803 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
804 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
805 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
806 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
807 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 808\
2da0d70d
DB
809 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
810 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
811 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
812 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 813\
2da0d70d 814 "add $24, "#dst" \n\t"\
99d2cb72 815\
2da0d70d
DB
816 "add $8, "#index" \n\t"\
817 "cmp "#dstw", "#index" \n\t"\
818 " jb 1b \n\t"
99d2cb72 819
6542b44e 820#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 821 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
822 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
823 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
824 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
825 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
826 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 827\
2da0d70d
DB
828 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
829 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
830 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 831\
2da0d70d
DB
832 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
833 "por %%mm1, %%mm6 \n\t"\
834 "por %%mm3, %%mm6 \n\t"\
835 MOVNTQ(%%mm6, (dst))\
99d2cb72 836\
2da0d70d
DB
837 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
838 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
839 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
840 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 841\
5802683a 842 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
843 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
844 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 845\
2da0d70d
DB
846 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
847 "por %%mm3, %%mm6 \n\t"\
848 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 849\
2da0d70d
DB
850 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
851 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
852 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 853\
2da0d70d
DB
854 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
855 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 856 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 857\
2da0d70d
DB
858 "por %%mm1, %%mm3 \n\t"\
859 "por %%mm3, %%mm6 \n\t"\
860 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 861\
2da0d70d 862 "add $24, "#dst" \n\t"\
99d2cb72 863\
2da0d70d
DB
864 "add $8, "#index" \n\t"\
865 "cmp "#dstw", "#index" \n\t"\
866 " jb 1b \n\t"
99d2cb72 867
b63f641e 868#if HAVE_MMX2
7630f2e0 869#undef WRITEBGR24
6e1c66bc 870#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 871#else
7630f2e0 872#undef WRITEBGR24
6e1c66bc 873#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
874#endif
875
6e1c66bc 876#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
877 "packuswb %%mm3, %%mm3 \n\t"\
878 "packuswb %%mm4, %%mm4 \n\t"\
879 "packuswb %%mm7, %%mm1 \n\t"\
880 "punpcklbw %%mm4, %%mm3 \n\t"\
881 "movq %%mm1, %%mm7 \n\t"\
882 "punpcklbw %%mm3, %%mm1 \n\t"\
883 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 884\
2da0d70d
DB
885 MOVNTQ(%%mm1, (dst, index, 2))\
886 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 887\
2da0d70d
DB
888 "add $8, "#index" \n\t"\
889 "cmp "#dstw", "#index" \n\t"\
890 " jb 1b \n\t"
6e1c66bc 891#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
892
893
77a49659 894static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
895 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
896 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
38858470 897{
b63f641e 898#if HAVE_MMX
f433c8ab 899 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
900 if (c->flags & SWS_ACCURATE_RND){
901 if (uDest){
902 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
903 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
904 }
bca11e75 905
14014d47
MN
906 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
907 }else{
908 if (uDest){
909 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
910 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
911 }
2da0d70d 912
14014d47
MN
913 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
914 }
f433c8ab
MN
915 return;
916 }
917#endif
b63f641e 918#if HAVE_ALTIVEC
a2faa401 919yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
920 chrFilter, chrSrc, chrFilterSize,
921 dest, uDest, vDest, dstW, chrDstW);
a2faa401 922#else //HAVE_ALTIVEC
5859233b 923yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
924 chrFilter, chrSrc, chrFilterSize,
925 dest, uDest, vDest, dstW, chrDstW);
a2faa401 926#endif //!HAVE_ALTIVEC
c1b0bfb4 927}
2add307d 928
6118e52e 929static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
930 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
931 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
6118e52e
VS
932{
933yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
934 chrFilter, chrSrc, chrFilterSize,
935 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
936}
937
bf2bdde6 938static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
2da0d70d 939 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
c1b0bfb4 940{
f433c8ab 941 int i;
b63f641e 942#if HAVE_MMX
f433c8ab 943 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
944 long p= uDest ? 3 : 1;
945 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
946 uint8_t *dst[3]= {dest, uDest, vDest};
947 long counter[3] = {dstW, chrDstW, chrDstW};
2da0d70d 948
14014d47
MN
949 if (c->flags & SWS_ACCURATE_RND){
950 while(p--){
7ad6469e 951 __asm__ volatile(
14014d47
MN
952 YSCALEYUV2YV121_ACCURATE
953 :: "r" (src[p]), "r" (dst[p] + counter[p]),
954 "g" (-counter[p])
955 : "%"REG_a
956 );
957 }
958 }else{
959 while(p--){
7ad6469e 960 __asm__ volatile(
14014d47
MN
961 YSCALEYUV2YV121
962 :: "r" (src[p]), "r" (dst[p] + counter[p]),
963 "g" (-counter[p])
964 : "%"REG_a
965 );
966 }
d78c1ea1 967 }
f433c8ab
MN
968 return;
969 }
970#endif
2da0d70d
DB
971 for (i=0; i<dstW; i++)
972 {
a1f3ffa3 973 int val= (lumSrc[i]+64)>>7;
2da0d70d
DB
974
975 if (val&256){
976 if (val<0) val=0;
977 else val=255;
978 }
979
980 dest[i]= val;
981 }
982
1b0a4572 983 if (uDest)
2da0d70d
DB
984 for (i=0; i<chrDstW; i++)
985 {
a1f3ffa3
MN
986 int u=(chrSrc[i ]+64)>>7;
987 int v=(chrSrc[i + VOFW]+64)>>7;
2da0d70d
DB
988
989 if ((u|v)&256){
990 if (u<0) u=0;
991 else if (u>255) u=255;
992 if (v<0) v=0;
993 else if (v>255) v=255;
994 }
995
996 uDest[i]= u;
997 vDest[i]= v;
998 }
38858470
MN
999}
1000
c1b0bfb4 1001
d604bab9
MN
1002/**
1003 * vertical scale YV12 to RGB
1004 */
25593e29 1005static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
1006 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1007 uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1008{
b63f641e 1009#if HAVE_MMX
f8d61128 1010 long dummy=0;
f433c8ab 1011 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
1012 if (c->flags & SWS_ACCURATE_RND){
1013 switch(c->dstFormat){
1014 case PIX_FMT_RGB32:
1015 YSCALEYUV2PACKEDX_ACCURATE
1016 YSCALEYUV2RGBX
1017 WRITEBGR32(%4, %5, %%REGa)
2da0d70d 1018
14014d47
MN
1019 YSCALEYUV2PACKEDX_END
1020 return;
1021 case PIX_FMT_BGR24:
1022 YSCALEYUV2PACKEDX_ACCURATE
1023 YSCALEYUV2RGBX
1024 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1025 "add %4, %%"REG_c" \n\t"
1026 WRITEBGR24(%%REGc, %5, %%REGa)
2da0d70d
DB
1027
1028
14014d47
MN
1029 :: "r" (&c->redDither),
1030 "m" (dummy), "m" (dummy), "m" (dummy),
1031 "r" (dest), "m" (dstW)
1032 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1033 );
1034 return;
1035 case PIX_FMT_RGB555:
1036 YSCALEYUV2PACKEDX_ACCURATE
1037 YSCALEYUV2RGBX
1038 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1039#ifdef DITHER1XBPP
88e2a9ae
CEH
1040 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1041 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1042 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1043#endif
1044
14014d47
MN
1045 WRITERGB15(%4, %5, %%REGa)
1046 YSCALEYUV2PACKEDX_END
1047 return;
1048 case PIX_FMT_RGB565:
1049 YSCALEYUV2PACKEDX_ACCURATE
1050 YSCALEYUV2RGBX
1051 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1052#ifdef DITHER1XBPP
88e2a9ae
CEH
1053 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1054 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1055 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
2da0d70d
DB
1056#endif
1057
14014d47
MN
1058 WRITERGB16(%4, %5, %%REGa)
1059 YSCALEYUV2PACKEDX_END
1060 return;
1061 case PIX_FMT_YUYV422:
1062 YSCALEYUV2PACKEDX_ACCURATE
1063 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1064
1065 "psraw $3, %%mm3 \n\t"
1066 "psraw $3, %%mm4 \n\t"
1067 "psraw $3, %%mm1 \n\t"
1068 "psraw $3, %%mm7 \n\t"
1069 WRITEYUY2(%4, %5, %%REGa)
1070 YSCALEYUV2PACKEDX_END
1071 return;
1072 }
1073 }else{
1074 switch(c->dstFormat)
1075 {
1076 case PIX_FMT_RGB32:
1077 YSCALEYUV2PACKEDX
1078 YSCALEYUV2RGBX
1079 WRITEBGR32(%4, %5, %%REGa)
1080 YSCALEYUV2PACKEDX_END
1081 return;
1082 case PIX_FMT_BGR24:
1083 YSCALEYUV2PACKEDX
1084 YSCALEYUV2RGBX
1085 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1086 "add %4, %%"REG_c" \n\t"
1087 WRITEBGR24(%%REGc, %5, %%REGa)
1088
1089 :: "r" (&c->redDither),
1090 "m" (dummy), "m" (dummy), "m" (dummy),
1091 "r" (dest), "m" (dstW)
1092 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1093 );
1094 return;
1095 case PIX_FMT_RGB555:
1096 YSCALEYUV2PACKEDX
1097 YSCALEYUV2RGBX
1098 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1099#ifdef DITHER1XBPP
88e2a9ae
CEH
1100 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1101 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1102 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1103#endif
1104
14014d47
MN
1105 WRITERGB15(%4, %5, %%REGa)
1106 YSCALEYUV2PACKEDX_END
1107 return;
1108 case PIX_FMT_RGB565:
1109 YSCALEYUV2PACKEDX
1110 YSCALEYUV2RGBX
1111 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1112#ifdef DITHER1XBPP
88e2a9ae
CEH
1113 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1114 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1115 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
2da0d70d
DB
1116#endif
1117
14014d47
MN
1118 WRITERGB16(%4, %5, %%REGa)
1119 YSCALEYUV2PACKEDX_END
1120 return;
1121 case PIX_FMT_YUYV422:
1122 YSCALEYUV2PACKEDX
1123 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1124
1125 "psraw $3, %%mm3 \n\t"
1126 "psraw $3, %%mm4 \n\t"
1127 "psraw $3, %%mm1 \n\t"
1128 "psraw $3, %%mm7 \n\t"
1129 WRITEYUY2(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1131 return;
1132 }
bca11e75
MN
1133 }
1134 }
bc279024 1135#endif /* HAVE_MMX */
b63f641e 1136#if HAVE_ALTIVEC
2da0d70d
DB
1137 /* The following list of supported dstFormat values should
1138 match what's found in the body of altivec_yuv2packedX() */
12794f73
KS
1139 if (!(c->flags & SWS_BITEXACT) &&
1140 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
2da0d70d 1141 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
12794f73 1142 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
2da0d70d
DB
1143 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1144 chrFilter, chrSrc, chrFilterSize,
1145 dest, dstW, dstY);
1146 else
1147#endif
1148 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1149 chrFilter, chrSrc, chrFilterSize,
1150 dest, dstW, dstY);
c1b0bfb4
MN
1151}
1152
c1b0bfb4
MN
1153/**
1154 * vertical bilinear scale YV12 to RGB
1155 */
25593e29 1156static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1157 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1158{
ac0ad729
MN
1159 int yalpha1=4095- yalpha;
1160 int uvalpha1=4095-uvalpha;
2da0d70d 1161 int i;
d604bab9 1162
b63f641e 1163#if HAVE_MMX
f433c8ab 1164 if(!(c->flags & SWS_BITEXACT)){
2da0d70d
DB
1165 switch(c->dstFormat)
1166 {
1167 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1168 case PIX_FMT_RGB32:
7ad6469e 1169 __asm__ volatile(
2da0d70d
DB
1170 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1171 "mov %4, %%"REG_b" \n\t"
1172 "push %%"REG_BP" \n\t"
1173 YSCALEYUV2RGB(%%REGBP, %5)
1174 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1175 "pop %%"REG_BP" \n\t"
1176 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1177
1178 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1179 "a" (&c->redDither)
1180 );
1181 return;
1182 case PIX_FMT_BGR24:
7ad6469e 1183 __asm__ volatile(
2da0d70d
DB
1184 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1185 "mov %4, %%"REG_b" \n\t"
1186 "push %%"REG_BP" \n\t"
1187 YSCALEYUV2RGB(%%REGBP, %5)
1188 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1189 "pop %%"REG_BP" \n\t"
1190 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1191 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1192 "a" (&c->redDither)
1193 );
1194 return;
27a90b04 1195 case PIX_FMT_RGB555:
7ad6469e 1196 __asm__ volatile(
2da0d70d
DB
1197 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1198 "mov %4, %%"REG_b" \n\t"
1199 "push %%"REG_BP" \n\t"
1200 YSCALEYUV2RGB(%%REGBP, %5)
1201 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1202#ifdef DITHER1XBPP
88e2a9ae
CEH
1203 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1204 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1205 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1206#endif
1207
27a90b04 1208 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1209 "pop %%"REG_BP" \n\t"
1210 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1211
1212 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1213 "a" (&c->redDither)
1214 );
1215 return;
27a90b04 1216 case PIX_FMT_RGB565:
7ad6469e 1217 __asm__ volatile(
2da0d70d
DB
1218 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1219 "mov %4, %%"REG_b" \n\t"
1220 "push %%"REG_BP" \n\t"
1221 YSCALEYUV2RGB(%%REGBP, %5)
1222 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1223#ifdef DITHER1XBPP
88e2a9ae
CEH
1224 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1225 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1226 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1227#endif
1228
27a90b04 1229 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1230 "pop %%"REG_BP" \n\t"
1231 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1232 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1233 "a" (&c->redDither)
1234 );
1235 return;
1236 case PIX_FMT_YUYV422:
7ad6469e 1237 __asm__ volatile(
2da0d70d
DB
1238 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1239 "mov %4, %%"REG_b" \n\t"
1240 "push %%"REG_BP" \n\t"
1241 YSCALEYUV2PACKED(%%REGBP, %5)
1242 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1243 "pop %%"REG_BP" \n\t"
1244 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1245 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1246 "a" (&c->redDither)
1247 );
1248 return;
1249 default: break;
1250 }
f433c8ab 1251 }
cf7d1c1a 1252#endif //HAVE_MMX
ec1bca2a 1253YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
d604bab9
MN
1254}
1255
1256/**
1257 * YV12 to RGB without scaling or interpolating
1258 */
25593e29 1259static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1260 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1261{
2da0d70d
DB
1262 const int yalpha1=0;
1263 int i;
6a4970ab 1264
8a322796 1265 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
2da0d70d 1266 const int yalpha= 4096; //FIXME ...
96034638 1267
2da0d70d
DB
1268 if (flags&SWS_FULL_CHR_H_INT)
1269 {
1270 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1271 return;
1272 }
397c035e 1273
b63f641e 1274#if HAVE_MMX
f433c8ab 1275 if(!(flags & SWS_BITEXACT)){
14014d47 1276 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
2da0d70d 1277 {
14014d47
MN
1278 switch(dstFormat)
1279 {
1280 case PIX_FMT_RGB32:
7ad6469e 1281 __asm__ volatile(
14014d47
MN
1282 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1283 "mov %4, %%"REG_b" \n\t"
1284 "push %%"REG_BP" \n\t"
1285 YSCALEYUV2RGB1(%%REGBP, %5)
1286 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1287 "pop %%"REG_BP" \n\t"
1288 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1289
1290 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1291 "a" (&c->redDither)
1292 );
1293 return;
1294 case PIX_FMT_BGR24:
7ad6469e 1295 __asm__ volatile(
14014d47
MN
1296 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1297 "mov %4, %%"REG_b" \n\t"
1298 "push %%"REG_BP" \n\t"
1299 YSCALEYUV2RGB1(%%REGBP, %5)
1300 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1301 "pop %%"REG_BP" \n\t"
1302 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1303
1304 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1305 "a" (&c->redDither)
1306 );
1307 return;
1308 case PIX_FMT_RGB555:
7ad6469e 1309 __asm__ volatile(
14014d47
MN
1310 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1311 "mov %4, %%"REG_b" \n\t"
1312 "push %%"REG_BP" \n\t"
1313 YSCALEYUV2RGB1(%%REGBP, %5)
1314 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1315#ifdef DITHER1XBPP
88e2a9ae
CEH
1316 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1317 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1318 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1319#endif
14014d47
MN
1320 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1321 "pop %%"REG_BP" \n\t"
1322 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1323
14014d47
MN
1324 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1325 "a" (&c->redDither)
1326 );
1327 return;
1328 case PIX_FMT_RGB565:
7ad6469e 1329 __asm__ volatile(
14014d47
MN
1330 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1331 "mov %4, %%"REG_b" \n\t"
1332 "push %%"REG_BP" \n\t"
1333 YSCALEYUV2RGB1(%%REGBP, %5)
1334 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1335#ifdef DITHER1XBPP
88e2a9ae
CEH
1336 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1337 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1338 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1339#endif
1340
14014d47
MN
1341 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1342 "pop %%"REG_BP" \n\t"
1343 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1344
14014d47
MN
1345 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1346 "a" (&c->redDither)
1347 );
1348 return;
1349 case PIX_FMT_YUYV422:
7ad6469e 1350 __asm__ volatile(
14014d47
MN
1351 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1352 "mov %4, %%"REG_b" \n\t"
1353 "push %%"REG_BP" \n\t"
1354 YSCALEYUV2PACKED1(%%REGBP, %5)
1355 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1356 "pop %%"REG_BP" \n\t"
1357 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1358
1359 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1360 "a" (&c->redDither)
1361 );
1362 return;
1363 }
2da0d70d 1364 }
14014d47 1365 else
2da0d70d 1366 {
14014d47
MN
1367 switch(dstFormat)
1368 {
1369 case PIX_FMT_RGB32:
7ad6469e 1370 __asm__ volatile(
14014d47
MN
1371 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1372 "mov %4, %%"REG_b" \n\t"
1373 "push %%"REG_BP" \n\t"
1374 YSCALEYUV2RGB1b(%%REGBP, %5)
1375 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1376 "pop %%"REG_BP" \n\t"
1377 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1378
1379 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1380 "a" (&c->redDither)
1381 );
1382 return;
1383 case PIX_FMT_BGR24:
7ad6469e 1384 __asm__ volatile(
14014d47
MN
1385 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1386 "mov %4, %%"REG_b" \n\t"
1387 "push %%"REG_BP" \n\t"
1388 YSCALEYUV2RGB1b(%%REGBP, %5)
1389 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1390 "pop %%"REG_BP" \n\t"
1391 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1392
1393 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1394 "a" (&c->redDither)
1395 );
1396 return;
1397 case PIX_FMT_RGB555:
7ad6469e 1398 __asm__ volatile(
14014d47
MN
1399 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1400 "mov %4, %%"REG_b" \n\t"
1401 "push %%"REG_BP" \n\t"
1402 YSCALEYUV2RGB1b(%%REGBP, %5)
1403 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1404#ifdef DITHER1XBPP
88e2a9ae
CEH
1405 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1406 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1407 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d 1408#endif
14014d47
MN
1409 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1410 "pop %%"REG_BP" \n\t"
1411 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1412
14014d47
MN
1413 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414 "a" (&c->redDither)
1415 );
1416 return;
1417 case PIX_FMT_RGB565:
7ad6469e 1418 __asm__ volatile(
14014d47
MN
1419 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1420 "mov %4, %%"REG_b" \n\t"
1421 "push %%"REG_BP" \n\t"
1422 YSCALEYUV2RGB1b(%%REGBP, %5)
1423 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1424#ifdef DITHER1XBPP
88e2a9ae
CEH
1425 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1426 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1427 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
2da0d70d
DB
1428#endif
1429
14014d47
MN
1430 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1431 "pop %%"REG_BP" \n\t"
1432 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1433
14014d47
MN
1434 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1435 "a" (&c->redDither)
1436 );
1437 return;
1438 case PIX_FMT_YUYV422:
7ad6469e 1439 __asm__ volatile(
14014d47
MN
1440 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1441 "mov %4, %%"REG_b" \n\t"
1442 "push %%"REG_BP" \n\t"
1443 YSCALEYUV2PACKED1b(%%REGBP, %5)
1444 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1445 "pop %%"REG_BP" \n\t"
1446 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1447
1448 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1449 "a" (&c->redDither)
1450 );
1451 return;
1452 }
2da0d70d
DB
1453 }
1454 }
bc279024 1455#endif /* HAVE_MMX */
e5091488 1456 if (uvalpha < 2048)
2da0d70d 1457 {
ec1bca2a 1458 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1459 }else{
ec1bca2a 1460 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1461 }
d604bab9
MN
1462}
1463
8a322796 1464//FIXME yuy2* can read up to 7 samples too much
6ff0ad6b 1465
896a22b8 1466static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1e621b18 1467{
b63f641e 1468#if HAVE_MMX
7ad6469e 1469 __asm__ volatile(
2da0d70d
DB
1470 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1471 "mov %0, %%"REG_a" \n\t"
1472 "1: \n\t"
1473 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1474 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1475 "pand %%mm2, %%mm0 \n\t"
1476 "pand %%mm2, %%mm1 \n\t"
1477 "packuswb %%mm1, %%mm0 \n\t"
1478 "movq %%mm0, (%2, %%"REG_a") \n\t"
1479 "add $8, %%"REG_a" \n\t"
1480 " js 1b \n\t"
1481 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1482 : "%"REG_a
1483 );
1e621b18 1484#else
2da0d70d
DB
1485 int i;
1486 for (i=0; i<width; i++)
1487 dst[i]= src[2*i];
1e621b18
MN
1488#endif
1489}
1490
896a22b8 1491static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1e621b18 1492{
b63f641e 1493#if HAVE_MMX
7ad6469e 1494 __asm__ volatile(
2da0d70d
DB
1495 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1496 "mov %0, %%"REG_a" \n\t"
1497 "1: \n\t"
1498 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1499 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1500 "psrlw $8, %%mm0 \n\t"
1501 "psrlw $8, %%mm1 \n\t"
1502 "packuswb %%mm1, %%mm0 \n\t"
1503 "movq %%mm0, %%mm1 \n\t"
1504 "psrlw $8, %%mm0 \n\t"
1505 "pand %%mm4, %%mm1 \n\t"
1506 "packuswb %%mm0, %%mm0 \n\t"
1507 "packuswb %%mm1, %%mm1 \n\t"
1508 "movd %%mm0, (%3, %%"REG_a") \n\t"
1509 "movd %%mm1, (%2, %%"REG_a") \n\t"
1510 "add $4, %%"REG_a" \n\t"
1511 " js 1b \n\t"
1512 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1513 : "%"REG_a
1514 );
1e621b18 1515#else
2da0d70d
DB
1516 int i;
1517 for (i=0; i<width; i++)
1518 {
1519 dstU[i]= src1[4*i + 1];
1520 dstV[i]= src1[4*i + 3];
1521 }
1522#endif
1523 assert(src1 == src2);
1e621b18
MN
1524}
1525
4cf16bbe
DB
1526/* This is almost identical to the previous, end exists only because
1527 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
896a22b8 1528static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
7322a67c 1529{
b63f641e 1530#if HAVE_MMX
7ad6469e 1531 __asm__ volatile(
2da0d70d
DB
1532 "mov %0, %%"REG_a" \n\t"
1533 "1: \n\t"
1534 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1535 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1536 "psrlw $8, %%mm0 \n\t"
1537 "psrlw $8, %%mm1 \n\t"
1538 "packuswb %%mm1, %%mm0 \n\t"
1539 "movq %%mm0, (%2, %%"REG_a") \n\t"
1540 "add $8, %%"REG_a" \n\t"
1541 " js 1b \n\t"
1542 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1543 : "%"REG_a
1544 );
7322a67c 1545#else
2da0d70d
DB
1546 int i;
1547 for (i=0; i<width; i++)
1548 dst[i]= src[2*i+1];
7322a67c
MN
1549#endif
1550}
1551
896a22b8 1552static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
7322a67c 1553{
b63f641e 1554#if HAVE_MMX
7ad6469e 1555 __asm__ volatile(
2da0d70d
DB
1556 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1557 "mov %0, %%"REG_a" \n\t"
1558 "1: \n\t"
1559 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1560 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1561 "pand %%mm4, %%mm0 \n\t"
1562 "pand %%mm4, %%mm1 \n\t"
1563 "packuswb %%mm1, %%mm0 \n\t"
1564 "movq %%mm0, %%mm1 \n\t"
1565 "psrlw $8, %%mm0 \n\t"
1566 "pand %%mm4, %%mm1 \n\t"
1567 "packuswb %%mm0, %%mm0 \n\t"
1568 "packuswb %%mm1, %%mm1 \n\t"
1569 "movd %%mm0, (%3, %%"REG_a") \n\t"
1570 "movd %%mm1, (%2, %%"REG_a") \n\t"
1571 "add $4, %%"REG_a" \n\t"
1572 " js 1b \n\t"
1573 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1574 : "%"REG_a
1575 );
7322a67c 1576#else
2da0d70d
DB
1577 int i;
1578 for (i=0; i<width; i++)
1579 {
1580 dstU[i]= src1[4*i + 0];
1581 dstV[i]= src1[4*i + 2];
1582 }
1583#endif
1584 assert(src1 == src2);
7322a67c
MN
1585}
1586
214892ee 1587#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
896a22b8 1588static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
214892ee
MN
1589{\
1590 int i;\
1591 for (i=0; i<width; i++)\
1592 {\
1593 int b= (((type*)src)[i]>>shb)&maskb;\
1594 int g= (((type*)src)[i]>>shg)&maskg;\
1595 int r= (((type*)src)[i]>>shr)&maskr;\
1596\
1597 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1598 }\
1e621b18
MN
1599}
1600
214892ee
MN
1601BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1602BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1603BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1604BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1605BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1606BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1607
a0baa07a 1608#define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
896a22b8 1609static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
a0baa07a
MN
1610{\
1611 int i;\
1612 for (i=0; i<width; i++)\
1613 {\
ba83d862
MN
1614 int b= (((type*)src)[i]&maskb)>>shb;\
1615 int g= (((type*)src)[i]&maskg)>>shg;\
1616 int r= (((type*)src)[i]&maskr)>>shr;\
a0baa07a
MN
1617\
1618 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1619 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1620 }\
ba83d862 1621}\
896a22b8 1622static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
ba83d862
MN
1623{\
1624 int i;\
1625 for (i=0; i<width; i++)\
1626 {\
1627 int pix0= ((type*)src)[2*i+0];\
1628 int pix1= ((type*)src)[2*i+1];\
1629 int g= (pix0&maskg)+(pix1&maskg);\
1630 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1631 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1632\
1633 g>>=shg;\
1634\
6b79dbce
MN
1635 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1636 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
ba83d862 1637 }\
2f60f629
MN
1638}
1639
ba83d862
MN
1640BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1641BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
a0baa07a
MN
1642BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1643BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1644BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1645BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1646
b63f641e 1647#if HAVE_MMX
a35acd7f 1648static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
dfb09bd1
MN
1649{
1650
1651 if(srcFormat == PIX_FMT_BGR24){
7ad6469e 1652 __asm__ volatile(
ff9a056d
MN
1653 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1654 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1655 :
dfb09bd1
MN
1656 );
1657 }else{
7ad6469e 1658 __asm__ volatile(
ff9a056d
MN
1659 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1660 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1661 :
dfb09bd1
MN
1662 );
1663 }
1664
7ad6469e 1665 __asm__ volatile(
dfb09bd1
MN
1666 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1667 "mov %2, %%"REG_a" \n\t"
1668 "pxor %%mm7, %%mm7 \n\t"
1669 "1: \n\t"
1670 PREFETCH" 64(%0) \n\t"
1671 "movd (%0), %%mm0 \n\t"
1672 "movd 2(%0), %%mm1 \n\t"
1673 "movd 6(%0), %%mm2 \n\t"
1674 "movd 8(%0), %%mm3 \n\t"
1675 "add $12, %0 \n\t"
1676 "punpcklbw %%mm7, %%mm0 \n\t"
1677 "punpcklbw %%mm7, %%mm1 \n\t"
1678 "punpcklbw %%mm7, %%mm2 \n\t"
1679 "punpcklbw %%mm7, %%mm3 \n\t"
1680 "pmaddwd %%mm5, %%mm0 \n\t"
1681 "pmaddwd %%mm6, %%mm1 \n\t"
1682 "pmaddwd %%mm5, %%mm2 \n\t"
1683 "pmaddwd %%mm6, %%mm3 \n\t"
1684 "paddd %%mm1, %%mm0 \n\t"
1685 "paddd %%mm3, %%mm2 \n\t"
1686 "paddd %%mm4, %%mm0 \n\t"
1687 "paddd %%mm4, %%mm2 \n\t"
1688 "psrad $15, %%mm0 \n\t"
1689 "psrad $15, %%mm2 \n\t"
1690 "packssdw %%mm2, %%mm0 \n\t"
1691 "packuswb %%mm0, %%mm0 \n\t"
1692 "movd %%mm0, (%1, %%"REG_a") \n\t"
1693 "add $4, %%"REG_a" \n\t"
1694 " js 1b \n\t"
1695 : "+r" (src)
1696 : "r" (dst+width), "g" (-width)
1697 : "%"REG_a
2da0d70d 1698 );
dfb09bd1
MN
1699}
1700
a35acd7f 1701static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
dfb09bd1 1702{
7ad6469e 1703 __asm__ volatile(
dfb09bd1
MN
1704 "movq 24+%4, %%mm6 \n\t"
1705 "mov %3, %%"REG_a" \n\t"
1706 "pxor %%mm7, %%mm7 \n\t"
1707 "1: \n\t"
1708 PREFETCH" 64(%0) \n\t"
1709 "movd (%0), %%mm0 \n\t"
1710 "movd 2(%0), %%mm1 \n\t"
1711 "punpcklbw %%mm7, %%mm0 \n\t"
1712 "punpcklbw %%mm7, %%mm1 \n\t"
1713 "movq %%mm0, %%mm2 \n\t"
1714 "movq %%mm1, %%mm3 \n\t"
1715 "pmaddwd %4, %%mm0 \n\t"
1716 "pmaddwd 8+%4, %%mm1 \n\t"
1717 "pmaddwd 16+%4, %%mm2 \n\t"
1718 "pmaddwd %%mm6, %%mm3 \n\t"
1719 "paddd %%mm1, %%mm0 \n\t"
1720 "paddd %%mm3, %%mm2 \n\t"
1721
1722 "movd 6(%0), %%mm1 \n\t"
1723 "movd 8(%0), %%mm3 \n\t"
1724 "add $12, %0 \n\t"
1725 "punpcklbw %%mm7, %%mm1 \n\t"
1726 "punpcklbw %%mm7, %%mm3 \n\t"
1727 "movq %%mm1, %%mm4 \n\t"
1728 "movq %%mm3, %%mm5 \n\t"
1729 "pmaddwd %4, %%mm1 \n\t"
1730 "pmaddwd 8+%4, %%mm3 \n\t"
1731 "pmaddwd 16+%4, %%mm4 \n\t"
1732 "pmaddwd %%mm6, %%mm5 \n\t"
1733 "paddd %%mm3, %%mm1 \n\t"
1734 "paddd %%mm5, %%mm4 \n\t"
1735
1736 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1737 "paddd %%mm3, %%mm0 \n\t"
1738 "paddd %%mm3, %%mm2 \n\t"
1739 "paddd %%mm3, %%mm1 \n\t"
1740 "paddd %%mm3, %%mm4 \n\t"
1741 "psrad $15, %%mm0 \n\t"
1742 "psrad $15, %%mm2 \n\t"
1743 "psrad $15, %%mm1 \n\t"
1744 "psrad $15, %%mm4 \n\t"
1745 "packssdw %%mm1, %%mm0 \n\t"
1746 "packssdw %%mm4, %%mm2 \n\t"
1747 "packuswb %%mm0, %%mm0 \n\t"
1748 "packuswb %%mm2, %%mm2 \n\t"
1749 "movd %%mm0, (%1, %%"REG_a") \n\t"
1750 "movd %%mm2, (%2, %%"REG_a") \n\t"
1751 "add $4, %%"REG_a" \n\t"
1752 " js 1b \n\t"
1753 : "+r" (src)
1754 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1755 : "%"REG_a
1756 );
1757}
1758#endif
1759
896a22b8 1760static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
dfb09bd1 1761{
b63f641e 1762#if HAVE_MMX
a35acd7f 1763 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1e621b18 1764#else
2da0d70d
DB
1765 int i;
1766 for (i=0; i<width; i++)
1767 {
1768 int b= src[i*3+0];
1769 int g= src[i*3+1];
1770 int r= src[i*3+2];
1e621b18 1771
e5091488 1772 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1773 }
bc279024 1774#endif /* HAVE_MMX */
1e621b18
MN
1775}
1776
896a22b8 1777static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1e621b18 1778{
b63f641e 1779#if HAVE_MMX
a35acd7f 1780 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1e621b18 1781#else
2da0d70d
DB
1782 int i;
1783 for (i=0; i<width; i++)
1784 {
dfb09bd1
MN
1785 int b= src1[3*i + 0];
1786 int g= src1[3*i + 1];
1787 int r= src1[3*i + 2];
2da0d70d 1788
dfb09bd1
MN
1789 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1790 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 1791 }
bc279024 1792#endif /* HAVE_MMX */
2da0d70d 1793 assert(src1 == src2);
1e621b18
MN
1794}
1795
896a22b8 1796static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
1797{
1798 int i;
1799 for (i=0; i<width; i++)
1800 {
1801 int b= src1[6*i + 0] + src1[6*i + 3];
1802 int g= src1[6*i + 1] + src1[6*i + 4];
1803 int r= src1[6*i + 2] + src1[6*i + 5];
1804
1805 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1806 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1807 }
1808 assert(src1 == src2);
1809}
1810
896a22b8 1811static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
a861d4d7 1812{
b63f641e 1813#if HAVE_MMX
a35acd7f 1814 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
dfb09bd1 1815#else
2da0d70d
DB
1816 int i;
1817 for (i=0; i<width; i++)
1818 {
1819 int r= src[i*3+0];
1820 int g= src[i*3+1];
1821 int b= src[i*3+2];
1822
e5091488 1823 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1824 }
dfb09bd1 1825#endif
a861d4d7
MN
1826}
1827
896a22b8 1828static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
a861d4d7 1829{
b63f641e 1830#if HAVE_MMX
5155b839 1831 assert(src1==src2);
a35acd7f 1832 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
dfb09bd1 1833#else
5155b839
DB
1834 int i;
1835 assert(src1==src2);
2da0d70d
DB
1836 for (i=0; i<width; i++)
1837 {
dfb09bd1
MN
1838 int r= src1[3*i + 0];
1839 int g= src1[3*i + 1];
1840 int b= src1[3*i + 2];
2da0d70d 1841
dfb09bd1
MN
1842 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1843 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 1844 }
dfb09bd1 1845#endif
a861d4d7
MN
1846}
1847
896a22b8 1848static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
1849{
1850 int i;
1851 assert(src1==src2);
1852 for (i=0; i<width; i++)
1853 {
e09d7eef
MN
1854 int r= src1[6*i + 0] + src1[6*i + 3];
1855 int g= src1[6*i + 1] + src1[6*i + 4];
1856 int b= src1[6*i + 2] + src1[6*i + 5];
2f60f629
MN
1857
1858 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1859 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1860 }
1861}
1862
1e621b18 1863
97b93389 1864static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
e28630fc 1865{
2da0d70d
DB
1866 int i;
1867 for (i=0; i<width; i++)
1868 {
1869 int d= src[i];
e28630fc 1870
2da0d70d
DB
1871 dst[i]= pal[d] & 0xFF;
1872 }
e28630fc
MN
1873}
1874
97b93389 1875static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
e28630fc 1876{
2da0d70d
DB
1877 int i;
1878 assert(src1 == src2);
1879 for (i=0; i<width; i++)
1880 {
1881 int p= pal[src1[i]];
1882
1883 dstU[i]= p>>8;
1884 dstV[i]= p>>16;
1885 }
e28630fc
MN
1886}
1887
896a22b8 1888static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
3d05e078
MN
1889{
1890 int i, j;
1891 for (i=0; i<width/8; i++){
3a5ba0c3
LB
1892 int d= ~src[i];
1893 for(j=0; j<8; j++)
1894 dst[8*i+j]= ((d>>(7-j))&1)*255;
1895 }
1896}
1897
896a22b8 1898static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
3a5ba0c3
LB
1899{
1900 int i, j;
1901 for (i=0; i<width/8; i++){
1902 int d= src[i];
78454dfc
MN
1903 for(j=0; j<8; j++)
1904 dst[8*i+j]= ((d>>(7-j))&1)*255;
3d05e078
MN
1905 }
1906}
1907
8a322796 1908// bilinear / bicubic scaling
077ea8a7 1909static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2da0d70d 1910 int16_t *filter, int16_t *filterPos, long filterSize)
2ff198c1 1911{
b63f641e 1912#if HAVE_MMX
2da0d70d
DB
1913 assert(filterSize % 4 == 0 && filterSize>0);
1914 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
1915 {
1916 long counter= -2*dstW;
1917 filter-= counter*2;
1918 filterPos-= counter/2;
1919 dst-= counter/2;
7ad6469e 1920 __asm__ volatile(
83c89c78 1921#if defined(PIC)
2da0d70d
DB
1922 "push %%"REG_b" \n\t"
1923#endif
1924 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
1925 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1926 "mov %%"REG_a", %%"REG_BP" \n\t"
1927 ASMALIGN(4)
1928 "1: \n\t"
1929 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1930 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1931 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1932 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1933 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1934 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1935 "punpcklbw %%mm7, %%mm0 \n\t"
1936 "punpcklbw %%mm7, %%mm2 \n\t"
1937 "pmaddwd %%mm1, %%mm0 \n\t"
1938 "pmaddwd %%mm2, %%mm3 \n\t"
ef423a66
MN
1939 "movq %%mm0, %%mm4 \n\t"
1940 "punpckldq %%mm3, %%mm0 \n\t"
1941 "punpckhdq %%mm3, %%mm4 \n\t"
1942 "paddd %%mm4, %%mm0 \n\t"
1943 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
1944 "packssdw %%mm0, %%mm0 \n\t"
1945 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1946 "add $4, %%"REG_BP" \n\t"
1947 " jnc 1b \n\t"
1948
1949 "pop %%"REG_BP" \n\t"
83c89c78 1950#if defined(PIC)
2da0d70d 1951 "pop %%"REG_b" \n\t"
83c89c78 1952#endif
2da0d70d
DB
1953 : "+a" (counter)
1954 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 1955#if !defined(PIC)
2da0d70d
DB
1956 : "%"REG_b
1957#endif
1958 );
1959 }
1960 else if (filterSize==8)
1961 {
1962 long counter= -2*dstW;
1963 filter-= counter*4;
1964 filterPos-= counter/2;
1965 dst-= counter/2;
7ad6469e 1966 __asm__ volatile(
83c89c78 1967#if defined(PIC)
2da0d70d
DB
1968 "push %%"REG_b" \n\t"
1969#endif
1970 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
1971 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1972 "mov %%"REG_a", %%"REG_BP" \n\t"
1973 ASMALIGN(4)
1974 "1: \n\t"
1975 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1976 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1977 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
1978 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
1979 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1980 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1981 "punpcklbw %%mm7, %%mm0 \n\t"
1982 "punpcklbw %%mm7, %%mm2 \n\t"
1983 "pmaddwd %%mm1, %%mm0 \n\t"
1984 "pmaddwd %%mm2, %%mm3 \n\t"
1985
1986 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
1987 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
1988 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
1989 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
1990 "punpcklbw %%mm7, %%mm4 \n\t"
1991 "punpcklbw %%mm7, %%mm2 \n\t"
1992 "pmaddwd %%mm1, %%mm4 \n\t"
1993 "pmaddwd %%mm2, %%mm5 \n\t"
1994 "paddd %%mm4, %%mm0 \n\t"
1995 "paddd %%mm5, %%mm3 \n\t"
ef423a66
MN
1996 "movq %%mm0, %%mm4 \n\t"
1997 "punpckldq %%mm3, %%mm0 \n\t"
1998 "punpckhdq %%mm3, %%mm4 \n\t"
1999 "paddd %%mm4, %%mm0 \n\t"
2000 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2001 "packssdw %%mm0, %%mm0 \n\t"
2002 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2003 "add $4, %%"REG_BP" \n\t"
2004 " jnc 1b \n\t"
2005
2006 "pop %%"REG_BP" \n\t"
83c89c78 2007#if defined(PIC)
2da0d70d 2008 "pop %%"REG_b" \n\t"
83c89c78 2009#endif
2da0d70d
DB
2010 : "+a" (counter)
2011 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2012#if !defined(PIC)
2da0d70d
DB
2013 : "%"REG_b
2014#endif
2015 );
2016 }
2017 else
2018 {
2019 uint8_t *offset = src+filterSize;
2020 long counter= -2*dstW;
2021 //filter-= counter*filterSize/2;
2022 filterPos-= counter/2;
2023 dst-= counter/2;
7ad6469e 2024 __asm__ volatile(
2da0d70d 2025 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2026 ASMALIGN(4)
2027 "1: \n\t"
2028 "mov %2, %%"REG_c" \n\t"
2029 "movzwl (%%"REG_c", %0), %%eax \n\t"
2030 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2031 "mov %5, %%"REG_c" \n\t"
2032 "pxor %%mm4, %%mm4 \n\t"
2033 "pxor %%mm5, %%mm5 \n\t"
2034 "2: \n\t"
2035 "movq (%1), %%mm1 \n\t"
2036 "movq (%1, %6), %%mm3 \n\t"
2037 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2038 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2039 "punpcklbw %%mm7, %%mm0 \n\t"
2040 "punpcklbw %%mm7, %%mm2 \n\t"
2041 "pmaddwd %%mm1, %%mm0 \n\t"
2042 "pmaddwd %%mm2, %%mm3 \n\t"
2043 "paddd %%mm3, %%mm5 \n\t"
2044 "paddd %%mm0, %%mm4 \n\t"
2045 "add $8, %1 \n\t"
2046 "add $4, %%"REG_c" \n\t"
2047 "cmp %4, %%"REG_c" \n\t"
2048 " jb 2b \n\t"
2049 "add %6, %1 \n\t"
ef423a66
MN
2050 "movq %%mm4, %%mm0 \n\t"
2051 "punpckldq %%mm5, %%mm4 \n\t"
2052 "punpckhdq %%mm5, %%mm0 \n\t"
2053 "paddd %%mm0, %%mm4 \n\t"
2054 "psrad $7, %%mm4 \n\t"
2da0d70d
DB
2055 "packssdw %%mm4, %%mm4 \n\t"
2056 "mov %3, %%"REG_a" \n\t"
2057 "movd %%mm4, (%%"REG_a", %0) \n\t"
2058 "add $4, %0 \n\t"
2059 " jnc 1b \n\t"
2060
2061 : "+r" (counter), "+r" (filter)
2062 : "m" (filterPos), "m" (dst), "m"(offset),
2063 "m" (src), "r" (filterSize*2)
2064 : "%"REG_a, "%"REG_c, "%"REG_d
2065 );
2066 }
077ea8a7 2067#else
b63f641e 2068#if HAVE_ALTIVEC
2da0d70d 2069 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2070#else
2da0d70d
DB
2071 int i;
2072 for (i=0; i<dstW; i++)
2073 {
2074 int j;
2075 int srcPos= filterPos[i];
2076 int val=0;
2077 //printf("filterPos: %d\n", filterPos[i]);
2078 for (j=0; j<filterSize; j++)
2079 {
2080 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2081 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2082 }
2083 //filter += hFilterSize;
881c4294 2084 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2da0d70d
DB
2085 //dst[i] = val>>7;
2086 }
bc279024
DB
2087#endif /* HAVE_ALTIVEC */
2088#endif /* HAVE_MMX */
077ea8a7 2089}
2ff198c1 2090 // *** horizontal scale Y line to temp buffer
6bc0c792 2091static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2da0d70d
DB
2092 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2093 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2094 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
e48a79c9 2095 int32_t *mmx2FilterPos, uint32_t *pal)
077ea8a7 2096{
2da0d70d 2097 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
1e621b18 2098 {
896a22b8 2099 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2100 src= formatConvBuffer;
1e621b18 2101 }
2da0d70d 2102 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
7322a67c 2103 {
896a22b8 2104 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2105 src= formatConvBuffer;
7322a67c 2106 }
2da0d70d 2107 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2108 {
896a22b8 2109 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2110 src= formatConvBuffer;
1e621b18 2111 }
9990e426
MN
2112 else if (srcFormat==PIX_FMT_RGB32_1)
2113 {
896a22b8 2114 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2115 src= formatConvBuffer;
2116 }
2da0d70d 2117 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2118 {
896a22b8 2119 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2120 src= formatConvBuffer;
1e621b18 2121 }
2da0d70d 2122 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2123 {
896a22b8 2124 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2125 src= formatConvBuffer;
6af250ea 2126 }
2da0d70d 2127 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2128 {
896a22b8 2129 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2130 src= formatConvBuffer;
b72034dd 2131 }
2da0d70d 2132 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2133 {
896a22b8 2134 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2135 src= formatConvBuffer;
a861d4d7 2136 }
9990e426
MN
2137 else if (srcFormat==PIX_FMT_BGR32_1)
2138 {
896a22b8 2139 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2140 src= formatConvBuffer;
2141 }
2da0d70d 2142 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2143 {
896a22b8 2144 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2145 src= formatConvBuffer;
a861d4d7 2146 }
2da0d70d 2147 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2148 {
896a22b8 2149 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2150 src= formatConvBuffer;
a43fb6b3 2151 }
2da0d70d 2152 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2153 {
896a22b8 2154 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2155 src= formatConvBuffer;
a43fb6b3 2156 }
2da0d70d 2157 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2158 {
e48a79c9 2159 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2160 src= formatConvBuffer;
e28630fc 2161 }
3a5ba0c3
LB
2162 else if (srcFormat==PIX_FMT_MONOBLACK)
2163 {
896a22b8 2164 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
3a5ba0c3
LB
2165 src= formatConvBuffer;
2166 }
2167 else if (srcFormat==PIX_FMT_MONOWHITE)
3d05e078 2168 {
896a22b8 2169 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
3d05e078
MN
2170 src= formatConvBuffer;
2171 }
1e621b18 2172
b63f641e 2173#if HAVE_MMX
8a322796 2174 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2175 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2176#else
2da0d70d 2177 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2178#endif
077ea8a7 2179 {
2da0d70d 2180 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
077ea8a7 2181 }
8a322796 2182 else // fast bilinear upscale / crap downscale
077ea8a7 2183 {
b63f641e
AJ
2184#if ARCH_X86
2185#if HAVE_MMX2
2da0d70d 2186 int i;
83c89c78 2187#if defined(PIC)
2da0d70d 2188 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2189#endif
2da0d70d
DB
2190 if (canMMX2BeUsed)
2191 {
7ad6469e 2192 __asm__ volatile(
83c89c78 2193#if defined(PIC)
2da0d70d
DB
2194 "mov %%"REG_b", %5 \n\t"
2195#endif
2196 "pxor %%mm7, %%mm7 \n\t"
2197 "mov %0, %%"REG_c" \n\t"
2198 "mov %1, %%"REG_D" \n\t"
2199 "mov %2, %%"REG_d" \n\t"
2200 "mov %3, %%"REG_b" \n\t"
2201 "xor %%"REG_a", %%"REG_a" \n\t" // i
2202 PREFETCH" (%%"REG_c") \n\t"
2203 PREFETCH" 32(%%"REG_c") \n\t"
2204 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2205
b63f641e 2206#if ARCH_X86_64
6d606c4f
AJ
2207
2208#define FUNNY_Y_CODE \
2da0d70d
DB
2209 "movl (%%"REG_b"), %%esi \n\t"\
2210 "call *%4 \n\t"\
2211 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2212 "add %%"REG_S", %%"REG_c" \n\t"\
2213 "add %%"REG_a", %%"REG_D" \n\t"\
2214 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2215
2216#else
2217
2ff198c1 2218#define FUNNY_Y_CODE \
2da0d70d
DB
2219 "movl (%%"REG_b"), %%esi \n\t"\
2220 "call *%4 \n\t"\
2221 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2222 "add %%"REG_a", %%"REG_D" \n\t"\
2223 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2224
bc279024 2225#endif /* ARCH_X86_64 */
6d606c4f 2226
2ff198c1
MN
2227FUNNY_Y_CODE
2228FUNNY_Y_CODE
2229FUNNY_Y_CODE
2230FUNNY_Y_CODE
2231FUNNY_Y_CODE
2232FUNNY_Y_CODE
2233FUNNY_Y_CODE
2234FUNNY_Y_CODE
2235
83c89c78 2236#if defined(PIC)
2da0d70d 2237 "mov %5, %%"REG_b" \n\t"
83c89c78 2238#endif
2da0d70d
DB
2239 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2240 "m" (funnyYCode)
83c89c78 2241#if defined(PIC)
2da0d70d 2242 ,"m" (ebxsave)
83c89c78 2243#endif
2da0d70d 2244 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2245#if !defined(PIC)
2da0d70d
DB
2246 ,"%"REG_b
2247#endif
2248 );
2249 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2250 }
2251 else
2252 {
bc279024 2253#endif /* HAVE_MMX2 */
2da0d70d
DB
2254 long xInc_shr16 = xInc >> 16;
2255 uint16_t xInc_mask = xInc & 0xffff;
2256 //NO MMX just normal asm ...
7ad6469e 2257 __asm__ volatile(
2da0d70d
DB
2258 "xor %%"REG_a", %%"REG_a" \n\t" // i
2259 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2260 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2261 ASMALIGN(4)
2262 "1: \n\t"
2263 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2264 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2265 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2266 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2267 "shll $16, %%edi \n\t"
2268 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2269 "mov %1, %%"REG_D" \n\t"
2270 "shrl $9, %%esi \n\t"
2271 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2272 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2273 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2274
2275 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2276 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2277 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2278 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2279 "shll $16, %%edi \n\t"
2280 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2281 "mov %1, %%"REG_D" \n\t"
2282 "shrl $9, %%esi \n\t"
2283 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2284 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2285 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2286
2287
2288 "add $2, %%"REG_a" \n\t"
2289 "cmp %2, %%"REG_a" \n\t"
2290 " jb 1b \n\t"
2291
2292
2293 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2294 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2295 );
b63f641e 2296#if HAVE_MMX2
2da0d70d 2297 } //if MMX2 can't be used
2ff198c1
MN
2298#endif
2299#else
2da0d70d
DB
2300 int i;
2301 unsigned int xpos=0;
2302 for (i=0;i<dstWidth;i++)
2303 {
2304 register unsigned int xx=xpos>>16;
2305 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2306 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2307 xpos+=xInc;
2308 }
b63f641e 2309#endif /* ARCH_X86 */
077ea8a7 2310 }
6bc0c792
MN
2311
2312 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2313 int i;
2314 //FIXME all pal and rgb srcFormats could do this convertion as well
2315 //FIXME all scalers more complex than bilinear could do half of this transform
2316 if(c->srcRange){
2317 for (i=0; i<dstWidth; i++)
2318 dst[i]= (dst[i]*14071 + 33561947)>>14;
2319 }else{
2320 for (i=0; i<dstWidth; i++)
aa13b0fc 2321 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
6bc0c792
MN
2322 }
2323 }
2ff198c1
MN
2324}
2325
6bc0c792 2326inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2da0d70d
DB
2327 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2328 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2329 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
e48a79c9 2330 int32_t *mmx2FilterPos, uint32_t *pal)
2ff198c1 2331{
2da0d70d 2332 if (srcFormat==PIX_FMT_YUYV422)
1e621b18 2333 {
896a22b8 2334 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2335 src1= formatConvBuffer;
8b2fce0d 2336 src2= formatConvBuffer+VOFW;
1e621b18 2337 }
2da0d70d 2338 else if (srcFormat==PIX_FMT_UYVY422)
7322a67c 2339 {
896a22b8 2340 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2341 src1= formatConvBuffer;
8b2fce0d 2342 src2= formatConvBuffer+VOFW;
7322a67c 2343 }
2da0d70d 2344 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2345 {
2f60f629 2346 if(c->chrSrcHSubSample)
896a22b8 2347 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2348 else
896a22b8 2349 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2350 src1= formatConvBuffer;
8b2fce0d 2351 src2= formatConvBuffer+VOFW;
1e621b18 2352 }
9990e426
MN
2353 else if (srcFormat==PIX_FMT_RGB32_1)
2354 {
2f60f629 2355 if(c->chrSrcHSubSample)
896a22b8 2356 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2357 else
896a22b8 2358 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2359 src1= formatConvBuffer;
2360 src2= formatConvBuffer+VOFW;
2361 }
2da0d70d 2362 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2363 {
2f60f629 2364 if(c->chrSrcHSubSample)
896a22b8 2365 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2366 else
896a22b8 2367 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2368 src1= formatConvBuffer;
8b2fce0d 2369 src2= formatConvBuffer+VOFW;
1e621b18 2370 }
2da0d70d 2371 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2372 {
2f60f629 2373 if(c->chrSrcHSubSample)
896a22b8 2374 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2375 else
896a22b8 2376 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2377 src1= formatConvBuffer;
8b2fce0d 2378 src2= formatConvBuffer+VOFW;
6af250ea 2379 }
2da0d70d 2380 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2381 {
2f60f629 2382 if(c->chrSrcHSubSample)
896a22b8 2383 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2384 else
896a22b8 2385 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2386 src1= formatConvBuffer;
8b2fce0d 2387 src2= formatConvBuffer+VOFW;
b72034dd 2388 }
2da0d70d 2389 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2390 {
2f60f629 2391 if(c->chrSrcHSubSample)
896a22b8 2392 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2393 else
896a22b8 2394 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2395 src1= formatConvBuffer;
8b2fce0d 2396 src2= formatConvBuffer+VOFW;
a861d4d7 2397 }
9990e426
MN
2398 else if (srcFormat==PIX_FMT_BGR32_1)
2399 {
2f60f629 2400 if(c->chrSrcHSubSample)
896a22b8 2401 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2402 else
896a22b8 2403 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2404 src1= formatConvBuffer;
2405 src2= formatConvBuffer+VOFW;
2406 }
2da0d70d 2407 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2408 {
2f60f629 2409 if(c->chrSrcHSubSample)
896a22b8 2410 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2411 else
896a22b8 2412 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2413 src1= formatConvBuffer;
8b2fce0d 2414 src2= formatConvBuffer+VOFW;
a861d4d7 2415 }
2da0d70d 2416 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2417 {
2f60f629 2418 if(c->chrSrcHSubSample)
896a22b8 2419 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2420 else
896a22b8 2421 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2422 src1= formatConvBuffer;
8b2fce0d 2423 src2= formatConvBuffer+VOFW;
a43fb6b3 2424 }
2da0d70d 2425 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2426 {
2f60f629 2427 if(c->chrSrcHSubSample)
896a22b8 2428 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2429 else
896a22b8 2430 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2431 src1= formatConvBuffer;
8b2fce0d 2432 src2= formatConvBuffer+VOFW;
a43fb6b3 2433 }
4bb9adcf 2434 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
6ff0ad6b 2435 {
2da0d70d 2436 return;
6ff0ad6b 2437 }
2da0d70d 2438 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2439 {
e48a79c9 2440 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2441 src1= formatConvBuffer;
8b2fce0d 2442 src2= formatConvBuffer+VOFW;
e28630fc 2443 }
1e621b18 2444
b63f641e 2445#if HAVE_MMX
8a322796 2446 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2447 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2448#else
2da0d70d 2449 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2450#endif
077ea8a7 2451 {
2da0d70d 2452 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
8b2fce0d 2453 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
077ea8a7 2454 }
8a322796 2455 else // fast bilinear upscale / crap downscale
077ea8a7 2456 {
b63f641e
AJ
2457#if ARCH_X86
2458#if HAVE_MMX2
2da0d70d 2459 int i;
83c89c78 2460#if defined(PIC)
2da0d70d 2461 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2462#endif
2da0d70d
DB
2463 if (canMMX2BeUsed)
2464 {
7ad6469e 2465 __asm__ volatile(
83c89c78 2466#if defined(PIC)
2da0d70d
DB
2467 "mov %%"REG_b", %6 \n\t"
2468#endif
2469 "pxor %%mm7, %%mm7 \n\t"
2470 "mov %0, %%"REG_c" \n\t"
2471 "mov %1, %%"REG_D" \n\t"
2472 "mov %2, %%"REG_d" \n\t"
2473 "mov %3, %%"REG_b" \n\t"
2474 "xor %%"REG_a", %%"REG_a" \n\t" // i
2475 PREFETCH" (%%"REG_c") \n\t"
2476 PREFETCH" 32(%%"REG_c") \n\t"
2477 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2478
b63f641e 2479#if ARCH_X86_64
6d606c4f
AJ
2480
2481#define FUNNY_UV_CODE \
2da0d70d
DB
2482 "movl (%%"REG_b"), %%esi \n\t"\
2483 "call *%4 \n\t"\
2484 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2485 "add %%"REG_S", %%"REG_c" \n\t"\
2486 "add %%"REG_a", %%"REG_D" \n\t"\
2487 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2488
2489#else
2490
b7dc6f66 2491#define FUNNY_UV_CODE \
2da0d70d
DB
2492 "movl (%%"REG_b"), %%esi \n\t"\
2493 "call *%4 \n\t"\
2494 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2495 "add %%"REG_a", %%"REG_D" \n\t"\
2496 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2497
bc279024 2498#endif /* ARCH_X86_64 */
6d606c4f 2499
b7dc6f66
MN
2500FUNNY_UV_CODE
2501FUNNY_UV_CODE
2502FUNNY_UV_CODE
2503FUNNY_UV_CODE
2da0d70d
DB
2504 "xor %%"REG_a", %%"REG_a" \n\t" // i
2505 "mov %5, %%"REG_c" \n\t" // src
2506 "mov %1, %%"REG_D" \n\t" // buf1
8b2fce0d 2507 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2da0d70d
DB
2508 PREFETCH" (%%"REG_c") \n\t"
2509 PREFETCH" 32(%%"REG_c") \n\t"
2510 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2511
2512FUNNY_UV_CODE
2513FUNNY_UV_CODE
2514FUNNY_UV_CODE
2515FUNNY_UV_CODE
2516
83c89c78 2517#if defined(PIC)
2da0d70d 2518 "mov %6, %%"REG_b" \n\t"
83c89c78 2519#endif
2da0d70d
DB
2520 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2521 "m" (funnyUVCode), "m" (src2)
83c89c78 2522#if defined(PIC)
2da0d70d 2523 ,"m" (ebxsave)
83c89c78 2524#endif
2da0d70d 2525 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2526#if !defined(PIC)
2da0d70d
DB
2527 ,"%"REG_b
2528#endif
2529 );
2530 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2531 {
2532 //printf("%d %d %d\n", dstWidth, i, srcW);
2533 dst[i] = src1[srcW-1]*128;
8b2fce0d 2534 dst[i+VOFW] = src2[srcW-1]*128;
2da0d70d
DB
2535 }
2536 }
2537 else
2538 {
bc279024 2539#endif /* HAVE_MMX2 */
2da0d70d
DB
2540 long xInc_shr16 = (long) (xInc >> 16);
2541 uint16_t xInc_mask = xInc & 0xffff;
7ad6469e 2542 __asm__ volatile(
2da0d70d
DB
2543 "xor %%"REG_a", %%"REG_a" \n\t" // i
2544 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2545 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2546 ASMALIGN(4)
2547 "1: \n\t"
2548 "mov %0, %%"REG_S" \n\t"
2549 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2550 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2551 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2552 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2553 "shll $16, %%edi \n\t"
2554 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2555 "mov %1, %%"REG_D" \n\t"
2556 "shrl $9, %%esi \n\t"
2557 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2558
2559 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2560 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2561 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2562 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2563 "shll $16, %%edi \n\t"
2564 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2565 "mov %1, %%"REG_D" \n\t"
2566 "shrl $9, %%esi \n\t"
8b2fce0d 2567 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2da0d70d
DB
2568
2569 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2570 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2571 "add $1, %%"REG_a" \n\t"
2572 "cmp %2, %%"REG_a" \n\t"
2573 " jb 1b \n\t"
2ff198c1 2574
8a322796
DB
2575/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2576 which is needed to support GCC 4.0. */
b63f641e 2577#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2da0d70d 2578 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2579#else
2da0d70d 2580 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2581#endif
2da0d70d
DB
2582 "r" (src2)
2583 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2584 );
b63f641e 2585#if HAVE_MMX2
2da0d70d 2586 } //if MMX2 can't be used
2ff198c1
MN
2587#endif
2588#else
2da0d70d
DB
2589 int i;
2590 unsigned int xpos=0;
2591 for (i=0;i<dstWidth;i++)
2592 {
2593 register unsigned int xx=xpos>>16;
2594 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2595 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
8b2fce0d 2596 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2da0d70d
DB
2597 /* slower
2598 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
8b2fce0d 2599 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2da0d70d
DB
2600 */
2601 xpos+=xInc;
2602 }
b63f641e 2603#endif /* ARCH_X86 */
2da0d70d 2604 }
6bc0c792
MN
2605 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2606 int i;
2607 //FIXME all pal and rgb srcFormats could do this convertion as well
2608 //FIXME all scalers more complex than bilinear could do half of this transform
2609 if(c->srcRange){
2610 for (i=0; i<dstWidth; i++){
2611 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2612 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2613 }
2614 }else{
2615 for (i=0; i<dstWidth; i++){
aa13b0fc
MN
2616 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2617 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
6bc0c792
MN
2618 }
2619 }
2620 }
077ea8a7
MN
2621}
2622
3e499f53 2623static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2da0d70d
DB
2624 int srcSliceH, uint8_t* dst[], int dstStride[]){
2625
2626 /* load a few things into local vars to make the code more readable? and faster */
2627 const int srcW= c->srcW;
2628 const int dstW= c->dstW;
2629 const int dstH= c->dstH;
2630 const int chrDstW= c->chrDstW;
2631 const int chrSrcW= c->chrSrcW;
2632 const int lumXInc= c->lumXInc;
2633 const int chrXInc= c->chrXInc;
2634 const int dstFormat= c->dstFormat;
2635 const int srcFormat= c->srcFormat;
2636 const int flags= c->flags;
2637 const int canMMX2BeUsed= c->canMMX2BeUsed;
2638 int16_t *vLumFilterPos= c->vLumFilterPos;
2639 int16_t *vChrFilterPos= c->vChrFilterPos;
2640 int16_t *hLumFilterPos= c->hLumFilterPos;
2641 int16_t *hChrFilterPos= c->hChrFilterPos;
2642 int16_t *vLumFilter= c->vLumFilter;
2643 int16_t *vChrFilter= c->vChrFilter;
2644 int16_t *hLumFilter= c->hLumFilter;
2645 int16_t *hChrFilter= c->hChrFilter;
2646 int32_t *lumMmxFilter= c->lumMmxFilter;
2647 int32_t *chrMmxFilter= c->chrMmxFilter;
2648 const int vLumFilterSize= c->vLumFilterSize;
2649 const int vChrFilterSize= c->vChrFilterSize;
2650 const int hLumFilterSize= c->hLumFilterSize;
2651 const int hChrFilterSize= c->hChrFilterSize;
2652 int16_t **lumPixBuf= c->lumPixBuf;
2653 int16_t **chrPixBuf= c->chrPixBuf;
2654 const int vLumBufSize= c->vLumBufSize;
2655 const int vChrBufSize= c->vChrBufSize;
2656 uint8_t *funnyYCode= c->funnyYCode;
2657 uint8_t *funnyUVCode= c->funnyUVCode;
2658 uint8_t *formatConvBuffer= c->formatConvBuffer;
2659 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2660 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2661 int lastDstY;
e150ef8d 2662 uint32_t *pal=c->pal_yuv;
2da0d70d 2663
8a322796 2664 /* vars which will change and which we need to store back in the context */
2da0d70d
DB
2665 int dstY= c->dstY;
2666 int lumBufIndex= c->lumBufIndex;
2667 int chrBufIndex= c->chrBufIndex;
2668 int lastInLumBuf= c->lastInLumBuf;
2669 int lastInChrBuf= c->lastInChrBuf;
2670
2671 if (isPacked(c->srcFormat)){
2da0d70d
DB
2672 src[0]=
2673 src[1]=
2674 src[2]= src[0];
2675 srcStride[0]=
2676 srcStride[1]=
2677 srcStride[2]= srcStride[0];
2678 }
2679 srcStride[1]<<= c->vChrDrop;
2680 srcStride[2]<<= c->vChrDrop;
2681
2682 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2683 // (int)dst[0], (int)dst[1], (int)dst[2]);
c7a810cc
MN
2684
2685#if 0 //self test FIXME move to a vfilter or something
2da0d70d
DB
2686 {
2687 static volatile int i=0;
2688 i++;
2689 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2690 selfTest(src, srcStride, c->srcW, c->srcH);
2691 i--;
2692 }
c7a810cc 2693#endif
37079906 2694
2da0d70d
DB
2695 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2696 //dstStride[0],dstStride[1],dstStride[2]);
2697
2698 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2699 {
6683a37f
DP
2700 static int warnedAlready=0; //FIXME move this into the context perhaps
2701 if (flags & SWS_PRINT_INFO && !warnedAlready)
2da0d70d 2702 {
4b0c30b7 2703 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
8a322796 2704 " ->cannot do aligned memory accesses anymore\n");
6683a37f 2705 warnedAlready=1;
2da0d70d
DB
2706 }
2707 }
2708
8a322796
DB
2709 /* Note the user might start scaling the picture in the middle so this
2710 will not get executed. This is not really intended but works
2711 currently, so people might do it. */
2da0d70d
DB
2712 if (srcSliceY ==0){
2713 lumBufIndex=0;
2714 chrBufIndex=0;
2715 dstY=0;
2716 lastInLumBuf= -1;
2717 lastInChrBuf= -1;
2718 }
2719
2720 lastDstY= dstY;
2721
2722 for (;dstY < dstH; dstY++){
2723 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2724 const int chrDstY= dstY>>c->chrDstVSubSample;
2725 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2726 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2727
2728 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2729 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2730 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2731 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2732
2733 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2734 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2735 //handle holes (FAST_BILINEAR & weird filters)
2736 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2737 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2738 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
fcc402b1
LB
2739 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2740 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2da0d70d
DB
2741
2742 // Do we have enough lines in this slice to output the dstY line
2743 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2744 {
2745 //Do horizontal scaling
2746 while(lastInLumBuf < lastLumSrcY)
2747 {
2748 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2749 lumBufIndex++;
2750 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
fcc402b1
LB
2751 assert(lumBufIndex < 2*vLumBufSize);
2752 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2753 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2da0d70d 2754 //printf("%d %d\n", lumBufIndex, vLumBufSize);
6bc0c792 2755 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2da0d70d
DB
2756 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2757 funnyYCode, c->srcFormat, formatConvBuffer,
2758 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2759 lastInLumBuf++;
2760 }
2761 while(lastInChrBuf < lastChrSrcY)
2762 {
2763 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2764 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2765 chrBufIndex++;
fcc402b1
LB
2766 assert(chrBufIndex < 2*vChrBufSize);
2767 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2768 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
2769 //FIXME replace parameters through context struct (some at least)
2770
2771 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 2772 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
2773 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2774 funnyUVCode, c->srcFormat, formatConvBuffer,
2775 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2776 lastInChrBuf++;
2777 }
2778 //wrap buf index around to stay inside the ring buffer
e5091488
BF
2779 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2780 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
2781 }
2782 else // not enough lines left in this slice -> load the rest in the buffer
2783 {
2784 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2785 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2786 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2787 vChrBufSize, vLumBufSize);*/
2788
2789 //Do horizontal scaling
2790 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2791 {
2792 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2793 lumBufIndex++;
fcc402b1
LB
2794 assert(lumBufIndex < 2*vLumBufSize);
2795 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2796 assert(lastInLumBuf + 1 - srcSliceY >= 0);
6bc0c792 2797 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2da0d70d
DB
2798 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2799 funnyYCode, c->srcFormat, formatConvBuffer,
2800 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2801 lastInLumBuf++;
2802 }
2803 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2804 {
2805 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2806 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2807 chrBufIndex++;
fcc402b1
LB
2808 assert(chrBufIndex < 2*vChrBufSize);
2809 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
2810 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
2811
2812 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 2813 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
2814 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2815 funnyUVCode, c->srcFormat, formatConvBuffer,
2816 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2817 lastInChrBuf++;
2818 }
2819 //wrap buf index around to stay inside the ring buffer
e5091488
BF
2820 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2821 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
2822 break; //we can't output a dstY line so let's try with the next slice
2823 }
d3f41512 2824
b63f641e 2825#if HAVE_MMX
88e2a9ae 2826 c->blueDither= ff_dither8[dstY&1];
92c7b471 2827 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
88e2a9ae 2828 c->greenDither= ff_dither8[dstY&1];
92c7b471 2829 else
88e2a9ae
CEH
2830 c->greenDither= ff_dither4[dstY&1];
2831 c->redDither= ff_dither8[(dstY+1)&1];
2da0d70d
DB
2832#endif
2833 if (dstY < dstH-2)
2834 {
2835 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2836 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
b63f641e 2837#if HAVE_MMX
2da0d70d
DB
2838 int i;
2839 if (flags & SWS_ACCURATE_RND){
1625216e 2840 int s= APCK_SIZE / 8;
2da0d70d 2841 for (i=0; i<vLumFilterSize; i+=2){
1625216e
MN
2842 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2843 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2844 lumMmxFilter[s*i+APCK_COEF/4 ]=
2845 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2da0d70d
DB
2846 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2847 }
2848 for (i=0; i<vChrFilterSize; i+=2){
1625216e
MN
2849 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2850 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2851 chrMmxFilter[s*i+APCK_COEF/4 ]=
2852 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2da0d70d 2853 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
bca11e75 2854 }
2da0d70d
DB
2855 }else{
2856 for (i=0; i<vLumFilterSize; i++)
2857 {
2858 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2859 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2860 lumMmxFilter[4*i+2]=
2861 lumMmxFilter[4*i+3]=
2862 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2863 }
2864 for (i=0; i<vChrFilterSize; i++)
2865 {
2866 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2867 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2868 chrMmxFilter[4*i+2]=
2869 chrMmxFilter[4*i+3]=
2870 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2871 }
2872 }
6542b44e 2873#endif
2da0d70d
DB
2874 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2875 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2876 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2877 RENAME(yuv2nv12X)(c,
2878 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2879 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2880 dest, uDest, dstW, chrDstW, dstFormat);
e3d2500f 2881 }
b0880d5d 2882 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2da0d70d
DB
2883 {
2884 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2885 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
8a322796 2886 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2da0d70d
DB
2887 {
2888 int16_t *lumBuf = lumPixBuf[0];
2889 int16_t *chrBuf= chrPixBuf[0];
bf2bdde6 2890 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2da0d70d
DB
2891 }
2892 else //General YV12
2893 {
2894 RENAME(yuv2yuvX)(c,
2895 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2896 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2897 dest, uDest, vDest, dstW, chrDstW);
2898 }
2899 }
2900 else
2901 {
fcc402b1
LB
2902 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2903 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
8a322796 2904 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2da0d70d
DB
2905 {
2906 int chrAlpha= vChrFilter[2*dstY+1];
f0faee4c
MN
2907 if(flags & SWS_FULL_CHR_H_INT){
2908 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2909 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2910 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2911 dest, dstW, dstY);
2912 }else{
14014d47
MN
2913 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2914 dest, dstW, chrAlpha, dstFormat, flags, dstY);
f0faee4c 2915 }
2da0d70d 2916 }
8a322796 2917 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2da0d70d
DB
2918 {
2919 int lumAlpha= vLumFilter[2*dstY+1];
2920 int chrAlpha= vChrFilter[2*dstY+1];
2921 lumMmxFilter[2]=
2922 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2923 chrMmxFilter[2]=
2924 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
f0faee4c
MN
2925 if(flags & SWS_FULL_CHR_H_INT){
2926 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2927 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2928 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2929 dest, dstW, dstY);
2930 }else{
14014d47
MN
2931 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2932 dest, dstW, lumAlpha, chrAlpha, dstY);
f0faee4c 2933 }
2da0d70d 2934 }
8a322796 2935 else //general RGB
2da0d70d 2936 {
f0faee4c
MN
2937 if(flags & SWS_FULL_CHR_H_INT){
2938 yuv2rgbXinC_full(c,
2939 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2940 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2941 dest, dstW, dstY);
2942 }else{
14014d47
MN
2943 RENAME(yuv2packedX)(c,
2944 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2945 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2946 dest, dstW, dstY);
f0faee4c 2947 }
2da0d70d
DB
2948 }
2949 }
2950 }
2951 else // hmm looks like we can't use MMX here without overwriting this array's tail
2952 {
2953 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2954 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2955 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2956 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2957 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2958 yuv2nv12XinC(
2959 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2960 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2961 dest, uDest, dstW, chrDstW, dstFormat);
2962 }
b0880d5d 2963 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
2da0d70d
DB
2964 {
2965 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2966 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2967 yuv2yuvXinC(
2968 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2969 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2970 dest, uDest, vDest, dstW, chrDstW);
2971 }
2972 else
2973 {
fcc402b1
LB
2974 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2975 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
f0faee4c
MN
2976 if(flags & SWS_FULL_CHR_H_INT){
2977 yuv2rgbXinC_full(c,
2978 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2979 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2980 dest, dstW, dstY);
2981 }else{
14014d47
MN
2982 yuv2packedXinC(c,
2983 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2984 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2985 dest, dstW, dstY);
f0faee4c 2986 }
2da0d70d
DB
2987 }
2988 }
2989 }
17f715fa 2990
b63f641e 2991#if HAVE_MMX
7ad6469e
DP
2992 __asm__ volatile(SFENCE:::"memory");
2993 __asm__ volatile(EMMS:::"memory");
17f715fa 2994#endif
2da0d70d
DB
2995 /* store changed local vars back in the context */
2996 c->dstY= dstY;
2997 c->lumBufIndex= lumBufIndex;
2998 c->chrBufIndex= chrBufIndex;
2999 c->lastInLumBuf= lastInLumBuf;
3000 c->lastInChrBuf= lastInChrBuf;
d4e24275 3001
2da0d70d 3002 return dstY - lastDstY;
627690b5 3003}