prevent going out of the buffer if the nal size does not fit in the buffer.
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
807e0c66
LA
20 * the C code (not assembly, mmx, ...) of this file can be used
21 * under the LGPL license too
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec
MN
27#undef PREFETCH
28#undef PREFETCHW
29#undef EMMS
30#undef SFENCE
31
32#ifdef HAVE_3DNOW
33/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34#define EMMS "femms"
35#else
36#define EMMS "emms"
37#endif
38
39#ifdef HAVE_3DNOW
40#define PREFETCH "prefetch"
41#define PREFETCHW "prefetchw"
42#elif defined ( HAVE_MMX2 )
43#define PREFETCH "prefetchnta"
44#define PREFETCHW "prefetcht0"
45#else
d904b5fc
NP
46#define PREFETCH " # nop"
47#define PREFETCHW " # nop"
48a05cec
MN
48#endif
49
50#ifdef HAVE_MMX2
51#define SFENCE "sfence"
52#else
d904b5fc 53#define SFENCE " # nop"
48a05cec 54#endif
d3f41512 55
d604bab9
MN
56#ifdef HAVE_MMX2
57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58#elif defined (HAVE_3DNOW)
59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60#endif
d3f41512 61
d604bab9 62#ifdef HAVE_MMX2
6e1c66bc 63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 64#else
6e1c66bc 65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 66#endif
6e1c66bc 67#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 68
a2faa401
RD
69#ifdef HAVE_ALTIVEC
70#include "swscale_altivec_template.c"
71#endif
72
bca11e75
MN
73#define YSCALEYUV2YV12X(x, offset, dest, width) \
74 asm volatile(\
6e1c66bc 75 "xor %%"REG_a", %%"REG_a" \n\t"\
379a2036
MN
76 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
6e1c66bc
AJ
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
4bff9ef9 80 ASMALIGN(4) /* FIXME Unroll? */\
c1b0bfb4 81 "1: \n\t"\
6e1c66bc
AJ
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
84 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
c1b0bfb4
MN
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
c1b0bfb4
MN
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
6e1c66bc
AJ
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
379a2036
MN
99 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
6e1c66bc
AJ
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
bca11e75
MN
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
e96da13b 105 "r" (dest), "g" (width)\
bca11e75
MN
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
108
109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
4bff9ef9 118 ASMALIGN(4) \
bca11e75
MN
119 "1: \n\t"\
120 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
121 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
e96da13b 167 "r" (dest), "g" (width)\
bca11e75
MN
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
c1b0bfb4
MN
170
171#define YSCALEYUV2YV121 \
6e1c66bc 172 "mov %2, %%"REG_a" \n\t"\
4bff9ef9 173 ASMALIGN(4) /* FIXME Unroll? */\
c1b0bfb4 174 "1: \n\t"\
6e1c66bc
AJ
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
c1b0bfb4
MN
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
6e1c66bc
AJ
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
c1b0bfb4
MN
182 "jnc 1b \n\t"
183
184/*
185 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187 "r" (dest), "m" (dstW),
188 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190*/
25593e29 191#define YSCALEYUV2PACKEDX \
8422aa88 192 asm volatile(\
6e1c66bc 193 "xor %%"REG_a", %%"REG_a" \n\t"\
4bff9ef9 194 ASMALIGN(4)\
77a49659 195 "nop \n\t"\
c1b0bfb4 196 "1: \n\t"\
6e1c66bc
AJ
197 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
198 "mov (%%"REG_d"), %%"REG_S" \n\t"\
379a2036
MN
199 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
200 "movq %%mm3, %%mm4 \n\t"\
4bff9ef9 201 ASMALIGN(4)\
c1b0bfb4 202 "2: \n\t"\
6e1c66bc
AJ
203 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
204 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
205 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
206 "add $16, %%"REG_d" \n\t"\
207 "mov (%%"REG_d"), %%"REG_S" \n\t"\
c1b0bfb4
MN
208 "pmulhw %%mm0, %%mm2 \n\t"\
209 "pmulhw %%mm0, %%mm5 \n\t"\
210 "paddw %%mm2, %%mm3 \n\t"\
211 "paddw %%mm5, %%mm4 \n\t"\
6e1c66bc 212 "test %%"REG_S", %%"REG_S" \n\t"\
c1b0bfb4
MN
213 " jnz 2b \n\t"\
214\
6e1c66bc
AJ
215 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
379a2036
MN
217 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
218 "movq %%mm1, %%mm7 \n\t"\
4bff9ef9 219 ASMALIGN(4)\
c1b0bfb4 220 "2: \n\t"\
6e1c66bc
AJ
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
c1b0bfb4
MN
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm1 \n\t"\
229 "paddw %%mm5, %%mm7 \n\t"\
6e1c66bc 230 "test %%"REG_S", %%"REG_S" \n\t"\
c1b0bfb4 231 " jnz 2b \n\t"\
25593e29 232
8422aa88
MN
233#define YSCALEYUV2PACKEDX_END\
234 :: "r" (&c->redDither), \
235 "m" (dummy), "m" (dummy), "m" (dummy),\
236 "r" (dest), "m" (dstW)\
237 : "%"REG_a, "%"REG_d, "%"REG_S\
238 );
239
bca11e75 240#define YSCALEYUV2PACKEDX_ACCURATE \
8422aa88 241 asm volatile(\
bca11e75 242 "xor %%"REG_a", %%"REG_a" \n\t"\
4bff9ef9 243 ASMALIGN(4)\
bca11e75
MN
244 "nop \n\t"\
245 "1: \n\t"\
246 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
247 "mov (%%"REG_d"), %%"REG_S" \n\t"\
248 "pxor %%mm4, %%mm4 \n\t"\
249 "pxor %%mm5, %%mm5 \n\t"\
250 "pxor %%mm6, %%mm6 \n\t"\
251 "pxor %%mm7, %%mm7 \n\t"\
4bff9ef9 252 ASMALIGN(4)\
bca11e75
MN
253 "2: \n\t"\
254 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
255 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
256 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
257 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
258 "movq %%mm0, %%mm3 \n\t"\
259 "punpcklwd %%mm1, %%mm0 \n\t"\
260 "punpckhwd %%mm1, %%mm3 \n\t"\
261 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
262 "pmaddwd %%mm1, %%mm0 \n\t"\
263 "pmaddwd %%mm1, %%mm3 \n\t"\
264 "paddd %%mm0, %%mm4 \n\t"\
265 "paddd %%mm3, %%mm5 \n\t"\
266 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
267 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
268 "add $16, %%"REG_d" \n\t"\
269 "test %%"REG_S", %%"REG_S" \n\t"\
270 "movq %%mm2, %%mm0 \n\t"\
271 "punpcklwd %%mm3, %%mm2 \n\t"\
272 "punpckhwd %%mm3, %%mm0 \n\t"\
273 "pmaddwd %%mm1, %%mm2 \n\t"\
274 "pmaddwd %%mm1, %%mm0 \n\t"\
275 "paddd %%mm2, %%mm6 \n\t"\
276 "paddd %%mm0, %%mm7 \n\t"\
277 " jnz 2b \n\t"\
278 "psrad $16, %%mm4 \n\t"\
279 "psrad $16, %%mm5 \n\t"\
280 "psrad $16, %%mm6 \n\t"\
281 "psrad $16, %%mm7 \n\t"\
282 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
283 "packssdw %%mm5, %%mm4 \n\t"\
284 "packssdw %%mm7, %%mm6 \n\t"\
285 "paddw %%mm0, %%mm4 \n\t"\
286 "paddw %%mm0, %%mm6 \n\t"\
287 "movq %%mm4, "U_TEMP"(%0) \n\t"\
288 "movq %%mm6, "V_TEMP"(%0) \n\t"\
289\
290 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
291 "mov (%%"REG_d"), %%"REG_S" \n\t"\
292 "pxor %%mm1, %%mm1 \n\t"\
293 "pxor %%mm5, %%mm5 \n\t"\
294 "pxor %%mm7, %%mm7 \n\t"\
295 "pxor %%mm6, %%mm6 \n\t"\
4bff9ef9 296 ASMALIGN(4)\
bca11e75
MN
297 "2: \n\t"\
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
301 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302 "movq %%mm0, %%mm3 \n\t"\
303 "punpcklwd %%mm4, %%mm0 \n\t"\
304 "punpckhwd %%mm4, %%mm3 \n\t"\
305 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306 "pmaddwd %%mm4, %%mm0 \n\t"\
307 "pmaddwd %%mm4, %%mm3 \n\t"\
308 "paddd %%mm0, %%mm1 \n\t"\
309 "paddd %%mm3, %%mm5 \n\t"\
310 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
312 "add $16, %%"REG_d" \n\t"\
313 "test %%"REG_S", %%"REG_S" \n\t"\
314 "movq %%mm2, %%mm0 \n\t"\
315 "punpcklwd %%mm3, %%mm2 \n\t"\
316 "punpckhwd %%mm3, %%mm0 \n\t"\
317 "pmaddwd %%mm4, %%mm2 \n\t"\
318 "pmaddwd %%mm4, %%mm0 \n\t"\
319 "paddd %%mm2, %%mm7 \n\t"\
320 "paddd %%mm0, %%mm6 \n\t"\
321 " jnz 2b \n\t"\
322 "psrad $16, %%mm1 \n\t"\
323 "psrad $16, %%mm5 \n\t"\
324 "psrad $16, %%mm7 \n\t"\
325 "psrad $16, %%mm6 \n\t"\
326 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
327 "packssdw %%mm5, %%mm1 \n\t"\
328 "packssdw %%mm6, %%mm7 \n\t"\
329 "paddw %%mm0, %%mm1 \n\t"\
330 "paddw %%mm0, %%mm7 \n\t"\
331 "movq "U_TEMP"(%0), %%mm3 \n\t"\
332 "movq "V_TEMP"(%0), %%mm4 \n\t"\
333
8422aa88 334#define YSCALEYUV2RGBX \
77a49659
MN
335 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
336 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
c1b0bfb4
MN
337 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
338 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
77a49659
MN
339 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
340 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
c1b0bfb4 341 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
77a49659
MN
342 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
343 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
344 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
345 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
346 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
347 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
c1b0bfb4
MN
348 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349 "paddw %%mm3, %%mm4 \n\t"\
350 "movq %%mm2, %%mm0 \n\t"\
351 "movq %%mm5, %%mm6 \n\t"\
352 "movq %%mm4, %%mm3 \n\t"\
353 "punpcklwd %%mm2, %%mm2 \n\t"\
354 "punpcklwd %%mm5, %%mm5 \n\t"\
355 "punpcklwd %%mm4, %%mm4 \n\t"\
356 "paddw %%mm1, %%mm2 \n\t"\
357 "paddw %%mm1, %%mm5 \n\t"\
358 "paddw %%mm1, %%mm4 \n\t"\
359 "punpckhwd %%mm0, %%mm0 \n\t"\
360 "punpckhwd %%mm6, %%mm6 \n\t"\
361 "punpckhwd %%mm3, %%mm3 \n\t"\
362 "paddw %%mm7, %%mm0 \n\t"\
363 "paddw %%mm7, %%mm6 \n\t"\
364 "paddw %%mm7, %%mm3 \n\t"\
365 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366 "packuswb %%mm0, %%mm2 \n\t"\
367 "packuswb %%mm6, %%mm5 \n\t"\
368 "packuswb %%mm3, %%mm4 \n\t"\
369 "pxor %%mm7, %%mm7 \n\t"
77a49659 370#if 0
d604bab9
MN
371#define FULL_YSCALEYUV2RGB \
372 "pxor %%mm7, %%mm7 \n\t"\
373 "movd %6, %%mm6 \n\t" /*yalpha1*/\
374 "punpcklwd %%mm6, %%mm6 \n\t"\
375 "punpcklwd %%mm6, %%mm6 \n\t"\
376 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
377 "punpcklwd %%mm5, %%mm5 \n\t"\
378 "punpcklwd %%mm5, %%mm5 \n\t"\
6e1c66bc 379 "xor %%"REG_a", %%"REG_a" \n\t"\
4bff9ef9 380 ASMALIGN(4)\
d604bab9 381 "1: \n\t"\
6e1c66bc
AJ
382 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
383 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
384 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
385 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
d604bab9
MN
386 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
387 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
6e1c66bc 391 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
d604bab9
MN
392 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6e1c66bc 394 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
d604bab9
MN
395 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
9b464428
FB
397 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
398 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
399 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
400\
401\
402 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
9b464428 404 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
d604bab9 405 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428 406 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
d604bab9 407 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428 408 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
409\
410\
411 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
9b464428
FB
412 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
413 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
414 "paddw %%mm1, %%mm3 \n\t" /* B*/\
415 "paddw %%mm1, %%mm0 \n\t" /* R*/\
416 "packuswb %%mm3, %%mm3 \n\t"\
417\
418 "packuswb %%mm0, %%mm0 \n\t"\
419 "paddw %%mm4, %%mm2 \n\t"\
420 "paddw %%mm2, %%mm1 \n\t" /* G*/\
421\
422 "packuswb %%mm1, %%mm1 \n\t"
77a49659 423#endif
d604bab9 424
6e1c66bc 425#define REAL_YSCALEYUV2PACKED(index, c) \
6542b44e
MN
426 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
427 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
428 "psraw $3, %%mm0 \n\t"\
429 "psraw $3, %%mm1 \n\t"\
430 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
431 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
6e1c66bc 432 "xor "#index", "#index" \n\t"\
4bff9ef9 433 ASMALIGN(4)\
25593e29 434 "1: \n\t"\
6542b44e
MN
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
25593e29
MN
439 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
6542b44e 441 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
25593e29
MN
442 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
6542b44e
MN
448 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
449 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
450 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
451 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
25593e29
MN
452 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
6542b44e
MN
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
25593e29
MN
456 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 460
6e1c66bc 461#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 462
6e1c66bc
AJ
463#define REAL_YSCALEYUV2RGB(index, c) \
464 "xor "#index", "#index" \n\t"\
4bff9ef9 465 ASMALIGN(4)\
d604bab9 466 "1: \n\t"\
6542b44e
MN
467 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
469 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
470 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
d604bab9
MN
471 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
6542b44e 473 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
d604bab9
MN
474 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
6542b44e
MN
480 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
481 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
482 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
483 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
6542b44e
MN
484 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
485 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
d604bab9 486 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
6542b44e
MN
487 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
488 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
489 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
490 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
d604bab9
MN
491 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
492 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
6542b44e
MN
493 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
d604bab9
MN
495 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6542b44e
MN
499 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
500 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
501 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
502 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
503 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
504 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
d604bab9
MN
505 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506 "paddw %%mm3, %%mm4 \n\t"\
507 "movq %%mm2, %%mm0 \n\t"\
508 "movq %%mm5, %%mm6 \n\t"\
509 "movq %%mm4, %%mm3 \n\t"\
510 "punpcklwd %%mm2, %%mm2 \n\t"\
511 "punpcklwd %%mm5, %%mm5 \n\t"\
512 "punpcklwd %%mm4, %%mm4 \n\t"\
513 "paddw %%mm1, %%mm2 \n\t"\
514 "paddw %%mm1, %%mm5 \n\t"\
515 "paddw %%mm1, %%mm4 \n\t"\
516 "punpckhwd %%mm0, %%mm0 \n\t"\
517 "punpckhwd %%mm6, %%mm6 \n\t"\
518 "punpckhwd %%mm3, %%mm3 \n\t"\
519 "paddw %%mm7, %%mm0 \n\t"\
520 "paddw %%mm7, %%mm6 \n\t"\
521 "paddw %%mm7, %%mm3 \n\t"\
522 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523 "packuswb %%mm0, %%mm2 \n\t"\
524 "packuswb %%mm6, %%mm5 \n\t"\
525 "packuswb %%mm3, %%mm4 \n\t"\
526 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 527#define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
6a4970ab 528
6e1c66bc
AJ
529#define REAL_YSCALEYUV2PACKED1(index, c) \
530 "xor "#index", "#index" \n\t"\
4bff9ef9 531 ASMALIGN(4)\
25593e29 532 "1: \n\t"\
e54d94ba
MN
533 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
534 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
25593e29
MN
535 "psraw $7, %%mm3 \n\t" \
536 "psraw $7, %%mm4 \n\t" \
e54d94ba
MN
537 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
538 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
25593e29
MN
539 "psraw $7, %%mm1 \n\t" \
540 "psraw $7, %%mm7 \n\t" \
6a4970ab 541
6e1c66bc 542#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 543
6e1c66bc
AJ
544#define REAL_YSCALEYUV2RGB1(index, c) \
545 "xor "#index", "#index" \n\t"\
4bff9ef9 546 ASMALIGN(4)\
d604bab9 547 "1: \n\t"\
e54d94ba
MN
548 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
549 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
d604bab9
MN
550 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
e54d94ba
MN
552 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
553 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
554 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
555 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
e54d94ba
MN
556 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
557 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
d604bab9 558 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
e54d94ba
MN
559 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
560 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
497d4f99
MN
561 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
e54d94ba
MN
563 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
564 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
565 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
566 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
567 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
568 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
497d4f99
MN
569 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570 "paddw %%mm3, %%mm4 \n\t"\
571 "movq %%mm2, %%mm0 \n\t"\
572 "movq %%mm5, %%mm6 \n\t"\
573 "movq %%mm4, %%mm3 \n\t"\
574 "punpcklwd %%mm2, %%mm2 \n\t"\
575 "punpcklwd %%mm5, %%mm5 \n\t"\
576 "punpcklwd %%mm4, %%mm4 \n\t"\
577 "paddw %%mm1, %%mm2 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "paddw %%mm1, %%mm4 \n\t"\
580 "punpckhwd %%mm0, %%mm0 \n\t"\
581 "punpckhwd %%mm6, %%mm6 \n\t"\
582 "punpckhwd %%mm3, %%mm3 \n\t"\
583 "paddw %%mm7, %%mm0 \n\t"\
584 "paddw %%mm7, %%mm6 \n\t"\
585 "paddw %%mm7, %%mm3 \n\t"\
586 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587 "packuswb %%mm0, %%mm2 \n\t"\
588 "packuswb %%mm6, %%mm5 \n\t"\
589 "packuswb %%mm3, %%mm4 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 591#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 592
6e1c66bc
AJ
593#define REAL_YSCALEYUV2PACKED1b(index, c) \
594 "xor "#index", "#index" \n\t"\
4bff9ef9 595 ASMALIGN(4)\
25593e29 596 "1: \n\t"\
e54d94ba
MN
597 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
599 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
25593e29
MN
601 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603 "psrlw $8, %%mm3 \n\t" \
604 "psrlw $8, %%mm4 \n\t" \
e54d94ba
MN
605 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
606 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
25593e29 607 "psraw $7, %%mm1 \n\t" \
6a4970ab 608 "psraw $7, %%mm7 \n\t"
6e1c66bc 609#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 610
497d4f99 611// do vertical chrominance interpolation
6e1c66bc
AJ
612#define REAL_YSCALEYUV2RGB1b(index, c) \
613 "xor "#index", "#index" \n\t"\
4bff9ef9 614 ASMALIGN(4)\
497d4f99 615 "1: \n\t"\
e54d94ba
MN
616 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
618 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397c035e
MN
620 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
c1b0bfb4
MN
622 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
623 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
e54d94ba
MN
624 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
625 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
497d4f99
MN
626 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
627 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
e54d94ba
MN
628 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
629 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
497d4f99 630 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
e54d94ba
MN
631 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
632 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
d604bab9
MN
633 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
e54d94ba
MN
635 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
636 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
637 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
638 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
639 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
640 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
d604bab9
MN
641 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642 "paddw %%mm3, %%mm4 \n\t"\
643 "movq %%mm2, %%mm0 \n\t"\
644 "movq %%mm5, %%mm6 \n\t"\
645 "movq %%mm4, %%mm3 \n\t"\
646 "punpcklwd %%mm2, %%mm2 \n\t"\
647 "punpcklwd %%mm5, %%mm5 \n\t"\
648 "punpcklwd %%mm4, %%mm4 \n\t"\
649 "paddw %%mm1, %%mm2 \n\t"\
650 "paddw %%mm1, %%mm5 \n\t"\
651 "paddw %%mm1, %%mm4 \n\t"\
652 "punpckhwd %%mm0, %%mm0 \n\t"\
653 "punpckhwd %%mm6, %%mm6 \n\t"\
654 "punpckhwd %%mm3, %%mm3 \n\t"\
655 "paddw %%mm7, %%mm0 \n\t"\
656 "paddw %%mm7, %%mm6 \n\t"\
657 "paddw %%mm7, %%mm3 \n\t"\
658 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659 "packuswb %%mm0, %%mm2 \n\t"\
660 "packuswb %%mm6, %%mm5 \n\t"\
661 "packuswb %%mm3, %%mm4 \n\t"\
662 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 663#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 664
6e1c66bc 665#define REAL_WRITEBGR32(dst, dstw, index) \
d604bab9
MN
666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667 "movq %%mm2, %%mm1 \n\t" /* B */\
668 "movq %%mm5, %%mm6 \n\t" /* R */\
669 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
670 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
671 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
672 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
673 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
674 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
675 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
676 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
677 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
678 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
679\
6542b44e
MN
680 MOVNTQ(%%mm0, (dst, index, 4))\
681 MOVNTQ(%%mm2, 8(dst, index, 4))\
682 MOVNTQ(%%mm1, 16(dst, index, 4))\
683 MOVNTQ(%%mm3, 24(dst, index, 4))\
d604bab9 684\
6e1c66bc
AJ
685 "add $8, "#index" \n\t"\
686 "cmp "#dstw", "#index" \n\t"\
d604bab9 687 " jb 1b \n\t"
6e1c66bc 688#define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
d604bab9 689
6e1c66bc 690#define REAL_WRITEBGR16(dst, dstw, index) \
9b464428
FB
691 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
692 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
693 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb 694 "psrlq $3, %%mm2 \n\t"\
d604bab9 695\
f62255fb
MN
696 "movq %%mm2, %%mm1 \n\t"\
697 "movq %%mm4, %%mm3 \n\t"\
d604bab9 698\
f62255fb
MN
699 "punpcklbw %%mm7, %%mm3 \n\t"\
700 "punpcklbw %%mm5, %%mm2 \n\t"\
701 "punpckhbw %%mm7, %%mm4 \n\t"\
702 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 703\
f62255fb
MN
704 "psllq $3, %%mm3 \n\t"\
705 "psllq $3, %%mm4 \n\t"\
d604bab9
MN
706\
707 "por %%mm3, %%mm2 \n\t"\
d604bab9 708 "por %%mm4, %%mm1 \n\t"\
d604bab9 709\
6542b44e
MN
710 MOVNTQ(%%mm2, (dst, index, 2))\
711 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 712\
6e1c66bc
AJ
713 "add $8, "#index" \n\t"\
714 "cmp "#dstw", "#index" \n\t"\
d604bab9 715 " jb 1b \n\t"
6e1c66bc 716#define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
d604bab9 717
6e1c66bc 718#define REAL_WRITEBGR15(dst, dstw, index) \
9b464428
FB
719 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
720 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
721 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb
MN
722 "psrlq $3, %%mm2 \n\t"\
723 "psrlq $1, %%mm5 \n\t"\
d604bab9 724\
f62255fb
MN
725 "movq %%mm2, %%mm1 \n\t"\
726 "movq %%mm4, %%mm3 \n\t"\
d604bab9 727\
f62255fb
MN
728 "punpcklbw %%mm7, %%mm3 \n\t"\
729 "punpcklbw %%mm5, %%mm2 \n\t"\
730 "punpckhbw %%mm7, %%mm4 \n\t"\
731 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 732\
f62255fb
MN
733 "psllq $2, %%mm3 \n\t"\
734 "psllq $2, %%mm4 \n\t"\
d604bab9
MN
735\
736 "por %%mm3, %%mm2 \n\t"\
d604bab9 737 "por %%mm4, %%mm1 \n\t"\
d604bab9 738\
6542b44e
MN
739 MOVNTQ(%%mm2, (dst, index, 2))\
740 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 741\
6e1c66bc
AJ
742 "add $8, "#index" \n\t"\
743 "cmp "#dstw", "#index" \n\t"\
d604bab9 744 " jb 1b \n\t"
6e1c66bc 745#define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
f62255fb 746
6542b44e 747#define WRITEBGR24OLD(dst, dstw, index) \
d604bab9
MN
748 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749 "movq %%mm2, %%mm1 \n\t" /* B */\
750 "movq %%mm5, %%mm6 \n\t" /* R */\
751 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
752 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
753 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
754 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
755 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
756 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
a525ce8d
MN
757 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
758 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
759 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
760 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9
MN
761\
762 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
763 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
9b464428
FB
764 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
765 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
d604bab9
MN
766 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
767 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
768 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
769 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
770\
771 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
772 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
773 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
774 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
9b464428 775 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
d604bab9
MN
776 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
777 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
9b464428
FB
778 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
779 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
d604bab9
MN
780 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
781 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
782 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
783 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
784\
785 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
786 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
787 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
9b464428
FB
788 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
789 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
d604bab9
MN
790 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
791 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
792 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
793\
6542b44e
MN
794 MOVNTQ(%%mm0, (dst))\
795 MOVNTQ(%%mm2, 8(dst))\
796 MOVNTQ(%%mm3, 16(dst))\
6e1c66bc 797 "add $24, "#dst" \n\t"\
d604bab9 798\
6e1c66bc
AJ
799 "add $8, "#index" \n\t"\
800 "cmp "#dstw", "#index" \n\t"\
d604bab9
MN
801 " jb 1b \n\t"
802
6542b44e 803#define WRITEBGR24MMX(dst, dstw, index) \
99d2cb72
MN
804 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805 "movq %%mm2, %%mm1 \n\t" /* B */\
806 "movq %%mm5, %%mm6 \n\t" /* R */\
807 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
808 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
809 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
810 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
811 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
812 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
813 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
814 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
815 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
816 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
817\
818 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
819 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
820 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
821 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
822\
823 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
824 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
825 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
826 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
827\
828 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
829 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
830 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
831 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
832\
833 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
834 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
835 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
836 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
6542b44e 837 MOVNTQ(%%mm0, (dst))\
99d2cb72
MN
838\
839 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
840 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
841 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
842 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
6542b44e 843 MOVNTQ(%%mm6, 8(dst))\
99d2cb72
MN
844\
845 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
846 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
847 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
6542b44e 848 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 849\
6e1c66bc 850 "add $24, "#dst" \n\t"\
99d2cb72 851\
6e1c66bc
AJ
852 "add $8, "#index" \n\t"\
853 "cmp "#dstw", "#index" \n\t"\
99d2cb72
MN
854 " jb 1b \n\t"
855
6542b44e 856#define WRITEBGR24MMX2(dst, dstw, index) \
99d2cb72 857 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
9b464428
FB
858 "movq "MANGLE(M24A)", %%mm0 \n\t"\
859 "movq "MANGLE(M24C)", %%mm7 \n\t"\
99d2cb72
MN
860 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
863\
864 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
865 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
866 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
867\
868 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
869 "por %%mm1, %%mm6 \n\t"\
870 "por %%mm3, %%mm6 \n\t"\
6542b44e 871 MOVNTQ(%%mm6, (dst))\
99d2cb72
MN
872\
873 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
874 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
875 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
876 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
877\
9b464428 878 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
99d2cb72
MN
879 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
880 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
881\
882 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
883 "por %%mm3, %%mm6 \n\t"\
6542b44e 884 MOVNTQ(%%mm6, 8(dst))\
99d2cb72
MN
885\
886 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
887 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
888 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
889\
890 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
891 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
9b464428 892 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72
MN
893\
894 "por %%mm1, %%mm3 \n\t"\
895 "por %%mm3, %%mm6 \n\t"\
6542b44e 896 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 897\
6e1c66bc 898 "add $24, "#dst" \n\t"\
99d2cb72 899\
6e1c66bc
AJ
900 "add $8, "#index" \n\t"\
901 "cmp "#dstw", "#index" \n\t"\
99d2cb72
MN
902 " jb 1b \n\t"
903
904#ifdef HAVE_MMX2
7630f2e0 905#undef WRITEBGR24
6e1c66bc 906#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 907#else
7630f2e0 908#undef WRITEBGR24
6e1c66bc 909#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
910#endif
911
6e1c66bc 912#define REAL_WRITEYUY2(dst, dstw, index) \
25593e29
MN
913 "packuswb %%mm3, %%mm3 \n\t"\
914 "packuswb %%mm4, %%mm4 \n\t"\
915 "packuswb %%mm7, %%mm1 \n\t"\
916 "punpcklbw %%mm4, %%mm3 \n\t"\
917 "movq %%mm1, %%mm7 \n\t"\
918 "punpcklbw %%mm3, %%mm1 \n\t"\
919 "punpckhbw %%mm3, %%mm7 \n\t"\
920\
6542b44e
MN
921 MOVNTQ(%%mm1, (dst, index, 2))\
922 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 923\
6e1c66bc
AJ
924 "add $8, "#index" \n\t"\
925 "cmp "#dstw", "#index" \n\t"\
25593e29 926 " jb 1b \n\t"
6e1c66bc 927#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
928
929
77a49659 930static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
c1b0bfb4 931 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
7f526efd 932 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
38858470 933{
c1b0bfb4 934#ifdef HAVE_MMX
bca11e75
MN
935 if(c->flags & SWS_ACCURATE_RND){
936 if(uDest){
937 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
939 }
940
941 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
942 }else{
943 if(uDest){
944 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
946 }
947
948 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
949 }
c1b0bfb4 950#else
a2faa401
RD
951#ifdef HAVE_ALTIVEC
952yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953 chrFilter, chrSrc, chrFilterSize,
954 dest, uDest, vDest, dstW, chrDstW);
955#else //HAVE_ALTIVEC
5859233b 956yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
e3d2500f 957 chrFilter, chrSrc, chrFilterSize,
5859233b 958 dest, uDest, vDest, dstW, chrDstW);
a2faa401 959#endif //!HAVE_ALTIVEC
7630f2e0 960#endif
c1b0bfb4 961}
2add307d 962
6118e52e
VS
963static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
966{
967yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968 chrFilter, chrSrc, chrFilterSize,
969 dest, uDest, dstW, chrDstW, dstFormat);
970}
971
c1b0bfb4 972static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
7f526efd 973 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
c1b0bfb4
MN
974{
975#ifdef HAVE_MMX
976 if(uDest != NULL)
38858470 977 {
c1b0bfb4
MN
978 asm volatile(
979 YSCALEYUV2YV121
e616aa93 980 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
7f526efd 981 "g" (-chrDstW)
6e1c66bc 982 : "%"REG_a
c1b0bfb4
MN
983 );
984
985 asm volatile(
986 YSCALEYUV2YV121
e616aa93 987 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
7f526efd 988 "g" (-chrDstW)
6e1c66bc 989 : "%"REG_a
c1b0bfb4 990 );
38858470
MN
991 }
992
c1b0bfb4
MN
993 asm volatile(
994 YSCALEYUV2YV121
995 :: "r" (lumSrc + dstW), "r" (dest + dstW),
7f526efd 996 "g" (-dstW)
6e1c66bc 997 : "%"REG_a
c1b0bfb4
MN
998 );
999#else
c1b0bfb4
MN
1000 int i;
1001 for(i=0; i<dstW; i++)
38858470 1002 {
c1b0bfb4 1003 int val= lumSrc[i]>>7;
6a4970ab 1004
44c1035c
MN
1005 if(val&256){
1006 if(val<0) val=0;
1007 else val=255;
1008 }
c1b0bfb4 1009
44c1035c 1010 dest[i]= val;
c1b0bfb4
MN
1011 }
1012
1013 if(uDest != NULL)
e616aa93 1014 for(i=0; i<chrDstW; i++)
38858470 1015 {
c1b0bfb4
MN
1016 int u=chrSrc[i]>>7;
1017 int v=chrSrc[i + 2048]>>7;
1018
44c1035c
MN
1019 if((u|v)&256){
1020 if(u<0) u=0;
1021 else if (u>255) u=255;
1022 if(v<0) v=0;
1023 else if (v>255) v=255;
1024 }
1025
1026 uDest[i]= u;
1027 vDest[i]= v;
38858470 1028 }
c1b0bfb4 1029#endif
38858470
MN
1030}
1031
c1b0bfb4 1032
d604bab9
MN
1033/**
1034 * vertical scale YV12 to RGB
1035 */
25593e29 1036static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
c1b0bfb4 1037 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
065ee1ec 1038 uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1039{
bca11e75 1040#ifdef HAVE_MMX
f8d61128 1041 long dummy=0;
bca11e75
MN
1042 if(c->flags & SWS_ACCURATE_RND){
1043 switch(c->dstFormat){
e9e12f0e 1044 case PIX_FMT_RGB32:
8422aa88
MN
1045 YSCALEYUV2PACKEDX_ACCURATE
1046 YSCALEYUV2RGBX
bca11e75
MN
1047 WRITEBGR32(%4, %5, %%REGa)
1048
8422aa88 1049 YSCALEYUV2PACKEDX_END
bca11e75 1050 return;
e9e12f0e 1051 case PIX_FMT_BGR24:
8422aa88
MN
1052 YSCALEYUV2PACKEDX_ACCURATE
1053 YSCALEYUV2RGBX
83c89c78
JT
1054 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055 "add %4, %%"REG_c" \n\t"
1056 WRITEBGR24(%%REGc, %5, %%REGa)
bca11e75 1057
8422aa88 1058
6a4970ab 1059 :: "r" (&c->redDither),
bca11e75
MN
1060 "m" (dummy), "m" (dummy), "m" (dummy),
1061 "r" (dest), "m" (dstW)
83c89c78 1062 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
bca11e75
MN
1063 );
1064 return;
e9e12f0e 1065 case PIX_FMT_BGR555:
8422aa88
MN
1066 YSCALEYUV2PACKEDX_ACCURATE
1067 YSCALEYUV2RGBX
bca11e75
MN
1068 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1069#ifdef DITHER1XBPP
1070 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073#endif
1074
1075 WRITEBGR15(%4, %5, %%REGa)
8422aa88 1076 YSCALEYUV2PACKEDX_END
bca11e75 1077 return;
e9e12f0e 1078 case PIX_FMT_BGR565:
8422aa88
MN
1079 YSCALEYUV2PACKEDX_ACCURATE
1080 YSCALEYUV2RGBX
bca11e75
MN
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082#ifdef DITHER1XBPP
1083 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086#endif
1087
1088 WRITEBGR16(%4, %5, %%REGa)
8422aa88 1089 YSCALEYUV2PACKEDX_END
bca11e75 1090 return;
e9e12f0e 1091 case PIX_FMT_YUYV422:
bca11e75
MN
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1094
1095 "psraw $3, %%mm3 \n\t"
1096 "psraw $3, %%mm4 \n\t"
1097 "psraw $3, %%mm1 \n\t"
1098 "psraw $3, %%mm7 \n\t"
1099 WRITEYUY2(%4, %5, %%REGa)
8422aa88 1100 YSCALEYUV2PACKEDX_END
bca11e75
MN
1101 return;
1102 }
1103 }else{
cf7d1c1a 1104 switch(c->dstFormat)
c1b0bfb4 1105 {
e9e12f0e 1106 case PIX_FMT_RGB32:
8422aa88
MN
1107 YSCALEYUV2PACKEDX
1108 YSCALEYUV2RGBX
6e1c66bc 1109 WRITEBGR32(%4, %5, %%REGa)
8422aa88 1110 YSCALEYUV2PACKEDX_END
bca11e75 1111 return;
e9e12f0e 1112 case PIX_FMT_BGR24:
8422aa88
MN
1113 YSCALEYUV2PACKEDX
1114 YSCALEYUV2RGBX
83c89c78
JT
1115 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1116 "add %4, %%"REG_c" \n\t"
1117 WRITEBGR24(%%REGc, %5, %%REGa)
c1b0bfb4 1118
6a4970ab 1119 :: "r" (&c->redDither),
77a49659
MN
1120 "m" (dummy), "m" (dummy), "m" (dummy),
1121 "r" (dest), "m" (dstW)
83c89c78 1122 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
c1b0bfb4 1123 );
bca11e75 1124 return;
e9e12f0e 1125 case PIX_FMT_BGR555:
8422aa88
MN
1126 YSCALEYUV2PACKEDX
1127 YSCALEYUV2RGBX
c1b0bfb4
MN
1128 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1129#ifdef DITHER1XBPP
9b464428
FB
1130 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1131 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1132 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
1133#endif
1134
6e1c66bc 1135 WRITEBGR15(%4, %5, %%REGa)
8422aa88 1136 YSCALEYUV2PACKEDX_END
bca11e75 1137 return;
e9e12f0e 1138 case PIX_FMT_BGR565:
8422aa88
MN
1139 YSCALEYUV2PACKEDX
1140 YSCALEYUV2RGBX
c1b0bfb4
MN
1141 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1142#ifdef DITHER1XBPP
9b464428
FB
1143 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1144 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1145 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
1146#endif
1147
6e1c66bc 1148 WRITEBGR16(%4, %5, %%REGa)
8422aa88 1149 YSCALEYUV2PACKEDX_END
bca11e75 1150 return;
e9e12f0e 1151 case PIX_FMT_YUYV422:
25593e29
MN
1152 YSCALEYUV2PACKEDX
1153 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1154
1155 "psraw $3, %%mm3 \n\t"
1156 "psraw $3, %%mm4 \n\t"
1157 "psraw $3, %%mm1 \n\t"
1158 "psraw $3, %%mm7 \n\t"
6e1c66bc 1159 WRITEYUY2(%4, %5, %%REGa)
8422aa88 1160 YSCALEYUV2PACKEDX_END
bca11e75
MN
1161 return;
1162 }
1163 }
c1b0bfb4 1164#endif
a31de956 1165#ifdef HAVE_ALTIVEC
b9a6fae9
AC
1166 /* The following list of supported dstFormat values should
1167 match what's found in the body of altivec_yuv2packedX() */
e9e12f0e
LA
1168 if(c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1169 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
b9a6fae9
AC
1171 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172 chrFilter, chrSrc, chrFilterSize,
1173 dest, dstW, dstY);
1174 else
a31de956 1175#endif
b9a6fae9
AC
1176 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177 chrFilter, chrSrc, chrFilterSize,
1178 dest, dstW, dstY);
c1b0bfb4
MN
1179}
1180
c1b0bfb4
MN
1181/**
1182 * vertical bilinear scale YV12 to RGB
1183 */
25593e29 1184static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
cf7d1c1a 1185 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9
MN
1186{
1187 int yalpha1=yalpha^4095;
1188 int uvalpha1=uvalpha^4095;
cf7d1c1a 1189 int i;
d604bab9 1190
77a416e8 1191#if 0 //isn't used
1e621b18 1192 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 1193 {
cf7d1c1a 1194 switch(dstFormat)
d604bab9 1195 {
cf7d1c1a 1196#ifdef HAVE_MMX
e9e12f0e 1197 case PIX_FMT_RGB32:
d604bab9
MN
1198 asm volatile(
1199
1200
1201FULL_YSCALEYUV2RGB
1202 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1203 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1204
1205 "movq %%mm3, %%mm1 \n\t"
1206 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1207 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1208
6e1c66bc
AJ
1209 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1210 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
d604bab9 1211
6e1c66bc
AJ
1212 "add $4, %%"REG_a" \n\t"
1213 "cmp %5, %%"REG_a" \n\t"
d604bab9
MN
1214 " jb 1b \n\t"
1215
1216
6e1c66bc 1217 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
d604bab9 1218 "m" (yalpha1), "m" (uvalpha1)
6e1c66bc 1219 : "%"REG_a
d604bab9 1220 );
cf7d1c1a 1221 break;
e9e12f0e 1222 case PIX_FMT_BGR24:
d604bab9
MN
1223 asm volatile(
1224
1225FULL_YSCALEYUV2RGB
1226
1227 // lsb ... msb
1228 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1229 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1230
1231 "movq %%mm3, %%mm1 \n\t"
1232 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1233 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1234
1235 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1236 "psrlq $8, %%mm3 \n\t" // GR0BGR00
9b464428
FB
1237 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1238 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
d604bab9
MN
1239 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1240 "movq %%mm1, %%mm2 \n\t"
1241 "psllq $48, %%mm1 \n\t" // 000000BG
1242 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1243
1244 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1245 "psrld $16, %%mm2 \n\t" // R000R000
1246 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1247 "por %%mm2, %%mm1 \n\t" // RBGRR000
1248
6e1c66bc
AJ
1249 "mov %4, %%"REG_b" \n\t"
1250 "add %%"REG_a", %%"REG_b" \n\t"
d604bab9
MN
1251
1252#ifdef HAVE_MMX2
1253 //FIXME Alignment
6e1c66bc
AJ
1254 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1255 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
d604bab9 1256#else
6e1c66bc 1257 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
d604bab9 1258 "psrlq $32, %%mm3 \n\t"
6e1c66bc
AJ
1259 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1260 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
d604bab9 1261#endif
6e1c66bc
AJ
1262 "add $4, %%"REG_a" \n\t"
1263 "cmp %5, %%"REG_a" \n\t"
d604bab9
MN
1264 " jb 1b \n\t"
1265
d1fac6cf 1266 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9 1267 "m" (yalpha1), "m" (uvalpha1)
6e1c66bc 1268 : "%"REG_a, "%"REG_b
d604bab9 1269 );
cf7d1c1a 1270 break;
e9e12f0e 1271 case PIX_FMT_BGR555:
d604bab9
MN
1272 asm volatile(
1273
1274FULL_YSCALEYUV2RGB
1275#ifdef DITHER1XBPP
9b464428
FB
1276 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1277 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1278 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
1279#endif
1280 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1281 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1282 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1283
1284 "psrlw $3, %%mm3 \n\t"
1285 "psllw $2, %%mm1 \n\t"
1286 "psllw $7, %%mm0 \n\t"
9b464428
FB
1287 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1288 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9
MN
1289
1290 "por %%mm3, %%mm1 \n\t"
1291 "por %%mm1, %%mm0 \n\t"
1292
6e1c66bc 1293 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1294
6e1c66bc
AJ
1295 "add $4, %%"REG_a" \n\t"
1296 "cmp %5, %%"REG_a" \n\t"
d604bab9
MN
1297 " jb 1b \n\t"
1298
d1fac6cf 1299 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9 1300 "m" (yalpha1), "m" (uvalpha1)
6e1c66bc 1301 : "%"REG_a
d604bab9 1302 );
cf7d1c1a 1303 break;
e9e12f0e 1304 case PIX_FMT_BGR565:
d604bab9
MN
1305 asm volatile(
1306
1307FULL_YSCALEYUV2RGB
1308#ifdef DITHER1XBPP
9b464428
FB
1309 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1310 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1311 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
1312#endif
1313 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1314 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1315 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1316
1317 "psrlw $3, %%mm3 \n\t"
1318 "psllw $3, %%mm1 \n\t"
1319 "psllw $8, %%mm0 \n\t"
9b464428
FB
1320 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1321 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9
MN
1322
1323 "por %%mm3, %%mm1 \n\t"
1324 "por %%mm1, %%mm0 \n\t"
1325
6e1c66bc 1326 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1327
6e1c66bc
AJ
1328 "add $4, %%"REG_a" \n\t"
1329 "cmp %5, %%"REG_a" \n\t"
d604bab9
MN
1330 " jb 1b \n\t"
1331
d1fac6cf 1332 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9 1333 "m" (yalpha1), "m" (uvalpha1)
6e1c66bc 1334 : "%"REG_a
d604bab9 1335 );
cf7d1c1a
MN
1336 break;
1337#endif
e9e12f0e 1338 case PIX_FMT_BGR32:
cf7d1c1a 1339#ifndef HAVE_MMX
e9e12f0e 1340 case PIX_FMT_RGB32:
cf7d1c1a 1341#endif
e9e12f0e 1342 if(dstFormat==PIX_FMT_RGB32)
28bf81c9 1343 {
2ba1bff0 1344 int i;
df3c183a
MN
1345#ifdef WORDS_BIGENDIAN
1346 dest++;
1347#endif
28bf81c9
MN
1348 for(i=0;i<dstW;i++){
1349 // vertical linear interpolation && yuv2rgb in a single step:
1350 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1351 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1352 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1353 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1354 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1355 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1356 dest+= 4;
1357 }
1358 }
e9e12f0e 1359 else if(dstFormat==PIX_FMT_BGR24)
d604bab9 1360 {
96034638 1361 int i;
d1fac6cf 1362 for(i=0;i<dstW;i++){
d604bab9
MN
1363 // vertical linear interpolation && yuv2rgb in a single step:
1364 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1365 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1366 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
390b20a6
MN
1367 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1368 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1369 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
28bf81c9 1370 dest+= 3;
d604bab9
MN
1371 }
1372 }
e9e12f0e 1373 else if(dstFormat==PIX_FMT_BGR565)
d604bab9 1374 {
96034638 1375 int i;
d1fac6cf 1376 for(i=0;i<dstW;i++){
d604bab9
MN
1377 // vertical linear interpolation && yuv2rgb in a single step:
1378 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1379 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1380 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1381
d022ce5c 1382 ((uint16_t*)dest)[i] =
b18ea156
MN
1383 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1384 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1385 clip_table16r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1386 }
1387 }
e9e12f0e 1388 else if(dstFormat==PIX_FMT_BGR555)
d604bab9 1389 {
96034638 1390 int i;
d1fac6cf 1391 for(i=0;i<dstW;i++){
d604bab9
MN
1392 // vertical linear interpolation && yuv2rgb in a single step:
1393 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1394 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1395 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1396
d022ce5c 1397 ((uint16_t*)dest)[i] =
b18ea156
MN
1398 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1399 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1400 clip_table15r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1401 }
1402 }
d604bab9
MN
1403 }//FULL_UV_IPOL
1404 else
1405 {
cf7d1c1a 1406#endif // if 0
d604bab9 1407#ifdef HAVE_MMX
cf7d1c1a
MN
1408 switch(c->dstFormat)
1409 {
77a416e8 1410//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
e9e12f0e 1411 case PIX_FMT_RGB32:
d604bab9 1412 asm volatile(
46fe31a0
MN
1413 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1414 "mov %4, %%"REG_b" \n\t"
1415 "push %%"REG_BP" \n\t"
1416 YSCALEYUV2RGB(%%REGBP, %5)
1417 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1418 "pop %%"REG_BP" \n\t"
1419 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1420
1421 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1422 "a" (&c->redDither)
d604bab9 1423 );
cf7d1c1a 1424 return;
e9e12f0e 1425 case PIX_FMT_BGR24:
d604bab9 1426 asm volatile(
46fe31a0
MN
1427 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1428 "mov %4, %%"REG_b" \n\t"
1429 "push %%"REG_BP" \n\t"
1430 YSCALEYUV2RGB(%%REGBP, %5)
1431 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1432 "pop %%"REG_BP" \n\t"
1433 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1434 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1435 "a" (&c->redDither)
d604bab9 1436 );
cf7d1c1a 1437 return;
e9e12f0e 1438 case PIX_FMT_BGR555:
d604bab9 1439 asm volatile(
46fe31a0
MN
1440 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1441 "mov %4, %%"REG_b" \n\t"
1442 "push %%"REG_BP" \n\t"
1443 YSCALEYUV2RGB(%%REGBP, %5)
d604bab9
MN
1444 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1445#ifdef DITHER1XBPP
9b464428
FB
1446 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1447 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1448 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1449#endif
1450
46fe31a0
MN
1451 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1452 "pop %%"REG_BP" \n\t"
1453 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
d604bab9 1454
46fe31a0
MN
1455 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1456 "a" (&c->redDither)
d604bab9 1457 );
cf7d1c1a 1458 return;
e9e12f0e 1459 case PIX_FMT_BGR565:
d604bab9 1460 asm volatile(
46fe31a0
MN
1461 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1462 "mov %4, %%"REG_b" \n\t"
1463 "push %%"REG_BP" \n\t"
1464 YSCALEYUV2RGB(%%REGBP, %5)
d604bab9
MN
1465 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1466#ifdef DITHER1XBPP
9b464428
FB
1467 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1468 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1469 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1470#endif
1471
46fe31a0
MN
1472 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1473 "pop %%"REG_BP" \n\t"
1474 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1475 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1476 "a" (&c->redDither)
d604bab9 1477 );
cf7d1c1a 1478 return;
e9e12f0e 1479 case PIX_FMT_YUYV422:
25593e29 1480 asm volatile(
46fe31a0
MN
1481 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1482 "mov %4, %%"REG_b" \n\t"
1483 "push %%"REG_BP" \n\t"
1484 YSCALEYUV2PACKED(%%REGBP, %5)
1485 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1486 "pop %%"REG_BP" \n\t"
1487 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1488 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1489 "a" (&c->redDither)
25593e29
MN
1490 );
1491 return;
cf7d1c1a
MN
1492 default: break;
1493 }
1494#endif //HAVE_MMX
25593e29 1495YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
d604bab9
MN
1496}
1497
1498/**
1499 * YV12 to RGB without scaling or interpolating
1500 */
25593e29 1501static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
cf7d1c1a 1502 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1503{
c1b0bfb4 1504 const int yalpha1=0;
cf7d1c1a 1505 int i;
6a4970ab 1506
cf7d1c1a
MN
1507 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1508 const int yalpha= 4096; //FIXME ...
96034638 1509
1e621b18 1510 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 1511 {
25593e29 1512 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
d604bab9
MN
1513 return;
1514 }
397c035e
MN
1515
1516#ifdef HAVE_MMX
497d4f99
MN
1517 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1518 {
cf7d1c1a 1519 switch(dstFormat)
d604bab9 1520 {
e9e12f0e 1521 case PIX_FMT_RGB32:
d604bab9 1522 asm volatile(
46fe31a0
MN
1523 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1524 "mov %4, %%"REG_b" \n\t"
1525 "push %%"REG_BP" \n\t"
1526 YSCALEYUV2RGB1(%%REGBP, %5)
1527 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1528 "pop %%"REG_BP" \n\t"
1529 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1530
1531 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1532 "a" (&c->redDither)
d604bab9 1533 );
cf7d1c1a 1534 return;
e9e12f0e 1535 case PIX_FMT_BGR24:
d604bab9 1536 asm volatile(
46fe31a0
MN
1537 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1538 "mov %4, %%"REG_b" \n\t"
1539 "push %%"REG_BP" \n\t"
1540 YSCALEYUV2RGB1(%%REGBP, %5)
1541 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1542 "pop %%"REG_BP" \n\t"
1543 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1544
1545 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1546 "a" (&c->redDither)
d604bab9 1547 );
cf7d1c1a 1548 return;
e9e12f0e 1549 case PIX_FMT_BGR555:
d604bab9 1550 asm volatile(
46fe31a0
MN
1551 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1552 "mov %4, %%"REG_b" \n\t"
1553 "push %%"REG_BP" \n\t"
1554 YSCALEYUV2RGB1(%%REGBP, %5)
d604bab9
MN
1555 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556#ifdef DITHER1XBPP
9b464428
FB
1557 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1558 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1559 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9 1560#endif
46fe31a0
MN
1561 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1562 "pop %%"REG_BP" \n\t"
1563 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
e54d94ba 1564
46fe31a0
MN
1565 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1566 "a" (&c->redDither)
d604bab9 1567 );
cf7d1c1a 1568 return;
e9e12f0e 1569 case PIX_FMT_BGR565:
d604bab9 1570 asm volatile(
46fe31a0
MN
1571 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1572 "mov %4, %%"REG_b" \n\t"
1573 "push %%"REG_BP" \n\t"
1574 YSCALEYUV2RGB1(%%REGBP, %5)
d604bab9
MN
1575 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1576#ifdef DITHER1XBPP
9b464428
FB
1577 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1578 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1579 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1580#endif
1581
46fe31a0
MN
1582 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1583 "pop %%"REG_BP" \n\t"
1584 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
e54d94ba 1585
46fe31a0
MN
1586 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1587 "a" (&c->redDither)
d604bab9 1588 );
cf7d1c1a 1589 return;
e9e12f0e 1590 case PIX_FMT_YUYV422:
25593e29 1591 asm volatile(
46fe31a0
MN
1592 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1593 "mov %4, %%"REG_b" \n\t"
1594 "push %%"REG_BP" \n\t"
1595 YSCALEYUV2PACKED1(%%REGBP, %5)
1596 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1597 "pop %%"REG_BP" \n\t"
1598 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1599
1600 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1601 "a" (&c->redDither)
25593e29
MN
1602 );
1603 return;
d604bab9 1604 }
497d4f99
MN
1605 }
1606 else
1607 {
cf7d1c1a 1608 switch(dstFormat)
d604bab9 1609 {
e9e12f0e 1610 case PIX_FMT_RGB32:
497d4f99 1611 asm volatile(
46fe31a0
MN
1612 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1613 "mov %4, %%"REG_b" \n\t"
1614 "push %%"REG_BP" \n\t"
1615 YSCALEYUV2RGB1b(%%REGBP, %5)
1616 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1617 "pop %%"REG_BP" \n\t"
1618 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1619
1620 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1621 "a" (&c->redDither)
497d4f99 1622 );
cf7d1c1a 1623 return;
e9e12f0e 1624 case PIX_FMT_BGR24:
497d4f99 1625 asm volatile(
46fe31a0
MN
1626 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1627 "mov %4, %%"REG_b" \n\t"
1628 "push %%"REG_BP" \n\t"
1629 YSCALEYUV2RGB1b(%%REGBP, %5)
1630 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1631 "pop %%"REG_BP" \n\t"
1632 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1633
1634 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1635 "a" (&c->redDither)
497d4f99 1636 );
cf7d1c1a 1637 return;
e9e12f0e 1638 case PIX_FMT_BGR555:
497d4f99 1639 asm volatile(
46fe31a0
MN
1640 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1641 "mov %4, %%"REG_b" \n\t"
1642 "push %%"REG_BP" \n\t"
1643 YSCALEYUV2RGB1b(%%REGBP, %5)
497d4f99
MN
1644 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1645#ifdef DITHER1XBPP
9b464428
FB
1646 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1647 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1648 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1649#endif
46fe31a0
MN
1650 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1651 "pop %%"REG_BP" \n\t"
1652 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
e54d94ba 1653
46fe31a0
MN
1654 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1655 "a" (&c->redDither)
497d4f99 1656 );
cf7d1c1a 1657 return;
e9e12f0e 1658 case PIX_FMT_BGR565:
497d4f99 1659 asm volatile(
46fe31a0
MN
1660 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1661 "mov %4, %%"REG_b" \n\t"
1662 "push %%"REG_BP" \n\t"
1663 YSCALEYUV2RGB1b(%%REGBP, %5)
497d4f99
MN
1664 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1665#ifdef DITHER1XBPP
9b464428
FB
1666 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1667 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1668 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1669#endif
d604bab9 1670
46fe31a0
MN
1671 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1672 "pop %%"REG_BP" \n\t"
1673 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
e54d94ba 1674
46fe31a0
MN
1675 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1676 "a" (&c->redDither)
497d4f99 1677 );
cf7d1c1a 1678 return;
e9e12f0e 1679 case PIX_FMT_YUYV422:
25593e29 1680 asm volatile(
46fe31a0
MN
1681 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1682 "mov %4, %%"REG_b" \n\t"
1683 "push %%"REG_BP" \n\t"
1684 YSCALEYUV2PACKED1b(%%REGBP, %5)
1685 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1686 "pop %%"REG_BP" \n\t"
1687 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1688
1689 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1690 "a" (&c->redDither)
25593e29
MN
1691 );
1692 return;
d604bab9 1693 }
497d4f99 1694 }
df3c183a 1695#endif
cf7d1c1a 1696 if( uvalpha < 2048 )
497d4f99 1697 {
25593e29 1698 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
cf7d1c1a 1699 }else{
25593e29 1700 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
497d4f99 1701 }
d604bab9
MN
1702}
1703
6ff0ad6b
MN
1704//FIXME yuy2* can read upto 7 samples to much
1705
7f526efd 1706static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1e621b18 1707{
6ff0ad6b
MN
1708#ifdef HAVE_MMX
1709 asm volatile(
1710 "movq "MANGLE(bm01010101)", %%mm2\n\t"
6e1c66bc 1711 "mov %0, %%"REG_a" \n\t"
6ff0ad6b 1712 "1: \n\t"
6e1c66bc
AJ
1713 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1714 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
6ff0ad6b
MN
1715 "pand %%mm2, %%mm0 \n\t"
1716 "pand %%mm2, %%mm1 \n\t"
1717 "packuswb %%mm1, %%mm0 \n\t"
6e1c66bc
AJ
1718 "movq %%mm0, (%2, %%"REG_a") \n\t"
1719 "add $8, %%"REG_a" \n\t"
6ff0ad6b 1720 " js 1b \n\t"
7f526efd 1721 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
6e1c66bc 1722 : "%"REG_a
6ff0ad6b 1723 );
1e621b18
MN
1724#else
1725 int i;
1726 for(i=0; i<width; i++)
1727 dst[i]= src[2*i];
1728#endif
1729}
1730
7f526efd 1731static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1e621b18 1732{
c2271987 1733#ifdef HAVE_MMX
6ff0ad6b
MN
1734 asm volatile(
1735 "movq "MANGLE(bm01010101)", %%mm4\n\t"
6e1c66bc 1736 "mov %0, %%"REG_a" \n\t"
6ff0ad6b 1737 "1: \n\t"
6e1c66bc
AJ
1738 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1739 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
6ff0ad6b
MN
1740 "psrlw $8, %%mm0 \n\t"
1741 "psrlw $8, %%mm1 \n\t"
1742 "packuswb %%mm1, %%mm0 \n\t"
1743 "movq %%mm0, %%mm1 \n\t"
1744 "psrlw $8, %%mm0 \n\t"
1745 "pand %%mm4, %%mm1 \n\t"
1746 "packuswb %%mm0, %%mm0 \n\t"
1747 "packuswb %%mm1, %%mm1 \n\t"
c2271987
MN
1748 "movd %%mm0, (%3, %%"REG_a") \n\t"
1749 "movd %%mm1, (%2, %%"REG_a") \n\t"
6e1c66bc 1750 "add $4, %%"REG_a" \n\t"
6ff0ad6b 1751 " js 1b \n\t"
c2271987 1752 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
6e1c66bc 1753 : "%"REG_a
6ff0ad6b 1754 );
1e621b18
MN
1755#else
1756 int i;
1757 for(i=0; i<width; i++)
1758 {
c2271987
MN
1759 dstU[i]= src1[4*i + 1];
1760 dstV[i]= src1[4*i + 3];
1e621b18
MN
1761 }
1762#endif
0683a5c5 1763 assert(src1 == src2);
1e621b18
MN
1764}
1765
7322a67c 1766//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
7f526efd 1767static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
7322a67c
MN
1768{
1769#ifdef HAVE_MMX
1770 asm volatile(
6e1c66bc 1771 "mov %0, %%"REG_a" \n\t"
7322a67c 1772 "1: \n\t"
6e1c66bc
AJ
1773 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1774 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
7322a67c
MN
1775 "psrlw $8, %%mm0 \n\t"
1776 "psrlw $8, %%mm1 \n\t"
1777 "packuswb %%mm1, %%mm0 \n\t"
6e1c66bc
AJ
1778 "movq %%mm0, (%2, %%"REG_a") \n\t"
1779 "add $8, %%"REG_a" \n\t"
7322a67c 1780 " js 1b \n\t"
7f526efd 1781 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
6e1c66bc 1782 : "%"REG_a
7322a67c
MN
1783 );
1784#else
1785 int i;
1786 for(i=0; i<width; i++)
1787 dst[i]= src[2*i+1];
1788#endif
1789}
1790
7f526efd 1791static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
7322a67c 1792{
c2271987 1793#ifdef HAVE_MMX
7322a67c
MN
1794 asm volatile(
1795 "movq "MANGLE(bm01010101)", %%mm4\n\t"
6e1c66bc 1796 "mov %0, %%"REG_a" \n\t"
7322a67c 1797 "1: \n\t"
6e1c66bc
AJ
1798 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1799 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
7322a67c
MN
1800 "pand %%mm4, %%mm0 \n\t"
1801 "pand %%mm4, %%mm1 \n\t"
1802 "packuswb %%mm1, %%mm0 \n\t"
1803 "movq %%mm0, %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "pand %%mm4, %%mm1 \n\t"
1806 "packuswb %%mm0, %%mm0 \n\t"
1807 "packuswb %%mm1, %%mm1 \n\t"
c2271987
MN
1808 "movd %%mm0, (%3, %%"REG_a") \n\t"
1809 "movd %%mm1, (%2, %%"REG_a") \n\t"
6e1c66bc 1810 "add $4, %%"REG_a" \n\t"
7322a67c 1811 " js 1b \n\t"
c2271987 1812 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
6e1c66bc 1813 : "%"REG_a
7322a67c
MN
1814 );
1815#else
1816 int i;
1817 for(i=0; i<width; i++)
1818 {
c2271987
MN
1819 dstU[i]= src1[4*i + 0];
1820 dstV[i]= src1[4*i + 2];
7322a67c
MN
1821 }
1822#endif
0683a5c5 1823 assert(src1 == src2);
7322a67c
MN
1824}
1825
1e621b18
MN
1826static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1827{
1e621b18
MN
1828 int i;
1829 for(i=0; i<width; i++)
1830 {
4e61e21c
MN
1831 int b= ((uint32_t*)src)[i]&0xFF;
1832 int g= (((uint32_t*)src)[i]>>8)&0xFF;
3e499f53 1833 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1e621b18 1834
4e61e21c 1835 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1e621b18 1836 }
1e621b18
MN
1837}
1838
1839static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1840{
1e621b18 1841 int i;
c2271987 1842 assert(src1 == src2);
1e621b18
MN
1843 for(i=0; i<width; i++)
1844 {
4e61e21c
MN
1845 const int a= ((uint32_t*)src1)[2*i+0];
1846 const int e= ((uint32_t*)src1)[2*i+1];
c2271987
MN
1847 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1848 const int h= (a&0x00FF00) + (e&0x00FF00);
4e61e21c
MN
1849 const int b= l&0x3FF;
1850 const int g= h>>8;
1851 const int r= l>>16;
1e621b18 1852
c2271987
MN
1853 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1e621b18 1855 }
1e621b18
MN
1856}
1857
7f526efd 1858static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1e621b18 1859{
ac6a2e45
MN
1860#ifdef HAVE_MMX
1861 asm volatile(
6e1c66bc 1862 "mov %2, %%"REG_a" \n\t"
854288bb
FB
1863 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1864 "movq "MANGLE(w1111)", %%mm5 \n\t"
ac6a2e45 1865 "pxor %%mm7, %%mm7 \n\t"
83c89c78 1866 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
4bff9ef9 1867 ASMALIGN(4)
ac6a2e45 1868 "1: \n\t"
83c89c78
JT
1869 PREFETCH" 64(%0, %%"REG_d") \n\t"
1870 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1871 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
ac6a2e45
MN
1872 "punpcklbw %%mm7, %%mm0 \n\t"
1873 "punpcklbw %%mm7, %%mm1 \n\t"
83c89c78
JT
1874 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1875 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
ac6a2e45
MN
1876 "punpcklbw %%mm7, %%mm2 \n\t"
1877 "punpcklbw %%mm7, %%mm3 \n\t"
1878 "pmaddwd %%mm6, %%mm0 \n\t"
1879 "pmaddwd %%mm6, %%mm1 \n\t"
1880 "pmaddwd %%mm6, %%mm2 \n\t"
1881 "pmaddwd %%mm6, %%mm3 \n\t"
1882#ifndef FAST_BGR2YV12
1883 "psrad $8, %%mm0 \n\t"
1884 "psrad $8, %%mm1 \n\t"
1885 "psrad $8, %%mm2 \n\t"
1886 "psrad $8, %%mm3 \n\t"
1887#endif
1888 "packssdw %%mm1, %%mm0 \n\t"
1889 "packssdw %%mm3, %%mm2 \n\t"
1890 "pmaddwd %%mm5, %%mm0 \n\t"
1891 "pmaddwd %%mm5, %%mm2 \n\t"
1892 "packssdw %%mm2, %%mm0 \n\t"
1893 "psraw $7, %%mm0 \n\t"
1894
83c89c78
JT
1895 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1896 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
ac6a2e45
MN
1897 "punpcklbw %%mm7, %%mm4 \n\t"
1898 "punpcklbw %%mm7, %%mm1 \n\t"
83c89c78
JT
1899 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1900 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
ac6a2e45
MN
1901 "punpcklbw %%mm7, %%mm2 \n\t"
1902 "punpcklbw %%mm7, %%mm3 \n\t"
1903 "pmaddwd %%mm6, %%mm4 \n\t"
1904 "pmaddwd %%mm6, %%mm1 \n\t"
1905 "pmaddwd %%mm6, %%mm2 \n\t"
1906 "pmaddwd %%mm6, %%mm3 \n\t"
1907#ifndef FAST_BGR2YV12
1908 "psrad $8, %%mm4 \n\t"
1909 "psrad $8, %%mm1 \n\t"
1910 "psrad $8, %%mm2 \n\t"
1911 "psrad $8, %%mm3 \n\t"
1912#endif
1913 "packssdw %%mm1, %%mm4 \n\t"
1914 "packssdw %%mm3, %%mm2 \n\t"
1915 "pmaddwd %%mm5, %%mm4 \n\t"
1916 "pmaddwd %%mm5, %%mm2 \n\t"
83c89c78 1917 "add $24, %%"REG_d" \n\t"
ac6a2e45
MN
1918 "packssdw %%mm2, %%mm4 \n\t"
1919 "psraw $7, %%mm4 \n\t"
1920
1921 "packuswb %%mm4, %%mm0 \n\t"
854288bb 1922 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
ac6a2e45 1923
6e1c66bc
AJ
1924 "movq %%mm0, (%1, %%"REG_a") \n\t"
1925 "add $8, %%"REG_a" \n\t"
ac6a2e45 1926 " js 1b \n\t"
7f526efd 1927 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
83c89c78 1928 : "%"REG_a, "%"REG_d
ac6a2e45 1929 );
1e621b18
MN
1930#else
1931 int i;
1932 for(i=0; i<width; i++)
1933 {
1934 int b= src[i*3+0];
1935 int g= src[i*3+1];
1936 int r= src[i*3+2];
1937
9902f4e2 1938 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1e621b18
MN
1939 }
1940#endif
1941}
1942
7f526efd 1943static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1e621b18 1944{
4342fc14
MN
1945#ifdef HAVE_MMX
1946 asm volatile(
c2271987 1947 "mov %3, %%"REG_a" \n\t"
854288bb
FB
1948 "movq "MANGLE(w1111)", %%mm5 \n\t"
1949 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
4342fc14 1950 "pxor %%mm7, %%mm7 \n\t"
83c89c78
JT
1951 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1952 "add %%"REG_d", %%"REG_d" \n\t"
4bff9ef9 1953 ASMALIGN(4)
4342fc14 1954 "1: \n\t"
83c89c78 1955 PREFETCH" 64(%0, %%"REG_d") \n\t"
4342fc14 1956#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
83c89c78 1957 "movq (%0, %%"REG_d"), %%mm0 \n\t"
83c89c78 1958 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
4342fc14
MN
1959 "movq %%mm0, %%mm1 \n\t"
1960 "movq %%mm2, %%mm3 \n\t"
1961 "psrlq $24, %%mm0 \n\t"
1962 "psrlq $24, %%mm2 \n\t"
1963 PAVGB(%%mm1, %%mm0)
1964 PAVGB(%%mm3, %%mm2)
1965 "punpcklbw %%mm7, %%mm0 \n\t"
1966 "punpcklbw %%mm7, %%mm2 \n\t"
1967#else
83c89c78 1968 "movd (%0, %%"REG_d"), %%mm0 \n\t"
83c89c78 1969 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
4342fc14 1970 "punpcklbw %%mm7, %%mm0 \n\t"
4342fc14 1971 "punpcklbw %%mm7, %%mm2 \n\t"
4342fc14 1972 "paddw %%mm2, %%mm0 \n\t"
83c89c78 1973 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
83c89c78 1974 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
4342fc14 1975 "punpcklbw %%mm7, %%mm4 \n\t"
4342fc14 1976 "punpcklbw %%mm7, %%mm2 \n\t"
4342fc14 1977 "paddw %%mm4, %%mm2 \n\t"
c2271987
MN
1978 "psrlw $1, %%mm0 \n\t"
1979 "psrlw $1, %%mm2 \n\t"
4342fc14 1980#endif
854288bb
FB
1981 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1982 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
6a4970ab 1983
4342fc14
MN
1984 "pmaddwd %%mm0, %%mm1 \n\t"
1985 "pmaddwd %%mm2, %%mm3 \n\t"
1986 "pmaddwd %%mm6, %%mm0 \n\t"
1987 "pmaddwd %%mm6, %%mm2 \n\t"
1988#ifndef FAST_BGR2YV12
1989 "psrad $8, %%mm0 \n\t"
1990 "psrad $8, %%mm1 \n\t"
1991 "psrad $8, %%mm2 \n\t"
1992 "psrad $8, %%mm3 \n\t"
1993#endif
1994 "packssdw %%mm2, %%mm0 \n\t"
1995 "packssdw %%mm3, %%mm1 \n\t"
1996 "pmaddwd %%mm5, %%mm0 \n\t"
1997 "pmaddwd %%mm5, %%mm1 \n\t"
1998 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1999 "psraw $7, %%mm0 \n\t"
2000
2001#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
83c89c78 2002 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
83c89c78 2003 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
4342fc14
MN
2004 "movq %%mm4, %%mm1 \n\t"
2005 "movq %%mm2, %%mm3 \n\t"
2006 "psrlq $24, %%mm4 \n\t"
2007 "psrlq $24, %%mm2 \n\t"
2008 PAVGB(%%mm1, %%mm4)
2009 PAVGB(%%mm3, %%mm2)
2010 "punpcklbw %%mm7, %%mm4 \n\t"
2011 "punpcklbw %%mm7, %%mm2 \n\t"
2012#else
83c89c78 2013 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
83c89c78 2014 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
4342fc14 2015 "punpcklbw %%mm7, %%mm4 \n\t"
4342fc14 2016 "punpcklbw %%mm7, %%mm2 \n\t"
4342fc14 2017 "paddw %%mm2, %%mm4 \n\t"
83c89c78 2018 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
83c89c78 2019 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
4342fc14 2020 "punpcklbw %%mm7, %%mm5 \n\t"
4342fc14 2021 "punpcklbw %%mm7, %%mm2 \n\t"
4342fc14 2022 "paddw %%mm5, %%mm2 \n\t"
854288bb 2023 "movq "MANGLE(w1111)", %%mm5 \n\t"
4342fc14
MN
2024 "psrlw $2, %%mm4 \n\t"
2025 "psrlw $2, %%mm2 \n\t"
2026#endif
854288bb
FB
2027 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2028 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
6a4970ab 2029
4342fc14
MN
2030 "pmaddwd %%mm4, %%mm1 \n\t"
2031 "pmaddwd %%mm2, %%mm3 \n\t"
2032 "pmaddwd %%mm6, %%mm4 \n\t"
2033 "pmaddwd %%mm6, %%mm2 \n\t"
2034#ifndef FAST_BGR2YV12
2035 "psrad $8, %%mm4 \n\t"
2036 "psrad $8, %%mm1 \n\t"
2037 "psrad $8, %%mm2 \n\t"
2038 "psrad $8, %%mm3 \n\t"
2039#endif
2040 "packssdw %%mm2, %%mm4 \n\t"
2041 "packssdw %%mm3, %%mm1 \n\t"
2042 "pmaddwd %%mm5, %%mm4 \n\t"
2043 "pmaddwd %%mm5, %%mm1 \n\t"
83c89c78 2044 "add $24, %%"REG_d" \n\t"
4342fc14
MN
2045 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2046 "psraw $7, %%mm4 \n\t"
6a4970ab 2047
4342fc14
MN
2048 "movq %%mm0, %%mm1 \n\t"
2049 "punpckldq %%mm4, %%mm0 \n\t"
2050 "punpckhdq %%mm4, %%mm1 \n\t"
2051 "packsswb %%mm1, %%mm0 \n\t"
854288bb 2052 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
4342fc14 2053
c2271987 2054 "movd %%mm0, (%1, %%"REG_a") \n\t"
4342fc14 2055 "punpckhdq %%mm0, %%mm0 \n\t"
c2271987 2056 "movd %%mm0, (%2, %%"REG_a") \n\t"
6e1c66bc 2057 "add $4, %%"REG_a" \n\t"
4342fc14 2058 " js 1b \n\t"
c2271987 2059 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
83c89c78 2060 : "%"REG_a, "%"REG_d
4342fc14 2061 );
1e621b18
MN
2062#else
2063 int i;
2064 for(i=0; i<width; i++)
2065 {
c2271987
MN
2066 int b= src1[6*i + 0] + src1[6*i + 3];
2067 int g= src1[6*i + 1] + src1[6*i + 4];
2068 int r= src1[6*i + 2] + src1[6*i + 5];
1e621b18 2069
c2271987
MN
2070 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1e621b18
MN
2072 }
2073#endif
0683a5c5 2074 assert(src1 == src2);
1e621b18
MN
2075}
2076
6af250ea
MN
2077static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2078{
2079 int i;
2080 for(i=0; i<width; i++)
2081 {
4e61e21c 2082 int d= ((uint16_t*)src)[i];
6af250ea
MN
2083 int b= d&0x1F;
2084 int g= (d>>5)&0x3F;
2085 int r= (d>>11)&0x1F;
2086
2087 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2088 }
2089}
2090
2091static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2092{
2093 int i;
6264a515 2094 assert(src1==src2);
6af250ea
MN
2095 for(i=0; i<width; i++)
2096 {
4e61e21c 2097 int d0= ((uint32_t*)src1)[i];
6a4970ab 2098
c2271987
MN
2099 int dl= (d0&0x07E0F81F);
2100 int dh= ((d0>>5)&0x07C0F83F);
5bb9d9d8
MN
2101
2102 int dh2= (dh>>11) + (dh<<21);
2103 int d= dh2 + dl;
2104
2105 int b= d&0x7F;
2106 int r= (d>>11)&0x7F;
2107 int g= d>>21;
c2271987
MN
2108 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
6af250ea
MN
2110 }
2111}
2112
b72034dd
MN
2113static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2114{
2115 int i;
2116 for(i=0; i<width; i++)
2117 {
4e61e21c 2118 int d= ((uint16_t*)src)[i];
b72034dd
MN
2119 int b= d&0x1F;
2120 int g= (d>>5)&0x1F;
2121 int r= (d>>10)&0x1F;
2122
2123 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2124 }
2125}
2126
2127static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2128{
2129 int i;
c2271987 2130 assert(src1==src2);
b72034dd
MN
2131 for(i=0; i<width; i++)
2132 {
4e61e21c 2133 int d0= ((uint32_t*)src1)[i];
6a4970ab 2134
c2271987
MN
2135 int dl= (d0&0x03E07C1F);
2136 int dh= ((d0>>5)&0x03E0F81F);
b72034dd
MN
2137
2138 int dh2= (dh>>11) + (dh<<21);
2139 int d= dh2 + dl;
2140
2141 int b= d&0x7F;
2142 int r= (d>>10)&0x7F;
2143 int g= d>>21;
c2271987
MN
2144 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
b72034dd
MN
2146 }
2147}
2148
2149
a861d4d7
MN
2150static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2151{
2152 int i;
2153 for(i=0; i<width; i++)
2154 {
4e61e21c
MN
2155 int r= ((uint32_t*)src)[i]&0xFF;
2156 int g= (((uint32_t*)src)[i]>>8)&0xFF;
3e499f53 2157 int b= (((uint32_t*)src)[i]>>16)&0xFF;
a861d4d7 2158
4e61e21c 2159 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
a861d4d7
MN
2160 }
2161}
2162
2163static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2164{
2165 int i;
c2271987 2166 assert(src1==src2);
a861d4d7
MN
2167 for(i=0; i<width; i++)
2168 {
4e61e21c
MN
2169 const int a= ((uint32_t*)src1)[2*i+0];
2170 const int e= ((uint32_t*)src1)[2*i+1];
c2271987
MN
2171 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2172 const int h= (a&0x00FF00) + (e&0x00FF00);
4e61e21c
MN
2173 const int r= l&0x3FF;
2174 const int g= h>>8;
2175 const int b= l>>16;
a861d4d7 2176
c2271987
MN
2177 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
a861d4d7
MN
2179 }
2180}
2181
2182static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2183{
2184 int i;
2185 for(i=0; i<width; i++)
2186 {
2187 int r= src[i*3+0];
2188 int g= src[i*3+1];
2189 int b= src[i*3+2];
2190
4e61e21c 2191 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
a861d4d7
MN
2192 }
2193}
2194
2195static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2196{
2197 int i;
c2271987 2198 assert(src1==src2);
a861d4d7
MN
2199 for(i=0; i<width; i++)
2200 {
c2271987
MN
2201 int r= src1[6*i + 0] + src1[6*i + 3];
2202 int g= src1[6*i + 1] + src1[6*i + 4];
2203 int b= src1[6*i + 2] + src1[6*i + 5];
a861d4d7 2204
c2271987
MN
2205 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
a861d4d7
MN
2207 }
2208}
2209
a43fb6b3
LA
2210static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2211{
2212 int i;
2213 for(i=0; i<width; i++)
2214 {
2215 int d= ((uint16_t*)src)[i];
2216 int r= d&0x1F;
2217 int g= (d>>5)&0x3F;
2218 int b= (d>>11)&0x1F;
2219
2220 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2221 }
2222}
2223
2224static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2225{
2226 int i;
6264a515 2227 assert(src1 == src2);
a43fb6b3
LA
2228 for(i=0; i<width; i++)
2229 {
2230 int d0= ((uint32_t*)src1)[i];
6a4970ab 2231
6264a515
MN
2232 int dl= (d0&0x07E0F81F);
2233 int dh= ((d0>>5)&0x07C0F83F);
a43fb6b3
LA
2234
2235 int dh2= (dh>>11) + (dh<<21);
2236 int d= dh2 + dl;
2237
2238 int r= d&0x7F;
2239 int b= (d>>11)&0x7F;
2240 int g= d>>21;
6264a515
MN
2241 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2242 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
a43fb6b3
LA
2243 }
2244}
2245
2246static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2247{
2248 int i;
2249 for(i=0; i<width; i++)
2250 {
2251 int d= ((uint16_t*)src)[i];
2252 int r= d&0x1F;
2253 int g= (d>>5)&0x1F;
2254 int b= (d>>10)&0x1F;
2255
2256 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2257 }
2258}
2259
2260static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2261{
2262 int i;
6264a515 2263 assert(src1 == src2);
a43fb6b3
LA
2264 for(i=0; i<width; i++)
2265 {
2266 int d0= ((uint32_t*)src1)[i];
6a4970ab 2267
6264a515
MN
2268 int dl= (d0&0x03E07C1F);
2269 int dh= ((d0>>5)&0x03E0F81F);
a43fb6b3
LA
2270
2271 int dh2= (dh>>11) + (dh<<21);
2272 int d= dh2 + dl;
2273
2274 int g= d&0x7F;
2275 int r= (d>>10)&0x7F;
2276 int b= d>>21;
6264a515
MN
2277 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2278 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
a43fb6b3
LA
2279 }
2280}
1e621b18 2281
e28630fc
MN
2282static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2283{
2284 int i;
2285 for(i=0; i<width; i++)
2286 {
2287 int d= src[i];
e28630fc 2288
21c08a3f 2289 dst[i]= pal[d] & 0xFF;
e28630fc
MN
2290 }
2291}
2292
2293static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2294{
2295 int i;
2296 assert(src1 == src2);
2297 for(i=0; i<width; i++)
2298 {
fa65e2f6 2299 int p= pal[src1[i]];
e28630fc 2300
fa65e2f6
MN
2301 dstU[i]= p>>8;
2302 dstV[i]= p>>16;
e28630fc
MN
2303 }
2304}
2305
077ea8a7
MN
2306// Bilinear / Bicubic scaling
2307static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
7f526efd 2308 int16_t *filter, int16_t *filterPos, long filterSize)
2ff198c1 2309{
077ea8a7 2310#ifdef HAVE_MMX
c9b99ea6 2311 assert(filterSize % 4 == 0 && filterSize>0);
911406f2 2312 if(filterSize==4) // Always true for upscaling, sometimes for down, too.
077ea8a7 2313 {
6e1c66bc 2314 long counter= -2*dstW;
077ea8a7
MN
2315 filter-= counter*2;
2316 filterPos-= counter/2;
2317 dst-= counter/2;
2318 asm volatile(
83c89c78
JT
2319#if defined(PIC)
2320 "push %%"REG_b" \n\t"
2321#endif
077ea8a7 2322 "pxor %%mm7, %%mm7 \n\t"
9b464428 2323 "movq "MANGLE(w02)", %%mm6 \n\t"
6e1c66bc
AJ
2324 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2325 "mov %%"REG_a", %%"REG_BP" \n\t"
4bff9ef9 2326 ASMALIGN(4)
077ea8a7 2327 "1: \n\t"
a7b42d28
AJ
2328 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2329 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
6e1c66bc
AJ
2330 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2331 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2332 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2333 "movd (%3, %%"REG_b"), %%mm2 \n\t"
077ea8a7
MN
2334 "punpcklbw %%mm7, %%mm0 \n\t"
2335 "punpcklbw %%mm7, %%mm2 \n\t"
2336 "pmaddwd %%mm1, %%mm0 \n\t"
2337 "pmaddwd %%mm2, %%mm3 \n\t"
2338 "psrad $8, %%mm0 \n\t"
2339 "psrad $8, %%mm3 \n\t"
2340 "packssdw %%mm3, %%mm0 \n\t"
2341 "pmaddwd %%mm6, %%mm0 \n\t"
2342 "packssdw %%mm0, %%mm0 \n\t"
6e1c66bc
AJ
2343 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2344 "add $4, %%"REG_BP" \n\t"
077ea8a7 2345 " jnc 1b \n\t"
e3d2500f 2346
6e1c66bc 2347 "pop %%"REG_BP" \n\t"
83c89c78
JT
2348#if defined(PIC)
2349 "pop %%"REG_b" \n\t"
2350#endif
077ea8a7
MN
2351 : "+a" (counter)
2352 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2353#if !defined(PIC)
6e1c66bc 2354 : "%"REG_b
83c89c78 2355#endif
077ea8a7
MN
2356 );
2357 }
2358 else if(filterSize==8)
2359 {
6e1c66bc 2360 long counter= -2*dstW;
077ea8a7
MN
2361 filter-= counter*4;
2362 filterPos-= counter/2;
2363 dst-= counter/2;
2364 asm volatile(
83c89c78
JT
2365#if defined(PIC)
2366 "push %%"REG_b" \n\t"
2367#endif
077ea8a7 2368 "pxor %%mm7, %%mm7 \n\t"
9b464428 2369 "movq "MANGLE(w02)", %%mm6 \n\t"
6e1c66bc
AJ
2370 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2371 "mov %%"REG_a", %%"REG_BP" \n\t"
4bff9ef9 2372 ASMALIGN(4)
077ea8a7 2373 "1: \n\t"
a7b42d28
AJ
2374 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2375 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
6e1c66bc
AJ
2376 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2377 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2378 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2379 "movd (%3, %%"REG_b"), %%mm2 \n\t"
077ea8a7
MN
2380 "punpcklbw %%mm7, %%mm0 \n\t"
2381 "punpcklbw %%mm7, %%mm2 \n\t"
2382 "pmaddwd %%mm1, %%mm0 \n\t"
2383 "pmaddwd %%mm2, %%mm3 \n\t"
2384
6e1c66bc
AJ
2385 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2386 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2387 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2388 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
077ea8a7
MN
2389 "punpcklbw %%mm7, %%mm4 \n\t"
2390 "punpcklbw %%mm7, %%mm2 \n\t"
2391 "pmaddwd %%mm1, %%mm4 \n\t"
2392 "pmaddwd %%mm2, %%mm5 \n\t"
2393 "paddd %%mm4, %%mm0 \n\t"
2394 "paddd %%mm5, %%mm3 \n\t"
6a4970ab 2395
077ea8a7
MN
2396 "psrad $8, %%mm0 \n\t"
2397 "psrad $8, %%mm3 \n\t"
2398 "packssdw %%mm3, %%mm0 \n\t"
2399 "pmaddwd %%mm6, %%mm0 \n\t"
2400 "packssdw %%mm0, %%mm0 \n\t"
6e1c66bc
AJ
2401 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2402 "add $4, %%"REG_BP" \n\t"
077ea8a7 2403 " jnc 1b \n\t"
c1b0bfb4 2404
6e1c66bc 2405 "pop %%"REG_BP" \n\t"
83c89c78
JT
2406#if defined(PIC)
2407 "pop %%"REG_b" \n\t"
2408#endif
077ea8a7
MN
2409 : "+a" (counter)
2410 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2411#if !defined(PIC)
6e1c66bc 2412 : "%"REG_b
83c89c78 2413#endif
077ea8a7
MN
2414 );
2415 }
2416 else
2417 {
20ffdcf9 2418 uint8_t *offset = src+filterSize;
6e1c66bc 2419 long counter= -2*dstW;
077ea8a7
MN
2420// filter-= counter*filterSize/2;
2421 filterPos-= counter/2;
2422 dst-= counter/2;
2423 asm volatile(
2424 "pxor %%mm7, %%mm7 \n\t"
9b464428 2425 "movq "MANGLE(w02)", %%mm6 \n\t"
4bff9ef9 2426 ASMALIGN(4)
077ea8a7 2427 "1: \n\t"
6e1c66bc 2428 "mov %2, %%"REG_c" \n\t"
a7b42d28 2429 "movzwl (%%"REG_c", %0), %%eax \n\t"
83c89c78 2430 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
6e1c66bc 2431 "mov %5, %%"REG_c" \n\t"
077ea8a7
MN
2432 "pxor %%mm4, %%mm4 \n\t"
2433 "pxor %%mm5, %%mm5 \n\t"
2434 "2: \n\t"
2435 "movq (%1), %%mm1 \n\t"
2436 "movq (%1, %6), %%mm3 \n\t"
6e1c66bc 2437 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
83c89c78 2438 "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t"
077ea8a7
MN
2439 "punpcklbw %%mm7, %%mm0 \n\t"
2440 "punpcklbw %%mm7, %%mm2 \n\t"
2441 "pmaddwd %%mm1, %%mm0 \n\t"
2442 "pmaddwd %%mm2, %%mm3 \n\t"
2443 "paddd %%mm3, %%mm5 \n\t"
2444 "paddd %%mm0, %%mm4 \n\t"
6e1c66bc
AJ
2445 "add $8, %1 \n\t"
2446 "add $4, %%"REG_c" \n\t"
2447 "cmp %4, %%"REG_c" \n\t"
077ea8a7 2448 " jb 2b \n\t"
6e1c66bc 2449 "add %6, %1 \n\t"
077ea8a7
MN
2450 "psrad $8, %%mm4 \n\t"
2451 "psrad $8, %%mm5 \n\t"
2452 "packssdw %%mm5, %%mm4 \n\t"
2453 "pmaddwd %%mm6, %%mm4 \n\t"
2454 "packssdw %%mm4, %%mm4 \n\t"
6e1c66bc
AJ
2455 "mov %3, %%"REG_a" \n\t"
2456 "movd %%mm4, (%%"REG_a", %0) \n\t"
2457 "add $4, %0 \n\t"
077ea8a7 2458 " jnc 1b \n\t"
c1b0bfb4 2459
627690b5 2460 : "+r" (counter), "+r" (filter)
20ffdcf9 2461 : "m" (filterPos), "m" (dst), "m"(offset),
7f526efd 2462 "m" (src), "r" (filterSize*2)
83c89c78 2463 : "%"REG_a, "%"REG_c, "%"REG_d
077ea8a7
MN
2464 );
2465 }
2466#else
8c266f0c
RD
2467#ifdef HAVE_ALTIVEC
2468 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2469#else
077ea8a7
MN
2470 int i;
2471 for(i=0; i<dstW; i++)
2472 {
2473 int j;
2474 int srcPos= filterPos[i];
2475 int val=0;
c1b0bfb4 2476// printf("filterPos: %d\n", filterPos[i]);
077ea8a7
MN
2477 for(j=0; j<filterSize; j++)
2478 {
2479// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2480 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2481 }
2482// filter += hFilterSize;
adcec46a 2483 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
077ea8a7
MN
2484// dst[i] = val>>7;
2485 }
2486#endif
8c266f0c 2487#endif
077ea8a7 2488}
2ff198c1 2489 // *** horizontal scale Y line to temp buffer
065ee1ec 2490static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
28bf81c9 2491 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
6a4970ab 2492 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
b7dc6f66 2493 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
e28630fc 2494 int32_t *mmx2FilterPos, uint8_t *pal)
077ea8a7 2495{
4884b9e5 2496 if(srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
1e621b18
MN
2497 {
2498 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2499 src= formatConvBuffer;
2500 }
4884b9e5 2501 else if(srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
7322a67c
MN
2502 {
2503 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2504 src= formatConvBuffer;
2505 }
e9e12f0e 2506 else if(srcFormat==PIX_FMT_RGB32)
1e621b18
MN
2507 {
2508 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2509 src= formatConvBuffer;
2510 }
e9e12f0e 2511 else if(srcFormat==PIX_FMT_BGR24)
1e621b18
MN
2512 {
2513 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2514 src= formatConvBuffer;
2515 }
e9e12f0e 2516 else if(srcFormat==PIX_FMT_BGR565)
6af250ea
MN
2517 {
2518 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2519 src= formatConvBuffer;
2520 }
e9e12f0e 2521 else if(srcFormat==PIX_FMT_BGR555)
b72034dd
MN
2522 {
2523 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2524 src= formatConvBuffer;
2525 }
e9e12f0e 2526 else if(srcFormat==PIX_FMT_BGR32)
a861d4d7
MN
2527 {
2528 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2529 src= formatConvBuffer;
2530 }
e9e12f0e 2531 else if(srcFormat==PIX_FMT_RGB24)
a861d4d7
MN
2532 {
2533 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2534 src= formatConvBuffer;
2535 }
a43fb6b3
LA
2536 else if(srcFormat==PIX_FMT_RGB565)
2537 {
2538 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2539 src= formatConvBuffer;
2540 }
2541 else if(srcFormat==PIX_FMT_RGB555)
2542 {
2543 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2544 src= formatConvBuffer;
2545 }
18064f5c 2546 else if(srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc
MN
2547 {
2548 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2549 src= formatConvBuffer;
2550 }
1e621b18 2551
e3d2500f 2552#ifdef HAVE_MMX
77a416e8 2553 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
28bf81c9 2554 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2555#else
28bf81c9 2556 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2557#endif
077ea8a7
MN
2558 {
2559 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2560 }
2561 else // Fast Bilinear upscale / crap downscale
2562 {
3d6a30d9 2563#if defined(ARCH_X86)
2ff198c1 2564#ifdef HAVE_MMX2
96034638 2565 int i;
83c89c78
JT
2566#if defined(PIC)
2567 uint64_t ebxsave __attribute__((aligned(8)));
2568#endif
2ff198c1
MN
2569 if(canMMX2BeUsed)
2570 {
2571 asm volatile(
83c89c78
JT
2572#if defined(PIC)
2573 "mov %%"REG_b", %5 \n\t"
2574#endif
2ff198c1 2575 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc
AJ
2576 "mov %0, %%"REG_c" \n\t"
2577 "mov %1, %%"REG_D" \n\t"
2578 "mov %2, %%"REG_d" \n\t"
2579 "mov %3, %%"REG_b" \n\t"
2580 "xor %%"REG_a", %%"REG_a" \n\t" // i
2581 PREFETCH" (%%"REG_c") \n\t"
2582 PREFETCH" 32(%%"REG_c") \n\t"
2583 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2584
6d606c4f
AJ
2585#ifdef ARCH_X86_64
2586
2587#define FUNNY_Y_CODE \
2588 "movl (%%"REG_b"), %%esi \n\t"\
2589 "call *%4 \n\t"\
2590 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2591 "add %%"REG_S", %%"REG_c" \n\t"\
2592 "add %%"REG_a", %%"REG_D" \n\t"\
2593 "xor %%"REG_a", %%"REG_a" \n\t"\
2594
2595#else
2596
2ff198c1 2597#define FUNNY_Y_CODE \
6d606c4f 2598 "movl (%%"REG_b"), %%esi \n\t"\
b7dc6f66 2599 "call *%4 \n\t"\
6d606c4f 2600 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
b6663a55 2601 "add %%"REG_a", %%"REG_D" \n\t"\
6e1c66bc 2602 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2603
6d606c4f
AJ
2604#endif
2605
2ff198c1
MN
2606FUNNY_Y_CODE
2607FUNNY_Y_CODE
2608FUNNY_Y_CODE
2609FUNNY_Y_CODE
2610FUNNY_Y_CODE
2611FUNNY_Y_CODE
2612FUNNY_Y_CODE
2613FUNNY_Y_CODE
2614
83c89c78
JT
2615#if defined(PIC)
2616 "mov %5, %%"REG_b" \n\t"
2617#endif
b7dc6f66
MN
2618 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2619 "m" (funnyYCode)
83c89c78
JT
2620#if defined(PIC)
2621 ,"m" (ebxsave)
2622#endif
2623 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2624#if !defined(PIC)
2625 ,"%"REG_b
2626#endif
2ff198c1 2627 );
af91b8b3 2628 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2ff198c1
MN
2629 }
2630 else
2631 {
2632#endif
065ee1ec
RD
2633 long xInc_shr16 = xInc >> 16;
2634 uint16_t xInc_mask = xInc & 0xffff;
2ff198c1
MN
2635 //NO MMX just normal asm ...
2636 asm volatile(
6e1c66bc 2637 "xor %%"REG_a", %%"REG_a" \n\t" // i
83c89c78 2638 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2ff198c1 2639 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
4bff9ef9 2640 ASMALIGN(4)
2ff198c1 2641 "1: \n\t"
83c89c78
JT
2642 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2643 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2ff198c1
MN
2644 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2645 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2646 "shll $16, %%edi \n\t"
2647 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
6e1c66bc 2648 "mov %1, %%"REG_D" \n\t"
2ff198c1 2649 "shrl $9, %%esi \n\t"
6e1c66bc 2650 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2ff198c1 2651 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
83c89c78 2652 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2ff198c1 2653
83c89c78
JT
2654 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2655 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2ff198c1
MN
2656 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2657 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2658 "shll $16, %%edi \n\t"
2659 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
6e1c66bc 2660 "mov %1, %%"REG_D" \n\t"
2ff198c1 2661 "shrl $9, %%esi \n\t"
6e1c66bc 2662 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2ff198c1 2663 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
83c89c78 2664 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2ff198c1
MN
2665
2666
6e1c66bc
AJ
2667 "add $2, %%"REG_a" \n\t"
2668 "cmp %2, %%"REG_a" \n\t"
2ff198c1
MN
2669 " jb 1b \n\t"
2670
2671
20ffdcf9 2672 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
83c89c78 2673 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2ff198c1
MN
2674 );
2675#ifdef HAVE_MMX2
77a416e8 2676 } //if MMX2 can't be used
2ff198c1
MN
2677#endif
2678#else
96034638
MN
2679 int i;
2680 unsigned int xpos=0;
2681 for(i=0;i<dstWidth;i++)
2682 {
2683 register unsigned int xx=xpos>>16;
2684 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2685 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2686 xpos+=xInc;
2687 }
2ff198c1 2688#endif
077ea8a7 2689 }
2ff198c1
MN
2690}
2691
7f526efd 2692inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
28bf81c9 2693 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
1e621b18 2694 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
b7dc6f66 2695 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
e28630fc 2696 int32_t *mmx2FilterPos, uint8_t *pal)
2ff198c1 2697{
e9e12f0e 2698 if(srcFormat==PIX_FMT_YUYV422)
1e621b18
MN
2699 {
2700 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2701 src1= formatConvBuffer;
2702 src2= formatConvBuffer+2048;
2703 }
e9e12f0e 2704 else if(srcFormat==PIX_FMT_UYVY422)
7322a67c
MN
2705 {
2706 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2707 src1= formatConvBuffer;
2708 src2= formatConvBuffer+2048;
2709 }
e9e12f0e 2710 else if(srcFormat==PIX_FMT_RGB32)
1e621b18
MN
2711 {
2712 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2713 src1= formatConvBuffer;
2714 src2= formatConvBuffer+2048;
2715 }
e9e12f0e 2716 else if(srcFormat==PIX_FMT_BGR24)
1e621b18
MN
2717 {
2718 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2719 src1= formatConvBuffer;
2720 src2= formatConvBuffer+2048;
2721 }
e9e12f0e 2722 else if(srcFormat==PIX_FMT_BGR565)
6af250ea
MN
2723 {
2724 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2725 src1= formatConvBuffer;
2726 src2= formatConvBuffer+2048;
2727 }
e9e12f0e 2728 else if(srcFormat==PIX_FMT_BGR555)
b72034dd
MN
2729 {
2730 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2731 src1= formatConvBuffer;
2732 src2= formatConvBuffer+2048;
2733 }
e9e12f0e 2734 else if(srcFormat==PIX_FMT_BGR32)
a861d4d7
MN
2735 {
2736 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2737 src1= formatConvBuffer;
2738 src2= formatConvBuffer+2048;
2739 }
e9e12f0e 2740 else if(srcFormat==PIX_FMT_RGB24)
a861d4d7
MN
2741 {
2742 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2743 src1= formatConvBuffer;
2744 src2= formatConvBuffer+2048;
2745 }
a43fb6b3
LA
2746 else if(srcFormat==PIX_FMT_RGB565)
2747 {
2748 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2749 src1= formatConvBuffer;
2750 src2= formatConvBuffer+2048;
2751 }
2752 else if(srcFormat==PIX_FMT_RGB555)
2753 {
2754 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2755 src1= formatConvBuffer;
2756 src2= formatConvBuffer+2048;
2757 }
6ff0ad6b
MN
2758 else if(isGray(srcFormat))
2759 {
2760 return;
2761 }
18064f5c 2762 else if(srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc
MN
2763 {
2764 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal);
2765 src1= formatConvBuffer;
2766 src2= formatConvBuffer+2048;
2767 }
1e621b18 2768
e3d2500f 2769#ifdef HAVE_MMX
77a416e8 2770 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
28bf81c9 2771 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2772#else
28bf81c9 2773 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2774#endif
077ea8a7
MN
2775 {
2776 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2777 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2778 }
2779 else // Fast Bilinear upscale / crap downscale
2780 {
3d6a30d9 2781#if defined(ARCH_X86)
2ff198c1 2782#ifdef HAVE_MMX2
96034638 2783 int i;
83c89c78
JT
2784#if defined(PIC)
2785 uint64_t ebxsave __attribute__((aligned(8)));
2786#endif
2ff198c1
MN
2787 if(canMMX2BeUsed)
2788 {
2789 asm volatile(
83c89c78
JT
2790#if defined(PIC)
2791 "mov %%"REG_b", %6 \n\t"
2792#endif
b7dc6f66 2793 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc
AJ
2794 "mov %0, %%"REG_c" \n\t"
2795 "mov %1, %%"REG_D" \n\t"
2796 "mov %2, %%"REG_d" \n\t"
2797 "mov %3, %%"REG_b" \n\t"
2798 "xor %%"REG_a", %%"REG_a" \n\t" // i
2799 PREFETCH" (%%"REG_c") \n\t"
2800 PREFETCH" 32(%%"REG_c") \n\t"
2801 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2802
6d606c4f
AJ
2803#ifdef ARCH_X86_64
2804
2805#define FUNNY_UV_CODE \
2806 "movl (%%"REG_b"), %%esi \n\t"\
2807 "call *%4 \n\t"\
2808 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2809 "add %%"REG_S", %%"REG_c" \n\t"\
2810 "add %%"REG_a", %%"REG_D" \n\t"\
2811 "xor %%"REG_a", %%"REG_a" \n\t"\
2812
2813#else
2814
b7dc6f66 2815#define FUNNY_UV_CODE \
6e1c66bc 2816 "movl (%%"REG_b"), %%esi \n\t"\
b7dc6f66 2817 "call *%4 \n\t"\
6d606c4f 2818 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
6e1c66bc
AJ
2819 "add %%"REG_a", %%"REG_D" \n\t"\
2820 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2821
6d606c4f
AJ
2822#endif
2823
b7dc6f66
MN
2824FUNNY_UV_CODE
2825FUNNY_UV_CODE
2826FUNNY_UV_CODE
2827FUNNY_UV_CODE
6e1c66bc
AJ
2828 "xor %%"REG_a", %%"REG_a" \n\t" // i
2829 "mov %5, %%"REG_c" \n\t" // src
2830 "mov %1, %%"REG_D" \n\t" // buf1
2831 "add $4096, %%"REG_D" \n\t"
2832 PREFETCH" (%%"REG_c") \n\t"
2833 PREFETCH" 32(%%"REG_c") \n\t"
2834 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2835
2836FUNNY_UV_CODE
2837FUNNY_UV_CODE
2838FUNNY_UV_CODE
2839FUNNY_UV_CODE
2840
83c89c78
JT
2841#if defined(PIC)
2842 "mov %6, %%"REG_b" \n\t"
2843#endif
b7dc6f66
MN
2844 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2845 "m" (funnyUVCode), "m" (src2)
83c89c78
JT
2846#if defined(PIC)
2847 ,"m" (ebxsave)
2848#endif
91d0bda2 2849 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78
JT
2850#if !defined(PIC)
2851 ,"%"REG_b
2852#endif
b7dc6f66 2853 );
c1b0bfb4 2854 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2ff198c1 2855 {
c1b0bfb4
MN
2856// printf("%d %d %d\n", dstWidth, i, srcW);
2857 dst[i] = src1[srcW-1]*128;
2858 dst[i+2048] = src2[srcW-1]*128;
2ff198c1
MN
2859 }
2860 }
2861 else
2862 {
2863#endif
20ffdcf9 2864 long xInc_shr16 = (long) (xInc >> 16);
6a4970ab 2865 uint16_t xInc_mask = xInc & 0xffff;
2ff198c1 2866 asm volatile(
6e1c66bc 2867 "xor %%"REG_a", %%"REG_a" \n\t" // i
83c89c78 2868 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2ff198c1 2869 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
4bff9ef9 2870 ASMALIGN(4)
2ff198c1 2871 "1: \n\t"
6e1c66bc 2872 "mov %0, %%"REG_S" \n\t"
83c89c78
JT
2873 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2874 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2ff198c1
MN
2875 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2876 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2877 "shll $16, %%edi \n\t"
2878 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
6e1c66bc 2879 "mov %1, %%"REG_D" \n\t"
2ff198c1 2880 "shrl $9, %%esi \n\t"
c35afa2f 2881 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2ff198c1 2882
83c89c78
JT
2883 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2884 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2ff198c1
MN
2885 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2886 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2887 "shll $16, %%edi \n\t"
2888 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
6e1c66bc 2889 "mov %1, %%"REG_D" \n\t"
2ff198c1 2890 "shrl $9, %%esi \n\t"
6e1c66bc 2891 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2ff198c1
MN
2892
2893 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
83c89c78 2894 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
6e1c66bc
AJ
2895 "add $1, %%"REG_a" \n\t"
2896 "cmp %2, %%"REG_a" \n\t"
2ff198c1
MN
2897 " jb 1b \n\t"
2898
dc77ef7f
GP
2899/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2900 which is needed to support GCC-4.0 */
2901#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2902 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2903#else
9cc768f6 2904 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2905#endif
2ff198c1 2906 "r" (src2)
83c89c78 2907 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2ff198c1
MN
2908 );
2909#ifdef HAVE_MMX2
77a416e8 2910 } //if MMX2 can't be used
2ff198c1
MN
2911#endif
2912#else
96034638
MN
2913 int i;
2914 unsigned int xpos=0;
2915 for(i=0;i<dstWidth;i++)
2916 {
2917 register unsigned int xx=xpos>>16;
2918 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2919 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2920 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1faf0867
MN
2921/* slower
2922 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2923 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2924*/
96034638
MN
2925 xpos+=xInc;
2926 }
2ff198c1 2927#endif
077ea8a7
MN
2928 }
2929}
2930
3e499f53
MN
2931static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2932 int srcSliceH, uint8_t* dst[], int dstStride[]){
28bf81c9
MN
2933
2934 /* load a few things into local vars to make the code more readable? and faster */
2935 const int srcW= c->srcW;
2936 const int dstW= c->dstW;
2937 const int dstH= c->dstH;
2938 const int chrDstW= c->chrDstW;
e616aa93 2939 const int chrSrcW= c->chrSrcW;
28bf81c9
MN
2940 const int lumXInc= c->lumXInc;
2941 const int chrXInc= c->chrXInc;
fe8054c0 2942 const int dstFormat= c->dstFormat;
44c1035c 2943 const int srcFormat= c->srcFormat;
28bf81c9
MN
2944 const int flags= c->flags;
2945 const int canMMX2BeUsed= c->canMMX2BeUsed;
2946 int16_t *vLumFilterPos= c->vLumFilterPos;
2947 int16_t *vChrFilterPos= c->vChrFilterPos;
2948 int16_t *hLumFilterPos= c->hLumFilterPos;
2949 int16_t *hChrFilterPos= c->hChrFilterPos;
2950 int16_t *vLumFilter= c->vLumFilter;
2951 int16_t *vChrFilter= c->vChrFilter;
2952 int16_t *hLumFilter= c->hLumFilter;
2953 int16_t *hChrFilter= c->hChrFilter;
77a49659
MN
2954 int32_t *lumMmxFilter= c->lumMmxFilter;
2955 int32_t *chrMmxFilter= c->chrMmxFilter;
28bf81c9
MN
2956 const int vLumFilterSize= c->vLumFilterSize;
2957 const int vChrFilterSize= c->vChrFilterSize;
2958 const int hLumFilterSize= c->hLumFilterSize;
2959 const int hChrFilterSize= c->hChrFilterSize;
2960 int16_t **lumPixBuf= c->lumPixBuf;
2961 int16_t **chrPixBuf= c->chrPixBuf;
2962 const int vLumBufSize= c->vLumBufSize;
2963 const int vChrBufSize= c->vChrBufSize;
2964 uint8_t *funnyYCode= c->funnyYCode;
2965 uint8_t *funnyUVCode= c->funnyUVCode;
1e621b18 2966 uint8_t *formatConvBuffer= c->formatConvBuffer;
e616aa93
MN
2967 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2968 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
d4e24275 2969 int lastDstY;
e28630fc 2970 uint8_t *pal=NULL;
28bf81c9
MN
2971
2972 /* vars whch will change and which we need to storw back in the context */
2973 int dstY= c->dstY;
2974 int lumBufIndex= c->lumBufIndex;
2975 int chrBufIndex= c->chrBufIndex;
2976 int lastInLumBuf= c->lastInLumBuf;
2977 int lastInChrBuf= c->lastInChrBuf;
6a4970ab 2978
5859233b 2979 if(isPacked(c->srcFormat)){
e28630fc 2980 pal= src[1];
1e621b18
MN
2981 src[0]=
2982 src[1]=
3e499f53 2983 src[2]= src[0];
5859233b 2984 srcStride[0]=
1e621b18 2985 srcStride[1]=
3e499f53 2986 srcStride[2]= srcStride[0];
6c7506de 2987 }
5859233b
MN
2988 srcStride[1]<<= c->vChrDrop;
2989 srcStride[2]<<= c->vChrDrop;
6c7506de 2990
c7a810cc
MN
2991// printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2992// (int)dst[0], (int)dst[1], (int)dst[2]);
2993
2994#if 0 //self test FIXME move to a vfilter or something
2995{
2996static volatile int i=0;
2997i++;
e9e12f0e 2998if(srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
c7a810cc
MN
2999 selfTest(src, srcStride, c->srcW, c->srcH);
3000i--;
3001}
3002#endif
37079906
MN
3003
3004//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
3005//dstStride[0],dstStride[1],dstStride[2]);
6c7506de
MN
3006
3007 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3008 {
3009 static int firstTime=1; //FIXME move this into the context perhaps
3010 if(flags & SWS_PRINT_INFO && firstTime)
3011 {
2d529db5 3012 av_log(c, AV_LOG_WARNING, "SwScaler: Warning: dstStride is not aligned!\n"
6c7506de
MN
3013 "SwScaler: ->cannot do aligned memory acesses anymore\n");
3014 firstTime=0;
3015 }
3016 }
28bf81c9 3017
1e621b18
MN
3018 /* Note the user might start scaling the picture in the middle so this will not get executed
3019 this is not really intended but works currently, so ppl might do it */
28bf81c9
MN
3020 if(srcSliceY ==0){
3021 lumBufIndex=0;
3022 chrBufIndex=0;
6a4970ab 3023 dstY=0;
28bf81c9
MN
3024 lastInLumBuf= -1;
3025 lastInChrBuf= -1;
077ea8a7 3026 }
d3f41512 3027
d4e24275
MN
3028 lastDstY= dstY;
3029
c1b0bfb4 3030 for(;dstY < dstH; dstY++){
28bf81c9 3031 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3f7bb50c
MN
3032 const int chrDstY= dstY>>c->chrDstVSubSample;
3033 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3034 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
d3f41512 3035
c1b0bfb4
MN
3036 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3037 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3038 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3039 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
d604bab9 3040
379a2036
MN
3041//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3042// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
c7f822d9
MN
3043 //handle holes (FAST_BILINEAR & weird filters)
3044 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3045 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3046//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
c1b0bfb4
MN
3047 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3048 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
d3f41512 3049
c1b0bfb4 3050 // Do we have enough lines in this slice to output the dstY line
e616aa93 3051 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
c1b0bfb4
MN
3052 {
3053 //Do horizontal scaling
3054 while(lastInLumBuf < lastLumSrcY)
d3f41512 3055 {
28bf81c9 3056 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4 3057 lumBufIndex++;
c7f822d9 3058// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
c1b0bfb4
MN
3059 ASSERT(lumBufIndex < 2*vLumBufSize)
3060 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3061 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3062// printf("%d %d\n", lumBufIndex, vLumBufSize);
28bf81c9
MN
3063 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3064 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
6a4970ab 3065 funnyYCode, c->srcFormat, formatConvBuffer,
e28630fc 3066 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
c1b0bfb4
MN
3067 lastInLumBuf++;
3068 }
3069 while(lastInChrBuf < lastChrSrcY)
3070 {
e616aa93
MN
3071</