whitespace cosmetics
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
807e0c66
LA
20 * the C code (not assembly, mmx, ...) of this file can be used
21 * under the LGPL license too
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec
MN
27#undef PREFETCH
28#undef PREFETCHW
29#undef EMMS
30#undef SFENCE
31
32#ifdef HAVE_3DNOW
33/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34#define EMMS "femms"
35#else
36#define EMMS "emms"
37#endif
38
39#ifdef HAVE_3DNOW
40#define PREFETCH "prefetch"
41#define PREFETCHW "prefetchw"
e5091488 42#elif defined (HAVE_MMX2)
48a05cec
MN
43#define PREFETCH "prefetchnta"
44#define PREFETCHW "prefetcht0"
45#else
d904b5fc
NP
46#define PREFETCH " # nop"
47#define PREFETCHW " # nop"
48a05cec
MN
48#endif
49
50#ifdef HAVE_MMX2
51#define SFENCE "sfence"
52#else
d904b5fc 53#define SFENCE " # nop"
48a05cec 54#endif
d3f41512 55
d604bab9
MN
56#ifdef HAVE_MMX2
57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58#elif defined (HAVE_3DNOW)
59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60#endif
d3f41512 61
d604bab9 62#ifdef HAVE_MMX2
6e1c66bc 63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 64#else
6e1c66bc 65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 66#endif
6e1c66bc 67#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 68
a2faa401
RD
69#ifdef HAVE_ALTIVEC
70#include "swscale_altivec_template.c"
71#endif
72
bca11e75 73#define YSCALEYUV2YV12X(x, offset, dest, width) \
2da0d70d
DB
74 asm volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
8b2fce0d
MN
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
2da0d70d
DB
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
bca11e75
MN
108
109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
2da0d70d
DB
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
8b2fce0d
MN
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
2da0d70d 122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
8b2fce0d 123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
2da0d70d
DB
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
2da0d70d
DB
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
c1b0bfb4
MN
170
171#define YSCALEYUV2YV121 \
2da0d70d
DB
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
c1b0bfb4
MN
183
184/*
2da0d70d
DB
185 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187 "r" (dest), "m" (dstW),
188 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 190*/
25593e29 191#define YSCALEYUV2PACKEDX \
2da0d70d
DB
192 asm volatile(\
193 "xor %%"REG_a", %%"REG_a" \n\t"\
194 ASMALIGN(4)\
195 "nop \n\t"\
196 "1: \n\t"\
197 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
198 "mov (%%"REG_d"), %%"REG_S" \n\t"\
199 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
200 "movq %%mm3, %%mm4 \n\t"\
201 ASMALIGN(4)\
202 "2: \n\t"\
203 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
204 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
8b2fce0d 205 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
2da0d70d
DB
206 "add $16, %%"REG_d" \n\t"\
207 "mov (%%"REG_d"), %%"REG_S" \n\t"\
208 "pmulhw %%mm0, %%mm2 \n\t"\
209 "pmulhw %%mm0, %%mm5 \n\t"\
210 "paddw %%mm2, %%mm3 \n\t"\
211 "paddw %%mm5, %%mm4 \n\t"\
212 "test %%"REG_S", %%"REG_S" \n\t"\
213 " jnz 2b \n\t"\
c1b0bfb4 214\
2da0d70d
DB
215 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
218 "movq %%mm1, %%mm7 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm1 \n\t"\
229 "paddw %%mm5, %%mm7 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
232
233#define YSCALEYUV2PACKEDX_END \
234 :: "r" (&c->redDither), \
235 "m" (dummy), "m" (dummy), "m" (dummy),\
236 "r" (dest), "m" (dstW) \
237 : "%"REG_a, "%"REG_d, "%"REG_S \
238 );
8422aa88 239
bca11e75 240#define YSCALEYUV2PACKEDX_ACCURATE \
2da0d70d
DB
241 asm volatile(\
242 "xor %%"REG_a", %%"REG_a" \n\t"\
243 ASMALIGN(4)\
244 "nop \n\t"\
245 "1: \n\t"\
246 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
247 "mov (%%"REG_d"), %%"REG_S" \n\t"\
248 "pxor %%mm4, %%mm4 \n\t"\
249 "pxor %%mm5, %%mm5 \n\t"\
250 "pxor %%mm6, %%mm6 \n\t"\
251 "pxor %%mm7, %%mm7 \n\t"\
252 ASMALIGN(4)\
253 "2: \n\t"\
254 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
8b2fce0d 255 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
2da0d70d
DB
256 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
257 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
258 "movq %%mm0, %%mm3 \n\t"\
259 "punpcklwd %%mm1, %%mm0 \n\t"\
260 "punpckhwd %%mm1, %%mm3 \n\t"\
261 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
262 "pmaddwd %%mm1, %%mm0 \n\t"\
263 "pmaddwd %%mm1, %%mm3 \n\t"\
264 "paddd %%mm0, %%mm4 \n\t"\
265 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 266 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
2da0d70d
DB
267 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
268 "add $16, %%"REG_d" \n\t"\
269 "test %%"REG_S", %%"REG_S" \n\t"\
270 "movq %%mm2, %%mm0 \n\t"\
271 "punpcklwd %%mm3, %%mm2 \n\t"\
272 "punpckhwd %%mm3, %%mm0 \n\t"\
273 "pmaddwd %%mm1, %%mm2 \n\t"\
274 "pmaddwd %%mm1, %%mm0 \n\t"\
275 "paddd %%mm2, %%mm6 \n\t"\
276 "paddd %%mm0, %%mm7 \n\t"\
277 " jnz 2b \n\t"\
278 "psrad $16, %%mm4 \n\t"\
279 "psrad $16, %%mm5 \n\t"\
280 "psrad $16, %%mm6 \n\t"\
281 "psrad $16, %%mm7 \n\t"\
282 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
283 "packssdw %%mm5, %%mm4 \n\t"\
284 "packssdw %%mm7, %%mm6 \n\t"\
285 "paddw %%mm0, %%mm4 \n\t"\
286 "paddw %%mm0, %%mm6 \n\t"\
287 "movq %%mm4, "U_TEMP"(%0) \n\t"\
288 "movq %%mm6, "V_TEMP"(%0) \n\t"\
bca11e75 289\
2da0d70d
DB
290 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
291 "mov (%%"REG_d"), %%"REG_S" \n\t"\
292 "pxor %%mm1, %%mm1 \n\t"\
293 "pxor %%mm5, %%mm5 \n\t"\
294 "pxor %%mm7, %%mm7 \n\t"\
295 "pxor %%mm6, %%mm6 \n\t"\
296 ASMALIGN(4)\
297 "2: \n\t"\
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
301 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302 "movq %%mm0, %%mm3 \n\t"\
303 "punpcklwd %%mm4, %%mm0 \n\t"\
304 "punpckhwd %%mm4, %%mm3 \n\t"\
305 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306 "pmaddwd %%mm4, %%mm0 \n\t"\
307 "pmaddwd %%mm4, %%mm3 \n\t"\
308 "paddd %%mm0, %%mm1 \n\t"\
309 "paddd %%mm3, %%mm5 \n\t"\
310 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
312 "add $16, %%"REG_d" \n\t"\
313 "test %%"REG_S", %%"REG_S" \n\t"\
314 "movq %%mm2, %%mm0 \n\t"\
315 "punpcklwd %%mm3, %%mm2 \n\t"\
316 "punpckhwd %%mm3, %%mm0 \n\t"\
317 "pmaddwd %%mm4, %%mm2 \n\t"\
318 "pmaddwd %%mm4, %%mm0 \n\t"\
319 "paddd %%mm2, %%mm7 \n\t"\
320 "paddd %%mm0, %%mm6 \n\t"\
321 " jnz 2b \n\t"\
322 "psrad $16, %%mm1 \n\t"\
323 "psrad $16, %%mm5 \n\t"\
324 "psrad $16, %%mm7 \n\t"\
325 "psrad $16, %%mm6 \n\t"\
326 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
327 "packssdw %%mm5, %%mm1 \n\t"\
328 "packssdw %%mm6, %%mm7 \n\t"\
329 "paddw %%mm0, %%mm1 \n\t"\
330 "paddw %%mm0, %%mm7 \n\t"\
331 "movq "U_TEMP"(%0), %%mm3 \n\t"\
332 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 333
8422aa88 334#define YSCALEYUV2RGBX \
2da0d70d
DB
335 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
336 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
337 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
338 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
339 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
340 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
341/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
343 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
344 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
345 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
346 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
347 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
348/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349 "paddw %%mm3, %%mm4 \n\t"\
350 "movq %%mm2, %%mm0 \n\t"\
351 "movq %%mm5, %%mm6 \n\t"\
352 "movq %%mm4, %%mm3 \n\t"\
353 "punpcklwd %%mm2, %%mm2 \n\t"\
354 "punpcklwd %%mm5, %%mm5 \n\t"\
355 "punpcklwd %%mm4, %%mm4 \n\t"\
356 "paddw %%mm1, %%mm2 \n\t"\
357 "paddw %%mm1, %%mm5 \n\t"\
358 "paddw %%mm1, %%mm4 \n\t"\
359 "punpckhwd %%mm0, %%mm0 \n\t"\
360 "punpckhwd %%mm6, %%mm6 \n\t"\
361 "punpckhwd %%mm3, %%mm3 \n\t"\
362 "paddw %%mm7, %%mm0 \n\t"\
363 "paddw %%mm7, %%mm6 \n\t"\
364 "paddw %%mm7, %%mm3 \n\t"\
365 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366 "packuswb %%mm0, %%mm2 \n\t"\
367 "packuswb %%mm6, %%mm5 \n\t"\
368 "packuswb %%mm3, %%mm4 \n\t"\
369 "pxor %%mm7, %%mm7 \n\t"
77a49659 370#if 0
d604bab9 371#define FULL_YSCALEYUV2RGB \
2da0d70d
DB
372 "pxor %%mm7, %%mm7 \n\t"\
373 "movd %6, %%mm6 \n\t" /*yalpha1*/\
374 "punpcklwd %%mm6, %%mm6 \n\t"\
375 "punpcklwd %%mm6, %%mm6 \n\t"\
376 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
377 "punpcklwd %%mm5, %%mm5 \n\t"\
378 "punpcklwd %%mm5, %%mm5 \n\t"\
379 "xor %%"REG_a", %%"REG_a" \n\t"\
380 ASMALIGN(4)\
381 "1: \n\t"\
382 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
383 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
384 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
385 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
386 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
387 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
8b2fce0d 391 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
392 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
8b2fce0d 394 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
395 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
398 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
399 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
400\
401\
2da0d70d
DB
402 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
404 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
405 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
407 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
409\
410\
2da0d70d
DB
411 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
412 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
413 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
414 "paddw %%mm1, %%mm3 \n\t" /* B*/\
415 "paddw %%mm1, %%mm0 \n\t" /* R*/\
416 "packuswb %%mm3, %%mm3 \n\t"\
d604bab9 417\
2da0d70d
DB
418 "packuswb %%mm0, %%mm0 \n\t"\
419 "paddw %%mm4, %%mm2 \n\t"\
420 "paddw %%mm2, %%mm1 \n\t" /* G*/\
d604bab9 421\
2da0d70d 422 "packuswb %%mm1, %%mm1 \n\t"
77a49659 423#endif
d604bab9 424
6e1c66bc 425#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
426 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
427 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
428 "psraw $3, %%mm0 \n\t"\
429 "psraw $3, %%mm1 \n\t"\
430 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432 "xor "#index", "#index" \n\t"\
433 ASMALIGN(4)\
434 "1: \n\t"\
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
437 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
439 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
442 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
449 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
450 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
451 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
452 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 460
6e1c66bc 461#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 462
6e1c66bc 463#define REAL_YSCALEYUV2RGB(index, c) \
2da0d70d
DB
464 "xor "#index", "#index" \n\t"\
465 ASMALIGN(4)\
466 "1: \n\t"\
467 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
469 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
470 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
471 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
474 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
481 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
482 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
483 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
484 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
485 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
486 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
488 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
489 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
490 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
491 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
492 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
493 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
500 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
501 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
502 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
503 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
504 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
505 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506 "paddw %%mm3, %%mm4 \n\t"\
507 "movq %%mm2, %%mm0 \n\t"\
508 "movq %%mm5, %%mm6 \n\t"\
509 "movq %%mm4, %%mm3 \n\t"\
510 "punpcklwd %%mm2, %%mm2 \n\t"\
511 "punpcklwd %%mm5, %%mm5 \n\t"\
512 "punpcklwd %%mm4, %%mm4 \n\t"\
513 "paddw %%mm1, %%mm2 \n\t"\
514 "paddw %%mm1, %%mm5 \n\t"\
515 "paddw %%mm1, %%mm4 \n\t"\
516 "punpckhwd %%mm0, %%mm0 \n\t"\
517 "punpckhwd %%mm6, %%mm6 \n\t"\
518 "punpckhwd %%mm3, %%mm3 \n\t"\
519 "paddw %%mm7, %%mm0 \n\t"\
520 "paddw %%mm7, %%mm6 \n\t"\
521 "paddw %%mm7, %%mm3 \n\t"\
522 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523 "packuswb %%mm0, %%mm2 \n\t"\
524 "packuswb %%mm6, %%mm5 \n\t"\
525 "packuswb %%mm3, %%mm4 \n\t"\
526 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 527#define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
6a4970ab 528
6e1c66bc 529#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
530 "xor "#index", "#index" \n\t"\
531 ASMALIGN(4)\
532 "1: \n\t"\
533 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 534 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
535 "psraw $7, %%mm3 \n\t" \
536 "psraw $7, %%mm4 \n\t" \
537 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
538 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
539 "psraw $7, %%mm1 \n\t" \
540 "psraw $7, %%mm7 \n\t" \
6a4970ab 541
6e1c66bc 542#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 543
6e1c66bc 544#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
545 "xor "#index", "#index" \n\t"\
546 ASMALIGN(4)\
547 "1: \n\t"\
548 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 549 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
550 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
553 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
554 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
555 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
556 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
557 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
558 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
560 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
561 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
564 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
565 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
566 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
567 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
568 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
569 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570 "paddw %%mm3, %%mm4 \n\t"\
571 "movq %%mm2, %%mm0 \n\t"\
572 "movq %%mm5, %%mm6 \n\t"\
573 "movq %%mm4, %%mm3 \n\t"\
574 "punpcklwd %%mm2, %%mm2 \n\t"\
575 "punpcklwd %%mm5, %%mm5 \n\t"\
576 "punpcklwd %%mm4, %%mm4 \n\t"\
577 "paddw %%mm1, %%mm2 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "paddw %%mm1, %%mm4 \n\t"\
580 "punpckhwd %%mm0, %%mm0 \n\t"\
581 "punpckhwd %%mm6, %%mm6 \n\t"\
582 "punpckhwd %%mm3, %%mm3 \n\t"\
583 "paddw %%mm7, %%mm0 \n\t"\
584 "paddw %%mm7, %%mm6 \n\t"\
585 "paddw %%mm7, %%mm3 \n\t"\
586 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587 "packuswb %%mm0, %%mm2 \n\t"\
588 "packuswb %%mm6, %%mm5 \n\t"\
589 "packuswb %%mm3, %%mm4 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 591#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 592
6e1c66bc 593#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
594 "xor "#index", "#index" \n\t"\
595 ASMALIGN(4)\
596 "1: \n\t"\
597 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
599 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
601 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603 "psrlw $8, %%mm3 \n\t" \
604 "psrlw $8, %%mm4 \n\t" \
605 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
606 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
607 "psraw $7, %%mm1 \n\t" \
608 "psraw $7, %%mm7 \n\t"
6e1c66bc 609#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 610
497d4f99 611// do vertical chrominance interpolation
6e1c66bc 612#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
613 "xor "#index", "#index" \n\t"\
614 ASMALIGN(4)\
615 "1: \n\t"\
616 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
618 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
620 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
623 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
624 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
625 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
626 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
627 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
628 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
629 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
630 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
632 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
633 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
636 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
637 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
638 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
639 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
640 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
641 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642 "paddw %%mm3, %%mm4 \n\t"\
643 "movq %%mm2, %%mm0 \n\t"\
644 "movq %%mm5, %%mm6 \n\t"\
645 "movq %%mm4, %%mm3 \n\t"\
646 "punpcklwd %%mm2, %%mm2 \n\t"\
647 "punpcklwd %%mm5, %%mm5 \n\t"\
648 "punpcklwd %%mm4, %%mm4 \n\t"\
649 "paddw %%mm1, %%mm2 \n\t"\
650 "paddw %%mm1, %%mm5 \n\t"\
651 "paddw %%mm1, %%mm4 \n\t"\
652 "punpckhwd %%mm0, %%mm0 \n\t"\
653 "punpckhwd %%mm6, %%mm6 \n\t"\
654 "punpckhwd %%mm3, %%mm3 \n\t"\
655 "paddw %%mm7, %%mm0 \n\t"\
656 "paddw %%mm7, %%mm6 \n\t"\
657 "paddw %%mm7, %%mm3 \n\t"\
658 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659 "packuswb %%mm0, %%mm2 \n\t"\
660 "packuswb %%mm6, %%mm5 \n\t"\
661 "packuswb %%mm3, %%mm4 \n\t"\
662 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 663#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 664
6e1c66bc 665#define REAL_WRITEBGR32(dst, dstw, index) \
2da0d70d
DB
666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667 "movq %%mm2, %%mm1 \n\t" /* B */\
668 "movq %%mm5, %%mm6 \n\t" /* R */\
669 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
670 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
671 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
672 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
673 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
674 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
675 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
676 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
677 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
678 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 679\
2da0d70d
DB
680 MOVNTQ(%%mm0, (dst, index, 4))\
681 MOVNTQ(%%mm2, 8(dst, index, 4))\
682 MOVNTQ(%%mm1, 16(dst, index, 4))\
683 MOVNTQ(%%mm3, 24(dst, index, 4))\
d604bab9 684\
2da0d70d
DB
685 "add $8, "#index" \n\t"\
686 "cmp "#dstw", "#index" \n\t"\
687 " jb 1b \n\t"
6e1c66bc 688#define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
d604bab9 689
6e1c66bc 690#define REAL_WRITEBGR16(dst, dstw, index) \
2da0d70d
DB
691 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
692 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
693 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
694 "psrlq $3, %%mm2 \n\t"\
d604bab9 695\
2da0d70d
DB
696 "movq %%mm2, %%mm1 \n\t"\
697 "movq %%mm4, %%mm3 \n\t"\
d604bab9 698\
2da0d70d
DB
699 "punpcklbw %%mm7, %%mm3 \n\t"\
700 "punpcklbw %%mm5, %%mm2 \n\t"\
701 "punpckhbw %%mm7, %%mm4 \n\t"\
702 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 703\
2da0d70d
DB
704 "psllq $3, %%mm3 \n\t"\
705 "psllq $3, %%mm4 \n\t"\
d604bab9 706\
2da0d70d
DB
707 "por %%mm3, %%mm2 \n\t"\
708 "por %%mm4, %%mm1 \n\t"\
d604bab9 709\
2da0d70d
DB
710 MOVNTQ(%%mm2, (dst, index, 2))\
711 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 712\
2da0d70d
DB
713 "add $8, "#index" \n\t"\
714 "cmp "#dstw", "#index" \n\t"\
715 " jb 1b \n\t"
6e1c66bc 716#define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
d604bab9 717
6e1c66bc 718#define REAL_WRITEBGR15(dst, dstw, index) \
2da0d70d
DB
719 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
720 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
721 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
722 "psrlq $3, %%mm2 \n\t"\
723 "psrlq $1, %%mm5 \n\t"\
d604bab9 724\
2da0d70d
DB
725 "movq %%mm2, %%mm1 \n\t"\
726 "movq %%mm4, %%mm3 \n\t"\
d604bab9 727\
2da0d70d
DB
728 "punpcklbw %%mm7, %%mm3 \n\t"\
729 "punpcklbw %%mm5, %%mm2 \n\t"\
730 "punpckhbw %%mm7, %%mm4 \n\t"\
731 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 732\
2da0d70d
DB
733 "psllq $2, %%mm3 \n\t"\
734 "psllq $2, %%mm4 \n\t"\
d604bab9 735\
2da0d70d
DB
736 "por %%mm3, %%mm2 \n\t"\
737 "por %%mm4, %%mm1 \n\t"\
d604bab9 738\
2da0d70d
DB
739 MOVNTQ(%%mm2, (dst, index, 2))\
740 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 741\
2da0d70d
DB
742 "add $8, "#index" \n\t"\
743 "cmp "#dstw", "#index" \n\t"\
744 " jb 1b \n\t"
6e1c66bc 745#define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
f62255fb 746
6542b44e 747#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
748 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749 "movq %%mm2, %%mm1 \n\t" /* B */\
750 "movq %%mm5, %%mm6 \n\t" /* R */\
751 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
752 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
753 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
754 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
755 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
756 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
757 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
758 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
759 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
760 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 761\
2da0d70d
DB
762 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
763 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
764 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
765 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
766 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
767 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
768 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
769 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 770\
2da0d70d
DB
771 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
772 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
773 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
774 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
775 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
776 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
777 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
778 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
779 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
780 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
781 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
782 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
783 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 784\
2da0d70d
DB
785 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
786 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
787 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
788 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
789 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
790 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
791 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
792 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 793\
2da0d70d
DB
794 MOVNTQ(%%mm0, (dst))\
795 MOVNTQ(%%mm2, 8(dst))\
796 MOVNTQ(%%mm3, 16(dst))\
797 "add $24, "#dst" \n\t"\
d604bab9 798\
2da0d70d
DB
799 "add $8, "#index" \n\t"\
800 "cmp "#dstw", "#index" \n\t"\
801 " jb 1b \n\t"
d604bab9 802
6542b44e 803#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
804 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805 "movq %%mm2, %%mm1 \n\t" /* B */\
806 "movq %%mm5, %%mm6 \n\t" /* R */\
807 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
808 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
809 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
810 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
811 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
812 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
813 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
814 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
815 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
816 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 817\
2da0d70d
DB
818 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
819 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
820 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
821 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 822\
2da0d70d
DB
823 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
824 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
825 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
826 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 827\
2da0d70d
DB
828 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
829 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
830 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
831 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 832\
2da0d70d
DB
833 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
834 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
835 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
836 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
837 MOVNTQ(%%mm0, (dst))\
99d2cb72 838\
2da0d70d
DB
839 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
840 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
841 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
842 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
843 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 844\
2da0d70d
DB
845 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
846 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
847 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
848 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 849\
2da0d70d 850 "add $24, "#dst" \n\t"\
99d2cb72 851\
2da0d70d
DB
852 "add $8, "#index" \n\t"\
853 "cmp "#dstw", "#index" \n\t"\
854 " jb 1b \n\t"
99d2cb72 855
6542b44e 856#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 857 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
858 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
859 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
860 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 863\
2da0d70d
DB
864 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
865 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
866 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 867\
2da0d70d
DB
868 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
869 "por %%mm1, %%mm6 \n\t"\
870 "por %%mm3, %%mm6 \n\t"\
871 MOVNTQ(%%mm6, (dst))\
99d2cb72 872\
2da0d70d
DB
873 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
874 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
875 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
876 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 877\
5802683a 878 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
879 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
880 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 881\
2da0d70d
DB
882 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
883 "por %%mm3, %%mm6 \n\t"\
884 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 885\
2da0d70d
DB
886 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
887 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
888 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 889\
2da0d70d
DB
890 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
891 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 892 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 893\
2da0d70d
DB
894 "por %%mm1, %%mm3 \n\t"\
895 "por %%mm3, %%mm6 \n\t"\
896 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 897\
2da0d70d 898 "add $24, "#dst" \n\t"\
99d2cb72 899\
2da0d70d
DB
900 "add $8, "#index" \n\t"\
901 "cmp "#dstw", "#index" \n\t"\
902 " jb 1b \n\t"
99d2cb72
MN
903
904#ifdef HAVE_MMX2
7630f2e0 905#undef WRITEBGR24
6e1c66bc 906#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 907#else
7630f2e0 908#undef WRITEBGR24
6e1c66bc 909#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
910#endif
911
6e1c66bc 912#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
913 "packuswb %%mm3, %%mm3 \n\t"\
914 "packuswb %%mm4, %%mm4 \n\t"\
915 "packuswb %%mm7, %%mm1 \n\t"\
916 "punpcklbw %%mm4, %%mm3 \n\t"\
917 "movq %%mm1, %%mm7 \n\t"\
918 "punpcklbw %%mm3, %%mm1 \n\t"\
919 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 920\
2da0d70d
DB
921 MOVNTQ(%%mm1, (dst, index, 2))\
922 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 923\
2da0d70d
DB
924 "add $8, "#index" \n\t"\
925 "cmp "#dstw", "#index" \n\t"\
926 " jb 1b \n\t"
6e1c66bc 927#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
928
929
77a49659 930static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
931 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
38858470 933{
c1b0bfb4 934#ifdef HAVE_MMX
2da0d70d
DB
935 if (c->flags & SWS_ACCURATE_RND){
936 if (uDest){
8b2fce0d
MN
937 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
2da0d70d 939 }
bca11e75 940
8b2fce0d 941 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
2da0d70d
DB
942 }else{
943 if (uDest){
8b2fce0d
MN
944 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
bca11e75 946 }
2da0d70d 947
8b2fce0d 948 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
2da0d70d 949 }
c1b0bfb4 950#else
a2faa401
RD
951#ifdef HAVE_ALTIVEC
952yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
953 chrFilter, chrSrc, chrFilterSize,
954 dest, uDest, vDest, dstW, chrDstW);
a2faa401 955#else //HAVE_ALTIVEC
5859233b 956yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
957 chrFilter, chrSrc, chrFilterSize,
958 dest, uDest, vDest, dstW, chrDstW);
a2faa401 959#endif //!HAVE_ALTIVEC
bc279024 960#endif /* HAVE_MMX */
c1b0bfb4 961}
2add307d 962
6118e52e 963static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
964 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
6118e52e
VS
966{
967yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
968 chrFilter, chrSrc, chrFilterSize,
969 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
970}
971
c1b0bfb4 972static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
2da0d70d 973 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
c1b0bfb4
MN
974{
975#ifdef HAVE_MMX
1b0a4572 976 if (uDest)
2da0d70d
DB
977 {
978 asm volatile(
979 YSCALEYUV2YV121
980 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
981 "g" (-chrDstW)
982 : "%"REG_a
983 );
984
985 asm volatile(
986 YSCALEYUV2YV121
8b2fce0d 987 :: "r" (chrSrc + VOFW + chrDstW), "r" (vDest + chrDstW),
2da0d70d
DB
988 "g" (-chrDstW)
989 : "%"REG_a
990 );
991 }
992
993 asm volatile(
994 YSCALEYUV2YV121
995 :: "r" (lumSrc + dstW), "r" (dest + dstW),
996 "g" (-dstW)
997 : "%"REG_a
998 );
c1b0bfb4 999#else
2da0d70d
DB
1000 int i;
1001 for (i=0; i<dstW; i++)
1002 {
1003 int val= lumSrc[i]>>7;
1004
1005 if (val&256){
1006 if (val<0) val=0;
1007 else val=255;
1008 }
1009
1010 dest[i]= val;
1011 }
1012
1b0a4572 1013 if (uDest)
2da0d70d
DB
1014 for (i=0; i<chrDstW; i++)
1015 {
1016 int u=chrSrc[i]>>7;
8b2fce0d 1017 int v=chrSrc[i + VOFW]>>7;
2da0d70d
DB
1018
1019 if ((u|v)&256){
1020 if (u<0) u=0;
1021 else if (u>255) u=255;
1022 if (v<0) v=0;
1023 else if (v>255) v=255;
1024 }
1025
1026 uDest[i]= u;
1027 vDest[i]= v;
1028 }
c1b0bfb4 1029#endif
38858470
MN
1030}
1031
c1b0bfb4 1032
d604bab9
MN
1033/**
1034 * vertical scale YV12 to RGB
1035 */
25593e29 1036static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
1037 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038 uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1039{
bca11e75 1040#ifdef HAVE_MMX
f8d61128 1041 long dummy=0;
2da0d70d
DB
1042 if (c->flags & SWS_ACCURATE_RND){
1043 switch(c->dstFormat){
1044 case PIX_FMT_RGB32:
1045 YSCALEYUV2PACKEDX_ACCURATE
1046 YSCALEYUV2RGBX
1047 WRITEBGR32(%4, %5, %%REGa)
1048
1049 YSCALEYUV2PACKEDX_END
1050 return;
1051 case PIX_FMT_BGR24:
1052 YSCALEYUV2PACKEDX_ACCURATE
1053 YSCALEYUV2RGBX
1054 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055 "add %4, %%"REG_c" \n\t"
1056 WRITEBGR24(%%REGc, %5, %%REGa)
1057
1058
1059 :: "r" (&c->redDither),
1060 "m" (dummy), "m" (dummy), "m" (dummy),
1061 "r" (dest), "m" (dstW)
1062 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1063 );
1064 return;
1065 case PIX_FMT_BGR555:
1066 YSCALEYUV2PACKEDX_ACCURATE
1067 YSCALEYUV2RGBX
1068 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1069#ifdef DITHER1XBPP
2da0d70d
DB
1070 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073#endif
1074
1075 WRITEBGR15(%4, %5, %%REGa)
1076 YSCALEYUV2PACKEDX_END
1077 return;
1078 case PIX_FMT_BGR565:
1079 YSCALEYUV2PACKEDX_ACCURATE
1080 YSCALEYUV2RGBX
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1082#ifdef DITHER1XBPP
2da0d70d
DB
1083 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086#endif
1087
1088 WRITEBGR16(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1090 return;
1091 case PIX_FMT_YUYV422:
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1094
1095 "psraw $3, %%mm3 \n\t"
1096 "psraw $3, %%mm4 \n\t"
1097 "psraw $3, %%mm1 \n\t"
1098 "psraw $3, %%mm7 \n\t"
1099 WRITEYUY2(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1101 return;
1102 }
bca11e75 1103 }else{
2da0d70d
DB
1104 switch(c->dstFormat)
1105 {
1106 case PIX_FMT_RGB32:
1107 YSCALEYUV2PACKEDX
1108 YSCALEYUV2RGBX
1109 WRITEBGR32(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1111 return;
1112 case PIX_FMT_BGR24:
1113 YSCALEYUV2PACKEDX
1114 YSCALEYUV2RGBX
1115 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1116 "add %4, %%"REG_c" \n\t"
1117 WRITEBGR24(%%REGc, %5, %%REGa)
1118
1119 :: "r" (&c->redDither),
1120 "m" (dummy), "m" (dummy), "m" (dummy),
1121 "r" (dest), "m" (dstW)
1122 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1123 );
1124 return;
1125 case PIX_FMT_BGR555:
1126 YSCALEYUV2PACKEDX
1127 YSCALEYUV2RGBX
1128 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1129#ifdef DITHER1XBPP
2da0d70d
DB
1130 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1131 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1132 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1133#endif
1134
1135 WRITEBGR15(%4, %5, %%REGa)
1136 YSCALEYUV2PACKEDX_END
1137 return;
1138 case PIX_FMT_BGR565:
1139 YSCALEYUV2PACKEDX
1140 YSCALEYUV2RGBX
1141 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1142#ifdef DITHER1XBPP
2da0d70d
DB
1143 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1144 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1145 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1146#endif
1147
1148 WRITEBGR16(%4, %5, %%REGa)
1149 YSCALEYUV2PACKEDX_END
1150 return;
1151 case PIX_FMT_YUYV422:
1152 YSCALEYUV2PACKEDX
1153 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1154
1155 "psraw $3, %%mm3 \n\t"
1156 "psraw $3, %%mm4 \n\t"
1157 "psraw $3, %%mm1 \n\t"
1158 "psraw $3, %%mm7 \n\t"
1159 WRITEYUY2(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1161 return;
bca11e75
MN
1162 }
1163 }
bc279024 1164#endif /* HAVE_MMX */
a31de956 1165#ifdef HAVE_ALTIVEC
2da0d70d
DB
1166 /* The following list of supported dstFormat values should
1167 match what's found in the body of altivec_yuv2packedX() */
1168 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1169 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1171 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172 chrFilter, chrSrc, chrFilterSize,
1173 dest, dstW, dstY);
1174 else
1175#endif
1176 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177 chrFilter, chrSrc, chrFilterSize,
1178 dest, dstW, dstY);
c1b0bfb4
MN
1179}
1180
c1b0bfb4
MN
1181/**
1182 * vertical bilinear scale YV12 to RGB
1183 */
25593e29 1184static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1185 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1186{
2da0d70d
DB
1187 int yalpha1=yalpha^4095;
1188 int uvalpha1=uvalpha^4095;
1189 int i;
d604bab9 1190
77a416e8 1191#if 0 //isn't used
2da0d70d
DB
1192 if (flags&SWS_FULL_CHR_H_INT)
1193 {
1194 switch(dstFormat)
1195 {
cf7d1c1a 1196#ifdef HAVE_MMX
2da0d70d
DB
1197 case PIX_FMT_RGB32:
1198 asm volatile(
d604bab9
MN
1199
1200
1201FULL_YSCALEYUV2RGB
2da0d70d
DB
1202 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1203 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
d604bab9 1204
2da0d70d
DB
1205 "movq %%mm3, %%mm1 \n\t"
1206 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1207 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
d604bab9 1208
2da0d70d
DB
1209 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1210 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
d604bab9 1211
2da0d70d
DB
1212 "add $4, %%"REG_a" \n\t"
1213 "cmp %5, %%"REG_a" \n\t"
1214 " jb 1b \n\t"
d604bab9 1215
2da0d70d
DB
1216 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1217 "m" (yalpha1), "m" (uvalpha1)
1218 : "%"REG_a
1219 );
1220 break;
1221 case PIX_FMT_BGR24:
1222 asm volatile(
d604bab9
MN
1223
1224FULL_YSCALEYUV2RGB
1225
2da0d70d
DB
1226 // lsb ... msb
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
d604bab9 1229
2da0d70d
DB
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
d604bab9 1233
2da0d70d
DB
1234 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1235 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1236 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1237 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1238 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1239 "movq %%mm1, %%mm2 \n\t"
1240 "psllq $48, %%mm1 \n\t" // 000000BG
1241 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
d604bab9 1242
2da0d70d
DB
1243 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1244 "psrld $16, %%mm2 \n\t" // R000R000
1245 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1246 "por %%mm2, %%mm1 \n\t" // RBGRR000
d604bab9 1247
2da0d70d
DB
1248 "mov %4, %%"REG_b" \n\t"
1249 "add %%"REG_a", %%"REG_b" \n\t"
d604bab9
MN
1250
1251#ifdef HAVE_MMX2
2da0d70d
DB
1252 //FIXME Alignment
1253 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1254 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
d604bab9 1255#else
2da0d70d
DB
1256 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1257 "psrlq $32, %%mm3 \n\t"
1258 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1259 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1260#endif
1261 "add $4, %%"REG_a" \n\t"
1262 "cmp %5, %%"REG_a" \n\t"
1263 " jb 1b \n\t"
1264
1265 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1266 "m" (yalpha1), "m" (uvalpha1)
1267 : "%"REG_a, "%"REG_b
1268 );
1269 break;
1270 case PIX_FMT_BGR555:
1271 asm volatile(
d604bab9
MN
1272
1273FULL_YSCALEYUV2RGB
1274#ifdef DITHER1XBPP
2da0d70d
DB
1275 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1276 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1277 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
d604bab9 1278#endif
2da0d70d
DB
1279 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1280 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1281 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
d604bab9 1282
2da0d70d
DB
1283 "psrlw $3, %%mm3 \n\t"
1284 "psllw $2, %%mm1 \n\t"
1285 "psllw $7, %%mm0 \n\t"
1286 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1287 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9 1288
2da0d70d
DB
1289 "por %%mm3, %%mm1 \n\t"
1290 "por %%mm1, %%mm0 \n\t"
d604bab9 1291
2da0d70d 1292 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1293
2da0d70d
DB
1294 "add $4, %%"REG_a" \n\t"
1295 "cmp %5, %%"REG_a" \n\t"
1296 " jb 1b \n\t"
d604bab9 1297
2da0d70d
DB
1298 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1299 "m" (yalpha1), "m" (uvalpha1)
1300 : "%"REG_a
1301 );
1302 break;
1303 case PIX_FMT_BGR565:
1304 asm volatile(
d604bab9
MN
1305
1306FULL_YSCALEYUV2RGB
1307#ifdef DITHER1XBPP
2da0d70d
DB
1308 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1309 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1310 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
d604bab9 1311#endif
2da0d70d
DB
1312 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1313 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1314 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
d604bab9 1315
2da0d70d
DB
1316 "psrlw $3, %%mm3 \n\t"
1317 "psllw $3, %%mm1 \n\t"
1318 "psllw $8, %%mm0 \n\t"
1319 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1320 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9 1321
2da0d70d
DB
1322 "por %%mm3, %%mm1 \n\t"
1323 "por %%mm1, %%mm0 \n\t"
d604bab9 1324
2da0d70d 1325 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1326
2da0d70d
DB
1327 "add $4, %%"REG_a" \n\t"
1328 "cmp %5, %%"REG_a" \n\t"
1329 " jb 1b \n\t"
d604bab9 1330
2da0d70d
DB
1331 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332 "m" (yalpha1), "m" (uvalpha1)
1333 : "%"REG_a
1334 );
1335 break;
bc279024 1336#endif /* HAVE_MMX */
2da0d70d 1337 case PIX_FMT_BGR32:
cf7d1c1a 1338#ifndef HAVE_MMX
2da0d70d 1339 case PIX_FMT_RGB32:
cf7d1c1a 1340#endif
2da0d70d
DB
1341 if (dstFormat==PIX_FMT_RGB32)
1342 {
1343 int i;
df3c183a 1344#ifdef WORDS_BIGENDIAN
2da0d70d
DB
1345 dest++;
1346#endif
1347 for (i=0;i<dstW;i++){
1348 // vertical linear interpolation && yuv2rgb in a single step:
1349 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1350 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1351 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1352 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1353 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1354 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1355 dest+= 4;
1356 }
1357 }
1358 else if (dstFormat==PIX_FMT_BGR24)
1359 {
1360 int i;
1361 for (i=0;i<dstW;i++){
1362 // vertical linear interpolation && yuv2rgb in a single step:
1363 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1364 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1365 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1366 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1367 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1368 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1369 dest+= 3;
1370 }
1371 }
1372 else if (dstFormat==PIX_FMT_BGR565)
1373 {
1374 int i;
1375 for (i=0;i<dstW;i++){
1376 // vertical linear interpolation && yuv2rgb in a single step:
1377 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1378 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1379 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1380
1381 ((uint16_t*)dest)[i] =
1382 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1383 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1384 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1385 }
1386 }
1387 else if (dstFormat==PIX_FMT_BGR555)
1388 {
1389 int i;
1390 for (i=0;i<dstW;i++){
1391 // vertical linear interpolation && yuv2rgb in a single step:
1392 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1393 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1394 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1395
1396 ((uint16_t*)dest)[i] =
1397 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1398 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1399 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1400 }
1401 }
1402 }//FULL_UV_IPOL
1403 else
1404 {
cf7d1c1a 1405#endif // if 0
d604bab9 1406#ifdef HAVE_MMX
2da0d70d
DB
1407 switch(c->dstFormat)
1408 {
1409 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1410 case PIX_FMT_RGB32:
1411 asm volatile(
1412 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1413 "mov %4, %%"REG_b" \n\t"
1414 "push %%"REG_BP" \n\t"
1415 YSCALEYUV2RGB(%%REGBP, %5)
1416 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1417 "pop %%"REG_BP" \n\t"
1418 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1419
1420 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1421 "a" (&c->redDither)
1422 );
1423 return;
1424 case PIX_FMT_BGR24:
1425 asm volatile(
1426 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1427 "mov %4, %%"REG_b" \n\t"
1428 "push %%"REG_BP" \n\t"
1429 YSCALEYUV2RGB(%%REGBP, %5)
1430 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1431 "pop %%"REG_BP" \n\t"
1432 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1433 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1434 "a" (&c->redDither)
1435 );
1436 return;
1437 case PIX_FMT_BGR555:
1438 asm volatile(
1439 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1440 "mov %4, %%"REG_b" \n\t"
1441 "push %%"REG_BP" \n\t"
1442 YSCALEYUV2RGB(%%REGBP, %5)
1443 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1444#ifdef DITHER1XBPP
2da0d70d
DB
1445 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1446 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1447 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1448#endif
1449
1450 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453
1454 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1455 "a" (&c->redDither)
1456 );
1457 return;
1458 case PIX_FMT_BGR565:
1459 asm volatile(
1460 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1461 "mov %4, %%"REG_b" \n\t"
1462 "push %%"REG_BP" \n\t"
1463 YSCALEYUV2RGB(%%REGBP, %5)
1464 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1465#ifdef DITHER1XBPP
2da0d70d
DB
1466 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1467 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1468 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1469#endif
1470
1471 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1472 "pop %%"REG_BP" \n\t"
1473 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475 "a" (&c->redDither)
1476 );
1477 return;
1478 case PIX_FMT_YUYV422:
1479 asm volatile(
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2PACKED(%%REGBP, %5)
1484 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1485 "pop %%"REG_BP" \n\t"
1486 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1487 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 "a" (&c->redDither)
1489 );
1490 return;
1491 default: break;
1492 }
cf7d1c1a 1493#endif //HAVE_MMX
25593e29 1494YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
d604bab9
MN
1495}
1496
1497/**
1498 * YV12 to RGB without scaling or interpolating
1499 */
25593e29 1500static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1501 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1502{
2da0d70d
DB
1503 const int yalpha1=0;
1504 int i;
6a4970ab 1505
2da0d70d
DB
1506 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1507 const int yalpha= 4096; //FIXME ...
96034638 1508
2da0d70d
DB
1509 if (flags&SWS_FULL_CHR_H_INT)
1510 {
1511 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1512 return;
1513 }
397c035e
MN
1514
1515#ifdef HAVE_MMX
e5091488 1516 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
2da0d70d
DB
1517 {
1518 switch(dstFormat)
1519 {
1520 case PIX_FMT_RGB32:
1521 asm volatile(
1522 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1523 "mov %4, %%"REG_b" \n\t"
1524 "push %%"REG_BP" \n\t"
1525 YSCALEYUV2RGB1(%%REGBP, %5)
1526 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1527 "pop %%"REG_BP" \n\t"
1528 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1529
1530 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1531 "a" (&c->redDither)
1532 );
1533 return;
1534 case PIX_FMT_BGR24:
1535 asm volatile(
1536 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1537 "mov %4, %%"REG_b" \n\t"
1538 "push %%"REG_BP" \n\t"
1539 YSCALEYUV2RGB1(%%REGBP, %5)
1540 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1541 "pop %%"REG_BP" \n\t"
1542 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1543
1544 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545 "a" (&c->redDither)
1546 );
1547 return;
1548 case PIX_FMT_BGR555:
1549 asm volatile(
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1555#ifdef DITHER1XBPP
2da0d70d
DB
1556 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1557 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1558 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1559#endif
1560 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1563
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565 "a" (&c->redDither)
1566 );
1567 return;
1568 case PIX_FMT_BGR565:
1569 asm volatile(
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1575#ifdef DITHER1XBPP
2da0d70d
DB
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1579#endif
1580
1581 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1582 "pop %%"REG_BP" \n\t"
1583 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1584
1585 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1586 "a" (&c->redDither)
1587 );
1588 return;
1589 case PIX_FMT_YUYV422:
1590 asm volatile(
1591 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1592 "mov %4, %%"REG_b" \n\t"
1593 "push %%"REG_BP" \n\t"
1594 YSCALEYUV2PACKED1(%%REGBP, %5)
1595 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1596 "pop %%"REG_BP" \n\t"
1597 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1598
1599 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1600 "a" (&c->redDither)
1601 );
1602 return;
1603 }
1604 }
1605 else
1606 {
1607 switch(dstFormat)
1608 {
1609 case PIX_FMT_RGB32:
1610 asm volatile(
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2RGB1b(%%REGBP, %5)
1615 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1618
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620 "a" (&c->redDither)
1621 );
1622 return;
1623 case PIX_FMT_BGR24:
1624 asm volatile(
1625 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1626 "mov %4, %%"REG_b" \n\t"
1627 "push %%"REG_BP" \n\t"
1628 YSCALEYUV2RGB1b(%%REGBP, %5)
1629 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1630 "pop %%"REG_BP" \n\t"
1631 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1632
1633 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1634 "a" (&c->redDither)
1635 );
1636 return;
1637 case PIX_FMT_BGR555:
1638 asm volatile(
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1644#ifdef DITHER1XBPP
2da0d70d
DB
1645 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1646 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1647 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1648#endif
1649 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1652
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654 "a" (&c->redDither)
1655 );
1656 return;
1657 case PIX_FMT_BGR565:
1658 asm volatile(
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1664#ifdef DITHER1XBPP
2da0d70d
DB
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1668#endif
1669
1670 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1671 "pop %%"REG_BP" \n\t"
1672 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1673
1674 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1675 "a" (&c->redDither)
1676 );
1677 return;
1678 case PIX_FMT_YUYV422:
1679 asm volatile(
1680 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1681 "mov %4, %%"REG_b" \n\t"
1682 "push %%"REG_BP" \n\t"
1683 YSCALEYUV2PACKED1b(%%REGBP, %5)
1684 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1685 "pop %%"REG_BP" \n\t"
1686 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1687
1688 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1689 "a" (&c->redDither)
1690 );
1691 return;
1692 }
1693 }
bc279024 1694#endif /* HAVE_MMX */
e5091488 1695 if (uvalpha < 2048)
2da0d70d
DB
1696 {
1697 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1698 }else{
1699 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1700 }
d604bab9
MN
1701}
1702
6ff0ad6b
MN
1703//FIXME yuy2* can read upto 7 samples to much
1704
7f526efd 1705static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1e621b18 1706{
6ff0ad6b 1707#ifdef HAVE_MMX
2da0d70d
DB
1708 asm volatile(
1709 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1710 "mov %0, %%"REG_a" \n\t"
1711 "1: \n\t"
1712 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1713 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1714 "pand %%mm2, %%mm0 \n\t"
1715 "pand %%mm2, %%mm1 \n\t"
1716 "packuswb %%mm1, %%mm0 \n\t"
1717 "movq %%mm0, (%2, %%"REG_a") \n\t"
1718 "add $8, %%"REG_a" \n\t"
1719 " js 1b \n\t"
1720 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1721 : "%"REG_a
1722 );
1e621b18 1723#else
2da0d70d
DB
1724 int i;
1725 for (i=0; i<width; i++)
1726 dst[i]= src[2*i];
1e621b18
MN
1727#endif
1728}
1729
7f526efd 1730static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1e621b18 1731{
c2271987 1732#ifdef HAVE_MMX
2da0d70d
DB
1733 asm volatile(
1734 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1735 "mov %0, %%"REG_a" \n\t"
1736 "1: \n\t"
1737 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1738 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1739 "psrlw $8, %%mm0 \n\t"
1740 "psrlw $8, %%mm1 \n\t"
1741 "packuswb %%mm1, %%mm0 \n\t"
1742 "movq %%mm0, %%mm1 \n\t"
1743 "psrlw $8, %%mm0 \n\t"
1744 "pand %%mm4, %%mm1 \n\t"
1745 "packuswb %%mm0, %%mm0 \n\t"
1746 "packuswb %%mm1, %%mm1 \n\t"
1747 "movd %%mm0, (%3, %%"REG_a") \n\t"
1748 "movd %%mm1, (%2, %%"REG_a") \n\t"
1749 "add $4, %%"REG_a" \n\t"
1750 " js 1b \n\t"
1751 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1752 : "%"REG_a
1753 );
1e621b18 1754#else
2da0d70d
DB
1755 int i;
1756 for (i=0; i<width; i++)
1757 {
1758 dstU[i]= src1[4*i + 1];
1759 dstV[i]= src1[4*i + 3];
1760 }
1761#endif
1762 assert(src1 == src2);
1e621b18
MN
1763}
1764
4cf16bbe
DB
1765/* This is almost identical to the previous, end exists only because
1766 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
7f526efd 1767static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
7322a67c
MN
1768{
1769#ifdef HAVE_MMX
2da0d70d
DB
1770 asm volatile(
1771 "mov %0, %%"REG_a" \n\t"
1772 "1: \n\t"
1773 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1774 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1775 "psrlw $8, %%mm0 \n\t"
1776 "psrlw $8, %%mm1 \n\t"
1777 "packuswb %%mm1, %%mm0 \n\t"
1778 "movq %%mm0, (%2, %%"REG_a") \n\t"
1779 "add $8, %%"REG_a" \n\t"
1780 " js 1b \n\t"
1781 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1782 : "%"REG_a
1783 );
7322a67c 1784#else
2da0d70d
DB
1785 int i;
1786 for (i=0; i<width; i++)
1787 dst[i]= src[2*i+1];
7322a67c
MN
1788#endif
1789}
1790
7f526efd 1791static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
7322a67c 1792{
c2271987 1793#ifdef HAVE_MMX
2da0d70d
DB
1794 asm volatile(
1795 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1796 "mov %0, %%"REG_a" \n\t"
1797 "1: \n\t"
1798 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1799 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1800 "pand %%mm4, %%mm0 \n\t"
1801 "pand %%mm4, %%mm1 \n\t"
1802 "packuswb %%mm1, %%mm0 \n\t"
1803 "movq %%mm0, %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "pand %%mm4, %%mm1 \n\t"
1806 "packuswb %%mm0, %%mm0 \n\t"
1807 "packuswb %%mm1, %%mm1 \n\t"
1808 "movd %%mm0, (%3, %%"REG_a") \n\t"
1809 "movd %%mm1, (%2, %%"REG_a") \n\t"
1810 "add $4, %%"REG_a" \n\t"
1811 " js 1b \n\t"
1812 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1813 : "%"REG_a
1814 );
7322a67c 1815#else
2da0d70d
DB
1816 int i;
1817 for (i=0; i<width; i++)
1818 {
1819 dstU[i]= src1[4*i + 0];
1820 dstV[i]= src1[4*i + 2];
1821 }
1822#endif
1823 assert(src1 == src2);
7322a67c
MN
1824}
1825
1e621b18
MN
1826static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1827{
2da0d70d
DB
1828 int i;
1829 for (i=0; i<width; i++)
1830 {
1831 int b= ((uint32_t*)src)[i]&0xFF;
1832 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1833 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1834
e5091488 1835 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1836 }
1e621b18
MN
1837}
1838
1839static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1840{
2da0d70d
DB
1841 int i;
1842 assert(src1 == src2);
1843 for (i=0; i<width; i++)
1844 {
1845 const int a= ((uint32_t*)src1)[2*i+0];
1846 const int e= ((uint32_t*)src1)[2*i+1];
1847 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1848 const int h= (a&0x00FF00) + (e&0x00FF00);
1849 const int b= l&0x3FF;
1850 const int g= h>>8;
1851 const int r= l>>16;
1852
1853 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1855 }
1e621b18
MN
1856}
1857
7f526efd 1858static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1e621b18 1859{
ac6a2e45 1860#ifdef HAVE_MMX
2da0d70d
DB
1861 asm volatile(
1862 "mov %2, %%"REG_a" \n\t"
5802683a
RD
1863 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1864 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2da0d70d
DB
1865 "pxor %%mm7, %%mm7 \n\t"
1866 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1867 ASMALIGN(4)
1868 "1: \n\t"
1869 PREFETCH" 64(%0, %%"REG_d") \n\t"
1870 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1871 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1872 "punpcklbw %%mm7, %%mm0 \n\t"
1873 "punpcklbw %%mm7, %%mm1 \n\t"
1874 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1875 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1876 "punpcklbw %%mm7, %%mm2 \n\t"
1877 "punpcklbw %%mm7, %%mm3 \n\t"
1878 "pmaddwd %%mm6, %%mm0 \n\t"
1879 "pmaddwd %%mm6, %%mm1 \n\t"
1880 "pmaddwd %%mm6, %%mm2 \n\t"
1881 "pmaddwd %%mm6, %%mm3 \n\t"
ac6a2e45 1882#ifndef FAST_BGR2YV12
2da0d70d
DB
1883 "psrad $8, %%mm0 \n\t"
1884 "psrad $8, %%mm1 \n\t"
1885 "psrad $8, %%mm2 \n\t"
1886 "psrad $8, %%mm3 \n\t"
1887#endif
1888 "packssdw %%mm1, %%mm0 \n\t"
1889 "packssdw %%mm3, %%mm2 \n\t"
1890 "pmaddwd %%mm5, %%mm0 \n\t"
1891 "pmaddwd %%mm5, %%mm2 \n\t"
1892 "packssdw %%mm2, %%mm0 \n\t"
1893 "psraw $7, %%mm0 \n\t"
1894
1895 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1896 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1897 "punpcklbw %%mm7, %%mm4 \n\t"
1898 "punpcklbw %%mm7, %%mm1 \n\t"
1899 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1900 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1901 "punpcklbw %%mm7, %%mm2 \n\t"
1902 "punpcklbw %%mm7, %%mm3 \n\t"
1903 "pmaddwd %%mm6, %%mm4 \n\t"
1904 "pmaddwd %%mm6, %%mm1 \n\t"
1905 "pmaddwd %%mm6, %%mm2 \n\t"
1906 "pmaddwd %%mm6, %%mm3 \n\t"
ac6a2e45 1907#ifndef FAST_BGR2YV12
2da0d70d
DB
1908 "psrad $8, %%mm4 \n\t"
1909 "psrad $8, %%mm1 \n\t"
1910 "psrad $8, %%mm2 \n\t"
1911 "psrad $8, %%mm3 \n\t"
1912#endif
1913 "packssdw %%mm1, %%mm4 \n\t"
1914 "packssdw %%mm3, %%mm2 \n\t"
1915 "pmaddwd %%mm5, %%mm4 \n\t"
1916 "pmaddwd %%mm5, %%mm2 \n\t"
1917 "add $24, %%"REG_d" \n\t"
1918 "packssdw %%mm2, %%mm4 \n\t"
1919 "psraw $7, %%mm4 \n\t"
1920
1921 "packuswb %%mm4, %%mm0 \n\t"
5802683a 1922 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2da0d70d
DB
1923
1924 "movq %%mm0, (%1, %%"REG_a") \n\t"
1925 "add $8, %%"REG_a" \n\t"
1926 " js 1b \n\t"
1927 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1928 : "%"REG_a, "%"REG_d
1929 );
1e621b18 1930#else
2da0d70d
DB
1931 int i;
1932 for (i=0; i<width; i++)
1933 {
1934 int b= src[i*3+0];
1935 int g= src[i*3+1];
1936 int r= src[i*3+2];
1e621b18 1937
e5091488 1938 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 1939 }
bc279024 1940#endif /* HAVE_MMX */
1e621b18
MN
1941}
1942
7f526efd 1943static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1e621b18 1944{
4342fc14 1945#ifdef HAVE_MMX
2da0d70d
DB
1946 asm volatile(
1947 "mov %3, %%"REG_a" \n\t"
5802683a
RD
1948 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1949 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2da0d70d
DB
1950 "pxor %%mm7, %%mm7 \n\t"
1951 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1952 "add %%"REG_d", %%"REG_d" \n\t"
1953 ASMALIGN(4)
1954 "1: \n\t"
1955 PREFETCH" 64(%0, %%"REG_d") \n\t"
4342fc14 1956#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2da0d70d
DB
1957 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1958 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1959 "movq %%mm0, %%mm1 \n\t"
1960 "movq %%mm2, %%mm3 \n\t"
1961 "psrlq $24, %%mm0 \n\t"
1962 "psrlq $24, %%mm2 \n\t"
1963 PAVGB(%%mm1, %%mm0)
1964 PAVGB(%%mm3, %%mm2)
1965 "punpcklbw %%mm7, %%mm0 \n\t"
1966 "punpcklbw %%mm7, %%mm2 \n\t"
4342fc14 1967#else
2da0d70d
DB
1968 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1969 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1970 "punpcklbw %%mm7, %%mm0 \n\t"
1971 "punpcklbw %%mm7, %%mm2 \n\t"
1972 "paddw %%mm2, %%mm0 \n\t"
1973 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1974 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1975 "punpcklbw %%mm7, %%mm4 \n\t"
1976 "punpcklbw %%mm7, %%mm2 \n\t"
1977 "paddw %%mm4, %%mm2 \n\t"
1978 "psrlw $1, %%mm0 \n\t"
1979 "psrlw $1, %%mm2 \n\t"
1980#endif
5802683a
RD
1981 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1982 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2da0d70d
DB
1983
1984 "pmaddwd %%mm0, %%mm1 \n\t"
1985 "pmaddwd %%mm2, %%mm3 \n\t"
1986 "pmaddwd %%mm6, %%mm0 \n\t"
1987 "pmaddwd %%mm6, %%mm2 \n\t"
4342fc14 1988#ifndef FAST_BGR2YV12
2da0d70d
DB
1989 "psrad $8, %%mm0 \n\t"
1990 "psrad $8, %%mm1 \n\t"
1991 "psrad $8, %%mm2 \n\t"
1992 "psrad $8, %%mm3 \n\t"
1993#endif
1994 "packssdw %%mm2, %%mm0 \n\t"
1995 "packssdw %%mm3, %%mm1 \n\t"
1996 "pmaddwd %%mm5, %%mm0 \n\t"
1997 "pmaddwd %%mm5, %%mm1 \n\t"
1998 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1999 "psraw $7, %%mm0 \n\t"
4342fc14
MN
2000
2001#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2da0d70d
DB
2002 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2003 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2004 "movq %%mm4, %%mm1 \n\t"
2005 "movq %%mm2, %%mm3 \n\t"
2006 "psrlq $24, %%mm4 \n\t"
2007 "psrlq $24, %%mm2 \n\t"
2008 PAVGB(%%mm1, %%mm4)
2009 PAVGB(%%mm3, %%mm2)
2010 "punpcklbw %%mm7, %%mm4 \n\t"
2011 "punpcklbw %%mm7, %%mm2 \n\t"
4342fc14 2012#else
2da0d70d
DB
2013 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2014 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2015 "punpcklbw %%mm7, %%mm4 \n\t"
2016 "punpcklbw %%mm7, %%mm2 \n\t"
2017 "paddw %%mm2, %%mm4 \n\t"
2018 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2019 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2020 "punpcklbw %%mm7, %%mm5 \n\t"
2021 "punpcklbw %%mm7, %%mm2 \n\t"
2022 "paddw %%mm5, %%mm2 \n\t"
5802683a 2023 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2da0d70d
DB
2024 "psrlw $2, %%mm4 \n\t"
2025 "psrlw $2, %%mm2 \n\t"
2026#endif
5802683a
RD
2027 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2028 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2da0d70d
DB
2029
2030 "pmaddwd %%mm4, %%mm1 \n\t"
2031 "pmaddwd %%mm2, %%mm3 \n\t"
2032 "pmaddwd %%mm6, %%mm4 \n\t"
2033 "pmaddwd %%mm6, %%mm2 \n\t"
4342fc14 2034#ifndef FAST_BGR2YV12
2da0d70d
DB
2035 "psrad $8, %%mm4 \n\t"
2036 "psrad $8, %%mm1 \n\t"
2037 "psrad $8, %%mm2 \n\t"
2038 "psrad $8, %%mm3 \n\t"
2039#endif
2040 "packssdw %%mm2, %%mm4 \n\t"
2041 "packssdw %%mm3, %%mm1 \n\t"
2042 "pmaddwd %%mm5, %%mm4 \n\t"
2043 "pmaddwd %%mm5, %%mm1 \n\t"
2044 "add $24, %%"REG_d" \n\t"
2045 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2046 "psraw $7, %%mm4 \n\t"
2047
2048 "movq %%mm0, %%mm1 \n\t"
2049 "punpckldq %%mm4, %%mm0 \n\t"
2050 "punpckhdq %%mm4, %%mm1 \n\t"
2051 "packsswb %%mm1, %%mm0 \n\t"
5802683a 2052 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2da0d70d
DB
2053
2054 "movd %%mm0, (%1, %%"REG_a") \n\t"
2055 "punpckhdq %%mm0, %%mm0 \n\t"
2056 "movd %%mm0, (%2, %%"REG_a") \n\t"
2057 "add $4, %%"REG_a" \n\t"
2058 " js 1b \n\t"
2059 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2060 : "%"REG_a, "%"REG_d
2061 );
1e621b18 2062#else
2da0d70d
DB
2063 int i;
2064 for (i=0; i<width; i++)
2065 {
2066 int b= src1[6*i + 0] + src1[6*i + 3];
2067 int g= src1[6*i + 1] + src1[6*i + 4];
2068 int r= src1[6*i + 2] + src1[6*i + 5];
2069
2070 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2072 }
bc279024 2073#endif /* HAVE_MMX */
2da0d70d 2074 assert(src1 == src2);
1e621b18
MN
2075}
2076
a680708d 2077static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
6af250ea 2078{
2da0d70d
DB
2079 int i;
2080 for (i=0; i<width; i++)
2081 {
2082 int d= ((uint16_t*)src)[i];
2083 int b= d&0x1F;
2084 int g= (d>>5)&0x3F;
2085 int r= (d>>11)&0x1F;
2086
2087 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2088 }
6af250ea
MN
2089}
2090
a680708d 2091static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
6af250ea 2092{
2da0d70d
DB
2093 int i;
2094 assert(src1==src2);
2095 for (i=0; i<width; i++)
2096 {
2097 int d0= ((uint32_t*)src1)[i];
2098
2099 int dl= (d0&0x07E0F81F);
2100 int dh= ((d0>>5)&0x07C0F83F);
2101
2102 int dh2= (dh>>11) + (dh<<21);
2103 int d= dh2 + dl;
2104
2105 int b= d&0x7F;
2106 int r= (d>>11)&0x7F;
2107 int g= d>>21;
2108 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2110 }
6af250ea
MN
2111}
2112
a680708d 2113static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
b72034dd 2114{
2da0d70d
DB
2115 int i;
2116 for (i=0; i<width; i++)
2117 {
2118 int d= ((uint16_t*)src)[i];
2119 int b= d&0x1F;
2120 int g= (d>>5)&0x1F;
2121 int r= (d>>10)&0x1F;
2122
2123 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2124 }
b72034dd
MN
2125}
2126
a680708d 2127static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
b72034dd 2128{
2da0d70d
DB
2129 int i;
2130 assert(src1==src2);
2131 for (i=0; i<width; i++)
2132 {
2133 int d0= ((uint32_t*)src1)[i];
2134
2135 int dl= (d0&0x03E07C1F);
2136 int dh= ((d0>>5)&0x03E0F81F);
2137
2138 int dh2= (dh>>11) + (dh<<21);
2139 int d= dh2 + dl;
2140
2141 int b= d&0x7F;
2142 int r= (d>>10)&0x7F;
2143 int g= d>>21;
2144 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2146 }
b72034dd
MN
2147}
2148
2149
a861d4d7
MN
2150static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2151{
2da0d70d
DB
2152 int i;
2153 for (i=0; i<width; i++)
2154 {
2155 int r= ((uint32_t*)src)[i]&0xFF;
2156 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2157 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2158
e5091488 2159 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 2160 }
a861d4d7
MN
2161}
2162
2163static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2164{
2da0d70d
DB
2165 int i;
2166 assert(src1==src2);
2167 for (i=0; i<width; i++)
2168 {
2169 const int a= ((uint32_t*)src1)[2*i+0];
2170 const int e= ((uint32_t*)src1)[2*i+1];
2171 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2172 const int h= (a&0x00FF00) + (e&0x00FF00);
2173 const int r= l&0x3FF;
2174 const int g= h>>8;
2175 const int b= l>>16;
2176
2177 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2179 }
a861d4d7
MN
2180}
2181
2182static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2183{
2da0d70d
DB
2184 int i;
2185 for (i=0; i<width; i++)
2186 {
2187 int r= src[i*3+0];
2188 int g= src[i*3+1];
2189 int b= src[i*3+2];
2190
e5091488 2191 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 2192 }
a861d4d7
MN
2193}
2194
2195static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2196{
2da0d70d
DB
2197 int i;
2198 assert(src1==src2);
2199 for (i=0; i<width; i++)
2200 {
2201 int r= src1[6*i + 0] + src1[6*i + 3];
2202 int g= src1[6*i + 1] + src1[6*i + 4];
2203 int b= src1[6*i + 2] + src1[6*i + 5];
2204
2205 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2207 }
a861d4d7
MN
2208}
2209
a680708d 2210static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
a43fb6b3 2211{
2da0d70d
DB
2212 int i;
2213 for (i=0; i<width; i++)
2214 {
2215 int d= ((uint16_t*)src)[i];
2216 int r= d&0x1F;
2217 int g= (d>>5)&0x3F;
2218 int b= (d>>11)&0x1F;
2219
2220 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2221 }
a43fb6b3
LA
2222}
2223
a680708d 2224static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
a43fb6b3 2225{
2da0d70d
DB
2226 int i;
2227 assert(src1 == src2);
2228 for (i=0; i<width; i++)
2229 {
2230 int d0= ((uint32_t*)src1)[i];
2231
2232 int dl= (d0&0x07E0F81F);
fa884294 2233 int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2da0d70d 2234
fa884294
IP
2235 int r= d&0x3F;
2236 int b= (d>>11)&0x3F;
2da0d70d
DB
2237 int g= d>>21;
2238 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2239 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2240 }
a43fb6b3
LA
2241}
2242
a680708d 2243static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
a43fb6b3 2244{
2da0d70d
DB
2245 int i;
2246 for (i=0; i<width; i++)
2247 {
2248 int d= ((uint16_t*)src)[i];
2249 int r= d&0x1F;
2250 int g= (d>>5)&0x1F;
2251 int b= (d>>10)&0x1F;
2252
2253 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2254 }
a43fb6b3
LA
2255}
2256
a680708d 2257static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
a43fb6b3 2258{
2da0d70d
DB
2259 int i;
2260 assert(src1 == src2);
2261 for (i=0; i<width; i++)
2262 {
2263 int d0= ((uint32_t*)src1)[i];
2264
2265 int dl= (d0&0x03E07C1F);
f96829d2 2266 int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2da0d70d 2267
f96829d2
IP
2268 int r= d&0x3F;
2269 int b= (d>>10)&0x3F;
2270 int g= d>>21;
2da0d70d
DB
2271 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2272 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2273 }
a43fb6b3 2274}
1e621b18 2275
e28630fc
MN
2276static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2277{
2da0d70d
DB
2278 int i;
2279 for (i=0; i<width; i++)
2280 {
2281 int d= src[i];
e28630fc 2282
2da0d70d
DB
2283 dst[i]= pal[d] & 0xFF;
2284 }
e28630fc
MN
2285}
2286
2287static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2288{
2da0d70d
DB
2289 int i;
2290 assert(src1 == src2);
2291 for (i=0; i<width; i++)
2292 {
2293 int p= pal[src1[i]];
2294
2295 dstU[i]= p>>8;
2296 dstV[i]= p>>16;
2297 }
e28630fc
MN
2298}
2299
077ea8a7
MN
2300// Bilinear / Bicubic scaling
2301static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2da0d70d 2302 int16_t *filter, int16_t *filterPos, long filterSize)
2ff198c1 2303{
077ea8a7 2304#ifdef HAVE_MMX
2da0d70d
DB
2305 assert(filterSize % 4 == 0 && filterSize>0);
2306 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2307 {
2308 long counter= -2*dstW;
2309 filter-= counter*2;
2310 filterPos-= counter/2;
2311 dst-= counter/2;
2312 asm volatile(
83c89c78 2313#if defined(PIC)
2da0d70d
DB
2314 "push %%"REG_b" \n\t"
2315#endif
2316 "pxor %%mm7, %%mm7 \n\t"
2317 "movq "MANGLE(w02)", %%mm6 \n\t"
2318 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2319 "mov %%"REG_a", %%"REG_BP" \n\t"
2320 ASMALIGN(4)
2321 "1: \n\t"
2322 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2323 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2324 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2325 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2326 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2327 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2328 "punpcklbw %%mm7, %%mm0 \n\t"
2329 "punpcklbw %%mm7, %%mm2 \n\t"
2330 "pmaddwd %%mm1, %%mm0 \n\t"
2331 "pmaddwd %%mm2, %%mm3 \n\t"
2332 "psrad $8, %%mm0 \n\t"
2333 "psrad $8, %%mm3 \n\t"
2334 "packssdw %%mm3, %%mm0 \n\t"
2335 "pmaddwd %%mm6, %%mm0 \n\t"
2336 "packssdw %%mm0, %%mm0 \n\t"
2337 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2338 "add $4, %%"REG_BP" \n\t"
2339 " jnc 1b \n\t"
2340
2341 "pop %%"REG_BP" \n\t"
83c89c78 2342#if defined(PIC)
2da0d70d 2343 "pop %%"REG_b" \n\t"
83c89c78 2344#endif
2da0d70d
DB
2345 : "+a" (counter)
2346 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2347#if !defined(PIC)
2da0d70d
DB
2348 : "%"REG_b
2349#endif
2350 );
2351 }
2352 else if (filterSize==8)
2353 {
2354 long counter= -2*dstW;
2355 filter-= counter*4;
2356 filterPos-= counter/2;
2357 dst-= counter/2;
2358 asm volatile(
83c89c78 2359#if defined(PIC)
2da0d70d
DB
2360 "push %%"REG_b" \n\t"
2361#endif
2362 "pxor %%mm7, %%mm7 \n\t"
2363 "movq "MANGLE(w02)", %%mm6 \n\t"
2364 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2365 "mov %%"REG_a", %%"REG_BP" \n\t"
2366 ASMALIGN(4)
2367 "1: \n\t"
2368 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2369 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2370 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2371 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2372 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2373 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2374 "punpcklbw %%mm7, %%mm0 \n\t"
2375 "punpcklbw %%mm7, %%mm2 \n\t"
2376 "pmaddwd %%mm1, %%mm0 \n\t"
2377 "pmaddwd %%mm2, %%mm3 \n\t"
2378
2379 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2380 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2381 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2382 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2383 "punpcklbw %%mm7, %%mm4 \n\t"
2384 "punpcklbw %%mm7, %%mm2 \n\t"
2385 "pmaddwd %%mm1, %%mm4 \n\t"
2386 "pmaddwd %%mm2, %%mm5 \n\t"
2387 "paddd %%mm4, %%mm0 \n\t"
2388 "paddd %%mm5, %%mm3 \n\t"
2389
2390 "psrad $8, %%mm0 \n\t"
2391 "psrad $8, %%mm3 \n\t"
2392 "packssdw %%mm3, %%mm0 \n\t"
2393 "pmaddwd %%mm6, %%mm0 \n\t"
2394 "packssdw %%mm0, %%mm0 \n\t"
2395 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2396 "add $4, %%"REG_BP" \n\t"
2397 " jnc 1b \n\t"
2398
2399 "pop %%"REG_BP" \n\t"
83c89c78 2400#if defined(PIC)
2da0d70d 2401 "pop %%"REG_b" \n\t"
83c89c78 2402#endif
2da0d70d
DB
2403 : "+a" (counter)
2404 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2405#if !defined(PIC)
2da0d70d
DB
2406 : "%"REG_b
2407#endif
2408 );
2409 }
2410 else
2411 {
2412 uint8_t *offset = src+filterSize;
2413 long counter= -2*dstW;
2414 //filter-= counter*filterSize/2;
2415 filterPos-= counter/2;
2416 dst-= counter/2;
2417 asm volatile(
2418 "pxor %%mm7, %%mm7 \n\t"
2419 "movq "MANGLE(w02)", %%mm6 \n\t"
2420 ASMALIGN(4)
2421 "1: \n\t"
2422 "mov %2, %%"REG_c" \n\t"
2423 "movzwl (%%"REG_c", %0), %%eax \n\t"
2424 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2425 "mov %5, %%"REG_c" \n\t"
2426 "pxor %%mm4, %%mm4 \n\t"
2427 "pxor %%mm5, %%mm5 \n\t"
2428 "2: \n\t"
2429 "movq (%1), %%mm1 \n\t"
2430 "movq (%1, %6), %%mm3 \n\t"
2431 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2432 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2433 "punpcklbw %%mm7, %%mm0 \n\t"
2434 "punpcklbw %%mm7, %%mm2 \n\t"
2435 "pmaddwd %%mm1, %%mm0 \n\t"
2436 "pmaddwd %%mm2, %%mm3 \n\t"
2437 "paddd %%mm3, %%mm5 \n\t"
2438 "paddd %%mm0, %%mm4 \n\t"
2439 "add $8, %1 \n\t"
2440 "add $4, %%"REG_c" \n\t"
2441 "cmp %4, %%"REG_c" \n\t"
2442 " jb 2b \n\t"
2443 "add %6, %1 \n\t"
2444 "psrad $8, %%mm4 \n\t"
2445 "psrad $8, %%mm5 \n\t"
2446 "packssdw %%mm5, %%mm4 \n\t"
2447 "pmaddwd %%mm6, %%mm4 \n\t"
2448 "packssdw %%mm4, %%mm4 \n\t"
2449 "mov %3, %%"REG_a" \n\t"
2450 "movd %%mm4, (%%"REG_a", %0) \n\t"
2451 "add $4, %0 \n\t"
2452 " jnc 1b \n\t"
2453
2454 : "+r" (counter), "+r" (filter)
2455 : "m" (filterPos), "m" (dst), "m"(offset),
2456 "m" (src), "r" (filterSize*2)
2457 : "%"REG_a, "%"REG_c, "%"REG_d
2458 );
2459 }
077ea8a7 2460#else
8c266f0c 2461#ifdef HAVE_ALTIVEC
2da0d70d 2462 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2463#else
2da0d70d
DB
2464 int i;
2465 for (i=0; i<dstW; i++)
2466 {
2467 int j;
2468 int srcPos= filterPos[i];
2469 int val=0;
2470 //printf("filterPos: %d\n", filterPos[i]);
2471 for (j=0; j<filterSize; j++)
2472 {
2473 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2474 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2475 }
2476 //filter += hFilterSize;
2477 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2478 //dst[i] = val>>7;
2479 }
bc279024
DB
2480#endif /* HAVE_ALTIVEC */
2481#endif /* HAVE_MMX */
077ea8a7 2482}
2ff198c1 2483 // *** horizontal scale Y line to temp buffer
065ee1ec 2484static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2da0d70d
DB
2485 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2486 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2487 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2488 int32_t *mmx2FilterPos, uint8_t *pal)
077ea8a7 2489{
2da0d70d 2490 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
1e621b18 2491 {
2da0d70d
DB
2492 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2493 src= formatConvBuffer;
1e621b18 2494 }
2da0d70d 2495 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
7322a67c 2496 {
2da0d70d
DB
2497 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2498 src= formatConvBuffer;
7322a67c 2499 }
2da0d70d 2500 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2501 {
2da0d70d
DB
2502 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2503 src= formatConvBuffer;
1e621b18 2504 }
2da0d70d 2505 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2506 {
2da0d70d
DB
2507 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2508 src= formatConvBuffer;
1e621b18 2509 }
2da0d70d 2510 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2511 {
2da0d70d
DB
2512 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2513 src= formatConvBuffer;
6af250ea 2514 }
2da0d70d 2515 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2516 {
2da0d70d
DB
2517 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2518 src= formatConvBuffer;
b72034dd 2519 }
2da0d70d 2520 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2521 {
2da0d70d
DB
2522 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2523 src= formatConvBuffer;
a861d4d7 2524 }
2da0d70d 2525 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2526 {
2da0d70d
DB
2527 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2528 src= formatConvBuffer;
a861d4d7 2529 }
2da0d70d 2530 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2531 {
2da0d70d
DB
2532 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2533 src= formatConvBuffer;
a43fb6b3 2534 }
2da0d70d 2535 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2536 {
2da0d70d
DB
2537 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2538 src= formatConvBuffer;
a43fb6b3 2539 }
2da0d70d 2540 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2541 {
87cf861c 2542 RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2da0d70d 2543 src= formatConvBuffer;
e28630fc 2544 }
1e621b18 2545
e3d2500f 2546#ifdef HAVE_MMX
ddda6fcd 2547 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2da0d70d 2548 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2549#else
2da0d70d 2550 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2551#endif
077ea8a7 2552 {
2da0d70d 2553 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
077ea8a7
MN
2554 }
2555 else // Fast Bilinear upscale / crap downscale
2556 {
3d6a30d9 2557#if defined(ARCH_X86)
2ff198c1 2558#ifdef HAVE_MMX2
2da0d70d 2559 int i;
83c89c78 2560#if defined(PIC)
2da0d70d 2561 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2562#endif
2da0d70d
DB
2563 if (canMMX2BeUsed)
2564 {
2565 asm volatile(
83c89c78 2566#if defined(PIC)
2da0d70d
DB
2567 "mov %%"REG_b", %5 \n\t"
2568#endif
2569 "pxor %%mm7, %%mm7 \n\t"
2570 "mov %0, %%"REG_c" \n\t"
2571 "mov %1, %%"REG_D" \n\t"
2572 "mov %2, %%"REG_d" \n\t"
2573 "mov %3, %%"REG_b" \n\t"
2574 "xor %%"REG_a", %%"REG_a" \n\t" // i
2575 PREFETCH" (%%"REG_c") \n\t"
2576 PREFETCH" 32(%%"REG_c") \n\t"
2577 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2578
6d606c4f
AJ
2579#ifdef ARCH_X86_64
2580
2581#define FUNNY_Y_CODE \
2da0d70d
DB
2582 "movl (%%"REG_b"), %%esi \n\t"\
2583 "call *%4 \n\t"\
2584 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2585 "add %%"REG_S", %%"REG_c" \n\t"\
2586 "add %%"REG_a", %%"REG_D" \n\t"\
2587 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2588
2589#else
2590
2ff198c1 2591#define FUNNY_Y_CODE \
2da0d70d
DB
2592 "movl (%%"REG_b"), %%esi \n\t"\
2593 "call *%4 \n\t"\
2594 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2595 "add %%"REG_a", %%"REG_D" \n\t"\
2596 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2597
bc279024 2598#endif /* ARCH_X86_64 */
6d606c4f 2599
2ff198c1
MN
2600FUNNY_Y_CODE
2601FUNNY_Y_CODE
2602FUNNY_Y_CODE
2603FUNNY_Y_CODE
2604FUNNY_Y_CODE
2605FUNNY_Y_CODE
2606FUNNY_Y_CODE
2607FUNNY_Y_CODE
2608
83c89c78 2609#if defined(PIC)
2da0d70d 2610 "mov %5, %%"REG_b" \n\t"
83c89c78 2611#endif
2da0d70d
DB
2612 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2613 "m" (funnyYCode)
83c89c78 2614#if defined(PIC)
2da0d70d 2615 ,"m" (ebxsave)
83c89c78 2616#endif
2da0d70d 2617 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2618#if !defined(PIC)
2da0d70d
DB
2619 ,"%"REG_b
2620#endif
2621 );
2622 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2623 }
2624 else
2625 {
bc279024 2626#endif /* HAVE_MMX2 */
2da0d70d
DB
2627 long xInc_shr16 = xInc >> 16;
2628 uint16_t xInc_mask = xInc & 0xffff;
2629 //NO MMX just normal asm ...
2630 asm volatile(
2631 "xor %%"REG_a", %%"REG_a" \n\t" // i
2632 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2633 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2634 ASMALIGN(4)
2635 "1: \n\t"
2636 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2637 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2638 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2639 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2640 "shll $16, %%edi \n\t"
2641 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2642 "mov %1, %%"REG_D" \n\t"
2643 "shrl $9, %%esi \n\t"
2644 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2645 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2646 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2647
2648 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2649 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2650 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2651 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2652 "shll $16, %%edi \n\t"
2653 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2654 "mov %1, %%"REG_D" \n\t"
2655 "shrl $9, %%esi \n\t"
2656 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2657 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2658 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2659
2660
2661 "add $2, %%"REG_a" \n\t"
2662 "cmp %2, %%"REG_a" \n\t"
2663 " jb 1b \n\t"
2664
2665
2666 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2667 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2668 );
2ff198c1 2669#ifdef HAVE_MMX2
2da0d70d 2670 } //if MMX2 can't be used
2ff198c1
MN
2671#endif
2672#else
2da0d70d
DB
2673 int i;
2674 unsigned int xpos=0;
2675 for (i=0;i<dstWidth;i++)
2676 {
2677 register unsigned int xx=xpos>>16;
2678 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2679 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2680 xpos+=xInc;
2681 }
bc279024 2682#endif /* defined(ARCH_X86) */
077ea8a7 2683 }
2ff198c1
MN
2684}
2685
7f526efd 2686inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2da0d70d
DB
2687 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2688 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2689 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2690 int32_t *mmx2FilterPos, uint8_t *pal)
2ff198c1 2691{
2da0d70d 2692 if (srcFormat==PIX_FMT_YUYV422)
1e621b18 2693 {
8b2fce0d 2694 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2695 src1= formatConvBuffer;
8b2fce0d 2696 src2= formatConvBuffer+VOFW;
1e621b18 2697 }
2da0d70d 2698 else if (srcFormat==PIX_FMT_UYVY422)
7322a67c 2699 {
8b2fce0d 2700 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2701 src1= formatConvBuffer;
8b2fce0d 2702 src2= formatConvBuffer+VOFW;
7322a67c 2703 }
2da0d70d 2704 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2705 {
8b2fce0d 2706 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2707 src1= formatConvBuffer;
8b2fce0d 2708 src2= formatConvBuffer+VOFW;
1e621b18 2709 }
2da0d70d 2710 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2711 {
8b2fce0d 2712 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2713 src1= formatConvBuffer;
8b2fce0d 2714 src2= formatConvBuffer+VOFW;
1e621b18 2715 }
2da0d70d 2716 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2717 {
8b2fce0d 2718 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2719 src1= formatConvBuffer;
8b2fce0d 2720 src2= formatConvBuffer+VOFW;
6af250ea 2721 }
2da0d70d 2722 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2723 {
8b2fce0d 2724 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2725 src1= formatConvBuffer;
8b2fce0d 2726 src2= formatConvBuffer+VOFW;
b72034dd 2727 }
2da0d70d 2728 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2729 {
8b2fce0d 2730 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2731 src1= formatConvBuffer;
8b2fce0d 2732 src2= formatConvBuffer+VOFW;
a861d4d7 2733 }
2da0d70d 2734 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2735 {
8b2fce0d 2736 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2737 src1= formatConvBuffer;
8b2fce0d 2738 src2= formatConvBuffer+VOFW;
a861d4d7 2739 }
2da0d70d 2740 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2741 {
8b2fce0d 2742 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2743 src1= formatConvBuffer;
8b2fce0d 2744 src2= formatConvBuffer+VOFW;
a43fb6b3 2745 }
2da0d70d 2746 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2747 {
8b2fce0d 2748 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2da0d70d 2749 src1= formatConvBuffer;
8b2fce0d 2750 src2= formatConvBuffer+VOFW;
a43fb6b3 2751 }
2da0d70d 2752 else if (isGray(srcFormat))
6ff0ad6b 2753 {
2da0d70d 2754 return;
6ff0ad6b 2755 }
2da0d70d 2756 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2757 {
87cf861c 2758 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2da0d70d 2759 src1= formatConvBuffer;
8b2fce0d 2760 src2= formatConvBuffer+VOFW;
e28630fc 2761 }
1e621b18 2762
e3d2500f 2763#ifdef HAVE_MMX
ddda6fcd 2764 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2da0d70d 2765 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2766#else
2da0d70d 2767 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2768#endif
077ea8a7 2769 {
2da0d70d 2770 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
8b2fce0d 2771 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
077ea8a7
MN
2772 }
2773 else // Fast Bilinear upscale / crap downscale
2774 {
3d6a30d9 2775#if defined(ARCH_X86)
2ff198c1 2776#ifdef HAVE_MMX2
2da0d70d 2777 int i;
83c89c78 2778#if defined(PIC)
2da0d70d 2779 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2780#endif
2da0d70d
DB
2781 if (canMMX2BeUsed)
2782 {
2783 asm volatile(
83c89c78 2784#if defined(PIC)
2da0d70d
DB
2785 "mov %%"REG_b", %6 \n\t"
2786#endif
2787 "pxor %%mm7, %%mm7 \n\t"
2788 "mov %0, %%"REG_c" \n\t"
2789 "mov %1, %%"REG_D" \n\t"
2790 "mov %2, %%"REG_d" \n\t"
2791 "mov %3, %%"REG_b" \n\t"
2792 "xor %%"REG_a", %%"REG_a" \n\t" // i
2793 PREFETCH" (%%"REG_c") \n\t"
2794 PREFETCH" 32(%%"REG_c") \n\t"
2795 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2796
6d606c4f
AJ
2797#ifdef ARCH_X86_64
2798
2799#define FUNNY_UV_CODE \
2da0d70d
DB
2800 "movl (%%"REG_b"), %%esi \n\t"\
2801 "call *%4 \n\t"\
2802 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2803 "add %%"REG_S", %%"REG_c" \n\t"\
2804 "add %%"REG_a", %%"REG_D" \n\t"\
2805 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2806
2807#else
2808
b7dc6f66 2809#define FUNNY_UV_CODE \
2da0d70d
DB
2810 "movl (%%"REG_b"), %%esi \n\t"\
2811 "call *%4 \n\t"\
2812 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2813 "add %%"REG_a", %%"REG_D" \n\t"\
2814 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2815
bc279024 2816#endif /* ARCH_X86_64 */
6d606c4f 2817
b7dc6f66
MN
2818FUNNY_UV_CODE
2819FUNNY_UV_CODE
2820FUNNY_UV_CODE
2821FUNNY_UV_CODE
2da0d70d
DB
2822 "xor %%"REG_a", %%"REG_a" \n\t" // i
2823 "mov %5, %%"REG_c" \n\t" // src
2824 "mov %1, %%"REG_D" \n\t" // buf1
8b2fce0d 2825 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2da0d70d
DB
2826 PREFETCH" (%%"REG_c") \n\t"
2827 PREFETCH" 32(%%"REG_c") \n\t"
2828 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2829
2830FUNNY_UV_CODE
2831FUNNY_UV_CODE
2832FUNNY_UV_CODE
2833FUNNY_UV_CODE
2834
83c89c78 2835#if defined(PIC)
2da0d70d 2836 "mov %6, %%"REG_b" \n\t"
83c89c78 2837#endif
2da0d70d
DB
2838 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2839 "m" (funnyUVCode), "m" (src2)
83c89c78 2840#if defined(PIC)
2da0d70d 2841 ,"m" (ebxsave)
83c89c78 2842#endif
2da0d70d 2843 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2844#if !defined(PIC)
2da0d70d
DB
2845 ,"%"REG_b
2846#endif
2847 );
2848 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2849 {
2850 //printf("%d %d %d\n", dstWidth, i, srcW);
2851 dst[i] = src1[srcW-1]*128;
8b2fce0d 2852 dst[i+VOFW] = src2[srcW-1]*128;
2da0d70d
DB
2853 }
2854 }
2855 else
2856 {
bc279024 2857#endif /* HAVE_MMX2 */
2da0d70d
DB
2858 long xInc_shr16 = (long) (xInc >> 16);
2859 uint16_t xInc_mask = xInc & 0xffff;
2860 asm volatile(
2861 "xor %%"REG_a", %%"REG_a" \n\t" // i
2862 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2863 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2864 ASMALIGN(4)
2865 "1: \n\t"
2866 "mov %0, %%"REG_S" \n\t"
2867 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2868 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2869 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2870 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2871 "shll $16, %%edi \n\t"
2872 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2873 "mov %1, %%"REG_D" \n\t"
2874 "shrl $9, %%esi \n\t"
2875 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2876
2877 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2878 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2879 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2880 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2881 "shll $16, %%edi \n\t"
2882 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2883 "mov %1, %%"REG_D" \n\t"
2884 "shrl $9, %%esi \n\t"
8b2fce0d 2885 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2da0d70d
DB
2886
2887 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2888 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2889 "add $1, %%"REG_a" \n\t"
2890 "cmp %2, %%"REG_a" \n\t"
2891 " jb 1b \n\t"
2ff198c1 2892
dc77ef7f
GP
2893/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2894 which is needed to support GCC-4.0 */
e5091488 2895#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2da0d70d 2896 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2897#else
2da0d70d 2898 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2899#endif
2da0d70d
DB
2900 "r" (src2)
2901 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2902 );
2ff198c1 2903#ifdef HAVE_MMX2
2da0d70d 2904 } //if MMX2 can't be used
2ff198c1
MN
2905#endif
2906#else
2da0d70d
DB
2907 int i;
2908 unsigned int xpos=0;
2909 for (i=0;i<dstWidth;i++)
2910 {
2911 register unsigned int xx=xpos>>16;
2912 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2913 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
8b2fce0d 2914 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2da0d70d
DB
2915 /* slower
2916 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
8b2fce0d 2917 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2da0d70d
DB
2918 */
2919 xpos+=xInc;
2920 }
bc279024 2921#endif /* defined(ARCH_X86) */
2da0d70d 2922 }
077ea8a7
MN
2923}
2924
3e499f53 2925static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2da0d70d
DB
2926 int srcSliceH, uint8_t* dst[], int dstStride[]){
2927
2928 /* load a few things into local vars to make the code more readable? and faster */
2929 const int srcW= c->srcW;
2930 const int dstW= c->dstW;
2931 const int dstH= c->dstH;
2932 const int chrDstW= c->chrDstW;
2933 const int chrSrcW= c->chrSrcW;
2934 const int lumXInc= c->lumXInc;
2935 const int chrXInc= c->chrXInc;
2936 const int dstFormat= c->dstFormat;
2937 const int srcFormat= c->srcFormat;
2938 const int flags= c->flags;
2939 const int canMMX2BeUsed= c->canMMX2BeUsed;
2940 int16_t *vLumFilterPos= c->vLumFilterPos;
2941 int16_t *vChrFilterPos= c->vChrFilterPos;
2942 int16_t *hLumFilterPos= c->hLumFilterPos;
2943 int16_t *hChrFilterPos= c->hChrFilterPos;
2944 int16_t *vLumFilter= c->vLumFilter;
2945 int16_t *vChrFilter= c->vChrFilter;
2946 int16_t *hLumFilter= c->hLumFilter;
2947 int16_t *hChrFilter= c->hChrFilter;
2948 int32_t *lumMmxFilter= c->lumMmxFilter;
2949 int32_t *chrMmxFilter= c->chrMmxFilter;
2950 const int vLumFilterSize= c->vLumFilterSize;
2951 const int vChrFilterSize= c->vChrFilterSize;
2952 const int hLumFilterSize= c->hLumFilterSize;
2953 const int hChrFilterSize= c->hChrFilterSize;
2954 int16_t **lumPixBuf= c->lumPixBuf;
2955 int16_t **chrPixBuf= c->chrPixBuf;
2956 const int vLumBufSize= c->vLumBufSize;
2957 const int vChrBufSize= c->vChrBufSize;
2958 uint8_t *funnyYCode= c->funnyYCode;
2959 uint8_t *funnyUVCode= c->funnyUVCode;
2960 uint8_t *formatConvBuffer= c->formatConvBuffer;
2961 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2962 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2963 int lastDstY;
2964 uint8_t *pal=NULL;
2965
2966 /* vars whch will change and which we need to storw back in the context */
2967 int dstY= c->dstY;
2968 int lumBufIndex= c->lumBufIndex;
2969 int chrBufIndex= c->chrBufIndex;
2970 int lastInLumBuf= c->lastInLumBuf;
2971 int lastInChrBuf= c->lastInChrBuf;
2972
2973 if (isPacked(c->srcFormat)){
2974 pal= src[1];
2975 src[0]=
2976 src[1]=
2977 src[2]= src[0];
2978 srcStride[0]=
2979 srcStride[1]=
2980 srcStride[2]= srcStride[0];
2981 }
2982 srcStride[1]<<= c->vChrDrop;
2983 srcStride[2]<<= c->vChrDrop;
2984
2985 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2986 // (int)dst[0], (int)dst[1], (int)dst[2]);
c7a810cc
MN
2987
2988#if 0 //self test FIXME move to a vfilter or something
2da0d70d
DB
2989 {
2990 static volatile int i=0;
2991 i++;
2992 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2993 selfTest(src, srcStride, c->srcW, c->srcH);
2994 i--;
2995 }
c7a810cc 2996#endif
37079906 2997
2da0d70d
DB
2998 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2999 //dstStride[0],dstStride[1],dstStride[2]);
3000
3001 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3002 {
3003 static int firstTime=1; //FIXME move this into the context perhaps
3004 if (flags & SWS_PRINT_INFO && firstTime)
3005 {
4b0c30b7
BC
3006 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3007 " ->cannot do aligned memory acesses anymore\n");
2da0d70d
DB
3008 firstTime=0;
3009 }
3010 }
3011
3012 /* Note the user might start scaling the picture in the middle so this will not get executed
3013 this is not really intended but works currently, so ppl might do it */
3014 if (srcSliceY ==0){
3015 lumBufIndex=0;
3016 chrBufIndex=0;
3017 dstY=0;
3018 lastInLumBuf= -1;
3019 lastInChrBuf= -1;
3020 }
3021
3022 lastDstY= dstY;
3023
3024 for (;dstY < dstH; dstY++){
3025 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3026 const int chrDstY= dstY>>c->chrDstVSubSample;
3027 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3028 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3029
3030 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3031 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3032 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3033 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3034
3035 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3036 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3037 //handle holes (FAST_BILINEAR & weird filters)
3038 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3039 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3040 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3041 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3042 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3043
3044 // Do we have enough lines in this slice to output the dstY line
3045 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3046 {
3047 //Do horizontal scaling
3048 while(lastInLumBuf < lastLumSrcY)
3049 {
3050 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3051 lumBufIndex++;
3052 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3053 ASSERT(lumBufIndex < 2*vLumBufSize)
3054 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3055 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3056 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3057 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3058 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3059 funnyYCode, c->srcFormat, formatConvBuffer,
3060 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3061 lastInLumBuf++;
3062 }
3063 while(lastInChrBuf < lastChrSrcY)
3064 {
3065 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3066 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3067 chrBufIndex++;
3068 ASSERT(chrBufIndex < 2*vChrBufSize)
3069 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3070 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3071 //FIXME replace parameters through context struct (some at least)
3072
3073 if (!(isGray(srcFormat) || isGray(dstFormat)))
3074 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3075 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3076 funnyUVCode, c->srcFormat, formatConvBuffer,
3077 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3078 lastInChrBuf++;
3079 }
3080 //wrap buf index around to stay inside the ring buffer
e5091488
BF
3081 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3082 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
3083 }
3084 else // not enough lines left in this slice -> load the rest in the buffer
3085 {
3086 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3087 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3088 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3089 vChrBufSize, vLumBufSize);*/
3090
3091 //Do horizontal scaling
3092 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3093 {
3094 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3095 lumBufIndex++;
3096 ASSERT(lumBufIndex < 2*vLumBufSize)
3097 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3098 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3099 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3100 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3101 funnyYCode, c->srcFormat, formatConvBuffer,
3102 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3103 lastInLumBuf++;
3104 }
3105 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3106 {
3107 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3108 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3109 chrBufIndex++;
3110 ASSERT(chrBufIndex < 2*vChrBufSize)
3111 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3112 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3113
3114 if (!(isGray(srcFormat) || isGray(dstFormat)))
3115 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3116 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3117 funnyUVCode, c->srcFormat, formatConvBuffer,
3118 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3119 lastInChrBuf++;
3120 }
3121 //wrap buf index around to stay inside the ring buffer
e5091488
BF
3122 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3123 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
3124 break; //we can't output a dstY line so let's try with the next slice
3125 }
d3f41512 3126
c1b0bfb4 3127#ifdef HAVE_MMX
0cb25594
CEH
3128 b5Dither= ff_dither8[dstY&1];
3129 g6Dither= ff_dither4[dstY&1];
3130 g5Dither= ff_dither8[dstY&1];
3131 r5Dither= ff_dither8[(dstY+1)&1];
2da0d70d
DB
3132#endif
3133 if (dstY < dstH-2)
3134 {
3135 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3136 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
6542b44e 3137#ifdef HAVE_MMX
2da0d70d
DB
3138 int i;
3139 if (flags & SWS_ACCURATE_RND){
3140 for (i=0; i<vLumFilterSize; i+=2){
3141 lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ];
3142 lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3143 lumMmxFilter[2*i+2]=
3144 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3145 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3146 }
3147 for (i=0; i<vChrFilterSize; i+=2){
3148 chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ];
3149 chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3150 chrMmxFilter[2*i+2]=
3151 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3152 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
bca11e75 3153 }
2da0d70d
DB
3154 }else{
3155 for (i=0; i<vLumFilterSize; i++)
3156 {
3157 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3158 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3159 lumMmxFilter[4*i+2]=
3160 lumMmxFilter[4*i+3]=
3161 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3162 }
3163 for (i=0; i<vChrFilterSize; i++)
3164 {
3165 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3166 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3167 chrMmxFilter[4*i+2]=
3168 chrMmxFilter[4*i+3]=
3169 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3170 }
3171 }
6542b44e 3172#endif
2da0d70d
DB
3173 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3174 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3175 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3176 RENAME(yuv2nv12X)(c,
3177 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3178 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3179 dest, uDest, dstW, chrDstW, dstFormat);
e3d2500f 3180 }
2da0d70d
DB
3181 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3182 {
3183 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3184 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3185 if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3186 {
3187 int16_t *lumBuf = lumPixBuf[0];
3188 int16_t *chrBuf= chrPixBuf[0];
3189 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3190 }
3191 else //General YV12
3192 {
3193 RENAME(yuv2yuvX)(c,
3194 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3195 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3196 dest, uDest, vDest, dstW, chrDstW);
3197 }
3198 }
3199 else
3200 {
3201 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3202 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3203 if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3204 {
3205 int chrAlpha= vChrFilter[2*dstY+1];
3206 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3207 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3208 }
3209 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3210 {
3211 int lumAlpha= vLumFilter[2*dstY+1];
3212 int chrAlpha= vChrFilter[2*dstY+1];
3213 lumMmxFilter[2]=
3214 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3215 chrMmxFilter[2]=
3216 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3217 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3218 dest, dstW, lumAlpha, chrAlpha, dstY);
3219 }
3220 else //General RGB
3221 {
3222 RENAME(yuv2packedX)(c,
3223 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3224 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3225 dest, dstW, dstY);
3226 }
3227 }
3228 }
3229 else // hmm looks like we can't use MMX here without overwriting this array's tail
3230 {
3231 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3232 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3233 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3234 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3235 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3236 yuv2nv12XinC(
3237 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3238 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3239 dest, uDest, dstW, chrDstW, dstFormat);
3240 }
3241 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3242 {
3243 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3244 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3245 yuv2yuvXinC(
3246 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3247 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3248 dest, uDest, vDest, dstW, chrDstW);
3249 }
3250 else
3251 {
3252 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3253 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3254 yuv2packedXinC(c,
3255 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3256 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3257 dest, dstW, dstY);
3258 }
3259 }
3260 }
17f715fa
MN
3261
3262#ifdef HAVE_MMX
f018bc10
RD
3263 asm volatile(SFENCE:::"memory");
3264 asm volatile(EMMS:::"memory");
17f715fa 3265#endif
2da0d70d
DB
3266 /* store changed local vars back in the context */
3267 c->dstY= dstY;
3268 c->lumBufIndex= lumBufIndex;
3269 c->chrBufIndex= chrBufIndex;
3270 c->lastInLumBuf= lastInLumBuf;
3271 c->lastInChrBuf= lastInChrBuf;
d4e24275 3272
2da0d70d 3273 return dstY - lastDstY;
627690b5 3274}