Merge variable declaration and export.
[libav.git] / libswscale / swscale_template.c
CommitLineData
fe8054c0 1/*
d026b45e
DB
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
b19bcbaa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d026b45e 19 *
8a322796
DB
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
d026b45e 22 */
783e9cc9 23
6e1c66bc 24#undef REAL_MOVNTQ
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
48a05cec
MN
27#undef PREFETCH
28#undef PREFETCHW
29#undef EMMS
30#undef SFENCE
31
32#ifdef HAVE_3DNOW
8a322796 33/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
48a05cec
MN
34#define EMMS "femms"
35#else
36#define EMMS "emms"
37#endif
38
39#ifdef HAVE_3DNOW
40#define PREFETCH "prefetch"
41#define PREFETCHW "prefetchw"
e5091488 42#elif defined (HAVE_MMX2)
48a05cec
MN
43#define PREFETCH "prefetchnta"
44#define PREFETCHW "prefetcht0"
45#else
d904b5fc
NP
46#define PREFETCH " # nop"
47#define PREFETCHW " # nop"
48a05cec
MN
48#endif
49
50#ifdef HAVE_MMX2
51#define SFENCE "sfence"
52#else
d904b5fc 53#define SFENCE " # nop"
48a05cec 54#endif
d3f41512 55
d604bab9
MN
56#ifdef HAVE_MMX2
57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58#elif defined (HAVE_3DNOW)
59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60#endif
d3f41512 61
d604bab9 62#ifdef HAVE_MMX2
6e1c66bc 63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 64#else
6e1c66bc 65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 66#endif
6e1c66bc 67#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 68
a2faa401
RD
69#ifdef HAVE_ALTIVEC
70#include "swscale_altivec_template.c"
71#endif
72
bca11e75 73#define YSCALEYUV2YV12X(x, offset, dest, width) \
2da0d70d
DB
74 asm volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
8b2fce0d
MN
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
2da0d70d
DB
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
bca11e75
MN
108
109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
2da0d70d
DB
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
8b2fce0d
MN
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
1625216e 122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
8b2fce0d 123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
2da0d70d
DB
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
1625216e
MN
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
c1b0bfb4
MN
170
171#define YSCALEYUV2YV121 \
2da0d70d
DB
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
c1b0bfb4 183
bf2bdde6
MN
184#define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
33a67bd6
MN
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
bf2bdde6
MN
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
201
c1b0bfb4 202/*
2da0d70d
DB
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
c1b0bfb4 208*/
25593e29 209#define YSCALEYUV2PACKEDX \
2da0d70d
DB
210 asm volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
8b2fce0d 223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
2da0d70d
DB
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
c1b0bfb4 232\
2da0d70d
DB
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
237 ASMALIGN(4)\
238 "2: \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
249 " jnz 2b \n\t"\
250
251#define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
256 );
8422aa88 257
bca11e75 258#define YSCALEYUV2PACKEDX_ACCURATE \
2da0d70d
DB
259 asm volatile(\
260 "xor %%"REG_a", %%"REG_a" \n\t"\
261 ASMALIGN(4)\
262 "nop \n\t"\
263 "1: \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
270 ASMALIGN(4)\
271 "2: \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
8b2fce0d 273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
1625216e 274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
1625216e 279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
2da0d70d
DB
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
8b2fce0d 284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
1625216e
MN
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
295 " jnz 2b \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
bca11e75 307\
2da0d70d
DB
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
314 ASMALIGN(4)\
315 "2: \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
1625216e 318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
2da0d70d
DB
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
1625216e 323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
2da0d70d
DB
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
1625216e
MN
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
2da0d70d
DB
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
339 " jnz 2b \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
bca11e75 351
8422aa88 352#define YSCALEYUV2RGBX \
2da0d70d
DB
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
77a49659 388#if 0
d604bab9 389#define FULL_YSCALEYUV2RGB \
2da0d70d
DB
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
398 ASMALIGN(4)\
399 "1: \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
8b2fce0d 409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
8b2fce0d 412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
418\
419\
2da0d70d
DB
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
427\
428\
2da0d70d
DB
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
d604bab9 435\
2da0d70d
DB
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
d604bab9 439\
2da0d70d 440 "packuswb %%mm1, %%mm1 \n\t"
77a49659 441#endif
d604bab9 442
6e1c66bc 443#define REAL_YSCALEYUV2PACKED(index, c) \
2da0d70d
DB
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
451 ASMALIGN(4)\
452 "1: \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6a4970ab 478
6e1c66bc 479#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
6a4970ab 480
6e1c66bc 481#define REAL_YSCALEYUV2RGB(index, c) \
2da0d70d
DB
482 "xor "#index", "#index" \n\t"\
483 ASMALIGN(4)\
484 "1: \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 545#define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
6a4970ab 546
6e1c66bc 547#define REAL_YSCALEYUV2PACKED1(index, c) \
2da0d70d
DB
548 "xor "#index", "#index" \n\t"\
549 ASMALIGN(4)\
550 "1: \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
6a4970ab 559
6e1c66bc 560#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
6a4970ab 561
6e1c66bc 562#define REAL_YSCALEYUV2RGB1(index, c) \
2da0d70d
DB
563 "xor "#index", "#index" \n\t"\
564 ASMALIGN(4)\
565 "1: \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
8b2fce0d 567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
2da0d70d
DB
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 609#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 610
6e1c66bc 611#define REAL_YSCALEYUV2PACKED1b(index, c) \
2da0d70d
DB
612 "xor "#index", "#index" \n\t"\
613 ASMALIGN(4)\
614 "1: \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
6e1c66bc 627#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
6a4970ab 628
497d4f99 629// do vertical chrominance interpolation
6e1c66bc 630#define REAL_YSCALEYUV2RGB1b(index, c) \
2da0d70d
DB
631 "xor "#index", "#index" \n\t"\
632 ASMALIGN(4)\
633 "1: \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
8b2fce0d
MN
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
2da0d70d
DB
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 681#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 682
6e1c66bc 683#define REAL_WRITEBGR32(dst, dstw, index) \
2da0d70d
DB
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 697\
2da0d70d
DB
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
d604bab9 702\
2da0d70d
DB
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
705 " jb 1b \n\t"
6e1c66bc 706#define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
d604bab9 707
27a90b04 708#define REAL_WRITERGB16(dst, dstw, index) \
2da0d70d
DB
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
d604bab9 713\
2da0d70d
DB
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
d604bab9 716\
2da0d70d
DB
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 721\
2da0d70d
DB
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
d604bab9 724\
2da0d70d
DB
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
d604bab9 727\
2da0d70d
DB
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 730\
2da0d70d
DB
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
733 " jb 1b \n\t"
27a90b04 734#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
d604bab9 735
27a90b04 736#define REAL_WRITERGB15(dst, dstw, index) \
2da0d70d
DB
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
d604bab9 742\
2da0d70d
DB
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
d604bab9 745\
2da0d70d
DB
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 750\
2da0d70d
DB
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
d604bab9 753\
2da0d70d
DB
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
d604bab9 756\
2da0d70d
DB
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 759\
2da0d70d
DB
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
762 " jb 1b \n\t"
27a90b04 763#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
f62255fb 764
6542b44e 765#define WRITEBGR24OLD(dst, dstw, index) \
2da0d70d
DB
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9 779\
2da0d70d
DB
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
d604bab9 788\
2da0d70d
DB
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
d604bab9 802\
2da0d70d
DB
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
d604bab9 811\
2da0d70d
DB
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
d604bab9 816\
2da0d70d
DB
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
819 " jb 1b \n\t"
d604bab9 820
6542b44e 821#define WRITEBGR24MMX(dst, dstw, index) \
2da0d70d
DB
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 835\
2da0d70d
DB
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
99d2cb72 840\
2da0d70d
DB
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
99d2cb72 845\
2da0d70d
DB
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
99d2cb72 850\
2da0d70d
DB
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
99d2cb72 856\
2da0d70d
DB
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 862\
2da0d70d
DB
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 867\
2da0d70d 868 "add $24, "#dst" \n\t"\
99d2cb72 869\
2da0d70d
DB
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
872 " jb 1b \n\t"
99d2cb72 873
6542b44e 874#define WRITEBGR24MMX2(dst, dstw, index) \
2da0d70d 875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
5802683a
RD
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
2da0d70d
DB
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
99d2cb72 881\
2da0d70d
DB
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
99d2cb72 885\
2da0d70d
DB
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
99d2cb72 890\
2da0d70d
DB
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
99d2cb72 895\
5802683a 896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
2da0d70d
DB
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
99d2cb72 899\
2da0d70d
DB
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
99d2cb72 903\
2da0d70d
DB
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
99d2cb72 907\
2da0d70d
DB
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
5802683a 910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72 911\
2da0d70d
DB
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 915\
2da0d70d 916 "add $24, "#dst" \n\t"\
99d2cb72 917\
2da0d70d
DB
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
920 " jb 1b \n\t"
99d2cb72
MN
921
922#ifdef HAVE_MMX2
7630f2e0 923#undef WRITEBGR24
6e1c66bc 924#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 925#else
7630f2e0 926#undef WRITEBGR24
6e1c66bc 927#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
928#endif
929
6e1c66bc 930#define REAL_WRITEYUY2(dst, dstw, index) \
2da0d70d
DB
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
25593e29 938\
2da0d70d
DB
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 941\
2da0d70d
DB
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
944 " jb 1b \n\t"
6e1c66bc 945#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
946
947
77a49659 948static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
38858470 951{
c1b0bfb4 952#ifdef HAVE_MMX
f433c8ab 953 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
954 if (c->flags & SWS_ACCURATE_RND){
955 if (uDest){
956 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
957 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
958 }
bca11e75 959
14014d47
MN
960 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
961 }else{
962 if (uDest){
963 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
964 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
965 }
2da0d70d 966
14014d47
MN
967 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
968 }
f433c8ab
MN
969 return;
970 }
971#endif
a2faa401
RD
972#ifdef HAVE_ALTIVEC
973yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
974 chrFilter, chrSrc, chrFilterSize,
975 dest, uDest, vDest, dstW, chrDstW);
a2faa401 976#else //HAVE_ALTIVEC
5859233b 977yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
978 chrFilter, chrSrc, chrFilterSize,
979 dest, uDest, vDest, dstW, chrDstW);
a2faa401 980#endif //!HAVE_ALTIVEC
c1b0bfb4 981}
2add307d 982
6118e52e 983static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
984 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
985 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
6118e52e
VS
986{
987yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
2da0d70d
DB
988 chrFilter, chrSrc, chrFilterSize,
989 dest, uDest, dstW, chrDstW, dstFormat);
6118e52e
VS
990}
991
bf2bdde6 992static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
2da0d70d 993 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
c1b0bfb4 994{
f433c8ab 995 int i;
c1b0bfb4 996#ifdef HAVE_MMX
f433c8ab 997 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
998 long p= uDest ? 3 : 1;
999 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
1000 uint8_t *dst[3]= {dest, uDest, vDest};
1001 long counter[3] = {dstW, chrDstW, chrDstW};
2da0d70d 1002
14014d47
MN
1003 if (c->flags & SWS_ACCURATE_RND){
1004 while(p--){
1005 asm volatile(
1006 YSCALEYUV2YV121_ACCURATE
1007 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1008 "g" (-counter[p])
1009 : "%"REG_a
1010 );
1011 }
1012 }else{
1013 while(p--){
1014 asm volatile(
1015 YSCALEYUV2YV121
1016 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1017 "g" (-counter[p])
1018 : "%"REG_a
1019 );
1020 }
d78c1ea1 1021 }
f433c8ab
MN
1022 return;
1023 }
1024#endif
2da0d70d
DB
1025 for (i=0; i<dstW; i++)
1026 {
a1f3ffa3 1027 int val= (lumSrc[i]+64)>>7;
2da0d70d
DB
1028
1029 if (val&256){
1030 if (val<0) val=0;
1031 else val=255;
1032 }
1033
1034 dest[i]= val;
1035 }
1036
1b0a4572 1037 if (uDest)
2da0d70d
DB
1038 for (i=0; i<chrDstW; i++)
1039 {
a1f3ffa3
MN
1040 int u=(chrSrc[i ]+64)>>7;
1041 int v=(chrSrc[i + VOFW]+64)>>7;
2da0d70d
DB
1042
1043 if ((u|v)&256){
1044 if (u<0) u=0;
1045 else if (u>255) u=255;
1046 if (v<0) v=0;
1047 else if (v>255) v=255;
1048 }
1049
1050 uDest[i]= u;
1051 vDest[i]= v;
1052 }
38858470
MN
1053}
1054
c1b0bfb4 1055
d604bab9
MN
1056/**
1057 * vertical scale YV12 to RGB
1058 */
25593e29 1059static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
2da0d70d
DB
1060 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1061 uint8_t *dest, long dstW, long dstY)
c1b0bfb4 1062{
bca11e75 1063#ifdef HAVE_MMX
f8d61128 1064 long dummy=0;
f433c8ab 1065 if(!(c->flags & SWS_BITEXACT)){
14014d47
MN
1066 if (c->flags & SWS_ACCURATE_RND){
1067 switch(c->dstFormat){
1068 case PIX_FMT_RGB32:
1069 YSCALEYUV2PACKEDX_ACCURATE
1070 YSCALEYUV2RGBX
1071 WRITEBGR32(%4, %5, %%REGa)
2da0d70d 1072
14014d47
MN
1073 YSCALEYUV2PACKEDX_END
1074 return;
1075 case PIX_FMT_BGR24:
1076 YSCALEYUV2PACKEDX_ACCURATE
1077 YSCALEYUV2RGBX
1078 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1079 "add %4, %%"REG_c" \n\t"
1080 WRITEBGR24(%%REGc, %5, %%REGa)
2da0d70d
DB
1081
1082
14014d47
MN
1083 :: "r" (&c->redDither),
1084 "m" (dummy), "m" (dummy), "m" (dummy),
1085 "r" (dest), "m" (dstW)
1086 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1087 );
1088 return;
1089 case PIX_FMT_RGB555:
1090 YSCALEYUV2PACKEDX_ACCURATE
1091 YSCALEYUV2RGBX
1092 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1093#ifdef DITHER1XBPP
14014d47
MN
1094 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1095 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1096 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
2da0d70d
DB
1097#endif
1098
14014d47
MN
1099 WRITERGB15(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1101 return;
1102 case PIX_FMT_RGB565:
1103 YSCALEYUV2PACKEDX_ACCURATE
1104 YSCALEYUV2RGBX
1105 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
bca11e75 1106#ifdef DITHER1XBPP
14014d47 1107 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
92c7b471 1108 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
14014d47 1109 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
2da0d70d
DB
1110#endif
1111
14014d47
MN
1112 WRITERGB16(%4, %5, %%REGa)
1113 YSCALEYUV2PACKEDX_END
1114 return;
1115 case PIX_FMT_YUYV422:
1116 YSCALEYUV2PACKEDX_ACCURATE
1117 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1118
1119 "psraw $3, %%mm3 \n\t"
1120 "psraw $3, %%mm4 \n\t"
1121 "psraw $3, %%mm1 \n\t"
1122 "psraw $3, %%mm7 \n\t"
1123 WRITEYUY2(%4, %5, %%REGa)
1124 YSCALEYUV2PACKEDX_END
1125 return;
1126 }
1127 }else{
1128 switch(c->dstFormat)
1129 {
1130 case PIX_FMT_RGB32:
1131 YSCALEYUV2PACKEDX
1132 YSCALEYUV2RGBX
1133 WRITEBGR32(%4, %5, %%REGa)
1134 YSCALEYUV2PACKEDX_END
1135 return;
1136 case PIX_FMT_BGR24:
1137 YSCALEYUV2PACKEDX
1138 YSCALEYUV2RGBX
1139 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1140 "add %4, %%"REG_c" \n\t"
1141 WRITEBGR24(%%REGc, %5, %%REGa)
1142
1143 :: "r" (&c->redDither),
1144 "m" (dummy), "m" (dummy), "m" (dummy),
1145 "r" (dest), "m" (dstW)
1146 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1147 );
1148 return;
1149 case PIX_FMT_RGB555:
1150 YSCALEYUV2PACKEDX
1151 YSCALEYUV2RGBX
1152 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1153#ifdef DITHER1XBPP
14014d47
MN
1154 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1155 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1156 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
2da0d70d
DB
1157#endif
1158
14014d47
MN
1159 WRITERGB15(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1161 return;
1162 case PIX_FMT_RGB565:
1163 YSCALEYUV2PACKEDX
1164 YSCALEYUV2RGBX
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
c1b0bfb4 1166#ifdef DITHER1XBPP
14014d47 1167 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
92c7b471 1168 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
14014d47 1169 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
2da0d70d
DB
1170#endif
1171
14014d47
MN
1172 WRITERGB16(%4, %5, %%REGa)
1173 YSCALEYUV2PACKEDX_END
1174 return;
1175 case PIX_FMT_YUYV422:
1176 YSCALEYUV2PACKEDX
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1185 return;
1186 }
bca11e75
MN
1187 }
1188 }
bc279024 1189#endif /* HAVE_MMX */
a31de956 1190#ifdef HAVE_ALTIVEC
2da0d70d
DB
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of altivec_yuv2packedX() */
1193 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1194 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1195 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1196 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1198 dest, dstW, dstY);
1199 else
1200#endif
1201 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1203 dest, dstW, dstY);
c1b0bfb4
MN
1204}
1205
c1b0bfb4
MN
1206/**
1207 * vertical bilinear scale YV12 to RGB
1208 */
25593e29 1209static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1210 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9 1211{
ac0ad729
MN
1212 int yalpha1=4095- yalpha;
1213 int uvalpha1=4095-uvalpha;
2da0d70d 1214 int i;
d604bab9 1215
77a416e8 1216#if 0 //isn't used
2da0d70d
DB
1217 if (flags&SWS_FULL_CHR_H_INT)
1218 {
1219 switch(dstFormat)
1220 {
cf7d1c1a 1221#ifdef HAVE_MMX
2da0d70d
DB
1222 case PIX_FMT_RGB32:
1223 asm volatile(
d604bab9
MN
1224
1225
1226FULL_YSCALEYUV2RGB
2da0d70d
DB
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
d604bab9 1229
2da0d70d
DB
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
d604bab9 1233
2da0d70d
DB
1234 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1235 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
d604bab9 1236
2da0d70d
DB
1237 "add $4, %%"REG_a" \n\t"
1238 "cmp %5, %%"REG_a" \n\t"
1239 " jb 1b \n\t"
d604bab9 1240
2da0d70d
DB
1241 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1242 "m" (yalpha1), "m" (uvalpha1)
1243 : "%"REG_a
1244 );
1245 break;
1246 case PIX_FMT_BGR24:
1247 asm volatile(
d604bab9
MN
1248
1249FULL_YSCALEYUV2RGB
1250
2da0d70d
DB
1251 // lsb ... msb
1252 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1253 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
d604bab9 1254
2da0d70d
DB
1255 "movq %%mm3, %%mm1 \n\t"
1256 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1257 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
d604bab9 1258
2da0d70d
DB
1259 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1260 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1261 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1262 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1263 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1264 "movq %%mm1, %%mm2 \n\t"
1265 "psllq $48, %%mm1 \n\t" // 000000BG
1266 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
d604bab9 1267
2da0d70d
DB
1268 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1269 "psrld $16, %%mm2 \n\t" // R000R000
1270 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1271 "por %%mm2, %%mm1 \n\t" // RBGRR000
d604bab9 1272
2da0d70d
DB
1273 "mov %4, %%"REG_b" \n\t"
1274 "add %%"REG_a", %%"REG_b" \n\t"
d604bab9
MN
1275
1276#ifdef HAVE_MMX2
2da0d70d
DB
1277 //FIXME Alignment
1278 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
d604bab9 1280#else
2da0d70d
DB
1281 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1282 "psrlq $32, %%mm3 \n\t"
1283 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1284 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1285#endif
1286 "add $4, %%"REG_a" \n\t"
1287 "cmp %5, %%"REG_a" \n\t"
1288 " jb 1b \n\t"
1289
1290 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1291 "m" (yalpha1), "m" (uvalpha1)
1292 : "%"REG_a, "%"REG_b
1293 );
1294 break;
1295 case PIX_FMT_BGR555:
1296 asm volatile(
d604bab9
MN
1297
1298FULL_YSCALEYUV2RGB
1299#ifdef DITHER1XBPP
2da0d70d
DB
1300 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1301 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1302 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
d604bab9 1303#endif
2da0d70d
DB
1304 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1305 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1306 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
d604bab9 1307
2da0d70d
DB
1308 "psrlw $3, %%mm3 \n\t"
1309 "psllw $2, %%mm1 \n\t"
1310 "psllw $7, %%mm0 \n\t"
1311 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1312 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9 1313
2da0d70d
DB
1314 "por %%mm3, %%mm1 \n\t"
1315 "por %%mm1, %%mm0 \n\t"
d604bab9 1316
2da0d70d 1317 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1318
2da0d70d
DB
1319 "add $4, %%"REG_a" \n\t"
1320 "cmp %5, %%"REG_a" \n\t"
1321 " jb 1b \n\t"
d604bab9 1322
2da0d70d
DB
1323 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1324 "m" (yalpha1), "m" (uvalpha1)
1325 : "%"REG_a
1326 );
1327 break;
1328 case PIX_FMT_BGR565:
1329 asm volatile(
d604bab9
MN
1330
1331FULL_YSCALEYUV2RGB
1332#ifdef DITHER1XBPP
92c7b471 1333 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
2da0d70d
DB
1334 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1335 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
d604bab9 1336#endif
2da0d70d
DB
1337 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1338 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1339 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
d604bab9 1340
2da0d70d
DB
1341 "psrlw $3, %%mm3 \n\t"
1342 "psllw $3, %%mm1 \n\t"
1343 "psllw $8, %%mm0 \n\t"
1344 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1345 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9 1346
2da0d70d
DB
1347 "por %%mm3, %%mm1 \n\t"
1348 "por %%mm1, %%mm0 \n\t"
d604bab9 1349
2da0d70d 1350 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1351
2da0d70d
DB
1352 "add $4, %%"REG_a" \n\t"
1353 "cmp %5, %%"REG_a" \n\t"
1354 " jb 1b \n\t"
d604bab9 1355
2da0d70d
DB
1356 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1357 "m" (yalpha1), "m" (uvalpha1)
1358 : "%"REG_a
1359 );
1360 break;
bc279024 1361#endif /* HAVE_MMX */
2da0d70d 1362 case PIX_FMT_BGR32:
cf7d1c1a 1363#ifndef HAVE_MMX
2da0d70d 1364 case PIX_FMT_RGB32:
cf7d1c1a 1365#endif
2da0d70d
DB
1366 if (dstFormat==PIX_FMT_RGB32)
1367 {
1368 int i;
df3c183a 1369#ifdef WORDS_BIGENDIAN
2da0d70d
DB
1370 dest++;
1371#endif
1372 for (i=0;i<dstW;i++){
1373 // vertical linear interpolation && yuv2rgb in a single step:
1374 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1375 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1376 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1377 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1378 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1379 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1380 dest+= 4;
1381 }
1382 }
1383 else if (dstFormat==PIX_FMT_BGR24)
1384 {
1385 int i;
1386 for (i=0;i<dstW;i++){
1387 // vertical linear interpolation && yuv2rgb in a single step:
1388 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1390 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1391 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1392 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1393 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1394 dest+= 3;
1395 }
1396 }
1397 else if (dstFormat==PIX_FMT_BGR565)
1398 {
1399 int i;
1400 for (i=0;i<dstW;i++){
1401 // vertical linear interpolation && yuv2rgb in a single step:
1402 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1403 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1404 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1405
1406 ((uint16_t*)dest)[i] =
1407 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1408 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1409 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1410 }
1411 }
1412 else if (dstFormat==PIX_FMT_BGR555)
1413 {
1414 int i;
1415 for (i=0;i<dstW;i++){
1416 // vertical linear interpolation && yuv2rgb in a single step:
1417 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1418 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
8b2fce0d 1419 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
2da0d70d
DB
1420
1421 ((uint16_t*)dest)[i] =
1422 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1423 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1424 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1425 }
1426 }
1427 }//FULL_UV_IPOL
1428 else
1429 {
cf7d1c1a 1430#endif // if 0
d604bab9 1431#ifdef HAVE_MMX
f433c8ab 1432 if(!(c->flags & SWS_BITEXACT)){
2da0d70d
DB
1433 switch(c->dstFormat)
1434 {
1435 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1436 case PIX_FMT_RGB32:
1437 asm volatile(
1438 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1439 "mov %4, %%"REG_b" \n\t"
1440 "push %%"REG_BP" \n\t"
1441 YSCALEYUV2RGB(%%REGBP, %5)
1442 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1443 "pop %%"REG_BP" \n\t"
1444 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1445
1446 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1447 "a" (&c->redDither)
1448 );
1449 return;
1450 case PIX_FMT_BGR24:
1451 asm volatile(
1452 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1453 "mov %4, %%"REG_b" \n\t"
1454 "push %%"REG_BP" \n\t"
1455 YSCALEYUV2RGB(%%REGBP, %5)
1456 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1457 "pop %%"REG_BP" \n\t"
1458 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1459 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1460 "a" (&c->redDither)
1461 );
1462 return;
27a90b04 1463 case PIX_FMT_RGB555:
2da0d70d
DB
1464 asm volatile(
1465 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1466 "mov %4, %%"REG_b" \n\t"
1467 "push %%"REG_BP" \n\t"
1468 YSCALEYUV2RGB(%%REGBP, %5)
1469 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1470#ifdef DITHER1XBPP
2da0d70d
DB
1471 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1472 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1473 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1474#endif
1475
27a90b04 1476 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1479
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481 "a" (&c->redDither)
1482 );
1483 return;
27a90b04 1484 case PIX_FMT_RGB565:
2da0d70d
DB
1485 asm volatile(
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB(%%REGBP, %5)
1490 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1491#ifdef DITHER1XBPP
2da0d70d 1492 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
92c7b471 1493 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
2da0d70d
DB
1494 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1495#endif
1496
27a90b04 1497 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
2da0d70d
DB
1498 "pop %%"REG_BP" \n\t"
1499 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1500 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1501 "a" (&c->redDither)
1502 );
1503 return;
1504 case PIX_FMT_YUYV422:
1505 asm volatile(
1506 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1507 "mov %4, %%"REG_b" \n\t"
1508 "push %%"REG_BP" \n\t"
1509 YSCALEYUV2PACKED(%%REGBP, %5)
1510 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1511 "pop %%"REG_BP" \n\t"
1512 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 "a" (&c->redDither)
1515 );
1516 return;
1517 default: break;
1518 }
f433c8ab 1519 }
cf7d1c1a 1520#endif //HAVE_MMX
ec1bca2a 1521YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
d604bab9
MN
1522}
1523
1524/**
1525 * YV12 to RGB without scaling or interpolating
1526 */
25593e29 1527static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
2da0d70d 1528 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1529{
2da0d70d
DB
1530 const int yalpha1=0;
1531 int i;
6a4970ab 1532
8a322796 1533 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
2da0d70d 1534 const int yalpha= 4096; //FIXME ...
96034638 1535
2da0d70d
DB
1536 if (flags&SWS_FULL_CHR_H_INT)
1537 {
1538 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1539 return;
1540 }
397c035e
MN
1541
1542#ifdef HAVE_MMX
f433c8ab 1543 if(!(flags & SWS_BITEXACT)){
14014d47 1544 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
2da0d70d 1545 {
14014d47
MN
1546 switch(dstFormat)
1547 {
1548 case PIX_FMT_RGB32:
1549 asm volatile(
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1555 "pop %%"REG_BP" \n\t"
1556 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1557
1558 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1559 "a" (&c->redDither)
1560 );
1561 return;
1562 case PIX_FMT_BGR24:
1563 asm volatile(
1564 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1565 "mov %4, %%"REG_b" \n\t"
1566 "push %%"REG_BP" \n\t"
1567 YSCALEYUV2RGB1(%%REGBP, %5)
1568 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1571
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573 "a" (&c->redDither)
1574 );
1575 return;
1576 case PIX_FMT_RGB555:
1577 asm volatile(
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2RGB1(%%REGBP, %5)
1582 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1583#ifdef DITHER1XBPP
14014d47
MN
1584 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1585 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1586 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
2da0d70d 1587#endif
14014d47
MN
1588 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1589 "pop %%"REG_BP" \n\t"
1590 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1591
14014d47
MN
1592 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1593 "a" (&c->redDither)
1594 );
1595 return;
1596 case PIX_FMT_RGB565:
1597 asm volatile(
1598 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1599 "mov %4, %%"REG_b" \n\t"
1600 "push %%"REG_BP" \n\t"
1601 YSCALEYUV2RGB1(%%REGBP, %5)
1602 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
d604bab9 1603#ifdef DITHER1XBPP
14014d47 1604 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
92c7b471 1605 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
14014d47 1606 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
2da0d70d
DB
1607#endif
1608
14014d47
MN
1609 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1610 "pop %%"REG_BP" \n\t"
1611 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1612
14014d47
MN
1613 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1614 "a" (&c->redDither)
1615 );
1616 return;
1617 case PIX_FMT_YUYV422:
1618 asm volatile(
1619 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1620 "mov %4, %%"REG_b" \n\t"
1621 "push %%"REG_BP" \n\t"
1622 YSCALEYUV2PACKED1(%%REGBP, %5)
1623 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1624 "pop %%"REG_BP" \n\t"
1625 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1626
1627 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1628 "a" (&c->redDither)
1629 );
1630 return;
1631 }
2da0d70d 1632 }
14014d47 1633 else
2da0d70d 1634 {
14014d47
MN
1635 switch(dstFormat)
1636 {
1637 case PIX_FMT_RGB32:
1638 asm volatile(
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1644 "pop %%"REG_BP" \n\t"
1645 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1646
1647 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1648 "a" (&c->redDither)
1649 );
1650 return;
1651 case PIX_FMT_BGR24:
1652 asm volatile(
1653 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1654 "mov %4, %%"REG_b" \n\t"
1655 "push %%"REG_BP" \n\t"
1656 YSCALEYUV2RGB1b(%%REGBP, %5)
1657 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1658 "pop %%"REG_BP" \n\t"
1659 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1660
1661 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1662 "a" (&c->redDither)
1663 );
1664 return;
1665 case PIX_FMT_RGB555:
1666 asm volatile(
1667 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1668 "mov %4, %%"REG_b" \n\t"
1669 "push %%"REG_BP" \n\t"
1670 YSCALEYUV2RGB1b(%%REGBP, %5)
1671 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1672#ifdef DITHER1XBPP
14014d47
MN
1673 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1674 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1675 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
2da0d70d 1676#endif
14014d47
MN
1677 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1678 "pop %%"REG_BP" \n\t"
1679 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1680
14014d47
MN
1681 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1682 "a" (&c->redDither)
1683 );
1684 return;
1685 case PIX_FMT_RGB565:
1686 asm volatile(
1687 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1688 "mov %4, %%"REG_b" \n\t"
1689 "push %%"REG_BP" \n\t"
1690 YSCALEYUV2RGB1b(%%REGBP, %5)
1691 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
497d4f99 1692#ifdef DITHER1XBPP
14014d47 1693 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
92c7b471 1694 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
14014d47 1695 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
2da0d70d
DB
1696#endif
1697
14014d47
MN
1698 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1699 "pop %%"REG_BP" \n\t"
1700 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
2da0d70d 1701
14014d47
MN
1702 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1703 "a" (&c->redDither)
1704 );
1705 return;
1706 case PIX_FMT_YUYV422:
1707 asm volatile(
1708 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1709 "mov %4, %%"REG_b" \n\t"
1710 "push %%"REG_BP" \n\t"
1711 YSCALEYUV2PACKED1b(%%REGBP, %5)
1712 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1713 "pop %%"REG_BP" \n\t"
1714 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1715
1716 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1717 "a" (&c->redDither)
1718 );
1719 return;
1720 }
2da0d70d
DB
1721 }
1722 }
bc279024 1723#endif /* HAVE_MMX */
e5091488 1724 if (uvalpha < 2048)
2da0d70d 1725 {
ec1bca2a 1726 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1727 }else{
ec1bca2a 1728 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
2da0d70d 1729 }
d604bab9
MN
1730}
1731
8a322796 1732//FIXME yuy2* can read up to 7 samples too much
6ff0ad6b 1733
896a22b8 1734static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1e621b18 1735{
6ff0ad6b 1736#ifdef HAVE_MMX
2da0d70d
DB
1737 asm volatile(
1738 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1739 "mov %0, %%"REG_a" \n\t"
1740 "1: \n\t"
1741 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1742 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1743 "pand %%mm2, %%mm0 \n\t"
1744 "pand %%mm2, %%mm1 \n\t"
1745 "packuswb %%mm1, %%mm0 \n\t"
1746 "movq %%mm0, (%2, %%"REG_a") \n\t"
1747 "add $8, %%"REG_a" \n\t"
1748 " js 1b \n\t"
1749 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1750 : "%"REG_a
1751 );
1e621b18 1752#else
2da0d70d
DB
1753 int i;
1754 for (i=0; i<width; i++)
1755 dst[i]= src[2*i];
1e621b18
MN
1756#endif
1757}
1758
896a22b8 1759static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1e621b18 1760{
c2271987 1761#ifdef HAVE_MMX
2da0d70d
DB
1762 asm volatile(
1763 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1764 "mov %0, %%"REG_a" \n\t"
1765 "1: \n\t"
1766 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1767 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1768 "psrlw $8, %%mm0 \n\t"
1769 "psrlw $8, %%mm1 \n\t"
1770 "packuswb %%mm1, %%mm0 \n\t"
1771 "movq %%mm0, %%mm1 \n\t"
1772 "psrlw $8, %%mm0 \n\t"
1773 "pand %%mm4, %%mm1 \n\t"
1774 "packuswb %%mm0, %%mm0 \n\t"
1775 "packuswb %%mm1, %%mm1 \n\t"
1776 "movd %%mm0, (%3, %%"REG_a") \n\t"
1777 "movd %%mm1, (%2, %%"REG_a") \n\t"
1778 "add $4, %%"REG_a" \n\t"
1779 " js 1b \n\t"
1780 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1781 : "%"REG_a
1782 );
1e621b18 1783#else
2da0d70d
DB
1784 int i;
1785 for (i=0; i<width; i++)
1786 {
1787 dstU[i]= src1[4*i + 1];
1788 dstV[i]= src1[4*i + 3];
1789 }
1790#endif
1791 assert(src1 == src2);
1e621b18
MN
1792}
1793
4cf16bbe
DB
1794/* This is almost identical to the previous, end exists only because
1795 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
896a22b8 1796static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
7322a67c
MN
1797{
1798#ifdef HAVE_MMX
2da0d70d
DB
1799 asm volatile(
1800 "mov %0, %%"REG_a" \n\t"
1801 "1: \n\t"
1802 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1803 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "psrlw $8, %%mm1 \n\t"
1806 "packuswb %%mm1, %%mm0 \n\t"
1807 "movq %%mm0, (%2, %%"REG_a") \n\t"
1808 "add $8, %%"REG_a" \n\t"
1809 " js 1b \n\t"
1810 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1811 : "%"REG_a
1812 );
7322a67c 1813#else
2da0d70d
DB
1814 int i;
1815 for (i=0; i<width; i++)
1816 dst[i]= src[2*i+1];
7322a67c
MN
1817#endif
1818}
1819
896a22b8 1820static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
7322a67c 1821{
c2271987 1822#ifdef HAVE_MMX
2da0d70d
DB
1823 asm volatile(
1824 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1825 "mov %0, %%"REG_a" \n\t"
1826 "1: \n\t"
1827 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1828 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1829 "pand %%mm4, %%mm0 \n\t"
1830 "pand %%mm4, %%mm1 \n\t"
1831 "packuswb %%mm1, %%mm0 \n\t"
1832 "movq %%mm0, %%mm1 \n\t"
1833 "psrlw $8, %%mm0 \n\t"
1834 "pand %%mm4, %%mm1 \n\t"
1835 "packuswb %%mm0, %%mm0 \n\t"
1836 "packuswb %%mm1, %%mm1 \n\t"
1837 "movd %%mm0, (%3, %%"REG_a") \n\t"
1838 "movd %%mm1, (%2, %%"REG_a") \n\t"
1839 "add $4, %%"REG_a" \n\t"
1840 " js 1b \n\t"
1841 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1842 : "%"REG_a
1843 );
7322a67c 1844#else
2da0d70d
DB
1845 int i;
1846 for (i=0; i<width; i++)
1847 {
1848 dstU[i]= src1[4*i + 0];
1849 dstV[i]= src1[4*i + 2];
1850 }
1851#endif
1852 assert(src1 == src2);
7322a67c
MN
1853}
1854
214892ee 1855#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
896a22b8 1856static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
214892ee
MN
1857{\
1858 int i;\
1859 for (i=0; i<width; i++)\
1860 {\
1861 int b= (((type*)src)[i]>>shb)&maskb;\
1862 int g= (((type*)src)[i]>>shg)&maskg;\
1863 int r= (((type*)src)[i]>>shr)&maskr;\
1864\
1865 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1866 }\
1e621b18
MN
1867}
1868
214892ee
MN
1869BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1870BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1871BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1872BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1873BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1874BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1875
a0baa07a 1876#define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
896a22b8 1877static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
a0baa07a
MN
1878{\
1879 int i;\
1880 for (i=0; i<width; i++)\
1881 {\
ba83d862
MN
1882 int b= (((type*)src)[i]&maskb)>>shb;\
1883 int g= (((type*)src)[i]&maskg)>>shg;\
1884 int r= (((type*)src)[i]&maskr)>>shr;\
a0baa07a
MN
1885\
1886 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1887 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1888 }\
ba83d862 1889}\
896a22b8 1890static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
ba83d862
MN
1891{\
1892 int i;\
1893 for (i=0; i<width; i++)\
1894 {\
1895 int pix0= ((type*)src)[2*i+0];\
1896 int pix1= ((type*)src)[2*i+1];\
1897 int g= (pix0&maskg)+(pix1&maskg);\
1898 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1899 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1900\
1901 g>>=shg;\
1902\
6b79dbce
MN
1903 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1904 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
ba83d862 1905 }\
2f60f629
MN
1906}
1907
ba83d862
MN
1908BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1909BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
a0baa07a
MN
1910BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1911BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1912BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1913BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1914
ac6a2e45 1915#ifdef HAVE_MMX
a35acd7f 1916static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
dfb09bd1
MN
1917{
1918
1919 if(srcFormat == PIX_FMT_BGR24){
1920 asm volatile(
ff9a056d
MN
1921 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1922 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1923 :
dfb09bd1
MN
1924 );
1925 }else{
1926 asm volatile(
ff9a056d
MN
1927 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1928 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1929 :
dfb09bd1
MN
1930 );
1931 }
1932
2da0d70d 1933 asm volatile(
dfb09bd1
MN
1934 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1935 "mov %2, %%"REG_a" \n\t"
1936 "pxor %%mm7, %%mm7 \n\t"
1937 "1: \n\t"
1938 PREFETCH" 64(%0) \n\t"
1939 "movd (%0), %%mm0 \n\t"
1940 "movd 2(%0), %%mm1 \n\t"
1941 "movd 6(%0), %%mm2 \n\t"
1942 "movd 8(%0), %%mm3 \n\t"
1943 "add $12, %0 \n\t"
1944 "punpcklbw %%mm7, %%mm0 \n\t"
1945 "punpcklbw %%mm7, %%mm1 \n\t"
1946 "punpcklbw %%mm7, %%mm2 \n\t"
1947 "punpcklbw %%mm7, %%mm3 \n\t"
1948 "pmaddwd %%mm5, %%mm0 \n\t"
1949 "pmaddwd %%mm6, %%mm1 \n\t"
1950 "pmaddwd %%mm5, %%mm2 \n\t"
1951 "pmaddwd %%mm6, %%mm3 \n\t"
1952 "paddd %%mm1, %%mm0 \n\t"
1953 "paddd %%mm3, %%mm2 \n\t"
1954 "paddd %%mm4, %%mm0 \n\t"
1955 "paddd %%mm4, %%mm2 \n\t"
1956 "psrad $15, %%mm0 \n\t"
1957 "psrad $15, %%mm2 \n\t"
1958 "packssdw %%mm2, %%mm0 \n\t"
1959 "packuswb %%mm0, %%mm0 \n\t"
1960 "movd %%mm0, (%1, %%"REG_a") \n\t"
1961 "add $4, %%"REG_a" \n\t"
1962 " js 1b \n\t"
1963 : "+r" (src)
1964 : "r" (dst+width), "g" (-width)
1965 : "%"REG_a
2da0d70d 1966 );
dfb09bd1
MN
1967}
1968
a35acd7f 1969static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
dfb09bd1
MN
1970{
1971 asm volatile(
1972 "movq 24+%4, %%mm6 \n\t"
1973 "mov %3, %%"REG_a" \n\t"
1974 "pxor %%mm7, %%mm7 \n\t"
1975 "1: \n\t"
1976 PREFETCH" 64(%0) \n\t"
1977 "movd (%0), %%mm0 \n\t"
1978 "movd 2(%0), %%mm1 \n\t"
1979 "punpcklbw %%mm7, %%mm0 \n\t"
1980 "punpcklbw %%mm7, %%mm1 \n\t"
1981 "movq %%mm0, %%mm2 \n\t"
1982 "movq %%mm1, %%mm3 \n\t"
1983 "pmaddwd %4, %%mm0 \n\t"
1984 "pmaddwd 8+%4, %%mm1 \n\t"
1985 "pmaddwd 16+%4, %%mm2 \n\t"
1986 "pmaddwd %%mm6, %%mm3 \n\t"
1987 "paddd %%mm1, %%mm0 \n\t"
1988 "paddd %%mm3, %%mm2 \n\t"
1989
1990 "movd 6(%0), %%mm1 \n\t"
1991 "movd 8(%0), %%mm3 \n\t"
1992 "add $12, %0 \n\t"
1993 "punpcklbw %%mm7, %%mm1 \n\t"
1994 "punpcklbw %%mm7, %%mm3 \n\t"
1995 "movq %%mm1, %%mm4 \n\t"
1996 "movq %%mm3, %%mm5 \n\t"
1997 "pmaddwd %4, %%mm1 \n\t"
1998 "pmaddwd 8+%4, %%mm3 \n\t"
1999 "pmaddwd 16+%4, %%mm4 \n\t"
2000 "pmaddwd %%mm6, %%mm5 \n\t"
2001 "paddd %%mm3, %%mm1 \n\t"
2002 "paddd %%mm5, %%mm4 \n\t"
2003
2004 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
2005 "paddd %%mm3, %%mm0 \n\t"
2006 "paddd %%mm3, %%mm2 \n\t"
2007 "paddd %%mm3, %%mm1 \n\t"
2008 "paddd %%mm3, %%mm4 \n\t"
2009 "psrad $15, %%mm0 \n\t"
2010 "psrad $15, %%mm2 \n\t"
2011 "psrad $15, %%mm1 \n\t"
2012 "psrad $15, %%mm4 \n\t"
2013 "packssdw %%mm1, %%mm0 \n\t"
2014 "packssdw %%mm4, %%mm2 \n\t"
2015 "packuswb %%mm0, %%mm0 \n\t"
2016 "packuswb %%mm2, %%mm2 \n\t"
2017 "movd %%mm0, (%1, %%"REG_a") \n\t"
2018 "movd %%mm2, (%2, %%"REG_a") \n\t"
2019 "add $4, %%"REG_a" \n\t"
2020 " js 1b \n\t"
2021 : "+r" (src)
2022 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2023 : "%"REG_a
2024 );
2025}
2026#endif
2027
896a22b8 2028static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
dfb09bd1
MN
2029{
2030#ifdef HAVE_MMX
a35acd7f 2031 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1e621b18 2032#else
2da0d70d
DB
2033 int i;
2034 for (i=0; i<width; i++)
2035 {
2036 int b= src[i*3+0];
2037 int g= src[i*3+1];
2038 int r= src[i*3+2];
1e621b18 2039
e5091488 2040 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 2041 }
bc279024 2042#endif /* HAVE_MMX */
1e621b18
MN
2043}
2044
896a22b8 2045static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1e621b18 2046{
4342fc14 2047#ifdef HAVE_MMX
a35acd7f 2048 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1e621b18 2049#else
2da0d70d
DB
2050 int i;
2051 for (i=0; i<width; i++)
2052 {
dfb09bd1
MN
2053 int b= src1[3*i + 0];
2054 int g= src1[3*i + 1];
2055 int r= src1[3*i + 2];
2da0d70d 2056
dfb09bd1
MN
2057 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2058 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2059 }
bc279024 2060#endif /* HAVE_MMX */
2da0d70d 2061 assert(src1 == src2);
1e621b18
MN
2062}
2063
896a22b8 2064static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
2065{
2066 int i;
2067 for (i=0; i<width; i++)
2068 {
2069 int b= src1[6*i + 0] + src1[6*i + 3];
2070 int g= src1[6*i + 1] + src1[6*i + 4];
2071 int r= src1[6*i + 2] + src1[6*i + 5];
2072
2073 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2074 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2075 }
2076 assert(src1 == src2);
2077}
2078
896a22b8 2079static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
a861d4d7 2080{
dfb09bd1 2081#ifdef HAVE_MMX
a35acd7f 2082 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
dfb09bd1 2083#else
2da0d70d
DB
2084 int i;
2085 for (i=0; i<width; i++)
2086 {
2087 int r= src[i*3+0];
2088 int g= src[i*3+1];
2089 int b= src[i*3+2];
2090
e5091488 2091 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2da0d70d 2092 }
dfb09bd1 2093#endif
a861d4d7
MN
2094}
2095
896a22b8 2096static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
a861d4d7 2097{
2da0d70d
DB
2098 int i;
2099 assert(src1==src2);
dfb09bd1 2100#ifdef HAVE_MMX
a35acd7f 2101 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
dfb09bd1 2102#else
2da0d70d
DB
2103 for (i=0; i<width; i++)
2104 {
dfb09bd1
MN
2105 int r= src1[3*i + 0];
2106 int g= src1[3*i + 1];
2107 int b= src1[3*i + 2];
2da0d70d 2108
dfb09bd1
MN
2109 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2110 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2da0d70d 2111 }
dfb09bd1 2112#endif
a861d4d7
MN
2113}
2114
896a22b8 2115static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2f60f629
MN
2116{
2117 int i;
2118 assert(src1==src2);
2119 for (i=0; i<width; i++)
2120 {
e09d7eef
MN
2121 int r= src1[6*i + 0] + src1[6*i + 3];
2122 int g= src1[6*i + 1] + src1[6*i + 4];
2123 int b= src1[6*i + 2] + src1[6*i + 5];
2f60f629
MN
2124
2125 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2126 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2127 }
2128}
2129
1e621b18 2130
97b93389 2131static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
e28630fc 2132{
2da0d70d
DB
2133 int i;
2134 for (i=0; i<width; i++)
2135 {
2136 int d= src[i];
e28630fc 2137
2da0d70d
DB
2138 dst[i]= pal[d] & 0xFF;
2139 }
e28630fc
MN
2140}
2141
97b93389 2142static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
e28630fc 2143{
2da0d70d
DB
2144 int i;
2145 assert(src1 == src2);
2146 for (i=0; i<width; i++)
2147 {
2148 int p= pal[src1[i]];
2149
2150 dstU[i]= p>>8;
2151 dstV[i]= p>>16;
2152 }
e28630fc
MN
2153}
2154
896a22b8 2155static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
3d05e078
MN
2156{
2157 int i, j;
2158 for (i=0; i<width/8; i++){
3a5ba0c3
LB
2159 int d= ~src[i];
2160 for(j=0; j<8; j++)
2161 dst[8*i+j]= ((d>>(7-j))&1)*255;
2162 }
2163}
2164
896a22b8 2165static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
3a5ba0c3
LB
2166{
2167 int i, j;
2168 for (i=0; i<width/8; i++){
2169 int d= src[i];
78454dfc
MN
2170 for(j=0; j<8; j++)
2171 dst[8*i+j]= ((d>>(7-j))&1)*255;
3d05e078
MN
2172 }
2173}
2174
8a322796 2175// bilinear / bicubic scaling
077ea8a7 2176static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2da0d70d 2177 int16_t *filter, int16_t *filterPos, long filterSize)
2ff198c1 2178{
077ea8a7 2179#ifdef HAVE_MMX
2da0d70d
DB
2180 assert(filterSize % 4 == 0 && filterSize>0);
2181 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2182 {
2183 long counter= -2*dstW;
2184 filter-= counter*2;
2185 filterPos-= counter/2;
2186 dst-= counter/2;
2187 asm volatile(
83c89c78 2188#if defined(PIC)
2da0d70d
DB
2189 "push %%"REG_b" \n\t"
2190#endif
2191 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2192 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2193 "mov %%"REG_a", %%"REG_BP" \n\t"
2194 ASMALIGN(4)
2195 "1: \n\t"
2196 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2197 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2198 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2199 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2200 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2201 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2202 "punpcklbw %%mm7, %%mm0 \n\t"
2203 "punpcklbw %%mm7, %%mm2 \n\t"
2204 "pmaddwd %%mm1, %%mm0 \n\t"
2205 "pmaddwd %%mm2, %%mm3 \n\t"
ef423a66
MN
2206 "movq %%mm0, %%mm4 \n\t"
2207 "punpckldq %%mm3, %%mm0 \n\t"
2208 "punpckhdq %%mm3, %%mm4 \n\t"
2209 "paddd %%mm4, %%mm0 \n\t"
2210 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2211 "packssdw %%mm0, %%mm0 \n\t"
2212 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2213 "add $4, %%"REG_BP" \n\t"
2214 " jnc 1b \n\t"
2215
2216 "pop %%"REG_BP" \n\t"
83c89c78 2217#if defined(PIC)
2da0d70d 2218 "pop %%"REG_b" \n\t"
83c89c78 2219#endif
2da0d70d
DB
2220 : "+a" (counter)
2221 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2222#if !defined(PIC)
2da0d70d
DB
2223 : "%"REG_b
2224#endif
2225 );
2226 }
2227 else if (filterSize==8)
2228 {
2229 long counter= -2*dstW;
2230 filter-= counter*4;
2231 filterPos-= counter/2;
2232 dst-= counter/2;
2233 asm volatile(
83c89c78 2234#if defined(PIC)
2da0d70d
DB
2235 "push %%"REG_b" \n\t"
2236#endif
2237 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2238 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2239 "mov %%"REG_a", %%"REG_BP" \n\t"
2240 ASMALIGN(4)
2241 "1: \n\t"
2242 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2243 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2244 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2245 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2246 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2247 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2248 "punpcklbw %%mm7, %%mm0 \n\t"
2249 "punpcklbw %%mm7, %%mm2 \n\t"
2250 "pmaddwd %%mm1, %%mm0 \n\t"
2251 "pmaddwd %%mm2, %%mm3 \n\t"
2252
2253 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2254 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2255 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2256 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2257 "punpcklbw %%mm7, %%mm4 \n\t"
2258 "punpcklbw %%mm7, %%mm2 \n\t"
2259 "pmaddwd %%mm1, %%mm4 \n\t"
2260 "pmaddwd %%mm2, %%mm5 \n\t"
2261 "paddd %%mm4, %%mm0 \n\t"
2262 "paddd %%mm5, %%mm3 \n\t"
ef423a66
MN
2263 "movq %%mm0, %%mm4 \n\t"
2264 "punpckldq %%mm3, %%mm0 \n\t"
2265 "punpckhdq %%mm3, %%mm4 \n\t"
2266 "paddd %%mm4, %%mm0 \n\t"
2267 "psrad $7, %%mm0 \n\t"
2da0d70d
DB
2268 "packssdw %%mm0, %%mm0 \n\t"
2269 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2270 "add $4, %%"REG_BP" \n\t"
2271 " jnc 1b \n\t"
2272
2273 "pop %%"REG_BP" \n\t"
83c89c78 2274#if defined(PIC)
2da0d70d 2275 "pop %%"REG_b" \n\t"
83c89c78 2276#endif
2da0d70d
DB
2277 : "+a" (counter)
2278 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
83c89c78 2279#if !defined(PIC)
2da0d70d
DB
2280 : "%"REG_b
2281#endif
2282 );
2283 }
2284 else
2285 {
2286 uint8_t *offset = src+filterSize;
2287 long counter= -2*dstW;
2288 //filter-= counter*filterSize/2;
2289 filterPos-= counter/2;
2290 dst-= counter/2;
2291 asm volatile(
2292 "pxor %%mm7, %%mm7 \n\t"
2da0d70d
DB
2293 ASMALIGN(4)
2294 "1: \n\t"
2295 "mov %2, %%"REG_c" \n\t"
2296 "movzwl (%%"REG_c", %0), %%eax \n\t"
2297 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2298 "mov %5, %%"REG_c" \n\t"
2299 "pxor %%mm4, %%mm4 \n\t"
2300 "pxor %%mm5, %%mm5 \n\t"
2301 "2: \n\t"
2302 "movq (%1), %%mm1 \n\t"
2303 "movq (%1, %6), %%mm3 \n\t"
2304 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2305 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2306 "punpcklbw %%mm7, %%mm0 \n\t"
2307 "punpcklbw %%mm7, %%mm2 \n\t"
2308 "pmaddwd %%mm1, %%mm0 \n\t"
2309 "pmaddwd %%mm2, %%mm3 \n\t"
2310 "paddd %%mm3, %%mm5 \n\t"
2311 "paddd %%mm0, %%mm4 \n\t"
2312 "add $8, %1 \n\t"
2313 "add $4, %%"REG_c" \n\t"
2314 "cmp %4, %%"REG_c" \n\t"
2315 " jb 2b \n\t"
2316 "add %6, %1 \n\t"
ef423a66
MN
2317 "movq %%mm4, %%mm0 \n\t"
2318 "punpckldq %%mm5, %%mm4 \n\t"
2319 "punpckhdq %%mm5, %%mm0 \n\t"
2320 "paddd %%mm0, %%mm4 \n\t"
2321 "psrad $7, %%mm4 \n\t"
2da0d70d
DB
2322 "packssdw %%mm4, %%mm4 \n\t"
2323 "mov %3, %%"REG_a" \n\t"
2324 "movd %%mm4, (%%"REG_a", %0) \n\t"
2325 "add $4, %0 \n\t"
2326 " jnc 1b \n\t"
2327
2328 : "+r" (counter), "+r" (filter)
2329 : "m" (filterPos), "m" (dst), "m"(offset),
2330 "m" (src), "r" (filterSize*2)
2331 : "%"REG_a, "%"REG_c, "%"REG_d
2332 );
2333 }
077ea8a7 2334#else
8c266f0c 2335#ifdef HAVE_ALTIVEC
2da0d70d 2336 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
8c266f0c 2337#else
2da0d70d
DB
2338 int i;
2339 for (i=0; i<dstW; i++)
2340 {
2341 int j;
2342 int srcPos= filterPos[i];
2343 int val=0;
2344 //printf("filterPos: %d\n", filterPos[i]);
2345 for (j=0; j<filterSize; j++)
2346 {
2347 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2348 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2349 }
2350 //filter += hFilterSize;
881c4294 2351 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2da0d70d
DB
2352 //dst[i] = val>>7;
2353 }
bc279024
DB
2354#endif /* HAVE_ALTIVEC */
2355#endif /* HAVE_MMX */
077ea8a7 2356}
2ff198c1 2357 // *** horizontal scale Y line to temp buffer
6bc0c792 2358static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2da0d70d
DB
2359 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2360 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2361 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
e48a79c9 2362 int32_t *mmx2FilterPos, uint32_t *pal)
077ea8a7 2363{
2da0d70d 2364 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
1e621b18 2365 {
896a22b8 2366 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2367 src= formatConvBuffer;
1e621b18 2368 }
2da0d70d 2369 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
7322a67c 2370 {
896a22b8 2371 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2372 src= formatConvBuffer;
7322a67c 2373 }
2da0d70d 2374 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2375 {
896a22b8 2376 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2377 src= formatConvBuffer;
1e621b18 2378 }
9990e426
MN
2379 else if (srcFormat==PIX_FMT_RGB32_1)
2380 {
896a22b8 2381 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2382 src= formatConvBuffer;
2383 }
2da0d70d 2384 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2385 {
896a22b8 2386 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2387 src= formatConvBuffer;
1e621b18 2388 }
2da0d70d 2389 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2390 {
896a22b8 2391 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2392 src= formatConvBuffer;
6af250ea 2393 }
2da0d70d 2394 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2395 {
896a22b8 2396 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2397 src= formatConvBuffer;
b72034dd 2398 }
2da0d70d 2399 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2400 {
896a22b8 2401 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2402 src= formatConvBuffer;
a861d4d7 2403 }
9990e426
MN
2404 else if (srcFormat==PIX_FMT_BGR32_1)
2405 {
896a22b8 2406 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
9990e426
MN
2407 src= formatConvBuffer;
2408 }
2da0d70d 2409 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2410 {
896a22b8 2411 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2412 src= formatConvBuffer;
a861d4d7 2413 }
2da0d70d 2414 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2415 {
896a22b8 2416 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2417 src= formatConvBuffer;
a43fb6b3 2418 }
2da0d70d 2419 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2420 {
896a22b8 2421 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2422 src= formatConvBuffer;
a43fb6b3 2423 }
2da0d70d 2424 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2425 {
e48a79c9 2426 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2da0d70d 2427 src= formatConvBuffer;
e28630fc 2428 }
3a5ba0c3
LB
2429 else if (srcFormat==PIX_FMT_MONOBLACK)
2430 {
896a22b8 2431 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
3a5ba0c3
LB
2432 src= formatConvBuffer;
2433 }
2434 else if (srcFormat==PIX_FMT_MONOWHITE)
3d05e078 2435 {
896a22b8 2436 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
3d05e078
MN
2437 src= formatConvBuffer;
2438 }
1e621b18 2439
e3d2500f 2440#ifdef HAVE_MMX
8a322796 2441 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2442 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2443#else
2da0d70d 2444 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2445#endif
077ea8a7 2446 {
2da0d70d 2447 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
077ea8a7 2448 }
8a322796 2449 else // fast bilinear upscale / crap downscale
077ea8a7 2450 {
3d6a30d9 2451#if defined(ARCH_X86)
2ff198c1 2452#ifdef HAVE_MMX2
2da0d70d 2453 int i;
83c89c78 2454#if defined(PIC)
2da0d70d 2455 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2456#endif
2da0d70d
DB
2457 if (canMMX2BeUsed)
2458 {
2459 asm volatile(
83c89c78 2460#if defined(PIC)
2da0d70d
DB
2461 "mov %%"REG_b", %5 \n\t"
2462#endif
2463 "pxor %%mm7, %%mm7 \n\t"
2464 "mov %0, %%"REG_c" \n\t"
2465 "mov %1, %%"REG_D" \n\t"
2466 "mov %2, %%"REG_d" \n\t"
2467 "mov %3, %%"REG_b" \n\t"
2468 "xor %%"REG_a", %%"REG_a" \n\t" // i
2469 PREFETCH" (%%"REG_c") \n\t"
2470 PREFETCH" 32(%%"REG_c") \n\t"
2471 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2472
6d606c4f
AJ
2473#ifdef ARCH_X86_64
2474
2475#define FUNNY_Y_CODE \
2da0d70d
DB
2476 "movl (%%"REG_b"), %%esi \n\t"\
2477 "call *%4 \n\t"\
2478 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2479 "add %%"REG_S", %%"REG_c" \n\t"\
2480 "add %%"REG_a", %%"REG_D" \n\t"\
2481 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2482
2483#else
2484
2ff198c1 2485#define FUNNY_Y_CODE \
2da0d70d
DB
2486 "movl (%%"REG_b"), %%esi \n\t"\
2487 "call *%4 \n\t"\
2488 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2489 "add %%"REG_a", %%"REG_D" \n\t"\
2490 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2491
bc279024 2492#endif /* ARCH_X86_64 */
6d606c4f 2493
2ff198c1
MN
2494FUNNY_Y_CODE
2495FUNNY_Y_CODE
2496FUNNY_Y_CODE
2497FUNNY_Y_CODE
2498FUNNY_Y_CODE
2499FUNNY_Y_CODE
2500FUNNY_Y_CODE
2501FUNNY_Y_CODE
2502
83c89c78 2503#if defined(PIC)
2da0d70d 2504 "mov %5, %%"REG_b" \n\t"
83c89c78 2505#endif
2da0d70d
DB
2506 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2507 "m" (funnyYCode)
83c89c78 2508#if defined(PIC)
2da0d70d 2509 ,"m" (ebxsave)
83c89c78 2510#endif
2da0d70d 2511 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2512#if !defined(PIC)
2da0d70d
DB
2513 ,"%"REG_b
2514#endif
2515 );
2516 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2517 }
2518 else
2519 {
bc279024 2520#endif /* HAVE_MMX2 */
2da0d70d
DB
2521 long xInc_shr16 = xInc >> 16;
2522 uint16_t xInc_mask = xInc & 0xffff;
2523 //NO MMX just normal asm ...
2524 asm volatile(
2525 "xor %%"REG_a", %%"REG_a" \n\t" // i
2526 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2527 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2528 ASMALIGN(4)
2529 "1: \n\t"
2530 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2531 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2532 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2533 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2534 "shll $16, %%edi \n\t"
2535 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2536 "mov %1, %%"REG_D" \n\t"
2537 "shrl $9, %%esi \n\t"
2538 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2539 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2540 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2541
2542 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2543 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2544 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2545 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2546 "shll $16, %%edi \n\t"
2547 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2548 "mov %1, %%"REG_D" \n\t"
2549 "shrl $9, %%esi \n\t"
2550 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2551 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2552 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2553
2554
2555 "add $2, %%"REG_a" \n\t"
2556 "cmp %2, %%"REG_a" \n\t"
2557 " jb 1b \n\t"
2558
2559
2560 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2561 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2562 );
2ff198c1 2563#ifdef HAVE_MMX2
2da0d70d 2564 } //if MMX2 can't be used
2ff198c1
MN
2565#endif
2566#else
2da0d70d
DB
2567 int i;
2568 unsigned int xpos=0;
2569 for (i=0;i<dstWidth;i++)
2570 {
2571 register unsigned int xx=xpos>>16;
2572 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2573 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2574 xpos+=xInc;
2575 }
bc279024 2576#endif /* defined(ARCH_X86) */
077ea8a7 2577 }
6bc0c792
MN
2578
2579 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2580 int i;
2581 //FIXME all pal and rgb srcFormats could do this convertion as well
2582 //FIXME all scalers more complex than bilinear could do half of this transform
2583 if(c->srcRange){
2584 for (i=0; i<dstWidth; i++)
2585 dst[i]= (dst[i]*14071 + 33561947)>>14;
2586 }else{
2587 for (i=0; i<dstWidth; i++)
aa13b0fc 2588 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
6bc0c792
MN
2589 }
2590 }
2ff198c1
MN
2591}
2592
6bc0c792 2593inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2da0d70d
DB
2594 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2595 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2596 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
e48a79c9 2597 int32_t *mmx2FilterPos, uint32_t *pal)
2ff198c1 2598{
2da0d70d 2599 if (srcFormat==PIX_FMT_YUYV422)
1e621b18 2600 {
896a22b8 2601 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2602 src1= formatConvBuffer;
8b2fce0d 2603 src2= formatConvBuffer+VOFW;
1e621b18 2604 }
2da0d70d 2605 else if (srcFormat==PIX_FMT_UYVY422)
7322a67c 2606 {
896a22b8 2607 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2608 src1= formatConvBuffer;
8b2fce0d 2609 src2= formatConvBuffer+VOFW;
7322a67c 2610 }
2da0d70d 2611 else if (srcFormat==PIX_FMT_RGB32)
1e621b18 2612 {
2f60f629 2613 if(c->chrSrcHSubSample)
896a22b8 2614 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2615 else
896a22b8 2616 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2617 src1= formatConvBuffer;
8b2fce0d 2618 src2= formatConvBuffer+VOFW;
1e621b18 2619 }
9990e426
MN
2620 else if (srcFormat==PIX_FMT_RGB32_1)
2621 {
2f60f629 2622 if(c->chrSrcHSubSample)
896a22b8 2623 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2624 else
896a22b8 2625 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2626 src1= formatConvBuffer;
2627 src2= formatConvBuffer+VOFW;
2628 }
2da0d70d 2629 else if (srcFormat==PIX_FMT_BGR24)
1e621b18 2630 {
2f60f629 2631 if(c->chrSrcHSubSample)
896a22b8 2632 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2633 else
896a22b8 2634 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2635 src1= formatConvBuffer;
8b2fce0d 2636 src2= formatConvBuffer+VOFW;
1e621b18 2637 }
2da0d70d 2638 else if (srcFormat==PIX_FMT_BGR565)
6af250ea 2639 {
2f60f629 2640 if(c->chrSrcHSubSample)
896a22b8 2641 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2642 else
896a22b8 2643 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2644 src1= formatConvBuffer;
8b2fce0d 2645 src2= formatConvBuffer+VOFW;
6af250ea 2646 }
2da0d70d 2647 else if (srcFormat==PIX_FMT_BGR555)
b72034dd 2648 {
2f60f629 2649 if(c->chrSrcHSubSample)
896a22b8 2650 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2651 else
896a22b8 2652 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2653 src1= formatConvBuffer;
8b2fce0d 2654 src2= formatConvBuffer+VOFW;
b72034dd 2655 }
2da0d70d 2656 else if (srcFormat==PIX_FMT_BGR32)
a861d4d7 2657 {
2f60f629 2658 if(c->chrSrcHSubSample)
896a22b8 2659 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2660 else
896a22b8 2661 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2662 src1= formatConvBuffer;
8b2fce0d 2663 src2= formatConvBuffer+VOFW;
a861d4d7 2664 }
9990e426
MN
2665 else if (srcFormat==PIX_FMT_BGR32_1)
2666 {
2f60f629 2667 if(c->chrSrcHSubSample)
896a22b8 2668 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2f60f629 2669 else
896a22b8 2670 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
9990e426
MN
2671 src1= formatConvBuffer;
2672 src2= formatConvBuffer+VOFW;
2673 }
2da0d70d 2674 else if (srcFormat==PIX_FMT_RGB24)
a861d4d7 2675 {
2f60f629 2676 if(c->chrSrcHSubSample)
896a22b8 2677 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2678 else
896a22b8 2679 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2680 src1= formatConvBuffer;
8b2fce0d 2681 src2= formatConvBuffer+VOFW;
a861d4d7 2682 }
2da0d70d 2683 else if (srcFormat==PIX_FMT_RGB565)
a43fb6b3 2684 {
2f60f629 2685 if(c->chrSrcHSubSample)
896a22b8 2686 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2687 else
896a22b8 2688 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2689 src1= formatConvBuffer;
8b2fce0d 2690 src2= formatConvBuffer+VOFW;
a43fb6b3 2691 }
2da0d70d 2692 else if (srcFormat==PIX_FMT_RGB555)
a43fb6b3 2693 {
2f60f629 2694 if(c->chrSrcHSubSample)
896a22b8 2695 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2f60f629 2696 else
896a22b8 2697 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2698 src1= formatConvBuffer;
8b2fce0d 2699 src2= formatConvBuffer+VOFW;
a43fb6b3 2700 }
4bb9adcf 2701 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
6ff0ad6b 2702 {
2da0d70d 2703 return;
6ff0ad6b 2704 }
2da0d70d 2705 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
e28630fc 2706 {
e48a79c9 2707 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2da0d70d 2708 src1= formatConvBuffer;
8b2fce0d 2709 src2= formatConvBuffer+VOFW;
e28630fc 2710 }
1e621b18 2711
e3d2500f 2712#ifdef HAVE_MMX
8a322796 2713 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2da0d70d 2714 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2715#else
2da0d70d 2716 if (!(flags&SWS_FAST_BILINEAR))
e3d2500f 2717#endif
077ea8a7 2718 {
2da0d70d 2719 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
8b2fce0d 2720 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
077ea8a7 2721 }
8a322796 2722 else // fast bilinear upscale / crap downscale
077ea8a7 2723 {
3d6a30d9 2724#if defined(ARCH_X86)
2ff198c1 2725#ifdef HAVE_MMX2
2da0d70d 2726 int i;
83c89c78 2727#if defined(PIC)
2da0d70d 2728 uint64_t ebxsave __attribute__((aligned(8)));
83c89c78 2729#endif
2da0d70d
DB
2730 if (canMMX2BeUsed)
2731 {
2732 asm volatile(
83c89c78 2733#if defined(PIC)
2da0d70d
DB
2734 "mov %%"REG_b", %6 \n\t"
2735#endif
2736 "pxor %%mm7, %%mm7 \n\t"
2737 "mov %0, %%"REG_c" \n\t"
2738 "mov %1, %%"REG_D" \n\t"
2739 "mov %2, %%"REG_d" \n\t"
2740 "mov %3, %%"REG_b" \n\t"
2741 "xor %%"REG_a", %%"REG_a" \n\t" // i
2742 PREFETCH" (%%"REG_c") \n\t"
2743 PREFETCH" 32(%%"REG_c") \n\t"
2744 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2745
6d606c4f
AJ
2746#ifdef ARCH_X86_64
2747
2748#define FUNNY_UV_CODE \
2da0d70d
DB
2749 "movl (%%"REG_b"), %%esi \n\t"\
2750 "call *%4 \n\t"\
2751 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2752 "add %%"REG_S", %%"REG_c" \n\t"\
2753 "add %%"REG_a", %%"REG_D" \n\t"\
2754 "xor %%"REG_a", %%"REG_a" \n\t"\
6d606c4f
AJ
2755
2756#else
2757
b7dc6f66 2758#define FUNNY_UV_CODE \
2da0d70d
DB
2759 "movl (%%"REG_b"), %%esi \n\t"\
2760 "call *%4 \n\t"\
2761 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2762 "add %%"REG_a", %%"REG_D" \n\t"\
2763 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2764
bc279024 2765#endif /* ARCH_X86_64 */
6d606c4f 2766
b7dc6f66
MN
2767FUNNY_UV_CODE
2768FUNNY_UV_CODE
2769FUNNY_UV_CODE
2770FUNNY_UV_CODE
2da0d70d
DB
2771 "xor %%"REG_a", %%"REG_a" \n\t" // i
2772 "mov %5, %%"REG_c" \n\t" // src
2773 "mov %1, %%"REG_D" \n\t" // buf1
8b2fce0d 2774 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2da0d70d
DB
2775 PREFETCH" (%%"REG_c") \n\t"
2776 PREFETCH" 32(%%"REG_c") \n\t"
2777 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2778
2779FUNNY_UV_CODE
2780FUNNY_UV_CODE
2781FUNNY_UV_CODE
2782FUNNY_UV_CODE
2783
83c89c78 2784#if defined(PIC)
2da0d70d 2785 "mov %6, %%"REG_b" \n\t"
83c89c78 2786#endif
2da0d70d
DB
2787 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2788 "m" (funnyUVCode), "m" (src2)
83c89c78 2789#if defined(PIC)
2da0d70d 2790 ,"m" (ebxsave)
83c89c78 2791#endif
2da0d70d 2792 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
83c89c78 2793#if !defined(PIC)
2da0d70d
DB
2794 ,"%"REG_b
2795#endif
2796 );
2797 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2798 {
2799 //printf("%d %d %d\n", dstWidth, i, srcW);
2800 dst[i] = src1[srcW-1]*128;
8b2fce0d 2801 dst[i+VOFW] = src2[srcW-1]*128;
2da0d70d
DB
2802 }
2803 }
2804 else
2805 {
bc279024 2806#endif /* HAVE_MMX2 */
2da0d70d
DB
2807 long xInc_shr16 = (long) (xInc >> 16);
2808 uint16_t xInc_mask = xInc & 0xffff;
2809 asm volatile(
2810 "xor %%"REG_a", %%"REG_a" \n\t" // i
2811 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2812 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2813 ASMALIGN(4)
2814 "1: \n\t"
2815 "mov %0, %%"REG_S" \n\t"
2816 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2817 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2818 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2819 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2820 "shll $16, %%edi \n\t"
2821 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2822 "mov %1, %%"REG_D" \n\t"
2823 "shrl $9, %%esi \n\t"
2824 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2825
2826 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2827 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2828 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2829 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2830 "shll $16, %%edi \n\t"
2831 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2832 "mov %1, %%"REG_D" \n\t"
2833 "shrl $9, %%esi \n\t"
8b2fce0d 2834 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2da0d70d
DB
2835
2836 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2837 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2838 "add $1, %%"REG_a" \n\t"
2839 "cmp %2, %%"REG_a" \n\t"
2840 " jb 1b \n\t"
2ff198c1 2841
8a322796
DB
2842/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2843 which is needed to support GCC 4.0. */
e5091488 2844#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2da0d70d 2845 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2846#else
2da0d70d 2847 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
dc77ef7f 2848#endif
2da0d70d
DB
2849 "r" (src2)
2850 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2851 );
2ff198c1 2852#ifdef HAVE_MMX2
2da0d70d 2853 } //if MMX2 can't be used
2ff198c1
MN
2854#endif
2855#else
2da0d70d
DB
2856 int i;
2857 unsigned int xpos=0;
2858 for (i=0;i<dstWidth;i++)
2859 {
2860 register unsigned int xx=xpos>>16;
2861 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2862 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
8b2fce0d 2863 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2da0d70d
DB
2864 /* slower
2865 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
8b2fce0d 2866 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2da0d70d
DB
2867 */
2868 xpos+=xInc;
2869 }
bc279024 2870#endif /* defined(ARCH_X86) */
2da0d70d 2871 }
6bc0c792
MN
2872 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2873 int i;
2874 //FIXME all pal and rgb srcFormats could do this convertion as well
2875 //FIXME all scalers more complex than bilinear could do half of this transform
2876 if(c->srcRange){
2877 for (i=0; i<dstWidth; i++){
2878 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2879 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2880 }
2881 }else{
2882 for (i=0; i<dstWidth; i++){
aa13b0fc
MN
2883 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2884 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
6bc0c792
MN
2885 }
2886 }
2887 }
077ea8a7
MN
2888}
2889
3e499f53 2890static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2da0d70d
DB
2891 int srcSliceH, uint8_t* dst[], int dstStride[]){
2892
2893 /* load a few things into local vars to make the code more readable? and faster */
2894 const int srcW= c->srcW;
2895 const int dstW= c->dstW;
2896 const int dstH= c->dstH;
2897 const int chrDstW= c->chrDstW;
2898 const int chrSrcW= c->chrSrcW;
2899 const int lumXInc= c->lumXInc;
2900 const int chrXInc= c->chrXInc;
2901 const int dstFormat= c->dstFormat;
2902 const int srcFormat= c->srcFormat;
2903 const int flags= c->flags;
2904 const int canMMX2BeUsed= c->canMMX2BeUsed;
2905 int16_t *vLumFilterPos= c->vLumFilterPos;
2906 int16_t *vChrFilterPos= c->vChrFilterPos;
2907 int16_t *hLumFilterPos= c->hLumFilterPos;
2908 int16_t *hChrFilterPos= c->hChrFilterPos;
2909 int16_t *vLumFilter= c->vLumFilter;
2910 int16_t *vChrFilter= c->vChrFilter;
2911 int16_t *hLumFilter= c->hLumFilter;
2912 int16_t *hChrFilter= c->hChrFilter;
2913 int32_t *lumMmxFilter= c->lumMmxFilter;
2914 int32_t *chrMmxFilter= c->chrMmxFilter;
2915 const int vLumFilterSize= c->vLumFilterSize;
2916 const int vChrFilterSize= c->vChrFilterSize;
2917 const int hLumFilterSize= c->hLumFilterSize;
2918 const int hChrFilterSize= c->hChrFilterSize;
2919 int16_t **lumPixBuf= c->lumPixBuf;
2920 int16_t **chrPixBuf= c->chrPixBuf;
2921 const int vLumBufSize= c->vLumBufSize;
2922 const int vChrBufSize= c->vChrBufSize;
2923 uint8_t *funnyYCode= c->funnyYCode;
2924 uint8_t *funnyUVCode= c->funnyUVCode;
2925 uint8_t *formatConvBuffer= c->formatConvBuffer;
2926 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2927 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2928 int lastDstY;
e48a79c9 2929 uint32_t *pal=NULL;
2da0d70d 2930
8a322796 2931 /* vars which will change and which we need to store back in the context */
2da0d70d
DB
2932 int dstY= c->dstY;
2933 int lumBufIndex= c->lumBufIndex;
2934 int chrBufIndex= c->chrBufIndex;
2935 int lastInLumBuf= c->lastInLumBuf;
2936 int lastInChrBuf= c->lastInChrBuf;
2937
2938 if (isPacked(c->srcFormat)){
e48a79c9 2939 pal= (uint32_t *)src[1];
2da0d70d
DB
2940 src[0]=
2941 src[1]=
2942 src[2]= src[0];
2943 srcStride[0]=
2944 srcStride[1]=
2945 srcStride[2]= srcStride[0];
2946 }
2947 srcStride[1]<<= c->vChrDrop;
2948 srcStride[2]<<= c->vChrDrop;
2949
2950 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2951 // (int)dst[0], (int)dst[1], (int)dst[2]);
c7a810cc
MN
2952
2953#if 0 //self test FIXME move to a vfilter or something
2da0d70d
DB
2954 {
2955 static volatile int i=0;
2956 i++;
2957 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2958 selfTest(src, srcStride, c->srcW, c->srcH);
2959 i--;
2960 }
c7a810cc 2961#endif
37079906 2962
2da0d70d
DB
2963 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2964 //dstStride[0],dstStride[1],dstStride[2]);
2965
2966 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2967 {
2968 static int firstTime=1; //FIXME move this into the context perhaps
2969 if (flags & SWS_PRINT_INFO && firstTime)
2970 {
4b0c30b7 2971 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
8a322796 2972 " ->cannot do aligned memory accesses anymore\n");
2da0d70d
DB
2973 firstTime=0;
2974 }
2975 }
2976
8a322796
DB
2977 /* Note the user might start scaling the picture in the middle so this
2978 will not get executed. This is not really intended but works
2979 currently, so people might do it. */
2da0d70d
DB
2980 if (srcSliceY ==0){
2981 lumBufIndex=0;
2982 chrBufIndex=0;
2983 dstY=0;
2984 lastInLumBuf= -1;
2985 lastInChrBuf= -1;
2986 }
2987
2988 lastDstY= dstY;
2989
2990 for (;dstY < dstH; dstY++){
2991 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2992 const int chrDstY= dstY>>c->chrDstVSubSample;
2993 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2994 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2995
2996 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2997 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2998 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2999 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3000
3001 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3002 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3003 //handle holes (FAST_BILINEAR & weird filters)
3004 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3005 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3006 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
fcc402b1
LB
3007 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3008 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2da0d70d
DB
3009
3010 // Do we have enough lines in this slice to output the dstY line
3011 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3012 {
3013 //Do horizontal scaling
3014 while(lastInLumBuf < lastLumSrcY)
3015 {
3016 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3017 lumBufIndex++;
3018 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
fcc402b1
LB
3019 assert(lumBufIndex < 2*vLumBufSize);
3020 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3021 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2da0d70d 3022 //printf("%d %d\n", lumBufIndex, vLumBufSize);
6bc0c792 3023 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2da0d70d
DB
3024 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3025 funnyYCode, c->srcFormat, formatConvBuffer,
3026 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3027 lastInLumBuf++;
3028 }
3029 while(lastInChrBuf < lastChrSrcY)
3030 {
3031 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3032 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3033 chrBufIndex++;
fcc402b1
LB
3034 assert(chrBufIndex < 2*vChrBufSize);
3035 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3036 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
3037 //FIXME replace parameters through context struct (some at least)
3038
3039 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 3040 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
3041 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3042 funnyUVCode, c->srcFormat, formatConvBuffer,
3043 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3044 lastInChrBuf++;
3045 }
3046 //wrap buf index around to stay inside the ring buffer
e5091488
BF
3047 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3048 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
3049 }
3050 else // not enough lines left in this slice -> load the rest in the buffer
3051 {
3052 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3053 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3054 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3055 vChrBufSize, vLumBufSize);*/
3056
3057 //Do horizontal scaling
3058 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3059 {
3060 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3061 lumBufIndex++;
fcc402b1
LB
3062 assert(lumBufIndex < 2*vLumBufSize);
3063 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3064 assert(lastInLumBuf + 1 - srcSliceY >= 0);
6bc0c792 3065 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2da0d70d
DB
3066 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3067 funnyYCode, c->srcFormat, formatConvBuffer,
3068 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3069 lastInLumBuf++;
3070 }
3071 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3072 {
3073 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3074 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3075 chrBufIndex++;
fcc402b1
LB
3076 assert(chrBufIndex < 2*vChrBufSize);
3077 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3078 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2da0d70d
DB
3079
3080 if (!(isGray(srcFormat) || isGray(dstFormat)))
6bc0c792 3081 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2da0d70d
DB
3082 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3083 funnyUVCode, c->srcFormat, formatConvBuffer,
3084 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3085 lastInChrBuf++;
3086 }
3087 //wrap buf index around to stay inside the ring buffer
e5091488
BF
3088 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3089 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2da0d70d
DB
3090 break; //we can't output a dstY line so let's try with the next slice
3091 }
d3f41512 3092
c1b0bfb4 3093#ifdef HAVE_MMX
0cb25594 3094 b5Dither= ff_dither8[dstY&1];
92c7b471
CEH
3095 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
3096 g5Dither= ff_dither8[dstY&1];
3097 else
3098 g5Dither= ff_dither4[dstY&1];
0cb25594 3099 r5Dither= ff_dither8[(dstY+1)&1];
2da0d70d
DB
3100#endif
3101 if (dstY < dstH-2)
3102 {
3103 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3104 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
6542b44e 3105#ifdef HAVE_MMX
2da0d70d
DB
3106 int i;
3107 if (flags & SWS_ACCURATE_RND){
1625216e 3108 int s= APCK_SIZE / 8;
2da0d70d 3109 for (i=0; i<vLumFilterSize; i+=2){
1625216e
MN
3110 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
3111 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
3112 lumMmxFilter[s*i+APCK_COEF/4 ]=
3113 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2da0d70d
DB
3114 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3115 }
3116 for (i=0; i<vChrFilterSize; i+=2){
1625216e
MN
3117 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
3118 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
3119 chrMmxFilter[s*i+APCK_COEF/4 ]=
3120 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2da0d70d 3121 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
bca11e75 3122 }
2da0d70d
DB
3123 }else{
3124 for (i=0; i<vLumFilterSize; i++)
3125 {
3126 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3127 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3128 lumMmxFilter[4*i+2]=
3129 lumMmxFilter[4*i+3]=
3130 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3131 }
3132 for (i=0; i<vChrFilterSize; i++)
3133 {
3134 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3135 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3136 chrMmxFilter[4*i+2]=
3137 chrMmxFilter[4*i+3]=
3138