10l. Previous GCC4 commit broke compilation with gcc-3.4 and maybe others ia-32
[libav.git] / postproc / swscale_template.c
CommitLineData
fe8054c0 1/*
5427e242 2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
fe8054c0
MN
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
6e1c66bc 19#undef REAL_MOVNTQ
541c4eb9 20#undef MOVNTQ
7d7f78b5 21#undef PAVGB
48a05cec
MN
22#undef PREFETCH
23#undef PREFETCHW
24#undef EMMS
25#undef SFENCE
26
27#ifdef HAVE_3DNOW
28/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29#define EMMS "femms"
30#else
31#define EMMS "emms"
32#endif
33
34#ifdef HAVE_3DNOW
35#define PREFETCH "prefetch"
36#define PREFETCHW "prefetchw"
37#elif defined ( HAVE_MMX2 )
38#define PREFETCH "prefetchnta"
39#define PREFETCHW "prefetcht0"
40#else
41#define PREFETCH "/nop"
42#define PREFETCHW "/nop"
43#endif
44
45#ifdef HAVE_MMX2
46#define SFENCE "sfence"
47#else
48#define SFENCE "/nop"
49#endif
d3f41512 50
d604bab9
MN
51#ifdef HAVE_MMX2
52#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53#elif defined (HAVE_3DNOW)
54#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55#endif
d3f41512 56
d604bab9 57#ifdef HAVE_MMX2
6e1c66bc 58#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
d604bab9 59#else
6e1c66bc 60#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
d604bab9 61#endif
6e1c66bc 62#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
d604bab9 63
a2faa401
RD
64#ifdef HAVE_ALTIVEC
65#include "swscale_altivec_template.c"
66#endif
67
77a49659 68#define YSCALEYUV2YV12X(x, offset) \
6e1c66bc 69 "xor %%"REG_a", %%"REG_a" \n\t"\
379a2036
MN
70 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71 "movq %%mm3, %%mm4 \n\t"\
6e1c66bc
AJ
72 "lea " offset "(%0), %%"REG_d" \n\t"\
73 "mov (%%"REG_d"), %%"REG_S" \n\t"\
c1b0bfb4
MN
74 ".balign 16 \n\t" /* FIXME Unroll? */\
75 "1: \n\t"\
6e1c66bc
AJ
76 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
77 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79 "add $16, %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
81 "test %%"REG_S", %%"REG_S" \n\t"\
c1b0bfb4
MN
82 "pmulhw %%mm0, %%mm2 \n\t"\
83 "pmulhw %%mm0, %%mm5 \n\t"\
84 "paddw %%mm2, %%mm3 \n\t"\
85 "paddw %%mm5, %%mm4 \n\t"\
c1b0bfb4
MN
86 " jnz 1b \n\t"\
87 "psraw $3, %%mm3 \n\t"\
88 "psraw $3, %%mm4 \n\t"\
89 "packuswb %%mm4, %%mm3 \n\t"\
6e1c66bc
AJ
90 MOVNTQ(%%mm3, (%1, %%REGa))\
91 "add $8, %%"REG_a" \n\t"\
92 "cmp %2, %%"REG_a" \n\t"\
379a2036
MN
93 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94 "movq %%mm3, %%mm4 \n\t"\
6e1c66bc
AJ
95 "lea " offset "(%0), %%"REG_d" \n\t"\
96 "mov (%%"REG_d"), %%"REG_S" \n\t"\
c1b0bfb4
MN
97 "jb 1b \n\t"
98
99#define YSCALEYUV2YV121 \
6e1c66bc 100 "mov %2, %%"REG_a" \n\t"\
c1b0bfb4
MN
101 ".balign 16 \n\t" /* FIXME Unroll? */\
102 "1: \n\t"\
6e1c66bc
AJ
103 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
c1b0bfb4
MN
105 "psraw $7, %%mm0 \n\t"\
106 "psraw $7, %%mm1 \n\t"\
107 "packuswb %%mm1, %%mm0 \n\t"\
6e1c66bc
AJ
108 MOVNTQ(%%mm0, (%1, %%REGa))\
109 "add $8, %%"REG_a" \n\t"\
c1b0bfb4
MN
110 "jnc 1b \n\t"
111
112/*
113 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115 "r" (dest), "m" (dstW),
116 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118*/
25593e29 119#define YSCALEYUV2PACKEDX \
6e1c66bc 120 "xor %%"REG_a", %%"REG_a" \n\t"\
c1b0bfb4 121 ".balign 16 \n\t"\
77a49659 122 "nop \n\t"\
c1b0bfb4 123 "1: \n\t"\
6e1c66bc
AJ
124 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125 "mov (%%"REG_d"), %%"REG_S" \n\t"\
379a2036
MN
126 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127 "movq %%mm3, %%mm4 \n\t"\
77a49659 128 ".balign 16 \n\t"\
c1b0bfb4 129 "2: \n\t"\
6e1c66bc
AJ
130 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
131 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
132 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133 "add $16, %%"REG_d" \n\t"\
134 "mov (%%"REG_d"), %%"REG_S" \n\t"\
c1b0bfb4
MN
135 "pmulhw %%mm0, %%mm2 \n\t"\
136 "pmulhw %%mm0, %%mm5 \n\t"\
137 "paddw %%mm2, %%mm3 \n\t"\
138 "paddw %%mm5, %%mm4 \n\t"\
6e1c66bc 139 "test %%"REG_S", %%"REG_S" \n\t"\
c1b0bfb4
MN
140 " jnz 2b \n\t"\
141\
6e1c66bc
AJ
142 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143 "mov (%%"REG_d"), %%"REG_S" \n\t"\
379a2036
MN
144 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145 "movq %%mm1, %%mm7 \n\t"\
77a49659 146 ".balign 16 \n\t"\
c1b0bfb4 147 "2: \n\t"\
6e1c66bc
AJ
148 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
149 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
150 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151 "add $16, %%"REG_d" \n\t"\
152 "mov (%%"REG_d"), %%"REG_S" \n\t"\
c1b0bfb4
MN
153 "pmulhw %%mm0, %%mm2 \n\t"\
154 "pmulhw %%mm0, %%mm5 \n\t"\
155 "paddw %%mm2, %%mm1 \n\t"\
156 "paddw %%mm5, %%mm7 \n\t"\
6e1c66bc 157 "test %%"REG_S", %%"REG_S" \n\t"\
c1b0bfb4 158 " jnz 2b \n\t"\
25593e29
MN
159
160
161#define YSCALEYUV2RGBX \
162 YSCALEYUV2PACKEDX\
77a49659
MN
163 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
164 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
c1b0bfb4
MN
165 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
166 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
77a49659
MN
167 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
168 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
c1b0bfb4 169 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
77a49659
MN
170 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
171 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
172 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
173 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
174 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
175 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
c1b0bfb4
MN
176 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177 "paddw %%mm3, %%mm4 \n\t"\
178 "movq %%mm2, %%mm0 \n\t"\
179 "movq %%mm5, %%mm6 \n\t"\
180 "movq %%mm4, %%mm3 \n\t"\
181 "punpcklwd %%mm2, %%mm2 \n\t"\
182 "punpcklwd %%mm5, %%mm5 \n\t"\
183 "punpcklwd %%mm4, %%mm4 \n\t"\
184 "paddw %%mm1, %%mm2 \n\t"\
185 "paddw %%mm1, %%mm5 \n\t"\
186 "paddw %%mm1, %%mm4 \n\t"\
187 "punpckhwd %%mm0, %%mm0 \n\t"\
188 "punpckhwd %%mm6, %%mm6 \n\t"\
189 "punpckhwd %%mm3, %%mm3 \n\t"\
190 "paddw %%mm7, %%mm0 \n\t"\
191 "paddw %%mm7, %%mm6 \n\t"\
192 "paddw %%mm7, %%mm3 \n\t"\
193 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194 "packuswb %%mm0, %%mm2 \n\t"\
195 "packuswb %%mm6, %%mm5 \n\t"\
196 "packuswb %%mm3, %%mm4 \n\t"\
197 "pxor %%mm7, %%mm7 \n\t"
77a49659 198#if 0
d604bab9
MN
199#define FULL_YSCALEYUV2RGB \
200 "pxor %%mm7, %%mm7 \n\t"\
201 "movd %6, %%mm6 \n\t" /*yalpha1*/\
202 "punpcklwd %%mm6, %%mm6 \n\t"\
203 "punpcklwd %%mm6, %%mm6 \n\t"\
204 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
205 "punpcklwd %%mm5, %%mm5 \n\t"\
206 "punpcklwd %%mm5, %%mm5 \n\t"\
6e1c66bc 207 "xor %%"REG_a", %%"REG_a" \n\t"\
cff6ecd7 208 ".balign 16 \n\t"\
d604bab9 209 "1: \n\t"\
6e1c66bc
AJ
210 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
213 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
d604bab9
MN
214 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
215 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
6e1c66bc 219 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
d604bab9
MN
220 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6e1c66bc 222 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
d604bab9
MN
223 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
9b464428
FB
225 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
226 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
227 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
228\
229\
230 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
9b464428 232 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
d604bab9 233 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428 234 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
d604bab9 235 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428 236 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
237\
238\
239 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
9b464428
FB
240 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
242 "paddw %%mm1, %%mm3 \n\t" /* B*/\
243 "paddw %%mm1, %%mm0 \n\t" /* R*/\
244 "packuswb %%mm3, %%mm3 \n\t"\
245\
246 "packuswb %%mm0, %%mm0 \n\t"\
247 "paddw %%mm4, %%mm2 \n\t"\
248 "paddw %%mm2, %%mm1 \n\t" /* G*/\
249\
250 "packuswb %%mm1, %%mm1 \n\t"
77a49659 251#endif
d604bab9 252
6e1c66bc 253#define REAL_YSCALEYUV2PACKED(index, c) \
6542b44e
MN
254 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256 "psraw $3, %%mm0 \n\t"\
257 "psraw $3, %%mm1 \n\t"\
258 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
6e1c66bc 260 "xor "#index", "#index" \n\t"\
25593e29
MN
261 ".balign 16 \n\t"\
262 "1: \n\t"\
6542b44e
MN
263 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
264 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
265 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
25593e29
MN
267 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
6542b44e 269 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
25593e29
MN
270 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
6542b44e
MN
276 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
277 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
278 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
25593e29
MN
280 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
281 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
6542b44e
MN
282 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
25593e29
MN
284 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288
6e1c66bc
AJ
289#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
290
291#define REAL_YSCALEYUV2RGB(index, c) \
292 "xor "#index", "#index" \n\t"\
cff6ecd7 293 ".balign 16 \n\t"\
d604bab9 294 "1: \n\t"\
6542b44e
MN
295 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
296 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
297 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
d604bab9
MN
299 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
6542b44e 301 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
d604bab9
MN
302 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
6542b44e
MN
308 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
309 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
6542b44e
MN
312 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
d604bab9 314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
6542b44e
MN
315 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
316 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
317 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
d604bab9
MN
319 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
320 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
6542b44e
MN
321 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
d604bab9
MN
323 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6542b44e
MN
327 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
330 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
331 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
332 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
d604bab9
MN
333 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334 "paddw %%mm3, %%mm4 \n\t"\
335 "movq %%mm2, %%mm0 \n\t"\
336 "movq %%mm5, %%mm6 \n\t"\
337 "movq %%mm4, %%mm3 \n\t"\
338 "punpcklwd %%mm2, %%mm2 \n\t"\
339 "punpcklwd %%mm5, %%mm5 \n\t"\
340 "punpcklwd %%mm4, %%mm4 \n\t"\
341 "paddw %%mm1, %%mm2 \n\t"\
342 "paddw %%mm1, %%mm5 \n\t"\
343 "paddw %%mm1, %%mm4 \n\t"\
344 "punpckhwd %%mm0, %%mm0 \n\t"\
345 "punpckhwd %%mm6, %%mm6 \n\t"\
346 "punpckhwd %%mm3, %%mm3 \n\t"\
347 "paddw %%mm7, %%mm0 \n\t"\
348 "paddw %%mm7, %%mm6 \n\t"\
349 "paddw %%mm7, %%mm3 \n\t"\
350 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351 "packuswb %%mm0, %%mm2 \n\t"\
352 "packuswb %%mm6, %%mm5 \n\t"\
353 "packuswb %%mm3, %%mm4 \n\t"\
354 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 355#define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
25593e29 356
6e1c66bc
AJ
357#define REAL_YSCALEYUV2PACKED1(index, c) \
358 "xor "#index", "#index" \n\t"\
25593e29
MN
359 ".balign 16 \n\t"\
360 "1: \n\t"\
e54d94ba
MN
361 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
362 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
25593e29
MN
363 "psraw $7, %%mm3 \n\t" \
364 "psraw $7, %%mm4 \n\t" \
e54d94ba
MN
365 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
366 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
25593e29
MN
367 "psraw $7, %%mm1 \n\t" \
368 "psraw $7, %%mm7 \n\t" \
369
6e1c66bc
AJ
370#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
371
372#define REAL_YSCALEYUV2RGB1(index, c) \
373 "xor "#index", "#index" \n\t"\
cff6ecd7 374 ".balign 16 \n\t"\
d604bab9 375 "1: \n\t"\
e54d94ba
MN
376 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
377 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
d604bab9
MN
378 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
e54d94ba
MN
380 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
381 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
382 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
383 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
e54d94ba
MN
384 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
d604bab9 386 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
e54d94ba
MN
387 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
388 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
497d4f99
MN
389 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
e54d94ba
MN
391 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
394 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
395 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
396 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
497d4f99
MN
397 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398 "paddw %%mm3, %%mm4 \n\t"\
399 "movq %%mm2, %%mm0 \n\t"\
400 "movq %%mm5, %%mm6 \n\t"\
401 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklwd %%mm2, %%mm2 \n\t"\
403 "punpcklwd %%mm5, %%mm5 \n\t"\
404 "punpcklwd %%mm4, %%mm4 \n\t"\
405 "paddw %%mm1, %%mm2 \n\t"\
406 "paddw %%mm1, %%mm5 \n\t"\
407 "paddw %%mm1, %%mm4 \n\t"\
408 "punpckhwd %%mm0, %%mm0 \n\t"\
409 "punpckhwd %%mm6, %%mm6 \n\t"\
410 "punpckhwd %%mm3, %%mm3 \n\t"\
411 "paddw %%mm7, %%mm0 \n\t"\
412 "paddw %%mm7, %%mm6 \n\t"\
413 "paddw %%mm7, %%mm3 \n\t"\
414 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415 "packuswb %%mm0, %%mm2 \n\t"\
416 "packuswb %%mm6, %%mm5 \n\t"\
417 "packuswb %%mm3, %%mm4 \n\t"\
418 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 419#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
497d4f99 420
6e1c66bc
AJ
421#define REAL_YSCALEYUV2PACKED1b(index, c) \
422 "xor "#index", "#index" \n\t"\
25593e29
MN
423 ".balign 16 \n\t"\
424 "1: \n\t"\
e54d94ba
MN
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
25593e29
MN
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431 "psrlw $8, %%mm3 \n\t" \
432 "psrlw $8, %%mm4 \n\t" \
e54d94ba
MN
433 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
434 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
25593e29
MN
435 "psraw $7, %%mm1 \n\t" \
436 "psraw $7, %%mm7 \n\t"
6e1c66bc 437#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
25593e29 438
497d4f99 439// do vertical chrominance interpolation
6e1c66bc
AJ
440#define REAL_YSCALEYUV2RGB1b(index, c) \
441 "xor "#index", "#index" \n\t"\
cff6ecd7 442 ".balign 16 \n\t"\
497d4f99 443 "1: \n\t"\
e54d94ba
MN
444 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
445 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
446 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397c035e
MN
448 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
c1b0bfb4
MN
450 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
451 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
e54d94ba
MN
452 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
453 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
497d4f99
MN
454 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
455 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
e54d94ba
MN
456 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
497d4f99 458 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
e54d94ba
MN
459 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
460 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
d604bab9
MN
461 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
e54d94ba
MN
463 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
d604bab9
MN
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 491#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
d604bab9 492
6e1c66bc 493#define REAL_WRITEBGR32(dst, dstw, index) \
d604bab9
MN
494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495 "movq %%mm2, %%mm1 \n\t" /* B */\
496 "movq %%mm5, %%mm6 \n\t" /* R */\
497 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
498 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
499 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
500 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
501 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
502 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
503 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
504 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
506 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
507\
6542b44e
MN
508 MOVNTQ(%%mm0, (dst, index, 4))\
509 MOVNTQ(%%mm2, 8(dst, index, 4))\
510 MOVNTQ(%%mm1, 16(dst, index, 4))\
511 MOVNTQ(%%mm3, 24(dst, index, 4))\
d604bab9 512\
6e1c66bc
AJ
513 "add $8, "#index" \n\t"\
514 "cmp "#dstw", "#index" \n\t"\
d604bab9 515 " jb 1b \n\t"
6e1c66bc 516#define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
d604bab9 517
6e1c66bc 518#define REAL_WRITEBGR16(dst, dstw, index) \
9b464428
FB
519 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
520 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
521 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb 522 "psrlq $3, %%mm2 \n\t"\
d604bab9 523\
f62255fb
MN
524 "movq %%mm2, %%mm1 \n\t"\
525 "movq %%mm4, %%mm3 \n\t"\
d604bab9 526\
f62255fb
MN
527 "punpcklbw %%mm7, %%mm3 \n\t"\
528 "punpcklbw %%mm5, %%mm2 \n\t"\
529 "punpckhbw %%mm7, %%mm4 \n\t"\
530 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 531\
f62255fb
MN
532 "psllq $3, %%mm3 \n\t"\
533 "psllq $3, %%mm4 \n\t"\
d604bab9
MN
534\
535 "por %%mm3, %%mm2 \n\t"\
d604bab9 536 "por %%mm4, %%mm1 \n\t"\
d604bab9 537\
6542b44e
MN
538 MOVNTQ(%%mm2, (dst, index, 2))\
539 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 540\
6e1c66bc
AJ
541 "add $8, "#index" \n\t"\
542 "cmp "#dstw", "#index" \n\t"\
d604bab9 543 " jb 1b \n\t"
6e1c66bc 544#define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
d604bab9 545
6e1c66bc 546#define REAL_WRITEBGR15(dst, dstw, index) \
9b464428
FB
547 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
548 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
549 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb
MN
550 "psrlq $3, %%mm2 \n\t"\
551 "psrlq $1, %%mm5 \n\t"\
d604bab9 552\
f62255fb
MN
553 "movq %%mm2, %%mm1 \n\t"\
554 "movq %%mm4, %%mm3 \n\t"\
d604bab9 555\
f62255fb
MN
556 "punpcklbw %%mm7, %%mm3 \n\t"\
557 "punpcklbw %%mm5, %%mm2 \n\t"\
558 "punpckhbw %%mm7, %%mm4 \n\t"\
559 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 560\
f62255fb
MN
561 "psllq $2, %%mm3 \n\t"\
562 "psllq $2, %%mm4 \n\t"\
d604bab9
MN
563\
564 "por %%mm3, %%mm2 \n\t"\
d604bab9 565 "por %%mm4, %%mm1 \n\t"\
d604bab9 566\
6542b44e
MN
567 MOVNTQ(%%mm2, (dst, index, 2))\
568 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 569\
6e1c66bc
AJ
570 "add $8, "#index" \n\t"\
571 "cmp "#dstw", "#index" \n\t"\
d604bab9 572 " jb 1b \n\t"
6e1c66bc 573#define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
f62255fb 574
6542b44e 575#define WRITEBGR24OLD(dst, dstw, index) \
d604bab9
MN
576 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577 "movq %%mm2, %%mm1 \n\t" /* B */\
578 "movq %%mm5, %%mm6 \n\t" /* R */\
579 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
580 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
581 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
582 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
583 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
584 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
a525ce8d
MN
585 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
586 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
588 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9
MN
589\
590 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
591 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
9b464428
FB
592 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
d604bab9
MN
594 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
595 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
596 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
597 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
598\
599 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
600 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
601 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
602 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
9b464428 603 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
d604bab9
MN
604 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
605 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
9b464428
FB
606 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
d604bab9
MN
608 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
609 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
610 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
611 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
612\
613 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
614 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
615 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
9b464428
FB
616 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
d604bab9
MN
618 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
619 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
620 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
621\
6542b44e
MN
622 MOVNTQ(%%mm0, (dst))\
623 MOVNTQ(%%mm2, 8(dst))\
624 MOVNTQ(%%mm3, 16(dst))\
6e1c66bc 625 "add $24, "#dst" \n\t"\
d604bab9 626\
6e1c66bc
AJ
627 "add $8, "#index" \n\t"\
628 "cmp "#dstw", "#index" \n\t"\
d604bab9
MN
629 " jb 1b \n\t"
630
6542b44e 631#define WRITEBGR24MMX(dst, dstw, index) \
99d2cb72
MN
632 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633 "movq %%mm2, %%mm1 \n\t" /* B */\
634 "movq %%mm5, %%mm6 \n\t" /* R */\
635 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
636 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
637 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
638 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
639 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
640 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
641 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
642 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
643 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
644 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
645\
646 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
647 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
648 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
649 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
650\
651 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
652 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
653 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
654 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
655\
656 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
657 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
658 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
659 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
660\
661 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
662 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
663 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
664 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
6542b44e 665 MOVNTQ(%%mm0, (dst))\
99d2cb72
MN
666\
667 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
668 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
669 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
670 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
6542b44e 671 MOVNTQ(%%mm6, 8(dst))\
99d2cb72
MN
672\
673 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
674 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
675 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
6542b44e 676 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 677\
6e1c66bc 678 "add $24, "#dst" \n\t"\
99d2cb72 679\
6e1c66bc
AJ
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
99d2cb72
MN
682 " jb 1b \n\t"
683
6542b44e 684#define WRITEBGR24MMX2(dst, dstw, index) \
99d2cb72 685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
9b464428
FB
686 "movq "MANGLE(M24A)", %%mm0 \n\t"\
687 "movq "MANGLE(M24C)", %%mm7 \n\t"\
99d2cb72
MN
688 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
689 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
690 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
691\
692 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
693 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
694 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
695\
696 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
697 "por %%mm1, %%mm6 \n\t"\
698 "por %%mm3, %%mm6 \n\t"\
6542b44e 699 MOVNTQ(%%mm6, (dst))\
99d2cb72
MN
700\
701 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
702 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
703 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
704 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
705\
9b464428 706 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
99d2cb72
MN
707 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
708 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
709\
710 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
711 "por %%mm3, %%mm6 \n\t"\
6542b44e 712 MOVNTQ(%%mm6, 8(dst))\
99d2cb72
MN
713\
714 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
715 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
716 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
717\
718 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
719 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
9b464428 720 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72
MN
721\
722 "por %%mm1, %%mm3 \n\t"\
723 "por %%mm3, %%mm6 \n\t"\
6542b44e 724 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 725\
6e1c66bc 726 "add $24, "#dst" \n\t"\
99d2cb72 727\
6e1c66bc
AJ
728 "add $8, "#index" \n\t"\
729 "cmp "#dstw", "#index" \n\t"\
99d2cb72
MN
730 " jb 1b \n\t"
731
732#ifdef HAVE_MMX2
7630f2e0 733#undef WRITEBGR24
6e1c66bc 734#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
99d2cb72 735#else
7630f2e0 736#undef WRITEBGR24
6e1c66bc 737#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
99d2cb72
MN
738#endif
739
6e1c66bc 740#define REAL_WRITEYUY2(dst, dstw, index) \
25593e29
MN
741 "packuswb %%mm3, %%mm3 \n\t"\
742 "packuswb %%mm4, %%mm4 \n\t"\
743 "packuswb %%mm7, %%mm1 \n\t"\
744 "punpcklbw %%mm4, %%mm3 \n\t"\
745 "movq %%mm1, %%mm7 \n\t"\
746 "punpcklbw %%mm3, %%mm1 \n\t"\
747 "punpckhbw %%mm3, %%mm7 \n\t"\
748\
6542b44e
MN
749 MOVNTQ(%%mm1, (dst, index, 2))\
750 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 751\
6e1c66bc
AJ
752 "add $8, "#index" \n\t"\
753 "cmp "#dstw", "#index" \n\t"\
25593e29 754 " jb 1b \n\t"
6e1c66bc 755#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
25593e29
MN
756
757
77a49659 758static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
c1b0bfb4 759 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
6542b44e 760 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
38858470 761{
c1b0bfb4
MN
762#ifdef HAVE_MMX
763 if(uDest != NULL)
764 {
765 asm volatile(
77a49659
MN
766 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767 :: "r" (&c->redDither),
bf0c1b62 768 "r" (uDest), "p" ((long)chrDstW)
6e1c66bc 769 : "%"REG_a, "%"REG_d, "%"REG_S
c1b0bfb4
MN
770 );
771
772 asm volatile(
77a49659
MN
773 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774 :: "r" (&c->redDither),
bf0c1b62 775 "r" (vDest), "p" ((long)chrDstW)
6e1c66bc 776 : "%"REG_a, "%"REG_d, "%"REG_S
c1b0bfb4
MN
777 );
778 }
779
780 asm volatile(
77a49659
MN
781 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782 :: "r" (&c->redDither),
bf0c1b62 783 "r" (dest), "p" ((long)dstW)
6e1c66bc 784 : "%"REG_a, "%"REG_d, "%"REG_S
c1b0bfb4
MN
785 );
786#else
a2faa401
RD
787#ifdef HAVE_ALTIVEC
788yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789 chrFilter, chrSrc, chrFilterSize,
790 dest, uDest, vDest, dstW, chrDstW);
791#else //HAVE_ALTIVEC
5859233b 792yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
e3d2500f 793 chrFilter, chrSrc, chrFilterSize,
5859233b 794 dest, uDest, vDest, dstW, chrDstW);
a2faa401 795#endif //!HAVE_ALTIVEC
7630f2e0 796#endif
c1b0bfb4 797}
2add307d 798
6118e52e
VS
799static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
802{
803yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804 chrFilter, chrSrc, chrFilterSize,
805 dest, uDest, dstW, chrDstW, dstFormat);
806}
807
c1b0bfb4 808static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
e616aa93 809 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
c1b0bfb4
MN
810{
811#ifdef HAVE_MMX
812 if(uDest != NULL)
38858470 813 {
c1b0bfb4
MN
814 asm volatile(
815 YSCALEYUV2YV121
e616aa93 816 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
6e1c66bc
AJ
817 "g" ((long)-chrDstW)
818 : "%"REG_a
c1b0bfb4
MN
819 );
820
821 asm volatile(
822 YSCALEYUV2YV121
e616aa93 823 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
6e1c66bc
AJ
824 "g" ((long)-chrDstW)
825 : "%"REG_a
c1b0bfb4 826 );
38858470
MN
827 }
828
c1b0bfb4
MN
829 asm volatile(
830 YSCALEYUV2YV121
831 :: "r" (lumSrc + dstW), "r" (dest + dstW),
6e1c66bc
AJ
832 "g" ((long)-dstW)
833 : "%"REG_a
c1b0bfb4
MN
834 );
835#else
c1b0bfb4
MN
836 int i;
837 for(i=0; i<dstW; i++)
38858470 838 {
c1b0bfb4 839 int val= lumSrc[i]>>7;
44c1035c
MN
840
841 if(val&256){
842 if(val<0) val=0;
843 else val=255;
844 }
c1b0bfb4 845
44c1035c 846 dest[i]= val;
c1b0bfb4
MN
847 }
848
849 if(uDest != NULL)
e616aa93 850 for(i=0; i<chrDstW; i++)
38858470 851 {
c1b0bfb4
MN
852 int u=chrSrc[i]>>7;
853 int v=chrSrc[i + 2048]>>7;
854
44c1035c
MN
855 if((u|v)&256){
856 if(u<0) u=0;
857 else if (u>255) u=255;
858 if(v<0) v=0;
859 else if (v>255) v=255;
860 }
861
862 uDest[i]= u;
863 vDest[i]= v;
38858470 864 }
c1b0bfb4 865#endif
38858470
MN
866}
867
c1b0bfb4 868
d604bab9
MN
869/**
870 * vertical scale YV12 to RGB
871 */
25593e29 872static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
c1b0bfb4 873 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
77a49659 874 uint8_t *dest, int dstW, int dstY)
c1b0bfb4 875{
77a49659 876 int dummy=0;
cf7d1c1a 877 switch(c->dstFormat)
c1b0bfb4
MN
878 {
879#ifdef HAVE_MMX
cf7d1c1a 880 case IMGFMT_BGR32:
c1b0bfb4
MN
881 {
882 asm volatile(
883 YSCALEYUV2RGBX
6e1c66bc 884 WRITEBGR32(%4, %5, %%REGa)
c1b0bfb4 885
77a49659
MN
886 :: "r" (&c->redDither),
887 "m" (dummy), "m" (dummy), "m" (dummy),
888 "r" (dest), "m" (dstW)
6e1c66bc 889 : "%"REG_a, "%"REG_d, "%"REG_S
c1b0bfb4
MN
890 );
891 }
cf7d1c1a
MN
892 break;
893 case IMGFMT_BGR24:
c1b0bfb4
MN
894 {
895 asm volatile(
896 YSCALEYUV2RGBX
6e1c66bc
AJ
897 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898 "add %4, %%"REG_b" \n\t"
899 WRITEBGR24(%%REGb, %5, %%REGa)
c1b0bfb4 900
77a49659
MN
901 :: "r" (&c->redDither),
902 "m" (dummy), "m" (dummy), "m" (dummy),
903 "r" (dest), "m" (dstW)
6e1c66bc 904 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
c1b0bfb4
MN
905 );
906 }
cf7d1c1a
MN
907 break;
908 case IMGFMT_BGR15:
c1b0bfb4
MN
909 {
910 asm volatile(
911 YSCALEYUV2RGBX
912 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
913#ifdef DITHER1XBPP
9b464428
FB
914 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
917#endif
918
6e1c66bc 919 WRITEBGR15(%4, %5, %%REGa)
c1b0bfb4 920
77a49659
MN
921 :: "r" (&c->redDither),
922 "m" (dummy), "m" (dummy), "m" (dummy),
923 "r" (dest), "m" (dstW)
6e1c66bc 924 : "%"REG_a, "%"REG_d, "%"REG_S
c1b0bfb4
MN
925 );
926 }
cf7d1c1a
MN
927 break;
928 case IMGFMT_BGR16:
c1b0bfb4
MN
929 {
930 asm volatile(
931 YSCALEYUV2RGBX
932 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
933#ifdef DITHER1XBPP
9b464428
FB
934 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
937#endif
938
6e1c66bc 939 WRITEBGR16(%4, %5, %%REGa)
c1b0bfb4 940
77a49659
MN
941 :: "r" (&c->redDither),
942 "m" (dummy), "m" (dummy), "m" (dummy),
943 "r" (dest), "m" (dstW)
6e1c66bc 944 : "%"REG_a, "%"REG_d, "%"REG_S
c1b0bfb4
MN
945 );
946 }
cf7d1c1a 947 break;
25593e29
MN
948 case IMGFMT_YUY2:
949 {
950 asm volatile(
951 YSCALEYUV2PACKEDX
952 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
953
954 "psraw $3, %%mm3 \n\t"
955 "psraw $3, %%mm4 \n\t"
956 "psraw $3, %%mm1 \n\t"
957 "psraw $3, %%mm7 \n\t"
6e1c66bc 958 WRITEYUY2(%4, %5, %%REGa)
25593e29 959
77a49659
MN
960 :: "r" (&c->redDither),
961 "m" (dummy), "m" (dummy), "m" (dummy),
962 "r" (dest), "m" (dstW)
6e1c66bc 963 : "%"REG_a, "%"REG_d, "%"REG_S
25593e29
MN
964 );
965 }
966 break;
c1b0bfb4 967#endif
cf7d1c1a 968 default:
a31de956
MN
969#ifdef HAVE_ALTIVEC
970 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
972 dest, dstW, dstY);
973#else
25593e29 974 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
cf7d1c1a
MN
975 chrFilter, chrSrc, chrFilterSize,
976 dest, dstW, dstY);
a31de956 977#endif
cf7d1c1a
MN
978 break;
979 }
c1b0bfb4
MN
980}
981
c1b0bfb4
MN
982/**
983 * vertical bilinear scale YV12 to RGB
984 */
25593e29 985static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
cf7d1c1a 986 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9
MN
987{
988 int yalpha1=yalpha^4095;
989 int uvalpha1=uvalpha^4095;
cf7d1c1a 990 int i;
d604bab9 991
77a416e8 992#if 0 //isn't used
1e621b18 993 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 994 {
cf7d1c1a 995 switch(dstFormat)
d604bab9 996 {
cf7d1c1a
MN
997#ifdef HAVE_MMX
998 case IMGFMT_BGR32:
d604bab9
MN
999 asm volatile(
1000
1001
1002FULL_YSCALEYUV2RGB
1003 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1004 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1005
1006 "movq %%mm3, %%mm1 \n\t"
1007 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1008 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1009
6e1c66bc
AJ
1010 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1011 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
d604bab9 1012
6e1c66bc
AJ
1013 "add $4, %%"REG_a" \n\t"
1014 "cmp %5, %%"REG_a" \n\t"
d604bab9
MN
1015 " jb 1b \n\t"
1016
1017
6e1c66bc 1018 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
d604bab9 1019 "m" (yalpha1), "m" (uvalpha1)
6e1c66bc 1020 : "%"REG_a
d604bab9 1021 );
cf7d1c1a
MN
1022 break;
1023 case IMGFMT_BGR24:
d604bab9
MN
1024 asm volatile(
1025
1026FULL_YSCALEYUV2RGB
1027
1028 // lsb ... msb
1029 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1030 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1031
1032 "movq %%mm3, %%mm1 \n\t"
1033 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1034 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1035
1036 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1037 "psrlq $8, %%mm3 \n\t" // GR0BGR00
9b464428
FB
1038 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1039 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
d604bab9
MN
1040 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1041 "movq %%mm1, %%mm2 \n\t"
1042 "psllq $48, %%mm1 \n\t" // 000000BG
1043 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1044
1045 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1046 "psrld $16, %%mm2 \n\t" // R000R000
1047 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1048 "por %%mm2, %%mm1 \n\t" // RBGRR000
1049
6e1c66bc
AJ
1050 "mov %4, %%"REG_b" \n\t"
1051 "add %%"REG_a", %%"REG_b" \n\t"
d604bab9
MN
1052
1053#ifdef HAVE_MMX2
1054 //FIXME Alignment
6e1c66bc
AJ
1055 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1056 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
d604bab9 1057#else
6e1c66bc 1058 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
d604bab9 1059 "psrlq $32, %%mm3 \n\t"
6e1c66bc
AJ
1060 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1061 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
d604bab9 1062#endif
6e1c66bc
AJ
1063 "add $4, %%"REG_a" \n\t"
1064 "cmp %5, %%"REG_a" \n\t"
d604bab9
MN
1065 " jb 1b \n\t"
1066
d1fac6cf 1067 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9 1068 "m" (yalpha1), "m" (uvalpha1)
6e1c66bc 1069 : "%"REG_a, "%"REG_b
d604bab9 1070 );
cf7d1c1a
MN
1071 break;
1072 case IMGFMT_BGR15:
d604bab9
MN
1073 asm volatile(
1074
1075FULL_YSCALEYUV2RGB
1076#ifdef DITHER1XBPP
9b464428
FB
1077 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1078 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1079 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
1080#endif
1081 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1082 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1083 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1084
1085 "psrlw $3, %%mm3 \n\t"
1086 "psllw $2, %%mm1 \n\t"
1087 "psllw $7, %%mm0 \n\t"
9b464428
FB
1088 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1089 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9
MN
1090
1091 "por %%mm3, %%mm1 \n\t"
1092 "por %%mm1, %%mm0 \n\t"
1093
6e1c66bc 1094 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1095
6e1c66bc
AJ
1096 "add $4, %%"REG_a" \n\t"
1097 "cmp %5, %%"REG_a" \n\t"
d604bab9
MN
1098 " jb 1b \n\t"
1099
d1fac6cf 1100 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9 1101 "m" (yalpha1), "m" (uvalpha1)
6e1c66bc 1102 : "%"REG_a
d604bab9 1103 );
cf7d1c1a
MN
1104 break;
1105 case IMGFMT_BGR16:
d604bab9
MN
1106 asm volatile(
1107
1108FULL_YSCALEYUV2RGB
1109#ifdef DITHER1XBPP
9b464428
FB
1110 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1111 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1112 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
1113#endif
1114 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1115 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1116 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1117
1118 "psrlw $3, %%mm3 \n\t"
1119 "psllw $3, %%mm1 \n\t"
1120 "psllw $8, %%mm0 \n\t"
9b464428
FB
1121 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1122 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9
MN
1123
1124 "por %%mm3, %%mm1 \n\t"
1125 "por %%mm1, %%mm0 \n\t"
1126
6e1c66bc 1127 MOVNTQ(%%mm0, (%4, %%REGa, 2))
d604bab9 1128
6e1c66bc
AJ
1129 "add $4, %%"REG_a" \n\t"
1130 "cmp %5, %%"REG_a" \n\t"
d604bab9
MN
1131 " jb 1b \n\t"
1132
d1fac6cf 1133 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9 1134 "m" (yalpha1), "m" (uvalpha1)
6e1c66bc 1135 : "%"REG_a
d604bab9 1136 );
cf7d1c1a
MN
1137 break;
1138#endif
1139 case IMGFMT_RGB32:
1140#ifndef HAVE_MMX
1141 case IMGFMT_BGR32:
1142#endif
28bf81c9
MN
1143 if(dstFormat==IMGFMT_BGR32)
1144 {
2ba1bff0 1145 int i;
df3c183a
MN
1146#ifdef WORDS_BIGENDIAN
1147 dest++;
1148#endif
28bf81c9
MN
1149 for(i=0;i<dstW;i++){
1150 // vertical linear interpolation && yuv2rgb in a single step:
1151 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1152 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1153 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1155 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1156 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1157 dest+= 4;
1158 }
1159 }
1160 else if(dstFormat==IMGFMT_BGR24)
d604bab9 1161 {
96034638 1162 int i;
d1fac6cf 1163 for(i=0;i<dstW;i++){
d604bab9
MN
1164 // vertical linear interpolation && yuv2rgb in a single step:
1165 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
390b20a6
MN
1168 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1169 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1170 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
28bf81c9 1171 dest+= 3;
d604bab9
MN
1172 }
1173 }
28bf81c9 1174 else if(dstFormat==IMGFMT_BGR16)
d604bab9 1175 {
96034638 1176 int i;
d1fac6cf 1177 for(i=0;i<dstW;i++){
d604bab9
MN
1178 // vertical linear interpolation && yuv2rgb in a single step:
1179 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1180 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1181 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1182
d022ce5c 1183 ((uint16_t*)dest)[i] =
b18ea156
MN
1184 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1185 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1186 clip_table16r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1187 }
1188 }
28bf81c9 1189 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1190 {
96034638 1191 int i;
d1fac6cf 1192 for(i=0;i<dstW;i++){
d604bab9
MN
1193 // vertical linear interpolation && yuv2rgb in a single step:
1194 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1195 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1196 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1197
d022ce5c 1198 ((uint16_t*)dest)[i] =
b18ea156
MN
1199 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1200 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1201 clip_table15r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1202 }
1203 }
d604bab9
MN
1204 }//FULL_UV_IPOL
1205 else
1206 {
cf7d1c1a 1207#endif // if 0
d604bab9 1208#ifdef HAVE_MMX
cf7d1c1a
MN
1209 switch(c->dstFormat)
1210 {
77a416e8 1211//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
cf7d1c1a 1212 case IMGFMT_BGR32:
d604bab9 1213 asm volatile(
6e1c66bc
AJ
1214 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1215 "mov %4, %%"REG_SP" \n\t"
1216 YSCALEYUV2RGB(%%REGa, %5)
1217 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1218 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
6542b44e
MN
1219
1220 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1221 "r" (&c->redDither)
6e1c66bc 1222 : "%"REG_a
d604bab9 1223 );
cf7d1c1a
MN
1224 return;
1225 case IMGFMT_BGR24:
d604bab9 1226 asm volatile(
6e1c66bc
AJ
1227 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1228 "mov %4, %%"REG_SP" \n\t"
1229 YSCALEYUV2RGB(%%REGa, %5)
1230 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1231 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
6542b44e
MN
1232 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1233 "r" (&c->redDither)
6e1c66bc 1234 : "%"REG_a
d604bab9 1235 );
cf7d1c1a
MN
1236 return;
1237 case IMGFMT_BGR15:
d604bab9 1238 asm volatile(
6e1c66bc
AJ
1239 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1240 "mov %4, %%"REG_SP" \n\t"
1241 YSCALEYUV2RGB(%%REGa, %5)
d604bab9
MN
1242 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1243#ifdef DITHER1XBPP
9b464428
FB
1244 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1245 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1246 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1247#endif
1248
6e1c66bc
AJ
1249 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1250 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
d604bab9 1251
6542b44e
MN
1252 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1253 "r" (&c->redDither)
6e1c66bc 1254 : "%"REG_a
d604bab9 1255 );
cf7d1c1a
MN
1256 return;
1257 case IMGFMT_BGR16:
d604bab9 1258 asm volatile(
6e1c66bc
AJ
1259 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1260 "mov %4, %%"REG_SP" \n\t"
1261 YSCALEYUV2RGB(%%REGa, %5)
d604bab9
MN
1262 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1263#ifdef DITHER1XBPP
9b464428
FB
1264 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1265 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1266 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1267#endif
1268
6e1c66bc
AJ
1269 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1270 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
6542b44e
MN
1271 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1272 "r" (&c->redDither)
6e1c66bc 1273 : "%"REG_a
d604bab9 1274 );
cf7d1c1a 1275 return;
25593e29
MN
1276 case IMGFMT_YUY2:
1277 asm volatile(
6e1c66bc
AJ
1278 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1279 "mov %4, %%"REG_SP" \n\t"
1280 YSCALEYUV2PACKED(%%REGa, %5)
1281 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1282 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
6542b44e
MN
1283 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1284 "r" (&c->redDither)
6e1c66bc 1285 : "%"REG_a
25593e29
MN
1286 );
1287 return;
cf7d1c1a
MN
1288 default: break;
1289 }
1290#endif //HAVE_MMX
25593e29 1291YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
d604bab9
MN
1292}
1293
1294/**
1295 * YV12 to RGB without scaling or interpolating
1296 */
25593e29 1297static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
cf7d1c1a 1298 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1299{
c1b0bfb4 1300 const int yalpha1=0;
cf7d1c1a
MN
1301 int i;
1302
1303 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1304 const int yalpha= 4096; //FIXME ...
96034638 1305
1e621b18 1306 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 1307 {
25593e29 1308 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
d604bab9
MN
1309 return;
1310 }
397c035e
MN
1311
1312#ifdef HAVE_MMX
497d4f99
MN
1313 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1314 {
cf7d1c1a 1315 switch(dstFormat)
d604bab9 1316 {
cf7d1c1a 1317 case IMGFMT_BGR32:
d604bab9 1318 asm volatile(
6e1c66bc
AJ
1319 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1320 "mov %4, %%"REG_SP" \n\t"
1321 YSCALEYUV2RGB1(%%REGa, %5)
1322 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1323 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1324
1325 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1326 "r" (&c->redDither)
6e1c66bc 1327 : "%"REG_a
d604bab9 1328 );
cf7d1c1a
MN
1329 return;
1330 case IMGFMT_BGR24:
d604bab9 1331 asm volatile(
6e1c66bc
AJ
1332 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1333 "mov %4, %%"REG_SP" \n\t"
1334 YSCALEYUV2RGB1(%%REGa, %5)
1335 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1336 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1337
1338 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1339 "r" (&c->redDither)
6e1c66bc 1340 : "%"REG_a
d604bab9 1341 );
cf7d1c1a
MN
1342 return;
1343 case IMGFMT_BGR15:
d604bab9 1344 asm volatile(
6e1c66bc
AJ
1345 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1346 "mov %4, %%"REG_SP" \n\t"
1347 YSCALEYUV2RGB1(%%REGa, %5)
d604bab9
MN
1348 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1349#ifdef DITHER1XBPP
9b464428
FB
1350 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1351 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1352 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9 1353#endif
6e1c66bc
AJ
1354 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1355 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1356
1357 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1358 "r" (&c->redDither)
6e1c66bc 1359 : "%"REG_a
d604bab9 1360 );
cf7d1c1a
MN
1361 return;
1362 case IMGFMT_BGR16:
d604bab9 1363 asm volatile(
6e1c66bc
AJ
1364 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1365 "mov %4, %%"REG_SP" \n\t"
1366 YSCALEYUV2RGB1(%%REGa, %5)
d604bab9
MN
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368#ifdef DITHER1XBPP
9b464428
FB
1369 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1371 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1372#endif
1373
6e1c66bc
AJ
1374 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1375 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1376
1377 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1378 "r" (&c->redDither)
6e1c66bc 1379 : "%"REG_a
d604bab9 1380 );
cf7d1c1a 1381 return;
25593e29
MN
1382 case IMGFMT_YUY2:
1383 asm volatile(
6e1c66bc
AJ
1384 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_SP" \n\t"
1386 YSCALEYUV2PACKED1(%%REGa, %5)
1387 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1388 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1389
1390 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1391 "r" (&c->redDither)
6e1c66bc 1392 : "%"REG_a
25593e29
MN
1393 );
1394 return;
d604bab9 1395 }
497d4f99
MN
1396 }
1397 else
1398 {
cf7d1c1a 1399 switch(dstFormat)
d604bab9 1400 {
cf7d1c1a 1401 case IMGFMT_BGR32:
497d4f99 1402 asm volatile(
6e1c66bc
AJ
1403 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1404 "mov %4, %%"REG_SP" \n\t"
1405 YSCALEYUV2RGB1b(%%REGa, %5)
1406 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1407 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1408
1409 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1410 "r" (&c->redDither)
6e1c66bc 1411 : "%"REG_a
497d4f99 1412 );
cf7d1c1a
MN
1413 return;
1414 case IMGFMT_BGR24:
497d4f99 1415 asm volatile(
6e1c66bc
AJ
1416 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1417 "mov %4, %%"REG_SP" \n\t"
1418 YSCALEYUV2RGB1b(%%REGa, %5)
1419 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1420 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1421
1422 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1423 "r" (&c->redDither)
6e1c66bc 1424 : "%"REG_a
497d4f99 1425 );
cf7d1c1a
MN
1426 return;
1427 case IMGFMT_BGR15:
497d4f99 1428 asm volatile(
6e1c66bc
AJ
1429 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1430 "mov %4, %%"REG_SP" \n\t"
1431 YSCALEYUV2RGB1b(%%REGa, %5)
497d4f99
MN
1432 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1433#ifdef DITHER1XBPP
9b464428
FB
1434 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1435 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1436 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1437#endif
6e1c66bc
AJ
1438 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1439 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1440
1441 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1442 "r" (&c->redDither)
6e1c66bc 1443 : "%"REG_a
497d4f99 1444 );
cf7d1c1a
MN
1445 return;
1446 case IMGFMT_BGR16:
497d4f99 1447 asm volatile(
6e1c66bc
AJ
1448 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1449 "mov %4, %%"REG_SP" \n\t"
1450 YSCALEYUV2RGB1b(%%REGa, %5)
497d4f99
MN
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452#ifdef DITHER1XBPP
9b464428
FB
1453 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1454 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1455 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1456#endif
d604bab9 1457
6e1c66bc
AJ
1458 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1459 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1460
1461 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1462 "r" (&c->redDither)
6e1c66bc 1463 : "%"REG_a
497d4f99 1464 );
cf7d1c1a 1465 return;
25593e29
MN
1466 case IMGFMT_YUY2:
1467 asm volatile(
6e1c66bc
AJ
1468 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_SP" \n\t"
1470 YSCALEYUV2PACKED1b(%%REGa, %5)
1471 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1472 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
e54d94ba
MN
1473
1474 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1475 "r" (&c->redDither)
6e1c66bc 1476 : "%"REG_a
25593e29
MN
1477 );
1478 return;
d604bab9 1479 }
497d4f99 1480 }
df3c183a 1481#endif
cf7d1c1a 1482 if( uvalpha < 2048 )
497d4f99 1483 {
25593e29 1484 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
cf7d1c1a 1485 }else{
25593e29 1486 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
497d4f99 1487 }
d604bab9
MN
1488}
1489
6ff0ad6b
MN
1490//FIXME yuy2* can read upto 7 samples to much
1491
1e621b18
MN
1492static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1493{
6ff0ad6b
MN
1494#ifdef HAVE_MMX
1495 asm volatile(
1496 "movq "MANGLE(bm01010101)", %%mm2\n\t"
6e1c66bc 1497 "mov %0, %%"REG_a" \n\t"
6ff0ad6b 1498 "1: \n\t"
6e1c66bc
AJ
1499 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1500 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
6ff0ad6b
MN
1501 "pand %%mm2, %%mm0 \n\t"
1502 "pand %%mm2, %%mm1 \n\t"
1503 "packuswb %%mm1, %%mm0 \n\t"
6e1c66bc
AJ
1504 "movq %%mm0, (%2, %%"REG_a") \n\t"
1505 "add $8, %%"REG_a" \n\t"
6ff0ad6b 1506 " js 1b \n\t"
6e1c66bc
AJ
1507 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1508 : "%"REG_a
6ff0ad6b 1509 );
1e621b18
MN
1510#else
1511 int i;
1512 for(i=0; i<width; i++)
1513 dst[i]= src[2*i];
1514#endif
1515}
1516
1517static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1518{
6ff0ad6b
MN
1519#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1520 asm volatile(
1521 "movq "MANGLE(bm01010101)", %%mm4\n\t"
6e1c66bc 1522 "mov %0, %%"REG_a" \n\t"
6ff0ad6b 1523 "1: \n\t"
6e1c66bc
AJ
1524 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1525 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1526 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1527 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
6ff0ad6b
MN
1528 PAVGB(%%mm2, %%mm0)
1529 PAVGB(%%mm3, %%mm1)
1530 "psrlw $8, %%mm0 \n\t"
1531 "psrlw $8, %%mm1 \n\t"
1532 "packuswb %%mm1, %%mm0 \n\t"
1533 "movq %%mm0, %%mm1 \n\t"
1534 "psrlw $8, %%mm0 \n\t"
1535 "pand %%mm4, %%mm1 \n\t"
1536 "packuswb %%mm0, %%mm0 \n\t"
1537 "packuswb %%mm1, %%mm1 \n\t"
6e1c66bc
AJ
1538 "movd %%mm0, (%4, %%"REG_a") \n\t"
1539 "movd %%mm1, (%3, %%"REG_a") \n\t"
1540 "add $4, %%"REG_a" \n\t"
6ff0ad6b 1541 " js 1b \n\t"
6e1c66bc
AJ
1542 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1543 : "%"REG_a
6ff0ad6b 1544 );
1e621b18
MN
1545#else
1546 int i;
1547 for(i=0; i<width; i++)
1548 {
1549 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1550 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1551 }
1552#endif
1553}
1554
7322a67c
MN
1555//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1556static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1557{
1558#ifdef HAVE_MMX
1559 asm volatile(
6e1c66bc 1560 "mov %0, %%"REG_a" \n\t"
7322a67c 1561 "1: \n\t"
6e1c66bc
AJ
1562 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1563 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
7322a67c
MN
1564 "psrlw $8, %%mm0 \n\t"
1565 "psrlw $8, %%mm1 \n\t"
1566 "packuswb %%mm1, %%mm0 \n\t"
6e1c66bc
AJ
1567 "movq %%mm0, (%2, %%"REG_a") \n\t"
1568 "add $8, %%"REG_a" \n\t"
7322a67c 1569 " js 1b \n\t"
6e1c66bc
AJ
1570 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1571 : "%"REG_a
7322a67c
MN
1572 );
1573#else
1574 int i;
1575 for(i=0; i<width; i++)
1576 dst[i]= src[2*i+1];
1577#endif
1578}
1579
1580static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1581{
1582#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1583 asm volatile(
1584 "movq "MANGLE(bm01010101)", %%mm4\n\t"
6e1c66bc 1585 "mov %0, %%"REG_a" \n\t"
7322a67c 1586 "1: \n\t"
6e1c66bc
AJ
1587 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1588 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1589 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1590 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
7322a67c
MN
1591 PAVGB(%%mm2, %%mm0)
1592 PAVGB(%%mm3, %%mm1)
1593 "pand %%mm4, %%mm0 \n\t"
1594 "pand %%mm4, %%mm1 \n\t"
1595 "packuswb %%mm1, %%mm0 \n\t"
1596 "movq %%mm0, %%mm1 \n\t"
1597 "psrlw $8, %%mm0 \n\t"
1598 "pand %%mm4, %%mm1 \n\t"
1599 "packuswb %%mm0, %%mm0 \n\t"
1600 "packuswb %%mm1, %%mm1 \n\t"
6e1c66bc
AJ
1601 "movd %%mm0, (%4, %%"REG_a") \n\t"
1602 "movd %%mm1, (%3, %%"REG_a") \n\t"
1603 "add $4, %%"REG_a" \n\t"
7322a67c 1604 " js 1b \n\t"
6e1c66bc
AJ
1605 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1606 : "%"REG_a
7322a67c
MN
1607 );
1608#else
1609 int i;
1610 for(i=0; i<width; i++)
1611 {
1612 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1613 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1614 }
1615#endif
1616}
1617
1e621b18
MN
1618static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1619{
1e621b18
MN
1620 int i;
1621 for(i=0; i<width; i++)
1622 {
4e61e21c
MN
1623 int b= ((uint32_t*)src)[i]&0xFF;
1624 int g= (((uint32_t*)src)[i]>>8)&0xFF;
3e499f53 1625 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1e621b18 1626
4e61e21c 1627 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1e621b18 1628 }
1e621b18
MN
1629}
1630
1631static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1632{
1e621b18
MN
1633 int i;
1634 for(i=0; i<width; i++)
1635 {
4e61e21c
MN
1636 const int a= ((uint32_t*)src1)[2*i+0];
1637 const int e= ((uint32_t*)src1)[2*i+1];
1638 const int c= ((uint32_t*)src2)[2*i+0];
1639 const int d= ((uint32_t*)src2)[2*i+1];
1640 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1641 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1642 const int b= l&0x3FF;
1643 const int g= h>>8;
1644 const int r= l>>16;
1e621b18
MN
1645
1646 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1647 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1648 }
1e621b18
MN
1649}
1650
1651static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1652{
ac6a2e45
MN
1653#ifdef HAVE_MMX
1654 asm volatile(
6e1c66bc 1655 "mov %2, %%"REG_a" \n\t"
854288bb
FB
1656 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1657 "movq "MANGLE(w1111)", %%mm5 \n\t"
ac6a2e45 1658 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc 1659 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
ac6a2e45
MN
1660 ".balign 16 \n\t"
1661 "1: \n\t"
6e1c66bc
AJ
1662 PREFETCH" 64(%0, %%"REG_b") \n\t"
1663 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1664 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
ac6a2e45
MN
1665 "punpcklbw %%mm7, %%mm0 \n\t"
1666 "punpcklbw %%mm7, %%mm1 \n\t"
6e1c66bc
AJ
1667 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1668 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
ac6a2e45
MN
1669 "punpcklbw %%mm7, %%mm2 \n\t"
1670 "punpcklbw %%mm7, %%mm3 \n\t"
1671 "pmaddwd %%mm6, %%mm0 \n\t"
1672 "pmaddwd %%mm6, %%mm1 \n\t"
1673 "pmaddwd %%mm6, %%mm2 \n\t"
1674 "pmaddwd %%mm6, %%mm3 \n\t"
1675#ifndef FAST_BGR2YV12
1676 "psrad $8, %%mm0 \n\t"
1677 "psrad $8, %%mm1 \n\t"
1678 "psrad $8, %%mm2 \n\t"
1679 "psrad $8, %%mm3 \n\t"
1680#endif
1681 "packssdw %%mm1, %%mm0 \n\t"
1682 "packssdw %%mm3, %%mm2 \n\t"
1683 "pmaddwd %%mm5, %%mm0 \n\t"
1684 "pmaddwd %%mm5, %%mm2 \n\t"
1685 "packssdw %%mm2, %%mm0 \n\t"
1686 "psraw $7, %%mm0 \n\t"
1687
6e1c66bc
AJ
1688 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1689 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
ac6a2e45
MN
1690 "punpcklbw %%mm7, %%mm4 \n\t"
1691 "punpcklbw %%mm7, %%mm1 \n\t"
6e1c66bc
AJ
1692 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1693 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
ac6a2e45
MN
1694 "punpcklbw %%mm7, %%mm2 \n\t"
1695 "punpcklbw %%mm7, %%mm3 \n\t"
1696 "pmaddwd %%mm6, %%mm4 \n\t"
1697 "pmaddwd %%mm6, %%mm1 \n\t"
1698 "pmaddwd %%mm6, %%mm2 \n\t"
1699 "pmaddwd %%mm6, %%mm3 \n\t"
1700#ifndef FAST_BGR2YV12
1701 "psrad $8, %%mm4 \n\t"
1702 "psrad $8, %%mm1 \n\t"
1703 "psrad $8, %%mm2 \n\t"
1704 "psrad $8, %%mm3 \n\t"
1705#endif
1706 "packssdw %%mm1, %%mm4 \n\t"
1707 "packssdw %%mm3, %%mm2 \n\t"
1708 "pmaddwd %%mm5, %%mm4 \n\t"
1709 "pmaddwd %%mm5, %%mm2 \n\t"
6e1c66bc 1710 "add $24, %%"REG_b" \n\t"
ac6a2e45
MN
1711 "packssdw %%mm2, %%mm4 \n\t"
1712 "psraw $7, %%mm4 \n\t"
1713
1714 "packuswb %%mm4, %%mm0 \n\t"
854288bb 1715 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
ac6a2e45 1716
6e1c66bc
AJ
1717 "movq %%mm0, (%1, %%"REG_a") \n\t"
1718 "add $8, %%"REG_a" \n\t"
ac6a2e45 1719 " js 1b \n\t"
6e1c66bc
AJ
1720 : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1721 : "%"REG_a, "%"REG_b
ac6a2e45 1722 );
1e621b18
MN
1723#else
1724 int i;
1725 for(i=0; i<width; i++)
1726 {
1727 int b= src[i*3+0];
1728 int g= src[i*3+1];
1729 int r= src[i*3+2];
1730
9902f4e2 1731 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1e621b18
MN
1732 }
1733#endif
1734}
1735
1736static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1737{
4342fc14
MN
1738#ifdef HAVE_MMX
1739 asm volatile(
6e1c66bc 1740 "mov %4, %%"REG_a" \n\t"
854288bb
FB
1741 "movq "MANGLE(w1111)", %%mm5 \n\t"
1742 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
4342fc14 1743 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc
AJ
1744 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1745 "add %%"REG_b", %%"REG_b" \n\t"
4342fc14
MN
1746 ".balign 16 \n\t"
1747 "1: \n\t"
6e1c66bc
AJ
1748 PREFETCH" 64(%0, %%"REG_b") \n\t"
1749 PREFETCH" 64(%1, %%"REG_b") \n\t"
4342fc14 1750#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
6e1c66bc
AJ
1751 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1752 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1753 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1754 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
4342fc14
MN
1755 PAVGB(%%mm1, %%mm0)
1756 PAVGB(%%mm3, %%mm2)
1757 "movq %%mm0, %%mm1 \n\t"
1758 "movq %%mm2, %%mm3 \n\t"
1759 "psrlq $24, %%mm0 \n\t"
1760 "psrlq $24, %%mm2 \n\t"
1761 PAVGB(%%mm1, %%mm0)
1762 PAVGB(%%mm3, %%mm2)
1763 "punpcklbw %%mm7, %%mm0 \n\t"
1764 "punpcklbw %%mm7, %%mm2 \n\t"
1765#else
6e1c66bc
AJ
1766 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1767 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1768 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1769 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
4342fc14
MN
1770 "punpcklbw %%mm7, %%mm0 \n\t"
1771 "punpcklbw %%mm7, %%mm1 \n\t"
1772 "punpcklbw %%mm7, %%mm2 \n\t"
1773 "punpcklbw %%mm7, %%mm3 \n\t"
1774 "paddw %%mm1, %%mm0 \n\t"
1775 "paddw %%mm3, %%mm2 \n\t"
1776 "paddw %%mm2, %%mm0 \n\t"
6e1c66bc
AJ
1777 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1778 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1779 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1780 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
4342fc14
MN
1781 "punpcklbw %%mm7, %%mm4 \n\t"
1782 "punpcklbw %%mm7, %%mm1 \n\t"
1783 "punpcklbw %%mm7, %%mm2 \n\t"
1784 "punpcklbw %%mm7, %%mm3 \n\t"
1785 "paddw %%mm1, %%mm4 \n\t"
1786 "paddw %%mm3, %%mm2 \n\t"
1787 "paddw %%mm4, %%mm2 \n\t"
1788 "psrlw $2, %%mm0 \n\t"
1789 "psrlw $2, %%mm2 \n\t"
1790#endif
854288bb
FB
1791 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1792 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1793
1794 "pmaddwd %%mm0, %%mm1 \n\t"
1795 "pmaddwd %%mm2, %%mm3 \n\t"
1796 "pmaddwd %%mm6, %%mm0 \n\t"
1797 "pmaddwd %%mm6, %%mm2 \n\t"
1798#ifndef FAST_BGR2YV12
1799 "psrad $8, %%mm0 \n\t"
1800 "psrad $8, %%mm1 \n\t"
1801 "psrad $8, %%mm2 \n\t"
1802 "psrad $8, %%mm3 \n\t"
1803#endif
1804 "packssdw %%mm2, %%mm0 \n\t"
1805 "packssdw %%mm3, %%mm1 \n\t"
1806 "pmaddwd %%mm5, %%mm0 \n\t"
1807 "pmaddwd %%mm5, %%mm1 \n\t"
1808 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1809 "psraw $7, %%mm0 \n\t"
1810
1811#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
6e1c66bc
AJ
1812 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1813 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1814 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1815 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
4342fc14
MN
1816 PAVGB(%%mm1, %%mm4)
1817 PAVGB(%%mm3, %%mm2)
1818 "movq %%mm4, %%mm1 \n\t"
1819 "movq %%mm2, %%mm3 \n\t"
1820 "psrlq $24, %%mm4 \n\t"
1821 "psrlq $24, %%mm2 \n\t"
1822 PAVGB(%%mm1, %%mm4)
1823 PAVGB(%%mm3, %%mm2)
1824 "punpcklbw %%mm7, %%mm4 \n\t"
1825 "punpcklbw %%mm7, %%mm2 \n\t"
1826#else
6e1c66bc
AJ
1827 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1828 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1829 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1830 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
4342fc14
MN
1831 "punpcklbw %%mm7, %%mm4 \n\t"
1832 "punpcklbw %%mm7, %%mm1 \n\t"
1833 "punpcklbw %%mm7, %%mm2 \n\t"
1834 "punpcklbw %%mm7, %%mm3 \n\t"
1835 "paddw %%mm1, %%mm4 \n\t"
1836 "paddw %%mm3, %%mm2 \n\t"
1837 "paddw %%mm2, %%mm4 \n\t"
6e1c66bc
AJ
1838 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1839 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1840 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1841 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
4342fc14
MN
1842 "punpcklbw %%mm7, %%mm5 \n\t"
1843 "punpcklbw %%mm7, %%mm1 \n\t"
1844 "punpcklbw %%mm7, %%mm2 \n\t"
1845 "punpcklbw %%mm7, %%mm3 \n\t"
1846 "paddw %%mm1, %%mm5 \n\t"
1847 "paddw %%mm3, %%mm2 \n\t"
1848 "paddw %%mm5, %%mm2 \n\t"
854288bb 1849 "movq "MANGLE(w1111)", %%mm5 \n\t"
4342fc14
MN
1850 "psrlw $2, %%mm4 \n\t"
1851 "psrlw $2, %%mm2 \n\t"
1852#endif
854288bb
FB
1853 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1854 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1855
1856 "pmaddwd %%mm4, %%mm1 \n\t"
1857 "pmaddwd %%mm2, %%mm3 \n\t"
1858 "pmaddwd %%mm6, %%mm4 \n\t"
1859 "pmaddwd %%mm6, %%mm2 \n\t"
1860#ifndef FAST_BGR2YV12
1861 "psrad $8, %%mm4 \n\t"
1862 "psrad $8, %%mm1 \n\t"
1863 "psrad $8, %%mm2 \n\t"
1864 "psrad $8, %%mm3 \n\t"
1865#endif
1866 "packssdw %%mm2, %%mm4 \n\t"
1867 "packssdw %%mm3, %%mm1 \n\t"
1868 "pmaddwd %%mm5, %%mm4 \n\t"
1869 "pmaddwd %%mm5, %%mm1 \n\t"
6e1c66bc 1870 "add $24, %%"REG_b" \n\t"
4342fc14
MN
1871 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1872 "psraw $7, %%mm4 \n\t"
1873
1874 "movq %%mm0, %%mm1 \n\t"
1875 "punpckldq %%mm4, %%mm0 \n\t"
1876 "punpckhdq %%mm4, %%mm1 \n\t"
1877 "packsswb %%mm1, %%mm0 \n\t"
854288bb 1878 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
4342fc14 1879
6e1c66bc 1880 "movd %%mm0, (%2, %%"REG_a") \n\t"
4342fc14 1881 "punpckhdq %%mm0, %%mm0 \n\t"
6e1c66bc
AJ
1882 "movd %%mm0, (%3, %%"REG_a") \n\t"
1883 "add $4, %%"REG_a" \n\t"
4342fc14 1884 " js 1b \n\t"
6e1c66bc
AJ
1885 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1886 : "%"REG_a, "%"REG_b
4342fc14 1887 );
1e621b18
MN
1888#else
1889 int i;
1890 for(i=0; i<width; i++)
1891 {
1892 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1893 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1894 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1895
1896 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1897 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1898 }
1899#endif
1900}
1901
6af250ea
MN
1902static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1903{
1904 int i;
1905 for(i=0; i<width; i++)
1906 {
4e61e21c 1907 int d= ((uint16_t*)src)[i];
6af250ea
MN
1908 int b= d&0x1F;
1909 int g= (d>>5)&0x3F;
1910 int r= (d>>11)&0x1F;
1911
1912 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1913 }
1914}
1915
1916static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1917{
1918 int i;
1919 for(i=0; i<width; i++)
1920 {
4e61e21c
MN
1921 int d0= ((uint32_t*)src1)[i];
1922 int d1= ((uint32_t*)src2)[i];
5bb9d9d8
MN
1923
1924 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1925 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1926
1927 int dh2= (dh>>11) + (dh<<21);
1928 int d= dh2 + dl;
1929
1930 int b= d&0x7F;
1931 int r= (d>>11)&0x7F;
1932 int g= d>>21;
6af250ea
MN
1933 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1934 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1935 }
1936}
1937
b72034dd
MN
1938static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1939{
1940 int i;
1941 for(i=0; i<width; i++)
1942 {
4e61e21c 1943 int d= ((uint16_t*)src)[i];
b72034dd
MN
1944 int b= d&0x1F;
1945 int g= (d>>5)&0x1F;
1946 int r= (d>>10)&0x1F;
1947
1948 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1949 }
1950}
1951
1952static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1953{
1954 int i;
1955 for(i=0; i<width; i++)
1956 {
4e61e21c
MN
1957 int d0= ((uint32_t*)src1)[i];
1958 int d1= ((uint32_t*)src2)[i];
b72034dd
MN
1959
1960 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1961 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1962
1963 int dh2= (dh>>11) + (dh<<21);
1964 int d= dh2 + dl;
1965
1966 int b= d&0x7F;
1967 int r= (d>>10)&0x7F;
1968 int g= d>>21;
b72034dd
MN
1969 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1970 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1971 }
1972}
1973
1974
a861d4d7
MN
1975static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1976{
1977 int i;
1978 for(i=0; i<width; i++)
1979 {
4e61e21c
MN
1980 int r= ((uint32_t*)src)[i]&0xFF;
1981 int g= (((uint32_t*)src)[i]>>8)&0xFF;
3e499f53 1982 int b= (((uint32_t*)src)[i]>>16)&0xFF;
a861d4d7 1983
4e61e21c 1984 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
a861d4d7
MN
1985 }
1986}
1987
1988static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1989{
1990 int i;
1991 for(i=0; i<width; i++)
1992 {
4e61e21c
MN
1993 const int a= ((uint32_t*)src1)[2*i+0];
1994 const int e= ((uint32_t*)src1)[2*i+1];
1995 const int c= ((uint32_t*)src2)[2*i+0];
1996 const int d= ((uint32_t*)src2)[2*i+1];
1997 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1998 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1999 const int r= l&0x3FF;
2000 const int g= h>>8;
2001 const int b= l>>16;
a861d4d7
MN
2002
2003 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2004 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2005 }
2006}
2007
2008static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2009{
2010 int i;
2011 for(i=0; i<width; i++)
2012 {
2013 int r= src[i*3+0];
2014 int g= src[i*3+1];
2015 int b= src[i*3+2];
2016
4e61e21c 2017 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
a861d4d7
MN
2018 }
2019}
2020
2021static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2022{
2023 int i;
2024 for(i=0; i<width; i++)
2025 {
2026 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2027 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2028 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2029
2030 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2031 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2032 }
2033}
2034
1e621b18 2035
077ea8a7
MN
2036// Bilinear / Bicubic scaling
2037static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2038 int16_t *filter, int16_t *filterPos, int filterSize)
2ff198c1 2039{
077ea8a7 2040#ifdef HAVE_MMX
c9b99ea6 2041 assert(filterSize % 4 == 0 && filterSize>0);
077ea8a7
MN
2042 if(filterSize==4) // allways true for upscaling, sometimes for down too
2043 {
6e1c66bc 2044 long counter= -2*dstW;
077ea8a7
MN
2045 filter-= counter*2;
2046 filterPos-= counter/2;
2047 dst-= counter/2;
2048 asm volatile(
2049 "pxor %%mm7, %%mm7 \n\t"
9b464428 2050 "movq "MANGLE(w02)", %%mm6 \n\t"
6e1c66bc
AJ
2051 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2052 "mov %%"REG_a", %%"REG_BP" \n\t"
077ea8a7
MN
2053 ".balign 16 \n\t"
2054 "1: \n\t"
a7b42d28
AJ
2055 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2056 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
6e1c66bc
AJ
2057 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2058 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2059 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2060 "movd (%3, %%"REG_b"), %%mm2 \n\t"
077ea8a7
MN
2061 "punpcklbw %%mm7, %%mm0 \n\t"
2062 "punpcklbw %%mm7, %%mm2 \n\t"
2063 "pmaddwd %%mm1, %%mm0 \n\t"
2064 "pmaddwd %%mm2, %%mm3 \n\t"
2065 "psrad $8, %%mm0 \n\t"
2066 "psrad $8, %%mm3 \n\t"
2067 "packssdw %%mm3, %%mm0 \n\t"
2068 "pmaddwd %%mm6, %%mm0 \n\t"
2069 "packssdw %%mm0, %%mm0 \n\t"
6e1c66bc
AJ
2070 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2071 "add $4, %%"REG_BP" \n\t"
077ea8a7 2072 " jnc 1b \n\t"
e3d2500f 2073
6e1c66bc 2074 "pop %%"REG_BP" \n\t"
077ea8a7
MN
2075 : "+a" (counter)
2076 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
6e1c66bc 2077 : "%"REG_b
077ea8a7
MN
2078 );
2079 }
2080 else if(filterSize==8)
2081 {
6e1c66bc 2082 long counter= -2*dstW;
077ea8a7
MN
2083 filter-= counter*4;
2084 filterPos-= counter/2;
2085 dst-= counter/2;
2086 asm volatile(
2087 "pxor %%mm7, %%mm7 \n\t"
9b464428 2088 "movq "MANGLE(w02)", %%mm6 \n\t"
6e1c66bc
AJ
2089 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2090 "mov %%"REG_a", %%"REG_BP" \n\t"
077ea8a7
MN
2091 ".balign 16 \n\t"
2092 "1: \n\t"
a7b42d28
AJ
2093 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2094 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
6e1c66bc
AJ
2095 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2096 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2097 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2098 "movd (%3, %%"REG_b"), %%mm2 \n\t"
077ea8a7
MN
2099 "punpcklbw %%mm7, %%mm0 \n\t"
2100 "punpcklbw %%mm7, %%mm2 \n\t"
2101 "pmaddwd %%mm1, %%mm0 \n\t"
2102 "pmaddwd %%mm2, %%mm3 \n\t"
2103
6e1c66bc
AJ
2104 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2105 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2106 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2107 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
077ea8a7
MN
2108 "punpcklbw %%mm7, %%mm4 \n\t"
2109 "punpcklbw %%mm7, %%mm2 \n\t"
2110 "pmaddwd %%mm1, %%mm4 \n\t"
2111 "pmaddwd %%mm2, %%mm5 \n\t"
2112 "paddd %%mm4, %%mm0 \n\t"
2113 "paddd %%mm5, %%mm3 \n\t"
2114
2115 "psrad $8, %%mm0 \n\t"
2116 "psrad $8, %%mm3 \n\t"
2117 "packssdw %%mm3, %%mm0 \n\t"
2118 "pmaddwd %%mm6, %%mm0 \n\t"
2119 "packssdw %%mm0, %%mm0 \n\t"
6e1c66bc
AJ
2120 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2121 "add $4, %%"REG_BP" \n\t"
077ea8a7 2122 " jnc 1b \n\t"
c1b0bfb4 2123
6e1c66bc 2124 "pop %%"REG_BP" \n\t"
077ea8a7
MN
2125 : "+a" (counter)
2126 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
6e1c66bc 2127 : "%"REG_b
077ea8a7
MN
2128 );
2129 }
2130 else
2131 {
20ffdcf9 2132 uint8_t *offset = src+filterSize;
6e1c66bc 2133 long counter= -2*dstW;
077ea8a7
MN
2134// filter-= counter*filterSize/2;
2135 filterPos-= counter/2;
2136 dst-= counter/2;
2137 asm volatile(
2138 "pxor %%mm7, %%mm7 \n\t"
9b464428 2139 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2140 ".balign 16 \n\t"
2141 "1: \n\t"
6e1c66bc 2142 "mov %2, %%"REG_c" \n\t"
a7b42d28
AJ
2143 "movzwl (%%"REG_c", %0), %%eax \n\t"
2144 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
6e1c66bc 2145 "mov %5, %%"REG_c" \n\t"
077ea8a7
MN
2146 "pxor %%mm4, %%mm4 \n\t"
2147 "pxor %%mm5, %%mm5 \n\t"
2148 "2: \n\t"
2149 "movq (%1), %%mm1 \n\t"
2150 "movq (%1, %6), %%mm3 \n\t"
6e1c66bc
AJ
2151 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2152 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
077ea8a7
MN
2153 "punpcklbw %%mm7, %%mm0 \n\t"
2154 "punpcklbw %%mm7, %%mm2 \n\t"
2155 "pmaddwd %%mm1, %%mm0 \n\t"
2156 "pmaddwd %%mm2, %%mm3 \n\t"
2157 "paddd %%mm3, %%mm5 \n\t"
2158 "paddd %%mm0, %%mm4 \n\t"
6e1c66bc
AJ
2159 "add $8, %1 \n\t"
2160 "add $4, %%"REG_c" \n\t"
2161 "cmp %4, %%"REG_c" \n\t"
077ea8a7 2162 " jb 2b \n\t"
6e1c66bc 2163 "add %6, %1 \n\t"
077ea8a7
MN
2164 "psrad $8, %%mm4 \n\t"
2165 "psrad $8, %%mm5 \n\t"
2166 "packssdw %%mm5, %%mm4 \n\t"
2167 "pmaddwd %%mm6, %%mm4 \n\t"
2168 "packssdw %%mm4, %%mm4 \n\t"
6e1c66bc
AJ
2169 "mov %3, %%"REG_a" \n\t"
2170 "movd %%mm4, (%%"REG_a", %0) \n\t"
2171 "add $4, %0 \n\t"
077ea8a7 2172 " jnc 1b \n\t"
c1b0bfb4 2173
627690b5 2174 : "+r" (counter), "+r" (filter)
20ffdcf9 2175 : "m" (filterPos), "m" (dst), "m"(offset),
6e1c66bc
AJ
2176 "m" (src), "r" ((long)filterSize*2)
2177 : "%"REG_b, "%"REG_a, "%"REG_c
077ea8a7
MN
2178 );
2179 }
2180#else
8c266f0c
RD
2181#ifdef HAVE_ALTIVEC
2182 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2183#else
077ea8a7
MN
2184 int i;
2185 for(i=0; i<dstW; i++)
2186 {
2187 int j;
2188 int srcPos= filterPos[i];
2189 int val=0;
c1b0bfb4 2190// printf("filterPos: %d\n", filterPos[i]);
077ea8a7
MN
2191 for(j=0; j<filterSize; j++)
2192 {
2193// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2194 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2195 }
2196// filter += hFilterSize;
2197 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2198// dst[i] = val>>7;
2199 }
2200#endif
8c266f0c 2201#endif
077ea8a7 2202}
2ff198c1 2203 // *** horizontal scale Y line to temp buffer
28bf81c9
MN
2204static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2205 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
1e621b18 2206 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
b7dc6f66
MN
2207 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2208 int32_t *mmx2FilterPos)
077ea8a7 2209{
1e621b18
MN
2210 if(srcFormat==IMGFMT_YUY2)
2211 {
2212 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2213 src= formatConvBuffer;
2214 }
7322a67c
MN
2215 else if(srcFormat==IMGFMT_UYVY)
2216 {
2217 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2218 src= formatConvBuffer;
2219 }
1e621b18
MN
2220 else if(srcFormat==IMGFMT_BGR32)
2221 {
2222 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2223 src= formatConvBuffer;
2224 }
2225 else if(srcFormat==IMGFMT_BGR24)
2226 {
2227 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2228 src= formatConvBuffer;
2229 }
6af250ea
MN
2230 else if(srcFormat==IMGFMT_BGR16)
2231 {
2232 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2233 src= formatConvBuffer;
2234 }
b72034dd
MN
2235 else if(srcFormat==IMGFMT_BGR15)
2236 {
2237 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2238 src= formatConvBuffer;
2239 }
a861d4d7
MN
2240 else if(srcFormat==IMGFMT_RGB32)
2241 {
2242 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2243 src= formatConvBuffer;
2244 }
2245 else if(srcFormat==IMGFMT_RGB24)
2246 {
2247 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2248 src= formatConvBuffer;
2249 }
1e621b18 2250
e3d2500f 2251#ifdef HAVE_MMX
77a416e8 2252 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
28bf81c9 2253 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2254#else
28bf81c9 2255 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2256#endif
077ea8a7
MN
2257 {
2258 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2259 }
2260 else // Fast Bilinear upscale / crap downscale
2261 {
6e1c66bc 2262#if defined(ARCH_X86) || defined(ARCH_X86_64)
2ff198c1 2263#ifdef HAVE_MMX2
96034638 2264 int i;
2ff198c1
MN
2265 if(canMMX2BeUsed)
2266 {
2267 asm volatile(
2268 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc
AJ
2269 "mov %0, %%"REG_c" \n\t"
2270 "mov %1, %%"REG_D" \n\t"
2271 "mov %2, %%"REG_d" \n\t"
2272 "mov %3, %%"REG_b" \n\t"
2273 "xor %%"REG_a", %%"REG_a" \n\t" // i
2274 PREFETCH" (%%"REG_c") \n\t"
2275 PREFETCH" 32(%%"REG_c") \n\t"
2276 PREFETCH" 64(%%"REG_c") \n\t"
99cefd0b 2277
6d606c4f
AJ
2278#ifdef ARCH_X86_64
2279
2280#define FUNNY_Y_CODE \
2281 "movl (%%"REG_b"), %%esi \n\t"\
2282 "call *%4 \n\t"\
2283 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2284 "add %%"REG_S", %%"REG_c" \n\t"\
2285 "add %%"REG_a", %%"REG_D" \n\t"\
2286 "xor %%"REG_a", %%"REG_a" \n\t"\
2287
2288#else
2289
2ff198c1 2290#define FUNNY_Y_CODE \
6d606c4f 2291 "movl (%%"REG_b"), %%esi \n\t"\
b7dc6f66 2292 "call *%4 \n\t"\
6d606c4f 2293 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
b6663a55 2294 "add %%"REG_a", %%"REG_D" \n\t"\
6e1c66bc 2295 "xor %%"REG_a", %%"REG_a" \n\t"\
99cefd0b 2296
6d606c4f
AJ
2297#endif
2298
2ff198c1
MN
2299FUNNY_Y_CODE
2300FUNNY_Y_CODE
2301FUNNY_Y_CODE
2302FUNNY_Y_CODE
2303FUNNY_Y_CODE
2304FUNNY_Y_CODE
2305FUNNY_Y_CODE
2306FUNNY_Y_CODE
2307
b7dc6f66
MN
2308 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2309 "m" (funnyYCode)
b6663a55 2310 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2ff198c1 2311 );
af91b8b3 2312 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2ff198c1
MN
2313 }
2314 else
2315 {
2316#endif
20ffdcf9
MN
2317 int xInc_shr16 = xInc >> 16;
2318 int xInc_mask = xInc & 0xffff;
2ff198c1
MN
2319 //NO MMX just normal asm ...
2320 asm volatile(
6e1c66bc
AJ
2321 "xor %%"REG_a", %%"REG_a" \n\t" // i
2322 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2ff198c1 2323 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2324 ".balign 16 \n\t"
2ff198c1 2325 "1: \n\t"
6e1c66bc
AJ
2326 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2327 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2ff198c1
MN
2328 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2329 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2330 "shll $16, %%edi \n\t"
2331 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
6e1c66bc 2332 "mov %1, %%"REG_D" \n\t"
2ff198c1 2333 "shrl $9, %%esi \n\t"
6e1c66bc 2334 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2ff198c1 2335 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
6e1c66bc 2336 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2ff198c1 2337
6e1c66bc
AJ
2338 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2339 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2ff198c1
MN
2340 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2341 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2342 "shll $16, %%edi \n\t"
2343 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
6e1c66bc 2344 "mov %1, %%"REG_D" \n\t"
2ff198c1 2345 "shrl $9, %%esi \n\t"
6e1c66bc 2346 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2ff198c1 2347 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
6e1c66bc 2348 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2ff198c1
MN
2349
2350
6e1c66bc
AJ
2351 "add $2, %%"REG_a" \n\t"
2352 "cmp %2, %%"REG_a" \n\t"
2ff198c1
MN
2353 " jb 1b \n\t"
2354
2355
20ffdcf9 2356 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
6e1c66bc 2357 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2ff198c1
MN
2358 );
2359#ifdef HAVE_MMX2
77a416e8 2360 } //if MMX2 can't be used
2ff198c1
MN
2361#endif
2362#else
96034638
MN
2363 int i;
2364 unsigned int xpos=0;
2365 for(i=0;i<dstWidth;i++)
2366 {
2367 register unsigned int xx=xpos>>16;
2368 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2369 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2370 xpos+=xInc;
2371 }
2ff198c1 2372#endif
077ea8a7 2373 }
2ff198c1
MN
2374}
2375
28bf81c9
MN
2376inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2377 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
1e621b18 2378 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
b7dc6f66
MN
2379 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2380 int32_t *mmx2FilterPos)
2ff198c1 2381{
1e621b18
MN
2382 if(srcFormat==IMGFMT_YUY2)
2383 {
2384 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2385 src1= formatConvBuffer;
2386 src2= formatConvBuffer+2048;
2387 }
7322a67c
MN
2388 else if(srcFormat==IMGFMT_UYVY)
2389 {
2390 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2391 src1= formatConvBuffer;
2392 src2= formatConvBuffer+2048;
2393 }
1e621b18
MN
2394 else if(srcFormat==IMGFMT_BGR32)
2395 {
2396 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2397 src1= formatConvBuffer;
2398 src2= formatConvBuffer+2048;
2399 }
2400 else if(srcFormat==IMGFMT_BGR24)
2401 {
2402 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2403 src1= formatConvBuffer;
2404 src2= formatConvBuffer+2048;
2405 }
6af250ea
MN
2406 else if(srcFormat==IMGFMT_BGR16)
2407 {
2408 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2409 src1= formatConvBuffer;
2410 src2= formatConvBuffer+2048;
2411 }
b72034dd
MN
2412 else if(srcFormat==IMGFMT_BGR15)
2413 {
2414 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2415 src1= formatConvBuffer;
2416 src2= formatConvBuffer+2048;
2417 }
a861d4d7
MN
2418 else if(srcFormat==IMGFMT_RGB32)
2419 {
2420 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2421 src1= formatConvBuffer;
2422 src2= formatConvBuffer+2048;
2423 }
2424 else if(srcFormat==IMGFMT_RGB24)
2425 {
2426 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2427 src1= formatConvBuffer;
2428 src2= formatConvBuffer+2048;
2429 }
6ff0ad6b
MN
2430 else if(isGray(srcFormat))
2431 {
2432 return;
2433 }
1e621b18 2434
e3d2500f 2435#ifdef HAVE_MMX
77a416e8 2436 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
28bf81c9 2437 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2438#else
28bf81c9 2439 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2440#endif
077ea8a7
MN
2441 {
2442 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2443 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2444 }
2445 else // Fast Bilinear upscale / crap downscale
2446 {
6e1c66bc 2447#if defined(ARCH_X86) || defined(ARCH_X86_64)
2ff198c1 2448#ifdef HAVE_MMX2
96034638 2449 int i;
2ff198c1
MN
2450 if(canMMX2BeUsed)
2451 {
2452 asm volatile(
b7dc6f66 2453 "pxor %%mm7, %%mm7 \n\t"
6e1c66bc
AJ
2454 "mov %0, %%"REG_c" \n\t"
2455 "mov %1, %%"REG_D" \n\t"
2456 "mov %2, %%"REG_d" \n\t"
2457 "mov %3, %%"REG_b" \n\t"
2458 "xor %%"REG_a", %%"REG_a" \n\t" // i
2459 PREFETCH" (%%"REG_c") \n\t"
2460 PREFETCH" 32(%%"REG_c") \n\t"
2461 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66 2462
6d606c4f
AJ
2463#ifdef ARCH_X86_64
2464
2465#define FUNNY_UV_CODE \
2466 "movl (%%"REG_b"), %%esi \n\t"\
2467 "call *%4 \n\t"\
2468 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2469 "add %%"REG_S", %%"REG_c" \n\t"\
2470 "add %%"REG_a", %%"REG_D" \n\t"\
2471 "xor %%"REG_a", %%"REG_a" \n\t"\
2472
2473#else
2474
b7dc6f66 2475#define FUNNY_UV_CODE \
6e1c66bc 2476 "movl (%%"REG_b"), %%esi \n\t"\
b7dc6f66 2477 "call *%4 \n\t"\
6d606c4f 2478 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
6e1c66bc
AJ
2479 "add %%"REG_a", %%"REG_D" \n\t"\
2480 "xor %%"REG_a", %%"REG_a" \n\t"\
b7dc6f66 2481
6d606c4f
AJ
2482#endif
2483
b7dc6f66
MN
2484FUNNY_UV_CODE
2485FUNNY_UV_CODE
2486FUNNY_UV_CODE
2487FUNNY_UV_CODE
6e1c66bc
AJ
2488 "xor %%"REG_a", %%"REG_a" \n\t" // i
2489 "mov %5, %%"REG_c" \n\t" // src
2490 "mov %1, %%"REG_D" \n\t" // buf1
2491 "add $4096, %%"REG_D" \n\t"
2492 PREFETCH" (%%"REG_c") \n\t"
2493 PREFETCH" 32(%%"REG_c") \n\t"
2494 PREFETCH" 64(%%"REG_c") \n\t"
b7dc6f66
MN
2495
2496FUNNY_UV_CODE
2497FUNNY_UV_CODE
2498FUNNY_UV_CODE
2499FUNNY_UV_CODE
2500
2501 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2502 "m" (funnyUVCode), "m" (src2)
6d606c4f 2503 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
b7dc6f66 2504 );
c1b0bfb4 2505 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2ff198c1 2506 {
c1b0bfb4
MN
2507// printf("%d %d %d\n", dstWidth, i, srcW);
2508 dst[i] = src1[srcW-1]*128;
2509 dst[i+2048] = src2[srcW-1]*128;
2ff198c1
MN
2510 }
2511 }
2512 else
2513 {
2514#endif
20ffdcf9
MN
2515 long xInc_shr16 = (long) (xInc >> 16);
2516 int xInc_mask = xInc & 0xffff;
2ff198c1 2517 asm volatile(
6e1c66bc
AJ
2518 "xor %%"REG_a", %%"REG_a" \n\t" // i
2519 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2ff198c1 2520 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2521 ".balign 16 \n\t"
2ff198c1 2522 "1: \n\t"
6e1c66bc
AJ
2523 "mov %0, %%"REG_S" \n\t"
2524 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2525 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2ff198c1
MN
2526 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2527 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2528 "shll $16, %%edi \n\t"
2529 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
6e1c66bc 2530 "mov %1, %%"REG_D" \n\t"
2ff198c1 2531 "shrl $9, %%esi \n\t"
6e1c66bc 2532 "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2ff198c1 2533
6e1c66bc
AJ
2534 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2535 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2ff198c1
MN
2536 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2537 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2538 "shll $16, %%edi \n\t"
2539 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
6e1c66bc 2540 "mov %1, %%"REG_D" \n\t"
2ff198c1 2541 "shrl $9, %%esi \n\t"
6e1c66bc 2542 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2ff198c1
MN
2543
2544 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
6e1c66bc
AJ
2545 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2546 "add $1, %%"REG_a" \n\t"
2547 "cmp %2, %%"REG_a" \n\t"
2ff198c1
MN
2548 " jb 1b \n\t"
2549
1a2f5491 2550 :: "m" (src1), "m" (dst), "mp" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2ff198c1 2551 "r" (src2)
6e1c66bc 2552 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2ff198c1
MN
2553 );
2554#ifdef HAVE_MMX2
77a416e8 2555 } //if MMX2 can't be used
2ff198c1
MN
2556#endif
2557#else
96034638
MN
2558 int i;
2559 unsigned int xpos=0;
2560 for(i=0;i<dstWidth;i++)
2561 {
2562 register unsigned int xx=xpos>>16;
2563 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2564 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2565 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1faf0867
MN
2566/* slower
2567 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2568 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2569*/
96034638
MN
2570 xpos+=xInc;
2571 }
2ff198c1 2572#endif
077ea8a7
MN
2573 }
2574}
2575
3e499f53
MN
2576static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2577 int srcSliceH, uint8_t* dst[], int dstStride[]){
28bf81c9
MN
2578
2579 /* load a few things into local vars to make the code more readable? and faster */
2580 const int srcW= c->srcW;
2581 const int dstW= c->dstW;
2582 const int dstH= c->dstH;
2583 const int chrDstW= c->chrDstW;
e616aa93 2584 const int chrSrcW= c->chrSrcW;
28bf81c9
MN
2585 const int lumXInc= c->lumXInc;
2586 const int chrXInc= c->chrXInc;
fe8054c0 2587 const int dstFormat= c->dstFormat;
44c1035c 2588 const int srcFormat= c->srcFormat;
28bf81c9
MN
2589 const int flags= c->flags;
2590 const int canMMX2BeUsed= c->canMMX2BeUsed;
2591 int16_t *vLumFilterPos= c->vLumFilterPos;
2592 int16_t *vChrFilterPos= c->vChrFilterPos;
2593 int16_t *hLumFilterPos= c->hLumFilterPos;
2594 int16_t *hChrFilterPos= c->hChrFilterPos;
2595 int16_t *vLumFilter= c->vLumFilter;
2596 int16_t *vChrFilter= c->vChrFilter;
2597 int16_t *hLumFilter= c->hLumFilter;
2598 int16_t *hChrFilter= c->hChrFilter;
77a49659
MN
2599 int32_t *lumMmxFilter= c->lumMmxFilter;
2600 int32_t *chrMmxFilter= c->chrMmxFilter;
28bf81c9
MN
2601 const int vLumFilterSize= c->vLumFilterSize;
2602 const int vChrFilterSize= c->vChrFilterSize;
2603 const int hLumFilterSize= c->hLumFilterSize;
2604 const int hChrFilterSize= c->hChrFilterSize;
2605 int16_t **lumPixBuf= c->lumPixBuf;
2606 int16_t **chrPixBuf= c->chrPixBuf;
2607 const int vLumBufSize= c->vLumBufSize;
2608 const int vChrBufSize= c->vChrBufSize;
2609 uint8_t *funnyYCode= c->funnyYCode;
2610 uint8_t *funnyUVCode= c->funnyUVCode;
1e621b18 2611 uint8_t *formatConvBuffer= c->formatConvBuffer;
e616aa93
MN
2612 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2613 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
d4e24275 2614 int lastDstY;
28bf81c9
MN
2615
2616 /* vars whch will change and which we need to storw back in the context */
2617 int dstY= c->dstY;
2618 int lumBufIndex= c->lumBufIndex;
2619 int chrBufIndex= c->chrBufIndex;
2620 int lastInLumBuf= c->lastInLumBuf;
2621 int lastInChrBuf= c->lastInChrBuf;
5859233b 2622
5859233b 2623 if(isPacked(c->srcFormat)){
1e621b18
MN
2624 src[0]=
2625 src[1]=
3e499f53 2626 src[2]= src[0];
5859233b 2627 srcStride[0]=
1e621b18 2628 srcStride[1]=
3e499f53 2629 srcStride[2]= srcStride[0];
6c7506de 2630 }
5859233b
MN
2631 srcStride[1]<<= c->vChrDrop;
2632 srcStride[2]<<= c->vChrDrop;
6c7506de 2633
c7a810cc
MN
2634// printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2635// (int)dst[0], (int)dst[1], (int)dst[2]);
2636
2637#if 0 //self test FIXME move to a vfilter or something
2638{
2639static volatile int i=0;
2640i++;
2641if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2642 selfTest(src, srcStride, c->srcW, c->srcH);
2643i--;
2644}
2645#endif
37079906
MN
2646
2647//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2648//dstStride[0],dstStride[1],dstStride[2]);
6c7506de
MN
2649
2650 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2651 {
2652 static int firstTime=1; //FIXME move this into the context perhaps
2653 if(flags & SWS_PRINT_INFO && firstTime)
2654 {
3ec38777 2655 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
6c7506de
MN
2656 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2657 firstTime=0;
2658 }
2659 }
28bf81c9 2660
1e621b18
MN
2661 /* Note the user might start scaling the picture in the middle so this will not get executed
2662 this is not really intended but works currently, so ppl might do it */
28bf81c9
MN
2663 if(srcSliceY ==0){
2664 lumBufIndex=0;
2665 chrBufIndex=0;
1e621b18 2666 dstY=0;
28bf81c9
MN
2667 lastInLumBuf= -1;
2668 lastInChrBuf= -1;
077ea8a7 2669 }
d3f41512 2670
d4e24275
MN
2671 lastDstY= dstY;
2672
c1b0bfb4 2673 for(;dstY < dstH; dstY++){
28bf81c9 2674 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3f7bb50c
MN
2675 const int chrDstY= dstY>>c->chrDstVSubSample;
2676 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2677 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
d3f41512 2678
c1b0bfb4
MN
2679 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2680 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2681 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2682 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
d604bab9 2683
379a2036
MN
2684//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2685// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
c7f822d9
MN
2686 //handle holes (FAST_BILINEAR & weird filters)
2687 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2688 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2689//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
c1b0bfb4
MN
2690 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2691 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
d3f41512 2692
c1b0bfb4 2693 // Do we have enough lines in this slice to output the dstY line
e616aa93 2694 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
c1b0bfb4
MN
2695 {
2696 //Do horizontal scaling
2697 while(lastInLumBuf < lastLumSrcY)
d3f41512 2698 {
28bf81c9 2699 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4 2700 lumBufIndex++;
c7f822d9 2701// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
c1b0bfb4
MN
2702 ASSERT(lumBufIndex < 2*vLumBufSize)
2703 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2704 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2705// printf("%d %d\n", lumBufIndex, vLumBufSize);
28bf81c9
MN
2706 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2707 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2708 funnyYCode, c->srcFormat, formatConvBuffer,
2709 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2710 lastInLumBuf++;
2711 }
2712 while(lastInChrBuf < lastChrSrcY)
2713 {
e616aa93
MN
2714 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2715 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2716 chrBufIndex++;
2717 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2718 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2719 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
28bf81c9 2720 //FIXME replace parameters through context struct (some at least)
44c1035c
MN
2721
2722 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2723 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2724 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2725 funnyUVCode, c->srcFormat, formatConvBuffer,
2726 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4 2727 lastInChrBuf++;
d3f41512 2728 }
c1b0bfb4
MN
2729 //wrap buf index around to stay inside the ring buffer
2730 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2731 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
d3f41512 2732 }
c1b0bfb4 2733 else // not enough lines left in this slice -> load the rest in the buffer
2ff198c1 2734 {
c1b0bfb4
MN
2735/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2736 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2737 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
e616aa93
MN
2738 vChrBufSize, vLumBufSize);*/
2739
c1b0bfb4
MN
2740 //Do horizontal scaling
2741 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2742 {
28bf81c9 2743 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4
MN
2744 lumBufIndex++;
2745 ASSERT(lumBufIndex < 2*vLumBufSize)
2746 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2747 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
28bf81c9
MN
2748 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2749 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2750 funnyYCode, c->srcFormat, formatConvBuffer,
2751 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2752 lastInLumBuf++;
2753 }
e616aa93 2754 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
c1b0bfb4 2755 {
e616aa93
MN
2756 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2757 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2758 chrBufIndex++;
2759 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2760 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2761 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
44c1035c
MN
2762
2763 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2764 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2765 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2766 funnyUVCode, c->srcFormat, formatConvBuffer,
2767 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4
MN
2768 lastInChrBuf++;
2769 }
2770 //wrap buf index around to stay inside the ring buffer
2771 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2772 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
77a416e8 2773 break; //we can't output a dstY line so let's try with the next slice
2ff198c1 2774 }
d3f41512 2775
c1b0bfb4
MN
2776#ifdef HAVE_MMX
2777 b5Dither= dither8[dstY&1];
2778 g6Dither= dither4[dstY&1];
2779 g5Dither= dither8[dstY&1];
2780 r5Dither= dither8[(dstY+1)&1];
2781#endif
28bf81c9 2782 if(dstY < dstH-2)
e3d2500f 2783 {
6542b44e
MN
2784 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2785 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2786#ifdef HAVE_MMX
2787 int i;
2788 for(i=0; i<vLumFilterSize; i++)
2789 {
2790 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2791 lumMmxFilter[4*i+2]=
2792 lumMmxFilter[4*i+3]=
2793 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2794 }
2795 for(i=0; i<vChrFilterSize; i++)
2796 {
2797 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2798 chrMmxFilter[4*i+2]=
2799 chrMmxFilter[4*i+3]=
2800 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2801 }
2802#endif
6118e52e
VS
2803 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2804 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2805 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2806 RENAME(yuv2nv12X)(c,
2807 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2808 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2809 dest, uDest, dstW, chrDstW, dstFormat);
2810 }
2811 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
0f25d72b 2812 {
df1b2c14
MN
2813 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2814 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
c1b0bfb4 2815 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2ff198c1 2816 {
c1b0bfb4
MN
2817 int16_t *lumBuf = lumPixBuf[0];
2818 int16_t *chrBuf= chrPixBuf[0];
e616aa93 2819 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
c1b0bfb4
MN
2820 }
2821 else //General YV12
2822 {
77a49659 2823 RENAME(yuv2yuvX)(c,
e616aa93
MN
2824 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2825 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
6542b44e 2826 dest, uDest, vDest, dstW, chrDstW);
2ff198c1 2827 }
0f25d72b 2828 }
c1b0bfb4 2829 else
2ff198c1 2830 {
c1b0bfb4
MN
2831 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2832 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2833 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2834 {
2835 int chrAlpha= vChrFilter[2*dstY+1];
25593e29 2836 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
cf7d1c1a 2837 dest, dstW, chrAlpha, dstFormat, flags, dstY);
c1b0bfb4
MN
2838 }
2839 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2840 {
2841 int lumAlpha= vLumFilter[2*dstY+1];
2842 int chrAlpha= vChrFilter[2*dstY+1];
25593e29 2843 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
cf7d1c1a 2844 dest, dstW, lumAlpha, chrAlpha, dstY);
c1b0bfb4
MN
2845 }
2846 else //General RGB
2847 {
25593e29 2848 RENAME(yuv2packedX)(c,
c1b0bfb4
MN
2849 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2850 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
77a49659 2851 dest, dstW, dstY);
c1b0bfb4
MN
2852 }
2853 }
e3d2500f 2854 }
77a416e8 2855 else // hmm looks like we can't use MMX here without overwriting this array's tail
e3d2500f
MN
2856 {
2857 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2858 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
6118e52e
VS
2859 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2860 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2861 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2862 yuv2nv12XinC(
2863 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2864 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2865 dest, uDest, dstW, chrDstW, dstFormat);
2866 }
2867 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
e3d2500f 2868 {
df1b2c14
MN
2869 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2870 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
5859233b 2871 yuv2yuvXinC(
e616aa93
MN
2872 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2873 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
5859233b 2874 dest, uDest, vDest, dstW, chrDstW);
e3d2500f
MN
2875 }
2876 else
2877 {
2878 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2879 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
25593e29 2880 yuv2packedXinC(c,
e3d2500f
MN
2881 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2882 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
cf7d1c1a 2883 dest, dstW, dstY);
e3d2500f
MN
2884 }
2885 }
c1b0bfb4 2886 }
17f715fa
MN
2887
2888#ifdef HAVE_MMX
2889 __asm __volatile(SFENCE:::"memory");
1faf0867 2890 __asm __volatile(EMMS:::"memory");
17f715fa 2891#endif
28bf81c9
MN
2892 /* store changed local vars back in the context */
2893 c->dstY= dstY;
2894 c->lumBufIndex= lumBufIndex;
2895 c->chrBufIndex= chrBufIndex;
2896 c->lastInLumBuf= lastInLumBuf;
2897 c->lastInChrBuf= lastInChrBuf;
d4e24275
MN
2898
2899 return dstY - lastDstY;
627690b5 2900}