make most resample filter parameters selectable at runtime
[libav.git] / postproc / swscale_template.c
CommitLineData
fe8054c0 1/*
5427e242 2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
fe8054c0
MN
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
541c4eb9 19#undef MOVNTQ
7d7f78b5 20#undef PAVGB
48a05cec
MN
21#undef PREFETCH
22#undef PREFETCHW
23#undef EMMS
24#undef SFENCE
25
26#ifdef HAVE_3DNOW
27/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28#define EMMS "femms"
29#else
30#define EMMS "emms"
31#endif
32
33#ifdef HAVE_3DNOW
34#define PREFETCH "prefetch"
35#define PREFETCHW "prefetchw"
36#elif defined ( HAVE_MMX2 )
37#define PREFETCH "prefetchnta"
38#define PREFETCHW "prefetcht0"
39#else
40#define PREFETCH "/nop"
41#define PREFETCHW "/nop"
42#endif
43
44#ifdef HAVE_MMX2
45#define SFENCE "sfence"
46#else
47#define SFENCE "/nop"
48#endif
d3f41512 49
d604bab9
MN
50#ifdef HAVE_MMX2
51#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52#elif defined (HAVE_3DNOW)
53#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54#endif
d3f41512 55
d604bab9
MN
56#ifdef HAVE_MMX2
57#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58#else
59#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60#endif
61
a2faa401
RD
62#ifdef HAVE_ALTIVEC
63#include "swscale_altivec_template.c"
64#endif
65
77a49659 66#define YSCALEYUV2YV12X(x, offset) \
c1b0bfb4 67 "xorl %%eax, %%eax \n\t"\
379a2036
MN
68 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
69 "movq %%mm3, %%mm4 \n\t"\
77a49659
MN
70 "leal " offset "(%0), %%edx \n\t"\
71 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
72 ".balign 16 \n\t" /* FIXME Unroll? */\
73 "1: \n\t"\
77a49659 74 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
c1b0bfb4
MN
75 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
76 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
77a49659
MN
77 "addl $16, %%edx \n\t"\
78 "movl (%%edx), %%esi \n\t"\
79 "testl %%esi, %%esi \n\t"\
c1b0bfb4
MN
80 "pmulhw %%mm0, %%mm2 \n\t"\
81 "pmulhw %%mm0, %%mm5 \n\t"\
82 "paddw %%mm2, %%mm3 \n\t"\
83 "paddw %%mm5, %%mm4 \n\t"\
c1b0bfb4
MN
84 " jnz 1b \n\t"\
85 "psraw $3, %%mm3 \n\t"\
86 "psraw $3, %%mm4 \n\t"\
87 "packuswb %%mm4, %%mm3 \n\t"\
77a49659 88 MOVNTQ(%%mm3, (%1, %%eax))\
c1b0bfb4 89 "addl $8, %%eax \n\t"\
77a49659 90 "cmpl %2, %%eax \n\t"\
379a2036
MN
91 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
92 "movq %%mm3, %%mm4 \n\t"\
77a49659
MN
93 "leal " offset "(%0), %%edx \n\t"\
94 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
95 "jb 1b \n\t"
96
97#define YSCALEYUV2YV121 \
98 "movl %2, %%eax \n\t"\
99 ".balign 16 \n\t" /* FIXME Unroll? */\
100 "1: \n\t"\
101 "movq (%0, %%eax, 2), %%mm0 \n\t"\
102 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
103 "psraw $7, %%mm0 \n\t"\
104 "psraw $7, %%mm1 \n\t"\
105 "packuswb %%mm1, %%mm0 \n\t"\
106 MOVNTQ(%%mm0, (%1, %%eax))\
107 "addl $8, %%eax \n\t"\
108 "jnc 1b \n\t"
109
110/*
111 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
112 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
113 "r" (dest), "m" (dstW),
114 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
115 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
116*/
25593e29 117#define YSCALEYUV2PACKEDX \
c1b0bfb4
MN
118 "xorl %%eax, %%eax \n\t"\
119 ".balign 16 \n\t"\
77a49659 120 "nop \n\t"\
c1b0bfb4 121 "1: \n\t"\
77a49659
MN
122 "leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
123 "movl (%%edx), %%esi \n\t"\
379a2036
MN
124 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
125 "movq %%mm3, %%mm4 \n\t"\
77a49659 126 ".balign 16 \n\t"\
c1b0bfb4 127 "2: \n\t"\
77a49659 128 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
c1b0bfb4
MN
129 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
130 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
77a49659
MN
131 "addl $16, %%edx \n\t"\
132 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
133 "pmulhw %%mm0, %%mm2 \n\t"\
134 "pmulhw %%mm0, %%mm5 \n\t"\
135 "paddw %%mm2, %%mm3 \n\t"\
136 "paddw %%mm5, %%mm4 \n\t"\
77a49659 137 "testl %%esi, %%esi \n\t"\
c1b0bfb4
MN
138 " jnz 2b \n\t"\
139\
77a49659
MN
140 "leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
141 "movl (%%edx), %%esi \n\t"\
379a2036
MN
142 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
143 "movq %%mm1, %%mm7 \n\t"\
77a49659 144 ".balign 16 \n\t"\
c1b0bfb4 145 "2: \n\t"\
77a49659 146 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
c1b0bfb4
MN
147 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
148 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
77a49659
MN
149 "addl $16, %%edx \n\t"\
150 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
151 "pmulhw %%mm0, %%mm2 \n\t"\
152 "pmulhw %%mm0, %%mm5 \n\t"\
153 "paddw %%mm2, %%mm1 \n\t"\
154 "paddw %%mm5, %%mm7 \n\t"\
77a49659 155 "testl %%esi, %%esi \n\t"\
c1b0bfb4 156 " jnz 2b \n\t"\
25593e29
MN
157
158
159#define YSCALEYUV2RGBX \
160 YSCALEYUV2PACKEDX\
77a49659
MN
161 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
162 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
c1b0bfb4
MN
163 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
164 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
77a49659
MN
165 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
166 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
c1b0bfb4 167 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
77a49659
MN
168 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
169 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
170 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
171 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
172 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
173 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
c1b0bfb4
MN
174 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
175 "paddw %%mm3, %%mm4 \n\t"\
176 "movq %%mm2, %%mm0 \n\t"\
177 "movq %%mm5, %%mm6 \n\t"\
178 "movq %%mm4, %%mm3 \n\t"\
179 "punpcklwd %%mm2, %%mm2 \n\t"\
180 "punpcklwd %%mm5, %%mm5 \n\t"\
181 "punpcklwd %%mm4, %%mm4 \n\t"\
182 "paddw %%mm1, %%mm2 \n\t"\
183 "paddw %%mm1, %%mm5 \n\t"\
184 "paddw %%mm1, %%mm4 \n\t"\
185 "punpckhwd %%mm0, %%mm0 \n\t"\
186 "punpckhwd %%mm6, %%mm6 \n\t"\
187 "punpckhwd %%mm3, %%mm3 \n\t"\
188 "paddw %%mm7, %%mm0 \n\t"\
189 "paddw %%mm7, %%mm6 \n\t"\
190 "paddw %%mm7, %%mm3 \n\t"\
191 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
192 "packuswb %%mm0, %%mm2 \n\t"\
193 "packuswb %%mm6, %%mm5 \n\t"\
194 "packuswb %%mm3, %%mm4 \n\t"\
195 "pxor %%mm7, %%mm7 \n\t"
77a49659 196#if 0
d604bab9
MN
197#define FULL_YSCALEYUV2RGB \
198 "pxor %%mm7, %%mm7 \n\t"\
199 "movd %6, %%mm6 \n\t" /*yalpha1*/\
200 "punpcklwd %%mm6, %%mm6 \n\t"\
201 "punpcklwd %%mm6, %%mm6 \n\t"\
202 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
203 "punpcklwd %%mm5, %%mm5 \n\t"\
204 "punpcklwd %%mm5, %%mm5 \n\t"\
205 "xorl %%eax, %%eax \n\t"\
cff6ecd7 206 ".balign 16 \n\t"\
d604bab9
MN
207 "1: \n\t"\
208 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
209 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
210 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
211 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
212 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
213 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
214 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
215 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
216 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
217 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
218 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
219 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
220 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
221 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
222 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
9b464428
FB
223 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
224 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
225 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
226\
227\
228 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
229 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
9b464428 230 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
d604bab9 231 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428 232 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
d604bab9 233 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428 234 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
235\
236\
237 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
9b464428
FB
238 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
239 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
240 "paddw %%mm1, %%mm3 \n\t" /* B*/\
241 "paddw %%mm1, %%mm0 \n\t" /* R*/\
242 "packuswb %%mm3, %%mm3 \n\t"\
243\
244 "packuswb %%mm0, %%mm0 \n\t"\
245 "paddw %%mm4, %%mm2 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t" /* G*/\
247\
248 "packuswb %%mm1, %%mm1 \n\t"
77a49659 249#endif
d604bab9 250
6542b44e
MN
251#define YSCALEYUV2PACKED(index, c) \
252 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
253 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
254 "psraw $3, %%mm0 \n\t"\
255 "psraw $3, %%mm1 \n\t"\
256 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
257 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
258 "xorl "#index", "#index" \n\t"\
25593e29
MN
259 ".balign 16 \n\t"\
260 "1: \n\t"\
6542b44e
MN
261 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
262 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
263 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
264 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
25593e29
MN
265 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
266 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
6542b44e 267 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
25593e29
MN
268 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
269 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
270 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
271 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
272 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
273 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
6542b44e
MN
274 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
275 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
276 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
277 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
25593e29
MN
278 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
279 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
6542b44e
MN
280 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
281 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
25593e29
MN
282 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
283 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
284 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
285 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
286
6542b44e
MN
287#define YSCALEYUV2RGB(index, c) \
288 "xorl "#index", "#index" \n\t"\
cff6ecd7 289 ".balign 16 \n\t"\
d604bab9 290 "1: \n\t"\
6542b44e
MN
291 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
292 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
293 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
294 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
d604bab9
MN
295 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
296 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
6542b44e 297 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
d604bab9
MN
298 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
299 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
300 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
301 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
302 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
303 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
6542b44e
MN
304 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
305 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
306 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
307 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
6542b44e
MN
308 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
309 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
d604bab9 310 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
6542b44e
MN
311 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
312 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
313 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
314 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
d604bab9
MN
315 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
316 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
6542b44e
MN
317 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
318 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
d604bab9
MN
319 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
320 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
321 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
322 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
6542b44e
MN
323 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
324 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
325 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
326 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
327 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
328 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
d604bab9
MN
329 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
330 "paddw %%mm3, %%mm4 \n\t"\
331 "movq %%mm2, %%mm0 \n\t"\
332 "movq %%mm5, %%mm6 \n\t"\
333 "movq %%mm4, %%mm3 \n\t"\
334 "punpcklwd %%mm2, %%mm2 \n\t"\
335 "punpcklwd %%mm5, %%mm5 \n\t"\
336 "punpcklwd %%mm4, %%mm4 \n\t"\
337 "paddw %%mm1, %%mm2 \n\t"\
338 "paddw %%mm1, %%mm5 \n\t"\
339 "paddw %%mm1, %%mm4 \n\t"\
340 "punpckhwd %%mm0, %%mm0 \n\t"\
341 "punpckhwd %%mm6, %%mm6 \n\t"\
342 "punpckhwd %%mm3, %%mm3 \n\t"\
343 "paddw %%mm7, %%mm0 \n\t"\
344 "paddw %%mm7, %%mm6 \n\t"\
345 "paddw %%mm7, %%mm3 \n\t"\
346 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
347 "packuswb %%mm0, %%mm2 \n\t"\
348 "packuswb %%mm6, %%mm5 \n\t"\
349 "packuswb %%mm3, %%mm4 \n\t"\
350 "pxor %%mm7, %%mm7 \n\t"
25593e29 351
e54d94ba
MN
352#define YSCALEYUV2PACKED1(index, c) \
353 "xorl "#index", "#index" \n\t"\
25593e29
MN
354 ".balign 16 \n\t"\
355 "1: \n\t"\
e54d94ba
MN
356 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
357 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
25593e29
MN
358 "psraw $7, %%mm3 \n\t" \
359 "psraw $7, %%mm4 \n\t" \
e54d94ba
MN
360 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
361 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
25593e29
MN
362 "psraw $7, %%mm1 \n\t" \
363 "psraw $7, %%mm7 \n\t" \
364
e54d94ba
MN
365#define YSCALEYUV2RGB1(index, c) \
366 "xorl "#index", "#index" \n\t"\
cff6ecd7 367 ".balign 16 \n\t"\
d604bab9 368 "1: \n\t"\
e54d94ba
MN
369 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
370 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
d604bab9
MN
371 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
372 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
e54d94ba
MN
373 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
374 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
375 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
376 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
e54d94ba
MN
377 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
378 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
d604bab9 379 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
e54d94ba
MN
380 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
381 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
497d4f99
MN
382 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
383 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
e54d94ba
MN
384 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
385 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
386 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
387 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
388 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
389 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
497d4f99
MN
390 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
391 "paddw %%mm3, %%mm4 \n\t"\
392 "movq %%mm2, %%mm0 \n\t"\
393 "movq %%mm5, %%mm6 \n\t"\
394 "movq %%mm4, %%mm3 \n\t"\
395 "punpcklwd %%mm2, %%mm2 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "punpcklwd %%mm4, %%mm4 \n\t"\
398 "paddw %%mm1, %%mm2 \n\t"\
399 "paddw %%mm1, %%mm5 \n\t"\
400 "paddw %%mm1, %%mm4 \n\t"\
401 "punpckhwd %%mm0, %%mm0 \n\t"\
402 "punpckhwd %%mm6, %%mm6 \n\t"\
403 "punpckhwd %%mm3, %%mm3 \n\t"\
404 "paddw %%mm7, %%mm0 \n\t"\
405 "paddw %%mm7, %%mm6 \n\t"\
406 "paddw %%mm7, %%mm3 \n\t"\
407 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
408 "packuswb %%mm0, %%mm2 \n\t"\
409 "packuswb %%mm6, %%mm5 \n\t"\
410 "packuswb %%mm3, %%mm4 \n\t"\
411 "pxor %%mm7, %%mm7 \n\t"
412
e54d94ba
MN
413#define YSCALEYUV2PACKED1b(index, c) \
414 "xorl "#index", "#index" \n\t"\
25593e29
MN
415 ".balign 16 \n\t"\
416 "1: \n\t"\
e54d94ba
MN
417 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
418 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
419 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
420 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
25593e29
MN
421 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
422 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
423 "psrlw $8, %%mm3 \n\t" \
424 "psrlw $8, %%mm4 \n\t" \
e54d94ba
MN
425 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
426 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
25593e29
MN
427 "psraw $7, %%mm1 \n\t" \
428 "psraw $7, %%mm7 \n\t"
429
497d4f99 430// do vertical chrominance interpolation
e54d94ba
MN
431#define YSCALEYUV2RGB1b(index, c) \
432 "xorl "#index", "#index" \n\t"\
cff6ecd7 433 ".balign 16 \n\t"\
497d4f99 434 "1: \n\t"\
e54d94ba
MN
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397c035e
MN
439 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
440 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
c1b0bfb4
MN
441 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
442 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
e54d94ba
MN
443 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
444 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
497d4f99
MN
445 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
446 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
e54d94ba
MN
447 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
448 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
497d4f99 449 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
e54d94ba
MN
450 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
451 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
d604bab9
MN
452 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
e54d94ba
MN
454 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
455 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
456 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
457 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
458 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
459 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
d604bab9
MN
460 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461 "paddw %%mm3, %%mm4 \n\t"\
462 "movq %%mm2, %%mm0 \n\t"\
463 "movq %%mm5, %%mm6 \n\t"\
464 "movq %%mm4, %%mm3 \n\t"\
465 "punpcklwd %%mm2, %%mm2 \n\t"\
466 "punpcklwd %%mm5, %%mm5 \n\t"\
467 "punpcklwd %%mm4, %%mm4 \n\t"\
468 "paddw %%mm1, %%mm2 \n\t"\
469 "paddw %%mm1, %%mm5 \n\t"\
470 "paddw %%mm1, %%mm4 \n\t"\
471 "punpckhwd %%mm0, %%mm0 \n\t"\
472 "punpckhwd %%mm6, %%mm6 \n\t"\
473 "punpckhwd %%mm3, %%mm3 \n\t"\
474 "paddw %%mm7, %%mm0 \n\t"\
475 "paddw %%mm7, %%mm6 \n\t"\
476 "paddw %%mm7, %%mm3 \n\t"\
477 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478 "packuswb %%mm0, %%mm2 \n\t"\
479 "packuswb %%mm6, %%mm5 \n\t"\
480 "packuswb %%mm3, %%mm4 \n\t"\
481 "pxor %%mm7, %%mm7 \n\t"
482
6542b44e 483#define WRITEBGR32(dst, dstw, index) \
d604bab9
MN
484 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
485 "movq %%mm2, %%mm1 \n\t" /* B */\
486 "movq %%mm5, %%mm6 \n\t" /* R */\
487 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
488 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
489 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
490 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
491 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
492 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
493 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
494 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
495 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
496 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
497\
6542b44e
MN
498 MOVNTQ(%%mm0, (dst, index, 4))\
499 MOVNTQ(%%mm2, 8(dst, index, 4))\
500 MOVNTQ(%%mm1, 16(dst, index, 4))\
501 MOVNTQ(%%mm3, 24(dst, index, 4))\
d604bab9 502\
6542b44e
MN
503 "addl $8, "#index" \n\t"\
504 "cmpl "#dstw", "#index" \n\t"\
d604bab9
MN
505 " jb 1b \n\t"
506
6542b44e 507#define WRITEBGR16(dst, dstw, index) \
9b464428
FB
508 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
509 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
510 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb 511 "psrlq $3, %%mm2 \n\t"\
d604bab9 512\
f62255fb
MN
513 "movq %%mm2, %%mm1 \n\t"\
514 "movq %%mm4, %%mm3 \n\t"\
d604bab9 515\
f62255fb
MN
516 "punpcklbw %%mm7, %%mm3 \n\t"\
517 "punpcklbw %%mm5, %%mm2 \n\t"\
518 "punpckhbw %%mm7, %%mm4 \n\t"\
519 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 520\
f62255fb
MN
521 "psllq $3, %%mm3 \n\t"\
522 "psllq $3, %%mm4 \n\t"\
d604bab9
MN
523\
524 "por %%mm3, %%mm2 \n\t"\
d604bab9 525 "por %%mm4, %%mm1 \n\t"\
d604bab9 526\
6542b44e
MN
527 MOVNTQ(%%mm2, (dst, index, 2))\
528 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 529\
6542b44e
MN
530 "addl $8, "#index" \n\t"\
531 "cmpl "#dstw", "#index" \n\t"\
d604bab9
MN
532 " jb 1b \n\t"
533
6542b44e 534#define WRITEBGR15(dst, dstw, index) \
9b464428
FB
535 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
536 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
537 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb
MN
538 "psrlq $3, %%mm2 \n\t"\
539 "psrlq $1, %%mm5 \n\t"\
d604bab9 540\
f62255fb
MN
541 "movq %%mm2, %%mm1 \n\t"\
542 "movq %%mm4, %%mm3 \n\t"\
d604bab9 543\
f62255fb
MN
544 "punpcklbw %%mm7, %%mm3 \n\t"\
545 "punpcklbw %%mm5, %%mm2 \n\t"\
546 "punpckhbw %%mm7, %%mm4 \n\t"\
547 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 548\
f62255fb
MN
549 "psllq $2, %%mm3 \n\t"\
550 "psllq $2, %%mm4 \n\t"\
d604bab9
MN
551\
552 "por %%mm3, %%mm2 \n\t"\
d604bab9 553 "por %%mm4, %%mm1 \n\t"\
d604bab9 554\
6542b44e
MN
555 MOVNTQ(%%mm2, (dst, index, 2))\
556 MOVNTQ(%%mm1, 8(dst, index, 2))\
d604bab9 557\
6542b44e
MN
558 "addl $8, "#index" \n\t"\
559 "cmpl "#dstw", "#index" \n\t"\
d604bab9 560 " jb 1b \n\t"
f62255fb 561
6542b44e 562#define WRITEBGR24OLD(dst, dstw, index) \
d604bab9
MN
563 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
564 "movq %%mm2, %%mm1 \n\t" /* B */\
565 "movq %%mm5, %%mm6 \n\t" /* R */\
566 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
567 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
568 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
569 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
570 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
571 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
a525ce8d
MN
572 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
573 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
574 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
575 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9
MN
576\
577 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
578 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
9b464428
FB
579 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
580 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
d604bab9
MN
581 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
582 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
583 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
584 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
585\
586 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
588 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
589 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
9b464428 590 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
d604bab9
MN
591 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
592 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
9b464428
FB
593 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
594 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
d604bab9
MN
595 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
596 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
597 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
598 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
599\
600 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
601 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
602 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
9b464428
FB
603 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
604 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
d604bab9
MN
605 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
606 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
607 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
608\
6542b44e
MN
609 MOVNTQ(%%mm0, (dst))\
610 MOVNTQ(%%mm2, 8(dst))\
611 MOVNTQ(%%mm3, 16(dst))\
612 "addl $24, "#dst" \n\t"\
d604bab9 613\
6542b44e
MN
614 "addl $8, "#index" \n\t"\
615 "cmpl "#dstw", "#index" \n\t"\
d604bab9
MN
616 " jb 1b \n\t"
617
6542b44e 618#define WRITEBGR24MMX(dst, dstw, index) \
99d2cb72
MN
619 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
620 "movq %%mm2, %%mm1 \n\t" /* B */\
621 "movq %%mm5, %%mm6 \n\t" /* R */\
622 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
623 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
624 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
625 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
626 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
627 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
628 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
629 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
630 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
631 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
632\
633 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
634 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
635 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
636 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
637\
638 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
639 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
640 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
641 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
642\
643 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
644 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
645 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
646 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
647\
648 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
649 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
650 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
651 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
6542b44e 652 MOVNTQ(%%mm0, (dst))\
99d2cb72
MN
653\
654 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
655 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
656 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
657 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
6542b44e 658 MOVNTQ(%%mm6, 8(dst))\
99d2cb72
MN
659\
660 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
661 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
662 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
6542b44e 663 MOVNTQ(%%mm5, 16(dst))\
99d2cb72 664\
6542b44e 665 "addl $24, "#dst" \n\t"\
99d2cb72 666\
6542b44e
MN
667 "addl $8, "#index" \n\t"\
668 "cmpl "#dstw", "#index" \n\t"\
99d2cb72
MN
669 " jb 1b \n\t"
670
6542b44e 671#define WRITEBGR24MMX2(dst, dstw, index) \
99d2cb72 672 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
9b464428
FB
673 "movq "MANGLE(M24A)", %%mm0 \n\t"\
674 "movq "MANGLE(M24C)", %%mm7 \n\t"\
99d2cb72
MN
675 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
676 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
677 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
678\
679 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
680 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
681 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
682\
683 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
684 "por %%mm1, %%mm6 \n\t"\
685 "por %%mm3, %%mm6 \n\t"\
6542b44e 686 MOVNTQ(%%mm6, (dst))\
99d2cb72
MN
687\
688 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
689 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
690 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
691 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
692\
9b464428 693 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
99d2cb72
MN
694 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
695 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
696\
697 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
698 "por %%mm3, %%mm6 \n\t"\
6542b44e 699 MOVNTQ(%%mm6, 8(dst))\
99d2cb72
MN
700\
701 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
702 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
703 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
704\
705 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
706 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
9b464428 707 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72
MN
708\
709 "por %%mm1, %%mm3 \n\t"\
710 "por %%mm3, %%mm6 \n\t"\
6542b44e 711 MOVNTQ(%%mm6, 16(dst))\
99d2cb72 712\
6542b44e 713 "addl $24, "#dst" \n\t"\
99d2cb72 714\
6542b44e
MN
715 "addl $8, "#index" \n\t"\
716 "cmpl "#dstw", "#index" \n\t"\
99d2cb72
MN
717 " jb 1b \n\t"
718
719#ifdef HAVE_MMX2
7630f2e0 720#undef WRITEBGR24
99d2cb72
MN
721#define WRITEBGR24 WRITEBGR24MMX2
722#else
7630f2e0 723#undef WRITEBGR24
99d2cb72
MN
724#define WRITEBGR24 WRITEBGR24MMX
725#endif
726
6542b44e 727#define WRITEYUY2(dst, dstw, index) \
25593e29
MN
728 "packuswb %%mm3, %%mm3 \n\t"\
729 "packuswb %%mm4, %%mm4 \n\t"\
730 "packuswb %%mm7, %%mm1 \n\t"\
731 "punpcklbw %%mm4, %%mm3 \n\t"\
732 "movq %%mm1, %%mm7 \n\t"\
733 "punpcklbw %%mm3, %%mm1 \n\t"\
734 "punpckhbw %%mm3, %%mm7 \n\t"\
735\
6542b44e
MN
736 MOVNTQ(%%mm1, (dst, index, 2))\
737 MOVNTQ(%%mm7, 8(dst, index, 2))\
25593e29 738\
6542b44e
MN
739 "addl $8, "#index" \n\t"\
740 "cmpl "#dstw", "#index" \n\t"\
25593e29
MN
741 " jb 1b \n\t"
742
743
77a49659 744static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
c1b0bfb4 745 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
6542b44e 746 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
38858470 747{
c1b0bfb4
MN
748#ifdef HAVE_MMX
749 if(uDest != NULL)
750 {
751 asm volatile(
77a49659
MN
752 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
753 :: "r" (&c->redDither),
754 "r" (uDest), "m" (chrDstW)
c1b0bfb4
MN
755 : "%eax", "%edx", "%esi"
756 );
757
758 asm volatile(
77a49659
MN
759 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
760 :: "r" (&c->redDither),
761 "r" (vDest), "m" (chrDstW)
c1b0bfb4
MN
762 : "%eax", "%edx", "%esi"
763 );
764 }
765
766 asm volatile(
77a49659
MN
767 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
768 :: "r" (&c->redDither),
769 "r" (dest), "m" (dstW)
c1b0bfb4
MN
770 : "%eax", "%edx", "%esi"
771 );
772#else
a2faa401
RD
773#ifdef HAVE_ALTIVEC
774yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
775 chrFilter, chrSrc, chrFilterSize,
776 dest, uDest, vDest, dstW, chrDstW);
777#else //HAVE_ALTIVEC
5859233b 778yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
e3d2500f 779 chrFilter, chrSrc, chrFilterSize,
5859233b 780 dest, uDest, vDest, dstW, chrDstW);
a2faa401 781#endif //!HAVE_ALTIVEC
7630f2e0 782#endif
c1b0bfb4 783}
2add307d 784
c1b0bfb4 785static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
e616aa93 786 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
c1b0bfb4
MN
787{
788#ifdef HAVE_MMX
789 if(uDest != NULL)
38858470 790 {
c1b0bfb4
MN
791 asm volatile(
792 YSCALEYUV2YV121
e616aa93
MN
793 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
794 "g" (-chrDstW)
c1b0bfb4
MN
795 : "%eax"
796 );
797
798 asm volatile(
799 YSCALEYUV2YV121
e616aa93
MN
800 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
801 "g" (-chrDstW)
c1b0bfb4
MN
802 : "%eax"
803 );
38858470
MN
804 }
805
c1b0bfb4
MN
806 asm volatile(
807 YSCALEYUV2YV121
808 :: "r" (lumSrc + dstW), "r" (dest + dstW),
809 "g" (-dstW)
810 : "%eax"
811 );
812#else
c1b0bfb4
MN
813 int i;
814 for(i=0; i<dstW; i++)
38858470 815 {
c1b0bfb4 816 int val= lumSrc[i]>>7;
44c1035c
MN
817
818 if(val&256){
819 if(val<0) val=0;
820 else val=255;
821 }
c1b0bfb4 822
44c1035c 823 dest[i]= val;
c1b0bfb4
MN
824 }
825
826 if(uDest != NULL)
e616aa93 827 for(i=0; i<chrDstW; i++)
38858470 828 {
c1b0bfb4
MN
829 int u=chrSrc[i]>>7;
830 int v=chrSrc[i + 2048]>>7;
831
44c1035c
MN
832 if((u|v)&256){
833 if(u<0) u=0;
834 else if (u>255) u=255;
835 if(v<0) v=0;
836 else if (v>255) v=255;
837 }
838
839 uDest[i]= u;
840 vDest[i]= v;
38858470 841 }
c1b0bfb4 842#endif
38858470
MN
843}
844
c1b0bfb4 845
d604bab9
MN
846/**
847 * vertical scale YV12 to RGB
848 */
25593e29 849static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
c1b0bfb4 850 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
77a49659 851 uint8_t *dest, int dstW, int dstY)
c1b0bfb4 852{
77a49659 853 int dummy=0;
cf7d1c1a 854 switch(c->dstFormat)
c1b0bfb4
MN
855 {
856#ifdef HAVE_MMX
cf7d1c1a 857 case IMGFMT_BGR32:
c1b0bfb4
MN
858 {
859 asm volatile(
860 YSCALEYUV2RGBX
6542b44e 861 WRITEBGR32(%4, %5, %%eax)
c1b0bfb4 862
77a49659
MN
863 :: "r" (&c->redDither),
864 "m" (dummy), "m" (dummy), "m" (dummy),
865 "r" (dest), "m" (dstW)
866 : "%eax", "%edx", "%esi"
c1b0bfb4
MN
867 );
868 }
cf7d1c1a
MN
869 break;
870 case IMGFMT_BGR24:
c1b0bfb4
MN
871 {
872 asm volatile(
873 YSCALEYUV2RGBX
874 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
875 "addl %4, %%ebx \n\t"
6542b44e 876 WRITEBGR24(%%ebx, %5, %%eax)
c1b0bfb4 877
77a49659
MN
878 :: "r" (&c->redDither),
879 "m" (dummy), "m" (dummy), "m" (dummy),
880 "r" (dest), "m" (dstW)
881 : "%eax", "%ebx", "%edx", "%esi" //FIXME ebx
c1b0bfb4
MN
882 );
883 }
cf7d1c1a
MN
884 break;
885 case IMGFMT_BGR15:
c1b0bfb4
MN
886 {
887 asm volatile(
888 YSCALEYUV2RGBX
889 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
890#ifdef DITHER1XBPP
9b464428
FB
891 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
892 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
893 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
894#endif
895
6542b44e 896 WRITEBGR15(%4, %5, %%eax)
c1b0bfb4 897
77a49659
MN
898 :: "r" (&c->redDither),
899 "m" (dummy), "m" (dummy), "m" (dummy),
900 "r" (dest), "m" (dstW)
901 : "%eax", "%edx", "%esi"
c1b0bfb4
MN
902 );
903 }
cf7d1c1a
MN
904 break;
905 case IMGFMT_BGR16:
c1b0bfb4
MN
906 {
907 asm volatile(
908 YSCALEYUV2RGBX
909 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
910#ifdef DITHER1XBPP
9b464428
FB
911 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
912 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
913 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
914#endif
915
6542b44e 916 WRITEBGR16(%4, %5, %%eax)
c1b0bfb4 917
77a49659
MN
918 :: "r" (&c->redDither),
919 "m" (dummy), "m" (dummy), "m" (dummy),
920 "r" (dest), "m" (dstW)
921 : "%eax", "%edx", "%esi"
c1b0bfb4
MN
922 );
923 }
cf7d1c1a 924 break;
25593e29
MN
925 case IMGFMT_YUY2:
926 {
927 asm volatile(
928 YSCALEYUV2PACKEDX
929 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
930
931 "psraw $3, %%mm3 \n\t"
932 "psraw $3, %%mm4 \n\t"
933 "psraw $3, %%mm1 \n\t"
934 "psraw $3, %%mm7 \n\t"
6542b44e 935 WRITEYUY2(%4, %5, %%eax)
25593e29 936
77a49659
MN
937 :: "r" (&c->redDither),
938 "m" (dummy), "m" (dummy), "m" (dummy),
939 "r" (dest), "m" (dstW)
940 : "%eax", "%edx", "%esi"
25593e29
MN
941 );
942 }
943 break;
c1b0bfb4 944#endif
cf7d1c1a 945 default:
a31de956
MN
946#ifdef HAVE_ALTIVEC
947 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
948 chrFilter, chrSrc, chrFilterSize,
949 dest, dstW, dstY);
950#else
25593e29 951 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
cf7d1c1a
MN
952 chrFilter, chrSrc, chrFilterSize,
953 dest, dstW, dstY);
a31de956 954#endif
cf7d1c1a
MN
955 break;
956 }
c1b0bfb4
MN
957}
958
c1b0bfb4
MN
959/**
960 * vertical bilinear scale YV12 to RGB
961 */
25593e29 962static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
cf7d1c1a 963 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9
MN
964{
965 int yalpha1=yalpha^4095;
966 int uvalpha1=uvalpha^4095;
cf7d1c1a 967 int i;
d604bab9 968
77a416e8 969#if 0 //isn't used
1e621b18 970 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 971 {
cf7d1c1a 972 switch(dstFormat)
d604bab9 973 {
cf7d1c1a
MN
974#ifdef HAVE_MMX
975 case IMGFMT_BGR32:
d604bab9
MN
976 asm volatile(
977
978
979FULL_YSCALEYUV2RGB
980 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
981 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
982
983 "movq %%mm3, %%mm1 \n\t"
984 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
985 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
986
987 MOVNTQ(%%mm3, (%4, %%eax, 4))
988 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
989
990 "addl $4, %%eax \n\t"
991 "cmpl %5, %%eax \n\t"
992 " jb 1b \n\t"
993
994
d1fac6cf 995 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
996 "m" (yalpha1), "m" (uvalpha1)
997 : "%eax"
998 );
cf7d1c1a
MN
999 break;
1000 case IMGFMT_BGR24:
d604bab9
MN
1001 asm volatile(
1002
1003FULL_YSCALEYUV2RGB
1004
1005 // lsb ... msb
1006 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1007 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1008
1009 "movq %%mm3, %%mm1 \n\t"
1010 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1011 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1012
1013 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1014 "psrlq $8, %%mm3 \n\t" // GR0BGR00
9b464428
FB
1015 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1016 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
d604bab9
MN
1017 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1018 "movq %%mm1, %%mm2 \n\t"
1019 "psllq $48, %%mm1 \n\t" // 000000BG
1020 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1021
1022 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1023 "psrld $16, %%mm2 \n\t" // R000R000
1024 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1025 "por %%mm2, %%mm1 \n\t" // RBGRR000
1026
1027 "movl %4, %%ebx \n\t"
1028 "addl %%eax, %%ebx \n\t"
1029
1030#ifdef HAVE_MMX2
1031 //FIXME Alignment
1032 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
1033 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
1034#else
1035 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
1036 "psrlq $32, %%mm3 \n\t"
1037 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
1038 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
1039#endif
1040 "addl $4, %%eax \n\t"
1041 "cmpl %5, %%eax \n\t"
1042 " jb 1b \n\t"
1043
d1fac6cf 1044 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1045 "m" (yalpha1), "m" (uvalpha1)
1046 : "%eax", "%ebx"
1047 );
cf7d1c1a
MN
1048 break;
1049 case IMGFMT_BGR15:
d604bab9
MN
1050 asm volatile(
1051
1052FULL_YSCALEYUV2RGB
1053#ifdef DITHER1XBPP
9b464428
FB
1054 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1055 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1056 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
1057#endif
1058 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1059 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1060 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1061
1062 "psrlw $3, %%mm3 \n\t"
1063 "psllw $2, %%mm1 \n\t"
1064 "psllw $7, %%mm0 \n\t"
9b464428
FB
1065 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1066 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9
MN
1067
1068 "por %%mm3, %%mm1 \n\t"
1069 "por %%mm1, %%mm0 \n\t"
1070
1071 MOVNTQ(%%mm0, (%4, %%eax, 2))
1072
1073 "addl $4, %%eax \n\t"
1074 "cmpl %5, %%eax \n\t"
1075 " jb 1b \n\t"
1076
d1fac6cf 1077 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1078 "m" (yalpha1), "m" (uvalpha1)
1079 : "%eax"
1080 );
cf7d1c1a
MN
1081 break;
1082 case IMGFMT_BGR16:
d604bab9
MN
1083 asm volatile(
1084
1085FULL_YSCALEYUV2RGB
1086#ifdef DITHER1XBPP
9b464428
FB
1087 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1088 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1089 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
1090#endif
1091 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1092 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1093 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1094
1095 "psrlw $3, %%mm3 \n\t"
1096 "psllw $3, %%mm1 \n\t"
1097 "psllw $8, %%mm0 \n\t"
9b464428
FB
1098 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1099 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9
MN
1100
1101 "por %%mm3, %%mm1 \n\t"
1102 "por %%mm1, %%mm0 \n\t"
1103
1104 MOVNTQ(%%mm0, (%4, %%eax, 2))
1105
1106 "addl $4, %%eax \n\t"
1107 "cmpl %5, %%eax \n\t"
1108 " jb 1b \n\t"
1109
d1fac6cf 1110 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1111 "m" (yalpha1), "m" (uvalpha1)
1112 : "%eax"
1113 );
cf7d1c1a
MN
1114 break;
1115#endif
1116 case IMGFMT_RGB32:
1117#ifndef HAVE_MMX
1118 case IMGFMT_BGR32:
1119#endif
28bf81c9
MN
1120 if(dstFormat==IMGFMT_BGR32)
1121 {
2ba1bff0 1122 int i;
df3c183a
MN
1123#ifdef WORDS_BIGENDIAN
1124 dest++;
1125#endif
28bf81c9
MN
1126 for(i=0;i<dstW;i++){
1127 // vertical linear interpolation && yuv2rgb in a single step:
1128 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1129 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1130 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1131 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1132 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1133 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1134 dest+= 4;
1135 }
1136 }
1137 else if(dstFormat==IMGFMT_BGR24)
d604bab9 1138 {
96034638 1139 int i;
d1fac6cf 1140 for(i=0;i<dstW;i++){
d604bab9
MN
1141 // vertical linear interpolation && yuv2rgb in a single step:
1142 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1143 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1144 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
390b20a6
MN
1145 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1146 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1147 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
28bf81c9 1148 dest+= 3;
d604bab9
MN
1149 }
1150 }
28bf81c9 1151 else if(dstFormat==IMGFMT_BGR16)
d604bab9 1152 {
96034638 1153 int i;
d1fac6cf 1154 for(i=0;i<dstW;i++){
d604bab9
MN
1155 // vertical linear interpolation && yuv2rgb in a single step:
1156 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1157 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1158 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1159
d022ce5c 1160 ((uint16_t*)dest)[i] =
b18ea156
MN
1161 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1162 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1163 clip_table16r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1164 }
1165 }
28bf81c9 1166 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1167 {
96034638 1168 int i;
d1fac6cf 1169 for(i=0;i<dstW;i++){
d604bab9
MN
1170 // vertical linear interpolation && yuv2rgb in a single step:
1171 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1172 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1173 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1174
d022ce5c 1175 ((uint16_t*)dest)[i] =
b18ea156
MN
1176 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1177 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1178 clip_table15r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1179 }
1180 }
d604bab9
MN
1181 }//FULL_UV_IPOL
1182 else
1183 {
cf7d1c1a 1184#endif // if 0
d604bab9 1185#ifdef HAVE_MMX
cf7d1c1a
MN
1186 switch(c->dstFormat)
1187 {
77a416e8 1188//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
cf7d1c1a 1189 case IMGFMT_BGR32:
d604bab9 1190 asm volatile(
6542b44e
MN
1191 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1192 "movl %4, %%esp \n\t"
1193 YSCALEYUV2RGB(%%eax, %5)
1194 WRITEBGR32(%%esp, 8280(%5), %%eax)
1195 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1196
1197 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1198 "r" (&c->redDither)
d604bab9
MN
1199 : "%eax"
1200 );
cf7d1c1a
MN
1201 return;
1202 case IMGFMT_BGR24:
d604bab9 1203 asm volatile(
6542b44e
MN
1204 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1205 "movl %4, %%esp \n\t"
1206 YSCALEYUV2RGB(%%eax, %5)
1207 WRITEBGR24(%%esp, 8280(%5), %%eax)
1208 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1209 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1210 "r" (&c->redDither)
1211 : "%eax"
d604bab9 1212 );
cf7d1c1a
MN
1213 return;
1214 case IMGFMT_BGR15:
d604bab9 1215 asm volatile(
6542b44e
MN
1216 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1217 "movl %4, %%esp \n\t"
1218 YSCALEYUV2RGB(%%eax, %5)
d604bab9
MN
1219 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1220#ifdef DITHER1XBPP
9b464428
FB
1221 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1222 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1223 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1224#endif
1225
6542b44e
MN
1226 WRITEBGR15(%%esp, 8280(%5), %%eax)
1227 "movl "ESP_OFFSET"(%5), %%esp \n\t"
d604bab9 1228
6542b44e
MN
1229 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1230 "r" (&c->redDither)
d604bab9
MN
1231 : "%eax"
1232 );
cf7d1c1a
MN
1233 return;
1234 case IMGFMT_BGR16:
d604bab9 1235 asm volatile(
6542b44e
MN
1236 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1237 "movl %4, %%esp \n\t"
1238 YSCALEYUV2RGB(%%eax, %5)
d604bab9
MN
1239 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1240#ifdef DITHER1XBPP
9b464428
FB
1241 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1242 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1243 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1244#endif
1245
6542b44e
MN
1246 WRITEBGR16(%%esp, 8280(%5), %%eax)
1247 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1248 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1249 "r" (&c->redDither)
d604bab9
MN
1250 : "%eax"
1251 );
cf7d1c1a 1252 return;
25593e29
MN
1253 case IMGFMT_YUY2:
1254 asm volatile(
6542b44e
MN
1255 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1256 "movl %4, %%esp \n\t"
1257 YSCALEYUV2PACKED(%%eax, %5)
1258 WRITEYUY2(%%esp, 8280(%5), %%eax)
1259 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1260 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1261 "r" (&c->redDither)
25593e29
MN
1262 : "%eax"
1263 );
1264 return;
cf7d1c1a
MN
1265 default: break;
1266 }
1267#endif //HAVE_MMX
25593e29 1268YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
d604bab9
MN
1269}
1270
1271/**
1272 * YV12 to RGB without scaling or interpolating
1273 */
25593e29 1274static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
cf7d1c1a 1275 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1276{
c1b0bfb4 1277 const int yalpha1=0;
cf7d1c1a
MN
1278 int i;
1279
1280 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1281 const int yalpha= 4096; //FIXME ...
96034638 1282
1e621b18 1283 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 1284 {
25593e29 1285 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
d604bab9
MN
1286 return;
1287 }
397c035e
MN
1288
1289#ifdef HAVE_MMX
497d4f99
MN
1290 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1291 {
cf7d1c1a 1292 switch(dstFormat)
d604bab9 1293 {
cf7d1c1a 1294 case IMGFMT_BGR32:
d604bab9 1295 asm volatile(
e54d94ba
MN
1296 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1297 "movl %4, %%esp \n\t"
1298 YSCALEYUV2RGB1(%%eax, %5)
1299 WRITEBGR32(%%esp, 8280(%5), %%eax)
1300 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1301
1302 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1303 "r" (&c->redDither)
d604bab9
MN
1304 : "%eax"
1305 );
cf7d1c1a
MN
1306 return;
1307 case IMGFMT_BGR24:
d604bab9 1308 asm volatile(
e54d94ba
MN
1309 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1310 "movl %4, %%esp \n\t"
1311 YSCALEYUV2RGB1(%%eax, %5)
1312 WRITEBGR24(%%esp, 8280(%5), %%eax)
1313 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1314
1315 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1316 "r" (&c->redDither)
1317 : "%eax"
d604bab9 1318 );
cf7d1c1a
MN
1319 return;
1320 case IMGFMT_BGR15:
d604bab9 1321 asm volatile(
e54d94ba
MN
1322 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1323 "movl %4, %%esp \n\t"
1324 YSCALEYUV2RGB1(%%eax, %5)
d604bab9
MN
1325 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1326#ifdef DITHER1XBPP
9b464428
FB
1327 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1328 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1329 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9 1330#endif
e54d94ba
MN
1331 WRITEBGR15(%%esp, 8280(%5), %%eax)
1332 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1333
1334 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1335 "r" (&c->redDither)
d604bab9
MN
1336 : "%eax"
1337 );
cf7d1c1a
MN
1338 return;
1339 case IMGFMT_BGR16:
d604bab9 1340 asm volatile(
e54d94ba
MN
1341 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1342 "movl %4, %%esp \n\t"
1343 YSCALEYUV2RGB1(%%eax, %5)
d604bab9
MN
1344 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1345#ifdef DITHER1XBPP
9b464428
FB
1346 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1347 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1348 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1349#endif
1350
e54d94ba
MN
1351 WRITEBGR16(%%esp, 8280(%5), %%eax)
1352 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1353
1354 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1355 "r" (&c->redDither)
d604bab9
MN
1356 : "%eax"
1357 );
cf7d1c1a 1358 return;
25593e29
MN
1359 case IMGFMT_YUY2:
1360 asm volatile(
e54d94ba
MN
1361 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1362 "movl %4, %%esp \n\t"
1363 YSCALEYUV2PACKED1(%%eax, %5)
1364 WRITEYUY2(%%esp, 8280(%5), %%eax)
1365 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1366
1367 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1368 "r" (&c->redDither)
25593e29
MN
1369 : "%eax"
1370 );
1371 return;
d604bab9 1372 }
497d4f99
MN
1373 }
1374 else
1375 {
cf7d1c1a 1376 switch(dstFormat)
d604bab9 1377 {
cf7d1c1a 1378 case IMGFMT_BGR32:
497d4f99 1379 asm volatile(
e54d94ba
MN
1380 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1381 "movl %4, %%esp \n\t"
1382 YSCALEYUV2RGB1b(%%eax, %5)
1383 WRITEBGR32(%%esp, 8280(%5), %%eax)
1384 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1385
1386 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1387 "r" (&c->redDither)
497d4f99
MN
1388 : "%eax"
1389 );
cf7d1c1a
MN
1390 return;
1391 case IMGFMT_BGR24:
497d4f99 1392 asm volatile(
e54d94ba
MN
1393 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1394 "movl %4, %%esp \n\t"
1395 YSCALEYUV2RGB1b(%%eax, %5)
1396 WRITEBGR24(%%esp, 8280(%5), %%eax)
1397 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1398
1399 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1400 "r" (&c->redDither)
1401 : "%eax"
497d4f99 1402 );
cf7d1c1a
MN
1403 return;
1404 case IMGFMT_BGR15:
497d4f99 1405 asm volatile(
e54d94ba
MN
1406 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1407 "movl %4, %%esp \n\t"
1408 YSCALEYUV2RGB1b(%%eax, %5)
497d4f99
MN
1409 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1410#ifdef DITHER1XBPP
9b464428
FB
1411 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1412 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1413 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1414#endif
e54d94ba
MN
1415 WRITEBGR15(%%esp, 8280(%5), %%eax)
1416 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1417
1418 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1419 "r" (&c->redDither)
497d4f99
MN
1420 : "%eax"
1421 );
cf7d1c1a
MN
1422 return;
1423 case IMGFMT_BGR16:
497d4f99 1424 asm volatile(
e54d94ba
MN
1425 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1426 "movl %4, %%esp \n\t"
1427 YSCALEYUV2RGB1b(%%eax, %5)
497d4f99
MN
1428 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1429#ifdef DITHER1XBPP
9b464428
FB
1430 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1431 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1432 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1433#endif
d604bab9 1434
e54d94ba
MN
1435 WRITEBGR16(%%esp, 8280(%5), %%eax)
1436 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1437
1438 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1439 "r" (&c->redDither)
497d4f99
MN
1440 : "%eax"
1441 );
cf7d1c1a 1442 return;
25593e29
MN
1443 case IMGFMT_YUY2:
1444 asm volatile(
e54d94ba
MN
1445 "movl %%esp, "ESP_OFFSET"(%5) \n\t"
1446 "movl %4, %%esp \n\t"
1447 YSCALEYUV2PACKED1b(%%eax, %5)
1448 WRITEYUY2(%%esp, 8280(%5), %%eax)
1449 "movl "ESP_OFFSET"(%5), %%esp \n\t"
1450
1451 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1452 "r" (&c->redDither)
25593e29
MN
1453 : "%eax"
1454 );
1455 return;
d604bab9 1456 }
497d4f99 1457 }
df3c183a 1458#endif
cf7d1c1a 1459 if( uvalpha < 2048 )
497d4f99 1460 {
25593e29 1461 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
cf7d1c1a 1462 }else{
25593e29 1463 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
497d4f99 1464 }
d604bab9
MN
1465}
1466
6ff0ad6b
MN
1467//FIXME yuy2* can read upto 7 samples to much
1468
1e621b18
MN
1469static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1470{
6ff0ad6b
MN
1471#ifdef HAVE_MMX
1472 asm volatile(
1473 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1474 "movl %0, %%eax \n\t"
1475 "1: \n\t"
1476 "movq (%1, %%eax,2), %%mm0 \n\t"
1477 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1478 "pand %%mm2, %%mm0 \n\t"
1479 "pand %%mm2, %%mm1 \n\t"
1480 "packuswb %%mm1, %%mm0 \n\t"
1481 "movq %%mm0, (%2, %%eax) \n\t"
1482 "addl $8, %%eax \n\t"
1483 " js 1b \n\t"
1484 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1485 : "%eax"
1486 );
1e621b18
MN
1487#else
1488 int i;
1489 for(i=0; i<width; i++)
1490 dst[i]= src[2*i];
1491#endif
1492}
1493
1494static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1495{
6ff0ad6b
MN
1496#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1497 asm volatile(
1498 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1499 "movl %0, %%eax \n\t"
1500 "1: \n\t"
1501 "movq (%1, %%eax,4), %%mm0 \n\t"
1502 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1503 "movq (%2, %%eax,4), %%mm2 \n\t"
1504 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1505 PAVGB(%%mm2, %%mm0)
1506 PAVGB(%%mm3, %%mm1)
1507 "psrlw $8, %%mm0 \n\t"
1508 "psrlw $8, %%mm1 \n\t"
1509 "packuswb %%mm1, %%mm0 \n\t"
1510 "movq %%mm0, %%mm1 \n\t"
1511 "psrlw $8, %%mm0 \n\t"
1512 "pand %%mm4, %%mm1 \n\t"
1513 "packuswb %%mm0, %%mm0 \n\t"
1514 "packuswb %%mm1, %%mm1 \n\t"
1515 "movd %%mm0, (%4, %%eax) \n\t"
1516 "movd %%mm1, (%3, %%eax) \n\t"
1517 "addl $4, %%eax \n\t"
1518 " js 1b \n\t"
1519 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1520 : "%eax"
1521 );
1e621b18
MN
1522#else
1523 int i;
1524 for(i=0; i<width; i++)
1525 {
1526 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1527 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1528 }
1529#endif
1530}
1531
7322a67c
MN
1532//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1533static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1534{
1535#ifdef HAVE_MMX
1536 asm volatile(
1537 "movl %0, %%eax \n\t"
1538 "1: \n\t"
1539 "movq (%1, %%eax,2), %%mm0 \n\t"
1540 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1541 "psrlw $8, %%mm0 \n\t"
1542 "psrlw $8, %%mm1 \n\t"
1543 "packuswb %%mm1, %%mm0 \n\t"
1544 "movq %%mm0, (%2, %%eax) \n\t"
1545 "addl $8, %%eax \n\t"
1546 " js 1b \n\t"
1547 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1548 : "%eax"
1549 );
1550#else
1551 int i;
1552 for(i=0; i<width; i++)
1553 dst[i]= src[2*i+1];
1554#endif
1555}
1556
1557static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1558{
1559#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1560 asm volatile(
1561 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1562 "movl %0, %%eax \n\t"
1563 "1: \n\t"
1564 "movq (%1, %%eax,4), %%mm0 \n\t"
1565 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1566 "movq (%2, %%eax,4), %%mm2 \n\t"
1567 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1568 PAVGB(%%mm2, %%mm0)
1569 PAVGB(%%mm3, %%mm1)
1570 "pand %%mm4, %%mm0 \n\t"
1571 "pand %%mm4, %%mm1 \n\t"
1572 "packuswb %%mm1, %%mm0 \n\t"
1573 "movq %%mm0, %%mm1 \n\t"
1574 "psrlw $8, %%mm0 \n\t"
1575 "pand %%mm4, %%mm1 \n\t"
1576 "packuswb %%mm0, %%mm0 \n\t"
1577 "packuswb %%mm1, %%mm1 \n\t"
1578 "movd %%mm0, (%4, %%eax) \n\t"
1579 "movd %%mm1, (%3, %%eax) \n\t"
1580 "addl $4, %%eax \n\t"
1581 " js 1b \n\t"
1582 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1583 : "%eax"
1584 );
1585#else
1586 int i;
1587 for(i=0; i<width; i++)
1588 {
1589 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1590 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1591 }
1592#endif
1593}
1594
1e621b18
MN
1595static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1596{
1597#ifdef HAVE_MMXFIXME
1598#else
1599 int i;
1600 for(i=0; i<width; i++)
1601 {
4e61e21c
MN
1602 int b= ((uint32_t*)src)[i]&0xFF;
1603 int g= (((uint32_t*)src)[i]>>8)&0xFF;
3e499f53 1604 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1e621b18 1605
4e61e21c 1606 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1e621b18
MN
1607 }
1608#endif
1609}
1610
1611static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1612{
1613#ifdef HAVE_MMXFIXME
1614#else
1615 int i;
1616 for(i=0; i<width; i++)
1617 {
4e61e21c
MN
1618 const int a= ((uint32_t*)src1)[2*i+0];
1619 const int e= ((uint32_t*)src1)[2*i+1];
1620 const int c= ((uint32_t*)src2)[2*i+0];
1621 const int d= ((uint32_t*)src2)[2*i+1];
1622 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1623 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1624 const int b= l&0x3FF;
1625 const int g= h>>8;
1626 const int r= l>>16;
1e621b18
MN
1627
1628 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1629 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1630 }
1631#endif
1632}
1633
1634static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1635{
ac6a2e45
MN
1636#ifdef HAVE_MMX
1637 asm volatile(
1638 "movl %2, %%eax \n\t"
854288bb
FB
1639 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1640 "movq "MANGLE(w1111)", %%mm5 \n\t"
ac6a2e45
MN
1641 "pxor %%mm7, %%mm7 \n\t"
1642 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1643 ".balign 16 \n\t"
1644 "1: \n\t"
1645 PREFETCH" 64(%0, %%ebx) \n\t"
1646 "movd (%0, %%ebx), %%mm0 \n\t"
1647 "movd 3(%0, %%ebx), %%mm1 \n\t"
1648 "punpcklbw %%mm7, %%mm0 \n\t"
1649 "punpcklbw %%mm7, %%mm1 \n\t"
1650 "movd 6(%0, %%ebx), %%mm2 \n\t"
1651 "movd 9(%0, %%ebx), %%mm3 \n\t"
1652 "punpcklbw %%mm7, %%mm2 \n\t"
1653 "punpcklbw %%mm7, %%mm3 \n\t"
1654 "pmaddwd %%mm6, %%mm0 \n\t"
1655 "pmaddwd %%mm6, %%mm1 \n\t"
1656 "pmaddwd %%mm6, %%mm2 \n\t"
1657 "pmaddwd %%mm6, %%mm3 \n\t"
1658#ifndef FAST_BGR2YV12
1659 "psrad $8, %%mm0 \n\t"
1660 "psrad $8, %%mm1 \n\t"
1661 "psrad $8, %%mm2 \n\t"
1662 "psrad $8, %%mm3 \n\t"
1663#endif
1664 "packssdw %%mm1, %%mm0 \n\t"
1665 "packssdw %%mm3, %%mm2 \n\t"
1666 "pmaddwd %%mm5, %%mm0 \n\t"
1667 "pmaddwd %%mm5, %%mm2 \n\t"
1668 "packssdw %%mm2, %%mm0 \n\t"
1669 "psraw $7, %%mm0 \n\t"
1670
1671 "movd 12(%0, %%ebx), %%mm4 \n\t"
1672 "movd 15(%0, %%ebx), %%mm1 \n\t"
1673 "punpcklbw %%mm7, %%mm4 \n\t"
1674 "punpcklbw %%mm7, %%mm1 \n\t"
1675 "movd 18(%0, %%ebx), %%mm2 \n\t"
1676 "movd 21(%0, %%ebx), %%mm3 \n\t"
1677 "punpcklbw %%mm7, %%mm2 \n\t"
1678 "punpcklbw %%mm7, %%mm3 \n\t"
1679 "pmaddwd %%mm6, %%mm4 \n\t"
1680 "pmaddwd %%mm6, %%mm1 \n\t"
1681 "pmaddwd %%mm6, %%mm2 \n\t"
1682 "pmaddwd %%mm6, %%mm3 \n\t"
1683#ifndef FAST_BGR2YV12
1684 "psrad $8, %%mm4 \n\t"
1685 "psrad $8, %%mm1 \n\t"
1686 "psrad $8, %%mm2 \n\t"
1687 "psrad $8, %%mm3 \n\t"
1688#endif
1689 "packssdw %%mm1, %%mm4 \n\t"
1690 "packssdw %%mm3, %%mm2 \n\t"
1691 "pmaddwd %%mm5, %%mm4 \n\t"
1692 "pmaddwd %%mm5, %%mm2 \n\t"
1693 "addl $24, %%ebx \n\t"
1694 "packssdw %%mm2, %%mm4 \n\t"
1695 "psraw $7, %%mm4 \n\t"
1696
1697 "packuswb %%mm4, %%mm0 \n\t"
854288bb 1698 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
ac6a2e45 1699
4342fc14 1700 "movq %%mm0, (%1, %%eax) \n\t"
ac6a2e45
MN
1701 "addl $8, %%eax \n\t"
1702 " js 1b \n\t"
1703 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1704 : "%eax", "%ebx"
1705 );
1e621b18
MN
1706#else
1707 int i;
1708 for(i=0; i<width; i++)
1709 {
1710 int b= src[i*3+0];
1711 int g= src[i*3+1];
1712 int r= src[i*3+2];
1713
9902f4e2 1714 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1e621b18
MN
1715 }
1716#endif
1717}
1718
1719static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1720{
4342fc14
MN
1721#ifdef HAVE_MMX
1722 asm volatile(
1723 "movl %4, %%eax \n\t"
854288bb
FB
1724 "movq "MANGLE(w1111)", %%mm5 \n\t"
1725 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
4342fc14
MN
1726 "pxor %%mm7, %%mm7 \n\t"
1727 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1728 "addl %%ebx, %%ebx \n\t"
1729 ".balign 16 \n\t"
1730 "1: \n\t"
1731 PREFETCH" 64(%0, %%ebx) \n\t"
1732 PREFETCH" 64(%1, %%ebx) \n\t"
1733#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1734 "movq (%0, %%ebx), %%mm0 \n\t"
1735 "movq (%1, %%ebx), %%mm1 \n\t"
1736 "movq 6(%0, %%ebx), %%mm2 \n\t"
1737 "movq 6(%1, %%ebx), %%mm3 \n\t"
1738 PAVGB(%%mm1, %%mm0)
1739 PAVGB(%%mm3, %%mm2)
1740 "movq %%mm0, %%mm1 \n\t"
1741 "movq %%mm2, %%mm3 \n\t"
1742 "psrlq $24, %%mm0 \n\t"
1743 "psrlq $24, %%mm2 \n\t"
1744 PAVGB(%%mm1, %%mm0)
1745 PAVGB(%%mm3, %%mm2)
1746 "punpcklbw %%mm7, %%mm0 \n\t"
1747 "punpcklbw %%mm7, %%mm2 \n\t"
1748#else
1749 "movd (%0, %%ebx), %%mm0 \n\t"
1750 "movd (%1, %%ebx), %%mm1 \n\t"
1751 "movd 3(%0, %%ebx), %%mm2 \n\t"
1752 "movd 3(%1, %%ebx), %%mm3 \n\t"
1753 "punpcklbw %%mm7, %%mm0 \n\t"
1754 "punpcklbw %%mm7, %%mm1 \n\t"
1755 "punpcklbw %%mm7, %%mm2 \n\t"
1756 "punpcklbw %%mm7, %%mm3 \n\t"
1757 "paddw %%mm1, %%mm0 \n\t"
1758 "paddw %%mm3, %%mm2 \n\t"
1759 "paddw %%mm2, %%mm0 \n\t"
1760 "movd 6(%0, %%ebx), %%mm4 \n\t"
1761 "movd 6(%1, %%ebx), %%mm1 \n\t"
1762 "movd 9(%0, %%ebx), %%mm2 \n\t"
1763 "movd 9(%1, %%ebx), %%mm3 \n\t"
1764 "punpcklbw %%mm7, %%mm4 \n\t"
1765 "punpcklbw %%mm7, %%mm1 \n\t"
1766 "punpcklbw %%mm7, %%mm2 \n\t"
1767 "punpcklbw %%mm7, %%mm3 \n\t"
1768 "paddw %%mm1, %%mm4 \n\t"
1769 "paddw %%mm3, %%mm2 \n\t"
1770 "paddw %%mm4, %%mm2 \n\t"
1771 "psrlw $2, %%mm0 \n\t"
1772 "psrlw $2, %%mm2 \n\t"
1773#endif
854288bb
FB
1774 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1775 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1776
1777 "pmaddwd %%mm0, %%mm1 \n\t"
1778 "pmaddwd %%mm2, %%mm3 \n\t"
1779 "pmaddwd %%mm6, %%mm0 \n\t"
1780 "pmaddwd %%mm6, %%mm2 \n\t"
1781#ifndef FAST_BGR2YV12
1782 "psrad $8, %%mm0 \n\t"
1783 "psrad $8, %%mm1 \n\t"
1784 "psrad $8, %%mm2 \n\t"
1785 "psrad $8, %%mm3 \n\t"
1786#endif
1787 "packssdw %%mm2, %%mm0 \n\t"
1788 "packssdw %%mm3, %%mm1 \n\t"
1789 "pmaddwd %%mm5, %%mm0 \n\t"
1790 "pmaddwd %%mm5, %%mm1 \n\t"
1791 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1792 "psraw $7, %%mm0 \n\t"
1793
1794#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1795 "movq 12(%0, %%ebx), %%mm4 \n\t"
1796 "movq 12(%1, %%ebx), %%mm1 \n\t"
1797 "movq 18(%0, %%ebx), %%mm2 \n\t"
1798 "movq 18(%1, %%ebx), %%mm3 \n\t"
1799 PAVGB(%%mm1, %%mm4)
1800 PAVGB(%%mm3, %%mm2)
1801 "movq %%mm4, %%mm1 \n\t"
1802 "movq %%mm2, %%mm3 \n\t"
1803 "psrlq $24, %%mm4 \n\t"
1804 "psrlq $24, %%mm2 \n\t"
1805 PAVGB(%%mm1, %%mm4)
1806 PAVGB(%%mm3, %%mm2)
1807 "punpcklbw %%mm7, %%mm4 \n\t"
1808 "punpcklbw %%mm7, %%mm2 \n\t"
1809#else
1810 "movd 12(%0, %%ebx), %%mm4 \n\t"
1811 "movd 12(%1, %%ebx), %%mm1 \n\t"
1812 "movd 15(%0, %%ebx), %%mm2 \n\t"
1813 "movd 15(%1, %%ebx), %%mm3 \n\t"
1814 "punpcklbw %%mm7, %%mm4 \n\t"
1815 "punpcklbw %%mm7, %%mm1 \n\t"
1816 "punpcklbw %%mm7, %%mm2 \n\t"
1817 "punpcklbw %%mm7, %%mm3 \n\t"
1818 "paddw %%mm1, %%mm4 \n\t"
1819 "paddw %%mm3, %%mm2 \n\t"
1820 "paddw %%mm2, %%mm4 \n\t"
1821 "movd 18(%0, %%ebx), %%mm5 \n\t"
1822 "movd 18(%1, %%ebx), %%mm1 \n\t"
1823 "movd 21(%0, %%ebx), %%mm2 \n\t"
1824 "movd 21(%1, %%ebx), %%mm3 \n\t"
1825 "punpcklbw %%mm7, %%mm5 \n\t"
1826 "punpcklbw %%mm7, %%mm1 \n\t"
1827 "punpcklbw %%mm7, %%mm2 \n\t"
1828 "punpcklbw %%mm7, %%mm3 \n\t"
1829 "paddw %%mm1, %%mm5 \n\t"
1830 "paddw %%mm3, %%mm2 \n\t"
1831 "paddw %%mm5, %%mm2 \n\t"
854288bb 1832 "movq "MANGLE(w1111)", %%mm5 \n\t"
4342fc14
MN
1833 "psrlw $2, %%mm4 \n\t"
1834 "psrlw $2, %%mm2 \n\t"
1835#endif
854288bb
FB
1836 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1837 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1838
1839 "pmaddwd %%mm4, %%mm1 \n\t"
1840 "pmaddwd %%mm2, %%mm3 \n\t"
1841 "pmaddwd %%mm6, %%mm4 \n\t"
1842 "pmaddwd %%mm6, %%mm2 \n\t"
1843#ifndef FAST_BGR2YV12
1844 "psrad $8, %%mm4 \n\t"
1845 "psrad $8, %%mm1 \n\t"
1846 "psrad $8, %%mm2 \n\t"
1847 "psrad $8, %%mm3 \n\t"
1848#endif
1849 "packssdw %%mm2, %%mm4 \n\t"
1850 "packssdw %%mm3, %%mm1 \n\t"
1851 "pmaddwd %%mm5, %%mm4 \n\t"
1852 "pmaddwd %%mm5, %%mm1 \n\t"
1853 "addl $24, %%ebx \n\t"
1854 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1855 "psraw $7, %%mm4 \n\t"
1856
1857 "movq %%mm0, %%mm1 \n\t"
1858 "punpckldq %%mm4, %%mm0 \n\t"
1859 "punpckhdq %%mm4, %%mm1 \n\t"
1860 "packsswb %%mm1, %%mm0 \n\t"
854288bb 1861 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
4342fc14
MN
1862
1863 "movd %%mm0, (%2, %%eax) \n\t"
1864 "punpckhdq %%mm0, %%mm0 \n\t"
1865 "movd %%mm0, (%3, %%eax) \n\t"
1866 "addl $4, %%eax \n\t"
1867 " js 1b \n\t"
1868 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1869 : "%eax", "%ebx"
1870 );
1e621b18
MN
1871#else
1872 int i;
1873 for(i=0; i<width; i++)
1874 {
1875 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1876 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1877 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1878
1879 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1880 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1881 }
1882#endif
1883}
1884
6af250ea
MN
1885static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1886{
1887 int i;
1888 for(i=0; i<width; i++)
1889 {
4e61e21c 1890 int d= ((uint16_t*)src)[i];
6af250ea
MN
1891 int b= d&0x1F;
1892 int g= (d>>5)&0x3F;
1893 int r= (d>>11)&0x1F;
1894
1895 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1896 }
1897}
1898
1899static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1900{
1901 int i;
1902 for(i=0; i<width; i++)
1903 {
4e61e21c
MN
1904 int d0= ((uint32_t*)src1)[i];
1905 int d1= ((uint32_t*)src2)[i];
5bb9d9d8
MN
1906
1907 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1908 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1909
1910 int dh2= (dh>>11) + (dh<<21);
1911 int d= dh2 + dl;
1912
1913 int b= d&0x7F;
1914 int r= (d>>11)&0x7F;
1915 int g= d>>21;
6af250ea
MN
1916 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1917 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1918 }
1919}
1920
b72034dd
MN
1921static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1922{
1923 int i;
1924 for(i=0; i<width; i++)
1925 {
4e61e21c 1926 int d= ((uint16_t*)src)[i];
b72034dd
MN
1927 int b= d&0x1F;
1928 int g= (d>>5)&0x1F;
1929 int r= (d>>10)&0x1F;
1930
1931 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1932 }
1933}
1934
1935static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1936{
1937 int i;
1938 for(i=0; i<width; i++)
1939 {
4e61e21c
MN
1940 int d0= ((uint32_t*)src1)[i];
1941 int d1= ((uint32_t*)src2)[i];
b72034dd
MN
1942
1943 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1944 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1945
1946 int dh2= (dh>>11) + (dh<<21);
1947 int d= dh2 + dl;
1948
1949 int b= d&0x7F;
1950 int r= (d>>10)&0x7F;
1951 int g= d>>21;
b72034dd
MN
1952 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1953 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1954 }
1955}
1956
1957
a861d4d7
MN
1958static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1959{
1960 int i;
1961 for(i=0; i<width; i++)
1962 {
4e61e21c
MN
1963 int r= ((uint32_t*)src)[i]&0xFF;
1964 int g= (((uint32_t*)src)[i]>>8)&0xFF;
3e499f53 1965 int b= (((uint32_t*)src)[i]>>16)&0xFF;
a861d4d7 1966
4e61e21c 1967 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
a861d4d7
MN
1968 }
1969}
1970
1971static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1972{
1973 int i;
1974 for(i=0; i<width; i++)
1975 {
4e61e21c
MN
1976 const int a= ((uint32_t*)src1)[2*i+0];
1977 const int e= ((uint32_t*)src1)[2*i+1];
1978 const int c= ((uint32_t*)src2)[2*i+0];
1979 const int d= ((uint32_t*)src2)[2*i+1];
1980 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1981 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1982 const int r= l&0x3FF;
1983 const int g= h>>8;
1984 const int b= l>>16;
a861d4d7
MN
1985
1986 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1987 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1988 }
1989}
1990
1991static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1992{
1993 int i;
1994 for(i=0; i<width; i++)
1995 {
1996 int r= src[i*3+0];
1997 int g= src[i*3+1];
1998 int b= src[i*3+2];
1999
4e61e21c 2000 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
a861d4d7
MN
2001 }
2002}
2003
2004static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2005{
2006 int i;
2007 for(i=0; i<width; i++)
2008 {
2009 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2010 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2011 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2012
2013 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2014 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2015 }
2016}
2017
1e621b18 2018
077ea8a7
MN
2019// Bilinear / Bicubic scaling
2020static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2021 int16_t *filter, int16_t *filterPos, int filterSize)
2ff198c1 2022{
077ea8a7 2023#ifdef HAVE_MMX
c9b99ea6 2024 assert(filterSize % 4 == 0 && filterSize>0);
077ea8a7
MN
2025 if(filterSize==4) // allways true for upscaling, sometimes for down too
2026 {
2027 int counter= -2*dstW;
2028 filter-= counter*2;
2029 filterPos-= counter/2;
2030 dst-= counter/2;
2031 asm volatile(
2032 "pxor %%mm7, %%mm7 \n\t"
9b464428 2033 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2034 "pushl %%ebp \n\t" // we use 7 regs here ...
2035 "movl %%eax, %%ebp \n\t"
2036 ".balign 16 \n\t"
2037 "1: \n\t"
2038 "movzwl (%2, %%ebp), %%eax \n\t"
2039 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2040 "movq (%1, %%ebp, 4), %%mm1 \n\t"
2041 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
2042 "movd (%3, %%eax), %%mm0 \n\t"
2043 "movd (%3, %%ebx), %%mm2 \n\t"
2044 "punpcklbw %%mm7, %%mm0 \n\t"
2045 "punpcklbw %%mm7, %%mm2 \n\t"
2046 "pmaddwd %%mm1, %%mm0 \n\t"
2047 "pmaddwd %%mm2, %%mm3 \n\t"
2048 "psrad $8, %%mm0 \n\t"
2049 "psrad $8, %%mm3 \n\t"
2050 "packssdw %%mm3, %%mm0 \n\t"
2051 "pmaddwd %%mm6, %%mm0 \n\t"
2052 "packssdw %%mm0, %%mm0 \n\t"
2053 "movd %%mm0, (%4, %%ebp) \n\t"
2054 "addl $4, %%ebp \n\t"
2055 " jnc 1b \n\t"
e3d2500f 2056
077ea8a7
MN
2057 "popl %%ebp \n\t"
2058 : "+a" (counter)
2059 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2060 : "%ebx"
2061 );
2062 }
2063 else if(filterSize==8)
2064 {
2065 int counter= -2*dstW;
2066 filter-= counter*4;
2067 filterPos-= counter/2;
2068 dst-= counter/2;
2069 asm volatile(
2070 "pxor %%mm7, %%mm7 \n\t"
9b464428 2071 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2072 "pushl %%ebp \n\t" // we use 7 regs here ...
2073 "movl %%eax, %%ebp \n\t"
2074 ".balign 16 \n\t"
2075 "1: \n\t"
2076 "movzwl (%2, %%ebp), %%eax \n\t"
2077 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2078 "movq (%1, %%ebp, 8), %%mm1 \n\t"
2079 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
2080 "movd (%3, %%eax), %%mm0 \n\t"
2081 "movd (%3, %%ebx), %%mm2 \n\t"
2082 "punpcklbw %%mm7, %%mm0 \n\t"
2083 "punpcklbw %%mm7, %%mm2 \n\t"
2084 "pmaddwd %%mm1, %%mm0 \n\t"
2085 "pmaddwd %%mm2, %%mm3 \n\t"
2086
2087 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
2088 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
2089 "movd 4(%3, %%eax), %%mm4 \n\t"
2090 "movd 4(%3, %%ebx), %%mm2 \n\t"
2091 "punpcklbw %%mm7, %%mm4 \n\t"
2092 "punpcklbw %%mm7, %%mm2 \n\t"
2093 "pmaddwd %%mm1, %%mm4 \n\t"
2094 "pmaddwd %%mm2, %%mm5 \n\t"
2095 "paddd %%mm4, %%mm0 \n\t"
2096 "paddd %%mm5, %%mm3 \n\t"
2097
2098 "psrad $8, %%mm0 \n\t"
2099 "psrad $8, %%mm3 \n\t"
2100 "packssdw %%mm3, %%mm0 \n\t"
2101 "pmaddwd %%mm6, %%mm0 \n\t"
2102 "packssdw %%mm0, %%mm0 \n\t"
2103 "movd %%mm0, (%4, %%ebp) \n\t"
2104 "addl $4, %%ebp \n\t"
2105 " jnc 1b \n\t"
c1b0bfb4 2106
077ea8a7
MN
2107 "popl %%ebp \n\t"
2108 : "+a" (counter)
2109 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2110 : "%ebx"
2111 );
2112 }
2113 else
2114 {
2115 int counter= -2*dstW;
2116// filter-= counter*filterSize/2;
2117 filterPos-= counter/2;
2118 dst-= counter/2;
2119 asm volatile(
2120 "pxor %%mm7, %%mm7 \n\t"
9b464428 2121 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2122 ".balign 16 \n\t"
2123 "1: \n\t"
2124 "movl %2, %%ecx \n\t"
2125 "movzwl (%%ecx, %0), %%eax \n\t"
2126 "movzwl 2(%%ecx, %0), %%ebx \n\t"
2127 "movl %5, %%ecx \n\t"
2128 "pxor %%mm4, %%mm4 \n\t"
2129 "pxor %%mm5, %%mm5 \n\t"
2130 "2: \n\t"
2131 "movq (%1), %%mm1 \n\t"
2132 "movq (%1, %6), %%mm3 \n\t"
2133 "movd (%%ecx, %%eax), %%mm0 \n\t"
2134 "movd (%%ecx, %%ebx), %%mm2 \n\t"
2135 "punpcklbw %%mm7, %%mm0 \n\t"
2136 "punpcklbw %%mm7, %%mm2 \n\t"
2137 "pmaddwd %%mm1, %%mm0 \n\t"
2138 "pmaddwd %%mm2, %%mm3 \n\t"
2139 "paddd %%mm3, %%mm5 \n\t"
2140 "paddd %%mm0, %%mm4 \n\t"
2141 "addl $8, %1 \n\t"
2142 "addl $4, %%ecx \n\t"
2143 "cmpl %4, %%ecx \n\t"
2144 " jb 2b \n\t"
2145 "addl %6, %1 \n\t"
2146 "psrad $8, %%mm4 \n\t"
2147 "psrad $8, %%mm5 \n\t"
2148 "packssdw %%mm5, %%mm4 \n\t"
2149 "pmaddwd %%mm6, %%mm4 \n\t"
2150 "packssdw %%mm4, %%mm4 \n\t"
2151 "movl %3, %%eax \n\t"
2152 "movd %%mm4, (%%eax, %0) \n\t"
2153 "addl $4, %0 \n\t"
2154 " jnc 1b \n\t"
c1b0bfb4 2155
627690b5
MN
2156 : "+r" (counter), "+r" (filter)
2157 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
077ea8a7 2158 "m" (src), "r" (filterSize*2)
e2f5a2a9 2159 : "%ebx", "%eax", "%ecx"
077ea8a7
MN
2160 );
2161 }
2162#else
8c266f0c
RD
2163#ifdef HAVE_ALTIVEC
2164 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2165#else
077ea8a7
MN
2166 int i;
2167 for(i=0; i<dstW; i++)
2168 {
2169 int j;
2170 int srcPos= filterPos[i];
2171 int val=0;
c1b0bfb4 2172// printf("filterPos: %d\n", filterPos[i]);
077ea8a7
MN
2173 for(j=0; j<filterSize; j++)
2174 {
2175// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2176 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2177 }
2178// filter += hFilterSize;
2179 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2180// dst[i] = val>>7;
2181 }
2182#endif
8c266f0c 2183#endif
077ea8a7 2184}
2ff198c1 2185 // *** horizontal scale Y line to temp buffer
28bf81c9
MN
2186static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2187 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
1e621b18 2188 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
b7dc6f66
MN
2189 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2190 int32_t *mmx2FilterPos)
077ea8a7 2191{
1e621b18
MN
2192 if(srcFormat==IMGFMT_YUY2)
2193 {
2194 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2195 src= formatConvBuffer;
2196 }
7322a67c
MN
2197 else if(srcFormat==IMGFMT_UYVY)
2198 {
2199 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2200 src= formatConvBuffer;
2201 }
1e621b18
MN
2202 else if(srcFormat==IMGFMT_BGR32)
2203 {
2204 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2205 src= formatConvBuffer;
2206 }
2207 else if(srcFormat==IMGFMT_BGR24)
2208 {
2209 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2210 src= formatConvBuffer;
2211 }
6af250ea
MN
2212 else if(srcFormat==IMGFMT_BGR16)
2213 {
2214 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2215 src= formatConvBuffer;
2216 }
b72034dd
MN
2217 else if(srcFormat==IMGFMT_BGR15)
2218 {
2219 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2220 src= formatConvBuffer;
2221 }
a861d4d7
MN
2222 else if(srcFormat==IMGFMT_RGB32)
2223 {
2224 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2225 src= formatConvBuffer;
2226 }
2227 else if(srcFormat==IMGFMT_RGB24)
2228 {
2229 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2230 src= formatConvBuffer;
2231 }
1e621b18 2232
e3d2500f 2233#ifdef HAVE_MMX
77a416e8 2234 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
28bf81c9 2235 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2236#else
28bf81c9 2237 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2238#endif
077ea8a7
MN
2239 {
2240 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2241 }
2242 else // Fast Bilinear upscale / crap downscale
2243 {
2ff198c1 2244#ifdef ARCH_X86
2ff198c1 2245#ifdef HAVE_MMX2
96034638 2246 int i;
2ff198c1
MN
2247 if(canMMX2BeUsed)
2248 {
2249 asm volatile(
2250 "pxor %%mm7, %%mm7 \n\t"
b7dc6f66
MN
2251 "movl %0, %%ecx \n\t"
2252 "movl %1, %%edi \n\t"
2253 "movl %2, %%edx \n\t"
2254 "movl %3, %%ebx \n\t"
2ff198c1 2255 "xorl %%eax, %%eax \n\t" // i
b7dc6f66
MN
2256 PREFETCH" (%%ecx) \n\t"
2257 PREFETCH" 32(%%ecx) \n\t"
2258 PREFETCH" 64(%%ecx) \n\t"
99cefd0b 2259
2ff198c1 2260#define FUNNY_Y_CODE \
b7dc6f66
MN
2261 "movl (%%ebx), %%esi \n\t"\
2262 "call *%4 \n\t"\
2263 "addl (%%ebx, %%eax), %%ecx \n\t"\
2264 "addl %%eax, %%edi \n\t"\
2265 "xorl %%eax, %%eax \n\t"\
99cefd0b 2266
2ff198c1
MN
2267FUNNY_Y_CODE
2268FUNNY_Y_CODE
2269FUNNY_Y_CODE
2270FUNNY_Y_CODE
2271FUNNY_Y_CODE
2272FUNNY_Y_CODE
2273FUNNY_Y_CODE
2274FUNNY_Y_CODE
2275
b7dc6f66
MN
2276 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2277 "m" (funnyYCode)
2ff198c1
MN
2278 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2279 );
af91b8b3 2280 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2ff198c1
MN
2281 }
2282 else
2283 {
2284#endif
2285 //NO MMX just normal asm ...
2286 asm volatile(
2287 "xorl %%eax, %%eax \n\t" // i
2288 "xorl %%ebx, %%ebx \n\t" // xx
2289 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2290 ".balign 16 \n\t"
2ff198c1
MN
2291 "1: \n\t"
2292 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2293 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2294 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2295 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2296 "shll $16, %%edi \n\t"
2297 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2298 "movl %1, %%edi \n\t"
2299 "shrl $9, %%esi \n\t"
2300 "movw %%si, (%%edi, %%eax, 2) \n\t"
2301 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2302 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2303
2304 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2305 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2306 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2307 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2308 "shll $16, %%edi \n\t"
2309 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2310 "movl %1, %%edi \n\t"
2311 "shrl $9, %%esi \n\t"
2312 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2313 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2314 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2315
2316
2317 "addl $2, %%eax \n\t"
2318 "cmpl %2, %%eax \n\t"
2319 " jb 1b \n\t"
2320
2321
2322 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2323 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2324 );
2325#ifdef HAVE_MMX2
77a416e8 2326 } //if MMX2 can't be used
2ff198c1
MN
2327#endif
2328#else
96034638
MN
2329 int i;
2330 unsigned int xpos=0;
2331 for(i=0;i<dstWidth;i++)
2332 {
2333 register unsigned int xx=xpos>>16;
2334 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2335 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2336 xpos+=xInc;
2337 }
2ff198c1 2338#endif
077ea8a7 2339 }
2ff198c1
MN
2340}
2341
28bf81c9
MN
2342inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2343 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
1e621b18 2344 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
b7dc6f66
MN
2345 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2346 int32_t *mmx2FilterPos)
2ff198c1 2347{
1e621b18
MN
2348 if(srcFormat==IMGFMT_YUY2)
2349 {
2350 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2351 src1= formatConvBuffer;
2352 src2= formatConvBuffer+2048;
2353 }
7322a67c
MN
2354 else if(srcFormat==IMGFMT_UYVY)
2355 {
2356 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2357 src1= formatConvBuffer;
2358 src2= formatConvBuffer+2048;
2359 }
1e621b18
MN
2360 else if(srcFormat==IMGFMT_BGR32)
2361 {
2362 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2363 src1= formatConvBuffer;
2364 src2= formatConvBuffer+2048;
2365 }
2366 else if(srcFormat==IMGFMT_BGR24)
2367 {
2368 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2369 src1= formatConvBuffer;
2370 src2= formatConvBuffer+2048;
2371 }
6af250ea
MN
2372 else if(srcFormat==IMGFMT_BGR16)
2373 {
2374 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2375 src1= formatConvBuffer;
2376 src2= formatConvBuffer+2048;
2377 }
b72034dd
MN
2378 else if(srcFormat==IMGFMT_BGR15)
2379 {
2380 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2381 src1= formatConvBuffer;
2382 src2= formatConvBuffer+2048;
2383 }
a861d4d7
MN
2384 else if(srcFormat==IMGFMT_RGB32)
2385 {
2386 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2387 src1= formatConvBuffer;
2388 src2= formatConvBuffer+2048;
2389 }
2390 else if(srcFormat==IMGFMT_RGB24)
2391 {
2392 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2393 src1= formatConvBuffer;
2394 src2= formatConvBuffer+2048;
2395 }
6ff0ad6b
MN
2396 else if(isGray(srcFormat))
2397 {
2398 return;
2399 }
1e621b18 2400
e3d2500f 2401#ifdef HAVE_MMX
77a416e8 2402 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
28bf81c9 2403 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2404#else
28bf81c9 2405 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2406#endif
077ea8a7
MN
2407 {
2408 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2409 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2410 }
2411 else // Fast Bilinear upscale / crap downscale
2412 {
2ff198c1
MN
2413#ifdef ARCH_X86
2414#ifdef HAVE_MMX2
96034638 2415 int i;
2ff198c1
MN
2416 if(canMMX2BeUsed)
2417 {
2418 asm volatile(
b7dc6f66
MN
2419 "pxor %%mm7, %%mm7 \n\t"
2420 "movl %0, %%ecx \n\t"
2421 "movl %1, %%edi \n\t"
2422 "movl %2, %%edx \n\t"
2423 "movl %3, %%ebx \n\t"
2424 "xorl %%eax, %%eax \n\t" // i
2425 PREFETCH" (%%ecx) \n\t"
2426 PREFETCH" 32(%%ecx) \n\t"
2427 PREFETCH" 64(%%ecx) \n\t"
2428
2429#define FUNNY_UV_CODE \
2430 "movl (%%ebx), %%esi \n\t"\
2431 "call *%4 \n\t"\
2432 "addl (%%ebx, %%eax), %%ecx \n\t"\
2433 "addl %%eax, %%edi \n\t"\
2434 "xorl %%eax, %%eax \n\t"\
2435
2436FUNNY_UV_CODE
2437FUNNY_UV_CODE
2438FUNNY_UV_CODE
2439FUNNY_UV_CODE
2440 "xorl %%eax, %%eax \n\t" // i
2441 "movl %5, %%ecx \n\t" // src
2442 "movl %1, %%edi \n\t" // buf1
2443 "addl $4096, %%edi \n\t"
2444 PREFETCH" (%%ecx) \n\t"
2445 PREFETCH" 32(%%ecx) \n\t"
2446 PREFETCH" 64(%%ecx) \n\t"
2447
2448FUNNY_UV_CODE
2449FUNNY_UV_CODE
2450FUNNY_UV_CODE
2451FUNNY_UV_CODE
2452
2453 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2454 "m" (funnyUVCode), "m" (src2)
2455 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2456 );
c1b0bfb4 2457 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2ff198c1 2458 {
c1b0bfb4
MN
2459// printf("%d %d %d\n", dstWidth, i, srcW);
2460 dst[i] = src1[srcW-1]*128;
2461 dst[i+2048] = src2[srcW-1]*128;
2ff198c1
MN
2462 }
2463 }
2464 else
2465 {
2466#endif
2467 asm volatile(
2468 "xorl %%eax, %%eax \n\t" // i
2469 "xorl %%ebx, %%ebx \n\t" // xx
2470 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2471 ".balign 16 \n\t"
2ff198c1
MN
2472 "1: \n\t"
2473 "movl %0, %%esi \n\t"
2474 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2475 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2476 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2477 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2478 "shll $16, %%edi \n\t"
2479 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2480 "movl %1, %%edi \n\t"
2481 "shrl $9, %%esi \n\t"
2482 "movw %%si, (%%edi, %%eax, 2) \n\t"
2483
2484 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2485 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2486 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2487 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2488 "shll $16, %%edi \n\t"
2489 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2490 "movl %1, %%edi \n\t"
2491 "shrl $9, %%esi \n\t"
2492 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2493
2494 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2495 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2496 "addl $1, %%eax \n\t"
2497 "cmpl %2, %%eax \n\t"
2498 " jb 1b \n\t"
2499
2500 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2501 "r" (src2)
2502 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2503 );
2504#ifdef HAVE_MMX2
77a416e8 2505 } //if MMX2 can't be used
2ff198c1
MN
2506#endif
2507#else
96034638
MN
2508 int i;
2509 unsigned int xpos=0;
2510 for(i=0;i<dstWidth;i++)
2511 {
2512 register unsigned int xx=xpos>>16;
2513 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2514 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2515 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1faf0867
MN
2516/* slower
2517 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2518 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2519*/
96034638
MN
2520 xpos+=xInc;
2521 }
2ff198c1 2522#endif
077ea8a7
MN
2523 }
2524}
2525
3e499f53
MN
2526static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2527 int srcSliceH, uint8_t* dst[], int dstStride[]){
28bf81c9
MN
2528
2529 /* load a few things into local vars to make the code more readable? and faster */
2530 const int srcW= c->srcW;
2531 const int dstW= c->dstW;
2532 const int dstH= c->dstH;
2533 const int chrDstW= c->chrDstW;
e616aa93 2534 const int chrSrcW= c->chrSrcW;
28bf81c9
MN
2535 const int lumXInc= c->lumXInc;
2536 const int chrXInc= c->chrXInc;
fe8054c0 2537 const int dstFormat= c->dstFormat;
44c1035c 2538 const int srcFormat= c->srcFormat;
28bf81c9
MN
2539 const int flags= c->flags;
2540 const int canMMX2BeUsed= c->canMMX2BeUsed;
2541 int16_t *vLumFilterPos= c->vLumFilterPos;
2542 int16_t *vChrFilterPos= c->vChrFilterPos;
2543 int16_t *hLumFilterPos= c->hLumFilterPos;
2544 int16_t *hChrFilterPos= c->hChrFilterPos;
2545 int16_t *vLumFilter= c->vLumFilter;
2546 int16_t *vChrFilter= c->vChrFilter;
2547 int16_t *hLumFilter= c->hLumFilter;
2548 int16_t *hChrFilter= c->hChrFilter;
77a49659
MN
2549 int32_t *lumMmxFilter= c->lumMmxFilter;
2550 int32_t *chrMmxFilter= c->chrMmxFilter;
28bf81c9
MN
2551 const int vLumFilterSize= c->vLumFilterSize;
2552 const int vChrFilterSize= c->vChrFilterSize;
2553 const int hLumFilterSize= c->hLumFilterSize;
2554 const int hChrFilterSize= c->hChrFilterSize;
2555 int16_t **lumPixBuf= c->lumPixBuf;
2556 int16_t **chrPixBuf= c->chrPixBuf;
2557 const int vLumBufSize= c->vLumBufSize;
2558 const int vChrBufSize= c->vChrBufSize;
2559 uint8_t *funnyYCode= c->funnyYCode;
2560 uint8_t *funnyUVCode= c->funnyUVCode;
1e621b18 2561 uint8_t *formatConvBuffer= c->formatConvBuffer;
e616aa93
MN
2562 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2563 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
d4e24275 2564 int lastDstY;
28bf81c9
MN
2565
2566 /* vars whch will change and which we need to storw back in the context */
2567 int dstY= c->dstY;
2568 int lumBufIndex= c->lumBufIndex;
2569 int chrBufIndex= c->chrBufIndex;
2570 int lastInLumBuf= c->lastInLumBuf;
2571 int lastInChrBuf= c->lastInChrBuf;
5859233b 2572
5859233b 2573 if(isPacked(c->srcFormat)){
1e621b18
MN
2574 src[0]=
2575 src[1]=
3e499f53 2576 src[2]= src[0];
5859233b 2577 srcStride[0]=
1e621b18 2578 srcStride[1]=
3e499f53 2579 srcStride[2]= srcStride[0];
6c7506de 2580 }
5859233b
MN
2581 srcStride[1]<<= c->vChrDrop;
2582 srcStride[2]<<= c->vChrDrop;
6c7506de 2583
c7a810cc
MN
2584// printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2585// (int)dst[0], (int)dst[1], (int)dst[2]);
2586
2587#if 0 //self test FIXME move to a vfilter or something
2588{
2589static volatile int i=0;
2590i++;
2591if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2592 selfTest(src, srcStride, c->srcW, c->srcH);
2593i--;
2594}
2595#endif
37079906
MN
2596
2597//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2598//dstStride[0],dstStride[1],dstStride[2]);
6c7506de
MN
2599
2600 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2601 {
2602 static int firstTime=1; //FIXME move this into the context perhaps
2603 if(flags & SWS_PRINT_INFO && firstTime)
2604 {
3ec38777 2605 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
6c7506de
MN
2606 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2607 firstTime=0;
2608 }
2609 }
28bf81c9 2610
1e621b18
MN
2611 /* Note the user might start scaling the picture in the middle so this will not get executed
2612 this is not really intended but works currently, so ppl might do it */
28bf81c9
MN
2613 if(srcSliceY ==0){
2614 lumBufIndex=0;
2615 chrBufIndex=0;
1e621b18 2616 dstY=0;
28bf81c9
MN
2617 lastInLumBuf= -1;
2618 lastInChrBuf= -1;
077ea8a7 2619 }
d3f41512 2620
d4e24275
MN
2621 lastDstY= dstY;
2622
c1b0bfb4 2623 for(;dstY < dstH; dstY++){
28bf81c9 2624 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3f7bb50c
MN
2625 const int chrDstY= dstY>>c->chrDstVSubSample;
2626 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2627 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
d3f41512 2628
c1b0bfb4
MN
2629 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2630 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2631 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2632 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
d604bab9 2633
379a2036
MN
2634//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2635// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
c7f822d9
MN
2636 //handle holes (FAST_BILINEAR & weird filters)
2637 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2638 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2639//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
c1b0bfb4
MN
2640 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2641 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
d3f41512 2642
c1b0bfb4 2643 // Do we have enough lines in this slice to output the dstY line
e616aa93 2644 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
c1b0bfb4
MN
2645 {
2646 //Do horizontal scaling
2647 while(lastInLumBuf < lastLumSrcY)
d3f41512 2648 {
28bf81c9 2649 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4 2650 lumBufIndex++;
c7f822d9 2651// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
c1b0bfb4
MN
2652 ASSERT(lumBufIndex < 2*vLumBufSize)
2653 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2654 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2655// printf("%d %d\n", lumBufIndex, vLumBufSize);
28bf81c9
MN
2656 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2657 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2658 funnyYCode, c->srcFormat, formatConvBuffer,
2659 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2660 lastInLumBuf++;
2661 }
2662 while(lastInChrBuf < lastChrSrcY)
2663 {
e616aa93
MN
2664 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2665 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2666 chrBufIndex++;
2667 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2668 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2669 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
28bf81c9 2670 //FIXME replace parameters through context struct (some at least)
44c1035c
MN
2671
2672 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2673 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2674 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2675 funnyUVCode, c->srcFormat, formatConvBuffer,
2676 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4 2677 lastInChrBuf++;
d3f41512 2678 }
c1b0bfb4
MN
2679 //wrap buf index around to stay inside the ring buffer
2680 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2681 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
d3f41512 2682 }
c1b0bfb4 2683 else // not enough lines left in this slice -> load the rest in the buffer
2ff198c1 2684 {
c1b0bfb4
MN
2685/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2686 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2687 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
e616aa93
MN
2688 vChrBufSize, vLumBufSize);*/
2689
c1b0bfb4
MN
2690 //Do horizontal scaling
2691 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2692 {
28bf81c9 2693 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4
MN
2694 lumBufIndex++;
2695 ASSERT(lumBufIndex < 2*vLumBufSize)
2696 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2697 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
28bf81c9
MN
2698 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2699 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2700 funnyYCode, c->srcFormat, formatConvBuffer,
2701 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2702 lastInLumBuf++;
2703 }
e616aa93 2704 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
c1b0bfb4 2705 {
e616aa93
MN
2706 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2707 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2708 chrBufIndex++;
2709 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2710 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2711 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
44c1035c
MN
2712
2713 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2714 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2715 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2716 funnyUVCode, c->srcFormat, formatConvBuffer,
2717 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4
MN
2718 lastInChrBuf++;
2719 }
2720 //wrap buf index around to stay inside the ring buffer
2721 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2722 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
77a416e8 2723 break; //we can't output a dstY line so let's try with the next slice
2ff198c1 2724 }
d3f41512 2725
c1b0bfb4
MN
2726#ifdef HAVE_MMX
2727 b5Dither= dither8[dstY&1];
2728 g6Dither= dither4[dstY&1];
2729 g5Dither= dither8[dstY&1];
2730 r5Dither= dither8[(dstY+1)&1];
2731#endif
28bf81c9 2732 if(dstY < dstH-2)
e3d2500f 2733 {
6542b44e
MN
2734 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2735 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2736#ifdef HAVE_MMX
2737 int i;
2738 for(i=0; i<vLumFilterSize; i++)
2739 {
2740 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2741 lumMmxFilter[4*i+2]=
2742 lumMmxFilter[4*i+3]=
2743 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2744 }
2745 for(i=0; i<vChrFilterSize; i++)
2746 {
2747 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2748 chrMmxFilter[4*i+2]=
2749 chrMmxFilter[4*i+3]=
2750 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2751 }
2752#endif
44c1035c 2753 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
0f25d72b 2754 {
df1b2c14
MN
2755 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2756 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
c1b0bfb4 2757 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2ff198c1 2758 {
c1b0bfb4
MN
2759 int16_t *lumBuf = lumPixBuf[0];
2760 int16_t *chrBuf= chrPixBuf[0];
e616aa93 2761 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
c1b0bfb4
MN
2762 }
2763 else //General YV12
2764 {
77a49659 2765 RENAME(yuv2yuvX)(c,
e616aa93
MN
2766 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2767 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
6542b44e 2768 dest, uDest, vDest, dstW, chrDstW);
2ff198c1 2769 }
0f25d72b 2770 }
c1b0bfb4 2771 else
2ff198c1 2772 {
c1b0bfb4
MN
2773 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2774 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2775 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2776 {
2777 int chrAlpha= vChrFilter[2*dstY+1];
25593e29 2778 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
cf7d1c1a 2779 dest, dstW, chrAlpha, dstFormat, flags, dstY);
c1b0bfb4
MN
2780 }
2781 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2782 {
2783 int lumAlpha= vLumFilter[2*dstY+1];
2784 int chrAlpha= vChrFilter[2*dstY+1];
25593e29 2785 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
cf7d1c1a 2786 dest, dstW, lumAlpha, chrAlpha, dstY);
c1b0bfb4
MN
2787 }
2788 else //General RGB
2789 {
25593e29 2790 RENAME(yuv2packedX)(c,
c1b0bfb4
MN
2791 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2792 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
77a49659 2793 dest, dstW, dstY);
c1b0bfb4
MN
2794 }
2795 }
e3d2500f 2796 }
77a416e8 2797 else // hmm looks like we can't use MMX here without overwriting this array's tail
e3d2500f
MN
2798 {
2799 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2800 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
f5b58629 2801 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
e3d2500f 2802 {
df1b2c14
MN
2803 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2804 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
5859233b 2805 yuv2yuvXinC(
e616aa93
MN
2806 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2807 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
5859233b 2808 dest, uDest, vDest, dstW, chrDstW);
e3d2500f
MN
2809 }
2810 else
2811 {
2812 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2813 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
25593e29 2814 yuv2packedXinC(c,
e3d2500f
MN
2815 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2816 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
cf7d1c1a 2817 dest, dstW, dstY);
e3d2500f
MN
2818 }
2819 }
c1b0bfb4 2820 }
17f715fa
MN
2821
2822#ifdef HAVE_MMX
2823 __asm __volatile(SFENCE:::"memory");
1faf0867 2824 __asm __volatile(EMMS:::"memory");
17f715fa 2825#endif
28bf81c9
MN
2826 /* store changed local vars back in the context */
2827 c->dstY= dstY;
2828 c->lumBufIndex= lumBufIndex;
2829 c->chrBufIndex= chrBufIndex;
2830 c->lastInLumBuf= lastInLumBuf;
2831 c->lastInChrBuf= lastInChrBuf;
d4e24275
MN
2832
2833 return dstY - lastDstY;
627690b5 2834}