cleanup
[libav.git] / postproc / swscale_template.c
CommitLineData
fe8054c0
MN
1/*
2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
541c4eb9 19#undef MOVNTQ
7d7f78b5 20#undef PAVGB
48a05cec
MN
21#undef PREFETCH
22#undef PREFETCHW
23#undef EMMS
24#undef SFENCE
25
26#ifdef HAVE_3DNOW
27/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28#define EMMS "femms"
29#else
30#define EMMS "emms"
31#endif
32
33#ifdef HAVE_3DNOW
34#define PREFETCH "prefetch"
35#define PREFETCHW "prefetchw"
36#elif defined ( HAVE_MMX2 )
37#define PREFETCH "prefetchnta"
38#define PREFETCHW "prefetcht0"
39#else
40#define PREFETCH "/nop"
41#define PREFETCHW "/nop"
42#endif
43
44#ifdef HAVE_MMX2
45#define SFENCE "sfence"
46#else
47#define SFENCE "/nop"
48#endif
d3f41512 49
d604bab9
MN
50#ifdef HAVE_MMX2
51#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52#elif defined (HAVE_3DNOW)
53#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54#endif
d3f41512 55
d604bab9
MN
56#ifdef HAVE_MMX2
57#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58#else
59#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60#endif
61
77a49659 62#define YSCALEYUV2YV12X(x, offset) \
c1b0bfb4
MN
63 "xorl %%eax, %%eax \n\t"\
64 "pxor %%mm3, %%mm3 \n\t"\
65 "pxor %%mm4, %%mm4 \n\t"\
77a49659
MN
66 "leal " offset "(%0), %%edx \n\t"\
67 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
68 ".balign 16 \n\t" /* FIXME Unroll? */\
69 "1: \n\t"\
77a49659 70 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
c1b0bfb4
MN
71 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
72 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
77a49659
MN
73 "addl $16, %%edx \n\t"\
74 "movl (%%edx), %%esi \n\t"\
75 "testl %%esi, %%esi \n\t"\
c1b0bfb4
MN
76 "pmulhw %%mm0, %%mm2 \n\t"\
77 "pmulhw %%mm0, %%mm5 \n\t"\
78 "paddw %%mm2, %%mm3 \n\t"\
79 "paddw %%mm5, %%mm4 \n\t"\
c1b0bfb4
MN
80 " jnz 1b \n\t"\
81 "psraw $3, %%mm3 \n\t"\
82 "psraw $3, %%mm4 \n\t"\
83 "packuswb %%mm4, %%mm3 \n\t"\
77a49659 84 MOVNTQ(%%mm3, (%1, %%eax))\
c1b0bfb4 85 "addl $8, %%eax \n\t"\
77a49659 86 "cmpl %2, %%eax \n\t"\
c1b0bfb4
MN
87 "pxor %%mm3, %%mm3 \n\t"\
88 "pxor %%mm4, %%mm4 \n\t"\
77a49659
MN
89 "leal " offset "(%0), %%edx \n\t"\
90 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
91 "jb 1b \n\t"
92
93#define YSCALEYUV2YV121 \
94 "movl %2, %%eax \n\t"\
95 ".balign 16 \n\t" /* FIXME Unroll? */\
96 "1: \n\t"\
97 "movq (%0, %%eax, 2), %%mm0 \n\t"\
98 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
99 "psraw $7, %%mm0 \n\t"\
100 "psraw $7, %%mm1 \n\t"\
101 "packuswb %%mm1, %%mm0 \n\t"\
102 MOVNTQ(%%mm0, (%1, %%eax))\
103 "addl $8, %%eax \n\t"\
104 "jnc 1b \n\t"
105
106/*
107 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
108 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
109 "r" (dest), "m" (dstW),
110 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
111 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
112*/
25593e29 113#define YSCALEYUV2PACKEDX \
c1b0bfb4
MN
114 "xorl %%eax, %%eax \n\t"\
115 ".balign 16 \n\t"\
77a49659 116 "nop \n\t"\
c1b0bfb4 117 "1: \n\t"\
77a49659
MN
118 "leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
119 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
120 "pxor %%mm3, %%mm3 \n\t"\
121 "pxor %%mm4, %%mm4 \n\t"\
77a49659 122 ".balign 16 \n\t"\
c1b0bfb4 123 "2: \n\t"\
77a49659 124 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
c1b0bfb4
MN
125 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
126 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
77a49659
MN
127 "addl $16, %%edx \n\t"\
128 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
129 "pmulhw %%mm0, %%mm2 \n\t"\
130 "pmulhw %%mm0, %%mm5 \n\t"\
131 "paddw %%mm2, %%mm3 \n\t"\
132 "paddw %%mm5, %%mm4 \n\t"\
77a49659 133 "testl %%esi, %%esi \n\t"\
c1b0bfb4
MN
134 " jnz 2b \n\t"\
135\
77a49659
MN
136 "leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
137 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
138 "pxor %%mm1, %%mm1 \n\t"\
139 "pxor %%mm7, %%mm7 \n\t"\
77a49659 140 ".balign 16 \n\t"\
c1b0bfb4 141 "2: \n\t"\
77a49659 142 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
c1b0bfb4
MN
143 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
144 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
77a49659
MN
145 "addl $16, %%edx \n\t"\
146 "movl (%%edx), %%esi \n\t"\
c1b0bfb4
MN
147 "pmulhw %%mm0, %%mm2 \n\t"\
148 "pmulhw %%mm0, %%mm5 \n\t"\
149 "paddw %%mm2, %%mm1 \n\t"\
150 "paddw %%mm5, %%mm7 \n\t"\
77a49659 151 "testl %%esi, %%esi \n\t"\
c1b0bfb4 152 " jnz 2b \n\t"\
25593e29
MN
153
154
155#define YSCALEYUV2RGBX \
156 YSCALEYUV2PACKEDX\
77a49659
MN
157 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
158 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
c1b0bfb4
MN
159 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
160 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
77a49659
MN
161 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
162 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
c1b0bfb4 163 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
77a49659
MN
164 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
165 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
166 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
167 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
168 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
169 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
c1b0bfb4
MN
170 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
171 "paddw %%mm3, %%mm4 \n\t"\
172 "movq %%mm2, %%mm0 \n\t"\
173 "movq %%mm5, %%mm6 \n\t"\
174 "movq %%mm4, %%mm3 \n\t"\
175 "punpcklwd %%mm2, %%mm2 \n\t"\
176 "punpcklwd %%mm5, %%mm5 \n\t"\
177 "punpcklwd %%mm4, %%mm4 \n\t"\
178 "paddw %%mm1, %%mm2 \n\t"\
179 "paddw %%mm1, %%mm5 \n\t"\
180 "paddw %%mm1, %%mm4 \n\t"\
181 "punpckhwd %%mm0, %%mm0 \n\t"\
182 "punpckhwd %%mm6, %%mm6 \n\t"\
183 "punpckhwd %%mm3, %%mm3 \n\t"\
184 "paddw %%mm7, %%mm0 \n\t"\
185 "paddw %%mm7, %%mm6 \n\t"\
186 "paddw %%mm7, %%mm3 \n\t"\
187 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
188 "packuswb %%mm0, %%mm2 \n\t"\
189 "packuswb %%mm6, %%mm5 \n\t"\
190 "packuswb %%mm3, %%mm4 \n\t"\
191 "pxor %%mm7, %%mm7 \n\t"
77a49659 192#if 0
d604bab9
MN
193#define FULL_YSCALEYUV2RGB \
194 "pxor %%mm7, %%mm7 \n\t"\
195 "movd %6, %%mm6 \n\t" /*yalpha1*/\
196 "punpcklwd %%mm6, %%mm6 \n\t"\
197 "punpcklwd %%mm6, %%mm6 \n\t"\
198 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
199 "punpcklwd %%mm5, %%mm5 \n\t"\
200 "punpcklwd %%mm5, %%mm5 \n\t"\
201 "xorl %%eax, %%eax \n\t"\
cff6ecd7 202 ".balign 16 \n\t"\
d604bab9
MN
203 "1: \n\t"\
204 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
205 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
206 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
207 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
208 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
209 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
210 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
211 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
212 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
213 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
214 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
215 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
216 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
217 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
218 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
9b464428
FB
219 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
220 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
221 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
222\
223\
224 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
225 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
9b464428 226 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
d604bab9 227 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428 228 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
d604bab9 229 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428 230 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
231\
232\
233 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
9b464428
FB
234 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
235 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
236 "paddw %%mm1, %%mm3 \n\t" /* B*/\
237 "paddw %%mm1, %%mm0 \n\t" /* R*/\
238 "packuswb %%mm3, %%mm3 \n\t"\
239\
240 "packuswb %%mm0, %%mm0 \n\t"\
241 "paddw %%mm4, %%mm2 \n\t"\
242 "paddw %%mm2, %%mm1 \n\t" /* G*/\
243\
244 "packuswb %%mm1, %%mm1 \n\t"
77a49659 245#endif
d604bab9 246
25593e29
MN
247#define YSCALEYUV2PACKED \
248 "movd %6, %%mm6 \n\t" /*yalpha1*/\
249 "punpcklwd %%mm6, %%mm6 \n\t"\
250 "punpcklwd %%mm6, %%mm6 \n\t"\
251 "psraw $3, %%mm6 \n\t"\
252 "movq %%mm6, 3968(%2) \n\t"\
253 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
254 "punpcklwd %%mm5, %%mm5 \n\t"\
255 "punpcklwd %%mm5, %%mm5 \n\t"\
256 "psraw $3, %%mm5 \n\t"\
257 "movq %%mm5, 3976(%2) \n\t"\
258 "xorl %%eax, %%eax \n\t"\
259 ".balign 16 \n\t"\
260 "1: \n\t"\
261 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
262 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
263 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
264 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
265 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
266 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
267 "movq 3976(%2), %%mm0 \n\t"\
268 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
269 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
270 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
271 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
272 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
273 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
274 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
275 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
276 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
277 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
278 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
279 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
280 "pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
281 "pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
282 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
283 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
284 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
285 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
286
d604bab9
MN
287#define YSCALEYUV2RGB \
288 "movd %6, %%mm6 \n\t" /*yalpha1*/\
289 "punpcklwd %%mm6, %%mm6 \n\t"\
290 "punpcklwd %%mm6, %%mm6 \n\t"\
5ac80202 291 "movq %%mm6, 3968(%2) \n\t"\
d604bab9
MN
292 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
293 "punpcklwd %%mm5, %%mm5 \n\t"\
294 "punpcklwd %%mm5, %%mm5 \n\t"\
5ac80202 295 "movq %%mm5, 3976(%2) \n\t"\
d604bab9 296 "xorl %%eax, %%eax \n\t"\
cff6ecd7 297 ".balign 16 \n\t"\
d604bab9
MN
298 "1: \n\t"\
299 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
300 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
301 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
302 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
303 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
304 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
5ac80202 305 "movq 3976(%2), %%mm0 \n\t"\
d604bab9
MN
306 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
307 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
308 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
309 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
310 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
311 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428
FB
312 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
313 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
314 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
315 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
316 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
317 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
318 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
319 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
320 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
321 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
322 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
323 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
324 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
5ac80202
MN
325 "pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
326 "pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
d604bab9
MN
327 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
328 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
329 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
330 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
9b464428
FB
331 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
332 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
333 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
334 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
335 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
336 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
d604bab9
MN
337 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
338 "paddw %%mm3, %%mm4 \n\t"\
339 "movq %%mm2, %%mm0 \n\t"\
340 "movq %%mm5, %%mm6 \n\t"\
341 "movq %%mm4, %%mm3 \n\t"\
342 "punpcklwd %%mm2, %%mm2 \n\t"\
343 "punpcklwd %%mm5, %%mm5 \n\t"\
344 "punpcklwd %%mm4, %%mm4 \n\t"\
345 "paddw %%mm1, %%mm2 \n\t"\
346 "paddw %%mm1, %%mm5 \n\t"\
347 "paddw %%mm1, %%mm4 \n\t"\
348 "punpckhwd %%mm0, %%mm0 \n\t"\
349 "punpckhwd %%mm6, %%mm6 \n\t"\
350 "punpckhwd %%mm3, %%mm3 \n\t"\
351 "paddw %%mm7, %%mm0 \n\t"\
352 "paddw %%mm7, %%mm6 \n\t"\
353 "paddw %%mm7, %%mm3 \n\t"\
354 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
355 "packuswb %%mm0, %%mm2 \n\t"\
356 "packuswb %%mm6, %%mm5 \n\t"\
357 "packuswb %%mm3, %%mm4 \n\t"\
358 "pxor %%mm7, %%mm7 \n\t"
25593e29
MN
359
360#define YSCALEYUV2PACKED1 \
361 "xorl %%eax, %%eax \n\t"\
362 ".balign 16 \n\t"\
363 "1: \n\t"\
364 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
365 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
366 "psraw $7, %%mm3 \n\t" \
367 "psraw $7, %%mm4 \n\t" \
368 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
369 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
370 "psraw $7, %%mm1 \n\t" \
371 "psraw $7, %%mm7 \n\t" \
372
d604bab9
MN
373#define YSCALEYUV2RGB1 \
374 "xorl %%eax, %%eax \n\t"\
cff6ecd7 375 ".balign 16 \n\t"\
d604bab9
MN
376 "1: \n\t"\
377 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
378 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
379 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
380 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428
FB
381 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
382 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
383 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
384 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
385 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
386 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9 387 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
497d4f99
MN
388 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
389 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
9b464428
FB
392 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
393 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
394 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
395 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
396 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
397 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
497d4f99
MN
398 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
399 "paddw %%mm3, %%mm4 \n\t"\
400 "movq %%mm2, %%mm0 \n\t"\
401 "movq %%mm5, %%mm6 \n\t"\
402 "movq %%mm4, %%mm3 \n\t"\
403 "punpcklwd %%mm2, %%mm2 \n\t"\
404 "punpcklwd %%mm5, %%mm5 \n\t"\
405 "punpcklwd %%mm4, %%mm4 \n\t"\
406 "paddw %%mm1, %%mm2 \n\t"\
407 "paddw %%mm1, %%mm5 \n\t"\
408 "paddw %%mm1, %%mm4 \n\t"\
409 "punpckhwd %%mm0, %%mm0 \n\t"\
410 "punpckhwd %%mm6, %%mm6 \n\t"\
411 "punpckhwd %%mm3, %%mm3 \n\t"\
412 "paddw %%mm7, %%mm0 \n\t"\
413 "paddw %%mm7, %%mm6 \n\t"\
414 "paddw %%mm7, %%mm3 \n\t"\
415 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
416 "packuswb %%mm0, %%mm2 \n\t"\
417 "packuswb %%mm6, %%mm5 \n\t"\
418 "packuswb %%mm3, %%mm4 \n\t"\
419 "pxor %%mm7, %%mm7 \n\t"
420
25593e29
MN
421#define YSCALEYUV2PACKED1b \
422 "xorl %%eax, %%eax \n\t"\
423 ".balign 16 \n\t"\
424 "1: \n\t"\
425 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431 "psrlw $8, %%mm3 \n\t" \
432 "psrlw $8, %%mm4 \n\t" \
433 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
434 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
435 "psraw $7, %%mm1 \n\t" \
436 "psraw $7, %%mm7 \n\t"
437
497d4f99
MN
438// do vertical chrominance interpolation
439#define YSCALEYUV2RGB1b \
440 "xorl %%eax, %%eax \n\t"\
cff6ecd7 441 ".balign 16 \n\t"\
497d4f99
MN
442 "1: \n\t"\
443 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
444 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
445 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
446 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397c035e
MN
447 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
448 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
c1b0bfb4
MN
449 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
450 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
9b464428
FB
451 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
452 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
497d4f99
MN
453 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
454 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
455 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
456 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
497d4f99
MN
457 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
458 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
459 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
d604bab9
MN
460 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
461 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
9b464428
FB
462 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
463 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
464 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
465 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
466 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
467 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
d604bab9
MN
468 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
469 "paddw %%mm3, %%mm4 \n\t"\
470 "movq %%mm2, %%mm0 \n\t"\
471 "movq %%mm5, %%mm6 \n\t"\
472 "movq %%mm4, %%mm3 \n\t"\
473 "punpcklwd %%mm2, %%mm2 \n\t"\
474 "punpcklwd %%mm5, %%mm5 \n\t"\
475 "punpcklwd %%mm4, %%mm4 \n\t"\
476 "paddw %%mm1, %%mm2 \n\t"\
477 "paddw %%mm1, %%mm5 \n\t"\
478 "paddw %%mm1, %%mm4 \n\t"\
479 "punpckhwd %%mm0, %%mm0 \n\t"\
480 "punpckhwd %%mm6, %%mm6 \n\t"\
481 "punpckhwd %%mm3, %%mm3 \n\t"\
482 "paddw %%mm7, %%mm0 \n\t"\
483 "paddw %%mm7, %%mm6 \n\t"\
484 "paddw %%mm7, %%mm3 \n\t"\
485 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
486 "packuswb %%mm0, %%mm2 \n\t"\
487 "packuswb %%mm6, %%mm5 \n\t"\
488 "packuswb %%mm3, %%mm4 \n\t"\
489 "pxor %%mm7, %%mm7 \n\t"
490
491#define WRITEBGR32 \
492 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
493 "movq %%mm2, %%mm1 \n\t" /* B */\
494 "movq %%mm5, %%mm6 \n\t" /* R */\
495 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
496 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
497 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
498 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
499 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
500 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
501 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
502 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
503 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
504 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
505\
506 MOVNTQ(%%mm0, (%4, %%eax, 4))\
507 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
508 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
509 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
510\
511 "addl $8, %%eax \n\t"\
512 "cmpl %5, %%eax \n\t"\
513 " jb 1b \n\t"
514
515#define WRITEBGR16 \
9b464428
FB
516 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
517 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
518 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb 519 "psrlq $3, %%mm2 \n\t"\
d604bab9 520\
f62255fb
MN
521 "movq %%mm2, %%mm1 \n\t"\
522 "movq %%mm4, %%mm3 \n\t"\
d604bab9 523\
f62255fb
MN
524 "punpcklbw %%mm7, %%mm3 \n\t"\
525 "punpcklbw %%mm5, %%mm2 \n\t"\
526 "punpckhbw %%mm7, %%mm4 \n\t"\
527 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 528\
f62255fb
MN
529 "psllq $3, %%mm3 \n\t"\
530 "psllq $3, %%mm4 \n\t"\
d604bab9
MN
531\
532 "por %%mm3, %%mm2 \n\t"\
d604bab9 533 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
534\
535 MOVNTQ(%%mm2, (%4, %%eax, 2))\
536 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
537\
538 "addl $8, %%eax \n\t"\
539 "cmpl %5, %%eax \n\t"\
540 " jb 1b \n\t"
541
542#define WRITEBGR15 \
9b464428
FB
543 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
544 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
545 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb
MN
546 "psrlq $3, %%mm2 \n\t"\
547 "psrlq $1, %%mm5 \n\t"\
d604bab9 548\
f62255fb
MN
549 "movq %%mm2, %%mm1 \n\t"\
550 "movq %%mm4, %%mm3 \n\t"\
d604bab9 551\
f62255fb
MN
552 "punpcklbw %%mm7, %%mm3 \n\t"\
553 "punpcklbw %%mm5, %%mm2 \n\t"\
554 "punpckhbw %%mm7, %%mm4 \n\t"\
555 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 556\
f62255fb
MN
557 "psllq $2, %%mm3 \n\t"\
558 "psllq $2, %%mm4 \n\t"\
d604bab9
MN
559\
560 "por %%mm3, %%mm2 \n\t"\
d604bab9 561 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
562\
563 MOVNTQ(%%mm2, (%4, %%eax, 2))\
564 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
565\
566 "addl $8, %%eax \n\t"\
567 "cmpl %5, %%eax \n\t"\
568 " jb 1b \n\t"
f62255fb 569
99d2cb72 570#define WRITEBGR24OLD \
d604bab9
MN
571 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
572 "movq %%mm2, %%mm1 \n\t" /* B */\
573 "movq %%mm5, %%mm6 \n\t" /* R */\
574 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
575 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
576 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
577 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
578 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
579 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
a525ce8d
MN
580 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
581 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
582 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
583 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9
MN
584\
585 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
586 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
9b464428
FB
587 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
588 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
d604bab9
MN
589 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
590 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
591 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
592 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
593\
594 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
595 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
596 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
597 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
9b464428 598 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
d604bab9
MN
599 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
600 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
9b464428
FB
601 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
602 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
d604bab9
MN
603 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
604 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
605 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
606 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
607\
608 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
609 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
610 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
9b464428
FB
611 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
612 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
d604bab9
MN
613 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
614 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
615 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
616\
bdc2eb9a
MN
617 MOVNTQ(%%mm0, (%%ebx))\
618 MOVNTQ(%%mm2, 8(%%ebx))\
619 MOVNTQ(%%mm3, 16(%%ebx))\
620 "addl $24, %%ebx \n\t"\
d604bab9
MN
621\
622 "addl $8, %%eax \n\t"\
623 "cmpl %5, %%eax \n\t"\
624 " jb 1b \n\t"
625
99d2cb72
MN
626#define WRITEBGR24MMX \
627 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
628 "movq %%mm2, %%mm1 \n\t" /* B */\
629 "movq %%mm5, %%mm6 \n\t" /* R */\
630 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
631 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
632 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
633 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
634 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
635 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
636 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
637 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
638 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
639 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
640\
641 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
642 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
643 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
644 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
645\
646 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
647 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
648 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
649 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
650\
651 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
652 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
653 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
654 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
655\
656 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
657 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
658 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
659 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
660 MOVNTQ(%%mm0, (%%ebx))\
661\
662 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
663 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
664 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
665 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
666 MOVNTQ(%%mm6, 8(%%ebx))\
667\
668 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
669 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
670 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
671 MOVNTQ(%%mm5, 16(%%ebx))\
672\
673 "addl $24, %%ebx \n\t"\
674\
675 "addl $8, %%eax \n\t"\
676 "cmpl %5, %%eax \n\t"\
677 " jb 1b \n\t"
678
679#define WRITEBGR24MMX2 \
680 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
9b464428
FB
681 "movq "MANGLE(M24A)", %%mm0 \n\t"\
682 "movq "MANGLE(M24C)", %%mm7 \n\t"\
99d2cb72
MN
683 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
684 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
685 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
686\
687 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
688 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
689 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
690\
691 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
692 "por %%mm1, %%mm6 \n\t"\
693 "por %%mm3, %%mm6 \n\t"\
694 MOVNTQ(%%mm6, (%%ebx))\
695\
696 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
697 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
698 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
699 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
700\
9b464428 701 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
99d2cb72
MN
702 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
703 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
704\
705 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
706 "por %%mm3, %%mm6 \n\t"\
707 MOVNTQ(%%mm6, 8(%%ebx))\
708\
709 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
710 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
711 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
712\
713 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
714 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
9b464428 715 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72
MN
716\
717 "por %%mm1, %%mm3 \n\t"\
718 "por %%mm3, %%mm6 \n\t"\
719 MOVNTQ(%%mm6, 16(%%ebx))\
720\
721 "addl $24, %%ebx \n\t"\
722\
723 "addl $8, %%eax \n\t"\
724 "cmpl %5, %%eax \n\t"\
725 " jb 1b \n\t"
726
727#ifdef HAVE_MMX2
7630f2e0 728#undef WRITEBGR24
99d2cb72
MN
729#define WRITEBGR24 WRITEBGR24MMX2
730#else
7630f2e0 731#undef WRITEBGR24
99d2cb72
MN
732#define WRITEBGR24 WRITEBGR24MMX
733#endif
734
25593e29
MN
735#define WRITEYUY2 \
736 "packuswb %%mm3, %%mm3 \n\t"\
737 "packuswb %%mm4, %%mm4 \n\t"\
738 "packuswb %%mm7, %%mm1 \n\t"\
739 "punpcklbw %%mm4, %%mm3 \n\t"\
740 "movq %%mm1, %%mm7 \n\t"\
741 "punpcklbw %%mm3, %%mm1 \n\t"\
742 "punpckhbw %%mm3, %%mm7 \n\t"\
743\
744 MOVNTQ(%%mm1, (%4, %%eax, 2))\
745 MOVNTQ(%%mm7, 8(%4, %%eax, 2))\
746\
747 "addl $8, %%eax \n\t"\
748 "cmpl %5, %%eax \n\t"\
749 " jb 1b \n\t"
750
751
77a49659 752static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
c1b0bfb4 753 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
e616aa93 754 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
77a49659 755 int32_t * lumMmxFilter, int32_t * chrMmxFilter)
38858470 756{
77a49659 757 int dummy=0;
c1b0bfb4
MN
758#ifdef HAVE_MMX
759 if(uDest != NULL)
760 {
761 asm volatile(
77a49659
MN
762 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
763 :: "r" (&c->redDither),
764 "r" (uDest), "m" (chrDstW)
c1b0bfb4
MN
765 : "%eax", "%edx", "%esi"
766 );
767
768 asm volatile(
77a49659
MN
769 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
770 :: "r" (&c->redDither),
771 "r" (vDest), "m" (chrDstW)
c1b0bfb4
MN
772 : "%eax", "%edx", "%esi"
773 );
774 }
775
776 asm volatile(
77a49659
MN
777 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
778 :: "r" (&c->redDither),
779 "r" (dest), "m" (dstW)
c1b0bfb4
MN
780 : "%eax", "%edx", "%esi"
781 );
782#else
5859233b 783yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
e3d2500f 784 chrFilter, chrSrc, chrFilterSize,
5859233b 785 dest, uDest, vDest, dstW, chrDstW);
7630f2e0 786#endif
c1b0bfb4 787}
2add307d 788
c1b0bfb4 789static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
e616aa93 790 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
c1b0bfb4
MN
791{
792#ifdef HAVE_MMX
793 if(uDest != NULL)
38858470 794 {
c1b0bfb4
MN
795 asm volatile(
796 YSCALEYUV2YV121
e616aa93
MN
797 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
798 "g" (-chrDstW)
c1b0bfb4
MN
799 : "%eax"
800 );
801
802 asm volatile(
803 YSCALEYUV2YV121
e616aa93
MN
804 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
805 "g" (-chrDstW)
c1b0bfb4
MN
806 : "%eax"
807 );
38858470
MN
808 }
809
c1b0bfb4
MN
810 asm volatile(
811 YSCALEYUV2YV121
812 :: "r" (lumSrc + dstW), "r" (dest + dstW),
813 "g" (-dstW)
814 : "%eax"
815 );
816#else
c1b0bfb4
MN
817 int i;
818 for(i=0; i<dstW; i++)
38858470 819 {
c1b0bfb4 820 int val= lumSrc[i]>>7;
44c1035c
MN
821
822 if(val&256){
823 if(val<0) val=0;
824 else val=255;
825 }
c1b0bfb4 826
44c1035c 827 dest[i]= val;
c1b0bfb4
MN
828 }
829
830 if(uDest != NULL)
e616aa93 831 for(i=0; i<chrDstW; i++)
38858470 832 {
c1b0bfb4
MN
833 int u=chrSrc[i]>>7;
834 int v=chrSrc[i + 2048]>>7;
835
44c1035c
MN
836 if((u|v)&256){
837 if(u<0) u=0;
838 else if (u>255) u=255;
839 if(v<0) v=0;
840 else if (v>255) v=255;
841 }
842
843 uDest[i]= u;
844 vDest[i]= v;
38858470 845 }
c1b0bfb4 846#endif
38858470
MN
847}
848
c1b0bfb4 849
d604bab9
MN
850/**
851 * vertical scale YV12 to RGB
852 */
25593e29 853static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
c1b0bfb4 854 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
77a49659 855 uint8_t *dest, int dstW, int dstY)
c1b0bfb4 856{
77a49659 857 int dummy=0;
cf7d1c1a 858 switch(c->dstFormat)
c1b0bfb4
MN
859 {
860#ifdef HAVE_MMX
cf7d1c1a 861 case IMGFMT_BGR32:
c1b0bfb4
MN
862 {
863 asm volatile(
864 YSCALEYUV2RGBX
865 WRITEBGR32
866
77a49659
MN
867 :: "r" (&c->redDither),
868 "m" (dummy), "m" (dummy), "m" (dummy),
869 "r" (dest), "m" (dstW)
870 : "%eax", "%edx", "%esi"
c1b0bfb4
MN
871 );
872 }
cf7d1c1a
MN
873 break;
874 case IMGFMT_BGR24:
c1b0bfb4
MN
875 {
876 asm volatile(
877 YSCALEYUV2RGBX
878 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
879 "addl %4, %%ebx \n\t"
880 WRITEBGR24
881
77a49659
MN
882 :: "r" (&c->redDither),
883 "m" (dummy), "m" (dummy), "m" (dummy),
884 "r" (dest), "m" (dstW)
885 : "%eax", "%ebx", "%edx", "%esi" //FIXME ebx
c1b0bfb4
MN
886 );
887 }
cf7d1c1a
MN
888 break;
889 case IMGFMT_BGR15:
c1b0bfb4
MN
890 {
891 asm volatile(
892 YSCALEYUV2RGBX
893 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
894#ifdef DITHER1XBPP
9b464428
FB
895 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
896 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
897 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
898#endif
899
900 WRITEBGR15
901
77a49659
MN
902 :: "r" (&c->redDither),
903 "m" (dummy), "m" (dummy), "m" (dummy),
904 "r" (dest), "m" (dstW)
905 : "%eax", "%edx", "%esi"
c1b0bfb4
MN
906 );
907 }
cf7d1c1a
MN
908 break;
909 case IMGFMT_BGR16:
c1b0bfb4
MN
910 {
911 asm volatile(
912 YSCALEYUV2RGBX
913 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
914#ifdef DITHER1XBPP
9b464428
FB
915 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
916 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
917 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
918#endif
919
920 WRITEBGR16
921
77a49659
MN
922 :: "r" (&c->redDither),
923 "m" (dummy), "m" (dummy), "m" (dummy),
924 "r" (dest), "m" (dstW)
925 : "%eax", "%edx", "%esi"
c1b0bfb4
MN
926 );
927 }
cf7d1c1a 928 break;
25593e29
MN
929 case IMGFMT_YUY2:
930 {
931 asm volatile(
932 YSCALEYUV2PACKEDX
933 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
934
935 "psraw $3, %%mm3 \n\t"
936 "psraw $3, %%mm4 \n\t"
937 "psraw $3, %%mm1 \n\t"
938 "psraw $3, %%mm7 \n\t"
939 WRITEYUY2
940
77a49659
MN
941 :: "r" (&c->redDither),
942 "m" (dummy), "m" (dummy), "m" (dummy),
943 "r" (dest), "m" (dstW)
944 : "%eax", "%edx", "%esi"
25593e29
MN
945 );
946 }
947 break;
c1b0bfb4 948#endif
cf7d1c1a 949 default:
25593e29 950 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
cf7d1c1a
MN
951 chrFilter, chrSrc, chrFilterSize,
952 dest, dstW, dstY);
953 break;
954 }
c1b0bfb4
MN
955}
956
c1b0bfb4
MN
957/**
958 * vertical bilinear scale YV12 to RGB
959 */
25593e29 960static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
cf7d1c1a 961 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9
MN
962{
963 int yalpha1=yalpha^4095;
964 int uvalpha1=uvalpha^4095;
cf7d1c1a 965 int i;
d604bab9 966
cf7d1c1a 967#if 0 //isnt used
1e621b18 968 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 969 {
cf7d1c1a 970 switch(dstFormat)
d604bab9 971 {
cf7d1c1a
MN
972#ifdef HAVE_MMX
973 case IMGFMT_BGR32:
d604bab9
MN
974 asm volatile(
975
976
977FULL_YSCALEYUV2RGB
978 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
979 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
980
981 "movq %%mm3, %%mm1 \n\t"
982 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
983 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
984
985 MOVNTQ(%%mm3, (%4, %%eax, 4))
986 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
987
988 "addl $4, %%eax \n\t"
989 "cmpl %5, %%eax \n\t"
990 " jb 1b \n\t"
991
992
d1fac6cf 993 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
994 "m" (yalpha1), "m" (uvalpha1)
995 : "%eax"
996 );
cf7d1c1a
MN
997 break;
998 case IMGFMT_BGR24:
d604bab9
MN
999 asm volatile(
1000
1001FULL_YSCALEYUV2RGB
1002
1003 // lsb ... msb
1004 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1005 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1006
1007 "movq %%mm3, %%mm1 \n\t"
1008 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1009 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1010
1011 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1012 "psrlq $8, %%mm3 \n\t" // GR0BGR00
9b464428
FB
1013 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1014 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
d604bab9
MN
1015 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1016 "movq %%mm1, %%mm2 \n\t"
1017 "psllq $48, %%mm1 \n\t" // 000000BG
1018 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1019
1020 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1021 "psrld $16, %%mm2 \n\t" // R000R000
1022 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1023 "por %%mm2, %%mm1 \n\t" // RBGRR000
1024
1025 "movl %4, %%ebx \n\t"
1026 "addl %%eax, %%ebx \n\t"
1027
1028#ifdef HAVE_MMX2
1029 //FIXME Alignment
1030 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
1031 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
1032#else
1033 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
1034 "psrlq $32, %%mm3 \n\t"
1035 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
1036 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
1037#endif
1038 "addl $4, %%eax \n\t"
1039 "cmpl %5, %%eax \n\t"
1040 " jb 1b \n\t"
1041
d1fac6cf 1042 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1043 "m" (yalpha1), "m" (uvalpha1)
1044 : "%eax", "%ebx"
1045 );
cf7d1c1a
MN
1046 break;
1047 case IMGFMT_BGR15:
d604bab9
MN
1048 asm volatile(
1049
1050FULL_YSCALEYUV2RGB
1051#ifdef DITHER1XBPP
9b464428
FB
1052 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1053 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1054 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
1055#endif
1056 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1057 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1058 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1059
1060 "psrlw $3, %%mm3 \n\t"
1061 "psllw $2, %%mm1 \n\t"
1062 "psllw $7, %%mm0 \n\t"
9b464428
FB
1063 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1064 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9
MN
1065
1066 "por %%mm3, %%mm1 \n\t"
1067 "por %%mm1, %%mm0 \n\t"
1068
1069 MOVNTQ(%%mm0, (%4, %%eax, 2))
1070
1071 "addl $4, %%eax \n\t"
1072 "cmpl %5, %%eax \n\t"
1073 " jb 1b \n\t"
1074
d1fac6cf 1075 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1076 "m" (yalpha1), "m" (uvalpha1)
1077 : "%eax"
1078 );
cf7d1c1a
MN
1079 break;
1080 case IMGFMT_BGR16:
d604bab9
MN
1081 asm volatile(
1082
1083FULL_YSCALEYUV2RGB
1084#ifdef DITHER1XBPP
9b464428
FB
1085 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1086 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1087 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
1088#endif
1089 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1090 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1091 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1092
1093 "psrlw $3, %%mm3 \n\t"
1094 "psllw $3, %%mm1 \n\t"
1095 "psllw $8, %%mm0 \n\t"
9b464428
FB
1096 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1097 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9
MN
1098
1099 "por %%mm3, %%mm1 \n\t"
1100 "por %%mm1, %%mm0 \n\t"
1101
1102 MOVNTQ(%%mm0, (%4, %%eax, 2))
1103
1104 "addl $4, %%eax \n\t"
1105 "cmpl %5, %%eax \n\t"
1106 " jb 1b \n\t"
1107
d1fac6cf 1108 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1109 "m" (yalpha1), "m" (uvalpha1)
1110 : "%eax"
1111 );
cf7d1c1a
MN
1112 break;
1113#endif
1114 case IMGFMT_RGB32:
1115#ifndef HAVE_MMX
1116 case IMGFMT_BGR32:
1117#endif
28bf81c9
MN
1118 if(dstFormat==IMGFMT_BGR32)
1119 {
2ba1bff0 1120 int i;
df3c183a
MN
1121#ifdef WORDS_BIGENDIAN
1122 dest++;
1123#endif
28bf81c9
MN
1124 for(i=0;i<dstW;i++){
1125 // vertical linear interpolation && yuv2rgb in a single step:
1126 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1127 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1128 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1129 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1130 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1131 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1132 dest+= 4;
1133 }
1134 }
1135 else if(dstFormat==IMGFMT_BGR24)
d604bab9 1136 {
96034638 1137 int i;
d1fac6cf 1138 for(i=0;i<dstW;i++){
d604bab9
MN
1139 // vertical linear interpolation && yuv2rgb in a single step:
1140 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1141 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1142 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
390b20a6
MN
1143 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1144 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1145 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
28bf81c9 1146 dest+= 3;
d604bab9
MN
1147 }
1148 }
28bf81c9 1149 else if(dstFormat==IMGFMT_BGR16)
d604bab9 1150 {
96034638 1151 int i;
d1fac6cf 1152 for(i=0;i<dstW;i++){
d604bab9
MN
1153 // vertical linear interpolation && yuv2rgb in a single step:
1154 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1155 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1156 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1157
d022ce5c 1158 ((uint16_t*)dest)[i] =
b18ea156
MN
1159 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1160 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1161 clip_table16r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1162 }
1163 }
28bf81c9 1164 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1165 {
96034638 1166 int i;
d1fac6cf 1167 for(i=0;i<dstW;i++){
d604bab9
MN
1168 // vertical linear interpolation && yuv2rgb in a single step:
1169 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1170 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1171 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1172
d022ce5c 1173 ((uint16_t*)dest)[i] =
b18ea156
MN
1174 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1175 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1176 clip_table15r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1177 }
1178 }
d604bab9
MN
1179 }//FULL_UV_IPOL
1180 else
1181 {
cf7d1c1a 1182#endif // if 0
d604bab9 1183#ifdef HAVE_MMX
cf7d1c1a
MN
1184 switch(c->dstFormat)
1185 {
1186 case IMGFMT_BGR32:
d604bab9
MN
1187 asm volatile(
1188 YSCALEYUV2RGB
1189 WRITEBGR32
1190
d1fac6cf 1191 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1192 "m" (yalpha1), "m" (uvalpha1)
1193 : "%eax"
1194 );
cf7d1c1a
MN
1195 return;
1196 case IMGFMT_BGR24:
d604bab9 1197 asm volatile(
bdc2eb9a 1198 "movl %4, %%ebx \n\t"
d604bab9
MN
1199 YSCALEYUV2RGB
1200 WRITEBGR24
1201
d1fac6cf 1202 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1203 "m" (yalpha1), "m" (uvalpha1)
1204 : "%eax", "%ebx"
1205 );
cf7d1c1a
MN
1206 return;
1207 case IMGFMT_BGR15:
d604bab9
MN
1208 asm volatile(
1209 YSCALEYUV2RGB
1210 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1211#ifdef DITHER1XBPP
9b464428
FB
1212 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1213 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1214 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1215#endif
1216
1217 WRITEBGR15
1218
d1fac6cf 1219 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1220 "m" (yalpha1), "m" (uvalpha1)
1221 : "%eax"
1222 );
cf7d1c1a
MN
1223 return;
1224 case IMGFMT_BGR16:
d604bab9
MN
1225 asm volatile(
1226 YSCALEYUV2RGB
1227 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1228#ifdef DITHER1XBPP
9b464428
FB
1229 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1230 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1231 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1232#endif
1233
1234 WRITEBGR16
1235
d1fac6cf 1236 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1237 "m" (yalpha1), "m" (uvalpha1)
1238 : "%eax"
1239 );
cf7d1c1a 1240 return;
25593e29
MN
1241 case IMGFMT_YUY2:
1242 asm volatile(
1243 YSCALEYUV2PACKED
1244 WRITEYUY2
1245
1246 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1247 "m" (yalpha1), "m" (uvalpha1)
1248 : "%eax"
1249 );
1250 return;
cf7d1c1a
MN
1251 default: break;
1252 }
1253#endif //HAVE_MMX
25593e29 1254YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
d604bab9
MN
1255}
1256
1257/**
1258 * YV12 to RGB without scaling or interpolating
1259 */
25593e29 1260static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
cf7d1c1a 1261 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1262{
c9f99fef 1263#ifdef HAVE_MMX
d604bab9 1264 int uvalpha1=uvalpha^4095;
c9f99fef 1265#endif
c1b0bfb4 1266 const int yalpha1=0;
cf7d1c1a
MN
1267 int i;
1268
1269 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1270 const int yalpha= 4096; //FIXME ...
96034638 1271
1e621b18 1272 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 1273 {
25593e29 1274 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
d604bab9
MN
1275 return;
1276 }
397c035e
MN
1277
1278#ifdef HAVE_MMX
497d4f99
MN
1279 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1280 {
cf7d1c1a 1281 switch(dstFormat)
d604bab9 1282 {
cf7d1c1a 1283 case IMGFMT_BGR32:
d604bab9
MN
1284 asm volatile(
1285 YSCALEYUV2RGB1
1286 WRITEBGR32
c1b0bfb4 1287 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1288 "m" (yalpha1), "m" (uvalpha1)
1289 : "%eax"
1290 );
cf7d1c1a
MN
1291 return;
1292 case IMGFMT_BGR24:
d604bab9 1293 asm volatile(
bdc2eb9a 1294 "movl %4, %%ebx \n\t"
d604bab9
MN
1295 YSCALEYUV2RGB1
1296 WRITEBGR24
c1b0bfb4 1297 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1298 "m" (yalpha1), "m" (uvalpha1)
1299 : "%eax", "%ebx"
1300 );
cf7d1c1a
MN
1301 return;
1302 case IMGFMT_BGR15:
d604bab9
MN
1303 asm volatile(
1304 YSCALEYUV2RGB1
1305 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1306#ifdef DITHER1XBPP
9b464428
FB
1307 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1308 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1309 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1310#endif
1311 WRITEBGR15
c1b0bfb4 1312 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1313 "m" (yalpha1), "m" (uvalpha1)
1314 : "%eax"
1315 );
cf7d1c1a
MN
1316 return;
1317 case IMGFMT_BGR16:
d604bab9
MN
1318 asm volatile(
1319 YSCALEYUV2RGB1
1320 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321#ifdef DITHER1XBPP
9b464428
FB
1322 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1323 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1324 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1325#endif
1326
1327 WRITEBGR16
c1b0bfb4 1328 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1329 "m" (yalpha1), "m" (uvalpha1)
1330 : "%eax"
1331 );
cf7d1c1a 1332 return;
25593e29
MN
1333 case IMGFMT_YUY2:
1334 asm volatile(
1335 YSCALEYUV2PACKED1
1336 WRITEYUY2
1337 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1338 "m" (yalpha1), "m" (uvalpha1)
1339 : "%eax"
1340 );
1341 return;
d604bab9 1342 }
497d4f99
MN
1343 }
1344 else
1345 {
cf7d1c1a 1346 switch(dstFormat)
d604bab9 1347 {
cf7d1c1a 1348 case IMGFMT_BGR32:
497d4f99
MN
1349 asm volatile(
1350 YSCALEYUV2RGB1b
1351 WRITEBGR32
c1b0bfb4 1352 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1353 "m" (yalpha1), "m" (uvalpha1)
1354 : "%eax"
1355 );
cf7d1c1a
MN
1356 return;
1357 case IMGFMT_BGR24:
497d4f99 1358 asm volatile(
bdc2eb9a 1359 "movl %4, %%ebx \n\t"
497d4f99
MN
1360 YSCALEYUV2RGB1b
1361 WRITEBGR24
c1b0bfb4 1362 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
497d4f99
MN
1363 "m" (yalpha1), "m" (uvalpha1)
1364 : "%eax", "%ebx"
1365 );
cf7d1c1a
MN
1366 return;
1367 case IMGFMT_BGR15:
497d4f99
MN
1368 asm volatile(
1369 YSCALEYUV2RGB1b
1370 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1371#ifdef DITHER1XBPP
9b464428
FB
1372 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1373 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1374 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99
MN
1375#endif
1376 WRITEBGR15
c1b0bfb4 1377 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1378 "m" (yalpha1), "m" (uvalpha1)
1379 : "%eax"
1380 );
cf7d1c1a
MN
1381 return;
1382 case IMGFMT_BGR16:
497d4f99
MN
1383 asm volatile(
1384 YSCALEYUV2RGB1b
1385 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1386#ifdef DITHER1XBPP
9b464428
FB
1387 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1388 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1389 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1390#endif
d604bab9 1391
497d4f99 1392 WRITEBGR16
c1b0bfb4 1393 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1394 "m" (yalpha1), "m" (uvalpha1)
1395 : "%eax"
1396 );
cf7d1c1a 1397 return;
25593e29
MN
1398 case IMGFMT_YUY2:
1399 asm volatile(
1400 YSCALEYUV2PACKED1b
1401 WRITEYUY2
1402 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1403 "m" (yalpha1), "m" (uvalpha1)
1404 : "%eax"
1405 );
1406 return;
d604bab9 1407 }
497d4f99 1408 }
df3c183a 1409#endif
cf7d1c1a 1410 if( uvalpha < 2048 )
497d4f99 1411 {
25593e29 1412 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
cf7d1c1a 1413 }else{
25593e29 1414 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
497d4f99 1415 }
d604bab9
MN
1416}
1417
6ff0ad6b
MN
1418//FIXME yuy2* can read upto 7 samples to much
1419
1e621b18
MN
1420static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1421{
6ff0ad6b
MN
1422#ifdef HAVE_MMX
1423 asm volatile(
1424 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1425 "movl %0, %%eax \n\t"
1426 "1: \n\t"
1427 "movq (%1, %%eax,2), %%mm0 \n\t"
1428 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1429 "pand %%mm2, %%mm0 \n\t"
1430 "pand %%mm2, %%mm1 \n\t"
1431 "packuswb %%mm1, %%mm0 \n\t"
1432 "movq %%mm0, (%2, %%eax) \n\t"
1433 "addl $8, %%eax \n\t"
1434 " js 1b \n\t"
1435 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1436 : "%eax"
1437 );
1e621b18
MN
1438#else
1439 int i;
1440 for(i=0; i<width; i++)
1441 dst[i]= src[2*i];
1442#endif
1443}
1444
1445static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1446{
6ff0ad6b
MN
1447#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1448 asm volatile(
1449 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1450 "movl %0, %%eax \n\t"
1451 "1: \n\t"
1452 "movq (%1, %%eax,4), %%mm0 \n\t"
1453 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1454 "movq (%2, %%eax,4), %%mm2 \n\t"
1455 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1456 PAVGB(%%mm2, %%mm0)
1457 PAVGB(%%mm3, %%mm1)
1458 "psrlw $8, %%mm0 \n\t"
1459 "psrlw $8, %%mm1 \n\t"
1460 "packuswb %%mm1, %%mm0 \n\t"
1461 "movq %%mm0, %%mm1 \n\t"
1462 "psrlw $8, %%mm0 \n\t"
1463 "pand %%mm4, %%mm1 \n\t"
1464 "packuswb %%mm0, %%mm0 \n\t"
1465 "packuswb %%mm1, %%mm1 \n\t"
1466 "movd %%mm0, (%4, %%eax) \n\t"
1467 "movd %%mm1, (%3, %%eax) \n\t"
1468 "addl $4, %%eax \n\t"
1469 " js 1b \n\t"
1470 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1471 : "%eax"
1472 );
1e621b18
MN
1473#else
1474 int i;
1475 for(i=0; i<width; i++)
1476 {
1477 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1478 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1479 }
1480#endif
1481}
1482
7322a67c
MN
1483//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1484static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1485{
1486#ifdef HAVE_MMX
1487 asm volatile(
1488 "movl %0, %%eax \n\t"
1489 "1: \n\t"
1490 "movq (%1, %%eax,2), %%mm0 \n\t"
1491 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1492 "psrlw $8, %%mm0 \n\t"
1493 "psrlw $8, %%mm1 \n\t"
1494 "packuswb %%mm1, %%mm0 \n\t"
1495 "movq %%mm0, (%2, %%eax) \n\t"
1496 "addl $8, %%eax \n\t"
1497 " js 1b \n\t"
1498 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1499 : "%eax"
1500 );
1501#else
1502 int i;
1503 for(i=0; i<width; i++)
1504 dst[i]= src[2*i+1];
1505#endif
1506}
1507
1508static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1509{
1510#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1511 asm volatile(
1512 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1513 "movl %0, %%eax \n\t"
1514 "1: \n\t"
1515 "movq (%1, %%eax,4), %%mm0 \n\t"
1516 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1517 "movq (%2, %%eax,4), %%mm2 \n\t"
1518 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1519 PAVGB(%%mm2, %%mm0)
1520 PAVGB(%%mm3, %%mm1)
1521 "pand %%mm4, %%mm0 \n\t"
1522 "pand %%mm4, %%mm1 \n\t"
1523 "packuswb %%mm1, %%mm0 \n\t"
1524 "movq %%mm0, %%mm1 \n\t"
1525 "psrlw $8, %%mm0 \n\t"
1526 "pand %%mm4, %%mm1 \n\t"
1527 "packuswb %%mm0, %%mm0 \n\t"
1528 "packuswb %%mm1, %%mm1 \n\t"
1529 "movd %%mm0, (%4, %%eax) \n\t"
1530 "movd %%mm1, (%3, %%eax) \n\t"
1531 "addl $4, %%eax \n\t"
1532 " js 1b \n\t"
1533 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1534 : "%eax"
1535 );
1536#else
1537 int i;
1538 for(i=0; i<width; i++)
1539 {
1540 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1541 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1542 }
1543#endif
1544}
1545
1e621b18
MN
1546static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1547{
1548#ifdef HAVE_MMXFIXME
1549#else
1550 int i;
1551 for(i=0; i<width; i++)
1552 {
1553 int b= src[i*4+0];
1554 int g= src[i*4+1];
1555 int r= src[i*4+2];
1556
1557 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1558 }
1559#endif
1560}
1561
1562static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1563{
1564#ifdef HAVE_MMXFIXME
1565#else
1566 int i;
1567 for(i=0; i<width; i++)
1568 {
1569 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1570 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1571 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1572
1573 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1574 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1575 }
1576#endif
1577}
1578
1579static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1580{
ac6a2e45
MN
1581#ifdef HAVE_MMX
1582 asm volatile(
1583 "movl %2, %%eax \n\t"
854288bb
FB
1584 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1585 "movq "MANGLE(w1111)", %%mm5 \n\t"
ac6a2e45
MN
1586 "pxor %%mm7, %%mm7 \n\t"
1587 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1588 ".balign 16 \n\t"
1589 "1: \n\t"
1590 PREFETCH" 64(%0, %%ebx) \n\t"
1591 "movd (%0, %%ebx), %%mm0 \n\t"
1592 "movd 3(%0, %%ebx), %%mm1 \n\t"
1593 "punpcklbw %%mm7, %%mm0 \n\t"
1594 "punpcklbw %%mm7, %%mm1 \n\t"
1595 "movd 6(%0, %%ebx), %%mm2 \n\t"
1596 "movd 9(%0, %%ebx), %%mm3 \n\t"
1597 "punpcklbw %%mm7, %%mm2 \n\t"
1598 "punpcklbw %%mm7, %%mm3 \n\t"
1599 "pmaddwd %%mm6, %%mm0 \n\t"
1600 "pmaddwd %%mm6, %%mm1 \n\t"
1601 "pmaddwd %%mm6, %%mm2 \n\t"
1602 "pmaddwd %%mm6, %%mm3 \n\t"
1603#ifndef FAST_BGR2YV12
1604 "psrad $8, %%mm0 \n\t"
1605 "psrad $8, %%mm1 \n\t"
1606 "psrad $8, %%mm2 \n\t"
1607 "psrad $8, %%mm3 \n\t"
1608#endif
1609 "packssdw %%mm1, %%mm0 \n\t"
1610 "packssdw %%mm3, %%mm2 \n\t"
1611 "pmaddwd %%mm5, %%mm0 \n\t"
1612 "pmaddwd %%mm5, %%mm2 \n\t"
1613 "packssdw %%mm2, %%mm0 \n\t"
1614 "psraw $7, %%mm0 \n\t"
1615
1616 "movd 12(%0, %%ebx), %%mm4 \n\t"
1617 "movd 15(%0, %%ebx), %%mm1 \n\t"
1618 "punpcklbw %%mm7, %%mm4 \n\t"
1619 "punpcklbw %%mm7, %%mm1 \n\t"
1620 "movd 18(%0, %%ebx), %%mm2 \n\t"
1621 "movd 21(%0, %%ebx), %%mm3 \n\t"
1622 "punpcklbw %%mm7, %%mm2 \n\t"
1623 "punpcklbw %%mm7, %%mm3 \n\t"
1624 "pmaddwd %%mm6, %%mm4 \n\t"
1625 "pmaddwd %%mm6, %%mm1 \n\t"
1626 "pmaddwd %%mm6, %%mm2 \n\t"
1627 "pmaddwd %%mm6, %%mm3 \n\t"
1628#ifndef FAST_BGR2YV12
1629 "psrad $8, %%mm4 \n\t"
1630 "psrad $8, %%mm1 \n\t"
1631 "psrad $8, %%mm2 \n\t"
1632 "psrad $8, %%mm3 \n\t"
1633#endif
1634 "packssdw %%mm1, %%mm4 \n\t"
1635 "packssdw %%mm3, %%mm2 \n\t"
1636 "pmaddwd %%mm5, %%mm4 \n\t"
1637 "pmaddwd %%mm5, %%mm2 \n\t"
1638 "addl $24, %%ebx \n\t"
1639 "packssdw %%mm2, %%mm4 \n\t"
1640 "psraw $7, %%mm4 \n\t"
1641
1642 "packuswb %%mm4, %%mm0 \n\t"
854288bb 1643 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
ac6a2e45 1644
4342fc14 1645 "movq %%mm0, (%1, %%eax) \n\t"
ac6a2e45
MN
1646 "addl $8, %%eax \n\t"
1647 " js 1b \n\t"
1648 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1649 : "%eax", "%ebx"
1650 );
1e621b18
MN
1651#else
1652 int i;
1653 for(i=0; i<width; i++)
1654 {
1655 int b= src[i*3+0];
1656 int g= src[i*3+1];
1657 int r= src[i*3+2];
1658
1659 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1660 }
1661#endif
1662}
1663
1664static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1665{
4342fc14
MN
1666#ifdef HAVE_MMX
1667 asm volatile(
1668 "movl %4, %%eax \n\t"
854288bb
FB
1669 "movq "MANGLE(w1111)", %%mm5 \n\t"
1670 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
4342fc14
MN
1671 "pxor %%mm7, %%mm7 \n\t"
1672 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1673 "addl %%ebx, %%ebx \n\t"
1674 ".balign 16 \n\t"
1675 "1: \n\t"
1676 PREFETCH" 64(%0, %%ebx) \n\t"
1677 PREFETCH" 64(%1, %%ebx) \n\t"
1678#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1679 "movq (%0, %%ebx), %%mm0 \n\t"
1680 "movq (%1, %%ebx), %%mm1 \n\t"
1681 "movq 6(%0, %%ebx), %%mm2 \n\t"
1682 "movq 6(%1, %%ebx), %%mm3 \n\t"
1683 PAVGB(%%mm1, %%mm0)
1684 PAVGB(%%mm3, %%mm2)
1685 "movq %%mm0, %%mm1 \n\t"
1686 "movq %%mm2, %%mm3 \n\t"
1687 "psrlq $24, %%mm0 \n\t"
1688 "psrlq $24, %%mm2 \n\t"
1689 PAVGB(%%mm1, %%mm0)
1690 PAVGB(%%mm3, %%mm2)
1691 "punpcklbw %%mm7, %%mm0 \n\t"
1692 "punpcklbw %%mm7, %%mm2 \n\t"
1693#else
1694 "movd (%0, %%ebx), %%mm0 \n\t"
1695 "movd (%1, %%ebx), %%mm1 \n\t"
1696 "movd 3(%0, %%ebx), %%mm2 \n\t"
1697 "movd 3(%1, %%ebx), %%mm3 \n\t"
1698 "punpcklbw %%mm7, %%mm0 \n\t"
1699 "punpcklbw %%mm7, %%mm1 \n\t"
1700 "punpcklbw %%mm7, %%mm2 \n\t"
1701 "punpcklbw %%mm7, %%mm3 \n\t"
1702 "paddw %%mm1, %%mm0 \n\t"
1703 "paddw %%mm3, %%mm2 \n\t"
1704 "paddw %%mm2, %%mm0 \n\t"
1705 "movd 6(%0, %%ebx), %%mm4 \n\t"
1706 "movd 6(%1, %%ebx), %%mm1 \n\t"
1707 "movd 9(%0, %%ebx), %%mm2 \n\t"
1708 "movd 9(%1, %%ebx), %%mm3 \n\t"
1709 "punpcklbw %%mm7, %%mm4 \n\t"
1710 "punpcklbw %%mm7, %%mm1 \n\t"
1711 "punpcklbw %%mm7, %%mm2 \n\t"
1712 "punpcklbw %%mm7, %%mm3 \n\t"
1713 "paddw %%mm1, %%mm4 \n\t"
1714 "paddw %%mm3, %%mm2 \n\t"
1715 "paddw %%mm4, %%mm2 \n\t"
1716 "psrlw $2, %%mm0 \n\t"
1717 "psrlw $2, %%mm2 \n\t"
1718#endif
854288bb
FB
1719 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1720 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1721
1722 "pmaddwd %%mm0, %%mm1 \n\t"
1723 "pmaddwd %%mm2, %%mm3 \n\t"
1724 "pmaddwd %%mm6, %%mm0 \n\t"
1725 "pmaddwd %%mm6, %%mm2 \n\t"
1726#ifndef FAST_BGR2YV12
1727 "psrad $8, %%mm0 \n\t"
1728 "psrad $8, %%mm1 \n\t"
1729 "psrad $8, %%mm2 \n\t"
1730 "psrad $8, %%mm3 \n\t"
1731#endif
1732 "packssdw %%mm2, %%mm0 \n\t"
1733 "packssdw %%mm3, %%mm1 \n\t"
1734 "pmaddwd %%mm5, %%mm0 \n\t"
1735 "pmaddwd %%mm5, %%mm1 \n\t"
1736 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1737 "psraw $7, %%mm0 \n\t"
1738
1739#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1740 "movq 12(%0, %%ebx), %%mm4 \n\t"
1741 "movq 12(%1, %%ebx), %%mm1 \n\t"
1742 "movq 18(%0, %%ebx), %%mm2 \n\t"
1743 "movq 18(%1, %%ebx), %%mm3 \n\t"
1744 PAVGB(%%mm1, %%mm4)
1745 PAVGB(%%mm3, %%mm2)
1746 "movq %%mm4, %%mm1 \n\t"
1747 "movq %%mm2, %%mm3 \n\t"
1748 "psrlq $24, %%mm4 \n\t"
1749 "psrlq $24, %%mm2 \n\t"
1750 PAVGB(%%mm1, %%mm4)
1751 PAVGB(%%mm3, %%mm2)
1752 "punpcklbw %%mm7, %%mm4 \n\t"
1753 "punpcklbw %%mm7, %%mm2 \n\t"
1754#else
1755 "movd 12(%0, %%ebx), %%mm4 \n\t"
1756 "movd 12(%1, %%ebx), %%mm1 \n\t"
1757 "movd 15(%0, %%ebx), %%mm2 \n\t"
1758 "movd 15(%1, %%ebx), %%mm3 \n\t"
1759 "punpcklbw %%mm7, %%mm4 \n\t"
1760 "punpcklbw %%mm7, %%mm1 \n\t"
1761 "punpcklbw %%mm7, %%mm2 \n\t"
1762 "punpcklbw %%mm7, %%mm3 \n\t"
1763 "paddw %%mm1, %%mm4 \n\t"
1764 "paddw %%mm3, %%mm2 \n\t"
1765 "paddw %%mm2, %%mm4 \n\t"
1766 "movd 18(%0, %%ebx), %%mm5 \n\t"
1767 "movd 18(%1, %%ebx), %%mm1 \n\t"
1768 "movd 21(%0, %%ebx), %%mm2 \n\t"
1769 "movd 21(%1, %%ebx), %%mm3 \n\t"
1770 "punpcklbw %%mm7, %%mm5 \n\t"
1771 "punpcklbw %%mm7, %%mm1 \n\t"
1772 "punpcklbw %%mm7, %%mm2 \n\t"
1773 "punpcklbw %%mm7, %%mm3 \n\t"
1774 "paddw %%mm1, %%mm5 \n\t"
1775 "paddw %%mm3, %%mm2 \n\t"
1776 "paddw %%mm5, %%mm2 \n\t"
854288bb 1777 "movq "MANGLE(w1111)", %%mm5 \n\t"
4342fc14
MN
1778 "psrlw $2, %%mm4 \n\t"
1779 "psrlw $2, %%mm2 \n\t"
1780#endif
854288bb
FB
1781 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1782 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1783
1784 "pmaddwd %%mm4, %%mm1 \n\t"
1785 "pmaddwd %%mm2, %%mm3 \n\t"
1786 "pmaddwd %%mm6, %%mm4 \n\t"
1787 "pmaddwd %%mm6, %%mm2 \n\t"
1788#ifndef FAST_BGR2YV12
1789 "psrad $8, %%mm4 \n\t"
1790 "psrad $8, %%mm1 \n\t"
1791 "psrad $8, %%mm2 \n\t"
1792 "psrad $8, %%mm3 \n\t"
1793#endif
1794 "packssdw %%mm2, %%mm4 \n\t"
1795 "packssdw %%mm3, %%mm1 \n\t"
1796 "pmaddwd %%mm5, %%mm4 \n\t"
1797 "pmaddwd %%mm5, %%mm1 \n\t"
1798 "addl $24, %%ebx \n\t"
1799 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1800 "psraw $7, %%mm4 \n\t"
1801
1802 "movq %%mm0, %%mm1 \n\t"
1803 "punpckldq %%mm4, %%mm0 \n\t"
1804 "punpckhdq %%mm4, %%mm1 \n\t"
1805 "packsswb %%mm1, %%mm0 \n\t"
854288bb 1806 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
4342fc14
MN
1807
1808 "movd %%mm0, (%2, %%eax) \n\t"
1809 "punpckhdq %%mm0, %%mm0 \n\t"
1810 "movd %%mm0, (%3, %%eax) \n\t"
1811 "addl $4, %%eax \n\t"
1812 " js 1b \n\t"
1813 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1814 : "%eax", "%ebx"
1815 );
1e621b18
MN
1816#else
1817 int i;
1818 for(i=0; i<width; i++)
1819 {
1820 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1821 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1822 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1823
1824 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1825 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1826 }
1827#endif
1828}
1829
6af250ea
MN
1830static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1831{
1832 int i;
1833 for(i=0; i<width; i++)
1834 {
1835 int d= src[i*2] + (src[i*2+1]<<8);
1836 int b= d&0x1F;
1837 int g= (d>>5)&0x3F;
1838 int r= (d>>11)&0x1F;
1839
1840 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1841 }
1842}
1843
1844static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1845{
1846 int i;
1847 for(i=0; i<width; i++)
1848 {
5bb9d9d8
MN
1849#if 1
1850 int d0= le2me_32( ((uint32_t*)src1)[i] );
1851 int d1= le2me_32( ((uint32_t*)src2)[i] );
1852
1853 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1854 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1855
1856 int dh2= (dh>>11) + (dh<<21);
1857 int d= dh2 + dl;
1858
1859 int b= d&0x7F;
1860 int r= (d>>11)&0x7F;
1861 int g= d>>21;
1862#else
6af250ea
MN
1863 int d0= src1[i*4] + (src1[i*4+1]<<8);
1864 int b0= d0&0x1F;
1865 int g0= (d0>>5)&0x3F;
1866 int r0= (d0>>11)&0x1F;
1867
1868 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1869 int b1= d1&0x1F;
1870 int g1= (d1>>5)&0x3F;
1871 int r1= (d1>>11)&0x1F;
1872
1873 int d2= src2[i*4] + (src2[i*4+1]<<8);
1874 int b2= d2&0x1F;
1875 int g2= (d2>>5)&0x3F;
1876 int r2= (d2>>11)&0x1F;
1877
1878 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1879 int b3= d3&0x1F;
1880 int g3= (d3>>5)&0x3F;
1881 int r3= (d3>>11)&0x1F;
1882
1883 int b= b0 + b1 + b2 + b3;
1884 int g= g0 + g1 + g2 + g3;
1885 int r= r0 + r1 + r2 + r3;
5bb9d9d8 1886#endif
6af250ea
MN
1887 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1888 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1889 }
1890}
1891
b72034dd
MN
1892static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1893{
1894 int i;
1895 for(i=0; i<width; i++)
1896 {
1897 int d= src[i*2] + (src[i*2+1]<<8);
1898 int b= d&0x1F;
1899 int g= (d>>5)&0x1F;
1900 int r= (d>>10)&0x1F;
1901
1902 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1903 }
1904}
1905
1906static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1907{
1908 int i;
1909 for(i=0; i<width; i++)
1910 {
1911#if 1
1912 int d0= le2me_32( ((uint32_t*)src1)[i] );
1913 int d1= le2me_32( ((uint32_t*)src2)[i] );
1914
1915 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1916 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1917
1918 int dh2= (dh>>11) + (dh<<21);
1919 int d= dh2 + dl;
1920
1921 int b= d&0x7F;
1922 int r= (d>>10)&0x7F;
1923 int g= d>>21;
1924#else
1925 int d0= src1[i*4] + (src1[i*4+1]<<8);
1926 int b0= d0&0x1F;
1927 int g0= (d0>>5)&0x1F;
1928 int r0= (d0>>10)&0x1F;
1929
1930 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1931 int b1= d1&0x1F;
1932 int g1= (d1>>5)&0x1F;
1933 int r1= (d1>>10)&0x1F;
1934
1935 int d2= src2[i*4] + (src2[i*4+1]<<8);
1936 int b2= d2&0x1F;
1937 int g2= (d2>>5)&0x1F;
1938 int r2= (d2>>10)&0x1F;
1939
1940 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1941 int b3= d3&0x1F;
1942 int g3= (d3>>5)&0x1F;
1943 int r3= (d3>>10)&0x1F;
1944
1945 int b= b0 + b1 + b2 + b3;
1946 int g= g0 + g1 + g2 + g3;
1947 int r= r0 + r1 + r2 + r3;
1948#endif
1949 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1950 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1951 }
1952}
1953
1954
a861d4d7
MN
1955static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1956{
1957 int i;
1958 for(i=0; i<width; i++)
1959 {
1960 int r= src[i*4+0];
1961 int g= src[i*4+1];
1962 int b= src[i*4+2];
1963
1964 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1965 }
1966}
1967
1968static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1969{
1970 int i;
1971 for(i=0; i<width; i++)
1972 {
1973 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1974 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1975 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1976
1977 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1978 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1979 }
1980}
1981
1982static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1983{
1984 int i;
1985 for(i=0; i<width; i++)
1986 {
1987 int r= src[i*3+0];
1988 int g= src[i*3+1];
1989 int b= src[i*3+2];
1990
1991 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1992 }
1993}
1994
1995static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1996{
1997 int i;
1998 for(i=0; i<width; i++)
1999 {
2000 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2001 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2002 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2003
2004 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2005 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2006 }
2007}
2008
1e621b18 2009
077ea8a7
MN
2010// Bilinear / Bicubic scaling
2011static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2012 int16_t *filter, int16_t *filterPos, int filterSize)
2ff198c1 2013{
077ea8a7
MN
2014#ifdef HAVE_MMX
2015 if(filterSize==4) // allways true for upscaling, sometimes for down too
2016 {
2017 int counter= -2*dstW;
2018 filter-= counter*2;
2019 filterPos-= counter/2;
2020 dst-= counter/2;
2021 asm volatile(
2022 "pxor %%mm7, %%mm7 \n\t"
9b464428 2023 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2024 "pushl %%ebp \n\t" // we use 7 regs here ...
2025 "movl %%eax, %%ebp \n\t"
2026 ".balign 16 \n\t"
2027 "1: \n\t"
2028 "movzwl (%2, %%ebp), %%eax \n\t"
2029 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2030 "movq (%1, %%ebp, 4), %%mm1 \n\t"
2031 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
2032 "movd (%3, %%eax), %%mm0 \n\t"
2033 "movd (%3, %%ebx), %%mm2 \n\t"
2034 "punpcklbw %%mm7, %%mm0 \n\t"
2035 "punpcklbw %%mm7, %%mm2 \n\t"
2036 "pmaddwd %%mm1, %%mm0 \n\t"
2037 "pmaddwd %%mm2, %%mm3 \n\t"
2038 "psrad $8, %%mm0 \n\t"
2039 "psrad $8, %%mm3 \n\t"
2040 "packssdw %%mm3, %%mm0 \n\t"
2041 "pmaddwd %%mm6, %%mm0 \n\t"
2042 "packssdw %%mm0, %%mm0 \n\t"
2043 "movd %%mm0, (%4, %%ebp) \n\t"
2044 "addl $4, %%ebp \n\t"
2045 " jnc 1b \n\t"
e3d2500f 2046
077ea8a7
MN
2047 "popl %%ebp \n\t"
2048 : "+a" (counter)
2049 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2050 : "%ebx"
2051 );
2052 }
2053 else if(filterSize==8)
2054 {
2055 int counter= -2*dstW;
2056 filter-= counter*4;
2057 filterPos-= counter/2;
2058 dst-= counter/2;
2059 asm volatile(
2060 "pxor %%mm7, %%mm7 \n\t"
9b464428 2061 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2062 "pushl %%ebp \n\t" // we use 7 regs here ...
2063 "movl %%eax, %%ebp \n\t"
2064 ".balign 16 \n\t"
2065 "1: \n\t"
2066 "movzwl (%2, %%ebp), %%eax \n\t"
2067 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2068 "movq (%1, %%ebp, 8), %%mm1 \n\t"
2069 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
2070 "movd (%3, %%eax), %%mm0 \n\t"
2071 "movd (%3, %%ebx), %%mm2 \n\t"
2072 "punpcklbw %%mm7, %%mm0 \n\t"
2073 "punpcklbw %%mm7, %%mm2 \n\t"
2074 "pmaddwd %%mm1, %%mm0 \n\t"
2075 "pmaddwd %%mm2, %%mm3 \n\t"
2076
2077 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
2078 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
2079 "movd 4(%3, %%eax), %%mm4 \n\t"
2080 "movd 4(%3, %%ebx), %%mm2 \n\t"
2081 "punpcklbw %%mm7, %%mm4 \n\t"
2082 "punpcklbw %%mm7, %%mm2 \n\t"
2083 "pmaddwd %%mm1, %%mm4 \n\t"
2084 "pmaddwd %%mm2, %%mm5 \n\t"
2085 "paddd %%mm4, %%mm0 \n\t"
2086 "paddd %%mm5, %%mm3 \n\t"
2087
2088 "psrad $8, %%mm0 \n\t"
2089 "psrad $8, %%mm3 \n\t"
2090 "packssdw %%mm3, %%mm0 \n\t"
2091 "pmaddwd %%mm6, %%mm0 \n\t"
2092 "packssdw %%mm0, %%mm0 \n\t"
2093 "movd %%mm0, (%4, %%ebp) \n\t"
2094 "addl $4, %%ebp \n\t"
2095 " jnc 1b \n\t"
c1b0bfb4 2096
077ea8a7
MN
2097 "popl %%ebp \n\t"
2098 : "+a" (counter)
2099 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2100 : "%ebx"
2101 );
2102 }
2103 else
2104 {
2105 int counter= -2*dstW;
2106// filter-= counter*filterSize/2;
2107 filterPos-= counter/2;
2108 dst-= counter/2;
2109 asm volatile(
2110 "pxor %%mm7, %%mm7 \n\t"
9b464428 2111 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2112 ".balign 16 \n\t"
2113 "1: \n\t"
2114 "movl %2, %%ecx \n\t"
2115 "movzwl (%%ecx, %0), %%eax \n\t"
2116 "movzwl 2(%%ecx, %0), %%ebx \n\t"
2117 "movl %5, %%ecx \n\t"
2118 "pxor %%mm4, %%mm4 \n\t"
2119 "pxor %%mm5, %%mm5 \n\t"
2120 "2: \n\t"
2121 "movq (%1), %%mm1 \n\t"
2122 "movq (%1, %6), %%mm3 \n\t"
2123 "movd (%%ecx, %%eax), %%mm0 \n\t"
2124 "movd (%%ecx, %%ebx), %%mm2 \n\t"
2125 "punpcklbw %%mm7, %%mm0 \n\t"
2126 "punpcklbw %%mm7, %%mm2 \n\t"
2127 "pmaddwd %%mm1, %%mm0 \n\t"
2128 "pmaddwd %%mm2, %%mm3 \n\t"
2129 "paddd %%mm3, %%mm5 \n\t"
2130 "paddd %%mm0, %%mm4 \n\t"
2131 "addl $8, %1 \n\t"
2132 "addl $4, %%ecx \n\t"
2133 "cmpl %4, %%ecx \n\t"
2134 " jb 2b \n\t"
2135 "addl %6, %1 \n\t"
2136 "psrad $8, %%mm4 \n\t"
2137 "psrad $8, %%mm5 \n\t"
2138 "packssdw %%mm5, %%mm4 \n\t"
2139 "pmaddwd %%mm6, %%mm4 \n\t"
2140 "packssdw %%mm4, %%mm4 \n\t"
2141 "movl %3, %%eax \n\t"
2142 "movd %%mm4, (%%eax, %0) \n\t"
2143 "addl $4, %0 \n\t"
2144 " jnc 1b \n\t"
c1b0bfb4 2145
627690b5
MN
2146 : "+r" (counter), "+r" (filter)
2147 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
077ea8a7 2148 "m" (src), "r" (filterSize*2)
e2f5a2a9 2149 : "%ebx", "%eax", "%ecx"
077ea8a7
MN
2150 );
2151 }
2152#else
2153 int i;
2154 for(i=0; i<dstW; i++)
2155 {
2156 int j;
2157 int srcPos= filterPos[i];
2158 int val=0;
c1b0bfb4 2159// printf("filterPos: %d\n", filterPos[i]);
077ea8a7
MN
2160 for(j=0; j<filterSize; j++)
2161 {
2162// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2163 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2164 }
2165// filter += hFilterSize;
2166 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2167// dst[i] = val>>7;
2168 }
2169#endif
2170}
2ff198c1 2171 // *** horizontal scale Y line to temp buffer
28bf81c9
MN
2172static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2173 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
1e621b18 2174 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
b7dc6f66
MN
2175 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2176 int32_t *mmx2FilterPos)
077ea8a7 2177{
1e621b18
MN
2178 if(srcFormat==IMGFMT_YUY2)
2179 {
2180 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2181 src= formatConvBuffer;
2182 }
7322a67c
MN
2183 else if(srcFormat==IMGFMT_UYVY)
2184 {
2185 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2186 src= formatConvBuffer;
2187 }
1e621b18
MN
2188 else if(srcFormat==IMGFMT_BGR32)
2189 {
2190 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2191 src= formatConvBuffer;
2192 }
2193 else if(srcFormat==IMGFMT_BGR24)
2194 {
2195 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2196 src= formatConvBuffer;
2197 }
6af250ea
MN
2198 else if(srcFormat==IMGFMT_BGR16)
2199 {
2200 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2201 src= formatConvBuffer;
2202 }
b72034dd
MN
2203 else if(srcFormat==IMGFMT_BGR15)
2204 {
2205 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2206 src= formatConvBuffer;
2207 }
a861d4d7
MN
2208 else if(srcFormat==IMGFMT_RGB32)
2209 {
2210 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2211 src= formatConvBuffer;
2212 }
2213 else if(srcFormat==IMGFMT_RGB24)
2214 {
2215 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2216 src= formatConvBuffer;
2217 }
1e621b18 2218
e3d2500f
MN
2219#ifdef HAVE_MMX
2220 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
28bf81c9 2221 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2222#else
28bf81c9 2223 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2224#endif
077ea8a7
MN
2225 {
2226 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2227 }
2228 else // Fast Bilinear upscale / crap downscale
2229 {
2ff198c1 2230#ifdef ARCH_X86
2ff198c1 2231#ifdef HAVE_MMX2
96034638 2232 int i;
2ff198c1
MN
2233 if(canMMX2BeUsed)
2234 {
2235 asm volatile(
2236 "pxor %%mm7, %%mm7 \n\t"
b7dc6f66
MN
2237 "movl %0, %%ecx \n\t"
2238 "movl %1, %%edi \n\t"
2239 "movl %2, %%edx \n\t"
2240 "movl %3, %%ebx \n\t"
2ff198c1 2241 "xorl %%eax, %%eax \n\t" // i
b7dc6f66
MN
2242 PREFETCH" (%%ecx) \n\t"
2243 PREFETCH" 32(%%ecx) \n\t"
2244 PREFETCH" 64(%%ecx) \n\t"
99cefd0b 2245
2ff198c1 2246#define FUNNY_Y_CODE \
b7dc6f66
MN
2247 "movl (%%ebx), %%esi \n\t"\
2248 "call *%4 \n\t"\
2249 "addl (%%ebx, %%eax), %%ecx \n\t"\
2250 "addl %%eax, %%edi \n\t"\
2251 "xorl %%eax, %%eax \n\t"\
99cefd0b 2252
2ff198c1
MN
2253FUNNY_Y_CODE
2254FUNNY_Y_CODE
2255FUNNY_Y_CODE
2256FUNNY_Y_CODE
2257FUNNY_Y_CODE
2258FUNNY_Y_CODE
2259FUNNY_Y_CODE
2260FUNNY_Y_CODE
2261
b7dc6f66
MN
2262 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2263 "m" (funnyYCode)
2ff198c1
MN
2264 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2265 );
af91b8b3 2266 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2ff198c1
MN
2267 }
2268 else
2269 {
2270#endif
2271 //NO MMX just normal asm ...
2272 asm volatile(
2273 "xorl %%eax, %%eax \n\t" // i
2274 "xorl %%ebx, %%ebx \n\t" // xx
2275 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2276 ".balign 16 \n\t"
2ff198c1
MN
2277 "1: \n\t"
2278 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2279 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2280 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2281 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2282 "shll $16, %%edi \n\t"
2283 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2284 "movl %1, %%edi \n\t"
2285 "shrl $9, %%esi \n\t"
2286 "movw %%si, (%%edi, %%eax, 2) \n\t"
2287 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2288 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2289
2290 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2291 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2292 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2293 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2294 "shll $16, %%edi \n\t"
2295 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2296 "movl %1, %%edi \n\t"
2297 "shrl $9, %%esi \n\t"
2298 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2299 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2300 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2301
2302
2303 "addl $2, %%eax \n\t"
2304 "cmpl %2, %%eax \n\t"
2305 " jb 1b \n\t"
2306
2307
2308 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2309 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2310 );
2311#ifdef HAVE_MMX2
2312 } //if MMX2 cant be used
2313#endif
2314#else
96034638
MN
2315 int i;
2316 unsigned int xpos=0;
2317 for(i=0;i<dstWidth;i++)
2318 {
2319 register unsigned int xx=xpos>>16;
2320 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2321 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2322 xpos+=xInc;
2323 }
2ff198c1 2324#endif
077ea8a7 2325 }
2ff198c1
MN
2326}
2327
28bf81c9
MN
2328inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2329 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
1e621b18 2330 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
b7dc6f66
MN
2331 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2332 int32_t *mmx2FilterPos)
2ff198c1 2333{
1e621b18
MN
2334 if(srcFormat==IMGFMT_YUY2)
2335 {
2336 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2337 src1= formatConvBuffer;
2338 src2= formatConvBuffer+2048;
2339 }
7322a67c
MN
2340 else if(srcFormat==IMGFMT_UYVY)
2341 {
2342 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2343 src1= formatConvBuffer;
2344 src2= formatConvBuffer+2048;
2345 }
1e621b18
MN
2346 else if(srcFormat==IMGFMT_BGR32)
2347 {
2348 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2349 src1= formatConvBuffer;
2350 src2= formatConvBuffer+2048;
2351 }
2352 else if(srcFormat==IMGFMT_BGR24)
2353 {
2354 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2355 src1= formatConvBuffer;
2356 src2= formatConvBuffer+2048;
2357 }
6af250ea
MN
2358 else if(srcFormat==IMGFMT_BGR16)
2359 {
2360 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2361 src1= formatConvBuffer;
2362 src2= formatConvBuffer+2048;
2363 }
b72034dd
MN
2364 else if(srcFormat==IMGFMT_BGR15)
2365 {
2366 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2367 src1= formatConvBuffer;
2368 src2= formatConvBuffer+2048;
2369 }
a861d4d7
MN
2370 else if(srcFormat==IMGFMT_RGB32)
2371 {
2372 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2373 src1= formatConvBuffer;
2374 src2= formatConvBuffer+2048;
2375 }
2376 else if(srcFormat==IMGFMT_RGB24)
2377 {
2378 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2379 src1= formatConvBuffer;
2380 src2= formatConvBuffer+2048;
2381 }
6ff0ad6b
MN
2382 else if(isGray(srcFormat))
2383 {
2384 return;
2385 }
1e621b18 2386
e3d2500f
MN
2387#ifdef HAVE_MMX
2388 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
28bf81c9 2389 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2390#else
28bf81c9 2391 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2392#endif
077ea8a7
MN
2393 {
2394 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2395 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2396 }
2397 else // Fast Bilinear upscale / crap downscale
2398 {
2ff198c1
MN
2399#ifdef ARCH_X86
2400#ifdef HAVE_MMX2
96034638 2401 int i;
2ff198c1
MN
2402 if(canMMX2BeUsed)
2403 {
2404 asm volatile(
b7dc6f66
MN
2405 "pxor %%mm7, %%mm7 \n\t"
2406 "movl %0, %%ecx \n\t"
2407 "movl %1, %%edi \n\t"
2408 "movl %2, %%edx \n\t"
2409 "movl %3, %%ebx \n\t"
2410 "xorl %%eax, %%eax \n\t" // i
2411 PREFETCH" (%%ecx) \n\t"
2412 PREFETCH" 32(%%ecx) \n\t"
2413 PREFETCH" 64(%%ecx) \n\t"
2414
2415#define FUNNY_UV_CODE \
2416 "movl (%%ebx), %%esi \n\t"\
2417 "call *%4 \n\t"\
2418 "addl (%%ebx, %%eax), %%ecx \n\t"\
2419 "addl %%eax, %%edi \n\t"\
2420 "xorl %%eax, %%eax \n\t"\
2421
2422FUNNY_UV_CODE
2423FUNNY_UV_CODE
2424FUNNY_UV_CODE
2425FUNNY_UV_CODE
2426 "xorl %%eax, %%eax \n\t" // i
2427 "movl %5, %%ecx \n\t" // src
2428 "movl %1, %%edi \n\t" // buf1
2429 "addl $4096, %%edi \n\t"
2430 PREFETCH" (%%ecx) \n\t"
2431 PREFETCH" 32(%%ecx) \n\t"
2432 PREFETCH" 64(%%ecx) \n\t"
2433
2434FUNNY_UV_CODE
2435FUNNY_UV_CODE
2436FUNNY_UV_CODE
2437FUNNY_UV_CODE
2438
2439 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2440 "m" (funnyUVCode), "m" (src2)
2441 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2442 );
c1b0bfb4 2443 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2ff198c1 2444 {
c1b0bfb4
MN
2445// printf("%d %d %d\n", dstWidth, i, srcW);
2446 dst[i] = src1[srcW-1]*128;
2447 dst[i+2048] = src2[srcW-1]*128;
2ff198c1
MN
2448 }
2449 }
2450 else
2451 {
2452#endif
2453 asm volatile(
2454 "xorl %%eax, %%eax \n\t" // i
2455 "xorl %%ebx, %%ebx \n\t" // xx
2456 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2457 ".balign 16 \n\t"
2ff198c1
MN
2458 "1: \n\t"
2459 "movl %0, %%esi \n\t"
2460 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2461 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2462 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2463 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2464 "shll $16, %%edi \n\t"
2465 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2466 "movl %1, %%edi \n\t"
2467 "shrl $9, %%esi \n\t"
2468 "movw %%si, (%%edi, %%eax, 2) \n\t"
2469
2470 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2471 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2472 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2473 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2474 "shll $16, %%edi \n\t"
2475 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2476 "movl %1, %%edi \n\t"
2477 "shrl $9, %%esi \n\t"
2478 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2479
2480 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2481 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2482 "addl $1, %%eax \n\t"
2483 "cmpl %2, %%eax \n\t"
2484 " jb 1b \n\t"
2485
2486 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2487 "r" (src2)
2488 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2489 );
2490#ifdef HAVE_MMX2
2491 } //if MMX2 cant be used
2492#endif
2493#else
96034638
MN
2494 int i;
2495 unsigned int xpos=0;
2496 for(i=0;i<dstWidth;i++)
2497 {
2498 register unsigned int xx=xpos>>16;
2499 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2500 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2501 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1faf0867
MN
2502/* slower
2503 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2504 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2505*/
96034638
MN
2506 xpos+=xInc;
2507 }
2ff198c1 2508#endif
077ea8a7
MN
2509 }
2510}
2511
1e621b18 2512static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
332105e4 2513 int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
28bf81c9
MN
2514
2515 /* load a few things into local vars to make the code more readable? and faster */
2516 const int srcW= c->srcW;
2517 const int dstW= c->dstW;
2518 const int dstH= c->dstH;
2519 const int chrDstW= c->chrDstW;
e616aa93 2520 const int chrSrcW= c->chrSrcW;
28bf81c9
MN
2521 const int lumXInc= c->lumXInc;
2522 const int chrXInc= c->chrXInc;
fe8054c0 2523 const int dstFormat= c->dstFormat;
44c1035c 2524 const int srcFormat= c->srcFormat;
28bf81c9
MN
2525 const int flags= c->flags;
2526 const int canMMX2BeUsed= c->canMMX2BeUsed;
2527 int16_t *vLumFilterPos= c->vLumFilterPos;
2528 int16_t *vChrFilterPos= c->vChrFilterPos;
2529 int16_t *hLumFilterPos= c->hLumFilterPos;
2530 int16_t *hChrFilterPos= c->hChrFilterPos;
2531 int16_t *vLumFilter= c->vLumFilter;
2532 int16_t *vChrFilter= c->vChrFilter;
2533 int16_t *hLumFilter= c->hLumFilter;
2534 int16_t *hChrFilter= c->hChrFilter;
77a49659
MN
2535 int32_t *lumMmxFilter= c->lumMmxFilter;
2536 int32_t *chrMmxFilter= c->chrMmxFilter;
28bf81c9
MN
2537 const int vLumFilterSize= c->vLumFilterSize;
2538 const int vChrFilterSize= c->vChrFilterSize;
2539 const int hLumFilterSize= c->hLumFilterSize;
2540 const int hChrFilterSize= c->hChrFilterSize;
2541 int16_t **lumPixBuf= c->lumPixBuf;
2542 int16_t **chrPixBuf= c->chrPixBuf;
2543 const int vLumBufSize= c->vLumBufSize;
2544 const int vChrBufSize= c->vChrBufSize;
2545 uint8_t *funnyYCode= c->funnyYCode;
2546 uint8_t *funnyUVCode= c->funnyUVCode;
1e621b18 2547 uint8_t *formatConvBuffer= c->formatConvBuffer;
e616aa93
MN
2548 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2549 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
28bf81c9
MN
2550
2551 /* vars whch will change and which we need to storw back in the context */
2552 int dstY= c->dstY;
2553 int lumBufIndex= c->lumBufIndex;
2554 int chrBufIndex= c->chrBufIndex;
2555 int lastInLumBuf= c->lastInLumBuf;
2556 int lastInChrBuf= c->lastInChrBuf;
1e621b18 2557 int srcStride[3];
332105e4 2558 int dstStride[3];
6c7506de
MN
2559 uint8_t *src[3];
2560 uint8_t *dst[3];
5859233b
MN
2561
2562 orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
2563 orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
44c1035c 2564
5859233b 2565 if(isPacked(c->srcFormat)){
1e621b18
MN
2566 src[0]=
2567 src[1]=
2568 src[2]= srcParam[0];
5859233b 2569 srcStride[0]=
1e621b18 2570 srcStride[1]=
5859233b 2571 srcStride[2]= srcStrideParam[0];
6c7506de 2572 }
5859233b
MN
2573 srcStride[1]<<= c->vChrDrop;
2574 srcStride[2]<<= c->vChrDrop;
6c7506de 2575
c7a810cc
MN
2576// printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2577// (int)dst[0], (int)dst[1], (int)dst[2]);
2578
2579#if 0 //self test FIXME move to a vfilter or something
2580{
2581static volatile int i=0;
2582i++;
2583if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2584 selfTest(src, srcStride, c->srcW, c->srcH);
2585i--;
2586}
2587#endif
37079906
MN
2588
2589//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2590//dstStride[0],dstStride[1],dstStride[2]);
6c7506de
MN
2591
2592 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2593 {
2594 static int firstTime=1; //FIXME move this into the context perhaps
2595 if(flags & SWS_PRINT_INFO && firstTime)
2596 {
4a53a912 2597 mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
6c7506de
MN
2598 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2599 firstTime=0;
2600 }
2601 }
28bf81c9 2602
1e621b18
MN
2603 /* Note the user might start scaling the picture in the middle so this will not get executed
2604 this is not really intended but works currently, so ppl might do it */
28bf81c9
MN
2605 if(srcSliceY ==0){
2606 lumBufIndex=0;
2607 chrBufIndex=0;
1e621b18 2608 dstY=0;
28bf81c9
MN
2609 lastInLumBuf= -1;
2610 lastInChrBuf= -1;
077ea8a7 2611 }
d3f41512 2612
c1b0bfb4 2613 for(;dstY < dstH; dstY++){
28bf81c9 2614 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3f7bb50c
MN
2615 const int chrDstY= dstY>>c->chrDstVSubSample;
2616 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2617 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
d3f41512 2618
c1b0bfb4
MN
2619 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2620 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2621 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2622 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
d604bab9 2623
c7f822d9
MN
2624 //handle holes (FAST_BILINEAR & weird filters)
2625 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2626 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2627//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
c1b0bfb4
MN
2628 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2629 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
d3f41512 2630
c1b0bfb4 2631 // Do we have enough lines in this slice to output the dstY line
e616aa93 2632 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
c1b0bfb4
MN
2633 {
2634 //Do horizontal scaling
2635 while(lastInLumBuf < lastLumSrcY)
d3f41512 2636 {
28bf81c9 2637 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4 2638 lumBufIndex++;
c7f822d9 2639// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
c1b0bfb4
MN
2640 ASSERT(lumBufIndex < 2*vLumBufSize)
2641 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2642 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2643// printf("%d %d\n", lumBufIndex, vLumBufSize);
28bf81c9
MN
2644 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2645 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2646 funnyYCode, c->srcFormat, formatConvBuffer,
2647 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2648 lastInLumBuf++;
2649 }
2650 while(lastInChrBuf < lastChrSrcY)
2651 {
e616aa93
MN
2652 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2653 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2654 chrBufIndex++;
2655 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2656 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2657 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
28bf81c9 2658 //FIXME replace parameters through context struct (some at least)
44c1035c
MN
2659
2660 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2661 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2662 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2663 funnyUVCode, c->srcFormat, formatConvBuffer,
2664 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4 2665 lastInChrBuf++;
d3f41512 2666 }
c1b0bfb4
MN
2667 //wrap buf index around to stay inside the ring buffer
2668 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2669 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
d3f41512 2670 }
c1b0bfb4 2671 else // not enough lines left in this slice -> load the rest in the buffer
2ff198c1 2672 {
c1b0bfb4
MN
2673/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2674 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2675 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
e616aa93
MN
2676 vChrBufSize, vLumBufSize);*/
2677
c1b0bfb4
MN
2678 //Do horizontal scaling
2679 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2680 {
28bf81c9 2681 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4
MN
2682 lumBufIndex++;
2683 ASSERT(lumBufIndex < 2*vLumBufSize)
2684 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2685 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
28bf81c9
MN
2686 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2687 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2688 funnyYCode, c->srcFormat, formatConvBuffer,
2689 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2690 lastInLumBuf++;
2691 }
e616aa93 2692 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
c1b0bfb4 2693 {
e616aa93
MN
2694 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2695 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2696 chrBufIndex++;
2697 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2698 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2699 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
44c1035c
MN
2700
2701 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2702 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2703 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2704 funnyUVCode, c->srcFormat, formatConvBuffer,
2705 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4
MN
2706 lastInChrBuf++;
2707 }
2708 //wrap buf index around to stay inside the ring buffer
2709 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2710 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2711 break; //we cant output a dstY line so lets try with the next slice
2ff198c1 2712 }
d3f41512 2713
c1b0bfb4
MN
2714#ifdef HAVE_MMX
2715 b5Dither= dither8[dstY&1];
2716 g6Dither= dither4[dstY&1];
2717 g5Dither= dither8[dstY&1];
2718 r5Dither= dither8[(dstY+1)&1];
2719#endif
28bf81c9 2720 if(dstY < dstH-2)
e3d2500f 2721 {
44c1035c 2722 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
0f25d72b 2723 {
df1b2c14
MN
2724 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2725 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
c1b0bfb4 2726 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2ff198c1 2727 {
c1b0bfb4
MN
2728 int16_t *lumBuf = lumPixBuf[0];
2729 int16_t *chrBuf= chrPixBuf[0];
e616aa93 2730 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
c1b0bfb4
MN
2731 }
2732 else //General YV12
2733 {
2734 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2735 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
77a49659
MN
2736 int i;
2737#ifdef HAVE_MMX
2738 for(i=0; i<vLumFilterSize; i++)
2739 {
2740 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2741 lumMmxFilter[4*i+2]=
2742 lumMmxFilter[4*i+3]=
2743 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2744 }
2745 for(i=0; i<vChrFilterSize; i++)
2746 {
2747 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2748 chrMmxFilter[4*i+2]=
2749 chrMmxFilter[4*i+3]=
2750 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2751 }
2752#endif
2753 RENAME(yuv2yuvX)(c,
e616aa93
MN
2754 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2755 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2756 dest, uDest, vDest, dstW, chrDstW,
77a49659 2757 lumMmxFilter, chrMmxFilter);
2ff198c1 2758 }
0f25d72b 2759 }
c1b0bfb4 2760 else
2ff198c1 2761 {
c1b0bfb4
MN
2762 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2763 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
d3f41512 2764
c1b0bfb4
MN
2765 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2766 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2767 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2768 {
2769 int chrAlpha= vChrFilter[2*dstY+1];
2ff198c1 2770
25593e29 2771 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
cf7d1c1a 2772 dest, dstW, chrAlpha, dstFormat, flags, dstY);
c1b0bfb4
MN
2773 }
2774 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2775 {
2776 int lumAlpha= vLumFilter[2*dstY+1];
2777 int chrAlpha= vChrFilter[2*dstY+1];
2778
25593e29 2779 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
cf7d1c1a 2780 dest, dstW, lumAlpha, chrAlpha, dstY);
c1b0bfb4
MN
2781 }
2782 else //General RGB
2783 {
77a49659
MN
2784 int i;
2785#ifdef HAVE_MMX
2786 for(i=0; i<vLumFilterSize; i++)
2787 {
2788 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2789 lumMmxFilter[4*i+2]=
2790 lumMmxFilter[4*i+3]=
2791 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2792 }
2793 for(i=0; i<vChrFilterSize; i++)
2794 {
2795 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2796 chrMmxFilter[4*i+2]=
2797 chrMmxFilter[4*i+3]=
2798 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2799 }
2800#endif
25593e29 2801 RENAME(yuv2packedX)(c,
c1b0bfb4
MN
2802 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2803 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
77a49659 2804 dest, dstW, dstY);
c1b0bfb4
MN
2805 }
2806 }
e3d2500f
MN
2807 }
2808 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2809 {
2810 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2811 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
f5b58629 2812 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
e3d2500f 2813 {
df1b2c14
MN
2814 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2815 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
5859233b 2816 yuv2yuvXinC(
e616aa93
MN
2817 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2818 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
5859233b 2819 dest, uDest, vDest, dstW, chrDstW);
e3d2500f
MN
2820 }
2821 else
2822 {
2823 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2824 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
25593e29 2825 yuv2packedXinC(c,
e3d2500f
MN
2826 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2827 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
cf7d1c1a 2828 dest, dstW, dstY);
e3d2500f
MN
2829 }
2830 }
c1b0bfb4 2831 }
17f715fa
MN
2832
2833#ifdef HAVE_MMX
2834 __asm __volatile(SFENCE:::"memory");
1faf0867 2835 __asm __volatile(EMMS:::"memory");
17f715fa 2836#endif
28bf81c9
MN
2837 /* store changed local vars back in the context */
2838 c->dstY= dstY;
2839 c->lumBufIndex= lumBufIndex;
2840 c->chrBufIndex= chrBufIndex;
2841 c->lastInLumBuf= lastInLumBuf;
2842 c->lastInChrBuf= lastInChrBuf;
627690b5 2843}