last line messed up in Y800 bugfix
[libav.git] / postproc / swscale_template.c
CommitLineData
fe8054c0
MN
1/*
2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
541c4eb9 19#undef MOVNTQ
7d7f78b5 20#undef PAVGB
48a05cec
MN
21#undef PREFETCH
22#undef PREFETCHW
23#undef EMMS
24#undef SFENCE
25
26#ifdef HAVE_3DNOW
27/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28#define EMMS "femms"
29#else
30#define EMMS "emms"
31#endif
32
33#ifdef HAVE_3DNOW
34#define PREFETCH "prefetch"
35#define PREFETCHW "prefetchw"
36#elif defined ( HAVE_MMX2 )
37#define PREFETCH "prefetchnta"
38#define PREFETCHW "prefetcht0"
39#else
40#define PREFETCH "/nop"
41#define PREFETCHW "/nop"
42#endif
43
44#ifdef HAVE_MMX2
45#define SFENCE "sfence"
46#else
47#define SFENCE "/nop"
48#endif
d3f41512 49
d604bab9
MN
50#ifdef HAVE_MMX2
51#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52#elif defined (HAVE_3DNOW)
53#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54#endif
d3f41512 55
d604bab9
MN
56#ifdef HAVE_MMX2
57#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58#else
59#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60#endif
61
c1b0bfb4
MN
62#define YSCALEYUV2YV12X(x) \
63 "xorl %%eax, %%eax \n\t"\
64 "pxor %%mm3, %%mm3 \n\t"\
65 "pxor %%mm4, %%mm4 \n\t"\
66 "movl %0, %%edx \n\t"\
67 ".balign 16 \n\t" /* FIXME Unroll? */\
68 "1: \n\t"\
69 "movl (%1, %%edx, 4), %%esi \n\t"\
70 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
71 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
72 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
77 "addl $1, %%edx \n\t"\
78 " jnz 1b \n\t"\
79 "psraw $3, %%mm3 \n\t"\
80 "psraw $3, %%mm4 \n\t"\
81 "packuswb %%mm4, %%mm3 \n\t"\
82 MOVNTQ(%%mm3, (%3, %%eax))\
83 "addl $8, %%eax \n\t"\
84 "cmpl %4, %%eax \n\t"\
85 "pxor %%mm3, %%mm3 \n\t"\
86 "pxor %%mm4, %%mm4 \n\t"\
87 "movl %0, %%edx \n\t"\
88 "jb 1b \n\t"
89
90#define YSCALEYUV2YV121 \
91 "movl %2, %%eax \n\t"\
92 ".balign 16 \n\t" /* FIXME Unroll? */\
93 "1: \n\t"\
94 "movq (%0, %%eax, 2), %%mm0 \n\t"\
95 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
96 "psraw $7, %%mm0 \n\t"\
97 "psraw $7, %%mm1 \n\t"\
98 "packuswb %%mm1, %%mm0 \n\t"\
99 MOVNTQ(%%mm0, (%1, %%eax))\
100 "addl $8, %%eax \n\t"\
101 "jnc 1b \n\t"
102
103/*
104 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
105 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
106 "r" (dest), "m" (dstW),
107 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
108 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
109*/
110#define YSCALEYUV2RGBX \
111 "xorl %%eax, %%eax \n\t"\
112 ".balign 16 \n\t"\
113 "1: \n\t"\
114 "movl %1, %%edx \n\t" /* -chrFilterSize */\
115 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
116 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
117 "pxor %%mm3, %%mm3 \n\t"\
118 "pxor %%mm4, %%mm4 \n\t"\
119 "2: \n\t"\
120 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
121 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
122 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
123 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
124 "pmulhw %%mm0, %%mm2 \n\t"\
125 "pmulhw %%mm0, %%mm5 \n\t"\
126 "paddw %%mm2, %%mm3 \n\t"\
127 "paddw %%mm5, %%mm4 \n\t"\
128 "addl $1, %%edx \n\t"\
129 " jnz 2b \n\t"\
130\
131 "movl %0, %%edx \n\t" /* -lumFilterSize */\
132 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
133 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
134 "pxor %%mm1, %%mm1 \n\t"\
135 "pxor %%mm7, %%mm7 \n\t"\
136 "2: \n\t"\
137 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
138 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
139 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
140 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
141 "pmulhw %%mm0, %%mm2 \n\t"\
142 "pmulhw %%mm0, %%mm5 \n\t"\
143 "paddw %%mm2, %%mm1 \n\t"\
144 "paddw %%mm5, %%mm7 \n\t"\
145 "addl $1, %%edx \n\t"\
146 " jnz 2b \n\t"\
147\
9b464428
FB
148 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
149 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
c1b0bfb4
MN
150 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
151 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
152 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
153 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
c1b0bfb4 154 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
9b464428
FB
155 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
156 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
157 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
158 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
159 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
160 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
c1b0bfb4
MN
161 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
162 "paddw %%mm3, %%mm4 \n\t"\
163 "movq %%mm2, %%mm0 \n\t"\
164 "movq %%mm5, %%mm6 \n\t"\
165 "movq %%mm4, %%mm3 \n\t"\
166 "punpcklwd %%mm2, %%mm2 \n\t"\
167 "punpcklwd %%mm5, %%mm5 \n\t"\
168 "punpcklwd %%mm4, %%mm4 \n\t"\
169 "paddw %%mm1, %%mm2 \n\t"\
170 "paddw %%mm1, %%mm5 \n\t"\
171 "paddw %%mm1, %%mm4 \n\t"\
172 "punpckhwd %%mm0, %%mm0 \n\t"\
173 "punpckhwd %%mm6, %%mm6 \n\t"\
174 "punpckhwd %%mm3, %%mm3 \n\t"\
175 "paddw %%mm7, %%mm0 \n\t"\
176 "paddw %%mm7, %%mm6 \n\t"\
177 "paddw %%mm7, %%mm3 \n\t"\
178 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
179 "packuswb %%mm0, %%mm2 \n\t"\
180 "packuswb %%mm6, %%mm5 \n\t"\
181 "packuswb %%mm3, %%mm4 \n\t"\
182 "pxor %%mm7, %%mm7 \n\t"
183
d604bab9
MN
184#define FULL_YSCALEYUV2RGB \
185 "pxor %%mm7, %%mm7 \n\t"\
186 "movd %6, %%mm6 \n\t" /*yalpha1*/\
187 "punpcklwd %%mm6, %%mm6 \n\t"\
188 "punpcklwd %%mm6, %%mm6 \n\t"\
189 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
190 "punpcklwd %%mm5, %%mm5 \n\t"\
191 "punpcklwd %%mm5, %%mm5 \n\t"\
192 "xorl %%eax, %%eax \n\t"\
cff6ecd7 193 ".balign 16 \n\t"\
d604bab9
MN
194 "1: \n\t"\
195 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
196 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
197 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
198 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
199 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
200 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
201 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
202 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
203 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
204 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
205 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
206 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
207 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
208 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
209 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
9b464428
FB
210 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
211 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
212 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
213\
214\
215 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
216 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
9b464428 217 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
d604bab9 218 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428 219 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
d604bab9 220 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428 221 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
222\
223\
224 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
9b464428
FB
225 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
226 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
227 "paddw %%mm1, %%mm3 \n\t" /* B*/\
228 "paddw %%mm1, %%mm0 \n\t" /* R*/\
229 "packuswb %%mm3, %%mm3 \n\t"\
230\
231 "packuswb %%mm0, %%mm0 \n\t"\
232 "paddw %%mm4, %%mm2 \n\t"\
233 "paddw %%mm2, %%mm1 \n\t" /* G*/\
234\
235 "packuswb %%mm1, %%mm1 \n\t"
236
237#define YSCALEYUV2RGB \
238 "movd %6, %%mm6 \n\t" /*yalpha1*/\
239 "punpcklwd %%mm6, %%mm6 \n\t"\
240 "punpcklwd %%mm6, %%mm6 \n\t"\
5ac80202 241 "movq %%mm6, 3968(%2) \n\t"\
d604bab9
MN
242 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
243 "punpcklwd %%mm5, %%mm5 \n\t"\
244 "punpcklwd %%mm5, %%mm5 \n\t"\
5ac80202 245 "movq %%mm5, 3976(%2) \n\t"\
d604bab9 246 "xorl %%eax, %%eax \n\t"\
cff6ecd7 247 ".balign 16 \n\t"\
d604bab9
MN
248 "1: \n\t"\
249 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
250 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
251 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
252 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
253 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
254 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
5ac80202 255 "movq 3976(%2), %%mm0 \n\t"\
d604bab9
MN
256 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
257 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
258 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
259 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
260 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
261 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428
FB
262 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
263 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
264 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
265 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
266 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
267 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
268 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
269 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
270 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
271 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
272 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
273 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
274 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
5ac80202
MN
275 "pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
276 "pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
d604bab9
MN
277 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
278 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
280 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
9b464428
FB
281 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
282 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
283 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
284 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
285 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
286 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
d604bab9
MN
287 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
288 "paddw %%mm3, %%mm4 \n\t"\
289 "movq %%mm2, %%mm0 \n\t"\
290 "movq %%mm5, %%mm6 \n\t"\
291 "movq %%mm4, %%mm3 \n\t"\
292 "punpcklwd %%mm2, %%mm2 \n\t"\
293 "punpcklwd %%mm5, %%mm5 \n\t"\
294 "punpcklwd %%mm4, %%mm4 \n\t"\
295 "paddw %%mm1, %%mm2 \n\t"\
296 "paddw %%mm1, %%mm5 \n\t"\
297 "paddw %%mm1, %%mm4 \n\t"\
298 "punpckhwd %%mm0, %%mm0 \n\t"\
299 "punpckhwd %%mm6, %%mm6 \n\t"\
300 "punpckhwd %%mm3, %%mm3 \n\t"\
301 "paddw %%mm7, %%mm0 \n\t"\
302 "paddw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm7, %%mm3 \n\t"\
304 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
305 "packuswb %%mm0, %%mm2 \n\t"\
306 "packuswb %%mm6, %%mm5 \n\t"\
307 "packuswb %%mm3, %%mm4 \n\t"\
308 "pxor %%mm7, %%mm7 \n\t"
309
310#define YSCALEYUV2RGB1 \
311 "xorl %%eax, %%eax \n\t"\
cff6ecd7 312 ".balign 16 \n\t"\
d604bab9
MN
313 "1: \n\t"\
314 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
315 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
316 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
317 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428
FB
318 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
319 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
320 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
321 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
322 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
323 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9 324 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
497d4f99
MN
325 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
326 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
327 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
328 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
9b464428
FB
329 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
330 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
331 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
332 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
333 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
334 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
497d4f99
MN
335 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
336 "paddw %%mm3, %%mm4 \n\t"\
337 "movq %%mm2, %%mm0 \n\t"\
338 "movq %%mm5, %%mm6 \n\t"\
339 "movq %%mm4, %%mm3 \n\t"\
340 "punpcklwd %%mm2, %%mm2 \n\t"\
341 "punpcklwd %%mm5, %%mm5 \n\t"\
342 "punpcklwd %%mm4, %%mm4 \n\t"\
343 "paddw %%mm1, %%mm2 \n\t"\
344 "paddw %%mm1, %%mm5 \n\t"\
345 "paddw %%mm1, %%mm4 \n\t"\
346 "punpckhwd %%mm0, %%mm0 \n\t"\
347 "punpckhwd %%mm6, %%mm6 \n\t"\
348 "punpckhwd %%mm3, %%mm3 \n\t"\
349 "paddw %%mm7, %%mm0 \n\t"\
350 "paddw %%mm7, %%mm6 \n\t"\
351 "paddw %%mm7, %%mm3 \n\t"\
352 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
353 "packuswb %%mm0, %%mm2 \n\t"\
354 "packuswb %%mm6, %%mm5 \n\t"\
355 "packuswb %%mm3, %%mm4 \n\t"\
356 "pxor %%mm7, %%mm7 \n\t"
357
358// do vertical chrominance interpolation
359#define YSCALEYUV2RGB1b \
360 "xorl %%eax, %%eax \n\t"\
cff6ecd7 361 ".balign 16 \n\t"\
497d4f99
MN
362 "1: \n\t"\
363 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
364 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
365 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
366 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397c035e
MN
367 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
368 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
c1b0bfb4
MN
369 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
370 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
9b464428
FB
371 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
372 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
497d4f99
MN
373 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
374 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
375 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
376 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
497d4f99
MN
377 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
378 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
379 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
d604bab9
MN
380 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
381 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
9b464428
FB
382 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
383 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
384 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
385 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
386 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
387 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
d604bab9
MN
388 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
389 "paddw %%mm3, %%mm4 \n\t"\
390 "movq %%mm2, %%mm0 \n\t"\
391 "movq %%mm5, %%mm6 \n\t"\
392 "movq %%mm4, %%mm3 \n\t"\
393 "punpcklwd %%mm2, %%mm2 \n\t"\
394 "punpcklwd %%mm5, %%mm5 \n\t"\
395 "punpcklwd %%mm4, %%mm4 \n\t"\
396 "paddw %%mm1, %%mm2 \n\t"\
397 "paddw %%mm1, %%mm5 \n\t"\
398 "paddw %%mm1, %%mm4 \n\t"\
399 "punpckhwd %%mm0, %%mm0 \n\t"\
400 "punpckhwd %%mm6, %%mm6 \n\t"\
401 "punpckhwd %%mm3, %%mm3 \n\t"\
402 "paddw %%mm7, %%mm0 \n\t"\
403 "paddw %%mm7, %%mm6 \n\t"\
404 "paddw %%mm7, %%mm3 \n\t"\
405 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
406 "packuswb %%mm0, %%mm2 \n\t"\
407 "packuswb %%mm6, %%mm5 \n\t"\
408 "packuswb %%mm3, %%mm4 \n\t"\
409 "pxor %%mm7, %%mm7 \n\t"
410
411#define WRITEBGR32 \
412 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
413 "movq %%mm2, %%mm1 \n\t" /* B */\
414 "movq %%mm5, %%mm6 \n\t" /* R */\
415 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
416 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
417 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
418 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
419 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
420 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
421 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
422 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
423 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
424 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
425\
426 MOVNTQ(%%mm0, (%4, %%eax, 4))\
427 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
428 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
429 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
430\
431 "addl $8, %%eax \n\t"\
432 "cmpl %5, %%eax \n\t"\
433 " jb 1b \n\t"
434
435#define WRITEBGR16 \
9b464428
FB
436 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
437 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
438 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb 439 "psrlq $3, %%mm2 \n\t"\
d604bab9 440\
f62255fb
MN
441 "movq %%mm2, %%mm1 \n\t"\
442 "movq %%mm4, %%mm3 \n\t"\
d604bab9 443\
f62255fb
MN
444 "punpcklbw %%mm7, %%mm3 \n\t"\
445 "punpcklbw %%mm5, %%mm2 \n\t"\
446 "punpckhbw %%mm7, %%mm4 \n\t"\
447 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 448\
f62255fb
MN
449 "psllq $3, %%mm3 \n\t"\
450 "psllq $3, %%mm4 \n\t"\
d604bab9
MN
451\
452 "por %%mm3, %%mm2 \n\t"\
d604bab9 453 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
454\
455 MOVNTQ(%%mm2, (%4, %%eax, 2))\
456 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
457\
458 "addl $8, %%eax \n\t"\
459 "cmpl %5, %%eax \n\t"\
460 " jb 1b \n\t"
461
462#define WRITEBGR15 \
9b464428
FB
463 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
464 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
465 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb
MN
466 "psrlq $3, %%mm2 \n\t"\
467 "psrlq $1, %%mm5 \n\t"\
d604bab9 468\
f62255fb
MN
469 "movq %%mm2, %%mm1 \n\t"\
470 "movq %%mm4, %%mm3 \n\t"\
d604bab9 471\
f62255fb
MN
472 "punpcklbw %%mm7, %%mm3 \n\t"\
473 "punpcklbw %%mm5, %%mm2 \n\t"\
474 "punpckhbw %%mm7, %%mm4 \n\t"\
475 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 476\
f62255fb
MN
477 "psllq $2, %%mm3 \n\t"\
478 "psllq $2, %%mm4 \n\t"\
d604bab9
MN
479\
480 "por %%mm3, %%mm2 \n\t"\
d604bab9 481 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
482\
483 MOVNTQ(%%mm2, (%4, %%eax, 2))\
484 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
485\
486 "addl $8, %%eax \n\t"\
487 "cmpl %5, %%eax \n\t"\
488 " jb 1b \n\t"
f62255fb 489
99d2cb72 490#define WRITEBGR24OLD \
d604bab9
MN
491 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
492 "movq %%mm2, %%mm1 \n\t" /* B */\
493 "movq %%mm5, %%mm6 \n\t" /* R */\
494 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
495 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
496 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
497 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
498 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
499 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
a525ce8d
MN
500 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
501 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
502 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
503 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9
MN
504\
505 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
506 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
9b464428
FB
507 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
508 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
d604bab9
MN
509 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
510 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
511 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
512 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
513\
514 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
515 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
516 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
517 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
9b464428 518 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
d604bab9
MN
519 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
520 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
9b464428
FB
521 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
522 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
d604bab9
MN
523 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
524 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
525 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
526 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
527\
528 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
529 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
530 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
9b464428
FB
531 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
532 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
d604bab9
MN
533 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
534 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
535 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
536\
bdc2eb9a
MN
537 MOVNTQ(%%mm0, (%%ebx))\
538 MOVNTQ(%%mm2, 8(%%ebx))\
539 MOVNTQ(%%mm3, 16(%%ebx))\
540 "addl $24, %%ebx \n\t"\
d604bab9
MN
541\
542 "addl $8, %%eax \n\t"\
543 "cmpl %5, %%eax \n\t"\
544 " jb 1b \n\t"
545
99d2cb72
MN
546#define WRITEBGR24MMX \
547 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
548 "movq %%mm2, %%mm1 \n\t" /* B */\
549 "movq %%mm5, %%mm6 \n\t" /* R */\
550 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
551 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
552 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
553 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
554 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
555 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
556 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
557 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
558 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
559 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
560\
561 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
562 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
563 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
564 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
565\
566 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
567 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
568 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
569 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
570\
571 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
572 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
573 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
574 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
575\
576 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
577 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
578 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
579 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
580 MOVNTQ(%%mm0, (%%ebx))\
581\
582 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
583 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
584 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
585 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
586 MOVNTQ(%%mm6, 8(%%ebx))\
587\
588 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
589 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
590 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
591 MOVNTQ(%%mm5, 16(%%ebx))\
592\
593 "addl $24, %%ebx \n\t"\
594\
595 "addl $8, %%eax \n\t"\
596 "cmpl %5, %%eax \n\t"\
597 " jb 1b \n\t"
598
599#define WRITEBGR24MMX2 \
600 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
9b464428
FB
601 "movq "MANGLE(M24A)", %%mm0 \n\t"\
602 "movq "MANGLE(M24C)", %%mm7 \n\t"\
99d2cb72
MN
603 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
604 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
605 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
606\
607 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
608 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
609 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
610\
611 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
612 "por %%mm1, %%mm6 \n\t"\
613 "por %%mm3, %%mm6 \n\t"\
614 MOVNTQ(%%mm6, (%%ebx))\
615\
616 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
617 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
618 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
619 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
620\
9b464428 621 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
99d2cb72
MN
622 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
623 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
624\
625 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
626 "por %%mm3, %%mm6 \n\t"\
627 MOVNTQ(%%mm6, 8(%%ebx))\
628\
629 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
630 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
631 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
632\
633 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
634 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
9b464428 635 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72
MN
636\
637 "por %%mm1, %%mm3 \n\t"\
638 "por %%mm3, %%mm6 \n\t"\
639 MOVNTQ(%%mm6, 16(%%ebx))\
640\
641 "addl $24, %%ebx \n\t"\
642\
643 "addl $8, %%eax \n\t"\
644 "cmpl %5, %%eax \n\t"\
645 " jb 1b \n\t"
646
647#ifdef HAVE_MMX2
7630f2e0 648#undef WRITEBGR24
99d2cb72
MN
649#define WRITEBGR24 WRITEBGR24MMX2
650#else
7630f2e0 651#undef WRITEBGR24
99d2cb72
MN
652#define WRITEBGR24 WRITEBGR24MMX
653#endif
654
c1b0bfb4
MN
655static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
656 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
e616aa93 657 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
c1b0bfb4 658 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
38858470 659{
c1b0bfb4
MN
660#ifdef HAVE_MMX
661 if(uDest != NULL)
662 {
663 asm volatile(
664 YSCALEYUV2YV12X(0)
665 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
e616aa93 666 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (chrDstW)
c1b0bfb4
MN
667 : "%eax", "%edx", "%esi"
668 );
669
670 asm volatile(
671 YSCALEYUV2YV12X(4096)
672 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
e616aa93 673 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (chrDstW)
c1b0bfb4
MN
674 : "%eax", "%edx", "%esi"
675 );
676 }
677
678 asm volatile(
679 YSCALEYUV2YV12X(0)
680 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
681 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
682 : "%eax", "%edx", "%esi"
683 );
684#else
5859233b 685yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
e3d2500f 686 chrFilter, chrSrc, chrFilterSize,
5859233b 687 dest, uDest, vDest, dstW, chrDstW);
7630f2e0 688#endif
c1b0bfb4 689}
2add307d 690
c1b0bfb4 691static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
e616aa93 692 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
c1b0bfb4
MN
693{
694#ifdef HAVE_MMX
695 if(uDest != NULL)
38858470 696 {
c1b0bfb4
MN
697 asm volatile(
698 YSCALEYUV2YV121
e616aa93
MN
699 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
700 "g" (-chrDstW)
c1b0bfb4
MN
701 : "%eax"
702 );
703
704 asm volatile(
705 YSCALEYUV2YV121
e616aa93
MN
706 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
707 "g" (-chrDstW)
c1b0bfb4
MN
708 : "%eax"
709 );
38858470
MN
710 }
711
c1b0bfb4
MN
712 asm volatile(
713 YSCALEYUV2YV121
714 :: "r" (lumSrc + dstW), "r" (dest + dstW),
715 "g" (-dstW)
716 : "%eax"
717 );
718#else
c1b0bfb4
MN
719 int i;
720 for(i=0; i<dstW; i++)
38858470 721 {
c1b0bfb4 722 int val= lumSrc[i]>>7;
44c1035c
MN
723
724 if(val&256){
725 if(val<0) val=0;
726 else val=255;
727 }
c1b0bfb4 728
44c1035c 729 dest[i]= val;
c1b0bfb4
MN
730 }
731
732 if(uDest != NULL)
e616aa93 733 for(i=0; i<chrDstW; i++)
38858470 734 {
c1b0bfb4
MN
735 int u=chrSrc[i]>>7;
736 int v=chrSrc[i + 2048]>>7;
737
44c1035c
MN
738 if((u|v)&256){
739 if(u<0) u=0;
740 else if (u>255) u=255;
741 if(v<0) v=0;
742 else if (v>255) v=255;
743 }
744
745 uDest[i]= u;
746 vDest[i]= v;
38858470 747 }
c1b0bfb4 748#endif
38858470
MN
749}
750
c1b0bfb4 751
d604bab9
MN
752/**
753 * vertical scale YV12 to RGB
754 */
cf7d1c1a 755static inline void RENAME(yuv2rgbX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
c1b0bfb4 756 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
cf7d1c1a 757 uint8_t *dest, int dstW, int16_t * lumMmxFilter, int16_t * chrMmxFilter, int dstY)
c1b0bfb4 758{
cf7d1c1a 759 switch(c->dstFormat)
c1b0bfb4
MN
760 {
761#ifdef HAVE_MMX
cf7d1c1a 762 case IMGFMT_BGR32:
c1b0bfb4
MN
763 {
764 asm volatile(
765 YSCALEYUV2RGBX
766 WRITEBGR32
767
768 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
769 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
770 "r" (dest), "m" (dstW),
771 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
772 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
773 );
774 }
cf7d1c1a
MN
775 break;
776 case IMGFMT_BGR24:
c1b0bfb4
MN
777 {
778 asm volatile(
779 YSCALEYUV2RGBX
780 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
781 "addl %4, %%ebx \n\t"
782 WRITEBGR24
783
784 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
785 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
786 "r" (dest), "m" (dstW),
787 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
788 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
789 );
790 }
cf7d1c1a
MN
791 break;
792 case IMGFMT_BGR15:
c1b0bfb4
MN
793 {
794 asm volatile(
795 YSCALEYUV2RGBX
796 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
797#ifdef DITHER1XBPP
9b464428
FB
798 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
799 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
800 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
801#endif
802
803 WRITEBGR15
804
805 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
806 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
807 "r" (dest), "m" (dstW),
808 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
809 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
810 );
811 }
cf7d1c1a
MN
812 break;
813 case IMGFMT_BGR16:
c1b0bfb4
MN
814 {
815 asm volatile(
816 YSCALEYUV2RGBX
817 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
818#ifdef DITHER1XBPP
9b464428
FB
819 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
820 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
821 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
822#endif
823
824 WRITEBGR16
825
826 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
827 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
828 "r" (dest), "m" (dstW),
829 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
830 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
831 );
832 }
cf7d1c1a 833 break;
c1b0bfb4 834#endif
cf7d1c1a
MN
835 default:
836 yuv2rgbXinC(c, lumFilter, lumSrc, lumFilterSize,
837 chrFilter, chrSrc, chrFilterSize,
838 dest, dstW, dstY);
839 break;
840 }
c1b0bfb4
MN
841}
842
c1b0bfb4
MN
843/**
844 * vertical bilinear scale YV12 to RGB
845 */
cf7d1c1a
MN
846static inline void RENAME(yuv2rgb2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
847 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
d604bab9
MN
848{
849 int yalpha1=yalpha^4095;
850 int uvalpha1=uvalpha^4095;
cf7d1c1a 851 int i;
d604bab9 852
cf7d1c1a 853#if 0 //isnt used
1e621b18 854 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 855 {
cf7d1c1a 856 switch(dstFormat)
d604bab9 857 {
cf7d1c1a
MN
858#ifdef HAVE_MMX
859 case IMGFMT_BGR32:
d604bab9
MN
860 asm volatile(
861
862
863FULL_YSCALEYUV2RGB
864 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
865 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
866
867 "movq %%mm3, %%mm1 \n\t"
868 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
869 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
870
871 MOVNTQ(%%mm3, (%4, %%eax, 4))
872 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
873
874 "addl $4, %%eax \n\t"
875 "cmpl %5, %%eax \n\t"
876 " jb 1b \n\t"
877
878
d1fac6cf 879 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
880 "m" (yalpha1), "m" (uvalpha1)
881 : "%eax"
882 );
cf7d1c1a
MN
883 break;
884 case IMGFMT_BGR24:
d604bab9
MN
885 asm volatile(
886
887FULL_YSCALEYUV2RGB
888
889 // lsb ... msb
890 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
891 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
892
893 "movq %%mm3, %%mm1 \n\t"
894 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
895 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
896
897 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
898 "psrlq $8, %%mm3 \n\t" // GR0BGR00
9b464428
FB
899 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
900 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
d604bab9
MN
901 "por %%mm2, %%mm3 \n\t" // BGRBGR00
902 "movq %%mm1, %%mm2 \n\t"
903 "psllq $48, %%mm1 \n\t" // 000000BG
904 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
905
906 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
907 "psrld $16, %%mm2 \n\t" // R000R000
908 "psrlq $24, %%mm1 \n\t" // 0BGR0000
909 "por %%mm2, %%mm1 \n\t" // RBGRR000
910
911 "movl %4, %%ebx \n\t"
912 "addl %%eax, %%ebx \n\t"
913
914#ifdef HAVE_MMX2
915 //FIXME Alignment
916 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
917 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
918#else
919 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
920 "psrlq $32, %%mm3 \n\t"
921 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
922 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
923#endif
924 "addl $4, %%eax \n\t"
925 "cmpl %5, %%eax \n\t"
926 " jb 1b \n\t"
927
d1fac6cf 928 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
929 "m" (yalpha1), "m" (uvalpha1)
930 : "%eax", "%ebx"
931 );
cf7d1c1a
MN
932 break;
933 case IMGFMT_BGR15:
d604bab9
MN
934 asm volatile(
935
936FULL_YSCALEYUV2RGB
937#ifdef DITHER1XBPP
9b464428
FB
938 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
939 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
940 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
941#endif
942 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
943 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
944 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
945
946 "psrlw $3, %%mm3 \n\t"
947 "psllw $2, %%mm1 \n\t"
948 "psllw $7, %%mm0 \n\t"
9b464428
FB
949 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
950 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9
MN
951
952 "por %%mm3, %%mm1 \n\t"
953 "por %%mm1, %%mm0 \n\t"
954
955 MOVNTQ(%%mm0, (%4, %%eax, 2))
956
957 "addl $4, %%eax \n\t"
958 "cmpl %5, %%eax \n\t"
959 " jb 1b \n\t"
960
d1fac6cf 961 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
962 "m" (yalpha1), "m" (uvalpha1)
963 : "%eax"
964 );
cf7d1c1a
MN
965 break;
966 case IMGFMT_BGR16:
d604bab9
MN
967 asm volatile(
968
969FULL_YSCALEYUV2RGB
970#ifdef DITHER1XBPP
9b464428
FB
971 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
972 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
973 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
974#endif
975 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
976 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
977 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
978
979 "psrlw $3, %%mm3 \n\t"
980 "psllw $3, %%mm1 \n\t"
981 "psllw $8, %%mm0 \n\t"
9b464428
FB
982 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
983 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9
MN
984
985 "por %%mm3, %%mm1 \n\t"
986 "por %%mm1, %%mm0 \n\t"
987
988 MOVNTQ(%%mm0, (%4, %%eax, 2))
989
990 "addl $4, %%eax \n\t"
991 "cmpl %5, %%eax \n\t"
992 " jb 1b \n\t"
993
d1fac6cf 994 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
995 "m" (yalpha1), "m" (uvalpha1)
996 : "%eax"
997 );
cf7d1c1a
MN
998 break;
999#endif
1000 case IMGFMT_RGB32:
1001#ifndef HAVE_MMX
1002 case IMGFMT_BGR32:
1003#endif
28bf81c9
MN
1004 if(dstFormat==IMGFMT_BGR32)
1005 {
2ba1bff0 1006 int i;
df3c183a
MN
1007#ifdef WORDS_BIGENDIAN
1008 dest++;
1009#endif
28bf81c9
MN
1010 for(i=0;i<dstW;i++){
1011 // vertical linear interpolation && yuv2rgb in a single step:
1012 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1013 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1014 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1015 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1016 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1017 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1018 dest+= 4;
1019 }
1020 }
1021 else if(dstFormat==IMGFMT_BGR24)
d604bab9 1022 {
96034638 1023 int i;
d1fac6cf 1024 for(i=0;i<dstW;i++){
d604bab9
MN
1025 // vertical linear interpolation && yuv2rgb in a single step:
1026 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1027 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1028 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
390b20a6
MN
1029 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1030 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1031 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
28bf81c9 1032 dest+= 3;
d604bab9
MN
1033 }
1034 }
28bf81c9 1035 else if(dstFormat==IMGFMT_BGR16)
d604bab9 1036 {
96034638 1037 int i;
d1fac6cf 1038 for(i=0;i<dstW;i++){
d604bab9
MN
1039 // vertical linear interpolation && yuv2rgb in a single step:
1040 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1041 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1042 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1043
d022ce5c 1044 ((uint16_t*)dest)[i] =
b18ea156
MN
1045 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1046 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1047 clip_table16r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1048 }
1049 }
28bf81c9 1050 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1051 {
96034638 1052 int i;
d1fac6cf 1053 for(i=0;i<dstW;i++){
d604bab9
MN
1054 // vertical linear interpolation && yuv2rgb in a single step:
1055 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1056 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1057 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1058
d022ce5c 1059 ((uint16_t*)dest)[i] =
b18ea156
MN
1060 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1061 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1062 clip_table15r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1063 }
1064 }
d604bab9
MN
1065 }//FULL_UV_IPOL
1066 else
1067 {
cf7d1c1a 1068#endif // if 0
d604bab9 1069#ifdef HAVE_MMX
cf7d1c1a
MN
1070 switch(c->dstFormat)
1071 {
1072 case IMGFMT_BGR32:
d604bab9
MN
1073 asm volatile(
1074 YSCALEYUV2RGB
1075 WRITEBGR32
1076
d1fac6cf 1077 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1078 "m" (yalpha1), "m" (uvalpha1)
1079 : "%eax"
1080 );
cf7d1c1a
MN
1081 return;
1082 case IMGFMT_BGR24:
d604bab9 1083 asm volatile(
bdc2eb9a 1084 "movl %4, %%ebx \n\t"
d604bab9
MN
1085 YSCALEYUV2RGB
1086 WRITEBGR24
1087
d1fac6cf 1088 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1089 "m" (yalpha1), "m" (uvalpha1)
1090 : "%eax", "%ebx"
1091 );
cf7d1c1a
MN
1092 return;
1093 case IMGFMT_BGR15:
d604bab9
MN
1094 asm volatile(
1095 YSCALEYUV2RGB
1096 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1097#ifdef DITHER1XBPP
9b464428
FB
1098 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1099 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1100 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1101#endif
1102
1103 WRITEBGR15
1104
d1fac6cf 1105 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1106 "m" (yalpha1), "m" (uvalpha1)
1107 : "%eax"
1108 );
cf7d1c1a
MN
1109 return;
1110 case IMGFMT_BGR16:
d604bab9
MN
1111 asm volatile(
1112 YSCALEYUV2RGB
1113 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1114#ifdef DITHER1XBPP
9b464428
FB
1115 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1116 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1117 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1118#endif
1119
1120 WRITEBGR16
1121
d1fac6cf 1122 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1123 "m" (yalpha1), "m" (uvalpha1)
1124 : "%eax"
1125 );
cf7d1c1a
MN
1126 return;
1127 default: break;
1128 }
1129#endif //HAVE_MMX
1130YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C)
d604bab9
MN
1131}
1132
1133/**
1134 * YV12 to RGB without scaling or interpolating
1135 */
cf7d1c1a
MN
1136static inline void RENAME(yuv2rgb1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1137 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
d604bab9 1138{
d604bab9 1139 int uvalpha1=uvalpha^4095;
c1b0bfb4 1140 const int yalpha1=0;
cf7d1c1a
MN
1141 int i;
1142
1143 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1144 const int yalpha= 4096; //FIXME ...
96034638 1145
1e621b18 1146 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 1147 {
cf7d1c1a 1148 RENAME(yuv2rgb2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
d604bab9
MN
1149 return;
1150 }
397c035e
MN
1151
1152#ifdef HAVE_MMX
497d4f99
MN
1153 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1154 {
cf7d1c1a 1155 switch(dstFormat)
d604bab9 1156 {
cf7d1c1a 1157 case IMGFMT_BGR32:
d604bab9
MN
1158 asm volatile(
1159 YSCALEYUV2RGB1
1160 WRITEBGR32
c1b0bfb4 1161 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1162 "m" (yalpha1), "m" (uvalpha1)
1163 : "%eax"
1164 );
cf7d1c1a
MN
1165 return;
1166 case IMGFMT_BGR24:
d604bab9 1167 asm volatile(
bdc2eb9a 1168 "movl %4, %%ebx \n\t"
d604bab9
MN
1169 YSCALEYUV2RGB1
1170 WRITEBGR24
c1b0bfb4 1171 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1172 "m" (yalpha1), "m" (uvalpha1)
1173 : "%eax", "%ebx"
1174 );
cf7d1c1a
MN
1175 return;
1176 case IMGFMT_BGR15:
d604bab9
MN
1177 asm volatile(
1178 YSCALEYUV2RGB1
1179 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1180#ifdef DITHER1XBPP
9b464428
FB
1181 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1182 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1183 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1184#endif
1185 WRITEBGR15
c1b0bfb4 1186 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1187 "m" (yalpha1), "m" (uvalpha1)
1188 : "%eax"
1189 );
cf7d1c1a
MN
1190 return;
1191 case IMGFMT_BGR16:
d604bab9
MN
1192 asm volatile(
1193 YSCALEYUV2RGB1
1194 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1195#ifdef DITHER1XBPP
9b464428
FB
1196 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1197 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1198 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1199#endif
1200
1201 WRITEBGR16
c1b0bfb4 1202 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1203 "m" (yalpha1), "m" (uvalpha1)
1204 : "%eax"
1205 );
cf7d1c1a 1206 return;
d604bab9 1207 }
497d4f99
MN
1208 }
1209 else
1210 {
cf7d1c1a 1211 switch(dstFormat)
d604bab9 1212 {
cf7d1c1a 1213 case IMGFMT_BGR32:
497d4f99
MN
1214 asm volatile(
1215 YSCALEYUV2RGB1b
1216 WRITEBGR32
c1b0bfb4 1217 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1218 "m" (yalpha1), "m" (uvalpha1)
1219 : "%eax"
1220 );
cf7d1c1a
MN
1221 return;
1222 case IMGFMT_BGR24:
497d4f99 1223 asm volatile(
bdc2eb9a 1224 "movl %4, %%ebx \n\t"
497d4f99
MN
1225 YSCALEYUV2RGB1b
1226 WRITEBGR24
c1b0bfb4 1227 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
497d4f99
MN
1228 "m" (yalpha1), "m" (uvalpha1)
1229 : "%eax", "%ebx"
1230 );
cf7d1c1a
MN
1231 return;
1232 case IMGFMT_BGR15:
497d4f99
MN
1233 asm volatile(
1234 YSCALEYUV2RGB1b
1235 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1236#ifdef DITHER1XBPP
9b464428
FB
1237 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1238 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1239 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99
MN
1240#endif
1241 WRITEBGR15
c1b0bfb4 1242 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1243 "m" (yalpha1), "m" (uvalpha1)
1244 : "%eax"
1245 );
cf7d1c1a
MN
1246 return;
1247 case IMGFMT_BGR16:
497d4f99
MN
1248 asm volatile(
1249 YSCALEYUV2RGB1b
1250 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1251#ifdef DITHER1XBPP
9b464428
FB
1252 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1253 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1254 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1255#endif
d604bab9 1256
497d4f99 1257 WRITEBGR16
c1b0bfb4 1258 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1259 "m" (yalpha1), "m" (uvalpha1)
1260 : "%eax"
1261 );
cf7d1c1a 1262 return;
d604bab9 1263 }
497d4f99 1264 }
df3c183a 1265#endif
cf7d1c1a 1266 if( uvalpha < 2048 )
497d4f99 1267 {
cf7d1c1a
MN
1268 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C)
1269 }else{
1270 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C)
497d4f99 1271 }
d604bab9
MN
1272}
1273
6ff0ad6b
MN
1274//FIXME yuy2* can read upto 7 samples to much
1275
1e621b18
MN
1276static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1277{
6ff0ad6b
MN
1278#ifdef HAVE_MMX
1279 asm volatile(
1280 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1281 "movl %0, %%eax \n\t"
1282 "1: \n\t"
1283 "movq (%1, %%eax,2), %%mm0 \n\t"
1284 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1285 "pand %%mm2, %%mm0 \n\t"
1286 "pand %%mm2, %%mm1 \n\t"
1287 "packuswb %%mm1, %%mm0 \n\t"
1288 "movq %%mm0, (%2, %%eax) \n\t"
1289 "addl $8, %%eax \n\t"
1290 " js 1b \n\t"
1291 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1292 : "%eax"
1293 );
1e621b18
MN
1294#else
1295 int i;
1296 for(i=0; i<width; i++)
1297 dst[i]= src[2*i];
1298#endif
1299}
1300
1301static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1302{
6ff0ad6b
MN
1303#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1304 asm volatile(
1305 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1306 "movl %0, %%eax \n\t"
1307 "1: \n\t"
1308 "movq (%1, %%eax,4), %%mm0 \n\t"
1309 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1310 "movq (%2, %%eax,4), %%mm2 \n\t"
1311 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1312 PAVGB(%%mm2, %%mm0)
1313 PAVGB(%%mm3, %%mm1)
1314 "psrlw $8, %%mm0 \n\t"
1315 "psrlw $8, %%mm1 \n\t"
1316 "packuswb %%mm1, %%mm0 \n\t"
1317 "movq %%mm0, %%mm1 \n\t"
1318 "psrlw $8, %%mm0 \n\t"
1319 "pand %%mm4, %%mm1 \n\t"
1320 "packuswb %%mm0, %%mm0 \n\t"
1321 "packuswb %%mm1, %%mm1 \n\t"
1322 "movd %%mm0, (%4, %%eax) \n\t"
1323 "movd %%mm1, (%3, %%eax) \n\t"
1324 "addl $4, %%eax \n\t"
1325 " js 1b \n\t"
1326 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1327 : "%eax"
1328 );
1e621b18
MN
1329#else
1330 int i;
1331 for(i=0; i<width; i++)
1332 {
1333 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1334 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1335 }
1336#endif
1337}
1338
1339static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1340{
1341#ifdef HAVE_MMXFIXME
1342#else
1343 int i;
1344 for(i=0; i<width; i++)
1345 {
1346 int b= src[i*4+0];
1347 int g= src[i*4+1];
1348 int r= src[i*4+2];
1349
1350 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1351 }
1352#endif
1353}
1354
1355static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1356{
1357#ifdef HAVE_MMXFIXME
1358#else
1359 int i;
1360 for(i=0; i<width; i++)
1361 {
1362 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1363 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1364 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1365
1366 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1367 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1368 }
1369#endif
1370}
1371
1372static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1373{
ac6a2e45
MN
1374#ifdef HAVE_MMX
1375 asm volatile(
1376 "movl %2, %%eax \n\t"
854288bb
FB
1377 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1378 "movq "MANGLE(w1111)", %%mm5 \n\t"
ac6a2e45
MN
1379 "pxor %%mm7, %%mm7 \n\t"
1380 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1381 ".balign 16 \n\t"
1382 "1: \n\t"
1383 PREFETCH" 64(%0, %%ebx) \n\t"
1384 "movd (%0, %%ebx), %%mm0 \n\t"
1385 "movd 3(%0, %%ebx), %%mm1 \n\t"
1386 "punpcklbw %%mm7, %%mm0 \n\t"
1387 "punpcklbw %%mm7, %%mm1 \n\t"
1388 "movd 6(%0, %%ebx), %%mm2 \n\t"
1389 "movd 9(%0, %%ebx), %%mm3 \n\t"
1390 "punpcklbw %%mm7, %%mm2 \n\t"
1391 "punpcklbw %%mm7, %%mm3 \n\t"
1392 "pmaddwd %%mm6, %%mm0 \n\t"
1393 "pmaddwd %%mm6, %%mm1 \n\t"
1394 "pmaddwd %%mm6, %%mm2 \n\t"
1395 "pmaddwd %%mm6, %%mm3 \n\t"
1396#ifndef FAST_BGR2YV12
1397 "psrad $8, %%mm0 \n\t"
1398 "psrad $8, %%mm1 \n\t"
1399 "psrad $8, %%mm2 \n\t"
1400 "psrad $8, %%mm3 \n\t"
1401#endif
1402 "packssdw %%mm1, %%mm0 \n\t"
1403 "packssdw %%mm3, %%mm2 \n\t"
1404 "pmaddwd %%mm5, %%mm0 \n\t"
1405 "pmaddwd %%mm5, %%mm2 \n\t"
1406 "packssdw %%mm2, %%mm0 \n\t"
1407 "psraw $7, %%mm0 \n\t"
1408
1409 "movd 12(%0, %%ebx), %%mm4 \n\t"
1410 "movd 15(%0, %%ebx), %%mm1 \n\t"
1411 "punpcklbw %%mm7, %%mm4 \n\t"
1412 "punpcklbw %%mm7, %%mm1 \n\t"
1413 "movd 18(%0, %%ebx), %%mm2 \n\t"
1414 "movd 21(%0, %%ebx), %%mm3 \n\t"
1415 "punpcklbw %%mm7, %%mm2 \n\t"
1416 "punpcklbw %%mm7, %%mm3 \n\t"
1417 "pmaddwd %%mm6, %%mm4 \n\t"
1418 "pmaddwd %%mm6, %%mm1 \n\t"
1419 "pmaddwd %%mm6, %%mm2 \n\t"
1420 "pmaddwd %%mm6, %%mm3 \n\t"
1421#ifndef FAST_BGR2YV12
1422 "psrad $8, %%mm4 \n\t"
1423 "psrad $8, %%mm1 \n\t"
1424 "psrad $8, %%mm2 \n\t"
1425 "psrad $8, %%mm3 \n\t"
1426#endif
1427 "packssdw %%mm1, %%mm4 \n\t"
1428 "packssdw %%mm3, %%mm2 \n\t"
1429 "pmaddwd %%mm5, %%mm4 \n\t"
1430 "pmaddwd %%mm5, %%mm2 \n\t"
1431 "addl $24, %%ebx \n\t"
1432 "packssdw %%mm2, %%mm4 \n\t"
1433 "psraw $7, %%mm4 \n\t"
1434
1435 "packuswb %%mm4, %%mm0 \n\t"
854288bb 1436 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
ac6a2e45 1437
4342fc14 1438 "movq %%mm0, (%1, %%eax) \n\t"
ac6a2e45
MN
1439 "addl $8, %%eax \n\t"
1440 " js 1b \n\t"
1441 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1442 : "%eax", "%ebx"
1443 );
1e621b18
MN
1444#else
1445 int i;
1446 for(i=0; i<width; i++)
1447 {
1448 int b= src[i*3+0];
1449 int g= src[i*3+1];
1450 int r= src[i*3+2];
1451
1452 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1453 }
1454#endif
1455}
1456
1457static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1458{
4342fc14
MN
1459#ifdef HAVE_MMX
1460 asm volatile(
1461 "movl %4, %%eax \n\t"
854288bb
FB
1462 "movq "MANGLE(w1111)", %%mm5 \n\t"
1463 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
4342fc14
MN
1464 "pxor %%mm7, %%mm7 \n\t"
1465 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1466 "addl %%ebx, %%ebx \n\t"
1467 ".balign 16 \n\t"
1468 "1: \n\t"
1469 PREFETCH" 64(%0, %%ebx) \n\t"
1470 PREFETCH" 64(%1, %%ebx) \n\t"
1471#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1472 "movq (%0, %%ebx), %%mm0 \n\t"
1473 "movq (%1, %%ebx), %%mm1 \n\t"
1474 "movq 6(%0, %%ebx), %%mm2 \n\t"
1475 "movq 6(%1, %%ebx), %%mm3 \n\t"
1476 PAVGB(%%mm1, %%mm0)
1477 PAVGB(%%mm3, %%mm2)
1478 "movq %%mm0, %%mm1 \n\t"
1479 "movq %%mm2, %%mm3 \n\t"
1480 "psrlq $24, %%mm0 \n\t"
1481 "psrlq $24, %%mm2 \n\t"
1482 PAVGB(%%mm1, %%mm0)
1483 PAVGB(%%mm3, %%mm2)
1484 "punpcklbw %%mm7, %%mm0 \n\t"
1485 "punpcklbw %%mm7, %%mm2 \n\t"
1486#else
1487 "movd (%0, %%ebx), %%mm0 \n\t"
1488 "movd (%1, %%ebx), %%mm1 \n\t"
1489 "movd 3(%0, %%ebx), %%mm2 \n\t"
1490 "movd 3(%1, %%ebx), %%mm3 \n\t"
1491 "punpcklbw %%mm7, %%mm0 \n\t"
1492 "punpcklbw %%mm7, %%mm1 \n\t"
1493 "punpcklbw %%mm7, %%mm2 \n\t"
1494 "punpcklbw %%mm7, %%mm3 \n\t"
1495 "paddw %%mm1, %%mm0 \n\t"
1496 "paddw %%mm3, %%mm2 \n\t"
1497 "paddw %%mm2, %%mm0 \n\t"
1498 "movd 6(%0, %%ebx), %%mm4 \n\t"
1499 "movd 6(%1, %%ebx), %%mm1 \n\t"
1500 "movd 9(%0, %%ebx), %%mm2 \n\t"
1501 "movd 9(%1, %%ebx), %%mm3 \n\t"
1502 "punpcklbw %%mm7, %%mm4 \n\t"
1503 "punpcklbw %%mm7, %%mm1 \n\t"
1504 "punpcklbw %%mm7, %%mm2 \n\t"
1505 "punpcklbw %%mm7, %%mm3 \n\t"
1506 "paddw %%mm1, %%mm4 \n\t"
1507 "paddw %%mm3, %%mm2 \n\t"
1508 "paddw %%mm4, %%mm2 \n\t"
1509 "psrlw $2, %%mm0 \n\t"
1510 "psrlw $2, %%mm2 \n\t"
1511#endif
854288bb
FB
1512 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1513 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1514
1515 "pmaddwd %%mm0, %%mm1 \n\t"
1516 "pmaddwd %%mm2, %%mm3 \n\t"
1517 "pmaddwd %%mm6, %%mm0 \n\t"
1518 "pmaddwd %%mm6, %%mm2 \n\t"
1519#ifndef FAST_BGR2YV12
1520 "psrad $8, %%mm0 \n\t"
1521 "psrad $8, %%mm1 \n\t"
1522 "psrad $8, %%mm2 \n\t"
1523 "psrad $8, %%mm3 \n\t"
1524#endif
1525 "packssdw %%mm2, %%mm0 \n\t"
1526 "packssdw %%mm3, %%mm1 \n\t"
1527 "pmaddwd %%mm5, %%mm0 \n\t"
1528 "pmaddwd %%mm5, %%mm1 \n\t"
1529 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1530 "psraw $7, %%mm0 \n\t"
1531
1532#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1533 "movq 12(%0, %%ebx), %%mm4 \n\t"
1534 "movq 12(%1, %%ebx), %%mm1 \n\t"
1535 "movq 18(%0, %%ebx), %%mm2 \n\t"
1536 "movq 18(%1, %%ebx), %%mm3 \n\t"
1537 PAVGB(%%mm1, %%mm4)
1538 PAVGB(%%mm3, %%mm2)
1539 "movq %%mm4, %%mm1 \n\t"
1540 "movq %%mm2, %%mm3 \n\t"
1541 "psrlq $24, %%mm4 \n\t"
1542 "psrlq $24, %%mm2 \n\t"
1543 PAVGB(%%mm1, %%mm4)
1544 PAVGB(%%mm3, %%mm2)
1545 "punpcklbw %%mm7, %%mm4 \n\t"
1546 "punpcklbw %%mm7, %%mm2 \n\t"
1547#else
1548 "movd 12(%0, %%ebx), %%mm4 \n\t"
1549 "movd 12(%1, %%ebx), %%mm1 \n\t"
1550 "movd 15(%0, %%ebx), %%mm2 \n\t"
1551 "movd 15(%1, %%ebx), %%mm3 \n\t"
1552 "punpcklbw %%mm7, %%mm4 \n\t"
1553 "punpcklbw %%mm7, %%mm1 \n\t"
1554 "punpcklbw %%mm7, %%mm2 \n\t"
1555 "punpcklbw %%mm7, %%mm3 \n\t"
1556 "paddw %%mm1, %%mm4 \n\t"
1557 "paddw %%mm3, %%mm2 \n\t"
1558 "paddw %%mm2, %%mm4 \n\t"
1559 "movd 18(%0, %%ebx), %%mm5 \n\t"
1560 "movd 18(%1, %%ebx), %%mm1 \n\t"
1561 "movd 21(%0, %%ebx), %%mm2 \n\t"
1562 "movd 21(%1, %%ebx), %%mm3 \n\t"
1563 "punpcklbw %%mm7, %%mm5 \n\t"
1564 "punpcklbw %%mm7, %%mm1 \n\t"
1565 "punpcklbw %%mm7, %%mm2 \n\t"
1566 "punpcklbw %%mm7, %%mm3 \n\t"
1567 "paddw %%mm1, %%mm5 \n\t"
1568 "paddw %%mm3, %%mm2 \n\t"
1569 "paddw %%mm5, %%mm2 \n\t"
854288bb 1570 "movq "MANGLE(w1111)", %%mm5 \n\t"
4342fc14
MN
1571 "psrlw $2, %%mm4 \n\t"
1572 "psrlw $2, %%mm2 \n\t"
1573#endif
854288bb
FB
1574 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1575 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1576
1577 "pmaddwd %%mm4, %%mm1 \n\t"
1578 "pmaddwd %%mm2, %%mm3 \n\t"
1579 "pmaddwd %%mm6, %%mm4 \n\t"
1580 "pmaddwd %%mm6, %%mm2 \n\t"
1581#ifndef FAST_BGR2YV12
1582 "psrad $8, %%mm4 \n\t"
1583 "psrad $8, %%mm1 \n\t"
1584 "psrad $8, %%mm2 \n\t"
1585 "psrad $8, %%mm3 \n\t"
1586#endif
1587 "packssdw %%mm2, %%mm4 \n\t"
1588 "packssdw %%mm3, %%mm1 \n\t"
1589 "pmaddwd %%mm5, %%mm4 \n\t"
1590 "pmaddwd %%mm5, %%mm1 \n\t"
1591 "addl $24, %%ebx \n\t"
1592 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1593 "psraw $7, %%mm4 \n\t"
1594
1595 "movq %%mm0, %%mm1 \n\t"
1596 "punpckldq %%mm4, %%mm0 \n\t"
1597 "punpckhdq %%mm4, %%mm1 \n\t"
1598 "packsswb %%mm1, %%mm0 \n\t"
854288bb 1599 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
4342fc14
MN
1600
1601 "movd %%mm0, (%2, %%eax) \n\t"
1602 "punpckhdq %%mm0, %%mm0 \n\t"
1603 "movd %%mm0, (%3, %%eax) \n\t"
1604 "addl $4, %%eax \n\t"
1605 " js 1b \n\t"
1606 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1607 : "%eax", "%ebx"
1608 );
1e621b18
MN
1609#else
1610 int i;
1611 for(i=0; i<width; i++)
1612 {
1613 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1614 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1615 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1616
1617 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1618 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1619 }
1620#endif
1621}
1622
6af250ea
MN
1623static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1624{
1625 int i;
1626 for(i=0; i<width; i++)
1627 {
1628 int d= src[i*2] + (src[i*2+1]<<8);
1629 int b= d&0x1F;
1630 int g= (d>>5)&0x3F;
1631 int r= (d>>11)&0x1F;
1632
1633 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1634 }
1635}
1636
1637static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1638{
1639 int i;
1640 for(i=0; i<width; i++)
1641 {
5bb9d9d8
MN
1642#if 1
1643 int d0= le2me_32( ((uint32_t*)src1)[i] );
1644 int d1= le2me_32( ((uint32_t*)src2)[i] );
1645
1646 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1647 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1648
1649 int dh2= (dh>>11) + (dh<<21);
1650 int d= dh2 + dl;
1651
1652 int b= d&0x7F;
1653 int r= (d>>11)&0x7F;
1654 int g= d>>21;
1655#else
6af250ea
MN
1656 int d0= src1[i*4] + (src1[i*4+1]<<8);
1657 int b0= d0&0x1F;
1658 int g0= (d0>>5)&0x3F;
1659 int r0= (d0>>11)&0x1F;
1660
1661 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1662 int b1= d1&0x1F;
1663 int g1= (d1>>5)&0x3F;
1664 int r1= (d1>>11)&0x1F;
1665
1666 int d2= src2[i*4] + (src2[i*4+1]<<8);
1667 int b2= d2&0x1F;
1668 int g2= (d2>>5)&0x3F;
1669 int r2= (d2>>11)&0x1F;
1670
1671 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1672 int b3= d3&0x1F;
1673 int g3= (d3>>5)&0x3F;
1674 int r3= (d3>>11)&0x1F;
1675
1676 int b= b0 + b1 + b2 + b3;
1677 int g= g0 + g1 + g2 + g3;
1678 int r= r0 + r1 + r2 + r3;
5bb9d9d8 1679#endif
6af250ea
MN
1680 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1681 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1682 }
1683}
1684
b72034dd
MN
1685static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1686{
1687 int i;
1688 for(i=0; i<width; i++)
1689 {
1690 int d= src[i*2] + (src[i*2+1]<<8);
1691 int b= d&0x1F;
1692 int g= (d>>5)&0x1F;
1693 int r= (d>>10)&0x1F;
1694
1695 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1696 }
1697}
1698
1699static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1700{
1701 int i;
1702 for(i=0; i<width; i++)
1703 {
1704#if 1
1705 int d0= le2me_32( ((uint32_t*)src1)[i] );
1706 int d1= le2me_32( ((uint32_t*)src2)[i] );
1707
1708 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1709 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1710
1711 int dh2= (dh>>11) + (dh<<21);
1712 int d= dh2 + dl;
1713
1714 int b= d&0x7F;
1715 int r= (d>>10)&0x7F;
1716 int g= d>>21;
1717#else
1718 int d0= src1[i*4] + (src1[i*4+1]<<8);
1719 int b0= d0&0x1F;
1720 int g0= (d0>>5)&0x1F;
1721 int r0= (d0>>10)&0x1F;
1722
1723 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1724 int b1= d1&0x1F;
1725 int g1= (d1>>5)&0x1F;
1726 int r1= (d1>>10)&0x1F;
1727
1728 int d2= src2[i*4] + (src2[i*4+1]<<8);
1729 int b2= d2&0x1F;
1730 int g2= (d2>>5)&0x1F;
1731 int r2= (d2>>10)&0x1F;
1732
1733 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1734 int b3= d3&0x1F;
1735 int g3= (d3>>5)&0x1F;
1736 int r3= (d3>>10)&0x1F;
1737
1738 int b= b0 + b1 + b2 + b3;
1739 int g= g0 + g1 + g2 + g3;
1740 int r= r0 + r1 + r2 + r3;
1741#endif
1742 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1743 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1744 }
1745}
1746
1747
a861d4d7
MN
1748static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1749{
1750 int i;
1751 for(i=0; i<width; i++)
1752 {
1753 int r= src[i*4+0];
1754 int g= src[i*4+1];
1755 int b= src[i*4+2];
1756
1757 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1758 }
1759}
1760
1761static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1762{
1763 int i;
1764 for(i=0; i<width; i++)
1765 {
1766 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1767 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1768 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1769
1770 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1771 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1772 }
1773}
1774
1775static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1776{
1777 int i;
1778 for(i=0; i<width; i++)
1779 {
1780 int r= src[i*3+0];
1781 int g= src[i*3+1];
1782 int b= src[i*3+2];
1783
1784 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1785 }
1786}
1787
1788static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1789{
1790 int i;
1791 for(i=0; i<width; i++)
1792 {
1793 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1794 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1795 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1796
1797 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1798 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1799 }
1800}
1801
1e621b18 1802
077ea8a7
MN
1803// Bilinear / Bicubic scaling
1804static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1805 int16_t *filter, int16_t *filterPos, int filterSize)
2ff198c1 1806{
077ea8a7
MN
1807#ifdef HAVE_MMX
1808 if(filterSize==4) // allways true for upscaling, sometimes for down too
1809 {
1810 int counter= -2*dstW;
1811 filter-= counter*2;
1812 filterPos-= counter/2;
1813 dst-= counter/2;
1814 asm volatile(
1815 "pxor %%mm7, %%mm7 \n\t"
9b464428 1816 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
1817 "pushl %%ebp \n\t" // we use 7 regs here ...
1818 "movl %%eax, %%ebp \n\t"
1819 ".balign 16 \n\t"
1820 "1: \n\t"
1821 "movzwl (%2, %%ebp), %%eax \n\t"
1822 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1823 "movq (%1, %%ebp, 4), %%mm1 \n\t"
1824 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
1825 "movd (%3, %%eax), %%mm0 \n\t"
1826 "movd (%3, %%ebx), %%mm2 \n\t"
1827 "punpcklbw %%mm7, %%mm0 \n\t"
1828 "punpcklbw %%mm7, %%mm2 \n\t"
1829 "pmaddwd %%mm1, %%mm0 \n\t"
1830 "pmaddwd %%mm2, %%mm3 \n\t"
1831 "psrad $8, %%mm0 \n\t"
1832 "psrad $8, %%mm3 \n\t"
1833 "packssdw %%mm3, %%mm0 \n\t"
1834 "pmaddwd %%mm6, %%mm0 \n\t"
1835 "packssdw %%mm0, %%mm0 \n\t"
1836 "movd %%mm0, (%4, %%ebp) \n\t"
1837 "addl $4, %%ebp \n\t"
1838 " jnc 1b \n\t"
e3d2500f 1839
077ea8a7
MN
1840 "popl %%ebp \n\t"
1841 : "+a" (counter)
1842 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1843 : "%ebx"
1844 );
1845 }
1846 else if(filterSize==8)
1847 {
1848 int counter= -2*dstW;
1849 filter-= counter*4;
1850 filterPos-= counter/2;
1851 dst-= counter/2;
1852 asm volatile(
1853 "pxor %%mm7, %%mm7 \n\t"
9b464428 1854 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
1855 "pushl %%ebp \n\t" // we use 7 regs here ...
1856 "movl %%eax, %%ebp \n\t"
1857 ".balign 16 \n\t"
1858 "1: \n\t"
1859 "movzwl (%2, %%ebp), %%eax \n\t"
1860 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1861 "movq (%1, %%ebp, 8), %%mm1 \n\t"
1862 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
1863 "movd (%3, %%eax), %%mm0 \n\t"
1864 "movd (%3, %%ebx), %%mm2 \n\t"
1865 "punpcklbw %%mm7, %%mm0 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "pmaddwd %%mm1, %%mm0 \n\t"
1868 "pmaddwd %%mm2, %%mm3 \n\t"
1869
1870 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
1871 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
1872 "movd 4(%3, %%eax), %%mm4 \n\t"
1873 "movd 4(%3, %%ebx), %%mm2 \n\t"
1874 "punpcklbw %%mm7, %%mm4 \n\t"
1875 "punpcklbw %%mm7, %%mm2 \n\t"
1876 "pmaddwd %%mm1, %%mm4 \n\t"
1877 "pmaddwd %%mm2, %%mm5 \n\t"
1878 "paddd %%mm4, %%mm0 \n\t"
1879 "paddd %%mm5, %%mm3 \n\t"
1880
1881 "psrad $8, %%mm0 \n\t"
1882 "psrad $8, %%mm3 \n\t"
1883 "packssdw %%mm3, %%mm0 \n\t"
1884 "pmaddwd %%mm6, %%mm0 \n\t"
1885 "packssdw %%mm0, %%mm0 \n\t"
1886 "movd %%mm0, (%4, %%ebp) \n\t"
1887 "addl $4, %%ebp \n\t"
1888 " jnc 1b \n\t"
c1b0bfb4 1889
077ea8a7
MN
1890 "popl %%ebp \n\t"
1891 : "+a" (counter)
1892 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1893 : "%ebx"
1894 );
1895 }
1896 else
1897 {
1898 int counter= -2*dstW;
1899// filter-= counter*filterSize/2;
1900 filterPos-= counter/2;
1901 dst-= counter/2;
1902 asm volatile(
1903 "pxor %%mm7, %%mm7 \n\t"
9b464428 1904 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
1905 ".balign 16 \n\t"
1906 "1: \n\t"
1907 "movl %2, %%ecx \n\t"
1908 "movzwl (%%ecx, %0), %%eax \n\t"
1909 "movzwl 2(%%ecx, %0), %%ebx \n\t"
1910 "movl %5, %%ecx \n\t"
1911 "pxor %%mm4, %%mm4 \n\t"
1912 "pxor %%mm5, %%mm5 \n\t"
1913 "2: \n\t"
1914 "movq (%1), %%mm1 \n\t"
1915 "movq (%1, %6), %%mm3 \n\t"
1916 "movd (%%ecx, %%eax), %%mm0 \n\t"
1917 "movd (%%ecx, %%ebx), %%mm2 \n\t"
1918 "punpcklbw %%mm7, %%mm0 \n\t"
1919 "punpcklbw %%mm7, %%mm2 \n\t"
1920 "pmaddwd %%mm1, %%mm0 \n\t"
1921 "pmaddwd %%mm2, %%mm3 \n\t"
1922 "paddd %%mm3, %%mm5 \n\t"
1923 "paddd %%mm0, %%mm4 \n\t"
1924 "addl $8, %1 \n\t"
1925 "addl $4, %%ecx \n\t"
1926 "cmpl %4, %%ecx \n\t"
1927 " jb 2b \n\t"
1928 "addl %6, %1 \n\t"
1929 "psrad $8, %%mm4 \n\t"
1930 "psrad $8, %%mm5 \n\t"
1931 "packssdw %%mm5, %%mm4 \n\t"
1932 "pmaddwd %%mm6, %%mm4 \n\t"
1933 "packssdw %%mm4, %%mm4 \n\t"
1934 "movl %3, %%eax \n\t"
1935 "movd %%mm4, (%%eax, %0) \n\t"
1936 "addl $4, %0 \n\t"
1937 " jnc 1b \n\t"
c1b0bfb4 1938
627690b5
MN
1939 : "+r" (counter), "+r" (filter)
1940 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
077ea8a7 1941 "m" (src), "r" (filterSize*2)
e2f5a2a9 1942 : "%ebx", "%eax", "%ecx"
077ea8a7
MN
1943 );
1944 }
1945#else
1946 int i;
1947 for(i=0; i<dstW; i++)
1948 {
1949 int j;
1950 int srcPos= filterPos[i];
1951 int val=0;
c1b0bfb4 1952// printf("filterPos: %d\n", filterPos[i]);
077ea8a7
MN
1953 for(j=0; j<filterSize; j++)
1954 {
1955// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
1956 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1957 }
1958// filter += hFilterSize;
1959 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
1960// dst[i] = val>>7;
1961 }
1962#endif
1963}
2ff198c1 1964 // *** horizontal scale Y line to temp buffer
28bf81c9
MN
1965static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
1966 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
1e621b18 1967 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
b7dc6f66
MN
1968 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
1969 int32_t *mmx2FilterPos)
077ea8a7 1970{
1e621b18
MN
1971 if(srcFormat==IMGFMT_YUY2)
1972 {
1973 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
1974 src= formatConvBuffer;
1975 }
1976 else if(srcFormat==IMGFMT_BGR32)
1977 {
1978 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
1979 src= formatConvBuffer;
1980 }
1981 else if(srcFormat==IMGFMT_BGR24)
1982 {
1983 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
1984 src= formatConvBuffer;
1985 }
6af250ea
MN
1986 else if(srcFormat==IMGFMT_BGR16)
1987 {
1988 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
1989 src= formatConvBuffer;
1990 }
b72034dd
MN
1991 else if(srcFormat==IMGFMT_BGR15)
1992 {
1993 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
1994 src= formatConvBuffer;
1995 }
a861d4d7
MN
1996 else if(srcFormat==IMGFMT_RGB32)
1997 {
1998 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
1999 src= formatConvBuffer;
2000 }
2001 else if(srcFormat==IMGFMT_RGB24)
2002 {
2003 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2004 src= formatConvBuffer;
2005 }
1e621b18 2006
e3d2500f
MN
2007#ifdef HAVE_MMX
2008 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
28bf81c9 2009 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2010#else
28bf81c9 2011 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2012#endif
077ea8a7
MN
2013 {
2014 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2015 }
2016 else // Fast Bilinear upscale / crap downscale
2017 {
2ff198c1 2018#ifdef ARCH_X86
2ff198c1 2019#ifdef HAVE_MMX2
96034638 2020 int i;
2ff198c1
MN
2021 if(canMMX2BeUsed)
2022 {
2023 asm volatile(
2024 "pxor %%mm7, %%mm7 \n\t"
b7dc6f66
MN
2025 "movl %0, %%ecx \n\t"
2026 "movl %1, %%edi \n\t"
2027 "movl %2, %%edx \n\t"
2028 "movl %3, %%ebx \n\t"
2ff198c1 2029 "xorl %%eax, %%eax \n\t" // i
b7dc6f66
MN
2030 PREFETCH" (%%ecx) \n\t"
2031 PREFETCH" 32(%%ecx) \n\t"
2032 PREFETCH" 64(%%ecx) \n\t"
99cefd0b 2033
2ff198c1 2034#define FUNNY_Y_CODE \
b7dc6f66
MN
2035 "movl (%%ebx), %%esi \n\t"\
2036 "call *%4 \n\t"\
2037 "addl (%%ebx, %%eax), %%ecx \n\t"\
2038 "addl %%eax, %%edi \n\t"\
2039 "xorl %%eax, %%eax \n\t"\
99cefd0b 2040
2ff198c1
MN
2041FUNNY_Y_CODE
2042FUNNY_Y_CODE
2043FUNNY_Y_CODE
2044FUNNY_Y_CODE
2045FUNNY_Y_CODE
2046FUNNY_Y_CODE
2047FUNNY_Y_CODE
2048FUNNY_Y_CODE
2049
b7dc6f66
MN
2050 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2051 "m" (funnyYCode)
2ff198c1
MN
2052 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2053 );
af91b8b3 2054 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2ff198c1
MN
2055 }
2056 else
2057 {
2058#endif
2059 //NO MMX just normal asm ...
2060 asm volatile(
2061 "xorl %%eax, %%eax \n\t" // i
2062 "xorl %%ebx, %%ebx \n\t" // xx
2063 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2064 ".balign 16 \n\t"
2ff198c1
MN
2065 "1: \n\t"
2066 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2067 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2068 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2069 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2070 "shll $16, %%edi \n\t"
2071 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2072 "movl %1, %%edi \n\t"
2073 "shrl $9, %%esi \n\t"
2074 "movw %%si, (%%edi, %%eax, 2) \n\t"
2075 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2076 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2077
2078 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2079 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2080 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2081 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2082 "shll $16, %%edi \n\t"
2083 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2084 "movl %1, %%edi \n\t"
2085 "shrl $9, %%esi \n\t"
2086 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2087 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2088 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2089
2090
2091 "addl $2, %%eax \n\t"
2092 "cmpl %2, %%eax \n\t"
2093 " jb 1b \n\t"
2094
2095
2096 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2097 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2098 );
2099#ifdef HAVE_MMX2
2100 } //if MMX2 cant be used
2101#endif
2102#else
96034638
MN
2103 int i;
2104 unsigned int xpos=0;
2105 for(i=0;i<dstWidth;i++)
2106 {
2107 register unsigned int xx=xpos>>16;
2108 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2109 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2110 xpos+=xInc;
2111 }
2ff198c1 2112#endif
077ea8a7 2113 }
2ff198c1
MN
2114}
2115
28bf81c9
MN
2116inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2117 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
1e621b18 2118 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
b7dc6f66
MN
2119 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2120 int32_t *mmx2FilterPos)
2ff198c1 2121{
1e621b18
MN
2122 if(srcFormat==IMGFMT_YUY2)
2123 {
2124 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2125 src1= formatConvBuffer;
2126 src2= formatConvBuffer+2048;
2127 }
2128 else if(srcFormat==IMGFMT_BGR32)
2129 {
2130 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2131 src1= formatConvBuffer;
2132 src2= formatConvBuffer+2048;
2133 }
2134 else if(srcFormat==IMGFMT_BGR24)
2135 {
2136 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2137 src1= formatConvBuffer;
2138 src2= formatConvBuffer+2048;
2139 }
6af250ea
MN
2140 else if(srcFormat==IMGFMT_BGR16)
2141 {
2142 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2143 src1= formatConvBuffer;
2144 src2= formatConvBuffer+2048;
2145 }
b72034dd
MN
2146 else if(srcFormat==IMGFMT_BGR15)
2147 {
2148 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2149 src1= formatConvBuffer;
2150 src2= formatConvBuffer+2048;
2151 }
a861d4d7
MN
2152 else if(srcFormat==IMGFMT_RGB32)
2153 {
2154 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2155 src1= formatConvBuffer;
2156 src2= formatConvBuffer+2048;
2157 }
2158 else if(srcFormat==IMGFMT_RGB24)
2159 {
2160 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2161 src1= formatConvBuffer;
2162 src2= formatConvBuffer+2048;
2163 }
6ff0ad6b
MN
2164 else if(isGray(srcFormat))
2165 {
2166 return;
2167 }
1e621b18 2168
e3d2500f
MN
2169#ifdef HAVE_MMX
2170 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
28bf81c9 2171 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2172#else
28bf81c9 2173 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2174#endif
077ea8a7
MN
2175 {
2176 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2177 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2178 }
2179 else // Fast Bilinear upscale / crap downscale
2180 {
2ff198c1
MN
2181#ifdef ARCH_X86
2182#ifdef HAVE_MMX2
96034638 2183 int i;
2ff198c1
MN
2184 if(canMMX2BeUsed)
2185 {
2186 asm volatile(
b7dc6f66
MN
2187 "pxor %%mm7, %%mm7 \n\t"
2188 "movl %0, %%ecx \n\t"
2189 "movl %1, %%edi \n\t"
2190 "movl %2, %%edx \n\t"
2191 "movl %3, %%ebx \n\t"
2192 "xorl %%eax, %%eax \n\t" // i
2193 PREFETCH" (%%ecx) \n\t"
2194 PREFETCH" 32(%%ecx) \n\t"
2195 PREFETCH" 64(%%ecx) \n\t"
2196
2197#define FUNNY_UV_CODE \
2198 "movl (%%ebx), %%esi \n\t"\
2199 "call *%4 \n\t"\
2200 "addl (%%ebx, %%eax), %%ecx \n\t"\
2201 "addl %%eax, %%edi \n\t"\
2202 "xorl %%eax, %%eax \n\t"\
2203
2204FUNNY_UV_CODE
2205FUNNY_UV_CODE
2206FUNNY_UV_CODE
2207FUNNY_UV_CODE
2208 "xorl %%eax, %%eax \n\t" // i
2209 "movl %5, %%ecx \n\t" // src
2210 "movl %1, %%edi \n\t" // buf1
2211 "addl $4096, %%edi \n\t"
2212 PREFETCH" (%%ecx) \n\t"
2213 PREFETCH" 32(%%ecx) \n\t"
2214 PREFETCH" 64(%%ecx) \n\t"
2215
2216FUNNY_UV_CODE
2217FUNNY_UV_CODE
2218FUNNY_UV_CODE
2219FUNNY_UV_CODE
2220
2221 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2222 "m" (funnyUVCode), "m" (src2)
2223 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2224 );
c1b0bfb4 2225 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2ff198c1 2226 {
c1b0bfb4
MN
2227// printf("%d %d %d\n", dstWidth, i, srcW);
2228 dst[i] = src1[srcW-1]*128;
2229 dst[i+2048] = src2[srcW-1]*128;
2ff198c1
MN
2230 }
2231 }
2232 else
2233 {
2234#endif
2235 asm volatile(
2236 "xorl %%eax, %%eax \n\t" // i
2237 "xorl %%ebx, %%ebx \n\t" // xx
2238 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2239 ".balign 16 \n\t"
2ff198c1
MN
2240 "1: \n\t"
2241 "movl %0, %%esi \n\t"
2242 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2243 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2244 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2245 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2246 "shll $16, %%edi \n\t"
2247 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2248 "movl %1, %%edi \n\t"
2249 "shrl $9, %%esi \n\t"
2250 "movw %%si, (%%edi, %%eax, 2) \n\t"
2251
2252 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2253 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2254 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2255 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2256 "shll $16, %%edi \n\t"
2257 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2258 "movl %1, %%edi \n\t"
2259 "shrl $9, %%esi \n\t"
2260 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2261
2262 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2263 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2264 "addl $1, %%eax \n\t"
2265 "cmpl %2, %%eax \n\t"
2266 " jb 1b \n\t"
2267
2268 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2269 "r" (src2)
2270 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2271 );
2272#ifdef HAVE_MMX2
2273 } //if MMX2 cant be used
2274#endif
2275#else
96034638
MN
2276 int i;
2277 unsigned int xpos=0;
2278 for(i=0;i<dstWidth;i++)
2279 {
2280 register unsigned int xx=xpos>>16;
2281 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2282 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2283 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1faf0867
MN
2284/* slower
2285 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2286 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2287*/
96034638
MN
2288 xpos+=xInc;
2289 }
2ff198c1 2290#endif
077ea8a7
MN
2291 }
2292}
2293
1e621b18 2294static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
332105e4 2295 int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
28bf81c9
MN
2296
2297 /* load a few things into local vars to make the code more readable? and faster */
2298 const int srcW= c->srcW;
2299 const int dstW= c->dstW;
2300 const int dstH= c->dstH;
2301 const int chrDstW= c->chrDstW;
e616aa93 2302 const int chrSrcW= c->chrSrcW;
28bf81c9
MN
2303 const int lumXInc= c->lumXInc;
2304 const int chrXInc= c->chrXInc;
fe8054c0 2305 const int dstFormat= c->dstFormat;
44c1035c 2306 const int srcFormat= c->srcFormat;
28bf81c9
MN
2307 const int flags= c->flags;
2308 const int canMMX2BeUsed= c->canMMX2BeUsed;
2309 int16_t *vLumFilterPos= c->vLumFilterPos;
2310 int16_t *vChrFilterPos= c->vChrFilterPos;
2311 int16_t *hLumFilterPos= c->hLumFilterPos;
2312 int16_t *hChrFilterPos= c->hChrFilterPos;
2313 int16_t *vLumFilter= c->vLumFilter;
2314 int16_t *vChrFilter= c->vChrFilter;
2315 int16_t *hLumFilter= c->hLumFilter;
2316 int16_t *hChrFilter= c->hChrFilter;
2317 int16_t *lumMmxFilter= c->lumMmxFilter;
2318 int16_t *chrMmxFilter= c->chrMmxFilter;
2319 const int vLumFilterSize= c->vLumFilterSize;
2320 const int vChrFilterSize= c->vChrFilterSize;
2321 const int hLumFilterSize= c->hLumFilterSize;
2322 const int hChrFilterSize= c->hChrFilterSize;
2323 int16_t **lumPixBuf= c->lumPixBuf;
2324 int16_t **chrPixBuf= c->chrPixBuf;
2325 const int vLumBufSize= c->vLumBufSize;
2326 const int vChrBufSize= c->vChrBufSize;
2327 uint8_t *funnyYCode= c->funnyYCode;
2328 uint8_t *funnyUVCode= c->funnyUVCode;
1e621b18 2329 uint8_t *formatConvBuffer= c->formatConvBuffer;
e616aa93
MN
2330 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2331 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
28bf81c9
MN
2332
2333 /* vars whch will change and which we need to storw back in the context */
2334 int dstY= c->dstY;
2335 int lumBufIndex= c->lumBufIndex;
2336 int chrBufIndex= c->chrBufIndex;
2337 int lastInLumBuf= c->lastInLumBuf;
2338 int lastInChrBuf= c->lastInChrBuf;
1e621b18 2339 int srcStride[3];
332105e4 2340 int dstStride[3];
6c7506de
MN
2341 uint8_t *src[3];
2342 uint8_t *dst[3];
5859233b
MN
2343
2344 orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
2345 orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
44c1035c 2346
5859233b 2347 if(isPacked(c->srcFormat)){
1e621b18
MN
2348 src[0]=
2349 src[1]=
2350 src[2]= srcParam[0];
5859233b 2351 srcStride[0]=
1e621b18 2352 srcStride[1]=
5859233b 2353 srcStride[2]= srcStrideParam[0];
6c7506de 2354 }
5859233b
MN
2355 srcStride[1]<<= c->vChrDrop;
2356 srcStride[2]<<= c->vChrDrop;
6c7506de 2357
c7a810cc
MN
2358// printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2359// (int)dst[0], (int)dst[1], (int)dst[2]);
2360
2361#if 0 //self test FIXME move to a vfilter or something
2362{
2363static volatile int i=0;
2364i++;
2365if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2366 selfTest(src, srcStride, c->srcW, c->srcH);
2367i--;
2368}
2369#endif
37079906
MN
2370
2371//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2372//dstStride[0],dstStride[1],dstStride[2]);
6c7506de
MN
2373
2374 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2375 {
2376 static int firstTime=1; //FIXME move this into the context perhaps
2377 if(flags & SWS_PRINT_INFO && firstTime)
2378 {
4a53a912 2379 mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
6c7506de
MN
2380 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2381 firstTime=0;
2382 }
2383 }
28bf81c9 2384
1e621b18
MN
2385 /* Note the user might start scaling the picture in the middle so this will not get executed
2386 this is not really intended but works currently, so ppl might do it */
28bf81c9
MN
2387 if(srcSliceY ==0){
2388 lumBufIndex=0;
2389 chrBufIndex=0;
1e621b18 2390 dstY=0;
28bf81c9
MN
2391 lastInLumBuf= -1;
2392 lastInChrBuf= -1;
077ea8a7 2393 }
d3f41512 2394
c1b0bfb4 2395 for(;dstY < dstH; dstY++){
28bf81c9 2396 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3f7bb50c
MN
2397 const int chrDstY= dstY>>c->chrDstVSubSample;
2398 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2399 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
d3f41512 2400
c1b0bfb4
MN
2401 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2402 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2403 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2404 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
d604bab9 2405
c7f822d9
MN
2406 //handle holes (FAST_BILINEAR & weird filters)
2407 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2408 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2409//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
c1b0bfb4
MN
2410 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2411 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
d3f41512 2412
c1b0bfb4 2413 // Do we have enough lines in this slice to output the dstY line
e616aa93 2414 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
c1b0bfb4
MN
2415 {
2416 //Do horizontal scaling
2417 while(lastInLumBuf < lastLumSrcY)
d3f41512 2418 {
28bf81c9 2419 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4 2420 lumBufIndex++;
c7f822d9 2421// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
c1b0bfb4
MN
2422 ASSERT(lumBufIndex < 2*vLumBufSize)
2423 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2424 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2425// printf("%d %d\n", lumBufIndex, vLumBufSize);
28bf81c9
MN
2426 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2427 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2428 funnyYCode, c->srcFormat, formatConvBuffer,
2429 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2430 lastInLumBuf++;
2431 }
2432 while(lastInChrBuf < lastChrSrcY)
2433 {
e616aa93
MN
2434 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2435 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2436 chrBufIndex++;
2437 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2438 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2439 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
28bf81c9 2440 //FIXME replace parameters through context struct (some at least)
44c1035c
MN
2441
2442 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2443 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2444 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2445 funnyUVCode, c->srcFormat, formatConvBuffer,
2446 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4 2447 lastInChrBuf++;
d3f41512 2448 }
c1b0bfb4
MN
2449 //wrap buf index around to stay inside the ring buffer
2450 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2451 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
d3f41512 2452 }
c1b0bfb4 2453 else // not enough lines left in this slice -> load the rest in the buffer
2ff198c1 2454 {
c1b0bfb4
MN
2455/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2456 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2457 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
e616aa93
MN
2458 vChrBufSize, vLumBufSize);*/
2459
c1b0bfb4
MN
2460 //Do horizontal scaling
2461 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2462 {
28bf81c9 2463 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4
MN
2464 lumBufIndex++;
2465 ASSERT(lumBufIndex < 2*vLumBufSize)
2466 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2467 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
28bf81c9
MN
2468 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2469 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2470 funnyYCode, c->srcFormat, formatConvBuffer,
2471 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2472 lastInLumBuf++;
2473 }
e616aa93 2474 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
c1b0bfb4 2475 {
e616aa93
MN
2476 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2477 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2478 chrBufIndex++;
2479 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2480 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2481 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
44c1035c
MN
2482
2483 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2484 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2485 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2486 funnyUVCode, c->srcFormat, formatConvBuffer,
2487 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4
MN
2488 lastInChrBuf++;
2489 }
2490 //wrap buf index around to stay inside the ring buffer
2491 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2492 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2493 break; //we cant output a dstY line so lets try with the next slice
2ff198c1 2494 }
d3f41512 2495
c1b0bfb4
MN
2496#ifdef HAVE_MMX
2497 b5Dither= dither8[dstY&1];
2498 g6Dither= dither4[dstY&1];
2499 g5Dither= dither8[dstY&1];
2500 r5Dither= dither8[(dstY+1)&1];
2501#endif
28bf81c9 2502 if(dstY < dstH-2)
e3d2500f 2503 {
44c1035c 2504 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
0f25d72b 2505 {
44c1035c 2506 if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
c1b0bfb4 2507 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2ff198c1 2508 {
c1b0bfb4
MN
2509 int16_t *lumBuf = lumPixBuf[0];
2510 int16_t *chrBuf= chrPixBuf[0];
e616aa93 2511 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
c1b0bfb4
MN
2512 }
2513 else //General YV12
2514 {
2515 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2516 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2517 RENAME(yuv2yuvX)(
e616aa93
MN
2518 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2519 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2520 dest, uDest, vDest, dstW, chrDstW,
2521 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+chrDstY*vChrFilterSize*4);
2ff198c1 2522 }
0f25d72b 2523 }
c1b0bfb4 2524 else
2ff198c1 2525 {
c1b0bfb4
MN
2526 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2527 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
d3f41512 2528
c1b0bfb4
MN
2529 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2530 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2531 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2532 {
2533 int chrAlpha= vChrFilter[2*dstY+1];
2ff198c1 2534
cf7d1c1a
MN
2535 RENAME(yuv2rgb1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2536 dest, dstW, chrAlpha, dstFormat, flags, dstY);
c1b0bfb4
MN
2537 }
2538 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2539 {
2540 int lumAlpha= vLumFilter[2*dstY+1];
2541 int chrAlpha= vChrFilter[2*dstY+1];
2542
cf7d1c1a
MN
2543 RENAME(yuv2rgb2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2544 dest, dstW, lumAlpha, chrAlpha, dstY);
c1b0bfb4
MN
2545 }
2546 else //General RGB
2547 {
cf7d1c1a 2548 RENAME(yuv2rgbX)(c,
c1b0bfb4
MN
2549 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2550 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
cf7d1c1a
MN
2551 dest, dstW,
2552 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4, dstY);
c1b0bfb4
MN
2553 }
2554 }
e3d2500f
MN
2555 }
2556 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2557 {
2558 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2559 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
f5b58629 2560 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
e3d2500f 2561 {
f5b58629 2562 if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL;
5859233b 2563 yuv2yuvXinC(
e616aa93
MN
2564 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2565 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
5859233b 2566 dest, uDest, vDest, dstW, chrDstW);
e3d2500f
MN
2567 }
2568 else
2569 {
2570 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2571 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
cf7d1c1a 2572 yuv2rgbXinC(c,
e3d2500f
MN
2573 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2574 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
cf7d1c1a 2575 dest, dstW, dstY);
e3d2500f
MN
2576 }
2577 }
c1b0bfb4 2578 }
17f715fa
MN
2579
2580#ifdef HAVE_MMX
2581 __asm __volatile(SFENCE:::"memory");
1faf0867 2582 __asm __volatile(EMMS:::"memory");
17f715fa 2583#endif
28bf81c9
MN
2584 /* store changed local vars back in the context */
2585 c->dstY= dstY;
2586 c->lumBufIndex= lumBufIndex;
2587 c->chrBufIndex= chrBufIndex;
2588 c->lastInLumBuf= lastInLumBuf;
2589 c->lastInChrBuf= lastInChrBuf;
627690b5 2590}