bgr24toY in MMX
[libav.git] / postproc / swscale_template.c
CommitLineData
fe8054c0
MN
1/*
2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
541c4eb9 19#undef MOVNTQ
7d7f78b5 20#undef PAVGB
48a05cec
MN
21#undef PREFETCH
22#undef PREFETCHW
23#undef EMMS
24#undef SFENCE
25
26#ifdef HAVE_3DNOW
27/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28#define EMMS "femms"
29#else
30#define EMMS "emms"
31#endif
32
33#ifdef HAVE_3DNOW
34#define PREFETCH "prefetch"
35#define PREFETCHW "prefetchw"
36#elif defined ( HAVE_MMX2 )
37#define PREFETCH "prefetchnta"
38#define PREFETCHW "prefetcht0"
39#else
40#define PREFETCH "/nop"
41#define PREFETCHW "/nop"
42#endif
43
44#ifdef HAVE_MMX2
45#define SFENCE "sfence"
46#else
47#define SFENCE "/nop"
48#endif
d3f41512 49
d604bab9
MN
50#ifdef HAVE_MMX2
51#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52#elif defined (HAVE_3DNOW)
53#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54#endif
d3f41512 55
d604bab9
MN
56#ifdef HAVE_MMX2
57#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58#else
59#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60#endif
61
c1b0bfb4
MN
62
63#define YSCALEYUV2YV12X(x) \
64 "xorl %%eax, %%eax \n\t"\
65 "pxor %%mm3, %%mm3 \n\t"\
66 "pxor %%mm4, %%mm4 \n\t"\
67 "movl %0, %%edx \n\t"\
68 ".balign 16 \n\t" /* FIXME Unroll? */\
69 "1: \n\t"\
70 "movl (%1, %%edx, 4), %%esi \n\t"\
71 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
72 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
73 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
74 "pmulhw %%mm0, %%mm2 \n\t"\
75 "pmulhw %%mm0, %%mm5 \n\t"\
76 "paddw %%mm2, %%mm3 \n\t"\
77 "paddw %%mm5, %%mm4 \n\t"\
78 "addl $1, %%edx \n\t"\
79 " jnz 1b \n\t"\
80 "psraw $3, %%mm3 \n\t"\
81 "psraw $3, %%mm4 \n\t"\
82 "packuswb %%mm4, %%mm3 \n\t"\
83 MOVNTQ(%%mm3, (%3, %%eax))\
84 "addl $8, %%eax \n\t"\
85 "cmpl %4, %%eax \n\t"\
86 "pxor %%mm3, %%mm3 \n\t"\
87 "pxor %%mm4, %%mm4 \n\t"\
88 "movl %0, %%edx \n\t"\
89 "jb 1b \n\t"
90
91#define YSCALEYUV2YV121 \
92 "movl %2, %%eax \n\t"\
93 ".balign 16 \n\t" /* FIXME Unroll? */\
94 "1: \n\t"\
95 "movq (%0, %%eax, 2), %%mm0 \n\t"\
96 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
97 "psraw $7, %%mm0 \n\t"\
98 "psraw $7, %%mm1 \n\t"\
99 "packuswb %%mm1, %%mm0 \n\t"\
100 MOVNTQ(%%mm0, (%1, %%eax))\
101 "addl $8, %%eax \n\t"\
102 "jnc 1b \n\t"
103
104/*
105 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
106 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
107 "r" (dest), "m" (dstW),
108 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
109 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
110*/
111#define YSCALEYUV2RGBX \
112 "xorl %%eax, %%eax \n\t"\
113 ".balign 16 \n\t"\
114 "1: \n\t"\
115 "movl %1, %%edx \n\t" /* -chrFilterSize */\
116 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
117 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
118 "pxor %%mm3, %%mm3 \n\t"\
119 "pxor %%mm4, %%mm4 \n\t"\
120 "2: \n\t"\
121 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
122 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
123 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
124 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
125 "pmulhw %%mm0, %%mm2 \n\t"\
126 "pmulhw %%mm0, %%mm5 \n\t"\
127 "paddw %%mm2, %%mm3 \n\t"\
128 "paddw %%mm5, %%mm4 \n\t"\
129 "addl $1, %%edx \n\t"\
130 " jnz 2b \n\t"\
131\
132 "movl %0, %%edx \n\t" /* -lumFilterSize */\
133 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
134 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
135 "pxor %%mm1, %%mm1 \n\t"\
136 "pxor %%mm7, %%mm7 \n\t"\
137 "2: \n\t"\
138 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
139 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
140 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
141 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
142 "pmulhw %%mm0, %%mm2 \n\t"\
143 "pmulhw %%mm0, %%mm5 \n\t"\
144 "paddw %%mm2, %%mm1 \n\t"\
145 "paddw %%mm5, %%mm7 \n\t"\
146 "addl $1, %%edx \n\t"\
147 " jnz 2b \n\t"\
148\
9b464428
FB
149 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
150 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
c1b0bfb4
MN
151 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
152 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
153 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
154 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
c1b0bfb4 155 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
9b464428
FB
156 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
157 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
158 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
159 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
160 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
161 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
c1b0bfb4
MN
162 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
163 "paddw %%mm3, %%mm4 \n\t"\
164 "movq %%mm2, %%mm0 \n\t"\
165 "movq %%mm5, %%mm6 \n\t"\
166 "movq %%mm4, %%mm3 \n\t"\
167 "punpcklwd %%mm2, %%mm2 \n\t"\
168 "punpcklwd %%mm5, %%mm5 \n\t"\
169 "punpcklwd %%mm4, %%mm4 \n\t"\
170 "paddw %%mm1, %%mm2 \n\t"\
171 "paddw %%mm1, %%mm5 \n\t"\
172 "paddw %%mm1, %%mm4 \n\t"\
173 "punpckhwd %%mm0, %%mm0 \n\t"\
174 "punpckhwd %%mm6, %%mm6 \n\t"\
175 "punpckhwd %%mm3, %%mm3 \n\t"\
176 "paddw %%mm7, %%mm0 \n\t"\
177 "paddw %%mm7, %%mm6 \n\t"\
178 "paddw %%mm7, %%mm3 \n\t"\
179 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
180 "packuswb %%mm0, %%mm2 \n\t"\
181 "packuswb %%mm6, %%mm5 \n\t"\
182 "packuswb %%mm3, %%mm4 \n\t"\
183 "pxor %%mm7, %%mm7 \n\t"
184
d604bab9
MN
185#define FULL_YSCALEYUV2RGB \
186 "pxor %%mm7, %%mm7 \n\t"\
187 "movd %6, %%mm6 \n\t" /*yalpha1*/\
188 "punpcklwd %%mm6, %%mm6 \n\t"\
189 "punpcklwd %%mm6, %%mm6 \n\t"\
190 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
191 "punpcklwd %%mm5, %%mm5 \n\t"\
192 "punpcklwd %%mm5, %%mm5 \n\t"\
193 "xorl %%eax, %%eax \n\t"\
cff6ecd7 194 ".balign 16 \n\t"\
d604bab9
MN
195 "1: \n\t"\
196 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
197 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
198 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
199 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
200 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
201 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
202 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
203 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
204 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
205 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
206 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
207 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
208 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
209 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
210 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
9b464428
FB
211 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
212 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
213 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
214\
215\
216 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
217 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
9b464428 218 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
d604bab9 219 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428 220 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
d604bab9 221 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428 222 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
223\
224\
225 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
9b464428
FB
226 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
227 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
228 "paddw %%mm1, %%mm3 \n\t" /* B*/\
229 "paddw %%mm1, %%mm0 \n\t" /* R*/\
230 "packuswb %%mm3, %%mm3 \n\t"\
231\
232 "packuswb %%mm0, %%mm0 \n\t"\
233 "paddw %%mm4, %%mm2 \n\t"\
234 "paddw %%mm2, %%mm1 \n\t" /* G*/\
235\
236 "packuswb %%mm1, %%mm1 \n\t"
237
238#define YSCALEYUV2RGB \
239 "movd %6, %%mm6 \n\t" /*yalpha1*/\
240 "punpcklwd %%mm6, %%mm6 \n\t"\
241 "punpcklwd %%mm6, %%mm6 \n\t"\
9b464428 242 "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\
d604bab9
MN
243 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
244 "punpcklwd %%mm5, %%mm5 \n\t"\
245 "punpcklwd %%mm5, %%mm5 \n\t"\
9b464428 246 "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\
d604bab9 247 "xorl %%eax, %%eax \n\t"\
cff6ecd7 248 ".balign 16 \n\t"\
d604bab9
MN
249 "1: \n\t"\
250 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
251 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
252 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
253 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
254 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
255 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
9b464428 256 "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\
d604bab9
MN
257 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
258 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
259 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
260 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
261 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
262 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428
FB
263 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
264 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
265 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
266 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
267 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
268 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
269 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
270 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
271 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
272 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
273 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
274 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
275 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
9b464428
FB
276 "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
277 "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
d604bab9
MN
278 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
280 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
281 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
9b464428
FB
282 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
283 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
284 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
285 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
286 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
287 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
d604bab9
MN
288 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
289 "paddw %%mm3, %%mm4 \n\t"\
290 "movq %%mm2, %%mm0 \n\t"\
291 "movq %%mm5, %%mm6 \n\t"\
292 "movq %%mm4, %%mm3 \n\t"\
293 "punpcklwd %%mm2, %%mm2 \n\t"\
294 "punpcklwd %%mm5, %%mm5 \n\t"\
295 "punpcklwd %%mm4, %%mm4 \n\t"\
296 "paddw %%mm1, %%mm2 \n\t"\
297 "paddw %%mm1, %%mm5 \n\t"\
298 "paddw %%mm1, %%mm4 \n\t"\
299 "punpckhwd %%mm0, %%mm0 \n\t"\
300 "punpckhwd %%mm6, %%mm6 \n\t"\
301 "punpckhwd %%mm3, %%mm3 \n\t"\
302 "paddw %%mm7, %%mm0 \n\t"\
303 "paddw %%mm7, %%mm6 \n\t"\
304 "paddw %%mm7, %%mm3 \n\t"\
305 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
306 "packuswb %%mm0, %%mm2 \n\t"\
307 "packuswb %%mm6, %%mm5 \n\t"\
308 "packuswb %%mm3, %%mm4 \n\t"\
309 "pxor %%mm7, %%mm7 \n\t"
310
311#define YSCALEYUV2RGB1 \
312 "xorl %%eax, %%eax \n\t"\
cff6ecd7 313 ".balign 16 \n\t"\
d604bab9
MN
314 "1: \n\t"\
315 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
316 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
317 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
318 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428
FB
319 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
320 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
321 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
322 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
323 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
324 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9 325 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
497d4f99
MN
326 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
327 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
328 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
329 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
9b464428
FB
330 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
331 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
332 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
333 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
334 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
335 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
497d4f99
MN
336 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
337 "paddw %%mm3, %%mm4 \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "movq %%mm5, %%mm6 \n\t"\
340 "movq %%mm4, %%mm3 \n\t"\
341 "punpcklwd %%mm2, %%mm2 \n\t"\
342 "punpcklwd %%mm5, %%mm5 \n\t"\
343 "punpcklwd %%mm4, %%mm4 \n\t"\
344 "paddw %%mm1, %%mm2 \n\t"\
345 "paddw %%mm1, %%mm5 \n\t"\
346 "paddw %%mm1, %%mm4 \n\t"\
347 "punpckhwd %%mm0, %%mm0 \n\t"\
348 "punpckhwd %%mm6, %%mm6 \n\t"\
349 "punpckhwd %%mm3, %%mm3 \n\t"\
350 "paddw %%mm7, %%mm0 \n\t"\
351 "paddw %%mm7, %%mm6 \n\t"\
352 "paddw %%mm7, %%mm3 \n\t"\
353 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
354 "packuswb %%mm0, %%mm2 \n\t"\
355 "packuswb %%mm6, %%mm5 \n\t"\
356 "packuswb %%mm3, %%mm4 \n\t"\
357 "pxor %%mm7, %%mm7 \n\t"
358
359// do vertical chrominance interpolation
360#define YSCALEYUV2RGB1b \
361 "xorl %%eax, %%eax \n\t"\
cff6ecd7 362 ".balign 16 \n\t"\
497d4f99
MN
363 "1: \n\t"\
364 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
365 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
366 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
367 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397c035e
MN
368 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
369 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
c1b0bfb4
MN
370 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
371 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
9b464428
FB
372 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
373 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
497d4f99
MN
374 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
375 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
376 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
377 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
497d4f99
MN
378 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
379 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
380 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
d604bab9
MN
381 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
382 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
9b464428
FB
383 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
384 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
385 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
386 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
387 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
388 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
d604bab9
MN
389 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
390 "paddw %%mm3, %%mm4 \n\t"\
391 "movq %%mm2, %%mm0 \n\t"\
392 "movq %%mm5, %%mm6 \n\t"\
393 "movq %%mm4, %%mm3 \n\t"\
394 "punpcklwd %%mm2, %%mm2 \n\t"\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm4, %%mm4 \n\t"\
397 "paddw %%mm1, %%mm2 \n\t"\
398 "paddw %%mm1, %%mm5 \n\t"\
399 "paddw %%mm1, %%mm4 \n\t"\
400 "punpckhwd %%mm0, %%mm0 \n\t"\
401 "punpckhwd %%mm6, %%mm6 \n\t"\
402 "punpckhwd %%mm3, %%mm3 \n\t"\
403 "paddw %%mm7, %%mm0 \n\t"\
404 "paddw %%mm7, %%mm6 \n\t"\
405 "paddw %%mm7, %%mm3 \n\t"\
406 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
407 "packuswb %%mm0, %%mm2 \n\t"\
408 "packuswb %%mm6, %%mm5 \n\t"\
409 "packuswb %%mm3, %%mm4 \n\t"\
410 "pxor %%mm7, %%mm7 \n\t"
411
412#define WRITEBGR32 \
413 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
414 "movq %%mm2, %%mm1 \n\t" /* B */\
415 "movq %%mm5, %%mm6 \n\t" /* R */\
416 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
417 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
418 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
419 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
420 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
421 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
422 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
423 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
424 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
425 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
426\
427 MOVNTQ(%%mm0, (%4, %%eax, 4))\
428 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
429 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
430 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
431\
432 "addl $8, %%eax \n\t"\
433 "cmpl %5, %%eax \n\t"\
434 " jb 1b \n\t"
435
436#define WRITEBGR16 \
9b464428
FB
437 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
438 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
439 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb 440 "psrlq $3, %%mm2 \n\t"\
d604bab9 441\
f62255fb
MN
442 "movq %%mm2, %%mm1 \n\t"\
443 "movq %%mm4, %%mm3 \n\t"\
d604bab9 444\
f62255fb
MN
445 "punpcklbw %%mm7, %%mm3 \n\t"\
446 "punpcklbw %%mm5, %%mm2 \n\t"\
447 "punpckhbw %%mm7, %%mm4 \n\t"\
448 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 449\
f62255fb
MN
450 "psllq $3, %%mm3 \n\t"\
451 "psllq $3, %%mm4 \n\t"\
d604bab9
MN
452\
453 "por %%mm3, %%mm2 \n\t"\
d604bab9 454 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
455\
456 MOVNTQ(%%mm2, (%4, %%eax, 2))\
457 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
458\
459 "addl $8, %%eax \n\t"\
460 "cmpl %5, %%eax \n\t"\
461 " jb 1b \n\t"
462
463#define WRITEBGR15 \
9b464428
FB
464 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
465 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
466 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb
MN
467 "psrlq $3, %%mm2 \n\t"\
468 "psrlq $1, %%mm5 \n\t"\
d604bab9 469\
f62255fb
MN
470 "movq %%mm2, %%mm1 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
d604bab9 472\
f62255fb
MN
473 "punpcklbw %%mm7, %%mm3 \n\t"\
474 "punpcklbw %%mm5, %%mm2 \n\t"\
475 "punpckhbw %%mm7, %%mm4 \n\t"\
476 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 477\
f62255fb
MN
478 "psllq $2, %%mm3 \n\t"\
479 "psllq $2, %%mm4 \n\t"\
d604bab9
MN
480\
481 "por %%mm3, %%mm2 \n\t"\
d604bab9 482 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
483\
484 MOVNTQ(%%mm2, (%4, %%eax, 2))\
485 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
486\
487 "addl $8, %%eax \n\t"\
488 "cmpl %5, %%eax \n\t"\
489 " jb 1b \n\t"
f62255fb 490
99d2cb72 491#define WRITEBGR24OLD \
d604bab9
MN
492 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
493 "movq %%mm2, %%mm1 \n\t" /* B */\
494 "movq %%mm5, %%mm6 \n\t" /* R */\
495 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
496 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
497 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
498 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
499 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
500 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
a525ce8d
MN
501 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
502 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
503 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
504 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9
MN
505\
506 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
507 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
9b464428
FB
508 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
509 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
d604bab9
MN
510 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
511 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
512 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
513 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
514\
515 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
516 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
517 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
518 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
9b464428 519 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
d604bab9
MN
520 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
521 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
9b464428
FB
522 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
523 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
d604bab9
MN
524 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
525 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
526 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
527 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
528\
529 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
530 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
531 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
9b464428
FB
532 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
533 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
d604bab9
MN
534 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
535 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
536 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
537\
bdc2eb9a
MN
538 MOVNTQ(%%mm0, (%%ebx))\
539 MOVNTQ(%%mm2, 8(%%ebx))\
540 MOVNTQ(%%mm3, 16(%%ebx))\
541 "addl $24, %%ebx \n\t"\
d604bab9
MN
542\
543 "addl $8, %%eax \n\t"\
544 "cmpl %5, %%eax \n\t"\
545 " jb 1b \n\t"
546
99d2cb72
MN
547#define WRITEBGR24MMX \
548 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
549 "movq %%mm2, %%mm1 \n\t" /* B */\
550 "movq %%mm5, %%mm6 \n\t" /* R */\
551 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
552 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
553 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
554 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
555 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
556 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
557 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
558 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
559 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
560 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
561\
562 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
563 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
564 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
565 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
566\
567 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
568 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
569 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
570 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
571\
572 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
573 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
574 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
575 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
576\
577 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
578 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
579 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
580 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
581 MOVNTQ(%%mm0, (%%ebx))\
582\
583 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
584 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
585 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
586 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
587 MOVNTQ(%%mm6, 8(%%ebx))\
588\
589 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
590 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
591 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
592 MOVNTQ(%%mm5, 16(%%ebx))\
593\
594 "addl $24, %%ebx \n\t"\
595\
596 "addl $8, %%eax \n\t"\
597 "cmpl %5, %%eax \n\t"\
598 " jb 1b \n\t"
599
600#define WRITEBGR24MMX2 \
601 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
9b464428
FB
602 "movq "MANGLE(M24A)", %%mm0 \n\t"\
603 "movq "MANGLE(M24C)", %%mm7 \n\t"\
99d2cb72
MN
604 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
605 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
606 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
607\
608 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
609 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
610 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
611\
612 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
613 "por %%mm1, %%mm6 \n\t"\
614 "por %%mm3, %%mm6 \n\t"\
615 MOVNTQ(%%mm6, (%%ebx))\
616\
617 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
618 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
619 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
620 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
621\
9b464428 622 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
99d2cb72
MN
623 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
624 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
625\
626 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
627 "por %%mm3, %%mm6 \n\t"\
628 MOVNTQ(%%mm6, 8(%%ebx))\
629\
630 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
631 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
632 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
633\
634 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
635 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
9b464428 636 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72
MN
637\
638 "por %%mm1, %%mm3 \n\t"\
639 "por %%mm3, %%mm6 \n\t"\
640 MOVNTQ(%%mm6, 16(%%ebx))\
641\
642 "addl $24, %%ebx \n\t"\
643\
644 "addl $8, %%eax \n\t"\
645 "cmpl %5, %%eax \n\t"\
646 " jb 1b \n\t"
647
648#ifdef HAVE_MMX2
7630f2e0 649#undef WRITEBGR24
99d2cb72
MN
650#define WRITEBGR24 WRITEBGR24MMX2
651#else
7630f2e0 652#undef WRITEBGR24
99d2cb72
MN
653#define WRITEBGR24 WRITEBGR24MMX
654#endif
655
c1b0bfb4
MN
656static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
657 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
658 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
659 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
38858470 660{
c1b0bfb4
MN
661#ifdef HAVE_MMX
662 if(uDest != NULL)
663 {
664 asm volatile(
665 YSCALEYUV2YV12X(0)
666 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
667 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
668 : "%eax", "%edx", "%esi"
669 );
670
671 asm volatile(
672 YSCALEYUV2YV12X(4096)
673 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
674 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
675 : "%eax", "%edx", "%esi"
676 );
677 }
678
679 asm volatile(
680 YSCALEYUV2YV12X(0)
681 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
682 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
683 : "%eax", "%edx", "%esi"
684 );
685#else
e3d2500f
MN
686yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
687 chrFilter, chrSrc, chrFilterSize,
688 dest, uDest, vDest, dstW);
7630f2e0 689#endif
c1b0bfb4 690}
2add307d 691
c1b0bfb4
MN
692static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
693 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
694{
695#ifdef HAVE_MMX
696 if(uDest != NULL)
38858470 697 {
c1b0bfb4
MN
698 asm volatile(
699 YSCALEYUV2YV121
700 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
701 "g" (-(dstW>>1))
702 : "%eax"
703 );
704
705 asm volatile(
706 YSCALEYUV2YV121
707 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
708 "g" (-(dstW>>1))
709 : "%eax"
710 );
38858470
MN
711 }
712
c1b0bfb4
MN
713 asm volatile(
714 YSCALEYUV2YV121
715 :: "r" (lumSrc + dstW), "r" (dest + dstW),
716 "g" (-dstW)
717 : "%eax"
718 );
719#else
720 //FIXME Optimize (just quickly writen not opti..)
721 //FIXME replace MINMAX with LUTs
722 int i;
723 for(i=0; i<dstW; i++)
38858470 724 {
c1b0bfb4
MN
725 int val= lumSrc[i]>>7;
726
727 dest[i]= MIN(MAX(val>>19, 0), 255);
728 }
729
730 if(uDest != NULL)
d1fac6cf 731 for(i=0; i<(dstW>>1); i++)
38858470 732 {
c1b0bfb4
MN
733 int u=chrSrc[i]>>7;
734 int v=chrSrc[i + 2048]>>7;
735
736 uDest[i]= MIN(MAX(u>>19, 0), 255);
737 vDest[i]= MIN(MAX(v>>19, 0), 255);
38858470 738 }
c1b0bfb4 739#endif
38858470
MN
740}
741
c1b0bfb4 742
d604bab9
MN
743/**
744 * vertical scale YV12 to RGB
745 */
c1b0bfb4
MN
746static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
747 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
28bf81c9 748 uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
c1b0bfb4 749{
28bf81c9 750/* if(flags&SWS_FULL_UV_IPOL)
c1b0bfb4
MN
751 {
752//FIXME
753 }//FULL_UV_IPOL
28bf81c9 754 else*/
c1b0bfb4
MN
755 {
756#ifdef HAVE_MMX
28bf81c9 757 if(dstFormat == IMGFMT_BGR32) //FIXME untested
c1b0bfb4
MN
758 {
759 asm volatile(
760 YSCALEYUV2RGBX
761 WRITEBGR32
762
763 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
764 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
765 "r" (dest), "m" (dstW),
766 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
767 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
768 );
769 }
28bf81c9 770 else if(dstFormat == IMGFMT_BGR24) //FIXME untested
c1b0bfb4
MN
771 {
772 asm volatile(
773 YSCALEYUV2RGBX
774 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
775 "addl %4, %%ebx \n\t"
776 WRITEBGR24
777
778 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
779 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
780 "r" (dest), "m" (dstW),
781 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
782 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
783 );
784 }
28bf81c9 785 else if(dstFormat==IMGFMT_BGR15)
c1b0bfb4
MN
786 {
787 asm volatile(
788 YSCALEYUV2RGBX
789 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
790#ifdef DITHER1XBPP
9b464428
FB
791 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
792 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
793 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
794#endif
795
796 WRITEBGR15
797
798 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
799 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
800 "r" (dest), "m" (dstW),
801 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
802 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
803 );
804 }
28bf81c9 805 else if(dstFormat==IMGFMT_BGR16)
c1b0bfb4
MN
806 {
807 asm volatile(
808 YSCALEYUV2RGBX
809 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
810#ifdef DITHER1XBPP
9b464428
FB
811 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
812 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
813 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
814#endif
815
816 WRITEBGR16
817
818 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
819 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
820 "r" (dest), "m" (dstW),
821 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
822 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
823 );
824 }
825#else
e3d2500f
MN
826yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
827 chrFilter, chrSrc, chrFilterSize,
28bf81c9 828 dest, dstW, dstFormat);
c1b0bfb4 829
c1b0bfb4
MN
830#endif
831 } //!FULL_UV_IPOL
832}
833
834
835/**
836 * vertical bilinear scale YV12 to RGB
837 */
838static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
28bf81c9 839 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
d604bab9
MN
840{
841 int yalpha1=yalpha^4095;
842 int uvalpha1=uvalpha^4095;
d604bab9 843
1e621b18 844 if(flags&SWS_FULL_CHR_H_INT)
d604bab9
MN
845 {
846
847#ifdef HAVE_MMX
28bf81c9 848 if(dstFormat==IMGFMT_BGR32)
d604bab9
MN
849 {
850 asm volatile(
851
852
853FULL_YSCALEYUV2RGB
854 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
855 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
856
857 "movq %%mm3, %%mm1 \n\t"
858 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
859 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
860
861 MOVNTQ(%%mm3, (%4, %%eax, 4))
862 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
863
864 "addl $4, %%eax \n\t"
865 "cmpl %5, %%eax \n\t"
866 " jb 1b \n\t"
867
868
d1fac6cf 869 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
870 "m" (yalpha1), "m" (uvalpha1)
871 : "%eax"
872 );
873 }
28bf81c9 874 else if(dstFormat==IMGFMT_BGR24)
d604bab9
MN
875 {
876 asm volatile(
877
878FULL_YSCALEYUV2RGB
879
880 // lsb ... msb
881 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
882 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
883
884 "movq %%mm3, %%mm1 \n\t"
885 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
886 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
887
888 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
889 "psrlq $8, %%mm3 \n\t" // GR0BGR00
9b464428
FB
890 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
891 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
d604bab9
MN
892 "por %%mm2, %%mm3 \n\t" // BGRBGR00
893 "movq %%mm1, %%mm2 \n\t"
894 "psllq $48, %%mm1 \n\t" // 000000BG
895 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
896
897 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
898 "psrld $16, %%mm2 \n\t" // R000R000
899 "psrlq $24, %%mm1 \n\t" // 0BGR0000
900 "por %%mm2, %%mm1 \n\t" // RBGRR000
901
902 "movl %4, %%ebx \n\t"
903 "addl %%eax, %%ebx \n\t"
904
905#ifdef HAVE_MMX2
906 //FIXME Alignment
907 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
908 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
909#else
910 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
911 "psrlq $32, %%mm3 \n\t"
912 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
913 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
914#endif
915 "addl $4, %%eax \n\t"
916 "cmpl %5, %%eax \n\t"
917 " jb 1b \n\t"
918
d1fac6cf 919 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
920 "m" (yalpha1), "m" (uvalpha1)
921 : "%eax", "%ebx"
922 );
923 }
28bf81c9 924 else if(dstFormat==IMGFMT_BGR15)
d604bab9
MN
925 {
926 asm volatile(
927
928FULL_YSCALEYUV2RGB
929#ifdef DITHER1XBPP
9b464428
FB
930 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
931 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
932 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
933#endif
934 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
935 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
936 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
937
938 "psrlw $3, %%mm3 \n\t"
939 "psllw $2, %%mm1 \n\t"
940 "psllw $7, %%mm0 \n\t"
9b464428
FB
941 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
942 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9
MN
943
944 "por %%mm3, %%mm1 \n\t"
945 "por %%mm1, %%mm0 \n\t"
946
947 MOVNTQ(%%mm0, (%4, %%eax, 2))
948
949 "addl $4, %%eax \n\t"
950 "cmpl %5, %%eax \n\t"
951 " jb 1b \n\t"
952
d1fac6cf 953 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
954 "m" (yalpha1), "m" (uvalpha1)
955 : "%eax"
956 );
957 }
28bf81c9 958 else if(dstFormat==IMGFMT_BGR16)
d604bab9
MN
959 {
960 asm volatile(
961
962FULL_YSCALEYUV2RGB
963#ifdef DITHER1XBPP
9b464428
FB
964 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
965 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
966 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
967#endif
968 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
969 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
970 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
971
972 "psrlw $3, %%mm3 \n\t"
973 "psllw $3, %%mm1 \n\t"
974 "psllw $8, %%mm0 \n\t"
9b464428
FB
975 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
976 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9
MN
977
978 "por %%mm3, %%mm1 \n\t"
979 "por %%mm1, %%mm0 \n\t"
980
981 MOVNTQ(%%mm0, (%4, %%eax, 2))
982
983 "addl $4, %%eax \n\t"
984 "cmpl %5, %%eax \n\t"
985 " jb 1b \n\t"
986
d1fac6cf 987 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
988 "m" (yalpha1), "m" (uvalpha1)
989 : "%eax"
990 );
991 }
992#else
28bf81c9
MN
993 if(dstFormat==IMGFMT_BGR32)
994 {
995 int i;
996 for(i=0;i<dstW;i++){
997 // vertical linear interpolation && yuv2rgb in a single step:
998 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
999 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1000 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1001 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1002 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1003 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1004 dest+= 4;
1005 }
1006 }
1007 else if(dstFormat==IMGFMT_BGR24)
d604bab9 1008 {
96034638 1009 int i;
d1fac6cf 1010 for(i=0;i<dstW;i++){
d604bab9
MN
1011 // vertical linear interpolation && yuv2rgb in a single step:
1012 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1013 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1014 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
390b20a6
MN
1015 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1016 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1017 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
28bf81c9 1018 dest+= 3;
d604bab9
MN
1019 }
1020 }
28bf81c9 1021 else if(dstFormat==IMGFMT_BGR16)
d604bab9 1022 {
96034638 1023 int i;
d1fac6cf 1024 for(i=0;i<dstW;i++){
d604bab9
MN
1025 // vertical linear interpolation && yuv2rgb in a single step:
1026 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1027 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1028 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1029
d022ce5c 1030 ((uint16_t*)dest)[i] =
b18ea156
MN
1031 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1032 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1033 clip_table16r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1034 }
1035 }
28bf81c9 1036 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1037 {
96034638 1038 int i;
d1fac6cf 1039 for(i=0;i<dstW;i++){
d604bab9
MN
1040 // vertical linear interpolation && yuv2rgb in a single step:
1041 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1042 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1043 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1044
d022ce5c 1045 ((uint16_t*)dest)[i] =
b18ea156
MN
1046 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1047 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1048 clip_table15r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1049 }
1050 }
1051#endif
1052 }//FULL_UV_IPOL
1053 else
1054 {
1055#ifdef HAVE_MMX
28bf81c9 1056 if(dstFormat==IMGFMT_BGR32)
d604bab9
MN
1057 {
1058 asm volatile(
1059 YSCALEYUV2RGB
1060 WRITEBGR32
1061
d1fac6cf 1062 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1063 "m" (yalpha1), "m" (uvalpha1)
1064 : "%eax"
1065 );
1066 }
28bf81c9 1067 else if(dstFormat==IMGFMT_BGR24)
d604bab9
MN
1068 {
1069 asm volatile(
bdc2eb9a 1070 "movl %4, %%ebx \n\t"
d604bab9
MN
1071 YSCALEYUV2RGB
1072 WRITEBGR24
1073
d1fac6cf 1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1075 "m" (yalpha1), "m" (uvalpha1)
1076 : "%eax", "%ebx"
1077 );
1078 }
28bf81c9 1079 else if(dstFormat==IMGFMT_BGR15)
d604bab9
MN
1080 {
1081 asm volatile(
1082 YSCALEYUV2RGB
1083 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1084#ifdef DITHER1XBPP
9b464428
FB
1085 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1086 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1087 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1088#endif
1089
1090 WRITEBGR15
1091
d1fac6cf 1092 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1093 "m" (yalpha1), "m" (uvalpha1)
1094 : "%eax"
1095 );
1096 }
28bf81c9 1097 else if(dstFormat==IMGFMT_BGR16)
d604bab9
MN
1098 {
1099 asm volatile(
1100 YSCALEYUV2RGB
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102#ifdef DITHER1XBPP
9b464428
FB
1103 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1106#endif
1107
1108 WRITEBGR16
1109
d1fac6cf 1110 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1111 "m" (yalpha1), "m" (uvalpha1)
1112 : "%eax"
1113 );
1114 }
1115#else
28bf81c9 1116 if(dstFormat==IMGFMT_BGR32)
d604bab9 1117 {
96034638 1118 int i;
d1fac6cf 1119 for(i=0; i<dstW-1; i+=2){
d604bab9 1120 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
1121 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1122 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1123 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1124 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d9fc1cfe
MN
1125
1126 int Cb= yuvtab_40cf[U];
1127 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1128 int Cr= yuvtab_3343[V];
1129
1130 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1131 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1132 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1133
1134 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1135 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1136 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1137 }
1138 }
28bf81c9 1139 else if(dstFormat==IMGFMT_BGR24)
d9fc1cfe 1140 {
96034638 1141 int i;
d1fac6cf 1142 for(i=0; i<dstW-1; i+=2){
d9fc1cfe
MN
1143 // vertical linear interpolation && yuv2rgb in a single step:
1144 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1145 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1146 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1147 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d9fc1cfe
MN
1148
1149 int Cb= yuvtab_40cf[U];
1150 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1151 int Cr= yuvtab_3343[V];
1152
1153 dest[0]=clip_table[((Y1 + Cb) >>13)];
1154 dest[1]=clip_table[((Y1 + Cg) >>13)];
1155 dest[2]=clip_table[((Y1 + Cr) >>13)];
1156
1157 dest[3]=clip_table[((Y2 + Cb) >>13)];
1158 dest[4]=clip_table[((Y2 + Cg) >>13)];
1159 dest[5]=clip_table[((Y2 + Cr) >>13)];
1160 dest+=6;
d604bab9
MN
1161 }
1162 }
28bf81c9 1163 else if(dstFormat==IMGFMT_BGR16)
d604bab9 1164 {
96034638 1165 int i;
5521b193
MN
1166#ifdef DITHER1XBPP
1167 static int ditherb1=1<<14;
1168 static int ditherg1=1<<13;
1169 static int ditherr1=2<<14;
1170 static int ditherb2=3<<14;
1171 static int ditherg2=3<<13;
1172 static int ditherr2=0<<14;
1173
1174 ditherb1 ^= (1^2)<<14;
1175 ditherg1 ^= (1^2)<<13;
1176 ditherr1 ^= (1^2)<<14;
1177 ditherb2 ^= (3^0)<<14;
1178 ditherg2 ^= (3^0)<<13;
1179 ditherr2 ^= (3^0)<<14;
1180#else
1181 const int ditherb1=0;
1182 const int ditherg1=0;
1183 const int ditherr1=0;
1184 const int ditherb2=0;
1185 const int ditherg2=0;
1186 const int ditherr2=0;
1187#endif
d1fac6cf 1188 for(i=0; i<dstW-1; i+=2){
d604bab9 1189 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
1190 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1191 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1192 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1193 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d604bab9 1194
d9fc1cfe
MN
1195 int Cb= yuvtab_40cf[U];
1196 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1197 int Cr= yuvtab_3343[V];
1198
d022ce5c 1199 ((uint16_t*)dest)[i] =
5521b193
MN
1200 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1201 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1202 clip_table16r[(Y1 + Cr + ditherr1) >>13];
d9fc1cfe
MN
1203
1204 ((uint16_t*)dest)[i+1] =
5521b193
MN
1205 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1206 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1207 clip_table16r[(Y2 + Cr + ditherr2) >>13];
d604bab9
MN
1208 }
1209 }
28bf81c9 1210 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1211 {
96034638 1212 int i;
5521b193
MN
1213#ifdef DITHER1XBPP
1214 static int ditherb1=1<<14;
1215 static int ditherg1=1<<14;
1216 static int ditherr1=2<<14;
1217 static int ditherb2=3<<14;
1218 static int ditherg2=3<<14;
1219 static int ditherr2=0<<14;
1220
1221 ditherb1 ^= (1^2)<<14;
1222 ditherg1 ^= (1^2)<<14;
1223 ditherr1 ^= (1^2)<<14;
1224 ditherb2 ^= (3^0)<<14;
1225 ditherg2 ^= (3^0)<<14;
1226 ditherr2 ^= (3^0)<<14;
1227#else
1228 const int ditherb1=0;
1229 const int ditherg1=0;
1230 const int ditherr1=0;
1231 const int ditherb2=0;
1232 const int ditherg2=0;
1233 const int ditherr2=0;
1234#endif
d1fac6cf 1235 for(i=0; i<dstW-1; i+=2){
d604bab9 1236 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
1237 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1238 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1239 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1240 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d604bab9 1241
d9fc1cfe
MN
1242 int Cb= yuvtab_40cf[U];
1243 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1244 int Cr= yuvtab_3343[V];
1245
d022ce5c 1246 ((uint16_t*)dest)[i] =
5521b193
MN
1247 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1248 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1249 clip_table15r[(Y1 + Cr + ditherr1) >>13];
b18ea156 1250
d9fc1cfe 1251 ((uint16_t*)dest)[i+1] =
5521b193
MN
1252 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1253 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1254 clip_table15r[(Y2 + Cr + ditherr2) >>13];
d604bab9
MN
1255 }
1256 }
1257#endif
1258 } //!FULL_UV_IPOL
1259}
1260
1261/**
1262 * YV12 to RGB without scaling or interpolating
1263 */
c1b0bfb4 1264static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
28bf81c9 1265 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
d604bab9 1266{
d604bab9 1267 int uvalpha1=uvalpha^4095;
c1b0bfb4 1268 const int yalpha1=0;
96034638 1269
1e621b18 1270 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 1271 {
28bf81c9 1272 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
d604bab9
MN
1273 return;
1274 }
397c035e
MN
1275
1276#ifdef HAVE_MMX
497d4f99
MN
1277 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1278 {
28bf81c9 1279 if(dstFormat==IMGFMT_BGR32)
d604bab9
MN
1280 {
1281 asm volatile(
1282 YSCALEYUV2RGB1
1283 WRITEBGR32
c1b0bfb4 1284 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1285 "m" (yalpha1), "m" (uvalpha1)
1286 : "%eax"
1287 );
1288 }
28bf81c9 1289 else if(dstFormat==IMGFMT_BGR24)
d604bab9
MN
1290 {
1291 asm volatile(
bdc2eb9a 1292 "movl %4, %%ebx \n\t"
d604bab9
MN
1293 YSCALEYUV2RGB1
1294 WRITEBGR24
c1b0bfb4 1295 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1296 "m" (yalpha1), "m" (uvalpha1)
1297 : "%eax", "%ebx"
1298 );
1299 }
28bf81c9 1300 else if(dstFormat==IMGFMT_BGR15)
d604bab9
MN
1301 {
1302 asm volatile(
1303 YSCALEYUV2RGB1
1304 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1305#ifdef DITHER1XBPP
9b464428
FB
1306 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1307 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1308 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1309#endif
1310 WRITEBGR15
c1b0bfb4 1311 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1312 "m" (yalpha1), "m" (uvalpha1)
1313 : "%eax"
1314 );
1315 }
28bf81c9 1316 else if(dstFormat==IMGFMT_BGR16)
d604bab9
MN
1317 {
1318 asm volatile(
1319 YSCALEYUV2RGB1
1320 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321#ifdef DITHER1XBPP
9b464428
FB
1322 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1323 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1324 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1325#endif
1326
1327 WRITEBGR16
c1b0bfb4 1328 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1329 "m" (yalpha1), "m" (uvalpha1)
1330 : "%eax"
1331 );
1332 }
497d4f99
MN
1333 }
1334 else
1335 {
28bf81c9 1336 if(dstFormat==IMGFMT_BGR32)
d604bab9 1337 {
497d4f99
MN
1338 asm volatile(
1339 YSCALEYUV2RGB1b
1340 WRITEBGR32
c1b0bfb4 1341 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1342 "m" (yalpha1), "m" (uvalpha1)
1343 : "%eax"
1344 );
d604bab9 1345 }
28bf81c9 1346 else if(dstFormat==IMGFMT_BGR24)
d604bab9 1347 {
497d4f99 1348 asm volatile(
bdc2eb9a 1349 "movl %4, %%ebx \n\t"
497d4f99
MN
1350 YSCALEYUV2RGB1b
1351 WRITEBGR24
c1b0bfb4 1352 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
497d4f99
MN
1353 "m" (yalpha1), "m" (uvalpha1)
1354 : "%eax", "%ebx"
1355 );
d604bab9 1356 }
28bf81c9 1357 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1358 {
497d4f99
MN
1359 asm volatile(
1360 YSCALEYUV2RGB1b
1361 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1362#ifdef DITHER1XBPP
9b464428
FB
1363 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1364 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1365 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99
MN
1366#endif
1367 WRITEBGR15
c1b0bfb4 1368 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1369 "m" (yalpha1), "m" (uvalpha1)
1370 : "%eax"
1371 );
1372 }
28bf81c9 1373 else if(dstFormat==IMGFMT_BGR16)
497d4f99
MN
1374 {
1375 asm volatile(
1376 YSCALEYUV2RGB1b
1377 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1378#ifdef DITHER1XBPP
9b464428
FB
1379 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1380 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1381 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1382#endif
d604bab9 1383
497d4f99 1384 WRITEBGR16
c1b0bfb4 1385 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1386 "m" (yalpha1), "m" (uvalpha1)
1387 : "%eax"
1388 );
d604bab9 1389 }
497d4f99
MN
1390 }
1391#else
397c035e 1392//FIXME write 2 versions (for even & odd lines)
497d4f99 1393
28bf81c9 1394 if(dstFormat==IMGFMT_BGR32)
497d4f99 1395 {
96034638 1396 int i;
d1fac6cf 1397 for(i=0; i<dstW-1; i+=2){
497d4f99 1398 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1399 int Y1=yuvtab_2568[buf0[i]>>7];
1400 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1401 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1402 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
397c035e
MN
1403
1404 int Cb= yuvtab_40cf[U];
1405 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1406 int Cr= yuvtab_3343[V];
1407
1408 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1409 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1410 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1411
1412 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1413 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1414 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1415 }
1416 }
28bf81c9 1417 else if(dstFormat==IMGFMT_BGR24)
397c035e 1418 {
96034638 1419 int i;
d1fac6cf 1420 for(i=0; i<dstW-1; i+=2){
397c035e
MN
1421 // vertical linear interpolation && yuv2rgb in a single step:
1422 int Y1=yuvtab_2568[buf0[i]>>7];
1423 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1424 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1425 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
397c035e
MN
1426
1427 int Cb= yuvtab_40cf[U];
1428 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1429 int Cr= yuvtab_3343[V];
1430
1431 dest[0]=clip_table[((Y1 + Cb) >>13)];
1432 dest[1]=clip_table[((Y1 + Cg) >>13)];
1433 dest[2]=clip_table[((Y1 + Cr) >>13)];
1434
1435 dest[3]=clip_table[((Y2 + Cb) >>13)];
1436 dest[4]=clip_table[((Y2 + Cg) >>13)];
1437 dest[5]=clip_table[((Y2 + Cr) >>13)];
1438 dest+=6;
497d4f99
MN
1439 }
1440 }
28bf81c9 1441 else if(dstFormat==IMGFMT_BGR16)
497d4f99 1442 {
96034638 1443 int i;
5521b193
MN
1444#ifdef DITHER1XBPP
1445 static int ditherb1=1<<14;
1446 static int ditherg1=1<<13;
1447 static int ditherr1=2<<14;
1448 static int ditherb2=3<<14;
1449 static int ditherg2=3<<13;
1450 static int ditherr2=0<<14;
1451
1452 ditherb1 ^= (1^2)<<14;
1453 ditherg1 ^= (1^2)<<13;
1454 ditherr1 ^= (1^2)<<14;
1455 ditherb2 ^= (3^0)<<14;
1456 ditherg2 ^= (3^0)<<13;
1457 ditherr2 ^= (3^0)<<14;
1458#else
1459 const int ditherb1=0;
1460 const int ditherg1=0;
1461 const int ditherr1=0;
1462 const int ditherb2=0;
1463 const int ditherg2=0;
1464 const int ditherr2=0;
1465#endif
d1fac6cf 1466 for(i=0; i<dstW-1; i+=2){
497d4f99 1467 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1468 int Y1=yuvtab_2568[buf0[i]>>7];
1469 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1470 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1471 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
497d4f99 1472
397c035e
MN
1473 int Cb= yuvtab_40cf[U];
1474 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1475 int Cr= yuvtab_3343[V];
1476
d022ce5c 1477 ((uint16_t*)dest)[i] =
5521b193
MN
1478 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1479 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1480 clip_table16r[(Y1 + Cr + ditherr1) >>13];
397c035e
MN
1481
1482 ((uint16_t*)dest)[i+1] =
5521b193
MN
1483 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1484 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1485 clip_table16r[(Y2 + Cr + ditherr2) >>13];
497d4f99
MN
1486 }
1487 }
28bf81c9 1488 else if(dstFormat==IMGFMT_BGR15)
497d4f99 1489 {
96034638 1490 int i;
5521b193
MN
1491#ifdef DITHER1XBPP
1492 static int ditherb1=1<<14;
1493 static int ditherg1=1<<14;
1494 static int ditherr1=2<<14;
1495 static int ditherb2=3<<14;
1496 static int ditherg2=3<<14;
1497 static int ditherr2=0<<14;
1498
1499 ditherb1 ^= (1^2)<<14;
1500 ditherg1 ^= (1^2)<<14;
1501 ditherr1 ^= (1^2)<<14;
1502 ditherb2 ^= (3^0)<<14;
1503 ditherg2 ^= (3^0)<<14;
1504 ditherr2 ^= (3^0)<<14;
1505#else
1506 const int ditherb1=0;
1507 const int ditherg1=0;
1508 const int ditherr1=0;
1509 const int ditherb2=0;
1510 const int ditherg2=0;
1511 const int ditherr2=0;
1512#endif
d1fac6cf 1513 for(i=0; i<dstW-1; i+=2){
497d4f99 1514 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1515 int Y1=yuvtab_2568[buf0[i]>>7];
1516 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1517 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1518 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
497d4f99 1519
397c035e
MN
1520 int Cb= yuvtab_40cf[U];
1521 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1522 int Cr= yuvtab_3343[V];
1523
d022ce5c 1524 ((uint16_t*)dest)[i] =
5521b193
MN
1525 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1526 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1527 clip_table15r[(Y1 + Cr + ditherr1) >>13];
b18ea156 1528
397c035e 1529 ((uint16_t*)dest)[i+1] =
5521b193
MN
1530 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1531 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1532 clip_table15r[(Y2 + Cr + ditherr2) >>13];
497d4f99
MN
1533 }
1534 }
d604bab9
MN
1535#endif
1536}
1537
6ff0ad6b
MN
1538//FIXME yuy2* can read upto 7 samples to much
1539
1e621b18
MN
1540static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1541{
6ff0ad6b
MN
1542#ifdef HAVE_MMX
1543 asm volatile(
1544 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1545 "movl %0, %%eax \n\t"
1546 "1: \n\t"
1547 "movq (%1, %%eax,2), %%mm0 \n\t"
1548 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1549 "pand %%mm2, %%mm0 \n\t"
1550 "pand %%mm2, %%mm1 \n\t"
1551 "packuswb %%mm1, %%mm0 \n\t"
1552 "movq %%mm0, (%2, %%eax) \n\t"
1553 "addl $8, %%eax \n\t"
1554 " js 1b \n\t"
1555 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1556 : "%eax"
1557 );
1e621b18
MN
1558#else
1559 int i;
1560 for(i=0; i<width; i++)
1561 dst[i]= src[2*i];
1562#endif
1563}
1564
1565static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1566{
6ff0ad6b
MN
1567#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1568 asm volatile(
1569 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1570 "movl %0, %%eax \n\t"
1571 "1: \n\t"
1572 "movq (%1, %%eax,4), %%mm0 \n\t"
1573 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1574 "movq (%2, %%eax,4), %%mm2 \n\t"
1575 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1576 PAVGB(%%mm2, %%mm0)
1577 PAVGB(%%mm3, %%mm1)
1578 "psrlw $8, %%mm0 \n\t"
1579 "psrlw $8, %%mm1 \n\t"
1580 "packuswb %%mm1, %%mm0 \n\t"
1581 "movq %%mm0, %%mm1 \n\t"
1582 "psrlw $8, %%mm0 \n\t"
1583 "pand %%mm4, %%mm1 \n\t"
1584 "packuswb %%mm0, %%mm0 \n\t"
1585 "packuswb %%mm1, %%mm1 \n\t"
1586 "movd %%mm0, (%4, %%eax) \n\t"
1587 "movd %%mm1, (%3, %%eax) \n\t"
1588 "addl $4, %%eax \n\t"
1589 " js 1b \n\t"
1590 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1591 : "%eax"
1592 );
1e621b18
MN
1593#else
1594 int i;
1595 for(i=0; i<width; i++)
1596 {
1597 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1598 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1599 }
1600#endif
1601}
1602
1603static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1604{
1605#ifdef HAVE_MMXFIXME
1606#else
1607 int i;
1608 for(i=0; i<width; i++)
1609 {
1610 int b= src[i*4+0];
1611 int g= src[i*4+1];
1612 int r= src[i*4+2];
1613
1614 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1615 }
1616#endif
1617}
1618
1619static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1620{
1621#ifdef HAVE_MMXFIXME
1622#else
1623 int i;
1624 for(i=0; i<width; i++)
1625 {
1626 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1627 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1628 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1629
1630 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1631 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1632 }
1633#endif
1634}
1635
1636static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1637{
ac6a2e45
MN
1638#ifdef HAVE_MMX
1639 asm volatile(
1640 "movl %2, %%eax \n\t"
1641 "movq bgr2YCoeff, %%mm6 \n\t"
1642 "movq w1111, %%mm5 \n\t"
1643 "pxor %%mm7, %%mm7 \n\t"
1644 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1645 ".balign 16 \n\t"
1646 "1: \n\t"
1647 PREFETCH" 64(%0, %%ebx) \n\t"
1648 "movd (%0, %%ebx), %%mm0 \n\t"
1649 "movd 3(%0, %%ebx), %%mm1 \n\t"
1650 "punpcklbw %%mm7, %%mm0 \n\t"
1651 "punpcklbw %%mm7, %%mm1 \n\t"
1652 "movd 6(%0, %%ebx), %%mm2 \n\t"
1653 "movd 9(%0, %%ebx), %%mm3 \n\t"
1654 "punpcklbw %%mm7, %%mm2 \n\t"
1655 "punpcklbw %%mm7, %%mm3 \n\t"
1656 "pmaddwd %%mm6, %%mm0 \n\t"
1657 "pmaddwd %%mm6, %%mm1 \n\t"
1658 "pmaddwd %%mm6, %%mm2 \n\t"
1659 "pmaddwd %%mm6, %%mm3 \n\t"
1660#ifndef FAST_BGR2YV12
1661 "psrad $8, %%mm0 \n\t"
1662 "psrad $8, %%mm1 \n\t"
1663 "psrad $8, %%mm2 \n\t"
1664 "psrad $8, %%mm3 \n\t"
1665#endif
1666 "packssdw %%mm1, %%mm0 \n\t"
1667 "packssdw %%mm3, %%mm2 \n\t"
1668 "pmaddwd %%mm5, %%mm0 \n\t"
1669 "pmaddwd %%mm5, %%mm2 \n\t"
1670 "packssdw %%mm2, %%mm0 \n\t"
1671 "psraw $7, %%mm0 \n\t"
1672
1673 "movd 12(%0, %%ebx), %%mm4 \n\t"
1674 "movd 15(%0, %%ebx), %%mm1 \n\t"
1675 "punpcklbw %%mm7, %%mm4 \n\t"
1676 "punpcklbw %%mm7, %%mm1 \n\t"
1677 "movd 18(%0, %%ebx), %%mm2 \n\t"
1678 "movd 21(%0, %%ebx), %%mm3 \n\t"
1679 "punpcklbw %%mm7, %%mm2 \n\t"
1680 "punpcklbw %%mm7, %%mm3 \n\t"
1681 "pmaddwd %%mm6, %%mm4 \n\t"
1682 "pmaddwd %%mm6, %%mm1 \n\t"
1683 "pmaddwd %%mm6, %%mm2 \n\t"
1684 "pmaddwd %%mm6, %%mm3 \n\t"
1685#ifndef FAST_BGR2YV12
1686 "psrad $8, %%mm4 \n\t"
1687 "psrad $8, %%mm1 \n\t"
1688 "psrad $8, %%mm2 \n\t"
1689 "psrad $8, %%mm3 \n\t"
1690#endif
1691 "packssdw %%mm1, %%mm4 \n\t"
1692 "packssdw %%mm3, %%mm2 \n\t"
1693 "pmaddwd %%mm5, %%mm4 \n\t"
1694 "pmaddwd %%mm5, %%mm2 \n\t"
1695 "addl $24, %%ebx \n\t"
1696 "packssdw %%mm2, %%mm4 \n\t"
1697 "psraw $7, %%mm4 \n\t"
1698
1699 "packuswb %%mm4, %%mm0 \n\t"
1700 "paddusb bgr2YOffset, %%mm0 \n\t"
1701
1702 MOVNTQ(%%mm0, (%1, %%eax))
1703 "addl $8, %%eax \n\t"
1704 " js 1b \n\t"
1705 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1706 : "%eax", "%ebx"
1707 );
1e621b18
MN
1708#else
1709 int i;
1710 for(i=0; i<width; i++)
1711 {
1712 int b= src[i*3+0];
1713 int g= src[i*3+1];
1714 int r= src[i*3+2];
1715
1716 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1717 }
1718#endif
1719}
1720
1721static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1722{
1723#ifdef HAVE_MMXFIXME
1724#else
1725 int i;
1726 for(i=0; i<width; i++)
1727 {
1728 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1729 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1730 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1731
1732 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1733 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1734 }
1735#endif
1736}
1737
6af250ea
MN
1738static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1739{
1740 int i;
1741 for(i=0; i<width; i++)
1742 {
1743 int d= src[i*2] + (src[i*2+1]<<8);
1744 int b= d&0x1F;
1745 int g= (d>>5)&0x3F;
1746 int r= (d>>11)&0x1F;
1747
1748 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1749 }
1750}
1751
1752static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1753{
1754 int i;
1755 for(i=0; i<width; i++)
1756 {
5bb9d9d8
MN
1757#if 1
1758 int d0= le2me_32( ((uint32_t*)src1)[i] );
1759 int d1= le2me_32( ((uint32_t*)src2)[i] );
1760
1761 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1762 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1763
1764 int dh2= (dh>>11) + (dh<<21);
1765 int d= dh2 + dl;
1766
1767 int b= d&0x7F;
1768 int r= (d>>11)&0x7F;
1769 int g= d>>21;
1770#else
6af250ea
MN
1771 int d0= src1[i*4] + (src1[i*4+1]<<8);
1772 int b0= d0&0x1F;
1773 int g0= (d0>>5)&0x3F;
1774 int r0= (d0>>11)&0x1F;
1775
1776 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1777 int b1= d1&0x1F;
1778 int g1= (d1>>5)&0x3F;
1779 int r1= (d1>>11)&0x1F;
1780
1781 int d2= src2[i*4] + (src2[i*4+1]<<8);
1782 int b2= d2&0x1F;
1783 int g2= (d2>>5)&0x3F;
1784 int r2= (d2>>11)&0x1F;
1785
1786 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1787 int b3= d3&0x1F;
1788 int g3= (d3>>5)&0x3F;
1789 int r3= (d3>>11)&0x1F;
1790
1791 int b= b0 + b1 + b2 + b3;
1792 int g= g0 + g1 + g2 + g3;
1793 int r= r0 + r1 + r2 + r3;
5bb9d9d8 1794#endif
6af250ea
MN
1795 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1796 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1797 }
1798}
1799
b72034dd
MN
1800static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1801{
1802 int i;
1803 for(i=0; i<width; i++)
1804 {
1805 int d= src[i*2] + (src[i*2+1]<<8);
1806 int b= d&0x1F;
1807 int g= (d>>5)&0x1F;
1808 int r= (d>>10)&0x1F;
1809
1810 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1811 }
1812}
1813
1814static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1815{
1816 int i;
1817 for(i=0; i<width; i++)
1818 {
1819#if 1
1820 int d0= le2me_32( ((uint32_t*)src1)[i] );
1821 int d1= le2me_32( ((uint32_t*)src2)[i] );
1822
1823 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1824 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1825
1826 int dh2= (dh>>11) + (dh<<21);
1827 int d= dh2 + dl;
1828
1829 int b= d&0x7F;
1830 int r= (d>>10)&0x7F;
1831 int g= d>>21;
1832#else
1833 int d0= src1[i*4] + (src1[i*4+1]<<8);
1834 int b0= d0&0x1F;
1835 int g0= (d0>>5)&0x1F;
1836 int r0= (d0>>10)&0x1F;
1837
1838 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1839 int b1= d1&0x1F;
1840 int g1= (d1>>5)&0x1F;
1841 int r1= (d1>>10)&0x1F;
1842
1843 int d2= src2[i*4] + (src2[i*4+1]<<8);
1844 int b2= d2&0x1F;
1845 int g2= (d2>>5)&0x1F;
1846 int r2= (d2>>10)&0x1F;
1847
1848 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1849 int b3= d3&0x1F;
1850 int g3= (d3>>5)&0x1F;
1851 int r3= (d3>>10)&0x1F;
1852
1853 int b= b0 + b1 + b2 + b3;
1854 int g= g0 + g1 + g2 + g3;
1855 int r= r0 + r1 + r2 + r3;
1856#endif
1857 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1858 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1859 }
1860}
1861
1862
a861d4d7
MN
1863static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1864{
1865 int i;
1866 for(i=0; i<width; i++)
1867 {
1868 int r= src[i*4+0];
1869 int g= src[i*4+1];
1870 int b= src[i*4+2];
1871
1872 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1873 }
1874}
1875
1876static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1877{
1878 int i;
1879 for(i=0; i<width; i++)
1880 {
1881 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1882 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1883 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1884
1885 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1886 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1887 }
1888}
1889
1890static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1891{
1892 int i;
1893 for(i=0; i<width; i++)
1894 {
1895 int r= src[i*3+0];
1896 int g= src[i*3+1];
1897 int b= src[i*3+2];
1898
1899 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1900 }
1901}
1902
1903static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1904{
1905 int i;
1906 for(i=0; i<width; i++)
1907 {
1908 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1909 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1910 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1911
1912 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1913 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1914 }
1915}
1916
1e621b18 1917
077ea8a7
MN
1918// Bilinear / Bicubic scaling
1919static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1920 int16_t *filter, int16_t *filterPos, int filterSize)
2ff198c1 1921{
077ea8a7
MN
1922#ifdef HAVE_MMX
1923 if(filterSize==4) // allways true for upscaling, sometimes for down too
1924 {
1925 int counter= -2*dstW;
1926 filter-= counter*2;
1927 filterPos-= counter/2;
1928 dst-= counter/2;
1929 asm volatile(
1930 "pxor %%mm7, %%mm7 \n\t"
9b464428 1931 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
1932 "pushl %%ebp \n\t" // we use 7 regs here ...
1933 "movl %%eax, %%ebp \n\t"
1934 ".balign 16 \n\t"
1935 "1: \n\t"
1936 "movzwl (%2, %%ebp), %%eax \n\t"
1937 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1938 "movq (%1, %%ebp, 4), %%mm1 \n\t"
1939 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
1940 "movd (%3, %%eax), %%mm0 \n\t"
1941 "movd (%3, %%ebx), %%mm2 \n\t"
1942 "punpcklbw %%mm7, %%mm0 \n\t"
1943 "punpcklbw %%mm7, %%mm2 \n\t"
1944 "pmaddwd %%mm1, %%mm0 \n\t"
1945 "pmaddwd %%mm2, %%mm3 \n\t"
1946 "psrad $8, %%mm0 \n\t"
1947 "psrad $8, %%mm3 \n\t"
1948 "packssdw %%mm3, %%mm0 \n\t"
1949 "pmaddwd %%mm6, %%mm0 \n\t"
1950 "packssdw %%mm0, %%mm0 \n\t"
1951 "movd %%mm0, (%4, %%ebp) \n\t"
1952 "addl $4, %%ebp \n\t"
1953 " jnc 1b \n\t"
e3d2500f 1954
077ea8a7
MN
1955 "popl %%ebp \n\t"
1956 : "+a" (counter)
1957 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1958 : "%ebx"
1959 );
1960 }
1961 else if(filterSize==8)
1962 {
1963 int counter= -2*dstW;
1964 filter-= counter*4;
1965 filterPos-= counter/2;
1966 dst-= counter/2;
1967 asm volatile(
1968 "pxor %%mm7, %%mm7 \n\t"
9b464428 1969 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
1970 "pushl %%ebp \n\t" // we use 7 regs here ...
1971 "movl %%eax, %%ebp \n\t"
1972 ".balign 16 \n\t"
1973 "1: \n\t"
1974 "movzwl (%2, %%ebp), %%eax \n\t"
1975 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1976 "movq (%1, %%ebp, 8), %%mm1 \n\t"
1977 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
1978 "movd (%3, %%eax), %%mm0 \n\t"
1979 "movd (%3, %%ebx), %%mm2 \n\t"
1980 "punpcklbw %%mm7, %%mm0 \n\t"
1981 "punpcklbw %%mm7, %%mm2 \n\t"
1982 "pmaddwd %%mm1, %%mm0 \n\t"
1983 "pmaddwd %%mm2, %%mm3 \n\t"
1984
1985 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
1986 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
1987 "movd 4(%3, %%eax), %%mm4 \n\t"
1988 "movd 4(%3, %%ebx), %%mm2 \n\t"
1989 "punpcklbw %%mm7, %%mm4 \n\t"
1990 "punpcklbw %%mm7, %%mm2 \n\t"
1991 "pmaddwd %%mm1, %%mm4 \n\t"
1992 "pmaddwd %%mm2, %%mm5 \n\t"
1993 "paddd %%mm4, %%mm0 \n\t"
1994 "paddd %%mm5, %%mm3 \n\t"
1995
1996 "psrad $8, %%mm0 \n\t"
1997 "psrad $8, %%mm3 \n\t"
1998 "packssdw %%mm3, %%mm0 \n\t"
1999 "pmaddwd %%mm6, %%mm0 \n\t"
2000 "packssdw %%mm0, %%mm0 \n\t"
2001 "movd %%mm0, (%4, %%ebp) \n\t"
2002 "addl $4, %%ebp \n\t"
2003 " jnc 1b \n\t"
c1b0bfb4 2004
077ea8a7
MN
2005 "popl %%ebp \n\t"
2006 : "+a" (counter)
2007 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2008 : "%ebx"
2009 );
2010 }
2011 else
2012 {
2013 int counter= -2*dstW;
2014// filter-= counter*filterSize/2;
2015 filterPos-= counter/2;
2016 dst-= counter/2;
2017 asm volatile(
2018 "pxor %%mm7, %%mm7 \n\t"
9b464428 2019 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2020 ".balign 16 \n\t"
2021 "1: \n\t"
2022 "movl %2, %%ecx \n\t"
2023 "movzwl (%%ecx, %0), %%eax \n\t"
2024 "movzwl 2(%%ecx, %0), %%ebx \n\t"
2025 "movl %5, %%ecx \n\t"
2026 "pxor %%mm4, %%mm4 \n\t"
2027 "pxor %%mm5, %%mm5 \n\t"
2028 "2: \n\t"
2029 "movq (%1), %%mm1 \n\t"
2030 "movq (%1, %6), %%mm3 \n\t"
2031 "movd (%%ecx, %%eax), %%mm0 \n\t"
2032 "movd (%%ecx, %%ebx), %%mm2 \n\t"
2033 "punpcklbw %%mm7, %%mm0 \n\t"
2034 "punpcklbw %%mm7, %%mm2 \n\t"
2035 "pmaddwd %%mm1, %%mm0 \n\t"
2036 "pmaddwd %%mm2, %%mm3 \n\t"
2037 "paddd %%mm3, %%mm5 \n\t"
2038 "paddd %%mm0, %%mm4 \n\t"
2039 "addl $8, %1 \n\t"
2040 "addl $4, %%ecx \n\t"
2041 "cmpl %4, %%ecx \n\t"
2042 " jb 2b \n\t"
2043 "addl %6, %1 \n\t"
2044 "psrad $8, %%mm4 \n\t"
2045 "psrad $8, %%mm5 \n\t"
2046 "packssdw %%mm5, %%mm4 \n\t"
2047 "pmaddwd %%mm6, %%mm4 \n\t"
2048 "packssdw %%mm4, %%mm4 \n\t"
2049 "movl %3, %%eax \n\t"
2050 "movd %%mm4, (%%eax, %0) \n\t"
2051 "addl $4, %0 \n\t"
2052 " jnc 1b \n\t"
c1b0bfb4 2053
627690b5
MN
2054 : "+r" (counter), "+r" (filter)
2055 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
077ea8a7 2056 "m" (src), "r" (filterSize*2)
e2f5a2a9 2057 : "%ebx", "%eax", "%ecx"
077ea8a7
MN
2058 );
2059 }
2060#else
2061 int i;
2062 for(i=0; i<dstW; i++)
2063 {
2064 int j;
2065 int srcPos= filterPos[i];
2066 int val=0;
c1b0bfb4 2067// printf("filterPos: %d\n", filterPos[i]);
077ea8a7
MN
2068 for(j=0; j<filterSize; j++)
2069 {
2070// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2071 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2072 }
2073// filter += hFilterSize;
2074 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2075// dst[i] = val>>7;
2076 }
2077#endif
2078}
2ff198c1 2079 // *** horizontal scale Y line to temp buffer
28bf81c9
MN
2080static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2081 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
1e621b18
MN
2082 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2083 int srcFormat, uint8_t *formatConvBuffer)
077ea8a7 2084{
1e621b18
MN
2085 if(srcFormat==IMGFMT_YUY2)
2086 {
2087 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2088 src= formatConvBuffer;
2089 }
2090 else if(srcFormat==IMGFMT_BGR32)
2091 {
2092 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2093 src= formatConvBuffer;
2094 }
2095 else if(srcFormat==IMGFMT_BGR24)
2096 {
2097 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2098 src= formatConvBuffer;
2099 }
6af250ea
MN
2100 else if(srcFormat==IMGFMT_BGR16)
2101 {
2102 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2103 src= formatConvBuffer;
2104 }
b72034dd
MN
2105 else if(srcFormat==IMGFMT_BGR15)
2106 {
2107 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2108 src= formatConvBuffer;
2109 }
a861d4d7
MN
2110 else if(srcFormat==IMGFMT_RGB32)
2111 {
2112 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2113 src= formatConvBuffer;
2114 }
2115 else if(srcFormat==IMGFMT_RGB24)
2116 {
2117 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2118 src= formatConvBuffer;
2119 }
1e621b18 2120
e3d2500f
MN
2121#ifdef HAVE_MMX
2122 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
28bf81c9 2123 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2124#else
28bf81c9 2125 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2126#endif
077ea8a7
MN
2127 {
2128 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2129 }
2130 else // Fast Bilinear upscale / crap downscale
2131 {
2ff198c1 2132#ifdef ARCH_X86
2ff198c1 2133#ifdef HAVE_MMX2
96034638 2134 int i;
2ff198c1
MN
2135 if(canMMX2BeUsed)
2136 {
2137 asm volatile(
2138 "pxor %%mm7, %%mm7 \n\t"
2139 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
2140 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
2141 "punpcklwd %%mm6, %%mm6 \n\t"
2142 "punpcklwd %%mm6, %%mm6 \n\t"
2143 "movq %%mm6, %%mm2 \n\t"
2144 "psllq $16, %%mm2 \n\t"
2145 "paddw %%mm6, %%mm2 \n\t"
2146 "psllq $16, %%mm2 \n\t"
2147 "paddw %%mm6, %%mm2 \n\t"
2148 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
28bf81c9 2149 "movq %%mm2, %%mm4 \n\t"
2ff198c1
MN
2150 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
2151 "punpcklwd %%mm6, %%mm6 \n\t"
2152 "punpcklwd %%mm6, %%mm6 \n\t"
2153 "xorl %%eax, %%eax \n\t" // i
2154 "movl %0, %%esi \n\t" // src
2155 "movl %1, %%edi \n\t" // buf1
2156 "movl %3, %%edx \n\t" // (xInc*4)>>16
2157 "xorl %%ecx, %%ecx \n\t"
2158 "xorl %%ebx, %%ebx \n\t"
2159 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
99cefd0b 2160
2ff198c1 2161#define FUNNY_Y_CODE \
99cefd0b
MN
2162 PREFETCH" 1024(%%esi) \n\t"\
2163 PREFETCH" 1056(%%esi) \n\t"\
2164 PREFETCH" 1088(%%esi) \n\t"\
28bf81c9
MN
2165 "call *%6 \n\t"\
2166 "movq %%mm4, %%mm2 \n\t"\
2ff198c1 2167 "xorl %%ecx, %%ecx \n\t"
99cefd0b 2168
2ff198c1
MN
2169FUNNY_Y_CODE
2170FUNNY_Y_CODE
2171FUNNY_Y_CODE
2172FUNNY_Y_CODE
2173FUNNY_Y_CODE
2174FUNNY_Y_CODE
2175FUNNY_Y_CODE
2176FUNNY_Y_CODE
2177
2178 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
28bf81c9 2179 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
2ff198c1
MN
2180 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2181 );
af91b8b3 2182 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2ff198c1
MN
2183 }
2184 else
2185 {
2186#endif
2187 //NO MMX just normal asm ...
2188 asm volatile(
2189 "xorl %%eax, %%eax \n\t" // i
2190 "xorl %%ebx, %%ebx \n\t" // xx
2191 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2192 ".balign 16 \n\t"
2ff198c1
MN
2193 "1: \n\t"
2194 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2195 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2196 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2197 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2198 "shll $16, %%edi \n\t"
2199 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2200 "movl %1, %%edi \n\t"
2201 "shrl $9, %%esi \n\t"
2202 "movw %%si, (%%edi, %%eax, 2) \n\t"
2203 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2204 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2205
2206 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2207 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2208 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2209 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2210 "shll $16, %%edi \n\t"
2211 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2212 "movl %1, %%edi \n\t"
2213 "shrl $9, %%esi \n\t"
2214 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2215 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2216 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2217
2218
2219 "addl $2, %%eax \n\t"
2220 "cmpl %2, %%eax \n\t"
2221 " jb 1b \n\t"
2222
2223
2224 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2225 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2226 );
2227#ifdef HAVE_MMX2
2228 } //if MMX2 cant be used
2229#endif
2230#else
96034638
MN
2231 int i;
2232 unsigned int xpos=0;
2233 for(i=0;i<dstWidth;i++)
2234 {
2235 register unsigned int xx=xpos>>16;
2236 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2237 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2238 xpos+=xInc;
2239 }
2ff198c1 2240#endif
077ea8a7 2241 }
2ff198c1
MN
2242}
2243
28bf81c9
MN
2244inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2245 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
1e621b18
MN
2246 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2247 int srcFormat, uint8_t *formatConvBuffer)
2ff198c1 2248{
1e621b18
MN
2249 if(srcFormat==IMGFMT_YUY2)
2250 {
2251 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2252 src1= formatConvBuffer;
2253 src2= formatConvBuffer+2048;
2254 }
2255 else if(srcFormat==IMGFMT_BGR32)
2256 {
2257 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2258 src1= formatConvBuffer;
2259 src2= formatConvBuffer+2048;
2260 }
2261 else if(srcFormat==IMGFMT_BGR24)
2262 {
2263 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2264 src1= formatConvBuffer;
2265 src2= formatConvBuffer+2048;
2266 }
6af250ea
MN
2267 else if(srcFormat==IMGFMT_BGR16)
2268 {
2269 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2270 src1= formatConvBuffer;
2271 src2= formatConvBuffer+2048;
2272 }
b72034dd
MN
2273 else if(srcFormat==IMGFMT_BGR15)
2274 {
2275 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2276 src1= formatConvBuffer;
2277 src2= formatConvBuffer+2048;
2278 }
a861d4d7
MN
2279 else if(srcFormat==IMGFMT_RGB32)
2280 {
2281 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2282 src1= formatConvBuffer;
2283 src2= formatConvBuffer+2048;
2284 }
2285 else if(srcFormat==IMGFMT_RGB24)
2286 {
2287 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2288 src1= formatConvBuffer;
2289 src2= formatConvBuffer+2048;
2290 }
6ff0ad6b
MN
2291 else if(isGray(srcFormat))
2292 {
2293 return;
2294 }
1e621b18 2295
e3d2500f
MN
2296#ifdef HAVE_MMX
2297 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
28bf81c9 2298 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2299#else
28bf81c9 2300 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2301#endif
077ea8a7
MN
2302 {
2303 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2304 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2305 }
2306 else // Fast Bilinear upscale / crap downscale
2307 {
2ff198c1
MN
2308#ifdef ARCH_X86
2309#ifdef HAVE_MMX2
96034638 2310 int i;
2ff198c1
MN
2311 if(canMMX2BeUsed)
2312 {
2313 asm volatile(
2314 "pxor %%mm7, %%mm7 \n\t"
2315 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
2316 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
2317 "punpcklwd %%mm6, %%mm6 \n\t"
2318 "punpcklwd %%mm6, %%mm6 \n\t"
2319 "movq %%mm6, %%mm2 \n\t"
2320 "psllq $16, %%mm2 \n\t"
2321 "paddw %%mm6, %%mm2 \n\t"
2322 "psllq $16, %%mm2 \n\t"
2323 "paddw %%mm6, %%mm2 \n\t"
2324 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
28bf81c9 2325 "movq %%mm2, %%mm4 \n\t"
2ff198c1
MN
2326 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
2327 "punpcklwd %%mm6, %%mm6 \n\t"
2328 "punpcklwd %%mm6, %%mm6 \n\t"
2329 "xorl %%eax, %%eax \n\t" // i
2330 "movl %0, %%esi \n\t" // src
2331 "movl %1, %%edi \n\t" // buf1
2332 "movl %3, %%edx \n\t" // (xInc*4)>>16
2333 "xorl %%ecx, %%ecx \n\t"
2334 "xorl %%ebx, %%ebx \n\t"
2335 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
2336
2ff198c1 2337#define FUNNYUVCODE \
99cefd0b
MN
2338 PREFETCH" 1024(%%esi) \n\t"\
2339 PREFETCH" 1056(%%esi) \n\t"\
2340 PREFETCH" 1088(%%esi) \n\t"\
28bf81c9
MN
2341 "call *%7 \n\t"\
2342 "movq %%mm4, %%mm2 \n\t"\
2ff198c1 2343 "xorl %%ecx, %%ecx \n\t"
2ff198c1
MN
2344
2345FUNNYUVCODE
2346FUNNYUVCODE
2347FUNNYUVCODE
2348FUNNYUVCODE
2349
2350FUNNYUVCODE
2351FUNNYUVCODE
2352FUNNYUVCODE
2353FUNNYUVCODE
2ff198c1
MN
2354 "xorl %%eax, %%eax \n\t" // i
2355 "movl %6, %%esi \n\t" // src
2356 "movl %1, %%edi \n\t" // buf1
2357 "addl $4096, %%edi \n\t"
2358
2359FUNNYUVCODE
2360FUNNYUVCODE
2361FUNNYUVCODE
2362FUNNYUVCODE
2363
2364FUNNYUVCODE
2365FUNNYUVCODE
2366FUNNYUVCODE
2367FUNNYUVCODE
2368
2369 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
28bf81c9 2370 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
2ff198c1
MN
2371 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2372 );
c1b0bfb4 2373 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2ff198c1 2374 {
c1b0bfb4
MN
2375// printf("%d %d %d\n", dstWidth, i, srcW);
2376 dst[i] = src1[srcW-1]*128;
2377 dst[i+2048] = src2[srcW-1]*128;
2ff198c1
MN
2378 }
2379 }
2380 else
2381 {
2382#endif
2383 asm volatile(
2384 "xorl %%eax, %%eax \n\t" // i
2385 "xorl %%ebx, %%ebx \n\t" // xx
2386 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2387 ".balign 16 \n\t"
2ff198c1
MN
2388 "1: \n\t"
2389 "movl %0, %%esi \n\t"
2390 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2391 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2392 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2393 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2394 "shll $16, %%edi \n\t"
2395 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2396 "movl %1, %%edi \n\t"
2397 "shrl $9, %%esi \n\t"
2398 "movw %%si, (%%edi, %%eax, 2) \n\t"
2399
2400 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2401 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2402 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2403 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2404 "shll $16, %%edi \n\t"
2405 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2406 "movl %1, %%edi \n\t"
2407 "shrl $9, %%esi \n\t"
2408 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2409
2410 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2411 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2412 "addl $1, %%eax \n\t"
2413 "cmpl %2, %%eax \n\t"
2414 " jb 1b \n\t"
2415
2416 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2417 "r" (src2)
2418 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2419 );
2420#ifdef HAVE_MMX2
2421 } //if MMX2 cant be used
2422#endif
2423#else
96034638
MN
2424 int i;
2425 unsigned int xpos=0;
2426 for(i=0;i<dstWidth;i++)
2427 {
2428 register unsigned int xx=xpos>>16;
2429 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2430 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2431 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1faf0867
MN
2432/* slower
2433 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2434 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2435*/
96034638
MN
2436 xpos+=xInc;
2437 }
2ff198c1 2438#endif
077ea8a7
MN
2439 }
2440}
2441
1e621b18 2442static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
6c7506de 2443 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
28bf81c9
MN
2444
2445 /* load a few things into local vars to make the code more readable? and faster */
2446 const int srcW= c->srcW;
2447 const int dstW= c->dstW;
2448 const int dstH= c->dstH;
2449 const int chrDstW= c->chrDstW;
2450 const int lumXInc= c->lumXInc;
2451 const int chrXInc= c->chrXInc;
fe8054c0 2452 const int dstFormat= c->dstFormat;
28bf81c9
MN
2453 const int flags= c->flags;
2454 const int canMMX2BeUsed= c->canMMX2BeUsed;
2455 int16_t *vLumFilterPos= c->vLumFilterPos;
2456 int16_t *vChrFilterPos= c->vChrFilterPos;
2457 int16_t *hLumFilterPos= c->hLumFilterPos;
2458 int16_t *hChrFilterPos= c->hChrFilterPos;
2459 int16_t *vLumFilter= c->vLumFilter;
2460 int16_t *vChrFilter= c->vChrFilter;
2461 int16_t *hLumFilter= c->hLumFilter;
2462 int16_t *hChrFilter= c->hChrFilter;
2463 int16_t *lumMmxFilter= c->lumMmxFilter;
2464 int16_t *chrMmxFilter= c->chrMmxFilter;
2465 const int vLumFilterSize= c->vLumFilterSize;
2466 const int vChrFilterSize= c->vChrFilterSize;
2467 const int hLumFilterSize= c->hLumFilterSize;
2468 const int hChrFilterSize= c->hChrFilterSize;
2469 int16_t **lumPixBuf= c->lumPixBuf;
2470 int16_t **chrPixBuf= c->chrPixBuf;
2471 const int vLumBufSize= c->vLumBufSize;
2472 const int vChrBufSize= c->vChrBufSize;
2473 uint8_t *funnyYCode= c->funnyYCode;
2474 uint8_t *funnyUVCode= c->funnyUVCode;
1e621b18 2475 uint8_t *formatConvBuffer= c->formatConvBuffer;
28bf81c9
MN
2476
2477 /* vars whch will change and which we need to storw back in the context */
2478 int dstY= c->dstY;
2479 int lumBufIndex= c->lumBufIndex;
2480 int chrBufIndex= c->chrBufIndex;
2481 int lastInLumBuf= c->lastInLumBuf;
2482 int lastInChrBuf= c->lastInChrBuf;
1e621b18 2483 int srcStride[3];
6c7506de
MN
2484 uint8_t *src[3];
2485 uint8_t *dst[3];
2486
37079906 2487 if(c->srcFormat == IMGFMT_I420){
6c7506de
MN
2488 src[0]= srcParam[0];
2489 src[1]= srcParam[2];
2490 src[2]= srcParam[1];
1e621b18
MN
2491 srcStride[0]= srcStrideParam[0];
2492 srcStride[1]= srcStrideParam[2];
2493 srcStride[2]= srcStrideParam[1];
2494 }
2495 else if(c->srcFormat==IMGFMT_YV12){
6c7506de
MN
2496 src[0]= srcParam[0];
2497 src[1]= srcParam[1];
2498 src[2]= srcParam[2];
1e621b18
MN
2499 srcStride[0]= srcStrideParam[0];
2500 srcStride[1]= srcStrideParam[1];
2501 srcStride[2]= srcStrideParam[2];
2502 }
2503 else if(isPacked(c->srcFormat)){
2504 src[0]=
2505 src[1]=
2506 src[2]= srcParam[0];
2507 srcStride[0]= srcStrideParam[0];
2508 srcStride[1]=
2509 srcStride[2]= srcStrideParam[0]<<1;
2510 }
6ff0ad6b 2511 else if(isGray(c->srcFormat)){
1e621b18
MN
2512 src[0]= srcParam[0];
2513 src[1]=
2514 src[2]= NULL;
2515 srcStride[0]= srcStrideParam[0];
2516 srcStride[1]=
2517 srcStride[2]= 0;
6c7506de
MN
2518 }
2519
37079906 2520 if(c->dstFormat == IMGFMT_I420){
6c7506de
MN
2521 dst[0]= dstParam[0];
2522 dst[1]= dstParam[2];
2523 dst[2]= dstParam[1];
2524
2525 }else{
2526 dst[0]= dstParam[0];
2527 dst[1]= dstParam[1];
2528 dst[2]= dstParam[2];
2529 }
37079906
MN
2530
2531//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2532//dstStride[0],dstStride[1],dstStride[2]);
6c7506de
MN
2533
2534 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2535 {
2536 static int firstTime=1; //FIXME move this into the context perhaps
2537 if(flags & SWS_PRINT_INFO && firstTime)
2538 {
2539 fprintf(stderr, "SwScaler: Warning: dstStride is not aligned!\n"
2540 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2541 firstTime=0;
2542 }
2543 }
28bf81c9 2544
1e621b18
MN
2545 /* Note the user might start scaling the picture in the middle so this will not get executed
2546 this is not really intended but works currently, so ppl might do it */
28bf81c9
MN
2547 if(srcSliceY ==0){
2548 lumBufIndex=0;
2549 chrBufIndex=0;
1e621b18 2550 dstY=0;
28bf81c9
MN
2551 lastInLumBuf= -1;
2552 lastInChrBuf= -1;
077ea8a7 2553 }
d3f41512 2554
c1b0bfb4 2555 for(;dstY < dstH; dstY++){
28bf81c9
MN
2556 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2557 unsigned char *uDest=dst[1]+dstStride[1]*(dstY>>1);
2558 unsigned char *vDest=dst[2]+dstStride[2]*(dstY>>1);
6c7506de 2559 const int chrDstY= isHalfChrV(dstFormat) ? (dstY>>1) : dstY;
d3f41512 2560
c1b0bfb4
MN
2561 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2562 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2563 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2564 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
d604bab9 2565
c7f822d9
MN
2566 //handle holes (FAST_BILINEAR & weird filters)
2567 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2568 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2569//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
c1b0bfb4
MN
2570 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2571 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
d3f41512 2572
c1b0bfb4
MN
2573 // Do we have enough lines in this slice to output the dstY line
2574 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
2575 {
2576 //Do horizontal scaling
2577 while(lastInLumBuf < lastLumSrcY)
d3f41512 2578 {
28bf81c9 2579 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4 2580 lumBufIndex++;
c7f822d9 2581// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
c1b0bfb4
MN
2582 ASSERT(lumBufIndex < 2*vLumBufSize)
2583 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2584 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2585// printf("%d %d\n", lumBufIndex, vLumBufSize);
28bf81c9
MN
2586 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2587 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
1e621b18 2588 funnyYCode, c->srcFormat, formatConvBuffer);
c1b0bfb4
MN
2589 lastInLumBuf++;
2590 }
2591 while(lastInChrBuf < lastChrSrcY)
2592 {
28bf81c9
MN
2593 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2594 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
c1b0bfb4
MN
2595 chrBufIndex++;
2596 ASSERT(chrBufIndex < 2*vChrBufSize)
2597 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2598 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
28bf81c9
MN
2599 //FIXME replace parameters through context struct (some at least)
2600 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2601 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
1e621b18 2602 funnyUVCode, c->srcFormat, formatConvBuffer);
c1b0bfb4 2603 lastInChrBuf++;
d3f41512 2604 }
c1b0bfb4
MN
2605 //wrap buf index around to stay inside the ring buffer
2606 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2607 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
d3f41512 2608 }
c1b0bfb4 2609 else // not enough lines left in this slice -> load the rest in the buffer
2ff198c1 2610 {
c1b0bfb4
MN
2611/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2612 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2613 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2614 vChrBufSize, vLumBufSize);
2615*/
2616 //Do horizontal scaling
2617 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2618 {
28bf81c9 2619 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4
MN
2620 lumBufIndex++;
2621 ASSERT(lumBufIndex < 2*vLumBufSize)
2622 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2623 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
28bf81c9
MN
2624 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2625 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
1e621b18 2626 funnyYCode, c->srcFormat, formatConvBuffer);
c1b0bfb4
MN
2627 lastInLumBuf++;
2628 }
2629 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2630 {
28bf81c9
MN
2631 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2632 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
c1b0bfb4
MN
2633 chrBufIndex++;
2634 ASSERT(chrBufIndex < 2*vChrBufSize)
2635 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2636 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
28bf81c9
MN
2637 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2638 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
1e621b18 2639 funnyUVCode, c->srcFormat, formatConvBuffer);
c1b0bfb4
MN
2640 lastInChrBuf++;
2641 }
2642 //wrap buf index around to stay inside the ring buffer
2643 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2644 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2645 break; //we cant output a dstY line so lets try with the next slice
2ff198c1 2646 }
d3f41512 2647
c1b0bfb4
MN
2648#ifdef HAVE_MMX
2649 b5Dither= dither8[dstY&1];
2650 g6Dither= dither4[dstY&1];
2651 g5Dither= dither8[dstY&1];
2652 r5Dither= dither8[(dstY+1)&1];
2653#endif
28bf81c9 2654 if(dstY < dstH-2)
e3d2500f 2655 {
6c7506de 2656 if(isPlanarYUV(dstFormat)) //YV12 like
0f25d72b 2657 {
c1b0bfb4
MN
2658 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2659 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2ff198c1 2660 {
c1b0bfb4
MN
2661 int16_t *lumBuf = lumPixBuf[0];
2662 int16_t *chrBuf= chrPixBuf[0];
2663 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2664 }
2665 else //General YV12
2666 {
2667 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2668 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2669 RENAME(yuv2yuvX)(
2670 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2671 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2672 dest, uDest, vDest, dstW,
2673 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2ff198c1 2674 }
0f25d72b 2675 }
c1b0bfb4 2676 else
2ff198c1 2677 {
c1b0bfb4
MN
2678 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2679 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
d3f41512 2680
c1b0bfb4
MN
2681 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2682 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2683 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2684 {
2685 int chrAlpha= vChrFilter[2*dstY+1];
2ff198c1 2686
c1b0bfb4 2687 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
28bf81c9 2688 dest, dstW, chrAlpha, dstFormat, flags);
c1b0bfb4
MN
2689 }
2690 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2691 {
2692 int lumAlpha= vLumFilter[2*dstY+1];
2693 int chrAlpha= vChrFilter[2*dstY+1];
2694
2695 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
28bf81c9 2696 dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
c1b0bfb4
MN
2697 }
2698 else //General RGB
2699 {
2700 RENAME(yuv2rgbX)(
2701 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2702 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
28bf81c9 2703 dest, dstW, dstFormat,
c1b0bfb4
MN
2704 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2705 }
2706 }
e3d2500f
MN
2707 }
2708 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2709 {
2710 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2711 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
6c7506de 2712 if(isPlanarYUV(dstFormat)) //YV12
e3d2500f
MN
2713 {
2714 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2715 yuv2yuvXinC(
2716 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2717 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2718 dest, uDest, vDest, dstW);
2719 }
2720 else
2721 {
2722 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2723 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2724 yuv2rgbXinC(
2725 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2726 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
28bf81c9 2727 dest, dstW, dstFormat);
e3d2500f
MN
2728 }
2729 }
c1b0bfb4 2730 }
17f715fa
MN
2731
2732#ifdef HAVE_MMX
2733 __asm __volatile(SFENCE:::"memory");
1faf0867 2734 __asm __volatile(EMMS:::"memory");
17f715fa 2735#endif
28bf81c9
MN
2736 /* store changed local vars back in the context */
2737 c->dstY= dstY;
2738 c->lumBufIndex= lumBufIndex;
2739 c->chrBufIndex= chrBufIndex;
2740 c->lastInLumBuf= lastInLumBuf;
2741 c->lastInChrBuf= lastInChrBuf;
627690b5 2742}