fixing scaling/contrast for 1/4/8 bpp
[libav.git] / postproc / swscale_template.c
CommitLineData
fe8054c0
MN
1/*
2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
541c4eb9 19#undef MOVNTQ
7d7f78b5 20#undef PAVGB
48a05cec
MN
21#undef PREFETCH
22#undef PREFETCHW
23#undef EMMS
24#undef SFENCE
25
26#ifdef HAVE_3DNOW
27/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28#define EMMS "femms"
29#else
30#define EMMS "emms"
31#endif
32
33#ifdef HAVE_3DNOW
34#define PREFETCH "prefetch"
35#define PREFETCHW "prefetchw"
36#elif defined ( HAVE_MMX2 )
37#define PREFETCH "prefetchnta"
38#define PREFETCHW "prefetcht0"
39#else
40#define PREFETCH "/nop"
41#define PREFETCHW "/nop"
42#endif
43
44#ifdef HAVE_MMX2
45#define SFENCE "sfence"
46#else
47#define SFENCE "/nop"
48#endif
d3f41512 49
d604bab9
MN
50#ifdef HAVE_MMX2
51#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52#elif defined (HAVE_3DNOW)
53#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54#endif
d3f41512 55
d604bab9
MN
56#ifdef HAVE_MMX2
57#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58#else
59#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60#endif
61
c1b0bfb4
MN
62#define YSCALEYUV2YV12X(x) \
63 "xorl %%eax, %%eax \n\t"\
64 "pxor %%mm3, %%mm3 \n\t"\
65 "pxor %%mm4, %%mm4 \n\t"\
66 "movl %0, %%edx \n\t"\
67 ".balign 16 \n\t" /* FIXME Unroll? */\
68 "1: \n\t"\
69 "movl (%1, %%edx, 4), %%esi \n\t"\
70 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
71 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
72 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
77 "addl $1, %%edx \n\t"\
78 " jnz 1b \n\t"\
79 "psraw $3, %%mm3 \n\t"\
80 "psraw $3, %%mm4 \n\t"\
81 "packuswb %%mm4, %%mm3 \n\t"\
82 MOVNTQ(%%mm3, (%3, %%eax))\
83 "addl $8, %%eax \n\t"\
84 "cmpl %4, %%eax \n\t"\
85 "pxor %%mm3, %%mm3 \n\t"\
86 "pxor %%mm4, %%mm4 \n\t"\
87 "movl %0, %%edx \n\t"\
88 "jb 1b \n\t"
89
90#define YSCALEYUV2YV121 \
91 "movl %2, %%eax \n\t"\
92 ".balign 16 \n\t" /* FIXME Unroll? */\
93 "1: \n\t"\
94 "movq (%0, %%eax, 2), %%mm0 \n\t"\
95 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
96 "psraw $7, %%mm0 \n\t"\
97 "psraw $7, %%mm1 \n\t"\
98 "packuswb %%mm1, %%mm0 \n\t"\
99 MOVNTQ(%%mm0, (%1, %%eax))\
100 "addl $8, %%eax \n\t"\
101 "jnc 1b \n\t"
102
103/*
104 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
105 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
106 "r" (dest), "m" (dstW),
107 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
108 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
109*/
110#define YSCALEYUV2RGBX \
111 "xorl %%eax, %%eax \n\t"\
112 ".balign 16 \n\t"\
113 "1: \n\t"\
114 "movl %1, %%edx \n\t" /* -chrFilterSize */\
115 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
116 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
117 "pxor %%mm3, %%mm3 \n\t"\
118 "pxor %%mm4, %%mm4 \n\t"\
119 "2: \n\t"\
120 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
121 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
122 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
123 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
124 "pmulhw %%mm0, %%mm2 \n\t"\
125 "pmulhw %%mm0, %%mm5 \n\t"\
126 "paddw %%mm2, %%mm3 \n\t"\
127 "paddw %%mm5, %%mm4 \n\t"\
128 "addl $1, %%edx \n\t"\
129 " jnz 2b \n\t"\
130\
131 "movl %0, %%edx \n\t" /* -lumFilterSize */\
132 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
133 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
134 "pxor %%mm1, %%mm1 \n\t"\
135 "pxor %%mm7, %%mm7 \n\t"\
136 "2: \n\t"\
137 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
138 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
139 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
140 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
141 "pmulhw %%mm0, %%mm2 \n\t"\
142 "pmulhw %%mm0, %%mm5 \n\t"\
143 "paddw %%mm2, %%mm1 \n\t"\
144 "paddw %%mm5, %%mm7 \n\t"\
145 "addl $1, %%edx \n\t"\
146 " jnz 2b \n\t"\
147\
9b464428
FB
148 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
149 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
c1b0bfb4
MN
150 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
151 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
152 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
153 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
c1b0bfb4 154 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
9b464428
FB
155 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
156 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
157 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
158 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
159 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
160 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
c1b0bfb4
MN
161 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
162 "paddw %%mm3, %%mm4 \n\t"\
163 "movq %%mm2, %%mm0 \n\t"\
164 "movq %%mm5, %%mm6 \n\t"\
165 "movq %%mm4, %%mm3 \n\t"\
166 "punpcklwd %%mm2, %%mm2 \n\t"\
167 "punpcklwd %%mm5, %%mm5 \n\t"\
168 "punpcklwd %%mm4, %%mm4 \n\t"\
169 "paddw %%mm1, %%mm2 \n\t"\
170 "paddw %%mm1, %%mm5 \n\t"\
171 "paddw %%mm1, %%mm4 \n\t"\
172 "punpckhwd %%mm0, %%mm0 \n\t"\
173 "punpckhwd %%mm6, %%mm6 \n\t"\
174 "punpckhwd %%mm3, %%mm3 \n\t"\
175 "paddw %%mm7, %%mm0 \n\t"\
176 "paddw %%mm7, %%mm6 \n\t"\
177 "paddw %%mm7, %%mm3 \n\t"\
178 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
179 "packuswb %%mm0, %%mm2 \n\t"\
180 "packuswb %%mm6, %%mm5 \n\t"\
181 "packuswb %%mm3, %%mm4 \n\t"\
182 "pxor %%mm7, %%mm7 \n\t"
183
d604bab9
MN
184#define FULL_YSCALEYUV2RGB \
185 "pxor %%mm7, %%mm7 \n\t"\
186 "movd %6, %%mm6 \n\t" /*yalpha1*/\
187 "punpcklwd %%mm6, %%mm6 \n\t"\
188 "punpcklwd %%mm6, %%mm6 \n\t"\
189 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
190 "punpcklwd %%mm5, %%mm5 \n\t"\
191 "punpcklwd %%mm5, %%mm5 \n\t"\
192 "xorl %%eax, %%eax \n\t"\
cff6ecd7 193 ".balign 16 \n\t"\
d604bab9
MN
194 "1: \n\t"\
195 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
196 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
197 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
198 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
199 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
200 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
201 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
202 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
203 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
204 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
205 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
206 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
207 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
208 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
209 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
9b464428
FB
210 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
211 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
212 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
d604bab9
MN
213\
214\
215 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
216 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
9b464428 217 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
d604bab9 218 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428 219 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
d604bab9 220 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428 221 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
d604bab9
MN
222\
223\
224 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
9b464428
FB
225 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
226 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
227 "paddw %%mm1, %%mm3 \n\t" /* B*/\
228 "paddw %%mm1, %%mm0 \n\t" /* R*/\
229 "packuswb %%mm3, %%mm3 \n\t"\
230\
231 "packuswb %%mm0, %%mm0 \n\t"\
232 "paddw %%mm4, %%mm2 \n\t"\
233 "paddw %%mm2, %%mm1 \n\t" /* G*/\
234\
235 "packuswb %%mm1, %%mm1 \n\t"
236
237#define YSCALEYUV2RGB \
238 "movd %6, %%mm6 \n\t" /*yalpha1*/\
239 "punpcklwd %%mm6, %%mm6 \n\t"\
240 "punpcklwd %%mm6, %%mm6 \n\t"\
5ac80202 241 "movq %%mm6, 3968(%2) \n\t"\
d604bab9
MN
242 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
243 "punpcklwd %%mm5, %%mm5 \n\t"\
244 "punpcklwd %%mm5, %%mm5 \n\t"\
5ac80202 245 "movq %%mm5, 3976(%2) \n\t"\
d604bab9 246 "xorl %%eax, %%eax \n\t"\
cff6ecd7 247 ".balign 16 \n\t"\
d604bab9
MN
248 "1: \n\t"\
249 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
250 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
251 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
252 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
253 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
254 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
5ac80202 255 "movq 3976(%2), %%mm0 \n\t"\
d604bab9
MN
256 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
257 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
258 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
259 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
260 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
261 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
9b464428
FB
262 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
263 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
264 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
265 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
266 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
267 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9
MN
268 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
269 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
270 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
271 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
272 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
273 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
274 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
5ac80202
MN
275 "pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
276 "pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
d604bab9
MN
277 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
278 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
280 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
9b464428
FB
281 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
282 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
283 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
284 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
285 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
286 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
d604bab9
MN
287 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
288 "paddw %%mm3, %%mm4 \n\t"\
289 "movq %%mm2, %%mm0 \n\t"\
290 "movq %%mm5, %%mm6 \n\t"\
291 "movq %%mm4, %%mm3 \n\t"\
292 "punpcklwd %%mm2, %%mm2 \n\t"\
293 "punpcklwd %%mm5, %%mm5 \n\t"\
294 "punpcklwd %%mm4, %%mm4 \n\t"\
295 "paddw %%mm1, %%mm2 \n\t"\
296 "paddw %%mm1, %%mm5 \n\t"\
297 "paddw %%mm1, %%mm4 \n\t"\
298 "punpckhwd %%mm0, %%mm0 \n\t"\
299 "punpckhwd %%mm6, %%mm6 \n\t"\
300 "punpckhwd %%mm3, %%mm3 \n\t"\
301 "paddw %%mm7, %%mm0 \n\t"\
302 "paddw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm7, %%mm3 \n\t"\
304 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
305 "packuswb %%mm0, %%mm2 \n\t"\
306 "packuswb %%mm6, %%mm5 \n\t"\
307 "packuswb %%mm3, %%mm4 \n\t"\
308 "pxor %%mm7, %%mm7 \n\t"
309
310#define YSCALEYUV2RGB1 \
311 "xorl %%eax, %%eax \n\t"\
cff6ecd7 312 ".balign 16 \n\t"\
d604bab9
MN
313 "1: \n\t"\
314 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
315 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
316 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
317 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
9b464428
FB
318 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
319 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
d604bab9
MN
320 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
321 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
322 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
323 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
d604bab9 324 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
497d4f99
MN
325 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
326 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
327 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
328 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
9b464428
FB
329 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
330 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
331 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
332 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
333 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
334 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
497d4f99
MN
335 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
336 "paddw %%mm3, %%mm4 \n\t"\
337 "movq %%mm2, %%mm0 \n\t"\
338 "movq %%mm5, %%mm6 \n\t"\
339 "movq %%mm4, %%mm3 \n\t"\
340 "punpcklwd %%mm2, %%mm2 \n\t"\
341 "punpcklwd %%mm5, %%mm5 \n\t"\
342 "punpcklwd %%mm4, %%mm4 \n\t"\
343 "paddw %%mm1, %%mm2 \n\t"\
344 "paddw %%mm1, %%mm5 \n\t"\
345 "paddw %%mm1, %%mm4 \n\t"\
346 "punpckhwd %%mm0, %%mm0 \n\t"\
347 "punpckhwd %%mm6, %%mm6 \n\t"\
348 "punpckhwd %%mm3, %%mm3 \n\t"\
349 "paddw %%mm7, %%mm0 \n\t"\
350 "paddw %%mm7, %%mm6 \n\t"\
351 "paddw %%mm7, %%mm3 \n\t"\
352 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
353 "packuswb %%mm0, %%mm2 \n\t"\
354 "packuswb %%mm6, %%mm5 \n\t"\
355 "packuswb %%mm3, %%mm4 \n\t"\
356 "pxor %%mm7, %%mm7 \n\t"
357
358// do vertical chrominance interpolation
359#define YSCALEYUV2RGB1b \
360 "xorl %%eax, %%eax \n\t"\
cff6ecd7 361 ".balign 16 \n\t"\
497d4f99
MN
362 "1: \n\t"\
363 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
364 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
365 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
366 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397c035e
MN
367 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
368 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
c1b0bfb4
MN
369 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
370 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
9b464428
FB
371 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
372 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
497d4f99
MN
373 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
374 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
9b464428
FB
375 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
376 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
497d4f99
MN
377 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
378 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
379 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
d604bab9
MN
380 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
381 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
9b464428
FB
382 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
383 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
384 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
385 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
386 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
387 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
d604bab9
MN
388 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
389 "paddw %%mm3, %%mm4 \n\t"\
390 "movq %%mm2, %%mm0 \n\t"\
391 "movq %%mm5, %%mm6 \n\t"\
392 "movq %%mm4, %%mm3 \n\t"\
393 "punpcklwd %%mm2, %%mm2 \n\t"\
394 "punpcklwd %%mm5, %%mm5 \n\t"\
395 "punpcklwd %%mm4, %%mm4 \n\t"\
396 "paddw %%mm1, %%mm2 \n\t"\
397 "paddw %%mm1, %%mm5 \n\t"\
398 "paddw %%mm1, %%mm4 \n\t"\
399 "punpckhwd %%mm0, %%mm0 \n\t"\
400 "punpckhwd %%mm6, %%mm6 \n\t"\
401 "punpckhwd %%mm3, %%mm3 \n\t"\
402 "paddw %%mm7, %%mm0 \n\t"\
403 "paddw %%mm7, %%mm6 \n\t"\
404 "paddw %%mm7, %%mm3 \n\t"\
405 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
406 "packuswb %%mm0, %%mm2 \n\t"\
407 "packuswb %%mm6, %%mm5 \n\t"\
408 "packuswb %%mm3, %%mm4 \n\t"\
409 "pxor %%mm7, %%mm7 \n\t"
410
411#define WRITEBGR32 \
412 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
413 "movq %%mm2, %%mm1 \n\t" /* B */\
414 "movq %%mm5, %%mm6 \n\t" /* R */\
415 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
416 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
417 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
418 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
419 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
420 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
421 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
422 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
423 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
424 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
425\
426 MOVNTQ(%%mm0, (%4, %%eax, 4))\
427 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
428 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
429 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
430\
431 "addl $8, %%eax \n\t"\
432 "cmpl %5, %%eax \n\t"\
433 " jb 1b \n\t"
434
435#define WRITEBGR16 \
9b464428
FB
436 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
437 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
438 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb 439 "psrlq $3, %%mm2 \n\t"\
d604bab9 440\
f62255fb
MN
441 "movq %%mm2, %%mm1 \n\t"\
442 "movq %%mm4, %%mm3 \n\t"\
d604bab9 443\
f62255fb
MN
444 "punpcklbw %%mm7, %%mm3 \n\t"\
445 "punpcklbw %%mm5, %%mm2 \n\t"\
446 "punpckhbw %%mm7, %%mm4 \n\t"\
447 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 448\
f62255fb
MN
449 "psllq $3, %%mm3 \n\t"\
450 "psllq $3, %%mm4 \n\t"\
d604bab9
MN
451\
452 "por %%mm3, %%mm2 \n\t"\
d604bab9 453 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
454\
455 MOVNTQ(%%mm2, (%4, %%eax, 2))\
456 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
457\
458 "addl $8, %%eax \n\t"\
459 "cmpl %5, %%eax \n\t"\
460 " jb 1b \n\t"
461
462#define WRITEBGR15 \
9b464428
FB
463 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
464 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
465 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
f62255fb
MN
466 "psrlq $3, %%mm2 \n\t"\
467 "psrlq $1, %%mm5 \n\t"\
d604bab9 468\
f62255fb
MN
469 "movq %%mm2, %%mm1 \n\t"\
470 "movq %%mm4, %%mm3 \n\t"\
d604bab9 471\
f62255fb
MN
472 "punpcklbw %%mm7, %%mm3 \n\t"\
473 "punpcklbw %%mm5, %%mm2 \n\t"\
474 "punpckhbw %%mm7, %%mm4 \n\t"\
475 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 476\
f62255fb
MN
477 "psllq $2, %%mm3 \n\t"\
478 "psllq $2, %%mm4 \n\t"\
d604bab9
MN
479\
480 "por %%mm3, %%mm2 \n\t"\
d604bab9 481 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
482\
483 MOVNTQ(%%mm2, (%4, %%eax, 2))\
484 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
485\
486 "addl $8, %%eax \n\t"\
487 "cmpl %5, %%eax \n\t"\
488 " jb 1b \n\t"
f62255fb 489
99d2cb72 490#define WRITEBGR24OLD \
d604bab9
MN
491 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
492 "movq %%mm2, %%mm1 \n\t" /* B */\
493 "movq %%mm5, %%mm6 \n\t" /* R */\
494 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
495 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
496 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
497 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
498 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
499 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
a525ce8d
MN
500 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
501 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
502 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
503 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9
MN
504\
505 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
506 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
9b464428
FB
507 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
508 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
d604bab9
MN
509 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
510 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
511 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
512 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
513\
514 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
515 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
516 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
517 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
9b464428 518 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
d604bab9
MN
519 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
520 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
9b464428
FB
521 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
522 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
d604bab9
MN
523 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
524 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
525 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
526 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
527\
528 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
529 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
530 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
9b464428
FB
531 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
532 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
d604bab9
MN
533 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
534 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
535 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
536\
bdc2eb9a
MN
537 MOVNTQ(%%mm0, (%%ebx))\
538 MOVNTQ(%%mm2, 8(%%ebx))\
539 MOVNTQ(%%mm3, 16(%%ebx))\
540 "addl $24, %%ebx \n\t"\
d604bab9
MN
541\
542 "addl $8, %%eax \n\t"\
543 "cmpl %5, %%eax \n\t"\
544 " jb 1b \n\t"
545
99d2cb72
MN
546#define WRITEBGR24MMX \
547 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
548 "movq %%mm2, %%mm1 \n\t" /* B */\
549 "movq %%mm5, %%mm6 \n\t" /* R */\
550 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
551 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
552 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
553 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
554 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
555 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
556 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
557 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
558 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
559 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
560\
561 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
562 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
563 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
564 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
565\
566 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
567 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
568 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
569 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
570\
571 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
572 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
573 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
574 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
575\
576 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
577 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
578 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
579 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
580 MOVNTQ(%%mm0, (%%ebx))\
581\
582 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
583 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
584 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
585 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
586 MOVNTQ(%%mm6, 8(%%ebx))\
587\
588 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
589 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
590 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
591 MOVNTQ(%%mm5, 16(%%ebx))\
592\
593 "addl $24, %%ebx \n\t"\
594\
595 "addl $8, %%eax \n\t"\
596 "cmpl %5, %%eax \n\t"\
597 " jb 1b \n\t"
598
599#define WRITEBGR24MMX2 \
600 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
9b464428
FB
601 "movq "MANGLE(M24A)", %%mm0 \n\t"\
602 "movq "MANGLE(M24C)", %%mm7 \n\t"\
99d2cb72
MN
603 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
604 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
605 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
606\
607 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
608 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
609 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
610\
611 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
612 "por %%mm1, %%mm6 \n\t"\
613 "por %%mm3, %%mm6 \n\t"\
614 MOVNTQ(%%mm6, (%%ebx))\
615\
616 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
617 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
618 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
619 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
620\
9b464428 621 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
99d2cb72
MN
622 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
623 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
624\
625 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
626 "por %%mm3, %%mm6 \n\t"\
627 MOVNTQ(%%mm6, 8(%%ebx))\
628\
629 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
630 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
631 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
632\
633 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
634 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
9b464428 635 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
99d2cb72
MN
636\
637 "por %%mm1, %%mm3 \n\t"\
638 "por %%mm3, %%mm6 \n\t"\
639 MOVNTQ(%%mm6, 16(%%ebx))\
640\
641 "addl $24, %%ebx \n\t"\
642\
643 "addl $8, %%eax \n\t"\
644 "cmpl %5, %%eax \n\t"\
645 " jb 1b \n\t"
646
647#ifdef HAVE_MMX2
7630f2e0 648#undef WRITEBGR24
99d2cb72
MN
649#define WRITEBGR24 WRITEBGR24MMX2
650#else
7630f2e0 651#undef WRITEBGR24
99d2cb72
MN
652#define WRITEBGR24 WRITEBGR24MMX
653#endif
654
c1b0bfb4
MN
655static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
656 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
e616aa93 657 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
c1b0bfb4 658 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
38858470 659{
c1b0bfb4
MN
660#ifdef HAVE_MMX
661 if(uDest != NULL)
662 {
663 asm volatile(
664 YSCALEYUV2YV12X(0)
665 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
e616aa93 666 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (chrDstW)
c1b0bfb4
MN
667 : "%eax", "%edx", "%esi"
668 );
669
670 asm volatile(
671 YSCALEYUV2YV12X(4096)
672 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
e616aa93 673 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (chrDstW)
c1b0bfb4
MN
674 : "%eax", "%edx", "%esi"
675 );
676 }
677
678 asm volatile(
679 YSCALEYUV2YV12X(0)
680 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
681 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
682 : "%eax", "%edx", "%esi"
683 );
684#else
5859233b 685yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
e3d2500f 686 chrFilter, chrSrc, chrFilterSize,
5859233b 687 dest, uDest, vDest, dstW, chrDstW);
7630f2e0 688#endif
c1b0bfb4 689}
2add307d 690
c1b0bfb4 691static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
e616aa93 692 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
c1b0bfb4
MN
693{
694#ifdef HAVE_MMX
695 if(uDest != NULL)
38858470 696 {
c1b0bfb4
MN
697 asm volatile(
698 YSCALEYUV2YV121
e616aa93
MN
699 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
700 "g" (-chrDstW)
c1b0bfb4
MN
701 : "%eax"
702 );
703
704 asm volatile(
705 YSCALEYUV2YV121
e616aa93
MN
706 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
707 "g" (-chrDstW)
c1b0bfb4
MN
708 : "%eax"
709 );
38858470
MN
710 }
711
c1b0bfb4
MN
712 asm volatile(
713 YSCALEYUV2YV121
714 :: "r" (lumSrc + dstW), "r" (dest + dstW),
715 "g" (-dstW)
716 : "%eax"
717 );
718#else
c1b0bfb4
MN
719 int i;
720 for(i=0; i<dstW; i++)
38858470 721 {
c1b0bfb4 722 int val= lumSrc[i]>>7;
44c1035c
MN
723
724 if(val&256){
725 if(val<0) val=0;
726 else val=255;
727 }
c1b0bfb4 728
44c1035c 729 dest[i]= val;
c1b0bfb4
MN
730 }
731
732 if(uDest != NULL)
e616aa93 733 for(i=0; i<chrDstW; i++)
38858470 734 {
c1b0bfb4
MN
735 int u=chrSrc[i]>>7;
736 int v=chrSrc[i + 2048]>>7;
737
44c1035c
MN
738 if((u|v)&256){
739 if(u<0) u=0;
740 else if (u>255) u=255;
741 if(v<0) v=0;
742 else if (v>255) v=255;
743 }
744
745 uDest[i]= u;
746 vDest[i]= v;
38858470 747 }
c1b0bfb4 748#endif
38858470
MN
749}
750
c1b0bfb4 751
d604bab9
MN
752/**
753 * vertical scale YV12 to RGB
754 */
c1b0bfb4
MN
755static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
756 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
28bf81c9 757 uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
c1b0bfb4 758{
28bf81c9 759/* if(flags&SWS_FULL_UV_IPOL)
c1b0bfb4
MN
760 {
761//FIXME
762 }//FULL_UV_IPOL
28bf81c9 763 else*/
c1b0bfb4
MN
764 {
765#ifdef HAVE_MMX
28bf81c9 766 if(dstFormat == IMGFMT_BGR32) //FIXME untested
c1b0bfb4
MN
767 {
768 asm volatile(
769 YSCALEYUV2RGBX
770 WRITEBGR32
771
772 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
773 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
774 "r" (dest), "m" (dstW),
775 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
776 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
777 );
778 }
28bf81c9 779 else if(dstFormat == IMGFMT_BGR24) //FIXME untested
c1b0bfb4
MN
780 {
781 asm volatile(
782 YSCALEYUV2RGBX
783 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
784 "addl %4, %%ebx \n\t"
785 WRITEBGR24
786
787 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
788 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
789 "r" (dest), "m" (dstW),
790 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
791 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
792 );
793 }
28bf81c9 794 else if(dstFormat==IMGFMT_BGR15)
c1b0bfb4
MN
795 {
796 asm volatile(
797 YSCALEYUV2RGBX
798 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
799#ifdef DITHER1XBPP
9b464428
FB
800 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
801 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
802 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
803#endif
804
805 WRITEBGR15
806
807 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
808 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
809 "r" (dest), "m" (dstW),
810 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
811 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
812 );
813 }
28bf81c9 814 else if(dstFormat==IMGFMT_BGR16)
c1b0bfb4
MN
815 {
816 asm volatile(
817 YSCALEYUV2RGBX
818 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
819#ifdef DITHER1XBPP
9b464428
FB
820 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
821 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
822 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
c1b0bfb4
MN
823#endif
824
825 WRITEBGR16
826
827 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
828 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
829 "r" (dest), "m" (dstW),
830 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
831 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
832 );
833 }
834#else
e3d2500f
MN
835yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
836 chrFilter, chrSrc, chrFilterSize,
28bf81c9 837 dest, dstW, dstFormat);
c1b0bfb4 838
c1b0bfb4
MN
839#endif
840 } //!FULL_UV_IPOL
841}
842
843
844/**
845 * vertical bilinear scale YV12 to RGB
846 */
847static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
28bf81c9 848 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
d604bab9
MN
849{
850 int yalpha1=yalpha^4095;
851 int uvalpha1=uvalpha^4095;
d604bab9 852
1e621b18 853 if(flags&SWS_FULL_CHR_H_INT)
d604bab9
MN
854 {
855
856#ifdef HAVE_MMX
28bf81c9 857 if(dstFormat==IMGFMT_BGR32)
d604bab9
MN
858 {
859 asm volatile(
860
861
862FULL_YSCALEYUV2RGB
863 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
864 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
865
866 "movq %%mm3, %%mm1 \n\t"
867 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
868 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
869
870 MOVNTQ(%%mm3, (%4, %%eax, 4))
871 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
872
873 "addl $4, %%eax \n\t"
874 "cmpl %5, %%eax \n\t"
875 " jb 1b \n\t"
876
877
d1fac6cf 878 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
879 "m" (yalpha1), "m" (uvalpha1)
880 : "%eax"
881 );
882 }
28bf81c9 883 else if(dstFormat==IMGFMT_BGR24)
d604bab9
MN
884 {
885 asm volatile(
886
887FULL_YSCALEYUV2RGB
888
889 // lsb ... msb
890 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
891 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
892
893 "movq %%mm3, %%mm1 \n\t"
894 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
895 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
896
897 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
898 "psrlq $8, %%mm3 \n\t" // GR0BGR00
9b464428
FB
899 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
900 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
d604bab9
MN
901 "por %%mm2, %%mm3 \n\t" // BGRBGR00
902 "movq %%mm1, %%mm2 \n\t"
903 "psllq $48, %%mm1 \n\t" // 000000BG
904 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
905
906 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
907 "psrld $16, %%mm2 \n\t" // R000R000
908 "psrlq $24, %%mm1 \n\t" // 0BGR0000
909 "por %%mm2, %%mm1 \n\t" // RBGRR000
910
911 "movl %4, %%ebx \n\t"
912 "addl %%eax, %%ebx \n\t"
913
914#ifdef HAVE_MMX2
915 //FIXME Alignment
916 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
917 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
918#else
919 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
920 "psrlq $32, %%mm3 \n\t"
921 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
922 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
923#endif
924 "addl $4, %%eax \n\t"
925 "cmpl %5, %%eax \n\t"
926 " jb 1b \n\t"
927
d1fac6cf 928 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
929 "m" (yalpha1), "m" (uvalpha1)
930 : "%eax", "%ebx"
931 );
932 }
28bf81c9 933 else if(dstFormat==IMGFMT_BGR15)
d604bab9
MN
934 {
935 asm volatile(
936
937FULL_YSCALEYUV2RGB
938#ifdef DITHER1XBPP
9b464428
FB
939 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
940 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
941 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
942#endif
943 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
944 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
945 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
946
947 "psrlw $3, %%mm3 \n\t"
948 "psllw $2, %%mm1 \n\t"
949 "psllw $7, %%mm0 \n\t"
9b464428
FB
950 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
951 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
d604bab9
MN
952
953 "por %%mm3, %%mm1 \n\t"
954 "por %%mm1, %%mm0 \n\t"
955
956 MOVNTQ(%%mm0, (%4, %%eax, 2))
957
958 "addl $4, %%eax \n\t"
959 "cmpl %5, %%eax \n\t"
960 " jb 1b \n\t"
961
d1fac6cf 962 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
963 "m" (yalpha1), "m" (uvalpha1)
964 : "%eax"
965 );
966 }
28bf81c9 967 else if(dstFormat==IMGFMT_BGR16)
d604bab9
MN
968 {
969 asm volatile(
970
971FULL_YSCALEYUV2RGB
972#ifdef DITHER1XBPP
9b464428
FB
973 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
974 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
975 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
d604bab9
MN
976#endif
977 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
978 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
979 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
980
981 "psrlw $3, %%mm3 \n\t"
982 "psllw $3, %%mm1 \n\t"
983 "psllw $8, %%mm0 \n\t"
9b464428
FB
984 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
985 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
d604bab9
MN
986
987 "por %%mm3, %%mm1 \n\t"
988 "por %%mm1, %%mm0 \n\t"
989
990 MOVNTQ(%%mm0, (%4, %%eax, 2))
991
992 "addl $4, %%eax \n\t"
993 "cmpl %5, %%eax \n\t"
994 " jb 1b \n\t"
995
d1fac6cf 996 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
997 "m" (yalpha1), "m" (uvalpha1)
998 : "%eax"
999 );
1000 }
1001#else
28bf81c9
MN
1002 if(dstFormat==IMGFMT_BGR32)
1003 {
2ba1bff0 1004 int i;
df3c183a
MN
1005#ifdef WORDS_BIGENDIAN
1006 dest++;
1007#endif
28bf81c9
MN
1008 for(i=0;i<dstW;i++){
1009 // vertical linear interpolation && yuv2rgb in a single step:
1010 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1011 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1012 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1013 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1014 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1015 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1016 dest+= 4;
1017 }
1018 }
1019 else if(dstFormat==IMGFMT_BGR24)
d604bab9 1020 {
96034638 1021 int i;
d1fac6cf 1022 for(i=0;i<dstW;i++){
d604bab9
MN
1023 // vertical linear interpolation && yuv2rgb in a single step:
1024 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1025 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1026 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
390b20a6
MN
1027 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1028 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1029 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
28bf81c9 1030 dest+= 3;
d604bab9
MN
1031 }
1032 }
28bf81c9 1033 else if(dstFormat==IMGFMT_BGR16)
d604bab9 1034 {
96034638 1035 int i;
d1fac6cf 1036 for(i=0;i<dstW;i++){
d604bab9
MN
1037 // vertical linear interpolation && yuv2rgb in a single step:
1038 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1039 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1040 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1041
d022ce5c 1042 ((uint16_t*)dest)[i] =
b18ea156
MN
1043 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1044 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1045 clip_table16r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1046 }
1047 }
28bf81c9 1048 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1049 {
96034638 1050 int i;
d1fac6cf 1051 for(i=0;i<dstW;i++){
d604bab9
MN
1052 // vertical linear interpolation && yuv2rgb in a single step:
1053 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1054 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1055 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1056
d022ce5c 1057 ((uint16_t*)dest)[i] =
b18ea156
MN
1058 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1059 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1060 clip_table15r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1061 }
1062 }
1063#endif
1064 }//FULL_UV_IPOL
1065 else
1066 {
1067#ifdef HAVE_MMX
28bf81c9 1068 if(dstFormat==IMGFMT_BGR32)
d604bab9
MN
1069 {
1070 asm volatile(
1071 YSCALEYUV2RGB
1072 WRITEBGR32
1073
d1fac6cf 1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1075 "m" (yalpha1), "m" (uvalpha1)
1076 : "%eax"
1077 );
1078 }
28bf81c9 1079 else if(dstFormat==IMGFMT_BGR24)
d604bab9
MN
1080 {
1081 asm volatile(
bdc2eb9a 1082 "movl %4, %%ebx \n\t"
d604bab9
MN
1083 YSCALEYUV2RGB
1084 WRITEBGR24
1085
d1fac6cf 1086 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1087 "m" (yalpha1), "m" (uvalpha1)
1088 : "%eax", "%ebx"
1089 );
1090 }
28bf81c9 1091 else if(dstFormat==IMGFMT_BGR15)
d604bab9
MN
1092 {
1093 asm volatile(
1094 YSCALEYUV2RGB
1095 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1096#ifdef DITHER1XBPP
9b464428
FB
1097 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1098 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1099 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1100#endif
1101
1102 WRITEBGR15
1103
d1fac6cf 1104 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1105 "m" (yalpha1), "m" (uvalpha1)
1106 : "%eax"
1107 );
1108 }
28bf81c9 1109 else if(dstFormat==IMGFMT_BGR16)
d604bab9
MN
1110 {
1111 asm volatile(
1112 YSCALEYUV2RGB
1113 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1114#ifdef DITHER1XBPP
9b464428
FB
1115 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1116 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1117 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1118#endif
1119
1120 WRITEBGR16
1121
d1fac6cf 1122 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1123 "m" (yalpha1), "m" (uvalpha1)
1124 : "%eax"
1125 );
1126 }
1127#else
28bf81c9 1128 if(dstFormat==IMGFMT_BGR32)
d604bab9 1129 {
2ba1bff0 1130 int i;
df3c183a
MN
1131#ifdef WORDS_BIGENDIAN
1132 dest++;
1133#endif
d1fac6cf 1134 for(i=0; i<dstW-1; i+=2){
d604bab9 1135 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
1136 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1137 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1138 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1139 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d9fc1cfe
MN
1140
1141 int Cb= yuvtab_40cf[U];
1142 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1143 int Cr= yuvtab_3343[V];
1144
1145 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1146 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1147 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1148
1149 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1150 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1151 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1152 }
1153 }
28bf81c9 1154 else if(dstFormat==IMGFMT_BGR24)
d9fc1cfe 1155 {
96034638 1156 int i;
d1fac6cf 1157 for(i=0; i<dstW-1; i+=2){
d9fc1cfe
MN
1158 // vertical linear interpolation && yuv2rgb in a single step:
1159 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1160 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1161 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1162 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d9fc1cfe
MN
1163
1164 int Cb= yuvtab_40cf[U];
1165 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1166 int Cr= yuvtab_3343[V];
1167
1168 dest[0]=clip_table[((Y1 + Cb) >>13)];
1169 dest[1]=clip_table[((Y1 + Cg) >>13)];
1170 dest[2]=clip_table[((Y1 + Cr) >>13)];
1171
1172 dest[3]=clip_table[((Y2 + Cb) >>13)];
1173 dest[4]=clip_table[((Y2 + Cg) >>13)];
1174 dest[5]=clip_table[((Y2 + Cr) >>13)];
1175 dest+=6;
d604bab9
MN
1176 }
1177 }
28bf81c9 1178 else if(dstFormat==IMGFMT_BGR16)
d604bab9 1179 {
96034638 1180 int i;
5521b193
MN
1181#ifdef DITHER1XBPP
1182 static int ditherb1=1<<14;
1183 static int ditherg1=1<<13;
1184 static int ditherr1=2<<14;
1185 static int ditherb2=3<<14;
1186 static int ditherg2=3<<13;
1187 static int ditherr2=0<<14;
1188
1189 ditherb1 ^= (1^2)<<14;
1190 ditherg1 ^= (1^2)<<13;
1191 ditherr1 ^= (1^2)<<14;
1192 ditherb2 ^= (3^0)<<14;
1193 ditherg2 ^= (3^0)<<13;
1194 ditherr2 ^= (3^0)<<14;
1195#else
1196 const int ditherb1=0;
1197 const int ditherg1=0;
1198 const int ditherr1=0;
1199 const int ditherb2=0;
1200 const int ditherg2=0;
1201 const int ditherr2=0;
1202#endif
d1fac6cf 1203 for(i=0; i<dstW-1; i+=2){
d604bab9 1204 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
1205 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1206 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1207 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1208 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d604bab9 1209
d9fc1cfe
MN
1210 int Cb= yuvtab_40cf[U];
1211 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1212 int Cr= yuvtab_3343[V];
1213
d022ce5c 1214 ((uint16_t*)dest)[i] =
5521b193
MN
1215 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1216 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1217 clip_table16r[(Y1 + Cr + ditherr1) >>13];
d9fc1cfe
MN
1218
1219 ((uint16_t*)dest)[i+1] =
5521b193
MN
1220 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1221 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1222 clip_table16r[(Y2 + Cr + ditherr2) >>13];
d604bab9
MN
1223 }
1224 }
28bf81c9 1225 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1226 {
96034638 1227 int i;
5521b193
MN
1228#ifdef DITHER1XBPP
1229 static int ditherb1=1<<14;
1230 static int ditherg1=1<<14;
1231 static int ditherr1=2<<14;
1232 static int ditherb2=3<<14;
1233 static int ditherg2=3<<14;
1234 static int ditherr2=0<<14;
1235
1236 ditherb1 ^= (1^2)<<14;
1237 ditherg1 ^= (1^2)<<14;
1238 ditherr1 ^= (1^2)<<14;
1239 ditherb2 ^= (3^0)<<14;
1240 ditherg2 ^= (3^0)<<14;
1241 ditherr2 ^= (3^0)<<14;
1242#else
1243 const int ditherb1=0;
1244 const int ditherg1=0;
1245 const int ditherr1=0;
1246 const int ditherb2=0;
1247 const int ditherg2=0;
1248 const int ditherr2=0;
1249#endif
d1fac6cf 1250 for(i=0; i<dstW-1; i+=2){
d604bab9 1251 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
1252 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1253 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1254 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1255 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d604bab9 1256
d9fc1cfe
MN
1257 int Cb= yuvtab_40cf[U];
1258 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1259 int Cr= yuvtab_3343[V];
1260
d022ce5c 1261 ((uint16_t*)dest)[i] =
5521b193
MN
1262 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1263 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1264 clip_table15r[(Y1 + Cr + ditherr1) >>13];
b18ea156 1265
d9fc1cfe 1266 ((uint16_t*)dest)[i+1] =
5521b193
MN
1267 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1268 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1269 clip_table15r[(Y2 + Cr + ditherr2) >>13];
d604bab9
MN
1270 }
1271 }
1272#endif
1273 } //!FULL_UV_IPOL
1274}
1275
1276/**
1277 * YV12 to RGB without scaling or interpolating
1278 */
c1b0bfb4 1279static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
28bf81c9 1280 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
d604bab9 1281{
d604bab9 1282 int uvalpha1=uvalpha^4095;
c1b0bfb4 1283 const int yalpha1=0;
96034638 1284
1e621b18 1285 if(flags&SWS_FULL_CHR_H_INT)
d604bab9 1286 {
28bf81c9 1287 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
d604bab9
MN
1288 return;
1289 }
397c035e
MN
1290
1291#ifdef HAVE_MMX
497d4f99
MN
1292 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1293 {
28bf81c9 1294 if(dstFormat==IMGFMT_BGR32)
d604bab9
MN
1295 {
1296 asm volatile(
1297 YSCALEYUV2RGB1
1298 WRITEBGR32
c1b0bfb4 1299 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1300 "m" (yalpha1), "m" (uvalpha1)
1301 : "%eax"
1302 );
1303 }
28bf81c9 1304 else if(dstFormat==IMGFMT_BGR24)
d604bab9
MN
1305 {
1306 asm volatile(
bdc2eb9a 1307 "movl %4, %%ebx \n\t"
d604bab9
MN
1308 YSCALEYUV2RGB1
1309 WRITEBGR24
c1b0bfb4 1310 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1311 "m" (yalpha1), "m" (uvalpha1)
1312 : "%eax", "%ebx"
1313 );
1314 }
28bf81c9 1315 else if(dstFormat==IMGFMT_BGR15)
d604bab9
MN
1316 {
1317 asm volatile(
1318 YSCALEYUV2RGB1
1319 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1320#ifdef DITHER1XBPP
9b464428
FB
1321 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1322 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1323 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1324#endif
1325 WRITEBGR15
c1b0bfb4 1326 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1327 "m" (yalpha1), "m" (uvalpha1)
1328 : "%eax"
1329 );
1330 }
28bf81c9 1331 else if(dstFormat==IMGFMT_BGR16)
d604bab9
MN
1332 {
1333 asm volatile(
1334 YSCALEYUV2RGB1
1335 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1336#ifdef DITHER1XBPP
9b464428
FB
1337 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1338 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1339 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
d604bab9
MN
1340#endif
1341
1342 WRITEBGR16
c1b0bfb4 1343 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1344 "m" (yalpha1), "m" (uvalpha1)
1345 : "%eax"
1346 );
1347 }
497d4f99
MN
1348 }
1349 else
1350 {
28bf81c9 1351 if(dstFormat==IMGFMT_BGR32)
d604bab9 1352 {
497d4f99
MN
1353 asm volatile(
1354 YSCALEYUV2RGB1b
1355 WRITEBGR32
c1b0bfb4 1356 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1357 "m" (yalpha1), "m" (uvalpha1)
1358 : "%eax"
1359 );
d604bab9 1360 }
28bf81c9 1361 else if(dstFormat==IMGFMT_BGR24)
d604bab9 1362 {
497d4f99 1363 asm volatile(
bdc2eb9a 1364 "movl %4, %%ebx \n\t"
497d4f99
MN
1365 YSCALEYUV2RGB1b
1366 WRITEBGR24
c1b0bfb4 1367 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
497d4f99
MN
1368 "m" (yalpha1), "m" (uvalpha1)
1369 : "%eax", "%ebx"
1370 );
d604bab9 1371 }
28bf81c9 1372 else if(dstFormat==IMGFMT_BGR15)
d604bab9 1373 {
497d4f99
MN
1374 asm volatile(
1375 YSCALEYUV2RGB1b
1376 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1377#ifdef DITHER1XBPP
9b464428
FB
1378 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1379 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1380 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99
MN
1381#endif
1382 WRITEBGR15
c1b0bfb4 1383 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1384 "m" (yalpha1), "m" (uvalpha1)
1385 : "%eax"
1386 );
1387 }
28bf81c9 1388 else if(dstFormat==IMGFMT_BGR16)
497d4f99
MN
1389 {
1390 asm volatile(
1391 YSCALEYUV2RGB1b
1392 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1393#ifdef DITHER1XBPP
9b464428
FB
1394 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1395 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1396 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
497d4f99 1397#endif
d604bab9 1398
497d4f99 1399 WRITEBGR16
c1b0bfb4 1400 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1401 "m" (yalpha1), "m" (uvalpha1)
1402 : "%eax"
1403 );
d604bab9 1404 }
497d4f99
MN
1405 }
1406#else
397c035e 1407//FIXME write 2 versions (for even & odd lines)
497d4f99 1408
28bf81c9 1409 if(dstFormat==IMGFMT_BGR32)
497d4f99 1410 {
2ba1bff0 1411 int i;
df3c183a
MN
1412#ifdef WORDS_BIGENDIAN
1413 dest++;
1414#endif
d1fac6cf 1415 for(i=0; i<dstW-1; i+=2){
497d4f99 1416 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1417 int Y1=yuvtab_2568[buf0[i]>>7];
1418 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1419 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1420 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
397c035e
MN
1421
1422 int Cb= yuvtab_40cf[U];
1423 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1424 int Cr= yuvtab_3343[V];
1425
1426 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1427 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1428 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1429
1430 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1431 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1432 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1433 }
1434 }
28bf81c9 1435 else if(dstFormat==IMGFMT_BGR24)
397c035e 1436 {
96034638 1437 int i;
d1fac6cf 1438 for(i=0; i<dstW-1; i+=2){
397c035e
MN
1439 // vertical linear interpolation && yuv2rgb in a single step:
1440 int Y1=yuvtab_2568[buf0[i]>>7];
1441 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1442 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1443 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
397c035e
MN
1444
1445 int Cb= yuvtab_40cf[U];
1446 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1447 int Cr= yuvtab_3343[V];
1448
1449 dest[0]=clip_table[((Y1 + Cb) >>13)];
1450 dest[1]=clip_table[((Y1 + Cg) >>13)];
1451 dest[2]=clip_table[((Y1 + Cr) >>13)];
1452
1453 dest[3]=clip_table[((Y2 + Cb) >>13)];
1454 dest[4]=clip_table[((Y2 + Cg) >>13)];
1455 dest[5]=clip_table[((Y2 + Cr) >>13)];
1456 dest+=6;
497d4f99
MN
1457 }
1458 }
28bf81c9 1459 else if(dstFormat==IMGFMT_BGR16)
497d4f99 1460 {
96034638 1461 int i;
5521b193
MN
1462#ifdef DITHER1XBPP
1463 static int ditherb1=1<<14;
1464 static int ditherg1=1<<13;
1465 static int ditherr1=2<<14;
1466 static int ditherb2=3<<14;
1467 static int ditherg2=3<<13;
1468 static int ditherr2=0<<14;
1469
1470 ditherb1 ^= (1^2)<<14;
1471 ditherg1 ^= (1^2)<<13;
1472 ditherr1 ^= (1^2)<<14;
1473 ditherb2 ^= (3^0)<<14;
1474 ditherg2 ^= (3^0)<<13;
1475 ditherr2 ^= (3^0)<<14;
1476#else
1477 const int ditherb1=0;
1478 const int ditherg1=0;
1479 const int ditherr1=0;
1480 const int ditherb2=0;
1481 const int ditherg2=0;
1482 const int ditherr2=0;
1483#endif
d1fac6cf 1484 for(i=0; i<dstW-1; i+=2){
497d4f99 1485 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1486 int Y1=yuvtab_2568[buf0[i]>>7];
1487 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1488 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1489 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
497d4f99 1490
397c035e
MN
1491 int Cb= yuvtab_40cf[U];
1492 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1493 int Cr= yuvtab_3343[V];
1494
d022ce5c 1495 ((uint16_t*)dest)[i] =
5521b193
MN
1496 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1497 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1498 clip_table16r[(Y1 + Cr + ditherr1) >>13];
397c035e
MN
1499
1500 ((uint16_t*)dest)[i+1] =
5521b193
MN
1501 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1502 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1503 clip_table16r[(Y2 + Cr + ditherr2) >>13];
497d4f99
MN
1504 }
1505 }
28bf81c9 1506 else if(dstFormat==IMGFMT_BGR15)
497d4f99 1507 {
96034638 1508 int i;
5521b193
MN
1509#ifdef DITHER1XBPP
1510 static int ditherb1=1<<14;
1511 static int ditherg1=1<<14;
1512 static int ditherr1=2<<14;
1513 static int ditherb2=3<<14;
1514 static int ditherg2=3<<14;
1515 static int ditherr2=0<<14;
1516
1517 ditherb1 ^= (1^2)<<14;
1518 ditherg1 ^= (1^2)<<14;
1519 ditherr1 ^= (1^2)<<14;
1520 ditherb2 ^= (3^0)<<14;
1521 ditherg2 ^= (3^0)<<14;
1522 ditherr2 ^= (3^0)<<14;
1523#else
1524 const int ditherb1=0;
1525 const int ditherg1=0;
1526 const int ditherr1=0;
1527 const int ditherb2=0;
1528 const int ditherg2=0;
1529 const int ditherr2=0;
1530#endif
d1fac6cf 1531 for(i=0; i<dstW-1; i+=2){
497d4f99 1532 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1533 int Y1=yuvtab_2568[buf0[i]>>7];
1534 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1535 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1536 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
497d4f99 1537
397c035e
MN
1538 int Cb= yuvtab_40cf[U];
1539 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1540 int Cr= yuvtab_3343[V];
1541
d022ce5c 1542 ((uint16_t*)dest)[i] =
5521b193
MN
1543 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1544 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1545 clip_table15r[(Y1 + Cr + ditherr1) >>13];
b18ea156 1546
397c035e 1547 ((uint16_t*)dest)[i+1] =
5521b193
MN
1548 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1549 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1550 clip_table15r[(Y2 + Cr + ditherr2) >>13];
497d4f99
MN
1551 }
1552 }
d604bab9
MN
1553#endif
1554}
1555
6ff0ad6b
MN
1556//FIXME yuy2* can read upto 7 samples to much
1557
1e621b18
MN
1558static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1559{
6ff0ad6b
MN
1560#ifdef HAVE_MMX
1561 asm volatile(
1562 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1563 "movl %0, %%eax \n\t"
1564 "1: \n\t"
1565 "movq (%1, %%eax,2), %%mm0 \n\t"
1566 "movq 8(%1, %%eax,2), %%mm1 \n\t"
1567 "pand %%mm2, %%mm0 \n\t"
1568 "pand %%mm2, %%mm1 \n\t"
1569 "packuswb %%mm1, %%mm0 \n\t"
1570 "movq %%mm0, (%2, %%eax) \n\t"
1571 "addl $8, %%eax \n\t"
1572 " js 1b \n\t"
1573 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1574 : "%eax"
1575 );
1e621b18
MN
1576#else
1577 int i;
1578 for(i=0; i<width; i++)
1579 dst[i]= src[2*i];
1580#endif
1581}
1582
1583static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1584{
6ff0ad6b
MN
1585#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1586 asm volatile(
1587 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1588 "movl %0, %%eax \n\t"
1589 "1: \n\t"
1590 "movq (%1, %%eax,4), %%mm0 \n\t"
1591 "movq 8(%1, %%eax,4), %%mm1 \n\t"
1592 "movq (%2, %%eax,4), %%mm2 \n\t"
1593 "movq 8(%2, %%eax,4), %%mm3 \n\t"
1594 PAVGB(%%mm2, %%mm0)
1595 PAVGB(%%mm3, %%mm1)
1596 "psrlw $8, %%mm0 \n\t"
1597 "psrlw $8, %%mm1 \n\t"
1598 "packuswb %%mm1, %%mm0 \n\t"
1599 "movq %%mm0, %%mm1 \n\t"
1600 "psrlw $8, %%mm0 \n\t"
1601 "pand %%mm4, %%mm1 \n\t"
1602 "packuswb %%mm0, %%mm0 \n\t"
1603 "packuswb %%mm1, %%mm1 \n\t"
1604 "movd %%mm0, (%4, %%eax) \n\t"
1605 "movd %%mm1, (%3, %%eax) \n\t"
1606 "addl $4, %%eax \n\t"
1607 " js 1b \n\t"
1608 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1609 : "%eax"
1610 );
1e621b18
MN
1611#else
1612 int i;
1613 for(i=0; i<width; i++)
1614 {
1615 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1616 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1617 }
1618#endif
1619}
1620
1621static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1622{
1623#ifdef HAVE_MMXFIXME
1624#else
1625 int i;
1626 for(i=0; i<width; i++)
1627 {
1628 int b= src[i*4+0];
1629 int g= src[i*4+1];
1630 int r= src[i*4+2];
1631
1632 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1633 }
1634#endif
1635}
1636
1637static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1638{
1639#ifdef HAVE_MMXFIXME
1640#else
1641 int i;
1642 for(i=0; i<width; i++)
1643 {
1644 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1645 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1646 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1647
1648 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1649 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1650 }
1651#endif
1652}
1653
1654static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1655{
ac6a2e45
MN
1656#ifdef HAVE_MMX
1657 asm volatile(
1658 "movl %2, %%eax \n\t"
854288bb
FB
1659 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1660 "movq "MANGLE(w1111)", %%mm5 \n\t"
ac6a2e45
MN
1661 "pxor %%mm7, %%mm7 \n\t"
1662 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1663 ".balign 16 \n\t"
1664 "1: \n\t"
1665 PREFETCH" 64(%0, %%ebx) \n\t"
1666 "movd (%0, %%ebx), %%mm0 \n\t"
1667 "movd 3(%0, %%ebx), %%mm1 \n\t"
1668 "punpcklbw %%mm7, %%mm0 \n\t"
1669 "punpcklbw %%mm7, %%mm1 \n\t"
1670 "movd 6(%0, %%ebx), %%mm2 \n\t"
1671 "movd 9(%0, %%ebx), %%mm3 \n\t"
1672 "punpcklbw %%mm7, %%mm2 \n\t"
1673 "punpcklbw %%mm7, %%mm3 \n\t"
1674 "pmaddwd %%mm6, %%mm0 \n\t"
1675 "pmaddwd %%mm6, %%mm1 \n\t"
1676 "pmaddwd %%mm6, %%mm2 \n\t"
1677 "pmaddwd %%mm6, %%mm3 \n\t"
1678#ifndef FAST_BGR2YV12
1679 "psrad $8, %%mm0 \n\t"
1680 "psrad $8, %%mm1 \n\t"
1681 "psrad $8, %%mm2 \n\t"
1682 "psrad $8, %%mm3 \n\t"
1683#endif
1684 "packssdw %%mm1, %%mm0 \n\t"
1685 "packssdw %%mm3, %%mm2 \n\t"
1686 "pmaddwd %%mm5, %%mm0 \n\t"
1687 "pmaddwd %%mm5, %%mm2 \n\t"
1688 "packssdw %%mm2, %%mm0 \n\t"
1689 "psraw $7, %%mm0 \n\t"
1690
1691 "movd 12(%0, %%ebx), %%mm4 \n\t"
1692 "movd 15(%0, %%ebx), %%mm1 \n\t"
1693 "punpcklbw %%mm7, %%mm4 \n\t"
1694 "punpcklbw %%mm7, %%mm1 \n\t"
1695 "movd 18(%0, %%ebx), %%mm2 \n\t"
1696 "movd 21(%0, %%ebx), %%mm3 \n\t"
1697 "punpcklbw %%mm7, %%mm2 \n\t"
1698 "punpcklbw %%mm7, %%mm3 \n\t"
1699 "pmaddwd %%mm6, %%mm4 \n\t"
1700 "pmaddwd %%mm6, %%mm1 \n\t"
1701 "pmaddwd %%mm6, %%mm2 \n\t"
1702 "pmaddwd %%mm6, %%mm3 \n\t"
1703#ifndef FAST_BGR2YV12
1704 "psrad $8, %%mm4 \n\t"
1705 "psrad $8, %%mm1 \n\t"
1706 "psrad $8, %%mm2 \n\t"
1707 "psrad $8, %%mm3 \n\t"
1708#endif
1709 "packssdw %%mm1, %%mm4 \n\t"
1710 "packssdw %%mm3, %%mm2 \n\t"
1711 "pmaddwd %%mm5, %%mm4 \n\t"
1712 "pmaddwd %%mm5, %%mm2 \n\t"
1713 "addl $24, %%ebx \n\t"
1714 "packssdw %%mm2, %%mm4 \n\t"
1715 "psraw $7, %%mm4 \n\t"
1716
1717 "packuswb %%mm4, %%mm0 \n\t"
854288bb 1718 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
ac6a2e45 1719
4342fc14 1720 "movq %%mm0, (%1, %%eax) \n\t"
ac6a2e45
MN
1721 "addl $8, %%eax \n\t"
1722 " js 1b \n\t"
1723 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1724 : "%eax", "%ebx"
1725 );
1e621b18
MN
1726#else
1727 int i;
1728 for(i=0; i<width; i++)
1729 {
1730 int b= src[i*3+0];
1731 int g= src[i*3+1];
1732 int r= src[i*3+2];
1733
1734 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1735 }
1736#endif
1737}
1738
1739static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1740{
4342fc14
MN
1741#ifdef HAVE_MMX
1742 asm volatile(
1743 "movl %4, %%eax \n\t"
854288bb
FB
1744 "movq "MANGLE(w1111)", %%mm5 \n\t"
1745 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
4342fc14
MN
1746 "pxor %%mm7, %%mm7 \n\t"
1747 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1748 "addl %%ebx, %%ebx \n\t"
1749 ".balign 16 \n\t"
1750 "1: \n\t"
1751 PREFETCH" 64(%0, %%ebx) \n\t"
1752 PREFETCH" 64(%1, %%ebx) \n\t"
1753#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1754 "movq (%0, %%ebx), %%mm0 \n\t"
1755 "movq (%1, %%ebx), %%mm1 \n\t"
1756 "movq 6(%0, %%ebx), %%mm2 \n\t"
1757 "movq 6(%1, %%ebx), %%mm3 \n\t"
1758 PAVGB(%%mm1, %%mm0)
1759 PAVGB(%%mm3, %%mm2)
1760 "movq %%mm0, %%mm1 \n\t"
1761 "movq %%mm2, %%mm3 \n\t"
1762 "psrlq $24, %%mm0 \n\t"
1763 "psrlq $24, %%mm2 \n\t"
1764 PAVGB(%%mm1, %%mm0)
1765 PAVGB(%%mm3, %%mm2)
1766 "punpcklbw %%mm7, %%mm0 \n\t"
1767 "punpcklbw %%mm7, %%mm2 \n\t"
1768#else
1769 "movd (%0, %%ebx), %%mm0 \n\t"
1770 "movd (%1, %%ebx), %%mm1 \n\t"
1771 "movd 3(%0, %%ebx), %%mm2 \n\t"
1772 "movd 3(%1, %%ebx), %%mm3 \n\t"
1773 "punpcklbw %%mm7, %%mm0 \n\t"
1774 "punpcklbw %%mm7, %%mm1 \n\t"
1775 "punpcklbw %%mm7, %%mm2 \n\t"
1776 "punpcklbw %%mm7, %%mm3 \n\t"
1777 "paddw %%mm1, %%mm0 \n\t"
1778 "paddw %%mm3, %%mm2 \n\t"
1779 "paddw %%mm2, %%mm0 \n\t"
1780 "movd 6(%0, %%ebx), %%mm4 \n\t"
1781 "movd 6(%1, %%ebx), %%mm1 \n\t"
1782 "movd 9(%0, %%ebx), %%mm2 \n\t"
1783 "movd 9(%1, %%ebx), %%mm3 \n\t"
1784 "punpcklbw %%mm7, %%mm4 \n\t"
1785 "punpcklbw %%mm7, %%mm1 \n\t"
1786 "punpcklbw %%mm7, %%mm2 \n\t"
1787 "punpcklbw %%mm7, %%mm3 \n\t"
1788 "paddw %%mm1, %%mm4 \n\t"
1789 "paddw %%mm3, %%mm2 \n\t"
1790 "paddw %%mm4, %%mm2 \n\t"
1791 "psrlw $2, %%mm0 \n\t"
1792 "psrlw $2, %%mm2 \n\t"
1793#endif
854288bb
FB
1794 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1795 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1796
1797 "pmaddwd %%mm0, %%mm1 \n\t"
1798 "pmaddwd %%mm2, %%mm3 \n\t"
1799 "pmaddwd %%mm6, %%mm0 \n\t"
1800 "pmaddwd %%mm6, %%mm2 \n\t"
1801#ifndef FAST_BGR2YV12
1802 "psrad $8, %%mm0 \n\t"
1803 "psrad $8, %%mm1 \n\t"
1804 "psrad $8, %%mm2 \n\t"
1805 "psrad $8, %%mm3 \n\t"
1806#endif
1807 "packssdw %%mm2, %%mm0 \n\t"
1808 "packssdw %%mm3, %%mm1 \n\t"
1809 "pmaddwd %%mm5, %%mm0 \n\t"
1810 "pmaddwd %%mm5, %%mm1 \n\t"
1811 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1812 "psraw $7, %%mm0 \n\t"
1813
1814#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1815 "movq 12(%0, %%ebx), %%mm4 \n\t"
1816 "movq 12(%1, %%ebx), %%mm1 \n\t"
1817 "movq 18(%0, %%ebx), %%mm2 \n\t"
1818 "movq 18(%1, %%ebx), %%mm3 \n\t"
1819 PAVGB(%%mm1, %%mm4)
1820 PAVGB(%%mm3, %%mm2)
1821 "movq %%mm4, %%mm1 \n\t"
1822 "movq %%mm2, %%mm3 \n\t"
1823 "psrlq $24, %%mm4 \n\t"
1824 "psrlq $24, %%mm2 \n\t"
1825 PAVGB(%%mm1, %%mm4)
1826 PAVGB(%%mm3, %%mm2)
1827 "punpcklbw %%mm7, %%mm4 \n\t"
1828 "punpcklbw %%mm7, %%mm2 \n\t"
1829#else
1830 "movd 12(%0, %%ebx), %%mm4 \n\t"
1831 "movd 12(%1, %%ebx), %%mm1 \n\t"
1832 "movd 15(%0, %%ebx), %%mm2 \n\t"
1833 "movd 15(%1, %%ebx), %%mm3 \n\t"
1834 "punpcklbw %%mm7, %%mm4 \n\t"
1835 "punpcklbw %%mm7, %%mm1 \n\t"
1836 "punpcklbw %%mm7, %%mm2 \n\t"
1837 "punpcklbw %%mm7, %%mm3 \n\t"
1838 "paddw %%mm1, %%mm4 \n\t"
1839 "paddw %%mm3, %%mm2 \n\t"
1840 "paddw %%mm2, %%mm4 \n\t"
1841 "movd 18(%0, %%ebx), %%mm5 \n\t"
1842 "movd 18(%1, %%ebx), %%mm1 \n\t"
1843 "movd 21(%0, %%ebx), %%mm2 \n\t"
1844 "movd 21(%1, %%ebx), %%mm3 \n\t"
1845 "punpcklbw %%mm7, %%mm5 \n\t"
1846 "punpcklbw %%mm7, %%mm1 \n\t"
1847 "punpcklbw %%mm7, %%mm2 \n\t"
1848 "punpcklbw %%mm7, %%mm3 \n\t"
1849 "paddw %%mm1, %%mm5 \n\t"
1850 "paddw %%mm3, %%mm2 \n\t"
1851 "paddw %%mm5, %%mm2 \n\t"
854288bb 1852 "movq "MANGLE(w1111)", %%mm5 \n\t"
4342fc14
MN
1853 "psrlw $2, %%mm4 \n\t"
1854 "psrlw $2, %%mm2 \n\t"
1855#endif
854288bb
FB
1856 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1857 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
4342fc14
MN
1858
1859 "pmaddwd %%mm4, %%mm1 \n\t"
1860 "pmaddwd %%mm2, %%mm3 \n\t"
1861 "pmaddwd %%mm6, %%mm4 \n\t"
1862 "pmaddwd %%mm6, %%mm2 \n\t"
1863#ifndef FAST_BGR2YV12
1864 "psrad $8, %%mm4 \n\t"
1865 "psrad $8, %%mm1 \n\t"
1866 "psrad $8, %%mm2 \n\t"
1867 "psrad $8, %%mm3 \n\t"
1868#endif
1869 "packssdw %%mm2, %%mm4 \n\t"
1870 "packssdw %%mm3, %%mm1 \n\t"
1871 "pmaddwd %%mm5, %%mm4 \n\t"
1872 "pmaddwd %%mm5, %%mm1 \n\t"
1873 "addl $24, %%ebx \n\t"
1874 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1875 "psraw $7, %%mm4 \n\t"
1876
1877 "movq %%mm0, %%mm1 \n\t"
1878 "punpckldq %%mm4, %%mm0 \n\t"
1879 "punpckhdq %%mm4, %%mm1 \n\t"
1880 "packsswb %%mm1, %%mm0 \n\t"
854288bb 1881 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
4342fc14
MN
1882
1883 "movd %%mm0, (%2, %%eax) \n\t"
1884 "punpckhdq %%mm0, %%mm0 \n\t"
1885 "movd %%mm0, (%3, %%eax) \n\t"
1886 "addl $4, %%eax \n\t"
1887 " js 1b \n\t"
1888 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1889 : "%eax", "%ebx"
1890 );
1e621b18
MN
1891#else
1892 int i;
1893 for(i=0; i<width; i++)
1894 {
1895 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1896 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1897 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1898
1899 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1900 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1901 }
1902#endif
1903}
1904
6af250ea
MN
1905static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1906{
1907 int i;
1908 for(i=0; i<width; i++)
1909 {
1910 int d= src[i*2] + (src[i*2+1]<<8);
1911 int b= d&0x1F;
1912 int g= (d>>5)&0x3F;
1913 int r= (d>>11)&0x1F;
1914
1915 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1916 }
1917}
1918
1919static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1920{
1921 int i;
1922 for(i=0; i<width; i++)
1923 {
5bb9d9d8
MN
1924#if 1
1925 int d0= le2me_32( ((uint32_t*)src1)[i] );
1926 int d1= le2me_32( ((uint32_t*)src2)[i] );
1927
1928 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1929 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1930
1931 int dh2= (dh>>11) + (dh<<21);
1932 int d= dh2 + dl;
1933
1934 int b= d&0x7F;
1935 int r= (d>>11)&0x7F;
1936 int g= d>>21;
1937#else
6af250ea
MN
1938 int d0= src1[i*4] + (src1[i*4+1]<<8);
1939 int b0= d0&0x1F;
1940 int g0= (d0>>5)&0x3F;
1941 int r0= (d0>>11)&0x1F;
1942
1943 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1944 int b1= d1&0x1F;
1945 int g1= (d1>>5)&0x3F;
1946 int r1= (d1>>11)&0x1F;
1947
1948 int d2= src2[i*4] + (src2[i*4+1]<<8);
1949 int b2= d2&0x1F;
1950 int g2= (d2>>5)&0x3F;
1951 int r2= (d2>>11)&0x1F;
1952
1953 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1954 int b3= d3&0x1F;
1955 int g3= (d3>>5)&0x3F;
1956 int r3= (d3>>11)&0x1F;
1957
1958 int b= b0 + b1 + b2 + b3;
1959 int g= g0 + g1 + g2 + g3;
1960 int r= r0 + r1 + r2 + r3;
5bb9d9d8 1961#endif
6af250ea
MN
1962 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1963 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1964 }
1965}
1966
b72034dd
MN
1967static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1968{
1969 int i;
1970 for(i=0; i<width; i++)
1971 {
1972 int d= src[i*2] + (src[i*2+1]<<8);
1973 int b= d&0x1F;
1974 int g= (d>>5)&0x1F;
1975 int r= (d>>10)&0x1F;
1976
1977 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1978 }
1979}
1980
1981static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1982{
1983 int i;
1984 for(i=0; i<width; i++)
1985 {
1986#if 1
1987 int d0= le2me_32( ((uint32_t*)src1)[i] );
1988 int d1= le2me_32( ((uint32_t*)src2)[i] );
1989
1990 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1991 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1992
1993 int dh2= (dh>>11) + (dh<<21);
1994 int d= dh2 + dl;
1995
1996 int b= d&0x7F;
1997 int r= (d>>10)&0x7F;
1998 int g= d>>21;
1999#else
2000 int d0= src1[i*4] + (src1[i*4+1]<<8);
2001 int b0= d0&0x1F;
2002 int g0= (d0>>5)&0x1F;
2003 int r0= (d0>>10)&0x1F;
2004
2005 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
2006 int b1= d1&0x1F;
2007 int g1= (d1>>5)&0x1F;
2008 int r1= (d1>>10)&0x1F;
2009
2010 int d2= src2[i*4] + (src2[i*4+1]<<8);
2011 int b2= d2&0x1F;
2012 int g2= (d2>>5)&0x1F;
2013 int r2= (d2>>10)&0x1F;
2014
2015 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
2016 int b3= d3&0x1F;
2017 int g3= (d3>>5)&0x1F;
2018 int r3= (d3>>10)&0x1F;
2019
2020 int b= b0 + b1 + b2 + b3;
2021 int g= g0 + g1 + g2 + g3;
2022 int r= r0 + r1 + r2 + r3;
2023#endif
2024 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2025 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2026 }
2027}
2028
2029
a861d4d7
MN
2030static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2031{
2032 int i;
2033 for(i=0; i<width; i++)
2034 {
2035 int r= src[i*4+0];
2036 int g= src[i*4+1];
2037 int b= src[i*4+2];
2038
2039 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2040 }
2041}
2042
2043static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2044{
2045 int i;
2046 for(i=0; i<width; i++)
2047 {
2048 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
2049 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
2050 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
2051
2052 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2053 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2054 }
2055}
2056
2057static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2058{
2059 int i;
2060 for(i=0; i<width; i++)
2061 {
2062 int r= src[i*3+0];
2063 int g= src[i*3+1];
2064 int b= src[i*3+2];
2065
2066 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2067 }
2068}
2069
2070static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2071{
2072 int i;
2073 for(i=0; i<width; i++)
2074 {
2075 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2076 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2077 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2078
2079 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2080 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2081 }
2082}
2083
1e621b18 2084
077ea8a7
MN
2085// Bilinear / Bicubic scaling
2086static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2087 int16_t *filter, int16_t *filterPos, int filterSize)
2ff198c1 2088{
077ea8a7
MN
2089#ifdef HAVE_MMX
2090 if(filterSize==4) // allways true for upscaling, sometimes for down too
2091 {
2092 int counter= -2*dstW;
2093 filter-= counter*2;
2094 filterPos-= counter/2;
2095 dst-= counter/2;
2096 asm volatile(
2097 "pxor %%mm7, %%mm7 \n\t"
9b464428 2098 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2099 "pushl %%ebp \n\t" // we use 7 regs here ...
2100 "movl %%eax, %%ebp \n\t"
2101 ".balign 16 \n\t"
2102 "1: \n\t"
2103 "movzwl (%2, %%ebp), %%eax \n\t"
2104 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2105 "movq (%1, %%ebp, 4), %%mm1 \n\t"
2106 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
2107 "movd (%3, %%eax), %%mm0 \n\t"
2108 "movd (%3, %%ebx), %%mm2 \n\t"
2109 "punpcklbw %%mm7, %%mm0 \n\t"
2110 "punpcklbw %%mm7, %%mm2 \n\t"
2111 "pmaddwd %%mm1, %%mm0 \n\t"
2112 "pmaddwd %%mm2, %%mm3 \n\t"
2113 "psrad $8, %%mm0 \n\t"
2114 "psrad $8, %%mm3 \n\t"
2115 "packssdw %%mm3, %%mm0 \n\t"
2116 "pmaddwd %%mm6, %%mm0 \n\t"
2117 "packssdw %%mm0, %%mm0 \n\t"
2118 "movd %%mm0, (%4, %%ebp) \n\t"
2119 "addl $4, %%ebp \n\t"
2120 " jnc 1b \n\t"
e3d2500f 2121
077ea8a7
MN
2122 "popl %%ebp \n\t"
2123 : "+a" (counter)
2124 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2125 : "%ebx"
2126 );
2127 }
2128 else if(filterSize==8)
2129 {
2130 int counter= -2*dstW;
2131 filter-= counter*4;
2132 filterPos-= counter/2;
2133 dst-= counter/2;
2134 asm volatile(
2135 "pxor %%mm7, %%mm7 \n\t"
9b464428 2136 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2137 "pushl %%ebp \n\t" // we use 7 regs here ...
2138 "movl %%eax, %%ebp \n\t"
2139 ".balign 16 \n\t"
2140 "1: \n\t"
2141 "movzwl (%2, %%ebp), %%eax \n\t"
2142 "movzwl 2(%2, %%ebp), %%ebx \n\t"
2143 "movq (%1, %%ebp, 8), %%mm1 \n\t"
2144 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
2145 "movd (%3, %%eax), %%mm0 \n\t"
2146 "movd (%3, %%ebx), %%mm2 \n\t"
2147 "punpcklbw %%mm7, %%mm0 \n\t"
2148 "punpcklbw %%mm7, %%mm2 \n\t"
2149 "pmaddwd %%mm1, %%mm0 \n\t"
2150 "pmaddwd %%mm2, %%mm3 \n\t"
2151
2152 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
2153 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
2154 "movd 4(%3, %%eax), %%mm4 \n\t"
2155 "movd 4(%3, %%ebx), %%mm2 \n\t"
2156 "punpcklbw %%mm7, %%mm4 \n\t"
2157 "punpcklbw %%mm7, %%mm2 \n\t"
2158 "pmaddwd %%mm1, %%mm4 \n\t"
2159 "pmaddwd %%mm2, %%mm5 \n\t"
2160 "paddd %%mm4, %%mm0 \n\t"
2161 "paddd %%mm5, %%mm3 \n\t"
2162
2163 "psrad $8, %%mm0 \n\t"
2164 "psrad $8, %%mm3 \n\t"
2165 "packssdw %%mm3, %%mm0 \n\t"
2166 "pmaddwd %%mm6, %%mm0 \n\t"
2167 "packssdw %%mm0, %%mm0 \n\t"
2168 "movd %%mm0, (%4, %%ebp) \n\t"
2169 "addl $4, %%ebp \n\t"
2170 " jnc 1b \n\t"
c1b0bfb4 2171
077ea8a7
MN
2172 "popl %%ebp \n\t"
2173 : "+a" (counter)
2174 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2175 : "%ebx"
2176 );
2177 }
2178 else
2179 {
2180 int counter= -2*dstW;
2181// filter-= counter*filterSize/2;
2182 filterPos-= counter/2;
2183 dst-= counter/2;
2184 asm volatile(
2185 "pxor %%mm7, %%mm7 \n\t"
9b464428 2186 "movq "MANGLE(w02)", %%mm6 \n\t"
077ea8a7
MN
2187 ".balign 16 \n\t"
2188 "1: \n\t"
2189 "movl %2, %%ecx \n\t"
2190 "movzwl (%%ecx, %0), %%eax \n\t"
2191 "movzwl 2(%%ecx, %0), %%ebx \n\t"
2192 "movl %5, %%ecx \n\t"
2193 "pxor %%mm4, %%mm4 \n\t"
2194 "pxor %%mm5, %%mm5 \n\t"
2195 "2: \n\t"
2196 "movq (%1), %%mm1 \n\t"
2197 "movq (%1, %6), %%mm3 \n\t"
2198 "movd (%%ecx, %%eax), %%mm0 \n\t"
2199 "movd (%%ecx, %%ebx), %%mm2 \n\t"
2200 "punpcklbw %%mm7, %%mm0 \n\t"
2201 "punpcklbw %%mm7, %%mm2 \n\t"
2202 "pmaddwd %%mm1, %%mm0 \n\t"
2203 "pmaddwd %%mm2, %%mm3 \n\t"
2204 "paddd %%mm3, %%mm5 \n\t"
2205 "paddd %%mm0, %%mm4 \n\t"
2206 "addl $8, %1 \n\t"
2207 "addl $4, %%ecx \n\t"
2208 "cmpl %4, %%ecx \n\t"
2209 " jb 2b \n\t"
2210 "addl %6, %1 \n\t"
2211 "psrad $8, %%mm4 \n\t"
2212 "psrad $8, %%mm5 \n\t"
2213 "packssdw %%mm5, %%mm4 \n\t"
2214 "pmaddwd %%mm6, %%mm4 \n\t"
2215 "packssdw %%mm4, %%mm4 \n\t"
2216 "movl %3, %%eax \n\t"
2217 "movd %%mm4, (%%eax, %0) \n\t"
2218 "addl $4, %0 \n\t"
2219 " jnc 1b \n\t"
c1b0bfb4 2220
627690b5
MN
2221 : "+r" (counter), "+r" (filter)
2222 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
077ea8a7 2223 "m" (src), "r" (filterSize*2)
e2f5a2a9 2224 : "%ebx", "%eax", "%ecx"
077ea8a7
MN
2225 );
2226 }
2227#else
2228 int i;
2229 for(i=0; i<dstW; i++)
2230 {
2231 int j;
2232 int srcPos= filterPos[i];
2233 int val=0;
c1b0bfb4 2234// printf("filterPos: %d\n", filterPos[i]);
077ea8a7
MN
2235 for(j=0; j<filterSize; j++)
2236 {
2237// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2238 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2239 }
2240// filter += hFilterSize;
2241 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2242// dst[i] = val>>7;
2243 }
2244#endif
2245}
2ff198c1 2246 // *** horizontal scale Y line to temp buffer
28bf81c9
MN
2247static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2248 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
1e621b18 2249 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
b7dc6f66
MN
2250 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2251 int32_t *mmx2FilterPos)
077ea8a7 2252{
1e621b18
MN
2253 if(srcFormat==IMGFMT_YUY2)
2254 {
2255 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2256 src= formatConvBuffer;
2257 }
2258 else if(srcFormat==IMGFMT_BGR32)
2259 {
2260 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2261 src= formatConvBuffer;
2262 }
2263 else if(srcFormat==IMGFMT_BGR24)
2264 {
2265 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2266 src= formatConvBuffer;
2267 }
6af250ea
MN
2268 else if(srcFormat==IMGFMT_BGR16)
2269 {
2270 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2271 src= formatConvBuffer;
2272 }
b72034dd
MN
2273 else if(srcFormat==IMGFMT_BGR15)
2274 {
2275 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2276 src= formatConvBuffer;
2277 }
a861d4d7
MN
2278 else if(srcFormat==IMGFMT_RGB32)
2279 {
2280 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2281 src= formatConvBuffer;
2282 }
2283 else if(srcFormat==IMGFMT_RGB24)
2284 {
2285 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2286 src= formatConvBuffer;
2287 }
1e621b18 2288
e3d2500f
MN
2289#ifdef HAVE_MMX
2290 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
28bf81c9 2291 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2292#else
28bf81c9 2293 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2294#endif
077ea8a7
MN
2295 {
2296 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2297 }
2298 else // Fast Bilinear upscale / crap downscale
2299 {
2ff198c1 2300#ifdef ARCH_X86
2ff198c1 2301#ifdef HAVE_MMX2
96034638 2302 int i;
2ff198c1
MN
2303 if(canMMX2BeUsed)
2304 {
2305 asm volatile(
2306 "pxor %%mm7, %%mm7 \n\t"
b7dc6f66
MN
2307 "movl %0, %%ecx \n\t"
2308 "movl %1, %%edi \n\t"
2309 "movl %2, %%edx \n\t"
2310 "movl %3, %%ebx \n\t"
2ff198c1 2311 "xorl %%eax, %%eax \n\t" // i
b7dc6f66
MN
2312 PREFETCH" (%%ecx) \n\t"
2313 PREFETCH" 32(%%ecx) \n\t"
2314 PREFETCH" 64(%%ecx) \n\t"
99cefd0b 2315
2ff198c1 2316#define FUNNY_Y_CODE \
b7dc6f66
MN
2317 "movl (%%ebx), %%esi \n\t"\
2318 "call *%4 \n\t"\
2319 "addl (%%ebx, %%eax), %%ecx \n\t"\
2320 "addl %%eax, %%edi \n\t"\
2321 "xorl %%eax, %%eax \n\t"\
99cefd0b 2322
2ff198c1
MN
2323FUNNY_Y_CODE
2324FUNNY_Y_CODE
2325FUNNY_Y_CODE
2326FUNNY_Y_CODE
2327FUNNY_Y_CODE
2328FUNNY_Y_CODE
2329FUNNY_Y_CODE
2330FUNNY_Y_CODE
2331
b7dc6f66
MN
2332 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2333 "m" (funnyYCode)
2ff198c1
MN
2334 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2335 );
af91b8b3 2336 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2ff198c1
MN
2337 }
2338 else
2339 {
2340#endif
2341 //NO MMX just normal asm ...
2342 asm volatile(
2343 "xorl %%eax, %%eax \n\t" // i
2344 "xorl %%ebx, %%ebx \n\t" // xx
2345 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2346 ".balign 16 \n\t"
2ff198c1
MN
2347 "1: \n\t"
2348 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2350 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2351 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2352 "shll $16, %%edi \n\t"
2353 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2354 "movl %1, %%edi \n\t"
2355 "shrl $9, %%esi \n\t"
2356 "movw %%si, (%%edi, %%eax, 2) \n\t"
2357 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2358 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2359
2360 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
2361 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
2362 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2363 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2364 "shll $16, %%edi \n\t"
2365 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2366 "movl %1, %%edi \n\t"
2367 "shrl $9, %%esi \n\t"
2368 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
2369 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2370 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2371
2372
2373 "addl $2, %%eax \n\t"
2374 "cmpl %2, %%eax \n\t"
2375 " jb 1b \n\t"
2376
2377
2378 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2379 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2380 );
2381#ifdef HAVE_MMX2
2382 } //if MMX2 cant be used
2383#endif
2384#else
96034638
MN
2385 int i;
2386 unsigned int xpos=0;
2387 for(i=0;i<dstWidth;i++)
2388 {
2389 register unsigned int xx=xpos>>16;
2390 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2391 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2392 xpos+=xInc;
2393 }
2ff198c1 2394#endif
077ea8a7 2395 }
2ff198c1
MN
2396}
2397
28bf81c9
MN
2398inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2399 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
1e621b18 2400 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
b7dc6f66
MN
2401 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2402 int32_t *mmx2FilterPos)
2ff198c1 2403{
1e621b18
MN
2404 if(srcFormat==IMGFMT_YUY2)
2405 {
2406 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2407 src1= formatConvBuffer;
2408 src2= formatConvBuffer+2048;
2409 }
2410 else if(srcFormat==IMGFMT_BGR32)
2411 {
2412 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2413 src1= formatConvBuffer;
2414 src2= formatConvBuffer+2048;
2415 }
2416 else if(srcFormat==IMGFMT_BGR24)
2417 {
2418 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2419 src1= formatConvBuffer;
2420 src2= formatConvBuffer+2048;
2421 }
6af250ea
MN
2422 else if(srcFormat==IMGFMT_BGR16)
2423 {
2424 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2425 src1= formatConvBuffer;
2426 src2= formatConvBuffer+2048;
2427 }
b72034dd
MN
2428 else if(srcFormat==IMGFMT_BGR15)
2429 {
2430 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2431 src1= formatConvBuffer;
2432 src2= formatConvBuffer+2048;
2433 }
a861d4d7
MN
2434 else if(srcFormat==IMGFMT_RGB32)
2435 {
2436 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2437 src1= formatConvBuffer;
2438 src2= formatConvBuffer+2048;
2439 }
2440 else if(srcFormat==IMGFMT_RGB24)
2441 {
2442 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2443 src1= formatConvBuffer;
2444 src2= formatConvBuffer+2048;
2445 }
6ff0ad6b
MN
2446 else if(isGray(srcFormat))
2447 {
2448 return;
2449 }
1e621b18 2450
e3d2500f
MN
2451#ifdef HAVE_MMX
2452 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
28bf81c9 2453 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
e3d2500f 2454#else
28bf81c9 2455 if(!(flags&SWS_FAST_BILINEAR))
e3d2500f 2456#endif
077ea8a7
MN
2457 {
2458 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2459 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2460 }
2461 else // Fast Bilinear upscale / crap downscale
2462 {
2ff198c1
MN
2463#ifdef ARCH_X86
2464#ifdef HAVE_MMX2
96034638 2465 int i;
2ff198c1
MN
2466 if(canMMX2BeUsed)
2467 {
2468 asm volatile(
b7dc6f66
MN
2469 "pxor %%mm7, %%mm7 \n\t"
2470 "movl %0, %%ecx \n\t"
2471 "movl %1, %%edi \n\t"
2472 "movl %2, %%edx \n\t"
2473 "movl %3, %%ebx \n\t"
2474 "xorl %%eax, %%eax \n\t" // i
2475 PREFETCH" (%%ecx) \n\t"
2476 PREFETCH" 32(%%ecx) \n\t"
2477 PREFETCH" 64(%%ecx) \n\t"
2478
2479#define FUNNY_UV_CODE \
2480 "movl (%%ebx), %%esi \n\t"\
2481 "call *%4 \n\t"\
2482 "addl (%%ebx, %%eax), %%ecx \n\t"\
2483 "addl %%eax, %%edi \n\t"\
2484 "xorl %%eax, %%eax \n\t"\
2485
2486FUNNY_UV_CODE
2487FUNNY_UV_CODE
2488FUNNY_UV_CODE
2489FUNNY_UV_CODE
2490 "xorl %%eax, %%eax \n\t" // i
2491 "movl %5, %%ecx \n\t" // src
2492 "movl %1, %%edi \n\t" // buf1
2493 "addl $4096, %%edi \n\t"
2494 PREFETCH" (%%ecx) \n\t"
2495 PREFETCH" 32(%%ecx) \n\t"
2496 PREFETCH" 64(%%ecx) \n\t"
2497
2498FUNNY_UV_CODE
2499FUNNY_UV_CODE
2500FUNNY_UV_CODE
2501FUNNY_UV_CODE
2502
2503 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2504 "m" (funnyUVCode), "m" (src2)
2505 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2506 );
c1b0bfb4 2507 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2ff198c1 2508 {
c1b0bfb4
MN
2509// printf("%d %d %d\n", dstWidth, i, srcW);
2510 dst[i] = src1[srcW-1]*128;
2511 dst[i+2048] = src2[srcW-1]*128;
2ff198c1
MN
2512 }
2513 }
2514 else
2515 {
2516#endif
2517 asm volatile(
2518 "xorl %%eax, %%eax \n\t" // i
2519 "xorl %%ebx, %%ebx \n\t" // xx
2520 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 2521 ".balign 16 \n\t"
2ff198c1
MN
2522 "1: \n\t"
2523 "movl %0, %%esi \n\t"
2524 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
2525 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
2526 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2527 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2528 "shll $16, %%edi \n\t"
2529 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2530 "movl %1, %%edi \n\t"
2531 "shrl $9, %%esi \n\t"
2532 "movw %%si, (%%edi, %%eax, 2) \n\t"
2533
2534 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
2535 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
2536 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2537 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2538 "shll $16, %%edi \n\t"
2539 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2540 "movl %1, %%edi \n\t"
2541 "shrl $9, %%esi \n\t"
2542 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2543
2544 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2545 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
2546 "addl $1, %%eax \n\t"
2547 "cmpl %2, %%eax \n\t"
2548 " jb 1b \n\t"
2549
2550 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2551 "r" (src2)
2552 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2553 );
2554#ifdef HAVE_MMX2
2555 } //if MMX2 cant be used
2556#endif
2557#else
96034638
MN
2558 int i;
2559 unsigned int xpos=0;
2560 for(i=0;i<dstWidth;i++)
2561 {
2562 register unsigned int xx=xpos>>16;
2563 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2564 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2565 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1faf0867
MN
2566/* slower
2567 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2568 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2569*/
96034638
MN
2570 xpos+=xInc;
2571 }
2ff198c1 2572#endif
077ea8a7
MN
2573 }
2574}
2575
1e621b18 2576static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
332105e4 2577 int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
28bf81c9
MN
2578
2579 /* load a few things into local vars to make the code more readable? and faster */
2580 const int srcW= c->srcW;
2581 const int dstW= c->dstW;
2582 const int dstH= c->dstH;
2583 const int chrDstW= c->chrDstW;
e616aa93 2584 const int chrSrcW= c->chrSrcW;
28bf81c9
MN
2585 const int lumXInc= c->lumXInc;
2586 const int chrXInc= c->chrXInc;
fe8054c0 2587 const int dstFormat= c->dstFormat;
44c1035c 2588 const int srcFormat= c->srcFormat;
28bf81c9
MN
2589 const int flags= c->flags;
2590 const int canMMX2BeUsed= c->canMMX2BeUsed;
2591 int16_t *vLumFilterPos= c->vLumFilterPos;
2592 int16_t *vChrFilterPos= c->vChrFilterPos;
2593 int16_t *hLumFilterPos= c->hLumFilterPos;
2594 int16_t *hChrFilterPos= c->hChrFilterPos;
2595 int16_t *vLumFilter= c->vLumFilter;
2596 int16_t *vChrFilter= c->vChrFilter;
2597 int16_t *hLumFilter= c->hLumFilter;
2598 int16_t *hChrFilter= c->hChrFilter;
2599 int16_t *lumMmxFilter= c->lumMmxFilter;
2600 int16_t *chrMmxFilter= c->chrMmxFilter;
2601 const int vLumFilterSize= c->vLumFilterSize;
2602 const int vChrFilterSize= c->vChrFilterSize;
2603 const int hLumFilterSize= c->hLumFilterSize;
2604 const int hChrFilterSize= c->hChrFilterSize;
2605 int16_t **lumPixBuf= c->lumPixBuf;
2606 int16_t **chrPixBuf= c->chrPixBuf;
2607 const int vLumBufSize= c->vLumBufSize;
2608 const int vChrBufSize= c->vChrBufSize;
2609 uint8_t *funnyYCode= c->funnyYCode;
2610 uint8_t *funnyUVCode= c->funnyUVCode;
1e621b18 2611 uint8_t *formatConvBuffer= c->formatConvBuffer;
e616aa93
MN
2612 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2613 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
28bf81c9
MN
2614
2615 /* vars whch will change and which we need to storw back in the context */
2616 int dstY= c->dstY;
2617 int lumBufIndex= c->lumBufIndex;
2618 int chrBufIndex= c->chrBufIndex;
2619 int lastInLumBuf= c->lastInLumBuf;
2620 int lastInChrBuf= c->lastInChrBuf;
1e621b18 2621 int srcStride[3];
332105e4 2622 int dstStride[3];
6c7506de
MN
2623 uint8_t *src[3];
2624 uint8_t *dst[3];
5859233b
MN
2625
2626 orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
2627 orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
44c1035c 2628
5859233b 2629 if(isPacked(c->srcFormat)){
1e621b18
MN
2630 src[0]=
2631 src[1]=
2632 src[2]= srcParam[0];
5859233b 2633 srcStride[0]=
1e621b18 2634 srcStride[1]=
5859233b 2635 srcStride[2]= srcStrideParam[0];
6c7506de 2636 }
5859233b
MN
2637 srcStride[1]<<= c->vChrDrop;
2638 srcStride[2]<<= c->vChrDrop;
6c7506de 2639
c7a810cc
MN
2640// printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2641// (int)dst[0], (int)dst[1], (int)dst[2]);
2642
2643#if 0 //self test FIXME move to a vfilter or something
2644{
2645static volatile int i=0;
2646i++;
2647if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2648 selfTest(src, srcStride, c->srcW, c->srcH);
2649i--;
2650}
2651#endif
37079906
MN
2652
2653//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2654//dstStride[0],dstStride[1],dstStride[2]);
6c7506de
MN
2655
2656 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2657 {
2658 static int firstTime=1; //FIXME move this into the context perhaps
2659 if(flags & SWS_PRINT_INFO && firstTime)
2660 {
4a53a912 2661 mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
6c7506de
MN
2662 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2663 firstTime=0;
2664 }
2665 }
28bf81c9 2666
1e621b18
MN
2667 /* Note the user might start scaling the picture in the middle so this will not get executed
2668 this is not really intended but works currently, so ppl might do it */
28bf81c9
MN
2669 if(srcSliceY ==0){
2670 lumBufIndex=0;
2671 chrBufIndex=0;
1e621b18 2672 dstY=0;
28bf81c9
MN
2673 lastInLumBuf= -1;
2674 lastInChrBuf= -1;
077ea8a7 2675 }
d3f41512 2676
c1b0bfb4 2677 for(;dstY < dstH; dstY++){
28bf81c9 2678 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3f7bb50c
MN
2679 const int chrDstY= dstY>>c->chrDstVSubSample;
2680 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2681 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
d3f41512 2682
c1b0bfb4
MN
2683 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2684 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2685 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2686 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
d604bab9 2687
c7f822d9
MN
2688 //handle holes (FAST_BILINEAR & weird filters)
2689 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2690 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2691//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
c1b0bfb4
MN
2692 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2693 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
d3f41512 2694
c1b0bfb4 2695 // Do we have enough lines in this slice to output the dstY line
e616aa93 2696 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
c1b0bfb4
MN
2697 {
2698 //Do horizontal scaling
2699 while(lastInLumBuf < lastLumSrcY)
d3f41512 2700 {
28bf81c9 2701 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4 2702 lumBufIndex++;
c7f822d9 2703// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
c1b0bfb4
MN
2704 ASSERT(lumBufIndex < 2*vLumBufSize)
2705 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2706 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2707// printf("%d %d\n", lumBufIndex, vLumBufSize);
28bf81c9
MN
2708 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2709 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2710 funnyYCode, c->srcFormat, formatConvBuffer,
2711 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2712 lastInLumBuf++;
2713 }
2714 while(lastInChrBuf < lastChrSrcY)
2715 {
e616aa93
MN
2716 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2717 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2718 chrBufIndex++;
2719 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2720 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2721 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
28bf81c9 2722 //FIXME replace parameters through context struct (some at least)
44c1035c
MN
2723
2724 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2725 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2726 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2727 funnyUVCode, c->srcFormat, formatConvBuffer,
2728 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4 2729 lastInChrBuf++;
d3f41512 2730 }
c1b0bfb4
MN
2731 //wrap buf index around to stay inside the ring buffer
2732 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2733 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
d3f41512 2734 }
c1b0bfb4 2735 else // not enough lines left in this slice -> load the rest in the buffer
2ff198c1 2736 {
c1b0bfb4
MN
2737/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2738 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2739 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
e616aa93
MN
2740 vChrBufSize, vLumBufSize);*/
2741
c1b0bfb4
MN
2742 //Do horizontal scaling
2743 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2744 {
28bf81c9 2745 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
c1b0bfb4
MN
2746 lumBufIndex++;
2747 ASSERT(lumBufIndex < 2*vLumBufSize)
2748 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2749 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
28bf81c9
MN
2750 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2751 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
b7dc6f66
MN
2752 funnyYCode, c->srcFormat, formatConvBuffer,
2753 c->lumMmx2Filter, c->lumMmx2FilterPos);
c1b0bfb4
MN
2754 lastInLumBuf++;
2755 }
e616aa93 2756 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
c1b0bfb4 2757 {
e616aa93
MN
2758 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2759 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
c1b0bfb4
MN
2760 chrBufIndex++;
2761 ASSERT(chrBufIndex < 2*vChrBufSize)
e616aa93
MN
2762 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2763 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
44c1035c
MN
2764
2765 if(!(isGray(srcFormat) || isGray(dstFormat)))
e616aa93 2766 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
28bf81c9 2767 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
b7dc6f66
MN
2768 funnyUVCode, c->srcFormat, formatConvBuffer,
2769 c->chrMmx2Filter, c->chrMmx2FilterPos);
c1b0bfb4
MN
2770 lastInChrBuf++;
2771 }
2772 //wrap buf index around to stay inside the ring buffer
2773 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2774 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2775 break; //we cant output a dstY line so lets try with the next slice
2ff198c1 2776 }
d3f41512 2777
c1b0bfb4
MN
2778#ifdef HAVE_MMX
2779 b5Dither= dither8[dstY&1];
2780 g6Dither= dither4[dstY&1];
2781 g5Dither= dither8[dstY&1];
2782 r5Dither= dither8[(dstY+1)&1];
2783#endif
28bf81c9 2784 if(dstY < dstH-2)
e3d2500f 2785 {
44c1035c 2786 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
0f25d72b 2787 {
44c1035c 2788 if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
c1b0bfb4 2789 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2ff198c1 2790 {
c1b0bfb4
MN
2791 int16_t *lumBuf = lumPixBuf[0];
2792 int16_t *chrBuf= chrPixBuf[0];
e616aa93 2793 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
c1b0bfb4
MN
2794 }
2795 else //General YV12
2796 {
2797 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2798 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2799 RENAME(yuv2yuvX)(
e616aa93
MN
2800 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2801 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2802 dest, uDest, vDest, dstW, chrDstW,
2803 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+chrDstY*vChrFilterSize*4);
2ff198c1 2804 }
0f25d72b 2805 }
c1b0bfb4 2806 else
2ff198c1 2807 {
c1b0bfb4
MN
2808 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2809 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
d3f41512 2810
c1b0bfb4
MN
2811 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2812 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2813 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2814 {
2815 int chrAlpha= vChrFilter[2*dstY+1];
2ff198c1 2816
c1b0bfb4 2817 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
28bf81c9 2818 dest, dstW, chrAlpha, dstFormat, flags);
c1b0bfb4
MN
2819 }
2820 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2821 {
2822 int lumAlpha= vLumFilter[2*dstY+1];
2823 int chrAlpha= vChrFilter[2*dstY+1];
2824
2825 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
28bf81c9 2826 dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
c1b0bfb4
MN
2827 }
2828 else //General RGB
2829 {
2830 RENAME(yuv2rgbX)(
2831 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2832 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
28bf81c9 2833 dest, dstW, dstFormat,
c1b0bfb4
MN
2834 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2835 }
2836 }
e3d2500f
MN
2837 }
2838 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2839 {
2840 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2841 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
6c7506de 2842 if(isPlanarYUV(dstFormat)) //YV12
e3d2500f 2843 {
e616aa93 2844 if(dstY&1) uDest=vDest= NULL;
5859233b 2845 yuv2yuvXinC(
e616aa93
MN
2846 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2847 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
5859233b 2848 dest, uDest, vDest, dstW, chrDstW);
e3d2500f
MN
2849 }
2850 else
2851 {
2852 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2853 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2854 yuv2rgbXinC(
2855 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2856 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
28bf81c9 2857 dest, dstW, dstFormat);
e3d2500f
MN
2858 }
2859 }
c1b0bfb4 2860 }
17f715fa
MN
2861
2862#ifdef HAVE_MMX
2863 __asm __volatile(SFENCE:::"memory");
1faf0867 2864 __asm __volatile(EMMS:::"memory");
17f715fa 2865#endif
28bf81c9
MN
2866 /* store changed local vars back in the context */
2867 c->dstY= dstY;
2868 c->lumBufIndex= lumBufIndex;
2869 c->chrBufIndex= chrBufIndex;
2870 c->lastInLumBuf= lastInLumBuf;
2871 c->lastInChrBuf= lastInChrBuf;
627690b5 2872}