looks better
[libav.git] / postproc / swscale_template.c
CommitLineData
31190492
A
1
2// Software scaling and colorspace conversion routines for MPlayer
3
afa569af 4// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
783e9cc9 5// current version mostly by Michael Niedermayer (michaelni@gmx.at)
d604bab9 6// the parts written by michael are under GNU GPL
783e9cc9 7
541c4eb9 8#undef MOVNTQ
7d7f78b5 9#undef PAVGB
48a05cec
MN
10#undef PREFETCH
11#undef PREFETCHW
12#undef EMMS
13#undef SFENCE
14
15#ifdef HAVE_3DNOW
16/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
17#define EMMS "femms"
18#else
19#define EMMS "emms"
20#endif
21
22#ifdef HAVE_3DNOW
23#define PREFETCH "prefetch"
24#define PREFETCHW "prefetchw"
25#elif defined ( HAVE_MMX2 )
26#define PREFETCH "prefetchnta"
27#define PREFETCHW "prefetcht0"
28#else
29#define PREFETCH "/nop"
30#define PREFETCHW "/nop"
31#endif
32
33#ifdef HAVE_MMX2
34#define SFENCE "sfence"
35#else
36#define SFENCE "/nop"
37#endif
d3f41512 38
d604bab9
MN
39#ifdef HAVE_MMX2
40#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
41#elif defined (HAVE_3DNOW)
42#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
43#endif
d3f41512 44
d604bab9
MN
45#ifdef HAVE_MMX2
46#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
47#else
48#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
49#endif
50
c1b0bfb4
MN
51
52#define YSCALEYUV2YV12X(x) \
53 "xorl %%eax, %%eax \n\t"\
54 "pxor %%mm3, %%mm3 \n\t"\
55 "pxor %%mm4, %%mm4 \n\t"\
56 "movl %0, %%edx \n\t"\
57 ".balign 16 \n\t" /* FIXME Unroll? */\
58 "1: \n\t"\
59 "movl (%1, %%edx, 4), %%esi \n\t"\
60 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
61 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
62 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
63 "pmulhw %%mm0, %%mm2 \n\t"\
64 "pmulhw %%mm0, %%mm5 \n\t"\
65 "paddw %%mm2, %%mm3 \n\t"\
66 "paddw %%mm5, %%mm4 \n\t"\
67 "addl $1, %%edx \n\t"\
68 " jnz 1b \n\t"\
69 "psraw $3, %%mm3 \n\t"\
70 "psraw $3, %%mm4 \n\t"\
71 "packuswb %%mm4, %%mm3 \n\t"\
72 MOVNTQ(%%mm3, (%3, %%eax))\
73 "addl $8, %%eax \n\t"\
74 "cmpl %4, %%eax \n\t"\
75 "pxor %%mm3, %%mm3 \n\t"\
76 "pxor %%mm4, %%mm4 \n\t"\
77 "movl %0, %%edx \n\t"\
78 "jb 1b \n\t"
79
80#define YSCALEYUV2YV121 \
81 "movl %2, %%eax \n\t"\
82 ".balign 16 \n\t" /* FIXME Unroll? */\
83 "1: \n\t"\
84 "movq (%0, %%eax, 2), %%mm0 \n\t"\
85 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
86 "psraw $7, %%mm0 \n\t"\
87 "psraw $7, %%mm1 \n\t"\
88 "packuswb %%mm1, %%mm0 \n\t"\
89 MOVNTQ(%%mm0, (%1, %%eax))\
90 "addl $8, %%eax \n\t"\
91 "jnc 1b \n\t"
92
93/*
94 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
95 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
96 "r" (dest), "m" (dstW),
97 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
98 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
99*/
100#define YSCALEYUV2RGBX \
101 "xorl %%eax, %%eax \n\t"\
102 ".balign 16 \n\t"\
103 "1: \n\t"\
104 "movl %1, %%edx \n\t" /* -chrFilterSize */\
105 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
106 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
107 "pxor %%mm3, %%mm3 \n\t"\
108 "pxor %%mm4, %%mm4 \n\t"\
109 "2: \n\t"\
110 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
111 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
112 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
113 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
114 "pmulhw %%mm0, %%mm2 \n\t"\
115 "pmulhw %%mm0, %%mm5 \n\t"\
116 "paddw %%mm2, %%mm3 \n\t"\
117 "paddw %%mm5, %%mm4 \n\t"\
118 "addl $1, %%edx \n\t"\
119 " jnz 2b \n\t"\
120\
121 "movl %0, %%edx \n\t" /* -lumFilterSize */\
122 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
123 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
124 "pxor %%mm1, %%mm1 \n\t"\
125 "pxor %%mm7, %%mm7 \n\t"\
126 "2: \n\t"\
127 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
128 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
129 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
130 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
131 "pmulhw %%mm0, %%mm2 \n\t"\
132 "pmulhw %%mm0, %%mm5 \n\t"\
133 "paddw %%mm2, %%mm1 \n\t"\
134 "paddw %%mm5, %%mm7 \n\t"\
135 "addl $1, %%edx \n\t"\
136 " jnz 2b \n\t"\
137\
138 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
139 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
140 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
141 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
142 "pmulhw ugCoeff, %%mm3 \n\t"\
143 "pmulhw vgCoeff, %%mm4 \n\t"\
144 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
145 "pmulhw ubCoeff, %%mm2 \n\t"\
146 "pmulhw vrCoeff, %%mm5 \n\t"\
147 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
148 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
149 "pmulhw yCoeff, %%mm1 \n\t"\
150 "pmulhw yCoeff, %%mm7 \n\t"\
151 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
152 "paddw %%mm3, %%mm4 \n\t"\
153 "movq %%mm2, %%mm0 \n\t"\
154 "movq %%mm5, %%mm6 \n\t"\
155 "movq %%mm4, %%mm3 \n\t"\
156 "punpcklwd %%mm2, %%mm2 \n\t"\
157 "punpcklwd %%mm5, %%mm5 \n\t"\
158 "punpcklwd %%mm4, %%mm4 \n\t"\
159 "paddw %%mm1, %%mm2 \n\t"\
160 "paddw %%mm1, %%mm5 \n\t"\
161 "paddw %%mm1, %%mm4 \n\t"\
162 "punpckhwd %%mm0, %%mm0 \n\t"\
163 "punpckhwd %%mm6, %%mm6 \n\t"\
164 "punpckhwd %%mm3, %%mm3 \n\t"\
165 "paddw %%mm7, %%mm0 \n\t"\
166 "paddw %%mm7, %%mm6 \n\t"\
167 "paddw %%mm7, %%mm3 \n\t"\
168 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
169 "packuswb %%mm0, %%mm2 \n\t"\
170 "packuswb %%mm6, %%mm5 \n\t"\
171 "packuswb %%mm3, %%mm4 \n\t"\
172 "pxor %%mm7, %%mm7 \n\t"
173
d604bab9
MN
174#define FULL_YSCALEYUV2RGB \
175 "pxor %%mm7, %%mm7 \n\t"\
176 "movd %6, %%mm6 \n\t" /*yalpha1*/\
177 "punpcklwd %%mm6, %%mm6 \n\t"\
178 "punpcklwd %%mm6, %%mm6 \n\t"\
179 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
180 "punpcklwd %%mm5, %%mm5 \n\t"\
181 "punpcklwd %%mm5, %%mm5 \n\t"\
182 "xorl %%eax, %%eax \n\t"\
cff6ecd7 183 ".balign 16 \n\t"\
d604bab9
MN
184 "1: \n\t"\
185 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
186 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
187 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
188 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
189 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
190 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
191 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
192 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
193 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
194 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
195 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
196 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
197 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
198 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
199 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
200 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
201 "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\
202 "pmulhw yCoeff, %%mm1 \n\t"\
203\
204\
205 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
206 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
207 "pmulhw ubCoeff, %%mm3 \n\t"\
208 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
209 "pmulhw ugCoeff, %%mm2 \n\t"\
210 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
211 "psubw w400, %%mm0 \n\t" /* (V-128)8*/\
212\
213\
214 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
215 "pmulhw vrCoeff, %%mm0 \n\t"\
216 "pmulhw vgCoeff, %%mm4 \n\t"\
217 "paddw %%mm1, %%mm3 \n\t" /* B*/\
218 "paddw %%mm1, %%mm0 \n\t" /* R*/\
219 "packuswb %%mm3, %%mm3 \n\t"\
220\
221 "packuswb %%mm0, %%mm0 \n\t"\
222 "paddw %%mm4, %%mm2 \n\t"\
223 "paddw %%mm2, %%mm1 \n\t" /* G*/\
224\
225 "packuswb %%mm1, %%mm1 \n\t"
226
227#define YSCALEYUV2RGB \
228 "movd %6, %%mm6 \n\t" /*yalpha1*/\
229 "punpcklwd %%mm6, %%mm6 \n\t"\
230 "punpcklwd %%mm6, %%mm6 \n\t"\
231 "movq %%mm6, asm_yalpha1 \n\t"\
232 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
233 "punpcklwd %%mm5, %%mm5 \n\t"\
234 "punpcklwd %%mm5, %%mm5 \n\t"\
235 "movq %%mm5, asm_uvalpha1 \n\t"\
236 "xorl %%eax, %%eax \n\t"\
cff6ecd7 237 ".balign 16 \n\t"\
d604bab9
MN
238 "1: \n\t"\
239 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
240 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
241 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
242 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
243 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
244 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
245 "movq asm_uvalpha1, %%mm0 \n\t"\
246 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
247 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
248 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
249 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
250 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
251 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
252 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
253 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
254 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
255 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
256 "pmulhw ugCoeff, %%mm3 \n\t"\
257 "pmulhw vgCoeff, %%mm4 \n\t"\
258 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
259 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
260 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
261 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
262 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
263 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
264 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
265 "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
266 "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
267 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
268 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
269 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
270 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
271 "pmulhw ubCoeff, %%mm2 \n\t"\
272 "pmulhw vrCoeff, %%mm5 \n\t"\
273 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
274 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
275 "pmulhw yCoeff, %%mm1 \n\t"\
276 "pmulhw yCoeff, %%mm7 \n\t"\
277 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
278 "paddw %%mm3, %%mm4 \n\t"\
279 "movq %%mm2, %%mm0 \n\t"\
280 "movq %%mm5, %%mm6 \n\t"\
281 "movq %%mm4, %%mm3 \n\t"\
282 "punpcklwd %%mm2, %%mm2 \n\t"\
283 "punpcklwd %%mm5, %%mm5 \n\t"\
284 "punpcklwd %%mm4, %%mm4 \n\t"\
285 "paddw %%mm1, %%mm2 \n\t"\
286 "paddw %%mm1, %%mm5 \n\t"\
287 "paddw %%mm1, %%mm4 \n\t"\
288 "punpckhwd %%mm0, %%mm0 \n\t"\
289 "punpckhwd %%mm6, %%mm6 \n\t"\
290 "punpckhwd %%mm3, %%mm3 \n\t"\
291 "paddw %%mm7, %%mm0 \n\t"\
292 "paddw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm7, %%mm3 \n\t"\
294 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
295 "packuswb %%mm0, %%mm2 \n\t"\
296 "packuswb %%mm6, %%mm5 \n\t"\
297 "packuswb %%mm3, %%mm4 \n\t"\
298 "pxor %%mm7, %%mm7 \n\t"
299
300#define YSCALEYUV2RGB1 \
301 "xorl %%eax, %%eax \n\t"\
cff6ecd7 302 ".balign 16 \n\t"\
d604bab9
MN
303 "1: \n\t"\
304 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
305 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
308 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
309 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw ugCoeff, %%mm3 \n\t"\
313 "pmulhw vgCoeff, %%mm4 \n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
497d4f99
MN
315 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
316 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
317 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
318 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
319 "pmulhw ubCoeff, %%mm2 \n\t"\
320 "pmulhw vrCoeff, %%mm5 \n\t"\
321 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
322 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
323 "pmulhw yCoeff, %%mm1 \n\t"\
324 "pmulhw yCoeff, %%mm7 \n\t"\
325 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
326 "paddw %%mm3, %%mm4 \n\t"\
327 "movq %%mm2, %%mm0 \n\t"\
328 "movq %%mm5, %%mm6 \n\t"\
329 "movq %%mm4, %%mm3 \n\t"\
330 "punpcklwd %%mm2, %%mm2 \n\t"\
331 "punpcklwd %%mm5, %%mm5 \n\t"\
332 "punpcklwd %%mm4, %%mm4 \n\t"\
333 "paddw %%mm1, %%mm2 \n\t"\
334 "paddw %%mm1, %%mm5 \n\t"\
335 "paddw %%mm1, %%mm4 \n\t"\
336 "punpckhwd %%mm0, %%mm0 \n\t"\
337 "punpckhwd %%mm6, %%mm6 \n\t"\
338 "punpckhwd %%mm3, %%mm3 \n\t"\
339 "paddw %%mm7, %%mm0 \n\t"\
340 "paddw %%mm7, %%mm6 \n\t"\
341 "paddw %%mm7, %%mm3 \n\t"\
342 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
343 "packuswb %%mm0, %%mm2 \n\t"\
344 "packuswb %%mm6, %%mm5 \n\t"\
345 "packuswb %%mm3, %%mm4 \n\t"\
346 "pxor %%mm7, %%mm7 \n\t"
347
348// do vertical chrominance interpolation
349#define YSCALEYUV2RGB1b \
350 "xorl %%eax, %%eax \n\t"\
cff6ecd7 351 ".balign 16 \n\t"\
497d4f99
MN
352 "1: \n\t"\
353 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
354 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
355 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
356 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397c035e
MN
357 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
358 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
c1b0bfb4
MN
359 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
360 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
497d4f99
MN
361 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
362 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
363 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
364 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
365 "pmulhw ugCoeff, %%mm3 \n\t"\
366 "pmulhw vgCoeff, %%mm4 \n\t"\
367 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
368 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
369 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
d604bab9
MN
370 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
371 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
372 "pmulhw ubCoeff, %%mm2 \n\t"\
373 "pmulhw vrCoeff, %%mm5 \n\t"\
374 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
375 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
376 "pmulhw yCoeff, %%mm1 \n\t"\
377 "pmulhw yCoeff, %%mm7 \n\t"\
378 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
379 "paddw %%mm3, %%mm4 \n\t"\
380 "movq %%mm2, %%mm0 \n\t"\
381 "movq %%mm5, %%mm6 \n\t"\
382 "movq %%mm4, %%mm3 \n\t"\
383 "punpcklwd %%mm2, %%mm2 \n\t"\
384 "punpcklwd %%mm5, %%mm5 \n\t"\
385 "punpcklwd %%mm4, %%mm4 \n\t"\
386 "paddw %%mm1, %%mm2 \n\t"\
387 "paddw %%mm1, %%mm5 \n\t"\
388 "paddw %%mm1, %%mm4 \n\t"\
389 "punpckhwd %%mm0, %%mm0 \n\t"\
390 "punpckhwd %%mm6, %%mm6 \n\t"\
391 "punpckhwd %%mm3, %%mm3 \n\t"\
392 "paddw %%mm7, %%mm0 \n\t"\
393 "paddw %%mm7, %%mm6 \n\t"\
394 "paddw %%mm7, %%mm3 \n\t"\
395 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
396 "packuswb %%mm0, %%mm2 \n\t"\
397 "packuswb %%mm6, %%mm5 \n\t"\
398 "packuswb %%mm3, %%mm4 \n\t"\
399 "pxor %%mm7, %%mm7 \n\t"
400
401#define WRITEBGR32 \
402 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
403 "movq %%mm2, %%mm1 \n\t" /* B */\
404 "movq %%mm5, %%mm6 \n\t" /* R */\
405 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
406 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
407 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
408 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
409 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
410 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
411 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
412 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
413 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
414 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
415\
416 MOVNTQ(%%mm0, (%4, %%eax, 4))\
417 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
418 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
419 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
420\
421 "addl $8, %%eax \n\t"\
422 "cmpl %5, %%eax \n\t"\
423 " jb 1b \n\t"
424
425#define WRITEBGR16 \
f62255fb
MN
426 "pand bF8, %%mm2 \n\t" /* B */\
427 "pand bFC, %%mm4 \n\t" /* G */\
428 "pand bF8, %%mm5 \n\t" /* R */\
429 "psrlq $3, %%mm2 \n\t"\
d604bab9 430\
f62255fb
MN
431 "movq %%mm2, %%mm1 \n\t"\
432 "movq %%mm4, %%mm3 \n\t"\
d604bab9 433\
f62255fb
MN
434 "punpcklbw %%mm7, %%mm3 \n\t"\
435 "punpcklbw %%mm5, %%mm2 \n\t"\
436 "punpckhbw %%mm7, %%mm4 \n\t"\
437 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 438\
f62255fb
MN
439 "psllq $3, %%mm3 \n\t"\
440 "psllq $3, %%mm4 \n\t"\
d604bab9
MN
441\
442 "por %%mm3, %%mm2 \n\t"\
d604bab9 443 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
444\
445 MOVNTQ(%%mm2, (%4, %%eax, 2))\
446 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
447\
448 "addl $8, %%eax \n\t"\
449 "cmpl %5, %%eax \n\t"\
450 " jb 1b \n\t"
451
452#define WRITEBGR15 \
f62255fb
MN
453 "pand bF8, %%mm2 \n\t" /* B */\
454 "pand bF8, %%mm4 \n\t" /* G */\
455 "pand bF8, %%mm5 \n\t" /* R */\
456 "psrlq $3, %%mm2 \n\t"\
457 "psrlq $1, %%mm5 \n\t"\
d604bab9 458\
f62255fb
MN
459 "movq %%mm2, %%mm1 \n\t"\
460 "movq %%mm4, %%mm3 \n\t"\
d604bab9 461\
f62255fb
MN
462 "punpcklbw %%mm7, %%mm3 \n\t"\
463 "punpcklbw %%mm5, %%mm2 \n\t"\
464 "punpckhbw %%mm7, %%mm4 \n\t"\
465 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 466\
f62255fb
MN
467 "psllq $2, %%mm3 \n\t"\
468 "psllq $2, %%mm4 \n\t"\
d604bab9
MN
469\
470 "por %%mm3, %%mm2 \n\t"\
d604bab9 471 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
472\
473 MOVNTQ(%%mm2, (%4, %%eax, 2))\
474 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
475\
476 "addl $8, %%eax \n\t"\
477 "cmpl %5, %%eax \n\t"\
478 " jb 1b \n\t"
f62255fb 479
99d2cb72 480#define WRITEBGR24OLD \
d604bab9
MN
481 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
482 "movq %%mm2, %%mm1 \n\t" /* B */\
483 "movq %%mm5, %%mm6 \n\t" /* R */\
484 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
485 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
486 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
487 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
488 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
489 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
a525ce8d
MN
490 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
491 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
492 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
493 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9
MN
494\
495 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
496 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
497 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\
498 "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\
499 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
500 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
501 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
502 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
503\
504 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
506 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
507 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
508 "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\
509 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
510 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
511 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\
512 "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\
513 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
514 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
515 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
516 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
517\
518 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
519 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
520 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
521 "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\
522 "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\
523 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
524 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
525 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
526\
bdc2eb9a
MN
527 MOVNTQ(%%mm0, (%%ebx))\
528 MOVNTQ(%%mm2, 8(%%ebx))\
529 MOVNTQ(%%mm3, 16(%%ebx))\
530 "addl $24, %%ebx \n\t"\
d604bab9
MN
531\
532 "addl $8, %%eax \n\t"\
533 "cmpl %5, %%eax \n\t"\
534 " jb 1b \n\t"
535
99d2cb72
MN
536#define WRITEBGR24MMX \
537 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
538 "movq %%mm2, %%mm1 \n\t" /* B */\
539 "movq %%mm5, %%mm6 \n\t" /* R */\
540 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
541 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
542 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
543 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
544 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
545 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
546 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
547 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
548 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
549 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
550\
551 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
552 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
553 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
554 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
555\
556 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
557 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
558 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
559 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
560\
561 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
562 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
563 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
564 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
565\
566 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
567 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
568 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
569 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
570 MOVNTQ(%%mm0, (%%ebx))\
571\
572 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
573 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
574 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
575 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
576 MOVNTQ(%%mm6, 8(%%ebx))\
577\
578 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
579 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
580 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
581 MOVNTQ(%%mm5, 16(%%ebx))\
582\
583 "addl $24, %%ebx \n\t"\
584\
585 "addl $8, %%eax \n\t"\
586 "cmpl %5, %%eax \n\t"\
587 " jb 1b \n\t"
588
589#define WRITEBGR24MMX2 \
590 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
591 "movq M24A, %%mm0 \n\t"\
592 "movq M24C, %%mm7 \n\t"\
593 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
594 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
595 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
596\
597 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
598 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
599 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
600\
601 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
602 "por %%mm1, %%mm6 \n\t"\
603 "por %%mm3, %%mm6 \n\t"\
604 MOVNTQ(%%mm6, (%%ebx))\
605\
606 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
607 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
608 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
609 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
610\
611 "pand M24B, %%mm1 \n\t" /* B5 B4 B3 */\
612 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
613 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
614\
615 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
616 "por %%mm3, %%mm6 \n\t"\
617 MOVNTQ(%%mm6, 8(%%ebx))\
618\
619 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
620 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
621 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
622\
623 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
624 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
625 "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */\
626\
627 "por %%mm1, %%mm3 \n\t"\
628 "por %%mm3, %%mm6 \n\t"\
629 MOVNTQ(%%mm6, 16(%%ebx))\
630\
631 "addl $24, %%ebx \n\t"\
632\
633 "addl $8, %%eax \n\t"\
634 "cmpl %5, %%eax \n\t"\
635 " jb 1b \n\t"
636
637#ifdef HAVE_MMX2
7630f2e0 638#undef WRITEBGR24
99d2cb72
MN
639#define WRITEBGR24 WRITEBGR24MMX2
640#else
7630f2e0 641#undef WRITEBGR24
99d2cb72
MN
642#define WRITEBGR24 WRITEBGR24MMX
643#endif
644
c1b0bfb4
MN
645static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
646 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
647 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
648 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
38858470 649{
c1b0bfb4
MN
650#ifdef HAVE_MMX
651 if(uDest != NULL)
652 {
653 asm volatile(
654 YSCALEYUV2YV12X(0)
655 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
656 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
657 : "%eax", "%edx", "%esi"
658 );
659
660 asm volatile(
661 YSCALEYUV2YV12X(4096)
662 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
663 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
664 : "%eax", "%edx", "%esi"
665 );
666 }
667
668 asm volatile(
669 YSCALEYUV2YV12X(0)
670 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
671 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
672 : "%eax", "%edx", "%esi"
673 );
674#else
e3d2500f
MN
675yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
676 chrFilter, chrSrc, chrFilterSize,
677 dest, uDest, vDest, dstW);
7630f2e0 678#endif
c1b0bfb4 679}
2add307d 680
c1b0bfb4
MN
681static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
682 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
683{
684#ifdef HAVE_MMX
685 if(uDest != NULL)
38858470 686 {
c1b0bfb4
MN
687 asm volatile(
688 YSCALEYUV2YV121
689 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
690 "g" (-(dstW>>1))
691 : "%eax"
692 );
693
694 asm volatile(
695 YSCALEYUV2YV121
696 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
697 "g" (-(dstW>>1))
698 : "%eax"
699 );
38858470
MN
700 }
701
c1b0bfb4
MN
702 asm volatile(
703 YSCALEYUV2YV121
704 :: "r" (lumSrc + dstW), "r" (dest + dstW),
705 "g" (-dstW)
706 : "%eax"
707 );
708#else
709 //FIXME Optimize (just quickly writen not opti..)
710 //FIXME replace MINMAX with LUTs
711 int i;
712 for(i=0; i<dstW; i++)
38858470 713 {
c1b0bfb4
MN
714 int val= lumSrc[i]>>7;
715
716 dest[i]= MIN(MAX(val>>19, 0), 255);
717 }
718
719 if(uDest != NULL)
d1fac6cf 720 for(i=0; i<(dstW>>1); i++)
38858470 721 {
c1b0bfb4
MN
722 int u=chrSrc[i]>>7;
723 int v=chrSrc[i + 2048]>>7;
724
725 uDest[i]= MIN(MAX(u>>19, 0), 255);
726 vDest[i]= MIN(MAX(v>>19, 0), 255);
38858470 727 }
c1b0bfb4 728#endif
38858470
MN
729}
730
c1b0bfb4 731
d604bab9
MN
732/**
733 * vertical scale YV12 to RGB
734 */
c1b0bfb4
MN
735static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
736 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
737 uint8_t *dest, int dstW, int dstbpp, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
738{
739 if(fullUVIpol)
740 {
741//FIXME
742 }//FULL_UV_IPOL
743 else
744 {
745#ifdef HAVE_MMX
746 if(dstbpp == 32) //FIXME untested
747 {
748 asm volatile(
749 YSCALEYUV2RGBX
750 WRITEBGR32
751
752 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
753 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
754 "r" (dest), "m" (dstW),
755 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
756 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
757 );
758 }
759 else if(dstbpp==24) //FIXME untested
760 {
761 asm volatile(
762 YSCALEYUV2RGBX
763 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
764 "addl %4, %%ebx \n\t"
765 WRITEBGR24
766
767 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
768 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
769 "r" (dest), "m" (dstW),
770 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
771 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
772 );
773 }
774 else if(dstbpp==15)
775 {
776 asm volatile(
777 YSCALEYUV2RGBX
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
779#ifdef DITHER1XBPP
780 "paddusb b5Dither, %%mm2 \n\t"
781 "paddusb g5Dither, %%mm4 \n\t"
782 "paddusb r5Dither, %%mm5 \n\t"
783#endif
784
785 WRITEBGR15
786
787 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
788 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
789 "r" (dest), "m" (dstW),
790 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
791 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
792 );
793 }
794 else if(dstbpp==16)
795 {
796 asm volatile(
797 YSCALEYUV2RGBX
798 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
799#ifdef DITHER1XBPP
800 "paddusb b5Dither, %%mm2 \n\t"
801 "paddusb g6Dither, %%mm4 \n\t"
802 "paddusb r5Dither, %%mm5 \n\t"
803#endif
804
805 WRITEBGR16
806
807 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
808 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
809 "r" (dest), "m" (dstW),
810 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
811 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
812 );
813 }
814#else
e3d2500f
MN
815yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
816 chrFilter, chrSrc, chrFilterSize,
817 dest, dstW, dstbpp);
c1b0bfb4 818
c1b0bfb4
MN
819#endif
820 } //!FULL_UV_IPOL
821}
822
823
824/**
825 * vertical bilinear scale YV12 to RGB
826 */
827static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
d1fac6cf 828 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstbpp)
d604bab9
MN
829{
830 int yalpha1=yalpha^4095;
831 int uvalpha1=uvalpha^4095;
d604bab9
MN
832
833 if(fullUVIpol)
834 {
835
836#ifdef HAVE_MMX
837 if(dstbpp == 32)
838 {
839 asm volatile(
840
841
842FULL_YSCALEYUV2RGB
843 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
844 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
845
846 "movq %%mm3, %%mm1 \n\t"
847 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
848 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
849
850 MOVNTQ(%%mm3, (%4, %%eax, 4))
851 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
852
853 "addl $4, %%eax \n\t"
854 "cmpl %5, %%eax \n\t"
855 " jb 1b \n\t"
856
857
d1fac6cf 858 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
859 "m" (yalpha1), "m" (uvalpha1)
860 : "%eax"
861 );
862 }
863 else if(dstbpp==24)
864 {
865 asm volatile(
866
867FULL_YSCALEYUV2RGB
868
869 // lsb ... msb
870 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
871 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
872
873 "movq %%mm3, %%mm1 \n\t"
874 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
875 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
876
877 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
878 "psrlq $8, %%mm3 \n\t" // GR0BGR00
879 "pand bm00000111, %%mm2 \n\t" // BGR00000
880 "pand bm11111000, %%mm3 \n\t" // 000BGR00
881 "por %%mm2, %%mm3 \n\t" // BGRBGR00
882 "movq %%mm1, %%mm2 \n\t"
883 "psllq $48, %%mm1 \n\t" // 000000BG
884 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
885
886 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
887 "psrld $16, %%mm2 \n\t" // R000R000
888 "psrlq $24, %%mm1 \n\t" // 0BGR0000
889 "por %%mm2, %%mm1 \n\t" // RBGRR000
890
891 "movl %4, %%ebx \n\t"
892 "addl %%eax, %%ebx \n\t"
893
894#ifdef HAVE_MMX2
895 //FIXME Alignment
896 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
897 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
898#else
899 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
900 "psrlq $32, %%mm3 \n\t"
901 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
902 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
903#endif
904 "addl $4, %%eax \n\t"
905 "cmpl %5, %%eax \n\t"
906 " jb 1b \n\t"
907
d1fac6cf 908 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
909 "m" (yalpha1), "m" (uvalpha1)
910 : "%eax", "%ebx"
911 );
912 }
913 else if(dstbpp==15)
914 {
915 asm volatile(
916
917FULL_YSCALEYUV2RGB
918#ifdef DITHER1XBPP
d8fa3c54
MN
919 "paddusb g5Dither, %%mm1 \n\t"
920 "paddusb r5Dither, %%mm0 \n\t"
921 "paddusb b5Dither, %%mm3 \n\t"
d604bab9
MN
922#endif
923 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
924 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
925 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
926
927 "psrlw $3, %%mm3 \n\t"
928 "psllw $2, %%mm1 \n\t"
929 "psllw $7, %%mm0 \n\t"
930 "pand g15Mask, %%mm1 \n\t"
931 "pand r15Mask, %%mm0 \n\t"
932
933 "por %%mm3, %%mm1 \n\t"
934 "por %%mm1, %%mm0 \n\t"
935
936 MOVNTQ(%%mm0, (%4, %%eax, 2))
937
938 "addl $4, %%eax \n\t"
939 "cmpl %5, %%eax \n\t"
940 " jb 1b \n\t"
941
d1fac6cf 942 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
943 "m" (yalpha1), "m" (uvalpha1)
944 : "%eax"
945 );
946 }
947 else if(dstbpp==16)
948 {
949 asm volatile(
950
951FULL_YSCALEYUV2RGB
952#ifdef DITHER1XBPP
d8fa3c54
MN
953 "paddusb g6Dither, %%mm1 \n\t"
954 "paddusb r5Dither, %%mm0 \n\t"
955 "paddusb b5Dither, %%mm3 \n\t"
d604bab9
MN
956#endif
957 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
958 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
959 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
960
961 "psrlw $3, %%mm3 \n\t"
962 "psllw $3, %%mm1 \n\t"
963 "psllw $8, %%mm0 \n\t"
964 "pand g16Mask, %%mm1 \n\t"
965 "pand r16Mask, %%mm0 \n\t"
966
967 "por %%mm3, %%mm1 \n\t"
968 "por %%mm1, %%mm0 \n\t"
969
970 MOVNTQ(%%mm0, (%4, %%eax, 2))
971
972 "addl $4, %%eax \n\t"
973 "cmpl %5, %%eax \n\t"
974 " jb 1b \n\t"
975
d1fac6cf 976 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
977 "m" (yalpha1), "m" (uvalpha1)
978 : "%eax"
979 );
980 }
981#else
982 if(dstbpp==32 || dstbpp==24)
983 {
96034638 984 int i;
d1fac6cf 985 for(i=0;i<dstW;i++){
d604bab9
MN
986 // vertical linear interpolation && yuv2rgb in a single step:
987 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
988 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
989 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
390b20a6
MN
990 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
991 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
992 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
d604bab9
MN
993 dest+=dstbpp>>3;
994 }
995 }
996 else if(dstbpp==16)
997 {
96034638 998 int i;
d1fac6cf 999 for(i=0;i<dstW;i++){
d604bab9
MN
1000 // vertical linear interpolation && yuv2rgb in a single step:
1001 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1002 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1003 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1004
d022ce5c 1005 ((uint16_t*)dest)[i] =
b18ea156
MN
1006 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1007 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1008 clip_table16r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1009 }
1010 }
1011 else if(dstbpp==15)
1012 {
96034638 1013 int i;
d1fac6cf 1014 for(i=0;i<dstW;i++){
d604bab9
MN
1015 // vertical linear interpolation && yuv2rgb in a single step:
1016 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1017 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1018 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1019
d022ce5c 1020 ((uint16_t*)dest)[i] =
b18ea156
MN
1021 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1022 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1023 clip_table15r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
1024 }
1025 }
1026#endif
1027 }//FULL_UV_IPOL
1028 else
1029 {
1030#ifdef HAVE_MMX
1031 if(dstbpp == 32)
1032 {
1033 asm volatile(
1034 YSCALEYUV2RGB
1035 WRITEBGR32
1036
d1fac6cf 1037 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1038 "m" (yalpha1), "m" (uvalpha1)
1039 : "%eax"
1040 );
1041 }
1042 else if(dstbpp==24)
1043 {
1044 asm volatile(
bdc2eb9a 1045 "movl %4, %%ebx \n\t"
d604bab9
MN
1046 YSCALEYUV2RGB
1047 WRITEBGR24
1048
d1fac6cf 1049 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1050 "m" (yalpha1), "m" (uvalpha1)
1051 : "%eax", "%ebx"
1052 );
1053 }
1054 else if(dstbpp==15)
1055 {
1056 asm volatile(
1057 YSCALEYUV2RGB
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059#ifdef DITHER1XBPP
d8fa3c54
MN
1060 "paddusb b5Dither, %%mm2 \n\t"
1061 "paddusb g5Dither, %%mm4 \n\t"
1062 "paddusb r5Dither, %%mm5 \n\t"
d604bab9
MN
1063#endif
1064
1065 WRITEBGR15
1066
d1fac6cf 1067 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1068 "m" (yalpha1), "m" (uvalpha1)
1069 : "%eax"
1070 );
1071 }
1072 else if(dstbpp==16)
1073 {
1074 asm volatile(
1075 YSCALEYUV2RGB
1076 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1077#ifdef DITHER1XBPP
d8fa3c54
MN
1078 "paddusb b5Dither, %%mm2 \n\t"
1079 "paddusb g6Dither, %%mm4 \n\t"
1080 "paddusb r5Dither, %%mm5 \n\t"
d604bab9
MN
1081#endif
1082
1083 WRITEBGR16
1084
d1fac6cf 1085 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1086 "m" (yalpha1), "m" (uvalpha1)
1087 : "%eax"
1088 );
1089 }
1090#else
d9fc1cfe 1091 if(dstbpp==32)
d604bab9 1092 {
96034638 1093 int i;
d1fac6cf 1094 for(i=0; i<dstW-1; i+=2){
d604bab9 1095 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
1096 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1097 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1098 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1099 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d9fc1cfe
MN
1100
1101 int Cb= yuvtab_40cf[U];
1102 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1103 int Cr= yuvtab_3343[V];
1104
1105 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1106 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1107 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1108
1109 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1110 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1111 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1112 }
1113 }
c1b0bfb4 1114 else if(dstbpp==24)
d9fc1cfe 1115 {
96034638 1116 int i;
d1fac6cf 1117 for(i=0; i<dstW-1; i+=2){
d9fc1cfe
MN
1118 // vertical linear interpolation && yuv2rgb in a single step:
1119 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1120 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1121 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1122 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d9fc1cfe
MN
1123
1124 int Cb= yuvtab_40cf[U];
1125 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1126 int Cr= yuvtab_3343[V];
1127
1128 dest[0]=clip_table[((Y1 + Cb) >>13)];
1129 dest[1]=clip_table[((Y1 + Cg) >>13)];
1130 dest[2]=clip_table[((Y1 + Cr) >>13)];
1131
1132 dest[3]=clip_table[((Y2 + Cb) >>13)];
1133 dest[4]=clip_table[((Y2 + Cg) >>13)];
1134 dest[5]=clip_table[((Y2 + Cr) >>13)];
1135 dest+=6;
d604bab9
MN
1136 }
1137 }
1138 else if(dstbpp==16)
1139 {
96034638 1140 int i;
d1fac6cf 1141 for(i=0; i<dstW-1; i+=2){
d604bab9 1142 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
1143 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1144 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1145 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1146 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d604bab9 1147
d9fc1cfe
MN
1148 int Cb= yuvtab_40cf[U];
1149 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1150 int Cr= yuvtab_3343[V];
1151
d022ce5c 1152 ((uint16_t*)dest)[i] =
b18ea156
MN
1153 clip_table16b[(Y1 + Cb) >>13] |
1154 clip_table16g[(Y1 + Cg) >>13] |
1155 clip_table16r[(Y1 + Cr) >>13];
d9fc1cfe
MN
1156
1157 ((uint16_t*)dest)[i+1] =
b18ea156
MN
1158 clip_table16b[(Y2 + Cb) >>13] |
1159 clip_table16g[(Y2 + Cg) >>13] |
1160 clip_table16r[(Y2 + Cr) >>13];
d604bab9
MN
1161 }
1162 }
1163 else if(dstbpp==15)
1164 {
96034638 1165 int i;
d1fac6cf 1166 for(i=0; i<dstW-1; i+=2){
d604bab9 1167 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
1168 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1169 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
1170 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1171 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d604bab9 1172
d9fc1cfe
MN
1173 int Cb= yuvtab_40cf[U];
1174 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1175 int Cr= yuvtab_3343[V];
1176
d022ce5c 1177 ((uint16_t*)dest)[i] =
b18ea156
MN
1178 clip_table15b[(Y1 + Cb) >>13] |
1179 clip_table15g[(Y1 + Cg) >>13] |
1180 clip_table15r[(Y1 + Cr) >>13];
1181
d9fc1cfe 1182 ((uint16_t*)dest)[i+1] =
b18ea156
MN
1183 clip_table15b[(Y2 + Cb) >>13] |
1184 clip_table15g[(Y2 + Cg) >>13] |
1185 clip_table15r[(Y2 + Cr) >>13];
d604bab9
MN
1186 }
1187 }
1188#endif
1189 } //!FULL_UV_IPOL
1190}
1191
1192/**
1193 * YV12 to RGB without scaling or interpolating
1194 */
c1b0bfb4
MN
1195static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1196 uint8_t *dest, int dstW, int uvalpha, int dstbpp)
d604bab9 1197{
d604bab9 1198 int uvalpha1=uvalpha^4095;
c1b0bfb4 1199 const int yalpha1=0;
96034638 1200
d604bab9
MN
1201 if(fullUVIpol || allwaysIpol)
1202 {
c1b0bfb4 1203 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp);
d604bab9
MN
1204 return;
1205 }
397c035e
MN
1206
1207#ifdef HAVE_MMX
497d4f99
MN
1208 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1209 {
d604bab9
MN
1210 if(dstbpp == 32)
1211 {
1212 asm volatile(
1213 YSCALEYUV2RGB1
1214 WRITEBGR32
c1b0bfb4 1215 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1216 "m" (yalpha1), "m" (uvalpha1)
1217 : "%eax"
1218 );
1219 }
1220 else if(dstbpp==24)
1221 {
1222 asm volatile(
bdc2eb9a 1223 "movl %4, %%ebx \n\t"
d604bab9
MN
1224 YSCALEYUV2RGB1
1225 WRITEBGR24
c1b0bfb4 1226 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
d604bab9
MN
1227 "m" (yalpha1), "m" (uvalpha1)
1228 : "%eax", "%ebx"
1229 );
1230 }
1231 else if(dstbpp==15)
1232 {
1233 asm volatile(
1234 YSCALEYUV2RGB1
1235 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1236#ifdef DITHER1XBPP
d8fa3c54
MN
1237 "paddusb b5Dither, %%mm2 \n\t"
1238 "paddusb g5Dither, %%mm4 \n\t"
1239 "paddusb r5Dither, %%mm5 \n\t"
d604bab9
MN
1240#endif
1241 WRITEBGR15
c1b0bfb4 1242 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1243 "m" (yalpha1), "m" (uvalpha1)
1244 : "%eax"
1245 );
1246 }
1247 else if(dstbpp==16)
1248 {
1249 asm volatile(
1250 YSCALEYUV2RGB1
1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1252#ifdef DITHER1XBPP
d8fa3c54
MN
1253 "paddusb b5Dither, %%mm2 \n\t"
1254 "paddusb g6Dither, %%mm4 \n\t"
1255 "paddusb r5Dither, %%mm5 \n\t"
d604bab9
MN
1256#endif
1257
1258 WRITEBGR16
c1b0bfb4 1259 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
d604bab9
MN
1260 "m" (yalpha1), "m" (uvalpha1)
1261 : "%eax"
1262 );
1263 }
497d4f99
MN
1264 }
1265 else
1266 {
1267 if(dstbpp == 32)
d604bab9 1268 {
497d4f99
MN
1269 asm volatile(
1270 YSCALEYUV2RGB1b
1271 WRITEBGR32
c1b0bfb4 1272 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1273 "m" (yalpha1), "m" (uvalpha1)
1274 : "%eax"
1275 );
d604bab9 1276 }
497d4f99 1277 else if(dstbpp==24)
d604bab9 1278 {
497d4f99 1279 asm volatile(
bdc2eb9a 1280 "movl %4, %%ebx \n\t"
497d4f99
MN
1281 YSCALEYUV2RGB1b
1282 WRITEBGR24
c1b0bfb4 1283 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
497d4f99
MN
1284 "m" (yalpha1), "m" (uvalpha1)
1285 : "%eax", "%ebx"
1286 );
d604bab9
MN
1287 }
1288 else if(dstbpp==15)
1289 {
497d4f99
MN
1290 asm volatile(
1291 YSCALEYUV2RGB1b
1292 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1293#ifdef DITHER1XBPP
d8fa3c54
MN
1294 "paddusb b5Dither, %%mm2 \n\t"
1295 "paddusb g5Dither, %%mm4 \n\t"
1296 "paddusb r5Dither, %%mm5 \n\t"
497d4f99
MN
1297#endif
1298 WRITEBGR15
c1b0bfb4 1299 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1300 "m" (yalpha1), "m" (uvalpha1)
1301 : "%eax"
1302 );
1303 }
1304 else if(dstbpp==16)
1305 {
1306 asm volatile(
1307 YSCALEYUV2RGB1b
1308 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1309#ifdef DITHER1XBPP
d8fa3c54
MN
1310 "paddusb b5Dither, %%mm2 \n\t"
1311 "paddusb g6Dither, %%mm4 \n\t"
1312 "paddusb r5Dither, %%mm5 \n\t"
497d4f99 1313#endif
d604bab9 1314
497d4f99 1315 WRITEBGR16
c1b0bfb4 1316 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
497d4f99
MN
1317 "m" (yalpha1), "m" (uvalpha1)
1318 : "%eax"
1319 );
d604bab9 1320 }
497d4f99
MN
1321 }
1322#else
397c035e 1323//FIXME write 2 versions (for even & odd lines)
497d4f99 1324
397c035e 1325 if(dstbpp==32)
497d4f99 1326 {
96034638 1327 int i;
d1fac6cf 1328 for(i=0; i<dstW-1; i+=2){
497d4f99 1329 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1330 int Y1=yuvtab_2568[buf0[i]>>7];
1331 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1332 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1333 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
397c035e
MN
1334
1335 int Cb= yuvtab_40cf[U];
1336 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1337 int Cr= yuvtab_3343[V];
1338
1339 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1340 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1341 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1342
1343 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1344 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1345 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1346 }
1347 }
c1b0bfb4 1348 else if(dstbpp==24)
397c035e 1349 {
96034638 1350 int i;
d1fac6cf 1351 for(i=0; i<dstW-1; i+=2){
397c035e
MN
1352 // vertical linear interpolation && yuv2rgb in a single step:
1353 int Y1=yuvtab_2568[buf0[i]>>7];
1354 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1355 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1356 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
397c035e
MN
1357
1358 int Cb= yuvtab_40cf[U];
1359 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1360 int Cr= yuvtab_3343[V];
1361
1362 dest[0]=clip_table[((Y1 + Cb) >>13)];
1363 dest[1]=clip_table[((Y1 + Cg) >>13)];
1364 dest[2]=clip_table[((Y1 + Cr) >>13)];
1365
1366 dest[3]=clip_table[((Y2 + Cb) >>13)];
1367 dest[4]=clip_table[((Y2 + Cg) >>13)];
1368 dest[5]=clip_table[((Y2 + Cr) >>13)];
1369 dest+=6;
497d4f99
MN
1370 }
1371 }
1372 else if(dstbpp==16)
1373 {
96034638 1374 int i;
d1fac6cf 1375 for(i=0; i<dstW-1; i+=2){
497d4f99 1376 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1377 int Y1=yuvtab_2568[buf0[i]>>7];
1378 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1379 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1380 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
497d4f99 1381
397c035e
MN
1382 int Cb= yuvtab_40cf[U];
1383 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1384 int Cr= yuvtab_3343[V];
1385
d022ce5c 1386 ((uint16_t*)dest)[i] =
b18ea156
MN
1387 clip_table16b[(Y1 + Cb) >>13] |
1388 clip_table16g[(Y1 + Cg) >>13] |
1389 clip_table16r[(Y1 + Cr) >>13];
397c035e
MN
1390
1391 ((uint16_t*)dest)[i+1] =
b18ea156
MN
1392 clip_table16b[(Y2 + Cb) >>13] |
1393 clip_table16g[(Y2 + Cg) >>13] |
1394 clip_table16r[(Y2 + Cr) >>13];
497d4f99
MN
1395 }
1396 }
1397 else if(dstbpp==15)
1398 {
96034638 1399 int i;
d1fac6cf 1400 for(i=0; i<dstW-1; i+=2){
497d4f99 1401 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1402 int Y1=yuvtab_2568[buf0[i]>>7];
1403 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1404 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1405 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
497d4f99 1406
397c035e
MN
1407 int Cb= yuvtab_40cf[U];
1408 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1409 int Cr= yuvtab_3343[V];
1410
d022ce5c 1411 ((uint16_t*)dest)[i] =
b18ea156
MN
1412 clip_table15b[(Y1 + Cb) >>13] |
1413 clip_table15g[(Y1 + Cg) >>13] |
1414 clip_table15r[(Y1 + Cr) >>13];
1415
397c035e 1416 ((uint16_t*)dest)[i+1] =
b18ea156
MN
1417 clip_table15b[(Y2 + Cb) >>13] |
1418 clip_table15g[(Y2 + Cg) >>13] |
1419 clip_table15r[(Y2 + Cr) >>13];
497d4f99
MN
1420 }
1421 }
d604bab9
MN
1422#endif
1423}
1424
077ea8a7
MN
1425// Bilinear / Bicubic scaling
1426static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1427 int16_t *filter, int16_t *filterPos, int filterSize)
2ff198c1 1428{
077ea8a7
MN
1429#ifdef HAVE_MMX
1430 if(filterSize==4) // allways true for upscaling, sometimes for down too
1431 {
1432 int counter= -2*dstW;
1433 filter-= counter*2;
1434 filterPos-= counter/2;
1435 dst-= counter/2;
1436 asm volatile(
1437 "pxor %%mm7, %%mm7 \n\t"
1438 "movq w02, %%mm6 \n\t"
1439 "pushl %%ebp \n\t" // we use 7 regs here ...
1440 "movl %%eax, %%ebp \n\t"
1441 ".balign 16 \n\t"
1442 "1: \n\t"
1443 "movzwl (%2, %%ebp), %%eax \n\t"
1444 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1445 "movq (%1, %%ebp, 4), %%mm1 \n\t"
1446 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
1447 "movd (%3, %%eax), %%mm0 \n\t"
1448 "movd (%3, %%ebx), %%mm2 \n\t"
1449 "punpcklbw %%mm7, %%mm0 \n\t"
1450 "punpcklbw %%mm7, %%mm2 \n\t"
1451 "pmaddwd %%mm1, %%mm0 \n\t"
1452 "pmaddwd %%mm2, %%mm3 \n\t"
1453 "psrad $8, %%mm0 \n\t"
1454 "psrad $8, %%mm3 \n\t"
1455 "packssdw %%mm3, %%mm0 \n\t"
1456 "pmaddwd %%mm6, %%mm0 \n\t"
1457 "packssdw %%mm0, %%mm0 \n\t"
1458 "movd %%mm0, (%4, %%ebp) \n\t"
1459 "addl $4, %%ebp \n\t"
1460 " jnc 1b \n\t"
e3d2500f 1461
077ea8a7
MN
1462 "popl %%ebp \n\t"
1463 : "+a" (counter)
1464 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1465 : "%ebx"
1466 );
1467 }
1468 else if(filterSize==8)
1469 {
1470 int counter= -2*dstW;
1471 filter-= counter*4;
1472 filterPos-= counter/2;
1473 dst-= counter/2;
1474 asm volatile(
1475 "pxor %%mm7, %%mm7 \n\t"
1476 "movq w02, %%mm6 \n\t"
1477 "pushl %%ebp \n\t" // we use 7 regs here ...
1478 "movl %%eax, %%ebp \n\t"
1479 ".balign 16 \n\t"
1480 "1: \n\t"
1481 "movzwl (%2, %%ebp), %%eax \n\t"
1482 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1483 "movq (%1, %%ebp, 8), %%mm1 \n\t"
1484 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
1485 "movd (%3, %%eax), %%mm0 \n\t"
1486 "movd (%3, %%ebx), %%mm2 \n\t"
1487 "punpcklbw %%mm7, %%mm0 \n\t"
1488 "punpcklbw %%mm7, %%mm2 \n\t"
1489 "pmaddwd %%mm1, %%mm0 \n\t"
1490 "pmaddwd %%mm2, %%mm3 \n\t"
1491
1492 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
1493 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
1494 "movd 4(%3, %%eax), %%mm4 \n\t"
1495 "movd 4(%3, %%ebx), %%mm2 \n\t"
1496 "punpcklbw %%mm7, %%mm4 \n\t"
1497 "punpcklbw %%mm7, %%mm2 \n\t"
1498 "pmaddwd %%mm1, %%mm4 \n\t"
1499 "pmaddwd %%mm2, %%mm5 \n\t"
1500 "paddd %%mm4, %%mm0 \n\t"
1501 "paddd %%mm5, %%mm3 \n\t"
1502
1503 "psrad $8, %%mm0 \n\t"
1504 "psrad $8, %%mm3 \n\t"
1505 "packssdw %%mm3, %%mm0 \n\t"
1506 "pmaddwd %%mm6, %%mm0 \n\t"
1507 "packssdw %%mm0, %%mm0 \n\t"
1508 "movd %%mm0, (%4, %%ebp) \n\t"
1509 "addl $4, %%ebp \n\t"
1510 " jnc 1b \n\t"
c1b0bfb4 1511
077ea8a7
MN
1512 "popl %%ebp \n\t"
1513 : "+a" (counter)
1514 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1515 : "%ebx"
1516 );
1517 }
1518 else
1519 {
1520 int counter= -2*dstW;
1521// filter-= counter*filterSize/2;
1522 filterPos-= counter/2;
1523 dst-= counter/2;
1524 asm volatile(
1525 "pxor %%mm7, %%mm7 \n\t"
1526 "movq w02, %%mm6 \n\t"
1527 ".balign 16 \n\t"
1528 "1: \n\t"
1529 "movl %2, %%ecx \n\t"
1530 "movzwl (%%ecx, %0), %%eax \n\t"
1531 "movzwl 2(%%ecx, %0), %%ebx \n\t"
1532 "movl %5, %%ecx \n\t"
1533 "pxor %%mm4, %%mm4 \n\t"
1534 "pxor %%mm5, %%mm5 \n\t"
1535 "2: \n\t"
1536 "movq (%1), %%mm1 \n\t"
1537 "movq (%1, %6), %%mm3 \n\t"
1538 "movd (%%ecx, %%eax), %%mm0 \n\t"
1539 "movd (%%ecx, %%ebx), %%mm2 \n\t"
1540 "punpcklbw %%mm7, %%mm0 \n\t"
1541 "punpcklbw %%mm7, %%mm2 \n\t"
1542 "pmaddwd %%mm1, %%mm0 \n\t"
1543 "pmaddwd %%mm2, %%mm3 \n\t"
1544 "paddd %%mm3, %%mm5 \n\t"
1545 "paddd %%mm0, %%mm4 \n\t"
1546 "addl $8, %1 \n\t"
1547 "addl $4, %%ecx \n\t"
1548 "cmpl %4, %%ecx \n\t"
1549 " jb 2b \n\t"
1550 "addl %6, %1 \n\t"
1551 "psrad $8, %%mm4 \n\t"
1552 "psrad $8, %%mm5 \n\t"
1553 "packssdw %%mm5, %%mm4 \n\t"
1554 "pmaddwd %%mm6, %%mm4 \n\t"
1555 "packssdw %%mm4, %%mm4 \n\t"
1556 "movl %3, %%eax \n\t"
1557 "movd %%mm4, (%%eax, %0) \n\t"
1558 "addl $4, %0 \n\t"
1559 " jnc 1b \n\t"
c1b0bfb4 1560
627690b5
MN
1561 : "+r" (counter), "+r" (filter)
1562 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
077ea8a7 1563 "m" (src), "r" (filterSize*2)
e2f5a2a9 1564 : "%ebx", "%eax", "%ecx"
077ea8a7
MN
1565 );
1566 }
1567#else
1568 int i;
1569 for(i=0; i<dstW; i++)
1570 {
1571 int j;
1572 int srcPos= filterPos[i];
1573 int val=0;
c1b0bfb4 1574// printf("filterPos: %d\n", filterPos[i]);
077ea8a7
MN
1575 for(j=0; j<filterSize; j++)
1576 {
1577// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
1578 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1579 }
1580// filter += hFilterSize;
1581 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
1582// dst[i] = val>>7;
1583 }
1584#endif
1585}
2ff198c1 1586 // *** horizontal scale Y line to temp buffer
077ea8a7
MN
1587static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc)
1588{
e3d2500f
MN
1589#ifdef HAVE_MMX
1590 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1591 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1592#else
077ea8a7 1593 if(sws_flags != SWS_FAST_BILINEAR)
e3d2500f 1594#endif
077ea8a7
MN
1595 {
1596 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1597 }
1598 else // Fast Bilinear upscale / crap downscale
1599 {
2ff198c1 1600#ifdef ARCH_X86
2ff198c1 1601#ifdef HAVE_MMX2
96034638 1602 int i;
2ff198c1
MN
1603 if(canMMX2BeUsed)
1604 {
1605 asm volatile(
1606 "pxor %%mm7, %%mm7 \n\t"
1607 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1608 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1609 "punpcklwd %%mm6, %%mm6 \n\t"
1610 "punpcklwd %%mm6, %%mm6 \n\t"
1611 "movq %%mm6, %%mm2 \n\t"
1612 "psllq $16, %%mm2 \n\t"
1613 "paddw %%mm6, %%mm2 \n\t"
1614 "psllq $16, %%mm2 \n\t"
1615 "paddw %%mm6, %%mm2 \n\t"
1616 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
1617 "movq %%mm2, temp0 \n\t"
1618 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1619 "punpcklwd %%mm6, %%mm6 \n\t"
1620 "punpcklwd %%mm6, %%mm6 \n\t"
1621 "xorl %%eax, %%eax \n\t" // i
1622 "movl %0, %%esi \n\t" // src
1623 "movl %1, %%edi \n\t" // buf1
1624 "movl %3, %%edx \n\t" // (xInc*4)>>16
1625 "xorl %%ecx, %%ecx \n\t"
1626 "xorl %%ebx, %%ebx \n\t"
1627 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
99cefd0b 1628
2ff198c1 1629#define FUNNY_Y_CODE \
99cefd0b
MN
1630 PREFETCH" 1024(%%esi) \n\t"\
1631 PREFETCH" 1056(%%esi) \n\t"\
1632 PREFETCH" 1088(%%esi) \n\t"\
2ff198c1
MN
1633 "call funnyYCode \n\t"\
1634 "movq temp0, %%mm2 \n\t"\
1635 "xorl %%ecx, %%ecx \n\t"
99cefd0b 1636
2ff198c1
MN
1637FUNNY_Y_CODE
1638FUNNY_Y_CODE
1639FUNNY_Y_CODE
1640FUNNY_Y_CODE
1641FUNNY_Y_CODE
1642FUNNY_Y_CODE
1643FUNNY_Y_CODE
1644FUNNY_Y_CODE
1645
1646 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1647 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1648 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1649 );
af91b8b3 1650 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2ff198c1
MN
1651 }
1652 else
1653 {
1654#endif
1655 //NO MMX just normal asm ...
1656 asm volatile(
1657 "xorl %%eax, %%eax \n\t" // i
1658 "xorl %%ebx, %%ebx \n\t" // xx
1659 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 1660 ".balign 16 \n\t"
2ff198c1
MN
1661 "1: \n\t"
1662 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1663 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1664 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1665 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1666 "shll $16, %%edi \n\t"
1667 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1668 "movl %1, %%edi \n\t"
1669 "shrl $9, %%esi \n\t"
1670 "movw %%si, (%%edi, %%eax, 2) \n\t"
1671 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1672 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1673
1674 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1675 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1676 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1677 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1678 "shll $16, %%edi \n\t"
1679 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1680 "movl %1, %%edi \n\t"
1681 "shrl $9, %%esi \n\t"
1682 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
1683 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1684 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1685
1686
1687 "addl $2, %%eax \n\t"
1688 "cmpl %2, %%eax \n\t"
1689 " jb 1b \n\t"
1690
1691
1692 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1693 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1694 );
1695#ifdef HAVE_MMX2
1696 } //if MMX2 cant be used
1697#endif
1698#else
96034638
MN
1699 int i;
1700 unsigned int xpos=0;
1701 for(i=0;i<dstWidth;i++)
1702 {
1703 register unsigned int xx=xpos>>16;
1704 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1705 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1706 xpos+=xInc;
1707 }
2ff198c1 1708#endif
077ea8a7 1709 }
2ff198c1
MN
1710}
1711
7630f2e0 1712inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth,
af91b8b3 1713 uint8_t *src1, uint8_t *src2, int srcW, int xInc)
2ff198c1 1714{
e3d2500f
MN
1715#ifdef HAVE_MMX
1716 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1717 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1718#else
077ea8a7 1719 if(sws_flags != SWS_FAST_BILINEAR)
e3d2500f 1720#endif
077ea8a7
MN
1721 {
1722 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1723 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1724 }
1725 else // Fast Bilinear upscale / crap downscale
1726 {
2ff198c1
MN
1727#ifdef ARCH_X86
1728#ifdef HAVE_MMX2
96034638 1729 int i;
2ff198c1
MN
1730 if(canMMX2BeUsed)
1731 {
1732 asm volatile(
1733 "pxor %%mm7, %%mm7 \n\t"
1734 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1735 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1736 "punpcklwd %%mm6, %%mm6 \n\t"
1737 "punpcklwd %%mm6, %%mm6 \n\t"
1738 "movq %%mm6, %%mm2 \n\t"
1739 "psllq $16, %%mm2 \n\t"
1740 "paddw %%mm6, %%mm2 \n\t"
1741 "psllq $16, %%mm2 \n\t"
1742 "paddw %%mm6, %%mm2 \n\t"
1743 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
1744 "movq %%mm2, temp0 \n\t"
1745 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1746 "punpcklwd %%mm6, %%mm6 \n\t"
1747 "punpcklwd %%mm6, %%mm6 \n\t"
1748 "xorl %%eax, %%eax \n\t" // i
1749 "movl %0, %%esi \n\t" // src
1750 "movl %1, %%edi \n\t" // buf1
1751 "movl %3, %%edx \n\t" // (xInc*4)>>16
1752 "xorl %%ecx, %%ecx \n\t"
1753 "xorl %%ebx, %%ebx \n\t"
1754 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1755
2ff198c1 1756#define FUNNYUVCODE \
99cefd0b
MN
1757 PREFETCH" 1024(%%esi) \n\t"\
1758 PREFETCH" 1056(%%esi) \n\t"\
1759 PREFETCH" 1088(%%esi) \n\t"\
2ff198c1
MN
1760 "call funnyUVCode \n\t"\
1761 "movq temp0, %%mm2 \n\t"\
1762 "xorl %%ecx, %%ecx \n\t"
2ff198c1
MN
1763
1764FUNNYUVCODE
1765FUNNYUVCODE
1766FUNNYUVCODE
1767FUNNYUVCODE
1768
1769FUNNYUVCODE
1770FUNNYUVCODE
1771FUNNYUVCODE
1772FUNNYUVCODE
2ff198c1
MN
1773 "xorl %%eax, %%eax \n\t" // i
1774 "movl %6, %%esi \n\t" // src
1775 "movl %1, %%edi \n\t" // buf1
1776 "addl $4096, %%edi \n\t"
1777
1778FUNNYUVCODE
1779FUNNYUVCODE
1780FUNNYUVCODE
1781FUNNYUVCODE
1782
1783FUNNYUVCODE
1784FUNNYUVCODE
1785FUNNYUVCODE
1786FUNNYUVCODE
1787
1788 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1789 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1790 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1791 );
c1b0bfb4 1792 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2ff198c1 1793 {
c1b0bfb4
MN
1794// printf("%d %d %d\n", dstWidth, i, srcW);
1795 dst[i] = src1[srcW-1]*128;
1796 dst[i+2048] = src2[srcW-1]*128;
2ff198c1
MN
1797 }
1798 }
1799 else
1800 {
1801#endif
1802 asm volatile(
1803 "xorl %%eax, %%eax \n\t" // i
1804 "xorl %%ebx, %%ebx \n\t" // xx
1805 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 1806 ".balign 16 \n\t"
2ff198c1
MN
1807 "1: \n\t"
1808 "movl %0, %%esi \n\t"
1809 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
1810 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
1811 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1812 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1813 "shll $16, %%edi \n\t"
1814 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1815 "movl %1, %%edi \n\t"
1816 "shrl $9, %%esi \n\t"
1817 "movw %%si, (%%edi, %%eax, 2) \n\t"
1818
1819 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
1820 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
1821 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1822 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1823 "shll $16, %%edi \n\t"
1824 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1825 "movl %1, %%edi \n\t"
1826 "shrl $9, %%esi \n\t"
1827 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1828
1829 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1830 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1831 "addl $1, %%eax \n\t"
1832 "cmpl %2, %%eax \n\t"
1833 " jb 1b \n\t"
1834
1835 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1836 "r" (src2)
1837 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1838 );
1839#ifdef HAVE_MMX2
1840 } //if MMX2 cant be used
1841#endif
1842#else
96034638
MN
1843 int i;
1844 unsigned int xpos=0;
1845 for(i=0;i<dstWidth;i++)
1846 {
1847 register unsigned int xx=xpos>>16;
1848 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1849 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1850 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1faf0867
MN
1851/* slower
1852 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1853 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1854*/
96034638
MN
1855 xpos+=xInc;
1856 }
2ff198c1 1857#endif
077ea8a7
MN
1858 }
1859}
1860
e3d2500f 1861static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
c1b0bfb4 1862 int srcW, int dstW, int filterAlign, int one)
077ea8a7
MN
1863{
1864 int i;
e3d2500f 1865 double filter[8000];
077ea8a7 1866#ifdef HAVE_MMX
e3d2500f 1867 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
077ea8a7 1868#endif
c1b0bfb4
MN
1869
1870 if(ABS(xInc - 0x10000) <10) // unscaled
1871 {
1872 int i;
1873 *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly
1874 for(i=0; i<dstW*(*filterSize); i++) filter[i]=0;
1875
1876 for(i=0; i<dstW; i++)
1877 {
1878 filter[i*(*filterSize)]=1;
1879 filterPos[i]=i;
1880 }
1881
1882 }
1883 else if(xInc <= (1<<16) || sws_flags==SWS_FAST_BILINEAR) // upscale
077ea8a7
MN
1884 {
1885 int i;
1886 int xDstInSrc;
1887 if(sws_flags==SWS_BICUBIC) *filterSize= 4;
1888 else *filterSize= 2;
1889// printf("%d %d %d\n", filterSize, srcW, dstW);
c1b0bfb4
MN
1890 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1891
077ea8a7
MN
1892 xDstInSrc= xInc - 0x8000;
1893 for(i=0; i<dstW; i++)
1894 {
1895 int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1;
1896 int j;
c1b0bfb4 1897
077ea8a7
MN
1898 filterPos[i]= xx;
1899 if(sws_flags == SWS_BICUBIC)
c1b0bfb4 1900 {
077ea8a7 1901 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
e3d2500f 1902 double y1,y2,y3,y4;
077ea8a7
MN
1903 double A= -0.75;
1904 // Equation is from VirtualDub
e3d2500f
MN
1905 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
1906 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
1907 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
1908 y4 = ( + A*d*d - A*d*d*d);
c1b0bfb4 1909
077ea8a7
MN
1910// printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1911 filter[i*(*filterSize) + 0]= y1;
1912 filter[i*(*filterSize) + 1]= y2;
1913 filter[i*(*filterSize) + 2]= y3;
1914 filter[i*(*filterSize) + 3]= y4;
1915// printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4);
1916 }
c1b0bfb4 1917 else
077ea8a7
MN
1918 {
1919 for(j=0; j<*filterSize; j++)
1920 {
1921 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
e3d2500f 1922 double coeff= 1.0 - d;
c1b0bfb4 1923 if(coeff<0) coeff=0;
077ea8a7
MN
1924 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1925 filter[i*(*filterSize) + j]= coeff;
1926 xx++;
1927 }
c1b0bfb4
MN
1928 }
1929 xDstInSrc+= xInc;
077ea8a7
MN
1930 }
1931 }
1932 else // downscale
1933 {
1934 int xDstInSrc;
1935 if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
1936 else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
1937// printf("%d %d %d\n", *filterSize, srcW, dstW);
c1b0bfb4
MN
1938 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1939
077ea8a7
MN
1940 xDstInSrc= xInc - 0x8000;
1941 for(i=0; i<dstW; i++)
1942 {
1943 int xx= (int)((double)xDstInSrc/(double)(1<<16) - *filterSize*0.5 + 0.5);
1944 int j;
c1b0bfb4 1945
077ea8a7
MN
1946 filterPos[i]= xx;
1947 for(j=0; j<*filterSize; j++)
1948 {
1949 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
e3d2500f 1950 double coeff;
077ea8a7 1951 if(sws_flags == SWS_BICUBIC)
c1b0bfb4 1952 {
077ea8a7
MN
1953 double A= -0.75;
1954// d*=2;
1955 // Equation is from VirtualDub
1956 if(d<1.0)
e3d2500f 1957 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
077ea8a7 1958 else if(d<2.0)
e3d2500f 1959 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
077ea8a7 1960 else
e3d2500f 1961 coeff=0.0;
077ea8a7 1962 }
c1b0bfb4 1963 else
077ea8a7 1964 {
e3d2500f 1965 coeff= 1.0 - d;
c1b0bfb4 1966 if(coeff<0) coeff=0;
077ea8a7 1967 }
c1b0bfb4 1968// if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
077ea8a7
MN
1969 filter[i*(*filterSize) + j]= coeff;
1970 xx++;
1971 }
1972 xDstInSrc+= xInc;
1973 }
1974 }
c1b0bfb4 1975
077ea8a7
MN
1976 //fix borders
1977 for(i=0; i<dstW; i++)
1978 {
1979 int j;
1980 if(filterPos[i] < 0)
1981 {
1982 // Move filter coeffs left to compensate for filterPos
1983 for(j=1; j<*filterSize; j++)
1984 {
1985 int left= MAX(j + filterPos[i], 0);
1986 filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j];
1987 filter[i*(*filterSize) + j]=0;
1988 }
1989 filterPos[i]= 0;
1990 }
1991
e3d2500f 1992 if(filterPos[i] + (*filterSize) > srcW)
077ea8a7 1993 {
e3d2500f 1994 int shift= filterPos[i] + (*filterSize) - srcW;
077ea8a7 1995 // Move filter coeffs right to compensate for filterPos
e3d2500f 1996 for(j=(*filterSize)-2; j>=0; j--)
077ea8a7 1997 {
e3d2500f 1998 int right= MIN(j + shift, (*filterSize)-1);
077ea8a7
MN
1999 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
2000 filter[i*(*filterSize) +j]=0;
2001 }
e3d2500f 2002 filterPos[i]= srcW - (*filterSize);
077ea8a7
MN
2003 }
2004 }
c1b0bfb4
MN
2005
2006 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
2007 // and skip these than later
2008
077ea8a7
MN
2009 //Normalize
2010 for(i=0; i<dstW; i++)
2011 {
2012 int j;
2013 double sum=0;
c1b0bfb4 2014 double scale= one;
077ea8a7
MN
2015 for(j=0; j<*filterSize; j++)
2016 {
2017 sum+= filter[i*(*filterSize) + j];
2018 }
2019 scale/= sum;
2020 for(j=0; j<*filterSize; j++)
2021 {
e3d2500f 2022 dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
077ea8a7
MN
2023 }
2024 }
2ff198c1 2025}
d604bab9 2026
c1b0bfb4
MN
2027#ifdef HAVE_MMX2
2028static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
2029{
2030 uint8_t *fragment;
2031 int imm8OfPShufW1;
2032 int imm8OfPShufW2;
2033 int fragmentLength;
2034
2035 int xpos, i;
2036
2037 // create an optimized horizontal scaling routine
2038
2039 //code fragment
2040
2041 asm volatile(
2042 "jmp 9f \n\t"
2043 // Begin
2044 "0: \n\t"
2045 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
2046 "movq %%mm0, %%mm1 \n\t"
2047 "psrlq $8, %%mm0 \n\t"
2048 "punpcklbw %%mm7, %%mm1 \n\t"
2049 "movq %%mm2, %%mm3 \n\t"
2050 "punpcklbw %%mm7, %%mm0 \n\t"
2051 "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
2052 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
2053 "1: \n\t"
2054 "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
2055 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
2056 "2: \n\t"
2057 "psrlw $9, %%mm3 \n\t"
2058 "psubw %%mm1, %%mm0 \n\t"
2059 "pmullw %%mm3, %%mm0 \n\t"
2060 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
2061 "psllw $7, %%mm1 \n\t"
2062 "paddw %%mm1, %%mm0 \n\t"
2063
2064 "movq %%mm0, (%%edi, %%eax) \n\t"
2065
2066 "addl $8, %%eax \n\t"
2067 // End
2068 "9: \n\t"
2069// "int $3\n\t"
2070 "leal 0b, %0 \n\t"
2071 "leal 1b, %1 \n\t"
2072 "leal 2b, %2 \n\t"
2073 "decl %1 \n\t"
2074 "decl %2 \n\t"
2075 "subl %0, %1 \n\t"
2076 "subl %0, %2 \n\t"
2077 "leal 9b, %3 \n\t"
2078 "subl %0, %3 \n\t"
2079 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
2080 "=r" (fragmentLength)
2081 );
2082
2083 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
2084
2085 for(i=0; i<dstW/8; i++)
2086 {
2087 int xx=xpos>>16;
2088
2089 if((i&3) == 0)
2090 {
2091 int a=0;
2092 int b=((xpos+xInc)>>16) - xx;
2093 int c=((xpos+xInc*2)>>16) - xx;
2094 int d=((xpos+xInc*3)>>16) - xx;
2095
2096 memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
2097
2098 funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
2099 funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
2100 a | (b<<2) | (c<<4) | (d<<6);
2101
2102 // if we dont need to read 8 bytes than dont :), reduces the chance of
2103 // crossing a cache line
2104 if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
2105
2106 funnyCode[fragmentLength*(i+4)/4]= RET;
2107 }
2108 xpos+=xInc;
2109 }
2110/*
2111 xpos= 0; //chrXInc/2 - 0x10000; // difference between centers of chrom samples
2112 for(i=0; i<dstUVw/8; i++)
2113 {
2114 int xx=xpos>>16;
2115
2116 if((i&3) == 0)
2117 {
2118 int a=0;
2119 int b=((xpos+chrXInc)>>16) - xx;
2120 int c=((xpos+chrXInc*2)>>16) - xx;
2121 int d=((xpos+chrXInc*3)>>16) - xx;
2122
2123 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
2124
2125 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
2126 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
2127 a | (b<<2) | (c<<4) | (d<<6);
2128
2129 // if we dont need to read 8 bytes than dont :), reduces the chance of
2130 // crossing a cache line
2131 if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
2132
2133 funnyUVCode[fragmentLength*(i+4)/4]= RET;
2134 }
2135 xpos+=chrXInc;
2136 }
2137*/
2138// funnyCode[0]= RET;
2139}
2140#endif // HAVE_MMX2
2141
d1fac6cf
MN
2142static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY ,
2143 int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
2144 int srcW, int srcH, int dstW, int dstH){
31190492 2145
31190492 2146
c1b0bfb4
MN
2147unsigned int lumXInc= (srcW << 16) / dstW;
2148unsigned int lumYInc= (srcH << 16) / dstH;
2149unsigned int chrXInc;
2150unsigned int chrYInc;
783e9cc9 2151
c1b0bfb4 2152static int dstY;
783e9cc9 2153
c1b0bfb4 2154// used to detect a size change
077ea8a7
MN
2155static int oldDstW= -1;
2156static int oldSrcW= -1;
c1b0bfb4
MN
2157static int oldDstH= -1;
2158static int oldSrcH= -1;
2159static int oldFlags=-1;
d604bab9 2160
c1b0bfb4
MN
2161static int lastInLumBuf;
2162static int lastInChrBuf;
31190492 2163
c1b0bfb4 2164int chrDstW, chrDstH;
7d7f78b5 2165
c1b0bfb4
MN
2166static int lumBufIndex=0;
2167static int chrBufIndex=0;
2168
2169static int firstTime=1;
2170
e3d2500f
MN
2171const int widthAlign= dstbpp==12 ? 16 : 8;
2172const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
2173const int over= dstbpp==12 ? (((dstW+15)&(~15))) - dststride
2174 : (((dstW+7)&(~7)))*bytespp - dststride;
2175if(dststride%widthAlign !=0 )
c1b0bfb4 2176{
c1b0bfb4
MN
2177 if(firstTime)
2178 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
e3d2500f
MN
2179 "SwScaler: ->cannot do aligned memory acesses anymore\n",
2180 widthAlign);
c1b0bfb4 2181}
7d7f78b5 2182
e3d2500f
MN
2183if(over>0)
2184{
2185 if(firstTime)
2186 fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n"
2187 "SwScaler: and dststride is not large enough to handle %d extra bytes\n"
2188 "SwScaler: ->using unoptimized C version for last line(s)\n",
2189 over);
2190}
2191
2192
2193
af91b8b3 2194//printf("%d %d %d %d\n", srcW, srcH, dstW, dstH);
c1b0bfb4 2195//printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH);
af91b8b3 2196
d3fda508 2197#ifdef HAVE_MMX2
c1b0bfb4
MN
2198canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2199if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR)
2200{
e3d2500f 2201 if(firstTime)
c1b0bfb4
MN
2202 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
2203}
e3d2500f
MN
2204#else
2205canMMX2BeUsed=0; // should be 0 anyway but ...
c1b0bfb4
MN
2206#endif
2207
2208if(firstTime)
2209{
f014740a
MN
2210#if defined (DITHER1XBPP) && defined (HAVE_MMX)
2211 char *dither= " dithered";
c1b0bfb4
MN
2212#else
2213 char *dither= "";
d3fda508 2214#endif
c1b0bfb4 2215 if(sws_flags==SWS_FAST_BILINEAR)
51fdf2de 2216 fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
c1b0bfb4 2217 else if(sws_flags==SWS_BILINEAR)
51fdf2de 2218 fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
c1b0bfb4 2219 else if(sws_flags==SWS_BICUBIC)
51fdf2de 2220 fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
c1b0bfb4 2221 else
51fdf2de 2222 fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
c1b0bfb4
MN
2223
2224 if(dstbpp==15)
f014740a 2225 fprintf(stderr, "with%s BGR15 output ", dither);
c1b0bfb4 2226 else if(dstbpp==16)
f014740a 2227 fprintf(stderr, "with%s BGR16 output ", dither);
c1b0bfb4
MN
2228 else if(dstbpp==24)
2229 fprintf(stderr, "with BGR24 output ");
2230 else if(dstbpp==32)
2231 fprintf(stderr, "with BGR32 output ");
2232 else if(dstbpp==12)
2233 fprintf(stderr, "with YV12 output ");
2234 else
2235 fprintf(stderr, "without output ");
2236
2237#ifdef HAVE_MMX2
2238 fprintf(stderr, "using MMX2\n");
2239#elif defined (HAVE_3DNOW)
2240 fprintf(stderr, "using 3DNOW\n");
2241#elif defined (HAVE_MMX)
2242 fprintf(stderr, "using MMX\n");
2243#elif defined (ARCH_X86)
e3d2500f 2244 fprintf(stderr, "using X86 ASM\n");
c1b0bfb4
MN
2245#else
2246 fprintf(stderr, "using C\n");
2247#endif
2248}
2249
d3fda508 2250
0f25d72b
MN
2251// match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2252// n-2 is the last chrominance sample available
c1b0bfb4 2253// this is not perfect, but noone shuld notice the difference, the more correct variant
0f25d72b
MN
2254// would be like the vertical one, but that would require some special code for the
2255// first and last pixel
c1b0bfb4
MN
2256if(sws_flags==SWS_FAST_BILINEAR)
2257{
2258 if(canMMX2BeUsed) lumXInc+= 20;
e3d2500f 2259#ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available
c1b0bfb4 2260 else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
e3d2500f 2261#endif
c1b0bfb4
MN
2262}
2263
2264if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW;
e3d2500f 2265else chrXInc= lumXInc, chrDstW= (dstW+1)>>1;
c1b0bfb4 2266
e3d2500f 2267if(dstbpp==12) chrYInc= lumYInc, chrDstH= (dstH+1)>>1;
c1b0bfb4 2268else chrYInc= lumYInc>>1, chrDstH= dstH;
0f25d72b 2269
b3a134b6 2270 // force calculation of the horizontal interpolation of the first line
b3a134b6 2271
d1fac6cf 2272 if(srcSliceY ==0){
af91b8b3 2273// printf("dstW %d, srcw %d, mmx2 %d\n", dstW, srcW, canMMX2BeUsed);
c1b0bfb4
MN
2274 lumBufIndex=0;
2275 chrBufIndex=0;
d1fac6cf 2276 dstY=0;
162caf68 2277
077ea8a7 2278 //precalculate horizontal scaler filter coefficients
c1b0bfb4
MN
2279 if(oldDstW!=dstW || oldSrcW!=srcW || oldFlags!=sws_flags)
2280 {
2281#ifdef HAVE_MMX
2282 const int filterAlign=4;
2283#else
2284 const int filterAlign=1;
2285#endif
2286 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags;
2287
e3d2500f
MN
2288 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
2289 srcW , dstW , filterAlign, 1<<14);
2290 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
2291 (srcW+1)>>1, chrDstW, filterAlign, 1<<14);
077ea8a7 2292
d3f41512
MN
2293#ifdef HAVE_MMX2
2294// cant downscale !!!
c1b0bfb4
MN
2295 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR)
2296 {
2297 initMMX2HScaler(dstW , lumXInc, funnyYCode);
2298 initMMX2HScaler(chrDstW, chrXInc, funnyUVCode);
2299 }
2300#endif
2301 } // Init Horizontal stuff
d3f41512 2302
c1b0bfb4
MN
2303 if(oldDstH!=dstH || oldSrcH!=srcH || oldFlags!=sws_flags)
2304 {
2305 int i;
2306 oldDstH= dstH; oldSrcH= srcH; oldFlags= sws_flags; //FIXME swsflags conflict with x check
d3f41512 2307
c1b0bfb4
MN
2308 // deallocate pixbufs
2309 for(i=0; i<vLumBufSize; i++) free(lumPixBuf[i]);
2310 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]);
d3f41512 2311
c1b0bfb4
MN
2312 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc,
2313 srcH , dstH, 1, (1<<12)-4);
2314 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc,
e3d2500f 2315 (srcH+1)>>1, chrDstH, 1, (1<<12)-4);
d3f41512 2316
c1b0bfb4
MN
2317 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2318 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize;
2319 for(i=0; i<dstH; i++)
2320 {
2321 int chrI= i*chrDstH / dstH;
2322 int nextSlice= MAX(vLumFilterPos[i ] + vLumFilterSize - 1,
2323 ((vChrFilterPos[chrI] + vChrFilterSize - 1)<<1));
2324 nextSlice&= ~1; // Slices start at even boundaries
2325 if(vLumFilterPos[i ] + vLumBufSize < nextSlice)
2326 vLumBufSize= nextSlice - vLumFilterPos[i ];
2327 if(vChrFilterPos[chrI] + vChrBufSize < (nextSlice>>1))
2328 vChrBufSize= (nextSlice>>1) - vChrFilterPos[chrI];
2329 }
d3f41512 2330
c1b0bfb4
MN
2331 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2332 // allocate several megabytes to handle all possible cases)
2333 for(i=0; i<vLumBufSize; i++)
2334 lumPixBuf[i]= lumPixBuf[i+vLumBufSize]= (uint16_t*)memalign(8, 4000);
2335 for(i=0; i<vChrBufSize; i++)
2336 chrPixBuf[i]= chrPixBuf[i+vChrBufSize]= (uint16_t*)memalign(8, 8000);
783e9cc9 2337
c1b0bfb4
MN
2338 //try to avoid drawing green stuff between the right end and the stride end
2339 for(i=0; i<vLumBufSize; i++) memset(lumPixBuf[i], 0, 4000);
2340 for(i=0; i<vChrBufSize; i++) memset(chrPixBuf[i], 64, 8000);
d3f41512 2341
627690b5
MN
2342 ASSERT(chrDstH<=dstH)
2343 ASSERT(vLumFilterSize*dstH*4<16000)
2344 ASSERT(vChrFilterSize*chrDstH*4<16000)
c1b0bfb4
MN
2345#ifdef HAVE_MMX
2346 // pack filter data for mmx code
2347 for(i=0; i<vLumFilterSize*dstH; i++)
2348 lumMmxFilter[4*i]=lumMmxFilter[4*i+1]=lumMmxFilter[4*i+2]=lumMmxFilter[4*i+3]=
2349 vLumFilter[i];
c1b0bfb4
MN
2350 for(i=0; i<vChrFilterSize*chrDstH; i++)
2351 chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]=
2352 vChrFilter[i];
2353#endif
2354 }
d3f41512 2355
e3d2500f
MN
2356 if(firstTime && verbose)
2357 {
2358#ifdef HAVE_MMX2
2359 int mmx2=1;
2360#else
2361 int mmx2=0;
2362#endif
2363#ifdef HAVE_MMX
2364 int mmx=1;
2365#else
2366 int mmx=0;
2367#endif
2368
2369#ifdef HAVE_MMX
2370 if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR)
2371 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2372 else
2373 {
2374 if(hLumFilterSize==4)
2375 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
2376 else if(hLumFilterSize==8)
2377 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
2378 else
2379 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
2380
2381 if(hChrFilterSize==4)
2382 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
2383 else if(hChrFilterSize==8)
2384 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
2385 else
2386 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2387 }
2388#elif defined (ARCH_X86)
2389 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2390#else
2391 if(sws_flags==SWS_FAST_BILINEAR)
2392 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2393 else
2394 printf("SwScaler: using C scaler for horizontal scaling\n");
2395#endif
2396
2397 if(dstbpp==12)
2398 {
2399 if(vLumFilterSize==1)
2400 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2401 else
2402 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2403 }
2404 else
2405 {
2406 if(vLumFilterSize==1 && vChrFilterSize==2)
2407 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2408 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C");
2409 else if(vLumFilterSize==2 && vChrFilterSize==2)
2410 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2411 else
2412 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2413 }
2414
2415 if(dstbpp==24)
2416 printf("SwScaler: using %s YV12->BGR24 Converter\n",
2417 mmx2 ? "MMX2" : (mmx ? "MMX" : "C"));
2418 else
2419 printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp);
2420
2421 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2422 }
2423
c1b0bfb4
MN
2424 lastInLumBuf= -1;
2425 lastInChrBuf= -1;
2426 } // if(firstLine)
d3f41512 2427
c1b0bfb4
MN
2428 for(;dstY < dstH; dstY++){
2429 unsigned char *dest =dstptr[0]+dststride*dstY;
2430 unsigned char *uDest=dstptr[1]+(dststride>>1)*(dstY>>1);
2431 unsigned char *vDest=dstptr[2]+(dststride>>1)*(dstY>>1);
2432 const int chrDstY= dstbpp==12 ? (dstY>>1) : dstY;
d3f41512 2433
c1b0bfb4
MN
2434 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2435 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2436 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2437 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
d604bab9 2438
c1b0bfb4
MN
2439 if(sws_flags == SWS_FAST_BILINEAR)
2440 {
2441 //handle holes
2442 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2443 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
d3f41512
MN
2444 }
2445
c1b0bfb4
MN
2446 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2447 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
d3f41512 2448
c1b0bfb4
MN
2449 // Do we have enough lines in this slice to output the dstY line
2450 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
2451 {
2452 //Do horizontal scaling
2453 while(lastInLumBuf < lastLumSrcY)
d3f41512 2454 {
c1b0bfb4
MN
2455 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2456 lumBufIndex++;
2457 ASSERT(lumBufIndex < 2*vLumBufSize)
2458 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2459 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2460// printf("%d %d\n", lumBufIndex, vLumBufSize);
2461 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2462 lastInLumBuf++;
2463 }
2464 while(lastInChrBuf < lastChrSrcY)
2465 {
2466 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2467 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2468 chrBufIndex++;
2469 ASSERT(chrBufIndex < 2*vChrBufSize)
2470 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2471 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
e3d2500f 2472 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
c1b0bfb4 2473 lastInChrBuf++;
d3f41512 2474 }
c1b0bfb4
MN
2475 //wrap buf index around to stay inside the ring buffer
2476 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2477 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
d3f41512 2478 }
c1b0bfb4 2479 else // not enough lines left in this slice -> load the rest in the buffer
2ff198c1 2480 {
c1b0bfb4
MN
2481/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2482 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2483 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2484 vChrBufSize, vLumBufSize);
2485*/
2486 //Do horizontal scaling
2487 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2488 {
2489 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2490 lumBufIndex++;
2491 ASSERT(lumBufIndex < 2*vLumBufSize)
2492 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2493 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2494 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2495 lastInLumBuf++;
2496 }
2497 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2498 {
2499 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2500 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2501 chrBufIndex++;
2502 ASSERT(chrBufIndex < 2*vChrBufSize)
2503 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2504 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
e3d2500f 2505 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
c1b0bfb4
MN
2506 lastInChrBuf++;
2507 }
2508 //wrap buf index around to stay inside the ring buffer
2509 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2510 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2511 break; //we cant output a dstY line so lets try with the next slice
2ff198c1 2512 }
d3f41512 2513
c1b0bfb4
MN
2514#ifdef HAVE_MMX
2515 b5Dither= dither8[dstY&1];
2516 g6Dither= dither4[dstY&1];
2517 g5Dither= dither8[dstY&1];
2518 r5Dither= dither8[(dstY+1)&1];
2519#endif
e3d2500f
MN
2520 if(dstY < dstH-2 || over<=0)
2521 {
c1b0bfb4 2522 if(dstbpp==12) //YV12
0f25d72b 2523 {
c1b0bfb4
MN
2524 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2525 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2ff198c1 2526 {
c1b0bfb4
MN
2527 int16_t *lumBuf = lumPixBuf[0];
2528 int16_t *chrBuf= chrPixBuf[0];
2529 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2530 }
2531 else //General YV12
2532 {
2533 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2534 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2535 RENAME(yuv2yuvX)(
2536 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2537 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2538 dest, uDest, vDest, dstW,
2539 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2ff198c1 2540 }
0f25d72b 2541 }
c1b0bfb4 2542 else
2ff198c1 2543 {
c1b0bfb4
MN
2544 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2545 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
d3f41512 2546
c1b0bfb4
MN
2547 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2548 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2549 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2550 {
2551 int chrAlpha= vChrFilter[2*dstY+1];
2ff198c1 2552
c1b0bfb4
MN
2553 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2554 dest, dstW, chrAlpha, dstbpp);
2555 }
2556 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2557 {
2558 int lumAlpha= vLumFilter[2*dstY+1];
2559 int chrAlpha= vChrFilter[2*dstY+1];
2560
2561 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2562 dest, dstW, lumAlpha, chrAlpha, dstbpp);
2563 }
2564 else //General RGB
2565 {
2566 RENAME(yuv2rgbX)(
2567 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2568 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2569 dest, dstW, dstbpp,
2570 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2571 }
2572 }
e3d2500f
MN
2573 }
2574 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2575 {
2576 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2577 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2578 if(dstbpp==12) //YV12
2579 {
2580 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2581 yuv2yuvXinC(
2582 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2583 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2584 dest, uDest, vDest, dstW);
2585 }
2586 else
2587 {
2588 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2589 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2590 yuv2rgbXinC(
2591 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2592 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2593 dest, dstW, dstbpp);
2594 }
2595 }
c1b0bfb4 2596 }
17f715fa
MN
2597
2598#ifdef HAVE_MMX
2599 __asm __volatile(SFENCE:::"memory");
1faf0867 2600 __asm __volatile(EMMS:::"memory");
17f715fa 2601#endif
c1b0bfb4 2602 firstTime=0;
627690b5 2603}