looks better
[libav.git] / postproc / swscale_template.c
1
2 // Software scaling and colorspace conversion routines for MPlayer
3
4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at)
6 // the parts written by michael are under GNU GPL
7
8 #undef MOVNTQ
9 #undef PAVGB
10 #undef PREFETCH
11 #undef PREFETCHW
12 #undef EMMS
13 #undef SFENCE
14
15 #ifdef HAVE_3DNOW
16 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
17 #define EMMS "femms"
18 #else
19 #define EMMS "emms"
20 #endif
21
22 #ifdef HAVE_3DNOW
23 #define PREFETCH "prefetch"
24 #define PREFETCHW "prefetchw"
25 #elif defined ( HAVE_MMX2 )
26 #define PREFETCH "prefetchnta"
27 #define PREFETCHW "prefetcht0"
28 #else
29 #define PREFETCH "/nop"
30 #define PREFETCHW "/nop"
31 #endif
32
33 #ifdef HAVE_MMX2
34 #define SFENCE "sfence"
35 #else
36 #define SFENCE "/nop"
37 #endif
38
39 #ifdef HAVE_MMX2
40 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
41 #elif defined (HAVE_3DNOW)
42 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
47 #else
48 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
49 #endif
50
51
52 #define YSCALEYUV2YV12X(x) \
53 "xorl %%eax, %%eax \n\t"\
54 "pxor %%mm3, %%mm3 \n\t"\
55 "pxor %%mm4, %%mm4 \n\t"\
56 "movl %0, %%edx \n\t"\
57 ".balign 16 \n\t" /* FIXME Unroll? */\
58 "1: \n\t"\
59 "movl (%1, %%edx, 4), %%esi \n\t"\
60 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
61 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
62 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
63 "pmulhw %%mm0, %%mm2 \n\t"\
64 "pmulhw %%mm0, %%mm5 \n\t"\
65 "paddw %%mm2, %%mm3 \n\t"\
66 "paddw %%mm5, %%mm4 \n\t"\
67 "addl $1, %%edx \n\t"\
68 " jnz 1b \n\t"\
69 "psraw $3, %%mm3 \n\t"\
70 "psraw $3, %%mm4 \n\t"\
71 "packuswb %%mm4, %%mm3 \n\t"\
72 MOVNTQ(%%mm3, (%3, %%eax))\
73 "addl $8, %%eax \n\t"\
74 "cmpl %4, %%eax \n\t"\
75 "pxor %%mm3, %%mm3 \n\t"\
76 "pxor %%mm4, %%mm4 \n\t"\
77 "movl %0, %%edx \n\t"\
78 "jb 1b \n\t"
79
80 #define YSCALEYUV2YV121 \
81 "movl %2, %%eax \n\t"\
82 ".balign 16 \n\t" /* FIXME Unroll? */\
83 "1: \n\t"\
84 "movq (%0, %%eax, 2), %%mm0 \n\t"\
85 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
86 "psraw $7, %%mm0 \n\t"\
87 "psraw $7, %%mm1 \n\t"\
88 "packuswb %%mm1, %%mm0 \n\t"\
89 MOVNTQ(%%mm0, (%1, %%eax))\
90 "addl $8, %%eax \n\t"\
91 "jnc 1b \n\t"
92
93 /*
94 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
95 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
96 "r" (dest), "m" (dstW),
97 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
98 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
99 */
100 #define YSCALEYUV2RGBX \
101 "xorl %%eax, %%eax \n\t"\
102 ".balign 16 \n\t"\
103 "1: \n\t"\
104 "movl %1, %%edx \n\t" /* -chrFilterSize */\
105 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
106 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
107 "pxor %%mm3, %%mm3 \n\t"\
108 "pxor %%mm4, %%mm4 \n\t"\
109 "2: \n\t"\
110 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
111 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
112 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
113 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
114 "pmulhw %%mm0, %%mm2 \n\t"\
115 "pmulhw %%mm0, %%mm5 \n\t"\
116 "paddw %%mm2, %%mm3 \n\t"\
117 "paddw %%mm5, %%mm4 \n\t"\
118 "addl $1, %%edx \n\t"\
119 " jnz 2b \n\t"\
120 \
121 "movl %0, %%edx \n\t" /* -lumFilterSize */\
122 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
123 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
124 "pxor %%mm1, %%mm1 \n\t"\
125 "pxor %%mm7, %%mm7 \n\t"\
126 "2: \n\t"\
127 "movl (%%ecx, %%edx, 4), %%esi \n\t"\
128 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
129 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
130 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
131 "pmulhw %%mm0, %%mm2 \n\t"\
132 "pmulhw %%mm0, %%mm5 \n\t"\
133 "paddw %%mm2, %%mm1 \n\t"\
134 "paddw %%mm5, %%mm7 \n\t"\
135 "addl $1, %%edx \n\t"\
136 " jnz 2b \n\t"\
137 \
138 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
139 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
140 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
141 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
142 "pmulhw ugCoeff, %%mm3 \n\t"\
143 "pmulhw vgCoeff, %%mm4 \n\t"\
144 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
145 "pmulhw ubCoeff, %%mm2 \n\t"\
146 "pmulhw vrCoeff, %%mm5 \n\t"\
147 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
148 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
149 "pmulhw yCoeff, %%mm1 \n\t"\
150 "pmulhw yCoeff, %%mm7 \n\t"\
151 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
152 "paddw %%mm3, %%mm4 \n\t"\
153 "movq %%mm2, %%mm0 \n\t"\
154 "movq %%mm5, %%mm6 \n\t"\
155 "movq %%mm4, %%mm3 \n\t"\
156 "punpcklwd %%mm2, %%mm2 \n\t"\
157 "punpcklwd %%mm5, %%mm5 \n\t"\
158 "punpcklwd %%mm4, %%mm4 \n\t"\
159 "paddw %%mm1, %%mm2 \n\t"\
160 "paddw %%mm1, %%mm5 \n\t"\
161 "paddw %%mm1, %%mm4 \n\t"\
162 "punpckhwd %%mm0, %%mm0 \n\t"\
163 "punpckhwd %%mm6, %%mm6 \n\t"\
164 "punpckhwd %%mm3, %%mm3 \n\t"\
165 "paddw %%mm7, %%mm0 \n\t"\
166 "paddw %%mm7, %%mm6 \n\t"\
167 "paddw %%mm7, %%mm3 \n\t"\
168 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
169 "packuswb %%mm0, %%mm2 \n\t"\
170 "packuswb %%mm6, %%mm5 \n\t"\
171 "packuswb %%mm3, %%mm4 \n\t"\
172 "pxor %%mm7, %%mm7 \n\t"
173
174 #define FULL_YSCALEYUV2RGB \
175 "pxor %%mm7, %%mm7 \n\t"\
176 "movd %6, %%mm6 \n\t" /*yalpha1*/\
177 "punpcklwd %%mm6, %%mm6 \n\t"\
178 "punpcklwd %%mm6, %%mm6 \n\t"\
179 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
180 "punpcklwd %%mm5, %%mm5 \n\t"\
181 "punpcklwd %%mm5, %%mm5 \n\t"\
182 "xorl %%eax, %%eax \n\t"\
183 ".balign 16 \n\t"\
184 "1: \n\t"\
185 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
186 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
187 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
188 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
189 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
190 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
191 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
192 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
193 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
194 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
195 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
196 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
197 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
198 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
199 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
200 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
201 "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\
202 "pmulhw yCoeff, %%mm1 \n\t"\
203 \
204 \
205 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
206 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
207 "pmulhw ubCoeff, %%mm3 \n\t"\
208 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
209 "pmulhw ugCoeff, %%mm2 \n\t"\
210 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
211 "psubw w400, %%mm0 \n\t" /* (V-128)8*/\
212 \
213 \
214 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
215 "pmulhw vrCoeff, %%mm0 \n\t"\
216 "pmulhw vgCoeff, %%mm4 \n\t"\
217 "paddw %%mm1, %%mm3 \n\t" /* B*/\
218 "paddw %%mm1, %%mm0 \n\t" /* R*/\
219 "packuswb %%mm3, %%mm3 \n\t"\
220 \
221 "packuswb %%mm0, %%mm0 \n\t"\
222 "paddw %%mm4, %%mm2 \n\t"\
223 "paddw %%mm2, %%mm1 \n\t" /* G*/\
224 \
225 "packuswb %%mm1, %%mm1 \n\t"
226
227 #define YSCALEYUV2RGB \
228 "movd %6, %%mm6 \n\t" /*yalpha1*/\
229 "punpcklwd %%mm6, %%mm6 \n\t"\
230 "punpcklwd %%mm6, %%mm6 \n\t"\
231 "movq %%mm6, asm_yalpha1 \n\t"\
232 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
233 "punpcklwd %%mm5, %%mm5 \n\t"\
234 "punpcklwd %%mm5, %%mm5 \n\t"\
235 "movq %%mm5, asm_uvalpha1 \n\t"\
236 "xorl %%eax, %%eax \n\t"\
237 ".balign 16 \n\t"\
238 "1: \n\t"\
239 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
240 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
241 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
242 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
243 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
244 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
245 "movq asm_uvalpha1, %%mm0 \n\t"\
246 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
247 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
248 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
249 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
250 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
251 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
252 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
253 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
254 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
255 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
256 "pmulhw ugCoeff, %%mm3 \n\t"\
257 "pmulhw vgCoeff, %%mm4 \n\t"\
258 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
259 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
260 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
261 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
262 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
263 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
264 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
265 "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
266 "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
267 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
268 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
269 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
270 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
271 "pmulhw ubCoeff, %%mm2 \n\t"\
272 "pmulhw vrCoeff, %%mm5 \n\t"\
273 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
274 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
275 "pmulhw yCoeff, %%mm1 \n\t"\
276 "pmulhw yCoeff, %%mm7 \n\t"\
277 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
278 "paddw %%mm3, %%mm4 \n\t"\
279 "movq %%mm2, %%mm0 \n\t"\
280 "movq %%mm5, %%mm6 \n\t"\
281 "movq %%mm4, %%mm3 \n\t"\
282 "punpcklwd %%mm2, %%mm2 \n\t"\
283 "punpcklwd %%mm5, %%mm5 \n\t"\
284 "punpcklwd %%mm4, %%mm4 \n\t"\
285 "paddw %%mm1, %%mm2 \n\t"\
286 "paddw %%mm1, %%mm5 \n\t"\
287 "paddw %%mm1, %%mm4 \n\t"\
288 "punpckhwd %%mm0, %%mm0 \n\t"\
289 "punpckhwd %%mm6, %%mm6 \n\t"\
290 "punpckhwd %%mm3, %%mm3 \n\t"\
291 "paddw %%mm7, %%mm0 \n\t"\
292 "paddw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm7, %%mm3 \n\t"\
294 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
295 "packuswb %%mm0, %%mm2 \n\t"\
296 "packuswb %%mm6, %%mm5 \n\t"\
297 "packuswb %%mm3, %%mm4 \n\t"\
298 "pxor %%mm7, %%mm7 \n\t"
299
300 #define YSCALEYUV2RGB1 \
301 "xorl %%eax, %%eax \n\t"\
302 ".balign 16 \n\t"\
303 "1: \n\t"\
304 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
305 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
308 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
309 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw ugCoeff, %%mm3 \n\t"\
313 "pmulhw vgCoeff, %%mm4 \n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
316 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
317 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
318 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
319 "pmulhw ubCoeff, %%mm2 \n\t"\
320 "pmulhw vrCoeff, %%mm5 \n\t"\
321 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
322 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
323 "pmulhw yCoeff, %%mm1 \n\t"\
324 "pmulhw yCoeff, %%mm7 \n\t"\
325 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
326 "paddw %%mm3, %%mm4 \n\t"\
327 "movq %%mm2, %%mm0 \n\t"\
328 "movq %%mm5, %%mm6 \n\t"\
329 "movq %%mm4, %%mm3 \n\t"\
330 "punpcklwd %%mm2, %%mm2 \n\t"\
331 "punpcklwd %%mm5, %%mm5 \n\t"\
332 "punpcklwd %%mm4, %%mm4 \n\t"\
333 "paddw %%mm1, %%mm2 \n\t"\
334 "paddw %%mm1, %%mm5 \n\t"\
335 "paddw %%mm1, %%mm4 \n\t"\
336 "punpckhwd %%mm0, %%mm0 \n\t"\
337 "punpckhwd %%mm6, %%mm6 \n\t"\
338 "punpckhwd %%mm3, %%mm3 \n\t"\
339 "paddw %%mm7, %%mm0 \n\t"\
340 "paddw %%mm7, %%mm6 \n\t"\
341 "paddw %%mm7, %%mm3 \n\t"\
342 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
343 "packuswb %%mm0, %%mm2 \n\t"\
344 "packuswb %%mm6, %%mm5 \n\t"\
345 "packuswb %%mm3, %%mm4 \n\t"\
346 "pxor %%mm7, %%mm7 \n\t"
347
348 // do vertical chrominance interpolation
349 #define YSCALEYUV2RGB1b \
350 "xorl %%eax, %%eax \n\t"\
351 ".balign 16 \n\t"\
352 "1: \n\t"\
353 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
354 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
355 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
356 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
357 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
358 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
359 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
360 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
361 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
362 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
363 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
364 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
365 "pmulhw ugCoeff, %%mm3 \n\t"\
366 "pmulhw vgCoeff, %%mm4 \n\t"\
367 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
368 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
369 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
370 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
371 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
372 "pmulhw ubCoeff, %%mm2 \n\t"\
373 "pmulhw vrCoeff, %%mm5 \n\t"\
374 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
375 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
376 "pmulhw yCoeff, %%mm1 \n\t"\
377 "pmulhw yCoeff, %%mm7 \n\t"\
378 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
379 "paddw %%mm3, %%mm4 \n\t"\
380 "movq %%mm2, %%mm0 \n\t"\
381 "movq %%mm5, %%mm6 \n\t"\
382 "movq %%mm4, %%mm3 \n\t"\
383 "punpcklwd %%mm2, %%mm2 \n\t"\
384 "punpcklwd %%mm5, %%mm5 \n\t"\
385 "punpcklwd %%mm4, %%mm4 \n\t"\
386 "paddw %%mm1, %%mm2 \n\t"\
387 "paddw %%mm1, %%mm5 \n\t"\
388 "paddw %%mm1, %%mm4 \n\t"\
389 "punpckhwd %%mm0, %%mm0 \n\t"\
390 "punpckhwd %%mm6, %%mm6 \n\t"\
391 "punpckhwd %%mm3, %%mm3 \n\t"\
392 "paddw %%mm7, %%mm0 \n\t"\
393 "paddw %%mm7, %%mm6 \n\t"\
394 "paddw %%mm7, %%mm3 \n\t"\
395 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
396 "packuswb %%mm0, %%mm2 \n\t"\
397 "packuswb %%mm6, %%mm5 \n\t"\
398 "packuswb %%mm3, %%mm4 \n\t"\
399 "pxor %%mm7, %%mm7 \n\t"
400
401 #define WRITEBGR32 \
402 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
403 "movq %%mm2, %%mm1 \n\t" /* B */\
404 "movq %%mm5, %%mm6 \n\t" /* R */\
405 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
406 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
407 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
408 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
409 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
410 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
411 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
412 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
413 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
414 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
415 \
416 MOVNTQ(%%mm0, (%4, %%eax, 4))\
417 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
418 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
419 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
420 \
421 "addl $8, %%eax \n\t"\
422 "cmpl %5, %%eax \n\t"\
423 " jb 1b \n\t"
424
425 #define WRITEBGR16 \
426 "pand bF8, %%mm2 \n\t" /* B */\
427 "pand bFC, %%mm4 \n\t" /* G */\
428 "pand bF8, %%mm5 \n\t" /* R */\
429 "psrlq $3, %%mm2 \n\t"\
430 \
431 "movq %%mm2, %%mm1 \n\t"\
432 "movq %%mm4, %%mm3 \n\t"\
433 \
434 "punpcklbw %%mm7, %%mm3 \n\t"\
435 "punpcklbw %%mm5, %%mm2 \n\t"\
436 "punpckhbw %%mm7, %%mm4 \n\t"\
437 "punpckhbw %%mm5, %%mm1 \n\t"\
438 \
439 "psllq $3, %%mm3 \n\t"\
440 "psllq $3, %%mm4 \n\t"\
441 \
442 "por %%mm3, %%mm2 \n\t"\
443 "por %%mm4, %%mm1 \n\t"\
444 \
445 MOVNTQ(%%mm2, (%4, %%eax, 2))\
446 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
447 \
448 "addl $8, %%eax \n\t"\
449 "cmpl %5, %%eax \n\t"\
450 " jb 1b \n\t"
451
452 #define WRITEBGR15 \
453 "pand bF8, %%mm2 \n\t" /* B */\
454 "pand bF8, %%mm4 \n\t" /* G */\
455 "pand bF8, %%mm5 \n\t" /* R */\
456 "psrlq $3, %%mm2 \n\t"\
457 "psrlq $1, %%mm5 \n\t"\
458 \
459 "movq %%mm2, %%mm1 \n\t"\
460 "movq %%mm4, %%mm3 \n\t"\
461 \
462 "punpcklbw %%mm7, %%mm3 \n\t"\
463 "punpcklbw %%mm5, %%mm2 \n\t"\
464 "punpckhbw %%mm7, %%mm4 \n\t"\
465 "punpckhbw %%mm5, %%mm1 \n\t"\
466 \
467 "psllq $2, %%mm3 \n\t"\
468 "psllq $2, %%mm4 \n\t"\
469 \
470 "por %%mm3, %%mm2 \n\t"\
471 "por %%mm4, %%mm1 \n\t"\
472 \
473 MOVNTQ(%%mm2, (%4, %%eax, 2))\
474 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
475 \
476 "addl $8, %%eax \n\t"\
477 "cmpl %5, %%eax \n\t"\
478 " jb 1b \n\t"
479
480 #define WRITEBGR24OLD \
481 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
482 "movq %%mm2, %%mm1 \n\t" /* B */\
483 "movq %%mm5, %%mm6 \n\t" /* R */\
484 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
485 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
486 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
487 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
488 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
489 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
490 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
491 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
492 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
493 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
494 \
495 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
496 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
497 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\
498 "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\
499 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
500 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
501 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
502 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
503 \
504 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
506 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
507 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
508 "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\
509 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
510 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
511 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\
512 "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\
513 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
514 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
515 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
516 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
517 \
518 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
519 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
520 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
521 "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\
522 "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\
523 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
524 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
525 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
526 \
527 MOVNTQ(%%mm0, (%%ebx))\
528 MOVNTQ(%%mm2, 8(%%ebx))\
529 MOVNTQ(%%mm3, 16(%%ebx))\
530 "addl $24, %%ebx \n\t"\
531 \
532 "addl $8, %%eax \n\t"\
533 "cmpl %5, %%eax \n\t"\
534 " jb 1b \n\t"
535
536 #define WRITEBGR24MMX \
537 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
538 "movq %%mm2, %%mm1 \n\t" /* B */\
539 "movq %%mm5, %%mm6 \n\t" /* R */\
540 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
541 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
542 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
543 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
544 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
545 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
546 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
547 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
548 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
549 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
550 \
551 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
552 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
553 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
554 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
555 \
556 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
557 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
558 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
559 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
560 \
561 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
562 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
563 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
564 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
565 \
566 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
567 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
568 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
569 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
570 MOVNTQ(%%mm0, (%%ebx))\
571 \
572 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
573 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
574 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
575 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
576 MOVNTQ(%%mm6, 8(%%ebx))\
577 \
578 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
579 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
580 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
581 MOVNTQ(%%mm5, 16(%%ebx))\
582 \
583 "addl $24, %%ebx \n\t"\
584 \
585 "addl $8, %%eax \n\t"\
586 "cmpl %5, %%eax \n\t"\
587 " jb 1b \n\t"
588
589 #define WRITEBGR24MMX2 \
590 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
591 "movq M24A, %%mm0 \n\t"\
592 "movq M24C, %%mm7 \n\t"\
593 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
594 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
595 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
596 \
597 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
598 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
599 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
600 \
601 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
602 "por %%mm1, %%mm6 \n\t"\
603 "por %%mm3, %%mm6 \n\t"\
604 MOVNTQ(%%mm6, (%%ebx))\
605 \
606 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
607 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
608 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
609 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
610 \
611 "pand M24B, %%mm1 \n\t" /* B5 B4 B3 */\
612 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
613 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
614 \
615 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
616 "por %%mm3, %%mm6 \n\t"\
617 MOVNTQ(%%mm6, 8(%%ebx))\
618 \
619 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
620 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
621 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
622 \
623 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
624 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
625 "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */\
626 \
627 "por %%mm1, %%mm3 \n\t"\
628 "por %%mm3, %%mm6 \n\t"\
629 MOVNTQ(%%mm6, 16(%%ebx))\
630 \
631 "addl $24, %%ebx \n\t"\
632 \
633 "addl $8, %%eax \n\t"\
634 "cmpl %5, %%eax \n\t"\
635 " jb 1b \n\t"
636
637 #ifdef HAVE_MMX2
638 #undef WRITEBGR24
639 #define WRITEBGR24 WRITEBGR24MMX2
640 #else
641 #undef WRITEBGR24
642 #define WRITEBGR24 WRITEBGR24MMX
643 #endif
644
645 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
646 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
647 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
648 int16_t * lumMmxFilter, int16_t * chrMmxFilter)
649 {
650 #ifdef HAVE_MMX
651 if(uDest != NULL)
652 {
653 asm volatile(
654 YSCALEYUV2YV12X(0)
655 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
656 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
657 : "%eax", "%edx", "%esi"
658 );
659
660 asm volatile(
661 YSCALEYUV2YV12X(4096)
662 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
663 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
664 : "%eax", "%edx", "%esi"
665 );
666 }
667
668 asm volatile(
669 YSCALEYUV2YV12X(0)
670 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
671 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
672 : "%eax", "%edx", "%esi"
673 );
674 #else
675 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
676 chrFilter, chrSrc, chrFilterSize,
677 dest, uDest, vDest, dstW);
678 #endif
679 }
680
681 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
682 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
683 {
684 #ifdef HAVE_MMX
685 if(uDest != NULL)
686 {
687 asm volatile(
688 YSCALEYUV2YV121
689 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
690 "g" (-(dstW>>1))
691 : "%eax"
692 );
693
694 asm volatile(
695 YSCALEYUV2YV121
696 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
697 "g" (-(dstW>>1))
698 : "%eax"
699 );
700 }
701
702 asm volatile(
703 YSCALEYUV2YV121
704 :: "r" (lumSrc + dstW), "r" (dest + dstW),
705 "g" (-dstW)
706 : "%eax"
707 );
708 #else
709 //FIXME Optimize (just quickly writen not opti..)
710 //FIXME replace MINMAX with LUTs
711 int i;
712 for(i=0; i<dstW; i++)
713 {
714 int val= lumSrc[i]>>7;
715
716 dest[i]= MIN(MAX(val>>19, 0), 255);
717 }
718
719 if(uDest != NULL)
720 for(i=0; i<(dstW>>1); i++)
721 {
722 int u=chrSrc[i]>>7;
723 int v=chrSrc[i + 2048]>>7;
724
725 uDest[i]= MIN(MAX(u>>19, 0), 255);
726 vDest[i]= MIN(MAX(v>>19, 0), 255);
727 }
728 #endif
729 }
730
731
732 /**
733 * vertical scale YV12 to RGB
734 */
735 static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
736 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
737 uint8_t *dest, int dstW, int dstbpp, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
738 {
739 if(fullUVIpol)
740 {
741 //FIXME
742 }//FULL_UV_IPOL
743 else
744 {
745 #ifdef HAVE_MMX
746 if(dstbpp == 32) //FIXME untested
747 {
748 asm volatile(
749 YSCALEYUV2RGBX
750 WRITEBGR32
751
752 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
753 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
754 "r" (dest), "m" (dstW),
755 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
756 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
757 );
758 }
759 else if(dstbpp==24) //FIXME untested
760 {
761 asm volatile(
762 YSCALEYUV2RGBX
763 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
764 "addl %4, %%ebx \n\t"
765 WRITEBGR24
766
767 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
768 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
769 "r" (dest), "m" (dstW),
770 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
771 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
772 );
773 }
774 else if(dstbpp==15)
775 {
776 asm volatile(
777 YSCALEYUV2RGBX
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
779 #ifdef DITHER1XBPP
780 "paddusb b5Dither, %%mm2 \n\t"
781 "paddusb g5Dither, %%mm4 \n\t"
782 "paddusb r5Dither, %%mm5 \n\t"
783 #endif
784
785 WRITEBGR15
786
787 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
788 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
789 "r" (dest), "m" (dstW),
790 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
791 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
792 );
793 }
794 else if(dstbpp==16)
795 {
796 asm volatile(
797 YSCALEYUV2RGBX
798 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
799 #ifdef DITHER1XBPP
800 "paddusb b5Dither, %%mm2 \n\t"
801 "paddusb g6Dither, %%mm4 \n\t"
802 "paddusb r5Dither, %%mm5 \n\t"
803 #endif
804
805 WRITEBGR16
806
807 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
808 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
809 "r" (dest), "m" (dstW),
810 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
811 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
812 );
813 }
814 #else
815 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
816 chrFilter, chrSrc, chrFilterSize,
817 dest, dstW, dstbpp);
818
819 #endif
820 } //!FULL_UV_IPOL
821 }
822
823
824 /**
825 * vertical bilinear scale YV12 to RGB
826 */
827 static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
828 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstbpp)
829 {
830 int yalpha1=yalpha^4095;
831 int uvalpha1=uvalpha^4095;
832
833 if(fullUVIpol)
834 {
835
836 #ifdef HAVE_MMX
837 if(dstbpp == 32)
838 {
839 asm volatile(
840
841
842 FULL_YSCALEYUV2RGB
843 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
844 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
845
846 "movq %%mm3, %%mm1 \n\t"
847 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
848 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
849
850 MOVNTQ(%%mm3, (%4, %%eax, 4))
851 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
852
853 "addl $4, %%eax \n\t"
854 "cmpl %5, %%eax \n\t"
855 " jb 1b \n\t"
856
857
858 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
859 "m" (yalpha1), "m" (uvalpha1)
860 : "%eax"
861 );
862 }
863 else if(dstbpp==24)
864 {
865 asm volatile(
866
867 FULL_YSCALEYUV2RGB
868
869 // lsb ... msb
870 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
871 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
872
873 "movq %%mm3, %%mm1 \n\t"
874 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
875 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
876
877 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
878 "psrlq $8, %%mm3 \n\t" // GR0BGR00
879 "pand bm00000111, %%mm2 \n\t" // BGR00000
880 "pand bm11111000, %%mm3 \n\t" // 000BGR00
881 "por %%mm2, %%mm3 \n\t" // BGRBGR00
882 "movq %%mm1, %%mm2 \n\t"
883 "psllq $48, %%mm1 \n\t" // 000000BG
884 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
885
886 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
887 "psrld $16, %%mm2 \n\t" // R000R000
888 "psrlq $24, %%mm1 \n\t" // 0BGR0000
889 "por %%mm2, %%mm1 \n\t" // RBGRR000
890
891 "movl %4, %%ebx \n\t"
892 "addl %%eax, %%ebx \n\t"
893
894 #ifdef HAVE_MMX2
895 //FIXME Alignment
896 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
897 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
898 #else
899 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
900 "psrlq $32, %%mm3 \n\t"
901 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
902 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
903 #endif
904 "addl $4, %%eax \n\t"
905 "cmpl %5, %%eax \n\t"
906 " jb 1b \n\t"
907
908 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
909 "m" (yalpha1), "m" (uvalpha1)
910 : "%eax", "%ebx"
911 );
912 }
913 else if(dstbpp==15)
914 {
915 asm volatile(
916
917 FULL_YSCALEYUV2RGB
918 #ifdef DITHER1XBPP
919 "paddusb g5Dither, %%mm1 \n\t"
920 "paddusb r5Dither, %%mm0 \n\t"
921 "paddusb b5Dither, %%mm3 \n\t"
922 #endif
923 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
924 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
925 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
926
927 "psrlw $3, %%mm3 \n\t"
928 "psllw $2, %%mm1 \n\t"
929 "psllw $7, %%mm0 \n\t"
930 "pand g15Mask, %%mm1 \n\t"
931 "pand r15Mask, %%mm0 \n\t"
932
933 "por %%mm3, %%mm1 \n\t"
934 "por %%mm1, %%mm0 \n\t"
935
936 MOVNTQ(%%mm0, (%4, %%eax, 2))
937
938 "addl $4, %%eax \n\t"
939 "cmpl %5, %%eax \n\t"
940 " jb 1b \n\t"
941
942 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
943 "m" (yalpha1), "m" (uvalpha1)
944 : "%eax"
945 );
946 }
947 else if(dstbpp==16)
948 {
949 asm volatile(
950
951 FULL_YSCALEYUV2RGB
952 #ifdef DITHER1XBPP
953 "paddusb g6Dither, %%mm1 \n\t"
954 "paddusb r5Dither, %%mm0 \n\t"
955 "paddusb b5Dither, %%mm3 \n\t"
956 #endif
957 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
958 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
959 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
960
961 "psrlw $3, %%mm3 \n\t"
962 "psllw $3, %%mm1 \n\t"
963 "psllw $8, %%mm0 \n\t"
964 "pand g16Mask, %%mm1 \n\t"
965 "pand r16Mask, %%mm0 \n\t"
966
967 "por %%mm3, %%mm1 \n\t"
968 "por %%mm1, %%mm0 \n\t"
969
970 MOVNTQ(%%mm0, (%4, %%eax, 2))
971
972 "addl $4, %%eax \n\t"
973 "cmpl %5, %%eax \n\t"
974 " jb 1b \n\t"
975
976 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
977 "m" (yalpha1), "m" (uvalpha1)
978 : "%eax"
979 );
980 }
981 #else
982 if(dstbpp==32 || dstbpp==24)
983 {
984 int i;
985 for(i=0;i<dstW;i++){
986 // vertical linear interpolation && yuv2rgb in a single step:
987 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
988 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
989 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
990 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
991 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
992 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
993 dest+=dstbpp>>3;
994 }
995 }
996 else if(dstbpp==16)
997 {
998 int i;
999 for(i=0;i<dstW;i++){
1000 // vertical linear interpolation && yuv2rgb in a single step:
1001 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1002 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1003 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1004
1005 ((uint16_t*)dest)[i] =
1006 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1007 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1008 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1009 }
1010 }
1011 else if(dstbpp==15)
1012 {
1013 int i;
1014 for(i=0;i<dstW;i++){
1015 // vertical linear interpolation && yuv2rgb in a single step:
1016 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1017 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1018 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1019
1020 ((uint16_t*)dest)[i] =
1021 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1022 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1023 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1024 }
1025 }
1026 #endif
1027 }//FULL_UV_IPOL
1028 else
1029 {
1030 #ifdef HAVE_MMX
1031 if(dstbpp == 32)
1032 {
1033 asm volatile(
1034 YSCALEYUV2RGB
1035 WRITEBGR32
1036
1037 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1038 "m" (yalpha1), "m" (uvalpha1)
1039 : "%eax"
1040 );
1041 }
1042 else if(dstbpp==24)
1043 {
1044 asm volatile(
1045 "movl %4, %%ebx \n\t"
1046 YSCALEYUV2RGB
1047 WRITEBGR24
1048
1049 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1050 "m" (yalpha1), "m" (uvalpha1)
1051 : "%eax", "%ebx"
1052 );
1053 }
1054 else if(dstbpp==15)
1055 {
1056 asm volatile(
1057 YSCALEYUV2RGB
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059 #ifdef DITHER1XBPP
1060 "paddusb b5Dither, %%mm2 \n\t"
1061 "paddusb g5Dither, %%mm4 \n\t"
1062 "paddusb r5Dither, %%mm5 \n\t"
1063 #endif
1064
1065 WRITEBGR15
1066
1067 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1068 "m" (yalpha1), "m" (uvalpha1)
1069 : "%eax"
1070 );
1071 }
1072 else if(dstbpp==16)
1073 {
1074 asm volatile(
1075 YSCALEYUV2RGB
1076 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1077 #ifdef DITHER1XBPP
1078 "paddusb b5Dither, %%mm2 \n\t"
1079 "paddusb g6Dither, %%mm4 \n\t"
1080 "paddusb r5Dither, %%mm5 \n\t"
1081 #endif
1082
1083 WRITEBGR16
1084
1085 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1086 "m" (yalpha1), "m" (uvalpha1)
1087 : "%eax"
1088 );
1089 }
1090 #else
1091 if(dstbpp==32)
1092 {
1093 int i;
1094 for(i=0; i<dstW-1; i+=2){
1095 // vertical linear interpolation && yuv2rgb in a single step:
1096 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1097 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1098 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1099 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1100
1101 int Cb= yuvtab_40cf[U];
1102 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1103 int Cr= yuvtab_3343[V];
1104
1105 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1106 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1107 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1108
1109 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1110 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1111 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1112 }
1113 }
1114 else if(dstbpp==24)
1115 {
1116 int i;
1117 for(i=0; i<dstW-1; i+=2){
1118 // vertical linear interpolation && yuv2rgb in a single step:
1119 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1120 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1121 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1122 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1123
1124 int Cb= yuvtab_40cf[U];
1125 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1126 int Cr= yuvtab_3343[V];
1127
1128 dest[0]=clip_table[((Y1 + Cb) >>13)];
1129 dest[1]=clip_table[((Y1 + Cg) >>13)];
1130 dest[2]=clip_table[((Y1 + Cr) >>13)];
1131
1132 dest[3]=clip_table[((Y2 + Cb) >>13)];
1133 dest[4]=clip_table[((Y2 + Cg) >>13)];
1134 dest[5]=clip_table[((Y2 + Cr) >>13)];
1135 dest+=6;
1136 }
1137 }
1138 else if(dstbpp==16)
1139 {
1140 int i;
1141 for(i=0; i<dstW-1; i+=2){
1142 // vertical linear interpolation && yuv2rgb in a single step:
1143 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1144 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1145 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1146 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1147
1148 int Cb= yuvtab_40cf[U];
1149 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1150 int Cr= yuvtab_3343[V];
1151
1152 ((uint16_t*)dest)[i] =
1153 clip_table16b[(Y1 + Cb) >>13] |
1154 clip_table16g[(Y1 + Cg) >>13] |
1155 clip_table16r[(Y1 + Cr) >>13];
1156
1157 ((uint16_t*)dest)[i+1] =
1158 clip_table16b[(Y2 + Cb) >>13] |
1159 clip_table16g[(Y2 + Cg) >>13] |
1160 clip_table16r[(Y2 + Cr) >>13];
1161 }
1162 }
1163 else if(dstbpp==15)
1164 {
1165 int i;
1166 for(i=0; i<dstW-1; i+=2){
1167 // vertical linear interpolation && yuv2rgb in a single step:
1168 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1169 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1170 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1171 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1172
1173 int Cb= yuvtab_40cf[U];
1174 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1175 int Cr= yuvtab_3343[V];
1176
1177 ((uint16_t*)dest)[i] =
1178 clip_table15b[(Y1 + Cb) >>13] |
1179 clip_table15g[(Y1 + Cg) >>13] |
1180 clip_table15r[(Y1 + Cr) >>13];
1181
1182 ((uint16_t*)dest)[i+1] =
1183 clip_table15b[(Y2 + Cb) >>13] |
1184 clip_table15g[(Y2 + Cg) >>13] |
1185 clip_table15r[(Y2 + Cr) >>13];
1186 }
1187 }
1188 #endif
1189 } //!FULL_UV_IPOL
1190 }
1191
1192 /**
1193 * YV12 to RGB without scaling or interpolating
1194 */
1195 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1196 uint8_t *dest, int dstW, int uvalpha, int dstbpp)
1197 {
1198 int uvalpha1=uvalpha^4095;
1199 const int yalpha1=0;
1200
1201 if(fullUVIpol || allwaysIpol)
1202 {
1203 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp);
1204 return;
1205 }
1206
1207 #ifdef HAVE_MMX
1208 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1209 {
1210 if(dstbpp == 32)
1211 {
1212 asm volatile(
1213 YSCALEYUV2RGB1
1214 WRITEBGR32
1215 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1216 "m" (yalpha1), "m" (uvalpha1)
1217 : "%eax"
1218 );
1219 }
1220 else if(dstbpp==24)
1221 {
1222 asm volatile(
1223 "movl %4, %%ebx \n\t"
1224 YSCALEYUV2RGB1
1225 WRITEBGR24
1226 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1227 "m" (yalpha1), "m" (uvalpha1)
1228 : "%eax", "%ebx"
1229 );
1230 }
1231 else if(dstbpp==15)
1232 {
1233 asm volatile(
1234 YSCALEYUV2RGB1
1235 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1236 #ifdef DITHER1XBPP
1237 "paddusb b5Dither, %%mm2 \n\t"
1238 "paddusb g5Dither, %%mm4 \n\t"
1239 "paddusb r5Dither, %%mm5 \n\t"
1240 #endif
1241 WRITEBGR15
1242 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1243 "m" (yalpha1), "m" (uvalpha1)
1244 : "%eax"
1245 );
1246 }
1247 else if(dstbpp==16)
1248 {
1249 asm volatile(
1250 YSCALEYUV2RGB1
1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1252 #ifdef DITHER1XBPP
1253 "paddusb b5Dither, %%mm2 \n\t"
1254 "paddusb g6Dither, %%mm4 \n\t"
1255 "paddusb r5Dither, %%mm5 \n\t"
1256 #endif
1257
1258 WRITEBGR16
1259 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1260 "m" (yalpha1), "m" (uvalpha1)
1261 : "%eax"
1262 );
1263 }
1264 }
1265 else
1266 {
1267 if(dstbpp == 32)
1268 {
1269 asm volatile(
1270 YSCALEYUV2RGB1b
1271 WRITEBGR32
1272 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1273 "m" (yalpha1), "m" (uvalpha1)
1274 : "%eax"
1275 );
1276 }
1277 else if(dstbpp==24)
1278 {
1279 asm volatile(
1280 "movl %4, %%ebx \n\t"
1281 YSCALEYUV2RGB1b
1282 WRITEBGR24
1283 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1284 "m" (yalpha1), "m" (uvalpha1)
1285 : "%eax", "%ebx"
1286 );
1287 }
1288 else if(dstbpp==15)
1289 {
1290 asm volatile(
1291 YSCALEYUV2RGB1b
1292 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1293 #ifdef DITHER1XBPP
1294 "paddusb b5Dither, %%mm2 \n\t"
1295 "paddusb g5Dither, %%mm4 \n\t"
1296 "paddusb r5Dither, %%mm5 \n\t"
1297 #endif
1298 WRITEBGR15
1299 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1300 "m" (yalpha1), "m" (uvalpha1)
1301 : "%eax"
1302 );
1303 }
1304 else if(dstbpp==16)
1305 {
1306 asm volatile(
1307 YSCALEYUV2RGB1b
1308 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1309 #ifdef DITHER1XBPP
1310 "paddusb b5Dither, %%mm2 \n\t"
1311 "paddusb g6Dither, %%mm4 \n\t"
1312 "paddusb r5Dither, %%mm5 \n\t"
1313 #endif
1314
1315 WRITEBGR16
1316 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1317 "m" (yalpha1), "m" (uvalpha1)
1318 : "%eax"
1319 );
1320 }
1321 }
1322 #else
1323 //FIXME write 2 versions (for even & odd lines)
1324
1325 if(dstbpp==32)
1326 {
1327 int i;
1328 for(i=0; i<dstW-1; i+=2){
1329 // vertical linear interpolation && yuv2rgb in a single step:
1330 int Y1=yuvtab_2568[buf0[i]>>7];
1331 int Y2=yuvtab_2568[buf0[i+1]>>7];
1332 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1333 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1334
1335 int Cb= yuvtab_40cf[U];
1336 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1337 int Cr= yuvtab_3343[V];
1338
1339 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1340 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1341 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1342
1343 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1344 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1345 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1346 }
1347 }
1348 else if(dstbpp==24)
1349 {
1350 int i;
1351 for(i=0; i<dstW-1; i+=2){
1352 // vertical linear interpolation && yuv2rgb in a single step:
1353 int Y1=yuvtab_2568[buf0[i]>>7];
1354 int Y2=yuvtab_2568[buf0[i+1]>>7];
1355 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1356 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1357
1358 int Cb= yuvtab_40cf[U];
1359 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1360 int Cr= yuvtab_3343[V];
1361
1362 dest[0]=clip_table[((Y1 + Cb) >>13)];
1363 dest[1]=clip_table[((Y1 + Cg) >>13)];
1364 dest[2]=clip_table[((Y1 + Cr) >>13)];
1365
1366 dest[3]=clip_table[((Y2 + Cb) >>13)];
1367 dest[4]=clip_table[((Y2 + Cg) >>13)];
1368 dest[5]=clip_table[((Y2 + Cr) >>13)];
1369 dest+=6;
1370 }
1371 }
1372 else if(dstbpp==16)
1373 {
1374 int i;
1375 for(i=0; i<dstW-1; i+=2){
1376 // vertical linear interpolation && yuv2rgb in a single step:
1377 int Y1=yuvtab_2568[buf0[i]>>7];
1378 int Y2=yuvtab_2568[buf0[i+1]>>7];
1379 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1380 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1381
1382 int Cb= yuvtab_40cf[U];
1383 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1384 int Cr= yuvtab_3343[V];
1385
1386 ((uint16_t*)dest)[i] =
1387 clip_table16b[(Y1 + Cb) >>13] |
1388 clip_table16g[(Y1 + Cg) >>13] |
1389 clip_table16r[(Y1 + Cr) >>13];
1390
1391 ((uint16_t*)dest)[i+1] =
1392 clip_table16b[(Y2 + Cb) >>13] |
1393 clip_table16g[(Y2 + Cg) >>13] |
1394 clip_table16r[(Y2 + Cr) >>13];
1395 }
1396 }
1397 else if(dstbpp==15)
1398 {
1399 int i;
1400 for(i=0; i<dstW-1; i+=2){
1401 // vertical linear interpolation && yuv2rgb in a single step:
1402 int Y1=yuvtab_2568[buf0[i]>>7];
1403 int Y2=yuvtab_2568[buf0[i+1]>>7];
1404 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1405 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1406
1407 int Cb= yuvtab_40cf[U];
1408 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1409 int Cr= yuvtab_3343[V];
1410
1411 ((uint16_t*)dest)[i] =
1412 clip_table15b[(Y1 + Cb) >>13] |
1413 clip_table15g[(Y1 + Cg) >>13] |
1414 clip_table15r[(Y1 + Cr) >>13];
1415
1416 ((uint16_t*)dest)[i+1] =
1417 clip_table15b[(Y2 + Cb) >>13] |
1418 clip_table15g[(Y2 + Cg) >>13] |
1419 clip_table15r[(Y2 + Cr) >>13];
1420 }
1421 }
1422 #endif
1423 }
1424
1425 // Bilinear / Bicubic scaling
1426 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1427 int16_t *filter, int16_t *filterPos, int filterSize)
1428 {
1429 #ifdef HAVE_MMX
1430 if(filterSize==4) // allways true for upscaling, sometimes for down too
1431 {
1432 int counter= -2*dstW;
1433 filter-= counter*2;
1434 filterPos-= counter/2;
1435 dst-= counter/2;
1436 asm volatile(
1437 "pxor %%mm7, %%mm7 \n\t"
1438 "movq w02, %%mm6 \n\t"
1439 "pushl %%ebp \n\t" // we use 7 regs here ...
1440 "movl %%eax, %%ebp \n\t"
1441 ".balign 16 \n\t"
1442 "1: \n\t"
1443 "movzwl (%2, %%ebp), %%eax \n\t"
1444 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1445 "movq (%1, %%ebp, 4), %%mm1 \n\t"
1446 "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
1447 "movd (%3, %%eax), %%mm0 \n\t"
1448 "movd (%3, %%ebx), %%mm2 \n\t"
1449 "punpcklbw %%mm7, %%mm0 \n\t"
1450 "punpcklbw %%mm7, %%mm2 \n\t"
1451 "pmaddwd %%mm1, %%mm0 \n\t"
1452 "pmaddwd %%mm2, %%mm3 \n\t"
1453 "psrad $8, %%mm0 \n\t"
1454 "psrad $8, %%mm3 \n\t"
1455 "packssdw %%mm3, %%mm0 \n\t"
1456 "pmaddwd %%mm6, %%mm0 \n\t"
1457 "packssdw %%mm0, %%mm0 \n\t"
1458 "movd %%mm0, (%4, %%ebp) \n\t"
1459 "addl $4, %%ebp \n\t"
1460 " jnc 1b \n\t"
1461
1462 "popl %%ebp \n\t"
1463 : "+a" (counter)
1464 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1465 : "%ebx"
1466 );
1467 }
1468 else if(filterSize==8)
1469 {
1470 int counter= -2*dstW;
1471 filter-= counter*4;
1472 filterPos-= counter/2;
1473 dst-= counter/2;
1474 asm volatile(
1475 "pxor %%mm7, %%mm7 \n\t"
1476 "movq w02, %%mm6 \n\t"
1477 "pushl %%ebp \n\t" // we use 7 regs here ...
1478 "movl %%eax, %%ebp \n\t"
1479 ".balign 16 \n\t"
1480 "1: \n\t"
1481 "movzwl (%2, %%ebp), %%eax \n\t"
1482 "movzwl 2(%2, %%ebp), %%ebx \n\t"
1483 "movq (%1, %%ebp, 8), %%mm1 \n\t"
1484 "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
1485 "movd (%3, %%eax), %%mm0 \n\t"
1486 "movd (%3, %%ebx), %%mm2 \n\t"
1487 "punpcklbw %%mm7, %%mm0 \n\t"
1488 "punpcklbw %%mm7, %%mm2 \n\t"
1489 "pmaddwd %%mm1, %%mm0 \n\t"
1490 "pmaddwd %%mm2, %%mm3 \n\t"
1491
1492 "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
1493 "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
1494 "movd 4(%3, %%eax), %%mm4 \n\t"
1495 "movd 4(%3, %%ebx), %%mm2 \n\t"
1496 "punpcklbw %%mm7, %%mm4 \n\t"
1497 "punpcklbw %%mm7, %%mm2 \n\t"
1498 "pmaddwd %%mm1, %%mm4 \n\t"
1499 "pmaddwd %%mm2, %%mm5 \n\t"
1500 "paddd %%mm4, %%mm0 \n\t"
1501 "paddd %%mm5, %%mm3 \n\t"
1502
1503 "psrad $8, %%mm0 \n\t"
1504 "psrad $8, %%mm3 \n\t"
1505 "packssdw %%mm3, %%mm0 \n\t"
1506 "pmaddwd %%mm6, %%mm0 \n\t"
1507 "packssdw %%mm0, %%mm0 \n\t"
1508 "movd %%mm0, (%4, %%ebp) \n\t"
1509 "addl $4, %%ebp \n\t"
1510 " jnc 1b \n\t"
1511
1512 "popl %%ebp \n\t"
1513 : "+a" (counter)
1514 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1515 : "%ebx"
1516 );
1517 }
1518 else
1519 {
1520 int counter= -2*dstW;
1521 // filter-= counter*filterSize/2;
1522 filterPos-= counter/2;
1523 dst-= counter/2;
1524 asm volatile(
1525 "pxor %%mm7, %%mm7 \n\t"
1526 "movq w02, %%mm6 \n\t"
1527 ".balign 16 \n\t"
1528 "1: \n\t"
1529 "movl %2, %%ecx \n\t"
1530 "movzwl (%%ecx, %0), %%eax \n\t"
1531 "movzwl 2(%%ecx, %0), %%ebx \n\t"
1532 "movl %5, %%ecx \n\t"
1533 "pxor %%mm4, %%mm4 \n\t"
1534 "pxor %%mm5, %%mm5 \n\t"
1535 "2: \n\t"
1536 "movq (%1), %%mm1 \n\t"
1537 "movq (%1, %6), %%mm3 \n\t"
1538 "movd (%%ecx, %%eax), %%mm0 \n\t"
1539 "movd (%%ecx, %%ebx), %%mm2 \n\t"
1540 "punpcklbw %%mm7, %%mm0 \n\t"
1541 "punpcklbw %%mm7, %%mm2 \n\t"
1542 "pmaddwd %%mm1, %%mm0 \n\t"
1543 "pmaddwd %%mm2, %%mm3 \n\t"
1544 "paddd %%mm3, %%mm5 \n\t"
1545 "paddd %%mm0, %%mm4 \n\t"
1546 "addl $8, %1 \n\t"
1547 "addl $4, %%ecx \n\t"
1548 "cmpl %4, %%ecx \n\t"
1549 " jb 2b \n\t"
1550 "addl %6, %1 \n\t"
1551 "psrad $8, %%mm4 \n\t"
1552 "psrad $8, %%mm5 \n\t"
1553 "packssdw %%mm5, %%mm4 \n\t"
1554 "pmaddwd %%mm6, %%mm4 \n\t"
1555 "packssdw %%mm4, %%mm4 \n\t"
1556 "movl %3, %%eax \n\t"
1557 "movd %%mm4, (%%eax, %0) \n\t"
1558 "addl $4, %0 \n\t"
1559 " jnc 1b \n\t"
1560
1561 : "+r" (counter), "+r" (filter)
1562 : "m" (filterPos), "m" (dst), "m"(src+filterSize),
1563 "m" (src), "r" (filterSize*2)
1564 : "%ebx", "%eax", "%ecx"
1565 );
1566 }
1567 #else
1568 int i;
1569 for(i=0; i<dstW; i++)
1570 {
1571 int j;
1572 int srcPos= filterPos[i];
1573 int val=0;
1574 // printf("filterPos: %d\n", filterPos[i]);
1575 for(j=0; j<filterSize; j++)
1576 {
1577 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
1578 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1579 }
1580 // filter += hFilterSize;
1581 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
1582 // dst[i] = val>>7;
1583 }
1584 #endif
1585 }
1586 // *** horizontal scale Y line to temp buffer
1587 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc)
1588 {
1589 #ifdef HAVE_MMX
1590 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1591 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1592 #else
1593 if(sws_flags != SWS_FAST_BILINEAR)
1594 #endif
1595 {
1596 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1597 }
1598 else // Fast Bilinear upscale / crap downscale
1599 {
1600 #ifdef ARCH_X86
1601 #ifdef HAVE_MMX2
1602 int i;
1603 if(canMMX2BeUsed)
1604 {
1605 asm volatile(
1606 "pxor %%mm7, %%mm7 \n\t"
1607 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1608 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1609 "punpcklwd %%mm6, %%mm6 \n\t"
1610 "punpcklwd %%mm6, %%mm6 \n\t"
1611 "movq %%mm6, %%mm2 \n\t"
1612 "psllq $16, %%mm2 \n\t"
1613 "paddw %%mm6, %%mm2 \n\t"
1614 "psllq $16, %%mm2 \n\t"
1615 "paddw %%mm6, %%mm2 \n\t"
1616 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
1617 "movq %%mm2, temp0 \n\t"
1618 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1619 "punpcklwd %%mm6, %%mm6 \n\t"
1620 "punpcklwd %%mm6, %%mm6 \n\t"
1621 "xorl %%eax, %%eax \n\t" // i
1622 "movl %0, %%esi \n\t" // src
1623 "movl %1, %%edi \n\t" // buf1
1624 "movl %3, %%edx \n\t" // (xInc*4)>>16
1625 "xorl %%ecx, %%ecx \n\t"
1626 "xorl %%ebx, %%ebx \n\t"
1627 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1628
1629 #define FUNNY_Y_CODE \
1630 PREFETCH" 1024(%%esi) \n\t"\
1631 PREFETCH" 1056(%%esi) \n\t"\
1632 PREFETCH" 1088(%%esi) \n\t"\
1633 "call funnyYCode \n\t"\
1634 "movq temp0, %%mm2 \n\t"\
1635 "xorl %%ecx, %%ecx \n\t"
1636
1637 FUNNY_Y_CODE
1638 FUNNY_Y_CODE
1639 FUNNY_Y_CODE
1640 FUNNY_Y_CODE
1641 FUNNY_Y_CODE
1642 FUNNY_Y_CODE
1643 FUNNY_Y_CODE
1644 FUNNY_Y_CODE
1645
1646 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1647 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1648 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1649 );
1650 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
1651 }
1652 else
1653 {
1654 #endif
1655 //NO MMX just normal asm ...
1656 asm volatile(
1657 "xorl %%eax, %%eax \n\t" // i
1658 "xorl %%ebx, %%ebx \n\t" // xx
1659 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
1660 ".balign 16 \n\t"
1661 "1: \n\t"
1662 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1663 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1664 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1665 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1666 "shll $16, %%edi \n\t"
1667 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1668 "movl %1, %%edi \n\t"
1669 "shrl $9, %%esi \n\t"
1670 "movw %%si, (%%edi, %%eax, 2) \n\t"
1671 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1672 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1673
1674 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1675 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1676 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1677 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1678 "shll $16, %%edi \n\t"
1679 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1680 "movl %1, %%edi \n\t"
1681 "shrl $9, %%esi \n\t"
1682 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
1683 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1684 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1685
1686
1687 "addl $2, %%eax \n\t"
1688 "cmpl %2, %%eax \n\t"
1689 " jb 1b \n\t"
1690
1691
1692 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1693 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1694 );
1695 #ifdef HAVE_MMX2
1696 } //if MMX2 cant be used
1697 #endif
1698 #else
1699 int i;
1700 unsigned int xpos=0;
1701 for(i=0;i<dstWidth;i++)
1702 {
1703 register unsigned int xx=xpos>>16;
1704 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1705 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1706 xpos+=xInc;
1707 }
1708 #endif
1709 }
1710 }
1711
1712 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth,
1713 uint8_t *src1, uint8_t *src2, int srcW, int xInc)
1714 {
1715 #ifdef HAVE_MMX
1716 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1717 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1718 #else
1719 if(sws_flags != SWS_FAST_BILINEAR)
1720 #endif
1721 {
1722 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1723 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1724 }
1725 else // Fast Bilinear upscale / crap downscale
1726 {
1727 #ifdef ARCH_X86
1728 #ifdef HAVE_MMX2
1729 int i;
1730 if(canMMX2BeUsed)
1731 {
1732 asm volatile(
1733 "pxor %%mm7, %%mm7 \n\t"
1734 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1735 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1736 "punpcklwd %%mm6, %%mm6 \n\t"
1737 "punpcklwd %%mm6, %%mm6 \n\t"
1738 "movq %%mm6, %%mm2 \n\t"
1739 "psllq $16, %%mm2 \n\t"
1740 "paddw %%mm6, %%mm2 \n\t"
1741 "psllq $16, %%mm2 \n\t"
1742 "paddw %%mm6, %%mm2 \n\t"
1743 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
1744 "movq %%mm2, temp0 \n\t"
1745 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1746 "punpcklwd %%mm6, %%mm6 \n\t"
1747 "punpcklwd %%mm6, %%mm6 \n\t"
1748 "xorl %%eax, %%eax \n\t" // i
1749 "movl %0, %%esi \n\t" // src
1750 "movl %1, %%edi \n\t" // buf1
1751 "movl %3, %%edx \n\t" // (xInc*4)>>16
1752 "xorl %%ecx, %%ecx \n\t"
1753 "xorl %%ebx, %%ebx \n\t"
1754 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1755
1756 #define FUNNYUVCODE \
1757 PREFETCH" 1024(%%esi) \n\t"\
1758 PREFETCH" 1056(%%esi) \n\t"\
1759 PREFETCH" 1088(%%esi) \n\t"\
1760 "call funnyUVCode \n\t"\
1761 "movq temp0, %%mm2 \n\t"\
1762 "xorl %%ecx, %%ecx \n\t"
1763
1764 FUNNYUVCODE
1765 FUNNYUVCODE
1766 FUNNYUVCODE
1767 FUNNYUVCODE
1768
1769 FUNNYUVCODE
1770 FUNNYUVCODE
1771 FUNNYUVCODE
1772 FUNNYUVCODE
1773 "xorl %%eax, %%eax \n\t" // i
1774 "movl %6, %%esi \n\t" // src
1775 "movl %1, %%edi \n\t" // buf1
1776 "addl $4096, %%edi \n\t"
1777
1778 FUNNYUVCODE
1779 FUNNYUVCODE
1780 FUNNYUVCODE
1781 FUNNYUVCODE
1782
1783 FUNNYUVCODE
1784 FUNNYUVCODE
1785 FUNNYUVCODE
1786 FUNNYUVCODE
1787
1788 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1789 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1790 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1791 );
1792 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1793 {
1794 // printf("%d %d %d\n", dstWidth, i, srcW);
1795 dst[i] = src1[srcW-1]*128;
1796 dst[i+2048] = src2[srcW-1]*128;
1797 }
1798 }
1799 else
1800 {
1801 #endif
1802 asm volatile(
1803 "xorl %%eax, %%eax \n\t" // i
1804 "xorl %%ebx, %%ebx \n\t" // xx
1805 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
1806 ".balign 16 \n\t"
1807 "1: \n\t"
1808 "movl %0, %%esi \n\t"
1809 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
1810 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
1811 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1812 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1813 "shll $16, %%edi \n\t"
1814 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1815 "movl %1, %%edi \n\t"
1816 "shrl $9, %%esi \n\t"
1817 "movw %%si, (%%edi, %%eax, 2) \n\t"
1818
1819 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
1820 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
1821 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1822 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1823 "shll $16, %%edi \n\t"
1824 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1825 "movl %1, %%edi \n\t"
1826 "shrl $9, %%esi \n\t"
1827 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1828
1829 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1830 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1831 "addl $1, %%eax \n\t"
1832 "cmpl %2, %%eax \n\t"
1833 " jb 1b \n\t"
1834
1835 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1836 "r" (src2)
1837 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1838 );
1839 #ifdef HAVE_MMX2
1840 } //if MMX2 cant be used
1841 #endif
1842 #else
1843 int i;
1844 unsigned int xpos=0;
1845 for(i=0;i<dstWidth;i++)
1846 {
1847 register unsigned int xx=xpos>>16;
1848 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1849 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1850 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1851 /* slower
1852 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1853 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1854 */
1855 xpos+=xInc;
1856 }
1857 #endif
1858 }
1859 }
1860
1861 static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
1862 int srcW, int dstW, int filterAlign, int one)
1863 {
1864 int i;
1865 double filter[8000];
1866 #ifdef HAVE_MMX
1867 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
1868 #endif
1869
1870 if(ABS(xInc - 0x10000) <10) // unscaled
1871 {
1872 int i;
1873 *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly
1874 for(i=0; i<dstW*(*filterSize); i++) filter[i]=0;
1875
1876 for(i=0; i<dstW; i++)
1877 {
1878 filter[i*(*filterSize)]=1;
1879 filterPos[i]=i;
1880 }
1881
1882 }
1883 else if(xInc <= (1<<16) || sws_flags==SWS_FAST_BILINEAR) // upscale
1884 {
1885 int i;
1886 int xDstInSrc;
1887 if(sws_flags==SWS_BICUBIC) *filterSize= 4;
1888 else *filterSize= 2;
1889 // printf("%d %d %d\n", filterSize, srcW, dstW);
1890 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1891
1892 xDstInSrc= xInc - 0x8000;
1893 for(i=0; i<dstW; i++)
1894 {
1895 int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1;
1896 int j;
1897
1898 filterPos[i]= xx;
1899 if(sws_flags == SWS_BICUBIC)
1900 {
1901 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
1902 double y1,y2,y3,y4;
1903 double A= -0.75;
1904 // Equation is from VirtualDub
1905 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
1906 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
1907 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
1908 y4 = ( + A*d*d - A*d*d*d);
1909
1910 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1911 filter[i*(*filterSize) + 0]= y1;
1912 filter[i*(*filterSize) + 1]= y2;
1913 filter[i*(*filterSize) + 2]= y3;
1914 filter[i*(*filterSize) + 3]= y4;
1915 // printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4);
1916 }
1917 else
1918 {
1919 for(j=0; j<*filterSize; j++)
1920 {
1921 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
1922 double coeff= 1.0 - d;
1923 if(coeff<0) coeff=0;
1924 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1925 filter[i*(*filterSize) + j]= coeff;
1926 xx++;
1927 }
1928 }
1929 xDstInSrc+= xInc;
1930 }
1931 }
1932 else // downscale
1933 {
1934 int xDstInSrc;
1935 if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
1936 else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
1937 // printf("%d %d %d\n", *filterSize, srcW, dstW);
1938 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1939
1940 xDstInSrc= xInc - 0x8000;
1941 for(i=0; i<dstW; i++)
1942 {
1943 int xx= (int)((double)xDstInSrc/(double)(1<<16) - *filterSize*0.5 + 0.5);
1944 int j;
1945
1946 filterPos[i]= xx;
1947 for(j=0; j<*filterSize; j++)
1948 {
1949 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
1950 double coeff;
1951 if(sws_flags == SWS_BICUBIC)
1952 {
1953 double A= -0.75;
1954 // d*=2;
1955 // Equation is from VirtualDub
1956 if(d<1.0)
1957 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
1958 else if(d<2.0)
1959 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
1960 else
1961 coeff=0.0;
1962 }
1963 else
1964 {
1965 coeff= 1.0 - d;
1966 if(coeff<0) coeff=0;
1967 }
1968 // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1969 filter[i*(*filterSize) + j]= coeff;
1970 xx++;
1971 }
1972 xDstInSrc+= xInc;
1973 }
1974 }
1975
1976 //fix borders
1977 for(i=0; i<dstW; i++)
1978 {
1979 int j;
1980 if(filterPos[i] < 0)
1981 {
1982 // Move filter coeffs left to compensate for filterPos
1983 for(j=1; j<*filterSize; j++)
1984 {
1985 int left= MAX(j + filterPos[i], 0);
1986 filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j];
1987 filter[i*(*filterSize) + j]=0;
1988 }
1989 filterPos[i]= 0;
1990 }
1991
1992 if(filterPos[i] + (*filterSize) > srcW)
1993 {
1994 int shift= filterPos[i] + (*filterSize) - srcW;
1995 // Move filter coeffs right to compensate for filterPos
1996 for(j=(*filterSize)-2; j>=0; j--)
1997 {
1998 int right= MIN(j + shift, (*filterSize)-1);
1999 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
2000 filter[i*(*filterSize) +j]=0;
2001 }
2002 filterPos[i]= srcW - (*filterSize);
2003 }
2004 }
2005
2006 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
2007 // and skip these than later
2008
2009 //Normalize
2010 for(i=0; i<dstW; i++)
2011 {
2012 int j;
2013 double sum=0;
2014 double scale= one;
2015 for(j=0; j<*filterSize; j++)
2016 {
2017 sum+= filter[i*(*filterSize) + j];
2018 }
2019 scale/= sum;
2020 for(j=0; j<*filterSize; j++)
2021 {
2022 dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
2023 }
2024 }
2025 }
2026
2027 #ifdef HAVE_MMX2
2028 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
2029 {
2030 uint8_t *fragment;
2031 int imm8OfPShufW1;
2032 int imm8OfPShufW2;
2033 int fragmentLength;
2034
2035 int xpos, i;
2036
2037 // create an optimized horizontal scaling routine
2038
2039 //code fragment
2040
2041 asm volatile(
2042 "jmp 9f \n\t"
2043 // Begin
2044 "0: \n\t"
2045 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
2046 "movq %%mm0, %%mm1 \n\t"
2047 "psrlq $8, %%mm0 \n\t"
2048 "punpcklbw %%mm7, %%mm1 \n\t"
2049 "movq %%mm2, %%mm3 \n\t"
2050 "punpcklbw %%mm7, %%mm0 \n\t"
2051 "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
2052 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
2053 "1: \n\t"
2054 "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
2055 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
2056 "2: \n\t"
2057 "psrlw $9, %%mm3 \n\t"
2058 "psubw %%mm1, %%mm0 \n\t"
2059 "pmullw %%mm3, %%mm0 \n\t"
2060 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
2061 "psllw $7, %%mm1 \n\t"
2062 "paddw %%mm1, %%mm0 \n\t"
2063
2064 "movq %%mm0, (%%edi, %%eax) \n\t"
2065
2066 "addl $8, %%eax \n\t"
2067 // End
2068 "9: \n\t"
2069 // "int $3\n\t"
2070 "leal 0b, %0 \n\t"
2071 "leal 1b, %1 \n\t"
2072 "leal 2b, %2 \n\t"
2073 "decl %1 \n\t"
2074 "decl %2 \n\t"
2075 "subl %0, %1 \n\t"
2076 "subl %0, %2 \n\t"
2077 "leal 9b, %3 \n\t"
2078 "subl %0, %3 \n\t"
2079 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
2080 "=r" (fragmentLength)
2081 );
2082
2083 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
2084
2085 for(i=0; i<dstW/8; i++)
2086 {
2087 int xx=xpos>>16;
2088
2089 if((i&3) == 0)
2090 {
2091 int a=0;
2092 int b=((xpos+xInc)>>16) - xx;
2093 int c=((xpos+xInc*2)>>16) - xx;
2094 int d=((xpos+xInc*3)>>16) - xx;
2095
2096 memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
2097
2098 funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
2099 funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
2100 a | (b<<2) | (c<<4) | (d<<6);
2101
2102 // if we dont need to read 8 bytes than dont :), reduces the chance of
2103 // crossing a cache line
2104 if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
2105
2106 funnyCode[fragmentLength*(i+4)/4]= RET;
2107 }
2108 xpos+=xInc;
2109 }
2110 /*
2111 xpos= 0; //chrXInc/2 - 0x10000; // difference between centers of chrom samples
2112 for(i=0; i<dstUVw/8; i++)
2113 {
2114 int xx=xpos>>16;
2115
2116 if((i&3) == 0)
2117 {
2118 int a=0;
2119 int b=((xpos+chrXInc)>>16) - xx;
2120 int c=((xpos+chrXInc*2)>>16) - xx;
2121 int d=((xpos+chrXInc*3)>>16) - xx;
2122
2123 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
2124
2125 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
2126 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
2127 a | (b<<2) | (c<<4) | (d<<6);
2128
2129 // if we dont need to read 8 bytes than dont :), reduces the chance of
2130 // crossing a cache line
2131 if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
2132
2133 funnyUVCode[fragmentLength*(i+4)/4]= RET;
2134 }
2135 xpos+=chrXInc;
2136 }
2137 */
2138 // funnyCode[0]= RET;
2139 }
2140 #endif // HAVE_MMX2
2141
2142 static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY ,
2143 int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
2144 int srcW, int srcH, int dstW, int dstH){
2145
2146
2147 unsigned int lumXInc= (srcW << 16) / dstW;
2148 unsigned int lumYInc= (srcH << 16) / dstH;
2149 unsigned int chrXInc;
2150 unsigned int chrYInc;
2151
2152 static int dstY;
2153
2154 // used to detect a size change
2155 static int oldDstW= -1;
2156 static int oldSrcW= -1;
2157 static int oldDstH= -1;
2158 static int oldSrcH= -1;
2159 static int oldFlags=-1;
2160
2161 static int lastInLumBuf;
2162 static int lastInChrBuf;
2163
2164 int chrDstW, chrDstH;
2165
2166 static int lumBufIndex=0;
2167 static int chrBufIndex=0;
2168
2169 static int firstTime=1;
2170
2171 const int widthAlign= dstbpp==12 ? 16 : 8;
2172 const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
2173 const int over= dstbpp==12 ? (((dstW+15)&(~15))) - dststride
2174 : (((dstW+7)&(~7)))*bytespp - dststride;
2175 if(dststride%widthAlign !=0 )
2176 {
2177 if(firstTime)
2178 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
2179 "SwScaler: ->cannot do aligned memory acesses anymore\n",
2180 widthAlign);
2181 }
2182
2183 if(over>0)
2184 {
2185 if(firstTime)
2186 fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n"
2187 "SwScaler: and dststride is not large enough to handle %d extra bytes\n"
2188 "SwScaler: ->using unoptimized C version for last line(s)\n",
2189 over);
2190 }
2191
2192
2193
2194 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH);
2195 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH);
2196
2197 #ifdef HAVE_MMX2
2198 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2199 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR)
2200 {
2201 if(firstTime)
2202 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
2203 }
2204 #else
2205 canMMX2BeUsed=0; // should be 0 anyway but ...
2206 #endif
2207
2208 if(firstTime)
2209 {
2210 #if defined (DITHER1XBPP) && defined (HAVE_MMX)
2211 char *dither= " dithered";
2212 #else
2213 char *dither= "";
2214 #endif
2215 if(sws_flags==SWS_FAST_BILINEAR)
2216 fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
2217 else if(sws_flags==SWS_BILINEAR)
2218 fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
2219 else if(sws_flags==SWS_BICUBIC)
2220 fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
2221 else
2222 fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
2223
2224 if(dstbpp==15)
2225 fprintf(stderr, "with%s BGR15 output ", dither);
2226 else if(dstbpp==16)
2227 fprintf(stderr, "with%s BGR16 output ", dither);
2228 else if(dstbpp==24)
2229 fprintf(stderr, "with BGR24 output ");
2230 else if(dstbpp==32)
2231 fprintf(stderr, "with BGR32 output ");
2232 else if(dstbpp==12)
2233 fprintf(stderr, "with YV12 output ");
2234 else
2235 fprintf(stderr, "without output ");
2236
2237 #ifdef HAVE_MMX2
2238 fprintf(stderr, "using MMX2\n");
2239 #elif defined (HAVE_3DNOW)
2240 fprintf(stderr, "using 3DNOW\n");
2241 #elif defined (HAVE_MMX)
2242 fprintf(stderr, "using MMX\n");
2243 #elif defined (ARCH_X86)
2244 fprintf(stderr, "using X86 ASM\n");
2245 #else
2246 fprintf(stderr, "using C\n");
2247 #endif
2248 }
2249
2250
2251 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2252 // n-2 is the last chrominance sample available
2253 // this is not perfect, but noone shuld notice the difference, the more correct variant
2254 // would be like the vertical one, but that would require some special code for the
2255 // first and last pixel
2256 if(sws_flags==SWS_FAST_BILINEAR)
2257 {
2258 if(canMMX2BeUsed) lumXInc+= 20;
2259 #ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available
2260 else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2261 #endif
2262 }
2263
2264 if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW;
2265 else chrXInc= lumXInc, chrDstW= (dstW+1)>>1;
2266
2267 if(dstbpp==12) chrYInc= lumYInc, chrDstH= (dstH+1)>>1;
2268 else chrYInc= lumYInc>>1, chrDstH= dstH;
2269
2270 // force calculation of the horizontal interpolation of the first line
2271
2272 if(srcSliceY ==0){
2273 // printf("dstW %d, srcw %d, mmx2 %d\n", dstW, srcW, canMMX2BeUsed);
2274 lumBufIndex=0;
2275 chrBufIndex=0;
2276 dstY=0;
2277
2278 //precalculate horizontal scaler filter coefficients
2279 if(oldDstW!=dstW || oldSrcW!=srcW || oldFlags!=sws_flags)
2280 {
2281 #ifdef HAVE_MMX
2282 const int filterAlign=4;
2283 #else
2284 const int filterAlign=1;
2285 #endif
2286 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags;
2287
2288 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
2289 srcW , dstW , filterAlign, 1<<14);
2290 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
2291 (srcW+1)>>1, chrDstW, filterAlign, 1<<14);
2292
2293 #ifdef HAVE_MMX2
2294 // cant downscale !!!
2295 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR)
2296 {
2297 initMMX2HScaler(dstW , lumXInc, funnyYCode);
2298 initMMX2HScaler(chrDstW, chrXInc, funnyUVCode);
2299 }
2300 #endif
2301 } // Init Horizontal stuff
2302
2303 if(oldDstH!=dstH || oldSrcH!=srcH || oldFlags!=sws_flags)
2304 {
2305 int i;
2306 oldDstH= dstH; oldSrcH= srcH; oldFlags= sws_flags; //FIXME swsflags conflict with x check
2307
2308 // deallocate pixbufs
2309 for(i=0; i<vLumBufSize; i++) free(lumPixBuf[i]);
2310 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]);
2311
2312 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc,
2313 srcH , dstH, 1, (1<<12)-4);
2314 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc,
2315 (srcH+1)>>1, chrDstH, 1, (1<<12)-4);
2316
2317 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2318 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize;
2319 for(i=0; i<dstH; i++)
2320 {
2321 int chrI= i*chrDstH / dstH;
2322 int nextSlice= MAX(vLumFilterPos[i ] + vLumFilterSize - 1,
2323 ((vChrFilterPos[chrI] + vChrFilterSize - 1)<<1));
2324 nextSlice&= ~1; // Slices start at even boundaries
2325 if(vLumFilterPos[i ] + vLumBufSize < nextSlice)
2326 vLumBufSize= nextSlice - vLumFilterPos[i ];
2327 if(vChrFilterPos[chrI] + vChrBufSize < (nextSlice>>1))
2328 vChrBufSize= (nextSlice>>1) - vChrFilterPos[chrI];
2329 }
2330
2331 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2332 // allocate several megabytes to handle all possible cases)
2333 for(i=0; i<vLumBufSize; i++)
2334 lumPixBuf[i]= lumPixBuf[i+vLumBufSize]= (uint16_t*)memalign(8, 4000);
2335 for(i=0; i<vChrBufSize; i++)
2336 chrPixBuf[i]= chrPixBuf[i+vChrBufSize]= (uint16_t*)memalign(8, 8000);
2337
2338 //try to avoid drawing green stuff between the right end and the stride end
2339 for(i=0; i<vLumBufSize; i++) memset(lumPixBuf[i], 0, 4000);
2340 for(i=0; i<vChrBufSize; i++) memset(chrPixBuf[i], 64, 8000);
2341
2342 ASSERT(chrDstH<=dstH)
2343 ASSERT(vLumFilterSize*dstH*4<16000)
2344 ASSERT(vChrFilterSize*chrDstH*4<16000)
2345 #ifdef HAVE_MMX
2346 // pack filter data for mmx code
2347 for(i=0; i<vLumFilterSize*dstH; i++)
2348 lumMmxFilter[4*i]=lumMmxFilter[4*i+1]=lumMmxFilter[4*i+2]=lumMmxFilter[4*i+3]=
2349 vLumFilter[i];
2350 for(i=0; i<vChrFilterSize*chrDstH; i++)
2351 chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]=
2352 vChrFilter[i];
2353 #endif
2354 }
2355
2356 if(firstTime && verbose)
2357 {
2358 #ifdef HAVE_MMX2
2359 int mmx2=1;
2360 #else
2361 int mmx2=0;
2362 #endif
2363 #ifdef HAVE_MMX
2364 int mmx=1;
2365 #else
2366 int mmx=0;
2367 #endif
2368
2369 #ifdef HAVE_MMX
2370 if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR)
2371 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2372 else
2373 {
2374 if(hLumFilterSize==4)
2375 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
2376 else if(hLumFilterSize==8)
2377 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
2378 else
2379 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
2380
2381 if(hChrFilterSize==4)
2382 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
2383 else if(hChrFilterSize==8)
2384 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
2385 else
2386 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2387 }
2388 #elif defined (ARCH_X86)
2389 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2390 #else
2391 if(sws_flags==SWS_FAST_BILINEAR)
2392 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2393 else
2394 printf("SwScaler: using C scaler for horizontal scaling\n");
2395 #endif
2396
2397 if(dstbpp==12)
2398 {
2399 if(vLumFilterSize==1)
2400 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2401 else
2402 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2403 }
2404 else
2405 {
2406 if(vLumFilterSize==1 && vChrFilterSize==2)
2407 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2408 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C");
2409 else if(vLumFilterSize==2 && vChrFilterSize==2)
2410 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2411 else
2412 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2413 }
2414
2415 if(dstbpp==24)
2416 printf("SwScaler: using %s YV12->BGR24 Converter\n",
2417 mmx2 ? "MMX2" : (mmx ? "MMX" : "C"));
2418 else
2419 printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp);
2420
2421 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2422 }
2423
2424 lastInLumBuf= -1;
2425 lastInChrBuf= -1;
2426 } // if(firstLine)
2427
2428 for(;dstY < dstH; dstY++){
2429 unsigned char *dest =dstptr[0]+dststride*dstY;
2430 unsigned char *uDest=dstptr[1]+(dststride>>1)*(dstY>>1);
2431 unsigned char *vDest=dstptr[2]+(dststride>>1)*(dstY>>1);
2432 const int chrDstY= dstbpp==12 ? (dstY>>1) : dstY;
2433
2434 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2435 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2436 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2437 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2438
2439 if(sws_flags == SWS_FAST_BILINEAR)
2440 {
2441 //handle holes
2442 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2443 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2444 }
2445
2446 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2447 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2448
2449 // Do we have enough lines in this slice to output the dstY line
2450 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
2451 {
2452 //Do horizontal scaling
2453 while(lastInLumBuf < lastLumSrcY)
2454 {
2455 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2456 lumBufIndex++;
2457 ASSERT(lumBufIndex < 2*vLumBufSize)
2458 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2459 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2460 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2461 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2462 lastInLumBuf++;
2463 }
2464 while(lastInChrBuf < lastChrSrcY)
2465 {
2466 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2467 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2468 chrBufIndex++;
2469 ASSERT(chrBufIndex < 2*vChrBufSize)
2470 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2471 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2472 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2473 lastInChrBuf++;
2474 }
2475 //wrap buf index around to stay inside the ring buffer
2476 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2477 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2478 }
2479 else // not enough lines left in this slice -> load the rest in the buffer
2480 {
2481 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2482 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2483 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2484 vChrBufSize, vLumBufSize);
2485 */
2486 //Do horizontal scaling
2487 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2488 {
2489 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2490 lumBufIndex++;
2491 ASSERT(lumBufIndex < 2*vLumBufSize)
2492 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2493 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2494 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2495 lastInLumBuf++;
2496 }
2497 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2498 {
2499 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2500 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2501 chrBufIndex++;
2502 ASSERT(chrBufIndex < 2*vChrBufSize)
2503 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2504 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2505 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2506 lastInChrBuf++;
2507 }
2508 //wrap buf index around to stay inside the ring buffer
2509 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2510 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2511 break; //we cant output a dstY line so lets try with the next slice
2512 }
2513
2514 #ifdef HAVE_MMX
2515 b5Dither= dither8[dstY&1];
2516 g6Dither= dither4[dstY&1];
2517 g5Dither= dither8[dstY&1];
2518 r5Dither= dither8[(dstY+1)&1];
2519 #endif
2520 if(dstY < dstH-2 || over<=0)
2521 {
2522 if(dstbpp==12) //YV12
2523 {
2524 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2525 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2526 {
2527 int16_t *lumBuf = lumPixBuf[0];
2528 int16_t *chrBuf= chrPixBuf[0];
2529 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2530 }
2531 else //General YV12
2532 {
2533 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2534 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2535 RENAME(yuv2yuvX)(
2536 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2537 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2538 dest, uDest, vDest, dstW,
2539 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2540 }
2541 }
2542 else
2543 {
2544 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2545 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2546
2547 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2548 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2549 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2550 {
2551 int chrAlpha= vChrFilter[2*dstY+1];
2552
2553 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2554 dest, dstW, chrAlpha, dstbpp);
2555 }
2556 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2557 {
2558 int lumAlpha= vLumFilter[2*dstY+1];
2559 int chrAlpha= vChrFilter[2*dstY+1];
2560
2561 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2562 dest, dstW, lumAlpha, chrAlpha, dstbpp);
2563 }
2564 else //General RGB
2565 {
2566 RENAME(yuv2rgbX)(
2567 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2568 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2569 dest, dstW, dstbpp,
2570 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2571 }
2572 }
2573 }
2574 else // hmm looks like we cant use MMX here without overwriting this arrays tail
2575 {
2576 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2577 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2578 if(dstbpp==12) //YV12
2579 {
2580 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2581 yuv2yuvXinC(
2582 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2583 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2584 dest, uDest, vDest, dstW);
2585 }
2586 else
2587 {
2588 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2589 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2590 yuv2rgbXinC(
2591 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2592 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2593 dest, dstW, dstbpp);
2594 }
2595 }
2596 }
2597
2598 #ifdef HAVE_MMX
2599 __asm __volatile(SFENCE:::"memory");
2600 __asm __volatile(EMMS:::"memory");
2601 #endif
2602 firstTime=0;
2603 }