c7f278bb89c07d5f69725d2db1113e8a62469dc2
[libav.git] / libswscale / swscale_template.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
22 */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
31
32 #ifdef HAVE_3DNOW
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
38
39 #ifdef HAVE_3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
49
50 #ifdef HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
55
56 #ifdef HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
61
62 #ifdef HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
68
69 #ifdef HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
72
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 asm volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
108
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
170
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
183
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
201
202 /*
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208 */
209 #define YSCALEYUV2PACKEDX \
210 asm volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
232 \
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
237 ASMALIGN(4)\
238 "2: \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
249 " jnz 2b \n\t"\
250
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
256 );
257
258 #define YSCALEYUV2PACKEDX_ACCURATE \
259 asm volatile(\
260 "xor %%"REG_a", %%"REG_a" \n\t"\
261 ASMALIGN(4)\
262 "nop \n\t"\
263 "1: \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
270 ASMALIGN(4)\
271 "2: \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
295 " jnz 2b \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
307 \
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
314 ASMALIGN(4)\
315 "2: \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
339 " jnz 2b \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
351
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
388 #if 0
389 #define FULL_YSCALEYUV2RGB \
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
398 ASMALIGN(4)\
399 "1: \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
418 \
419 \
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
427 \
428 \
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
435 \
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
439 \
440 "packuswb %%mm1, %%mm1 \n\t"
441 #endif
442
443 #define REAL_YSCALEYUV2PACKED(index, c) \
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
451 ASMALIGN(4)\
452 "1: \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
478
479 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
480
481 #define REAL_YSCALEYUV2RGB(index, c) \
482 "xor "#index", "#index" \n\t"\
483 ASMALIGN(4)\
484 "1: \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
545 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
546
547 #define REAL_YSCALEYUV2PACKED1(index, c) \
548 "xor "#index", "#index" \n\t"\
549 ASMALIGN(4)\
550 "1: \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
559
560 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
561
562 #define REAL_YSCALEYUV2RGB1(index, c) \
563 "xor "#index", "#index" \n\t"\
564 ASMALIGN(4)\
565 "1: \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
609 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
610
611 #define REAL_YSCALEYUV2PACKED1b(index, c) \
612 "xor "#index", "#index" \n\t"\
613 ASMALIGN(4)\
614 "1: \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
627 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
628
629 // do vertical chrominance interpolation
630 #define REAL_YSCALEYUV2RGB1b(index, c) \
631 "xor "#index", "#index" \n\t"\
632 ASMALIGN(4)\
633 "1: \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
681 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
682
683 #define REAL_WRITEBGR32(dst, dstw, index) \
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
697 \
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
702 \
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
705 " jb 1b \n\t"
706 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
707
708 #define REAL_WRITERGB16(dst, dstw, index) \
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
713 \
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
716 \
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
721 \
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
724 \
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
727 \
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
730 \
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
733 " jb 1b \n\t"
734 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
735
736 #define REAL_WRITERGB15(dst, dstw, index) \
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
742 \
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
745 \
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
750 \
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
753 \
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
756 \
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
759 \
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
762 " jb 1b \n\t"
763 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
764
765 #define WRITEBGR24OLD(dst, dstw, index) \
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
779 \
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
788 \
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
802 \
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
811 \
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
816 \
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
819 " jb 1b \n\t"
820
821 #define WRITEBGR24MMX(dst, dstw, index) \
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
835 \
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
840 \
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
845 \
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
850 \
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
856 \
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
862 \
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
867 \
868 "add $24, "#dst" \n\t"\
869 \
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
872 " jb 1b \n\t"
873
874 #define WRITEBGR24MMX2(dst, dstw, index) \
875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
881 \
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
885 \
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
890 \
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
895 \
896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
899 \
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
903 \
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
907 \
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
911 \
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
915 \
916 "add $24, "#dst" \n\t"\
917 \
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
920 " jb 1b \n\t"
921
922 #ifdef HAVE_MMX2
923 #undef WRITEBGR24
924 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
925 #else
926 #undef WRITEBGR24
927 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
928 #endif
929
930 #define REAL_WRITEYUY2(dst, dstw, index) \
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
938 \
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
941 \
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
944 " jb 1b \n\t"
945 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
946
947
948 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
951 {
952 #ifdef HAVE_MMX
953 if (c->flags & SWS_ACCURATE_RND){
954 if (uDest){
955 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
956 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
957 }
958
959 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
960 }else{
961 if (uDest){
962 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
963 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
964 }
965
966 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
967 }
968 #else
969 #ifdef HAVE_ALTIVEC
970 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
972 dest, uDest, vDest, dstW, chrDstW);
973 #else //HAVE_ALTIVEC
974 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
975 chrFilter, chrSrc, chrFilterSize,
976 dest, uDest, vDest, dstW, chrDstW);
977 #endif //!HAVE_ALTIVEC
978 #endif /* HAVE_MMX */
979 }
980
981 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
982 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
983 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
984 {
985 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
986 chrFilter, chrSrc, chrFilterSize,
987 dest, uDest, dstW, chrDstW, dstFormat);
988 }
989
990 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
991 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
992 {
993 #ifdef HAVE_MMX
994 long p= uDest ? 3 : 1;
995 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
996 uint8_t *dst[3]= {dest, uDest, vDest};
997 long counter[3] = {dstW, chrDstW, chrDstW};
998
999 if (c->flags & SWS_ACCURATE_RND){
1000 while(p--){
1001 asm volatile(
1002 YSCALEYUV2YV121_ACCURATE
1003 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1004 "g" (-counter[p])
1005 : "%"REG_a
1006 );
1007 }
1008 }else{
1009 while(p--){
1010 asm volatile(
1011 YSCALEYUV2YV121
1012 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1013 "g" (-counter[p])
1014 : "%"REG_a
1015 );
1016 }
1017 }
1018
1019 #else
1020 int i;
1021 for (i=0; i<dstW; i++)
1022 {
1023 int val= (lumSrc[i]+64)>>7;
1024
1025 if (val&256){
1026 if (val<0) val=0;
1027 else val=255;
1028 }
1029
1030 dest[i]= val;
1031 }
1032
1033 if (uDest)
1034 for (i=0; i<chrDstW; i++)
1035 {
1036 int u=(chrSrc[i ]+64)>>7;
1037 int v=(chrSrc[i + VOFW]+64)>>7;
1038
1039 if ((u|v)&256){
1040 if (u<0) u=0;
1041 else if (u>255) u=255;
1042 if (v<0) v=0;
1043 else if (v>255) v=255;
1044 }
1045
1046 uDest[i]= u;
1047 vDest[i]= v;
1048 }
1049 #endif
1050 }
1051
1052
1053 /**
1054 * vertical scale YV12 to RGB
1055 */
1056 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1057 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1058 uint8_t *dest, long dstW, long dstY)
1059 {
1060 #ifdef HAVE_MMX
1061 long dummy=0;
1062 if (c->flags & SWS_ACCURATE_RND){
1063 switch(c->dstFormat){
1064 case PIX_FMT_RGB32:
1065 YSCALEYUV2PACKEDX_ACCURATE
1066 YSCALEYUV2RGBX
1067 WRITEBGR32(%4, %5, %%REGa)
1068
1069 YSCALEYUV2PACKEDX_END
1070 return;
1071 case PIX_FMT_BGR24:
1072 YSCALEYUV2PACKEDX_ACCURATE
1073 YSCALEYUV2RGBX
1074 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1075 "add %4, %%"REG_c" \n\t"
1076 WRITEBGR24(%%REGc, %5, %%REGa)
1077
1078
1079 :: "r" (&c->redDither),
1080 "m" (dummy), "m" (dummy), "m" (dummy),
1081 "r" (dest), "m" (dstW)
1082 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1083 );
1084 return;
1085 case PIX_FMT_RGB555:
1086 YSCALEYUV2PACKEDX_ACCURATE
1087 YSCALEYUV2RGBX
1088 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1089 #ifdef DITHER1XBPP
1090 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1091 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1092 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1093 #endif
1094
1095 WRITERGB15(%4, %5, %%REGa)
1096 YSCALEYUV2PACKEDX_END
1097 return;
1098 case PIX_FMT_RGB565:
1099 YSCALEYUV2PACKEDX_ACCURATE
1100 YSCALEYUV2RGBX
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102 #ifdef DITHER1XBPP
1103 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1106 #endif
1107
1108 WRITERGB16(%4, %5, %%REGa)
1109 YSCALEYUV2PACKEDX_END
1110 return;
1111 case PIX_FMT_YUYV422:
1112 YSCALEYUV2PACKEDX_ACCURATE
1113 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1114
1115 "psraw $3, %%mm3 \n\t"
1116 "psraw $3, %%mm4 \n\t"
1117 "psraw $3, %%mm1 \n\t"
1118 "psraw $3, %%mm7 \n\t"
1119 WRITEYUY2(%4, %5, %%REGa)
1120 YSCALEYUV2PACKEDX_END
1121 return;
1122 }
1123 }else{
1124 switch(c->dstFormat)
1125 {
1126 case PIX_FMT_RGB32:
1127 YSCALEYUV2PACKEDX
1128 YSCALEYUV2RGBX
1129 WRITEBGR32(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1131 return;
1132 case PIX_FMT_BGR24:
1133 YSCALEYUV2PACKEDX
1134 YSCALEYUV2RGBX
1135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1136 "add %4, %%"REG_c" \n\t"
1137 WRITEBGR24(%%REGc, %5, %%REGa)
1138
1139 :: "r" (&c->redDither),
1140 "m" (dummy), "m" (dummy), "m" (dummy),
1141 "r" (dest), "m" (dstW)
1142 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143 );
1144 return;
1145 case PIX_FMT_RGB555:
1146 YSCALEYUV2PACKEDX
1147 YSCALEYUV2RGBX
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149 #ifdef DITHER1XBPP
1150 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1151 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1152 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1153 #endif
1154
1155 WRITERGB15(%4, %5, %%REGa)
1156 YSCALEYUV2PACKEDX_END
1157 return;
1158 case PIX_FMT_RGB565:
1159 YSCALEYUV2PACKEDX
1160 YSCALEYUV2RGBX
1161 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1162 #ifdef DITHER1XBPP
1163 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1164 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1165 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1166 #endif
1167
1168 WRITERGB16(%4, %5, %%REGa)
1169 YSCALEYUV2PACKEDX_END
1170 return;
1171 case PIX_FMT_YUYV422:
1172 YSCALEYUV2PACKEDX
1173 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1174
1175 "psraw $3, %%mm3 \n\t"
1176 "psraw $3, %%mm4 \n\t"
1177 "psraw $3, %%mm1 \n\t"
1178 "psraw $3, %%mm7 \n\t"
1179 WRITEYUY2(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1181 return;
1182 }
1183 }
1184 #endif /* HAVE_MMX */
1185 #ifdef HAVE_ALTIVEC
1186 /* The following list of supported dstFormat values should
1187 match what's found in the body of altivec_yuv2packedX() */
1188 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1189 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1190 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1191 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1192 chrFilter, chrSrc, chrFilterSize,
1193 dest, dstW, dstY);
1194 else
1195 #endif
1196 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1198 dest, dstW, dstY);
1199 }
1200
1201 /**
1202 * vertical bilinear scale YV12 to RGB
1203 */
1204 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1205 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1206 {
1207 int yalpha1=4095- yalpha;
1208 int uvalpha1=4095-uvalpha;
1209 int i;
1210
1211 #if 0 //isn't used
1212 if (flags&SWS_FULL_CHR_H_INT)
1213 {
1214 switch(dstFormat)
1215 {
1216 #ifdef HAVE_MMX
1217 case PIX_FMT_RGB32:
1218 asm volatile(
1219
1220
1221 FULL_YSCALEYUV2RGB
1222 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1223 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1224
1225 "movq %%mm3, %%mm1 \n\t"
1226 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1227 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1228
1229 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1230 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1231
1232 "add $4, %%"REG_a" \n\t"
1233 "cmp %5, %%"REG_a" \n\t"
1234 " jb 1b \n\t"
1235
1236 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1237 "m" (yalpha1), "m" (uvalpha1)
1238 : "%"REG_a
1239 );
1240 break;
1241 case PIX_FMT_BGR24:
1242 asm volatile(
1243
1244 FULL_YSCALEYUV2RGB
1245
1246 // lsb ... msb
1247 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1248 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1249
1250 "movq %%mm3, %%mm1 \n\t"
1251 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1252 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1253
1254 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1255 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1256 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1257 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1258 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1259 "movq %%mm1, %%mm2 \n\t"
1260 "psllq $48, %%mm1 \n\t" // 000000BG
1261 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1262
1263 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1264 "psrld $16, %%mm2 \n\t" // R000R000
1265 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1266 "por %%mm2, %%mm1 \n\t" // RBGRR000
1267
1268 "mov %4, %%"REG_b" \n\t"
1269 "add %%"REG_a", %%"REG_b" \n\t"
1270
1271 #ifdef HAVE_MMX2
1272 //FIXME Alignment
1273 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1274 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1275 #else
1276 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1277 "psrlq $32, %%mm3 \n\t"
1278 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1280 #endif
1281 "add $4, %%"REG_a" \n\t"
1282 "cmp %5, %%"REG_a" \n\t"
1283 " jb 1b \n\t"
1284
1285 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1286 "m" (yalpha1), "m" (uvalpha1)
1287 : "%"REG_a, "%"REG_b
1288 );
1289 break;
1290 case PIX_FMT_BGR555:
1291 asm volatile(
1292
1293 FULL_YSCALEYUV2RGB
1294 #ifdef DITHER1XBPP
1295 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1296 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1297 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1298 #endif
1299 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1300 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1301 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1302
1303 "psrlw $3, %%mm3 \n\t"
1304 "psllw $2, %%mm1 \n\t"
1305 "psllw $7, %%mm0 \n\t"
1306 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1307 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1308
1309 "por %%mm3, %%mm1 \n\t"
1310 "por %%mm1, %%mm0 \n\t"
1311
1312 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1313
1314 "add $4, %%"REG_a" \n\t"
1315 "cmp %5, %%"REG_a" \n\t"
1316 " jb 1b \n\t"
1317
1318 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1319 "m" (yalpha1), "m" (uvalpha1)
1320 : "%"REG_a
1321 );
1322 break;
1323 case PIX_FMT_BGR565:
1324 asm volatile(
1325
1326 FULL_YSCALEYUV2RGB
1327 #ifdef DITHER1XBPP
1328 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1329 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1330 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1331 #endif
1332 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1333 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1334 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1335
1336 "psrlw $3, %%mm3 \n\t"
1337 "psllw $3, %%mm1 \n\t"
1338 "psllw $8, %%mm0 \n\t"
1339 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1340 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1341
1342 "por %%mm3, %%mm1 \n\t"
1343 "por %%mm1, %%mm0 \n\t"
1344
1345 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1346
1347 "add $4, %%"REG_a" \n\t"
1348 "cmp %5, %%"REG_a" \n\t"
1349 " jb 1b \n\t"
1350
1351 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1352 "m" (yalpha1), "m" (uvalpha1)
1353 : "%"REG_a
1354 );
1355 break;
1356 #endif /* HAVE_MMX */
1357 case PIX_FMT_BGR32:
1358 #ifndef HAVE_MMX
1359 case PIX_FMT_RGB32:
1360 #endif
1361 if (dstFormat==PIX_FMT_RGB32)
1362 {
1363 int i;
1364 #ifdef WORDS_BIGENDIAN
1365 dest++;
1366 #endif
1367 for (i=0;i<dstW;i++){
1368 // vertical linear interpolation && yuv2rgb in a single step:
1369 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1370 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1371 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1372 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1373 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1374 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1375 dest+= 4;
1376 }
1377 }
1378 else if (dstFormat==PIX_FMT_BGR24)
1379 {
1380 int i;
1381 for (i=0;i<dstW;i++){
1382 // vertical linear interpolation && yuv2rgb in a single step:
1383 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1384 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1385 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1386 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1387 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1388 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1389 dest+= 3;
1390 }
1391 }
1392 else if (dstFormat==PIX_FMT_BGR565)
1393 {
1394 int i;
1395 for (i=0;i<dstW;i++){
1396 // vertical linear interpolation && yuv2rgb in a single step:
1397 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1398 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1399 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1400
1401 ((uint16_t*)dest)[i] =
1402 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1403 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1404 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1405 }
1406 }
1407 else if (dstFormat==PIX_FMT_BGR555)
1408 {
1409 int i;
1410 for (i=0;i<dstW;i++){
1411 // vertical linear interpolation && yuv2rgb in a single step:
1412 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1413 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1414 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1415
1416 ((uint16_t*)dest)[i] =
1417 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1418 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1419 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1420 }
1421 }
1422 }//FULL_UV_IPOL
1423 else
1424 {
1425 #endif // if 0
1426 #ifdef HAVE_MMX
1427 switch(c->dstFormat)
1428 {
1429 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1430 case PIX_FMT_RGB32:
1431 asm volatile(
1432 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1433 "mov %4, %%"REG_b" \n\t"
1434 "push %%"REG_BP" \n\t"
1435 YSCALEYUV2RGB(%%REGBP, %5)
1436 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1439
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441 "a" (&c->redDither)
1442 );
1443 return;
1444 case PIX_FMT_BGR24:
1445 asm volatile(
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB(%%REGBP, %5)
1450 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1454 "a" (&c->redDither)
1455 );
1456 return;
1457 case PIX_FMT_RGB555:
1458 asm volatile(
1459 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1460 "mov %4, %%"REG_b" \n\t"
1461 "push %%"REG_BP" \n\t"
1462 YSCALEYUV2RGB(%%REGBP, %5)
1463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1464 #ifdef DITHER1XBPP
1465 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1466 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1467 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1468 #endif
1469
1470 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1471 "pop %%"REG_BP" \n\t"
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1473
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475 "a" (&c->redDither)
1476 );
1477 return;
1478 case PIX_FMT_RGB565:
1479 asm volatile(
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2RGB(%%REGBP, %5)
1484 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1485 #ifdef DITHER1XBPP
1486 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1487 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1488 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1489 #endif
1490
1491 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1495 "a" (&c->redDither)
1496 );
1497 return;
1498 case PIX_FMT_YUYV422:
1499 asm volatile(
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2PACKED(%%REGBP, %5)
1504 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1505 "pop %%"REG_BP" \n\t"
1506 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1507 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508 "a" (&c->redDither)
1509 );
1510 return;
1511 default: break;
1512 }
1513 #endif //HAVE_MMX
1514 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONOBLACK2_C)
1515 }
1516
1517 /**
1518 * YV12 to RGB without scaling or interpolating
1519 */
1520 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1521 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1522 {
1523 const int yalpha1=0;
1524 int i;
1525
1526 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1527 const int yalpha= 4096; //FIXME ...
1528
1529 if (flags&SWS_FULL_CHR_H_INT)
1530 {
1531 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1532 return;
1533 }
1534
1535 #ifdef HAVE_MMX
1536 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1537 {
1538 switch(dstFormat)
1539 {
1540 case PIX_FMT_RGB32:
1541 asm volatile(
1542 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1543 "mov %4, %%"REG_b" \n\t"
1544 "push %%"REG_BP" \n\t"
1545 YSCALEYUV2RGB1(%%REGBP, %5)
1546 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1549
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1551 "a" (&c->redDither)
1552 );
1553 return;
1554 case PIX_FMT_BGR24:
1555 asm volatile(
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1(%%REGBP, %5)
1560 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1563
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565 "a" (&c->redDither)
1566 );
1567 return;
1568 case PIX_FMT_RGB555:
1569 asm volatile(
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575 #ifdef DITHER1XBPP
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1579 #endif
1580 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1581 "pop %%"REG_BP" \n\t"
1582 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1583
1584 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1585 "a" (&c->redDither)
1586 );
1587 return;
1588 case PIX_FMT_RGB565:
1589 asm volatile(
1590 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1591 "mov %4, %%"REG_b" \n\t"
1592 "push %%"REG_BP" \n\t"
1593 YSCALEYUV2RGB1(%%REGBP, %5)
1594 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1595 #ifdef DITHER1XBPP
1596 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1597 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1598 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1599 #endif
1600
1601 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1602 "pop %%"REG_BP" \n\t"
1603 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1604
1605 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1606 "a" (&c->redDither)
1607 );
1608 return;
1609 case PIX_FMT_YUYV422:
1610 asm volatile(
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2PACKED1(%%REGBP, %5)
1615 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1618
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620 "a" (&c->redDither)
1621 );
1622 return;
1623 }
1624 }
1625 else
1626 {
1627 switch(dstFormat)
1628 {
1629 case PIX_FMT_RGB32:
1630 asm volatile(
1631 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1632 "mov %4, %%"REG_b" \n\t"
1633 "push %%"REG_BP" \n\t"
1634 YSCALEYUV2RGB1b(%%REGBP, %5)
1635 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1636 "pop %%"REG_BP" \n\t"
1637 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1638
1639 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1640 "a" (&c->redDither)
1641 );
1642 return;
1643 case PIX_FMT_BGR24:
1644 asm volatile(
1645 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1646 "mov %4, %%"REG_b" \n\t"
1647 "push %%"REG_BP" \n\t"
1648 YSCALEYUV2RGB1b(%%REGBP, %5)
1649 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1652
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654 "a" (&c->redDither)
1655 );
1656 return;
1657 case PIX_FMT_RGB555:
1658 asm volatile(
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664 #ifdef DITHER1XBPP
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1668 #endif
1669 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1670 "pop %%"REG_BP" \n\t"
1671 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1672
1673 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1674 "a" (&c->redDither)
1675 );
1676 return;
1677 case PIX_FMT_RGB565:
1678 asm volatile(
1679 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1680 "mov %4, %%"REG_b" \n\t"
1681 "push %%"REG_BP" \n\t"
1682 YSCALEYUV2RGB1b(%%REGBP, %5)
1683 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1684 #ifdef DITHER1XBPP
1685 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1686 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1687 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1688 #endif
1689
1690 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1691 "pop %%"REG_BP" \n\t"
1692 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1693
1694 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1695 "a" (&c->redDither)
1696 );
1697 return;
1698 case PIX_FMT_YUYV422:
1699 asm volatile(
1700 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1701 "mov %4, %%"REG_b" \n\t"
1702 "push %%"REG_BP" \n\t"
1703 YSCALEYUV2PACKED1b(%%REGBP, %5)
1704 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1705 "pop %%"REG_BP" \n\t"
1706 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1707
1708 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1709 "a" (&c->redDither)
1710 );
1711 return;
1712 }
1713 }
1714 #endif /* HAVE_MMX */
1715 if (uvalpha < 2048)
1716 {
1717 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONOBLACK2_C)
1718 }else{
1719 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONOBLACK2_C)
1720 }
1721 }
1722
1723 //FIXME yuy2* can read up to 7 samples too much
1724
1725 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1726 {
1727 #ifdef HAVE_MMX
1728 asm volatile(
1729 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1730 "mov %0, %%"REG_a" \n\t"
1731 "1: \n\t"
1732 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1733 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1734 "pand %%mm2, %%mm0 \n\t"
1735 "pand %%mm2, %%mm1 \n\t"
1736 "packuswb %%mm1, %%mm0 \n\t"
1737 "movq %%mm0, (%2, %%"REG_a") \n\t"
1738 "add $8, %%"REG_a" \n\t"
1739 " js 1b \n\t"
1740 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1741 : "%"REG_a
1742 );
1743 #else
1744 int i;
1745 for (i=0; i<width; i++)
1746 dst[i]= src[2*i];
1747 #endif
1748 }
1749
1750 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1751 {
1752 #ifdef HAVE_MMX
1753 asm volatile(
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1756 "1: \n\t"
1757 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1759 "psrlw $8, %%mm0 \n\t"
1760 "psrlw $8, %%mm1 \n\t"
1761 "packuswb %%mm1, %%mm0 \n\t"
1762 "movq %%mm0, %%mm1 \n\t"
1763 "psrlw $8, %%mm0 \n\t"
1764 "pand %%mm4, %%mm1 \n\t"
1765 "packuswb %%mm0, %%mm0 \n\t"
1766 "packuswb %%mm1, %%mm1 \n\t"
1767 "movd %%mm0, (%3, %%"REG_a") \n\t"
1768 "movd %%mm1, (%2, %%"REG_a") \n\t"
1769 "add $4, %%"REG_a" \n\t"
1770 " js 1b \n\t"
1771 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1772 : "%"REG_a
1773 );
1774 #else
1775 int i;
1776 for (i=0; i<width; i++)
1777 {
1778 dstU[i]= src1[4*i + 1];
1779 dstV[i]= src1[4*i + 3];
1780 }
1781 #endif
1782 assert(src1 == src2);
1783 }
1784
1785 /* This is almost identical to the previous, end exists only because
1786 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1787 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1788 {
1789 #ifdef HAVE_MMX
1790 asm volatile(
1791 "mov %0, %%"REG_a" \n\t"
1792 "1: \n\t"
1793 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1794 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1795 "psrlw $8, %%mm0 \n\t"
1796 "psrlw $8, %%mm1 \n\t"
1797 "packuswb %%mm1, %%mm0 \n\t"
1798 "movq %%mm0, (%2, %%"REG_a") \n\t"
1799 "add $8, %%"REG_a" \n\t"
1800 " js 1b \n\t"
1801 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1802 : "%"REG_a
1803 );
1804 #else
1805 int i;
1806 for (i=0; i<width; i++)
1807 dst[i]= src[2*i+1];
1808 #endif
1809 }
1810
1811 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1812 {
1813 #ifdef HAVE_MMX
1814 asm volatile(
1815 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1816 "mov %0, %%"REG_a" \n\t"
1817 "1: \n\t"
1818 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1819 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1820 "pand %%mm4, %%mm0 \n\t"
1821 "pand %%mm4, %%mm1 \n\t"
1822 "packuswb %%mm1, %%mm0 \n\t"
1823 "movq %%mm0, %%mm1 \n\t"
1824 "psrlw $8, %%mm0 \n\t"
1825 "pand %%mm4, %%mm1 \n\t"
1826 "packuswb %%mm0, %%mm0 \n\t"
1827 "packuswb %%mm1, %%mm1 \n\t"
1828 "movd %%mm0, (%3, %%"REG_a") \n\t"
1829 "movd %%mm1, (%2, %%"REG_a") \n\t"
1830 "add $4, %%"REG_a" \n\t"
1831 " js 1b \n\t"
1832 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1833 : "%"REG_a
1834 );
1835 #else
1836 int i;
1837 for (i=0; i<width; i++)
1838 {
1839 dstU[i]= src1[4*i + 0];
1840 dstV[i]= src1[4*i + 2];
1841 }
1842 #endif
1843 assert(src1 == src2);
1844 }
1845
1846 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1847 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width)\
1848 {\
1849 int i;\
1850 for (i=0; i<width; i++)\
1851 {\
1852 int b= (((type*)src)[i]>>shb)&maskb;\
1853 int g= (((type*)src)[i]>>shg)&maskg;\
1854 int r= (((type*)src)[i]>>shr)&maskr;\
1855 \
1856 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1857 }\
1858 }
1859
1860 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1861 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1862 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1863 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1864 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1865 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1866
1867 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1868 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1869 {\
1870 int i;\
1871 for (i=0; i<width; i++)\
1872 {\
1873 int b= (((type*)src)[i]&maskb)>>shb;\
1874 int g= (((type*)src)[i]&maskg)>>shg;\
1875 int r= (((type*)src)[i]&maskr)>>shr;\
1876 \
1877 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1878 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1879 }\
1880 }\
1881 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1882 {\
1883 int i;\
1884 for (i=0; i<width; i++)\
1885 {\
1886 int pix0= ((type*)src)[2*i+0];\
1887 int pix1= ((type*)src)[2*i+1];\
1888 int g= (pix0&maskg)+(pix1&maskg);\
1889 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1890 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1891 \
1892 g>>=shg;\
1893 \
1894 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1895 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1896 }\
1897 }
1898
1899 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1900 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1901 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1902 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1903 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1904 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1905
1906 #ifdef HAVE_MMX
1907 static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1908 {
1909
1910 if(srcFormat == PIX_FMT_BGR24){
1911 asm volatile(
1912 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1913 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1914 :
1915 );
1916 }else{
1917 asm volatile(
1918 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1919 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1920 :
1921 );
1922 }
1923
1924 asm volatile(
1925 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1926 "mov %2, %%"REG_a" \n\t"
1927 "pxor %%mm7, %%mm7 \n\t"
1928 "1: \n\t"
1929 PREFETCH" 64(%0) \n\t"
1930 "movd (%0), %%mm0 \n\t"
1931 "movd 2(%0), %%mm1 \n\t"
1932 "movd 6(%0), %%mm2 \n\t"
1933 "movd 8(%0), %%mm3 \n\t"
1934 "add $12, %0 \n\t"
1935 "punpcklbw %%mm7, %%mm0 \n\t"
1936 "punpcklbw %%mm7, %%mm1 \n\t"
1937 "punpcklbw %%mm7, %%mm2 \n\t"
1938 "punpcklbw %%mm7, %%mm3 \n\t"
1939 "pmaddwd %%mm5, %%mm0 \n\t"
1940 "pmaddwd %%mm6, %%mm1 \n\t"
1941 "pmaddwd %%mm5, %%mm2 \n\t"
1942 "pmaddwd %%mm6, %%mm3 \n\t"
1943 "paddd %%mm1, %%mm0 \n\t"
1944 "paddd %%mm3, %%mm2 \n\t"
1945 "paddd %%mm4, %%mm0 \n\t"
1946 "paddd %%mm4, %%mm2 \n\t"
1947 "psrad $15, %%mm0 \n\t"
1948 "psrad $15, %%mm2 \n\t"
1949 "packssdw %%mm2, %%mm0 \n\t"
1950 "packuswb %%mm0, %%mm0 \n\t"
1951 "movd %%mm0, (%1, %%"REG_a") \n\t"
1952 "add $4, %%"REG_a" \n\t"
1953 " js 1b \n\t"
1954 : "+r" (src)
1955 : "r" (dst+width), "g" (-width)
1956 : "%"REG_a
1957 );
1958 }
1959
1960 static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1961 {
1962 asm volatile(
1963 "movq 24+%4, %%mm6 \n\t"
1964 "mov %3, %%"REG_a" \n\t"
1965 "pxor %%mm7, %%mm7 \n\t"
1966 "1: \n\t"
1967 PREFETCH" 64(%0) \n\t"
1968 "movd (%0), %%mm0 \n\t"
1969 "movd 2(%0), %%mm1 \n\t"
1970 "punpcklbw %%mm7, %%mm0 \n\t"
1971 "punpcklbw %%mm7, %%mm1 \n\t"
1972 "movq %%mm0, %%mm2 \n\t"
1973 "movq %%mm1, %%mm3 \n\t"
1974 "pmaddwd %4, %%mm0 \n\t"
1975 "pmaddwd 8+%4, %%mm1 \n\t"
1976 "pmaddwd 16+%4, %%mm2 \n\t"
1977 "pmaddwd %%mm6, %%mm3 \n\t"
1978 "paddd %%mm1, %%mm0 \n\t"
1979 "paddd %%mm3, %%mm2 \n\t"
1980
1981 "movd 6(%0), %%mm1 \n\t"
1982 "movd 8(%0), %%mm3 \n\t"
1983 "add $12, %0 \n\t"
1984 "punpcklbw %%mm7, %%mm1 \n\t"
1985 "punpcklbw %%mm7, %%mm3 \n\t"
1986 "movq %%mm1, %%mm4 \n\t"
1987 "movq %%mm3, %%mm5 \n\t"
1988 "pmaddwd %4, %%mm1 \n\t"
1989 "pmaddwd 8+%4, %%mm3 \n\t"
1990 "pmaddwd 16+%4, %%mm4 \n\t"
1991 "pmaddwd %%mm6, %%mm5 \n\t"
1992 "paddd %%mm3, %%mm1 \n\t"
1993 "paddd %%mm5, %%mm4 \n\t"
1994
1995 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1996 "paddd %%mm3, %%mm0 \n\t"
1997 "paddd %%mm3, %%mm2 \n\t"
1998 "paddd %%mm3, %%mm1 \n\t"
1999 "paddd %%mm3, %%mm4 \n\t"
2000 "psrad $15, %%mm0 \n\t"
2001 "psrad $15, %%mm2 \n\t"
2002 "psrad $15, %%mm1 \n\t"
2003 "psrad $15, %%mm4 \n\t"
2004 "packssdw %%mm1, %%mm0 \n\t"
2005 "packssdw %%mm4, %%mm2 \n\t"
2006 "packuswb %%mm0, %%mm0 \n\t"
2007 "packuswb %%mm2, %%mm2 \n\t"
2008 "movd %%mm0, (%1, %%"REG_a") \n\t"
2009 "movd %%mm2, (%2, %%"REG_a") \n\t"
2010 "add $4, %%"REG_a" \n\t"
2011 " js 1b \n\t"
2012 : "+r" (src)
2013 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2014 : "%"REG_a
2015 );
2016 }
2017 #endif
2018
2019 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
2020 {
2021 #ifdef HAVE_MMX
2022 bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24);
2023 #else
2024 int i;
2025 for (i=0; i<width; i++)
2026 {
2027 int b= src[i*3+0];
2028 int g= src[i*3+1];
2029 int r= src[i*3+2];
2030
2031 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2032 }
2033 #endif /* HAVE_MMX */
2034 }
2035
2036 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2037 {
2038 #ifdef HAVE_MMX
2039 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24);
2040 #else
2041 int i;
2042 for (i=0; i<width; i++)
2043 {
2044 int b= src1[3*i + 0];
2045 int g= src1[3*i + 1];
2046 int r= src1[3*i + 2];
2047
2048 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2049 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2050 }
2051 #endif /* HAVE_MMX */
2052 assert(src1 == src2);
2053 }
2054
2055 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2056 {
2057 int i;
2058 for (i=0; i<width; i++)
2059 {
2060 int b= src1[6*i + 0] + src1[6*i + 3];
2061 int g= src1[6*i + 1] + src1[6*i + 4];
2062 int r= src1[6*i + 2] + src1[6*i + 5];
2063
2064 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2065 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2066 }
2067 assert(src1 == src2);
2068 }
2069
2070 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
2071 {
2072 #ifdef HAVE_MMX
2073 bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24);
2074 #else
2075 int i;
2076 for (i=0; i<width; i++)
2077 {
2078 int r= src[i*3+0];
2079 int g= src[i*3+1];
2080 int b= src[i*3+2];
2081
2082 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2083 }
2084 #endif
2085 }
2086
2087 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2088 {
2089 int i;
2090 assert(src1==src2);
2091 #ifdef HAVE_MMX
2092 bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24);
2093 #else
2094 for (i=0; i<width; i++)
2095 {
2096 int r= src1[3*i + 0];
2097 int g= src1[3*i + 1];
2098 int b= src1[3*i + 2];
2099
2100 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2101 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2102 }
2103 #endif
2104 }
2105
2106 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2107 {
2108 int i;
2109 assert(src1==src2);
2110 for (i=0; i<width; i++)
2111 {
2112 int r= src1[6*i + 0] + src1[6*i + 3];
2113 int g= src1[6*i + 1] + src1[6*i + 4];
2114 int b= src1[6*i + 2] + src1[6*i + 5];
2115
2116 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2117 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2118 }
2119 }
2120
2121
2122 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2123 {
2124 int i;
2125 for (i=0; i<width; i++)
2126 {
2127 int d= src[i];
2128
2129 dst[i]= pal[d] & 0xFF;
2130 }
2131 }
2132
2133 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2134 {
2135 int i;
2136 assert(src1 == src2);
2137 for (i=0; i<width; i++)
2138 {
2139 int p= pal[src1[i]];
2140
2141 dstU[i]= p>>8;
2142 dstV[i]= p>>16;
2143 }
2144 }
2145
2146 // bilinear / bicubic scaling
2147 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2148 int16_t *filter, int16_t *filterPos, long filterSize)
2149 {
2150 #ifdef HAVE_MMX
2151 assert(filterSize % 4 == 0 && filterSize>0);
2152 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2153 {
2154 long counter= -2*dstW;
2155 filter-= counter*2;
2156 filterPos-= counter/2;
2157 dst-= counter/2;
2158 asm volatile(
2159 #if defined(PIC)
2160 "push %%"REG_b" \n\t"
2161 #endif
2162 "pxor %%mm7, %%mm7 \n\t"
2163 "movq "MANGLE(w02)", %%mm6 \n\t"
2164 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2165 "mov %%"REG_a", %%"REG_BP" \n\t"
2166 ASMALIGN(4)
2167 "1: \n\t"
2168 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2169 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2170 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2171 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2172 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2173 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2174 "punpcklbw %%mm7, %%mm0 \n\t"
2175 "punpcklbw %%mm7, %%mm2 \n\t"
2176 "pmaddwd %%mm1, %%mm0 \n\t"
2177 "pmaddwd %%mm2, %%mm3 \n\t"
2178 "psrad $8, %%mm0 \n\t"
2179 "psrad $8, %%mm3 \n\t"
2180 "packssdw %%mm3, %%mm0 \n\t"
2181 "pmaddwd %%mm6, %%mm0 \n\t"
2182 "packssdw %%mm0, %%mm0 \n\t"
2183 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2184 "add $4, %%"REG_BP" \n\t"
2185 " jnc 1b \n\t"
2186
2187 "pop %%"REG_BP" \n\t"
2188 #if defined(PIC)
2189 "pop %%"REG_b" \n\t"
2190 #endif
2191 : "+a" (counter)
2192 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2193 #if !defined(PIC)
2194 : "%"REG_b
2195 #endif
2196 );
2197 }
2198 else if (filterSize==8)
2199 {
2200 long counter= -2*dstW;
2201 filter-= counter*4;
2202 filterPos-= counter/2;
2203 dst-= counter/2;
2204 asm volatile(
2205 #if defined(PIC)
2206 "push %%"REG_b" \n\t"
2207 #endif
2208 "pxor %%mm7, %%mm7 \n\t"
2209 "movq "MANGLE(w02)", %%mm6 \n\t"
2210 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2211 "mov %%"REG_a", %%"REG_BP" \n\t"
2212 ASMALIGN(4)
2213 "1: \n\t"
2214 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2215 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2216 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2217 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2218 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2219 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2220 "punpcklbw %%mm7, %%mm0 \n\t"
2221 "punpcklbw %%mm7, %%mm2 \n\t"
2222 "pmaddwd %%mm1, %%mm0 \n\t"
2223 "pmaddwd %%mm2, %%mm3 \n\t"
2224
2225 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2226 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2227 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2228 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2229 "punpcklbw %%mm7, %%mm4 \n\t"
2230 "punpcklbw %%mm7, %%mm2 \n\t"
2231 "pmaddwd %%mm1, %%mm4 \n\t"
2232 "pmaddwd %%mm2, %%mm5 \n\t"
2233 "paddd %%mm4, %%mm0 \n\t"
2234 "paddd %%mm5, %%mm3 \n\t"
2235
2236 "psrad $8, %%mm0 \n\t"
2237 "psrad $8, %%mm3 \n\t"
2238 "packssdw %%mm3, %%mm0 \n\t"
2239 "pmaddwd %%mm6, %%mm0 \n\t"
2240 "packssdw %%mm0, %%mm0 \n\t"
2241 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2242 "add $4, %%"REG_BP" \n\t"
2243 " jnc 1b \n\t"
2244
2245 "pop %%"REG_BP" \n\t"
2246 #if defined(PIC)
2247 "pop %%"REG_b" \n\t"
2248 #endif
2249 : "+a" (counter)
2250 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2251 #if !defined(PIC)
2252 : "%"REG_b
2253 #endif
2254 );
2255 }
2256 else
2257 {
2258 uint8_t *offset = src+filterSize;
2259 long counter= -2*dstW;
2260 //filter-= counter*filterSize/2;
2261 filterPos-= counter/2;
2262 dst-= counter/2;
2263 asm volatile(
2264 "pxor %%mm7, %%mm7 \n\t"
2265 "movq "MANGLE(w02)", %%mm6 \n\t"
2266 ASMALIGN(4)
2267 "1: \n\t"
2268 "mov %2, %%"REG_c" \n\t"
2269 "movzwl (%%"REG_c", %0), %%eax \n\t"
2270 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2271 "mov %5, %%"REG_c" \n\t"
2272 "pxor %%mm4, %%mm4 \n\t"
2273 "pxor %%mm5, %%mm5 \n\t"
2274 "2: \n\t"
2275 "movq (%1), %%mm1 \n\t"
2276 "movq (%1, %6), %%mm3 \n\t"
2277 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2278 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2279 "punpcklbw %%mm7, %%mm0 \n\t"
2280 "punpcklbw %%mm7, %%mm2 \n\t"
2281 "pmaddwd %%mm1, %%mm0 \n\t"
2282 "pmaddwd %%mm2, %%mm3 \n\t"
2283 "paddd %%mm3, %%mm5 \n\t"
2284 "paddd %%mm0, %%mm4 \n\t"
2285 "add $8, %1 \n\t"
2286 "add $4, %%"REG_c" \n\t"
2287 "cmp %4, %%"REG_c" \n\t"
2288 " jb 2b \n\t"
2289 "add %6, %1 \n\t"
2290 "psrad $8, %%mm4 \n\t"
2291 "psrad $8, %%mm5 \n\t"
2292 "packssdw %%mm5, %%mm4 \n\t"
2293 "pmaddwd %%mm6, %%mm4 \n\t"
2294 "packssdw %%mm4, %%mm4 \n\t"
2295 "mov %3, %%"REG_a" \n\t"
2296 "movd %%mm4, (%%"REG_a", %0) \n\t"
2297 "add $4, %0 \n\t"
2298 " jnc 1b \n\t"
2299
2300 : "+r" (counter), "+r" (filter)
2301 : "m" (filterPos), "m" (dst), "m"(offset),
2302 "m" (src), "r" (filterSize*2)
2303 : "%"REG_a, "%"REG_c, "%"REG_d
2304 );
2305 }
2306 #else
2307 #ifdef HAVE_ALTIVEC
2308 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2309 #else
2310 int i;
2311 for (i=0; i<dstW; i++)
2312 {
2313 int j;
2314 int srcPos= filterPos[i];
2315 int val=0;
2316 //printf("filterPos: %d\n", filterPos[i]);
2317 for (j=0; j<filterSize; j++)
2318 {
2319 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2320 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2321 }
2322 //filter += hFilterSize;
2323 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2324 //dst[i] = val>>7;
2325 }
2326 #endif /* HAVE_ALTIVEC */
2327 #endif /* HAVE_MMX */
2328 }
2329 // *** horizontal scale Y line to temp buffer
2330 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2331 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2332 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2333 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2334 int32_t *mmx2FilterPos, uint8_t *pal)
2335 {
2336 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2337 {
2338 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2339 src= formatConvBuffer;
2340 }
2341 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2342 {
2343 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2344 src= formatConvBuffer;
2345 }
2346 else if (srcFormat==PIX_FMT_RGB32)
2347 {
2348 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2349 src= formatConvBuffer;
2350 }
2351 else if (srcFormat==PIX_FMT_RGB32_1)
2352 {
2353 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2354 src= formatConvBuffer;
2355 }
2356 else if (srcFormat==PIX_FMT_BGR24)
2357 {
2358 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2359 src= formatConvBuffer;
2360 }
2361 else if (srcFormat==PIX_FMT_BGR565)
2362 {
2363 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2364 src= formatConvBuffer;
2365 }
2366 else if (srcFormat==PIX_FMT_BGR555)
2367 {
2368 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2369 src= formatConvBuffer;
2370 }
2371 else if (srcFormat==PIX_FMT_BGR32)
2372 {
2373 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2374 src= formatConvBuffer;
2375 }
2376 else if (srcFormat==PIX_FMT_BGR32_1)
2377 {
2378 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2379 src= formatConvBuffer;
2380 }
2381 else if (srcFormat==PIX_FMT_RGB24)
2382 {
2383 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2384 src= formatConvBuffer;
2385 }
2386 else if (srcFormat==PIX_FMT_RGB565)
2387 {
2388 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2389 src= formatConvBuffer;
2390 }
2391 else if (srcFormat==PIX_FMT_RGB555)
2392 {
2393 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2394 src= formatConvBuffer;
2395 }
2396 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2397 {
2398 RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2399 src= formatConvBuffer;
2400 }
2401
2402 #ifdef HAVE_MMX
2403 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2404 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2405 #else
2406 if (!(flags&SWS_FAST_BILINEAR))
2407 #endif
2408 {
2409 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2410 }
2411 else // fast bilinear upscale / crap downscale
2412 {
2413 #if defined(ARCH_X86)
2414 #ifdef HAVE_MMX2
2415 int i;
2416 #if defined(PIC)
2417 uint64_t ebxsave __attribute__((aligned(8)));
2418 #endif
2419 if (canMMX2BeUsed)
2420 {
2421 asm volatile(
2422 #if defined(PIC)
2423 "mov %%"REG_b", %5 \n\t"
2424 #endif
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c" \n\t"
2427 "mov %1, %%"REG_D" \n\t"
2428 "mov %2, %%"REG_d" \n\t"
2429 "mov %3, %%"REG_b" \n\t"
2430 "xor %%"REG_a", %%"REG_a" \n\t" // i
2431 PREFETCH" (%%"REG_c") \n\t"
2432 PREFETCH" 32(%%"REG_c") \n\t"
2433 PREFETCH" 64(%%"REG_c") \n\t"
2434
2435 #ifdef ARCH_X86_64
2436
2437 #define FUNNY_Y_CODE \
2438 "movl (%%"REG_b"), %%esi \n\t"\
2439 "call *%4 \n\t"\
2440 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2441 "add %%"REG_S", %%"REG_c" \n\t"\
2442 "add %%"REG_a", %%"REG_D" \n\t"\
2443 "xor %%"REG_a", %%"REG_a" \n\t"\
2444
2445 #else
2446
2447 #define FUNNY_Y_CODE \
2448 "movl (%%"REG_b"), %%esi \n\t"\
2449 "call *%4 \n\t"\
2450 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2451 "add %%"REG_a", %%"REG_D" \n\t"\
2452 "xor %%"REG_a", %%"REG_a" \n\t"\
2453
2454 #endif /* ARCH_X86_64 */
2455
2456 FUNNY_Y_CODE
2457 FUNNY_Y_CODE
2458 FUNNY_Y_CODE
2459 FUNNY_Y_CODE
2460 FUNNY_Y_CODE
2461 FUNNY_Y_CODE
2462 FUNNY_Y_CODE
2463 FUNNY_Y_CODE
2464
2465 #if defined(PIC)
2466 "mov %5, %%"REG_b" \n\t"
2467 #endif
2468 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2469 "m" (funnyYCode)
2470 #if defined(PIC)
2471 ,"m" (ebxsave)
2472 #endif
2473 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2474 #if !defined(PIC)
2475 ,"%"REG_b
2476 #endif
2477 );
2478 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2479 }
2480 else
2481 {
2482 #endif /* HAVE_MMX2 */
2483 long xInc_shr16 = xInc >> 16;
2484 uint16_t xInc_mask = xInc & 0xffff;
2485 //NO MMX just normal asm ...
2486 asm volatile(
2487 "xor %%"REG_a", %%"REG_a" \n\t" // i
2488 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2489 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2490 ASMALIGN(4)
2491 "1: \n\t"
2492 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2493 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2494 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2495 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2496 "shll $16, %%edi \n\t"
2497 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2498 "mov %1, %%"REG_D" \n\t"
2499 "shrl $9, %%esi \n\t"
2500 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2501 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2502 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2503
2504 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2505 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2506 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2507 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2508 "shll $16, %%edi \n\t"
2509 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2510 "mov %1, %%"REG_D" \n\t"
2511 "shrl $9, %%esi \n\t"
2512 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2513 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2514 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2515
2516
2517 "add $2, %%"REG_a" \n\t"
2518 "cmp %2, %%"REG_a" \n\t"
2519 " jb 1b \n\t"
2520
2521
2522 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2523 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2524 );
2525 #ifdef HAVE_MMX2
2526 } //if MMX2 can't be used
2527 #endif
2528 #else
2529 int i;
2530 unsigned int xpos=0;
2531 for (i=0;i<dstWidth;i++)
2532 {
2533 register unsigned int xx=xpos>>16;
2534 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2535 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2536 xpos+=xInc;
2537 }
2538 #endif /* defined(ARCH_X86) */
2539 }
2540
2541 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2542 int i;
2543 //FIXME all pal and rgb srcFormats could do this convertion as well
2544 //FIXME all scalers more complex than bilinear could do half of this transform
2545 if(c->srcRange){
2546 for (i=0; i<dstWidth; i++)
2547 dst[i]= (dst[i]*14071 + 33561947)>>14;
2548 }else{
2549 for (i=0; i<dstWidth; i++)
2550 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2551 }
2552 }
2553 }
2554
2555 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2556 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2557 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2558 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2559 int32_t *mmx2FilterPos, uint8_t *pal)
2560 {
2561 if (srcFormat==PIX_FMT_YUYV422)
2562 {
2563 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2564 src1= formatConvBuffer;
2565 src2= formatConvBuffer+VOFW;
2566 }
2567 else if (srcFormat==PIX_FMT_UYVY422)
2568 {
2569 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2570 src1= formatConvBuffer;
2571 src2= formatConvBuffer+VOFW;
2572 }
2573 else if (srcFormat==PIX_FMT_RGB32)
2574 {
2575 if(c->chrSrcHSubSample)
2576 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2577 else
2578 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2579 src1= formatConvBuffer;
2580 src2= formatConvBuffer+VOFW;
2581 }
2582 else if (srcFormat==PIX_FMT_RGB32_1)
2583 {
2584 if(c->chrSrcHSubSample)
2585 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2586 else
2587 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2588 src1= formatConvBuffer;
2589 src2= formatConvBuffer+VOFW;
2590 }
2591 else if (srcFormat==PIX_FMT_BGR24)
2592 {
2593 if(c->chrSrcHSubSample)
2594 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2595 else
2596 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2597 src1= formatConvBuffer;
2598 src2= formatConvBuffer+VOFW;
2599 }
2600 else if (srcFormat==PIX_FMT_BGR565)
2601 {
2602 if(c->chrSrcHSubSample)
2603 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2604 else
2605 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2606 src1= formatConvBuffer;
2607 src2= formatConvBuffer+VOFW;
2608 }
2609 else if (srcFormat==PIX_FMT_BGR555)
2610 {
2611 if(c->chrSrcHSubSample)
2612 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2613 else
2614 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2615 src1= formatConvBuffer;
2616 src2= formatConvBuffer+VOFW;
2617 }
2618 else if (srcFormat==PIX_FMT_BGR32)
2619 {
2620 if(c->chrSrcHSubSample)
2621 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2622 else
2623 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2624 src1= formatConvBuffer;
2625 src2= formatConvBuffer+VOFW;
2626 }
2627 else if (srcFormat==PIX_FMT_BGR32_1)
2628 {
2629 if(c->chrSrcHSubSample)
2630 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2631 else
2632 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2633 src1= formatConvBuffer;
2634 src2= formatConvBuffer+VOFW;
2635 }
2636 else if (srcFormat==PIX_FMT_RGB24)
2637 {
2638 if(c->chrSrcHSubSample)
2639 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2640 else
2641 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2642 src1= formatConvBuffer;
2643 src2= formatConvBuffer+VOFW;
2644 }
2645 else if (srcFormat==PIX_FMT_RGB565)
2646 {
2647 if(c->chrSrcHSubSample)
2648 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2649 else
2650 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2651 src1= formatConvBuffer;
2652 src2= formatConvBuffer+VOFW;
2653 }
2654 else if (srcFormat==PIX_FMT_RGB555)
2655 {
2656 if(c->chrSrcHSubSample)
2657 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2658 else
2659 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2660 src1= formatConvBuffer;
2661 src2= formatConvBuffer+VOFW;
2662 }
2663 else if (isGray(srcFormat))
2664 {
2665 return;
2666 }
2667 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2668 {
2669 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2670 src1= formatConvBuffer;
2671 src2= formatConvBuffer+VOFW;
2672 }
2673
2674 #ifdef HAVE_MMX
2675 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2676 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2677 #else
2678 if (!(flags&SWS_FAST_BILINEAR))
2679 #endif
2680 {
2681 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2682 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2683 }
2684 else // fast bilinear upscale / crap downscale
2685 {
2686 #if defined(ARCH_X86)
2687 #ifdef HAVE_MMX2
2688 int i;
2689 #if defined(PIC)
2690 uint64_t ebxsave __attribute__((aligned(8)));
2691 #endif
2692 if (canMMX2BeUsed)
2693 {
2694 asm volatile(
2695 #if defined(PIC)
2696 "mov %%"REG_b", %6 \n\t"
2697 #endif
2698 "pxor %%mm7, %%mm7 \n\t"
2699 "mov %0, %%"REG_c" \n\t"
2700 "mov %1, %%"REG_D" \n\t"
2701 "mov %2, %%"REG_d" \n\t"
2702 "mov %3, %%"REG_b" \n\t"
2703 "xor %%"REG_a", %%"REG_a" \n\t" // i
2704 PREFETCH" (%%"REG_c") \n\t"
2705 PREFETCH" 32(%%"REG_c") \n\t"
2706 PREFETCH" 64(%%"REG_c") \n\t"
2707
2708 #ifdef ARCH_X86_64
2709
2710 #define FUNNY_UV_CODE \
2711 "movl (%%"REG_b"), %%esi \n\t"\
2712 "call *%4 \n\t"\
2713 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2714 "add %%"REG_S", %%"REG_c" \n\t"\
2715 "add %%"REG_a", %%"REG_D" \n\t"\
2716 "xor %%"REG_a", %%"REG_a" \n\t"\
2717
2718 #else
2719
2720 #define FUNNY_UV_CODE \
2721 "movl (%%"REG_b"), %%esi \n\t"\
2722 "call *%4 \n\t"\
2723 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2724 "add %%"REG_a", %%"REG_D" \n\t"\
2725 "xor %%"REG_a", %%"REG_a" \n\t"\
2726
2727 #endif /* ARCH_X86_64 */
2728
2729 FUNNY_UV_CODE
2730 FUNNY_UV_CODE
2731 FUNNY_UV_CODE
2732 FUNNY_UV_CODE
2733 "xor %%"REG_a", %%"REG_a" \n\t" // i
2734 "mov %5, %%"REG_c" \n\t" // src
2735 "mov %1, %%"REG_D" \n\t" // buf1
2736 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2737 PREFETCH" (%%"REG_c") \n\t"
2738 PREFETCH" 32(%%"REG_c") \n\t"
2739 PREFETCH" 64(%%"REG_c") \n\t"
2740
2741 FUNNY_UV_CODE
2742 FUNNY_UV_CODE
2743 FUNNY_UV_CODE
2744 FUNNY_UV_CODE
2745
2746 #if defined(PIC)
2747 "mov %6, %%"REG_b" \n\t"
2748 #endif
2749 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2750 "m" (funnyUVCode), "m" (src2)
2751 #if defined(PIC)
2752 ,"m" (ebxsave)
2753 #endif
2754 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2755 #if !defined(PIC)
2756 ,"%"REG_b
2757 #endif
2758 );
2759 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2760 {
2761 //printf("%d %d %d\n", dstWidth, i, srcW);
2762 dst[i] = src1[srcW-1]*128;
2763 dst[i+VOFW] = src2[srcW-1]*128;
2764 }
2765 }
2766 else
2767 {
2768 #endif /* HAVE_MMX2 */
2769 long xInc_shr16 = (long) (xInc >> 16);
2770 uint16_t xInc_mask = xInc & 0xffff;
2771 asm volatile(
2772 "xor %%"REG_a", %%"REG_a" \n\t" // i
2773 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2774 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2775 ASMALIGN(4)
2776 "1: \n\t"
2777 "mov %0, %%"REG_S" \n\t"
2778 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2779 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2780 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2781 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2782 "shll $16, %%edi \n\t"
2783 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2784 "mov %1, %%"REG_D" \n\t"
2785 "shrl $9, %%esi \n\t"
2786 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2787
2788 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2789 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2790 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2791 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2792 "shll $16, %%edi \n\t"
2793 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2794 "mov %1, %%"REG_D" \n\t"
2795 "shrl $9, %%esi \n\t"
2796 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2797
2798 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2799 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2800 "add $1, %%"REG_a" \n\t"
2801 "cmp %2, %%"REG_a" \n\t"
2802 " jb 1b \n\t"
2803
2804 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2805 which is needed to support GCC 4.0. */
2806 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2807 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2808 #else
2809 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2810 #endif
2811 "r" (src2)
2812 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2813 );
2814 #ifdef HAVE_MMX2
2815 } //if MMX2 can't be used
2816 #endif
2817 #else
2818 int i;
2819 unsigned int xpos=0;
2820 for (i=0;i<dstWidth;i++)
2821 {
2822 register unsigned int xx=xpos>>16;
2823 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2824 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2825 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2826 /* slower
2827 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2828 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2829 */
2830 xpos+=xInc;
2831 }
2832 #endif /* defined(ARCH_X86) */
2833 }
2834 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2835 int i;
2836 //FIXME all pal and rgb srcFormats could do this convertion as well
2837 //FIXME all scalers more complex than bilinear could do half of this transform
2838 if(c->srcRange){
2839 for (i=0; i<dstWidth; i++){
2840 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2841 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2842 }
2843 }else{
2844 for (i=0; i<dstWidth; i++){
2845 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2846 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2847 }
2848 }
2849 }
2850 }
2851
2852 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2853 int srcSliceH, uint8_t* dst[], int dstStride[]){
2854
2855 /* load a few things into local vars to make the code more readable? and faster */
2856 const int srcW= c->srcW;
2857 const int dstW= c->dstW;
2858 const int dstH= c->dstH;
2859 const int chrDstW= c->chrDstW;
2860 const int chrSrcW= c->chrSrcW;
2861 const int lumXInc= c->lumXInc;
2862 const int chrXInc= c->chrXInc;
2863 const int dstFormat= c->dstFormat;
2864 const int srcFormat= c->srcFormat;
2865 const int flags= c->flags;
2866 const int canMMX2BeUsed= c->canMMX2BeUsed;
2867 int16_t *vLumFilterPos= c->vLumFilterPos;
2868 int16_t *vChrFilterPos= c->vChrFilterPos;
2869 int16_t *hLumFilterPos= c->hLumFilterPos;
2870 int16_t *hChrFilterPos= c->hChrFilterPos;
2871 int16_t *vLumFilter= c->vLumFilter;
2872 int16_t *vChrFilter= c->vChrFilter;
2873 int16_t *hLumFilter= c->hLumFilter;
2874 int16_t *hChrFilter= c->hChrFilter;
2875 int32_t *lumMmxFilter= c->lumMmxFilter;
2876 int32_t *chrMmxFilter= c->chrMmxFilter;
2877 const int vLumFilterSize= c->vLumFilterSize;
2878 const int vChrFilterSize= c->vChrFilterSize;
2879 const int hLumFilterSize= c->hLumFilterSize;
2880 const int hChrFilterSize= c->hChrFilterSize;
2881 int16_t **lumPixBuf= c->lumPixBuf;
2882 int16_t **chrPixBuf= c->chrPixBuf;
2883 const int vLumBufSize= c->vLumBufSize;
2884 const int vChrBufSize= c->vChrBufSize;
2885 uint8_t *funnyYCode= c->funnyYCode;
2886 uint8_t *funnyUVCode= c->funnyUVCode;
2887 uint8_t *formatConvBuffer= c->formatConvBuffer;
2888 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2889 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2890 int lastDstY;
2891 uint8_t *pal=NULL;
2892
2893 /* vars which will change and which we need to store back in the context */
2894 int dstY= c->dstY;
2895 int lumBufIndex= c->lumBufIndex;
2896 int chrBufIndex= c->chrBufIndex;
2897 int lastInLumBuf= c->lastInLumBuf;
2898 int lastInChrBuf= c->lastInChrBuf;
2899
2900 if (isPacked(c->srcFormat)){
2901 pal= src[1];
2902 src[0]=
2903 src[1]=
2904 src[2]= src[0];
2905 srcStride[0]=
2906 srcStride[1]=
2907 srcStride[2]= srcStride[0];
2908 }
2909 srcStride[1]<<= c->vChrDrop;
2910 srcStride[2]<<= c->vChrDrop;
2911
2912 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2913 // (int)dst[0], (int)dst[1], (int)dst[2]);
2914
2915 #if 0 //self test FIXME move to a vfilter or something
2916 {
2917 static volatile int i=0;
2918 i++;
2919 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2920 selfTest(src, srcStride, c->srcW, c->srcH);
2921 i--;
2922 }
2923 #endif
2924
2925 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2926 //dstStride[0],dstStride[1],dstStride[2]);
2927
2928 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2929 {
2930 static int firstTime=1; //FIXME move this into the context perhaps
2931 if (flags & SWS_PRINT_INFO && firstTime)
2932 {
2933 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2934 " ->cannot do aligned memory accesses anymore\n");
2935 firstTime=0;
2936 }
2937 }
2938
2939 /* Note the user might start scaling the picture in the middle so this
2940 will not get executed. This is not really intended but works
2941 currently, so people might do it. */
2942 if (srcSliceY ==0){
2943 lumBufIndex=0;
2944 chrBufIndex=0;
2945 dstY=0;
2946 lastInLumBuf= -1;
2947 lastInChrBuf= -1;
2948 }
2949
2950 lastDstY= dstY;
2951
2952 for (;dstY < dstH; dstY++){
2953 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2954 const int chrDstY= dstY>>c->chrDstVSubSample;
2955 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2956 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2957
2958 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2959 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2960 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2961 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2962
2963 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2964 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2965 //handle holes (FAST_BILINEAR & weird filters)
2966 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2967 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2968 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2969 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2970 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2971
2972 // Do we have enough lines in this slice to output the dstY line
2973 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2974 {
2975 //Do horizontal scaling
2976 while(lastInLumBuf < lastLumSrcY)
2977 {
2978 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2979 lumBufIndex++;
2980 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2981 assert(lumBufIndex < 2*vLumBufSize);
2982 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2983 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2984 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2985 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2986 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2987 funnyYCode, c->srcFormat, formatConvBuffer,
2988 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2989 lastInLumBuf++;
2990 }
2991 while(lastInChrBuf < lastChrSrcY)
2992 {
2993 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2994 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2995 chrBufIndex++;
2996 assert(chrBufIndex < 2*vChrBufSize);
2997 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2998 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2999 //FIXME replace parameters through context struct (some at least)
3000
3001 if (!(isGray(srcFormat) || isGray(dstFormat)))
3002 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3003 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3004 funnyUVCode, c->srcFormat, formatConvBuffer,
3005 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3006 lastInChrBuf++;
3007 }
3008 //wrap buf index around to stay inside the ring buffer
3009 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3010 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3011 }
3012 else // not enough lines left in this slice -> load the rest in the buffer
3013 {
3014 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3015 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3016 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3017 vChrBufSize, vLumBufSize);*/
3018
3019 //Do horizontal scaling
3020 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3021 {
3022 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3023 lumBufIndex++;
3024 assert(lumBufIndex < 2*vLumBufSize);
3025 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3026 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3027 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3028 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3029 funnyYCode, c->srcFormat, formatConvBuffer,
3030 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3031 lastInLumBuf++;
3032 }
3033 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3034 {
3035 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3036 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3037 chrBufIndex++;
3038 assert(chrBufIndex < 2*vChrBufSize);
3039 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3040 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3041
3042 if (!(isGray(srcFormat) || isGray(dstFormat)))
3043 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3044 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3045 funnyUVCode, c->srcFormat, formatConvBuffer,
3046 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3047 lastInChrBuf++;
3048 }
3049 //wrap buf index around to stay inside the ring buffer
3050 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3051 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3052 break; //we can't output a dstY line so let's try with the next slice
3053 }
3054