f08fcf5bf0e9c4ad8b5794ae1be72f3849107931
[libav.git] / libswscale / swscale_template.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
22 */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
31
32 #if HAVE_AMD3DNOW
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
38
39 #if HAVE_AMD3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif HAVE_MMX2
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
49
50 #if HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
55
56 #if HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif HAVE_AMD3DNOW
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
61
62 #if HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
68
69 #if HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
72
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 __asm__ volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
108
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 __asm__ volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
170
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
183
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
201
202 /*
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208 */
209 #define YSCALEYUV2PACKEDX_UV \
210 __asm__ volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
232
233 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
234 "lea "offset"(%0), %%"REG_d" \n\t"\
235 "mov (%%"REG_d"), %%"REG_S" \n\t"\
236 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
237 "movq "#dst1", "#dst2" \n\t"\
238 ASMALIGN(4)\
239 "2: \n\t"\
240 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
241 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
242 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
243 "add $16, %%"REG_d" \n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
245 "pmulhw "#coeff", "#src1" \n\t"\
246 "pmulhw "#coeff", "#src2" \n\t"\
247 "paddw "#src1", "#dst1" \n\t"\
248 "paddw "#src2", "#dst2" \n\t"\
249 "test %%"REG_S", %%"REG_S" \n\t"\
250 " jnz 2b \n\t"\
251
252 #define YSCALEYUV2PACKEDX \
253 YSCALEYUV2PACKEDX_UV \
254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
255
256 #define YSCALEYUV2PACKEDX_END \
257 :: "r" (&c->redDither), \
258 "m" (dummy), "m" (dummy), "m" (dummy),\
259 "r" (dest), "m" (dstW) \
260 : "%"REG_a, "%"REG_d, "%"REG_S \
261 );
262
263 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
264 __asm__ volatile(\
265 "xor %%"REG_a", %%"REG_a" \n\t"\
266 ASMALIGN(4)\
267 "nop \n\t"\
268 "1: \n\t"\
269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pxor %%mm4, %%mm4 \n\t"\
272 "pxor %%mm5, %%mm5 \n\t"\
273 "pxor %%mm6, %%mm6 \n\t"\
274 "pxor %%mm7, %%mm7 \n\t"\
275 ASMALIGN(4)\
276 "2: \n\t"\
277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
281 "movq %%mm0, %%mm3 \n\t"\
282 "punpcklwd %%mm1, %%mm0 \n\t"\
283 "punpckhwd %%mm1, %%mm3 \n\t"\
284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
285 "pmaddwd %%mm1, %%mm0 \n\t"\
286 "pmaddwd %%mm1, %%mm3 \n\t"\
287 "paddd %%mm0, %%mm4 \n\t"\
288 "paddd %%mm3, %%mm5 \n\t"\
289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
292 "test %%"REG_S", %%"REG_S" \n\t"\
293 "movq %%mm2, %%mm0 \n\t"\
294 "punpcklwd %%mm3, %%mm2 \n\t"\
295 "punpckhwd %%mm3, %%mm0 \n\t"\
296 "pmaddwd %%mm1, %%mm2 \n\t"\
297 "pmaddwd %%mm1, %%mm0 \n\t"\
298 "paddd %%mm2, %%mm6 \n\t"\
299 "paddd %%mm0, %%mm7 \n\t"\
300 " jnz 2b \n\t"\
301 "psrad $16, %%mm4 \n\t"\
302 "psrad $16, %%mm5 \n\t"\
303 "psrad $16, %%mm6 \n\t"\
304 "psrad $16, %%mm7 \n\t"\
305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
306 "packssdw %%mm5, %%mm4 \n\t"\
307 "packssdw %%mm7, %%mm6 \n\t"\
308 "paddw %%mm0, %%mm4 \n\t"\
309 "paddw %%mm0, %%mm6 \n\t"\
310 "movq %%mm4, "U_TEMP"(%0) \n\t"\
311 "movq %%mm6, "V_TEMP"(%0) \n\t"\
312
313 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314 "lea "offset"(%0), %%"REG_d" \n\t"\
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm1, %%mm1 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm7, %%mm7 \n\t"\
319 "pxor %%mm6, %%mm6 \n\t"\
320 ASMALIGN(4)\
321 "2: \n\t"\
322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
326 "movq %%mm0, %%mm3 \n\t"\
327 "punpcklwd %%mm4, %%mm0 \n\t"\
328 "punpckhwd %%mm4, %%mm3 \n\t"\
329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
330 "pmaddwd %%mm4, %%mm0 \n\t"\
331 "pmaddwd %%mm4, %%mm3 \n\t"\
332 "paddd %%mm0, %%mm1 \n\t"\
333 "paddd %%mm3, %%mm5 \n\t"\
334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
337 "test %%"REG_S", %%"REG_S" \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "punpcklwd %%mm3, %%mm2 \n\t"\
340 "punpckhwd %%mm3, %%mm0 \n\t"\
341 "pmaddwd %%mm4, %%mm2 \n\t"\
342 "pmaddwd %%mm4, %%mm0 \n\t"\
343 "paddd %%mm2, %%mm7 \n\t"\
344 "paddd %%mm0, %%mm6 \n\t"\
345 " jnz 2b \n\t"\
346 "psrad $16, %%mm1 \n\t"\
347 "psrad $16, %%mm5 \n\t"\
348 "psrad $16, %%mm7 \n\t"\
349 "psrad $16, %%mm6 \n\t"\
350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
351 "packssdw %%mm5, %%mm1 \n\t"\
352 "packssdw %%mm6, %%mm7 \n\t"\
353 "paddw %%mm0, %%mm1 \n\t"\
354 "paddw %%mm0, %%mm7 \n\t"\
355 "movq "U_TEMP"(%0), %%mm3 \n\t"\
356 "movq "V_TEMP"(%0), %%mm4 \n\t"\
357
358 #define YSCALEYUV2PACKEDX_ACCURATE \
359 YSCALEYUV2PACKEDX_ACCURATE_UV \
360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
361
362 #define YSCALEYUV2RGBX \
363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
369 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
376 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377 "paddw %%mm3, %%mm4 \n\t"\
378 "movq %%mm2, %%mm0 \n\t"\
379 "movq %%mm5, %%mm6 \n\t"\
380 "movq %%mm4, %%mm3 \n\t"\
381 "punpcklwd %%mm2, %%mm2 \n\t"\
382 "punpcklwd %%mm5, %%mm5 \n\t"\
383 "punpcklwd %%mm4, %%mm4 \n\t"\
384 "paddw %%mm1, %%mm2 \n\t"\
385 "paddw %%mm1, %%mm5 \n\t"\
386 "paddw %%mm1, %%mm4 \n\t"\
387 "punpckhwd %%mm0, %%mm0 \n\t"\
388 "punpckhwd %%mm6, %%mm6 \n\t"\
389 "punpckhwd %%mm3, %%mm3 \n\t"\
390 "paddw %%mm7, %%mm0 \n\t"\
391 "paddw %%mm7, %%mm6 \n\t"\
392 "paddw %%mm7, %%mm3 \n\t"\
393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394 "packuswb %%mm0, %%mm2 \n\t"\
395 "packuswb %%mm6, %%mm5 \n\t"\
396 "packuswb %%mm3, %%mm4 \n\t"\
397
398 #define REAL_YSCALEYUV2PACKED(index, c) \
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
401 "psraw $3, %%mm0 \n\t"\
402 "psraw $3, %%mm1 \n\t"\
403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405 "xor "#index", "#index" \n\t"\
406 ASMALIGN(4)\
407 "1: \n\t"\
408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
433
434 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
435
436 #define REAL_YSCALEYUV2RGB_UV(index, c) \
437 "xor "#index", "#index" \n\t"\
438 ASMALIGN(4)\
439 "1: \n\t"\
440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
460
461 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
462 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
463 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
464 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
465 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
474
475 #define REAL_YSCALEYUV2RGB_COEFF(c) \
476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483 "paddw %%mm3, %%mm4 \n\t"\
484 "movq %%mm2, %%mm0 \n\t"\
485 "movq %%mm5, %%mm6 \n\t"\
486 "movq %%mm4, %%mm3 \n\t"\
487 "punpcklwd %%mm2, %%mm2 \n\t"\
488 "punpcklwd %%mm5, %%mm5 \n\t"\
489 "punpcklwd %%mm4, %%mm4 \n\t"\
490 "paddw %%mm1, %%mm2 \n\t"\
491 "paddw %%mm1, %%mm5 \n\t"\
492 "paddw %%mm1, %%mm4 \n\t"\
493 "punpckhwd %%mm0, %%mm0 \n\t"\
494 "punpckhwd %%mm6, %%mm6 \n\t"\
495 "punpckhwd %%mm3, %%mm3 \n\t"\
496 "paddw %%mm7, %%mm0 \n\t"\
497 "paddw %%mm7, %%mm6 \n\t"\
498 "paddw %%mm7, %%mm3 \n\t"\
499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500 "packuswb %%mm0, %%mm2 \n\t"\
501 "packuswb %%mm6, %%mm5 \n\t"\
502 "packuswb %%mm3, %%mm4 \n\t"\
503
504 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
505
506 #define YSCALEYUV2RGB(index, c) \
507 REAL_YSCALEYUV2RGB_UV(index, c) \
508 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
509 REAL_YSCALEYUV2RGB_COEFF(c)
510
511 #define REAL_YSCALEYUV2PACKED1(index, c) \
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $7, %%mm3 \n\t" \
518 "psraw $7, %%mm4 \n\t" \
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $7, %%mm1 \n\t" \
522 "psraw $7, %%mm7 \n\t" \
523
524 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
525
526 #define REAL_YSCALEYUV2RGB1(index, c) \
527 "xor "#index", "#index" \n\t"\
528 ASMALIGN(4)\
529 "1: \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552 "paddw %%mm3, %%mm4 \n\t"\
553 "movq %%mm2, %%mm0 \n\t"\
554 "movq %%mm5, %%mm6 \n\t"\
555 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklwd %%mm2, %%mm2 \n\t"\
557 "punpcklwd %%mm5, %%mm5 \n\t"\
558 "punpcklwd %%mm4, %%mm4 \n\t"\
559 "paddw %%mm1, %%mm2 \n\t"\
560 "paddw %%mm1, %%mm5 \n\t"\
561 "paddw %%mm1, %%mm4 \n\t"\
562 "punpckhwd %%mm0, %%mm0 \n\t"\
563 "punpckhwd %%mm6, %%mm6 \n\t"\
564 "punpckhwd %%mm3, %%mm3 \n\t"\
565 "paddw %%mm7, %%mm0 \n\t"\
566 "paddw %%mm7, %%mm6 \n\t"\
567 "paddw %%mm7, %%mm3 \n\t"\
568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569 "packuswb %%mm0, %%mm2 \n\t"\
570 "packuswb %%mm6, %%mm5 \n\t"\
571 "packuswb %%mm3, %%mm4 \n\t"\
572
573 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
574
575 #define REAL_YSCALEYUV2PACKED1b(index, c) \
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $8, %%mm3 \n\t" \
586 "psrlw $8, %%mm4 \n\t" \
587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
589 "psraw $7, %%mm1 \n\t" \
590 "psraw $7, %%mm7 \n\t"
591 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
592
593 // do vertical chrominance interpolation
594 #define REAL_YSCALEYUV2RGB1b(index, c) \
595 "xor "#index", "#index" \n\t"\
596 ASMALIGN(4)\
597 "1: \n\t"\
598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624 "paddw %%mm3, %%mm4 \n\t"\
625 "movq %%mm2, %%mm0 \n\t"\
626 "movq %%mm5, %%mm6 \n\t"\
627 "movq %%mm4, %%mm3 \n\t"\
628 "punpcklwd %%mm2, %%mm2 \n\t"\
629 "punpcklwd %%mm5, %%mm5 \n\t"\
630 "punpcklwd %%mm4, %%mm4 \n\t"\
631 "paddw %%mm1, %%mm2 \n\t"\
632 "paddw %%mm1, %%mm5 \n\t"\
633 "paddw %%mm1, %%mm4 \n\t"\
634 "punpckhwd %%mm0, %%mm0 \n\t"\
635 "punpckhwd %%mm6, %%mm6 \n\t"\
636 "punpckhwd %%mm3, %%mm3 \n\t"\
637 "paddw %%mm7, %%mm0 \n\t"\
638 "paddw %%mm7, %%mm6 \n\t"\
639 "paddw %%mm7, %%mm3 \n\t"\
640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641 "packuswb %%mm0, %%mm2 \n\t"\
642 "packuswb %%mm6, %%mm5 \n\t"\
643 "packuswb %%mm3, %%mm4 \n\t"\
644
645 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
646
647 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
648 "movq "#b", "#q2" \n\t" /* B */\
649 "movq "#r", "#t" \n\t" /* R */\
650 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
651 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
652 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
653 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
654 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
655 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
656 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
657 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
658 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
659 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
660 \
661 MOVNTQ( q0, (dst, index, 4))\
662 MOVNTQ( b, 8(dst, index, 4))\
663 MOVNTQ( q2, 16(dst, index, 4))\
664 MOVNTQ( q3, 24(dst, index, 4))\
665 \
666 "add $8, "#index" \n\t"\
667 "cmp "#dstw", "#index" \n\t"\
668 " jb 1b \n\t"
669 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
670
671 #define REAL_WRITERGB16(dst, dstw, index) \
672 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
673 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
674 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
675 "psrlq $3, %%mm2 \n\t"\
676 \
677 "movq %%mm2, %%mm1 \n\t"\
678 "movq %%mm4, %%mm3 \n\t"\
679 \
680 "punpcklbw %%mm7, %%mm3 \n\t"\
681 "punpcklbw %%mm5, %%mm2 \n\t"\
682 "punpckhbw %%mm7, %%mm4 \n\t"\
683 "punpckhbw %%mm5, %%mm1 \n\t"\
684 \
685 "psllq $3, %%mm3 \n\t"\
686 "psllq $3, %%mm4 \n\t"\
687 \
688 "por %%mm3, %%mm2 \n\t"\
689 "por %%mm4, %%mm1 \n\t"\
690 \
691 MOVNTQ(%%mm2, (dst, index, 2))\
692 MOVNTQ(%%mm1, 8(dst, index, 2))\
693 \
694 "add $8, "#index" \n\t"\
695 "cmp "#dstw", "#index" \n\t"\
696 " jb 1b \n\t"
697 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
698
699 #define REAL_WRITERGB15(dst, dstw, index) \
700 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
701 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
702 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
703 "psrlq $3, %%mm2 \n\t"\
704 "psrlq $1, %%mm5 \n\t"\
705 \
706 "movq %%mm2, %%mm1 \n\t"\
707 "movq %%mm4, %%mm3 \n\t"\
708 \
709 "punpcklbw %%mm7, %%mm3 \n\t"\
710 "punpcklbw %%mm5, %%mm2 \n\t"\
711 "punpckhbw %%mm7, %%mm4 \n\t"\
712 "punpckhbw %%mm5, %%mm1 \n\t"\
713 \
714 "psllq $2, %%mm3 \n\t"\
715 "psllq $2, %%mm4 \n\t"\
716 \
717 "por %%mm3, %%mm2 \n\t"\
718 "por %%mm4, %%mm1 \n\t"\
719 \
720 MOVNTQ(%%mm2, (dst, index, 2))\
721 MOVNTQ(%%mm1, 8(dst, index, 2))\
722 \
723 "add $8, "#index" \n\t"\
724 "cmp "#dstw", "#index" \n\t"\
725 " jb 1b \n\t"
726 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
727
728 #define WRITEBGR24OLD(dst, dstw, index) \
729 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
730 "movq %%mm2, %%mm1 \n\t" /* B */\
731 "movq %%mm5, %%mm6 \n\t" /* R */\
732 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
733 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
734 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
735 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
736 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
737 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
738 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
739 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
740 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
741 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
742 \
743 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
744 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
746 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
747 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
748 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
749 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
750 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
751 \
752 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
753 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
754 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
755 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
756 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
757 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
758 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
759 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
760 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
761 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
762 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
763 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
764 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
765 \
766 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
767 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
768 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
769 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
770 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
771 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
772 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
773 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
774 \
775 MOVNTQ(%%mm0, (dst))\
776 MOVNTQ(%%mm2, 8(dst))\
777 MOVNTQ(%%mm3, 16(dst))\
778 "add $24, "#dst" \n\t"\
779 \
780 "add $8, "#index" \n\t"\
781 "cmp "#dstw", "#index" \n\t"\
782 " jb 1b \n\t"
783
784 #define WRITEBGR24MMX(dst, dstw, index) \
785 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
786 "movq %%mm2, %%mm1 \n\t" /* B */\
787 "movq %%mm5, %%mm6 \n\t" /* R */\
788 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
789 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
790 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
791 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
792 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
793 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
794 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
795 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
796 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
797 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
798 \
799 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
800 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
801 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
802 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
803 \
804 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
805 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
806 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
807 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
808 \
809 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
810 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
811 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
812 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
813 \
814 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
815 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
816 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
817 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
818 MOVNTQ(%%mm0, (dst))\
819 \
820 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
821 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
822 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
823 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
824 MOVNTQ(%%mm6, 8(dst))\
825 \
826 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
827 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
828 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
829 MOVNTQ(%%mm5, 16(dst))\
830 \
831 "add $24, "#dst" \n\t"\
832 \
833 "add $8, "#index" \n\t"\
834 "cmp "#dstw", "#index" \n\t"\
835 " jb 1b \n\t"
836
837 #define WRITEBGR24MMX2(dst, dstw, index) \
838 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
839 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
840 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
841 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
842 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
843 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
844 \
845 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
846 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
847 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
848 \
849 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
850 "por %%mm1, %%mm6 \n\t"\
851 "por %%mm3, %%mm6 \n\t"\
852 MOVNTQ(%%mm6, (dst))\
853 \
854 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
855 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
856 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
857 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
858 \
859 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
860 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
861 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
862 \
863 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
864 "por %%mm3, %%mm6 \n\t"\
865 MOVNTQ(%%mm6, 8(dst))\
866 \
867 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
868 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
869 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
870 \
871 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
872 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
873 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
874 \
875 "por %%mm1, %%mm3 \n\t"\
876 "por %%mm3, %%mm6 \n\t"\
877 MOVNTQ(%%mm6, 16(dst))\
878 \
879 "add $24, "#dst" \n\t"\
880 \
881 "add $8, "#index" \n\t"\
882 "cmp "#dstw", "#index" \n\t"\
883 " jb 1b \n\t"
884
885 #if HAVE_MMX2
886 #undef WRITEBGR24
887 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
888 #else
889 #undef WRITEBGR24
890 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
891 #endif
892
893 #define REAL_WRITEYUY2(dst, dstw, index) \
894 "packuswb %%mm3, %%mm3 \n\t"\
895 "packuswb %%mm4, %%mm4 \n\t"\
896 "packuswb %%mm7, %%mm1 \n\t"\
897 "punpcklbw %%mm4, %%mm3 \n\t"\
898 "movq %%mm1, %%mm7 \n\t"\
899 "punpcklbw %%mm3, %%mm1 \n\t"\
900 "punpckhbw %%mm3, %%mm7 \n\t"\
901 \
902 MOVNTQ(%%mm1, (dst, index, 2))\
903 MOVNTQ(%%mm7, 8(dst, index, 2))\
904 \
905 "add $8, "#index" \n\t"\
906 "cmp "#dstw", "#index" \n\t"\
907 " jb 1b \n\t"
908 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
909
910
911 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
912 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
913 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
914 {
915 #if HAVE_MMX
916 if(!(c->flags & SWS_BITEXACT)){
917 if (c->flags & SWS_ACCURATE_RND){
918 if (uDest){
919 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
920 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
921 }
922
923 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
924 }else{
925 if (uDest){
926 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
927 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
928 }
929
930 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
931 }
932 return;
933 }
934 #endif
935 #if HAVE_ALTIVEC
936 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
937 chrFilter, chrSrc, chrFilterSize,
938 dest, uDest, vDest, dstW, chrDstW);
939 #else //HAVE_ALTIVEC
940 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
941 chrFilter, chrSrc, chrFilterSize,
942 dest, uDest, vDest, dstW, chrDstW);
943 #endif //!HAVE_ALTIVEC
944 }
945
946 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
947 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
948 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
949 {
950 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
951 chrFilter, chrSrc, chrFilterSize,
952 dest, uDest, dstW, chrDstW, dstFormat);
953 }
954
955 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
956 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
957 {
958 int i;
959 #if HAVE_MMX
960 if(!(c->flags & SWS_BITEXACT)){
961 long p= uDest ? 3 : 1;
962 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
963 uint8_t *dst[3]= {dest, uDest, vDest};
964 long counter[3] = {dstW, chrDstW, chrDstW};
965
966 if (c->flags & SWS_ACCURATE_RND){
967 while(p--){
968 __asm__ volatile(
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
971 "g" (-counter[p])
972 : "%"REG_a
973 );
974 }
975 }else{
976 while(p--){
977 __asm__ volatile(
978 YSCALEYUV2YV121
979 :: "r" (src[p]), "r" (dst[p] + counter[p]),
980 "g" (-counter[p])
981 : "%"REG_a
982 );
983 }
984 }
985 return;
986 }
987 #endif
988 for (i=0; i<dstW; i++)
989 {
990 int val= (lumSrc[i]+64)>>7;
991
992 if (val&256){
993 if (val<0) val=0;
994 else val=255;
995 }
996
997 dest[i]= val;
998 }
999
1000 if (uDest)
1001 for (i=0; i<chrDstW; i++)
1002 {
1003 int u=(chrSrc[i ]+64)>>7;
1004 int v=(chrSrc[i + VOFW]+64)>>7;
1005
1006 if ((u|v)&256){
1007 if (u<0) u=0;
1008 else if (u>255) u=255;
1009 if (v<0) v=0;
1010 else if (v>255) v=255;
1011 }
1012
1013 uDest[i]= u;
1014 vDest[i]= v;
1015 }
1016 }
1017
1018
1019 /**
1020 * vertical scale YV12 to RGB
1021 */
1022 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1023 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1024 uint8_t *dest, long dstW, long dstY)
1025 {
1026 #if HAVE_MMX
1027 long dummy=0;
1028 if(!(c->flags & SWS_BITEXACT)){
1029 if (c->flags & SWS_ACCURATE_RND){
1030 switch(c->dstFormat){
1031 case PIX_FMT_RGB32:
1032 YSCALEYUV2PACKEDX_ACCURATE
1033 YSCALEYUV2RGBX
1034 "pcmpeqd %%mm7, %%mm7 \n\t"
1035 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1036
1037 YSCALEYUV2PACKEDX_END
1038 return;
1039 case PIX_FMT_BGR24:
1040 YSCALEYUV2PACKEDX_ACCURATE
1041 YSCALEYUV2RGBX
1042 "pxor %%mm7, %%mm7 \n\t"
1043 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1044 "add %4, %%"REG_c" \n\t"
1045 WRITEBGR24(%%REGc, %5, %%REGa)
1046
1047
1048 :: "r" (&c->redDither),
1049 "m" (dummy), "m" (dummy), "m" (dummy),
1050 "r" (dest), "m" (dstW)
1051 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1052 );
1053 return;
1054 case PIX_FMT_RGB555:
1055 YSCALEYUV2PACKEDX_ACCURATE
1056 YSCALEYUV2RGBX
1057 "pxor %%mm7, %%mm7 \n\t"
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059 #ifdef DITHER1XBPP
1060 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1061 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1062 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1063 #endif
1064
1065 WRITERGB15(%4, %5, %%REGa)
1066 YSCALEYUV2PACKEDX_END
1067 return;
1068 case PIX_FMT_RGB565:
1069 YSCALEYUV2PACKEDX_ACCURATE
1070 YSCALEYUV2RGBX
1071 "pxor %%mm7, %%mm7 \n\t"
1072 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1073 #ifdef DITHER1XBPP
1074 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1075 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1076 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1077 #endif
1078
1079 WRITERGB16(%4, %5, %%REGa)
1080 YSCALEYUV2PACKEDX_END
1081 return;
1082 case PIX_FMT_YUYV422:
1083 YSCALEYUV2PACKEDX_ACCURATE
1084 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1085
1086 "psraw $3, %%mm3 \n\t"
1087 "psraw $3, %%mm4 \n\t"
1088 "psraw $3, %%mm1 \n\t"
1089 "psraw $3, %%mm7 \n\t"
1090 WRITEYUY2(%4, %5, %%REGa)
1091 YSCALEYUV2PACKEDX_END
1092 return;
1093 }
1094 }else{
1095 switch(c->dstFormat)
1096 {
1097 case PIX_FMT_RGB32:
1098 YSCALEYUV2PACKEDX
1099 YSCALEYUV2RGBX
1100 "pcmpeqd %%mm7, %%mm7 \n\t"
1101 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1102 YSCALEYUV2PACKEDX_END
1103 return;
1104 case PIX_FMT_BGR24:
1105 YSCALEYUV2PACKEDX
1106 YSCALEYUV2RGBX
1107 "pxor %%mm7, %%mm7 \n\t"
1108 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1109 "add %4, %%"REG_c" \n\t"
1110 WRITEBGR24(%%REGc, %5, %%REGa)
1111
1112 :: "r" (&c->redDither),
1113 "m" (dummy), "m" (dummy), "m" (dummy),
1114 "r" (dest), "m" (dstW)
1115 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1116 );
1117 return;
1118 case PIX_FMT_RGB555:
1119 YSCALEYUV2PACKEDX
1120 YSCALEYUV2RGBX
1121 "pxor %%mm7, %%mm7 \n\t"
1122 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1123 #ifdef DITHER1XBPP
1124 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1125 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1126 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1127 #endif
1128
1129 WRITERGB15(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1131 return;
1132 case PIX_FMT_RGB565:
1133 YSCALEYUV2PACKEDX
1134 YSCALEYUV2RGBX
1135 "pxor %%mm7, %%mm7 \n\t"
1136 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1137 #ifdef DITHER1XBPP
1138 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1139 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1140 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1141 #endif
1142
1143 WRITERGB16(%4, %5, %%REGa)
1144 YSCALEYUV2PACKEDX_END
1145 return;
1146 case PIX_FMT_YUYV422:
1147 YSCALEYUV2PACKEDX
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149
1150 "psraw $3, %%mm3 \n\t"
1151 "psraw $3, %%mm4 \n\t"
1152 "psraw $3, %%mm1 \n\t"
1153 "psraw $3, %%mm7 \n\t"
1154 WRITEYUY2(%4, %5, %%REGa)
1155 YSCALEYUV2PACKEDX_END
1156 return;
1157 }
1158 }
1159 }
1160 #endif /* HAVE_MMX */
1161 #if HAVE_ALTIVEC
1162 /* The following list of supported dstFormat values should
1163 match what's found in the body of ff_yuv2packedX_altivec() */
1164 if (!(c->flags & SWS_BITEXACT) &&
1165 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1166 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1167 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1168 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1169 chrFilter, chrSrc, chrFilterSize,
1170 dest, dstW, dstY);
1171 else
1172 #endif
1173 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1174 chrFilter, chrSrc, chrFilterSize,
1175 dest, dstW, dstY);
1176 }
1177
1178 /**
1179 * vertical bilinear scale YV12 to RGB
1180 */
1181 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1182 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1183 {
1184 int yalpha1=4095- yalpha;
1185 int uvalpha1=4095-uvalpha;
1186 int i;
1187
1188 #if HAVE_MMX
1189 if(!(c->flags & SWS_BITEXACT)){
1190 switch(c->dstFormat)
1191 {
1192 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1193 case PIX_FMT_RGB32:
1194 __asm__ volatile(
1195 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1196 "mov %4, %%"REG_b" \n\t"
1197 "push %%"REG_BP" \n\t"
1198 YSCALEYUV2RGB(%%REGBP, %5)
1199 "pcmpeqd %%mm7, %%mm7 \n\t"
1200 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1201 "pop %%"REG_BP" \n\t"
1202 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1203
1204 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1205 "a" (&c->redDither)
1206 );
1207 return;
1208 case PIX_FMT_BGR24:
1209 __asm__ volatile(
1210 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1211 "mov %4, %%"REG_b" \n\t"
1212 "push %%"REG_BP" \n\t"
1213 YSCALEYUV2RGB(%%REGBP, %5)
1214 "pxor %%mm7, %%mm7 \n\t"
1215 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1216 "pop %%"REG_BP" \n\t"
1217 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1218 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1219 "a" (&c->redDither)
1220 );
1221 return;
1222 case PIX_FMT_RGB555:
1223 __asm__ volatile(
1224 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1225 "mov %4, %%"REG_b" \n\t"
1226 "push %%"REG_BP" \n\t"
1227 YSCALEYUV2RGB(%%REGBP, %5)
1228 "pxor %%mm7, %%mm7 \n\t"
1229 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1230 #ifdef DITHER1XBPP
1231 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1232 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1233 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1234 #endif
1235
1236 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1237 "pop %%"REG_BP" \n\t"
1238 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1239
1240 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1241 "a" (&c->redDither)
1242 );
1243 return;
1244 case PIX_FMT_RGB565:
1245 __asm__ volatile(
1246 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1247 "mov %4, %%"REG_b" \n\t"
1248 "push %%"REG_BP" \n\t"
1249 YSCALEYUV2RGB(%%REGBP, %5)
1250 "pxor %%mm7, %%mm7 \n\t"
1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1252 #ifdef DITHER1XBPP
1253 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1254 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1255 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1256 #endif
1257
1258 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1259 "pop %%"REG_BP" \n\t"
1260 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1261 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1262 "a" (&c->redDither)
1263 );
1264 return;
1265 case PIX_FMT_YUYV422:
1266 __asm__ volatile(
1267 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1268 "mov %4, %%"REG_b" \n\t"
1269 "push %%"REG_BP" \n\t"
1270 YSCALEYUV2PACKED(%%REGBP, %5)
1271 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1272 "pop %%"REG_BP" \n\t"
1273 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275 "a" (&c->redDither)
1276 );
1277 return;
1278 default: break;
1279 }
1280 }
1281 #endif //HAVE_MMX
1282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1283 }
1284
1285 /**
1286 * YV12 to RGB without scaling or interpolating
1287 */
1288 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1290 {
1291 const int yalpha1=0;
1292 int i;
1293
1294 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1295 const int yalpha= 4096; //FIXME ...
1296
1297 if (flags&SWS_FULL_CHR_H_INT)
1298 {
1299 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300 return;
1301 }
1302
1303 #if HAVE_MMX
1304 if(!(flags & SWS_BITEXACT)){
1305 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1306 {
1307 switch(dstFormat)
1308 {
1309 case PIX_FMT_RGB32:
1310 __asm__ volatile(
1311 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1312 "mov %4, %%"REG_b" \n\t"
1313 "push %%"REG_BP" \n\t"
1314 YSCALEYUV2RGB1(%%REGBP, %5)
1315 "pcmpeqd %%mm7, %%mm7 \n\t"
1316 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1317 "pop %%"REG_BP" \n\t"
1318 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1319
1320 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1321 "a" (&c->redDither)
1322 );
1323 return;
1324 case PIX_FMT_BGR24:
1325 __asm__ volatile(
1326 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1327 "mov %4, %%"REG_b" \n\t"
1328 "push %%"REG_BP" \n\t"
1329 YSCALEYUV2RGB1(%%REGBP, %5)
1330 "pxor %%mm7, %%mm7 \n\t"
1331 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1332 "pop %%"REG_BP" \n\t"
1333 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1334
1335 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1336 "a" (&c->redDither)
1337 );
1338 return;
1339 case PIX_FMT_RGB555:
1340 __asm__ volatile(
1341 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1342 "mov %4, %%"REG_b" \n\t"
1343 "push %%"REG_BP" \n\t"
1344 YSCALEYUV2RGB1(%%REGBP, %5)
1345 "pxor %%mm7, %%mm7 \n\t"
1346 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1347 #ifdef DITHER1XBPP
1348 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1349 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1350 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1351 #endif
1352 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1353 "pop %%"REG_BP" \n\t"
1354 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1355
1356 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1357 "a" (&c->redDither)
1358 );
1359 return;
1360 case PIX_FMT_RGB565:
1361 __asm__ volatile(
1362 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1363 "mov %4, %%"REG_b" \n\t"
1364 "push %%"REG_BP" \n\t"
1365 YSCALEYUV2RGB1(%%REGBP, %5)
1366 "pxor %%mm7, %%mm7 \n\t"
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368 #ifdef DITHER1XBPP
1369 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1370 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1371 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1372 #endif
1373
1374 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1375 "pop %%"REG_BP" \n\t"
1376 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1377
1378 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1379 "a" (&c->redDither)
1380 );
1381 return;
1382 case PIX_FMT_YUYV422:
1383 __asm__ volatile(
1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_b" \n\t"
1386 "push %%"REG_BP" \n\t"
1387 YSCALEYUV2PACKED1(%%REGBP, %5)
1388 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1389 "pop %%"REG_BP" \n\t"
1390 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1391
1392 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1393 "a" (&c->redDither)
1394 );
1395 return;
1396 }
1397 }
1398 else
1399 {
1400 switch(dstFormat)
1401 {
1402 case PIX_FMT_RGB32:
1403 __asm__ volatile(
1404 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1405 "mov %4, %%"REG_b" \n\t"
1406 "push %%"REG_BP" \n\t"
1407 YSCALEYUV2RGB1b(%%REGBP, %5)
1408 "pcmpeqd %%mm7, %%mm7 \n\t"
1409 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1410 "pop %%"REG_BP" \n\t"
1411 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1412
1413 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414 "a" (&c->redDither)
1415 );
1416 return;
1417 case PIX_FMT_BGR24:
1418 __asm__ volatile(
1419 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1420 "mov %4, %%"REG_b" \n\t"
1421 "push %%"REG_BP" \n\t"
1422 YSCALEYUV2RGB1b(%%REGBP, %5)
1423 "pxor %%mm7, %%mm7 \n\t"
1424 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1425 "pop %%"REG_BP" \n\t"
1426 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1427
1428 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1429 "a" (&c->redDither)
1430 );
1431 return;
1432 case PIX_FMT_RGB555:
1433 __asm__ volatile(
1434 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1435 "mov %4, %%"REG_b" \n\t"
1436 "push %%"REG_BP" \n\t"
1437 YSCALEYUV2RGB1b(%%REGBP, %5)
1438 "pxor %%mm7, %%mm7 \n\t"
1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1440 #ifdef DITHER1XBPP
1441 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1442 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1443 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1444 #endif
1445 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1446 "pop %%"REG_BP" \n\t"
1447 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1448
1449 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1450 "a" (&c->redDither)
1451 );
1452 return;
1453 case PIX_FMT_RGB565:
1454 __asm__ volatile(
1455 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1456 "mov %4, %%"REG_b" \n\t"
1457 "push %%"REG_BP" \n\t"
1458 YSCALEYUV2RGB1b(%%REGBP, %5)
1459 "pxor %%mm7, %%mm7 \n\t"
1460 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1461 #ifdef DITHER1XBPP
1462 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1463 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1464 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1465 #endif
1466
1467 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1468 "pop %%"REG_BP" \n\t"
1469 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1470
1471 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1472 "a" (&c->redDither)
1473 );
1474 return;
1475 case PIX_FMT_YUYV422:
1476 __asm__ volatile(
1477 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1478 "mov %4, %%"REG_b" \n\t"
1479 "push %%"REG_BP" \n\t"
1480 YSCALEYUV2PACKED1b(%%REGBP, %5)
1481 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1482 "pop %%"REG_BP" \n\t"
1483 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1484
1485 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1486 "a" (&c->redDither)
1487 );
1488 return;
1489 }
1490 }
1491 }
1492 #endif /* HAVE_MMX */
1493 if (uvalpha < 2048)
1494 {
1495 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1496 }else{
1497 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1498 }
1499 }
1500
1501 //FIXME yuy2* can read up to 7 samples too much
1502
1503 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1504 {
1505 #if HAVE_MMX
1506 __asm__ volatile(
1507 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1508 "mov %0, %%"REG_a" \n\t"
1509 "1: \n\t"
1510 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1511 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1512 "pand %%mm2, %%mm0 \n\t"
1513 "pand %%mm2, %%mm1 \n\t"
1514 "packuswb %%mm1, %%mm0 \n\t"
1515 "movq %%mm0, (%2, %%"REG_a") \n\t"
1516 "add $8, %%"REG_a" \n\t"
1517 " js 1b \n\t"
1518 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1519 : "%"REG_a
1520 );
1521 #else
1522 int i;
1523 for (i=0; i<width; i++)
1524 dst[i]= src[2*i];
1525 #endif
1526 }
1527
1528 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1529 {
1530 #if HAVE_MMX
1531 __asm__ volatile(
1532 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1533 "mov %0, %%"REG_a" \n\t"
1534 "1: \n\t"
1535 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1536 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1537 "psrlw $8, %%mm0 \n\t"
1538 "psrlw $8, %%mm1 \n\t"
1539 "packuswb %%mm1, %%mm0 \n\t"
1540 "movq %%mm0, %%mm1 \n\t"
1541 "psrlw $8, %%mm0 \n\t"
1542 "pand %%mm4, %%mm1 \n\t"
1543 "packuswb %%mm0, %%mm0 \n\t"
1544 "packuswb %%mm1, %%mm1 \n\t"
1545 "movd %%mm0, (%3, %%"REG_a") \n\t"
1546 "movd %%mm1, (%2, %%"REG_a") \n\t"
1547 "add $4, %%"REG_a" \n\t"
1548 " js 1b \n\t"
1549 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1550 : "%"REG_a
1551 );
1552 #else
1553 int i;
1554 for (i=0; i<width; i++)
1555 {
1556 dstU[i]= src1[4*i + 1];
1557 dstV[i]= src1[4*i + 3];
1558 }
1559 #endif
1560 assert(src1 == src2);
1561 }
1562
1563 /* This is almost identical to the previous, end exists only because
1564 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1565 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1566 {
1567 #if HAVE_MMX
1568 __asm__ volatile(
1569 "mov %0, %%"REG_a" \n\t"
1570 "1: \n\t"
1571 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1572 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1573 "psrlw $8, %%mm0 \n\t"
1574 "psrlw $8, %%mm1 \n\t"
1575 "packuswb %%mm1, %%mm0 \n\t"
1576 "movq %%mm0, (%2, %%"REG_a") \n\t"
1577 "add $8, %%"REG_a" \n\t"
1578 " js 1b \n\t"
1579 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1580 : "%"REG_a
1581 );
1582 #else
1583 int i;
1584 for (i=0; i<width; i++)
1585 dst[i]= src[2*i+1];
1586 #endif
1587 }
1588
1589 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1590 {
1591 #if HAVE_MMX
1592 __asm__ volatile(
1593 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1594 "mov %0, %%"REG_a" \n\t"
1595 "1: \n\t"
1596 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1597 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1598 "pand %%mm4, %%mm0 \n\t"
1599 "pand %%mm4, %%mm1 \n\t"
1600 "packuswb %%mm1, %%mm0 \n\t"
1601 "movq %%mm0, %%mm1 \n\t"
1602 "psrlw $8, %%mm0 \n\t"
1603 "pand %%mm4, %%mm1 \n\t"
1604 "packuswb %%mm0, %%mm0 \n\t"
1605 "packuswb %%mm1, %%mm1 \n\t"
1606 "movd %%mm0, (%3, %%"REG_a") \n\t"
1607 "movd %%mm1, (%2, %%"REG_a") \n\t"
1608 "add $4, %%"REG_a" \n\t"
1609 " js 1b \n\t"
1610 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1611 : "%"REG_a
1612 );
1613 #else
1614 int i;
1615 for (i=0; i<width; i++)
1616 {
1617 dstU[i]= src1[4*i + 0];
1618 dstV[i]= src1[4*i + 2];
1619 }
1620 #endif
1621 assert(src1 == src2);
1622 }
1623
1624 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1625 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1626 {\
1627 int i;\
1628 for (i=0; i<width; i++)\
1629 {\
1630 int b= (((type*)src)[i]>>shb)&maskb;\
1631 int g= (((type*)src)[i]>>shg)&maskg;\
1632 int r= (((type*)src)[i]>>shr)&maskr;\
1633 \
1634 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1635 }\
1636 }
1637
1638 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1639 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1640 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1641 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1642 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1643 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1644
1645 #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1646 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1647 {\
1648 int i;\
1649 for (i=0; i<width; i++)\
1650 {\
1651 int b= (((type*)src)[i]&maskb)>>shb;\
1652 int g= (((type*)src)[i]&maskg)>>shg;\
1653 int r= (((type*)src)[i]&maskr)>>shr;\
1654 \
1655 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1656 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1657 }\
1658 }\
1659 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1660 {\
1661 int i;\
1662 for (i=0; i<width; i++)\
1663 {\
1664 int pix0= ((type*)src)[2*i+0];\
1665 int pix1= ((type*)src)[2*i+1];\
1666 int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1667 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1668 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1669 g&= maskg|(2*maskg);\
1670 \
1671 g>>=shg;\
1672 \
1673 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1674 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1675 }\
1676 }
1677
1678 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1679 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1680 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1681 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1682 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1683 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1684
1685 #if HAVE_MMX
1686 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1687 {
1688
1689 if(srcFormat == PIX_FMT_BGR24){
1690 __asm__ volatile(
1691 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1692 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1693 :
1694 );
1695 }else{
1696 __asm__ volatile(
1697 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1698 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1699 :
1700 );
1701 }
1702
1703 __asm__ volatile(
1704 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1705 "mov %2, %%"REG_a" \n\t"
1706 "pxor %%mm7, %%mm7 \n\t"
1707 "1: \n\t"
1708 PREFETCH" 64(%0) \n\t"
1709 "movd (%0), %%mm0 \n\t"
1710 "movd 2(%0), %%mm1 \n\t"
1711 "movd 6(%0), %%mm2 \n\t"
1712 "movd 8(%0), %%mm3 \n\t"
1713 "add $12, %0 \n\t"
1714 "punpcklbw %%mm7, %%mm0 \n\t"
1715 "punpcklbw %%mm7, %%mm1 \n\t"
1716 "punpcklbw %%mm7, %%mm2 \n\t"
1717 "punpcklbw %%mm7, %%mm3 \n\t"
1718 "pmaddwd %%mm5, %%mm0 \n\t"
1719 "pmaddwd %%mm6, %%mm1 \n\t"
1720 "pmaddwd %%mm5, %%mm2 \n\t"
1721 "pmaddwd %%mm6, %%mm3 \n\t"
1722 "paddd %%mm1, %%mm0 \n\t"
1723 "paddd %%mm3, %%mm2 \n\t"
1724 "paddd %%mm4, %%mm0 \n\t"
1725 "paddd %%mm4, %%mm2 \n\t"
1726 "psrad $15, %%mm0 \n\t"
1727 "psrad $15, %%mm2 \n\t"
1728 "packssdw %%mm2, %%mm0 \n\t"
1729 "packuswb %%mm0, %%mm0 \n\t"
1730 "movd %%mm0, (%1, %%"REG_a") \n\t"
1731 "add $4, %%"REG_a" \n\t"
1732 " js 1b \n\t"
1733 : "+r" (src)
1734 : "r" (dst+width), "g" (-width)
1735 : "%"REG_a
1736 );
1737 }
1738
1739 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1740 {
1741 __asm__ volatile(
1742 "movq 24+%4, %%mm6 \n\t"
1743 "mov %3, %%"REG_a" \n\t"
1744 "pxor %%mm7, %%mm7 \n\t"
1745 "1: \n\t"
1746 PREFETCH" 64(%0) \n\t"
1747 "movd (%0), %%mm0 \n\t"
1748 "movd 2(%0), %%mm1 \n\t"
1749 "punpcklbw %%mm7, %%mm0 \n\t"
1750 "punpcklbw %%mm7, %%mm1 \n\t"
1751 "movq %%mm0, %%mm2 \n\t"
1752 "movq %%mm1, %%mm3 \n\t"
1753 "pmaddwd %4, %%mm0 \n\t"
1754 "pmaddwd 8+%4, %%mm1 \n\t"
1755 "pmaddwd 16+%4, %%mm2 \n\t"
1756 "pmaddwd %%mm6, %%mm3 \n\t"
1757 "paddd %%mm1, %%mm0 \n\t"
1758 "paddd %%mm3, %%mm2 \n\t"
1759
1760 "movd 6(%0), %%mm1 \n\t"
1761 "movd 8(%0), %%mm3 \n\t"
1762 "add $12, %0 \n\t"
1763 "punpcklbw %%mm7, %%mm1 \n\t"
1764 "punpcklbw %%mm7, %%mm3 \n\t"
1765 "movq %%mm1, %%mm4 \n\t"
1766 "movq %%mm3, %%mm5 \n\t"
1767 "pmaddwd %4, %%mm1 \n\t"
1768 "pmaddwd 8+%4, %%mm3 \n\t"
1769 "pmaddwd 16+%4, %%mm4 \n\t"
1770 "pmaddwd %%mm6, %%mm5 \n\t"
1771 "paddd %%mm3, %%mm1 \n\t"
1772 "paddd %%mm5, %%mm4 \n\t"
1773
1774 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1775 "paddd %%mm3, %%mm0 \n\t"
1776 "paddd %%mm3, %%mm2 \n\t"
1777 "paddd %%mm3, %%mm1 \n\t"
1778 "paddd %%mm3, %%mm4 \n\t"
1779 "psrad $15, %%mm0 \n\t"
1780 "psrad $15, %%mm2 \n\t"
1781 "psrad $15, %%mm1 \n\t"
1782 "psrad $15, %%mm4 \n\t"
1783 "packssdw %%mm1, %%mm0 \n\t"
1784 "packssdw %%mm4, %%mm2 \n\t"
1785 "packuswb %%mm0, %%mm0 \n\t"
1786 "packuswb %%mm2, %%mm2 \n\t"
1787 "movd %%mm0, (%1, %%"REG_a") \n\t"
1788 "movd %%mm2, (%2, %%"REG_a") \n\t"
1789 "add $4, %%"REG_a" \n\t"
1790 " js 1b \n\t"
1791 : "+r" (src)
1792 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1793 : "%"REG_a
1794 );
1795 }
1796 #endif
1797
1798 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1799 {
1800 #if HAVE_MMX
1801 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1802 #else
1803 int i;
1804 for (i=0; i<width; i++)
1805 {
1806 int b= src[i*3+0];
1807 int g= src[i*3+1];
1808 int r= src[i*3+2];
1809
1810 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1811 }
1812 #endif /* HAVE_MMX */
1813 }
1814
1815 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1816 {
1817 #if HAVE_MMX
1818 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1819 #else
1820 int i;
1821 for (i=0; i<width; i++)
1822 {
1823 int b= src1[3*i + 0];
1824 int g= src1[3*i + 1];
1825 int r= src1[3*i + 2];
1826
1827 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1828 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1829 }
1830 #endif /* HAVE_MMX */
1831 assert(src1 == src2);
1832 }
1833
1834 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1835 {
1836 int i;
1837 for (i=0; i<width; i++)
1838 {
1839 int b= src1[6*i + 0] + src1[6*i + 3];
1840 int g= src1[6*i + 1] + src1[6*i + 4];
1841 int r= src1[6*i + 2] + src1[6*i + 5];
1842
1843 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1844 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1845 }
1846 assert(src1 == src2);
1847 }
1848
1849 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1850 {
1851 #if HAVE_MMX
1852 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1853 #else
1854 int i;
1855 for (i=0; i<width; i++)
1856 {
1857 int r= src[i*3+0];
1858 int g= src[i*3+1];
1859 int b= src[i*3+2];
1860
1861 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1862 }
1863 #endif
1864 }
1865
1866 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1867 {
1868 #if HAVE_MMX
1869 assert(src1==src2);
1870 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1871 #else
1872 int i;
1873 assert(src1==src2);
1874 for (i=0; i<width; i++)
1875 {
1876 int r= src1[3*i + 0];
1877 int g= src1[3*i + 1];
1878 int b= src1[3*i + 2];
1879
1880 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1881 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1882 }
1883 #endif
1884 }
1885
1886 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1887 {
1888 int i;
1889 assert(src1==src2);
1890 for (i=0; i<width; i++)
1891 {
1892 int r= src1[6*i + 0] + src1[6*i + 3];
1893 int g= src1[6*i + 1] + src1[6*i + 4];
1894 int b= src1[6*i + 2] + src1[6*i + 5];
1895
1896 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1897 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1898 }
1899 }
1900
1901
1902 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
1903 {
1904 int i;
1905 for (i=0; i<width; i++)
1906 {
1907 int d= src[i];
1908
1909 dst[i]= pal[d] & 0xFF;
1910 }
1911 }
1912
1913 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
1914 {
1915 int i;
1916 assert(src1 == src2);
1917 for (i=0; i<width; i++)
1918 {
1919 int p= pal[src1[i]];
1920
1921 dstU[i]= p>>8;
1922 dstV[i]= p>>16;
1923 }
1924 }
1925
1926 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1927 {
1928 int i, j;
1929 for (i=0; i<width/8; i++){
1930 int d= ~src[i];
1931 for(j=0; j<8; j++)
1932 dst[8*i+j]= ((d>>(7-j))&1)*255;
1933 }
1934 }
1935
1936 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1937 {
1938 int i, j;
1939 for (i=0; i<width/8; i++){
1940 int d= src[i];
1941 for(j=0; j<8; j++)
1942 dst[8*i+j]= ((d>>(7-j))&1)*255;
1943 }
1944 }
1945
1946 // bilinear / bicubic scaling
1947 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1948 int16_t *filter, int16_t *filterPos, long filterSize)
1949 {
1950 #if HAVE_MMX
1951 assert(filterSize % 4 == 0 && filterSize>0);
1952 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
1953 {
1954 long counter= -2*dstW;
1955 filter-= counter*2;
1956 filterPos-= counter/2;
1957 dst-= counter/2;
1958 __asm__ volatile(
1959 #if defined(PIC)
1960 "push %%"REG_b" \n\t"
1961 #endif
1962 "pxor %%mm7, %%mm7 \n\t"
1963 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1964 "mov %%"REG_a", %%"REG_BP" \n\t"
1965 ASMALIGN(4)
1966 "1: \n\t"
1967 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1968 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1969 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1970 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1971 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1972 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1973 "punpcklbw %%mm7, %%mm0 \n\t"
1974 "punpcklbw %%mm7, %%mm2 \n\t"
1975 "pmaddwd %%mm1, %%mm0 \n\t"
1976 "pmaddwd %%mm2, %%mm3 \n\t"
1977 "movq %%mm0, %%mm4 \n\t"
1978 "punpckldq %%mm3, %%mm0 \n\t"
1979 "punpckhdq %%mm3, %%mm4 \n\t"
1980 "paddd %%mm4, %%mm0 \n\t"
1981 "psrad $7, %%mm0 \n\t"
1982 "packssdw %%mm0, %%mm0 \n\t"
1983 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1984 "add $4, %%"REG_BP" \n\t"
1985 " jnc 1b \n\t"
1986
1987 "pop %%"REG_BP" \n\t"
1988 #if defined(PIC)
1989 "pop %%"REG_b" \n\t"
1990 #endif
1991 : "+a" (counter)
1992 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1993 #if !defined(PIC)
1994 : "%"REG_b
1995 #endif
1996 );
1997 }
1998 else if (filterSize==8)
1999 {
2000 long counter= -2*dstW;
2001 filter-= counter*4;
2002 filterPos-= counter/2;
2003 dst-= counter/2;
2004 __asm__ volatile(
2005 #if defined(PIC)
2006 "push %%"REG_b" \n\t"
2007 #endif
2008 "pxor %%mm7, %%mm7 \n\t"
2009 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2010 "mov %%"REG_a", %%"REG_BP" \n\t"
2011 ASMALIGN(4)
2012 "1: \n\t"
2013 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2014 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2015 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2016 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2017 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2018 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm0 \n\t"
2020 "punpcklbw %%mm7, %%mm2 \n\t"
2021 "pmaddwd %%mm1, %%mm0 \n\t"
2022 "pmaddwd %%mm2, %%mm3 \n\t"
2023
2024 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2025 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2026 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2027 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2028 "punpcklbw %%mm7, %%mm4 \n\t"
2029 "punpcklbw %%mm7, %%mm2 \n\t"
2030 "pmaddwd %%mm1, %%mm4 \n\t"
2031 "pmaddwd %%mm2, %%mm5 \n\t"
2032 "paddd %%mm4, %%mm0 \n\t"
2033 "paddd %%mm5, %%mm3 \n\t"
2034 "movq %%mm0, %%mm4 \n\t"
2035 "punpckldq %%mm3, %%mm0 \n\t"
2036 "punpckhdq %%mm3, %%mm4 \n\t"
2037 "paddd %%mm4, %%mm0 \n\t"
2038 "psrad $7, %%mm0 \n\t"
2039 "packssdw %%mm0, %%mm0 \n\t"
2040 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2041 "add $4, %%"REG_BP" \n\t"
2042 " jnc 1b \n\t"
2043
2044 "pop %%"REG_BP" \n\t"
2045 #if defined(PIC)
2046 "pop %%"REG_b" \n\t"
2047 #endif
2048 : "+a" (counter)
2049 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2050 #if !defined(PIC)
2051 : "%"REG_b
2052 #endif
2053 );
2054 }
2055 else
2056 {
2057 uint8_t *offset = src+filterSize;
2058 long counter= -2*dstW;
2059 //filter-= counter*filterSize/2;
2060 filterPos-= counter/2;
2061 dst-= counter/2;
2062 __asm__ volatile(
2063 "pxor %%mm7, %%mm7 \n\t"
2064 ASMALIGN(4)
2065 "1: \n\t"
2066 "mov %2, %%"REG_c" \n\t"
2067 "movzwl (%%"REG_c", %0), %%eax \n\t"
2068 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2069 "mov %5, %%"REG_c" \n\t"
2070 "pxor %%mm4, %%mm4 \n\t"
2071 "pxor %%mm5, %%mm5 \n\t"
2072 "2: \n\t"
2073 "movq (%1), %%mm1 \n\t"
2074 "movq (%1, %6), %%mm3 \n\t"
2075 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2076 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2077 "punpcklbw %%mm7, %%mm0 \n\t"
2078 "punpcklbw %%mm7, %%mm2 \n\t"
2079 "pmaddwd %%mm1, %%mm0 \n\t"
2080 "pmaddwd %%mm2, %%mm3 \n\t"
2081 "paddd %%mm3, %%mm5 \n\t"
2082 "paddd %%mm0, %%mm4 \n\t"
2083 "add $8, %1 \n\t"
2084 "add $4, %%"REG_c" \n\t"
2085 "cmp %4, %%"REG_c" \n\t"
2086 " jb 2b \n\t"
2087 "add %6, %1 \n\t"
2088 "movq %%mm4, %%mm0 \n\t"
2089 "punpckldq %%mm5, %%mm4 \n\t"
2090 "punpckhdq %%mm5, %%mm0 \n\t"
2091 "paddd %%mm0, %%mm4 \n\t"
2092 "psrad $7, %%mm4 \n\t"
2093 "packssdw %%mm4, %%mm4 \n\t"
2094 "mov %3, %%"REG_a" \n\t"
2095 "movd %%mm4, (%%"REG_a", %0) \n\t"
2096 "add $4, %0 \n\t"
2097 " jnc 1b \n\t"
2098
2099 : "+r" (counter), "+r" (filter)
2100 : "m" (filterPos), "m" (dst), "m"(offset),
2101 "m" (src), "r" (filterSize*2)
2102 : "%"REG_a, "%"REG_c, "%"REG_d
2103 );
2104 }
2105 #else
2106 #if HAVE_ALTIVEC
2107 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2108 #else
2109 int i;
2110 for (i=0; i<dstW; i++)
2111 {
2112 int j;
2113 int srcPos= filterPos[i];
2114 int val=0;
2115 //printf("filterPos: %d\n", filterPos[i]);
2116 for (j=0; j<filterSize; j++)
2117 {
2118 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2119 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2120 }
2121 //filter += hFilterSize;
2122 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2123 //dst[i] = val>>7;
2124 }
2125 #endif /* HAVE_ALTIVEC */
2126 #endif /* HAVE_MMX */
2127 }
2128 // *** horizontal scale Y line to temp buffer
2129 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2130 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2131 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2132 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2133 int32_t *mmx2FilterPos, uint32_t *pal)
2134 {
2135 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2136 {
2137 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2138 src= formatConvBuffer;
2139 }
2140 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2141 {
2142 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2143 src= formatConvBuffer;
2144 }
2145 else if (srcFormat==PIX_FMT_RGB32)
2146 {
2147 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2148 src= formatConvBuffer;
2149 }
2150 else if (srcFormat==PIX_FMT_RGB32_1)
2151 {
2152 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2153 src= formatConvBuffer;
2154 }
2155 else if (srcFormat==PIX_FMT_BGR24)
2156 {
2157 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2158 src= formatConvBuffer;
2159 }
2160 else if (srcFormat==PIX_FMT_BGR565)
2161 {
2162 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2163 src= formatConvBuffer;
2164 }
2165 else if (srcFormat==PIX_FMT_BGR555)
2166 {
2167 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2168 src= formatConvBuffer;
2169 }
2170 else if (srcFormat==PIX_FMT_BGR32)
2171 {
2172 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2173 src= formatConvBuffer;
2174 }
2175 else if (srcFormat==PIX_FMT_BGR32_1)
2176 {
2177 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2178 src= formatConvBuffer;
2179 }
2180 else if (srcFormat==PIX_FMT_RGB24)
2181 {
2182 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2183 src= formatConvBuffer;
2184 }
2185 else if (srcFormat==PIX_FMT_RGB565)
2186 {
2187 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2188 src= formatConvBuffer;
2189 }
2190 else if (srcFormat==PIX_FMT_RGB555)
2191 {
2192 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2193 src= formatConvBuffer;
2194 }
2195 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2196 {
2197 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2198 src= formatConvBuffer;
2199 }
2200 else if (srcFormat==PIX_FMT_MONOBLACK)
2201 {
2202 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2203 src= formatConvBuffer;
2204 }
2205 else if (srcFormat==PIX_FMT_MONOWHITE)
2206 {
2207 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2208 src= formatConvBuffer;
2209 }
2210
2211 #if HAVE_MMX
2212 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2213 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2214 #else
2215 if (!(flags&SWS_FAST_BILINEAR))
2216 #endif
2217 {
2218 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2219 }
2220 else // fast bilinear upscale / crap downscale
2221 {
2222 #if ARCH_X86 && CONFIG_GPL
2223 #if HAVE_MMX2
2224 int i;
2225 #if defined(PIC)
2226 uint64_t ebxsave __attribute__((aligned(8)));
2227 #endif
2228 if (canMMX2BeUsed)
2229 {
2230 __asm__ volatile(
2231 #if defined(PIC)
2232 "mov %%"REG_b", %5 \n\t"
2233 #endif
2234 "pxor %%mm7, %%mm7 \n\t"
2235 "mov %0, %%"REG_c" \n\t"
2236 "mov %1, %%"REG_D" \n\t"
2237 "mov %2, %%"REG_d" \n\t"
2238 "mov %3, %%"REG_b" \n\t"
2239 "xor %%"REG_a", %%"REG_a" \n\t" // i
2240 PREFETCH" (%%"REG_c") \n\t"
2241 PREFETCH" 32(%%"REG_c") \n\t"
2242 PREFETCH" 64(%%"REG_c") \n\t"
2243
2244 #if ARCH_X86_64
2245
2246 #define FUNNY_Y_CODE \
2247 "movl (%%"REG_b"), %%esi \n\t"\
2248 "call *%4 \n\t"\
2249 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2250 "add %%"REG_S", %%"REG_c" \n\t"\
2251 "add %%"REG_a", %%"REG_D" \n\t"\
2252 "xor %%"REG_a", %%"REG_a" \n\t"\
2253
2254 #else
2255
2256 #define FUNNY_Y_CODE \
2257 "movl (%%"REG_b"), %%esi \n\t"\
2258 "call *%4 \n\t"\
2259 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2260 "add %%"REG_a", %%"REG_D" \n\t"\
2261 "xor %%"REG_a", %%"REG_a" \n\t"\
2262
2263 #endif /* ARCH_X86_64 */
2264
2265 FUNNY_Y_CODE
2266 FUNNY_Y_CODE
2267 FUNNY_Y_CODE
2268 FUNNY_Y_CODE
2269 FUNNY_Y_CODE
2270 FUNNY_Y_CODE
2271 FUNNY_Y_CODE
2272 FUNNY_Y_CODE
2273
2274 #if defined(PIC)
2275 "mov %5, %%"REG_b" \n\t"
2276 #endif
2277 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2278 "m" (funnyYCode)
2279 #if defined(PIC)
2280 ,"m" (ebxsave)
2281 #endif
2282 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2283 #if !defined(PIC)
2284 ,"%"REG_b
2285 #endif
2286 );
2287 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2288 }
2289 else
2290 {
2291 #endif /* HAVE_MMX2 */
2292 long xInc_shr16 = xInc >> 16;
2293 uint16_t xInc_mask = xInc & 0xffff;
2294 //NO MMX just normal asm ...
2295 __asm__ volatile(
2296 "xor %%"REG_a", %%"REG_a" \n\t" // i
2297 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2298 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2299 ASMALIGN(4)
2300 "1: \n\t"
2301 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2302 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2303 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2304 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2305 "shll $16, %%edi \n\t"
2306 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2307 "mov %1, %%"REG_D" \n\t"
2308 "shrl $9, %%esi \n\t"
2309 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2310 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2311 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2312
2313 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2314 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2315 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2316 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2317 "shll $16, %%edi \n\t"
2318 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2319 "mov %1, %%"REG_D" \n\t"
2320 "shrl $9, %%esi \n\t"
2321 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2322 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2323 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2324
2325
2326 "add $2, %%"REG_a" \n\t"
2327 "cmp %2, %%"REG_a" \n\t"
2328 " jb 1b \n\t"
2329
2330
2331 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2332 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2333 );
2334 #if HAVE_MMX2
2335 } //if MMX2 can't be used
2336 #endif
2337 #else
2338 int i;
2339 unsigned int xpos=0;
2340 for (i=0;i<dstWidth;i++)
2341 {
2342 register unsigned int xx=xpos>>16;
2343 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2344 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2345 xpos+=xInc;
2346 }
2347 #endif /* ARCH_X86 */
2348 }
2349
2350 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2351 int i;
2352 //FIXME all pal and rgb srcFormats could do this convertion as well
2353 //FIXME all scalers more complex than bilinear could do half of this transform
2354 if(c->srcRange){
2355 for (i=0; i<dstWidth; i++)
2356 dst[i]= (dst[i]*14071 + 33561947)>>14;
2357 }else{
2358 for (i=0; i<dstWidth; i++)
2359 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2360 }
2361 }
2362 }
2363
2364 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2365 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2366 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2367 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2368 int32_t *mmx2FilterPos, uint32_t *pal)
2369 {
2370 if (srcFormat==PIX_FMT_YUYV422)
2371 {
2372 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2373 src1= formatConvBuffer;
2374 src2= formatConvBuffer+VOFW;
2375 }
2376 else if (srcFormat==PIX_FMT_UYVY422)
2377 {
2378 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2379 src1= formatConvBuffer;
2380 src2= formatConvBuffer+VOFW;
2381 }
2382 else if (srcFormat==PIX_FMT_RGB32)
2383 {
2384 if(c->chrSrcHSubSample)
2385 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2386 else
2387 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2388 src1= formatConvBuffer;
2389 src2= formatConvBuffer+VOFW;
2390 }
2391 else if (srcFormat==PIX_FMT_RGB32_1)
2392 {
2393 if(c->chrSrcHSubSample)
2394 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2395 else
2396 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2397 src1= formatConvBuffer;
2398 src2= formatConvBuffer+VOFW;
2399 }
2400 else if (srcFormat==PIX_FMT_BGR24)
2401 {
2402 if(c->chrSrcHSubSample)
2403 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2404 else
2405 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2406 src1= formatConvBuffer;
2407 src2= formatConvBuffer+VOFW;
2408 }
2409 else if (srcFormat==PIX_FMT_BGR565)
2410 {
2411 if(c->chrSrcHSubSample)
2412 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2413 else
2414 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2415 src1= formatConvBuffer;
2416 src2= formatConvBuffer+VOFW;
2417 }
2418 else if (srcFormat==PIX_FMT_BGR555)
2419 {
2420 if(c->chrSrcHSubSample)
2421 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2422 else
2423 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2424 src1= formatConvBuffer;
2425 src2= formatConvBuffer+VOFW;
2426 }
2427 else if (srcFormat==PIX_FMT_BGR32)
2428 {
2429 if(c->chrSrcHSubSample)
2430 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2431 else
2432 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2433 src1= formatConvBuffer;
2434 src2= formatConvBuffer+VOFW;
2435 }
2436 else if (srcFormat==PIX_FMT_BGR32_1)
2437 {
2438 if(c->chrSrcHSubSample)
2439 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2440 else
2441 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2442 src1= formatConvBuffer;
2443 src2= formatConvBuffer+VOFW;
2444 }
2445 else if (srcFormat==PIX_FMT_RGB24)
2446 {
2447 if(c->chrSrcHSubSample)
2448 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2449 else
2450 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2451 src1= formatConvBuffer;
2452 src2= formatConvBuffer+VOFW;
2453 }
2454 else if (srcFormat==PIX_FMT_RGB565)
2455 {
2456 if(c->chrSrcHSubSample)
2457 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2458 else
2459 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2460 src1= formatConvBuffer;
2461 src2= formatConvBuffer+VOFW;
2462 }
2463 else if (srcFormat==PIX_FMT_RGB555)
2464 {
2465 if(c->chrSrcHSubSample)
2466 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2467 else
2468 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2469 src1= formatConvBuffer;
2470 src2= formatConvBuffer+VOFW;
2471 }
2472 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2473 {
2474 return;
2475 }
2476 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2477 {
2478 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2479 src1= formatConvBuffer;
2480 src2= formatConvBuffer+VOFW;
2481 }
2482
2483 #if HAVE_MMX
2484 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2485 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2486 #else
2487 if (!(flags&SWS_FAST_BILINEAR))
2488 #endif
2489 {
2490 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2491 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2492 }
2493 else // fast bilinear upscale / crap downscale
2494 {
2495 #if ARCH_X86 && CONFIG_GPL
2496 #if HAVE_MMX2
2497 int i;
2498 #if defined(PIC)
2499 uint64_t ebxsave __attribute__((aligned(8)));
2500 #endif
2501 if (canMMX2BeUsed)
2502 {
2503 __asm__ volatile(
2504 #if defined(PIC)
2505 "mov %%"REG_b", %6 \n\t"
2506 #endif
2507 "pxor %%mm7, %%mm7 \n\t"
2508 "mov %0, %%"REG_c" \n\t"
2509 "mov %1, %%"REG_D" \n\t"
2510 "mov %2, %%"REG_d" \n\t"
2511 "mov %3, %%"REG_b" \n\t"
2512 "xor %%"REG_a", %%"REG_a" \n\t" // i
2513 PREFETCH" (%%"REG_c") \n\t"
2514 PREFETCH" 32(%%"REG_c") \n\t"
2515 PREFETCH" 64(%%"REG_c") \n\t"
2516
2517 #if ARCH_X86_64
2518
2519 #define FUNNY_UV_CODE \
2520 "movl (%%"REG_b"), %%esi \n\t"\
2521 "call *%4 \n\t"\
2522 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2523 "add %%"REG_S", %%"REG_c" \n\t"\
2524 "add %%"REG_a", %%"REG_D" \n\t"\
2525 "xor %%"REG_a", %%"REG_a" \n\t"\
2526
2527 #else
2528
2529 #define FUNNY_UV_CODE \
2530 "movl (%%"REG_b"), %%esi \n\t"\
2531 "call *%4 \n\t"\
2532 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2533 "add %%"REG_a", %%"REG_D" \n\t"\
2534 "xor %%"REG_a", %%"REG_a" \n\t"\
2535
2536 #endif /* ARCH_X86_64 */
2537
2538 FUNNY_UV_CODE
2539 FUNNY_UV_CODE
2540 FUNNY_UV_CODE
2541 FUNNY_UV_CODE
2542 "xor %%"REG_a", %%"REG_a" \n\t" // i
2543 "mov %5, %%"REG_c" \n\t" // src
2544 "mov %1, %%"REG_D" \n\t" // buf1
2545 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2546 PREFETCH" (%%"REG_c") \n\t"
2547 PREFETCH" 32(%%"REG_c") \n\t"
2548 PREFETCH" 64(%%"REG_c") \n\t"
2549
2550 FUNNY_UV_CODE
2551 FUNNY_UV_CODE
2552 FUNNY_UV_CODE
2553 FUNNY_UV_CODE
2554
2555 #if defined(PIC)
2556 "mov %6, %%"REG_b" \n\t"
2557 #endif
2558 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2559 "m" (funnyUVCode), "m" (src2)
2560 #if defined(PIC)
2561 ,"m" (ebxsave)
2562 #endif
2563 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2564 #if !defined(PIC)
2565 ,"%"REG_b
2566 #endif
2567 );
2568 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2569 {
2570 //printf("%d %d %d\n", dstWidth, i, srcW);
2571 dst[i] = src1[srcW-1]*128;
2572 dst[i+VOFW] = src2[srcW-1]*128;
2573 }
2574 }
2575 else
2576 {
2577 #endif /* HAVE_MMX2 */
2578 long xInc_shr16 = (long) (xInc >> 16);
2579 uint16_t xInc_mask = xInc & 0xffff;
2580 __asm__ volatile(
2581 "xor %%"REG_a", %%"REG_a" \n\t" // i
2582 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2583 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2584 ASMALIGN(4)
2585 "1: \n\t"
2586 "mov %0, %%"REG_S" \n\t"
2587 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2588 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2589 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2590 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2591 "shll $16, %%edi \n\t"
2592 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2593 "mov %1, %%"REG_D" \n\t"
2594 "shrl $9, %%esi \n\t"
2595 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2596
2597 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2598 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2599 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2600 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2601 "shll $16, %%edi \n\t"
2602 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2603 "mov %1, %%"REG_D" \n\t"
2604 "shrl $9, %%esi \n\t"
2605 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2606
2607 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2608 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2609 "add $1, %%"REG_a" \n\t"
2610 "cmp %2, %%"REG_a" \n\t"
2611 " jb 1b \n\t"
2612
2613 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2614 which is needed to support GCC 4.0. */
2615 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2616 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2617 #else
2618 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2619 #endif
2620 "r" (src2)
2621 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2622 );
2623 #if HAVE_MMX2
2624 } //if MMX2 can't be used
2625 #endif
2626 #else
2627 int i;
2628 unsigned int xpos=0;
2629 for (i=0;i<dstWidth;i++)
2630 {
2631 register unsigned int xx=xpos>>16;
2632 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2633 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2634 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2635 /* slower
2636 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2637 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2638 */
2639 xpos+=xInc;
2640 }
2641 #endif /* ARCH_X86 */
2642 }
2643 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2644 int i;
2645 //FIXME all pal and rgb srcFormats could do this convertion as well
2646 //FIXME all scalers more complex than bilinear could do half of this transform
2647 if(c->srcRange){
2648 for (i=0; i<dstWidth; i++){
2649 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2650 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2651 }
2652 }else{
2653 for (i=0; i<dstWidth; i++){
2654 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2655 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2656 }
2657 }
2658 }
2659 }
2660
2661 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2662 int srcSliceH, uint8_t* dst[], int dstStride[]){
2663
2664 /* load a few things into local vars to make the code more readable? and faster */
2665 const int srcW= c->srcW;
2666 const int dstW= c->dstW;
2667 const int dstH= c->dstH;
2668 const int chrDstW= c->chrDstW;
2669 const int chrSrcW= c->chrSrcW;
2670 const int lumXInc= c->lumXInc;
2671 const int chrXInc= c->chrXInc;
2672 const int dstFormat= c->dstFormat;
2673 const int srcFormat= c->srcFormat;
2674 const int flags= c->flags;
2675 const int canMMX2BeUsed= c->canMMX2BeUsed;
2676 int16_t *vLumFilterPos= c->vLumFilterPos;
2677 int16_t *vChrFilterPos= c->vChrFilterPos;
2678 int16_t *hLumFilterPos= c->hLumFilterPos;
2679 int16_t *hChrFilterPos= c->hChrFilterPos;
2680 int16_t *vLumFilter= c->vLumFilter;
2681 int16_t *vChrFilter= c->vChrFilter;
2682 int16_t *hLumFilter= c->hLumFilter;
2683 int16_t *hChrFilter= c->hChrFilter;
2684 int32_t *lumMmxFilter= c->lumMmxFilter;
2685 int32_t *chrMmxFilter= c->chrMmxFilter;
2686 const int vLumFilterSize= c->vLumFilterSize;
2687 const int vChrFilterSize= c->vChrFilterSize;
2688 const int hLumFilterSize= c->hLumFilterSize;
2689 const int hChrFilterSize= c->hChrFilterSize;
2690 int16_t **lumPixBuf= c->lumPixBuf;
2691 int16_t **chrPixBuf= c->chrPixBuf;
2692 const int vLumBufSize= c->vLumBufSize;
2693 const int vChrBufSize= c->vChrBufSize;
2694 uint8_t *funnyYCode= c->funnyYCode;
2695 uint8_t *funnyUVCode= c->funnyUVCode;
2696 uint8_t *formatConvBuffer= c->formatConvBuffer;
2697 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2698 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2699 int lastDstY;
2700 uint32_t *pal=c->pal_yuv;
2701
2702 /* vars which will change and which we need to store back in the context */
2703 int dstY= c->dstY;
2704 int lumBufIndex= c->lumBufIndex;
2705 int chrBufIndex= c->chrBufIndex;
2706 int lastInLumBuf= c->lastInLumBuf;
2707 int lastInChrBuf= c->lastInChrBuf;
2708
2709 if (isPacked(c->srcFormat)){
2710 src[0]=
2711 src[1]=
2712 src[2]= src[0];
2713 srcStride[0]=
2714 srcStride[1]=
2715 srcStride[2]= srcStride[0];
2716 }
2717 srcStride[1]<<= c->vChrDrop;
2718 srcStride[2]<<= c->vChrDrop;
2719
2720 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2721 // (int)dst[0], (int)dst[1], (int)dst[2]);
2722
2723 #if 0 //self test FIXME move to a vfilter or something
2724 {
2725 static volatile int i=0;
2726 i++;
2727 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2728 selfTest(src, srcStride, c->srcW, c->srcH);
2729 i--;
2730 }
2731 #endif
2732
2733 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2734 //dstStride[0],dstStride[1],dstStride[2]);
2735
2736 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2737 {
2738 static int warnedAlready=0; //FIXME move this into the context perhaps
2739 if (flags & SWS_PRINT_INFO && !warnedAlready)
2740 {
2741 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2742 " ->cannot do aligned memory accesses anymore\n");
2743 warnedAlready=1;
2744 }
2745 }
2746
2747 /* Note the user might start scaling the picture in the middle so this
2748 will not get executed. This is not really intended but works
2749 currently, so people might do it. */
2750 if (srcSliceY ==0){
2751 lumBufIndex=0;
2752 chrBufIndex=0;
2753 dstY=0;
2754 lastInLumBuf= -1;
2755 lastInChrBuf= -1;
2756 }
2757
2758 lastDstY= dstY;
2759
2760 for (;dstY < dstH; dstY++){
2761 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2762 const int chrDstY= dstY>>c->chrDstVSubSample;
2763 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2764 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2765
2766 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2767 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2768 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2769 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2770
2771 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2772 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2773 //handle holes (FAST_BILINEAR & weird filters)
2774 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2775 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2776 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2777 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2778 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2779
2780 // Do we have enough lines in this slice to output the dstY line
2781 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2782 {
2783 //Do horizontal scaling
2784 while(lastInLumBuf < lastLumSrcY)
2785 {
2786 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2787 lumBufIndex++;
2788 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2789 assert(lumBufIndex < 2*vLumBufSize);
2790 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2791 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2792 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2793 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2794 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2795 funnyYCode, c->srcFormat, formatConvBuffer,
2796 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2797 lastInLumBuf++;
2798 }
2799 while(lastInChrBuf < lastChrSrcY)
2800 {
2801 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2802 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2803 chrBufIndex++;
2804 assert(chrBufIndex < 2*vChrBufSize);
2805 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2806 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2807 //FIXME replace parameters through context struct (some at least)
2808
2809 if (!(isGray(srcFormat) || isGray(dstFormat)))
2810 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2811 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2812 funnyUVCode, c->srcFormat, formatConvBuffer,
2813 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2814 lastInChrBuf++;
2815 }
2816 //wrap buf index around to stay inside the ring buffer
2817 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2818 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2819 }
2820 else // not enough lines left in this slice -> load the rest in the buffer
2821 {
2822 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2823 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2824 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2825 vChrBufSize, vLumBufSize);*/
2826
2827 //Do horizontal scaling
2828 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2829 {
2830 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2831 lumBufIndex++;
2832 assert(lumBufIndex < 2*vLumBufSize);
2833 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2834 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2835 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2836 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2837 funnyYCode, c->srcFormat, formatConvBuffer,
2838 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2839 lastInLumBuf++;
2840 }
2841 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2842 {
2843 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2844 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2845 chrBufIndex++;
2846 assert(chrBufIndex < 2*vChrBufSize);
2847 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
2848 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2849
2850 if (!(isGray(srcFormat) || isGray(dstFormat)))
2851 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2852 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2853 funnyUVCode, c->srcFormat, formatConvBuffer,
2854 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2855 lastInChrBuf++;
2856 }
2857 //wrap buf index around to stay inside the ring buffer
2858 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2859 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2860 break; //we can't output a dstY line so let's try with the next slice
2861 }
2862
2863 #if HAVE_MMX
2864 c->blueDither= ff_dither8[dstY&1];
2865 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2866 c->greenDither= ff_dither8[dstY&1];
2867 else
2868 c->greenDither= ff_dither4[dstY&1];
2869 c->redDither= ff_dither8[(dstY+1)&1];
2870 #endif
2871 if (dstY < dstH-2)
2872 {
2873 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2874 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2875 #if HAVE_MMX
2876 int i;
2877 if (flags & SWS_ACCURATE_RND){
2878 int s= APCK_SIZE / 8;
2879 for (i=0; i<vLumFilterSize; i+=2){
2880 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2881 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2882 lumMmxFilter[s*i+APCK_COEF/4 ]=
2883 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2884 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2885 }
2886 for (i=0; i<vChrFilterSize; i+=2){
2887 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2888 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2889 chrMmxFilter[s*i+APCK_COEF/4 ]=
2890 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2891 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2892 }
2893 }else{
2894 for (i=0; i<vLumFilterSize; i++)
2895 {
2896 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2897 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2898 lumMmxFilter[4*i+2]=
2899 lumMmxFilter[4*i+3]=
2900 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2901 }
2902 for (i=0; i<vChrFilterSize; i++)
2903 {
2904 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2905 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2906 chrMmxFilter[4*i+2]=
2907 chrMmxFilter[4*i+3]=
2908 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2909 }
2910 }
2911 #endif
2912 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2913 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2914 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2915 RENAME(yuv2nv12X)(c,
2916 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2917 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2918 dest, uDest, dstW, chrDstW, dstFormat);
2919 }
2920 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2921 {
2922 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2923 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2924 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2925 {
2926 int16_t *lumBuf = lumPixBuf[0];
2927 int16_t *chrBuf= chrPixBuf[0];
2928 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2929 }
2930 else //General YV12
2931 {
2932 RENAME(yuv2yuvX)(c,
2933 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2934 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2935 dest, uDest, vDest, dstW, chrDstW);
2936 }
2937 }
2938 else
2939 {
2940 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2941 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2942 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2943 {
2944 int chrAlpha= vChrFilter[2*dstY+1];
2945 if(flags & SWS_FULL_CHR_H_INT){
2946 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2947 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2948 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2949 dest, dstW, dstY);
2950 }else{
2951 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2952 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2953 }
2954 }
2955 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2956 {
2957 int lumAlpha= vLumFilter[2*dstY+1];
2958 int chrAlpha= vChrFilter[2*dstY+1];
2959 lumMmxFilter[2]=
2960 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2961 chrMmxFilter[2]=
2962 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2963 if(flags & SWS_FULL_CHR_H_INT){
2964 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2965 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2966 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2967 dest, dstW, dstY);
2968 }else{
2969 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2970 dest, dstW, lumAlpha, chrAlpha, dstY);
2971 }
2972 }
2973 else //general RGB
2974 {
2975 if(flags & SWS_FULL_CHR_H_INT){
2976 yuv2rgbXinC_full(c,
2977 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2978 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2979 dest, dstW, dstY);
2980 }else{
2981 RENAME(yuv2packedX)(c,
2982 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2983 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2984 dest, dstW, dstY);
2985 }
2986 }
2987 }
2988 }
2989 else // hmm looks like we can't use MMX here without overwriting this array's tail
2990 {
2991 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2992 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2993 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2994 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2995 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2996 yuv2nv12XinC(
2997 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2998 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2999 dest, uDest, dstW, chrDstW, dstFormat);
3000 }
3001 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3002 {
3003 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3004 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3005 yuv2yuvXinC(
3006 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3007 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3008 dest, uDest, vDest, dstW, chrDstW);
3009 }
3010 else
3011 {
3012 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3013 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3014 if(flags & SWS_FULL_CHR_H_INT){
3015 yuv2rgbXinC_full(c,
3016 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3017 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3018 dest, dstW, dstY);
3019 }else{
3020 yuv2packedXinC(c,
3021 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3022 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3023 dest, dstW, dstY);
3024 }
3025 }
3026 }
3027 }
3028
3029 #if HAVE_MMX
3030 __asm__ volatile(SFENCE:::"memory");
3031 __asm__ volatile(EMMS:::"memory");
3032 #endif
3033 /* store changed local vars back in the context */
3034 c->dstY= dstY;
3035 c->lumBufIndex= lumBufIndex;
3036 c->chrBufIndex= chrBufIndex;
3037 c->lastInLumBuf= lastInLumBuf;
3038 c->lastInChrBuf= lastInChrBuf;
3039
3040 return dstY - lastDstY;
3041 }