swscale: Use function pointers for swScale functions.
[libav.git] / libswscale / swscale_template.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
22 */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29
30 #if HAVE_AMD3DNOW
31 #define PREFETCH "prefetch"
32 #define PREFETCHW "prefetchw"
33 #elif HAVE_MMX2
34 #define PREFETCH "prefetchnta"
35 #define PREFETCHW "prefetcht0"
36 #else
37 #define PREFETCH " # nop"
38 #define PREFETCHW " # nop"
39 #endif
40
41 #if HAVE_MMX2
42 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43 #elif HAVE_AMD3DNOW
44 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45 #endif
46
47 #if HAVE_MMX2
48 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
49 #else
50 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
51 #endif
52 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
53
54 #if HAVE_ALTIVEC
55 #include "ppc/swscale_altivec_template.c"
56 #endif
57
58 #define YSCALEYUV2YV12X(x, offset, dest, width) \
59 __asm__ volatile(\
60 "xor %%"REG_a", %%"REG_a" \n\t"\
61 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
62 "movq %%mm3, %%mm4 \n\t"\
63 "lea " offset "(%0), %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 ASMALIGN(4) /* FIXME Unroll? */\
66 "1: \n\t"\
67 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
68 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
69 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
70 "add $16, %%"REG_d" \n\t"\
71 "mov (%%"REG_d"), %%"REG_S" \n\t"\
72 "test %%"REG_S", %%"REG_S" \n\t"\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
77 " jnz 1b \n\t"\
78 "psraw $3, %%mm3 \n\t"\
79 "psraw $3, %%mm4 \n\t"\
80 "packuswb %%mm4, %%mm3 \n\t"\
81 MOVNTQ(%%mm3, (%1, %%REGa))\
82 "add $8, %%"REG_a" \n\t"\
83 "cmp %2, %%"REG_a" \n\t"\
84 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
85 "movq %%mm3, %%mm4 \n\t"\
86 "lea " offset "(%0), %%"REG_d" \n\t"\
87 "mov (%%"REG_d"), %%"REG_S" \n\t"\
88 "jb 1b \n\t"\
89 :: "r" (&c->redDither),\
90 "r" (dest), "g" (width)\
91 : "%"REG_a, "%"REG_d, "%"REG_S\
92 );
93
94 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
95 __asm__ volatile(\
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "xor %%"REG_a", %%"REG_a" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 ASMALIGN(4) \
104 "1: \n\t"\
105 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
106 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
108 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
117 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
128 " jnz 1b \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %%REGa))\
142 "add $8, %%"REG_a" \n\t"\
143 "cmp %2, %%"REG_a" \n\t"\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
150 "jb 1b \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" (width)\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
154 );
155
156 #define YSCALEYUV2YV121 \
157 "mov %2, %%"REG_a" \n\t"\
158 ASMALIGN(4) /* FIXME Unroll? */\
159 "1: \n\t"\
160 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
161 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
167 "jnc 1b \n\t"
168
169 #define YSCALEYUV2YV121_ACCURATE \
170 "mov %2, %%"REG_a" \n\t"\
171 "pcmpeqw %%mm7, %%mm7 \n\t"\
172 "psrlw $15, %%mm7 \n\t"\
173 "psllw $6, %%mm7 \n\t"\
174 ASMALIGN(4) /* FIXME Unroll? */\
175 "1: \n\t"\
176 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
177 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
178 "paddsw %%mm7, %%mm0 \n\t"\
179 "paddsw %%mm7, %%mm1 \n\t"\
180 "psraw $7, %%mm0 \n\t"\
181 "psraw $7, %%mm1 \n\t"\
182 "packuswb %%mm1, %%mm0 \n\t"\
183 MOVNTQ(%%mm0, (%1, %%REGa))\
184 "add $8, %%"REG_a" \n\t"\
185 "jnc 1b \n\t"
186
187 /*
188 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190 "r" (dest), "m" (dstW),
191 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
193 */
194 #define YSCALEYUV2PACKEDX_UV \
195 __asm__ volatile(\
196 "xor %%"REG_a", %%"REG_a" \n\t"\
197 ASMALIGN(4)\
198 "nop \n\t"\
199 "1: \n\t"\
200 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
201 "mov (%%"REG_d"), %%"REG_S" \n\t"\
202 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
203 "movq %%mm3, %%mm4 \n\t"\
204 ASMALIGN(4)\
205 "2: \n\t"\
206 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
207 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
208 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
209 "add $16, %%"REG_d" \n\t"\
210 "mov (%%"REG_d"), %%"REG_S" \n\t"\
211 "pmulhw %%mm0, %%mm2 \n\t"\
212 "pmulhw %%mm0, %%mm5 \n\t"\
213 "paddw %%mm2, %%mm3 \n\t"\
214 "paddw %%mm5, %%mm4 \n\t"\
215 "test %%"REG_S", %%"REG_S" \n\t"\
216 " jnz 2b \n\t"\
217
218 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219 "lea "offset"(%0), %%"REG_d" \n\t"\
220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
221 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
222 "movq "#dst1", "#dst2" \n\t"\
223 ASMALIGN(4)\
224 "2: \n\t"\
225 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
226 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
227 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
228 "add $16, %%"REG_d" \n\t"\
229 "mov (%%"REG_d"), %%"REG_S" \n\t"\
230 "pmulhw "#coeff", "#src1" \n\t"\
231 "pmulhw "#coeff", "#src2" \n\t"\
232 "paddw "#src1", "#dst1" \n\t"\
233 "paddw "#src2", "#dst2" \n\t"\
234 "test %%"REG_S", %%"REG_S" \n\t"\
235 " jnz 2b \n\t"\
236
237 #define YSCALEYUV2PACKEDX \
238 YSCALEYUV2PACKEDX_UV \
239 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
240
241 #define YSCALEYUV2PACKEDX_END \
242 :: "r" (&c->redDither), \
243 "m" (dummy), "m" (dummy), "m" (dummy),\
244 "r" (dest), "m" (dstW) \
245 : "%"REG_a, "%"REG_d, "%"REG_S \
246 );
247
248 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
249 __asm__ volatile(\
250 "xor %%"REG_a", %%"REG_a" \n\t"\
251 ASMALIGN(4)\
252 "nop \n\t"\
253 "1: \n\t"\
254 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
255 "mov (%%"REG_d"), %%"REG_S" \n\t"\
256 "pxor %%mm4, %%mm4 \n\t"\
257 "pxor %%mm5, %%mm5 \n\t"\
258 "pxor %%mm6, %%mm6 \n\t"\
259 "pxor %%mm7, %%mm7 \n\t"\
260 ASMALIGN(4)\
261 "2: \n\t"\
262 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
263 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
264 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
265 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
266 "movq %%mm0, %%mm3 \n\t"\
267 "punpcklwd %%mm1, %%mm0 \n\t"\
268 "punpckhwd %%mm1, %%mm3 \n\t"\
269 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
270 "pmaddwd %%mm1, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm3 \n\t"\
272 "paddd %%mm0, %%mm4 \n\t"\
273 "paddd %%mm3, %%mm5 \n\t"\
274 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
275 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
276 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
277 "test %%"REG_S", %%"REG_S" \n\t"\
278 "movq %%mm2, %%mm0 \n\t"\
279 "punpcklwd %%mm3, %%mm2 \n\t"\
280 "punpckhwd %%mm3, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm2 \n\t"\
282 "pmaddwd %%mm1, %%mm0 \n\t"\
283 "paddd %%mm2, %%mm6 \n\t"\
284 "paddd %%mm0, %%mm7 \n\t"\
285 " jnz 2b \n\t"\
286 "psrad $16, %%mm4 \n\t"\
287 "psrad $16, %%mm5 \n\t"\
288 "psrad $16, %%mm6 \n\t"\
289 "psrad $16, %%mm7 \n\t"\
290 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
291 "packssdw %%mm5, %%mm4 \n\t"\
292 "packssdw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm0, %%mm4 \n\t"\
294 "paddw %%mm0, %%mm6 \n\t"\
295 "movq %%mm4, "U_TEMP"(%0) \n\t"\
296 "movq %%mm6, "V_TEMP"(%0) \n\t"\
297
298 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299 "lea "offset"(%0), %%"REG_d" \n\t"\
300 "mov (%%"REG_d"), %%"REG_S" \n\t"\
301 "pxor %%mm1, %%mm1 \n\t"\
302 "pxor %%mm5, %%mm5 \n\t"\
303 "pxor %%mm7, %%mm7 \n\t"\
304 "pxor %%mm6, %%mm6 \n\t"\
305 ASMALIGN(4)\
306 "2: \n\t"\
307 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
309 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
310 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
311 "movq %%mm0, %%mm3 \n\t"\
312 "punpcklwd %%mm4, %%mm0 \n\t"\
313 "punpckhwd %%mm4, %%mm3 \n\t"\
314 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "pmaddwd %%mm4, %%mm3 \n\t"\
317 "paddd %%mm0, %%mm1 \n\t"\
318 "paddd %%mm3, %%mm5 \n\t"\
319 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
320 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
321 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
322 "test %%"REG_S", %%"REG_S" \n\t"\
323 "movq %%mm2, %%mm0 \n\t"\
324 "punpcklwd %%mm3, %%mm2 \n\t"\
325 "punpckhwd %%mm3, %%mm0 \n\t"\
326 "pmaddwd %%mm4, %%mm2 \n\t"\
327 "pmaddwd %%mm4, %%mm0 \n\t"\
328 "paddd %%mm2, %%mm7 \n\t"\
329 "paddd %%mm0, %%mm6 \n\t"\
330 " jnz 2b \n\t"\
331 "psrad $16, %%mm1 \n\t"\
332 "psrad $16, %%mm5 \n\t"\
333 "psrad $16, %%mm7 \n\t"\
334 "psrad $16, %%mm6 \n\t"\
335 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
336 "packssdw %%mm5, %%mm1 \n\t"\
337 "packssdw %%mm6, %%mm7 \n\t"\
338 "paddw %%mm0, %%mm1 \n\t"\
339 "paddw %%mm0, %%mm7 \n\t"\
340 "movq "U_TEMP"(%0), %%mm3 \n\t"\
341 "movq "V_TEMP"(%0), %%mm4 \n\t"\
342
343 #define YSCALEYUV2PACKEDX_ACCURATE \
344 YSCALEYUV2PACKEDX_ACCURATE_UV \
345 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346
347 #define YSCALEYUV2RGBX \
348 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
349 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
350 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
351 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
352 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
353 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
354 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
356 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
357 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
358 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
359 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
360 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
361 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362 "paddw %%mm3, %%mm4 \n\t"\
363 "movq %%mm2, %%mm0 \n\t"\
364 "movq %%mm5, %%mm6 \n\t"\
365 "movq %%mm4, %%mm3 \n\t"\
366 "punpcklwd %%mm2, %%mm2 \n\t"\
367 "punpcklwd %%mm5, %%mm5 \n\t"\
368 "punpcklwd %%mm4, %%mm4 \n\t"\
369 "paddw %%mm1, %%mm2 \n\t"\
370 "paddw %%mm1, %%mm5 \n\t"\
371 "paddw %%mm1, %%mm4 \n\t"\
372 "punpckhwd %%mm0, %%mm0 \n\t"\
373 "punpckhwd %%mm6, %%mm6 \n\t"\
374 "punpckhwd %%mm3, %%mm3 \n\t"\
375 "paddw %%mm7, %%mm0 \n\t"\
376 "paddw %%mm7, %%mm6 \n\t"\
377 "paddw %%mm7, %%mm3 \n\t"\
378 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379 "packuswb %%mm0, %%mm2 \n\t"\
380 "packuswb %%mm6, %%mm5 \n\t"\
381 "packuswb %%mm3, %%mm4 \n\t"\
382
383 #define REAL_YSCALEYUV2PACKED(index, c) \
384 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
385 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
386 "psraw $3, %%mm0 \n\t"\
387 "psraw $3, %%mm1 \n\t"\
388 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390 "xor "#index", "#index" \n\t"\
391 ASMALIGN(4)\
392 "1: \n\t"\
393 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
394 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
395 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
396 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
407 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
408 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
409 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
410 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
411 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
412 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
418
419 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
420
421 #define REAL_YSCALEYUV2RGB_UV(index, c) \
422 "xor "#index", "#index" \n\t"\
423 ASMALIGN(4)\
424 "1: \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
432 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
439 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
440 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
441 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
442 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
443 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
444 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
445
446 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
448 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
449 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
450 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
451 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
460 #define REAL_YSCALEYUV2RGB_COEFF(c) \
461 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
462 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
463 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
464 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
465 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
466 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
467 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468 "paddw %%mm3, %%mm4 \n\t"\
469 "movq %%mm2, %%mm0 \n\t"\
470 "movq %%mm5, %%mm6 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklwd %%mm2, %%mm2 \n\t"\
473 "punpcklwd %%mm5, %%mm5 \n\t"\
474 "punpcklwd %%mm4, %%mm4 \n\t"\
475 "paddw %%mm1, %%mm2 \n\t"\
476 "paddw %%mm1, %%mm5 \n\t"\
477 "paddw %%mm1, %%mm4 \n\t"\
478 "punpckhwd %%mm0, %%mm0 \n\t"\
479 "punpckhwd %%mm6, %%mm6 \n\t"\
480 "punpckhwd %%mm3, %%mm3 \n\t"\
481 "paddw %%mm7, %%mm0 \n\t"\
482 "paddw %%mm7, %%mm6 \n\t"\
483 "paddw %%mm7, %%mm3 \n\t"\
484 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485 "packuswb %%mm0, %%mm2 \n\t"\
486 "packuswb %%mm6, %%mm5 \n\t"\
487 "packuswb %%mm3, %%mm4 \n\t"\
488
489 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
490
491 #define YSCALEYUV2RGB(index, c) \
492 REAL_YSCALEYUV2RGB_UV(index, c) \
493 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494 REAL_YSCALEYUV2RGB_COEFF(c)
495
496 #define REAL_YSCALEYUV2PACKED1(index, c) \
497 "xor "#index", "#index" \n\t"\
498 ASMALIGN(4)\
499 "1: \n\t"\
500 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
501 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
502 "psraw $7, %%mm3 \n\t" \
503 "psraw $7, %%mm4 \n\t" \
504 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
505 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
506 "psraw $7, %%mm1 \n\t" \
507 "psraw $7, %%mm7 \n\t" \
508
509 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
510
511 #define REAL_YSCALEYUV2RGB1(index, c) \
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
520 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
521 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
522 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
523 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
524 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
525 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
527 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
528 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
531 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
532 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
533 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
534 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
535 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
536 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537 "paddw %%mm3, %%mm4 \n\t"\
538 "movq %%mm2, %%mm0 \n\t"\
539 "movq %%mm5, %%mm6 \n\t"\
540 "movq %%mm4, %%mm3 \n\t"\
541 "punpcklwd %%mm2, %%mm2 \n\t"\
542 "punpcklwd %%mm5, %%mm5 \n\t"\
543 "punpcklwd %%mm4, %%mm4 \n\t"\
544 "paddw %%mm1, %%mm2 \n\t"\
545 "paddw %%mm1, %%mm5 \n\t"\
546 "paddw %%mm1, %%mm4 \n\t"\
547 "punpckhwd %%mm0, %%mm0 \n\t"\
548 "punpckhwd %%mm6, %%mm6 \n\t"\
549 "punpckhwd %%mm3, %%mm3 \n\t"\
550 "paddw %%mm7, %%mm0 \n\t"\
551 "paddw %%mm7, %%mm6 \n\t"\
552 "paddw %%mm7, %%mm3 \n\t"\
553 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554 "packuswb %%mm0, %%mm2 \n\t"\
555 "packuswb %%mm6, %%mm5 \n\t"\
556 "packuswb %%mm3, %%mm4 \n\t"\
557
558 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
559
560 #define REAL_YSCALEYUV2PACKED1b(index, c) \
561 "xor "#index", "#index" \n\t"\
562 ASMALIGN(4)\
563 "1: \n\t"\
564 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
565 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
566 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
567 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570 "psrlw $8, %%mm3 \n\t" \
571 "psrlw $8, %%mm4 \n\t" \
572 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
573 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
574 "psraw $7, %%mm1 \n\t" \
575 "psraw $7, %%mm7 \n\t"
576 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
577
578 // do vertical chrominance interpolation
579 #define REAL_YSCALEYUV2RGB1b(index, c) \
580 "xor "#index", "#index" \n\t"\
581 ASMALIGN(4)\
582 "1: \n\t"\
583 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
584 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
585 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
586 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
587 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
590 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
591 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
592 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
593 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
594 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
595 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
596 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
597 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
599 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
600 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
603 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
604 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
605 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
606 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
607 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
608 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609 "paddw %%mm3, %%mm4 \n\t"\
610 "movq %%mm2, %%mm0 \n\t"\
611 "movq %%mm5, %%mm6 \n\t"\
612 "movq %%mm4, %%mm3 \n\t"\
613 "punpcklwd %%mm2, %%mm2 \n\t"\
614 "punpcklwd %%mm5, %%mm5 \n\t"\
615 "punpcklwd %%mm4, %%mm4 \n\t"\
616 "paddw %%mm1, %%mm2 \n\t"\
617 "paddw %%mm1, %%mm5 \n\t"\
618 "paddw %%mm1, %%mm4 \n\t"\
619 "punpckhwd %%mm0, %%mm0 \n\t"\
620 "punpckhwd %%mm6, %%mm6 \n\t"\
621 "punpckhwd %%mm3, %%mm3 \n\t"\
622 "paddw %%mm7, %%mm0 \n\t"\
623 "paddw %%mm7, %%mm6 \n\t"\
624 "paddw %%mm7, %%mm3 \n\t"\
625 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626 "packuswb %%mm0, %%mm2 \n\t"\
627 "packuswb %%mm6, %%mm5 \n\t"\
628 "packuswb %%mm3, %%mm4 \n\t"\
629
630 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
631
632 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
634 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
635 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
636 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
637 "packuswb %%mm1, %%mm7 \n\t"
638 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639
640 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641 "movq "#b", "#q2" \n\t" /* B */\
642 "movq "#r", "#t" \n\t" /* R */\
643 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
644 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
645 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
646 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
647 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
648 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
649 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
650 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
651 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
652 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
653 \
654 MOVNTQ( q0, (dst, index, 4))\
655 MOVNTQ( b, 8(dst, index, 4))\
656 MOVNTQ( q2, 16(dst, index, 4))\
657 MOVNTQ( q3, 24(dst, index, 4))\
658 \
659 "add $8, "#index" \n\t"\
660 "cmp "#dstw", "#index" \n\t"\
661 " jb 1b \n\t"
662 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
663
664 #define REAL_WRITERGB16(dst, dstw, index) \
665 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
666 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
667 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
668 "psrlq $3, %%mm2 \n\t"\
669 \
670 "movq %%mm2, %%mm1 \n\t"\
671 "movq %%mm4, %%mm3 \n\t"\
672 \
673 "punpcklbw %%mm7, %%mm3 \n\t"\
674 "punpcklbw %%mm5, %%mm2 \n\t"\
675 "punpckhbw %%mm7, %%mm4 \n\t"\
676 "punpckhbw %%mm5, %%mm1 \n\t"\
677 \
678 "psllq $3, %%mm3 \n\t"\
679 "psllq $3, %%mm4 \n\t"\
680 \
681 "por %%mm3, %%mm2 \n\t"\
682 "por %%mm4, %%mm1 \n\t"\
683 \
684 MOVNTQ(%%mm2, (dst, index, 2))\
685 MOVNTQ(%%mm1, 8(dst, index, 2))\
686 \
687 "add $8, "#index" \n\t"\
688 "cmp "#dstw", "#index" \n\t"\
689 " jb 1b \n\t"
690 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
691
692 #define REAL_WRITERGB15(dst, dstw, index) \
693 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
694 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
695 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
696 "psrlq $3, %%mm2 \n\t"\
697 "psrlq $1, %%mm5 \n\t"\
698 \
699 "movq %%mm2, %%mm1 \n\t"\
700 "movq %%mm4, %%mm3 \n\t"\
701 \
702 "punpcklbw %%mm7, %%mm3 \n\t"\
703 "punpcklbw %%mm5, %%mm2 \n\t"\
704 "punpckhbw %%mm7, %%mm4 \n\t"\
705 "punpckhbw %%mm5, %%mm1 \n\t"\
706 \
707 "psllq $2, %%mm3 \n\t"\
708 "psllq $2, %%mm4 \n\t"\
709 \
710 "por %%mm3, %%mm2 \n\t"\
711 "por %%mm4, %%mm1 \n\t"\
712 \
713 MOVNTQ(%%mm2, (dst, index, 2))\
714 MOVNTQ(%%mm1, 8(dst, index, 2))\
715 \
716 "add $8, "#index" \n\t"\
717 "cmp "#dstw", "#index" \n\t"\
718 " jb 1b \n\t"
719 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
720
721 #define WRITEBGR24OLD(dst, dstw, index) \
722 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723 "movq %%mm2, %%mm1 \n\t" /* B */\
724 "movq %%mm5, %%mm6 \n\t" /* R */\
725 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
726 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
727 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
728 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
729 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
730 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
731 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
732 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
733 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
734 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
735 \
736 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
737 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
738 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
739 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
740 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
741 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
742 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
743 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
744 \
745 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
746 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
747 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
748 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
749 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
750 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
751 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
752 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
753 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
754 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
755 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
756 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
757 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
758 \
759 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
760 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
761 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
762 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
763 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
764 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
765 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
766 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
767 \
768 MOVNTQ(%%mm0, (dst))\
769 MOVNTQ(%%mm2, 8(dst))\
770 MOVNTQ(%%mm3, 16(dst))\
771 "add $24, "#dst" \n\t"\
772 \
773 "add $8, "#index" \n\t"\
774 "cmp "#dstw", "#index" \n\t"\
775 " jb 1b \n\t"
776
777 #define WRITEBGR24MMX(dst, dstw, index) \
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779 "movq %%mm2, %%mm1 \n\t" /* B */\
780 "movq %%mm5, %%mm6 \n\t" /* R */\
781 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
782 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
783 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
784 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
785 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
786 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
787 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
788 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
789 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
790 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
791 \
792 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
793 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
794 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
795 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
796 \
797 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
798 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
799 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
800 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
801 \
802 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
803 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
804 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
805 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
806 \
807 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
809 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
810 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
811 MOVNTQ(%%mm0, (dst))\
812 \
813 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
814 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
815 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
816 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
817 MOVNTQ(%%mm6, 8(dst))\
818 \
819 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
820 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
821 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
822 MOVNTQ(%%mm5, 16(dst))\
823 \
824 "add $24, "#dst" \n\t"\
825 \
826 "add $8, "#index" \n\t"\
827 "cmp "#dstw", "#index" \n\t"\
828 " jb 1b \n\t"
829
830 #define WRITEBGR24MMX2(dst, dstw, index) \
831 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
835 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
836 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
837 \
838 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
839 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
840 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
841 \
842 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
843 "por %%mm1, %%mm6 \n\t"\
844 "por %%mm3, %%mm6 \n\t"\
845 MOVNTQ(%%mm6, (dst))\
846 \
847 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
848 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
849 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
850 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
851 \
852 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
853 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
854 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
855 \
856 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
857 "por %%mm3, %%mm6 \n\t"\
858 MOVNTQ(%%mm6, 8(dst))\
859 \
860 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
861 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
862 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
863 \
864 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
865 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
866 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
867 \
868 "por %%mm1, %%mm3 \n\t"\
869 "por %%mm3, %%mm6 \n\t"\
870 MOVNTQ(%%mm6, 16(dst))\
871 \
872 "add $24, "#dst" \n\t"\
873 \
874 "add $8, "#index" \n\t"\
875 "cmp "#dstw", "#index" \n\t"\
876 " jb 1b \n\t"
877
878 #if HAVE_MMX2
879 #undef WRITEBGR24
880 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
881 #else
882 #undef WRITEBGR24
883 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
884 #endif
885
886 #define REAL_WRITEYUY2(dst, dstw, index) \
887 "packuswb %%mm3, %%mm3 \n\t"\
888 "packuswb %%mm4, %%mm4 \n\t"\
889 "packuswb %%mm7, %%mm1 \n\t"\
890 "punpcklbw %%mm4, %%mm3 \n\t"\
891 "movq %%mm1, %%mm7 \n\t"\
892 "punpcklbw %%mm3, %%mm1 \n\t"\
893 "punpckhbw %%mm3, %%mm7 \n\t"\
894 \
895 MOVNTQ(%%mm1, (dst, index, 2))\
896 MOVNTQ(%%mm7, 8(dst, index, 2))\
897 \
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
900 " jb 1b \n\t"
901 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
902
903
904 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
907 {
908 #if HAVE_MMX
909 if(!(c->flags & SWS_BITEXACT)){
910 if (c->flags & SWS_ACCURATE_RND){
911 if (uDest){
912 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914 }
915 if (CONFIG_SWSCALE_ALPHA && aDest){
916 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917 }
918
919 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920 }else{
921 if (uDest){
922 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924 }
925 if (CONFIG_SWSCALE_ALPHA && aDest){
926 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927 }
928
929 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930 }
931 return;
932 }
933 #endif
934 #if HAVE_ALTIVEC
935 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 dest, uDest, vDest, dstW, chrDstW);
938 #else //HAVE_ALTIVEC
939 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940 chrFilter, chrSrc, chrFilterSize,
941 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942 #endif //!HAVE_ALTIVEC
943 }
944
945 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
948 {
949 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, dstW, chrDstW, dstFormat);
952 }
953
954 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
956 {
957 int i;
958 #if HAVE_MMX
959 if(!(c->flags & SWS_BITEXACT)){
960 long p= 4;
961 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
964
965 if (c->flags & SWS_ACCURATE_RND){
966 while(p--){
967 if (dst[p]){
968 __asm__ volatile(
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
971 "g" (-counter[p])
972 : "%"REG_a
973 );
974 }
975 }
976 }else{
977 while(p--){
978 if (dst[p]){
979 __asm__ volatile(
980 YSCALEYUV2YV121
981 :: "r" (src[p]), "r" (dst[p] + counter[p]),
982 "g" (-counter[p])
983 : "%"REG_a
984 );
985 }
986 }
987 }
988 return;
989 }
990 #endif
991 for (i=0; i<dstW; i++)
992 {
993 int val= (lumSrc[i]+64)>>7;
994
995 if (val&256){
996 if (val<0) val=0;
997 else val=255;
998 }
999
1000 dest[i]= val;
1001 }
1002
1003 if (uDest)
1004 for (i=0; i<chrDstW; i++)
1005 {
1006 int u=(chrSrc[i ]+64)>>7;
1007 int v=(chrSrc[i + VOFW]+64)>>7;
1008
1009 if ((u|v)&256){
1010 if (u<0) u=0;
1011 else if (u>255) u=255;
1012 if (v<0) v=0;
1013 else if (v>255) v=255;
1014 }
1015
1016 uDest[i]= u;
1017 vDest[i]= v;
1018 }
1019
1020 if (CONFIG_SWSCALE_ALPHA && aDest)
1021 for (i=0; i<dstW; i++){
1022 int val= (alpSrc[i]+64)>>7;
1023 aDest[i]= av_clip_uint8(val);
1024 }
1025 }
1026
1027
1028 /**
1029 * vertical scale YV12 to RGB
1030 */
1031 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1032 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1033 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1034 {
1035 #if HAVE_MMX
1036 x86_reg dummy=0;
1037 if(!(c->flags & SWS_BITEXACT)){
1038 if (c->flags & SWS_ACCURATE_RND){
1039 switch(c->dstFormat){
1040 case PIX_FMT_RGB32:
1041 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1042 YSCALEYUV2PACKEDX_ACCURATE
1043 YSCALEYUV2RGBX
1044 "movq %%mm2, "U_TEMP"(%0) \n\t"
1045 "movq %%mm4, "V_TEMP"(%0) \n\t"
1046 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1047 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1048 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1049 "psraw $3, %%mm1 \n\t"
1050 "psraw $3, %%mm7 \n\t"
1051 "packuswb %%mm7, %%mm1 \n\t"
1052 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1053
1054 YSCALEYUV2PACKEDX_END
1055 }else{
1056 YSCALEYUV2PACKEDX_ACCURATE
1057 YSCALEYUV2RGBX
1058 "pcmpeqd %%mm7, %%mm7 \n\t"
1059 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1060
1061 YSCALEYUV2PACKEDX_END
1062 }
1063 return;
1064 case PIX_FMT_BGR24:
1065 YSCALEYUV2PACKEDX_ACCURATE
1066 YSCALEYUV2RGBX
1067 "pxor %%mm7, %%mm7 \n\t"
1068 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1069 "add %4, %%"REG_c" \n\t"
1070 WRITEBGR24(%%REGc, %5, %%REGa)
1071
1072
1073 :: "r" (&c->redDither),
1074 "m" (dummy), "m" (dummy), "m" (dummy),
1075 "r" (dest), "m" (dstW)
1076 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1077 );
1078 return;
1079 case PIX_FMT_RGB555:
1080 YSCALEYUV2PACKEDX_ACCURATE
1081 YSCALEYUV2RGBX
1082 "pxor %%mm7, %%mm7 \n\t"
1083 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1084 #ifdef DITHER1XBPP
1085 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1086 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1087 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1088 #endif
1089
1090 WRITERGB15(%4, %5, %%REGa)
1091 YSCALEYUV2PACKEDX_END
1092 return;
1093 case PIX_FMT_RGB565:
1094 YSCALEYUV2PACKEDX_ACCURATE
1095 YSCALEYUV2RGBX
1096 "pxor %%mm7, %%mm7 \n\t"
1097 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1098 #ifdef DITHER1XBPP
1099 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1100 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1101 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1102 #endif
1103
1104 WRITERGB16(%4, %5, %%REGa)
1105 YSCALEYUV2PACKEDX_END
1106 return;
1107 case PIX_FMT_YUYV422:
1108 YSCALEYUV2PACKEDX_ACCURATE
1109 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1110
1111 "psraw $3, %%mm3 \n\t"
1112 "psraw $3, %%mm4 \n\t"
1113 "psraw $3, %%mm1 \n\t"
1114 "psraw $3, %%mm7 \n\t"
1115 WRITEYUY2(%4, %5, %%REGa)
1116 YSCALEYUV2PACKEDX_END
1117 return;
1118 }
1119 }else{
1120 switch(c->dstFormat)
1121 {
1122 case PIX_FMT_RGB32:
1123 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1124 YSCALEYUV2PACKEDX
1125 YSCALEYUV2RGBX
1126 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1127 "psraw $3, %%mm1 \n\t"
1128 "psraw $3, %%mm7 \n\t"
1129 "packuswb %%mm7, %%mm1 \n\t"
1130 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1131 YSCALEYUV2PACKEDX_END
1132 }else{
1133 YSCALEYUV2PACKEDX
1134 YSCALEYUV2RGBX
1135 "pcmpeqd %%mm7, %%mm7 \n\t"
1136 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1137 YSCALEYUV2PACKEDX_END
1138 }
1139 return;
1140 case PIX_FMT_BGR24:
1141 YSCALEYUV2PACKEDX
1142 YSCALEYUV2RGBX
1143 "pxor %%mm7, %%mm7 \n\t"
1144 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1145 "add %4, %%"REG_c" \n\t"
1146 WRITEBGR24(%%REGc, %5, %%REGa)
1147
1148 :: "r" (&c->redDither),
1149 "m" (dummy), "m" (dummy), "m" (dummy),
1150 "r" (dest), "m" (dstW)
1151 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1152 );
1153 return;
1154 case PIX_FMT_RGB555:
1155 YSCALEYUV2PACKEDX
1156 YSCALEYUV2RGBX
1157 "pxor %%mm7, %%mm7 \n\t"
1158 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1159 #ifdef DITHER1XBPP
1160 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1161 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1162 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1163 #endif
1164
1165 WRITERGB15(%4, %5, %%REGa)
1166 YSCALEYUV2PACKEDX_END
1167 return;
1168 case PIX_FMT_RGB565:
1169 YSCALEYUV2PACKEDX
1170 YSCALEYUV2RGBX
1171 "pxor %%mm7, %%mm7 \n\t"
1172 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1173 #ifdef DITHER1XBPP
1174 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1175 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1176 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1177 #endif
1178
1179 WRITERGB16(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1181 return;
1182 case PIX_FMT_YUYV422:
1183 YSCALEYUV2PACKEDX
1184 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1185
1186 "psraw $3, %%mm3 \n\t"
1187 "psraw $3, %%mm4 \n\t"
1188 "psraw $3, %%mm1 \n\t"
1189 "psraw $3, %%mm7 \n\t"
1190 WRITEYUY2(%4, %5, %%REGa)
1191 YSCALEYUV2PACKEDX_END
1192 return;
1193 }
1194 }
1195 }
1196 #endif /* HAVE_MMX */
1197 #if HAVE_ALTIVEC
1198 /* The following list of supported dstFormat values should
1199 match what's found in the body of ff_yuv2packedX_altivec() */
1200 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1201 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1202 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1203 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1204 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1205 chrFilter, chrSrc, chrFilterSize,
1206 dest, dstW, dstY);
1207 else
1208 #endif
1209 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1210 chrFilter, chrSrc, chrFilterSize,
1211 alpSrc, dest, dstW, dstY);
1212 }
1213
1214 /**
1215 * vertical bilinear scale YV12 to RGB
1216 */
1217 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1218 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1219 {
1220 int yalpha1=4095- yalpha;
1221 int uvalpha1=4095-uvalpha;
1222 int i;
1223
1224 #if HAVE_MMX
1225 if(!(c->flags & SWS_BITEXACT)){
1226 switch(c->dstFormat)
1227 {
1228 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1229 case PIX_FMT_RGB32:
1230 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1231 #if ARCH_X86_64
1232 __asm__ volatile(
1233 YSCALEYUV2RGB(%%REGBP, %5)
1234 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1235 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1236 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1237 "packuswb %%mm7, %%mm1 \n\t"
1238 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1239
1240 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1241 "a" (&c->redDither)
1242 ,"r" (abuf0), "r" (abuf1)
1243 : "%"REG_BP
1244 );
1245 #else
1246 *(uint16_t **)(&c->u_temp)=abuf0;
1247 *(uint16_t **)(&c->v_temp)=abuf1;
1248 __asm__ volatile(
1249 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1250 "mov %4, %%"REG_b" \n\t"
1251 "push %%"REG_BP" \n\t"
1252 YSCALEYUV2RGB(%%REGBP, %5)
1253 "push %0 \n\t"
1254 "push %1 \n\t"
1255 "mov "U_TEMP"(%5), %0 \n\t"
1256 "mov "V_TEMP"(%5), %1 \n\t"
1257 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1258 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1259 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1260 "packuswb %%mm7, %%mm1 \n\t"
1261 "pop %1 \n\t"
1262 "pop %0 \n\t"
1263 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1264 "pop %%"REG_BP" \n\t"
1265 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1266
1267 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1268 "a" (&c->redDither)
1269 );
1270 #endif
1271 }else{
1272 __asm__ volatile(
1273 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1274 "mov %4, %%"REG_b" \n\t"
1275 "push %%"REG_BP" \n\t"
1276 YSCALEYUV2RGB(%%REGBP, %5)
1277 "pcmpeqd %%mm7, %%mm7 \n\t"
1278 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1279 "pop %%"REG_BP" \n\t"
1280 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1281
1282 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1283 "a" (&c->redDither)
1284 );
1285 }
1286 return;
1287 case PIX_FMT_BGR24:
1288 __asm__ volatile(
1289 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1290 "mov %4, %%"REG_b" \n\t"
1291 "push %%"REG_BP" \n\t"
1292 YSCALEYUV2RGB(%%REGBP, %5)
1293 "pxor %%mm7, %%mm7 \n\t"
1294 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1295 "pop %%"REG_BP" \n\t"
1296 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1297 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1298 "a" (&c->redDither)
1299 );
1300 return;
1301 case PIX_FMT_RGB555:
1302 __asm__ volatile(
1303 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1304 "mov %4, %%"REG_b" \n\t"
1305 "push %%"REG_BP" \n\t"
1306 YSCALEYUV2RGB(%%REGBP, %5)
1307 "pxor %%mm7, %%mm7 \n\t"
1308 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1309 #ifdef DITHER1XBPP
1310 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1311 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1312 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1313 #endif
1314
1315 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1316 "pop %%"REG_BP" \n\t"
1317 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1318
1319 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1320 "a" (&c->redDither)
1321 );
1322 return;
1323 case PIX_FMT_RGB565:
1324 __asm__ volatile(
1325 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1326 "mov %4, %%"REG_b" \n\t"
1327 "push %%"REG_BP" \n\t"
1328 YSCALEYUV2RGB(%%REGBP, %5)
1329 "pxor %%mm7, %%mm7 \n\t"
1330 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1331 #ifdef DITHER1XBPP
1332 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1333 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1334 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1335 #endif
1336
1337 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1338 "pop %%"REG_BP" \n\t"
1339 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1340 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1341 "a" (&c->redDither)
1342 );
1343 return;
1344 case PIX_FMT_YUYV422:
1345 __asm__ volatile(
1346 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1347 "mov %4, %%"REG_b" \n\t"
1348 "push %%"REG_BP" \n\t"
1349 YSCALEYUV2PACKED(%%REGBP, %5)
1350 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1351 "pop %%"REG_BP" \n\t"
1352 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1353 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1354 "a" (&c->redDither)
1355 );
1356 return;
1357 default: break;
1358 }
1359 }
1360 #endif //HAVE_MMX
1361 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1362 }
1363
1364 /**
1365 * YV12 to RGB without scaling or interpolating
1366 */
1367 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1368 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1369 {
1370 const int yalpha1=0;
1371 int i;
1372
1373 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1374 const int yalpha= 4096; //FIXME ...
1375
1376 if (flags&SWS_FULL_CHR_H_INT)
1377 {
1378 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1379 return;
1380 }
1381
1382 #if HAVE_MMX
1383 if(!(flags & SWS_BITEXACT)){
1384 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1385 {
1386 switch(dstFormat)
1387 {
1388 case PIX_FMT_RGB32:
1389 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1390 __asm__ volatile(
1391 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1392 "mov %4, %%"REG_b" \n\t"
1393 "push %%"REG_BP" \n\t"
1394 YSCALEYUV2RGB1(%%REGBP, %5)
1395 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1396 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1397 "pop %%"REG_BP" \n\t"
1398 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1399
1400 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1401 "a" (&c->redDither)
1402 );
1403 }else{
1404 __asm__ volatile(
1405 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1406 "mov %4, %%"REG_b" \n\t"
1407 "push %%"REG_BP" \n\t"
1408 YSCALEYUV2RGB1(%%REGBP, %5)
1409 "pcmpeqd %%mm7, %%mm7 \n\t"
1410 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1411 "pop %%"REG_BP" \n\t"
1412 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1413
1414 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1415 "a" (&c->redDither)
1416 );
1417 }
1418 return;
1419 case PIX_FMT_BGR24:
1420 __asm__ volatile(
1421 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1422 "mov %4, %%"REG_b" \n\t"
1423 "push %%"REG_BP" \n\t"
1424 YSCALEYUV2RGB1(%%REGBP, %5)
1425 "pxor %%mm7, %%mm7 \n\t"
1426 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1427 "pop %%"REG_BP" \n\t"
1428 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1429
1430 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1431 "a" (&c->redDither)
1432 );
1433 return;
1434 case PIX_FMT_RGB555:
1435 __asm__ volatile(
1436 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1437 "mov %4, %%"REG_b" \n\t"
1438 "push %%"REG_BP" \n\t"
1439 YSCALEYUV2RGB1(%%REGBP, %5)
1440 "pxor %%mm7, %%mm7 \n\t"
1441 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1442 #ifdef DITHER1XBPP
1443 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1444 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1445 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1446 #endif
1447 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1448 "pop %%"REG_BP" \n\t"
1449 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1450
1451 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1452 "a" (&c->redDither)
1453 );
1454 return;
1455 case PIX_FMT_RGB565:
1456 __asm__ volatile(
1457 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1458 "mov %4, %%"REG_b" \n\t"
1459 "push %%"REG_BP" \n\t"
1460 YSCALEYUV2RGB1(%%REGBP, %5)
1461 "pxor %%mm7, %%mm7 \n\t"
1462 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1463 #ifdef DITHER1XBPP
1464 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1465 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1466 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1467 #endif
1468
1469 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1470 "pop %%"REG_BP" \n\t"
1471 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1472
1473 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1474 "a" (&c->redDither)
1475 );
1476 return;
1477 case PIX_FMT_YUYV422:
1478 __asm__ volatile(
1479 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1480 "mov %4, %%"REG_b" \n\t"
1481 "push %%"REG_BP" \n\t"
1482 YSCALEYUV2PACKED1(%%REGBP, %5)
1483 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1484 "pop %%"REG_BP" \n\t"
1485 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1486
1487 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 "a" (&c->redDither)
1489 );
1490 return;
1491 }
1492 }
1493 else
1494 {
1495 switch(dstFormat)
1496 {
1497 case PIX_FMT_RGB32:
1498 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1499 __asm__ volatile(
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP, %5)
1504 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1505 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506 "pop %%"REG_BP" \n\t"
1507 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1508
1509 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510 "a" (&c->redDither)
1511 );
1512 }else{
1513 __asm__ volatile(
1514 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1515 "mov %4, %%"REG_b" \n\t"
1516 "push %%"REG_BP" \n\t"
1517 YSCALEYUV2RGB1b(%%REGBP, %5)
1518 "pcmpeqd %%mm7, %%mm7 \n\t"
1519 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1520 "pop %%"REG_BP" \n\t"
1521 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1522
1523 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524 "a" (&c->redDither)
1525 );
1526 }
1527 return;
1528 case PIX_FMT_BGR24:
1529 __asm__ volatile(
1530 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1531 "mov %4, %%"REG_b" \n\t"
1532 "push %%"REG_BP" \n\t"
1533 YSCALEYUV2RGB1b(%%REGBP, %5)
1534 "pxor %%mm7, %%mm7 \n\t"
1535 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1536 "pop %%"REG_BP" \n\t"
1537 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1538
1539 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1540 "a" (&c->redDither)
1541 );
1542 return;
1543 case PIX_FMT_RGB555:
1544 __asm__ volatile(
1545 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1546 "mov %4, %%"REG_b" \n\t"
1547 "push %%"REG_BP" \n\t"
1548 YSCALEYUV2RGB1b(%%REGBP, %5)
1549 "pxor %%mm7, %%mm7 \n\t"
1550 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1551 #ifdef DITHER1XBPP
1552 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1553 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1554 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1555 #endif
1556 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1557 "pop %%"REG_BP" \n\t"
1558 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1559
1560 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1561 "a" (&c->redDither)
1562 );
1563 return;
1564 case PIX_FMT_RGB565:
1565 __asm__ volatile(
1566 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1567 "mov %4, %%"REG_b" \n\t"
1568 "push %%"REG_BP" \n\t"
1569 YSCALEYUV2RGB1b(%%REGBP, %5)
1570 "pxor %%mm7, %%mm7 \n\t"
1571 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1572 #ifdef DITHER1XBPP
1573 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1574 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1575 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1576 #endif
1577
1578 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1579 "pop %%"REG_BP" \n\t"
1580 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1581
1582 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583 "a" (&c->redDither)
1584 );
1585 return;
1586 case PIX_FMT_YUYV422:
1587 __asm__ volatile(
1588 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1589 "mov %4, %%"REG_b" \n\t"
1590 "push %%"REG_BP" \n\t"
1591 YSCALEYUV2PACKED1b(%%REGBP, %5)
1592 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1593 "pop %%"REG_BP" \n\t"
1594 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1595
1596 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1597 "a" (&c->redDither)
1598 );
1599 return;
1600 }
1601 }
1602 }
1603 #endif /* HAVE_MMX */
1604 if (uvalpha < 2048)
1605 {
1606 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1607 }else{
1608 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1609 }
1610 }
1611
1612 //FIXME yuy2* can read up to 7 samples too much
1613
1614 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1615 {
1616 #if HAVE_MMX
1617 __asm__ volatile(
1618 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1619 "mov %0, %%"REG_a" \n\t"
1620 "1: \n\t"
1621 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1622 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1623 "pand %%mm2, %%mm0 \n\t"
1624 "pand %%mm2, %%mm1 \n\t"
1625 "packuswb %%mm1, %%mm0 \n\t"
1626 "movq %%mm0, (%2, %%"REG_a") \n\t"
1627 "add $8, %%"REG_a" \n\t"
1628 " js 1b \n\t"
1629 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1630 : "%"REG_a
1631 );
1632 #else
1633 int i;
1634 for (i=0; i<width; i++)
1635 dst[i]= src[2*i];
1636 #endif
1637 }
1638
1639 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1640 {
1641 #if HAVE_MMX
1642 __asm__ volatile(
1643 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1644 "mov %0, %%"REG_a" \n\t"
1645 "1: \n\t"
1646 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1647 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1648 "psrlw $8, %%mm0 \n\t"
1649 "psrlw $8, %%mm1 \n\t"
1650 "packuswb %%mm1, %%mm0 \n\t"
1651 "movq %%mm0, %%mm1 \n\t"
1652 "psrlw $8, %%mm0 \n\t"
1653 "pand %%mm4, %%mm1 \n\t"
1654 "packuswb %%mm0, %%mm0 \n\t"
1655 "packuswb %%mm1, %%mm1 \n\t"
1656 "movd %%mm0, (%3, %%"REG_a") \n\t"
1657 "movd %%mm1, (%2, %%"REG_a") \n\t"
1658 "add $4, %%"REG_a" \n\t"
1659 " js 1b \n\t"
1660 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1661 : "%"REG_a
1662 );
1663 #else
1664 int i;
1665 for (i=0; i<width; i++)
1666 {
1667 dstU[i]= src1[4*i + 1];
1668 dstV[i]= src1[4*i + 3];
1669 }
1670 #endif
1671 assert(src1 == src2);
1672 }
1673
1674 /* This is almost identical to the previous, end exists only because
1675 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1676 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1677 {
1678 #if HAVE_MMX
1679 __asm__ volatile(
1680 "mov %0, %%"REG_a" \n\t"
1681 "1: \n\t"
1682 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1683 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1684 "psrlw $8, %%mm0 \n\t"
1685 "psrlw $8, %%mm1 \n\t"
1686 "packuswb %%mm1, %%mm0 \n\t"
1687 "movq %%mm0, (%2, %%"REG_a") \n\t"
1688 "add $8, %%"REG_a" \n\t"
1689 " js 1b \n\t"
1690 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1691 : "%"REG_a
1692 );
1693 #else
1694 int i;
1695 for (i=0; i<width; i++)
1696 dst[i]= src[2*i+1];
1697 #endif
1698 }
1699
1700 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1701 {
1702 #if HAVE_MMX
1703 __asm__ volatile(
1704 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1705 "mov %0, %%"REG_a" \n\t"
1706 "1: \n\t"
1707 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1708 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1709 "pand %%mm4, %%mm0 \n\t"
1710 "pand %%mm4, %%mm1 \n\t"
1711 "packuswb %%mm1, %%mm0 \n\t"
1712 "movq %%mm0, %%mm1 \n\t"
1713 "psrlw $8, %%mm0 \n\t"
1714 "pand %%mm4, %%mm1 \n\t"
1715 "packuswb %%mm0, %%mm0 \n\t"
1716 "packuswb %%mm1, %%mm1 \n\t"
1717 "movd %%mm0, (%3, %%"REG_a") \n\t"
1718 "movd %%mm1, (%2, %%"REG_a") \n\t"
1719 "add $4, %%"REG_a" \n\t"
1720 " js 1b \n\t"
1721 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1722 : "%"REG_a
1723 );
1724 #else
1725 int i;
1726 for (i=0; i<width; i++)
1727 {
1728 dstU[i]= src1[4*i + 0];
1729 dstV[i]= src1[4*i + 2];
1730 }
1731 #endif
1732 assert(src1 == src2);
1733 }
1734
1735 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1736 static inline void RENAME(name)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)\
1737 {\
1738 int i;\
1739 for (i=0; i<width; i++)\
1740 {\
1741 int b= (((const type*)src)[i]>>shb)&maskb;\
1742 int g= (((const type*)src)[i]>>shg)&maskg;\
1743 int r= (((const type*)src)[i]>>shr)&maskr;\
1744 \
1745 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1746 }\
1747 }
1748
1749 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1750 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1751 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1752 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1753 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1754 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1755
1756 static inline void RENAME(abgrToA)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused){
1757 int i;
1758 for (i=0; i<width; i++){
1759 dst[i]= src[4*i];
1760 }
1761 }
1762
1763 #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1764 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
1765 {\
1766 int i;\
1767 for (i=0; i<width; i++)\
1768 {\
1769 int b= (((const type*)src)[i]&maskb)>>shb;\
1770 int g= (((const type*)src)[i]&maskg)>>shg;\
1771 int r= (((const type*)src)[i]&maskr)>>shr;\
1772 \
1773 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1774 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1775 }\
1776 }\
1777 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
1778 {\
1779 int i;\
1780 for (i=0; i<width; i++)\
1781 {\
1782 int pix0= ((const type*)src)[2*i+0];\
1783 int pix1= ((const type*)src)[2*i+1];\
1784 int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1785 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1786 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1787 g&= maskg|(2*maskg);\
1788 \
1789 g>>=shg;\
1790 \
1791 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1792 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1793 }\
1794 }
1795
1796 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1797 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1798 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1799 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1800 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1801 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1802
1803 #if HAVE_MMX
1804 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
1805 {
1806
1807 if(srcFormat == PIX_FMT_BGR24){
1808 __asm__ volatile(
1809 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1810 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1811 :
1812 );
1813 }else{
1814 __asm__ volatile(
1815 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1816 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1817 :
1818 );
1819 }
1820
1821 __asm__ volatile(
1822 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1823 "mov %2, %%"REG_a" \n\t"
1824 "pxor %%mm7, %%mm7 \n\t"
1825 "1: \n\t"
1826 PREFETCH" 64(%0) \n\t"
1827 "movd (%0), %%mm0 \n\t"
1828 "movd 2(%0), %%mm1 \n\t"
1829 "movd 6(%0), %%mm2 \n\t"
1830 "movd 8(%0), %%mm3 \n\t"
1831 "add $12, %0 \n\t"
1832 "punpcklbw %%mm7, %%mm0 \n\t"
1833 "punpcklbw %%mm7, %%mm1 \n\t"
1834 "punpcklbw %%mm7, %%mm2 \n\t"
1835 "punpcklbw %%mm7, %%mm3 \n\t"
1836 "pmaddwd %%mm5, %%mm0 \n\t"
1837 "pmaddwd %%mm6, %%mm1 \n\t"
1838 "pmaddwd %%mm5, %%mm2 \n\t"
1839 "pmaddwd %%mm6, %%mm3 \n\t"
1840 "paddd %%mm1, %%mm0 \n\t"
1841 "paddd %%mm3, %%mm2 \n\t"
1842 "paddd %%mm4, %%mm0 \n\t"
1843 "paddd %%mm4, %%mm2 \n\t"
1844 "psrad $15, %%mm0 \n\t"
1845 "psrad $15, %%mm2 \n\t"
1846 "packssdw %%mm2, %%mm0 \n\t"
1847 "packuswb %%mm0, %%mm0 \n\t"
1848 "movd %%mm0, (%1, %%"REG_a") \n\t"
1849 "add $4, %%"REG_a" \n\t"
1850 " js 1b \n\t"
1851 : "+r" (src)
1852 : "r" (dst+width), "g" ((x86_reg)-width)
1853 : "%"REG_a
1854 );
1855 }
1856
1857 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
1858 {
1859 __asm__ volatile(
1860 "movq 24+%4, %%mm6 \n\t"
1861 "mov %3, %%"REG_a" \n\t"
1862 "pxor %%mm7, %%mm7 \n\t"
1863 "1: \n\t"
1864 PREFETCH" 64(%0) \n\t"
1865 "movd (%0), %%mm0 \n\t"
1866 "movd 2(%0), %%mm1 \n\t"
1867 "punpcklbw %%mm7, %%mm0 \n\t"
1868 "punpcklbw %%mm7, %%mm1 \n\t"
1869 "movq %%mm0, %%mm2 \n\t"
1870 "movq %%mm1, %%mm3 \n\t"
1871 "pmaddwd %4, %%mm0 \n\t"
1872 "pmaddwd 8+%4, %%mm1 \n\t"
1873 "pmaddwd 16+%4, %%mm2 \n\t"
1874 "pmaddwd %%mm6, %%mm3 \n\t"
1875 "paddd %%mm1, %%mm0 \n\t"
1876 "paddd %%mm3, %%mm2 \n\t"
1877
1878 "movd 6(%0), %%mm1 \n\t"
1879 "movd 8(%0), %%mm3 \n\t"
1880 "add $12, %0 \n\t"
1881 "punpcklbw %%mm7, %%mm1 \n\t"
1882 "punpcklbw %%mm7, %%mm3 \n\t"
1883 "movq %%mm1, %%mm4 \n\t"
1884 "movq %%mm3, %%mm5 \n\t"
1885 "pmaddwd %4, %%mm1 \n\t"
1886 "pmaddwd 8+%4, %%mm3 \n\t"
1887 "pmaddwd 16+%4, %%mm4 \n\t"
1888 "pmaddwd %%mm6, %%mm5 \n\t"
1889 "paddd %%mm3, %%mm1 \n\t"
1890 "paddd %%mm5, %%mm4 \n\t"
1891
1892 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1893 "paddd %%mm3, %%mm0 \n\t"
1894 "paddd %%mm3, %%mm2 \n\t"
1895 "paddd %%mm3, %%mm1 \n\t"
1896 "paddd %%mm3, %%mm4 \n\t"
1897 "psrad $15, %%mm0 \n\t"
1898 "psrad $15, %%mm2 \n\t"
1899 "psrad $15, %%mm1 \n\t"
1900 "psrad $15, %%mm4 \n\t"
1901 "packssdw %%mm1, %%mm0 \n\t"
1902 "packssdw %%mm4, %%mm2 \n\t"
1903 "packuswb %%mm0, %%mm0 \n\t"
1904 "packuswb %%mm2, %%mm2 \n\t"
1905 "movd %%mm0, (%1, %%"REG_a") \n\t"
1906 "movd %%mm2, (%2, %%"REG_a") \n\t"
1907 "add $4, %%"REG_a" \n\t"
1908 " js 1b \n\t"
1909 : "+r" (src)
1910 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1911 : "%"REG_a
1912 );
1913 }
1914 #endif
1915
1916 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1917 {
1918 #if HAVE_MMX
1919 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1920 #else
1921 int i;
1922 for (i=0; i<width; i++)
1923 {
1924 int b= src[i*3+0];
1925 int g= src[i*3+1];
1926 int r= src[i*3+2];
1927
1928 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1929 }
1930 #endif /* HAVE_MMX */
1931 }
1932
1933 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1934 {
1935 #if HAVE_MMX
1936 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1937 #else
1938 int i;
1939 for (i=0; i<width; i++)
1940 {
1941 int b= src1[3*i + 0];
1942 int g= src1[3*i + 1];
1943 int r= src1[3*i + 2];
1944
1945 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1946 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1947 }
1948 #endif /* HAVE_MMX */
1949 assert(src1 == src2);
1950 }
1951
1952 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1953 {
1954 int i;
1955 for (i=0; i<width; i++)
1956 {
1957 int b= src1[6*i + 0] + src1[6*i + 3];
1958 int g= src1[6*i + 1] + src1[6*i + 4];
1959 int r= src1[6*i + 2] + src1[6*i + 5];
1960
1961 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1962 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1963 }
1964 assert(src1 == src2);
1965 }
1966
1967 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1968 {
1969 #if HAVE_MMX
1970 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1971 #else
1972 int i;
1973 for (i=0; i<width; i++)
1974 {
1975 int r= src[i*3+0];
1976 int g= src[i*3+1];
1977 int b= src[i*3+2];
1978
1979 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1980 }
1981 #endif
1982 }
1983
1984 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1985 {
1986 #if HAVE_MMX
1987 assert(src1==src2);
1988 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1989 #else
1990 int i;
1991 assert(src1==src2);
1992 for (i=0; i<width; i++)
1993 {
1994 int r= src1[3*i + 0];
1995 int g= src1[3*i + 1];
1996 int b= src1[3*i + 2];
1997
1998 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1999 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2000 }
2001 #endif
2002 }
2003
2004 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2005 {
2006 int i;
2007 assert(src1==src2);
2008 for (i=0; i<width; i++)
2009 {
2010 int r= src1[6*i + 0] + src1[6*i + 3];
2011 int g= src1[6*i + 1] + src1[6*i + 4];
2012 int b= src1[6*i + 2] + src1[6*i + 5];
2013
2014 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2015 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2016 }
2017 }
2018
2019
2020 static inline void RENAME(palToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *pal)
2021 {
2022 int i;
2023 for (i=0; i<width; i++)
2024 {
2025 int d= src[i];
2026
2027 dst[i]= pal[d] & 0xFF;
2028 }
2029 }
2030
2031 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV,
2032 const uint8_t *src1, const uint8_t *src2,
2033 long width, uint32_t *pal)
2034 {
2035 int i;
2036 assert(src1 == src2);
2037 for (i=0; i<width; i++)
2038 {
2039 int p= pal[src1[i]];
2040
2041 dstU[i]= p>>8;
2042 dstV[i]= p>>16;
2043 }
2044 }
2045
2046 static inline void RENAME(monowhite2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
2047 {
2048 int i, j;
2049 for (i=0; i<width/8; i++){
2050 int d= ~src[i];
2051 for(j=0; j<8; j++)
2052 dst[8*i+j]= ((d>>(7-j))&1)*255;
2053 }
2054 }
2055
2056 static inline void RENAME(monoblack2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
2057 {
2058 int i, j;
2059 for (i=0; i<width/8; i++){
2060 int d= src[i];
2061 for(j=0; j<8; j++)
2062 dst[8*i+j]= ((d>>(7-j))&1)*255;
2063 }
2064 }
2065
2066 // bilinear / bicubic scaling
2067 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2068 const int16_t *filter, const int16_t *filterPos, long filterSize)
2069 {
2070 #if HAVE_MMX
2071 assert(filterSize % 4 == 0 && filterSize>0);
2072 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2073 {
2074 x86_reg counter= -2*dstW;
2075 filter-= counter*2;
2076 filterPos-= counter/2;
2077 dst-= counter/2;
2078 __asm__ volatile(
2079 #if defined(PIC)
2080 "push %%"REG_b" \n\t"
2081 #endif
2082 "pxor %%mm7, %%mm7 \n\t"
2083 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2084 "mov %%"REG_a", %%"REG_BP" \n\t"
2085 ASMALIGN(4)
2086 "1: \n\t"
2087 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2088 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2089 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2090 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2091 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2092 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2093 "punpcklbw %%mm7, %%mm0 \n\t"
2094 "punpcklbw %%mm7, %%mm2 \n\t"
2095 "pmaddwd %%mm1, %%mm0 \n\t"
2096 "pmaddwd %%mm2, %%mm3 \n\t"
2097 "movq %%mm0, %%mm4 \n\t"
2098 "punpckldq %%mm3, %%mm0 \n\t"
2099 "punpckhdq %%mm3, %%mm4 \n\t"
2100 "paddd %%mm4, %%mm0 \n\t"
2101 "psrad $7, %%mm0 \n\t"
2102 "packssdw %%mm0, %%mm0 \n\t"
2103 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2104 "add $4, %%"REG_BP" \n\t"
2105 " jnc 1b \n\t"
2106
2107 "pop %%"REG_BP" \n\t"
2108 #if defined(PIC)
2109 "pop %%"REG_b" \n\t"
2110 #endif
2111 : "+a" (counter)
2112 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2113 #if !defined(PIC)
2114 : "%"REG_b
2115 #endif
2116 );
2117 }
2118 else if (filterSize==8)
2119 {
2120 x86_reg counter= -2*dstW;
2121 filter-= counter*4;
2122 filterPos-= counter/2;
2123 dst-= counter/2;
2124 __asm__ volatile(
2125 #if defined(PIC)
2126 "push %%"REG_b" \n\t"
2127 #endif
2128 "pxor %%mm7, %%mm7 \n\t"
2129 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2130 "mov %%"REG_a", %%"REG_BP" \n\t"
2131 ASMALIGN(4)
2132 "1: \n\t"
2133 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2134 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2135 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2136 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2137 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2138 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2139 "punpcklbw %%mm7, %%mm0 \n\t"
2140 "punpcklbw %%mm7, %%mm2 \n\t"
2141 "pmaddwd %%mm1, %%mm0 \n\t"
2142 "pmaddwd %%mm2, %%mm3 \n\t"
2143
2144 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2145 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2146 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2147 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2148 "punpcklbw %%mm7, %%mm4 \n\t"
2149 "punpcklbw %%mm7, %%mm2 \n\t"
2150 "pmaddwd %%mm1, %%mm4 \n\t"
2151 "pmaddwd %%mm2, %%mm5 \n\t"
2152 "paddd %%mm4, %%mm0 \n\t"
2153 "paddd %%mm5, %%mm3 \n\t"
2154 "movq %%mm0, %%mm4 \n\t"
2155 "punpckldq %%mm3, %%mm0 \n\t"
2156 "punpckhdq %%mm3, %%mm4 \n\t"
2157 "paddd %%mm4, %%mm0 \n\t"
2158 "psrad $7, %%mm0 \n\t"
2159 "packssdw %%mm0, %%mm0 \n\t"
2160 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2161 "add $4, %%"REG_BP" \n\t"
2162 " jnc 1b \n\t"
2163
2164 "pop %%"REG_BP" \n\t"
2165 #if defined(PIC)
2166 "pop %%"REG_b" \n\t"
2167 #endif
2168 : "+a" (counter)
2169 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2170 #if !defined(PIC)
2171 : "%"REG_b
2172 #endif
2173 );
2174 }
2175 else
2176 {
2177 uint8_t *offset = src+filterSize;
2178 x86_reg counter= -2*dstW;
2179 //filter-= counter*filterSize/2;
2180 filterPos-= counter/2;
2181 dst-= counter/2;
2182 __asm__ volatile(
2183 "pxor %%mm7, %%mm7 \n\t"
2184 ASMALIGN(4)
2185 "1: \n\t"
2186 "mov %2, %%"REG_c" \n\t"
2187 "movzwl (%%"REG_c", %0), %%eax \n\t"
2188 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2189 "mov %5, %%"REG_c" \n\t"
2190 "pxor %%mm4, %%mm4 \n\t"
2191 "pxor %%mm5, %%mm5 \n\t"
2192 "2: \n\t"
2193 "movq (%1), %%mm1 \n\t"
2194 "movq (%1, %6), %%mm3 \n\t"
2195 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2196 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2197 "punpcklbw %%mm7, %%mm0 \n\t"
2198 "punpcklbw %%mm7, %%mm2 \n\t"
2199 "pmaddwd %%mm1, %%mm0 \n\t"
2200 "pmaddwd %%mm2, %%mm3 \n\t"
2201 "paddd %%mm3, %%mm5 \n\t"
2202 "paddd %%mm0, %%mm4 \n\t"
2203 "add $8, %1 \n\t"
2204 "add $4, %%"REG_c" \n\t"
2205 "cmp %4, %%"REG_c" \n\t"
2206 " jb 2b \n\t"
2207 "add %6, %1 \n\t"
2208 "movq %%mm4, %%mm0 \n\t"
2209 "punpckldq %%mm5, %%mm4 \n\t"
2210 "punpckhdq %%mm5, %%mm0 \n\t"
2211 "paddd %%mm0, %%mm4 \n\t"
2212 "psrad $7, %%mm4 \n\t"
2213 "packssdw %%mm4, %%mm4 \n\t"
2214 "mov %3, %%"REG_a" \n\t"
2215 "movd %%mm4, (%%"REG_a", %0) \n\t"
2216 "add $4, %0 \n\t"
2217 " jnc 1b \n\t"
2218
2219 : "+r" (counter), "+r" (filter)
2220 : "m" (filterPos), "m" (dst), "m"(offset),
2221 "m" (src), "r" ((x86_reg)filterSize*2)
2222 : "%"REG_a, "%"REG_c, "%"REG_d
2223 );
2224 }
2225 #else
2226 #if HAVE_ALTIVEC
2227 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2228 #else
2229 int i;
2230 for (i=0; i<dstW; i++)
2231 {
2232 int j;
2233 int srcPos= filterPos[i];
2234 int val=0;
2235 //printf("filterPos: %d\n", filterPos[i]);
2236 for (j=0; j<filterSize; j++)
2237 {
2238 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2239 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2240 }
2241 //filter += hFilterSize;
2242 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2243 //dst[i] = val>>7;
2244 }
2245 #endif /* HAVE_ALTIVEC */
2246 #endif /* HAVE_MMX */
2247 }
2248
2249 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2250 int dstWidth, const uint8_t *src, int srcW,
2251 int xInc)
2252 {
2253 int i;
2254 unsigned int xpos=0;
2255 for (i=0;i<dstWidth;i++)
2256 {
2257 register unsigned int xx=xpos>>16;
2258 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2259 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2260 xpos+=xInc;
2261 }
2262 }
2263
2264 // *** horizontal scale Y line to temp buffer
2265 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2266 int flags, const int16_t *hLumFilter,
2267 const int16_t *hLumFilterPos, int hLumFilterSize,
2268 int srcFormat, uint8_t *formatConvBuffer,
2269 uint32_t *pal, int isAlpha)
2270 {
2271 int32_t *mmx2FilterPos = c->lumMmx2FilterPos;
2272 int16_t *mmx2Filter = c->lumMmx2Filter;
2273 int canMMX2BeUsed = c->canMMX2BeUsed;
2274 void *funnyYCode = c->funnyYCode;
2275
2276 if (isAlpha) {
2277 if (srcFormat == PIX_FMT_RGB32 || srcFormat == PIX_FMT_BGR32 )
2278 src += 3;
2279 } else {
2280 if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
2281 src += ALT32_CORR;
2282 }
2283
2284 if (c->hyscale_internal) {
2285 c->hyscale_internal(formatConvBuffer, src, srcW, pal);
2286 src= formatConvBuffer;
2287 }
2288
2289 #if HAVE_MMX
2290 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2291 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2292 #else
2293 if (!(flags&SWS_FAST_BILINEAR))
2294 #endif
2295 {
2296 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2297 }
2298 else // fast bilinear upscale / crap downscale
2299 {
2300 #if ARCH_X86 && CONFIG_GPL
2301 #if HAVE_MMX2
2302 int i;
2303 #if defined(PIC)
2304 uint64_t ebxsave __attribute__((aligned(8)));
2305 #endif
2306 if (canMMX2BeUsed)
2307 {
2308 __asm__ volatile(
2309 #if defined(PIC)
2310 "mov %%"REG_b", %5 \n\t"
2311 #endif
2312 "pxor %%mm7, %%mm7 \n\t"
2313 "mov %0, %%"REG_c" \n\t"
2314 "mov %1, %%"REG_D" \n\t"
2315 "mov %2, %%"REG_d" \n\t"
2316 "mov %3, %%"REG_b" \n\t"
2317 "xor %%"REG_a", %%"REG_a" \n\t" // i
2318 PREFETCH" (%%"REG_c") \n\t"
2319 PREFETCH" 32(%%"REG_c") \n\t"
2320 PREFETCH" 64(%%"REG_c") \n\t"
2321
2322 #if ARCH_X86_64
2323
2324 #define FUNNY_Y_CODE \
2325 "movl (%%"REG_b"), %%esi \n\t"\
2326 "call *%4 \n\t"\
2327 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2328 "add %%"REG_S", %%"REG_c" \n\t"\
2329 "add %%"REG_a", %%"REG_D" \n\t"\
2330 "xor %%"REG_a", %%"REG_a" \n\t"\
2331
2332 #else
2333
2334 #define FUNNY_Y_CODE \
2335 "movl (%%"REG_b"), %%esi \n\t"\
2336 "call *%4 \n\t"\
2337 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2338 "add %%"REG_a", %%"REG_D" \n\t"\
2339 "xor %%"REG_a", %%"REG_a" \n\t"\
2340
2341 #endif /* ARCH_X86_64 */
2342
2343 FUNNY_Y_CODE
2344 FUNNY_Y_CODE
2345 FUNNY_Y_CODE
2346 FUNNY_Y_CODE
2347 FUNNY_Y_CODE
2348 FUNNY_Y_CODE
2349 FUNNY_Y_CODE
2350 FUNNY_Y_CODE
2351
2352 #if defined(PIC)
2353 "mov %5, %%"REG_b" \n\t"
2354 #endif
2355 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2356 "m" (funnyYCode)
2357 #if defined(PIC)
2358 ,"m" (ebxsave)
2359 #endif
2360 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2361 #if !defined(PIC)
2362 ,"%"REG_b
2363 #endif
2364 );
2365 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2366 }
2367 else
2368 {
2369 #endif /* HAVE_MMX2 */
2370 x86_reg xInc_shr16 = xInc >> 16;
2371 uint16_t xInc_mask = xInc & 0xffff;
2372 //NO MMX just normal asm ...
2373 __asm__ volatile(
2374 "xor %%"REG_a", %%"REG_a" \n\t" // i
2375 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2376 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2377 ASMALIGN(4)
2378 "1: \n\t"
2379 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2380 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2381 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2382 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2383 "shll $16, %%edi \n\t"
2384 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2385 "mov %1, %%"REG_D" \n\t"
2386 "shrl $9, %%esi \n\t"
2387 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2388 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2389 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2390
2391 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2392 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2393 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2394 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2395 "shll $16, %%edi \n\t"
2396 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2397 "mov %1, %%"REG_D" \n\t"
2398 "shrl $9, %%esi \n\t"
2399 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2400 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2401 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2402
2403
2404 "add $2, %%"REG_a" \n\t"
2405 "cmp %2, %%"REG_a" \n\t"
2406 " jb 1b \n\t"
2407
2408
2409 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2410 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2411 );
2412 #if HAVE_MMX2
2413 } //if MMX2 can't be used
2414 #endif
2415 #else
2416 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2417 #endif /* ARCH_X86 */
2418 }
2419
2420 if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2421 int i;
2422 //FIXME all pal and rgb srcFormats could do this convertion as well
2423 //FIXME all scalers more complex than bilinear could do half of this transform
2424 if(c->srcRange){
2425 for (i=0; i<dstWidth; i++)
2426 dst[i]= (dst[i]*14071 + 33561947)>>14;
2427 }else{
2428 for (i=0; i<dstWidth; i++)
2429 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2430 }
2431 }
2432 }
2433
2434 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2435 int dstWidth, const uint8_t *src1,
2436 const uint8_t *src2, int srcW, int xInc)
2437 {
2438 int i;
2439 unsigned int xpos=0;
2440 for (i=0;i<dstWidth;i++)
2441 {
2442 register unsigned int xx=xpos>>16;
2443 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2444 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2445 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2446 /* slower
2447 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2448 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2449 */
2450 xpos+=xInc;
2451 }
2452 }
2453
2454 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2455 int srcW, int xInc, int flags, const int16_t *hChrFilter,
2456 const int16_t *hChrFilterPos, int hChrFilterSize,
2457 int srcFormat, uint8_t *formatConvBuffer,
2458 uint32_t *pal)
2459 {
2460 int32_t *mmx2FilterPos = c->chrMmx2FilterPos;
2461 int16_t *mmx2Filter = c->chrMmx2Filter;
2462 int canMMX2BeUsed = c->canMMX2BeUsed;
2463 void *funnyUVCode = c->funnyUVCode;
2464
2465 if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2466 return;
2467
2468 if (srcFormat==PIX_FMT_RGB32_1) {
2469 src1 += ALT32_CORR;
2470 src2 += ALT32_CORR;
2471 }
2472
2473 if (c->hcscale_internal) {
2474 c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2475 src1= formatConvBuffer;
2476 src2= formatConvBuffer+VOFW;
2477 }
2478
2479 #if HAVE_MMX
2480 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2481 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2482 #else
2483 if (!(flags&SWS_FAST_BILINEAR))
2484 #endif
2485 {
2486 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2487 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2488 }
2489 else // fast bilinear upscale / crap downscale
2490 {
2491 #if ARCH_X86 && CONFIG_GPL
2492 #if HAVE_MMX2
2493 int i;
2494 #if defined(PIC)
2495 uint64_t ebxsave __attribute__((aligned(8)));
2496 #endif
2497 if (canMMX2BeUsed)
2498 {
2499 __asm__ volatile(
2500 #if defined(PIC)
2501 "mov %%"REG_b", %6 \n\t"
2502 #endif
2503 "pxor %%mm7, %%mm7 \n\t"
2504 "mov %0, %%"REG_c" \n\t"
2505 "mov %1, %%"REG_D" \n\t"
2506 "mov %2, %%"REG_d" \n\t"
2507 "mov %3, %%"REG_b" \n\t"
2508 "xor %%"REG_a", %%"REG_a" \n\t" // i
2509 PREFETCH" (%%"REG_c") \n\t"
2510 PREFETCH" 32(%%"REG_c") \n\t"
2511 PREFETCH" 64(%%"REG_c") \n\t"
2512
2513 #if ARCH_X86_64
2514
2515 #define FUNNY_UV_CODE \
2516 "movl (%%"REG_b"), %%esi \n\t"\
2517 "call *%4 \n\t"\
2518 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2519 "add %%"REG_S", %%"REG_c" \n\t"\
2520 "add %%"REG_a", %%"REG_D" \n\t"\
2521 "xor %%"REG_a", %%"REG_a" \n\t"\
2522
2523 #else
2524
2525 #define FUNNY_UV_CODE \
2526 "movl (%%"REG_b"), %%esi \n\t"\
2527 "call *%4 \n\t"\
2528 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2529 "add %%"REG_a", %%"REG_D" \n\t"\
2530 "xor %%"REG_a", %%"REG_a" \n\t"\
2531
2532 #endif /* ARCH_X86_64 */
2533
2534 FUNNY_UV_CODE
2535 FUNNY_UV_CODE
2536 FUNNY_UV_CODE
2537 FUNNY_UV_CODE
2538 "xor %%"REG_a", %%"REG_a" \n\t" // i
2539 "mov %5, %%"REG_c" \n\t" // src
2540 "mov %1, %%"REG_D" \n\t" // buf1
2541 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2542 PREFETCH" (%%"REG_c") \n\t"
2543 PREFETCH" 32(%%"REG_c") \n\t"
2544 PREFETCH" 64(%%"REG_c") \n\t"
2545
2546 FUNNY_UV_CODE
2547 FUNNY_UV_CODE
2548 FUNNY_UV_CODE
2549 FUNNY_UV_CODE
2550
2551 #if defined(PIC)
2552 "mov %6, %%"REG_b" \n\t"
2553 #endif
2554 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2555 "m" (funnyUVCode), "m" (src2)
2556 #if defined(PIC)
2557 ,"m" (ebxsave)
2558 #endif
2559 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2560 #if !defined(PIC)
2561 ,"%"REG_b
2562 #endif
2563 );
2564 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2565 {
2566 //printf("%d %d %d\n", dstWidth, i, srcW);
2567 dst[i] = src1[srcW-1]*128;
2568 dst[i+VOFW] = src2[srcW-1]*128;
2569 }
2570 }
2571 else
2572 {
2573 #endif /* HAVE_MMX2 */
2574 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2575 uint16_t xInc_mask = xInc & 0xffff;
2576 __asm__ volatile(
2577 "xor %%"REG_a", %%"REG_a" \n\t" // i
2578 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2579 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2580 ASMALIGN(4)
2581 "1: \n\t"
2582 "mov %0, %%"REG_S" \n\t"
2583 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2584 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2585 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2586 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2587 "shll $16, %%edi \n\t"
2588 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2589 "mov %1, %%"REG_D" \n\t"
2590 "shrl $9, %%esi \n\t"
2591 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2592
2593 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2594 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2595 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2596 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2597 "shll $16, %%edi \n\t"
2598 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2599 "mov %1, %%"REG_D" \n\t"
2600 "shrl $9, %%esi \n\t"
2601 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2602
2603 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2604 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2605 "add $1, %%"REG_a" \n\t"
2606 "cmp %2, %%"REG_a" \n\t"
2607 " jb 1b \n\t"
2608
2609 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2610 which is needed to support GCC 4.0. */
2611 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2612 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2613 #else
2614 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2615 #endif
2616 "r" (src2)
2617 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2618 );
2619 #if HAVE_MMX2
2620 } //if MMX2 can't be used
2621 #endif
2622 #else
2623 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2624 #endif /* ARCH_X86 */
2625 }
2626 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2627 int i;
2628 //FIXME all pal and rgb srcFormats could do this convertion as well
2629 //FIXME all scalers more complex than bilinear could do half of this transform
2630 if(c->srcRange){
2631 for (i=0; i<dstWidth; i++){
2632 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2633 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2634 }
2635 }else{
2636 for (i=0; i<dstWidth; i++){
2637 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2638 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2639 }
2640 }
2641 }
2642 }
2643
2644 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2645 int srcSliceH, uint8_t* dst[], int dstStride[]){
2646
2647 /* load a few things into local vars to make the code more readable? and faster */
2648 const int srcW= c->srcW;
2649 const int dstW= c->dstW;
2650 const int dstH= c->dstH;
2651 const int chrDstW= c->chrDstW;
2652 const int chrSrcW= c->chrSrcW;
2653 const int lumXInc= c->lumXInc;
2654 const int chrXInc= c->chrXInc;
2655 const int dstFormat= c->dstFormat;
2656 const int srcFormat= c->srcFormat;
2657 const int flags= c->flags;
2658 int16_t *vLumFilterPos= c->vLumFilterPos;
2659 int16_t *vChrFilterPos= c->vChrFilterPos;
2660 int16_t *hLumFilterPos= c->hLumFilterPos;
2661 int16_t *hChrFilterPos= c->hChrFilterPos;
2662 int16_t *vLumFilter= c->vLumFilter;
2663 int16_t *vChrFilter= c->vChrFilter;
2664 int16_t *hLumFilter= c->hLumFilter;
2665 int16_t *hChrFilter= c->hChrFilter;
2666 int32_t *lumMmxFilter= c->lumMmxFilter;
2667 int32_t *chrMmxFilter= c->chrMmxFilter;
2668 int32_t *alpMmxFilter= c->alpMmxFilter;
2669 const int vLumFilterSize= c->vLumFilterSize;
2670 const int vChrFilterSize= c->vChrFilterSize;
2671 const int hLumFilterSize= c->hLumFilterSize;
2672 const int hChrFilterSize= c->hChrFilterSize;
2673 int16_t **lumPixBuf= c->lumPixBuf;
2674 int16_t **chrPixBuf= c->chrPixBuf;
2675 int16_t **alpPixBuf= c->alpPixBuf;
2676 const int vLumBufSize= c->vLumBufSize;
2677 const int vChrBufSize= c->vChrBufSize;
2678 uint8_t *formatConvBuffer