3c6c8ebe8d14085a04e7e932999ef3a4b12f2a12
[libav.git] / libswscale / swscale_template.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
22 */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29
30 #if COMPILE_TEMPLATE_AMD3DNOW
31 #define PREFETCH "prefetch"
32 #define PREFETCHW "prefetchw"
33 #elif COMPILE_TEMPLATE_MMX2
34 #define PREFETCH "prefetchnta"
35 #define PREFETCHW "prefetcht0"
36 #else
37 #define PREFETCH " # nop"
38 #define PREFETCHW " # nop"
39 #endif
40
41 #if COMPILE_TEMPLATE_MMX2
42 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43 #elif COMPILE_TEMPLATE_AMD3DNOW
44 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45 #endif
46
47 #if COMPILE_TEMPLATE_MMX2
48 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
49 #else
50 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
51 #endif
52 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
53
54 #if COMPILE_TEMPLATE_ALTIVEC
55 #include "ppc/swscale_altivec_template.c"
56 #endif
57
58 #define YSCALEYUV2YV12X(x, offset, dest, width) \
59 __asm__ volatile(\
60 "xor %%"REG_a", %%"REG_a" \n\t"\
61 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
62 "movq %%mm3, %%mm4 \n\t"\
63 "lea " offset "(%0), %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 ASMALIGN(4) /* FIXME Unroll? */\
66 "1: \n\t"\
67 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
68 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
69 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
70 "add $16, %%"REG_d" \n\t"\
71 "mov (%%"REG_d"), %%"REG_S" \n\t"\
72 "test %%"REG_S", %%"REG_S" \n\t"\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
77 " jnz 1b \n\t"\
78 "psraw $3, %%mm3 \n\t"\
79 "psraw $3, %%mm4 \n\t"\
80 "packuswb %%mm4, %%mm3 \n\t"\
81 MOVNTQ(%%mm3, (%1, %%REGa))\
82 "add $8, %%"REG_a" \n\t"\
83 "cmp %2, %%"REG_a" \n\t"\
84 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
85 "movq %%mm3, %%mm4 \n\t"\
86 "lea " offset "(%0), %%"REG_d" \n\t"\
87 "mov (%%"REG_d"), %%"REG_S" \n\t"\
88 "jb 1b \n\t"\
89 :: "r" (&c->redDither),\
90 "r" (dest), "g" (width)\
91 : "%"REG_a, "%"REG_d, "%"REG_S\
92 );
93
94 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
95 __asm__ volatile(\
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "xor %%"REG_a", %%"REG_a" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 ASMALIGN(4) \
104 "1: \n\t"\
105 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
106 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
108 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
117 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
128 " jnz 1b \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %%REGa))\
142 "add $8, %%"REG_a" \n\t"\
143 "cmp %2, %%"REG_a" \n\t"\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
150 "jb 1b \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" (width)\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
154 );
155
156 #define YSCALEYUV2YV121 \
157 "mov %2, %%"REG_a" \n\t"\
158 ASMALIGN(4) /* FIXME Unroll? */\
159 "1: \n\t"\
160 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
161 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
167 "jnc 1b \n\t"
168
169 #define YSCALEYUV2YV121_ACCURATE \
170 "mov %2, %%"REG_a" \n\t"\
171 "pcmpeqw %%mm7, %%mm7 \n\t"\
172 "psrlw $15, %%mm7 \n\t"\
173 "psllw $6, %%mm7 \n\t"\
174 ASMALIGN(4) /* FIXME Unroll? */\
175 "1: \n\t"\
176 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
177 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
178 "paddsw %%mm7, %%mm0 \n\t"\
179 "paddsw %%mm7, %%mm1 \n\t"\
180 "psraw $7, %%mm0 \n\t"\
181 "psraw $7, %%mm1 \n\t"\
182 "packuswb %%mm1, %%mm0 \n\t"\
183 MOVNTQ(%%mm0, (%1, %%REGa))\
184 "add $8, %%"REG_a" \n\t"\
185 "jnc 1b \n\t"
186
187 /*
188 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190 "r" (dest), "m" (dstW),
191 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
193 */
194 #define YSCALEYUV2PACKEDX_UV \
195 __asm__ volatile(\
196 "xor %%"REG_a", %%"REG_a" \n\t"\
197 ASMALIGN(4)\
198 "nop \n\t"\
199 "1: \n\t"\
200 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
201 "mov (%%"REG_d"), %%"REG_S" \n\t"\
202 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
203 "movq %%mm3, %%mm4 \n\t"\
204 ASMALIGN(4)\
205 "2: \n\t"\
206 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
207 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
208 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
209 "add $16, %%"REG_d" \n\t"\
210 "mov (%%"REG_d"), %%"REG_S" \n\t"\
211 "pmulhw %%mm0, %%mm2 \n\t"\
212 "pmulhw %%mm0, %%mm5 \n\t"\
213 "paddw %%mm2, %%mm3 \n\t"\
214 "paddw %%mm5, %%mm4 \n\t"\
215 "test %%"REG_S", %%"REG_S" \n\t"\
216 " jnz 2b \n\t"\
217
218 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219 "lea "offset"(%0), %%"REG_d" \n\t"\
220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
221 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
222 "movq "#dst1", "#dst2" \n\t"\
223 ASMALIGN(4)\
224 "2: \n\t"\
225 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
226 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
227 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
228 "add $16, %%"REG_d" \n\t"\
229 "mov (%%"REG_d"), %%"REG_S" \n\t"\
230 "pmulhw "#coeff", "#src1" \n\t"\
231 "pmulhw "#coeff", "#src2" \n\t"\
232 "paddw "#src1", "#dst1" \n\t"\
233 "paddw "#src2", "#dst2" \n\t"\
234 "test %%"REG_S", %%"REG_S" \n\t"\
235 " jnz 2b \n\t"\
236
237 #define YSCALEYUV2PACKEDX \
238 YSCALEYUV2PACKEDX_UV \
239 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
240
241 #define YSCALEYUV2PACKEDX_END \
242 :: "r" (&c->redDither), \
243 "m" (dummy), "m" (dummy), "m" (dummy),\
244 "r" (dest), "m" (dstW) \
245 : "%"REG_a, "%"REG_d, "%"REG_S \
246 );
247
248 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
249 __asm__ volatile(\
250 "xor %%"REG_a", %%"REG_a" \n\t"\
251 ASMALIGN(4)\
252 "nop \n\t"\
253 "1: \n\t"\
254 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
255 "mov (%%"REG_d"), %%"REG_S" \n\t"\
256 "pxor %%mm4, %%mm4 \n\t"\
257 "pxor %%mm5, %%mm5 \n\t"\
258 "pxor %%mm6, %%mm6 \n\t"\
259 "pxor %%mm7, %%mm7 \n\t"\
260 ASMALIGN(4)\
261 "2: \n\t"\
262 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
263 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
264 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
265 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
266 "movq %%mm0, %%mm3 \n\t"\
267 "punpcklwd %%mm1, %%mm0 \n\t"\
268 "punpckhwd %%mm1, %%mm3 \n\t"\
269 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
270 "pmaddwd %%mm1, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm3 \n\t"\
272 "paddd %%mm0, %%mm4 \n\t"\
273 "paddd %%mm3, %%mm5 \n\t"\
274 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
275 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
276 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
277 "test %%"REG_S", %%"REG_S" \n\t"\
278 "movq %%mm2, %%mm0 \n\t"\
279 "punpcklwd %%mm3, %%mm2 \n\t"\
280 "punpckhwd %%mm3, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm2 \n\t"\
282 "pmaddwd %%mm1, %%mm0 \n\t"\
283 "paddd %%mm2, %%mm6 \n\t"\
284 "paddd %%mm0, %%mm7 \n\t"\
285 " jnz 2b \n\t"\
286 "psrad $16, %%mm4 \n\t"\
287 "psrad $16, %%mm5 \n\t"\
288 "psrad $16, %%mm6 \n\t"\
289 "psrad $16, %%mm7 \n\t"\
290 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
291 "packssdw %%mm5, %%mm4 \n\t"\
292 "packssdw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm0, %%mm4 \n\t"\
294 "paddw %%mm0, %%mm6 \n\t"\
295 "movq %%mm4, "U_TEMP"(%0) \n\t"\
296 "movq %%mm6, "V_TEMP"(%0) \n\t"\
297
298 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299 "lea "offset"(%0), %%"REG_d" \n\t"\
300 "mov (%%"REG_d"), %%"REG_S" \n\t"\
301 "pxor %%mm1, %%mm1 \n\t"\
302 "pxor %%mm5, %%mm5 \n\t"\
303 "pxor %%mm7, %%mm7 \n\t"\
304 "pxor %%mm6, %%mm6 \n\t"\
305 ASMALIGN(4)\
306 "2: \n\t"\
307 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
309 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
310 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
311 "movq %%mm0, %%mm3 \n\t"\
312 "punpcklwd %%mm4, %%mm0 \n\t"\
313 "punpckhwd %%mm4, %%mm3 \n\t"\
314 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "pmaddwd %%mm4, %%mm3 \n\t"\
317 "paddd %%mm0, %%mm1 \n\t"\
318 "paddd %%mm3, %%mm5 \n\t"\
319 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
320 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
321 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
322 "test %%"REG_S", %%"REG_S" \n\t"\
323 "movq %%mm2, %%mm0 \n\t"\
324 "punpcklwd %%mm3, %%mm2 \n\t"\
325 "punpckhwd %%mm3, %%mm0 \n\t"\
326 "pmaddwd %%mm4, %%mm2 \n\t"\
327 "pmaddwd %%mm4, %%mm0 \n\t"\
328 "paddd %%mm2, %%mm7 \n\t"\
329 "paddd %%mm0, %%mm6 \n\t"\
330 " jnz 2b \n\t"\
331 "psrad $16, %%mm1 \n\t"\
332 "psrad $16, %%mm5 \n\t"\
333 "psrad $16, %%mm7 \n\t"\
334 "psrad $16, %%mm6 \n\t"\
335 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
336 "packssdw %%mm5, %%mm1 \n\t"\
337 "packssdw %%mm6, %%mm7 \n\t"\
338 "paddw %%mm0, %%mm1 \n\t"\
339 "paddw %%mm0, %%mm7 \n\t"\
340 "movq "U_TEMP"(%0), %%mm3 \n\t"\
341 "movq "V_TEMP"(%0), %%mm4 \n\t"\
342
343 #define YSCALEYUV2PACKEDX_ACCURATE \
344 YSCALEYUV2PACKEDX_ACCURATE_UV \
345 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346
347 #define YSCALEYUV2RGBX \
348 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
349 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
350 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
351 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
352 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
353 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
354 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
356 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
357 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
358 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
359 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
360 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
361 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362 "paddw %%mm3, %%mm4 \n\t"\
363 "movq %%mm2, %%mm0 \n\t"\
364 "movq %%mm5, %%mm6 \n\t"\
365 "movq %%mm4, %%mm3 \n\t"\
366 "punpcklwd %%mm2, %%mm2 \n\t"\
367 "punpcklwd %%mm5, %%mm5 \n\t"\
368 "punpcklwd %%mm4, %%mm4 \n\t"\
369 "paddw %%mm1, %%mm2 \n\t"\
370 "paddw %%mm1, %%mm5 \n\t"\
371 "paddw %%mm1, %%mm4 \n\t"\
372 "punpckhwd %%mm0, %%mm0 \n\t"\
373 "punpckhwd %%mm6, %%mm6 \n\t"\
374 "punpckhwd %%mm3, %%mm3 \n\t"\
375 "paddw %%mm7, %%mm0 \n\t"\
376 "paddw %%mm7, %%mm6 \n\t"\
377 "paddw %%mm7, %%mm3 \n\t"\
378 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379 "packuswb %%mm0, %%mm2 \n\t"\
380 "packuswb %%mm6, %%mm5 \n\t"\
381 "packuswb %%mm3, %%mm4 \n\t"\
382
383 #define REAL_YSCALEYUV2PACKED(index, c) \
384 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
385 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
386 "psraw $3, %%mm0 \n\t"\
387 "psraw $3, %%mm1 \n\t"\
388 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390 "xor "#index", "#index" \n\t"\
391 ASMALIGN(4)\
392 "1: \n\t"\
393 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
394 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
395 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
396 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
407 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
408 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
409 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
410 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
411 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
412 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
418
419 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
420
421 #define REAL_YSCALEYUV2RGB_UV(index, c) \
422 "xor "#index", "#index" \n\t"\
423 ASMALIGN(4)\
424 "1: \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
432 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
439 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
440 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
441 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
442 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
443 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
444 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
445
446 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
448 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
449 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
450 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
451 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
460 #define REAL_YSCALEYUV2RGB_COEFF(c) \
461 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
462 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
463 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
464 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
465 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
466 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
467 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468 "paddw %%mm3, %%mm4 \n\t"\
469 "movq %%mm2, %%mm0 \n\t"\
470 "movq %%mm5, %%mm6 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklwd %%mm2, %%mm2 \n\t"\
473 "punpcklwd %%mm5, %%mm5 \n\t"\
474 "punpcklwd %%mm4, %%mm4 \n\t"\
475 "paddw %%mm1, %%mm2 \n\t"\
476 "paddw %%mm1, %%mm5 \n\t"\
477 "paddw %%mm1, %%mm4 \n\t"\
478 "punpckhwd %%mm0, %%mm0 \n\t"\
479 "punpckhwd %%mm6, %%mm6 \n\t"\
480 "punpckhwd %%mm3, %%mm3 \n\t"\
481 "paddw %%mm7, %%mm0 \n\t"\
482 "paddw %%mm7, %%mm6 \n\t"\
483 "paddw %%mm7, %%mm3 \n\t"\
484 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485 "packuswb %%mm0, %%mm2 \n\t"\
486 "packuswb %%mm6, %%mm5 \n\t"\
487 "packuswb %%mm3, %%mm4 \n\t"\
488
489 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
490
491 #define YSCALEYUV2RGB(index, c) \
492 REAL_YSCALEYUV2RGB_UV(index, c) \
493 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494 REAL_YSCALEYUV2RGB_COEFF(c)
495
496 #define REAL_YSCALEYUV2PACKED1(index, c) \
497 "xor "#index", "#index" \n\t"\
498 ASMALIGN(4)\
499 "1: \n\t"\
500 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
501 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
502 "psraw $7, %%mm3 \n\t" \
503 "psraw $7, %%mm4 \n\t" \
504 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
505 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
506 "psraw $7, %%mm1 \n\t" \
507 "psraw $7, %%mm7 \n\t" \
508
509 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
510
511 #define REAL_YSCALEYUV2RGB1(index, c) \
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
520 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
521 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
522 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
523 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
524 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
525 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
527 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
528 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
531 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
532 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
533 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
534 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
535 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
536 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537 "paddw %%mm3, %%mm4 \n\t"\
538 "movq %%mm2, %%mm0 \n\t"\
539 "movq %%mm5, %%mm6 \n\t"\
540 "movq %%mm4, %%mm3 \n\t"\
541 "punpcklwd %%mm2, %%mm2 \n\t"\
542 "punpcklwd %%mm5, %%mm5 \n\t"\
543 "punpcklwd %%mm4, %%mm4 \n\t"\
544 "paddw %%mm1, %%mm2 \n\t"\
545 "paddw %%mm1, %%mm5 \n\t"\
546 "paddw %%mm1, %%mm4 \n\t"\
547 "punpckhwd %%mm0, %%mm0 \n\t"\
548 "punpckhwd %%mm6, %%mm6 \n\t"\
549 "punpckhwd %%mm3, %%mm3 \n\t"\
550 "paddw %%mm7, %%mm0 \n\t"\
551 "paddw %%mm7, %%mm6 \n\t"\
552 "paddw %%mm7, %%mm3 \n\t"\
553 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554 "packuswb %%mm0, %%mm2 \n\t"\
555 "packuswb %%mm6, %%mm5 \n\t"\
556 "packuswb %%mm3, %%mm4 \n\t"\
557
558 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
559
560 #define REAL_YSCALEYUV2PACKED1b(index, c) \
561 "xor "#index", "#index" \n\t"\
562 ASMALIGN(4)\
563 "1: \n\t"\
564 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
565 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
566 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
567 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570 "psrlw $8, %%mm3 \n\t" \
571 "psrlw $8, %%mm4 \n\t" \
572 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
573 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
574 "psraw $7, %%mm1 \n\t" \
575 "psraw $7, %%mm7 \n\t"
576 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
577
578 // do vertical chrominance interpolation
579 #define REAL_YSCALEYUV2RGB1b(index, c) \
580 "xor "#index", "#index" \n\t"\
581 ASMALIGN(4)\
582 "1: \n\t"\
583 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
584 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
585 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
586 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
587 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
590 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
591 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
592 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
593 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
594 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
595 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
596 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
597 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
599 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
600 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
603 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
604 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
605 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
606 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
607 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
608 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609 "paddw %%mm3, %%mm4 \n\t"\
610 "movq %%mm2, %%mm0 \n\t"\
611 "movq %%mm5, %%mm6 \n\t"\
612 "movq %%mm4, %%mm3 \n\t"\
613 "punpcklwd %%mm2, %%mm2 \n\t"\
614 "punpcklwd %%mm5, %%mm5 \n\t"\
615 "punpcklwd %%mm4, %%mm4 \n\t"\
616 "paddw %%mm1, %%mm2 \n\t"\
617 "paddw %%mm1, %%mm5 \n\t"\
618 "paddw %%mm1, %%mm4 \n\t"\
619 "punpckhwd %%mm0, %%mm0 \n\t"\
620 "punpckhwd %%mm6, %%mm6 \n\t"\
621 "punpckhwd %%mm3, %%mm3 \n\t"\
622 "paddw %%mm7, %%mm0 \n\t"\
623 "paddw %%mm7, %%mm6 \n\t"\
624 "paddw %%mm7, %%mm3 \n\t"\
625 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626 "packuswb %%mm0, %%mm2 \n\t"\
627 "packuswb %%mm6, %%mm5 \n\t"\
628 "packuswb %%mm3, %%mm4 \n\t"\
629
630 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
631
632 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
634 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
635 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
636 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
637 "packuswb %%mm1, %%mm7 \n\t"
638 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639
640 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641 "movq "#b", "#q2" \n\t" /* B */\
642 "movq "#r", "#t" \n\t" /* R */\
643 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
644 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
645 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
646 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
647 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
648 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
649 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
650 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
651 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
652 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
653 \
654 MOVNTQ( q0, (dst, index, 4))\
655 MOVNTQ( b, 8(dst, index, 4))\
656 MOVNTQ( q2, 16(dst, index, 4))\
657 MOVNTQ( q3, 24(dst, index, 4))\
658 \
659 "add $8, "#index" \n\t"\
660 "cmp "#dstw", "#index" \n\t"\
661 " jb 1b \n\t"
662 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
663
664 #define REAL_WRITERGB16(dst, dstw, index) \
665 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
666 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
667 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
668 "psrlq $3, %%mm2 \n\t"\
669 \
670 "movq %%mm2, %%mm1 \n\t"\
671 "movq %%mm4, %%mm3 \n\t"\
672 \
673 "punpcklbw %%mm7, %%mm3 \n\t"\
674 "punpcklbw %%mm5, %%mm2 \n\t"\
675 "punpckhbw %%mm7, %%mm4 \n\t"\
676 "punpckhbw %%mm5, %%mm1 \n\t"\
677 \
678 "psllq $3, %%mm3 \n\t"\
679 "psllq $3, %%mm4 \n\t"\
680 \
681 "por %%mm3, %%mm2 \n\t"\
682 "por %%mm4, %%mm1 \n\t"\
683 \
684 MOVNTQ(%%mm2, (dst, index, 2))\
685 MOVNTQ(%%mm1, 8(dst, index, 2))\
686 \
687 "add $8, "#index" \n\t"\
688 "cmp "#dstw", "#index" \n\t"\
689 " jb 1b \n\t"
690 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
691
692 #define REAL_WRITERGB15(dst, dstw, index) \
693 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
694 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
695 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
696 "psrlq $3, %%mm2 \n\t"\
697 "psrlq $1, %%mm5 \n\t"\
698 \
699 "movq %%mm2, %%mm1 \n\t"\
700 "movq %%mm4, %%mm3 \n\t"\
701 \
702 "punpcklbw %%mm7, %%mm3 \n\t"\
703 "punpcklbw %%mm5, %%mm2 \n\t"\
704 "punpckhbw %%mm7, %%mm4 \n\t"\
705 "punpckhbw %%mm5, %%mm1 \n\t"\
706 \
707 "psllq $2, %%mm3 \n\t"\
708 "psllq $2, %%mm4 \n\t"\
709 \
710 "por %%mm3, %%mm2 \n\t"\
711 "por %%mm4, %%mm1 \n\t"\
712 \
713 MOVNTQ(%%mm2, (dst, index, 2))\
714 MOVNTQ(%%mm1, 8(dst, index, 2))\
715 \
716 "add $8, "#index" \n\t"\
717 "cmp "#dstw", "#index" \n\t"\
718 " jb 1b \n\t"
719 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
720
721 #define WRITEBGR24OLD(dst, dstw, index) \
722 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723 "movq %%mm2, %%mm1 \n\t" /* B */\
724 "movq %%mm5, %%mm6 \n\t" /* R */\
725 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
726 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
727 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
728 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
729 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
730 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
731 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
732 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
733 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
734 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
735 \
736 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
737 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
738 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
739 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
740 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
741 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
742 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
743 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
744 \
745 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
746 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
747 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
748 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
749 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
750 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
751 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
752 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
753 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
754 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
755 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
756 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
757 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
758 \
759 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
760 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
761 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
762 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
763 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
764 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
765 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
766 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
767 \
768 MOVNTQ(%%mm0, (dst))\
769 MOVNTQ(%%mm2, 8(dst))\
770 MOVNTQ(%%mm3, 16(dst))\
771 "add $24, "#dst" \n\t"\
772 \
773 "add $8, "#index" \n\t"\
774 "cmp "#dstw", "#index" \n\t"\
775 " jb 1b \n\t"
776
777 #define WRITEBGR24MMX(dst, dstw, index) \
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779 "movq %%mm2, %%mm1 \n\t" /* B */\
780 "movq %%mm5, %%mm6 \n\t" /* R */\
781 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
782 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
783 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
784 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
785 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
786 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
787 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
788 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
789 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
790 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
791 \
792 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
793 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
794 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
795 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
796 \
797 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
798 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
799 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
800 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
801 \
802 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
803 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
804 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
805 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
806 \
807 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
809 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
810 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
811 MOVNTQ(%%mm0, (dst))\
812 \
813 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
814 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
815 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
816 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
817 MOVNTQ(%%mm6, 8(dst))\
818 \
819 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
820 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
821 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
822 MOVNTQ(%%mm5, 16(dst))\
823 \
824 "add $24, "#dst" \n\t"\
825 \
826 "add $8, "#index" \n\t"\
827 "cmp "#dstw", "#index" \n\t"\
828 " jb 1b \n\t"
829
830 #define WRITEBGR24MMX2(dst, dstw, index) \
831 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
835 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
836 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
837 \
838 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
839 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
840 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
841 \
842 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
843 "por %%mm1, %%mm6 \n\t"\
844 "por %%mm3, %%mm6 \n\t"\
845 MOVNTQ(%%mm6, (dst))\
846 \
847 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
848 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
849 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
850 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
851 \
852 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
853 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
854 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
855 \
856 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
857 "por %%mm3, %%mm6 \n\t"\
858 MOVNTQ(%%mm6, 8(dst))\
859 \
860 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
861 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
862 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
863 \
864 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
865 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
866 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
867 \
868 "por %%mm1, %%mm3 \n\t"\
869 "por %%mm3, %%mm6 \n\t"\
870 MOVNTQ(%%mm6, 16(dst))\
871 \
872 "add $24, "#dst" \n\t"\
873 \
874 "add $8, "#index" \n\t"\
875 "cmp "#dstw", "#index" \n\t"\
876 " jb 1b \n\t"
877
878 #if COMPILE_TEMPLATE_MMX2
879 #undef WRITEBGR24
880 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
881 #else
882 #undef WRITEBGR24
883 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
884 #endif
885
886 #define REAL_WRITEYUY2(dst, dstw, index) \
887 "packuswb %%mm3, %%mm3 \n\t"\
888 "packuswb %%mm4, %%mm4 \n\t"\
889 "packuswb %%mm7, %%mm1 \n\t"\
890 "punpcklbw %%mm4, %%mm3 \n\t"\
891 "movq %%mm1, %%mm7 \n\t"\
892 "punpcklbw %%mm3, %%mm1 \n\t"\
893 "punpckhbw %%mm3, %%mm7 \n\t"\
894 \
895 MOVNTQ(%%mm1, (dst, index, 2))\
896 MOVNTQ(%%mm7, 8(dst, index, 2))\
897 \
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
900 " jb 1b \n\t"
901 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
902
903
904 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
907 {
908 #if COMPILE_TEMPLATE_MMX
909 if(!(c->flags & SWS_BITEXACT)) {
910 if (c->flags & SWS_ACCURATE_RND) {
911 if (uDest) {
912 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914 }
915 if (CONFIG_SWSCALE_ALPHA && aDest) {
916 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917 }
918
919 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920 } else {
921 if (uDest) {
922 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924 }
925 if (CONFIG_SWSCALE_ALPHA && aDest) {
926 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927 }
928
929 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930 }
931 return;
932 }
933 #endif
934 #if COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 dest, uDest, vDest, dstW, chrDstW);
938 #else //COMPILE_TEMPLATE_ALTIVEC
939 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940 chrFilter, chrSrc, chrFilterSize,
941 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942 #endif //!COMPILE_TEMPLATE_ALTIVEC
943 }
944
945 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
948 {
949 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, dstW, chrDstW, dstFormat);
952 }
953
954 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
956 {
957 int i;
958 #if COMPILE_TEMPLATE_MMX
959 if(!(c->flags & SWS_BITEXACT)) {
960 long p= 4;
961 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
964
965 if (c->flags & SWS_ACCURATE_RND) {
966 while(p--) {
967 if (dst[p]) {
968 __asm__ volatile(
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
971 "g" (-counter[p])
972 : "%"REG_a
973 );
974 }
975 }
976 } else {
977 while(p--) {
978 if (dst[p]) {
979 __asm__ volatile(
980 YSCALEYUV2YV121
981 :: "r" (src[p]), "r" (dst[p] + counter[p]),
982 "g" (-counter[p])
983 : "%"REG_a
984 );
985 }
986 }
987 }
988 return;
989 }
990 #endif
991 for (i=0; i<dstW; i++) {
992 int val= (lumSrc[i]+64)>>7;
993
994 if (val&256) {
995 if (val<0) val=0;
996 else val=255;
997 }
998
999 dest[i]= val;
1000 }
1001
1002 if (uDest)
1003 for (i=0; i<chrDstW; i++) {
1004 int u=(chrSrc[i ]+64)>>7;
1005 int v=(chrSrc[i + VOFW]+64)>>7;
1006
1007 if ((u|v)&256) {
1008 if (u<0) u=0;
1009 else if (u>255) u=255;
1010 if (v<0) v=0;
1011 else if (v>255) v=255;
1012 }
1013
1014 uDest[i]= u;
1015 vDest[i]= v;
1016 }
1017
1018 if (CONFIG_SWSCALE_ALPHA && aDest)
1019 for (i=0; i<dstW; i++) {
1020 int val= (alpSrc[i]+64)>>7;
1021 aDest[i]= av_clip_uint8(val);
1022 }
1023 }
1024
1025
1026 /**
1027 * vertical scale YV12 to RGB
1028 */
1029 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1030 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1031 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1032 {
1033 #if COMPILE_TEMPLATE_MMX
1034 x86_reg dummy=0;
1035 if(!(c->flags & SWS_BITEXACT)) {
1036 if (c->flags & SWS_ACCURATE_RND) {
1037 switch(c->dstFormat) {
1038 case PIX_FMT_RGB32:
1039 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1040 YSCALEYUV2PACKEDX_ACCURATE
1041 YSCALEYUV2RGBX
1042 "movq %%mm2, "U_TEMP"(%0) \n\t"
1043 "movq %%mm4, "V_TEMP"(%0) \n\t"
1044 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1045 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1046 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1047 "psraw $3, %%mm1 \n\t"
1048 "psraw $3, %%mm7 \n\t"
1049 "packuswb %%mm7, %%mm1 \n\t"
1050 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1051
1052 YSCALEYUV2PACKEDX_END
1053 } else {
1054 YSCALEYUV2PACKEDX_ACCURATE
1055 YSCALEYUV2RGBX
1056 "pcmpeqd %%mm7, %%mm7 \n\t"
1057 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1058
1059 YSCALEYUV2PACKEDX_END
1060 }
1061 return;
1062 case PIX_FMT_BGR24:
1063 YSCALEYUV2PACKEDX_ACCURATE
1064 YSCALEYUV2RGBX
1065 "pxor %%mm7, %%mm7 \n\t"
1066 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1067 "add %4, %%"REG_c" \n\t"
1068 WRITEBGR24(%%REGc, %5, %%REGa)
1069
1070
1071 :: "r" (&c->redDither),
1072 "m" (dummy), "m" (dummy), "m" (dummy),
1073 "r" (dest), "m" (dstW)
1074 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1075 );
1076 return;
1077 case PIX_FMT_RGB555:
1078 YSCALEYUV2PACKEDX_ACCURATE
1079 YSCALEYUV2RGBX
1080 "pxor %%mm7, %%mm7 \n\t"
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082 #ifdef DITHER1XBPP
1083 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1084 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1085 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1086 #endif
1087
1088 WRITERGB15(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1090 return;
1091 case PIX_FMT_RGB565:
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 YSCALEYUV2RGBX
1094 "pxor %%mm7, %%mm7 \n\t"
1095 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1096 #ifdef DITHER1XBPP
1097 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1098 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1099 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1100 #endif
1101
1102 WRITERGB16(%4, %5, %%REGa)
1103 YSCALEYUV2PACKEDX_END
1104 return;
1105 case PIX_FMT_YUYV422:
1106 YSCALEYUV2PACKEDX_ACCURATE
1107 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1108
1109 "psraw $3, %%mm3 \n\t"
1110 "psraw $3, %%mm4 \n\t"
1111 "psraw $3, %%mm1 \n\t"
1112 "psraw $3, %%mm7 \n\t"
1113 WRITEYUY2(%4, %5, %%REGa)
1114 YSCALEYUV2PACKEDX_END
1115 return;
1116 }
1117 } else {
1118 switch(c->dstFormat) {
1119 case PIX_FMT_RGB32:
1120 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1121 YSCALEYUV2PACKEDX
1122 YSCALEYUV2RGBX
1123 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1124 "psraw $3, %%mm1 \n\t"
1125 "psraw $3, %%mm7 \n\t"
1126 "packuswb %%mm7, %%mm1 \n\t"
1127 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1128 YSCALEYUV2PACKEDX_END
1129 } else {
1130 YSCALEYUV2PACKEDX
1131 YSCALEYUV2RGBX
1132 "pcmpeqd %%mm7, %%mm7 \n\t"
1133 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1134 YSCALEYUV2PACKEDX_END
1135 }
1136 return;
1137 case PIX_FMT_BGR24:
1138 YSCALEYUV2PACKEDX
1139 YSCALEYUV2RGBX
1140 "pxor %%mm7, %%mm7 \n\t"
1141 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1142 "add %4, %%"REG_c" \n\t"
1143 WRITEBGR24(%%REGc, %5, %%REGa)
1144
1145 :: "r" (&c->redDither),
1146 "m" (dummy), "m" (dummy), "m" (dummy),
1147 "r" (dest), "m" (dstW)
1148 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1149 );
1150 return;
1151 case PIX_FMT_RGB555:
1152 YSCALEYUV2PACKEDX
1153 YSCALEYUV2RGBX
1154 "pxor %%mm7, %%mm7 \n\t"
1155 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1156 #ifdef DITHER1XBPP
1157 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1158 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1159 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1160 #endif
1161
1162 WRITERGB15(%4, %5, %%REGa)
1163 YSCALEYUV2PACKEDX_END
1164 return;
1165 case PIX_FMT_RGB565:
1166 YSCALEYUV2PACKEDX
1167 YSCALEYUV2RGBX
1168 "pxor %%mm7, %%mm7 \n\t"
1169 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1170 #ifdef DITHER1XBPP
1171 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1172 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1173 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1174 #endif
1175
1176 WRITERGB16(%4, %5, %%REGa)
1177 YSCALEYUV2PACKEDX_END
1178 return;
1179 case PIX_FMT_YUYV422:
1180 YSCALEYUV2PACKEDX
1181 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1182
1183 "psraw $3, %%mm3 \n\t"
1184 "psraw $3, %%mm4 \n\t"
1185 "psraw $3, %%mm1 \n\t"
1186 "psraw $3, %%mm7 \n\t"
1187 WRITEYUY2(%4, %5, %%REGa)
1188 YSCALEYUV2PACKEDX_END
1189 return;
1190 }
1191 }
1192 }
1193 #endif /* COMPILE_TEMPLATE_MMX */
1194 #if COMPILE_TEMPLATE_ALTIVEC
1195 /* The following list of supported dstFormat values should
1196 match what's found in the body of ff_yuv2packedX_altivec() */
1197 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1198 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1199 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1200 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1201 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1203 dest, dstW, dstY);
1204 else
1205 #endif
1206 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1207 chrFilter, chrSrc, chrFilterSize,
1208 alpSrc, dest, dstW, dstY);
1209 }
1210
1211 /**
1212 * vertical bilinear scale YV12 to RGB
1213 */
1214 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1215 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1216 {
1217 int yalpha1=4095- yalpha;
1218 int uvalpha1=4095-uvalpha;
1219 int i;
1220
1221 #if COMPILE_TEMPLATE_MMX
1222 if(!(c->flags & SWS_BITEXACT)) {
1223 switch(c->dstFormat) {
1224 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1225 case PIX_FMT_RGB32:
1226 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1227 #if ARCH_X86_64
1228 __asm__ volatile(
1229 YSCALEYUV2RGB(%%REGBP, %5)
1230 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1231 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1232 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1233 "packuswb %%mm7, %%mm1 \n\t"
1234 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1235
1236 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1237 "a" (&c->redDither)
1238 ,"r" (abuf0), "r" (abuf1)
1239 : "%"REG_BP
1240 );
1241 #else
1242 *(uint16_t **)(&c->u_temp)=abuf0;
1243 *(uint16_t **)(&c->v_temp)=abuf1;
1244 __asm__ volatile(
1245 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1246 "mov %4, %%"REG_b" \n\t"
1247 "push %%"REG_BP" \n\t"
1248 YSCALEYUV2RGB(%%REGBP, %5)
1249 "push %0 \n\t"
1250 "push %1 \n\t"
1251 "mov "U_TEMP"(%5), %0 \n\t"
1252 "mov "V_TEMP"(%5), %1 \n\t"
1253 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1254 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1255 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1256 "packuswb %%mm7, %%mm1 \n\t"
1257 "pop %1 \n\t"
1258 "pop %0 \n\t"
1259 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1260 "pop %%"REG_BP" \n\t"
1261 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1262
1263 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1264 "a" (&c->redDither)
1265 );
1266 #endif
1267 } else {
1268 __asm__ volatile(
1269 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1270 "mov %4, %%"REG_b" \n\t"
1271 "push %%"REG_BP" \n\t"
1272 YSCALEYUV2RGB(%%REGBP, %5)
1273 "pcmpeqd %%mm7, %%mm7 \n\t"
1274 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1275 "pop %%"REG_BP" \n\t"
1276 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1277
1278 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1279 "a" (&c->redDither)
1280 );
1281 }
1282 return;
1283 case PIX_FMT_BGR24:
1284 __asm__ volatile(
1285 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1286 "mov %4, %%"REG_b" \n\t"
1287 "push %%"REG_BP" \n\t"
1288 YSCALEYUV2RGB(%%REGBP, %5)
1289 "pxor %%mm7, %%mm7 \n\t"
1290 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1291 "pop %%"REG_BP" \n\t"
1292 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1293 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1294 "a" (&c->redDither)
1295 );
1296 return;
1297 case PIX_FMT_RGB555:
1298 __asm__ volatile(
1299 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1300 "mov %4, %%"REG_b" \n\t"
1301 "push %%"REG_BP" \n\t"
1302 YSCALEYUV2RGB(%%REGBP, %5)
1303 "pxor %%mm7, %%mm7 \n\t"
1304 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1305 #ifdef DITHER1XBPP
1306 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1307 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1308 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1309 #endif
1310
1311 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1312 "pop %%"REG_BP" \n\t"
1313 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1314
1315 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1316 "a" (&c->redDither)
1317 );
1318 return;
1319 case PIX_FMT_RGB565:
1320 __asm__ volatile(
1321 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1322 "mov %4, %%"REG_b" \n\t"
1323 "push %%"REG_BP" \n\t"
1324 YSCALEYUV2RGB(%%REGBP, %5)
1325 "pxor %%mm7, %%mm7 \n\t"
1326 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1327 #ifdef DITHER1XBPP
1328 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1329 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1330 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1331 #endif
1332
1333 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1334 "pop %%"REG_BP" \n\t"
1335 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1336 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1337 "a" (&c->redDither)
1338 );
1339 return;
1340 case PIX_FMT_YUYV422:
1341 __asm__ volatile(
1342 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1343 "mov %4, %%"REG_b" \n\t"
1344 "push %%"REG_BP" \n\t"
1345 YSCALEYUV2PACKED(%%REGBP, %5)
1346 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1347 "pop %%"REG_BP" \n\t"
1348 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1349 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1350 "a" (&c->redDither)
1351 );
1352 return;
1353 default: break;
1354 }
1355 }
1356 #endif //COMPILE_TEMPLATE_MMX
1357 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1358 }
1359
1360 /**
1361 * YV12 to RGB without scaling or interpolating
1362 */
1363 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1364 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1365 {
1366 const int yalpha1=0;
1367 int i;
1368
1369 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1370 const int yalpha= 4096; //FIXME ...
1371
1372 if (flags&SWS_FULL_CHR_H_INT) {
1373 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1374 return;
1375 }
1376
1377 #if COMPILE_TEMPLATE_MMX
1378 if(!(flags & SWS_BITEXACT)) {
1379 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1380 switch(dstFormat) {
1381 case PIX_FMT_RGB32:
1382 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1383 __asm__ volatile(
1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_b" \n\t"
1386 "push %%"REG_BP" \n\t"
1387 YSCALEYUV2RGB1(%%REGBP, %5)
1388 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1389 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1390 "pop %%"REG_BP" \n\t"
1391 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1392
1393 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1394 "a" (&c->redDither)
1395 );
1396 } else {
1397 __asm__ volatile(
1398 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1399 "mov %4, %%"REG_b" \n\t"
1400 "push %%"REG_BP" \n\t"
1401 YSCALEYUV2RGB1(%%REGBP, %5)
1402 "pcmpeqd %%mm7, %%mm7 \n\t"
1403 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1404 "pop %%"REG_BP" \n\t"
1405 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1406
1407 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1408 "a" (&c->redDither)
1409 );
1410 }
1411 return;
1412 case PIX_FMT_BGR24:
1413 __asm__ volatile(
1414 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1415 "mov %4, %%"REG_b" \n\t"
1416 "push %%"REG_BP" \n\t"
1417 YSCALEYUV2RGB1(%%REGBP, %5)
1418 "pxor %%mm7, %%mm7 \n\t"
1419 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1420 "pop %%"REG_BP" \n\t"
1421 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1422
1423 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1424 "a" (&c->redDither)
1425 );
1426 return;
1427 case PIX_FMT_RGB555:
1428 __asm__ volatile(
1429 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1430 "mov %4, %%"REG_b" \n\t"
1431 "push %%"REG_BP" \n\t"
1432 YSCALEYUV2RGB1(%%REGBP, %5)
1433 "pxor %%mm7, %%mm7 \n\t"
1434 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1435 #ifdef DITHER1XBPP
1436 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1437 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1438 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1439 #endif
1440 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1441 "pop %%"REG_BP" \n\t"
1442 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1443
1444 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1445 "a" (&c->redDither)
1446 );
1447 return;
1448 case PIX_FMT_RGB565:
1449 __asm__ volatile(
1450 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1451 "mov %4, %%"REG_b" \n\t"
1452 "push %%"REG_BP" \n\t"
1453 YSCALEYUV2RGB1(%%REGBP, %5)
1454 "pxor %%mm7, %%mm7 \n\t"
1455 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1456 #ifdef DITHER1XBPP
1457 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1458 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1459 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1460 #endif
1461
1462 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1463 "pop %%"REG_BP" \n\t"
1464 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1465
1466 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1467 "a" (&c->redDither)
1468 );
1469 return;
1470 case PIX_FMT_YUYV422:
1471 __asm__ volatile(
1472 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1473 "mov %4, %%"REG_b" \n\t"
1474 "push %%"REG_BP" \n\t"
1475 YSCALEYUV2PACKED1(%%REGBP, %5)
1476 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1479
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481 "a" (&c->redDither)
1482 );
1483 return;
1484 }
1485 } else {
1486 switch(dstFormat) {
1487 case PIX_FMT_RGB32:
1488 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1489 __asm__ volatile(
1490 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1491 "mov %4, %%"REG_b" \n\t"
1492 "push %%"REG_BP" \n\t"
1493 YSCALEYUV2RGB1b(%%REGBP, %5)
1494 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1495 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1496 "pop %%"REG_BP" \n\t"
1497 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1498
1499 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1500 "a" (&c->redDither)
1501 );
1502 } else {
1503 __asm__ volatile(
1504 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1505 "mov %4, %%"REG_b" \n\t"
1506 "push %%"REG_BP" \n\t"
1507 YSCALEYUV2RGB1b(%%REGBP, %5)
1508 "pcmpeqd %%mm7, %%mm7 \n\t"
1509 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1510 "pop %%"REG_BP" \n\t"
1511 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1512
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 "a" (&c->redDither)
1515 );
1516 }
1517 return;
1518 case PIX_FMT_BGR24:
1519 __asm__ volatile(
1520 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1521 "mov %4, %%"REG_b" \n\t"
1522 "push %%"REG_BP" \n\t"
1523 YSCALEYUV2RGB1b(%%REGBP, %5)
1524 "pxor %%mm7, %%mm7 \n\t"
1525 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1526 "pop %%"REG_BP" \n\t"
1527 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1528
1529 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1530 "a" (&c->redDither)
1531 );
1532 return;
1533 case PIX_FMT_RGB555:
1534 __asm__ volatile(
1535 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1536 "mov %4, %%"REG_b" \n\t"
1537 "push %%"REG_BP" \n\t"
1538 YSCALEYUV2RGB1b(%%REGBP, %5)
1539 "pxor %%mm7, %%mm7 \n\t"
1540 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1541 #ifdef DITHER1XBPP
1542 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1543 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1544 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1545 #endif
1546 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1549
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1551 "a" (&c->redDither)
1552 );
1553 return;
1554 case PIX_FMT_RGB565:
1555 __asm__ volatile(
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1b(%%REGBP, %5)
1560 "pxor %%mm7, %%mm7 \n\t"
1561 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1562 #ifdef DITHER1XBPP
1563 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1564 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1565 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1566 #endif
1567
1568 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1571
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573 "a" (&c->redDither)
1574 );
1575 return;
1576 case PIX_FMT_YUYV422:
1577 __asm__ volatile(
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2PACKED1b(%%REGBP, %5)
1582 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1583 "pop %%"REG_BP" \n\t"
1584 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1585
1586 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1587 "a" (&c->redDither)
1588 );
1589 return;
1590 }
1591 }
1592 }
1593 #endif /* COMPILE_TEMPLATE_MMX */
1594 if (uvalpha < 2048) {
1595 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1596 } else {
1597 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1598 }
1599 }
1600
1601 //FIXME yuy2* can read up to 7 samples too much
1602
1603 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1604 {
1605 #if COMPILE_TEMPLATE_MMX
1606 __asm__ volatile(
1607 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1608 "mov %0, %%"REG_a" \n\t"
1609 "1: \n\t"
1610 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1611 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1612 "pand %%mm2, %%mm0 \n\t"
1613 "pand %%mm2, %%mm1 \n\t"
1614 "packuswb %%mm1, %%mm0 \n\t"
1615 "movq %%mm0, (%2, %%"REG_a") \n\t"
1616 "add $8, %%"REG_a" \n\t"
1617 " js 1b \n\t"
1618 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1619 : "%"REG_a
1620 );
1621 #else
1622 int i;
1623 for (i=0; i<width; i++)
1624 dst[i]= src[2*i];
1625 #endif
1626 }
1627
1628 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1629 {
1630 #if COMPILE_TEMPLATE_MMX
1631 __asm__ volatile(
1632 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1633 "mov %0, %%"REG_a" \n\t"
1634 "1: \n\t"
1635 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1636 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "psrlw $8, %%mm1 \n\t"
1639 "packuswb %%mm1, %%mm0 \n\t"
1640 "movq %%mm0, %%mm1 \n\t"
1641 "psrlw $8, %%mm0 \n\t"
1642 "pand %%mm4, %%mm1 \n\t"
1643 "packuswb %%mm0, %%mm0 \n\t"
1644 "packuswb %%mm1, %%mm1 \n\t"
1645 "movd %%mm0, (%3, %%"REG_a") \n\t"
1646 "movd %%mm1, (%2, %%"REG_a") \n\t"
1647 "add $4, %%"REG_a" \n\t"
1648 " js 1b \n\t"
1649 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1650 : "%"REG_a
1651 );
1652 #else
1653 int i;
1654 for (i=0; i<width; i++) {
1655 dstU[i]= src1[4*i + 1];
1656 dstV[i]= src1[4*i + 3];
1657 }
1658 #endif
1659 assert(src1 == src2);
1660 }
1661
1662 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1663 {
1664 #if COMPILE_TEMPLATE_MMX
1665 __asm__ volatile(
1666 "mov %0, %%"REG_a" \n\t"
1667 "1: \n\t"
1668 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1669 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1670 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1671 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1672 "psrlw $8, %%mm0 \n\t"
1673 "psrlw $8, %%mm1 \n\t"
1674 "psrlw $8, %%mm2 \n\t"
1675 "psrlw $8, %%mm3 \n\t"
1676 "packuswb %%mm1, %%mm0 \n\t"
1677 "packuswb %%mm3, %%mm2 \n\t"
1678 "movq %%mm0, (%3, %%"REG_a") \n\t"
1679 "movq %%mm2, (%4, %%"REG_a") \n\t"
1680 "add $8, %%"REG_a" \n\t"
1681 " js 1b \n\t"
1682 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1683 : "%"REG_a
1684 );
1685 #else
1686 int i;
1687 for (i=0; i<width; i++) {
1688 dstU[i]= src1[2*i + 1];
1689 dstV[i]= src2[2*i + 1];
1690 }
1691 #endif
1692 }
1693
1694 /* This is almost identical to the previous, end exists only because
1695 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1696 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1697 {
1698 #if COMPILE_TEMPLATE_MMX
1699 __asm__ volatile(
1700 "mov %0, %%"REG_a" \n\t"
1701 "1: \n\t"
1702 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1703 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1704 "psrlw $8, %%mm0 \n\t"
1705 "psrlw $8, %%mm1 \n\t"
1706 "packuswb %%mm1, %%mm0 \n\t"
1707 "movq %%mm0, (%2, %%"REG_a") \n\t"
1708 "add $8, %%"REG_a" \n\t"
1709 " js 1b \n\t"
1710 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1711 : "%"REG_a
1712 );
1713 #else
1714 int i;
1715 for (i=0; i<width; i++)
1716 dst[i]= src[2*i+1];
1717 #endif
1718 }
1719
1720 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1721 {
1722 #if COMPILE_TEMPLATE_MMX
1723 __asm__ volatile(
1724 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1725 "mov %0, %%"REG_a" \n\t"
1726 "1: \n\t"
1727 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1728 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1729 "pand %%mm4, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm1, %%mm0 \n\t"
1732 "movq %%mm0, %%mm1 \n\t"
1733 "psrlw $8, %%mm0 \n\t"
1734 "pand %%mm4, %%mm1 \n\t"
1735 "packuswb %%mm0, %%mm0 \n\t"
1736 "packuswb %%mm1, %%mm1 \n\t"
1737 "movd %%mm0, (%3, %%"REG_a") \n\t"
1738 "movd %%mm1, (%2, %%"REG_a") \n\t"
1739 "add $4, %%"REG_a" \n\t"
1740 " js 1b \n\t"
1741 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1742 : "%"REG_a
1743 );
1744 #else
1745 int i;
1746 for (i=0; i<width; i++) {
1747 dstU[i]= src1[4*i + 0];
1748 dstV[i]= src1[4*i + 2];
1749 }
1750 #endif
1751 assert(src1 == src2);
1752 }
1753
1754 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1755 {
1756 #if COMPILE_TEMPLATE_MMX
1757 __asm__ volatile(
1758 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1759 "mov %0, %%"REG_a" \n\t"
1760 "1: \n\t"
1761 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1762 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1763 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1764 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1765 "pand %%mm4, %%mm0 \n\t"
1766 "pand %%mm4, %%mm1 \n\t"
1767 "pand %%mm4, %%mm2 \n\t"
1768 "pand %%mm4, %%mm3 \n\t"
1769 "packuswb %%mm1, %%mm0 \n\t"
1770 "packuswb %%mm3, %%mm2 \n\t"
1771 "movq %%mm0, (%3, %%"REG_a") \n\t"
1772 "movq %%mm2, (%4, %%"REG_a") \n\t"
1773 "add $8, %%"REG_a" \n\t"
1774 " js 1b \n\t"
1775 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1776 : "%"REG_a
1777 );
1778 #else
1779 int i;
1780 for (i=0; i<width; i++) {
1781 dstU[i]= src1[2*i];
1782 dstV[i]= src2[2*i];
1783 }
1784 #endif
1785 }
1786
1787 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1788 const uint8_t *src, long width)
1789 {
1790 #if COMPILE_TEMPLATE_MMX
1791 __asm__ volatile(
1792 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1793 "mov %0, %%"REG_a" \n\t"
1794 "1: \n\t"
1795 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1796 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1797 "movq %%mm0, %%mm2 \n\t"
1798 "movq %%mm1, %%mm3 \n\t"
1799 "pand %%mm4, %%mm0 \n\t"
1800 "pand %%mm4, %%mm1 \n\t"
1801 "psrlw $8, %%mm2 \n\t"
1802 "psrlw $8, %%mm3 \n\t"
1803 "packuswb %%mm1, %%mm0 \n\t"
1804 "packuswb %%mm3, %%mm2 \n\t"
1805 "movq %%mm0, (%2, %%"REG_a") \n\t"
1806 "movq %%mm2, (%3, %%"REG_a") \n\t"
1807 "add $8, %%"REG_a" \n\t"
1808 " js 1b \n\t"
1809 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1810 : "%"REG_a
1811 );
1812 #else
1813 int i;
1814 for (i = 0; i < width; i++) {
1815 dst1[i] = src[2*i+0];
1816 dst2[i] = src[2*i+1];
1817 }
1818 #endif
1819 }
1820
1821 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1822 const uint8_t *src1, const uint8_t *src2,
1823 long width, uint32_t *unused)
1824 {
1825 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1826 }
1827
1828 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1829 const uint8_t *src1, const uint8_t *src2,
1830 long width, uint32_t *unused)
1831 {
1832 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1833 }
1834
1835 #if COMPILE_TEMPLATE_MMX
1836 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1837 {
1838
1839 if(srcFormat == PIX_FMT_BGR24) {
1840 __asm__ volatile(
1841 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1842 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1843 :
1844 );
1845 } else {
1846 __asm__ volatile(
1847 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1848 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1849 :
1850 );
1851 }
1852
1853 __asm__ volatile(
1854 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1855 "mov %2, %%"REG_a" \n\t"
1856 "pxor %%mm7, %%mm7 \n\t"
1857 "1: \n\t"
1858 PREFETCH" 64(%0) \n\t"
1859 "movd (%0), %%mm0 \n\t"
1860 "movd 2(%0), %%mm1 \n\t"
1861 "movd 6(%0), %%mm2 \n\t"
1862 "movd 8(%0), %%mm3 \n\t"
1863 "add $12, %0 \n\t"
1864 "punpcklbw %%mm7, %%mm0 \n\t"
1865 "punpcklbw %%mm7, %%mm1 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "punpcklbw %%mm7, %%mm3 \n\t"
1868 "pmaddwd %%mm5, %%mm0 \n\t"
1869 "pmaddwd %%mm6, %%mm1 \n\t"
1870 "pmaddwd %%mm5, %%mm2 \n\t"
1871 "pmaddwd %%mm6, %%mm3 \n\t"
1872 "paddd %%mm1, %%mm0 \n\t"
1873 "paddd %%mm3, %%mm2 \n\t"
1874 "paddd %%mm4, %%mm0 \n\t"
1875 "paddd %%mm4, %%mm2 \n\t"
1876 "psrad $15, %%mm0 \n\t"
1877 "psrad $15, %%mm2 \n\t"
1878 "packssdw %%mm2, %%mm0 \n\t"
1879 "packuswb %%mm0, %%mm0 \n\t"
1880 "movd %%mm0, (%1, %%"REG_a") \n\t"
1881 "add $4, %%"REG_a" \n\t"
1882 " js 1b \n\t"
1883 : "+r" (src)
1884 : "r" (dst+width), "g" ((x86_reg)-width)
1885 : "%"REG_a
1886 );
1887 }
1888
1889 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1890 {
1891 __asm__ volatile(
1892 "movq 24+%4, %%mm6 \n\t"
1893 "mov %3, %%"REG_a" \n\t"
1894 "pxor %%mm7, %%mm7 \n\t"
1895 "1: \n\t"
1896 PREFETCH" 64(%0) \n\t"
1897 "movd (%0), %%mm0 \n\t"
1898 "movd 2(%0), %%mm1 \n\t"
1899 "punpcklbw %%mm7, %%mm0 \n\t"
1900 "punpcklbw %%mm7, %%mm1 \n\t"
1901 "movq %%mm0, %%mm2 \n\t"
1902 "movq %%mm1, %%mm3 \n\t"
1903 "pmaddwd %4, %%mm0 \n\t"
1904 "pmaddwd 8+%4, %%mm1 \n\t"
1905 "pmaddwd 16+%4, %%mm2 \n\t"
1906 "pmaddwd %%mm6, %%mm3 \n\t"
1907 "paddd %%mm1, %%mm0 \n\t"
1908 "paddd %%mm3, %%mm2 \n\t"
1909
1910 "movd 6(%0), %%mm1 \n\t"
1911 "movd 8(%0), %%mm3 \n\t"
1912 "add $12, %0 \n\t"
1913 "punpcklbw %%mm7, %%mm1 \n\t"
1914 "punpcklbw %%mm7, %%mm3 \n\t"
1915 "movq %%mm1, %%mm4 \n\t"
1916 "movq %%mm3, %%mm5 \n\t"
1917 "pmaddwd %4, %%mm1 \n\t"
1918 "pmaddwd 8+%4, %%mm3 \n\t"
1919 "pmaddwd 16+%4, %%mm4 \n\t"
1920 "pmaddwd %%mm6, %%mm5 \n\t"
1921 "paddd %%mm3, %%mm1 \n\t"
1922 "paddd %%mm5, %%mm4 \n\t"
1923
1924 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1925 "paddd %%mm3, %%mm0 \n\t"
1926 "paddd %%mm3, %%mm2 \n\t"
1927 "paddd %%mm3, %%mm1 \n\t"
1928 "paddd %%mm3, %%mm4 \n\t"
1929 "psrad $15, %%mm0 \n\t"
1930 "psrad $15, %%mm2 \n\t"
1931 "psrad $15, %%mm1 \n\t"
1932 "psrad $15, %%mm4 \n\t"
1933 "packssdw %%mm1, %%mm0 \n\t"
1934 "packssdw %%mm4, %%mm2 \n\t"
1935 "packuswb %%mm0, %%mm0 \n\t"
1936 "packuswb %%mm2, %%mm2 \n\t"
1937 "movd %%mm0, (%1, %%"REG_a") \n\t"
1938 "movd %%mm2, (%2, %%"REG_a") \n\t"
1939 "add $4, %%"REG_a" \n\t"
1940 " js 1b \n\t"
1941 : "+r" (src)
1942 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1943 : "%"REG_a
1944 );
1945 }
1946 #endif
1947
1948 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1949 {
1950 #if COMPILE_TEMPLATE_MMX
1951 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1952 #else
1953 int i;
1954 for (i=0; i<width; i++) {
1955 int b= src[i*3+0];
1956 int g= src[i*3+1];
1957 int r= src[i*3+2];
1958
1959 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1960 }
1961 #endif /* COMPILE_TEMPLATE_MMX */
1962 }
1963
1964 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1965 {
1966 #if COMPILE_TEMPLATE_MMX
1967 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1968 #else
1969 int i;
1970 for (i=0; i<width; i++) {
1971 int b= src1[3*i + 0];
1972 int g= src1[3*i + 1];
1973 int r= src1[3*i + 2];
1974
1975 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1976 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1977 }
1978 #endif /* COMPILE_TEMPLATE_MMX */
1979 assert(src1 == src2);
1980 }
1981
1982 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1983 {
1984 int i;
1985 for (i=0; i<width; i++) {
1986 int b= src1[6*i + 0] + src1[6*i + 3];
1987 int g= src1[6*i + 1] + src1[6*i + 4];
1988 int r= src1[6*i + 2] + src1[6*i + 5];
1989
1990 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1991 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1992 }
1993 assert(src1 == src2);
1994 }
1995
1996 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1997 {
1998 #if COMPILE_TEMPLATE_MMX
1999 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2000 #else
2001 int i;
2002 for (i=0; i<width; i++) {
2003 int r= src[i*3+0];
2004 int g= src[i*3+1];
2005 int b= src[i*3+2];
2006
2007 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2008 }
2009 #endif
2010 }
2011
2012 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2013 {
2014 #if COMPILE_TEMPLATE_MMX
2015 assert(src1==src2);
2016 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2017 #else
2018 int i;
2019 assert(src1==src2);
2020 for (i=0; i<width; i++) {
2021 int r= src1[3*i + 0];
2022 int g= src1[3*i + 1];
2023 int b= src1[3*i + 2];
2024
2025 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2026 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2027 }
2028 #endif
2029 }
2030
2031 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2032 {
2033 int i;
2034 assert(src1==src2);
2035 for (i=0; i<width; i++) {
2036 int r= src1[6*i + 0] + src1[6*i + 3];
2037 int g= src1[6*i + 1] + src1[6*i + 4];
2038 int b= src1[6*i + 2] + src1[6*i + 5];
2039
2040 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2041 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2042 }
2043 }
2044
2045
2046 // bilinear / bicubic scaling
2047 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2048 const int16_t *filter, const int16_t *filterPos, long filterSize)
2049 {
2050 #if COMPILE_TEMPLATE_MMX
2051 assert(filterSize % 4 == 0 && filterSize>0);
2052 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2053 x86_reg counter= -2*dstW;
2054 filter-= counter*2;
2055 filterPos-= counter/2;
2056 dst-= counter/2;
2057 __asm__ volatile(
2058 #if defined(PIC)
2059 "push %%"REG_b" \n\t"
2060 #endif
2061 "pxor %%mm7, %%mm7 \n\t"
2062 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2063 "mov %%"REG_a", %%"REG_BP" \n\t"
2064 ASMALIGN(4)
2065 "1: \n\t"
2066 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2067 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2068 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2069 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2070 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2071 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2072 "punpcklbw %%mm7, %%mm0 \n\t"
2073 "punpcklbw %%mm7, %%mm2 \n\t"
2074 "pmaddwd %%mm1, %%mm0 \n\t"
2075 "pmaddwd %%mm2, %%mm3 \n\t"
2076 "movq %%mm0, %%mm4 \n\t"
2077 "punpckldq %%mm3, %%mm0 \n\t"
2078 "punpckhdq %%mm3, %%mm4 \n\t"
2079 "paddd %%mm4, %%mm0 \n\t"
2080 "psrad $7, %%mm0 \n\t"
2081 "packssdw %%mm0, %%mm0 \n\t"
2082 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2083 "add $4, %%"REG_BP" \n\t"
2084 " jnc 1b \n\t"
2085
2086 "pop %%"REG_BP" \n\t"
2087 #if defined(PIC)
2088 "pop %%"REG_b" \n\t"
2089 #endif
2090 : "+a" (counter)
2091 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2092 #if !defined(PIC)
2093 : "%"REG_b
2094 #endif
2095 );
2096 } else if (filterSize==8) {
2097 x86_reg counter= -2*dstW;
2098 filter-= counter*4;
2099 filterPos-= counter/2;
2100 dst-= counter/2;
2101 __asm__ volatile(
2102 #if defined(PIC)
2103 "push %%"REG_b" \n\t"
2104 #endif
2105 "pxor %%mm7, %%mm7 \n\t"
2106 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2107 "mov %%"REG_a", %%"REG_BP" \n\t"
2108 ASMALIGN(4)
2109 "1: \n\t"
2110 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2111 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2112 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2113 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2114 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2115 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2116 "punpcklbw %%mm7, %%mm0 \n\t"
2117 "punpcklbw %%mm7, %%mm2 \n\t"
2118 "pmaddwd %%mm1, %%mm0 \n\t"
2119 "pmaddwd %%mm2, %%mm3 \n\t"
2120
2121 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2122 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2123 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2124 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2125 "punpcklbw %%mm7, %%mm4 \n\t"
2126 "punpcklbw %%mm7, %%mm2 \n\t"
2127 "pmaddwd %%mm1, %%mm4 \n\t"
2128 "pmaddwd %%mm2, %%mm5 \n\t"
2129 "paddd %%mm4, %%mm0 \n\t"
2130 "paddd %%mm5, %%mm3 \n\t"
2131 "movq %%mm0, %%mm4 \n\t"
2132 "punpckldq %%mm3, %%mm0 \n\t"
2133 "punpckhdq %%mm3, %%mm4 \n\t"
2134 "paddd %%mm4, %%mm0 \n\t"
2135 "psrad $7, %%mm0 \n\t"
2136 "packssdw %%mm0, %%mm0 \n\t"
2137 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2138 "add $4, %%"REG_BP" \n\t"
2139 " jnc 1b \n\t"
2140
2141 "pop %%"REG_BP" \n\t"
2142 #if defined(PIC)
2143 "pop %%"REG_b" \n\t"
2144 #endif
2145 : "+a" (counter)
2146 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2147 #if !defined(PIC)
2148 : "%"REG_b
2149 #endif
2150 );
2151 } else {
2152 uint8_t *offset = src+filterSize;
2153 x86_reg counter= -2*dstW;
2154 //filter-= counter*filterSize/2;
2155 filterPos-= counter/2;
2156 dst-= counter/2;
2157 __asm__ volatile(
2158 "pxor %%mm7, %%mm7 \n\t"
2159 ASMALIGN(4)
2160 "1: \n\t"
2161 "mov %2, %%"REG_c" \n\t"
2162 "movzwl (%%"REG_c", %0), %%eax \n\t"
2163 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2164 "mov %5, %%"REG_c" \n\t"
2165 "pxor %%mm4, %%mm4 \n\t"
2166 "pxor %%mm5, %%mm5 \n\t"
2167 "2: \n\t"
2168 "movq (%1), %%mm1 \n\t"
2169 "movq (%1, %6), %%mm3 \n\t"
2170 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2171 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2172 "punpcklbw %%mm7, %%mm0 \n\t"
2173 "punpcklbw %%mm7, %%mm2 \n\t"
2174 "pmaddwd %%mm1, %%mm0 \n\t"
2175 "pmaddwd %%mm2, %%mm3 \n\t"
2176 "paddd %%mm3, %%mm5 \n\t"
2177 "paddd %%mm0, %%mm4 \n\t"
2178 "add $8, %1 \n\t"
2179 "add $4, %%"REG_c" \n\t"
2180 "cmp %4, %%"REG_c" \n\t"
2181 " jb 2b \n\t"
2182 "add %6, %1 \n\t"
2183 "movq %%mm4, %%mm0 \n\t"
2184 "punpckldq %%mm5, %%mm4 \n\t"
2185 "punpckhdq %%mm5, %%mm0 \n\t"
2186 "paddd %%mm0, %%mm4 \n\t"
2187 "psrad $7, %%mm4 \n\t"
2188 "packssdw %%mm4, %%mm4 \n\t"
2189 "mov %3, %%"REG_a" \n\t"
2190 "movd %%mm4, (%%"REG_a", %0) \n\t"
2191 "add $4, %0 \n\t"
2192 " jnc 1b \n\t"
2193
2194 : "+r" (counter), "+r" (filter)
2195 : "m" (filterPos), "m" (dst), "m"(offset),
2196 "m" (src), "r" ((x86_reg)filterSize*2)
2197 : "%"REG_a, "%"REG_c, "%"REG_d
2198 );
2199 }
2200 #else
2201 #if COMPILE_TEMPLATE_ALTIVEC
2202 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2203 #else
2204 int i;
2205 for (i=0; i<dstW; i++) {
2206 int j;
2207 int srcPos= filterPos[i];
2208 int val=0;
2209 //printf("filterPos: %d\n", filterPos[i]);
2210 for (j=0; j<filterSize; j++) {
2211 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2212 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2213 }
2214 //filter += hFilterSize;
2215 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2216 //dst[i] = val>>7;
2217 }
2218 #endif /* COMPILE_ALTIVEC */
2219 #endif /* COMPILE_MMX */
2220 }
2221
2222 //FIXME all pal and rgb srcFormats could do this convertion as well
2223 //FIXME all scalers more complex than bilinear could do half of this transform
2224 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2225 {
2226 int i;
2227 for (i = 0; i < width; i++) {
2228 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2229 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2230 }
2231 }
2232 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2233 {
2234 int i;
2235 for (i = 0; i < width; i++) {
2236 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2237 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2238 }
2239 }
2240 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2241 {
2242 int i;
2243 for (i = 0; i < width; i++)
2244 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2245 }
2246 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2247 {
2248 int i;
2249 for (i = 0; i < width; i++)
2250 dst[i] = (dst[i]*14071 + 33561947)>>14;
2251 }
2252
2253 #define FAST_BILINEAR_X86 \
2254 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2255 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2256 "shll $16, %%edi \n\t" \
2257 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2258 "mov %1, %%"REG_D"\n\t" \
2259 "shrl $9, %%esi \n\t" \
2260
2261 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2262 int dstWidth, const uint8_t *src, int srcW,
2263 int xInc)
2264 {
2265 int i;
2266 unsigned int xpos=0;
2267 for (i=0;i<dstWidth;i++) {
2268 register unsigned int xx=xpos>>16;
2269 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2270 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2271 xpos+=xInc;
2272 }
2273 }
2274
2275 // *** horizontal scale Y line to temp buffer
2276 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2277 int flags, const int16_t *hLumFilter,
2278 const int16_t *hLumFilterPos, int hLumFilterSize,
2279 enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
2280 uint32_t *pal, int isAlpha)
2281 {
2282 int32_t av_unused *mmx2FilterPos = c->lumMmx2FilterPos;
2283 int16_t av_unused *mmx2Filter = c->lumMmx2Filter;
2284 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2285 void av_unused *mmx2FilterCode= c->lumMmx2FilterCode;
2286 void (*internal_func)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->hascale_internal : c->hyscale_internal;
2287 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2288
2289 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2290
2291 if (internal_func) {
2292 internal_func(formatConvBuffer, src, srcW, pal);
2293 src= formatConvBuffer;
2294 }
2295
2296 if (!c->hyscale_fast)
2297 {
2298 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2299 } else { // fast bilinear upscale / crap downscale
2300 #if ARCH_X86 && CONFIG_GPL
2301 #if COMPILE_TEMPLATE_MMX2
2302 int i;
2303 #if defined(PIC)
2304 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2305 #endif
2306 if (canMMX2BeUsed) {
2307 __asm__ volatile(
2308 #if defined(PIC)
2309 "mov %%"REG_b", %5 \n\t"
2310 #endif
2311 "pxor %%mm7, %%mm7 \n\t"
2312 "mov %0, %%"REG_c" \n\t"
2313 "mov %1, %%"REG_D" \n\t"
2314 "mov %2, %%"REG_d" \n\t"
2315 "mov %3, %%"REG_b" \n\t"
2316 "xor %%"REG_a", %%"REG_a" \n\t" // i
2317 PREFETCH" (%%"REG_c") \n\t"
2318 PREFETCH" 32(%%"REG_c") \n\t"
2319 PREFETCH" 64(%%"REG_c") \n\t"
2320
2321 #if ARCH_X86_64
2322
2323 #define CALL_MMX2_FILTER_CODE \
2324 "movl (%%"REG_b"), %%esi \n\t"\
2325 "call *%4 \n\t"\
2326 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2327 "add %%"REG_S", %%"REG_c" \n\t"\
2328 "add %%"REG_a", %%"REG_D" \n\t"\
2329 "xor %%"REG_a", %%"REG_a" \n\t"\
2330
2331 #else
2332
2333 #define CALL_MMX2_FILTER_CODE \
2334 "movl (%%"REG_b"), %%esi \n\t"\
2335 "call *%4 \n\t"\
2336 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2337 "add %%"REG_a", %%"REG_D" \n\t"\
2338 "xor %%"REG_a", %%"REG_a" \n\t"\
2339
2340 #endif /* ARCH_X86_64 */
2341
2342 CALL_MMX2_FILTER_CODE
2343 CALL_MMX2_FILTER_CODE
2344 CALL_MMX2_FILTER_CODE
2345 CALL_MMX2_FILTER_CODE
2346 CALL_MMX2_FILTER_CODE
2347 CALL_MMX2_FILTER_CODE
2348 CALL_MMX2_FILTER_CODE
2349 CALL_MMX2_FILTER_CODE
2350
2351 #if defined(PIC)
2352 "mov %5, %%"REG_b" \n\t"
2353 #endif
2354 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2355 "m" (mmx2FilterCode)
2356 #if defined(PIC)
2357 ,"m" (ebxsave)
2358 #endif
2359 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2360 #if !defined(PIC)
2361 ,"%"REG_b
2362 #endif
2363 );
2364 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2365 } else {
2366 #endif /* COMPILE_TEMPLATE_MMX2 */
2367 x86_reg xInc_shr16 = xInc >> 16;
2368 uint16_t xInc_mask = xInc & 0xffff;
2369 //NO MMX just normal asm ...
2370 __asm__ volatile(
2371 "xor %%"REG_a", %%"REG_a" \n\t" // i
2372 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2373 "xorl %%ecx, %%ecx \n\t" // xalpha
2374 ASMALIGN(4)
2375 "1: \n\t"
2376 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2377 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2378 FAST_BILINEAR_X86
2379 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2380 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2381 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2382
2383 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2384 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2385 FAST_BILINEAR_X86
2386 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2387 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2388 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2389
2390
2391 "add $2, %%"REG_a" \n\t"
2392 "cmp %2, %%"REG_a" \n\t"
2393 " jb 1b \n\t"
2394
2395
2396 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2397 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2398 );
2399 #if COMPILE_TEMPLATE_MMX2
2400 } //if MMX2 can't be used
2401 #endif
2402 #else
2403 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2404 #endif /* ARCH_X86 */
2405 }
2406
2407 if (convertRange)
2408 convertRange(dst, dstWidth);
2409 }
2410
2411 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2412 int dstWidth, const uint8_t *src1,
2413 const uint8_t *src2, int srcW, int xInc)
2414 {
2415 int i;
2416 unsigned int xpos=0;
2417 for (i=0;i<dstWidth;i++) {
2418 register unsigned int xx=xpos>>16;
2419 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2420 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2421 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2422 /* slower
2423 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2424 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2425 */
2426 xpos+=xInc;
2427 }
2428 }
2429
2430 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2431 int srcW, int xInc, int flags, const int16_t *hChrFilter,
2432 const int16_t *hChrFilterPos, int hChrFilterSize,
2433 enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
2434 uint32_t *pal)
2435 {
2436 int32_t av_unused *mmx2FilterPos = c->chrMmx2FilterPos;
2437 int16_t av_unused *mmx2Filter = c->chrMmx2Filter;
2438 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2439 void av_unused *mmx2FilterCode= c->chrMmx2FilterCode;
2440
2441 src1 += c->chrSrcOffset;
2442 src2 += c->chrSrcOffset;
2443
2444 if (c->hcscale_internal) {
2445 c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2446 src1= formatConvBuffer;
2447 src2= formatConvBuffer+VOFW;
2448 }
2449
2450 if (!c->hcscale_fast)
2451 {
2452 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2453 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2454 } else { // fast bilinear upscale / crap downscale
2455 #if ARCH_X86 && CONFIG_GPL
2456 #if COMPILE_TEMPLATE_MMX2
2457 int i;
2458 #if defined(PIC)
2459 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2460 #endif
2461 if (canMMX2BeUsed) {
2462 __asm__ volatile(
2463 #if defined(PIC)
2464 "mov %%"REG_b", %6 \n\t"
2465 #endif
2466 "pxor %%mm7, %%mm7 \n\t"
2467 "mov %0, %%"REG_c" \n\t"
2468 "mov %1, %%"REG_D" \n\t"
2469 "mov %2, %%"REG_d" \n\t"
2470 "mov %3, %%"REG_b" \n\t"
2471 "xor %%"REG_a", %%"REG_a" \n\t" // i
2472 PREFETCH" (%%"REG_c") \n\t"
2473 PREFETCH" 32(%%"REG_c") \n\t"
2474 PREFETCH" 64(%%"REG_c") \n\t"
2475
2476 CALL_MMX2_FILTER_CODE
2477 CALL_MMX2_FILTER_CODE
2478 CALL_MMX2_FILTER_CODE
2479 CALL_MMX2_FILTER_CODE
2480 "xor %%"REG_a", %%"REG_a" \n\t" // i
2481 "mov %5, %%"REG_c" \n\t" // src
2482 "mov %1, %%"REG_D" \n\t" // buf1
2483 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2484 PREFETCH" (%%"REG_c") \n\t"
2485 PREFETCH" 32(%%"REG_c") \n\t"
2486 PREFETCH" 64(%%"REG_c") \n\t"
2487
2488 CALL_MMX2_FILTER_CODE
2489 CALL_MMX2_FILTER_CODE
2490 CALL_MMX2_FILTER_CODE
2491 CALL_MMX2_FILTER_CODE
2492
2493 #if defined(PIC)
2494 "mov %6, %%"REG_b" \n\t"
2495 #endif
2496 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2497 "m" (mmx2FilterCode), "m" (src2)
2498 #if defined(PIC)
2499 ,"m" (ebxsave)
2500 #endif
2501 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2502 #if !defined(PIC)
2503 ,"%"REG_b
2504 #endif
2505 );
2506 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2507 //printf("%d %d %d\n", dstWidth, i, srcW);
2508 dst[i] = src1[srcW-1]*128;
2509 dst[i+VOFW] = src2[srcW-1]*128;
2510 }
2511 } else {
2512 #endif /* COMPILE_TEMPLATE_MMX2 */
2513 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2514 uint16_t xInc_mask = xInc & 0xffff;
2515 __asm__ volatile(
2516 "xor %%"REG_a", %%"REG_a" \n\t" // i
2517 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2518 "xorl %%ecx, %%ecx \n\t" // xalpha
2519 ASMALIGN(4)
2520 "1: \n\t"
2521 "mov %0, %%"REG_S" \n\t"
2522 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2523 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2524 FAST_BILINEAR_X86
2525 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2526
2527 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2528 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2529 FAST_BILINEAR_X86
2530 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2531
2532 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2533 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2534 "add $1, %%"REG_a" \n\t"
2535 "cmp %2, %%"REG_a" \n\t"
2536 " jb 1b \n\t"
2537
2538 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2539 which is needed to support GCC 4.0. */
2540 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2541 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2542 #else
2543 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2544 #endif
2545 "r" (src2)
2546 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2547 );
2548 #if COMPILE_TEMPLATE_MMX2
2549 } //if MMX2 can't be used
2550 #endif
2551 #else
2552 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2553 #endif /* ARCH_X86 */
2554 }
2555
2556 if (c->chrConvertRange)
2557 c->chrConvertRange(dst, dstWidth);
2558 }
2559
2560 #define DEBUG_SWSCALE_BUFFERS 0
2561 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2562
2563 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2564 int srcSliceH, uint8_t* dst[], int dstStride[])
2565 {
2566 /* load a few things into local vars to make the code more readable? and faster */
2567 const int srcW= c->srcW;
2568 const int dstW= c->dstW;
2569 const int dstH= c->dstH;
2570 const int chrDstW= c->chrDstW;
2571 const int chrSrcW= c->chrSrcW;
2572 const int lumXInc= c->lumXInc;
2573 const int chrXInc= c->chrXInc;
2574 const enum PixelFormat dstFormat= c->dstFormat;
2575 const enum PixelFormat srcFormat= c->srcFormat;
2576 const int flags= c->flags;
2577 int16_t *vLumFilterPos= c->vLumFilterPos;
2578 int16_t *vChrFilterPos= c->vChrFilterPos;
2579 int16_t *hLumFilterPos= c->hLumFilterPos;
2580 int16_t *hChrFilterPos= c->hChrFilterPos;
2581 int16_t *vLumFilter= c->vLumFilter;
2582 int16_t *vChrFilter= c->vChrFilter;
2583 int16_t *hLumFilter= c->hLumFilter;
2584 int16_t *hChrFilter= c->hChrFilter;
2585 int32_t *lumMmxFilter= c->lumMmxFilter;
2586 int32_t *chrMmxFilter= c->chrMmxFilter;
2587 int32_t *alpMmxFilter= c->alpMmxFilter;
2588 const int vLumFilterSize= c->vLumFilterSize;
2589 const int vChrFilterSize= c->vChrFilterSize;
2590 const int hLumFilterSize= c->hLumFilterSize;
2591 const int hChrFilterSize= c->hChrFilterSize;
2592 int16_t **lumPixBuf= c->lumPixBuf;
2593 int16_t **chrPixBuf= c->chrPixBuf;
2594 int16_t **alpPixBuf= c->alpPixBuf;
2595 const int vLumBufSize= c->vLumBufSize;
2596 const int vChrBufSize= c->vChrBufSize;
2597 uint8_t *formatConvBuffer= c->formatConvBuffer;
2598 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2599 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2600 int lastDstY;
2601 uint32_t *pal=c->pal_yuv;
2602
2603 /* vars which will change and which we need to store back in the context */
2604 int dstY= c->dstY;
2605 int lumBufIndex= c->lumBufIndex;
2606 int chrBufIndex= c->chrBufIndex;
2607 int lastInLumBuf= c->lastInLumBuf;
2608 int lastInChrBuf= c->lastInChrBuf;
2609
2610 if (isPacked(c->srcFormat)) {
2611 src[0]=
2612 src[1]=
2613 src[2]=
2614 src[3]= src[0];
2615 srcStride[0]=
2616 srcStride[1]=
2617 srcStride[2]=
2618 srcStride[3]= srcStride[0];
2619 }
2620 srcStride[1]<<= c->vChrDrop;
2621 srcStride[2]<<= c->vChrDrop;
2622
2623 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2624 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2625 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2626 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2627 srcSliceY, srcSliceH, dstY, dstH);
2628 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2629 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2630
2631 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2632 static int warnedAlready=0; //FIXME move this into the context perhaps
2633 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2634 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2635 " ->cannot do aligned memory accesses anymore\n");
2636 warnedAlready=1;
2637 }
2638 }
2639
2640 /* Note the user might start scaling the picture in the middle so this
2641 will not get executed. This is not really intended but works
2642 currently, so people might do it. */
2643 if (srcSliceY ==0) {
2644 lumBufIndex=-1;
2645 chrBufIndex=-1;
2646 dstY=0;
2647 lastInLumBuf= -1;
2648 lastInChrBuf= -1;
2649 }
2650
2651 lastDstY= dstY;
2652
2653 for (;dstY < dstH; dstY++) {
2654 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2655 const int chrDstY= dstY>>c->chrDstVSubSample;
2656 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2657 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2658 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2659
2660 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2661 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2662 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2663 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2664 int enough_lines;
2665
2666 //handle holes (FAST_BILINEAR & weird filters)
2667 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2668 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2669 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2670 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2671
2672 // Do we have enough lines in this slice to output the dstY line
2673 enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2674 if (!enough_lines) {
2675 lastLumSrcY = srcSliceY + srcSliceH - 1;
2676 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2677 }
2678
2679 DEBUG_BUFFERS("dstY: %d\n", dstY);
2680 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2681 firstLumSrcY, lastLumSrcY, lastInLumBuf);
2682 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2683 firstChrSrcY, lastChrSrcY, lastInChrBuf);
2684
2685 //Do horizontal scaling
2686 while(lastInLumBuf < lastLumSrcY) {
2687 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2688 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2689 lumBufIndex++;
2690 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2691 lumBufIndex, lastInLumBuf);
2692 assert(lumBufIndex < 2*vLumBufSize);
2693 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2694 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2695 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2696 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2697 c->srcFormat, formatConvBuffer,
2698 pal, 0);
2699 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2700 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2701 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2702 c->srcFormat, formatConvBuffer,
2703 pal, 1);
2704 lastInLumBuf++;
2705 }
2706 while(lastInChrBuf < lastChrSrcY) {
2707 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2708 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2709 chrBufIndex++;
2710 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2711 chrBufIndex, lastInChrBuf);
2712 assert(chrBufIndex < 2*vChrBufSize);
2713 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2714 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2715 //FIXME replace parameters through context struct (some at least)
2716
2717 if (c->needs_hcscale)
2718 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2719 flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2720 c->srcFormat, formatConvBuffer,
2721 pal);
2722 lastInChrBuf++;
2723 }
2724 //wrap buf index around to stay inside the ring buffer
2725 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2726 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2727 if (!enough_lines)
2728 break; //we can't output a dstY line so let's try with the next slice
2729
2730 #if COMPILE_TEMPLATE_MMX
2731 c->blueDither= ff_dither8[dstY&1];
2732 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2733 c->greenDither= ff_dither8[dstY&1];
2734 else
2735 c->greenDither= ff_dither4[dstY&1];
2736 c->redDither= ff_dither8[(dstY+1)&1];
2737 #endif
2738 if (dstY < dstH-2) {
2739 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2740 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2741 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2742 #if COMPILE_TEMPLATE_MMX
2743 int i;
2744 if (flags & SWS_ACCURATE_RND) {
2745 int s= APCK_SIZE / 8;
2746 for (i=0; i<vLumFilterSize; i+=2) {
2747 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2748 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2749 lumMmxFilter[s*i+APCK_COEF/4 ]=
2750 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2751 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2752 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2753 *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2754 *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2755 alpMmxFilter[s*i+APCK_COEF/4 ]=
2756 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2757 }
2758 }
2759 for (i=0; i<vChrFilterSize; i+=2) {
2760 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2761 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2762 chrMmxFilter[s*i+APCK_COEF/4 ]=
2763 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2764 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2765 }
2766 } else {
2767 for (i=0; i<vLumFilterSize; i++) {
2768 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2769 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2770 lumMmxFilter[4*i+2]=
2771 lumMmxFilter[4*i+3]=
2772 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2773 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2774 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2775 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2776 alpMmxFilter[4*i+2]=
2777 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2778 }
2779 }
2780 for (i=0; i<vChrFilterSize; i++) {
2781 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2782 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2783 chrMmxFilter[4*i+2]=
2784 chrMmxFilter[4*i+3]=
2785 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2786 }
2787 }
2788 #endif
2789 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2790 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2791 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2792 c->yuv2nv12X(c,
2793 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2794 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2795 dest, uDest, dstW, chrDstW, dstFormat);
2796 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2797 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2798 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2799 if (is16BPS(dstFormat)) {
2800 yuv2yuvX16inC(
2801 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2802 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2803 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2804 dstFormat);
2805 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2806 int16_t *lumBuf = lumSrcPtr[0];
2807 int16_t *chrBuf= chrSrcPtr[0];
2808 int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2809 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2810 } else { //General YV12
2811 c->yuv2yuvX(c,
2812 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2813 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2814 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2815 }
2816 } else {
2817 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2818 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2819 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2820 int chrAlpha= vChrFilter[2*dstY+1];
2821 if(flags & SWS_FULL_CHR_H_INT) {
2822 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2823 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2824 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2825 alpSrcPtr, dest, dstW, dstY);
2826 } else {
2827 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2828 alpPixBuf ? *alpSrcPtr : NULL,
2829 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2830 }
2831 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2832 int lumAlpha= vLumFilter[2*dstY+1];
2833 int chrAlpha= vChrFilter[2*dstY+1];
2834 lumMmxFilter[2]=
2835 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2836 chrMmxFilter[2]=
2837 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2838 if(flags & SWS_FULL_CHR_H_INT) {
2839 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2840 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2841 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2842 alpSrcPtr, dest, dstW, dstY);
2843 } else {
2844 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2845 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2846 dest, dstW, lumAlpha, chrAlpha, dstY);
2847 }
2848 } else { //general RGB
2849 if(flags & SWS_FULL_CHR_H_INT) {
2850 yuv2rgbXinC_full(c,
2851 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2852 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2853 alpSrcPtr, dest, dstW, dstY);
2854 } else {
2855 c->yuv2packedX(c,
2856 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2857 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2858 alpSrcPtr, dest, dstW, dstY);
2859 }
2860 }
2861 }
2862 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2863 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2864 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2865 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2866 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2867 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2868 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2869 yuv2nv12XinC(
2870 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2871 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2872 dest, uDest, dstW, chrDstW, dstFormat);
2873 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2874 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2875 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2876 if (is16BPS(dstFormat)) {
2877 yuv2yuvX16inC(
2878 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2879 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2880 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2881 dstFormat);
2882 } else {
2883 yuv2yuvXinC(
2884 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2885 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2886 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2887 }
2888 } else {
2889 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2890 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2891 if(flags & SWS_FULL_CHR_H_INT) {
2892 yuv2rgbXinC_full(c,
2893 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2894 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2895 alpSrcPtr, dest, dstW, dstY);
2896 } else {
2897 yuv2packedXinC(c,
2898 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2899 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2900 alpSrcPtr, dest, dstW, dstY);
2901 }
2902 }
2903 }
2904 }
2905
2906 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2907 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2908
2909 #if COMPILE_TEMPLATE_MMX
2910 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2911 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2912 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2913 else __asm__ volatile("emms" :::"memory");
2914 #endif
2915 /* store changed local vars back in the context */
2916 c->dstY= dstY;
2917 c->lumBufIndex= lumBufIndex;
2918 c->chrBufIndex= chrBufIndex;
2919 c->lastInLumBuf= lastInLumBuf;
2920 c->lastInChrBuf= lastInChrBuf;
2921
2922 return dstY - lastDstY;
2923 }
2924
2925 static void RENAME(sws_init_swScale)(SwsContext *c)
2926 {
2927 enum PixelFormat srcFormat = c->srcFormat;
2928
2929 c->yuv2nv12X = RENAME(yuv2nv12X );
2930 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2931 c->yuv2yuvX = RENAME(yuv2yuvX );
2932 c->yuv2packed1 = RENAME(yuv2packed1 );
2933 c->yuv2packed2 = RENAME(yuv2packed2 );
2934 c->yuv2packedX = RENAME(yuv2packedX );
2935
2936 c->hScale = RENAME(hScale );
2937
2938 #if COMPILE_TEMPLATE_MMX
2939 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2940 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2941 #else
2942 if (c->flags & SWS_FAST_BILINEAR)
2943 #endif
2944 {
2945 c->hyscale_fast = RENAME(hyscale_fast);
2946 c->hcscale_fast = RENAME(hcscale_fast);
2947 }
2948
2949 c->hcscale_internal = NULL;
2950 switch(srcFormat) {
2951 case PIX_FMT_YUYV422 : c->hcscale_internal = RENAME(yuy2ToUV); break;
2952 case PIX_FMT_UYVY422 : c->hcscale_internal = RENAME(uyvyToUV); break;
2953 case PIX_FMT_NV12 : c->hcscale_internal = RENAME(nv12ToUV); break;
2954 case PIX_FMT_NV21 : c->hcscale_internal = RENAME(nv21ToUV); break;
2955 case PIX_FMT_RGB8 :
2956 case PIX_FMT_BGR8 :
2957 case PIX_FMT_PAL8 :
2958 case PIX_FMT_BGR4_BYTE:
2959 case PIX_FMT_RGB4_BYTE: c->hcscale_internal = palToUV; break;
2960 case PIX_FMT_YUV420P16BE:
2961 case PIX_FMT_YUV422P16BE:
2962 case PIX_FMT_YUV444P16BE: c->hcscale_internal = RENAME(BEToUV); break;
2963 case PIX_FMT_YUV420P16LE:
2964 case PIX_FMT_YUV422P16LE:
2965 case PIX_FMT_YUV444P16LE: c->hcscale_internal = RENAME(LEToUV); break;
2966 }
2967 if (c->chrSrcHSubSample) {
2968 switch(srcFormat) {
2969 case PIX_FMT_RGB48BE:
2970 case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV_half; break;
2971 case PIX_FMT_RGB32 :
2972 case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV_half; break;
2973 case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV_half); break;
2974 case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV_half; break;
2975 case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV_half; break;
2976 case PIX_FMT_BGR32 :
2977 case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV_half; break;
2978 case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV_half); break;
2979 case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV_half; break;
2980 case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV_half; break;
2981 }