Replace wrong condition name in #endif comment by correct instance.
[libav.git] / libswscale / swscale_template.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
22 */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28
29 #if COMPILE_TEMPLATE_AMD3DNOW
30 #define PREFETCH "prefetch"
31 #elif COMPILE_TEMPLATE_MMX2
32 #define PREFETCH "prefetchnta"
33 #else
34 #define PREFETCH " # nop"
35 #endif
36
37 #if COMPILE_TEMPLATE_MMX2
38 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39 #elif COMPILE_TEMPLATE_AMD3DNOW
40 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
41 #endif
42
43 #if COMPILE_TEMPLATE_MMX2
44 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
45 #else
46 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
47 #endif
48 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
49
50 #if COMPILE_TEMPLATE_ALTIVEC
51 #include "ppc/swscale_altivec_template.c"
52 #endif
53
54 #define YSCALEYUV2YV12X(x, offset, dest, width) \
55 __asm__ volatile(\
56 "xor %%"REG_a", %%"REG_a" \n\t"\
57 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
58 "movq %%mm3, %%mm4 \n\t"\
59 "lea " offset "(%0), %%"REG_d" \n\t"\
60 "mov (%%"REG_d"), %%"REG_S" \n\t"\
61 ASMALIGN(4) /* FIXME Unroll? */\
62 "1: \n\t"\
63 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
64 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
65 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
66 "add $16, %%"REG_d" \n\t"\
67 "mov (%%"REG_d"), %%"REG_S" \n\t"\
68 "test %%"REG_S", %%"REG_S" \n\t"\
69 "pmulhw %%mm0, %%mm2 \n\t"\
70 "pmulhw %%mm0, %%mm5 \n\t"\
71 "paddw %%mm2, %%mm3 \n\t"\
72 "paddw %%mm5, %%mm4 \n\t"\
73 " jnz 1b \n\t"\
74 "psraw $3, %%mm3 \n\t"\
75 "psraw $3, %%mm4 \n\t"\
76 "packuswb %%mm4, %%mm3 \n\t"\
77 MOVNTQ(%%mm3, (%1, %%REGa))\
78 "add $8, %%"REG_a" \n\t"\
79 "cmp %2, %%"REG_a" \n\t"\
80 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
81 "movq %%mm3, %%mm4 \n\t"\
82 "lea " offset "(%0), %%"REG_d" \n\t"\
83 "mov (%%"REG_d"), %%"REG_S" \n\t"\
84 "jb 1b \n\t"\
85 :: "r" (&c->redDither),\
86 "r" (dest), "g" (width)\
87 : "%"REG_a, "%"REG_d, "%"REG_S\
88 );
89
90 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
91 __asm__ volatile(\
92 "lea " offset "(%0), %%"REG_d" \n\t"\
93 "xor %%"REG_a", %%"REG_a" \n\t"\
94 "pxor %%mm4, %%mm4 \n\t"\
95 "pxor %%mm5, %%mm5 \n\t"\
96 "pxor %%mm6, %%mm6 \n\t"\
97 "pxor %%mm7, %%mm7 \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 ASMALIGN(4) \
100 "1: \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
102 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
103 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
104 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
105 "movq %%mm0, %%mm3 \n\t"\
106 "punpcklwd %%mm1, %%mm0 \n\t"\
107 "punpckhwd %%mm1, %%mm3 \n\t"\
108 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
109 "pmaddwd %%mm1, %%mm0 \n\t"\
110 "pmaddwd %%mm1, %%mm3 \n\t"\
111 "paddd %%mm0, %%mm4 \n\t"\
112 "paddd %%mm3, %%mm5 \n\t"\
113 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
114 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
115 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
116 "test %%"REG_S", %%"REG_S" \n\t"\
117 "movq %%mm2, %%mm0 \n\t"\
118 "punpcklwd %%mm3, %%mm2 \n\t"\
119 "punpckhwd %%mm3, %%mm0 \n\t"\
120 "pmaddwd %%mm1, %%mm2 \n\t"\
121 "pmaddwd %%mm1, %%mm0 \n\t"\
122 "paddd %%mm2, %%mm6 \n\t"\
123 "paddd %%mm0, %%mm7 \n\t"\
124 " jnz 1b \n\t"\
125 "psrad $16, %%mm4 \n\t"\
126 "psrad $16, %%mm5 \n\t"\
127 "psrad $16, %%mm6 \n\t"\
128 "psrad $16, %%mm7 \n\t"\
129 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
130 "packssdw %%mm5, %%mm4 \n\t"\
131 "packssdw %%mm7, %%mm6 \n\t"\
132 "paddw %%mm0, %%mm4 \n\t"\
133 "paddw %%mm0, %%mm6 \n\t"\
134 "psraw $3, %%mm4 \n\t"\
135 "psraw $3, %%mm6 \n\t"\
136 "packuswb %%mm6, %%mm4 \n\t"\
137 MOVNTQ(%%mm4, (%1, %%REGa))\
138 "add $8, %%"REG_a" \n\t"\
139 "cmp %2, %%"REG_a" \n\t"\
140 "lea " offset "(%0), %%"REG_d" \n\t"\
141 "pxor %%mm4, %%mm4 \n\t"\
142 "pxor %%mm5, %%mm5 \n\t"\
143 "pxor %%mm6, %%mm6 \n\t"\
144 "pxor %%mm7, %%mm7 \n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
146 "jb 1b \n\t"\
147 :: "r" (&c->redDither),\
148 "r" (dest), "g" (width)\
149 : "%"REG_a, "%"REG_d, "%"REG_S\
150 );
151
152 #define YSCALEYUV2YV121 \
153 "mov %2, %%"REG_a" \n\t"\
154 ASMALIGN(4) /* FIXME Unroll? */\
155 "1: \n\t"\
156 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
157 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
158 "psraw $7, %%mm0 \n\t"\
159 "psraw $7, %%mm1 \n\t"\
160 "packuswb %%mm1, %%mm0 \n\t"\
161 MOVNTQ(%%mm0, (%1, %%REGa))\
162 "add $8, %%"REG_a" \n\t"\
163 "jnc 1b \n\t"
164
165 #define YSCALEYUV2YV121_ACCURATE \
166 "mov %2, %%"REG_a" \n\t"\
167 "pcmpeqw %%mm7, %%mm7 \n\t"\
168 "psrlw $15, %%mm7 \n\t"\
169 "psllw $6, %%mm7 \n\t"\
170 ASMALIGN(4) /* FIXME Unroll? */\
171 "1: \n\t"\
172 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
173 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
174 "paddsw %%mm7, %%mm0 \n\t"\
175 "paddsw %%mm7, %%mm1 \n\t"\
176 "psraw $7, %%mm0 \n\t"\
177 "psraw $7, %%mm1 \n\t"\
178 "packuswb %%mm1, %%mm0 \n\t"\
179 MOVNTQ(%%mm0, (%1, %%REGa))\
180 "add $8, %%"REG_a" \n\t"\
181 "jnc 1b \n\t"
182
183 /*
184 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
185 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
186 "r" (dest), "m" (dstW),
187 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
188 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
189 */
190 #define YSCALEYUV2PACKEDX_UV \
191 __asm__ volatile(\
192 "xor %%"REG_a", %%"REG_a" \n\t"\
193 ASMALIGN(4)\
194 "nop \n\t"\
195 "1: \n\t"\
196 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
197 "mov (%%"REG_d"), %%"REG_S" \n\t"\
198 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
199 "movq %%mm3, %%mm4 \n\t"\
200 ASMALIGN(4)\
201 "2: \n\t"\
202 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
203 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
204 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
205 "add $16, %%"REG_d" \n\t"\
206 "mov (%%"REG_d"), %%"REG_S" \n\t"\
207 "pmulhw %%mm0, %%mm2 \n\t"\
208 "pmulhw %%mm0, %%mm5 \n\t"\
209 "paddw %%mm2, %%mm3 \n\t"\
210 "paddw %%mm5, %%mm4 \n\t"\
211 "test %%"REG_S", %%"REG_S" \n\t"\
212 " jnz 2b \n\t"\
213
214 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
215 "lea "offset"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
218 "movq "#dst1", "#dst2" \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw "#coeff", "#src1" \n\t"\
227 "pmulhw "#coeff", "#src2" \n\t"\
228 "paddw "#src1", "#dst1" \n\t"\
229 "paddw "#src2", "#dst2" \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
232
233 #define YSCALEYUV2PACKEDX \
234 YSCALEYUV2PACKEDX_UV \
235 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
236
237 #define YSCALEYUV2PACKEDX_END \
238 :: "r" (&c->redDither), \
239 "m" (dummy), "m" (dummy), "m" (dummy),\
240 "r" (dest), "m" (dstW) \
241 : "%"REG_a, "%"REG_d, "%"REG_S \
242 );
243
244 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
245 __asm__ volatile(\
246 "xor %%"REG_a", %%"REG_a" \n\t"\
247 ASMALIGN(4)\
248 "nop \n\t"\
249 "1: \n\t"\
250 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
251 "mov (%%"REG_d"), %%"REG_S" \n\t"\
252 "pxor %%mm4, %%mm4 \n\t"\
253 "pxor %%mm5, %%mm5 \n\t"\
254 "pxor %%mm6, %%mm6 \n\t"\
255 "pxor %%mm7, %%mm7 \n\t"\
256 ASMALIGN(4)\
257 "2: \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
259 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
260 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
261 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
262 "movq %%mm0, %%mm3 \n\t"\
263 "punpcklwd %%mm1, %%mm0 \n\t"\
264 "punpckhwd %%mm1, %%mm3 \n\t"\
265 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
266 "pmaddwd %%mm1, %%mm0 \n\t"\
267 "pmaddwd %%mm1, %%mm3 \n\t"\
268 "paddd %%mm0, %%mm4 \n\t"\
269 "paddd %%mm3, %%mm5 \n\t"\
270 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
271 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
272 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
273 "test %%"REG_S", %%"REG_S" \n\t"\
274 "movq %%mm2, %%mm0 \n\t"\
275 "punpcklwd %%mm3, %%mm2 \n\t"\
276 "punpckhwd %%mm3, %%mm0 \n\t"\
277 "pmaddwd %%mm1, %%mm2 \n\t"\
278 "pmaddwd %%mm1, %%mm0 \n\t"\
279 "paddd %%mm2, %%mm6 \n\t"\
280 "paddd %%mm0, %%mm7 \n\t"\
281 " jnz 2b \n\t"\
282 "psrad $16, %%mm4 \n\t"\
283 "psrad $16, %%mm5 \n\t"\
284 "psrad $16, %%mm6 \n\t"\
285 "psrad $16, %%mm7 \n\t"\
286 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
287 "packssdw %%mm5, %%mm4 \n\t"\
288 "packssdw %%mm7, %%mm6 \n\t"\
289 "paddw %%mm0, %%mm4 \n\t"\
290 "paddw %%mm0, %%mm6 \n\t"\
291 "movq %%mm4, "U_TEMP"(%0) \n\t"\
292 "movq %%mm6, "V_TEMP"(%0) \n\t"\
293
294 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
295 "lea "offset"(%0), %%"REG_d" \n\t"\
296 "mov (%%"REG_d"), %%"REG_S" \n\t"\
297 "pxor %%mm1, %%mm1 \n\t"\
298 "pxor %%mm5, %%mm5 \n\t"\
299 "pxor %%mm7, %%mm7 \n\t"\
300 "pxor %%mm6, %%mm6 \n\t"\
301 ASMALIGN(4)\
302 "2: \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
304 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
305 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
306 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
307 "movq %%mm0, %%mm3 \n\t"\
308 "punpcklwd %%mm4, %%mm0 \n\t"\
309 "punpckhwd %%mm4, %%mm3 \n\t"\
310 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
311 "pmaddwd %%mm4, %%mm0 \n\t"\
312 "pmaddwd %%mm4, %%mm3 \n\t"\
313 "paddd %%mm0, %%mm1 \n\t"\
314 "paddd %%mm3, %%mm5 \n\t"\
315 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
316 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
317 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
318 "test %%"REG_S", %%"REG_S" \n\t"\
319 "movq %%mm2, %%mm0 \n\t"\
320 "punpcklwd %%mm3, %%mm2 \n\t"\
321 "punpckhwd %%mm3, %%mm0 \n\t"\
322 "pmaddwd %%mm4, %%mm2 \n\t"\
323 "pmaddwd %%mm4, %%mm0 \n\t"\
324 "paddd %%mm2, %%mm7 \n\t"\
325 "paddd %%mm0, %%mm6 \n\t"\
326 " jnz 2b \n\t"\
327 "psrad $16, %%mm1 \n\t"\
328 "psrad $16, %%mm5 \n\t"\
329 "psrad $16, %%mm7 \n\t"\
330 "psrad $16, %%mm6 \n\t"\
331 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
332 "packssdw %%mm5, %%mm1 \n\t"\
333 "packssdw %%mm6, %%mm7 \n\t"\
334 "paddw %%mm0, %%mm1 \n\t"\
335 "paddw %%mm0, %%mm7 \n\t"\
336 "movq "U_TEMP"(%0), %%mm3 \n\t"\
337 "movq "V_TEMP"(%0), %%mm4 \n\t"\
338
339 #define YSCALEYUV2PACKEDX_ACCURATE \
340 YSCALEYUV2PACKEDX_ACCURATE_UV \
341 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
342
343 #define YSCALEYUV2RGBX \
344 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
345 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
346 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
347 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
348 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
349 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
350 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
351 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
352 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
353 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
354 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
355 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
356 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
357 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
358 "paddw %%mm3, %%mm4 \n\t"\
359 "movq %%mm2, %%mm0 \n\t"\
360 "movq %%mm5, %%mm6 \n\t"\
361 "movq %%mm4, %%mm3 \n\t"\
362 "punpcklwd %%mm2, %%mm2 \n\t"\
363 "punpcklwd %%mm5, %%mm5 \n\t"\
364 "punpcklwd %%mm4, %%mm4 \n\t"\
365 "paddw %%mm1, %%mm2 \n\t"\
366 "paddw %%mm1, %%mm5 \n\t"\
367 "paddw %%mm1, %%mm4 \n\t"\
368 "punpckhwd %%mm0, %%mm0 \n\t"\
369 "punpckhwd %%mm6, %%mm6 \n\t"\
370 "punpckhwd %%mm3, %%mm3 \n\t"\
371 "paddw %%mm7, %%mm0 \n\t"\
372 "paddw %%mm7, %%mm6 \n\t"\
373 "paddw %%mm7, %%mm3 \n\t"\
374 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
375 "packuswb %%mm0, %%mm2 \n\t"\
376 "packuswb %%mm6, %%mm5 \n\t"\
377 "packuswb %%mm3, %%mm4 \n\t"\
378
379 #define REAL_YSCALEYUV2PACKED(index, c) \
380 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
381 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
382 "psraw $3, %%mm0 \n\t"\
383 "psraw $3, %%mm1 \n\t"\
384 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
385 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
386 "xor "#index", "#index" \n\t"\
387 ASMALIGN(4)\
388 "1: \n\t"\
389 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
390 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
391 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
392 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
393 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
394 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
396 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
397 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
399 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
400 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
401 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
402 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
403 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
404 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
405 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
406 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
407 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
408 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
409 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
410 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
411 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
412 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
413 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
414
415 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
416
417 #define REAL_YSCALEYUV2RGB_UV(index, c) \
418 "xor "#index", "#index" \n\t"\
419 ASMALIGN(4)\
420 "1: \n\t"\
421 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
422 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
423 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
424 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
425 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
426 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
427 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
428 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
429 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
430 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
431 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
432 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
433 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
434 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
435 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
436 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
437 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
438 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
439 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
440 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
441
442 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
443 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
444 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
445 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
446 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
447 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
448 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
449 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
455
456 #define REAL_YSCALEYUV2RGB_COEFF(c) \
457 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
458 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
459 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
460 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
461 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
462 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
463 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
464 "paddw %%mm3, %%mm4 \n\t"\
465 "movq %%mm2, %%mm0 \n\t"\
466 "movq %%mm5, %%mm6 \n\t"\
467 "movq %%mm4, %%mm3 \n\t"\
468 "punpcklwd %%mm2, %%mm2 \n\t"\
469 "punpcklwd %%mm5, %%mm5 \n\t"\
470 "punpcklwd %%mm4, %%mm4 \n\t"\
471 "paddw %%mm1, %%mm2 \n\t"\
472 "paddw %%mm1, %%mm5 \n\t"\
473 "paddw %%mm1, %%mm4 \n\t"\
474 "punpckhwd %%mm0, %%mm0 \n\t"\
475 "punpckhwd %%mm6, %%mm6 \n\t"\
476 "punpckhwd %%mm3, %%mm3 \n\t"\
477 "paddw %%mm7, %%mm0 \n\t"\
478 "paddw %%mm7, %%mm6 \n\t"\
479 "paddw %%mm7, %%mm3 \n\t"\
480 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
481 "packuswb %%mm0, %%mm2 \n\t"\
482 "packuswb %%mm6, %%mm5 \n\t"\
483 "packuswb %%mm3, %%mm4 \n\t"\
484
485 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
486
487 #define YSCALEYUV2RGB(index, c) \
488 REAL_YSCALEYUV2RGB_UV(index, c) \
489 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
490 REAL_YSCALEYUV2RGB_COEFF(c)
491
492 #define REAL_YSCALEYUV2PACKED1(index, c) \
493 "xor "#index", "#index" \n\t"\
494 ASMALIGN(4)\
495 "1: \n\t"\
496 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
497 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
498 "psraw $7, %%mm3 \n\t" \
499 "psraw $7, %%mm4 \n\t" \
500 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
501 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
502 "psraw $7, %%mm1 \n\t" \
503 "psraw $7, %%mm7 \n\t" \
504
505 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
506
507 #define REAL_YSCALEYUV2RGB1(index, c) \
508 "xor "#index", "#index" \n\t"\
509 ASMALIGN(4)\
510 "1: \n\t"\
511 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
512 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
513 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
514 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
515 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
516 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
517 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
518 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
519 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
520 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
521 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
522 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
523 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
524 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
525 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
527 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
528 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
529 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
530 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
531 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
532 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
533 "paddw %%mm3, %%mm4 \n\t"\
534 "movq %%mm2, %%mm0 \n\t"\
535 "movq %%mm5, %%mm6 \n\t"\
536 "movq %%mm4, %%mm3 \n\t"\
537 "punpcklwd %%mm2, %%mm2 \n\t"\
538 "punpcklwd %%mm5, %%mm5 \n\t"\
539 "punpcklwd %%mm4, %%mm4 \n\t"\
540 "paddw %%mm1, %%mm2 \n\t"\
541 "paddw %%mm1, %%mm5 \n\t"\
542 "paddw %%mm1, %%mm4 \n\t"\
543 "punpckhwd %%mm0, %%mm0 \n\t"\
544 "punpckhwd %%mm6, %%mm6 \n\t"\
545 "punpckhwd %%mm3, %%mm3 \n\t"\
546 "paddw %%mm7, %%mm0 \n\t"\
547 "paddw %%mm7, %%mm6 \n\t"\
548 "paddw %%mm7, %%mm3 \n\t"\
549 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
550 "packuswb %%mm0, %%mm2 \n\t"\
551 "packuswb %%mm6, %%mm5 \n\t"\
552 "packuswb %%mm3, %%mm4 \n\t"\
553
554 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
555
556 #define REAL_YSCALEYUV2PACKED1b(index, c) \
557 "xor "#index", "#index" \n\t"\
558 ASMALIGN(4)\
559 "1: \n\t"\
560 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
561 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
562 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
563 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
564 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
565 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
566 "psrlw $8, %%mm3 \n\t" \
567 "psrlw $8, %%mm4 \n\t" \
568 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
569 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
570 "psraw $7, %%mm1 \n\t" \
571 "psraw $7, %%mm7 \n\t"
572 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
573
574 // do vertical chrominance interpolation
575 #define REAL_YSCALEYUV2RGB1b(index, c) \
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
586 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
587 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
588 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
589 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
590 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
591 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
592 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
593 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
594 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
595 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
596 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
597 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
599 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
600 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
601 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
602 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
603 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
604 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
605 "paddw %%mm3, %%mm4 \n\t"\
606 "movq %%mm2, %%mm0 \n\t"\
607 "movq %%mm5, %%mm6 \n\t"\
608 "movq %%mm4, %%mm3 \n\t"\
609 "punpcklwd %%mm2, %%mm2 \n\t"\
610 "punpcklwd %%mm5, %%mm5 \n\t"\
611 "punpcklwd %%mm4, %%mm4 \n\t"\
612 "paddw %%mm1, %%mm2 \n\t"\
613 "paddw %%mm1, %%mm5 \n\t"\
614 "paddw %%mm1, %%mm4 \n\t"\
615 "punpckhwd %%mm0, %%mm0 \n\t"\
616 "punpckhwd %%mm6, %%mm6 \n\t"\
617 "punpckhwd %%mm3, %%mm3 \n\t"\
618 "paddw %%mm7, %%mm0 \n\t"\
619 "paddw %%mm7, %%mm6 \n\t"\
620 "paddw %%mm7, %%mm3 \n\t"\
621 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
622 "packuswb %%mm0, %%mm2 \n\t"\
623 "packuswb %%mm6, %%mm5 \n\t"\
624 "packuswb %%mm3, %%mm4 \n\t"\
625
626 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
627
628 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
629 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
630 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
631 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
632 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
633 "packuswb %%mm1, %%mm7 \n\t"
634 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
635
636 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
637 "movq "#b", "#q2" \n\t" /* B */\
638 "movq "#r", "#t" \n\t" /* R */\
639 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
640 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
641 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
642 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
643 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
644 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
645 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
646 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
647 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
648 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
649 \
650 MOVNTQ( q0, (dst, index, 4))\
651 MOVNTQ( b, 8(dst, index, 4))\
652 MOVNTQ( q2, 16(dst, index, 4))\
653 MOVNTQ( q3, 24(dst, index, 4))\
654 \
655 "add $8, "#index" \n\t"\
656 "cmp "#dstw", "#index" \n\t"\
657 " jb 1b \n\t"
658 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
659
660 #define REAL_WRITERGB16(dst, dstw, index) \
661 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
662 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
663 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
664 "psrlq $3, %%mm2 \n\t"\
665 \
666 "movq %%mm2, %%mm1 \n\t"\
667 "movq %%mm4, %%mm3 \n\t"\
668 \
669 "punpcklbw %%mm7, %%mm3 \n\t"\
670 "punpcklbw %%mm5, %%mm2 \n\t"\
671 "punpckhbw %%mm7, %%mm4 \n\t"\
672 "punpckhbw %%mm5, %%mm1 \n\t"\
673 \
674 "psllq $3, %%mm3 \n\t"\
675 "psllq $3, %%mm4 \n\t"\
676 \
677 "por %%mm3, %%mm2 \n\t"\
678 "por %%mm4, %%mm1 \n\t"\
679 \
680 MOVNTQ(%%mm2, (dst, index, 2))\
681 MOVNTQ(%%mm1, 8(dst, index, 2))\
682 \
683 "add $8, "#index" \n\t"\
684 "cmp "#dstw", "#index" \n\t"\
685 " jb 1b \n\t"
686 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
687
688 #define REAL_WRITERGB15(dst, dstw, index) \
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
690 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
692 "psrlq $3, %%mm2 \n\t"\
693 "psrlq $1, %%mm5 \n\t"\
694 \
695 "movq %%mm2, %%mm1 \n\t"\
696 "movq %%mm4, %%mm3 \n\t"\
697 \
698 "punpcklbw %%mm7, %%mm3 \n\t"\
699 "punpcklbw %%mm5, %%mm2 \n\t"\
700 "punpckhbw %%mm7, %%mm4 \n\t"\
701 "punpckhbw %%mm5, %%mm1 \n\t"\
702 \
703 "psllq $2, %%mm3 \n\t"\
704 "psllq $2, %%mm4 \n\t"\
705 \
706 "por %%mm3, %%mm2 \n\t"\
707 "por %%mm4, %%mm1 \n\t"\
708 \
709 MOVNTQ(%%mm2, (dst, index, 2))\
710 MOVNTQ(%%mm1, 8(dst, index, 2))\
711 \
712 "add $8, "#index" \n\t"\
713 "cmp "#dstw", "#index" \n\t"\
714 " jb 1b \n\t"
715 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
716
717 #define WRITEBGR24OLD(dst, dstw, index) \
718 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
719 "movq %%mm2, %%mm1 \n\t" /* B */\
720 "movq %%mm5, %%mm6 \n\t" /* R */\
721 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
723 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
724 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
725 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
726 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
727 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
728 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
729 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
730 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
731 \
732 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
733 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
734 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
735 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
736 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
737 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
738 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
739 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
740 \
741 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
742 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
743 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
744 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
745 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
746 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
747 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
748 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
749 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
750 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
751 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
752 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
753 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
754 \
755 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
756 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
757 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
758 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
759 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
760 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
761 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
762 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
763 \
764 MOVNTQ(%%mm0, (dst))\
765 MOVNTQ(%%mm2, 8(dst))\
766 MOVNTQ(%%mm3, 16(dst))\
767 "add $24, "#dst" \n\t"\
768 \
769 "add $8, "#index" \n\t"\
770 "cmp "#dstw", "#index" \n\t"\
771 " jb 1b \n\t"
772
773 #define WRITEBGR24MMX(dst, dstw, index) \
774 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
775 "movq %%mm2, %%mm1 \n\t" /* B */\
776 "movq %%mm5, %%mm6 \n\t" /* R */\
777 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
778 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
779 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
780 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
781 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
782 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
783 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
784 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
785 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
786 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
787 \
788 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
789 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
790 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
791 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
792 \
793 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
794 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
795 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
796 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
797 \
798 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
799 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
800 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
801 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
802 \
803 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
804 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
805 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
806 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
807 MOVNTQ(%%mm0, (dst))\
808 \
809 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
810 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
811 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
812 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
813 MOVNTQ(%%mm6, 8(dst))\
814 \
815 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
816 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
817 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
818 MOVNTQ(%%mm5, 16(dst))\
819 \
820 "add $24, "#dst" \n\t"\
821 \
822 "add $8, "#index" \n\t"\
823 "cmp "#dstw", "#index" \n\t"\
824 " jb 1b \n\t"
825
826 #define WRITEBGR24MMX2(dst, dstw, index) \
827 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
828 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
829 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
830 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
831 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
832 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
833 \
834 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
835 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
836 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
837 \
838 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
839 "por %%mm1, %%mm6 \n\t"\
840 "por %%mm3, %%mm6 \n\t"\
841 MOVNTQ(%%mm6, (dst))\
842 \
843 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
844 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
845 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
846 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
847 \
848 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
849 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
850 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
851 \
852 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
853 "por %%mm3, %%mm6 \n\t"\
854 MOVNTQ(%%mm6, 8(dst))\
855 \
856 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
857 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
858 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
859 \
860 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
861 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
862 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
863 \
864 "por %%mm1, %%mm3 \n\t"\
865 "por %%mm3, %%mm6 \n\t"\
866 MOVNTQ(%%mm6, 16(dst))\
867 \
868 "add $24, "#dst" \n\t"\
869 \
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
872 " jb 1b \n\t"
873
874 #if COMPILE_TEMPLATE_MMX2
875 #undef WRITEBGR24
876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
877 #else
878 #undef WRITEBGR24
879 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
880 #endif
881
882 #define REAL_WRITEYUY2(dst, dstw, index) \
883 "packuswb %%mm3, %%mm3 \n\t"\
884 "packuswb %%mm4, %%mm4 \n\t"\
885 "packuswb %%mm7, %%mm1 \n\t"\
886 "punpcklbw %%mm4, %%mm3 \n\t"\
887 "movq %%mm1, %%mm7 \n\t"\
888 "punpcklbw %%mm3, %%mm1 \n\t"\
889 "punpckhbw %%mm3, %%mm7 \n\t"\
890 \
891 MOVNTQ(%%mm1, (dst, index, 2))\
892 MOVNTQ(%%mm7, 8(dst, index, 2))\
893 \
894 "add $8, "#index" \n\t"\
895 "cmp "#dstw", "#index" \n\t"\
896 " jb 1b \n\t"
897 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
898
899
900 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
901 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
902 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
903 {
904 #if COMPILE_TEMPLATE_MMX
905 if(!(c->flags & SWS_BITEXACT)) {
906 if (c->flags & SWS_ACCURATE_RND) {
907 if (uDest) {
908 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
909 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
910 }
911 if (CONFIG_SWSCALE_ALPHA && aDest) {
912 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
913 }
914
915 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
916 } else {
917 if (uDest) {
918 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
919 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
920 }
921 if (CONFIG_SWSCALE_ALPHA && aDest) {
922 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
923 }
924
925 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
926 }
927 return;
928 }
929 #endif
930 #if COMPILE_TEMPLATE_ALTIVEC
931 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
932 chrFilter, chrSrc, chrFilterSize,
933 dest, uDest, vDest, dstW, chrDstW);
934 #else //COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
938 #endif //!COMPILE_TEMPLATE_ALTIVEC
939 }
940
941 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
942 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
943 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
944 {
945 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
946 chrFilter, chrSrc, chrFilterSize,
947 dest, uDest, dstW, chrDstW, dstFormat);
948 }
949
950 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
951 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
952 {
953 int i;
954 #if COMPILE_TEMPLATE_MMX
955 if(!(c->flags & SWS_BITEXACT)) {
956 long p= 4;
957 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
958 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
959 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
960
961 if (c->flags & SWS_ACCURATE_RND) {
962 while(p--) {
963 if (dst[p]) {
964 __asm__ volatile(
965 YSCALEYUV2YV121_ACCURATE
966 :: "r" (src[p]), "r" (dst[p] + counter[p]),
967 "g" (-counter[p])
968 : "%"REG_a
969 );
970 }
971 }
972 } else {
973 while(p--) {
974 if (dst[p]) {
975 __asm__ volatile(
976 YSCALEYUV2YV121
977 :: "r" (src[p]), "r" (dst[p] + counter[p]),
978 "g" (-counter[p])
979 : "%"REG_a
980 );
981 }
982 }
983 }
984 return;
985 }
986 #endif
987 for (i=0; i<dstW; i++) {
988 int val= (lumSrc[i]+64)>>7;
989
990 if (val&256) {
991 if (val<0) val=0;
992 else val=255;
993 }
994
995 dest[i]= val;
996 }
997
998 if (uDest)
999 for (i=0; i<chrDstW; i++) {
1000 int u=(chrSrc[i ]+64)>>7;
1001 int v=(chrSrc[i + VOFW]+64)>>7;
1002
1003 if ((u|v)&256) {
1004 if (u<0) u=0;
1005 else if (u>255) u=255;
1006 if (v<0) v=0;
1007 else if (v>255) v=255;
1008 }
1009
1010 uDest[i]= u;
1011 vDest[i]= v;
1012 }
1013
1014 if (CONFIG_SWSCALE_ALPHA && aDest)
1015 for (i=0; i<dstW; i++) {
1016 int val= (alpSrc[i]+64)>>7;
1017 aDest[i]= av_clip_uint8(val);
1018 }
1019 }
1020
1021
1022 /**
1023 * vertical scale YV12 to RGB
1024 */
1025 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1026 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1027 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1028 {
1029 #if COMPILE_TEMPLATE_MMX
1030 x86_reg dummy=0;
1031 if(!(c->flags & SWS_BITEXACT)) {
1032 if (c->flags & SWS_ACCURATE_RND) {
1033 switch(c->dstFormat) {
1034 case PIX_FMT_RGB32:
1035 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1036 YSCALEYUV2PACKEDX_ACCURATE
1037 YSCALEYUV2RGBX
1038 "movq %%mm2, "U_TEMP"(%0) \n\t"
1039 "movq %%mm4, "V_TEMP"(%0) \n\t"
1040 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1041 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1042 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1043 "psraw $3, %%mm1 \n\t"
1044 "psraw $3, %%mm7 \n\t"
1045 "packuswb %%mm7, %%mm1 \n\t"
1046 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1047
1048 YSCALEYUV2PACKEDX_END
1049 } else {
1050 YSCALEYUV2PACKEDX_ACCURATE
1051 YSCALEYUV2RGBX
1052 "pcmpeqd %%mm7, %%mm7 \n\t"
1053 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1054
1055 YSCALEYUV2PACKEDX_END
1056 }
1057 return;
1058 case PIX_FMT_BGR24:
1059 YSCALEYUV2PACKEDX_ACCURATE
1060 YSCALEYUV2RGBX
1061 "pxor %%mm7, %%mm7 \n\t"
1062 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1063 "add %4, %%"REG_c" \n\t"
1064 WRITEBGR24(%%REGc, %5, %%REGa)
1065
1066
1067 :: "r" (&c->redDither),
1068 "m" (dummy), "m" (dummy), "m" (dummy),
1069 "r" (dest), "m" (dstW)
1070 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1071 );
1072 return;
1073 case PIX_FMT_RGB555:
1074 YSCALEYUV2PACKEDX_ACCURATE
1075 YSCALEYUV2RGBX
1076 "pxor %%mm7, %%mm7 \n\t"
1077 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1078 #ifdef DITHER1XBPP
1079 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1080 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1081 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1082 #endif
1083
1084 WRITERGB15(%4, %5, %%REGa)
1085 YSCALEYUV2PACKEDX_END
1086 return;
1087 case PIX_FMT_RGB565:
1088 YSCALEYUV2PACKEDX_ACCURATE
1089 YSCALEYUV2RGBX
1090 "pxor %%mm7, %%mm7 \n\t"
1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1092 #ifdef DITHER1XBPP
1093 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1094 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1095 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1096 #endif
1097
1098 WRITERGB16(%4, %5, %%REGa)
1099 YSCALEYUV2PACKEDX_END
1100 return;
1101 case PIX_FMT_YUYV422:
1102 YSCALEYUV2PACKEDX_ACCURATE
1103 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1104
1105 "psraw $3, %%mm3 \n\t"
1106 "psraw $3, %%mm4 \n\t"
1107 "psraw $3, %%mm1 \n\t"
1108 "psraw $3, %%mm7 \n\t"
1109 WRITEYUY2(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1111 return;
1112 }
1113 } else {
1114 switch(c->dstFormat) {
1115 case PIX_FMT_RGB32:
1116 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1117 YSCALEYUV2PACKEDX
1118 YSCALEYUV2RGBX
1119 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1120 "psraw $3, %%mm1 \n\t"
1121 "psraw $3, %%mm7 \n\t"
1122 "packuswb %%mm7, %%mm1 \n\t"
1123 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1124 YSCALEYUV2PACKEDX_END
1125 } else {
1126 YSCALEYUV2PACKEDX
1127 YSCALEYUV2RGBX
1128 "pcmpeqd %%mm7, %%mm7 \n\t"
1129 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1130 YSCALEYUV2PACKEDX_END
1131 }
1132 return;
1133 case PIX_FMT_BGR24:
1134 YSCALEYUV2PACKEDX
1135 YSCALEYUV2RGBX
1136 "pxor %%mm7, %%mm7 \n\t"
1137 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1138 "add %4, %%"REG_c" \n\t"
1139 WRITEBGR24(%%REGc, %5, %%REGa)
1140
1141 :: "r" (&c->redDither),
1142 "m" (dummy), "m" (dummy), "m" (dummy),
1143 "r" (dest), "m" (dstW)
1144 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145 );
1146 return;
1147 case PIX_FMT_RGB555:
1148 YSCALEYUV2PACKEDX
1149 YSCALEYUV2RGBX
1150 "pxor %%mm7, %%mm7 \n\t"
1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1152 #ifdef DITHER1XBPP
1153 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1154 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1155 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1156 #endif
1157
1158 WRITERGB15(%4, %5, %%REGa)
1159 YSCALEYUV2PACKEDX_END
1160 return;
1161 case PIX_FMT_RGB565:
1162 YSCALEYUV2PACKEDX
1163 YSCALEYUV2RGBX
1164 "pxor %%mm7, %%mm7 \n\t"
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1166 #ifdef DITHER1XBPP
1167 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1168 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1169 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1170 #endif
1171
1172 WRITERGB16(%4, %5, %%REGa)
1173 YSCALEYUV2PACKEDX_END
1174 return;
1175 case PIX_FMT_YUYV422:
1176 YSCALEYUV2PACKEDX
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1185 return;
1186 }
1187 }
1188 }
1189 #endif /* COMPILE_TEMPLATE_MMX */
1190 #if COMPILE_TEMPLATE_ALTIVEC
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of ff_yuv2packedX_altivec() */
1193 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1194 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1195 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1196 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1197 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1198 chrFilter, chrSrc, chrFilterSize,
1199 dest, dstW, dstY);
1200 else
1201 #endif
1202 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1203 chrFilter, chrSrc, chrFilterSize,
1204 alpSrc, dest, dstW, dstY);
1205 }
1206
1207 /**
1208 * vertical bilinear scale YV12 to RGB
1209 */
1210 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1211 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1212 {
1213 int yalpha1=4095- yalpha;
1214 int uvalpha1=4095-uvalpha;
1215 int i;
1216
1217 #if COMPILE_TEMPLATE_MMX
1218 if(!(c->flags & SWS_BITEXACT)) {
1219 switch(c->dstFormat) {
1220 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1221 case PIX_FMT_RGB32:
1222 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1223 #if ARCH_X86_64
1224 __asm__ volatile(
1225 YSCALEYUV2RGB(%%r8, %5)
1226 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1227 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1228 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1229 "packuswb %%mm7, %%mm1 \n\t"
1230 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1231
1232 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1233 "a" (&c->redDither)
1234 ,"r" (abuf0), "r" (abuf1)
1235 : "%r8"
1236 );
1237 #else
1238 *(const uint16_t **)(&c->u_temp)=abuf0;
1239 *(const uint16_t **)(&c->v_temp)=abuf1;
1240 __asm__ volatile(
1241 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1242 "mov %4, %%"REG_b" \n\t"
1243 "push %%"REG_BP" \n\t"
1244 YSCALEYUV2RGB(%%REGBP, %5)
1245 "push %0 \n\t"
1246 "push %1 \n\t"
1247 "mov "U_TEMP"(%5), %0 \n\t"
1248 "mov "V_TEMP"(%5), %1 \n\t"
1249 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1250 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "packuswb %%mm7, %%mm1 \n\t"
1253 "pop %1 \n\t"
1254 "pop %0 \n\t"
1255 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1256 "pop %%"REG_BP" \n\t"
1257 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1258
1259 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1260 "a" (&c->redDither)
1261 );
1262 #endif
1263 } else {
1264 __asm__ volatile(
1265 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1266 "mov %4, %%"REG_b" \n\t"
1267 "push %%"REG_BP" \n\t"
1268 YSCALEYUV2RGB(%%REGBP, %5)
1269 "pcmpeqd %%mm7, %%mm7 \n\t"
1270 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1271 "pop %%"REG_BP" \n\t"
1272 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1273
1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275 "a" (&c->redDither)
1276 );
1277 }
1278 return;
1279 case PIX_FMT_BGR24:
1280 __asm__ volatile(
1281 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1282 "mov %4, %%"REG_b" \n\t"
1283 "push %%"REG_BP" \n\t"
1284 YSCALEYUV2RGB(%%REGBP, %5)
1285 "pxor %%mm7, %%mm7 \n\t"
1286 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1287 "pop %%"REG_BP" \n\t"
1288 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1289 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1290 "a" (&c->redDither)
1291 );
1292 return;
1293 case PIX_FMT_RGB555:
1294 __asm__ volatile(
1295 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1296 "mov %4, %%"REG_b" \n\t"
1297 "push %%"REG_BP" \n\t"
1298 YSCALEYUV2RGB(%%REGBP, %5)
1299 "pxor %%mm7, %%mm7 \n\t"
1300 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1301 #ifdef DITHER1XBPP
1302 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1303 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1304 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1305 #endif
1306
1307 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1308 "pop %%"REG_BP" \n\t"
1309 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1310
1311 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1312 "a" (&c->redDither)
1313 );
1314 return;
1315 case PIX_FMT_RGB565:
1316 __asm__ volatile(
1317 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1318 "mov %4, %%"REG_b" \n\t"
1319 "push %%"REG_BP" \n\t"
1320 YSCALEYUV2RGB(%%REGBP, %5)
1321 "pxor %%mm7, %%mm7 \n\t"
1322 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1323 #ifdef DITHER1XBPP
1324 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1325 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1326 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1327 #endif
1328
1329 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1330 "pop %%"REG_BP" \n\t"
1331 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1332 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1333 "a" (&c->redDither)
1334 );
1335 return;
1336 case PIX_FMT_YUYV422:
1337 __asm__ volatile(
1338 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1339 "mov %4, %%"REG_b" \n\t"
1340 "push %%"REG_BP" \n\t"
1341 YSCALEYUV2PACKED(%%REGBP, %5)
1342 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1343 "pop %%"REG_BP" \n\t"
1344 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1345 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1346 "a" (&c->redDither)
1347 );
1348 return;
1349 default: break;
1350 }
1351 }
1352 #endif //COMPILE_TEMPLATE_MMX
1353 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1354 }
1355
1356 /**
1357 * YV12 to RGB without scaling or interpolating
1358 */
1359 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1360 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1361 {
1362 const int yalpha1=0;
1363 int i;
1364
1365 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1366 const int yalpha= 4096; //FIXME ...
1367
1368 if (flags&SWS_FULL_CHR_H_INT) {
1369 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1370 return;
1371 }
1372
1373 #if COMPILE_TEMPLATE_MMX
1374 if(!(flags & SWS_BITEXACT)) {
1375 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1376 switch(dstFormat) {
1377 case PIX_FMT_RGB32:
1378 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1379 __asm__ volatile(
1380 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1381 "mov %4, %%"REG_b" \n\t"
1382 "push %%"REG_BP" \n\t"
1383 YSCALEYUV2RGB1(%%REGBP, %5)
1384 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1385 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1386 "pop %%"REG_BP" \n\t"
1387 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1388
1389 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1390 "a" (&c->redDither)
1391 );
1392 } else {
1393 __asm__ volatile(
1394 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1395 "mov %4, %%"REG_b" \n\t"
1396 "push %%"REG_BP" \n\t"
1397 YSCALEYUV2RGB1(%%REGBP, %5)
1398 "pcmpeqd %%mm7, %%mm7 \n\t"
1399 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1400 "pop %%"REG_BP" \n\t"
1401 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1402
1403 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1404 "a" (&c->redDither)
1405 );
1406 }
1407 return;
1408 case PIX_FMT_BGR24:
1409 __asm__ volatile(
1410 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1411 "mov %4, %%"REG_b" \n\t"
1412 "push %%"REG_BP" \n\t"
1413 YSCALEYUV2RGB1(%%REGBP, %5)
1414 "pxor %%mm7, %%mm7 \n\t"
1415 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1416 "pop %%"REG_BP" \n\t"
1417 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1418
1419 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1420 "a" (&c->redDither)
1421 );
1422 return;
1423 case PIX_FMT_RGB555:
1424 __asm__ volatile(
1425 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1426 "mov %4, %%"REG_b" \n\t"
1427 "push %%"REG_BP" \n\t"
1428 YSCALEYUV2RGB1(%%REGBP, %5)
1429 "pxor %%mm7, %%mm7 \n\t"
1430 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1431 #ifdef DITHER1XBPP
1432 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1433 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1434 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1435 #endif
1436 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1439
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441 "a" (&c->redDither)
1442 );
1443 return;
1444 case PIX_FMT_RGB565:
1445 __asm__ volatile(
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB1(%%REGBP, %5)
1450 "pxor %%mm7, %%mm7 \n\t"
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452 #ifdef DITHER1XBPP
1453 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1454 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1455 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1456 #endif
1457
1458 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1459 "pop %%"REG_BP" \n\t"
1460 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1461
1462 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463 "a" (&c->redDither)
1464 );
1465 return;
1466 case PIX_FMT_YUYV422:
1467 __asm__ volatile(
1468 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_b" \n\t"
1470 "push %%"REG_BP" \n\t"
1471 YSCALEYUV2PACKED1(%%REGBP, %5)
1472 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1473 "pop %%"REG_BP" \n\t"
1474 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1475
1476 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1477 "a" (&c->redDither)
1478 );
1479 return;
1480 }
1481 } else {
1482 switch(dstFormat) {
1483 case PIX_FMT_RGB32:
1484 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1485 __asm__ volatile(
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB1b(%%REGBP, %5)
1490 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1491 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494
1495 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1496 "a" (&c->redDither)
1497 );
1498 } else {
1499 __asm__ volatile(
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP, %5)
1504 "pcmpeqd %%mm7, %%mm7 \n\t"
1505 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506 "pop %%"REG_BP" \n\t"
1507 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1508
1509 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510 "a" (&c->redDither)
1511 );
1512 }
1513 return;
1514 case PIX_FMT_BGR24:
1515 __asm__ volatile(
1516 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1517 "mov %4, %%"REG_b" \n\t"
1518 "push %%"REG_BP" \n\t"
1519 YSCALEYUV2RGB1b(%%REGBP, %5)
1520 "pxor %%mm7, %%mm7 \n\t"
1521 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1522 "pop %%"REG_BP" \n\t"
1523 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1524
1525 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1526 "a" (&c->redDither)
1527 );
1528 return;
1529 case PIX_FMT_RGB555:
1530 __asm__ volatile(
1531 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1532 "mov %4, %%"REG_b" \n\t"
1533 "push %%"REG_BP" \n\t"
1534 YSCALEYUV2RGB1b(%%REGBP, %5)
1535 "pxor %%mm7, %%mm7 \n\t"
1536 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1537 #ifdef DITHER1XBPP
1538 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1539 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1540 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1541 #endif
1542 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1543 "pop %%"REG_BP" \n\t"
1544 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1545
1546 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1547 "a" (&c->redDither)
1548 );
1549 return;
1550 case PIX_FMT_RGB565:
1551 __asm__ volatile(
1552 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1553 "mov %4, %%"REG_b" \n\t"
1554 "push %%"REG_BP" \n\t"
1555 YSCALEYUV2RGB1b(%%REGBP, %5)
1556 "pxor %%mm7, %%mm7 \n\t"
1557 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1558 #ifdef DITHER1XBPP
1559 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1560 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1561 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1562 #endif
1563
1564 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1565 "pop %%"REG_BP" \n\t"
1566 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1567
1568 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1569 "a" (&c->redDither)
1570 );
1571 return;
1572 case PIX_FMT_YUYV422:
1573 __asm__ volatile(
1574 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1575 "mov %4, %%"REG_b" \n\t"
1576 "push %%"REG_BP" \n\t"
1577 YSCALEYUV2PACKED1b(%%REGBP, %5)
1578 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1579 "pop %%"REG_BP" \n\t"
1580 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1581
1582 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583 "a" (&c->redDither)
1584 );
1585 return;
1586 }
1587 }
1588 }
1589 #endif /* COMPILE_TEMPLATE_MMX */
1590 if (uvalpha < 2048) {
1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1592 } else {
1593 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1594 }
1595 }
1596
1597 //FIXME yuy2* can read up to 7 samples too much
1598
1599 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1600 {
1601 #if COMPILE_TEMPLATE_MMX
1602 __asm__ volatile(
1603 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1604 "mov %0, %%"REG_a" \n\t"
1605 "1: \n\t"
1606 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1607 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1608 "pand %%mm2, %%mm0 \n\t"
1609 "pand %%mm2, %%mm1 \n\t"
1610 "packuswb %%mm1, %%mm0 \n\t"
1611 "movq %%mm0, (%2, %%"REG_a") \n\t"
1612 "add $8, %%"REG_a" \n\t"
1613 " js 1b \n\t"
1614 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1615 : "%"REG_a
1616 );
1617 #else
1618 int i;
1619 for (i=0; i<width; i++)
1620 dst[i]= src[2*i];
1621 #endif
1622 }
1623
1624 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1625 {
1626 #if COMPILE_TEMPLATE_MMX
1627 __asm__ volatile(
1628 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1629 "mov %0, %%"REG_a" \n\t"
1630 "1: \n\t"
1631 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1632 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1633 "psrlw $8, %%mm0 \n\t"
1634 "psrlw $8, %%mm1 \n\t"
1635 "packuswb %%mm1, %%mm0 \n\t"
1636 "movq %%mm0, %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "pand %%mm4, %%mm1 \n\t"
1639 "packuswb %%mm0, %%mm0 \n\t"
1640 "packuswb %%mm1, %%mm1 \n\t"
1641 "movd %%mm0, (%3, %%"REG_a") \n\t"
1642 "movd %%mm1, (%2, %%"REG_a") \n\t"
1643 "add $4, %%"REG_a" \n\t"
1644 " js 1b \n\t"
1645 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1646 : "%"REG_a
1647 );
1648 #else
1649 int i;
1650 for (i=0; i<width; i++) {
1651 dstU[i]= src1[4*i + 1];
1652 dstV[i]= src1[4*i + 3];
1653 }
1654 #endif
1655 assert(src1 == src2);
1656 }
1657
1658 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1659 {
1660 #if COMPILE_TEMPLATE_MMX
1661 __asm__ volatile(
1662 "mov %0, %%"REG_a" \n\t"
1663 "1: \n\t"
1664 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1665 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1666 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1667 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1668 "psrlw $8, %%mm0 \n\t"
1669 "psrlw $8, %%mm1 \n\t"
1670 "psrlw $8, %%mm2 \n\t"
1671 "psrlw $8, %%mm3 \n\t"
1672 "packuswb %%mm1, %%mm0 \n\t"
1673 "packuswb %%mm3, %%mm2 \n\t"
1674 "movq %%mm0, (%3, %%"REG_a") \n\t"
1675 "movq %%mm2, (%4, %%"REG_a") \n\t"
1676 "add $8, %%"REG_a" \n\t"
1677 " js 1b \n\t"
1678 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1679 : "%"REG_a
1680 );
1681 #else
1682 int i;
1683 for (i=0; i<width; i++) {
1684 dstU[i]= src1[2*i + 1];
1685 dstV[i]= src2[2*i + 1];
1686 }
1687 #endif
1688 }
1689
1690 /* This is almost identical to the previous, end exists only because
1691 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1692 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1693 {
1694 #if COMPILE_TEMPLATE_MMX
1695 __asm__ volatile(
1696 "mov %0, %%"REG_a" \n\t"
1697 "1: \n\t"
1698 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1699 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1700 "psrlw $8, %%mm0 \n\t"
1701 "psrlw $8, %%mm1 \n\t"
1702 "packuswb %%mm1, %%mm0 \n\t"
1703 "movq %%mm0, (%2, %%"REG_a") \n\t"
1704 "add $8, %%"REG_a" \n\t"
1705 " js 1b \n\t"
1706 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1707 : "%"REG_a
1708 );
1709 #else
1710 int i;
1711 for (i=0; i<width; i++)
1712 dst[i]= src[2*i+1];
1713 #endif
1714 }
1715
1716 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1717 {
1718 #if COMPILE_TEMPLATE_MMX
1719 __asm__ volatile(
1720 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1721 "mov %0, %%"REG_a" \n\t"
1722 "1: \n\t"
1723 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1724 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1725 "pand %%mm4, %%mm0 \n\t"
1726 "pand %%mm4, %%mm1 \n\t"
1727 "packuswb %%mm1, %%mm0 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "psrlw $8, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm0, %%mm0 \n\t"
1732 "packuswb %%mm1, %%mm1 \n\t"
1733 "movd %%mm0, (%3, %%"REG_a") \n\t"
1734 "movd %%mm1, (%2, %%"REG_a") \n\t"
1735 "add $4, %%"REG_a" \n\t"
1736 " js 1b \n\t"
1737 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1738 : "%"REG_a
1739 );
1740 #else
1741 int i;
1742 for (i=0; i<width; i++) {
1743 dstU[i]= src1[4*i + 0];
1744 dstV[i]= src1[4*i + 2];
1745 }
1746 #endif
1747 assert(src1 == src2);
1748 }
1749
1750 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1751 {
1752 #if COMPILE_TEMPLATE_MMX
1753 __asm__ volatile(
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1756 "1: \n\t"
1757 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1759 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1760 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1761 "pand %%mm4, %%mm0 \n\t"
1762 "pand %%mm4, %%mm1 \n\t"
1763 "pand %%mm4, %%mm2 \n\t"
1764 "pand %%mm4, %%mm3 \n\t"
1765 "packuswb %%mm1, %%mm0 \n\t"
1766 "packuswb %%mm3, %%mm2 \n\t"
1767 "movq %%mm0, (%3, %%"REG_a") \n\t"
1768 "movq %%mm2, (%4, %%"REG_a") \n\t"
1769 "add $8, %%"REG_a" \n\t"
1770 " js 1b \n\t"
1771 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1772 : "%"REG_a
1773 );
1774 #else
1775 int i;
1776 for (i=0; i<width; i++) {
1777 dstU[i]= src1[2*i];
1778 dstV[i]= src2[2*i];
1779 }
1780 #endif
1781 }
1782
1783 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1784 const uint8_t *src, long width)
1785 {
1786 #if COMPILE_TEMPLATE_MMX
1787 __asm__ volatile(
1788 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1789 "mov %0, %%"REG_a" \n\t"
1790 "1: \n\t"
1791 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1792 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1793 "movq %%mm0, %%mm2 \n\t"
1794 "movq %%mm1, %%mm3 \n\t"
1795 "pand %%mm4, %%mm0 \n\t"
1796 "pand %%mm4, %%mm1 \n\t"
1797 "psrlw $8, %%mm2 \n\t"
1798 "psrlw $8, %%mm3 \n\t"
1799 "packuswb %%mm1, %%mm0 \n\t"
1800 "packuswb %%mm3, %%mm2 \n\t"
1801 "movq %%mm0, (%2, %%"REG_a") \n\t"
1802 "movq %%mm2, (%3, %%"REG_a") \n\t"
1803 "add $8, %%"REG_a" \n\t"
1804 " js 1b \n\t"
1805 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1806 : "%"REG_a
1807 );
1808 #else
1809 int i;
1810 for (i = 0; i < width; i++) {
1811 dst1[i] = src[2*i+0];
1812 dst2[i] = src[2*i+1];
1813 }
1814 #endif
1815 }
1816
1817 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1818 const uint8_t *src1, const uint8_t *src2,
1819 long width, uint32_t *unused)
1820 {
1821 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1822 }
1823
1824 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1825 const uint8_t *src1, const uint8_t *src2,
1826 long width, uint32_t *unused)
1827 {
1828 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1829 }
1830
1831 #if COMPILE_TEMPLATE_MMX
1832 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1833 {
1834
1835 if(srcFormat == PIX_FMT_BGR24) {
1836 __asm__ volatile(
1837 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1838 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1839 :
1840 );
1841 } else {
1842 __asm__ volatile(
1843 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1844 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1845 :
1846 );
1847 }
1848
1849 __asm__ volatile(
1850 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1851 "mov %2, %%"REG_a" \n\t"
1852 "pxor %%mm7, %%mm7 \n\t"
1853 "1: \n\t"
1854 PREFETCH" 64(%0) \n\t"
1855 "movd (%0), %%mm0 \n\t"
1856 "movd 2(%0), %%mm1 \n\t"
1857 "movd 6(%0), %%mm2 \n\t"
1858 "movd 8(%0), %%mm3 \n\t"
1859 "add $12, %0 \n\t"
1860 "punpcklbw %%mm7, %%mm0 \n\t"
1861 "punpcklbw %%mm7, %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm2 \n\t"
1863 "punpcklbw %%mm7, %%mm3 \n\t"
1864 "pmaddwd %%mm5, %%mm0 \n\t"
1865 "pmaddwd %%mm6, %%mm1 \n\t"
1866 "pmaddwd %%mm5, %%mm2 \n\t"
1867 "pmaddwd %%mm6, %%mm3 \n\t"
1868 "paddd %%mm1, %%mm0 \n\t"
1869 "paddd %%mm3, %%mm2 \n\t"
1870 "paddd %%mm4, %%mm0 \n\t"
1871 "paddd %%mm4, %%mm2 \n\t"
1872 "psrad $15, %%mm0 \n\t"
1873 "psrad $15, %%mm2 \n\t"
1874 "packssdw %%mm2, %%mm0 \n\t"
1875 "packuswb %%mm0, %%mm0 \n\t"
1876 "movd %%mm0, (%1, %%"REG_a") \n\t"
1877 "add $4, %%"REG_a" \n\t"
1878 " js 1b \n\t"
1879 : "+r" (src)
1880 : "r" (dst+width), "g" ((x86_reg)-width)
1881 : "%"REG_a
1882 );
1883 }
1884
1885 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1886 {
1887 __asm__ volatile(
1888 "movq 24+%4, %%mm6 \n\t"
1889 "mov %3, %%"REG_a" \n\t"
1890 "pxor %%mm7, %%mm7 \n\t"
1891 "1: \n\t"
1892 PREFETCH" 64(%0) \n\t"
1893 "movd (%0), %%mm0 \n\t"
1894 "movd 2(%0), %%mm1 \n\t"
1895 "punpcklbw %%mm7, %%mm0 \n\t"
1896 "punpcklbw %%mm7, %%mm1 \n\t"
1897 "movq %%mm0, %%mm2 \n\t"
1898 "movq %%mm1, %%mm3 \n\t"
1899 "pmaddwd %4, %%mm0 \n\t"
1900 "pmaddwd 8+%4, %%mm1 \n\t"
1901 "pmaddwd 16+%4, %%mm2 \n\t"
1902 "pmaddwd %%mm6, %%mm3 \n\t"
1903 "paddd %%mm1, %%mm0 \n\t"
1904 "paddd %%mm3, %%mm2 \n\t"
1905
1906 "movd 6(%0), %%mm1 \n\t"
1907 "movd 8(%0), %%mm3 \n\t"
1908 "add $12, %0 \n\t"
1909 "punpcklbw %%mm7, %%mm1 \n\t"
1910 "punpcklbw %%mm7, %%mm3 \n\t"
1911 "movq %%mm1, %%mm4 \n\t"
1912 "movq %%mm3, %%mm5 \n\t"
1913 "pmaddwd %4, %%mm1 \n\t"
1914 "pmaddwd 8+%4, %%mm3 \n\t"
1915 "pmaddwd 16+%4, %%mm4 \n\t"
1916 "pmaddwd %%mm6, %%mm5 \n\t"
1917 "paddd %%mm3, %%mm1 \n\t"
1918 "paddd %%mm5, %%mm4 \n\t"
1919
1920 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1921 "paddd %%mm3, %%mm0 \n\t"
1922 "paddd %%mm3, %%mm2 \n\t"
1923 "paddd %%mm3, %%mm1 \n\t"
1924 "paddd %%mm3, %%mm4 \n\t"
1925 "psrad $15, %%mm0 \n\t"
1926 "psrad $15, %%mm2 \n\t"
1927 "psrad $15, %%mm1 \n\t"
1928 "psrad $15, %%mm4 \n\t"
1929 "packssdw %%mm1, %%mm0 \n\t"
1930 "packssdw %%mm4, %%mm2 \n\t"
1931 "packuswb %%mm0, %%mm0 \n\t"
1932 "packuswb %%mm2, %%mm2 \n\t"
1933 "movd %%mm0, (%1, %%"REG_a") \n\t"
1934 "movd %%mm2, (%2, %%"REG_a") \n\t"
1935 "add $4, %%"REG_a" \n\t"
1936 " js 1b \n\t"
1937 : "+r" (src)
1938 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1939 : "%"REG_a
1940 );
1941 }
1942 #endif
1943
1944 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1945 {
1946 #if COMPILE_TEMPLATE_MMX
1947 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1948 #else
1949 int i;
1950 for (i=0; i<width; i++) {
1951 int b= src[i*3+0];
1952 int g= src[i*3+1];
1953 int r= src[i*3+2];
1954
1955 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1956 }
1957 #endif /* COMPILE_TEMPLATE_MMX */
1958 }
1959
1960 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1961 {
1962 #if COMPILE_TEMPLATE_MMX
1963 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1964 #else
1965 int i;
1966 for (i=0; i<width; i++) {
1967 int b= src1[3*i + 0];
1968 int g= src1[3*i + 1];
1969 int r= src1[3*i + 2];
1970
1971 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1972 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1973 }
1974 #endif /* COMPILE_TEMPLATE_MMX */
1975 assert(src1 == src2);
1976 }
1977
1978 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1979 {
1980 int i;
1981 for (i=0; i<width; i++) {
1982 int b= src1[6*i + 0] + src1[6*i + 3];
1983 int g= src1[6*i + 1] + src1[6*i + 4];
1984 int r= src1[6*i + 2] + src1[6*i + 5];
1985
1986 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1987 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1988 }
1989 assert(src1 == src2);
1990 }
1991
1992 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1993 {
1994 #if COMPILE_TEMPLATE_MMX
1995 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1996 #else
1997 int i;
1998 for (i=0; i<width; i++) {
1999 int r= src[i*3+0];
2000 int g= src[i*3+1];
2001 int b= src[i*3+2];
2002
2003 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2004 }
2005 #endif
2006 }
2007
2008 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2009 {
2010 #if COMPILE_TEMPLATE_MMX
2011 assert(src1==src2);
2012 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2013 #else
2014 int i;
2015 assert(src1==src2);
2016 for (i=0; i<width; i++) {
2017 int r= src1[3*i + 0];
2018 int g= src1[3*i + 1];
2019 int b= src1[3*i + 2];
2020
2021 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2022 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2023 }
2024 #endif
2025 }
2026
2027 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2028 {
2029 int i;
2030 assert(src1==src2);
2031 for (i=0; i<width; i++) {
2032 int r= src1[6*i + 0] + src1[6*i + 3];
2033 int g= src1[6*i + 1] + src1[6*i + 4];
2034 int b= src1[6*i + 2] + src1[6*i + 5];
2035
2036 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2037 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2038 }
2039 }
2040
2041
2042 // bilinear / bicubic scaling
2043 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2044 const int16_t *filter, const int16_t *filterPos, long filterSize)
2045 {
2046 #if COMPILE_TEMPLATE_MMX
2047 assert(filterSize % 4 == 0 && filterSize>0);
2048 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2049 x86_reg counter= -2*dstW;
2050 filter-= counter*2;
2051 filterPos-= counter/2;
2052 dst-= counter/2;
2053 __asm__ volatile(
2054 #if defined(PIC)
2055 "push %%"REG_b" \n\t"
2056 #endif
2057 "pxor %%mm7, %%mm7 \n\t"
2058 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a", %%"REG_BP" \n\t"
2060 ASMALIGN(4)
2061 "1: \n\t"
2062 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2064 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2065 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2066 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2072 "movq %%mm0, %%mm4 \n\t"
2073 "punpckldq %%mm3, %%mm0 \n\t"
2074 "punpckhdq %%mm3, %%mm4 \n\t"
2075 "paddd %%mm4, %%mm0 \n\t"
2076 "psrad $7, %%mm0 \n\t"
2077 "packssdw %%mm0, %%mm0 \n\t"
2078 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2079 "add $4, %%"REG_BP" \n\t"
2080 " jnc 1b \n\t"
2081
2082 "pop %%"REG_BP" \n\t"
2083 #if defined(PIC)
2084 "pop %%"REG_b" \n\t"
2085 #endif
2086 : "+a" (counter)
2087 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2088 #if !defined(PIC)
2089 : "%"REG_b
2090 #endif
2091 );
2092 } else if (filterSize==8) {
2093 x86_reg counter= -2*dstW;
2094 filter-= counter*4;
2095 filterPos-= counter/2;
2096 dst-= counter/2;
2097 __asm__ volatile(
2098 #if defined(PIC)
2099 "push %%"REG_b" \n\t"
2100 #endif
2101 "pxor %%mm7, %%mm7 \n\t"
2102 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2103 "mov %%"REG_a", %%"REG_BP" \n\t"
2104 ASMALIGN(4)
2105 "1: \n\t"
2106 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2107 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2108 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2109 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2110 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2111 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2112 "punpcklbw %%mm7, %%mm0 \n\t"
2113 "punpcklbw %%mm7, %%mm2 \n\t"
2114 "pmaddwd %%mm1, %%mm0 \n\t"
2115 "pmaddwd %%mm2, %%mm3 \n\t"
2116
2117 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2118 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2119 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2120 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm4 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "pmaddwd %%mm1, %%mm4 \n\t"
2124 "pmaddwd %%mm2, %%mm5 \n\t"
2125 "paddd %%mm4, %%mm0 \n\t"
2126 "paddd %%mm5, %%mm3 \n\t"
2127 "movq %%mm0, %%mm4 \n\t"
2128 "punpckldq %%mm3, %%mm0 \n\t"
2129 "punpckhdq %%mm3, %%mm4 \n\t"
2130 "paddd %%mm4, %%mm0 \n\t"
2131 "psrad $7, %%mm0 \n\t"
2132 "packssdw %%mm0, %%mm0 \n\t"
2133 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2134 "add $4, %%"REG_BP" \n\t"
2135 " jnc 1b \n\t"
2136
2137 "pop %%"REG_BP" \n\t"
2138 #if defined(PIC)
2139 "pop %%"REG_b" \n\t"
2140 #endif
2141 : "+a" (counter)
2142 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2143 #if !defined(PIC)
2144 : "%"REG_b
2145 #endif
2146 );
2147 } else {
2148 const uint8_t *offset = src+filterSize;
2149 x86_reg counter= -2*dstW;
2150 //filter-= counter*filterSize/2;
2151 filterPos-= counter/2;
2152 dst-= counter/2;
2153 __asm__ volatile(
2154 "pxor %%mm7, %%mm7 \n\t"
2155 ASMALIGN(4)
2156 "1: \n\t"
2157 "mov %2, %%"REG_c" \n\t"
2158 "movzwl (%%"REG_c", %0), %%eax \n\t"
2159 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2160 "mov %5, %%"REG_c" \n\t"
2161 "pxor %%mm4, %%mm4 \n\t"
2162 "pxor %%mm5, %%mm5 \n\t"
2163 "2: \n\t"
2164 "movq (%1), %%mm1 \n\t"
2165 "movq (%1, %6), %%mm3 \n\t"
2166 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2167 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2168 "punpcklbw %%mm7, %%mm0 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "pmaddwd %%mm1, %%mm0 \n\t"
2171 "pmaddwd %%mm2, %%mm3 \n\t"
2172 "paddd %%mm3, %%mm5 \n\t"
2173 "paddd %%mm0, %%mm4 \n\t"
2174 "add $8, %1 \n\t"
2175 "add $4, %%"REG_c" \n\t"
2176 "cmp %4, %%"REG_c" \n\t"
2177 " jb 2b \n\t"
2178 "add %6, %1 \n\t"
2179 "movq %%mm4, %%mm0 \n\t"
2180 "punpckldq %%mm5, %%mm4 \n\t"
2181 "punpckhdq %%mm5, %%mm0 \n\t"
2182 "paddd %%mm0, %%mm4 \n\t"
2183 "psrad $7, %%mm4 \n\t"
2184 "packssdw %%mm4, %%mm4 \n\t"
2185 "mov %3, %%"REG_a" \n\t"
2186 "movd %%mm4, (%%"REG_a", %0) \n\t"
2187 "add $4, %0 \n\t"
2188 " jnc 1b \n\t"
2189
2190 : "+r" (counter), "+r" (filter)
2191 : "m" (filterPos), "m" (dst), "m"(offset),
2192 "m" (src), "r" ((x86_reg)filterSize*2)
2193 : "%"REG_a, "%"REG_c, "%"REG_d
2194 );
2195 }
2196 #else
2197 #if COMPILE_TEMPLATE_ALTIVEC
2198 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2199 #else
2200 int i;
2201 for (i=0; i<dstW; i++) {
2202 int j;
2203 int srcPos= filterPos[i];
2204 int val=0;
2205 //printf("filterPos: %d\n", filterPos[i]);
2206 for (j=0; j<filterSize; j++) {
2207 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2208 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2209 }
2210 //filter += hFilterSize;
2211 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2212 //dst[i] = val>>7;
2213 }
2214 #endif /* COMPILE_TEMPLATE_ALTIVEC */
2215 #endif /* COMPILE_MMX */
2216 }
2217
2218 //FIXME all pal and rgb srcFormats could do this convertion as well
2219 //FIXME all scalers more complex than bilinear could do half of this transform
2220 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2221 {
2222 int i;
2223 for (i = 0; i < width; i++) {
2224 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2225 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2226 }
2227 }
2228 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2229 {
2230 int i;
2231 for (i = 0; i < width; i++) {
2232 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2233 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2234 }
2235 }
2236 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2237 {
2238 int i;
2239 for (i = 0; i < width; i++)
2240 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2241 }
2242 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2243 {
2244 int i;
2245 for (i = 0; i < width; i++)
2246 dst[i] = (dst[i]*14071 + 33561947)>>14;
2247 }
2248
2249 #define FAST_BILINEAR_X86 \
2250 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2251 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2252 "shll $16, %%edi \n\t" \
2253 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2254 "mov %1, %%"REG_D"\n\t" \
2255 "shrl $9, %%esi \n\t" \
2256
2257 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2258 long dstWidth, const uint8_t *src, int srcW,
2259 int xInc)
2260 {
2261 #if ARCH_X86 && CONFIG_GPL
2262 #if COMPILE_TEMPLATE_MMX2
2263 int32_t *filterPos = c->hLumFilterPos;
2264 int16_t *filter = c->hLumFilter;
2265 int canMMX2BeUsed = c->canMMX2BeUsed;
2266 void *mmx2FilterCode= c->lumMmx2FilterCode;
2267 int i;
2268 #if defined(PIC)
2269 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2270 #endif
2271 if (canMMX2BeUsed) {
2272 __asm__ volatile(
2273 #if defined(PIC)
2274 "mov %%"REG_b", %5 \n\t"
2275 #endif
2276 "pxor %%mm7, %%mm7 \n\t"
2277 "mov %0, %%"REG_c" \n\t"
2278 "mov %1, %%"REG_D" \n\t"
2279 "mov %2, %%"REG_d" \n\t"
2280 "mov %3, %%"REG_b" \n\t"
2281 "xor %%"REG_a", %%"REG_a" \n\t" // i
2282 PREFETCH" (%%"REG_c") \n\t"
2283 PREFETCH" 32(%%"REG_c") \n\t"
2284 PREFETCH" 64(%%"REG_c") \n\t"
2285
2286 #if ARCH_X86_64
2287
2288 #define CALL_MMX2_FILTER_CODE \
2289 "movl (%%"REG_b"), %%esi \n\t"\
2290 "call *%4 \n\t"\
2291 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2292 "add %%"REG_S", %%"REG_c" \n\t"\
2293 "add %%"REG_a", %%"REG_D" \n\t"\
2294 "xor %%"REG_a", %%"REG_a" \n\t"\
2295
2296 #else
2297
2298 #define CALL_MMX2_FILTER_CODE \
2299 "movl (%%"REG_b"), %%esi \n\t"\
2300 "call *%4 \n\t"\
2301 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2302 "add %%"REG_a", %%"REG_D" \n\t"\
2303 "xor %%"REG_a", %%"REG_a" \n\t"\
2304
2305 #endif /* ARCH_X86_64 */
2306
2307 CALL_MMX2_FILTER_CODE
2308 CALL_MMX2_FILTER_CODE
2309 CALL_MMX2_FILTER_CODE
2310 CALL_MMX2_FILTER_CODE
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2313 CALL_MMX2_FILTER_CODE
2314 CALL_MMX2_FILTER_CODE
2315
2316 #if defined(PIC)
2317 "mov %5, %%"REG_b" \n\t"
2318 #endif
2319 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2320 "m" (mmx2FilterCode)
2321 #if defined(PIC)
2322 ,"m" (ebxsave)
2323 #endif
2324 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2325 #if !defined(PIC)
2326 ,"%"REG_b
2327 #endif
2328 );
2329 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2330 } else {
2331 #endif /* COMPILE_TEMPLATE_MMX2 */
2332 x86_reg xInc_shr16 = xInc >> 16;
2333 uint16_t xInc_mask = xInc & 0xffff;
2334 //NO MMX just normal asm ...
2335 __asm__ volatile(
2336 "xor %%"REG_a", %%"REG_a" \n\t" // i
2337 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2338 "xorl %%ecx, %%ecx \n\t" // xalpha
2339 ASMALIGN(4)
2340 "1: \n\t"
2341 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2342 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2343 FAST_BILINEAR_X86
2344 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2345 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2346 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2347
2348 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2350 FAST_BILINEAR_X86
2351 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2352 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2353 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2354
2355
2356 "add $2, %%"REG_a" \n\t"
2357 "cmp %2, %%"REG_a" \n\t"
2358 " jb 1b \n\t"
2359
2360
2361 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2362 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2363 );
2364 #if COMPILE_TEMPLATE_MMX2
2365 } //if MMX2 can't be used
2366 #endif
2367 #else
2368 int i;
2369 unsigned int xpos=0;
2370 for (i=0;i<dstWidth;i++) {
2371 register unsigned int xx=xpos>>16;
2372 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2373 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2374 xpos+=xInc;
2375 }
2376 #endif /* ARCH_X86 */
2377 }
2378
2379 // *** horizontal scale Y line to temp buffer
2380 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2381 const int16_t *hLumFilter,
2382 const int16_t *hLumFilterPos, int hLumFilterSize,
2383 uint8_t *formatConvBuffer,
2384 uint32_t *pal, int isAlpha)
2385 {
2386 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2387 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2388
2389 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2390
2391 if (toYV12) {
2392 toYV12(formatConvBuffer, src, srcW, pal);
2393 src= formatConvBuffer;
2394 }
2395
2396 if (!c->hyscale_fast) {
2397 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2398 } else { // fast bilinear upscale / crap downscale
2399 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2400 }
2401
2402 if (convertRange)
2403 convertRange(dst, dstWidth);
2404 }
2405
2406 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2407 long dstWidth, const uint8_t *src1,
2408 const uint8_t *src2, int srcW, int xInc)
2409 {
2410 #if ARCH_X86 && CONFIG_GPL
2411 #if COMPILE_TEMPLATE_MMX2
2412 int32_t *filterPos = c->hChrFilterPos;
2413 int16_t *filter = c->hChrFilter;
2414 int canMMX2BeUsed = c->canMMX2BeUsed;
2415 void *mmx2FilterCode= c->chrMmx2FilterCode;
2416 int i;
2417 #if defined(PIC)
2418 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2419 #endif
2420 if (canMMX2BeUsed) {
2421 __asm__ volatile(
2422 #if defined(PIC)
2423 "mov %%"REG_b", %6 \n\t"
2424 #endif
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c" \n\t"
2427 "mov %1, %%"REG_D" \n\t"
2428 "mov %2, %%"REG_d" \n\t"
2429 "mov %3, %%"REG_b" \n\t"
2430 "xor %%"REG_a", %%"REG_a" \n\t" // i
2431 PREFETCH" (%%"REG_c") \n\t"
2432 PREFETCH" 32(%%"REG_c") \n\t"
2433 PREFETCH" 64(%%"REG_c") \n\t"
2434
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 CALL_MMX2_FILTER_CODE
2439 "xor %%"REG_a", %%"REG_a" \n\t" // i
2440 "mov %5, %%"REG_c" \n\t" // src
2441 "mov %1, %%"REG_D" \n\t" // buf1
2442 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2443 PREFETCH" (%%"REG_c") \n\t"
2444 PREFETCH" 32(%%"REG_c") \n\t"
2445 PREFETCH" 64(%%"REG_c") \n\t"
2446
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2450 CALL_MMX2_FILTER_CODE
2451
2452 #if defined(PIC)
2453 "mov %6, %%"REG_b" \n\t"
2454 #endif
2455 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2456 "m" (mmx2FilterCode), "m" (src2)
2457 #if defined(PIC)
2458 ,"m" (ebxsave)
2459 #endif
2460 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2461 #if !defined(PIC)
2462 ,"%"REG_b
2463 #endif
2464 );
2465 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2466 //printf("%d %d %d\n", dstWidth, i, srcW);
2467 dst[i] = src1[srcW-1]*128;
2468 dst[i+VOFW] = src2[srcW-1]*128;
2469 }
2470 } else {
2471 #endif /* COMPILE_TEMPLATE_MMX2 */
2472 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2473 uint16_t xInc_mask = xInc & 0xffff;
2474 __asm__ volatile(
2475 "xor %%"REG_a", %%"REG_a" \n\t" // i
2476 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
2478 ASMALIGN(4)
2479 "1: \n\t"
2480 "mov %0, %%"REG_S" \n\t"
2481 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2483 FAST_BILINEAR_X86
2484 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2485
2486 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2488 FAST_BILINEAR_X86
2489 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2490
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a" \n\t"
2494 "cmp %2, %%"REG_a" \n\t"
2495 " jb 1b \n\t"
2496
2497 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498 which is needed to support GCC 4.0. */
2499 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2501 #else
2502 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2503 #endif
2504 "r" (src2)
2505 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2506 );
2507 #if COMPILE_TEMPLATE_MMX2
2508 } //if MMX2 can't be used
2509 #endif
2510 #else
2511 int i;
2512 unsigned int xpos=0;
2513 for (i=0;i<dstWidth;i++) {
2514 register unsigned int xx=xpos>>16;
2515 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2516 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2517 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2518 /* slower
2519 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2521 */
2522 xpos+=xInc;
2523 }
2524 #endif /* ARCH_X86 */
2525 }
2526
2527 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2528 int srcW, int xInc, const int16_t *hChrFilter,
2529 const int16_t *hChrFilterPos, int hChrFilterSize,
2530 uint8_t *formatConvBuffer,
2531 uint32_t *pal)
2532 {
2533
2534 src1 += c->chrSrcOffset;
2535 src2 += c->chrSrcOffset;
2536
2537 if (c->chrToYV12) {
2538 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2539 src1= formatConvBuffer;
2540 src2= formatConvBuffer+VOFW;
2541 }
2542
2543 if (!c->hcscale_fast) {
2544 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2545 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2546 } else { // fast bilinear upscale / crap downscale
2547 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2548 }
2549
2550 if (c->chrConvertRange)
2551 c->chrConvertRange(dst, dstWidth);
2552 }
2553
2554 #define DEBUG_SWSCALE_BUFFERS 0
2555 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2556
2557 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2558 int srcSliceH, uint8_t* dst[], int dstStride[])
2559 {
2560 /* load a few things into local vars to make the code more readable? and faster */
2561 const int srcW= c->srcW;
2562 const int dstW= c->dstW;
2563 const int dstH= c->dstH;
2564 const int chrDstW= c->chrDstW;
2565 const int chrSrcW= c->chrSrcW;
2566 const int lumXInc= c->lumXInc;
2567 const int chrXInc= c->chrXInc;
2568 const enum PixelFormat dstFormat= c->dstFormat;
2569 const int flags= c->flags;
2570 int16_t *vLumFilterPos= c->vLumFilterPos;
2571 int16_t *vChrFilterPos= c->vChrFilterPos;
2572 int16_t *hLumFilterPos= c->hLumFilterPos;
2573 int16_t *hChrFilterPos= c->hChrFilterPos;
2574 int16_t *vLumFilter= c->vLumFilter;
2575 int16_t *vChrFilter= c->vChrFilter;
2576 int16_t *hLumFilter= c->hLumFilter;
2577 int16_t *hChrFilter= c->hChrFilter;
2578 int32_t *lumMmxFilter= c->lumMmxFilter;
2579 int32_t *chrMmxFilter= c->chrMmxFilter;
2580 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2581 const int vLumFilterSize= c->vLumFilterSize;
2582 const int vChrFilterSize= c->vChrFilterSize;
2583 const int hLumFilterSize= c->hLumFilterSize;
2584 const int hChrFilterSize= c->hChrFilterSize;
2585 int16_t **lumPixBuf= c->lumPixBuf;
2586 int16_t **chrPixBuf= c->chrPixBuf;
2587 int16_t **alpPixBuf= c->alpPixBuf;
2588 const int vLumBufSize= c->vLumBufSize;
2589 const int vChrBufSize= c->vChrBufSize;
2590 uint8_t *formatConvBuffer= c->formatConvBuffer;
2591 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2592 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2593 int lastDstY;
2594 uint32_t *pal=c->pal_yuv;
2595
2596 /* vars which will change and which we need to store back in the context */
2597 int dstY= c->dstY;
2598 int lumBufIndex= c->lumBufIndex;
2599 int chrBufIndex= c->chrBufIndex;
2600 int lastInLumBuf= c->lastInLumBuf;
2601 int lastInChrBuf= c->lastInChrBuf;
2602
2603 if (isPacked(c->srcFormat)) {
2604 src[0]=
2605 src[1]=
2606 src[2]=
2607 src[3]= src[0];
2608 srcStride[0]=
2609 srcStride[1]=
2610 srcStride[2]=
2611 srcStride[3]= srcStride[0];
2612 }
2613 srcStride[1]<<= c->vChrDrop;
2614 srcStride[2]<<= c->vChrDrop;
2615
2616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2618 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620 srcSliceY, srcSliceH, dstY, dstH);
2621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2623
2624 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2625 static int warnedAlready=0; //FIXME move this into the context perhaps
2626 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2627 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2628 " ->cannot do aligned memory accesses anymore\n");
2629 warnedAlready=1;
2630 }
2631 }
2632
2633 /* Note the user might start scaling the picture in the middle so this
2634 will not get executed. This is not really intended but works
2635 currently, so people might do it. */
2636 if (srcSliceY ==0) {
2637 lumBufIndex=-1;
2638 chrBufIndex=-1;
2639 dstY=0;
2640 lastInLumBuf= -1;
2641 lastInChrBuf= -1;
2642 }
2643
2644 lastDstY= dstY;
2645
2646 for (;dstY < dstH; dstY++) {
2647 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2648 const int chrDstY= dstY>>c->chrDstVSubSample;
2649 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2650 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2651 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2652
2653 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2654 const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2655 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2656 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2657 int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2658 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2659 int enough_lines;
2660
2661 //handle holes (FAST_BILINEAR & weird filters)
2662 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2663 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2664 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2665 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);