More indentation changes leftover from r29522:
[libav.git] / libswscale / swscale_template.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
22 */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29
30 #if COMPILE_TEMPLATE_AMD3DNOW
31 #define PREFETCH "prefetch"
32 #define PREFETCHW "prefetchw"
33 #elif COMPILE_TEMPLATE_MMX2
34 #define PREFETCH "prefetchnta"
35 #define PREFETCHW "prefetcht0"
36 #else
37 #define PREFETCH " # nop"
38 #define PREFETCHW " # nop"
39 #endif
40
41 #if COMPILE_TEMPLATE_MMX2
42 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43 #elif COMPILE_TEMPLATE_AMD3DNOW
44 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45 #endif
46
47 #if COMPILE_TEMPLATE_MMX2
48 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
49 #else
50 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
51 #endif
52 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
53
54 #if COMPILE_TEMPLATE_ALTIVEC
55 #include "ppc/swscale_altivec_template.c"
56 #endif
57
58 #define YSCALEYUV2YV12X(x, offset, dest, width) \
59 __asm__ volatile(\
60 "xor %%"REG_a", %%"REG_a" \n\t"\
61 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
62 "movq %%mm3, %%mm4 \n\t"\
63 "lea " offset "(%0), %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 ASMALIGN(4) /* FIXME Unroll? */\
66 "1: \n\t"\
67 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
68 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
69 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
70 "add $16, %%"REG_d" \n\t"\
71 "mov (%%"REG_d"), %%"REG_S" \n\t"\
72 "test %%"REG_S", %%"REG_S" \n\t"\
73 "pmulhw %%mm0, %%mm2 \n\t"\
74 "pmulhw %%mm0, %%mm5 \n\t"\
75 "paddw %%mm2, %%mm3 \n\t"\
76 "paddw %%mm5, %%mm4 \n\t"\
77 " jnz 1b \n\t"\
78 "psraw $3, %%mm3 \n\t"\
79 "psraw $3, %%mm4 \n\t"\
80 "packuswb %%mm4, %%mm3 \n\t"\
81 MOVNTQ(%%mm3, (%1, %%REGa))\
82 "add $8, %%"REG_a" \n\t"\
83 "cmp %2, %%"REG_a" \n\t"\
84 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
85 "movq %%mm3, %%mm4 \n\t"\
86 "lea " offset "(%0), %%"REG_d" \n\t"\
87 "mov (%%"REG_d"), %%"REG_S" \n\t"\
88 "jb 1b \n\t"\
89 :: "r" (&c->redDither),\
90 "r" (dest), "g" (width)\
91 : "%"REG_a, "%"REG_d, "%"REG_S\
92 );
93
94 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
95 __asm__ volatile(\
96 "lea " offset "(%0), %%"REG_d" \n\t"\
97 "xor %%"REG_a", %%"REG_a" \n\t"\
98 "pxor %%mm4, %%mm4 \n\t"\
99 "pxor %%mm5, %%mm5 \n\t"\
100 "pxor %%mm6, %%mm6 \n\t"\
101 "pxor %%mm7, %%mm7 \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 ASMALIGN(4) \
104 "1: \n\t"\
105 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
106 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
107 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
108 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
109 "movq %%mm0, %%mm3 \n\t"\
110 "punpcklwd %%mm1, %%mm0 \n\t"\
111 "punpckhwd %%mm1, %%mm3 \n\t"\
112 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
113 "pmaddwd %%mm1, %%mm0 \n\t"\
114 "pmaddwd %%mm1, %%mm3 \n\t"\
115 "paddd %%mm0, %%mm4 \n\t"\
116 "paddd %%mm3, %%mm5 \n\t"\
117 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
118 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
119 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
120 "test %%"REG_S", %%"REG_S" \n\t"\
121 "movq %%mm2, %%mm0 \n\t"\
122 "punpcklwd %%mm3, %%mm2 \n\t"\
123 "punpckhwd %%mm3, %%mm0 \n\t"\
124 "pmaddwd %%mm1, %%mm2 \n\t"\
125 "pmaddwd %%mm1, %%mm0 \n\t"\
126 "paddd %%mm2, %%mm6 \n\t"\
127 "paddd %%mm0, %%mm7 \n\t"\
128 " jnz 1b \n\t"\
129 "psrad $16, %%mm4 \n\t"\
130 "psrad $16, %%mm5 \n\t"\
131 "psrad $16, %%mm6 \n\t"\
132 "psrad $16, %%mm7 \n\t"\
133 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134 "packssdw %%mm5, %%mm4 \n\t"\
135 "packssdw %%mm7, %%mm6 \n\t"\
136 "paddw %%mm0, %%mm4 \n\t"\
137 "paddw %%mm0, %%mm6 \n\t"\
138 "psraw $3, %%mm4 \n\t"\
139 "psraw $3, %%mm6 \n\t"\
140 "packuswb %%mm6, %%mm4 \n\t"\
141 MOVNTQ(%%mm4, (%1, %%REGa))\
142 "add $8, %%"REG_a" \n\t"\
143 "cmp %2, %%"REG_a" \n\t"\
144 "lea " offset "(%0), %%"REG_d" \n\t"\
145 "pxor %%mm4, %%mm4 \n\t"\
146 "pxor %%mm5, %%mm5 \n\t"\
147 "pxor %%mm6, %%mm6 \n\t"\
148 "pxor %%mm7, %%mm7 \n\t"\
149 "mov (%%"REG_d"), %%"REG_S" \n\t"\
150 "jb 1b \n\t"\
151 :: "r" (&c->redDither),\
152 "r" (dest), "g" (width)\
153 : "%"REG_a, "%"REG_d, "%"REG_S\
154 );
155
156 #define YSCALEYUV2YV121 \
157 "mov %2, %%"REG_a" \n\t"\
158 ASMALIGN(4) /* FIXME Unroll? */\
159 "1: \n\t"\
160 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
161 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
162 "psraw $7, %%mm0 \n\t"\
163 "psraw $7, %%mm1 \n\t"\
164 "packuswb %%mm1, %%mm0 \n\t"\
165 MOVNTQ(%%mm0, (%1, %%REGa))\
166 "add $8, %%"REG_a" \n\t"\
167 "jnc 1b \n\t"
168
169 #define YSCALEYUV2YV121_ACCURATE \
170 "mov %2, %%"REG_a" \n\t"\
171 "pcmpeqw %%mm7, %%mm7 \n\t"\
172 "psrlw $15, %%mm7 \n\t"\
173 "psllw $6, %%mm7 \n\t"\
174 ASMALIGN(4) /* FIXME Unroll? */\
175 "1: \n\t"\
176 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
177 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
178 "paddsw %%mm7, %%mm0 \n\t"\
179 "paddsw %%mm7, %%mm1 \n\t"\
180 "psraw $7, %%mm0 \n\t"\
181 "psraw $7, %%mm1 \n\t"\
182 "packuswb %%mm1, %%mm0 \n\t"\
183 MOVNTQ(%%mm0, (%1, %%REGa))\
184 "add $8, %%"REG_a" \n\t"\
185 "jnc 1b \n\t"
186
187 /*
188 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190 "r" (dest), "m" (dstW),
191 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
193 */
194 #define YSCALEYUV2PACKEDX_UV \
195 __asm__ volatile(\
196 "xor %%"REG_a", %%"REG_a" \n\t"\
197 ASMALIGN(4)\
198 "nop \n\t"\
199 "1: \n\t"\
200 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
201 "mov (%%"REG_d"), %%"REG_S" \n\t"\
202 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
203 "movq %%mm3, %%mm4 \n\t"\
204 ASMALIGN(4)\
205 "2: \n\t"\
206 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
207 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
208 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
209 "add $16, %%"REG_d" \n\t"\
210 "mov (%%"REG_d"), %%"REG_S" \n\t"\
211 "pmulhw %%mm0, %%mm2 \n\t"\
212 "pmulhw %%mm0, %%mm5 \n\t"\
213 "paddw %%mm2, %%mm3 \n\t"\
214 "paddw %%mm5, %%mm4 \n\t"\
215 "test %%"REG_S", %%"REG_S" \n\t"\
216 " jnz 2b \n\t"\
217
218 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219 "lea "offset"(%0), %%"REG_d" \n\t"\
220 "mov (%%"REG_d"), %%"REG_S" \n\t"\
221 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
222 "movq "#dst1", "#dst2" \n\t"\
223 ASMALIGN(4)\
224 "2: \n\t"\
225 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
226 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
227 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
228 "add $16, %%"REG_d" \n\t"\
229 "mov (%%"REG_d"), %%"REG_S" \n\t"\
230 "pmulhw "#coeff", "#src1" \n\t"\
231 "pmulhw "#coeff", "#src2" \n\t"\
232 "paddw "#src1", "#dst1" \n\t"\
233 "paddw "#src2", "#dst2" \n\t"\
234 "test %%"REG_S", %%"REG_S" \n\t"\
235 " jnz 2b \n\t"\
236
237 #define YSCALEYUV2PACKEDX \
238 YSCALEYUV2PACKEDX_UV \
239 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
240
241 #define YSCALEYUV2PACKEDX_END \
242 :: "r" (&c->redDither), \
243 "m" (dummy), "m" (dummy), "m" (dummy),\
244 "r" (dest), "m" (dstW) \
245 : "%"REG_a, "%"REG_d, "%"REG_S \
246 );
247
248 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
249 __asm__ volatile(\
250 "xor %%"REG_a", %%"REG_a" \n\t"\
251 ASMALIGN(4)\
252 "nop \n\t"\
253 "1: \n\t"\
254 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
255 "mov (%%"REG_d"), %%"REG_S" \n\t"\
256 "pxor %%mm4, %%mm4 \n\t"\
257 "pxor %%mm5, %%mm5 \n\t"\
258 "pxor %%mm6, %%mm6 \n\t"\
259 "pxor %%mm7, %%mm7 \n\t"\
260 ASMALIGN(4)\
261 "2: \n\t"\
262 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
263 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
264 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
265 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
266 "movq %%mm0, %%mm3 \n\t"\
267 "punpcklwd %%mm1, %%mm0 \n\t"\
268 "punpckhwd %%mm1, %%mm3 \n\t"\
269 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
270 "pmaddwd %%mm1, %%mm0 \n\t"\
271 "pmaddwd %%mm1, %%mm3 \n\t"\
272 "paddd %%mm0, %%mm4 \n\t"\
273 "paddd %%mm3, %%mm5 \n\t"\
274 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
275 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
276 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
277 "test %%"REG_S", %%"REG_S" \n\t"\
278 "movq %%mm2, %%mm0 \n\t"\
279 "punpcklwd %%mm3, %%mm2 \n\t"\
280 "punpckhwd %%mm3, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm2 \n\t"\
282 "pmaddwd %%mm1, %%mm0 \n\t"\
283 "paddd %%mm2, %%mm6 \n\t"\
284 "paddd %%mm0, %%mm7 \n\t"\
285 " jnz 2b \n\t"\
286 "psrad $16, %%mm4 \n\t"\
287 "psrad $16, %%mm5 \n\t"\
288 "psrad $16, %%mm6 \n\t"\
289 "psrad $16, %%mm7 \n\t"\
290 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
291 "packssdw %%mm5, %%mm4 \n\t"\
292 "packssdw %%mm7, %%mm6 \n\t"\
293 "paddw %%mm0, %%mm4 \n\t"\
294 "paddw %%mm0, %%mm6 \n\t"\
295 "movq %%mm4, "U_TEMP"(%0) \n\t"\
296 "movq %%mm6, "V_TEMP"(%0) \n\t"\
297
298 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299 "lea "offset"(%0), %%"REG_d" \n\t"\
300 "mov (%%"REG_d"), %%"REG_S" \n\t"\
301 "pxor %%mm1, %%mm1 \n\t"\
302 "pxor %%mm5, %%mm5 \n\t"\
303 "pxor %%mm7, %%mm7 \n\t"\
304 "pxor %%mm6, %%mm6 \n\t"\
305 ASMALIGN(4)\
306 "2: \n\t"\
307 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
309 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
310 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
311 "movq %%mm0, %%mm3 \n\t"\
312 "punpcklwd %%mm4, %%mm0 \n\t"\
313 "punpckhwd %%mm4, %%mm3 \n\t"\
314 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
315 "pmaddwd %%mm4, %%mm0 \n\t"\
316 "pmaddwd %%mm4, %%mm3 \n\t"\
317 "paddd %%mm0, %%mm1 \n\t"\
318 "paddd %%mm3, %%mm5 \n\t"\
319 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
320 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
321 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
322 "test %%"REG_S", %%"REG_S" \n\t"\
323 "movq %%mm2, %%mm0 \n\t"\
324 "punpcklwd %%mm3, %%mm2 \n\t"\
325 "punpckhwd %%mm3, %%mm0 \n\t"\
326 "pmaddwd %%mm4, %%mm2 \n\t"\
327 "pmaddwd %%mm4, %%mm0 \n\t"\
328 "paddd %%mm2, %%mm7 \n\t"\
329 "paddd %%mm0, %%mm6 \n\t"\
330 " jnz 2b \n\t"\
331 "psrad $16, %%mm1 \n\t"\
332 "psrad $16, %%mm5 \n\t"\
333 "psrad $16, %%mm7 \n\t"\
334 "psrad $16, %%mm6 \n\t"\
335 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
336 "packssdw %%mm5, %%mm1 \n\t"\
337 "packssdw %%mm6, %%mm7 \n\t"\
338 "paddw %%mm0, %%mm1 \n\t"\
339 "paddw %%mm0, %%mm7 \n\t"\
340 "movq "U_TEMP"(%0), %%mm3 \n\t"\
341 "movq "V_TEMP"(%0), %%mm4 \n\t"\
342
343 #define YSCALEYUV2PACKEDX_ACCURATE \
344 YSCALEYUV2PACKEDX_ACCURATE_UV \
345 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346
347 #define YSCALEYUV2RGBX \
348 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
349 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
350 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
351 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
352 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
353 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
354 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
356 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
357 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
358 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
359 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
360 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
361 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362 "paddw %%mm3, %%mm4 \n\t"\
363 "movq %%mm2, %%mm0 \n\t"\
364 "movq %%mm5, %%mm6 \n\t"\
365 "movq %%mm4, %%mm3 \n\t"\
366 "punpcklwd %%mm2, %%mm2 \n\t"\
367 "punpcklwd %%mm5, %%mm5 \n\t"\
368 "punpcklwd %%mm4, %%mm4 \n\t"\
369 "paddw %%mm1, %%mm2 \n\t"\
370 "paddw %%mm1, %%mm5 \n\t"\
371 "paddw %%mm1, %%mm4 \n\t"\
372 "punpckhwd %%mm0, %%mm0 \n\t"\
373 "punpckhwd %%mm6, %%mm6 \n\t"\
374 "punpckhwd %%mm3, %%mm3 \n\t"\
375 "paddw %%mm7, %%mm0 \n\t"\
376 "paddw %%mm7, %%mm6 \n\t"\
377 "paddw %%mm7, %%mm3 \n\t"\
378 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379 "packuswb %%mm0, %%mm2 \n\t"\
380 "packuswb %%mm6, %%mm5 \n\t"\
381 "packuswb %%mm3, %%mm4 \n\t"\
382
383 #define REAL_YSCALEYUV2PACKED(index, c) \
384 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
385 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
386 "psraw $3, %%mm0 \n\t"\
387 "psraw $3, %%mm1 \n\t"\
388 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390 "xor "#index", "#index" \n\t"\
391 ASMALIGN(4)\
392 "1: \n\t"\
393 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
394 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
395 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
396 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
407 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
408 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
409 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
410 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
411 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
412 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
418
419 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
420
421 #define REAL_YSCALEYUV2RGB_UV(index, c) \
422 "xor "#index", "#index" \n\t"\
423 ASMALIGN(4)\
424 "1: \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
432 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
439 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
440 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
441 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
442 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
443 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
444 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
445
446 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
448 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
449 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
450 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
451 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
452 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
460 #define REAL_YSCALEYUV2RGB_COEFF(c) \
461 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
462 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
463 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
464 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
465 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
466 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
467 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468 "paddw %%mm3, %%mm4 \n\t"\
469 "movq %%mm2, %%mm0 \n\t"\
470 "movq %%mm5, %%mm6 \n\t"\
471 "movq %%mm4, %%mm3 \n\t"\
472 "punpcklwd %%mm2, %%mm2 \n\t"\
473 "punpcklwd %%mm5, %%mm5 \n\t"\
474 "punpcklwd %%mm4, %%mm4 \n\t"\
475 "paddw %%mm1, %%mm2 \n\t"\
476 "paddw %%mm1, %%mm5 \n\t"\
477 "paddw %%mm1, %%mm4 \n\t"\
478 "punpckhwd %%mm0, %%mm0 \n\t"\
479 "punpckhwd %%mm6, %%mm6 \n\t"\
480 "punpckhwd %%mm3, %%mm3 \n\t"\
481 "paddw %%mm7, %%mm0 \n\t"\
482 "paddw %%mm7, %%mm6 \n\t"\
483 "paddw %%mm7, %%mm3 \n\t"\
484 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485 "packuswb %%mm0, %%mm2 \n\t"\
486 "packuswb %%mm6, %%mm5 \n\t"\
487 "packuswb %%mm3, %%mm4 \n\t"\
488
489 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
490
491 #define YSCALEYUV2RGB(index, c) \
492 REAL_YSCALEYUV2RGB_UV(index, c) \
493 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494 REAL_YSCALEYUV2RGB_COEFF(c)
495
496 #define REAL_YSCALEYUV2PACKED1(index, c) \
497 "xor "#index", "#index" \n\t"\
498 ASMALIGN(4)\
499 "1: \n\t"\
500 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
501 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
502 "psraw $7, %%mm3 \n\t" \
503 "psraw $7, %%mm4 \n\t" \
504 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
505 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
506 "psraw $7, %%mm1 \n\t" \
507 "psraw $7, %%mm7 \n\t" \
508
509 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
510
511 #define REAL_YSCALEYUV2RGB1(index, c) \
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
520 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
521 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
522 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
523 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
524 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
525 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
527 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
528 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
531 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
532 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
533 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
534 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
535 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
536 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537 "paddw %%mm3, %%mm4 \n\t"\
538 "movq %%mm2, %%mm0 \n\t"\
539 "movq %%mm5, %%mm6 \n\t"\
540 "movq %%mm4, %%mm3 \n\t"\
541 "punpcklwd %%mm2, %%mm2 \n\t"\
542 "punpcklwd %%mm5, %%mm5 \n\t"\
543 "punpcklwd %%mm4, %%mm4 \n\t"\
544 "paddw %%mm1, %%mm2 \n\t"\
545 "paddw %%mm1, %%mm5 \n\t"\
546 "paddw %%mm1, %%mm4 \n\t"\
547 "punpckhwd %%mm0, %%mm0 \n\t"\
548 "punpckhwd %%mm6, %%mm6 \n\t"\
549 "punpckhwd %%mm3, %%mm3 \n\t"\
550 "paddw %%mm7, %%mm0 \n\t"\
551 "paddw %%mm7, %%mm6 \n\t"\
552 "paddw %%mm7, %%mm3 \n\t"\
553 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554 "packuswb %%mm0, %%mm2 \n\t"\
555 "packuswb %%mm6, %%mm5 \n\t"\
556 "packuswb %%mm3, %%mm4 \n\t"\
557
558 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
559
560 #define REAL_YSCALEYUV2PACKED1b(index, c) \
561 "xor "#index", "#index" \n\t"\
562 ASMALIGN(4)\
563 "1: \n\t"\
564 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
565 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
566 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
567 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
568 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570 "psrlw $8, %%mm3 \n\t" \
571 "psrlw $8, %%mm4 \n\t" \
572 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
573 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
574 "psraw $7, %%mm1 \n\t" \
575 "psraw $7, %%mm7 \n\t"
576 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
577
578 // do vertical chrominance interpolation
579 #define REAL_YSCALEYUV2RGB1b(index, c) \
580 "xor "#index", "#index" \n\t"\
581 ASMALIGN(4)\
582 "1: \n\t"\
583 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
584 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
585 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
586 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
587 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
590 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
591 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
592 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
593 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
594 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
595 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
596 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
597 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
599 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
600 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
603 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
604 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
605 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
606 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
607 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
608 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609 "paddw %%mm3, %%mm4 \n\t"\
610 "movq %%mm2, %%mm0 \n\t"\
611 "movq %%mm5, %%mm6 \n\t"\
612 "movq %%mm4, %%mm3 \n\t"\
613 "punpcklwd %%mm2, %%mm2 \n\t"\
614 "punpcklwd %%mm5, %%mm5 \n\t"\
615 "punpcklwd %%mm4, %%mm4 \n\t"\
616 "paddw %%mm1, %%mm2 \n\t"\
617 "paddw %%mm1, %%mm5 \n\t"\
618 "paddw %%mm1, %%mm4 \n\t"\
619 "punpckhwd %%mm0, %%mm0 \n\t"\
620 "punpckhwd %%mm6, %%mm6 \n\t"\
621 "punpckhwd %%mm3, %%mm3 \n\t"\
622 "paddw %%mm7, %%mm0 \n\t"\
623 "paddw %%mm7, %%mm6 \n\t"\
624 "paddw %%mm7, %%mm3 \n\t"\
625 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626 "packuswb %%mm0, %%mm2 \n\t"\
627 "packuswb %%mm6, %%mm5 \n\t"\
628 "packuswb %%mm3, %%mm4 \n\t"\
629
630 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
631
632 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
634 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
635 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
636 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
637 "packuswb %%mm1, %%mm7 \n\t"
638 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639
640 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641 "movq "#b", "#q2" \n\t" /* B */\
642 "movq "#r", "#t" \n\t" /* R */\
643 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
644 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
645 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
646 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
647 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
648 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
649 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
650 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
651 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
652 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
653 \
654 MOVNTQ( q0, (dst, index, 4))\
655 MOVNTQ( b, 8(dst, index, 4))\
656 MOVNTQ( q2, 16(dst, index, 4))\
657 MOVNTQ( q3, 24(dst, index, 4))\
658 \
659 "add $8, "#index" \n\t"\
660 "cmp "#dstw", "#index" \n\t"\
661 " jb 1b \n\t"
662 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
663
664 #define REAL_WRITERGB16(dst, dstw, index) \
665 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
666 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
667 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
668 "psrlq $3, %%mm2 \n\t"\
669 \
670 "movq %%mm2, %%mm1 \n\t"\
671 "movq %%mm4, %%mm3 \n\t"\
672 \
673 "punpcklbw %%mm7, %%mm3 \n\t"\
674 "punpcklbw %%mm5, %%mm2 \n\t"\
675 "punpckhbw %%mm7, %%mm4 \n\t"\
676 "punpckhbw %%mm5, %%mm1 \n\t"\
677 \
678 "psllq $3, %%mm3 \n\t"\
679 "psllq $3, %%mm4 \n\t"\
680 \
681 "por %%mm3, %%mm2 \n\t"\
682 "por %%mm4, %%mm1 \n\t"\
683 \
684 MOVNTQ(%%mm2, (dst, index, 2))\
685 MOVNTQ(%%mm1, 8(dst, index, 2))\
686 \
687 "add $8, "#index" \n\t"\
688 "cmp "#dstw", "#index" \n\t"\
689 " jb 1b \n\t"
690 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
691
692 #define REAL_WRITERGB15(dst, dstw, index) \
693 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
694 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
695 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
696 "psrlq $3, %%mm2 \n\t"\
697 "psrlq $1, %%mm5 \n\t"\
698 \
699 "movq %%mm2, %%mm1 \n\t"\
700 "movq %%mm4, %%mm3 \n\t"\
701 \
702 "punpcklbw %%mm7, %%mm3 \n\t"\
703 "punpcklbw %%mm5, %%mm2 \n\t"\
704 "punpckhbw %%mm7, %%mm4 \n\t"\
705 "punpckhbw %%mm5, %%mm1 \n\t"\
706 \
707 "psllq $2, %%mm3 \n\t"\
708 "psllq $2, %%mm4 \n\t"\
709 \
710 "por %%mm3, %%mm2 \n\t"\
711 "por %%mm4, %%mm1 \n\t"\
712 \
713 MOVNTQ(%%mm2, (dst, index, 2))\
714 MOVNTQ(%%mm1, 8(dst, index, 2))\
715 \
716 "add $8, "#index" \n\t"\
717 "cmp "#dstw", "#index" \n\t"\
718 " jb 1b \n\t"
719 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
720
721 #define WRITEBGR24OLD(dst, dstw, index) \
722 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723 "movq %%mm2, %%mm1 \n\t" /* B */\
724 "movq %%mm5, %%mm6 \n\t" /* R */\
725 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
726 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
727 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
728 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
729 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
730 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
731 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
732 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
733 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
734 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
735 \
736 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
737 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
738 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
739 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
740 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
741 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
742 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
743 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
744 \
745 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
746 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
747 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
748 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
749 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
750 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
751 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
752 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
753 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
754 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
755 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
756 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
757 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
758 \
759 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
760 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
761 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
762 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
763 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
764 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
765 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
766 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
767 \
768 MOVNTQ(%%mm0, (dst))\
769 MOVNTQ(%%mm2, 8(dst))\
770 MOVNTQ(%%mm3, 16(dst))\
771 "add $24, "#dst" \n\t"\
772 \
773 "add $8, "#index" \n\t"\
774 "cmp "#dstw", "#index" \n\t"\
775 " jb 1b \n\t"
776
777 #define WRITEBGR24MMX(dst, dstw, index) \
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779 "movq %%mm2, %%mm1 \n\t" /* B */\
780 "movq %%mm5, %%mm6 \n\t" /* R */\
781 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
782 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
783 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
784 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
785 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
786 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
787 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
788 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
789 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
790 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
791 \
792 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
793 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
794 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
795 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
796 \
797 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
798 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
799 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
800 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
801 \
802 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
803 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
804 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
805 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
806 \
807 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
809 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
810 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
811 MOVNTQ(%%mm0, (dst))\
812 \
813 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
814 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
815 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
816 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
817 MOVNTQ(%%mm6, 8(dst))\
818 \
819 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
820 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
821 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
822 MOVNTQ(%%mm5, 16(dst))\
823 \
824 "add $24, "#dst" \n\t"\
825 \
826 "add $8, "#index" \n\t"\
827 "cmp "#dstw", "#index" \n\t"\
828 " jb 1b \n\t"
829
830 #define WRITEBGR24MMX2(dst, dstw, index) \
831 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
835 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
836 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
837 \
838 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
839 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
840 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
841 \
842 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
843 "por %%mm1, %%mm6 \n\t"\
844 "por %%mm3, %%mm6 \n\t"\
845 MOVNTQ(%%mm6, (dst))\
846 \
847 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
848 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
849 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
850 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
851 \
852 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
853 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
854 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
855 \
856 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
857 "por %%mm3, %%mm6 \n\t"\
858 MOVNTQ(%%mm6, 8(dst))\
859 \
860 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
861 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
862 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
863 \
864 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
865 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
866 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
867 \
868 "por %%mm1, %%mm3 \n\t"\
869 "por %%mm3, %%mm6 \n\t"\
870 MOVNTQ(%%mm6, 16(dst))\
871 \
872 "add $24, "#dst" \n\t"\
873 \
874 "add $8, "#index" \n\t"\
875 "cmp "#dstw", "#index" \n\t"\
876 " jb 1b \n\t"
877
878 #if COMPILE_TEMPLATE_MMX2
879 #undef WRITEBGR24
880 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
881 #else
882 #undef WRITEBGR24
883 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
884 #endif
885
886 #define REAL_WRITEYUY2(dst, dstw, index) \
887 "packuswb %%mm3, %%mm3 \n\t"\
888 "packuswb %%mm4, %%mm4 \n\t"\
889 "packuswb %%mm7, %%mm1 \n\t"\
890 "punpcklbw %%mm4, %%mm3 \n\t"\
891 "movq %%mm1, %%mm7 \n\t"\
892 "punpcklbw %%mm3, %%mm1 \n\t"\
893 "punpckhbw %%mm3, %%mm7 \n\t"\
894 \
895 MOVNTQ(%%mm1, (dst, index, 2))\
896 MOVNTQ(%%mm7, 8(dst, index, 2))\
897 \
898 "add $8, "#index" \n\t"\
899 "cmp "#dstw", "#index" \n\t"\
900 " jb 1b \n\t"
901 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
902
903
904 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
907 {
908 #if COMPILE_TEMPLATE_MMX
909 if(!(c->flags & SWS_BITEXACT)) {
910 if (c->flags & SWS_ACCURATE_RND) {
911 if (uDest) {
912 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914 }
915 if (CONFIG_SWSCALE_ALPHA && aDest) {
916 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917 }
918
919 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920 } else {
921 if (uDest) {
922 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924 }
925 if (CONFIG_SWSCALE_ALPHA && aDest) {
926 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927 }
928
929 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930 }
931 return;
932 }
933 #endif
934 #if COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 dest, uDest, vDest, dstW, chrDstW);
938 #else //COMPILE_TEMPLATE_ALTIVEC
939 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940 chrFilter, chrSrc, chrFilterSize,
941 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942 #endif //!COMPILE_TEMPLATE_ALTIVEC
943 }
944
945 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
948 {
949 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950 chrFilter, chrSrc, chrFilterSize,
951 dest, uDest, dstW, chrDstW, dstFormat);
952 }
953
954 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
956 {
957 int i;
958 #if COMPILE_TEMPLATE_MMX
959 if(!(c->flags & SWS_BITEXACT)) {
960 long p= 4;
961 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
964
965 if (c->flags & SWS_ACCURATE_RND) {
966 while(p--) {
967 if (dst[p]) {
968 __asm__ volatile(
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
971 "g" (-counter[p])
972 : "%"REG_a
973 );
974 }
975 }
976 } else {
977 while(p--) {
978 if (dst[p]) {
979 __asm__ volatile(
980 YSCALEYUV2YV121
981 :: "r" (src[p]), "r" (dst[p] + counter[p]),
982 "g" (-counter[p])
983 : "%"REG_a
984 );
985 }
986 }
987 }
988 return;
989 }
990 #endif
991 for (i=0; i<dstW; i++) {
992 int val= (lumSrc[i]+64)>>7;
993
994 if (val&256) {
995 if (val<0) val=0;
996 else val=255;
997 }
998
999 dest[i]= val;
1000 }
1001
1002 if (uDest)
1003 for (i=0; i<chrDstW; i++) {
1004 int u=(chrSrc[i ]+64)>>7;
1005 int v=(chrSrc[i + VOFW]+64)>>7;
1006
1007 if ((u|v)&256) {
1008 if (u<0) u=0;
1009 else if (u>255) u=255;
1010 if (v<0) v=0;
1011 else if (v>255) v=255;
1012 }
1013
1014 uDest[i]= u;
1015 vDest[i]= v;
1016 }
1017
1018 if (CONFIG_SWSCALE_ALPHA && aDest)
1019 for (i=0; i<dstW; i++) {
1020 int val= (alpSrc[i]+64)>>7;
1021 aDest[i]= av_clip_uint8(val);
1022 }
1023 }
1024
1025
1026 /**
1027 * vertical scale YV12 to RGB
1028 */
1029 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1030 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1031 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1032 {
1033 #if COMPILE_TEMPLATE_MMX
1034 x86_reg dummy=0;
1035 if(!(c->flags & SWS_BITEXACT)) {
1036 if (c->flags & SWS_ACCURATE_RND) {
1037 switch(c->dstFormat) {
1038 case PIX_FMT_RGB32:
1039 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1040 YSCALEYUV2PACKEDX_ACCURATE
1041 YSCALEYUV2RGBX
1042 "movq %%mm2, "U_TEMP"(%0) \n\t"
1043 "movq %%mm4, "V_TEMP"(%0) \n\t"
1044 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1045 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1046 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1047 "psraw $3, %%mm1 \n\t"
1048 "psraw $3, %%mm7 \n\t"
1049 "packuswb %%mm7, %%mm1 \n\t"
1050 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1051
1052 YSCALEYUV2PACKEDX_END
1053 } else {
1054 YSCALEYUV2PACKEDX_ACCURATE
1055 YSCALEYUV2RGBX
1056 "pcmpeqd %%mm7, %%mm7 \n\t"
1057 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1058
1059 YSCALEYUV2PACKEDX_END
1060 }
1061 return;
1062 case PIX_FMT_BGR24:
1063 YSCALEYUV2PACKEDX_ACCURATE
1064 YSCALEYUV2RGBX
1065 "pxor %%mm7, %%mm7 \n\t"
1066 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1067 "add %4, %%"REG_c" \n\t"
1068 WRITEBGR24(%%REGc, %5, %%REGa)
1069
1070
1071 :: "r" (&c->redDither),
1072 "m" (dummy), "m" (dummy), "m" (dummy),
1073 "r" (dest), "m" (dstW)
1074 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1075 );
1076 return;
1077 case PIX_FMT_RGB555:
1078 YSCALEYUV2PACKEDX_ACCURATE
1079 YSCALEYUV2RGBX
1080 "pxor %%mm7, %%mm7 \n\t"
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082 #ifdef DITHER1XBPP
1083 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1084 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1085 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1086 #endif
1087
1088 WRITERGB15(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1090 return;
1091 case PIX_FMT_RGB565:
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 YSCALEYUV2RGBX
1094 "pxor %%mm7, %%mm7 \n\t"
1095 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1096 #ifdef DITHER1XBPP
1097 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1098 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1099 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1100 #endif
1101
1102 WRITERGB16(%4, %5, %%REGa)
1103 YSCALEYUV2PACKEDX_END
1104 return;
1105 case PIX_FMT_YUYV422:
1106 YSCALEYUV2PACKEDX_ACCURATE
1107 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1108
1109 "psraw $3, %%mm3 \n\t"
1110 "psraw $3, %%mm4 \n\t"
1111 "psraw $3, %%mm1 \n\t"
1112 "psraw $3, %%mm7 \n\t"
1113 WRITEYUY2(%4, %5, %%REGa)
1114 YSCALEYUV2PACKEDX_END
1115 return;
1116 }
1117 } else {
1118 switch(c->dstFormat) {
1119 case PIX_FMT_RGB32:
1120 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1121 YSCALEYUV2PACKEDX
1122 YSCALEYUV2RGBX
1123 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1124 "psraw $3, %%mm1 \n\t"
1125 "psraw $3, %%mm7 \n\t"
1126 "packuswb %%mm7, %%mm1 \n\t"
1127 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1128 YSCALEYUV2PACKEDX_END
1129 } else {
1130 YSCALEYUV2PACKEDX
1131 YSCALEYUV2RGBX
1132 "pcmpeqd %%mm7, %%mm7 \n\t"
1133 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1134 YSCALEYUV2PACKEDX_END
1135 }
1136 return;
1137 case PIX_FMT_BGR24:
1138 YSCALEYUV2PACKEDX
1139 YSCALEYUV2RGBX
1140 "pxor %%mm7, %%mm7 \n\t"
1141 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1142 "add %4, %%"REG_c" \n\t"
1143 WRITEBGR24(%%REGc, %5, %%REGa)
1144
1145 :: "r" (&c->redDither),
1146 "m" (dummy), "m" (dummy), "m" (dummy),
1147 "r" (dest), "m" (dstW)
1148 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1149 );
1150 return;
1151 case PIX_FMT_RGB555:
1152 YSCALEYUV2PACKEDX
1153 YSCALEYUV2RGBX
1154 "pxor %%mm7, %%mm7 \n\t"
1155 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1156 #ifdef DITHER1XBPP
1157 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1158 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1159 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1160 #endif
1161
1162 WRITERGB15(%4, %5, %%REGa)
1163 YSCALEYUV2PACKEDX_END
1164 return;
1165 case PIX_FMT_RGB565:
1166 YSCALEYUV2PACKEDX
1167 YSCALEYUV2RGBX
1168 "pxor %%mm7, %%mm7 \n\t"
1169 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1170 #ifdef DITHER1XBPP
1171 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1172 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1173 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1174 #endif
1175
1176 WRITERGB16(%4, %5, %%REGa)
1177 YSCALEYUV2PACKEDX_END
1178 return;
1179 case PIX_FMT_YUYV422:
1180 YSCALEYUV2PACKEDX
1181 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1182
1183 "psraw $3, %%mm3 \n\t"
1184 "psraw $3, %%mm4 \n\t"
1185 "psraw $3, %%mm1 \n\t"
1186 "psraw $3, %%mm7 \n\t"
1187 WRITEYUY2(%4, %5, %%REGa)
1188 YSCALEYUV2PACKEDX_END
1189 return;
1190 }
1191 }
1192 }
1193 #endif /* COMPILE_TEMPLATE_MMX */
1194 #if COMPILE_TEMPLATE_ALTIVEC
1195 /* The following list of supported dstFormat values should
1196 match what's found in the body of ff_yuv2packedX_altivec() */
1197 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1198 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1199 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1200 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1201 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1203 dest, dstW, dstY);
1204 else
1205 #endif
1206 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1207 chrFilter, chrSrc, chrFilterSize,
1208 alpSrc, dest, dstW, dstY);
1209 }
1210
1211 /**
1212 * vertical bilinear scale YV12 to RGB
1213 */
1214 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1215 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1216 {
1217 int yalpha1=4095- yalpha;
1218 int uvalpha1=4095-uvalpha;
1219 int i;
1220
1221 #if COMPILE_TEMPLATE_MMX
1222 if(!(c->flags & SWS_BITEXACT)) {
1223 switch(c->dstFormat) {
1224 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1225 case PIX_FMT_RGB32:
1226 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1227 #if ARCH_X86_64
1228 __asm__ volatile(
1229 YSCALEYUV2RGB(%%REGBP, %5)
1230 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1231 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1232 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1233 "packuswb %%mm7, %%mm1 \n\t"
1234 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1235
1236 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1237 "a" (&c->redDither)
1238 ,"r" (abuf0), "r" (abuf1)
1239 : "%"REG_BP
1240 );
1241 #else
1242 *(uint16_t **)(&c->u_temp)=abuf0;
1243 *(uint16_t **)(&c->v_temp)=abuf1;
1244 __asm__ volatile(
1245 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1246 "mov %4, %%"REG_b" \n\t"
1247 "push %%"REG_BP" \n\t"
1248 YSCALEYUV2RGB(%%REGBP, %5)
1249 "push %0 \n\t"
1250 "push %1 \n\t"
1251 "mov "U_TEMP"(%5), %0 \n\t"
1252 "mov "V_TEMP"(%5), %1 \n\t"
1253 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1254 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1255 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1256 "packuswb %%mm7, %%mm1 \n\t"
1257 "pop %1 \n\t"
1258 "pop %0 \n\t"
1259 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1260 "pop %%"REG_BP" \n\t"
1261 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1262
1263 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1264 "a" (&c->redDither)
1265 );
1266 #endif
1267 } else {
1268 __asm__ volatile(
1269 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1270 "mov %4, %%"REG_b" \n\t"
1271 "push %%"REG_BP" \n\t"
1272 YSCALEYUV2RGB(%%REGBP, %5)
1273 "pcmpeqd %%mm7, %%mm7 \n\t"
1274 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1275 "pop %%"REG_BP" \n\t"
1276 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1277
1278 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1279 "a" (&c->redDither)
1280 );
1281 }
1282 return;
1283 case PIX_FMT_BGR24:
1284 __asm__ volatile(
1285 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1286 "mov %4, %%"REG_b" \n\t"
1287 "push %%"REG_BP" \n\t"
1288 YSCALEYUV2RGB(%%REGBP, %5)
1289 "pxor %%mm7, %%mm7 \n\t"
1290 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1291 "pop %%"REG_BP" \n\t"
1292 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1293 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1294 "a" (&c->redDither)
1295 );
1296 return;
1297 case PIX_FMT_RGB555:
1298 __asm__ volatile(
1299 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1300 "mov %4, %%"REG_b" \n\t"
1301 "push %%"REG_BP" \n\t"
1302 YSCALEYUV2RGB(%%REGBP, %5)
1303 "pxor %%mm7, %%mm7 \n\t"
1304 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1305 #ifdef DITHER1XBPP
1306 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1307 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1308 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1309 #endif
1310
1311 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1312 "pop %%"REG_BP" \n\t"
1313 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1314
1315 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1316 "a" (&c->redDither)
1317 );
1318 return;
1319 case PIX_FMT_RGB565:
1320 __asm__ volatile(
1321 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1322 "mov %4, %%"REG_b" \n\t"
1323 "push %%"REG_BP" \n\t"
1324 YSCALEYUV2RGB(%%REGBP, %5)
1325 "pxor %%mm7, %%mm7 \n\t"
1326 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1327 #ifdef DITHER1XBPP
1328 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1329 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1330 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1331 #endif
1332
1333 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1334 "pop %%"REG_BP" \n\t"
1335 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1336 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1337 "a" (&c->redDither)
1338 );
1339 return;
1340 case PIX_FMT_YUYV422:
1341 __asm__ volatile(
1342 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1343 "mov %4, %%"REG_b" \n\t"
1344 "push %%"REG_BP" \n\t"
1345 YSCALEYUV2PACKED(%%REGBP, %5)
1346 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1347 "pop %%"REG_BP" \n\t"
1348 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1349 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1350 "a" (&c->redDither)
1351 );
1352 return;
1353 default: break;
1354 }
1355 }
1356 #endif //COMPILE_TEMPLATE_MMX
1357 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1358 }
1359
1360 /**
1361 * YV12 to RGB without scaling or interpolating
1362 */
1363 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1364 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1365 {
1366 const int yalpha1=0;
1367 int i;
1368
1369 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1370 const int yalpha= 4096; //FIXME ...
1371
1372 if (flags&SWS_FULL_CHR_H_INT) {
1373 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1374 return;
1375 }
1376
1377 #if COMPILE_TEMPLATE_MMX
1378 if(!(flags & SWS_BITEXACT)) {
1379 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1380 switch(dstFormat) {
1381 case PIX_FMT_RGB32:
1382 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1383 __asm__ volatile(
1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_b" \n\t"
1386 "push %%"REG_BP" \n\t"
1387 YSCALEYUV2RGB1(%%REGBP, %5)
1388 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1389 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1390 "pop %%"REG_BP" \n\t"
1391 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1392
1393 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1394 "a" (&c->redDither)
1395 );
1396 } else {
1397 __asm__ volatile(
1398 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1399 "mov %4, %%"REG_b" \n\t"
1400 "push %%"REG_BP" \n\t"
1401 YSCALEYUV2RGB1(%%REGBP, %5)
1402 "pcmpeqd %%mm7, %%mm7 \n\t"
1403 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1404 "pop %%"REG_BP" \n\t"
1405 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1406
1407 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1408 "a" (&c->redDither)
1409 );
1410 }
1411 return;
1412 case PIX_FMT_BGR24:
1413 __asm__ volatile(
1414 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1415 "mov %4, %%"REG_b" \n\t"
1416 "push %%"REG_BP" \n\t"
1417 YSCALEYUV2RGB1(%%REGBP, %5)
1418 "pxor %%mm7, %%mm7 \n\t"
1419 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1420 "pop %%"REG_BP" \n\t"
1421 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1422
1423 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1424 "a" (&c->redDither)
1425 );
1426 return;
1427 case PIX_FMT_RGB555:
1428 __asm__ volatile(
1429 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1430 "mov %4, %%"REG_b" \n\t"
1431 "push %%"REG_BP" \n\t"
1432 YSCALEYUV2RGB1(%%REGBP, %5)
1433 "pxor %%mm7, %%mm7 \n\t"
1434 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1435 #ifdef DITHER1XBPP
1436 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1437 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1438 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1439 #endif
1440 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1441 "pop %%"REG_BP" \n\t"
1442 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1443
1444 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1445 "a" (&c->redDither)
1446 );
1447 return;
1448 case PIX_FMT_RGB565:
1449 __asm__ volatile(
1450 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1451 "mov %4, %%"REG_b" \n\t"
1452 "push %%"REG_BP" \n\t"
1453 YSCALEYUV2RGB1(%%REGBP, %5)
1454 "pxor %%mm7, %%mm7 \n\t"
1455 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1456 #ifdef DITHER1XBPP
1457 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1458 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1459 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1460 #endif
1461
1462 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1463 "pop %%"REG_BP" \n\t"
1464 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1465
1466 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1467 "a" (&c->redDither)
1468 );
1469 return;
1470 case PIX_FMT_YUYV422:
1471 __asm__ volatile(
1472 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1473 "mov %4, %%"REG_b" \n\t"
1474 "push %%"REG_BP" \n\t"
1475 YSCALEYUV2PACKED1(%%REGBP, %5)
1476 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1479
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481 "a" (&c->redDither)
1482 );
1483 return;
1484 }
1485 } else {
1486 switch(dstFormat) {
1487 case PIX_FMT_RGB32:
1488 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1489 __asm__ volatile(
1490 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1491 "mov %4, %%"REG_b" \n\t"
1492 "push %%"REG_BP" \n\t"
1493 YSCALEYUV2RGB1b(%%REGBP, %5)
1494 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1495 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1496 "pop %%"REG_BP" \n\t"
1497 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1498
1499 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1500 "a" (&c->redDither)
1501 );
1502 } else {
1503 __asm__ volatile(
1504 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1505 "mov %4, %%"REG_b" \n\t"
1506 "push %%"REG_BP" \n\t"
1507 YSCALEYUV2RGB1b(%%REGBP, %5)
1508 "pcmpeqd %%mm7, %%mm7 \n\t"
1509 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1510 "pop %%"REG_BP" \n\t"
1511 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1512
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 "a" (&c->redDither)
1515 );
1516 }
1517 return;
1518 case PIX_FMT_BGR24:
1519 __asm__ volatile(
1520 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1521 "mov %4, %%"REG_b" \n\t"
1522 "push %%"REG_BP" \n\t"
1523 YSCALEYUV2RGB1b(%%REGBP, %5)
1524 "pxor %%mm7, %%mm7 \n\t"
1525 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1526 "pop %%"REG_BP" \n\t"
1527 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1528
1529 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1530 "a" (&c->redDither)
1531 );
1532 return;
1533 case PIX_FMT_RGB555:
1534 __asm__ volatile(
1535 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1536 "mov %4, %%"REG_b" \n\t"
1537 "push %%"REG_BP" \n\t"
1538 YSCALEYUV2RGB1b(%%REGBP, %5)
1539 "pxor %%mm7, %%mm7 \n\t"
1540 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1541 #ifdef DITHER1XBPP
1542 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1543 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1544 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1545 #endif
1546 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1549
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1551 "a" (&c->redDither)
1552 );
1553 return;
1554 case PIX_FMT_RGB565:
1555 __asm__ volatile(
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1b(%%REGBP, %5)
1560 "pxor %%mm7, %%mm7 \n\t"
1561 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1562 #ifdef DITHER1XBPP
1563 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1564 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1565 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1566 #endif
1567
1568 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1571
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573 "a" (&c->redDither)
1574 );
1575 return;
1576 case PIX_FMT_YUYV422:
1577 __asm__ volatile(
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2PACKED1b(%%REGBP, %5)
1582 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1583 "pop %%"REG_BP" \n\t"
1584 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1585
1586 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1587 "a" (&c->redDither)
1588 );
1589 return;
1590 }
1591 }
1592 }
1593 #endif /* COMPILE_TEMPLATE_MMX */
1594 if (uvalpha < 2048) {
1595 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1596 } else {
1597 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1598 }
1599 }
1600
1601 //FIXME yuy2* can read up to 7 samples too much
1602
1603 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1604 {
1605 #if COMPILE_TEMPLATE_MMX
1606 __asm__ volatile(
1607 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1608 "mov %0, %%"REG_a" \n\t"
1609 "1: \n\t"
1610 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1611 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1612 "pand %%mm2, %%mm0 \n\t"
1613 "pand %%mm2, %%mm1 \n\t"
1614 "packuswb %%mm1, %%mm0 \n\t"
1615 "movq %%mm0, (%2, %%"REG_a") \n\t"
1616 "add $8, %%"REG_a" \n\t"
1617 " js 1b \n\t"
1618 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1619 : "%"REG_a
1620 );
1621 #else
1622 int i;
1623 for (i=0; i<width; i++)
1624 dst[i]= src[2*i];
1625 #endif
1626 }
1627
1628 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1629 {
1630 #if COMPILE_TEMPLATE_MMX
1631 __asm__ volatile(
1632 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1633 "mov %0, %%"REG_a" \n\t"
1634 "1: \n\t"
1635 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1636 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "psrlw $8, %%mm1 \n\t"
1639 "packuswb %%mm1, %%mm0 \n\t"
1640 "movq %%mm0, %%mm1 \n\t"
1641 "psrlw $8, %%mm0 \n\t"
1642 "pand %%mm4, %%mm1 \n\t"
1643 "packuswb %%mm0, %%mm0 \n\t"
1644 "packuswb %%mm1, %%mm1 \n\t"
1645 "movd %%mm0, (%3, %%"REG_a") \n\t"
1646 "movd %%mm1, (%2, %%"REG_a") \n\t"
1647 "add $4, %%"REG_a" \n\t"
1648 " js 1b \n\t"
1649 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1650 : "%"REG_a
1651 );
1652 #else
1653 int i;
1654 for (i=0; i<width; i++) {
1655 dstU[i]= src1[4*i + 1];
1656 dstV[i]= src1[4*i + 3];
1657 }
1658 #endif
1659 assert(src1 == src2);
1660 }
1661
1662 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1663 {
1664 #if COMPILE_TEMPLATE_MMX
1665 __asm__ volatile(
1666 "mov %0, %%"REG_a" \n\t"
1667 "1: \n\t"
1668 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1669 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1670 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1671 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1672 "psrlw $8, %%mm0 \n\t"
1673 "psrlw $8, %%mm1 \n\t"
1674 "psrlw $8, %%mm2 \n\t"
1675 "psrlw $8, %%mm3 \n\t"
1676 "packuswb %%mm1, %%mm0 \n\t"
1677 "packuswb %%mm3, %%mm2 \n\t"
1678 "movq %%mm0, (%3, %%"REG_a") \n\t"
1679 "movq %%mm2, (%4, %%"REG_a") \n\t"
1680 "add $8, %%"REG_a" \n\t"
1681 " js 1b \n\t"
1682 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1683 : "%"REG_a
1684 );
1685 #else
1686 int i;
1687 for (i=0; i<width; i++) {
1688 dstU[i]= src1[2*i + 1];
1689 dstV[i]= src2[2*i + 1];
1690 }
1691 #endif
1692 }
1693
1694 /* This is almost identical to the previous, end exists only because
1695 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1696 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1697 {
1698 #if COMPILE_TEMPLATE_MMX
1699 __asm__ volatile(
1700 "mov %0, %%"REG_a" \n\t"
1701 "1: \n\t"
1702 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1703 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1704 "psrlw $8, %%mm0 \n\t"
1705 "psrlw $8, %%mm1 \n\t"
1706 "packuswb %%mm1, %%mm0 \n\t"
1707 "movq %%mm0, (%2, %%"REG_a") \n\t"
1708 "add $8, %%"REG_a" \n\t"
1709 " js 1b \n\t"
1710 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1711 : "%"REG_a
1712 );
1713 #else
1714 int i;
1715 for (i=0; i<width; i++)
1716 dst[i]= src[2*i+1];
1717 #endif
1718 }
1719
1720 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1721 {
1722 #if COMPILE_TEMPLATE_MMX
1723 __asm__ volatile(
1724 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1725 "mov %0, %%"REG_a" \n\t"
1726 "1: \n\t"
1727 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1728 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1729 "pand %%mm4, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm1, %%mm0 \n\t"
1732 "movq %%mm0, %%mm1 \n\t"
1733 "psrlw $8, %%mm0 \n\t"
1734 "pand %%mm4, %%mm1 \n\t"
1735 "packuswb %%mm0, %%mm0 \n\t"
1736 "packuswb %%mm1, %%mm1 \n\t"
1737 "movd %%mm0, (%3, %%"REG_a") \n\t"
1738 "movd %%mm1, (%2, %%"REG_a") \n\t"
1739 "add $4, %%"REG_a" \n\t"
1740 " js 1b \n\t"
1741 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1742 : "%"REG_a
1743 );
1744 #else
1745 int i;
1746 for (i=0; i<width; i++) {
1747 dstU[i]= src1[4*i + 0];
1748 dstV[i]= src1[4*i + 2];
1749 }
1750 #endif
1751 assert(src1 == src2);
1752 }
1753
1754 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1755 {
1756 #if COMPILE_TEMPLATE_MMX
1757 __asm__ volatile(
1758 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1759 "mov %0, %%"REG_a" \n\t"
1760 "1: \n\t"
1761 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1762 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1763 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1764 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1765 "pand %%mm4, %%mm0 \n\t"
1766 "pand %%mm4, %%mm1 \n\t"
1767 "pand %%mm4, %%mm2 \n\t"
1768 "pand %%mm4, %%mm3 \n\t"
1769 "packuswb %%mm1, %%mm0 \n\t"
1770 "packuswb %%mm3, %%mm2 \n\t"
1771 "movq %%mm0, (%3, %%"REG_a") \n\t"
1772 "movq %%mm2, (%4, %%"REG_a") \n\t"
1773 "add $8, %%"REG_a" \n\t"
1774 " js 1b \n\t"
1775 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1776 : "%"REG_a
1777 );
1778 #else
1779 int i;
1780 for (i=0; i<width; i++) {
1781 dstU[i]= src1[2*i];
1782 dstV[i]= src2[2*i];
1783 }
1784 #endif
1785 }
1786
1787 #if COMPILE_TEMPLATE_MMX
1788 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
1789 {
1790
1791 if(srcFormat == PIX_FMT_BGR24) {
1792 __asm__ volatile(
1793 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1794 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1795 :
1796 );
1797 } else {
1798 __asm__ volatile(
1799 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1800 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1801 :
1802 );
1803 }
1804
1805 __asm__ volatile(
1806 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1807 "mov %2, %%"REG_a" \n\t"
1808 "pxor %%mm7, %%mm7 \n\t"
1809 "1: \n\t"
1810 PREFETCH" 64(%0) \n\t"
1811 "movd (%0), %%mm0 \n\t"
1812 "movd 2(%0), %%mm1 \n\t"
1813 "movd 6(%0), %%mm2 \n\t"
1814 "movd 8(%0), %%mm3 \n\t"
1815 "add $12, %0 \n\t"
1816 "punpcklbw %%mm7, %%mm0 \n\t"
1817 "punpcklbw %%mm7, %%mm1 \n\t"
1818 "punpcklbw %%mm7, %%mm2 \n\t"
1819 "punpcklbw %%mm7, %%mm3 \n\t"
1820 "pmaddwd %%mm5, %%mm0 \n\t"
1821 "pmaddwd %%mm6, %%mm1 \n\t"
1822 "pmaddwd %%mm5, %%mm2 \n\t"
1823 "pmaddwd %%mm6, %%mm3 \n\t"
1824 "paddd %%mm1, %%mm0 \n\t"
1825 "paddd %%mm3, %%mm2 \n\t"
1826 "paddd %%mm4, %%mm0 \n\t"
1827 "paddd %%mm4, %%mm2 \n\t"
1828 "psrad $15, %%mm0 \n\t"
1829 "psrad $15, %%mm2 \n\t"
1830 "packssdw %%mm2, %%mm0 \n\t"
1831 "packuswb %%mm0, %%mm0 \n\t"
1832 "movd %%mm0, (%1, %%"REG_a") \n\t"
1833 "add $4, %%"REG_a" \n\t"
1834 " js 1b \n\t"
1835 : "+r" (src)
1836 : "r" (dst+width), "g" ((x86_reg)-width)
1837 : "%"REG_a
1838 );
1839 }
1840
1841 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
1842 {
1843 __asm__ volatile(
1844 "movq 24+%4, %%mm6 \n\t"
1845 "mov %3, %%"REG_a" \n\t"
1846 "pxor %%mm7, %%mm7 \n\t"
1847 "1: \n\t"
1848 PREFETCH" 64(%0) \n\t"
1849 "movd (%0), %%mm0 \n\t"
1850 "movd 2(%0), %%mm1 \n\t"
1851 "punpcklbw %%mm7, %%mm0 \n\t"
1852 "punpcklbw %%mm7, %%mm1 \n\t"
1853 "movq %%mm0, %%mm2 \n\t"
1854 "movq %%mm1, %%mm3 \n\t"
1855 "pmaddwd %4, %%mm0 \n\t"
1856 "pmaddwd 8+%4, %%mm1 \n\t"
1857 "pmaddwd 16+%4, %%mm2 \n\t"
1858 "pmaddwd %%mm6, %%mm3 \n\t"
1859 "paddd %%mm1, %%mm0 \n\t"
1860 "paddd %%mm3, %%mm2 \n\t"
1861
1862 "movd 6(%0), %%mm1 \n\t"
1863 "movd 8(%0), %%mm3 \n\t"
1864 "add $12, %0 \n\t"
1865 "punpcklbw %%mm7, %%mm1 \n\t"
1866 "punpcklbw %%mm7, %%mm3 \n\t"
1867 "movq %%mm1, %%mm4 \n\t"
1868 "movq %%mm3, %%mm5 \n\t"
1869 "pmaddwd %4, %%mm1 \n\t"
1870 "pmaddwd 8+%4, %%mm3 \n\t"
1871 "pmaddwd 16+%4, %%mm4 \n\t"
1872 "pmaddwd %%mm6, %%mm5 \n\t"
1873 "paddd %%mm3, %%mm1 \n\t"
1874 "paddd %%mm5, %%mm4 \n\t"
1875
1876 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1877 "paddd %%mm3, %%mm0 \n\t"
1878 "paddd %%mm3, %%mm2 \n\t"
1879 "paddd %%mm3, %%mm1 \n\t"
1880 "paddd %%mm3, %%mm4 \n\t"
1881 "psrad $15, %%mm0 \n\t"
1882 "psrad $15, %%mm2 \n\t"
1883 "psrad $15, %%mm1 \n\t"
1884 "psrad $15, %%mm4 \n\t"
1885 "packssdw %%mm1, %%mm0 \n\t"
1886 "packssdw %%mm4, %%mm2 \n\t"
1887 "packuswb %%mm0, %%mm0 \n\t"
1888 "packuswb %%mm2, %%mm2 \n\t"
1889 "movd %%mm0, (%1, %%"REG_a") \n\t"
1890 "movd %%mm2, (%2, %%"REG_a") \n\t"
1891 "add $4, %%"REG_a" \n\t"
1892 " js 1b \n\t"
1893 : "+r" (src)
1894 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1895 : "%"REG_a
1896 );
1897 }
1898 #endif
1899
1900 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1901 {
1902 #if COMPILE_TEMPLATE_MMX
1903 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1904 #else
1905 int i;
1906 for (i=0; i<width; i++) {
1907 int b= src[i*3+0];
1908 int g= src[i*3+1];
1909 int r= src[i*3+2];
1910
1911 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1912 }
1913 #endif /* COMPILE_TEMPLATE_MMX */
1914 }
1915
1916 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1917 {
1918 #if COMPILE_TEMPLATE_MMX
1919 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1920 #else
1921 int i;
1922 for (i=0; i<width; i++) {
1923 int b= src1[3*i + 0];
1924 int g= src1[3*i + 1];
1925 int r= src1[3*i + 2];
1926
1927 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1928 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1929 }
1930 #endif /* COMPILE_TEMPLATE_MMX */
1931 assert(src1 == src2);
1932 }
1933
1934 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1935 {
1936 int i;
1937 for (i=0; i<width; i++) {
1938 int b= src1[6*i + 0] + src1[6*i + 3];
1939 int g= src1[6*i + 1] + src1[6*i + 4];
1940 int r= src1[6*i + 2] + src1[6*i + 5];
1941
1942 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1943 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1944 }
1945 assert(src1 == src2);
1946 }
1947
1948 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1949 {
1950 #if COMPILE_TEMPLATE_MMX
1951 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1952 #else
1953 int i;
1954 for (i=0; i<width; i++) {
1955 int r= src[i*3+0];
1956 int g= src[i*3+1];
1957 int b= src[i*3+2];
1958
1959 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1960 }
1961 #endif
1962 }
1963
1964 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1965 {
1966 #if COMPILE_TEMPLATE_MMX
1967 assert(src1==src2);
1968 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1969 #else
1970 int i;
1971 assert(src1==src2);
1972 for (i=0; i<width; i++) {
1973 int r= src1[3*i + 0];
1974 int g= src1[3*i + 1];
1975 int b= src1[3*i + 2];
1976
1977 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1978 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1979 }
1980 #endif
1981 }
1982
1983 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1984 {
1985 int i;
1986 assert(src1==src2);
1987 for (i=0; i<width; i++) {
1988 int r= src1[6*i + 0] + src1[6*i + 3];
1989 int g= src1[6*i + 1] + src1[6*i + 4];
1990 int b= src1[6*i + 2] + src1[6*i + 5];
1991
1992 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1993 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1994 }
1995 }
1996
1997
1998 // bilinear / bicubic scaling
1999 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2000 const int16_t *filter, const int16_t *filterPos, long filterSize)
2001 {
2002 #if COMPILE_TEMPLATE_MMX
2003 assert(filterSize % 4 == 0 && filterSize>0);
2004 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2005 x86_reg counter= -2*dstW;
2006 filter-= counter*2;
2007 filterPos-= counter/2;
2008 dst-= counter/2;
2009 __asm__ volatile(
2010 #if defined(PIC)
2011 "push %%"REG_b" \n\t"
2012 #endif
2013 "pxor %%mm7, %%mm7 \n\t"
2014 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2015 "mov %%"REG_a", %%"REG_BP" \n\t"
2016 ASMALIGN(4)
2017 "1: \n\t"
2018 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2019 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2020 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2021 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2022 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2023 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2024 "punpcklbw %%mm7, %%mm0 \n\t"
2025 "punpcklbw %%mm7, %%mm2 \n\t"
2026 "pmaddwd %%mm1, %%mm0 \n\t"
2027 "pmaddwd %%mm2, %%mm3 \n\t"
2028 "movq %%mm0, %%mm4 \n\t"
2029 "punpckldq %%mm3, %%mm0 \n\t"
2030 "punpckhdq %%mm3, %%mm4 \n\t"
2031 "paddd %%mm4, %%mm0 \n\t"
2032 "psrad $7, %%mm0 \n\t"
2033 "packssdw %%mm0, %%mm0 \n\t"
2034 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2035 "add $4, %%"REG_BP" \n\t"
2036 " jnc 1b \n\t"
2037
2038 "pop %%"REG_BP" \n\t"
2039 #if defined(PIC)
2040 "pop %%"REG_b" \n\t"
2041 #endif
2042 : "+a" (counter)
2043 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2044 #if !defined(PIC)
2045 : "%"REG_b
2046 #endif
2047 );
2048 } else if (filterSize==8) {
2049 x86_reg counter= -2*dstW;
2050 filter-= counter*4;
2051 filterPos-= counter/2;
2052 dst-= counter/2;
2053 __asm__ volatile(
2054 #if defined(PIC)
2055 "push %%"REG_b" \n\t"
2056 #endif
2057 "pxor %%mm7, %%mm7 \n\t"
2058 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a", %%"REG_BP" \n\t"
2060 ASMALIGN(4)
2061 "1: \n\t"
2062 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2064 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2065 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2066 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2072
2073 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2074 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2075 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2076 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2077 "punpcklbw %%mm7, %%mm4 \n\t"
2078 "punpcklbw %%mm7, %%mm2 \n\t"
2079 "pmaddwd %%mm1, %%mm4 \n\t"
2080 "pmaddwd %%mm2, %%mm5 \n\t"
2081 "paddd %%mm4, %%mm0 \n\t"
2082 "paddd %%mm5, %%mm3 \n\t"
2083 "movq %%mm0, %%mm4 \n\t"
2084 "punpckldq %%mm3, %%mm0 \n\t"
2085 "punpckhdq %%mm3, %%mm4 \n\t"
2086 "paddd %%mm4, %%mm0 \n\t"
2087 "psrad $7, %%mm0 \n\t"
2088 "packssdw %%mm0, %%mm0 \n\t"
2089 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2090 "add $4, %%"REG_BP" \n\t"
2091 " jnc 1b \n\t"
2092
2093 "pop %%"REG_BP" \n\t"
2094 #if defined(PIC)
2095 "pop %%"REG_b" \n\t"
2096 #endif
2097 : "+a" (counter)
2098 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2099 #if !defined(PIC)
2100 : "%"REG_b
2101 #endif
2102 );
2103 } else {
2104 uint8_t *offset = src+filterSize;
2105 x86_reg counter= -2*dstW;
2106 //filter-= counter*filterSize/2;
2107 filterPos-= counter/2;
2108 dst-= counter/2;
2109 __asm__ volatile(
2110 "pxor %%mm7, %%mm7 \n\t"
2111 ASMALIGN(4)
2112 "1: \n\t"
2113 "mov %2, %%"REG_c" \n\t"
2114 "movzwl (%%"REG_c", %0), %%eax \n\t"
2115 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2116 "mov %5, %%"REG_c" \n\t"
2117 "pxor %%mm4, %%mm4 \n\t"
2118 "pxor %%mm5, %%mm5 \n\t"
2119 "2: \n\t"
2120 "movq (%1), %%mm1 \n\t"
2121 "movq (%1, %6), %%mm3 \n\t"
2122 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2123 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2124 "punpcklbw %%mm7, %%mm0 \n\t"
2125 "punpcklbw %%mm7, %%mm2 \n\t"
2126 "pmaddwd %%mm1, %%mm0 \n\t"
2127 "pmaddwd %%mm2, %%mm3 \n\t"
2128 "paddd %%mm3, %%mm5 \n\t"
2129 "paddd %%mm0, %%mm4 \n\t"
2130 "add $8, %1 \n\t"
2131 "add $4, %%"REG_c" \n\t"
2132 "cmp %4, %%"REG_c" \n\t"
2133 " jb 2b \n\t"
2134 "add %6, %1 \n\t"
2135 "movq %%mm4, %%mm0 \n\t"
2136 "punpckldq %%mm5, %%mm4 \n\t"
2137 "punpckhdq %%mm5, %%mm0 \n\t"
2138 "paddd %%mm0, %%mm4 \n\t"
2139 "psrad $7, %%mm4 \n\t"
2140 "packssdw %%mm4, %%mm4 \n\t"
2141 "mov %3, %%"REG_a" \n\t"
2142 "movd %%mm4, (%%"REG_a", %0) \n\t"
2143 "add $4, %0 \n\t"
2144 " jnc 1b \n\t"
2145
2146 : "+r" (counter), "+r" (filter)
2147 : "m" (filterPos), "m" (dst), "m"(offset),
2148 "m" (src), "r" ((x86_reg)filterSize*2)
2149 : "%"REG_a, "%"REG_c, "%"REG_d
2150 );
2151 }
2152 #else
2153 #if COMPILE_TEMPLATE_ALTIVEC
2154 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2155 #else
2156 int i;
2157 for (i=0; i<dstW; i++) {
2158 int j;
2159 int srcPos= filterPos[i];
2160 int val=0;
2161 //printf("filterPos: %d\n", filterPos[i]);
2162 for (j=0; j<filterSize; j++) {
2163 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2164 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2165 }
2166 //filter += hFilterSize;
2167 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2168 //dst[i] = val>>7;
2169 }
2170 #endif /* COMPILE_ALTIVEC */
2171 #endif /* COMPILE_MMX */
2172 }
2173
2174 #define FAST_BILINEAR_X86 \
2175 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2176 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2177 "shll $16, %%edi \n\t" \
2178 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2179 "mov %1, %%"REG_D"\n\t" \
2180 "shrl $9, %%esi \n\t" \
2181
2182 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2183 int dstWidth, const uint8_t *src, int srcW,
2184 int xInc)
2185 {
2186 int i;
2187 unsigned int xpos=0;
2188 for (i=0;i<dstWidth;i++) {
2189 register unsigned int xx=xpos>>16;
2190 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2191 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2192 xpos+=xInc;
2193 }
2194 }
2195
2196 // *** horizontal scale Y line to temp buffer
2197 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2198 int flags, const int16_t *hLumFilter,
2199 const int16_t *hLumFilterPos, int hLumFilterSize,
2200 int srcFormat, uint8_t *formatConvBuffer,
2201 uint32_t *pal, int isAlpha)
2202 {
2203 int32_t av_unused *mmx2FilterPos = c->lumMmx2FilterPos;
2204 int16_t av_unused *mmx2Filter = c->lumMmx2Filter;
2205 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2206 void av_unused *mmx2FilterCode= c->lumMmx2FilterCode;
2207 void (*internal_func)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->hascale_internal : c->hyscale_internal;
2208
2209 if (isAlpha) {
2210 if (srcFormat == PIX_FMT_RGB32 || srcFormat == PIX_FMT_BGR32 )
2211 src += 3;
2212 } else {
2213 if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
2214 src += ALT32_CORR;
2215 }
2216
2217 if (srcFormat == PIX_FMT_RGB48LE)
2218 src++;
2219
2220 if (internal_func) {
2221 internal_func(formatConvBuffer, src, srcW, pal);
2222 src= formatConvBuffer;
2223 }
2224
2225 #if COMPILE_TEMPLATE_MMX
2226 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2227 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2228 #else
2229 if (!(flags&SWS_FAST_BILINEAR))
2230 #endif
2231 {
2232 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2233 } else { // fast bilinear upscale / crap downscale
2234 #if ARCH_X86 && CONFIG_GPL
2235 #if COMPILE_TEMPLATE_MMX2
2236 int i;
2237 #if defined(PIC)
2238 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2239 #endif
2240 if (canMMX2BeUsed) {
2241 __asm__ volatile(
2242 #if defined(PIC)
2243 "mov %%"REG_b", %5 \n\t"
2244 #endif
2245 "pxor %%mm7, %%mm7 \n\t"
2246 "mov %0, %%"REG_c" \n\t"
2247 "mov %1, %%"REG_D" \n\t"
2248 "mov %2, %%"REG_d" \n\t"
2249 "mov %3, %%"REG_b" \n\t"
2250 "xor %%"REG_a", %%"REG_a" \n\t" // i
2251 PREFETCH" (%%"REG_c") \n\t"
2252 PREFETCH" 32(%%"REG_c") \n\t"
2253 PREFETCH" 64(%%"REG_c") \n\t"
2254
2255 #if ARCH_X86_64
2256
2257 #define CALL_MMX2_FILTER_CODE \
2258 "movl (%%"REG_b"), %%esi \n\t"\
2259 "call *%4 \n\t"\
2260 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2261 "add %%"REG_S", %%"REG_c" \n\t"\
2262 "add %%"REG_a", %%"REG_D" \n\t"\
2263 "xor %%"REG_a", %%"REG_a" \n\t"\
2264
2265 #else
2266
2267 #define CALL_MMX2_FILTER_CODE \
2268 "movl (%%"REG_b"), %%esi \n\t"\
2269 "call *%4 \n\t"\
2270 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2271 "add %%"REG_a", %%"REG_D" \n\t"\
2272 "xor %%"REG_a", %%"REG_a" \n\t"\
2273
2274 #endif /* ARCH_X86_64 */
2275
2276 CALL_MMX2_FILTER_CODE
2277 CALL_MMX2_FILTER_CODE
2278 CALL_MMX2_FILTER_CODE
2279 CALL_MMX2_FILTER_CODE
2280 CALL_MMX2_FILTER_CODE
2281 CALL_MMX2_FILTER_CODE
2282 CALL_MMX2_FILTER_CODE
2283 CALL_MMX2_FILTER_CODE
2284
2285 #if defined(PIC)
2286 "mov %5, %%"REG_b" \n\t"
2287 #endif
2288 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2289 "m" (mmx2FilterCode)
2290 #if defined(PIC)
2291 ,"m" (ebxsave)
2292 #endif
2293 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2294 #if !defined(PIC)
2295 ,"%"REG_b
2296 #endif
2297 );
2298 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2299 } else {
2300 #endif /* COMPILE_TEMPLATE_MMX2 */
2301 x86_reg xInc_shr16 = xInc >> 16;
2302 uint16_t xInc_mask = xInc & 0xffff;
2303 //NO MMX just normal asm ...
2304 __asm__ volatile(
2305 "xor %%"REG_a", %%"REG_a" \n\t" // i
2306 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2307 "xorl %%ecx, %%ecx \n\t" // xalpha
2308 ASMALIGN(4)
2309 "1: \n\t"
2310 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2311 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2312 FAST_BILINEAR_X86
2313 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2314 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2315 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2316
2317 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2318 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2319 FAST_BILINEAR_X86
2320 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2321 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2322 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2323
2324
2325 "add $2, %%"REG_a" \n\t"
2326 "cmp %2, %%"REG_a" \n\t"
2327 " jb 1b \n\t"
2328
2329
2330 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2331 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2332 );
2333 #if COMPILE_TEMPLATE_MMX2
2334 } //if MMX2 can't be used
2335 #endif
2336 #else
2337 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2338 #endif /* ARCH_X86 */
2339 }
2340
2341 if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
2342 int i;
2343 //FIXME all pal and rgb srcFormats could do this convertion as well
2344 //FIXME all scalers more complex than bilinear could do half of this transform
2345 if(c->srcRange) {
2346 for (i=0; i<dstWidth; i++)
2347 dst[i]= (dst[i]*14071 + 33561947)>>14;
2348 } else {
2349 for (i=0; i<dstWidth; i++)
2350 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2351 }
2352 }
2353 }
2354
2355 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2356 int dstWidth, const uint8_t *src1,
2357 const uint8_t *src2, int srcW, int xInc)
2358 {
2359 int i;
2360 unsigned int xpos=0;
2361 for (i=0;i<dstWidth;i++) {
2362 register unsigned int xx=xpos>>16;
2363 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2364 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2365 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2366 /* slower
2367 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2368 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2369 */
2370 xpos+=xInc;
2371 }
2372 }
2373
2374 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2375 int srcW, int xInc, int flags, const int16_t *hChrFilter,
2376 const int16_t *hChrFilterPos, int hChrFilterSize,
2377 int srcFormat, uint8_t *formatConvBuffer,
2378 uint32_t *pal)
2379 {
2380 int32_t av_unused *mmx2FilterPos = c->chrMmx2FilterPos;
2381 int16_t av_unused *mmx2Filter = c->chrMmx2Filter;
2382 int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
2383 void av_unused *mmx2FilterCode= c->chrMmx2FilterCode;
2384
2385 if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2386 return;
2387
2388 if (srcFormat==PIX_FMT_RGB32_1 || srcFormat==PIX_FMT_BGR32_1) {
2389 src1 += ALT32_CORR;
2390 src2 += ALT32_CORR;
2391 }
2392
2393 if (srcFormat==PIX_FMT_RGB48LE) {
2394 src1++;
2395 src2++;
2396 }
2397
2398 if (c->hcscale_internal) {
2399 c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2400 src1= formatConvBuffer;
2401 src2= formatConvBuffer+VOFW;
2402 }
2403
2404 #if COMPILE_TEMPLATE_MMX
2405 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2406 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2407 #else
2408 if (!(flags&SWS_FAST_BILINEAR))
2409 #endif
2410 {
2411 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2412 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2413 } else { // fast bilinear upscale / crap downscale
2414 #if ARCH_X86 && CONFIG_GPL
2415 #if COMPILE_TEMPLATE_MMX2
2416 int i;
2417 #if defined(PIC)
2418 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2419 #endif
2420 if (canMMX2BeUsed) {
2421 __asm__ volatile(
2422 #if defined(PIC)
2423 "mov %%"REG_b", %6 \n\t"
2424 #endif
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c" \n\t"
2427 "mov %1, %%"REG_D" \n\t"
2428 "mov %2, %%"REG_d" \n\t"
2429 "mov %3, %%"REG_b" \n\t"
2430 "xor %%"REG_a", %%"REG_a" \n\t" // i
2431 PREFETCH" (%%"REG_c") \n\t"
2432 PREFETCH" 32(%%"REG_c") \n\t"
2433 PREFETCH" 64(%%"REG_c") \n\t"
2434
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 CALL_MMX2_FILTER_CODE
2439 "xor %%"REG_a", %%"REG_a" \n\t" // i
2440 "mov %5, %%"REG_c" \n\t" // src
2441 "mov %1, %%"REG_D" \n\t" // buf1
2442 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2443 PREFETCH" (%%"REG_c") \n\t"
2444 PREFETCH" 32(%%"REG_c") \n\t"
2445 PREFETCH" 64(%%"REG_c") \n\t"
2446
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2450 CALL_MMX2_FILTER_CODE
2451
2452 #if defined(PIC)
2453 "mov %6, %%"REG_b" \n\t"
2454 #endif
2455 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2456 "m" (mmx2FilterCode), "m" (src2)
2457 #if defined(PIC)
2458 ,"m" (ebxsave)
2459 #endif
2460 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2461 #if !defined(PIC)
2462 ,"%"REG_b
2463 #endif
2464 );
2465 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2466 //printf("%d %d %d\n", dstWidth, i, srcW);
2467 dst[i] = src1[srcW-1]*128;
2468 dst[i+VOFW] = src2[srcW-1]*128;
2469 }
2470 } else {
2471 #endif /* COMPILE_TEMPLATE_MMX2 */
2472 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2473 uint16_t xInc_mask = xInc & 0xffff;
2474 __asm__ volatile(
2475 "xor %%"REG_a", %%"REG_a" \n\t" // i
2476 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
2478 ASMALIGN(4)
2479 "1: \n\t"
2480 "mov %0, %%"REG_S" \n\t"
2481 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2483 FAST_BILINEAR_X86
2484 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2485
2486 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2488 FAST_BILINEAR_X86
2489 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2490
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a" \n\t"
2494 "cmp %2, %%"REG_a" \n\t"
2495 " jb 1b \n\t"
2496
2497 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498 which is needed to support GCC 4.0. */
2499 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2500 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2501 #else
2502 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2503 #endif
2504 "r" (src2)
2505 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2506 );
2507 #if COMPILE_TEMPLATE_MMX2
2508 } //if MMX2 can't be used
2509 #endif
2510 #else
2511 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2512 #endif /* ARCH_X86 */
2513 }
2514 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
2515 int i;
2516 //FIXME all pal and rgb srcFormats could do this convertion as well
2517 //FIXME all scalers more complex than bilinear could do half of this transform
2518 if(c->srcRange) {
2519 for (i=0; i<dstWidth; i++) {
2520 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2521 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2522 }
2523 } else {
2524 for (i=0; i<dstWidth; i++) {
2525 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2526 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2527 }
2528 }
2529 }
2530 }
2531
2532 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2533 int srcSliceH, uint8_t* dst[], int dstStride[])
2534 {
2535 /* load a few things into local vars to make the code more readable? and faster */
2536 const int srcW= c->srcW;
2537 const int dstW= c->dstW;
2538 const int dstH= c->dstH;
2539 const int chrDstW= c->chrDstW;
2540 const int chrSrcW= c->chrSrcW;
2541 const int lumXInc= c->lumXInc;
2542 const int chrXInc= c->chrXInc;
2543 const int dstFormat= c->dstFormat;
2544 const int srcFormat= c->srcFormat;
2545 const int flags= c->flags;
2546 int16_t *vLumFilterPos= c->vLumFilterPos;
2547 int16_t *vChrFilterPos= c->vChrFilterPos;
2548 int16_t *hLumFilterPos= c->hLumFilterPos;
2549 int16_t *hChrFilterPos= c->hChrFilterPos;
2550 int16_t *vLumFilter= c->vLumFilter;
2551 int16_t *vChrFilter= c->vChrFilter;
2552 int16_t *hLumFilter= c->hLumFilter;
2553 int16_t *hChrFilter= c->hChrFilter;
2554 int32_t *lumMmxFilter= c->lumMmxFilter;
2555 int32_t *chrMmxFilter= c->chrMmxFilter;
2556 int32_t *alpMmxFilter= c->alpMmxFilter;
2557 const int vLumFilterSize= c->vLumFilterSize;
2558 const int vChrFilterSize= c->vChrFilterSize;
2559 const int hLumFilterSize= c->hLumFilterSize;
2560 const int hChrFilterSize= c->hChrFilterSize;
2561 int16_t **lumPixBuf= c->lumPixBuf;
2562 int16_t **chrPixBuf= c->chrPixBuf;
2563 int16_t **alpPixBuf= c->alpPixBuf;
2564 const int vLumBufSize= c->vLumBufSize;
2565 const int vChrBufSize= c->vChrBufSize;
2566 uint8_t *formatConvBuffer= c->formatConvBuffer;
2567 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2568 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2569 int lastDstY;
2570 uint32_t *pal=c->pal_yuv;
2571
2572 /* vars which will change and which we need to store back in the context */
2573 int dstY= c->dstY;
2574 int lumBufIndex= c->lumBufIndex;
2575 int chrBufIndex= c->chrBufIndex;
2576 int lastInLumBuf= c->lastInLumBuf;
2577 int lastInChrBuf= c->lastInChrBuf;
2578
2579 if (isPacked(c->srcFormat)) {
2580 src[0]=
2581 src[1]=
2582 src[2]=
2583 src[3]= src[0];
2584 srcStride[0]=
2585 srcStride[1]=
2586 srcStride[2]=
2587 srcStride[3]= srcStride[0];
2588 }
2589 srcStride[1]<<= c->vChrDrop;
2590 srcStride[2]<<= c->vChrDrop;
2591
2592 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2593 // (int)dst[0], (int)dst[1], (int)dst[2]);
2594
2595 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2596 //dstStride[0],dstStride[1],dstStride[2]);
2597
2598 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2599 static int warnedAlready=0; //FIXME move this into the context perhaps
2600 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2601 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2602 " ->cannot do aligned memory accesses anymore\n");
2603 warnedAlready=1;
2604 }
2605 }
2606
2607 /* Note the user might start scaling the picture in the middle so this
2608 will not get executed. This is not really intended but works
2609 currently, so people might do it. */
2610 if (srcSliceY ==0) {
2611 lumBufIndex=0;
2612 chrBufIndex=0;
2613 dstY=0;
2614 lastInLumBuf= -1;
2615 lastInChrBuf= -1;
2616 }
2617
2618 lastDstY= dstY;
2619
2620 for (;dstY < dstH; dstY++) {
2621 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2622 const int chrDstY= dstY>>c->chrDstVSubSample;
2623 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2624 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2625 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2626
2627 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2628 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2629 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2630 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2631 int enough_lines;
2632
2633 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2634 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2635 //handle holes (FAST_BILINEAR & weird filters)
2636 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2637 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2638 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2639 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2640 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2641
2642 // Do we have enough lines in this slice to output the dstY line
2643 enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2644 if (!enough_lines) {
2645 lastLumSrcY = srcSliceY + srcSliceH - 1;
2646 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2647 }
2648
2649 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2650 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2651 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2652 vChrBufSize, vLumBufSize);*/
2653
2654 //Do horizontal scaling
2655 while(lastInLumBuf < lastLumSrcY) {
2656 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2657 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2658 lumBufIndex++;
2659 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2660 assert(lumBufIndex < 2*vLumBufSize);
2661 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2662 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2663 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2664 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2665 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2666 c->srcFormat, formatConvBuffer,
2667 pal, 0);
2668 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2669 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2670 flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2671 c->srcFormat, formatConvBuffer,
2672