Do not assume long is same width as x86 register.
[libav.git] / libswscale / swscale_template.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
22 */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
31
32 #if HAVE_AMD3DNOW
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
38
39 #if HAVE_AMD3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif HAVE_MMX2
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
49
50 #if HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
55
56 #if HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif HAVE_AMD3DNOW
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
61
62 #if HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
68
69 #if HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
72
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 __asm__ volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
108
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 __asm__ volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
170
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
183
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
201
202 /*
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208 */
209 #define YSCALEYUV2PACKEDX_UV \
210 __asm__ volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
232
233 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
234 "lea "offset"(%0), %%"REG_d" \n\t"\
235 "mov (%%"REG_d"), %%"REG_S" \n\t"\
236 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
237 "movq "#dst1", "#dst2" \n\t"\
238 ASMALIGN(4)\
239 "2: \n\t"\
240 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
241 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
242 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
243 "add $16, %%"REG_d" \n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
245 "pmulhw "#coeff", "#src1" \n\t"\
246 "pmulhw "#coeff", "#src2" \n\t"\
247 "paddw "#src1", "#dst1" \n\t"\
248 "paddw "#src2", "#dst2" \n\t"\
249 "test %%"REG_S", %%"REG_S" \n\t"\
250 " jnz 2b \n\t"\
251
252 #define YSCALEYUV2PACKEDX \
253 YSCALEYUV2PACKEDX_UV \
254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
255
256 #define YSCALEYUV2PACKEDX_END \
257 :: "r" (&c->redDither), \
258 "m" (dummy), "m" (dummy), "m" (dummy),\
259 "r" (dest), "m" (dstW) \
260 : "%"REG_a, "%"REG_d, "%"REG_S \
261 );
262
263 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
264 __asm__ volatile(\
265 "xor %%"REG_a", %%"REG_a" \n\t"\
266 ASMALIGN(4)\
267 "nop \n\t"\
268 "1: \n\t"\
269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pxor %%mm4, %%mm4 \n\t"\
272 "pxor %%mm5, %%mm5 \n\t"\
273 "pxor %%mm6, %%mm6 \n\t"\
274 "pxor %%mm7, %%mm7 \n\t"\
275 ASMALIGN(4)\
276 "2: \n\t"\
277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
281 "movq %%mm0, %%mm3 \n\t"\
282 "punpcklwd %%mm1, %%mm0 \n\t"\
283 "punpckhwd %%mm1, %%mm3 \n\t"\
284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
285 "pmaddwd %%mm1, %%mm0 \n\t"\
286 "pmaddwd %%mm1, %%mm3 \n\t"\
287 "paddd %%mm0, %%mm4 \n\t"\
288 "paddd %%mm3, %%mm5 \n\t"\
289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
292 "test %%"REG_S", %%"REG_S" \n\t"\
293 "movq %%mm2, %%mm0 \n\t"\
294 "punpcklwd %%mm3, %%mm2 \n\t"\
295 "punpckhwd %%mm3, %%mm0 \n\t"\
296 "pmaddwd %%mm1, %%mm2 \n\t"\
297 "pmaddwd %%mm1, %%mm0 \n\t"\
298 "paddd %%mm2, %%mm6 \n\t"\
299 "paddd %%mm0, %%mm7 \n\t"\
300 " jnz 2b \n\t"\
301 "psrad $16, %%mm4 \n\t"\
302 "psrad $16, %%mm5 \n\t"\
303 "psrad $16, %%mm6 \n\t"\
304 "psrad $16, %%mm7 \n\t"\
305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
306 "packssdw %%mm5, %%mm4 \n\t"\
307 "packssdw %%mm7, %%mm6 \n\t"\
308 "paddw %%mm0, %%mm4 \n\t"\
309 "paddw %%mm0, %%mm6 \n\t"\
310 "movq %%mm4, "U_TEMP"(%0) \n\t"\
311 "movq %%mm6, "V_TEMP"(%0) \n\t"\
312
313 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314 "lea "offset"(%0), %%"REG_d" \n\t"\
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm1, %%mm1 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm7, %%mm7 \n\t"\
319 "pxor %%mm6, %%mm6 \n\t"\
320 ASMALIGN(4)\
321 "2: \n\t"\
322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
326 "movq %%mm0, %%mm3 \n\t"\
327 "punpcklwd %%mm4, %%mm0 \n\t"\
328 "punpckhwd %%mm4, %%mm3 \n\t"\
329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
330 "pmaddwd %%mm4, %%mm0 \n\t"\
331 "pmaddwd %%mm4, %%mm3 \n\t"\
332 "paddd %%mm0, %%mm1 \n\t"\
333 "paddd %%mm3, %%mm5 \n\t"\
334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
337 "test %%"REG_S", %%"REG_S" \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "punpcklwd %%mm3, %%mm2 \n\t"\
340 "punpckhwd %%mm3, %%mm0 \n\t"\
341 "pmaddwd %%mm4, %%mm2 \n\t"\
342 "pmaddwd %%mm4, %%mm0 \n\t"\
343 "paddd %%mm2, %%mm7 \n\t"\
344 "paddd %%mm0, %%mm6 \n\t"\
345 " jnz 2b \n\t"\
346 "psrad $16, %%mm1 \n\t"\
347 "psrad $16, %%mm5 \n\t"\
348 "psrad $16, %%mm7 \n\t"\
349 "psrad $16, %%mm6 \n\t"\
350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
351 "packssdw %%mm5, %%mm1 \n\t"\
352 "packssdw %%mm6, %%mm7 \n\t"\
353 "paddw %%mm0, %%mm1 \n\t"\
354 "paddw %%mm0, %%mm7 \n\t"\
355 "movq "U_TEMP"(%0), %%mm3 \n\t"\
356 "movq "V_TEMP"(%0), %%mm4 \n\t"\
357
358 #define YSCALEYUV2PACKEDX_ACCURATE \
359 YSCALEYUV2PACKEDX_ACCURATE_UV \
360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
361
362 #define YSCALEYUV2RGBX \
363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
369 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
376 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377 "paddw %%mm3, %%mm4 \n\t"\
378 "movq %%mm2, %%mm0 \n\t"\
379 "movq %%mm5, %%mm6 \n\t"\
380 "movq %%mm4, %%mm3 \n\t"\
381 "punpcklwd %%mm2, %%mm2 \n\t"\
382 "punpcklwd %%mm5, %%mm5 \n\t"\
383 "punpcklwd %%mm4, %%mm4 \n\t"\
384 "paddw %%mm1, %%mm2 \n\t"\
385 "paddw %%mm1, %%mm5 \n\t"\
386 "paddw %%mm1, %%mm4 \n\t"\
387 "punpckhwd %%mm0, %%mm0 \n\t"\
388 "punpckhwd %%mm6, %%mm6 \n\t"\
389 "punpckhwd %%mm3, %%mm3 \n\t"\
390 "paddw %%mm7, %%mm0 \n\t"\
391 "paddw %%mm7, %%mm6 \n\t"\
392 "paddw %%mm7, %%mm3 \n\t"\
393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394 "packuswb %%mm0, %%mm2 \n\t"\
395 "packuswb %%mm6, %%mm5 \n\t"\
396 "packuswb %%mm3, %%mm4 \n\t"\
397
398 #define REAL_YSCALEYUV2PACKED(index, c) \
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
401 "psraw $3, %%mm0 \n\t"\
402 "psraw $3, %%mm1 \n\t"\
403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405 "xor "#index", "#index" \n\t"\
406 ASMALIGN(4)\
407 "1: \n\t"\
408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
433
434 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
435
436 #define REAL_YSCALEYUV2RGB_UV(index, c) \
437 "xor "#index", "#index" \n\t"\
438 ASMALIGN(4)\
439 "1: \n\t"\
440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
460
461 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
462 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
463 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
464 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
465 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
474
475 #define REAL_YSCALEYUV2RGB_COEFF(c) \
476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483 "paddw %%mm3, %%mm4 \n\t"\
484 "movq %%mm2, %%mm0 \n\t"\
485 "movq %%mm5, %%mm6 \n\t"\
486 "movq %%mm4, %%mm3 \n\t"\
487 "punpcklwd %%mm2, %%mm2 \n\t"\
488 "punpcklwd %%mm5, %%mm5 \n\t"\
489 "punpcklwd %%mm4, %%mm4 \n\t"\
490 "paddw %%mm1, %%mm2 \n\t"\
491 "paddw %%mm1, %%mm5 \n\t"\
492 "paddw %%mm1, %%mm4 \n\t"\
493 "punpckhwd %%mm0, %%mm0 \n\t"\
494 "punpckhwd %%mm6, %%mm6 \n\t"\
495 "punpckhwd %%mm3, %%mm3 \n\t"\
496 "paddw %%mm7, %%mm0 \n\t"\
497 "paddw %%mm7, %%mm6 \n\t"\
498 "paddw %%mm7, %%mm3 \n\t"\
499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500 "packuswb %%mm0, %%mm2 \n\t"\
501 "packuswb %%mm6, %%mm5 \n\t"\
502 "packuswb %%mm3, %%mm4 \n\t"\
503
504 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
505
506 #define YSCALEYUV2RGB(index, c) \
507 REAL_YSCALEYUV2RGB_UV(index, c) \
508 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
509 REAL_YSCALEYUV2RGB_COEFF(c)
510
511 #define REAL_YSCALEYUV2PACKED1(index, c) \
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $7, %%mm3 \n\t" \
518 "psraw $7, %%mm4 \n\t" \
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $7, %%mm1 \n\t" \
522 "psraw $7, %%mm7 \n\t" \
523
524 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
525
526 #define REAL_YSCALEYUV2RGB1(index, c) \
527 "xor "#index", "#index" \n\t"\
528 ASMALIGN(4)\
529 "1: \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552 "paddw %%mm3, %%mm4 \n\t"\
553 "movq %%mm2, %%mm0 \n\t"\
554 "movq %%mm5, %%mm6 \n\t"\
555 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklwd %%mm2, %%mm2 \n\t"\
557 "punpcklwd %%mm5, %%mm5 \n\t"\
558 "punpcklwd %%mm4, %%mm4 \n\t"\
559 "paddw %%mm1, %%mm2 \n\t"\
560 "paddw %%mm1, %%mm5 \n\t"\
561 "paddw %%mm1, %%mm4 \n\t"\
562 "punpckhwd %%mm0, %%mm0 \n\t"\
563 "punpckhwd %%mm6, %%mm6 \n\t"\
564 "punpckhwd %%mm3, %%mm3 \n\t"\
565 "paddw %%mm7, %%mm0 \n\t"\
566 "paddw %%mm7, %%mm6 \n\t"\
567 "paddw %%mm7, %%mm3 \n\t"\
568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569 "packuswb %%mm0, %%mm2 \n\t"\
570 "packuswb %%mm6, %%mm5 \n\t"\
571 "packuswb %%mm3, %%mm4 \n\t"\
572
573 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
574
575 #define REAL_YSCALEYUV2PACKED1b(index, c) \
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $8, %%mm3 \n\t" \
586 "psrlw $8, %%mm4 \n\t" \
587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
589 "psraw $7, %%mm1 \n\t" \
590 "psraw $7, %%mm7 \n\t"
591 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
592
593 // do vertical chrominance interpolation
594 #define REAL_YSCALEYUV2RGB1b(index, c) \
595 "xor "#index", "#index" \n\t"\
596 ASMALIGN(4)\
597 "1: \n\t"\
598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624 "paddw %%mm3, %%mm4 \n\t"\
625 "movq %%mm2, %%mm0 \n\t"\
626 "movq %%mm5, %%mm6 \n\t"\
627 "movq %%mm4, %%mm3 \n\t"\
628 "punpcklwd %%mm2, %%mm2 \n\t"\
629 "punpcklwd %%mm5, %%mm5 \n\t"\
630 "punpcklwd %%mm4, %%mm4 \n\t"\
631 "paddw %%mm1, %%mm2 \n\t"\
632 "paddw %%mm1, %%mm5 \n\t"\
633 "paddw %%mm1, %%mm4 \n\t"\
634 "punpckhwd %%mm0, %%mm0 \n\t"\
635 "punpckhwd %%mm6, %%mm6 \n\t"\
636 "punpckhwd %%mm3, %%mm3 \n\t"\
637 "paddw %%mm7, %%mm0 \n\t"\
638 "paddw %%mm7, %%mm6 \n\t"\
639 "paddw %%mm7, %%mm3 \n\t"\
640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641 "packuswb %%mm0, %%mm2 \n\t"\
642 "packuswb %%mm6, %%mm5 \n\t"\
643 "packuswb %%mm3, %%mm4 \n\t"\
644
645 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
646
647 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
648 "movq "#b", "#q2" \n\t" /* B */\
649 "movq "#r", "#t" \n\t" /* R */\
650 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
651 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
652 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
653 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
654 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
655 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
656 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
657 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
658 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
659 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
660 \
661 MOVNTQ( q0, (dst, index, 4))\
662 MOVNTQ( b, 8(dst, index, 4))\
663 MOVNTQ( q2, 16(dst, index, 4))\
664 MOVNTQ( q3, 24(dst, index, 4))\
665 \
666 "add $8, "#index" \n\t"\
667 "cmp "#dstw", "#index" \n\t"\
668 " jb 1b \n\t"
669 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
670
671 #define REAL_WRITERGB16(dst, dstw, index) \
672 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
673 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
674 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
675 "psrlq $3, %%mm2 \n\t"\
676 \
677 "movq %%mm2, %%mm1 \n\t"\
678 "movq %%mm4, %%mm3 \n\t"\
679 \
680 "punpcklbw %%mm7, %%mm3 \n\t"\
681 "punpcklbw %%mm5, %%mm2 \n\t"\
682 "punpckhbw %%mm7, %%mm4 \n\t"\
683 "punpckhbw %%mm5, %%mm1 \n\t"\
684 \
685 "psllq $3, %%mm3 \n\t"\
686 "psllq $3, %%mm4 \n\t"\
687 \
688 "por %%mm3, %%mm2 \n\t"\
689 "por %%mm4, %%mm1 \n\t"\
690 \
691 MOVNTQ(%%mm2, (dst, index, 2))\
692 MOVNTQ(%%mm1, 8(dst, index, 2))\
693 \
694 "add $8, "#index" \n\t"\
695 "cmp "#dstw", "#index" \n\t"\
696 " jb 1b \n\t"
697 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
698
699 #define REAL_WRITERGB15(dst, dstw, index) \
700 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
701 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
702 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
703 "psrlq $3, %%mm2 \n\t"\
704 "psrlq $1, %%mm5 \n\t"\
705 \
706 "movq %%mm2, %%mm1 \n\t"\
707 "movq %%mm4, %%mm3 \n\t"\
708 \
709 "punpcklbw %%mm7, %%mm3 \n\t"\
710 "punpcklbw %%mm5, %%mm2 \n\t"\
711 "punpckhbw %%mm7, %%mm4 \n\t"\
712 "punpckhbw %%mm5, %%mm1 \n\t"\
713 \
714 "psllq $2, %%mm3 \n\t"\
715 "psllq $2, %%mm4 \n\t"\
716 \
717 "por %%mm3, %%mm2 \n\t"\
718 "por %%mm4, %%mm1 \n\t"\
719 \
720 MOVNTQ(%%mm2, (dst, index, 2))\
721 MOVNTQ(%%mm1, 8(dst, index, 2))\
722 \
723 "add $8, "#index" \n\t"\
724 "cmp "#dstw", "#index" \n\t"\
725 " jb 1b \n\t"
726 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
727
728 #define WRITEBGR24OLD(dst, dstw, index) \
729 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
730 "movq %%mm2, %%mm1 \n\t" /* B */\
731 "movq %%mm5, %%mm6 \n\t" /* R */\
732 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
733 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
734 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
735 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
736 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
737 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
738 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
739 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
740 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
741 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
742 \
743 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
744 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
746 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
747 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
748 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
749 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
750 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
751 \
752 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
753 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
754 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
755 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
756 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
757 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
758 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
759 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
760 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
761 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
762 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
763 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
764 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
765 \
766 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
767 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
768 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
769 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
770 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
771 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
772 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
773 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
774 \
775 MOVNTQ(%%mm0, (dst))\
776 MOVNTQ(%%mm2, 8(dst))\
777 MOVNTQ(%%mm3, 16(dst))\
778 "add $24, "#dst" \n\t"\
779 \
780 "add $8, "#index" \n\t"\
781 "cmp "#dstw", "#index" \n\t"\
782 " jb 1b \n\t"
783
784 #define WRITEBGR24MMX(dst, dstw, index) \
785 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
786 "movq %%mm2, %%mm1 \n\t" /* B */\
787 "movq %%mm5, %%mm6 \n\t" /* R */\
788 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
789 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
790 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
791 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
792 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
793 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
794 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
795 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
796 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
797 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
798 \
799 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
800 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
801 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
802 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
803 \
804 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
805 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
806 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
807 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
808 \
809 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
810 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
811 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
812 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
813 \
814 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
815 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
816 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
817 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
818 MOVNTQ(%%mm0, (dst))\
819 \
820 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
821 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
822 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
823 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
824 MOVNTQ(%%mm6, 8(dst))\
825 \
826 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
827 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
828 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
829 MOVNTQ(%%mm5, 16(dst))\
830 \
831 "add $24, "#dst" \n\t"\
832 \
833 "add $8, "#index" \n\t"\
834 "cmp "#dstw", "#index" \n\t"\
835 " jb 1b \n\t"
836
837 #define WRITEBGR24MMX2(dst, dstw, index) \
838 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
839 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
840 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
841 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
842 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
843 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
844 \
845 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
846 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
847 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
848 \
849 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
850 "por %%mm1, %%mm6 \n\t"\
851 "por %%mm3, %%mm6 \n\t"\
852 MOVNTQ(%%mm6, (dst))\
853 \
854 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
855 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
856 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
857 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
858 \
859 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
860 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
861 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
862 \
863 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
864 "por %%mm3, %%mm6 \n\t"\
865 MOVNTQ(%%mm6, 8(dst))\
866 \
867 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
868 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
869 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
870 \
871 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
872 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
873 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
874 \
875 "por %%mm1, %%mm3 \n\t"\
876 "por %%mm3, %%mm6 \n\t"\
877 MOVNTQ(%%mm6, 16(dst))\
878 \
879 "add $24, "#dst" \n\t"\
880 \
881 "add $8, "#index" \n\t"\
882 "cmp "#dstw", "#index" \n\t"\
883 " jb 1b \n\t"
884
885 #if HAVE_MMX2
886 #undef WRITEBGR24
887 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
888 #else
889 #undef WRITEBGR24
890 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
891 #endif
892
893 #define REAL_WRITEYUY2(dst, dstw, index) \
894 "packuswb %%mm3, %%mm3 \n\t"\
895 "packuswb %%mm4, %%mm4 \n\t"\
896 "packuswb %%mm7, %%mm1 \n\t"\
897 "punpcklbw %%mm4, %%mm3 \n\t"\
898 "movq %%mm1, %%mm7 \n\t"\
899 "punpcklbw %%mm3, %%mm1 \n\t"\
900 "punpckhbw %%mm3, %%mm7 \n\t"\
901 \
902 MOVNTQ(%%mm1, (dst, index, 2))\
903 MOVNTQ(%%mm7, 8(dst, index, 2))\
904 \
905 "add $8, "#index" \n\t"\
906 "cmp "#dstw", "#index" \n\t"\
907 " jb 1b \n\t"
908 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
909
910
911 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
912 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
913 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
914 {
915 #if HAVE_MMX
916 if(!(c->flags & SWS_BITEXACT)){
917 if (c->flags & SWS_ACCURATE_RND){
918 if (uDest){
919 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
920 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
921 }
922
923 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
924 }else{
925 if (uDest){
926 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
927 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
928 }
929
930 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
931 }
932 return;
933 }
934 #endif
935 #if HAVE_ALTIVEC
936 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
937 chrFilter, chrSrc, chrFilterSize,
938 dest, uDest, vDest, dstW, chrDstW);
939 #else //HAVE_ALTIVEC
940 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
941 chrFilter, chrSrc, chrFilterSize,
942 dest, uDest, vDest, dstW, chrDstW);
943 #endif //!HAVE_ALTIVEC
944 }
945
946 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
947 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
948 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
949 {
950 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
951 chrFilter, chrSrc, chrFilterSize,
952 dest, uDest, dstW, chrDstW, dstFormat);
953 }
954
955 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
956 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
957 {
958 int i;
959 #if HAVE_MMX
960 if(!(c->flags & SWS_BITEXACT)){
961 long p= uDest ? 3 : 1;
962 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
963 uint8_t *dst[3]= {dest, uDest, vDest};
964 x86_reg counter[3] = {dstW, chrDstW, chrDstW};
965
966 if (c->flags & SWS_ACCURATE_RND){
967 while(p--){
968 __asm__ volatile(
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
971 "g" (-counter[p])
972 : "%"REG_a
973 );
974 }
975 }else{
976 while(p--){
977 __asm__ volatile(
978 YSCALEYUV2YV121
979 :: "r" (src[p]), "r" (dst[p] + counter[p]),
980 "g" (-counter[p])
981 : "%"REG_a
982 );
983 }
984 }
985 return;
986 }
987 #endif
988 for (i=0; i<dstW; i++)
989 {
990 int val= (lumSrc[i]+64)>>7;
991
992 if (val&256){
993 if (val<0) val=0;
994 else val=255;
995 }
996
997 dest[i]= val;
998 }
999
1000 if (uDest)
1001 for (i=0; i<chrDstW; i++)
1002 {
1003 int u=(chrSrc[i ]+64)>>7;
1004 int v=(chrSrc[i + VOFW]+64)>>7;
1005
1006 if ((u|v)&256){
1007 if (u<0) u=0;
1008 else if (u>255) u=255;
1009 if (v<0) v=0;
1010 else if (v>255) v=255;
1011 }
1012
1013 uDest[i]= u;
1014 vDest[i]= v;
1015 }
1016 }
1017
1018
1019 /**
1020 * vertical scale YV12 to RGB
1021 */
1022 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1023 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1024 uint8_t *dest, long dstW, long dstY)
1025 {
1026 #if HAVE_MMX
1027 x86_reg dummy=0;
1028 if(!(c->flags & SWS_BITEXACT)){
1029 if (c->flags & SWS_ACCURATE_RND){
1030 switch(c->dstFormat){
1031 case PIX_FMT_RGB32:
1032 YSCALEYUV2PACKEDX_ACCURATE
1033 YSCALEYUV2RGBX
1034 "pcmpeqd %%mm7, %%mm7 \n\t"
1035 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1036
1037 YSCALEYUV2PACKEDX_END
1038 return;
1039 case PIX_FMT_BGR24:
1040 YSCALEYUV2PACKEDX_ACCURATE
1041 YSCALEYUV2RGBX
1042 "pxor %%mm7, %%mm7 \n\t"
1043 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1044 "add %4, %%"REG_c" \n\t"
1045 WRITEBGR24(%%REGc, %5, %%REGa)
1046
1047
1048 :: "r" (&c->redDither),
1049 "m" (dummy), "m" (dummy), "m" (dummy),
1050 "r" (dest), "m" (dstW)
1051 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1052 );
1053 return;
1054 case PIX_FMT_RGB555:
1055 YSCALEYUV2PACKEDX_ACCURATE
1056 YSCALEYUV2RGBX
1057 "pxor %%mm7, %%mm7 \n\t"
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059 #ifdef DITHER1XBPP
1060 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1061 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1062 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1063 #endif
1064
1065 WRITERGB15(%4, %5, %%REGa)
1066 YSCALEYUV2PACKEDX_END
1067 return;
1068 case PIX_FMT_RGB565:
1069 YSCALEYUV2PACKEDX_ACCURATE
1070 YSCALEYUV2RGBX
1071 "pxor %%mm7, %%mm7 \n\t"
1072 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1073 #ifdef DITHER1XBPP
1074 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1075 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1076 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1077 #endif
1078
1079 WRITERGB16(%4, %5, %%REGa)
1080 YSCALEYUV2PACKEDX_END
1081 return;
1082 case PIX_FMT_YUYV422:
1083 YSCALEYUV2PACKEDX_ACCURATE
1084 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1085
1086 "psraw $3, %%mm3 \n\t"
1087 "psraw $3, %%mm4 \n\t"
1088 "psraw $3, %%mm1 \n\t"
1089 "psraw $3, %%mm7 \n\t"
1090 WRITEYUY2(%4, %5, %%REGa)
1091 YSCALEYUV2PACKEDX_END
1092 return;
1093 }
1094 }else{
1095 switch(c->dstFormat)
1096 {
1097 case PIX_FMT_RGB32:
1098 YSCALEYUV2PACKEDX
1099 YSCALEYUV2RGBX
1100 "pcmpeqd %%mm7, %%mm7 \n\t"
1101 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1102 YSCALEYUV2PACKEDX_END
1103 return;
1104 case PIX_FMT_BGR24:
1105 YSCALEYUV2PACKEDX
1106 YSCALEYUV2RGBX
1107 "pxor %%mm7, %%mm7 \n\t"
1108 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1109 "add %4, %%"REG_c" \n\t"
1110 WRITEBGR24(%%REGc, %5, %%REGa)
1111
1112 :: "r" (&c->redDither),
1113 "m" (dummy), "m" (dummy), "m" (dummy),
1114 "r" (dest), "m" (dstW)
1115 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1116 );
1117 return;
1118 case PIX_FMT_RGB555:
1119 YSCALEYUV2PACKEDX
1120 YSCALEYUV2RGBX
1121 "pxor %%mm7, %%mm7 \n\t"
1122 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1123 #ifdef DITHER1XBPP
1124 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1125 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1126 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1127 #endif
1128
1129 WRITERGB15(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1131 return;
1132 case PIX_FMT_RGB565:
1133 YSCALEYUV2PACKEDX
1134 YSCALEYUV2RGBX
1135 "pxor %%mm7, %%mm7 \n\t"
1136 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1137 #ifdef DITHER1XBPP
1138 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1139 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1140 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1141 #endif
1142
1143 WRITERGB16(%4, %5, %%REGa)
1144 YSCALEYUV2PACKEDX_END
1145 return;
1146 case PIX_FMT_YUYV422:
1147 YSCALEYUV2PACKEDX
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149
1150 "psraw $3, %%mm3 \n\t"
1151 "psraw $3, %%mm4 \n\t"
1152 "psraw $3, %%mm1 \n\t"
1153 "psraw $3, %%mm7 \n\t"
1154 WRITEYUY2(%4, %5, %%REGa)
1155 YSCALEYUV2PACKEDX_END
1156 return;
1157 }
1158 }
1159 }
1160 #endif /* HAVE_MMX */
1161 #if HAVE_ALTIVEC
1162 /* The following list of supported dstFormat values should
1163 match what's found in the body of ff_yuv2packedX_altivec() */
1164 if (!(c->flags & SWS_BITEXACT) &&
1165 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1166 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1167 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1168 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1169 chrFilter, chrSrc, chrFilterSize,
1170 dest, dstW, dstY);
1171 else
1172 #endif
1173 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1174 chrFilter, chrSrc, chrFilterSize,
1175 dest, dstW, dstY);
1176 }
1177
1178 /**
1179 * vertical bilinear scale YV12 to RGB
1180 */
1181 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1182 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1183 {
1184 int yalpha1=4095- yalpha;
1185 int uvalpha1=4095-uvalpha;
1186 int i;
1187
1188 #if HAVE_MMX
1189 if(!(c->flags & SWS_BITEXACT)){
1190 switch(c->dstFormat)
1191 {
1192 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1193 case PIX_FMT_RGB32:
1194 __asm__ volatile(
1195 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1196 "mov %4, %%"REG_b" \n\t"
1197 "push %%"REG_BP" \n\t"
1198 YSCALEYUV2RGB(%%REGBP, %5)
1199 "pcmpeqd %%mm7, %%mm7 \n\t"
1200 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1201 "pop %%"REG_BP" \n\t"
1202 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1203
1204 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1205 "a" (&c->redDither)
1206 );
1207 return;
1208 case PIX_FMT_BGR24:
1209 __asm__ volatile(
1210 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1211 "mov %4, %%"REG_b" \n\t"
1212 "push %%"REG_BP" \n\t"
1213 YSCALEYUV2RGB(%%REGBP, %5)
1214 "pxor %%mm7, %%mm7 \n\t"
1215 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1216 "pop %%"REG_BP" \n\t"
1217 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1218 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1219 "a" (&c->redDither)
1220 );
1221 return;
1222 case PIX_FMT_RGB555:
1223 __asm__ volatile(
1224 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1225 "mov %4, %%"REG_b" \n\t"
1226 "push %%"REG_BP" \n\t"
1227 YSCALEYUV2RGB(%%REGBP, %5)
1228 "pxor %%mm7, %%mm7 \n\t"
1229 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1230 #ifdef DITHER1XBPP
1231 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1232 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1233 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1234 #endif
1235
1236 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1237 "pop %%"REG_BP" \n\t"
1238 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1239
1240 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1241 "a" (&c->redDither)
1242 );
1243 return;
1244 case PIX_FMT_RGB565:
1245 __asm__ volatile(
1246 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1247 "mov %4, %%"REG_b" \n\t"
1248 "push %%"REG_BP" \n\t"
1249 YSCALEYUV2RGB(%%REGBP, %5)
1250 "pxor %%mm7, %%mm7 \n\t"
1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1252 #ifdef DITHER1XBPP
1253 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1254 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1255 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1256 #endif
1257
1258 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1259 "pop %%"REG_BP" \n\t"
1260 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1261 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1262 "a" (&c->redDither)
1263 );
1264 return;
1265 case PIX_FMT_YUYV422:
1266 __asm__ volatile(
1267 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1268 "mov %4, %%"REG_b" \n\t"
1269 "push %%"REG_BP" \n\t"
1270 YSCALEYUV2PACKED(%%REGBP, %5)
1271 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1272 "pop %%"REG_BP" \n\t"
1273 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275 "a" (&c->redDither)
1276 );
1277 return;
1278 default: break;
1279 }
1280 }
1281 #endif //HAVE_MMX
1282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1283 }
1284
1285 /**
1286 * YV12 to RGB without scaling or interpolating
1287 */
1288 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1290 {
1291 const int yalpha1=0;
1292 int i;
1293
1294 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1295 const int yalpha= 4096; //FIXME ...
1296
1297 if (flags&SWS_FULL_CHR_H_INT)
1298 {
1299 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300 return;
1301 }
1302
1303 #if HAVE_MMX
1304 if(!(flags & SWS_BITEXACT)){
1305 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1306 {
1307 switch(dstFormat)
1308 {
1309 case PIX_FMT_RGB32:
1310 __asm__ volatile(
1311 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1312 "mov %4, %%"REG_b" \n\t"
1313 "push %%"REG_BP" \n\t"
1314 YSCALEYUV2RGB1(%%REGBP, %5)
1315 "pcmpeqd %%mm7, %%mm7 \n\t"
1316 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1317 "pop %%"REG_BP" \n\t"
1318 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1319
1320 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1321 "a" (&c->redDither)
1322 );
1323 return;
1324 case PIX_FMT_BGR24:
1325 __asm__ volatile(
1326 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1327 "mov %4, %%"REG_b" \n\t"
1328 "push %%"REG_BP" \n\t"
1329 YSCALEYUV2RGB1(%%REGBP, %5)
1330 "pxor %%mm7, %%mm7 \n\t"
1331 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1332 "pop %%"REG_BP" \n\t"
1333 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1334
1335 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1336 "a" (&c->redDither)
1337 );
1338 return;
1339 case PIX_FMT_RGB555:
1340 __asm__ volatile(
1341 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1342 "mov %4, %%"REG_b" \n\t"
1343 "push %%"REG_BP" \n\t"
1344 YSCALEYUV2RGB1(%%REGBP, %5)
1345 "pxor %%mm7, %%mm7 \n\t"
1346 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1347 #ifdef DITHER1XBPP
1348 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1349 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1350 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1351 #endif
1352 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1353 "pop %%"REG_BP" \n\t"
1354 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1355
1356 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1357 "a" (&c->redDither)
1358 );
1359 return;
1360 case PIX_FMT_RGB565:
1361 __asm__ volatile(
1362 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1363 "mov %4, %%"REG_b" \n\t"
1364 "push %%"REG_BP" \n\t"
1365 YSCALEYUV2RGB1(%%REGBP, %5)
1366 "pxor %%mm7, %%mm7 \n\t"
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368 #ifdef DITHER1XBPP
1369 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1370 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1371 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1372 #endif
1373
1374 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1375 "pop %%"REG_BP" \n\t"
1376 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1377
1378 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1379 "a" (&c->redDither)
1380 );
1381 return;
1382 case PIX_FMT_YUYV422:
1383 __asm__ volatile(
1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_b" \n\t"
1386 "push %%"REG_BP" \n\t"
1387 YSCALEYUV2PACKED1(%%REGBP, %5)
1388 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1389 "pop %%"REG_BP" \n\t"
1390 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1391
1392 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1393 "a" (&c->redDither)
1394 );
1395 return;
1396 }
1397 }
1398 else
1399 {
1400 switch(dstFormat)
1401 {
1402 case PIX_FMT_RGB32:
1403 __asm__ volatile(
1404 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1405 "mov %4, %%"REG_b" \n\t"
1406 "push %%"REG_BP" \n\t"
1407 YSCALEYUV2RGB1b(%%REGBP, %5)
1408 "pcmpeqd %%mm7, %%mm7 \n\t"
1409 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1410 "pop %%"REG_BP" \n\t"
1411 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1412
1413 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414 "a" (&c->redDither)
1415 );
1416 return;
1417 case PIX_FMT_BGR24:
1418 __asm__ volatile(
1419 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1420 "mov %4, %%"REG_b" \n\t"
1421 "push %%"REG_BP" \n\t"
1422 YSCALEYUV2RGB1b(%%REGBP, %5)
1423 "pxor %%mm7, %%mm7 \n\t"
1424 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1425 "pop %%"REG_BP" \n\t"
1426 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1427
1428 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1429 "a" (&c->redDither)
1430 );
1431 return;
1432 case PIX_FMT_RGB555:
1433 __asm__ volatile(
1434 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1435 "mov %4, %%"REG_b" \n\t"
1436 "push %%"REG_BP" \n\t"
1437 YSCALEYUV2RGB1b(%%REGBP, %5)
1438 "pxor %%mm7, %%mm7 \n\t"
1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1440 #ifdef DITHER1XBPP
1441 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1442 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1443 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1444 #endif
1445 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1446 "pop %%"REG_BP" \n\t"
1447 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1448
1449 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1450 "a" (&c->redDither)
1451 );
1452 return;
1453 case PIX_FMT_RGB565:
1454 __asm__ volatile(
1455 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1456 "mov %4, %%"REG_b" \n\t"
1457 "push %%"REG_BP" \n\t"
1458 YSCALEYUV2RGB1b(%%REGBP, %5)
1459 "pxor %%mm7, %%mm7 \n\t"
1460 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1461 #ifdef DITHER1XBPP
1462 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1463 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1464 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1465 #endif
1466
1467 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1468 "pop %%"REG_BP" \n\t"
1469 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1470
1471 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1472 "a" (&c->redDither)
1473 );
1474 return;
1475 case PIX_FMT_YUYV422:
1476 __asm__ volatile(
1477 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1478 "mov %4, %%"REG_b" \n\t"
1479 "push %%"REG_BP" \n\t"
1480 YSCALEYUV2PACKED1b(%%REGBP, %5)
1481 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1482 "pop %%"REG_BP" \n\t"
1483 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1484
1485 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1486 "a" (&c->redDither)
1487 );
1488 return;
1489 }
1490 }
1491 }
1492 #endif /* HAVE_MMX */
1493 if (uvalpha < 2048)
1494 {
1495 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1496 }else{
1497 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1498 }
1499 }
1500
1501 //FIXME yuy2* can read up to 7 samples too much
1502
1503 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1504 {
1505 #if HAVE_MMX
1506 __asm__ volatile(
1507 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1508 "mov %0, %%"REG_a" \n\t"
1509 "1: \n\t"
1510 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1511 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1512 "pand %%mm2, %%mm0 \n\t"
1513 "pand %%mm2, %%mm1 \n\t"
1514 "packuswb %%mm1, %%mm0 \n\t"
1515 "movq %%mm0, (%2, %%"REG_a") \n\t"
1516 "add $8, %%"REG_a" \n\t"
1517 " js 1b \n\t"
1518 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1519 : "%"REG_a
1520 );
1521 #else
1522 int i;
1523 for (i=0; i<width; i++)
1524 dst[i]= src[2*i];
1525 #endif
1526 }
1527
1528 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1529 {
1530 #if HAVE_MMX
1531 __asm__ volatile(
1532 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1533 "mov %0, %%"REG_a" \n\t"
1534 "1: \n\t"
1535 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1536 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1537 "psrlw $8, %%mm0 \n\t"
1538 "psrlw $8, %%mm1 \n\t"
1539 "packuswb %%mm1, %%mm0 \n\t"
1540 "movq %%mm0, %%mm1 \n\t"
1541 "psrlw $8, %%mm0 \n\t"
1542 "pand %%mm4, %%mm1 \n\t"
1543 "packuswb %%mm0, %%mm0 \n\t"
1544 "packuswb %%mm1, %%mm1 \n\t"
1545 "movd %%mm0, (%3, %%"REG_a") \n\t"
1546 "movd %%mm1, (%2, %%"REG_a") \n\t"
1547 "add $4, %%"REG_a" \n\t"
1548 " js 1b \n\t"
1549 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1550 : "%"REG_a
1551 );
1552 #else
1553 int i;
1554 for (i=0; i<width; i++)
1555 {
1556 dstU[i]= src1[4*i + 1];
1557 dstV[i]= src1[4*i + 3];
1558 }
1559 #endif
1560 assert(src1 == src2);
1561 }
1562
1563 /* This is almost identical to the previous, end exists only because
1564 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1565 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1566 {
1567 #if HAVE_MMX
1568 __asm__ volatile(
1569 "mov %0, %%"REG_a" \n\t"
1570 "1: \n\t"
1571 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1572 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1573 "psrlw $8, %%mm0 \n\t"
1574 "psrlw $8, %%mm1 \n\t"
1575 "packuswb %%mm1, %%mm0 \n\t"
1576 "movq %%mm0, (%2, %%"REG_a") \n\t"
1577 "add $8, %%"REG_a" \n\t"
1578 " js 1b \n\t"
1579 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1580 : "%"REG_a
1581 );
1582 #else
1583 int i;
1584 for (i=0; i<width; i++)
1585 dst[i]= src[2*i+1];
1586 #endif
1587 }
1588
1589 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1590 {
1591 #if HAVE_MMX
1592 __asm__ volatile(
1593 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1594 "mov %0, %%"REG_a" \n\t"
1595 "1: \n\t"
1596 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1597 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1598 "pand %%mm4, %%mm0 \n\t"
1599 "pand %%mm4, %%mm1 \n\t"
1600 "packuswb %%mm1, %%mm0 \n\t"
1601 "movq %%mm0, %%mm1 \n\t"
1602 "psrlw $8, %%mm0 \n\t"
1603 "pand %%mm4, %%mm1 \n\t"
1604 "packuswb %%mm0, %%mm0 \n\t"
1605 "packuswb %%mm1, %%mm1 \n\t"
1606 "movd %%mm0, (%3, %%"REG_a") \n\t"
1607 "movd %%mm1, (%2, %%"REG_a") \n\t"
1608 "add $4, %%"REG_a" \n\t"
1609 " js 1b \n\t"
1610 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1611 : "%"REG_a
1612 );
1613 #else
1614 int i;
1615 for (i=0; i<width; i++)
1616 {
1617 dstU[i]= src1[4*i + 0];
1618 dstV[i]= src1[4*i + 2];
1619 }
1620 #endif
1621 assert(src1 == src2);
1622 }
1623
1624 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1625 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1626 {\
1627 int i;\
1628 for (i=0; i<width; i++)\
1629 {\
1630 int b= (((type*)src)[i]>>shb)&maskb;\
1631 int g= (((type*)src)[i]>>shg)&maskg;\
1632 int r= (((type*)src)[i]>>shr)&maskr;\
1633 \
1634 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1635 }\
1636 }
1637
1638 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1639 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1640 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1641 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1642 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1643 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1644
1645 #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1646 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1647 {\
1648 int i;\
1649 for (i=0; i<width; i++)\
1650 {\
1651 int b= (((type*)src)[i]&maskb)>>shb;\
1652 int g= (((type*)src)[i]&maskg)>>shg;\
1653 int r= (((type*)src)[i]&maskr)>>shr;\
1654 \
1655 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1656 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1657 }\
1658 }\
1659 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1660 {\
1661 int i;\
1662 for (i=0; i<width; i++)\
1663 {\
1664 int pix0= ((type*)src)[2*i+0];\
1665 int pix1= ((type*)src)[2*i+1];\
1666 int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1667 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1668 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1669 g&= maskg|(2*maskg);\
1670 \
1671 g>>=shg;\
1672 \
1673 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1674 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1675 }\
1676 }
1677
1678 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1679 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1680 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1681 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1682 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1683 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1684
1685 #if HAVE_MMX
1686 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1687 {
1688
1689 if(srcFormat == PIX_FMT_BGR24){
1690 __asm__ volatile(
1691 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1692 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1693 :
1694 );
1695 }else{
1696 __asm__ volatile(
1697 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1698 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1699 :
1700 );
1701 }
1702
1703 __asm__ volatile(
1704 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1705 "mov %2, %%"REG_a" \n\t"
1706 "pxor %%mm7, %%mm7 \n\t"
1707 "1: \n\t"
1708 PREFETCH" 64(%0) \n\t"
1709 "movd (%0), %%mm0 \n\t"
1710 "movd 2(%0), %%mm1 \n\t"
1711 "movd 6(%0), %%mm2 \n\t"
1712 "movd 8(%0), %%mm3 \n\t"
1713 "add $12, %0 \n\t"
1714 "punpcklbw %%mm7, %%mm0 \n\t"
1715 "punpcklbw %%mm7, %%mm1 \n\t"
1716 "punpcklbw %%mm7, %%mm2 \n\t"
1717 "punpcklbw %%mm7, %%mm3 \n\t"
1718 "pmaddwd %%mm5, %%mm0 \n\t"
1719 "pmaddwd %%mm6, %%mm1 \n\t"
1720 "pmaddwd %%mm5, %%mm2 \n\t"
1721 "pmaddwd %%mm6, %%mm3 \n\t"
1722 "paddd %%mm1, %%mm0 \n\t"
1723 "paddd %%mm3, %%mm2 \n\t"
1724 "paddd %%mm4, %%mm0 \n\t"
1725 "paddd %%mm4, %%mm2 \n\t"
1726 "psrad $15, %%mm0 \n\t"
1727 "psrad $15, %%mm2 \n\t"
1728 "packssdw %%mm2, %%mm0 \n\t"
1729 "packuswb %%mm0, %%mm0 \n\t"
1730 "movd %%mm0, (%1, %%"REG_a") \n\t"
1731 "add $4, %%"REG_a" \n\t"
1732 " js 1b \n\t"
1733 : "+r" (src)
1734 : "r" (dst+width), "g" ((x86_reg)-width)
1735 : "%"REG_a
1736 );
1737 }
1738
1739 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1740 {
1741 __asm__ volatile(
1742 "movq 24+%4, %%mm6 \n\t"
1743 "mov %3, %%"REG_a" \n\t"
1744 "pxor %%mm7, %%mm7 \n\t"
1745 "1: \n\t"
1746 PREFETCH" 64(%0) \n\t"
1747 "movd (%0), %%mm0 \n\t"
1748 "movd 2(%0), %%mm1 \n\t"
1749 "punpcklbw %%mm7, %%mm0 \n\t"
1750 "punpcklbw %%mm7, %%mm1 \n\t"
1751 "movq %%mm0, %%mm2 \n\t"
1752 "movq %%mm1, %%mm3 \n\t"
1753 "pmaddwd %4, %%mm0 \n\t"
1754 "pmaddwd 8+%4, %%mm1 \n\t"
1755 "pmaddwd 16+%4, %%mm2 \n\t"
1756 "pmaddwd %%mm6, %%mm3 \n\t"
1757 "paddd %%mm1, %%mm0 \n\t"
1758 "paddd %%mm3, %%mm2 \n\t"
1759
1760 "movd 6(%0), %%mm1 \n\t"
1761 "movd 8(%0), %%mm3 \n\t"
1762 "add $12, %0 \n\t"
1763 "punpcklbw %%mm7, %%mm1 \n\t"
1764 "punpcklbw %%mm7, %%mm3 \n\t"
1765 "movq %%mm1, %%mm4 \n\t"
1766 "movq %%mm3, %%mm5 \n\t"
1767 "pmaddwd %4, %%mm1 \n\t"
1768 "pmaddwd 8+%4, %%mm3 \n\t"
1769 "pmaddwd 16+%4, %%mm4 \n\t"
1770 "pmaddwd %%mm6, %%mm5 \n\t"
1771 "paddd %%mm3, %%mm1 \n\t"
1772 "paddd %%mm5, %%mm4 \n\t"
1773
1774 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1775 "paddd %%mm3, %%mm0 \n\t"
1776 "paddd %%mm3, %%mm2 \n\t"
1777 "paddd %%mm3, %%mm1 \n\t"
1778 "paddd %%mm3, %%mm4 \n\t"
1779 "psrad $15, %%mm0 \n\t"
1780 "psrad $15, %%mm2 \n\t"
1781 "psrad $15, %%mm1 \n\t"
1782 "psrad $15, %%mm4 \n\t"
1783 "packssdw %%mm1, %%mm0 \n\t"
1784 "packssdw %%mm4, %%mm2 \n\t"
1785 "packuswb %%mm0, %%mm0 \n\t"
1786 "packuswb %%mm2, %%mm2 \n\t"
1787 "movd %%mm0, (%1, %%"REG_a") \n\t"
1788 "movd %%mm2, (%2, %%"REG_a") \n\t"
1789 "add $4, %%"REG_a" \n\t"
1790 " js 1b \n\t"
1791 : "+r" (src)
1792 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1793 : "%"REG_a
1794 );
1795 }
1796 #endif
1797
1798 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1799 {
1800 #if HAVE_MMX
1801 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1802 #else
1803 int i;
1804 for (i=0; i<width; i++)
1805 {
1806 int b= src[i*3+0];
1807 int g= src[i*3+1];
1808 int r= src[i*3+2];
1809
1810 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1811 }
1812 #endif /* HAVE_MMX */
1813 }
1814
1815 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1816 {
1817 #if HAVE_MMX
1818 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1819 #else
1820 int i;
1821 for (i=0; i<width; i++)
1822 {
1823 int b= src1[3*i + 0];
1824 int g= src1[3*i + 1];
1825 int r= src1[3*i + 2];
1826
1827 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1828 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1829 }
1830 #endif /* HAVE_MMX */
1831 assert(src1 == src2);
1832 }
1833
1834 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1835 {
1836 int i;
1837 for (i=0; i<width; i++)
1838 {
1839 int b= src1[6*i + 0] + src1[6*i + 3];
1840 int g= src1[6*i + 1] + src1[6*i + 4];
1841 int r= src1[6*i + 2] + src1[6*i + 5];
1842
1843 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1844 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1845 }
1846 assert(src1 == src2);
1847 }
1848
1849 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1850 {
1851 #if HAVE_MMX
1852 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1853 #else
1854 int i;
1855 for (i=0; i<width; i++)
1856 {
1857 int r= src[i*3+0];
1858 int g= src[i*3+1];
1859 int b= src[i*3+2];
1860
1861 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1862 }
1863 #endif
1864 }
1865
1866 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1867 {
1868 #if HAVE_MMX
1869 assert(src1==src2);
1870 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1871 #else
1872 int i;
1873 assert(src1==src2);
1874 for (i=0; i<width; i++)
1875 {
1876 int r= src1[3*i + 0];
1877 int g= src1[3*i + 1];
1878 int b= src1[3*i + 2];
1879
1880 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1881 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1882 }
1883 #endif
1884 }
1885
1886 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1887 {
1888 int i;
1889 assert(src1==src2);
1890 for (i=0; i<width; i++)
1891 {
1892 int r= src1[6*i + 0] + src1[6*i + 3];
1893 int g= src1[6*i + 1] + src1[6*i + 4];
1894 int b= src1[6*i + 2] + src1[6*i + 5];
1895
1896 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1897 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1898 }
1899 }
1900
1901
1902 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
1903 {
1904 int i;
1905 for (i=0; i<width; i++)
1906 {
1907 int d= src[i];
1908
1909 dst[i]= pal[d] & 0xFF;
1910 }
1911 }
1912
1913 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
1914 {
1915 int i;
1916 assert(src1 == src2);
1917 for (i=0; i<width; i++)
1918 {
1919 int p= pal[src1[i]];
1920
1921 dstU[i]= p>>8;
1922 dstV[i]= p>>16;
1923 }
1924 }
1925
1926 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1927 {
1928 int i, j;
1929 for (i=0; i<width/8; i++){
1930 int d= ~src[i];
1931 for(j=0; j<8; j++)
1932 dst[8*i+j]= ((d>>(7-j))&1)*255;
1933 }
1934 }
1935
1936 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1937 {
1938 int i, j;
1939 for (i=0; i<width/8; i++){
1940 int d= src[i];
1941 for(j=0; j<8; j++)
1942 dst[8*i+j]= ((d>>(7-j))&1)*255;
1943 }
1944 }
1945
1946 // bilinear / bicubic scaling
1947 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1948 int16_t *filter, int16_t *filterPos, long filterSize)
1949 {
1950 #if HAVE_MMX
1951 assert(filterSize % 4 == 0 && filterSize>0);
1952 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
1953 {
1954 x86_reg counter= -2*dstW;
1955 filter-= counter*2;
1956 filterPos-= counter/2;
1957 dst-= counter/2;
1958 __asm__ volatile(
1959 #if defined(PIC)
1960 "push %%"REG_b" \n\t"
1961 #endif
1962 "pxor %%mm7, %%mm7 \n\t"
1963 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1964 "mov %%"REG_a", %%"REG_BP" \n\t"
1965 ASMALIGN(4)
1966 "1: \n\t"
1967 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1968 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1969 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1970 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1971 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1972 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1973 "punpcklbw %%mm7, %%mm0 \n\t"
1974 "punpcklbw %%mm7, %%mm2 \n\t"
1975 "pmaddwd %%mm1, %%mm0 \n\t"
1976 "pmaddwd %%mm2, %%mm3 \n\t"
1977 "movq %%mm0, %%mm4 \n\t"
1978 "punpckldq %%mm3, %%mm0 \n\t"
1979 "punpckhdq %%mm3, %%mm4 \n\t"
1980 "paddd %%mm4, %%mm0 \n\t"
1981 "psrad $7, %%mm0 \n\t"
1982 "packssdw %%mm0, %%mm0 \n\t"
1983 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1984 "add $4, %%"REG_BP" \n\t"
1985 " jnc 1b \n\t"
1986
1987 "pop %%"REG_BP" \n\t"
1988 #if defined(PIC)
1989 "pop %%"REG_b" \n\t"
1990 #endif
1991 : "+a" (counter)
1992 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1993 #if !defined(PIC)
1994 : "%"REG_b
1995 #endif
1996 );
1997 }
1998 else if (filterSize==8)
1999 {
2000 x86_reg counter= -2*dstW;
2001 filter-= counter*4;
2002 filterPos-= counter/2;
2003 dst-= counter/2;
2004 __asm__ volatile(
2005 #if defined(PIC)
2006 "push %%"REG_b" \n\t"
2007 #endif
2008 "pxor %%mm7, %%mm7 \n\t"
2009 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2010 "mov %%"REG_a", %%"REG_BP" \n\t"
2011 ASMALIGN(4)
2012 "1: \n\t"
2013 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2014 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2015 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2016 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2017 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2018 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm0 \n\t"
2020 "punpcklbw %%mm7, %%mm2 \n\t"
2021 "pmaddwd %%mm1, %%mm0 \n\t"
2022 "pmaddwd %%mm2, %%mm3 \n\t"
2023
2024 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2025 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2026 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2027 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2028 "punpcklbw %%mm7, %%mm4 \n\t"
2029 "punpcklbw %%mm7, %%mm2 \n\t"
2030 "pmaddwd %%mm1, %%mm4 \n\t"
2031 "pmaddwd %%mm2, %%mm5 \n\t"
2032 "paddd %%mm4, %%mm0 \n\t"
2033 "paddd %%mm5, %%mm3 \n\t"
2034 "movq %%mm0, %%mm4 \n\t"
2035 "punpckldq %%mm3, %%mm0 \n\t"
2036 "punpckhdq %%mm3, %%mm4 \n\t"
2037 "paddd %%mm4, %%mm0 \n\t"
2038 "psrad $7, %%mm0 \n\t"
2039 "packssdw %%mm0, %%mm0 \n\t"
2040 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2041 "add $4, %%"REG_BP" \n\t"
2042 " jnc 1b \n\t"
2043
2044 "pop %%"REG_BP" \n\t"
2045 #if defined(PIC)
2046 "pop %%"REG_b" \n\t"
2047 #endif
2048 : "+a" (counter)
2049 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2050 #if !defined(PIC)
2051 : "%"REG_b
2052 #endif
2053 );
2054 }
2055 else
2056 {
2057 uint8_t *offset = src+filterSize;
2058 x86_reg counter= -2*dstW;
2059 //filter-= counter*filterSize/2;
2060 filterPos-= counter/2;
2061 dst-= counter/2;
2062 __asm__ volatile(
2063 "pxor %%mm7, %%mm7 \n\t"
2064 ASMALIGN(4)
2065 "1: \n\t"
2066 "mov %2, %%"REG_c" \n\t"
2067 "movzwl (%%"REG_c", %0), %%eax \n\t"
2068 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2069 "mov %5, %%"REG_c" \n\t"
2070 "pxor %%mm4, %%mm4 \n\t"
2071 "pxor %%mm5, %%mm5 \n\t"
2072 "2: \n\t"
2073 "movq (%1), %%mm1 \n\t"
2074 "movq (%1, %6), %%mm3 \n\t"
2075 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2076 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2077 "punpcklbw %%mm7, %%mm0 \n\t"
2078 "punpcklbw %%mm7, %%mm2 \n\t"
2079 "pmaddwd %%mm1, %%mm0 \n\t"
2080 "pmaddwd %%mm2, %%mm3 \n\t"
2081 "paddd %%mm3, %%mm5 \n\t"
2082 "paddd %%mm0, %%mm4 \n\t"
2083 "add $8, %1 \n\t"
2084 "add $4, %%"REG_c" \n\t"
2085 "cmp %4, %%"REG_c" \n\t"
2086 " jb 2b \n\t"
2087 "add %6, %1 \n\t"
2088 "movq %%mm4, %%mm0 \n\t"
2089 "punpckldq %%mm5, %%mm4 \n\t"
2090 "punpckhdq %%mm5, %%mm0 \n\t"
2091 "paddd %%mm0, %%mm4 \n\t"
2092 "psrad $7, %%mm4 \n\t"
2093 "packssdw %%mm4, %%mm4 \n\t"
2094 "mov %3, %%"REG_a" \n\t"
2095 "movd %%mm4, (%%"REG_a", %0) \n\t"
2096 "add $4, %0 \n\t"
2097 " jnc 1b \n\t"
2098
2099 : "+r" (counter), "+r" (filter)
2100 : "m" (filterPos), "m" (dst), "m"(offset),
2101 "m" (src), "r" ((x86_reg)filterSize*2)
2102 : "%"REG_a, "%"REG_c, "%"REG_d
2103 );
2104 }
2105 #else
2106 #if HAVE_ALTIVEC
2107 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2108 #else
2109 int i;
2110 for (i=0; i<dstW; i++)
2111 {
2112 int j;
2113 int srcPos= filterPos[i];
2114 int val=0;
2115 //printf("filterPos: %d\n", filterPos[i]);
2116 for (j=0; j<filterSize; j++)
2117 {
2118 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2119 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2120 }
2121 //filter += hFilterSize;
2122 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2123 //dst[i] = val>>7;
2124 }
2125 #endif /* HAVE_ALTIVEC */
2126 #endif /* HAVE_MMX */
2127 }
2128 // *** horizontal scale Y line to temp buffer
2129 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2130 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2131 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2132 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2133 int32_t *mmx2FilterPos, uint32_t *pal)
2134 {
2135 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2136 {
2137 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2138 src= formatConvBuffer;
2139 }
2140 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2141 {
2142 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2143 src= formatConvBuffer;
2144 }
2145 else if (srcFormat==PIX_FMT_RGB32)
2146 {
2147 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2148 src= formatConvBuffer;
2149 }
2150 else if (srcFormat==PIX_FMT_RGB32_1)
2151 {
2152 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2153 src= formatConvBuffer;
2154 }
2155 else if (srcFormat==PIX_FMT_BGR24)
2156 {
2157 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2158 src= formatConvBuffer;
2159 }
2160 else if (srcFormat==PIX_FMT_BGR565)
2161 {
2162 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2163 src= formatConvBuffer;
2164 }
2165 else if (srcFormat==PIX_FMT_BGR555)
2166 {
2167 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2168 src= formatConvBuffer;
2169 }
2170 else if (srcFormat==PIX_FMT_BGR32)
2171 {
2172 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2173 src= formatConvBuffer;
2174 }
2175 else if (srcFormat==PIX_FMT_BGR32_1)
2176 {
2177 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2178 src= formatConvBuffer;
2179 }
2180 else if (srcFormat==PIX_FMT_RGB24)
2181 {
2182 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2183 src= formatConvBuffer;
2184 }
2185 else if (srcFormat==PIX_FMT_RGB565)
2186 {
2187 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2188 src= formatConvBuffer;
2189 }
2190 else if (srcFormat==PIX_FMT_RGB555)
2191 {
2192 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2193 src= formatConvBuffer;
2194 }
2195 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2196 {
2197 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2198 src= formatConvBuffer;
2199 }
2200 else if (srcFormat==PIX_FMT_MONOBLACK)
2201 {
2202 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2203 src= formatConvBuffer;
2204 }
2205 else if (srcFormat==PIX_FMT_MONOWHITE)
2206 {
2207 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2208 src= formatConvBuffer;
2209 }
2210
2211 #if HAVE_MMX
2212 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2213 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2214 #else
2215 if (!(flags&SWS_FAST_BILINEAR))
2216 #endif
2217 {
2218 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2219 }
2220 else // fast bilinear upscale / crap downscale
2221 {
2222 #if ARCH_X86 && CONFIG_GPL
2223 #if HAVE_MMX2
2224 int i;
2225 #if defined(PIC)
2226 uint64_t ebxsave __attribute__((aligned(8)));
2227 #endif
2228 if (canMMX2BeUsed)
2229 {
2230 __asm__ volatile(
2231 #if defined(PIC)
2232 "mov %%"REG_b", %5 \n\t"
2233 #endif
2234 "pxor %%mm7, %%mm7 \n\t"
2235 "mov %0, %%"REG_c" \n\t"
2236 "mov %1, %%"REG_D" \n\t"
2237 "mov %2, %%"REG_d" \n\t"
2238 "mov %3, %%"REG_b" \n\t"
2239 "xor %%"REG_a", %%"REG_a" \n\t" // i
2240 PREFETCH" (%%"REG_c") \n\t"
2241 PREFETCH" 32(%%"REG_c") \n\t"
2242 PREFETCH" 64(%%"REG_c") \n\t"
2243
2244 #if ARCH_X86_64
2245
2246 #define FUNNY_Y_CODE \
2247 "movl (%%"REG_b"), %%esi \n\t"\
2248 "call *%4 \n\t"\
2249 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2250 "add %%"REG_S", %%"REG_c" \n\t"\
2251 "add %%"REG_a", %%"REG_D" \n\t"\
2252 "xor %%"REG_a", %%"REG_a" \n\t"\
2253
2254 #else
2255
2256 #define FUNNY_Y_CODE \
2257 "movl (%%"REG_b"), %%esi \n\t"\
2258 "call *%4 \n\t"\
2259 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2260 "add %%"REG_a", %%"REG_D" \n\t"\
2261 "xor %%"REG_a", %%"REG_a" \n\t"\
2262
2263 #endif /* ARCH_X86_64 */
2264
2265 FUNNY_Y_CODE
2266 FUNNY_Y_CODE
2267 FUNNY_Y_CODE
2268 FUNNY_Y_CODE
2269 FUNNY_Y_CODE
2270 FUNNY_Y_CODE
2271 FUNNY_Y_CODE
2272 FUNNY_Y_CODE
2273
2274 #if defined(PIC)
2275 "mov %5, %%"REG_b" \n\t"
2276 #endif
2277 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2278 "m" (funnyYCode)
2279 #if defined(PIC)
2280 ,"m" (ebxsave)
2281 #endif
2282 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2283 #if !defined(PIC)
2284 ,"%"REG_b
2285 #endif
2286 );
2287 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2288 }
2289 else
2290 {
2291 #endif /* HAVE_MMX2 */
2292 x86_reg xInc_shr16 = xInc >> 16;
2293 uint16_t xInc_mask = xInc & 0xffff;
2294 //NO MMX just normal asm ...
2295 __asm__ volatile(
2296 "xor %%"REG_a", %%"REG_a" \n\t" // i
2297 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2298 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2299 ASMALIGN(4)
2300 "1: \n\t"
2301 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2302 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2303 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2304 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2305 "shll $16, %%edi \n\t"
2306 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2307 "mov %1, %%"REG_D" \n\t"
2308 "shrl $9, %%esi \n\t"
2309 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2310 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2311 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2312
2313 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2314 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2315 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2316 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2317 "shll $16, %%edi \n\t"
2318 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2319 "mov %1, %%"REG_D" \n\t"
2320 "shrl $9, %%esi \n\t"
2321 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2322 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2323 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2324
2325
2326 "add $2, %%"REG_a" \n\t"
2327 "cmp %2, %%"REG_a" \n\t"
2328 " jb 1b \n\t"
2329
2330
2331 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2332 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2333 );
2334 #if HAVE_MMX2
2335 } //if MMX2 can't be used
2336 #endif
2337 #else
2338 int i;
2339 unsigned int xpos=0;
2340 for (i=0;i<dstWidth;i++)
2341 {
2342 register unsigned int xx=xpos>>16;
2343 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2344 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2345 xpos+=xInc;
2346 }
2347 #endif /* ARCH_X86 */
2348 }
2349
2350 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2351 int i;
2352 //FIXME all pal and rgb srcFormats could do this convertion as well
2353 //FIXME all scalers more complex than bilinear could do half of this transform
2354 if(c->srcRange){
2355 for (i=0; i<dstWidth; i++)
2356 dst[i]= (dst[i]*14071 + 33561947)>>14;
2357 }else{
2358 for (i=0; i<dstWidth; i++)
2359 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2360 }
2361 }
2362 }
2363
2364 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2365 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2366 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2367 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2368 int32_t *mmx2FilterPos, uint32_t *pal)
2369 {
2370 if (srcFormat==PIX_FMT_YUYV422)
2371 {
2372 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2373 src1= formatConvBuffer;
2374 src2= formatConvBuffer+VOFW;
2375 }
2376 else if (srcFormat==PIX_FMT_UYVY422)
2377 {
2378 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2379 src1= formatConvBuffer;
2380 src2= formatConvBuffer+VOFW;
2381 }
2382 else if (srcFormat==PIX_FMT_RGB32)
2383 {
2384 if(c->chrSrcHSubSample)
2385 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2386 else
2387 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2388 src1= formatConvBuffer;
2389 src2= formatConvBuffer+VOFW;
2390 }
2391 else if (srcFormat==PIX_FMT_RGB32_1)
2392 {
2393 if(c->chrSrcHSubSample)
2394 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2395 else
2396 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2397 src1= formatConvBuffer;
2398 src2= formatConvBuffer+VOFW;
2399 }
2400 else if (srcFormat==PIX_FMT_BGR24)
2401 {
2402 if(c->chrSrcHSubSample)
2403 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2404 else
2405 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2406 src1= formatConvBuffer;
2407 src2= formatConvBuffer+VOFW;
2408 }
2409 else if (srcFormat==PIX_FMT_BGR565)
2410 {
2411 if(c->chrSrcHSubSample)
2412 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2413 else
2414 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2415 src1= formatConvBuffer;
2416 src2= formatConvBuffer+VOFW;
2417 }
2418 else if (srcFormat==PIX_FMT_BGR555)
2419 {
2420 if(c->chrSrcHSubSample)
2421 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2422 else
2423 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2424 src1= formatConvBuffer;
2425 src2= formatConvBuffer+VOFW;
2426 }
2427 else if (srcFormat==PIX_FMT_BGR32)
2428 {
2429 if(c->chrSrcHSubSample)
2430 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2431 else
2432 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2433 src1= formatConvBuffer;
2434 src2= formatConvBuffer+VOFW;
2435 }
2436 else if (srcFormat==PIX_FMT_BGR32_1)
2437 {
2438 if(c->chrSrcHSubSample)
2439 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2440 else
2441 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2442 src1= formatConvBuffer;
2443 src2= formatConvBuffer+VOFW;
2444 }
2445 else if (srcFormat==PIX_FMT_RGB24)
2446 {
2447 if(c->chrSrcHSubSample)
2448 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2449 else
2450 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2451 src1= formatConvBuffer;
2452 src2= formatConvBuffer+VOFW;
2453 }
2454 else if (srcFormat==PIX_FMT_RGB565)
2455 {
2456 if(c->chrSrcHSubSample)
2457 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2458 else
2459 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2460 src1= formatConvBuffer;
2461 src2= formatConvBuffer+VOFW;
2462 }
2463 else if (srcFormat==PIX_FMT_RGB555)
2464 {
2465 if(c->chrSrcHSubSample)
2466 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2467 else
2468 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2469 src1= formatConvBuffer;
2470 src2= formatConvBuffer+VOFW;
2471 }
2472 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2473 {
2474 return;
2475 }
2476 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2477 {
2478 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2479 src1= formatConvBuffer;
2480 src2= formatConvBuffer+VOFW;
2481 }
2482
2483 #if HAVE_MMX
2484 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2485 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2486 #else
2487 if (!(flags&SWS_FAST_BILINEAR))
2488 #endif
2489 {
2490 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2491 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2492 }
2493 else // fast bilinear upscale / crap downscale
2494 {
2495 #if ARCH_X86 && CONFIG_GPL
2496 #if HAVE_MMX2
2497 int i;
2498 #if defined(PIC)
2499 uint64_t ebxsave __attribute__((aligned(8)));
2500 #endif
2501 if (canMMX2BeUsed)
2502 {
2503 __asm__ volatile(
2504 #if defined(PIC)
2505 "mov %%"REG_b", %6 \n\t"
2506 #endif
2507 "pxor %%mm7, %%mm7 \n\t"
2508 "mov %0, %%"REG_c" \n\t"
2509 "mov %1, %%"REG_D" \n\t"
2510 "mov %2, %%"REG_d" \n\t"
2511 "mov %3, %%"REG_b" \n\t"
2512 "xor %%"REG_a", %%"REG_a" \n\t" // i
2513 PREFETCH" (%%"REG_c") \n\t"
2514 PREFETCH" 32(%%"REG_c") \n\t"
2515 PREFETCH" 64(%%"REG_c") \n\t"
2516
2517 #if ARCH_X86_64
2518
2519 #define FUNNY_UV_CODE \
2520 "movl (%%"REG_b"), %%esi \n\t"\
2521 "call *%4 \n\t"\
2522 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2523 "add %%"REG_S", %%"REG_c" \n\t"\
2524 "add %%"REG_a", %%"REG_D" \n\t"\
2525 "xor %%"REG_a", %%"REG_a" \n\t"\
2526
2527 #else
2528
2529 #define FUNNY_UV_CODE \
2530 "movl (%%"REG_b"), %%esi \n\t"\
2531 "call *%4 \n\t"\
2532 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2533 "add %%"REG_a", %%"REG_D" \n\t"\
2534 "xor %%"REG_a", %%"REG_a" \n\t"\
2535
2536 #endif /* ARCH_X86_64 */
2537
2538 FUNNY_UV_CODE
2539 FUNNY_UV_CODE
2540 FUNNY_UV_CODE
2541 FUNNY_UV_CODE
2542 "xor %%"REG_a", %%"REG_a" \n\t" // i
2543 "mov %5, %%"REG_c" \n\t" // src
2544 "mov %1, %%"REG_D" \n\t" // buf1
2545 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2546 PREFETCH" (%%"REG_c") \n\t"
2547 PREFETCH" 32(%%"REG_c") \n\t"
2548 PREFETCH" 64(%%"REG_c") \n\t"
2549
2550 FUNNY_UV_CODE
2551 FUNNY_UV_CODE
2552 FUNNY_UV_CODE
2553 FUNNY_UV_CODE
2554
2555 #if defined(PIC)
2556 "mov %6, %%"REG_b" \n\t"
2557 #endif
2558 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2559 "m" (funnyUVCode), "m" (src2)
2560 #if defined(PIC)
2561 ,"m" (ebxsave)
2562 #endif
2563 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2564 #if !defined(PIC)
2565 ,"%"REG_b
2566 #endif
2567 );
2568 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2569 {
2570 //printf("%d %d %d\n", dstWidth, i, srcW);
2571 dst[i] = src1[srcW-1]*128;
2572 dst[i+VOFW] = src2[srcW-1]*128;
2573 }
2574 }
2575 else
2576 {
2577 #endif /* HAVE_MMX2 */
2578 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2579 uint16_t xInc_mask = xInc & 0xffff;
2580 __asm__ volatile(
2581 "xor %%"REG_a", %%"REG_a" \n\t" // i
2582 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2583 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2584 ASMALIGN(4)
2585 "1: \n\t"
2586 "mov %0, %%"REG_S" \n\t"
2587 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2588 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2589 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2590 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2591 "shll $16, %%edi \n\t"
2592 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2593 "mov %1, %%"REG_D" \n\t"
2594 "shrl $9, %%esi \n\t"
2595 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2596
2597 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2598 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2599 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2600 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2601 "shll $16, %%edi \n\t"
2602 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2603 "mov %1, %%"REG_D" \n\t"
2604 "shrl $9, %%esi \n\t"
2605 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2606
2607 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2608 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2609 "add $1, %%"REG_a" \n\t"
2610 "cmp %2, %%"REG_a" \n\t"
2611 " jb 1b \n\t"
2612
2613 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2614 which is needed to support GCC 4.0. */
2615 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2616 :: "m" (src1), "m" (dst), "g" ((x86_reg)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2617 #else
2618 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2619 #endif
2620 "r" (src2)
2621 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2622 );
2623 #if HAVE_MMX2
2624 } //if MMX2 can't be used
2625 #endif
2626 #else
2627 int i;
2628 unsigned int xpos=0;
2629 for (i=0;i<dstWidth;i++)
2630 {
2631 register unsigned int xx=xpos>>16;
2632 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2633 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2634 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2635 /* slower
2636 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2637 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2638 */
2639 xpos+=xInc;
2640 }
2641 #endif /* ARCH_X86 */
2642 }
2643 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2644 int i;
2645 //FIXME all pal and rgb srcFormats could do this convertion as well
2646 //FIXME all scalers more complex than bilinear could do half of this transform
2647 if(c->srcRange){
2648 for (i=0; i<dstWidth; i++){
2649 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2650 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2651 }
2652 }else{
2653 for (i=0; i<dstWidth; i++){
2654 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2655 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2656 }
2657 }
2658 }
2659 }
2660
2661 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2662 int srcSliceH, uint8_t* dst[], int dstStride[]){
2663
2664 /* load a few things into local vars to make the code more readable? and faster */
2665 const int srcW= c->srcW;
2666 const int dstW= c->dstW;
2667 const int dstH= c->dstH;
2668 const int chrDstW= c->chrDstW;
2669 const int chrSrcW= c->chrSrcW;
2670 const int lumXInc= c->lumXInc;
2671 const int chrXInc= c->chrXInc;
2672 const int dstFormat= c->dstFormat;
2673 const int srcFormat= c->srcFormat;
2674 const int flags= c->flags;
2675 const int canMMX2BeUsed= c->canMMX2BeUsed;
2676 int16_t *vLumFilterPos= c->vLumFilterPos;
2677 int16_t *vChrFilterPos= c->vChrFilterPos;
2678 int16_t *hLumFilterPos= c->hLumFilterPos;
2679 int16_t *hChrFilterPos= c->hChrFilterPos;
2680 int16_t *vLumFilter= c->vLumFilter;
2681 int16_t *vChrFilter= c->vChrFilter;
2682 int16_t *hLumFilter= c->hLumFilter;
2683 int16_t *hChrFilter= c->hChrFilter;
2684 int32_t *lumMmxFilter= c->lumMmxFilter;
2685 int32_t *chrMmxFilter= c->chrMmxFilter;
2686 const int vLumFilterSize= c->vLumFilterSize;
2687 const int vChrFilterSize= c->vChrFilterSize;
2688 const int hLumFilterSize= c->hLumFilterSize;
2689 const int hChrFilterSize= c->hChrFilterSize;
2690 int16_t **lumPixBuf= c->lumPixBuf;
2691 int16_t **chrPixBuf= c->chrPixBuf;
2692 const int vLumBufSize= c->vLumBufSize;
2693 const int vChrBufSize= c->vChrBufSize;
2694 uint8_t *funnyYCode= c->funnyYCode;
2695 uint8_t *funnyUVCode= c->funnyUVCode;
2696 uint8_t *formatConvBuffer= c->formatConvBuffer;
2697 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2698 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2699 int lastDstY;
2700