b4451f2f90a722b3820bc95a5fe0036c61de1cd8
[libav.git] / libswscale / swscale_template.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * the C code (not assembly, mmx, ...) of this file can be used
21 * under the LGPL license too
22 */
23
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
31
32 #ifdef HAVE_3DNOW
33 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
38
39 #ifdef HAVE_3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined ( HAVE_MMX2 )
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
49
50 #ifdef HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
55
56 #ifdef HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
61
62 #ifdef HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
68
69 #ifdef HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
72
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 asm volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
108
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
170
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
183
184 /*
185 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187 "r" (dest), "m" (dstW),
188 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190 */
191 #define YSCALEYUV2PACKEDX \
192 asm volatile(\
193 "xor %%"REG_a", %%"REG_a" \n\t"\
194 ASMALIGN(4)\
195 "nop \n\t"\
196 "1: \n\t"\
197 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
198 "mov (%%"REG_d"), %%"REG_S" \n\t"\
199 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
200 "movq %%mm3, %%mm4 \n\t"\
201 ASMALIGN(4)\
202 "2: \n\t"\
203 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
204 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
205 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
206 "add $16, %%"REG_d" \n\t"\
207 "mov (%%"REG_d"), %%"REG_S" \n\t"\
208 "pmulhw %%mm0, %%mm2 \n\t"\
209 "pmulhw %%mm0, %%mm5 \n\t"\
210 "paddw %%mm2, %%mm3 \n\t"\
211 "paddw %%mm5, %%mm4 \n\t"\
212 "test %%"REG_S", %%"REG_S" \n\t"\
213 " jnz 2b \n\t"\
214 \
215 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
218 "movq %%mm1, %%mm7 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm1 \n\t"\
229 "paddw %%mm5, %%mm7 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
232
233 #define YSCALEYUV2PACKEDX_END \
234 :: "r" (&c->redDither), \
235 "m" (dummy), "m" (dummy), "m" (dummy),\
236 "r" (dest), "m" (dstW) \
237 : "%"REG_a, "%"REG_d, "%"REG_S \
238 );
239
240 #define YSCALEYUV2PACKEDX_ACCURATE \
241 asm volatile(\
242 "xor %%"REG_a", %%"REG_a" \n\t"\
243 ASMALIGN(4)\
244 "nop \n\t"\
245 "1: \n\t"\
246 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
247 "mov (%%"REG_d"), %%"REG_S" \n\t"\
248 "pxor %%mm4, %%mm4 \n\t"\
249 "pxor %%mm5, %%mm5 \n\t"\
250 "pxor %%mm6, %%mm6 \n\t"\
251 "pxor %%mm7, %%mm7 \n\t"\
252 ASMALIGN(4)\
253 "2: \n\t"\
254 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
255 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
256 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
257 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
258 "movq %%mm0, %%mm3 \n\t"\
259 "punpcklwd %%mm1, %%mm0 \n\t"\
260 "punpckhwd %%mm1, %%mm3 \n\t"\
261 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
262 "pmaddwd %%mm1, %%mm0 \n\t"\
263 "pmaddwd %%mm1, %%mm3 \n\t"\
264 "paddd %%mm0, %%mm4 \n\t"\
265 "paddd %%mm3, %%mm5 \n\t"\
266 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
267 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
268 "add $16, %%"REG_d" \n\t"\
269 "test %%"REG_S", %%"REG_S" \n\t"\
270 "movq %%mm2, %%mm0 \n\t"\
271 "punpcklwd %%mm3, %%mm2 \n\t"\
272 "punpckhwd %%mm3, %%mm0 \n\t"\
273 "pmaddwd %%mm1, %%mm2 \n\t"\
274 "pmaddwd %%mm1, %%mm0 \n\t"\
275 "paddd %%mm2, %%mm6 \n\t"\
276 "paddd %%mm0, %%mm7 \n\t"\
277 " jnz 2b \n\t"\
278 "psrad $16, %%mm4 \n\t"\
279 "psrad $16, %%mm5 \n\t"\
280 "psrad $16, %%mm6 \n\t"\
281 "psrad $16, %%mm7 \n\t"\
282 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
283 "packssdw %%mm5, %%mm4 \n\t"\
284 "packssdw %%mm7, %%mm6 \n\t"\
285 "paddw %%mm0, %%mm4 \n\t"\
286 "paddw %%mm0, %%mm6 \n\t"\
287 "movq %%mm4, "U_TEMP"(%0) \n\t"\
288 "movq %%mm6, "V_TEMP"(%0) \n\t"\
289 \
290 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
291 "mov (%%"REG_d"), %%"REG_S" \n\t"\
292 "pxor %%mm1, %%mm1 \n\t"\
293 "pxor %%mm5, %%mm5 \n\t"\
294 "pxor %%mm7, %%mm7 \n\t"\
295 "pxor %%mm6, %%mm6 \n\t"\
296 ASMALIGN(4)\
297 "2: \n\t"\
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
301 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302 "movq %%mm0, %%mm3 \n\t"\
303 "punpcklwd %%mm4, %%mm0 \n\t"\
304 "punpckhwd %%mm4, %%mm3 \n\t"\
305 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306 "pmaddwd %%mm4, %%mm0 \n\t"\
307 "pmaddwd %%mm4, %%mm3 \n\t"\
308 "paddd %%mm0, %%mm1 \n\t"\
309 "paddd %%mm3, %%mm5 \n\t"\
310 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
312 "add $16, %%"REG_d" \n\t"\
313 "test %%"REG_S", %%"REG_S" \n\t"\
314 "movq %%mm2, %%mm0 \n\t"\
315 "punpcklwd %%mm3, %%mm2 \n\t"\
316 "punpckhwd %%mm3, %%mm0 \n\t"\
317 "pmaddwd %%mm4, %%mm2 \n\t"\
318 "pmaddwd %%mm4, %%mm0 \n\t"\
319 "paddd %%mm2, %%mm7 \n\t"\
320 "paddd %%mm0, %%mm6 \n\t"\
321 " jnz 2b \n\t"\
322 "psrad $16, %%mm1 \n\t"\
323 "psrad $16, %%mm5 \n\t"\
324 "psrad $16, %%mm7 \n\t"\
325 "psrad $16, %%mm6 \n\t"\
326 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
327 "packssdw %%mm5, %%mm1 \n\t"\
328 "packssdw %%mm6, %%mm7 \n\t"\
329 "paddw %%mm0, %%mm1 \n\t"\
330 "paddw %%mm0, %%mm7 \n\t"\
331 "movq "U_TEMP"(%0), %%mm3 \n\t"\
332 "movq "V_TEMP"(%0), %%mm4 \n\t"\
333
334 #define YSCALEYUV2RGBX \
335 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
336 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
337 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
338 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
339 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
340 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
341 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
343 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
344 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
345 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
346 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
347 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
348 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349 "paddw %%mm3, %%mm4 \n\t"\
350 "movq %%mm2, %%mm0 \n\t"\
351 "movq %%mm5, %%mm6 \n\t"\
352 "movq %%mm4, %%mm3 \n\t"\
353 "punpcklwd %%mm2, %%mm2 \n\t"\
354 "punpcklwd %%mm5, %%mm5 \n\t"\
355 "punpcklwd %%mm4, %%mm4 \n\t"\
356 "paddw %%mm1, %%mm2 \n\t"\
357 "paddw %%mm1, %%mm5 \n\t"\
358 "paddw %%mm1, %%mm4 \n\t"\
359 "punpckhwd %%mm0, %%mm0 \n\t"\
360 "punpckhwd %%mm6, %%mm6 \n\t"\
361 "punpckhwd %%mm3, %%mm3 \n\t"\
362 "paddw %%mm7, %%mm0 \n\t"\
363 "paddw %%mm7, %%mm6 \n\t"\
364 "paddw %%mm7, %%mm3 \n\t"\
365 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366 "packuswb %%mm0, %%mm2 \n\t"\
367 "packuswb %%mm6, %%mm5 \n\t"\
368 "packuswb %%mm3, %%mm4 \n\t"\
369 "pxor %%mm7, %%mm7 \n\t"
370 #if 0
371 #define FULL_YSCALEYUV2RGB \
372 "pxor %%mm7, %%mm7 \n\t"\
373 "movd %6, %%mm6 \n\t" /*yalpha1*/\
374 "punpcklwd %%mm6, %%mm6 \n\t"\
375 "punpcklwd %%mm6, %%mm6 \n\t"\
376 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
377 "punpcklwd %%mm5, %%mm5 \n\t"\
378 "punpcklwd %%mm5, %%mm5 \n\t"\
379 "xor %%"REG_a", %%"REG_a" \n\t"\
380 ASMALIGN(4)\
381 "1: \n\t"\
382 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
383 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
384 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
385 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
386 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
387 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
392 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
395 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
398 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
399 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
400 \
401 \
402 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
404 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
405 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
407 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
409 \
410 \
411 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
412 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
413 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
414 "paddw %%mm1, %%mm3 \n\t" /* B*/\
415 "paddw %%mm1, %%mm0 \n\t" /* R*/\
416 "packuswb %%mm3, %%mm3 \n\t"\
417 \
418 "packuswb %%mm0, %%mm0 \n\t"\
419 "paddw %%mm4, %%mm2 \n\t"\
420 "paddw %%mm2, %%mm1 \n\t" /* G*/\
421 \
422 "packuswb %%mm1, %%mm1 \n\t"
423 #endif
424
425 #define REAL_YSCALEYUV2PACKED(index, c) \
426 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
427 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
428 "psraw $3, %%mm0 \n\t"\
429 "psraw $3, %%mm1 \n\t"\
430 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432 "xor "#index", "#index" \n\t"\
433 ASMALIGN(4)\
434 "1: \n\t"\
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
442 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
449 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
450 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
451 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
452 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460
461 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
462
463 #define REAL_YSCALEYUV2RGB(index, c) \
464 "xor "#index", "#index" \n\t"\
465 ASMALIGN(4)\
466 "1: \n\t"\
467 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
469 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
470 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
471 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
474 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
481 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
482 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
483 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
484 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
485 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
486 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
488 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
489 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
490 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
491 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
492 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
493 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
500 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
501 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
502 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
503 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
504 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
505 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506 "paddw %%mm3, %%mm4 \n\t"\
507 "movq %%mm2, %%mm0 \n\t"\
508 "movq %%mm5, %%mm6 \n\t"\
509 "movq %%mm4, %%mm3 \n\t"\
510 "punpcklwd %%mm2, %%mm2 \n\t"\
511 "punpcklwd %%mm5, %%mm5 \n\t"\
512 "punpcklwd %%mm4, %%mm4 \n\t"\
513 "paddw %%mm1, %%mm2 \n\t"\
514 "paddw %%mm1, %%mm5 \n\t"\
515 "paddw %%mm1, %%mm4 \n\t"\
516 "punpckhwd %%mm0, %%mm0 \n\t"\
517 "punpckhwd %%mm6, %%mm6 \n\t"\
518 "punpckhwd %%mm3, %%mm3 \n\t"\
519 "paddw %%mm7, %%mm0 \n\t"\
520 "paddw %%mm7, %%mm6 \n\t"\
521 "paddw %%mm7, %%mm3 \n\t"\
522 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523 "packuswb %%mm0, %%mm2 \n\t"\
524 "packuswb %%mm6, %%mm5 \n\t"\
525 "packuswb %%mm3, %%mm4 \n\t"\
526 "pxor %%mm7, %%mm7 \n\t"
527 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
528
529 #define REAL_YSCALEYUV2PACKED1(index, c) \
530 "xor "#index", "#index" \n\t"\
531 ASMALIGN(4)\
532 "1: \n\t"\
533 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
534 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
535 "psraw $7, %%mm3 \n\t" \
536 "psraw $7, %%mm4 \n\t" \
537 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
538 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
539 "psraw $7, %%mm1 \n\t" \
540 "psraw $7, %%mm7 \n\t" \
541
542 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
543
544 #define REAL_YSCALEYUV2RGB1(index, c) \
545 "xor "#index", "#index" \n\t"\
546 ASMALIGN(4)\
547 "1: \n\t"\
548 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
549 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
550 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
553 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
554 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
555 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
556 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
557 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
558 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
560 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
561 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
564 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
565 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
566 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
567 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
568 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
569 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570 "paddw %%mm3, %%mm4 \n\t"\
571 "movq %%mm2, %%mm0 \n\t"\
572 "movq %%mm5, %%mm6 \n\t"\
573 "movq %%mm4, %%mm3 \n\t"\
574 "punpcklwd %%mm2, %%mm2 \n\t"\
575 "punpcklwd %%mm5, %%mm5 \n\t"\
576 "punpcklwd %%mm4, %%mm4 \n\t"\
577 "paddw %%mm1, %%mm2 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "paddw %%mm1, %%mm4 \n\t"\
580 "punpckhwd %%mm0, %%mm0 \n\t"\
581 "punpckhwd %%mm6, %%mm6 \n\t"\
582 "punpckhwd %%mm3, %%mm3 \n\t"\
583 "paddw %%mm7, %%mm0 \n\t"\
584 "paddw %%mm7, %%mm6 \n\t"\
585 "paddw %%mm7, %%mm3 \n\t"\
586 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587 "packuswb %%mm0, %%mm2 \n\t"\
588 "packuswb %%mm6, %%mm5 \n\t"\
589 "packuswb %%mm3, %%mm4 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"
591 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
592
593 #define REAL_YSCALEYUV2PACKED1b(index, c) \
594 "xor "#index", "#index" \n\t"\
595 ASMALIGN(4)\
596 "1: \n\t"\
597 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
599 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
601 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603 "psrlw $8, %%mm3 \n\t" \
604 "psrlw $8, %%mm4 \n\t" \
605 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
606 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
607 "psraw $7, %%mm1 \n\t" \
608 "psraw $7, %%mm7 \n\t"
609 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
610
611 // do vertical chrominance interpolation
612 #define REAL_YSCALEYUV2RGB1b(index, c) \
613 "xor "#index", "#index" \n\t"\
614 ASMALIGN(4)\
615 "1: \n\t"\
616 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
618 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
620 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
623 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
624 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
625 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
626 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
627 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
628 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
629 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
630 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
632 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
633 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
636 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
637 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
638 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
639 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
640 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
641 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642 "paddw %%mm3, %%mm4 \n\t"\
643 "movq %%mm2, %%mm0 \n\t"\
644 "movq %%mm5, %%mm6 \n\t"\
645 "movq %%mm4, %%mm3 \n\t"\
646 "punpcklwd %%mm2, %%mm2 \n\t"\
647 "punpcklwd %%mm5, %%mm5 \n\t"\
648 "punpcklwd %%mm4, %%mm4 \n\t"\
649 "paddw %%mm1, %%mm2 \n\t"\
650 "paddw %%mm1, %%mm5 \n\t"\
651 "paddw %%mm1, %%mm4 \n\t"\
652 "punpckhwd %%mm0, %%mm0 \n\t"\
653 "punpckhwd %%mm6, %%mm6 \n\t"\
654 "punpckhwd %%mm3, %%mm3 \n\t"\
655 "paddw %%mm7, %%mm0 \n\t"\
656 "paddw %%mm7, %%mm6 \n\t"\
657 "paddw %%mm7, %%mm3 \n\t"\
658 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659 "packuswb %%mm0, %%mm2 \n\t"\
660 "packuswb %%mm6, %%mm5 \n\t"\
661 "packuswb %%mm3, %%mm4 \n\t"\
662 "pxor %%mm7, %%mm7 \n\t"
663 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
664
665 #define REAL_WRITEBGR32(dst, dstw, index) \
666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667 "movq %%mm2, %%mm1 \n\t" /* B */\
668 "movq %%mm5, %%mm6 \n\t" /* R */\
669 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
670 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
671 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
672 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
673 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
674 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
675 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
676 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
677 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
678 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
679 \
680 MOVNTQ(%%mm0, (dst, index, 4))\
681 MOVNTQ(%%mm2, 8(dst, index, 4))\
682 MOVNTQ(%%mm1, 16(dst, index, 4))\
683 MOVNTQ(%%mm3, 24(dst, index, 4))\
684 \
685 "add $8, "#index" \n\t"\
686 "cmp "#dstw", "#index" \n\t"\
687 " jb 1b \n\t"
688 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
689
690 #define REAL_WRITEBGR16(dst, dstw, index) \
691 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
692 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
693 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
694 "psrlq $3, %%mm2 \n\t"\
695 \
696 "movq %%mm2, %%mm1 \n\t"\
697 "movq %%mm4, %%mm3 \n\t"\
698 \
699 "punpcklbw %%mm7, %%mm3 \n\t"\
700 "punpcklbw %%mm5, %%mm2 \n\t"\
701 "punpckhbw %%mm7, %%mm4 \n\t"\
702 "punpckhbw %%mm5, %%mm1 \n\t"\
703 \
704 "psllq $3, %%mm3 \n\t"\
705 "psllq $3, %%mm4 \n\t"\
706 \
707 "por %%mm3, %%mm2 \n\t"\
708 "por %%mm4, %%mm1 \n\t"\
709 \
710 MOVNTQ(%%mm2, (dst, index, 2))\
711 MOVNTQ(%%mm1, 8(dst, index, 2))\
712 \
713 "add $8, "#index" \n\t"\
714 "cmp "#dstw", "#index" \n\t"\
715 " jb 1b \n\t"
716 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
717
718 #define REAL_WRITEBGR15(dst, dstw, index) \
719 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
720 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
721 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
722 "psrlq $3, %%mm2 \n\t"\
723 "psrlq $1, %%mm5 \n\t"\
724 \
725 "movq %%mm2, %%mm1 \n\t"\
726 "movq %%mm4, %%mm3 \n\t"\
727 \
728 "punpcklbw %%mm7, %%mm3 \n\t"\
729 "punpcklbw %%mm5, %%mm2 \n\t"\
730 "punpckhbw %%mm7, %%mm4 \n\t"\
731 "punpckhbw %%mm5, %%mm1 \n\t"\
732 \
733 "psllq $2, %%mm3 \n\t"\
734 "psllq $2, %%mm4 \n\t"\
735 \
736 "por %%mm3, %%mm2 \n\t"\
737 "por %%mm4, %%mm1 \n\t"\
738 \
739 MOVNTQ(%%mm2, (dst, index, 2))\
740 MOVNTQ(%%mm1, 8(dst, index, 2))\
741 \
742 "add $8, "#index" \n\t"\
743 "cmp "#dstw", "#index" \n\t"\
744 " jb 1b \n\t"
745 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
746
747 #define WRITEBGR24OLD(dst, dstw, index) \
748 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749 "movq %%mm2, %%mm1 \n\t" /* B */\
750 "movq %%mm5, %%mm6 \n\t" /* R */\
751 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
752 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
753 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
754 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
755 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
756 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
757 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
758 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
759 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
760 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
761 \
762 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
763 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
764 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
765 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
766 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
767 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
768 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
769 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
770 \
771 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
772 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
773 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
774 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
775 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
776 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
777 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
778 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
779 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
780 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
781 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
782 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
783 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
784 \
785 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
786 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
787 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
788 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
789 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
790 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
791 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
792 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
793 \
794 MOVNTQ(%%mm0, (dst))\
795 MOVNTQ(%%mm2, 8(dst))\
796 MOVNTQ(%%mm3, 16(dst))\
797 "add $24, "#dst" \n\t"\
798 \
799 "add $8, "#index" \n\t"\
800 "cmp "#dstw", "#index" \n\t"\
801 " jb 1b \n\t"
802
803 #define WRITEBGR24MMX(dst, dstw, index) \
804 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805 "movq %%mm2, %%mm1 \n\t" /* B */\
806 "movq %%mm5, %%mm6 \n\t" /* R */\
807 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
808 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
809 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
810 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
811 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
812 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
813 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
814 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
815 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
816 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
817 \
818 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
819 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
820 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
821 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
822 \
823 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
824 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
825 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
826 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
827 \
828 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
829 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
830 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
831 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
832 \
833 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
834 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
835 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
836 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
837 MOVNTQ(%%mm0, (dst))\
838 \
839 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
840 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
841 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
842 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
843 MOVNTQ(%%mm6, 8(dst))\
844 \
845 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
846 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
847 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
848 MOVNTQ(%%mm5, 16(dst))\
849 \
850 "add $24, "#dst" \n\t"\
851 \
852 "add $8, "#index" \n\t"\
853 "cmp "#dstw", "#index" \n\t"\
854 " jb 1b \n\t"
855
856 #define WRITEBGR24MMX2(dst, dstw, index) \
857 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858 "movq "MANGLE(M24A)", %%mm0 \n\t"\
859 "movq "MANGLE(M24C)", %%mm7 \n\t"\
860 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
863 \
864 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
865 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
866 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
867 \
868 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
869 "por %%mm1, %%mm6 \n\t"\
870 "por %%mm3, %%mm6 \n\t"\
871 MOVNTQ(%%mm6, (dst))\
872 \
873 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
874 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
875 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
876 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
877 \
878 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
879 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
880 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
881 \
882 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
883 "por %%mm3, %%mm6 \n\t"\
884 MOVNTQ(%%mm6, 8(dst))\
885 \
886 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
887 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
888 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
889 \
890 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
891 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
892 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
893 \
894 "por %%mm1, %%mm3 \n\t"\
895 "por %%mm3, %%mm6 \n\t"\
896 MOVNTQ(%%mm6, 16(dst))\
897 \
898 "add $24, "#dst" \n\t"\
899 \
900 "add $8, "#index" \n\t"\
901 "cmp "#dstw", "#index" \n\t"\
902 " jb 1b \n\t"
903
904 #ifdef HAVE_MMX2
905 #undef WRITEBGR24
906 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
907 #else
908 #undef WRITEBGR24
909 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
910 #endif
911
912 #define REAL_WRITEYUY2(dst, dstw, index) \
913 "packuswb %%mm3, %%mm3 \n\t"\
914 "packuswb %%mm4, %%mm4 \n\t"\
915 "packuswb %%mm7, %%mm1 \n\t"\
916 "punpcklbw %%mm4, %%mm3 \n\t"\
917 "movq %%mm1, %%mm7 \n\t"\
918 "punpcklbw %%mm3, %%mm1 \n\t"\
919 "punpckhbw %%mm3, %%mm7 \n\t"\
920 \
921 MOVNTQ(%%mm1, (dst, index, 2))\
922 MOVNTQ(%%mm7, 8(dst, index, 2))\
923 \
924 "add $8, "#index" \n\t"\
925 "cmp "#dstw", "#index" \n\t"\
926 " jb 1b \n\t"
927 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
928
929
930 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
933 {
934 #ifdef HAVE_MMX
935 if (c->flags & SWS_ACCURATE_RND){
936 if (uDest){
937 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
939 }
940
941 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
942 }else{
943 if (uDest){
944 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
946 }
947
948 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
949 }
950 #else
951 #ifdef HAVE_ALTIVEC
952 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953 chrFilter, chrSrc, chrFilterSize,
954 dest, uDest, vDest, dstW, chrDstW);
955 #else //HAVE_ALTIVEC
956 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957 chrFilter, chrSrc, chrFilterSize,
958 dest, uDest, vDest, dstW, chrDstW);
959 #endif //!HAVE_ALTIVEC
960 #endif
961 }
962
963 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
966 {
967 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968 chrFilter, chrSrc, chrFilterSize,
969 dest, uDest, dstW, chrDstW, dstFormat);
970 }
971
972 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
974 {
975 #ifdef HAVE_MMX
976 if (uDest != NULL)
977 {
978 asm volatile(
979 YSCALEYUV2YV121
980 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
981 "g" (-chrDstW)
982 : "%"REG_a
983 );
984
985 asm volatile(
986 YSCALEYUV2YV121
987 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
988 "g" (-chrDstW)
989 : "%"REG_a
990 );
991 }
992
993 asm volatile(
994 YSCALEYUV2YV121
995 :: "r" (lumSrc + dstW), "r" (dest + dstW),
996 "g" (-dstW)
997 : "%"REG_a
998 );
999 #else
1000 int i;
1001 for (i=0; i<dstW; i++)
1002 {
1003 int val= lumSrc[i]>>7;
1004
1005 if (val&256){
1006 if (val<0) val=0;
1007 else val=255;
1008 }
1009
1010 dest[i]= val;
1011 }
1012
1013 if (uDest != NULL)
1014 for (i=0; i<chrDstW; i++)
1015 {
1016 int u=chrSrc[i]>>7;
1017 int v=chrSrc[i + 2048]>>7;
1018
1019 if ((u|v)&256){
1020 if (u<0) u=0;
1021 else if (u>255) u=255;
1022 if (v<0) v=0;
1023 else if (v>255) v=255;
1024 }
1025
1026 uDest[i]= u;
1027 vDest[i]= v;
1028 }
1029 #endif
1030 }
1031
1032
1033 /**
1034 * vertical scale YV12 to RGB
1035 */
1036 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038 uint8_t *dest, long dstW, long dstY)
1039 {
1040 #ifdef HAVE_MMX
1041 long dummy=0;
1042 if (c->flags & SWS_ACCURATE_RND){
1043 switch(c->dstFormat){
1044 case PIX_FMT_RGB32:
1045 YSCALEYUV2PACKEDX_ACCURATE
1046 YSCALEYUV2RGBX
1047 WRITEBGR32(%4, %5, %%REGa)
1048
1049 YSCALEYUV2PACKEDX_END
1050 return;
1051 case PIX_FMT_BGR24:
1052 YSCALEYUV2PACKEDX_ACCURATE
1053 YSCALEYUV2RGBX
1054 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055 "add %4, %%"REG_c" \n\t"
1056 WRITEBGR24(%%REGc, %5, %%REGa)
1057
1058
1059 :: "r" (&c->redDither),
1060 "m" (dummy), "m" (dummy), "m" (dummy),
1061 "r" (dest), "m" (dstW)
1062 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1063 );
1064 return;
1065 case PIX_FMT_BGR555:
1066 YSCALEYUV2PACKEDX_ACCURATE
1067 YSCALEYUV2RGBX
1068 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1069 #ifdef DITHER1XBPP
1070 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073 #endif
1074
1075 WRITEBGR15(%4, %5, %%REGa)
1076 YSCALEYUV2PACKEDX_END
1077 return;
1078 case PIX_FMT_BGR565:
1079 YSCALEYUV2PACKEDX_ACCURATE
1080 YSCALEYUV2RGBX
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082 #ifdef DITHER1XBPP
1083 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086 #endif
1087
1088 WRITEBGR16(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1090 return;
1091 case PIX_FMT_YUYV422:
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1094
1095 "psraw $3, %%mm3 \n\t"
1096 "psraw $3, %%mm4 \n\t"
1097 "psraw $3, %%mm1 \n\t"
1098 "psraw $3, %%mm7 \n\t"
1099 WRITEYUY2(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1101 return;
1102 }
1103 }else{
1104 switch(c->dstFormat)
1105 {
1106 case PIX_FMT_RGB32:
1107 YSCALEYUV2PACKEDX
1108 YSCALEYUV2RGBX
1109 WRITEBGR32(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1111 return;
1112 case PIX_FMT_BGR24:
1113 YSCALEYUV2PACKEDX
1114 YSCALEYUV2RGBX
1115 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1116 "add %4, %%"REG_c" \n\t"
1117 WRITEBGR24(%%REGc, %5, %%REGa)
1118
1119 :: "r" (&c->redDither),
1120 "m" (dummy), "m" (dummy), "m" (dummy),
1121 "r" (dest), "m" (dstW)
1122 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1123 );
1124 return;
1125 case PIX_FMT_BGR555:
1126 YSCALEYUV2PACKEDX
1127 YSCALEYUV2RGBX
1128 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1129 #ifdef DITHER1XBPP
1130 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1131 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1132 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1133 #endif
1134
1135 WRITEBGR15(%4, %5, %%REGa)
1136 YSCALEYUV2PACKEDX_END
1137 return;
1138 case PIX_FMT_BGR565:
1139 YSCALEYUV2PACKEDX
1140 YSCALEYUV2RGBX
1141 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1142 #ifdef DITHER1XBPP
1143 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1144 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1145 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1146 #endif
1147
1148 WRITEBGR16(%4, %5, %%REGa)
1149 YSCALEYUV2PACKEDX_END
1150 return;
1151 case PIX_FMT_YUYV422:
1152 YSCALEYUV2PACKEDX
1153 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1154
1155 "psraw $3, %%mm3 \n\t"
1156 "psraw $3, %%mm4 \n\t"
1157 "psraw $3, %%mm1 \n\t"
1158 "psraw $3, %%mm7 \n\t"
1159 WRITEYUY2(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1161 return;
1162 }
1163 }
1164 #endif
1165 #ifdef HAVE_ALTIVEC
1166 /* The following list of supported dstFormat values should
1167 match what's found in the body of altivec_yuv2packedX() */
1168 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1169 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1171 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172 chrFilter, chrSrc, chrFilterSize,
1173 dest, dstW, dstY);
1174 else
1175 #endif
1176 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177 chrFilter, chrSrc, chrFilterSize,
1178 dest, dstW, dstY);
1179 }
1180
1181 /**
1182 * vertical bilinear scale YV12 to RGB
1183 */
1184 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1186 {
1187 int yalpha1=yalpha^4095;
1188 int uvalpha1=uvalpha^4095;
1189 int i;
1190
1191 #if 0 //isn't used
1192 if (flags&SWS_FULL_CHR_H_INT)
1193 {
1194 switch(dstFormat)
1195 {
1196 #ifdef HAVE_MMX
1197 case PIX_FMT_RGB32:
1198 asm volatile(
1199
1200
1201 FULL_YSCALEYUV2RGB
1202 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1203 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1204
1205 "movq %%mm3, %%mm1 \n\t"
1206 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1207 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1208
1209 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1210 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1211
1212 "add $4, %%"REG_a" \n\t"
1213 "cmp %5, %%"REG_a" \n\t"
1214 " jb 1b \n\t"
1215
1216 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1217 "m" (yalpha1), "m" (uvalpha1)
1218 : "%"REG_a
1219 );
1220 break;
1221 case PIX_FMT_BGR24:
1222 asm volatile(
1223
1224 FULL_YSCALEYUV2RGB
1225
1226 // lsb ... msb
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1229
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1233
1234 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1235 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1236 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1237 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1238 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1239 "movq %%mm1, %%mm2 \n\t"
1240 "psllq $48, %%mm1 \n\t" // 000000BG
1241 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1242
1243 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1244 "psrld $16, %%mm2 \n\t" // R000R000
1245 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1246 "por %%mm2, %%mm1 \n\t" // RBGRR000
1247
1248 "mov %4, %%"REG_b" \n\t"
1249 "add %%"REG_a", %%"REG_b" \n\t"
1250
1251 #ifdef HAVE_MMX2
1252 //FIXME Alignment
1253 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1254 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1255 #else
1256 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1257 "psrlq $32, %%mm3 \n\t"
1258 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1259 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1260 #endif
1261 "add $4, %%"REG_a" \n\t"
1262 "cmp %5, %%"REG_a" \n\t"
1263 " jb 1b \n\t"
1264
1265 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1266 "m" (yalpha1), "m" (uvalpha1)
1267 : "%"REG_a, "%"REG_b
1268 );
1269 break;
1270 case PIX_FMT_BGR555:
1271 asm volatile(
1272
1273 FULL_YSCALEYUV2RGB
1274 #ifdef DITHER1XBPP
1275 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1276 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1277 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1278 #endif
1279 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1280 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1281 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1282
1283 "psrlw $3, %%mm3 \n\t"
1284 "psllw $2, %%mm1 \n\t"
1285 "psllw $7, %%mm0 \n\t"
1286 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1287 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1288
1289 "por %%mm3, %%mm1 \n\t"
1290 "por %%mm1, %%mm0 \n\t"
1291
1292 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1293
1294 "add $4, %%"REG_a" \n\t"
1295 "cmp %5, %%"REG_a" \n\t"
1296 " jb 1b \n\t"
1297
1298 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1299 "m" (yalpha1), "m" (uvalpha1)
1300 : "%"REG_a
1301 );
1302 break;
1303 case PIX_FMT_BGR565:
1304 asm volatile(
1305
1306 FULL_YSCALEYUV2RGB
1307 #ifdef DITHER1XBPP
1308 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1309 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1310 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1311 #endif
1312 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1313 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1314 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1315
1316 "psrlw $3, %%mm3 \n\t"
1317 "psllw $3, %%mm1 \n\t"
1318 "psllw $8, %%mm0 \n\t"
1319 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1320 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1321
1322 "por %%mm3, %%mm1 \n\t"
1323 "por %%mm1, %%mm0 \n\t"
1324
1325 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1326
1327 "add $4, %%"REG_a" \n\t"
1328 "cmp %5, %%"REG_a" \n\t"
1329 " jb 1b \n\t"
1330
1331 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332 "m" (yalpha1), "m" (uvalpha1)
1333 : "%"REG_a
1334 );
1335 break;
1336 #endif
1337 case PIX_FMT_BGR32:
1338 #ifndef HAVE_MMX
1339 case PIX_FMT_RGB32:
1340 #endif
1341 if (dstFormat==PIX_FMT_RGB32)
1342 {
1343 int i;
1344 #ifdef WORDS_BIGENDIAN
1345 dest++;
1346 #endif
1347 for (i=0;i<dstW;i++){
1348 // vertical linear interpolation && yuv2rgb in a single step:
1349 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1350 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1351 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1352 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1353 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1354 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1355 dest+= 4;
1356 }
1357 }
1358 else if (dstFormat==PIX_FMT_BGR24)
1359 {
1360 int i;
1361 for (i=0;i<dstW;i++){
1362 // vertical linear interpolation && yuv2rgb in a single step:
1363 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1364 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1365 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1366 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1367 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1368 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1369 dest+= 3;
1370 }
1371 }
1372 else if (dstFormat==PIX_FMT_BGR565)
1373 {
1374 int i;
1375 for (i=0;i<dstW;i++){
1376 // vertical linear interpolation && yuv2rgb in a single step:
1377 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1378 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1379 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1380
1381 ((uint16_t*)dest)[i] =
1382 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1383 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1384 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1385 }
1386 }
1387 else if (dstFormat==PIX_FMT_BGR555)
1388 {
1389 int i;
1390 for (i=0;i<dstW;i++){
1391 // vertical linear interpolation && yuv2rgb in a single step:
1392 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1393 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1394 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1395
1396 ((uint16_t*)dest)[i] =
1397 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1398 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1399 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1400 }
1401 }
1402 }//FULL_UV_IPOL
1403 else
1404 {
1405 #endif // if 0
1406 #ifdef HAVE_MMX
1407 switch(c->dstFormat)
1408 {
1409 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1410 case PIX_FMT_RGB32:
1411 asm volatile(
1412 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1413 "mov %4, %%"REG_b" \n\t"
1414 "push %%"REG_BP" \n\t"
1415 YSCALEYUV2RGB(%%REGBP, %5)
1416 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1417 "pop %%"REG_BP" \n\t"
1418 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1419
1420 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1421 "a" (&c->redDither)
1422 );
1423 return;
1424 case PIX_FMT_BGR24:
1425 asm volatile(
1426 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1427 "mov %4, %%"REG_b" \n\t"
1428 "push %%"REG_BP" \n\t"
1429 YSCALEYUV2RGB(%%REGBP, %5)
1430 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1431 "pop %%"REG_BP" \n\t"
1432 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1433 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1434 "a" (&c->redDither)
1435 );
1436 return;
1437 case PIX_FMT_BGR555:
1438 asm volatile(
1439 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1440 "mov %4, %%"REG_b" \n\t"
1441 "push %%"REG_BP" \n\t"
1442 YSCALEYUV2RGB(%%REGBP, %5)
1443 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1444 #ifdef DITHER1XBPP
1445 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1446 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1447 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1448 #endif
1449
1450 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453
1454 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1455 "a" (&c->redDither)
1456 );
1457 return;
1458 case PIX_FMT_BGR565:
1459 asm volatile(
1460 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1461 "mov %4, %%"REG_b" \n\t"
1462 "push %%"REG_BP" \n\t"
1463 YSCALEYUV2RGB(%%REGBP, %5)
1464 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465 #ifdef DITHER1XBPP
1466 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1467 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1468 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1469 #endif
1470
1471 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1472 "pop %%"REG_BP" \n\t"
1473 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475 "a" (&c->redDither)
1476 );
1477 return;
1478 case PIX_FMT_YUYV422:
1479 asm volatile(
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2PACKED(%%REGBP, %5)
1484 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1485 "pop %%"REG_BP" \n\t"
1486 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1487 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 "a" (&c->redDither)
1489 );
1490 return;
1491 default: break;
1492 }
1493 #endif //HAVE_MMX
1494 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1495 }
1496
1497 /**
1498 * YV12 to RGB without scaling or interpolating
1499 */
1500 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1501 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1502 {
1503 const int yalpha1=0;
1504 int i;
1505
1506 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1507 const int yalpha= 4096; //FIXME ...
1508
1509 if (flags&SWS_FULL_CHR_H_INT)
1510 {
1511 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1512 return;
1513 }
1514
1515 #ifdef HAVE_MMX
1516 if ( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1517 {
1518 switch(dstFormat)
1519 {
1520 case PIX_FMT_RGB32:
1521 asm volatile(
1522 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1523 "mov %4, %%"REG_b" \n\t"
1524 "push %%"REG_BP" \n\t"
1525 YSCALEYUV2RGB1(%%REGBP, %5)
1526 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1527 "pop %%"REG_BP" \n\t"
1528 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1529
1530 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1531 "a" (&c->redDither)
1532 );
1533 return;
1534 case PIX_FMT_BGR24:
1535 asm volatile(
1536 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1537 "mov %4, %%"REG_b" \n\t"
1538 "push %%"REG_BP" \n\t"
1539 YSCALEYUV2RGB1(%%REGBP, %5)
1540 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1541 "pop %%"REG_BP" \n\t"
1542 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1543
1544 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545 "a" (&c->redDither)
1546 );
1547 return;
1548 case PIX_FMT_BGR555:
1549 asm volatile(
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1555 #ifdef DITHER1XBPP
1556 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1557 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1558 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1559 #endif
1560 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1563
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565 "a" (&c->redDither)
1566 );
1567 return;
1568 case PIX_FMT_BGR565:
1569 asm volatile(
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575 #ifdef DITHER1XBPP
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1579 #endif
1580
1581 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1582 "pop %%"REG_BP" \n\t"
1583 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1584
1585 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1586 "a" (&c->redDither)
1587 );
1588 return;
1589 case PIX_FMT_YUYV422:
1590 asm volatile(
1591 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1592 "mov %4, %%"REG_b" \n\t"
1593 "push %%"REG_BP" \n\t"
1594 YSCALEYUV2PACKED1(%%REGBP, %5)
1595 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1596 "pop %%"REG_BP" \n\t"
1597 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1598
1599 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1600 "a" (&c->redDither)
1601 );
1602 return;
1603 }
1604 }
1605 else
1606 {
1607 switch(dstFormat)
1608 {
1609 case PIX_FMT_RGB32:
1610 asm volatile(
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2RGB1b(%%REGBP, %5)
1615 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1618
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620 "a" (&c->redDither)
1621 );
1622 return;
1623 case PIX_FMT_BGR24:
1624 asm volatile(
1625 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1626 "mov %4, %%"REG_b" \n\t"
1627 "push %%"REG_BP" \n\t"
1628 YSCALEYUV2RGB1b(%%REGBP, %5)
1629 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1630 "pop %%"REG_BP" \n\t"
1631 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1632
1633 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1634 "a" (&c->redDither)
1635 );
1636 return;
1637 case PIX_FMT_BGR555:
1638 asm volatile(
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1644 #ifdef DITHER1XBPP
1645 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1646 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1647 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1648 #endif
1649 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1652
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654 "a" (&c->redDither)
1655 );
1656 return;
1657 case PIX_FMT_BGR565:
1658 asm volatile(
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664 #ifdef DITHER1XBPP
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1668 #endif
1669
1670 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1671 "pop %%"REG_BP" \n\t"
1672 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1673
1674 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1675 "a" (&c->redDither)
1676 );
1677 return;
1678 case PIX_FMT_YUYV422:
1679 asm volatile(
1680 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1681 "mov %4, %%"REG_b" \n\t"
1682 "push %%"REG_BP" \n\t"
1683 YSCALEYUV2PACKED1b(%%REGBP, %5)
1684 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1685 "pop %%"REG_BP" \n\t"
1686 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1687
1688 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1689 "a" (&c->redDither)
1690 );
1691 return;
1692 }
1693 }
1694 #endif
1695 if ( uvalpha < 2048 )
1696 {
1697 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1698 }else{
1699 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1700 }
1701 }
1702
1703 //FIXME yuy2* can read upto 7 samples to much
1704
1705 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1706 {
1707 #ifdef HAVE_MMX
1708 asm volatile(
1709 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1710 "mov %0, %%"REG_a" \n\t"
1711 "1: \n\t"
1712 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1713 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1714 "pand %%mm2, %%mm0 \n\t"
1715 "pand %%mm2, %%mm1 \n\t"
1716 "packuswb %%mm1, %%mm0 \n\t"
1717 "movq %%mm0, (%2, %%"REG_a") \n\t"
1718 "add $8, %%"REG_a" \n\t"
1719 " js 1b \n\t"
1720 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1721 : "%"REG_a
1722 );
1723 #else
1724 int i;
1725 for (i=0; i<width; i++)
1726 dst[i]= src[2*i];
1727 #endif
1728 }
1729
1730 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1731 {
1732 #ifdef HAVE_MMX
1733 asm volatile(
1734 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1735 "mov %0, %%"REG_a" \n\t"
1736 "1: \n\t"
1737 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1738 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1739 "psrlw $8, %%mm0 \n\t"
1740 "psrlw $8, %%mm1 \n\t"
1741 "packuswb %%mm1, %%mm0 \n\t"
1742 "movq %%mm0, %%mm1 \n\t"
1743 "psrlw $8, %%mm0 \n\t"
1744 "pand %%mm4, %%mm1 \n\t"
1745 "packuswb %%mm0, %%mm0 \n\t"
1746 "packuswb %%mm1, %%mm1 \n\t"
1747 "movd %%mm0, (%3, %%"REG_a") \n\t"
1748 "movd %%mm1, (%2, %%"REG_a") \n\t"
1749 "add $4, %%"REG_a" \n\t"
1750 " js 1b \n\t"
1751 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1752 : "%"REG_a
1753 );
1754 #else
1755 int i;
1756 for (i=0; i<width; i++)
1757 {
1758 dstU[i]= src1[4*i + 1];
1759 dstV[i]= src1[4*i + 3];
1760 }
1761 #endif
1762 assert(src1 == src2);
1763 }
1764
1765 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1766 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1767 {
1768 #ifdef HAVE_MMX
1769 asm volatile(
1770 "mov %0, %%"REG_a" \n\t"
1771 "1: \n\t"
1772 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1773 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1774 "psrlw $8, %%mm0 \n\t"
1775 "psrlw $8, %%mm1 \n\t"
1776 "packuswb %%mm1, %%mm0 \n\t"
1777 "movq %%mm0, (%2, %%"REG_a") \n\t"
1778 "add $8, %%"REG_a" \n\t"
1779 " js 1b \n\t"
1780 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1781 : "%"REG_a
1782 );
1783 #else
1784 int i;
1785 for (i=0; i<width; i++)
1786 dst[i]= src[2*i+1];
1787 #endif
1788 }
1789
1790 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1791 {
1792 #ifdef HAVE_MMX
1793 asm volatile(
1794 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1795 "mov %0, %%"REG_a" \n\t"
1796 "1: \n\t"
1797 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1798 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1799 "pand %%mm4, %%mm0 \n\t"
1800 "pand %%mm4, %%mm1 \n\t"
1801 "packuswb %%mm1, %%mm0 \n\t"
1802 "movq %%mm0, %%mm1 \n\t"
1803 "psrlw $8, %%mm0 \n\t"
1804 "pand %%mm4, %%mm1 \n\t"
1805 "packuswb %%mm0, %%mm0 \n\t"
1806 "packuswb %%mm1, %%mm1 \n\t"
1807 "movd %%mm0, (%3, %%"REG_a") \n\t"
1808 "movd %%mm1, (%2, %%"REG_a") \n\t"
1809 "add $4, %%"REG_a" \n\t"
1810 " js 1b \n\t"
1811 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1812 : "%"REG_a
1813 );
1814 #else
1815 int i;
1816 for (i=0; i<width; i++)
1817 {
1818 dstU[i]= src1[4*i + 0];
1819 dstV[i]= src1[4*i + 2];
1820 }
1821 #endif
1822 assert(src1 == src2);
1823 }
1824
1825 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1826 {
1827 int i;
1828 for (i=0; i<width; i++)
1829 {
1830 int b= ((uint32_t*)src)[i]&0xFF;
1831 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1832 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1833
1834 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1835 }
1836 }
1837
1838 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1839 {
1840 int i;
1841 assert(src1 == src2);
1842 for (i=0; i<width; i++)
1843 {
1844 const int a= ((uint32_t*)src1)[2*i+0];
1845 const int e= ((uint32_t*)src1)[2*i+1];
1846 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1847 const int h= (a&0x00FF00) + (e&0x00FF00);
1848 const int b= l&0x3FF;
1849 const int g= h>>8;
1850 const int r= l>>16;
1851
1852 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1853 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854 }
1855 }
1856
1857 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1858 {
1859 #ifdef HAVE_MMX
1860 asm volatile(
1861 "mov %2, %%"REG_a" \n\t"
1862 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1863 "movq "MANGLE(w1111)", %%mm5 \n\t"
1864 "pxor %%mm7, %%mm7 \n\t"
1865 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1866 ASMALIGN(4)
1867 "1: \n\t"
1868 PREFETCH" 64(%0, %%"REG_d") \n\t"
1869 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1870 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1871 "punpcklbw %%mm7, %%mm0 \n\t"
1872 "punpcklbw %%mm7, %%mm1 \n\t"
1873 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1874 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1875 "punpcklbw %%mm7, %%mm2 \n\t"
1876 "punpcklbw %%mm7, %%mm3 \n\t"
1877 "pmaddwd %%mm6, %%mm0 \n\t"
1878 "pmaddwd %%mm6, %%mm1 \n\t"
1879 "pmaddwd %%mm6, %%mm2 \n\t"
1880 "pmaddwd %%mm6, %%mm3 \n\t"
1881 #ifndef FAST_BGR2YV12
1882 "psrad $8, %%mm0 \n\t"
1883 "psrad $8, %%mm1 \n\t"
1884 "psrad $8, %%mm2 \n\t"
1885 "psrad $8, %%mm3 \n\t"
1886 #endif
1887 "packssdw %%mm1, %%mm0 \n\t"
1888 "packssdw %%mm3, %%mm2 \n\t"
1889 "pmaddwd %%mm5, %%mm0 \n\t"
1890 "pmaddwd %%mm5, %%mm2 \n\t"
1891 "packssdw %%mm2, %%mm0 \n\t"
1892 "psraw $7, %%mm0 \n\t"
1893
1894 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1895 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1896 "punpcklbw %%mm7, %%mm4 \n\t"
1897 "punpcklbw %%mm7, %%mm1 \n\t"
1898 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1899 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1900 "punpcklbw %%mm7, %%mm2 \n\t"
1901 "punpcklbw %%mm7, %%mm3 \n\t"
1902 "pmaddwd %%mm6, %%mm4 \n\t"
1903 "pmaddwd %%mm6, %%mm1 \n\t"
1904 "pmaddwd %%mm6, %%mm2 \n\t"
1905 "pmaddwd %%mm6, %%mm3 \n\t"
1906 #ifndef FAST_BGR2YV12
1907 "psrad $8, %%mm4 \n\t"
1908 "psrad $8, %%mm1 \n\t"
1909 "psrad $8, %%mm2 \n\t"
1910 "psrad $8, %%mm3 \n\t"
1911 #endif
1912 "packssdw %%mm1, %%mm4 \n\t"
1913 "packssdw %%mm3, %%mm2 \n\t"
1914 "pmaddwd %%mm5, %%mm4 \n\t"
1915 "pmaddwd %%mm5, %%mm2 \n\t"
1916 "add $24, %%"REG_d" \n\t"
1917 "packssdw %%mm2, %%mm4 \n\t"
1918 "psraw $7, %%mm4 \n\t"
1919
1920 "packuswb %%mm4, %%mm0 \n\t"
1921 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1922
1923 "movq %%mm0, (%1, %%"REG_a") \n\t"
1924 "add $8, %%"REG_a" \n\t"
1925 " js 1b \n\t"
1926 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1927 : "%"REG_a, "%"REG_d
1928 );
1929 #else
1930 int i;
1931 for (i=0; i<width; i++)
1932 {
1933 int b= src[i*3+0];
1934 int g= src[i*3+1];
1935 int r= src[i*3+2];
1936
1937 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1938 }
1939 #endif
1940 }
1941
1942 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1943 {
1944 #ifdef HAVE_MMX
1945 asm volatile(
1946 "mov %3, %%"REG_a" \n\t"
1947 "movq "MANGLE(w1111)", %%mm5 \n\t"
1948 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1949 "pxor %%mm7, %%mm7 \n\t"
1950 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1951 "add %%"REG_d", %%"REG_d" \n\t"
1952 ASMALIGN(4)
1953 "1: \n\t"
1954 PREFETCH" 64(%0, %%"REG_d") \n\t"
1955 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1956 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1957 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1958 "movq %%mm0, %%mm1 \n\t"
1959 "movq %%mm2, %%mm3 \n\t"
1960 "psrlq $24, %%mm0 \n\t"
1961 "psrlq $24, %%mm2 \n\t"
1962 PAVGB(%%mm1, %%mm0)
1963 PAVGB(%%mm3, %%mm2)
1964 "punpcklbw %%mm7, %%mm0 \n\t"
1965 "punpcklbw %%mm7, %%mm2 \n\t"
1966 #else
1967 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1968 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1969 "punpcklbw %%mm7, %%mm0 \n\t"
1970 "punpcklbw %%mm7, %%mm2 \n\t"
1971 "paddw %%mm2, %%mm0 \n\t"
1972 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1973 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1974 "punpcklbw %%mm7, %%mm4 \n\t"
1975 "punpcklbw %%mm7, %%mm2 \n\t"
1976 "paddw %%mm4, %%mm2 \n\t"
1977 "psrlw $1, %%mm0 \n\t"
1978 "psrlw $1, %%mm2 \n\t"
1979 #endif
1980 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1981 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1982
1983 "pmaddwd %%mm0, %%mm1 \n\t"
1984 "pmaddwd %%mm2, %%mm3 \n\t"
1985 "pmaddwd %%mm6, %%mm0 \n\t"
1986 "pmaddwd %%mm6, %%mm2 \n\t"
1987 #ifndef FAST_BGR2YV12
1988 "psrad $8, %%mm0 \n\t"
1989 "psrad $8, %%mm1 \n\t"
1990 "psrad $8, %%mm2 \n\t"
1991 "psrad $8, %%mm3 \n\t"
1992 #endif
1993 "packssdw %%mm2, %%mm0 \n\t"
1994 "packssdw %%mm3, %%mm1 \n\t"
1995 "pmaddwd %%mm5, %%mm0 \n\t"
1996 "pmaddwd %%mm5, %%mm1 \n\t"
1997 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1998 "psraw $7, %%mm0 \n\t"
1999
2000 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2001 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2002 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2003 "movq %%mm4, %%mm1 \n\t"
2004 "movq %%mm2, %%mm3 \n\t"
2005 "psrlq $24, %%mm4 \n\t"
2006 "psrlq $24, %%mm2 \n\t"
2007 PAVGB(%%mm1, %%mm4)
2008 PAVGB(%%mm3, %%mm2)
2009 "punpcklbw %%mm7, %%mm4 \n\t"
2010 "punpcklbw %%mm7, %%mm2 \n\t"
2011 #else
2012 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2013 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2014 "punpcklbw %%mm7, %%mm4 \n\t"
2015 "punpcklbw %%mm7, %%mm2 \n\t"
2016 "paddw %%mm2, %%mm4 \n\t"
2017 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2018 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm5 \n\t"
2020 "punpcklbw %%mm7, %%mm2 \n\t"
2021 "paddw %%mm5, %%mm2 \n\t"
2022 "movq "MANGLE(w1111)", %%mm5 \n\t"
2023 "psrlw $2, %%mm4 \n\t"
2024 "psrlw $2, %%mm2 \n\t"
2025 #endif
2026 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2027 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2028
2029 "pmaddwd %%mm4, %%mm1 \n\t"
2030 "pmaddwd %%mm2, %%mm3 \n\t"
2031 "pmaddwd %%mm6, %%mm4 \n\t"
2032 "pmaddwd %%mm6, %%mm2 \n\t"
2033 #ifndef FAST_BGR2YV12
2034 "psrad $8, %%mm4 \n\t"
2035 "psrad $8, %%mm1 \n\t"
2036 "psrad $8, %%mm2 \n\t"
2037 "psrad $8, %%mm3 \n\t"
2038 #endif
2039 "packssdw %%mm2, %%mm4 \n\t"
2040 "packssdw %%mm3, %%mm1 \n\t"
2041 "pmaddwd %%mm5, %%mm4 \n\t"
2042 "pmaddwd %%mm5, %%mm1 \n\t"
2043 "add $24, %%"REG_d" \n\t"
2044 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2045 "psraw $7, %%mm4 \n\t"
2046
2047 "movq %%mm0, %%mm1 \n\t"
2048 "punpckldq %%mm4, %%mm0 \n\t"
2049 "punpckhdq %%mm4, %%mm1 \n\t"
2050 "packsswb %%mm1, %%mm0 \n\t"
2051 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2052
2053 "movd %%mm0, (%1, %%"REG_a") \n\t"
2054 "punpckhdq %%mm0, %%mm0 \n\t"
2055 "movd %%mm0, (%2, %%"REG_a") \n\t"
2056 "add $4, %%"REG_a" \n\t"
2057 " js 1b \n\t"
2058 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2059 : "%"REG_a, "%"REG_d
2060 );
2061 #else
2062 int i;
2063 for (i=0; i<width; i++)
2064 {
2065 int b= src1[6*i + 0] + src1[6*i + 3];
2066 int g= src1[6*i + 1] + src1[6*i + 4];
2067 int r= src1[6*i + 2] + src1[6*i + 5];
2068
2069 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2070 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071 }
2072 #endif
2073 assert(src1 == src2);
2074 }
2075
2076 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2077 {
2078 int i;
2079 for (i=0; i<width; i++)
2080 {
2081 int d= ((uint16_t*)src)[i];
2082 int b= d&0x1F;
2083 int g= (d>>5)&0x3F;
2084 int r= (d>>11)&0x1F;
2085
2086 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2087 }
2088 }
2089
2090 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2091 {
2092 int i;
2093 assert(src1==src2);
2094 for (i=0; i<width; i++)
2095 {
2096 int d0= ((uint32_t*)src1)[i];
2097
2098 int dl= (d0&0x07E0F81F);
2099 int dh= ((d0>>5)&0x07C0F83F);
2100
2101 int dh2= (dh>>11) + (dh<<21);
2102 int d= dh2 + dl;
2103
2104 int b= d&0x7F;
2105 int r= (d>>11)&0x7F;
2106 int g= d>>21;
2107 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2108 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109 }
2110 }
2111
2112 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2113 {
2114 int i;
2115 for (i=0; i<width; i++)
2116 {
2117 int d= ((uint16_t*)src)[i];
2118 int b= d&0x1F;
2119 int g= (d>>5)&0x1F;
2120 int r= (d>>10)&0x1F;
2121
2122 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2123 }
2124 }
2125
2126 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2127 {
2128 int i;
2129 assert(src1==src2);
2130 for (i=0; i<width; i++)
2131 {
2132 int d0= ((uint32_t*)src1)[i];
2133
2134 int dl= (d0&0x03E07C1F);
2135 int dh= ((d0>>5)&0x03E0F81F);
2136
2137 int dh2= (dh>>11) + (dh<<21);
2138 int d= dh2 + dl;
2139
2140 int b= d&0x7F;
2141 int r= (d>>10)&0x7F;
2142 int g= d>>21;
2143 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2144 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145 }
2146 }
2147
2148
2149 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2150 {
2151 int i;
2152 for (i=0; i<width; i++)
2153 {
2154 int r= ((uint32_t*)src)[i]&0xFF;
2155 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2156 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2157
2158 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2159 }
2160 }
2161
2162 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2163 {
2164 int i;
2165 assert(src1==src2);
2166 for (i=0; i<width; i++)
2167 {
2168 const int a= ((uint32_t*)src1)[2*i+0];
2169 const int e= ((uint32_t*)src1)[2*i+1];
2170 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2171 const int h= (a&0x00FF00) + (e&0x00FF00);
2172 const int r= l&0x3FF;
2173 const int g= h>>8;
2174 const int b= l>>16;
2175
2176 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2177 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178 }
2179 }
2180
2181 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2182 {
2183 int i;
2184 for (i=0; i<width; i++)
2185 {
2186 int r= src[i*3+0];
2187 int g= src[i*3+1];
2188 int b= src[i*3+2];
2189
2190 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2191 }
2192 }
2193
2194 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2195 {
2196 int i;
2197 assert(src1==src2);
2198 for (i=0; i<width; i++)
2199 {
2200 int r= src1[6*i + 0] + src1[6*i + 3];
2201 int g= src1[6*i + 1] + src1[6*i + 4];
2202 int b= src1[6*i + 2] + src1[6*i + 5];
2203
2204 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2205 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206 }
2207 }
2208
2209 static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2210 {
2211 int i;
2212 for (i=0; i<width; i++)
2213 {
2214 int d= ((uint16_t*)src)[i];
2215 int r= d&0x1F;
2216 int g= (d>>5)&0x3F;
2217 int b= (d>>11)&0x1F;
2218
2219 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2220 }
2221 }
2222
2223 static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2224 {
2225 int i;
2226 assert(src1 == src2);
2227 for (i=0; i<width; i++)
2228 {
2229 int d0= ((uint32_t*)src1)[i];
2230
2231 int dl= (d0&0x07E0F81F);
2232 int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2233
2234 int r= d&0x3F;
2235 int b= (d>>11)&0x3F;
2236 int g= d>>21;
2237 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2238 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2239 }
2240 }
2241
2242 static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2243 {
2244 int i;
2245 for (i=0; i<width; i++)
2246 {
2247 int d= ((uint16_t*)src)[i];
2248 int r= d&0x1F;
2249 int g= (d>>5)&0x1F;
2250 int b= (d>>10)&0x1F;
2251
2252 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2253 }
2254 }
2255
2256 static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2257 {
2258 int i;
2259 assert(src1 == src2);
2260 for (i=0; i<width; i++)
2261 {
2262 int d0= ((uint32_t*)src1)[i];
2263
2264 int dl= (d0&0x03E07C1F);
2265 int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2266
2267 int r= d&0x3F;
2268 int b= (d>>10)&0x3F;
2269 int g= d>>21;
2270 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2271 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2272 }
2273 }
2274
2275 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2276 {
2277 int i;
2278 for (i=0; i<width; i++)
2279 {
2280 int d= src[i];
2281
2282 dst[i]= pal[d] & 0xFF;
2283 }
2284 }
2285
2286 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2287 {
2288 int i;
2289 assert(src1 == src2);
2290 for (i=0; i<width; i++)
2291 {
2292 int p= pal[src1[i]];
2293
2294 dstU[i]= p>>8;
2295 dstV[i]= p>>16;
2296 }
2297 }
2298
2299 // Bilinear / Bicubic scaling
2300 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2301 int16_t *filter, int16_t *filterPos, long filterSize)
2302 {
2303 #ifdef HAVE_MMX
2304 assert(filterSize % 4 == 0 && filterSize>0);
2305 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2306 {
2307 long counter= -2*dstW;
2308 filter-= counter*2;
2309 filterPos-= counter/2;
2310 dst-= counter/2;
2311 asm volatile(
2312 #if defined(PIC)
2313 "push %%"REG_b" \n\t"
2314 #endif
2315 "pxor %%mm7, %%mm7 \n\t"
2316 "movq "MANGLE(w02)", %%mm6 \n\t"
2317 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2318 "mov %%"REG_a", %%"REG_BP" \n\t"
2319 ASMALIGN(4)
2320 "1: \n\t"
2321 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2322 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2323 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2324 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2325 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2326 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2327 "punpcklbw %%mm7, %%mm0 \n\t"
2328 "punpcklbw %%mm7, %%mm2 \n\t"
2329 "pmaddwd %%mm1, %%mm0 \n\t"
2330 "pmaddwd %%mm2, %%mm3 \n\t"
2331 "psrad $8, %%mm0 \n\t"
2332 "psrad $8, %%mm3 \n\t"
2333 "packssdw %%mm3, %%mm0 \n\t"
2334 "pmaddwd %%mm6, %%mm0 \n\t"
2335 "packssdw %%mm0, %%mm0 \n\t"
2336 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2337 "add $4, %%"REG_BP" \n\t"
2338 " jnc 1b \n\t"
2339
2340 "pop %%"REG_BP" \n\t"
2341 #if defined(PIC)
2342 "pop %%"REG_b" \n\t"
2343 #endif
2344 : "+a" (counter)
2345 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2346 #if !defined(PIC)
2347 : "%"REG_b
2348 #endif
2349 );
2350 }
2351 else if (filterSize==8)
2352 {
2353 long counter= -2*dstW;
2354 filter-= counter*4;
2355 filterPos-= counter/2;
2356 dst-= counter/2;
2357 asm volatile(
2358 #if defined(PIC)
2359 "push %%"REG_b" \n\t"
2360 #endif
2361 "pxor %%mm7, %%mm7 \n\t"
2362 "movq "MANGLE(w02)", %%mm6 \n\t"
2363 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2364 "mov %%"REG_a", %%"REG_BP" \n\t"
2365 ASMALIGN(4)
2366 "1: \n\t"
2367 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2368 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2369 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2370 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2371 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2372 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2373 "punpcklbw %%mm7, %%mm0 \n\t"
2374 "punpcklbw %%mm7, %%mm2 \n\t"
2375 "pmaddwd %%mm1, %%mm0 \n\t"
2376 "pmaddwd %%mm2, %%mm3 \n\t"
2377
2378 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2379 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2380 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2381 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2382 "punpcklbw %%mm7, %%mm4 \n\t"
2383 "punpcklbw %%mm7, %%mm2 \n\t"
2384 "pmaddwd %%mm1, %%mm4 \n\t"
2385 "pmaddwd %%mm2, %%mm5 \n\t"
2386 "paddd %%mm4, %%mm0 \n\t"
2387 "paddd %%mm5, %%mm3 \n\t"
2388
2389 "psrad $8, %%mm0 \n\t"
2390 "psrad $8, %%mm3 \n\t"
2391 "packssdw %%mm3, %%mm0 \n\t"
2392 "pmaddwd %%mm6, %%mm0 \n\t"
2393 "packssdw %%mm0, %%mm0 \n\t"
2394 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2395 "add $4, %%"REG_BP" \n\t"
2396 " jnc 1b \n\t"
2397
2398 "pop %%"REG_BP" \n\t"
2399 #if defined(PIC)
2400 "pop %%"REG_b" \n\t"
2401 #endif
2402 : "+a" (counter)
2403 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2404 #if !defined(PIC)
2405 : "%"REG_b
2406 #endif
2407 );
2408 }
2409 else
2410 {
2411 uint8_t *offset = src+filterSize;
2412 long counter= -2*dstW;
2413 //filter-= counter*filterSize/2;
2414 filterPos-= counter/2;
2415 dst-= counter/2;
2416 asm volatile(
2417 "pxor %%mm7, %%mm7 \n\t"
2418 "movq "MANGLE(w02)", %%mm6 \n\t"
2419 ASMALIGN(4)
2420 "1: \n\t"
2421 "mov %2, %%"REG_c" \n\t"
2422 "movzwl (%%"REG_c", %0), %%eax \n\t"
2423 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2424 "mov %5, %%"REG_c" \n\t"
2425 "pxor %%mm4, %%mm4 \n\t"
2426 "pxor %%mm5, %%mm5 \n\t"
2427 "2: \n\t"
2428 "movq (%1), %%mm1 \n\t"
2429 "movq (%1, %6), %%mm3 \n\t"
2430 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2431 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2432 "punpcklbw %%mm7, %%mm0 \n\t"
2433 "punpcklbw %%mm7, %%mm2 \n\t"
2434 "pmaddwd %%mm1, %%mm0 \n\t"
2435 "pmaddwd %%mm2, %%mm3 \n\t"
2436 "paddd %%mm3, %%mm5 \n\t"
2437 "paddd %%mm0, %%mm4 \n\t"
2438 "add $8, %1 \n\t"
2439 "add $4, %%"REG_c" \n\t"
2440 "cmp %4, %%"REG_c" \n\t"
2441 " jb 2b \n\t"
2442 "add %6, %1 \n\t"
2443 "psrad $8, %%mm4 \n\t"
2444 "psrad $8, %%mm5 \n\t"
2445 "packssdw %%mm5, %%mm4 \n\t"
2446 "pmaddwd %%mm6, %%mm4 \n\t"
2447 "packssdw %%mm4, %%mm4 \n\t"
2448 "mov %3, %%"REG_a" \n\t"
2449 "movd %%mm4, (%%"REG_a", %0) \n\t"
2450 "add $4, %0 \n\t"
2451 " jnc 1b \n\t"
2452
2453 : "+r" (counter), "+r" (filter)
2454 : "m" (filterPos), "m" (dst), "m"(offset),
2455 "m" (src), "r" (filterSize*2)
2456 : "%"REG_a, "%"REG_c, "%"REG_d
2457 );
2458 }
2459 #else
2460 #ifdef HAVE_ALTIVEC
2461 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2462 #else
2463 int i;
2464 for (i=0; i<dstW; i++)
2465 {
2466 int j;
2467 int srcPos= filterPos[i];
2468 int val=0;
2469 //printf("filterPos: %d\n", filterPos[i]);
2470 for (j=0; j<filterSize; j++)
2471 {
2472 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2473 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2474 }
2475 //filter += hFilterSize;
2476 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2477 //dst[i] = val>>7;
2478 }
2479 #endif
2480 #endif
2481 }
2482 // *** horizontal scale Y line to temp buffer
2483 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2484 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2485 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2486 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2487 int32_t *mmx2FilterPos, uint8_t *pal)
2488 {
2489 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2490 {
2491 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2492 src= formatConvBuffer;
2493 }
2494 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2495 {
2496 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2497 src= formatConvBuffer;
2498 }
2499 else if (srcFormat==PIX_FMT_RGB32)
2500 {
2501 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2502 src= formatConvBuffer;
2503 }
2504 else if (srcFormat==PIX_FMT_BGR24)
2505 {
2506 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2507 src= formatConvBuffer;
2508 }
2509 else if (srcFormat==PIX_FMT_BGR565)
2510 {
2511 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2512 src= formatConvBuffer;
2513 }
2514 else if (srcFormat==PIX_FMT_BGR555)
2515 {
2516 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2517 src= formatConvBuffer;
2518 }
2519 else if (srcFormat==PIX_FMT_BGR32)
2520 {
2521 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2522 src= formatConvBuffer;
2523 }
2524 else if (srcFormat==PIX_FMT_RGB24)
2525 {
2526 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2527 src= formatConvBuffer;
2528 }
2529 else if (srcFormat==PIX_FMT_RGB565)
2530 {
2531 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2532 src= formatConvBuffer;
2533 }
2534 else if (srcFormat==PIX_FMT_RGB555)
2535 {
2536 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2537 src= formatConvBuffer;
2538 }
2539 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2540 {
2541 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2542 src= formatConvBuffer;
2543 }
2544
2545 #ifdef HAVE_MMX
2546 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2547 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2548 #else
2549 if (!(flags&SWS_FAST_BILINEAR))
2550 #endif
2551 {
2552 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2553 }
2554 else // Fast Bilinear upscale / crap downscale
2555 {
2556 #if defined(ARCH_X86)
2557 #ifdef HAVE_MMX2
2558 int i;
2559 #if defined(PIC)
2560 uint64_t ebxsave __attribute__((aligned(8)));
2561 #endif
2562 if (canMMX2BeUsed)
2563 {
2564 asm volatile(
2565 #if defined(PIC)
2566 "mov %%"REG_b", %5 \n\t"
2567 #endif
2568 "pxor %%mm7, %%mm7 \n\t"
2569 "mov %0, %%"REG_c" \n\t"
2570 "mov %1, %%"REG_D" \n\t"
2571 "mov %2, %%"REG_d" \n\t"
2572 "mov %3, %%"REG_b" \n\t"
2573 "xor %%"REG_a", %%"REG_a" \n\t" // i
2574 PREFETCH" (%%"REG_c") \n\t"
2575 PREFETCH" 32(%%"REG_c") \n\t"
2576 PREFETCH" 64(%%"REG_c") \n\t"
2577
2578 #ifdef ARCH_X86_64
2579
2580 #define FUNNY_Y_CODE \
2581 "movl (%%"REG_b"), %%esi \n\t"\
2582 "call *%4 \n\t"\
2583 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2584 "add %%"REG_S", %%"REG_c" \n\t"\
2585 "add %%"REG_a", %%"REG_D" \n\t"\
2586 "xor %%"REG_a", %%"REG_a" \n\t"\
2587
2588 #else
2589
2590 #define FUNNY_Y_CODE \
2591 "movl (%%"REG_b"), %%esi \n\t"\
2592 "call *%4 \n\t"\
2593 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2594 "add %%"REG_a", %%"REG_D" \n\t"\
2595 "xor %%"REG_a", %%"REG_a" \n\t"\
2596
2597 #endif
2598
2599 FUNNY_Y_CODE
2600 FUNNY_Y_CODE
2601 FUNNY_Y_CODE
2602 FUNNY_Y_CODE
2603 FUNNY_Y_CODE
2604 FUNNY_Y_CODE
2605 FUNNY_Y_CODE
2606 FUNNY_Y_CODE
2607
2608 #if defined(PIC)
2609 "mov %5, %%"REG_b" \n\t"
2610 #endif
2611 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2612 "m" (funnyYCode)
2613 #if defined(PIC)
2614 ,"m" (ebxsave)
2615 #endif
2616 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2617 #if !defined(PIC)
2618 ,"%"REG_b
2619 #endif
2620 );
2621 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2622 }
2623 else
2624 {
2625 #endif
2626 long xInc_shr16 = xInc >> 16;
2627 uint16_t xInc_mask = xInc & 0xffff;
2628 //NO MMX just normal asm ...
2629 asm volatile(
2630 "xor %%"REG_a", %%"REG_a" \n\t" // i
2631 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2632 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2633 ASMALIGN(4)
2634 "1: \n\t"
2635 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2636 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2637 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2638 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2639 "shll $16, %%edi \n\t"
2640 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2641 "mov %1, %%"REG_D" \n\t"
2642 "shrl $9, %%esi \n\t"
2643 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2644 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2645 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2646
2647 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2648 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2649 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2650 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2651 "shll $16, %%edi \n\t"
2652 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2653 "mov %1, %%"REG_D" \n\t"
2654 "shrl $9, %%esi \n\t"
2655 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2656 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2657 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2658
2659
2660 "add $2, %%"REG_a" \n\t"
2661 "cmp %2, %%"REG_a" \n\t"
2662 " jb 1b \n\t"
2663
2664
2665 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2666 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2667 );
2668 #ifdef HAVE_MMX2
2669 } //if MMX2 can't be used
2670 #endif
2671 #else
2672 int i;
2673 unsigned int xpos=0;
2674 for (i=0;i<dstWidth;i++)
2675 {
2676 register unsigned int xx=xpos>>16;
2677 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2678 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2679 xpos+=xInc;
2680 }
2681 #endif
2682 }
2683 }
2684
2685 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2686 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2687 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2688 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2689 int32_t *mmx2FilterPos, uint8_t *pal)
2690 {
2691 if (srcFormat==PIX_FMT_YUYV422)
2692 {
2693 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2694 src1= formatConvBuffer;
2695 src2= formatConvBuffer+2048;
2696 }
2697 else if (srcFormat==PIX_FMT_UYVY422)
2698 {
2699 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2700 src1= formatConvBuffer;
2701 src2= formatConvBuffer+2048;
2702 }
2703 else if (srcFormat==PIX_FMT_RGB32)
2704 {
2705 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2706 src1= formatConvBuffer;
2707 src2= formatConvBuffer+2048;
2708 }
2709 else if (srcFormat==PIX_FMT_BGR24)
2710 {
2711 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2712 src1= formatConvBuffer;
2713 src2= formatConvBuffer+2048;
2714 }
2715 else if (srcFormat==PIX_FMT_BGR565)
2716 {
2717 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2718 src1= formatConvBuffer;
2719 src2= formatConvBuffer+2048;
2720 }
2721 else if (srcFormat==PIX_FMT_BGR555)
2722 {
2723 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2724 src1= formatConvBuffer;
2725 src2= formatConvBuffer+2048;
2726 }
2727 else if (srcFormat==PIX_FMT_BGR32)
2728 {
2729 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2730 src1= formatConvBuffer;
2731 src2= formatConvBuffer+2048;
2732 }
2733 else if (srcFormat==PIX_FMT_RGB24)
2734 {
2735 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2736 src1= formatConvBuffer;
2737 src2= formatConvBuffer+2048;
2738 }
2739 else if (srcFormat==PIX_FMT_RGB565)
2740 {
2741 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2742 src1= formatConvBuffer;
2743 src2= formatConvBuffer+2048;
2744 }
2745 else if (srcFormat==PIX_FMT_RGB555)
2746 {
2747 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2748 src1= formatConvBuffer;
2749 src2= formatConvBuffer+2048;
2750 }
2751 else if (isGray(srcFormat))
2752 {
2753 return;
2754 }
2755 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2756 {
2757 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal);
2758 src1= formatConvBuffer;
2759 src2= formatConvBuffer+2048;
2760 }
2761
2762 #ifdef HAVE_MMX
2763 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2764 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2765 #else
2766 if (!(flags&SWS_FAST_BILINEAR))
2767 #endif
2768 {
2769 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2770 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2771 }
2772 else // Fast Bilinear upscale / crap downscale
2773 {
2774 #if defined(ARCH_X86)
2775 #ifdef HAVE_MMX2
2776 int i;
2777 #if defined(PIC)
2778 uint64_t ebxsave __attribute__((aligned(8)));
2779 #endif
2780 if (canMMX2BeUsed)
2781 {
2782 asm volatile(
2783 #if defined(PIC)
2784 "mov %%"REG_b", %6 \n\t"
2785 #endif
2786 "pxor %%mm7, %%mm7 \n\t"
2787 "mov %0, %%"REG_c" \n\t"
2788 "mov %1, %%"REG_D" \n\t"
2789 "mov %2, %%"REG_d" \n\t"
2790 "mov %3, %%"REG_b" \n\t"
2791 "xor %%"REG_a", %%"REG_a" \n\t" // i
2792 PREFETCH" (%%"REG_c") \n\t"
2793 PREFETCH" 32(%%"REG_c") \n\t"
2794 PREFETCH" 64(%%"REG_c") \n\t"
2795
2796 #ifdef ARCH_X86_64
2797
2798 #define FUNNY_UV_CODE \
2799 "movl (%%"REG_b"), %%esi \n\t"\
2800 "call *%4 \n\t"\
2801 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2802 "add %%"REG_S", %%"REG_c" \n\t"\
2803 "add %%"REG_a", %%"REG_D" \n\t"\
2804 "xor %%"REG_a", %%"REG_a" \n\t"\
2805
2806 #else
2807
2808 #define FUNNY_UV_CODE \
2809 "movl (%%"REG_b"), %%esi \n\t"\
2810 "call *%4 \n\t"\
2811 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2812 "add %%"REG_a", %%"REG_D" \n\t"\
2813 "xor %%"REG_a", %%"REG_a" \n\t"\
2814
2815 #endif
2816
2817 FUNNY_UV_CODE
2818 FUNNY_UV_CODE
2819 FUNNY_UV_CODE
2820 FUNNY_UV_CODE
2821 "xor %%"REG_a", %%"REG_a" \n\t" // i
2822 "mov %5, %%"REG_c" \n\t" // src
2823 "mov %1, %%"REG_D" \n\t" // buf1
2824 "add $4096, %%"REG_D" \n\t"
2825 PREFETCH" (%%"REG_c") \n\t"
2826 PREFETCH" 32(%%"REG_c") \n\t"
2827 PREFETCH" 64(%%"REG_c") \n\t"
2828
2829 FUNNY_UV_CODE
2830 FUNNY_UV_CODE
2831 FUNNY_UV_CODE
2832 FUNNY_UV_CODE
2833
2834 #if defined(PIC)
2835 "mov %6, %%"REG_b" \n\t"
2836 #endif
2837 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2838 "m" (funnyUVCode), "m" (src2)
2839 #if defined(PIC)
2840 ,"m" (ebxsave)
2841 #endif
2842 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2843 #if !defined(PIC)
2844 ,"%"REG_b
2845 #endif
2846 );
2847 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2848 {
2849 //printf("%d %d %d\n", dstWidth, i, srcW);
2850 dst[i] = src1[srcW-1]*128;
2851 dst[i+2048] = src2[srcW-1]*128;
2852 }
2853 }
2854 else
2855 {
2856 #endif
2857 long xInc_shr16 = (long) (xInc >> 16);
2858 uint16_t xInc_mask = xInc & 0xffff;
2859 asm volatile(
2860 "xor %%"REG_a", %%"REG_a" \n\t" // i
2861 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2862 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2863 ASMALIGN(4)
2864 "1: \n\t"
2865 "mov %0, %%"REG_S" \n\t"
2866 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2867 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2868 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2869 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2870 "shll $16, %%edi \n\t"
2871 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2872 "mov %1, %%"REG_D" \n\t"
2873 "shrl $9, %%esi \n\t"
2874 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2875
2876 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2877 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2878 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2879 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2880 "shll $16, %%edi \n\t"
2881 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2882 "mov %1, %%"REG_D" \n\t"
2883 "shrl $9, %%esi \n\t"
2884 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2) \n\t"
2885
2886 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2887 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2888 "add $1, %%"REG_a" \n\t"
2889 "cmp %2, %%"REG_a" \n\t"
2890 " jb 1b \n\t"
2891
2892 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2893 which is needed to support GCC-4.0 */
2894 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2895 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2896 #else
2897 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2898 #endif
2899 "r" (src2)
2900 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2901 );
2902 #ifdef HAVE_MMX2
2903 } //if MMX2 can't be used
2904 #endif
2905 #else
2906 int i;
2907 unsigned int xpos=0;
2908 for (i=0;i<dstWidth;i++)
2909 {
2910 register unsigned int xx=xpos>>16;
2911 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2912 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2913 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2914 /* slower
2915 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2916 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2917 */
2918 xpos+=xInc;
2919 }
2920 #endif
2921 }
2922 }
2923
2924 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2925 int srcSliceH, uint8_t* dst[], int dstStride[]){
2926
2927 /* load a few things into local vars to make the code more readable? and faster */
2928 const int srcW= c->srcW;
2929 const int dstW= c->dstW;
2930 const int dstH= c->dstH;
2931 const int chrDstW= c->chrDstW;
2932 const int chrSrcW= c->chrSrcW;
2933 const int lumXInc= c->lumXInc;
2934 const int chrXInc= c->chrXInc;
2935 const int dstFormat= c->dstFormat;
2936 const int srcFormat= c->srcFormat;
2937 const int flags= c->flags;
2938 const int canMMX2BeUsed= c->canMMX2BeUsed;
2939 int16_t *vLumFilterPos= c->vLumFilterPos;
2940 int16_t *vChrFilterPos= c->vChrFilterPos;
2941 int16_t *hLumFilterPos= c->hLumFilterPos;
2942 int16_t *hChrFilterPos= c->hChrFilterPos;
2943 int16_t *vLumFilter= c->vLumFilter;
2944 int16_t *vChrFilter= c->vChrFilter;
2945 int16_t *hLumFilter= c->hLumFilter;
2946 int16_t *hChrFilter= c->hChrFilter;
2947 int32_t *lumMmxFilter= c->lumMmxFilter;
2948 int32_t *chrMmxFilter= c->chrMmxFilter;
2949 const int vLumFilterSize= c->vLumFilterSize;
2950 const int vChrFilterSize= c->vChrFilterSize;
2951 const int hLumFilterSize= c->hLumFilterSize;
2952 const int hChrFilterSize= c->hChrFilterSize;
2953 int16_t **lumPixBuf= c->lumPixBuf;
2954 int16_t **chrPixBuf= c->chrPixBuf;
2955 const int vLumBufSize= c->vLumBufSize;
2956 const int vChrBufSize= c->vChrBufSize;
2957 uint8_t *funnyYCode= c->funnyYCode;
2958 uint8_t *funnyUVCode= c->funnyUVCode;
2959 uint8_t *formatConvBuffer= c->formatConvBuffer;
2960 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2961 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2962 int lastDstY;
2963 uint8_t *pal=NULL;
2964
2965 /* vars whch will change and which we need to storw back in the context */
2966 int dstY= c->dstY;
2967 int lumBufIndex= c->lumBufIndex;
2968 int chrBufIndex= c->chrBufIndex;
2969 int lastInLumBuf= c->lastInLumBuf;
2970 int lastInChrBuf= c->lastInChrBuf;
2971
2972 if (isPacked(c->srcFormat)){
2973 pal= src[1];
2974 src[0]=
2975 src[1]=
2976 src[2]= src[0];
2977 srcStride[0]=
2978 srcStride[1]=
2979 srcStride[2]= srcStride[0];
2980 }
2981 srcStride[1]<<= c->vChrDrop;
2982 srcStride[2]<<= c->vChrDrop;
2983
2984 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2985 // (int)dst[0], (int)dst[1], (int)dst[2]);
2986
2987 #if 0 //self test FIXME move to a vfilter or something
2988 {
2989 static volatile int i=0;
2990 i++;
2991 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2992 selfTest(src, srcStride, c->srcW, c->srcH);
2993 i--;
2994 }
2995 #endif
2996
2997 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2998 //dstStride[0],dstStride[1],dstStride[2]);
2999
3000 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3001 {
3002 static int firstTime=1; //FIXME move this into the context perhaps
3003 if (flags & SWS_PRINT_INFO && firstTime)
3004 {
3005 av_log(c, AV_LOG_WARNING, "SwScaler: Warning: dstStride is not aligned!\n"
3006 "SwScaler: ->cannot do aligned memory acesses anymore\n");
3007 firstTime=0;
3008 }
3009 }
3010
3011 /* Note the user might start scaling the picture in the middle so this will not get executed
3012 this is not really intended but works currently, so ppl might do it */
3013 if (srcSliceY ==0){
3014 lumBufIndex=0;
3015 chrBufIndex=0;
3016 dstY=0;
3017 lastInLumBuf= -1;
3018 lastInChrBuf= -1;
3019 }
3020
3021 lastDstY= dstY;
3022
3023 for (;dstY < dstH; dstY++){
3024 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3025 const int chrDstY= dstY>>c->chrDstVSubSample;
3026 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3027 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3028
3029 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3030 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3031 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3032 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3033
3034 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3035 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3036 //handle holes (FAST_BILINEAR & weird filters)
3037 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3038 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3039 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3040 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3041 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3042
3043 // Do we have enough lines in this slice to output the dstY line
3044 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3045 {
3046 //Do horizontal scaling
3047 while(lastInLumBuf < lastLumSrcY)
3048 {
3049 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3050 lumBufIndex++;
3051 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3052 ASSERT(lumBufIndex < 2*vLumBufSize)
3053 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3054 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3055 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3056 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3057 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3058 funnyYCode, c->srcFormat, formatConvBuffer,
3059 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3060 lastInLumBuf++;
3061 }
3062 while(lastInChrBuf < lastChrSrcY)
3063 {
3064 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3065 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3066 chrBufIndex++;
3067 ASSERT(chrBufIndex < 2*vChrBufSize)
3068 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3069 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3070 //FIXME replace parameters through context struct (some at least)
3071
3072 if (!(isGray(srcFormat) || isGray(dstFormat)))
3073 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3074 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3075 funnyUVCode, c->srcFormat, formatConvBuffer,
3076 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3077 lastInChrBuf++;
3078 }
3079 //wrap buf index around to stay inside the ring buffer
3080 if