add support for intel mac. mp3lib is not fixed yet.
[libav.git] / postproc / swscale_template.c
1 /*
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 #include "asmalign.h"
20
21 #undef REAL_MOVNTQ
22 #undef MOVNTQ
23 #undef PAVGB
24 #undef PREFETCH
25 #undef PREFETCHW
26 #undef EMMS
27 #undef SFENCE
28
29 #ifdef HAVE_3DNOW
30 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
31 #define EMMS "femms"
32 #else
33 #define EMMS "emms"
34 #endif
35
36 #ifdef HAVE_3DNOW
37 #define PREFETCH "prefetch"
38 #define PREFETCHW "prefetchw"
39 #elif defined ( HAVE_MMX2 )
40 #define PREFETCH "prefetchnta"
41 #define PREFETCHW "prefetcht0"
42 #else
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
45 #endif
46
47 #ifdef HAVE_MMX2
48 #define SFENCE "sfence"
49 #else
50 #define SFENCE "/nop"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
55 #elif defined (HAVE_3DNOW)
56 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
57 #endif
58
59 #ifdef HAVE_MMX2
60 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
61 #else
62 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
63 #endif
64 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
65
66 #ifdef HAVE_ALTIVEC
67 #include "swscale_altivec_template.c"
68 #endif
69
70 #define YSCALEYUV2YV12X(x, offset) \
71 "xor %%"REG_a", %%"REG_a" \n\t"\
72 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
73 "movq %%mm3, %%mm4 \n\t"\
74 "lea " offset "(%0), %%"REG_d" \n\t"\
75 "mov (%%"REG_d"), %%"REG_S" \n\t"\
76 ASMALIGN16 /* FIXME Unroll? */\
77 "1: \n\t"\
78 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
79 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
80 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
81 "add $16, %%"REG_d" \n\t"\
82 "mov (%%"REG_d"), %%"REG_S" \n\t"\
83 "test %%"REG_S", %%"REG_S" \n\t"\
84 "pmulhw %%mm0, %%mm2 \n\t"\
85 "pmulhw %%mm0, %%mm5 \n\t"\
86 "paddw %%mm2, %%mm3 \n\t"\
87 "paddw %%mm5, %%mm4 \n\t"\
88 " jnz 1b \n\t"\
89 "psraw $3, %%mm3 \n\t"\
90 "psraw $3, %%mm4 \n\t"\
91 "packuswb %%mm4, %%mm3 \n\t"\
92 MOVNTQ(%%mm3, (%1, %%REGa))\
93 "add $8, %%"REG_a" \n\t"\
94 "cmp %2, %%"REG_a" \n\t"\
95 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
96 "movq %%mm3, %%mm4 \n\t"\
97 "lea " offset "(%0), %%"REG_d" \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 "jb 1b \n\t"
100
101 #define YSCALEYUV2YV121 \
102 "mov %2, %%"REG_a" \n\t"\
103 ASMALIGN16 /* FIXME Unroll? */\
104 "1: \n\t"\
105 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
106 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
107 "psraw $7, %%mm0 \n\t"\
108 "psraw $7, %%mm1 \n\t"\
109 "packuswb %%mm1, %%mm0 \n\t"\
110 MOVNTQ(%%mm0, (%1, %%REGa))\
111 "add $8, %%"REG_a" \n\t"\
112 "jnc 1b \n\t"
113
114 /*
115 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
116 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
117 "r" (dest), "m" (dstW),
118 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
119 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
120 */
121 #define YSCALEYUV2PACKEDX \
122 "xor %%"REG_a", %%"REG_a" \n\t"\
123 ASMALIGN16\
124 "nop \n\t"\
125 "1: \n\t"\
126 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
127 "mov (%%"REG_d"), %%"REG_S" \n\t"\
128 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
129 "movq %%mm3, %%mm4 \n\t"\
130 ASMALIGN16\
131 "2: \n\t"\
132 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
133 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
134 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
135 "add $16, %%"REG_d" \n\t"\
136 "mov (%%"REG_d"), %%"REG_S" \n\t"\
137 "pmulhw %%mm0, %%mm2 \n\t"\
138 "pmulhw %%mm0, %%mm5 \n\t"\
139 "paddw %%mm2, %%mm3 \n\t"\
140 "paddw %%mm5, %%mm4 \n\t"\
141 "test %%"REG_S", %%"REG_S" \n\t"\
142 " jnz 2b \n\t"\
143 \
144 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
146 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
147 "movq %%mm1, %%mm7 \n\t"\
148 ASMALIGN16\
149 "2: \n\t"\
150 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
151 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
152 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
153 "add $16, %%"REG_d" \n\t"\
154 "mov (%%"REG_d"), %%"REG_S" \n\t"\
155 "pmulhw %%mm0, %%mm2 \n\t"\
156 "pmulhw %%mm0, %%mm5 \n\t"\
157 "paddw %%mm2, %%mm1 \n\t"\
158 "paddw %%mm5, %%mm7 \n\t"\
159 "test %%"REG_S", %%"REG_S" \n\t"\
160 " jnz 2b \n\t"\
161
162
163 #define YSCALEYUV2RGBX \
164 YSCALEYUV2PACKEDX\
165 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
166 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
167 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
168 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
169 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
170 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
171 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
172 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
173 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
174 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
175 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
176 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
177 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
178 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
179 "paddw %%mm3, %%mm4 \n\t"\
180 "movq %%mm2, %%mm0 \n\t"\
181 "movq %%mm5, %%mm6 \n\t"\
182 "movq %%mm4, %%mm3 \n\t"\
183 "punpcklwd %%mm2, %%mm2 \n\t"\
184 "punpcklwd %%mm5, %%mm5 \n\t"\
185 "punpcklwd %%mm4, %%mm4 \n\t"\
186 "paddw %%mm1, %%mm2 \n\t"\
187 "paddw %%mm1, %%mm5 \n\t"\
188 "paddw %%mm1, %%mm4 \n\t"\
189 "punpckhwd %%mm0, %%mm0 \n\t"\
190 "punpckhwd %%mm6, %%mm6 \n\t"\
191 "punpckhwd %%mm3, %%mm3 \n\t"\
192 "paddw %%mm7, %%mm0 \n\t"\
193 "paddw %%mm7, %%mm6 \n\t"\
194 "paddw %%mm7, %%mm3 \n\t"\
195 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
196 "packuswb %%mm0, %%mm2 \n\t"\
197 "packuswb %%mm6, %%mm5 \n\t"\
198 "packuswb %%mm3, %%mm4 \n\t"\
199 "pxor %%mm7, %%mm7 \n\t"
200 #if 0
201 #define FULL_YSCALEYUV2RGB \
202 "pxor %%mm7, %%mm7 \n\t"\
203 "movd %6, %%mm6 \n\t" /*yalpha1*/\
204 "punpcklwd %%mm6, %%mm6 \n\t"\
205 "punpcklwd %%mm6, %%mm6 \n\t"\
206 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
207 "punpcklwd %%mm5, %%mm5 \n\t"\
208 "punpcklwd %%mm5, %%mm5 \n\t"\
209 "xor %%"REG_a", %%"REG_a" \n\t"\
210 ASMALIGN16\
211 "1: \n\t"\
212 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
213 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
214 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
215 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
216 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
217 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
218 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
219 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
220 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
221 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
222 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
223 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
224 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
225 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
226 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
227 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
228 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
229 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230 \
231 \
232 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
233 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
234 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
235 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
236 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
237 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
238 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
239 \
240 \
241 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
242 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
243 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
244 "paddw %%mm1, %%mm3 \n\t" /* B*/\
245 "paddw %%mm1, %%mm0 \n\t" /* R*/\
246 "packuswb %%mm3, %%mm3 \n\t"\
247 \
248 "packuswb %%mm0, %%mm0 \n\t"\
249 "paddw %%mm4, %%mm2 \n\t"\
250 "paddw %%mm2, %%mm1 \n\t" /* G*/\
251 \
252 "packuswb %%mm1, %%mm1 \n\t"
253 #endif
254
255 #define REAL_YSCALEYUV2PACKED(index, c) \
256 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
257 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
258 "psraw $3, %%mm0 \n\t"\
259 "psraw $3, %%mm1 \n\t"\
260 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
261 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
262 "xor "#index", "#index" \n\t"\
263 ASMALIGN16\
264 "1: \n\t"\
265 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
266 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
267 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
268 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
269 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
270 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
271 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
272 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
273 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
274 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
275 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
276 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
277 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
278 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
279 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
280 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
281 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
282 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
283 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
284 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
285 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
286 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
287 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
288 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
290
291 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
292
293 #define REAL_YSCALEYUV2RGB(index, c) \
294 "xor "#index", "#index" \n\t"\
295 ASMALIGN16\
296 "1: \n\t"\
297 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
298 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
299 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
300 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
301 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
302 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
303 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
304 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
305 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
308 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
309 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
310 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
311 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
312 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
313 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
314 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
315 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
316 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
317 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
318 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
319 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
320 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
321 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
322 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
323 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
324 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
325 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
326 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
327 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
328 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
329 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
330 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
331 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
332 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
333 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
334 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
335 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
336 "paddw %%mm3, %%mm4 \n\t"\
337 "movq %%mm2, %%mm0 \n\t"\
338 "movq %%mm5, %%mm6 \n\t"\
339 "movq %%mm4, %%mm3 \n\t"\
340 "punpcklwd %%mm2, %%mm2 \n\t"\
341 "punpcklwd %%mm5, %%mm5 \n\t"\
342 "punpcklwd %%mm4, %%mm4 \n\t"\
343 "paddw %%mm1, %%mm2 \n\t"\
344 "paddw %%mm1, %%mm5 \n\t"\
345 "paddw %%mm1, %%mm4 \n\t"\
346 "punpckhwd %%mm0, %%mm0 \n\t"\
347 "punpckhwd %%mm6, %%mm6 \n\t"\
348 "punpckhwd %%mm3, %%mm3 \n\t"\
349 "paddw %%mm7, %%mm0 \n\t"\
350 "paddw %%mm7, %%mm6 \n\t"\
351 "paddw %%mm7, %%mm3 \n\t"\
352 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
353 "packuswb %%mm0, %%mm2 \n\t"\
354 "packuswb %%mm6, %%mm5 \n\t"\
355 "packuswb %%mm3, %%mm4 \n\t"\
356 "pxor %%mm7, %%mm7 \n\t"
357 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
358
359 #define REAL_YSCALEYUV2PACKED1(index, c) \
360 "xor "#index", "#index" \n\t"\
361 ASMALIGN16\
362 "1: \n\t"\
363 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
364 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
365 "psraw $7, %%mm3 \n\t" \
366 "psraw $7, %%mm4 \n\t" \
367 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
368 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
369 "psraw $7, %%mm1 \n\t" \
370 "psraw $7, %%mm7 \n\t" \
371
372 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
373
374 #define REAL_YSCALEYUV2RGB1(index, c) \
375 "xor "#index", "#index" \n\t"\
376 ASMALIGN16\
377 "1: \n\t"\
378 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
379 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
380 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
381 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
382 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
383 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
384 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
385 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
386 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
387 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
388 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
389 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
390 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
391 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
392 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
393 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
394 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
395 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
396 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
397 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
398 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
399 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
400 "paddw %%mm3, %%mm4 \n\t"\
401 "movq %%mm2, %%mm0 \n\t"\
402 "movq %%mm5, %%mm6 \n\t"\
403 "movq %%mm4, %%mm3 \n\t"\
404 "punpcklwd %%mm2, %%mm2 \n\t"\
405 "punpcklwd %%mm5, %%mm5 \n\t"\
406 "punpcklwd %%mm4, %%mm4 \n\t"\
407 "paddw %%mm1, %%mm2 \n\t"\
408 "paddw %%mm1, %%mm5 \n\t"\
409 "paddw %%mm1, %%mm4 \n\t"\
410 "punpckhwd %%mm0, %%mm0 \n\t"\
411 "punpckhwd %%mm6, %%mm6 \n\t"\
412 "punpckhwd %%mm3, %%mm3 \n\t"\
413 "paddw %%mm7, %%mm0 \n\t"\
414 "paddw %%mm7, %%mm6 \n\t"\
415 "paddw %%mm7, %%mm3 \n\t"\
416 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
417 "packuswb %%mm0, %%mm2 \n\t"\
418 "packuswb %%mm6, %%mm5 \n\t"\
419 "packuswb %%mm3, %%mm4 \n\t"\
420 "pxor %%mm7, %%mm7 \n\t"
421 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
422
423 #define REAL_YSCALEYUV2PACKED1b(index, c) \
424 "xor "#index", "#index" \n\t"\
425 ASMALIGN16\
426 "1: \n\t"\
427 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
428 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
429 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
430 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
431 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
432 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
433 "psrlw $8, %%mm3 \n\t" \
434 "psrlw $8, %%mm4 \n\t" \
435 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
436 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
437 "psraw $7, %%mm1 \n\t" \
438 "psraw $7, %%mm7 \n\t"
439 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
440
441 // do vertical chrominance interpolation
442 #define REAL_YSCALEYUV2RGB1b(index, c) \
443 "xor "#index", "#index" \n\t"\
444 ASMALIGN16\
445 "1: \n\t"\
446 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
447 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
448 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
449 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
450 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
451 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
452 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
453 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
454 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
455 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
456 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
457 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
458 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
459 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
460 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
461 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
462 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
463 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
464 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
465 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
466 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
467 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
468 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
469 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
470 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
471 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
472 "paddw %%mm3, %%mm4 \n\t"\
473 "movq %%mm2, %%mm0 \n\t"\
474 "movq %%mm5, %%mm6 \n\t"\
475 "movq %%mm4, %%mm3 \n\t"\
476 "punpcklwd %%mm2, %%mm2 \n\t"\
477 "punpcklwd %%mm5, %%mm5 \n\t"\
478 "punpcklwd %%mm4, %%mm4 \n\t"\
479 "paddw %%mm1, %%mm2 \n\t"\
480 "paddw %%mm1, %%mm5 \n\t"\
481 "paddw %%mm1, %%mm4 \n\t"\
482 "punpckhwd %%mm0, %%mm0 \n\t"\
483 "punpckhwd %%mm6, %%mm6 \n\t"\
484 "punpckhwd %%mm3, %%mm3 \n\t"\
485 "paddw %%mm7, %%mm0 \n\t"\
486 "paddw %%mm7, %%mm6 \n\t"\
487 "paddw %%mm7, %%mm3 \n\t"\
488 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
489 "packuswb %%mm0, %%mm2 \n\t"\
490 "packuswb %%mm6, %%mm5 \n\t"\
491 "packuswb %%mm3, %%mm4 \n\t"\
492 "pxor %%mm7, %%mm7 \n\t"
493 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
494
495 #define REAL_WRITEBGR32(dst, dstw, index) \
496 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
497 "movq %%mm2, %%mm1 \n\t" /* B */\
498 "movq %%mm5, %%mm6 \n\t" /* R */\
499 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
500 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
501 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
502 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
503 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
504 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
505 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
506 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
507 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
508 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
509 \
510 MOVNTQ(%%mm0, (dst, index, 4))\
511 MOVNTQ(%%mm2, 8(dst, index, 4))\
512 MOVNTQ(%%mm1, 16(dst, index, 4))\
513 MOVNTQ(%%mm3, 24(dst, index, 4))\
514 \
515 "add $8, "#index" \n\t"\
516 "cmp "#dstw", "#index" \n\t"\
517 " jb 1b \n\t"
518 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
519
520 #define REAL_WRITEBGR16(dst, dstw, index) \
521 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
522 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
523 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
524 "psrlq $3, %%mm2 \n\t"\
525 \
526 "movq %%mm2, %%mm1 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 \
529 "punpcklbw %%mm7, %%mm3 \n\t"\
530 "punpcklbw %%mm5, %%mm2 \n\t"\
531 "punpckhbw %%mm7, %%mm4 \n\t"\
532 "punpckhbw %%mm5, %%mm1 \n\t"\
533 \
534 "psllq $3, %%mm3 \n\t"\
535 "psllq $3, %%mm4 \n\t"\
536 \
537 "por %%mm3, %%mm2 \n\t"\
538 "por %%mm4, %%mm1 \n\t"\
539 \
540 MOVNTQ(%%mm2, (dst, index, 2))\
541 MOVNTQ(%%mm1, 8(dst, index, 2))\
542 \
543 "add $8, "#index" \n\t"\
544 "cmp "#dstw", "#index" \n\t"\
545 " jb 1b \n\t"
546 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
547
548 #define REAL_WRITEBGR15(dst, dstw, index) \
549 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
550 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
551 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
552 "psrlq $3, %%mm2 \n\t"\
553 "psrlq $1, %%mm5 \n\t"\
554 \
555 "movq %%mm2, %%mm1 \n\t"\
556 "movq %%mm4, %%mm3 \n\t"\
557 \
558 "punpcklbw %%mm7, %%mm3 \n\t"\
559 "punpcklbw %%mm5, %%mm2 \n\t"\
560 "punpckhbw %%mm7, %%mm4 \n\t"\
561 "punpckhbw %%mm5, %%mm1 \n\t"\
562 \
563 "psllq $2, %%mm3 \n\t"\
564 "psllq $2, %%mm4 \n\t"\
565 \
566 "por %%mm3, %%mm2 \n\t"\
567 "por %%mm4, %%mm1 \n\t"\
568 \
569 MOVNTQ(%%mm2, (dst, index, 2))\
570 MOVNTQ(%%mm1, 8(dst, index, 2))\
571 \
572 "add $8, "#index" \n\t"\
573 "cmp "#dstw", "#index" \n\t"\
574 " jb 1b \n\t"
575 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
576
577 #define WRITEBGR24OLD(dst, dstw, index) \
578 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
579 "movq %%mm2, %%mm1 \n\t" /* B */\
580 "movq %%mm5, %%mm6 \n\t" /* R */\
581 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
582 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
583 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
584 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
585 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
586 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
587 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
588 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
589 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
590 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
591 \
592 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
593 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
594 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
595 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
596 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
597 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
598 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
599 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
600 \
601 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
602 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
603 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
604 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
605 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
606 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
607 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
608 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
609 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
610 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
611 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
612 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
613 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
614 \
615 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
616 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
617 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
618 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
619 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
620 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
621 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
622 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
623 \
624 MOVNTQ(%%mm0, (dst))\
625 MOVNTQ(%%mm2, 8(dst))\
626 MOVNTQ(%%mm3, 16(dst))\
627 "add $24, "#dst" \n\t"\
628 \
629 "add $8, "#index" \n\t"\
630 "cmp "#dstw", "#index" \n\t"\
631 " jb 1b \n\t"
632
633 #define WRITEBGR24MMX(dst, dstw, index) \
634 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
635 "movq %%mm2, %%mm1 \n\t" /* B */\
636 "movq %%mm5, %%mm6 \n\t" /* R */\
637 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
638 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
639 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
640 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
641 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
642 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
643 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
644 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
645 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
646 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
647 \
648 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
649 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
650 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
651 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
652 \
653 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
654 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
655 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
656 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
657 \
658 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
659 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
660 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
661 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
662 \
663 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
664 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
665 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
666 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
667 MOVNTQ(%%mm0, (dst))\
668 \
669 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
670 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
671 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
672 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
673 MOVNTQ(%%mm6, 8(dst))\
674 \
675 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
676 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
677 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
678 MOVNTQ(%%mm5, 16(dst))\
679 \
680 "add $24, "#dst" \n\t"\
681 \
682 "add $8, "#index" \n\t"\
683 "cmp "#dstw", "#index" \n\t"\
684 " jb 1b \n\t"
685
686 #define WRITEBGR24MMX2(dst, dstw, index) \
687 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
688 "movq "MANGLE(M24A)", %%mm0 \n\t"\
689 "movq "MANGLE(M24C)", %%mm7 \n\t"\
690 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
691 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
692 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
693 \
694 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
695 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
696 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
697 \
698 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
699 "por %%mm1, %%mm6 \n\t"\
700 "por %%mm3, %%mm6 \n\t"\
701 MOVNTQ(%%mm6, (dst))\
702 \
703 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
704 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
705 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
706 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
707 \
708 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
709 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
710 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
711 \
712 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
713 "por %%mm3, %%mm6 \n\t"\
714 MOVNTQ(%%mm6, 8(dst))\
715 \
716 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
717 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
718 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
719 \
720 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
721 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
722 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
723 \
724 "por %%mm1, %%mm3 \n\t"\
725 "por %%mm3, %%mm6 \n\t"\
726 MOVNTQ(%%mm6, 16(dst))\
727 \
728 "add $24, "#dst" \n\t"\
729 \
730 "add $8, "#index" \n\t"\
731 "cmp "#dstw", "#index" \n\t"\
732 " jb 1b \n\t"
733
734 #ifdef HAVE_MMX2
735 #undef WRITEBGR24
736 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
737 #else
738 #undef WRITEBGR24
739 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
740 #endif
741
742 #define REAL_WRITEYUY2(dst, dstw, index) \
743 "packuswb %%mm3, %%mm3 \n\t"\
744 "packuswb %%mm4, %%mm4 \n\t"\
745 "packuswb %%mm7, %%mm1 \n\t"\
746 "punpcklbw %%mm4, %%mm3 \n\t"\
747 "movq %%mm1, %%mm7 \n\t"\
748 "punpcklbw %%mm3, %%mm1 \n\t"\
749 "punpckhbw %%mm3, %%mm7 \n\t"\
750 \
751 MOVNTQ(%%mm1, (dst, index, 2))\
752 MOVNTQ(%%mm7, 8(dst, index, 2))\
753 \
754 "add $8, "#index" \n\t"\
755 "cmp "#dstw", "#index" \n\t"\
756 " jb 1b \n\t"
757 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
758
759
760 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
761 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
762 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
763 {
764 #ifdef HAVE_MMX
765 if(uDest != NULL)
766 {
767 asm volatile(
768 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
769 :: "r" (&c->redDither),
770 "r" (uDest), "p" (chrDstW)
771 : "%"REG_a, "%"REG_d, "%"REG_S
772 );
773
774 asm volatile(
775 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
776 :: "r" (&c->redDither),
777 "r" (vDest), "p" (chrDstW)
778 : "%"REG_a, "%"REG_d, "%"REG_S
779 );
780 }
781
782 asm volatile(
783 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
784 :: "r" (&c->redDither),
785 "r" (dest), "p" (dstW)
786 : "%"REG_a, "%"REG_d, "%"REG_S
787 );
788 #else
789 #ifdef HAVE_ALTIVEC
790 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
791 chrFilter, chrSrc, chrFilterSize,
792 dest, uDest, vDest, dstW, chrDstW);
793 #else //HAVE_ALTIVEC
794 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
795 chrFilter, chrSrc, chrFilterSize,
796 dest, uDest, vDest, dstW, chrDstW);
797 #endif //!HAVE_ALTIVEC
798 #endif
799 }
800
801 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
802 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
803 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
804 {
805 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
806 chrFilter, chrSrc, chrFilterSize,
807 dest, uDest, dstW, chrDstW, dstFormat);
808 }
809
810 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
811 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
812 {
813 #ifdef HAVE_MMX
814 if(uDest != NULL)
815 {
816 asm volatile(
817 YSCALEYUV2YV121
818 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
819 "g" (-chrDstW)
820 : "%"REG_a
821 );
822
823 asm volatile(
824 YSCALEYUV2YV121
825 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
826 "g" (-chrDstW)
827 : "%"REG_a
828 );
829 }
830
831 asm volatile(
832 YSCALEYUV2YV121
833 :: "r" (lumSrc + dstW), "r" (dest + dstW),
834 "g" (-dstW)
835 : "%"REG_a
836 );
837 #else
838 int i;
839 for(i=0; i<dstW; i++)
840 {
841 int val= lumSrc[i]>>7;
842
843 if(val&256){
844 if(val<0) val=0;
845 else val=255;
846 }
847
848 dest[i]= val;
849 }
850
851 if(uDest != NULL)
852 for(i=0; i<chrDstW; i++)
853 {
854 int u=chrSrc[i]>>7;
855 int v=chrSrc[i + 2048]>>7;
856
857 if((u|v)&256){
858 if(u<0) u=0;
859 else if (u>255) u=255;
860 if(v<0) v=0;
861 else if (v>255) v=255;
862 }
863
864 uDest[i]= u;
865 vDest[i]= v;
866 }
867 #endif
868 }
869
870
871 /**
872 * vertical scale YV12 to RGB
873 */
874 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
875 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
876 uint8_t *dest, int dstW, int dstY)
877 {
878 int dummy=0;
879 switch(c->dstFormat)
880 {
881 #ifdef HAVE_MMX
882 case IMGFMT_BGR32:
883 {
884 asm volatile(
885 YSCALEYUV2RGBX
886 WRITEBGR32(%4, %5, %%REGa)
887
888 :: "r" (&c->redDither),
889 "m" (dummy), "m" (dummy), "m" (dummy),
890 "r" (dest), "m" (dstW)
891 : "%"REG_a, "%"REG_d, "%"REG_S
892 );
893 }
894 break;
895 case IMGFMT_BGR24:
896 {
897 asm volatile(
898 YSCALEYUV2RGBX
899 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
900 "add %4, %%"REG_b" \n\t"
901 WRITEBGR24(%%REGb, %5, %%REGa)
902
903 :: "r" (&c->redDither),
904 "m" (dummy), "m" (dummy), "m" (dummy),
905 "r" (dest), "m" (dstW)
906 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
907 );
908 }
909 break;
910 case IMGFMT_BGR15:
911 {
912 asm volatile(
913 YSCALEYUV2RGBX
914 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
915 #ifdef DITHER1XBPP
916 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
917 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
918 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
919 #endif
920
921 WRITEBGR15(%4, %5, %%REGa)
922
923 :: "r" (&c->redDither),
924 "m" (dummy), "m" (dummy), "m" (dummy),
925 "r" (dest), "m" (dstW)
926 : "%"REG_a, "%"REG_d, "%"REG_S
927 );
928 }
929 break;
930 case IMGFMT_BGR16:
931 {
932 asm volatile(
933 YSCALEYUV2RGBX
934 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
935 #ifdef DITHER1XBPP
936 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
937 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
938 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
939 #endif
940
941 WRITEBGR16(%4, %5, %%REGa)
942
943 :: "r" (&c->redDither),
944 "m" (dummy), "m" (dummy), "m" (dummy),
945 "r" (dest), "m" (dstW)
946 : "%"REG_a, "%"REG_d, "%"REG_S
947 );
948 }
949 break;
950 case IMGFMT_YUY2:
951 {
952 asm volatile(
953 YSCALEYUV2PACKEDX
954 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
955
956 "psraw $3, %%mm3 \n\t"
957 "psraw $3, %%mm4 \n\t"
958 "psraw $3, %%mm1 \n\t"
959 "psraw $3, %%mm7 \n\t"
960 WRITEYUY2(%4, %5, %%REGa)
961
962 :: "r" (&c->redDither),
963 "m" (dummy), "m" (dummy), "m" (dummy),
964 "r" (dest), "m" (dstW)
965 : "%"REG_a, "%"REG_d, "%"REG_S
966 );
967 }
968 break;
969 #endif
970 default:
971 #ifdef HAVE_ALTIVEC
972 /* The following list of supported dstFormat values should
973 match what's found in the body of altivec_yuv2packedX() */
974 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA ||
975 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
976 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB)
977 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
978 chrFilter, chrSrc, chrFilterSize,
979 dest, dstW, dstY);
980 else
981 #endif
982 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
983 chrFilter, chrSrc, chrFilterSize,
984 dest, dstW, dstY);
985 break;
986 }
987 }
988
989 /**
990 * vertical bilinear scale YV12 to RGB
991 */
992 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
993 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
994 {
995 int yalpha1=yalpha^4095;
996 int uvalpha1=uvalpha^4095;
997 int i;
998
999 #if 0 //isn't used
1000 if(flags&SWS_FULL_CHR_H_INT)
1001 {
1002 switch(dstFormat)
1003 {
1004 #ifdef HAVE_MMX
1005 case IMGFMT_BGR32:
1006 asm volatile(
1007
1008
1009 FULL_YSCALEYUV2RGB
1010 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1011 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1012
1013 "movq %%mm3, %%mm1 \n\t"
1014 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1015 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1016
1017 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1018 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1019
1020 "add $4, %%"REG_a" \n\t"
1021 "cmp %5, %%"REG_a" \n\t"
1022 " jb 1b \n\t"
1023
1024
1025 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1026 "m" (yalpha1), "m" (uvalpha1)
1027 : "%"REG_a
1028 );
1029 break;
1030 case IMGFMT_BGR24:
1031 asm volatile(
1032
1033 FULL_YSCALEYUV2RGB
1034
1035 // lsb ... msb
1036 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1037 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1038
1039 "movq %%mm3, %%mm1 \n\t"
1040 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1041 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1042
1043 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1044 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1045 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1046 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1047 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1048 "movq %%mm1, %%mm2 \n\t"
1049 "psllq $48, %%mm1 \n\t" // 000000BG
1050 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1051
1052 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1053 "psrld $16, %%mm2 \n\t" // R000R000
1054 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1055 "por %%mm2, %%mm1 \n\t" // RBGRR000
1056
1057 "mov %4, %%"REG_b" \n\t"
1058 "add %%"REG_a", %%"REG_b" \n\t"
1059
1060 #ifdef HAVE_MMX2
1061 //FIXME Alignment
1062 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1063 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1064 #else
1065 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1066 "psrlq $32, %%mm3 \n\t"
1067 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1068 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1069 #endif
1070 "add $4, %%"REG_a" \n\t"
1071 "cmp %5, %%"REG_a" \n\t"
1072 " jb 1b \n\t"
1073
1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1075 "m" (yalpha1), "m" (uvalpha1)
1076 : "%"REG_a, "%"REG_b
1077 );
1078 break;
1079 case IMGFMT_BGR15:
1080 asm volatile(
1081
1082 FULL_YSCALEYUV2RGB
1083 #ifdef DITHER1XBPP
1084 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1086 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1087 #endif
1088 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1089 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1090 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1091
1092 "psrlw $3, %%mm3 \n\t"
1093 "psllw $2, %%mm1 \n\t"
1094 "psllw $7, %%mm0 \n\t"
1095 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1096 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1097
1098 "por %%mm3, %%mm1 \n\t"
1099 "por %%mm1, %%mm0 \n\t"
1100
1101 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1102
1103 "add $4, %%"REG_a" \n\t"
1104 "cmp %5, %%"REG_a" \n\t"
1105 " jb 1b \n\t"
1106
1107 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1108 "m" (yalpha1), "m" (uvalpha1)
1109 : "%"REG_a
1110 );
1111 break;
1112 case IMGFMT_BGR16:
1113 asm volatile(
1114
1115 FULL_YSCALEYUV2RGB
1116 #ifdef DITHER1XBPP
1117 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1118 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1119 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1120 #endif
1121 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1122 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1123 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1124
1125 "psrlw $3, %%mm3 \n\t"
1126 "psllw $3, %%mm1 \n\t"
1127 "psllw $8, %%mm0 \n\t"
1128 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1129 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1130
1131 "por %%mm3, %%mm1 \n\t"
1132 "por %%mm1, %%mm0 \n\t"
1133
1134 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1135
1136 "add $4, %%"REG_a" \n\t"
1137 "cmp %5, %%"REG_a" \n\t"
1138 " jb 1b \n\t"
1139
1140 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1141 "m" (yalpha1), "m" (uvalpha1)
1142 : "%"REG_a
1143 );
1144 break;
1145 #endif
1146 case IMGFMT_RGB32:
1147 #ifndef HAVE_MMX
1148 case IMGFMT_BGR32:
1149 #endif
1150 if(dstFormat==IMGFMT_BGR32)
1151 {
1152 int i;
1153 #ifdef WORDS_BIGENDIAN
1154 dest++;
1155 #endif
1156 for(i=0;i<dstW;i++){
1157 // vertical linear interpolation && yuv2rgb in a single step:
1158 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1159 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1160 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1161 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1162 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1163 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1164 dest+= 4;
1165 }
1166 }
1167 else if(dstFormat==IMGFMT_BGR24)
1168 {
1169 int i;
1170 for(i=0;i<dstW;i++){
1171 // vertical linear interpolation && yuv2rgb in a single step:
1172 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1173 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1174 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1175 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1176 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1177 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1178 dest+= 3;
1179 }
1180 }
1181 else if(dstFormat==IMGFMT_BGR16)
1182 {
1183 int i;
1184 for(i=0;i<dstW;i++){
1185 // vertical linear interpolation && yuv2rgb in a single step:
1186 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1187 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1188 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1189
1190 ((uint16_t*)dest)[i] =
1191 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1192 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1193 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1194 }
1195 }
1196 else if(dstFormat==IMGFMT_BGR15)
1197 {
1198 int i;
1199 for(i=0;i<dstW;i++){
1200 // vertical linear interpolation && yuv2rgb in a single step:
1201 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1202 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1203 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1204
1205 ((uint16_t*)dest)[i] =
1206 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1207 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1208 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1209 }
1210 }
1211 }//FULL_UV_IPOL
1212 else
1213 {
1214 #endif // if 0
1215 #ifdef HAVE_MMX
1216 switch(c->dstFormat)
1217 {
1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219 case IMGFMT_BGR32:
1220 asm volatile(
1221 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1222 "mov %4, %%"REG_SP" \n\t"
1223 YSCALEYUV2RGB(%%REGa, %5)
1224 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1225 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1226
1227 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1228 "r" (&c->redDither)
1229 : "%"REG_a
1230 );
1231 return;
1232 case IMGFMT_BGR24:
1233 asm volatile(
1234 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1235 "mov %4, %%"REG_SP" \n\t"
1236 YSCALEYUV2RGB(%%REGa, %5)
1237 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1238 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1239 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1240 "r" (&c->redDither)
1241 : "%"REG_a
1242 );
1243 return;
1244 case IMGFMT_BGR15:
1245 asm volatile(
1246 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1247 "mov %4, %%"REG_SP" \n\t"
1248 YSCALEYUV2RGB(%%REGa, %5)
1249 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1250 #ifdef DITHER1XBPP
1251 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1252 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1253 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1254 #endif
1255
1256 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1257 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1258
1259 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1260 "r" (&c->redDither)
1261 : "%"REG_a
1262 );
1263 return;
1264 case IMGFMT_BGR16:
1265 asm volatile(
1266 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1267 "mov %4, %%"REG_SP" \n\t"
1268 YSCALEYUV2RGB(%%REGa, %5)
1269 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1270 #ifdef DITHER1XBPP
1271 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1272 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1273 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1274 #endif
1275
1276 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1277 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1278 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1279 "r" (&c->redDither)
1280 : "%"REG_a
1281 );
1282 return;
1283 case IMGFMT_YUY2:
1284 asm volatile(
1285 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1286 "mov %4, %%"REG_SP" \n\t"
1287 YSCALEYUV2PACKED(%%REGa, %5)
1288 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1289 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1290 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1291 "r" (&c->redDither)
1292 : "%"REG_a
1293 );
1294 return;
1295 default: break;
1296 }
1297 #endif //HAVE_MMX
1298 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1299 }
1300
1301 /**
1302 * YV12 to RGB without scaling or interpolating
1303 */
1304 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1305 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1306 {
1307 const int yalpha1=0;
1308 int i;
1309
1310 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1311 const int yalpha= 4096; //FIXME ...
1312
1313 if(flags&SWS_FULL_CHR_H_INT)
1314 {
1315 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1316 return;
1317 }
1318
1319 #ifdef HAVE_MMX
1320 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1321 {
1322 switch(dstFormat)
1323 {
1324 case IMGFMT_BGR32:
1325 asm volatile(
1326 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1327 "mov %4, %%"REG_SP" \n\t"
1328 YSCALEYUV2RGB1(%%REGa, %5)
1329 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1330 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1331
1332 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1333 "r" (&c->redDither)
1334 : "%"REG_a
1335 );
1336 return;
1337 case IMGFMT_BGR24:
1338 asm volatile(
1339 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1340 "mov %4, %%"REG_SP" \n\t"
1341 YSCALEYUV2RGB1(%%REGa, %5)
1342 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1343 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1344
1345 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1346 "r" (&c->redDither)
1347 : "%"REG_a
1348 );
1349 return;
1350 case IMGFMT_BGR15:
1351 asm volatile(
1352 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1353 "mov %4, %%"REG_SP" \n\t"
1354 YSCALEYUV2RGB1(%%REGa, %5)
1355 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1356 #ifdef DITHER1XBPP
1357 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1358 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1359 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1360 #endif
1361 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1362 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1363
1364 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1365 "r" (&c->redDither)
1366 : "%"REG_a
1367 );
1368 return;
1369 case IMGFMT_BGR16:
1370 asm volatile(
1371 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1372 "mov %4, %%"REG_SP" \n\t"
1373 YSCALEYUV2RGB1(%%REGa, %5)
1374 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1375 #ifdef DITHER1XBPP
1376 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1377 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1378 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1379 #endif
1380
1381 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1382 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1383
1384 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1385 "r" (&c->redDither)
1386 : "%"REG_a
1387 );
1388 return;
1389 case IMGFMT_YUY2:
1390 asm volatile(
1391 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1392 "mov %4, %%"REG_SP" \n\t"
1393 YSCALEYUV2PACKED1(%%REGa, %5)
1394 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1395 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1396
1397 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1398 "r" (&c->redDither)
1399 : "%"REG_a
1400 );
1401 return;
1402 }
1403 }
1404 else
1405 {
1406 switch(dstFormat)
1407 {
1408 case IMGFMT_BGR32:
1409 asm volatile(
1410 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1411 "mov %4, %%"REG_SP" \n\t"
1412 YSCALEYUV2RGB1b(%%REGa, %5)
1413 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1414 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1415
1416 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1417 "r" (&c->redDither)
1418 : "%"REG_a
1419 );
1420 return;
1421 case IMGFMT_BGR24:
1422 asm volatile(
1423 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1424 "mov %4, %%"REG_SP" \n\t"
1425 YSCALEYUV2RGB1b(%%REGa, %5)
1426 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1427 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1428
1429 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1430 "r" (&c->redDither)
1431 : "%"REG_a
1432 );
1433 return;
1434 case IMGFMT_BGR15:
1435 asm volatile(
1436 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1437 "mov %4, %%"REG_SP" \n\t"
1438 YSCALEYUV2RGB1b(%%REGa, %5)
1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1440 #ifdef DITHER1XBPP
1441 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1442 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1443 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1444 #endif
1445 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1446 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1447
1448 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1449 "r" (&c->redDither)
1450 : "%"REG_a
1451 );
1452 return;
1453 case IMGFMT_BGR16:
1454 asm volatile(
1455 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1456 "mov %4, %%"REG_SP" \n\t"
1457 YSCALEYUV2RGB1b(%%REGa, %5)
1458 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1459 #ifdef DITHER1XBPP
1460 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1461 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1462 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1463 #endif
1464
1465 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1466 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1467
1468 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1469 "r" (&c->redDither)
1470 : "%"REG_a
1471 );
1472 return;
1473 case IMGFMT_YUY2:
1474 asm volatile(
1475 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1476 "mov %4, %%"REG_SP" \n\t"
1477 YSCALEYUV2PACKED1b(%%REGa, %5)
1478 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1479 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1480
1481 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1482 "r" (&c->redDither)
1483 : "%"REG_a
1484 );
1485 return;
1486 }
1487 }
1488 #endif
1489 if( uvalpha < 2048 )
1490 {
1491 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1492 }else{
1493 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1494 }
1495 }
1496
1497 //FIXME yuy2* can read upto 7 samples to much
1498
1499 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1500 {
1501 #ifdef HAVE_MMX
1502 asm volatile(
1503 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1504 "mov %0, %%"REG_a" \n\t"
1505 "1: \n\t"
1506 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1507 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1508 "pand %%mm2, %%mm0 \n\t"
1509 "pand %%mm2, %%mm1 \n\t"
1510 "packuswb %%mm1, %%mm0 \n\t"
1511 "movq %%mm0, (%2, %%"REG_a") \n\t"
1512 "add $8, %%"REG_a" \n\t"
1513 " js 1b \n\t"
1514 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1515 : "%"REG_a
1516 );
1517 #else
1518 int i;
1519 for(i=0; i<width; i++)
1520 dst[i]= src[2*i];
1521 #endif
1522 }
1523
1524 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1525 {
1526 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1527 asm volatile(
1528 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1529 "mov %0, %%"REG_a" \n\t"
1530 "1: \n\t"
1531 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1532 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1533 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1534 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1535 PAVGB(%%mm2, %%mm0)
1536 PAVGB(%%mm3, %%mm1)
1537 "psrlw $8, %%mm0 \n\t"
1538 "psrlw $8, %%mm1 \n\t"
1539 "packuswb %%mm1, %%mm0 \n\t"
1540 "movq %%mm0, %%mm1 \n\t"
1541 "psrlw $8, %%mm0 \n\t"
1542 "pand %%mm4, %%mm1 \n\t"
1543 "packuswb %%mm0, %%mm0 \n\t"
1544 "packuswb %%mm1, %%mm1 \n\t"
1545 "movd %%mm0, (%4, %%"REG_a") \n\t"
1546 "movd %%mm1, (%3, %%"REG_a") \n\t"
1547 "add $4, %%"REG_a" \n\t"
1548 " js 1b \n\t"
1549 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1550 : "%"REG_a
1551 );
1552 #else
1553 int i;
1554 for(i=0; i<width; i++)
1555 {
1556 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1557 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1558 }
1559 #endif
1560 }
1561
1562 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1563 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1564 {
1565 #ifdef HAVE_MMX
1566 asm volatile(
1567 "mov %0, %%"REG_a" \n\t"
1568 "1: \n\t"
1569 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1570 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1571 "psrlw $8, %%mm0 \n\t"
1572 "psrlw $8, %%mm1 \n\t"
1573 "packuswb %%mm1, %%mm0 \n\t"
1574 "movq %%mm0, (%2, %%"REG_a") \n\t"
1575 "add $8, %%"REG_a" \n\t"
1576 " js 1b \n\t"
1577 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1578 : "%"REG_a
1579 );
1580 #else
1581 int i;
1582 for(i=0; i<width; i++)
1583 dst[i]= src[2*i+1];
1584 #endif
1585 }
1586
1587 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1588 {
1589 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1590 asm volatile(
1591 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1592 "mov %0, %%"REG_a" \n\t"
1593 "1: \n\t"
1594 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1595 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1596 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1597 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1598 PAVGB(%%mm2, %%mm0)
1599 PAVGB(%%mm3, %%mm1)
1600 "pand %%mm4, %%mm0 \n\t"
1601 "pand %%mm4, %%mm1 \n\t"
1602 "packuswb %%mm1, %%mm0 \n\t"
1603 "movq %%mm0, %%mm1 \n\t"
1604 "psrlw $8, %%mm0 \n\t"
1605 "pand %%mm4, %%mm1 \n\t"
1606 "packuswb %%mm0, %%mm0 \n\t"
1607 "packuswb %%mm1, %%mm1 \n\t"
1608 "movd %%mm0, (%4, %%"REG_a") \n\t"
1609 "movd %%mm1, (%3, %%"REG_a") \n\t"
1610 "add $4, %%"REG_a" \n\t"
1611 " js 1b \n\t"
1612 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1613 : "%"REG_a
1614 );
1615 #else
1616 int i;
1617 for(i=0; i<width; i++)
1618 {
1619 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1620 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1621 }
1622 #endif
1623 }
1624
1625 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1626 {
1627 int i;
1628 for(i=0; i<width; i++)
1629 {
1630 int b= ((uint32_t*)src)[i]&0xFF;
1631 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1632 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1633
1634 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1635 }
1636 }
1637
1638 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1639 {
1640 int i;
1641 for(i=0; i<width; i++)
1642 {
1643 const int a= ((uint32_t*)src1)[2*i+0];
1644 const int e= ((uint32_t*)src1)[2*i+1];
1645 const int c= ((uint32_t*)src2)[2*i+0];
1646 const int d= ((uint32_t*)src2)[2*i+1];
1647 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1648 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1649 const int b= l&0x3FF;
1650 const int g= h>>8;
1651 const int r= l>>16;
1652
1653 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1654 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1655 }
1656 }
1657
1658 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1659 {
1660 #ifdef HAVE_MMX
1661 asm volatile(
1662 "mov %2, %%"REG_a" \n\t"
1663 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1664 "movq "MANGLE(w1111)", %%mm5 \n\t"
1665 "pxor %%mm7, %%mm7 \n\t"
1666 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1667 ASMALIGN16
1668 "1: \n\t"
1669 PREFETCH" 64(%0, %%"REG_b") \n\t"
1670 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1671 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1672 "punpcklbw %%mm7, %%mm0 \n\t"
1673 "punpcklbw %%mm7, %%mm1 \n\t"
1674 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1675 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1676 "punpcklbw %%mm7, %%mm2 \n\t"
1677 "punpcklbw %%mm7, %%mm3 \n\t"
1678 "pmaddwd %%mm6, %%mm0 \n\t"
1679 "pmaddwd %%mm6, %%mm1 \n\t"
1680 "pmaddwd %%mm6, %%mm2 \n\t"
1681 "pmaddwd %%mm6, %%mm3 \n\t"
1682 #ifndef FAST_BGR2YV12
1683 "psrad $8, %%mm0 \n\t"
1684 "psrad $8, %%mm1 \n\t"
1685 "psrad $8, %%mm2 \n\t"
1686 "psrad $8, %%mm3 \n\t"
1687 #endif
1688 "packssdw %%mm1, %%mm0 \n\t"
1689 "packssdw %%mm3, %%mm2 \n\t"
1690 "pmaddwd %%mm5, %%mm0 \n\t"
1691 "pmaddwd %%mm5, %%mm2 \n\t"
1692 "packssdw %%mm2, %%mm0 \n\t"
1693 "psraw $7, %%mm0 \n\t"
1694
1695 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1696 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1697 "punpcklbw %%mm7, %%mm4 \n\t"
1698 "punpcklbw %%mm7, %%mm1 \n\t"
1699 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1700 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1701 "punpcklbw %%mm7, %%mm2 \n\t"
1702 "punpcklbw %%mm7, %%mm3 \n\t"
1703 "pmaddwd %%mm6, %%mm4 \n\t"
1704 "pmaddwd %%mm6, %%mm1 \n\t"
1705 "pmaddwd %%mm6, %%mm2 \n\t"
1706 "pmaddwd %%mm6, %%mm3 \n\t"
1707 #ifndef FAST_BGR2YV12
1708 "psrad $8, %%mm4 \n\t"
1709 "psrad $8, %%mm1 \n\t"
1710 "psrad $8, %%mm2 \n\t"
1711 "psrad $8, %%mm3 \n\t"
1712 #endif
1713 "packssdw %%mm1, %%mm4 \n\t"
1714 "packssdw %%mm3, %%mm2 \n\t"
1715 "pmaddwd %%mm5, %%mm4 \n\t"
1716 "pmaddwd %%mm5, %%mm2 \n\t"
1717 "add $24, %%"REG_b" \n\t"
1718 "packssdw %%mm2, %%mm4 \n\t"
1719 "psraw $7, %%mm4 \n\t"
1720
1721 "packuswb %%mm4, %%mm0 \n\t"
1722 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1723
1724 "movq %%mm0, (%1, %%"REG_a") \n\t"
1725 "add $8, %%"REG_a" \n\t"
1726 " js 1b \n\t"
1727 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1728 : "%"REG_a, "%"REG_b
1729 );
1730 #else
1731 int i;
1732 for(i=0; i<width; i++)
1733 {
1734 int b= src[i*3+0];
1735 int g= src[i*3+1];
1736 int r= src[i*3+2];
1737
1738 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1739 }
1740 #endif
1741 }
1742
1743 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1744 {
1745 #ifdef HAVE_MMX
1746 asm volatile(
1747 "mov %4, %%"REG_a" \n\t"
1748 "movq "MANGLE(w1111)", %%mm5 \n\t"
1749 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1750 "pxor %%mm7, %%mm7 \n\t"
1751 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1752 "add %%"REG_b", %%"REG_b" \n\t"
1753 ASMALIGN16
1754 "1: \n\t"
1755 PREFETCH" 64(%0, %%"REG_b") \n\t"
1756 PREFETCH" 64(%1, %%"REG_b") \n\t"
1757 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1758 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1759 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1760 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1761 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1762 PAVGB(%%mm1, %%mm0)
1763 PAVGB(%%mm3, %%mm2)
1764 "movq %%mm0, %%mm1 \n\t"
1765 "movq %%mm2, %%mm3 \n\t"
1766 "psrlq $24, %%mm0 \n\t"
1767 "psrlq $24, %%mm2 \n\t"
1768 PAVGB(%%mm1, %%mm0)
1769 PAVGB(%%mm3, %%mm2)
1770 "punpcklbw %%mm7, %%mm0 \n\t"
1771 "punpcklbw %%mm7, %%mm2 \n\t"
1772 #else
1773 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1774 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1775 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1776 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1777 "punpcklbw %%mm7, %%mm0 \n\t"
1778 "punpcklbw %%mm7, %%mm1 \n\t"
1779 "punpcklbw %%mm7, %%mm2 \n\t"
1780 "punpcklbw %%mm7, %%mm3 \n\t"
1781 "paddw %%mm1, %%mm0 \n\t"
1782 "paddw %%mm3, %%mm2 \n\t"
1783 "paddw %%mm2, %%mm0 \n\t"
1784 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1785 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1786 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1787 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1788 "punpcklbw %%mm7, %%mm4 \n\t"
1789 "punpcklbw %%mm7, %%mm1 \n\t"
1790 "punpcklbw %%mm7, %%mm2 \n\t"
1791 "punpcklbw %%mm7, %%mm3 \n\t"
1792 "paddw %%mm1, %%mm4 \n\t"
1793 "paddw %%mm3, %%mm2 \n\t"
1794 "paddw %%mm4, %%mm2 \n\t"
1795 "psrlw $2, %%mm0 \n\t"
1796 "psrlw $2, %%mm2 \n\t"
1797 #endif
1798 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1799 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1800
1801 "pmaddwd %%mm0, %%mm1 \n\t"
1802 "pmaddwd %%mm2, %%mm3 \n\t"
1803 "pmaddwd %%mm6, %%mm0 \n\t"
1804 "pmaddwd %%mm6, %%mm2 \n\t"
1805 #ifndef FAST_BGR2YV12
1806 "psrad $8, %%mm0 \n\t"
1807 "psrad $8, %%mm1 \n\t"
1808 "psrad $8, %%mm2 \n\t"
1809 "psrad $8, %%mm3 \n\t"
1810 #endif
1811 "packssdw %%mm2, %%mm0 \n\t"
1812 "packssdw %%mm3, %%mm1 \n\t"
1813 "pmaddwd %%mm5, %%mm0 \n\t"
1814 "pmaddwd %%mm5, %%mm1 \n\t"
1815 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1816 "psraw $7, %%mm0 \n\t"
1817
1818 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1819 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1820 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1821 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1822 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1823 PAVGB(%%mm1, %%mm4)
1824 PAVGB(%%mm3, %%mm2)
1825 "movq %%mm4, %%mm1 \n\t"
1826 "movq %%mm2, %%mm3 \n\t"
1827 "psrlq $24, %%mm4 \n\t"
1828 "psrlq $24, %%mm2 \n\t"
1829 PAVGB(%%mm1, %%mm4)
1830 PAVGB(%%mm3, %%mm2)
1831 "punpcklbw %%mm7, %%mm4 \n\t"
1832 "punpcklbw %%mm7, %%mm2 \n\t"
1833 #else
1834 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1835 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1836 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1837 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1838 "punpcklbw %%mm7, %%mm4 \n\t"
1839 "punpcklbw %%mm7, %%mm1 \n\t"
1840 "punpcklbw %%mm7, %%mm2 \n\t"
1841 "punpcklbw %%mm7, %%mm3 \n\t"
1842 "paddw %%mm1, %%mm4 \n\t"
1843 "paddw %%mm3, %%mm2 \n\t"
1844 "paddw %%mm2, %%mm4 \n\t"
1845 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1846 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1847 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1848 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1849 "punpcklbw %%mm7, %%mm5 \n\t"
1850 "punpcklbw %%mm7, %%mm1 \n\t"
1851 "punpcklbw %%mm7, %%mm2 \n\t"
1852 "punpcklbw %%mm7, %%mm3 \n\t"
1853 "paddw %%mm1, %%mm5 \n\t"
1854 "paddw %%mm3, %%mm2 \n\t"
1855 "paddw %%mm5, %%mm2 \n\t"
1856 "movq "MANGLE(w1111)", %%mm5 \n\t"
1857 "psrlw $2, %%mm4 \n\t"
1858 "psrlw $2, %%mm2 \n\t"
1859 #endif
1860 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1861 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1862
1863 "pmaddwd %%mm4, %%mm1 \n\t"
1864 "pmaddwd %%mm2, %%mm3 \n\t"
1865 "pmaddwd %%mm6, %%mm4 \n\t"
1866 "pmaddwd %%mm6, %%mm2 \n\t"
1867 #ifndef FAST_BGR2YV12
1868 "psrad $8, %%mm4 \n\t"
1869 "psrad $8, %%mm1 \n\t"
1870 "psrad $8, %%mm2 \n\t"
1871 "psrad $8, %%mm3 \n\t"
1872 #endif
1873 "packssdw %%mm2, %%mm4 \n\t"
1874 "packssdw %%mm3, %%mm1 \n\t"
1875 "pmaddwd %%mm5, %%mm4 \n\t"
1876 "pmaddwd %%mm5, %%mm1 \n\t"
1877 "add $24, %%"REG_b" \n\t"
1878 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1879 "psraw $7, %%mm4 \n\t"
1880
1881 "movq %%mm0, %%mm1 \n\t"
1882 "punpckldq %%mm4, %%mm0 \n\t"
1883 "punpckhdq %%mm4, %%mm1 \n\t"
1884 "packsswb %%mm1, %%mm0 \n\t"
1885 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1886
1887 "movd %%mm0, (%2, %%"REG_a") \n\t"
1888 "punpckhdq %%mm0, %%mm0 \n\t"
1889 "movd %%mm0, (%3, %%"REG_a") \n\t"
1890 "add $4, %%"REG_a" \n\t"
1891 " js 1b \n\t"
1892 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1893 : "%"REG_a, "%"REG_b
1894 );
1895 #else
1896 int i;
1897 for(i=0; i<width; i++)
1898 {
1899 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1900 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1901 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1902
1903 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1904 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1905 }
1906 #endif
1907 }
1908
1909 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1910 {
1911 int i;
1912 for(i=0; i<width; i++)
1913 {
1914 int d= ((uint16_t*)src)[i];
1915 int b= d&0x1F;
1916 int g= (d>>5)&0x3F;
1917 int r= (d>>11)&0x1F;
1918
1919 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1920 }
1921 }
1922
1923 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1924 {
1925 int i;
1926 for(i=0; i<width; i++)
1927 {
1928 int d0= ((uint32_t*)src1)[i];
1929 int d1= ((uint32_t*)src2)[i];
1930
1931 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1932 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1933
1934 int dh2= (dh>>11) + (dh<<21);
1935 int d= dh2 + dl;
1936
1937 int b= d&0x7F;
1938 int r= (d>>11)&0x7F;
1939 int g= d>>21;
1940 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1941 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1942 }
1943 }
1944
1945 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1946 {
1947 int i;
1948 for(i=0; i<width; i++)
1949 {
1950 int d= ((uint16_t*)src)[i];
1951 int b= d&0x1F;
1952 int g= (d>>5)&0x1F;
1953 int r= (d>>10)&0x1F;
1954
1955 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1956 }
1957 }
1958
1959 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1960 {
1961 int i;
1962 for(i=0; i<width; i++)
1963 {
1964 int d0= ((uint32_t*)src1)[i];
1965 int d1= ((uint32_t*)src2)[i];
1966
1967 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1968 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1969
1970 int dh2= (dh>>11) + (dh<<21);
1971 int d= dh2 + dl;
1972
1973 int b= d&0x7F;
1974 int r= (d>>10)&0x7F;
1975 int g= d>>21;
1976 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1977 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1978 }
1979 }
1980
1981
1982 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1983 {
1984 int i;
1985 for(i=0; i<width; i++)
1986 {
1987 int r= ((uint32_t*)src)[i]&0xFF;
1988 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1989 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1990
1991 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1992 }
1993 }
1994
1995 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1996 {
1997 int i;
1998 for(i=0; i<width; i++)
1999 {
2000 const int a= ((uint32_t*)src1)[2*i+0];
2001 const int e= ((uint32_t*)src1)[2*i+1];
2002 const int c= ((uint32_t*)src2)[2*i+0];
2003 const int d= ((uint32_t*)src2)[2*i+1];
2004 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2005 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2006 const int r= l&0x3FF;
2007 const int g= h>>8;
2008 const int b= l>>16;
2009
2010 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2011 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2012 }
2013 }
2014
2015 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2016 {
2017 int i;
2018 for(i=0; i<width; i++)
2019 {
2020 int r= src[i*3+0];
2021 int g= src[i*3+1];
2022 int b= src[i*3+2];
2023
2024 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2025 }
2026 }
2027
2028 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2029 {
2030 int i;
2031 for(i=0; i<width; i++)
2032 {
2033 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2034 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2035 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2036
2037 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2038 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2039 }
2040 }
2041
2042
2043 // Bilinear / Bicubic scaling
2044 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2045 int16_t *filter, int16_t *filterPos, long filterSize)
2046 {
2047 #ifdef HAVE_MMX
2048 assert(filterSize % 4 == 0 && filterSize>0);
2049 if(filterSize==4) // allways true for upscaling, sometimes for down too
2050 {
2051 long counter= -2*dstW;
2052 filter-= counter*2;
2053 filterPos-= counter/2;
2054 dst-= counter/2;
2055 asm volatile(
2056 "pxor %%mm7, %%mm7 \n\t"
2057 "movq "MANGLE(w02)", %%mm6 \n\t"
2058 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a", %%"REG_BP" \n\t"
2060 ASMALIGN16
2061 "1: \n\t"
2062 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2064 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2065 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2066 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2072 "psrad $8, %%mm0 \n\t"
2073 "psrad $8, %%mm3 \n\t"
2074 "packssdw %%mm3, %%mm0 \n\t"
2075 "pmaddwd %%mm6, %%mm0 \n\t"
2076 "packssdw %%mm0, %%mm0 \n\t"
2077 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2078 "add $4, %%"REG_BP" \n\t"
2079 " jnc 1b \n\t"
2080
2081 "pop %%"REG_BP" \n\t"
2082 : "+a" (counter)
2083 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2084 : "%"REG_b
2085 );
2086 }
2087 else if(filterSize==8)
2088 {
2089 long counter= -2*dstW;
2090 filter-= counter*4;
2091 filterPos-= counter/2;
2092 dst-= counter/2;
2093 asm volatile(
2094 "pxor %%mm7, %%mm7 \n\t"
2095 "movq "MANGLE(w02)", %%mm6 \n\t"
2096 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2097 "mov %%"REG_a", %%"REG_BP" \n\t"
2098 ASMALIGN16
2099 "1: \n\t"
2100 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2101 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2102 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2103 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2104 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2105 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2106 "punpcklbw %%mm7, %%mm0 \n\t"
2107 "punpcklbw %%mm7, %%mm2 \n\t"
2108 "pmaddwd %%mm1, %%mm0 \n\t"
2109 "pmaddwd %%mm2, %%mm3 \n\t"
2110
2111 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2112 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2113 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2114 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2115 "punpcklbw %%mm7, %%mm4 \n\t"
2116 "punpcklbw %%mm7, %%mm2 \n\t"
2117 "pmaddwd %%mm1, %%mm4 \n\t"
2118 "pmaddwd %%mm2, %%mm5 \n\t"
2119 "paddd %%mm4, %%mm0 \n\t"
2120 "paddd %%mm5, %%mm3 \n\t"
2121
2122 "psrad $8, %%mm0 \n\t"
2123 "psrad $8, %%mm3 \n\t"
2124 "packssdw %%mm3, %%mm0 \n\t"
2125 "pmaddwd %%mm6, %%mm0 \n\t"
2126 "packssdw %%mm0, %%mm0 \n\t"
2127 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2128 "add $4, %%"REG_BP" \n\t"
2129 " jnc 1b \n\t"
2130
2131 "pop %%"REG_BP" \n\t"
2132 : "+a" (counter)
2133 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2134 : "%"REG_b
2135 );
2136 }
2137 else
2138 {
2139 uint8_t *offset = src+filterSize;
2140 long counter= -2*dstW;
2141 // filter-= counter*filterSize/2;
2142 filterPos-= counter/2;
2143 dst-= counter/2;
2144 asm volatile(
2145 "pxor %%mm7, %%mm7 \n\t"
2146 "movq "MANGLE(w02)", %%mm6 \n\t"
2147 ASMALIGN16
2148 "1: \n\t"
2149 "mov %2, %%"REG_c" \n\t"
2150 "movzwl (%%"REG_c", %0), %%eax \n\t"
2151 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2152 "mov %5, %%"REG_c" \n\t"
2153 "pxor %%mm4, %%mm4 \n\t"
2154 "pxor %%mm5, %%mm5 \n\t"
2155 "2: \n\t"
2156 "movq (%1), %%mm1 \n\t"
2157 "movq (%1, %6), %%mm3 \n\t"
2158 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2159 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2160 "punpcklbw %%mm7, %%mm0 \n\t"
2161 "punpcklbw %%mm7, %%mm2 \n\t"
2162 "pmaddwd %%mm1, %%mm0 \n\t"
2163 "pmaddwd %%mm2, %%mm3 \n\t"
2164 "paddd %%mm3, %%mm5 \n\t"
2165 "paddd %%mm0, %%mm4 \n\t"
2166 "add $8, %1 \n\t"
2167 "add $4, %%"REG_c" \n\t"
2168 "cmp %4, %%"REG_c" \n\t"
2169 " jb 2b \n\t"
2170 "add %6, %1 \n\t"
2171 "psrad $8, %%mm4 \n\t"
2172 "psrad $8, %%mm5 \n\t"
2173 "packssdw %%mm5, %%mm4 \n\t"
2174 "pmaddwd %%mm6, %%mm4 \n\t"
2175 "packssdw %%mm4, %%mm4 \n\t"
2176 "mov %3, %%"REG_a" \n\t"
2177 "movd %%mm4, (%%"REG_a", %0) \n\t"
2178 "add $4, %0 \n\t"
2179 " jnc 1b \n\t"
2180
2181 : "+r" (counter), "+r" (filter)
2182 : "m" (filterPos), "m" (dst), "m"(offset),
2183 "m" (src), "r" (filterSize*2)
2184 : "%"REG_b, "%"REG_a, "%"REG_c
2185 );
2186 }
2187 #else
2188 #ifdef HAVE_ALTIVEC
2189 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2190 #else
2191 int i;
2192 for(i=0; i<dstW; i++)
2193 {
2194 int j;
2195 int srcPos= filterPos[i];
2196 int val=0;
2197 // printf("filterPos: %d\n", filterPos[i]);
2198 for(j=0; j<filterSize; j++)
2199 {
2200 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2201 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2202 }
2203 // filter += hFilterSize;
2204 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2205 // dst[i] = val>>7;
2206 }
2207 #endif
2208 #endif
2209 }
2210 // *** horizontal scale Y line to temp buffer
2211 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2212 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2213 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2214 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2215 int32_t *mmx2FilterPos)
2216 {
2217 if(srcFormat==IMGFMT_YUY2)
2218 {
2219 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2220 src= formatConvBuffer;
2221 }
2222 else if(srcFormat==IMGFMT_UYVY)
2223 {
2224 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2225 src= formatConvBuffer;
2226 }
2227 else if(srcFormat==IMGFMT_BGR32)
2228 {
2229 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2230 src= formatConvBuffer;
2231 }
2232 else if(srcFormat==IMGFMT_BGR24)
2233 {
2234 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2235 src= formatConvBuffer;
2236 }
2237 else if(srcFormat==IMGFMT_BGR16)
2238 {
2239 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2240 src= formatConvBuffer;
2241 }
2242 else if(srcFormat==IMGFMT_BGR15)
2243 {
2244 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2245 src= formatConvBuffer;
2246 }
2247 else if(srcFormat==IMGFMT_RGB32)
2248 {
2249 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2250 src= formatConvBuffer;
2251 }
2252 else if(srcFormat==IMGFMT_RGB24)
2253 {
2254 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2255 src= formatConvBuffer;
2256 }
2257
2258 #ifdef HAVE_MMX
2259 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2260 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2261 #else
2262 if(!(flags&SWS_FAST_BILINEAR))
2263 #endif
2264 {
2265 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2266 }
2267 else // Fast Bilinear upscale / crap downscale
2268 {
2269 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2270 #ifdef HAVE_MMX2
2271 int i;
2272 if(canMMX2BeUsed)
2273 {
2274 asm volatile(
2275 "pxor %%mm7, %%mm7 \n\t"
2276 "mov %0, %%"REG_c" \n\t"
2277 "mov %1, %%"REG_D" \n\t"
2278 "mov %2, %%"REG_d" \n\t"
2279 "mov %3, %%"REG_b" \n\t"
2280 "xor %%"REG_a", %%"REG_a" \n\t" // i
2281 PREFETCH" (%%"REG_c") \n\t"
2282 PREFETCH" 32(%%"REG_c") \n\t"
2283 PREFETCH" 64(%%"REG_c") \n\t"
2284
2285 #ifdef ARCH_X86_64
2286
2287 #define FUNNY_Y_CODE \
2288 "movl (%%"REG_b"), %%esi \n\t"\
2289 "call *%4 \n\t"\
2290 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2291 "add %%"REG_S", %%"REG_c" \n\t"\
2292 "add %%"REG_a", %%"REG_D" \n\t"\
2293 "xor %%"REG_a", %%"REG_a" \n\t"\
2294
2295 #else
2296
2297 #define FUNNY_Y_CODE \
2298 "movl (%%"REG_b"), %%esi \n\t"\
2299 "call *%4 \n\t"\
2300 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2301 "add %%"REG_a", %%"REG_D" \n\t"\
2302 "xor %%"REG_a", %%"REG_a" \n\t"\
2303
2304 #endif
2305
2306 FUNNY_Y_CODE
2307 FUNNY_Y_CODE
2308 FUNNY_Y_CODE
2309 FUNNY_Y_CODE
2310 FUNNY_Y_CODE
2311 FUNNY_Y_CODE
2312 FUNNY_Y_CODE
2313 FUNNY_Y_CODE
2314
2315 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2316 "m" (funnyYCode)
2317 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2318 );
2319 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2320 }
2321 else
2322 {
2323 #endif
2324 int xInc_shr16 = xInc >> 16;
2325 int xInc_mask = xInc & 0xffff;
2326 //NO MMX just normal asm ...
2327 asm volatile(
2328 "xor %%"REG_a", %%"REG_a" \n\t" // i
2329 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2330 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2331 ASMALIGN16
2332 "1: \n\t"
2333 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2334 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2335 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2336 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2337 "shll $16, %%edi \n\t"
2338 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2339 "mov %1, %%"REG_D" \n\t"
2340 "shrl $9, %%esi \n\t"
2341 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2342 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2343 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2344
2345 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2346 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2347 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2348 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2349 "shll $16, %%edi \n\t"
2350 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2351 "mov %1, %%"REG_D" \n\t"
2352 "shrl $9, %%esi \n\t"
2353 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2354 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2355 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2356
2357
2358 "add $2, %%"REG_a" \n\t"
2359 "cmp %2, %%"REG_a" \n\t"
2360 " jb 1b \n\t"
2361
2362
2363 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2364 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2365 );
2366 #ifdef HAVE_MMX2
2367 } //if MMX2 can't be used
2368 #endif
2369 #else
2370 int i;
2371 unsigned int xpos=0;
2372 for(i=0;i<dstWidth;i++)
2373 {
2374 register unsigned int xx=xpos>>16;
2375 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2376 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2377 xpos+=xInc;
2378 }
2379 #endif
2380 }
2381 }
2382
2383 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2384 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2385 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2386 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2387 int32_t *mmx2FilterPos)
2388 {
2389 if(srcFormat==IMGFMT_YUY2)
2390 {
2391 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2392 src1= formatConvBuffer;
2393 src2= formatConvBuffer+2048;
2394 }
2395 else if(srcFormat==IMGFMT_UYVY)
2396 {
2397 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2398 src1= formatConvBuffer;
2399 src2= formatConvBuffer+2048;
2400 }
2401 else if(srcFormat==IMGFMT_BGR32)
2402 {
2403 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2404 src1= formatConvBuffer;
2405 src2= formatConvBuffer+2048;
2406 }
2407 else if(srcFormat==IMGFMT_BGR24)
2408 {
2409 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2410 src1= formatConvBuffer;
2411 src2= formatConvBuffer+2048;
2412 }
2413 else if(srcFormat==IMGFMT_BGR16)
2414 {
2415 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2416 src1= formatConvBuffer;
2417 src2= formatConvBuffer+2048;
2418 }
2419 else if(srcFormat==IMGFMT_BGR15)
2420 {
2421 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2422 src1= formatConvBuffer;
2423 src2= formatConvBuffer+2048;
2424 }
2425 else if(srcFormat==IMGFMT_RGB32)
2426 {
2427 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2428 src1= formatConvBuffer;
2429 src2= formatConvBuffer+2048;
2430 }
2431 else if(srcFormat==IMGFMT_RGB24)
2432 {
2433 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2434 src1= formatConvBuffer;
2435 src2= formatConvBuffer+2048;
2436 }
2437 else if(isGray(srcFormat))
2438 {
2439 return;
2440 }
2441
2442 #ifdef HAVE_MMX
2443 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2444 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2445 #else
2446 if(!(flags&SWS_FAST_BILINEAR))
2447 #endif
2448 {
2449 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2450 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2451 }
2452 else // Fast Bilinear upscale / crap downscale
2453 {
2454 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2455 #ifdef HAVE_MMX2
2456 int i;
2457 if(canMMX2BeUsed)
2458 {
2459 asm volatile(
2460 "pxor %%mm7, %%mm7 \n\t"
2461 "mov %0, %%"REG_c" \n\t"
2462 "mov %1, %%"REG_D" \n\t"
2463 "mov %2, %%"REG_d" \n\t"
2464 "mov %3, %%"REG_b" \n\t"
2465 "xor %%"REG_a", %%"REG_a" \n\t" // i
2466 PREFETCH" (%%"REG_c") \n\t"
2467 PREFETCH" 32(%%"REG_c") \n\t"
2468 PREFETCH" 64(%%"REG_c") \n\t"
2469
2470 #ifdef ARCH_X86_64
2471
2472 #define FUNNY_UV_CODE \
2473 "movl (%%"REG_b"), %%esi \n\t"\
2474 "call *%4 \n\t"\
2475 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2476 "add %%"REG_S", %%"REG_c" \n\t"\
2477 "add %%"REG_a", %%"REG_D" \n\t"\
2478 "xor %%"REG_a", %%"REG_a" \n\t"\
2479
2480 #else
2481
2482 #define FUNNY_UV_CODE \
2483 "movl (%%"REG_b"), %%esi \n\t"\
2484 "call *%4 \n\t"\
2485 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2486 "add %%"REG_a", %%"REG_D" \n\t"\
2487 "xor %%"REG_a", %%"REG_a" \n\t"\
2488
2489 #endif
2490
2491 FUNNY_UV_CODE
2492 FUNNY_UV_CODE
2493 FUNNY_UV_CODE
2494 FUNNY_UV_CODE
2495 "xor %%"REG_a", %%"REG_a" \n\t" // i
2496 "mov %5, %%"REG_c" \n\t" // src
2497 "mov %1, %%"REG_D" \n\t" // buf1
2498 "add $4096, %%"REG_D" \n\t"
2499 PREFETCH" (%%"REG_c") \n\t"
2500 PREFETCH" 32(%%"REG_c") \n\t"
2501 PREFETCH" 64(%%"REG_c") \n\t"
2502
2503 FUNNY_UV_CODE
2504 FUNNY_UV_CODE
2505 FUNNY_UV_CODE
2506 FUNNY_UV_CODE
2507
2508 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2509 "m" (funnyUVCode), "m" (src2)
2510 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2511 );
2512 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2513 {
2514 // printf("%d %d %d\n", dstWidth, i, srcW);
2515 dst[i] = src1[srcW-1]*128;
2516 dst[i+2048] = src2[srcW-1]*128;
2517 }
2518 }
2519 else
2520 {
2521 #endif
2522 long xInc_shr16 = (long) (xInc >> 16);
2523 int xInc_mask = xInc & 0xffff;
2524 asm volatile(
2525 "xor %%"REG_a", %%"REG_a" \n\t" // i
2526 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2527 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2528 ASMALIGN16
2529 "1: \n\t"
2530 "mov %0, %%"REG_S" \n\t"
2531 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2532 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2533 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2534 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2535 "shll $16, %%edi \n\t"
2536 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2537 "mov %1, %%"REG_D" \n\t"
2538 "shrl $9, %%esi \n\t"
2539 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2540
2541 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2542 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2543 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2544 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2545 "shll $16, %%edi \n\t"
2546 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2547 "mov %1, %%"REG_D" \n\t"
2548 "shrl $9, %%esi \n\t"
2549 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2550
2551 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2552 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2553 "add $1, %%"REG_a" \n\t"
2554 "cmp %2, %%"REG_a" \n\t"
2555 " jb 1b \n\t"
2556
2557 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2558 which is needed to support GCC-4.0 */
2559 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2560 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2561 #else
2562 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2563 #endif
2564 "r" (src2)
2565 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2566 );
2567 #ifdef HAVE_MMX2
2568 } //if MMX2 can't be used
2569 #endif
2570 #else
2571 int i;
2572 unsigned int xpos=0;
2573 for(i=0;i<dstWidth;i++)
2574 {
2575 register unsigned int xx=xpos>>16;
2576 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2577 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2578 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2579 /* slower
2580 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2581 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2582 */
2583 xpos+=xInc;
2584 }
2585 #endif
2586 }
2587 }
2588
2589 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2590 int srcSliceH, uint8_t* dst[], int dstStride[]){
2591
2592 /* load a few things into local vars to make the code more readable? and faster */
2593 const int srcW= c->srcW;
2594 const int dstW= c->dstW;
2595 const int dstH= c->dstH;
2596 const int chrDstW= c->chrDstW;
2597 const int chrSrcW= c->chrSrcW;
2598 const int lumXInc= c->lumXInc;
2599 const int chrXInc= c->chrXInc;
2600 const int dstFormat= c->dstFormat;
2601 const int srcFormat= c->srcFormat;
2602 const int flags= c->flags;
2603 const int canMMX2BeUsed= c->canMMX2BeUsed;
2604 int16_t *vLumFilterPos= c->vLumFilterPos;
2605 int16_t *vChrFilterPos= c->vChrFilterPos;
2606 int16_t *hLumFilterPos= c->hLumFilterPos;
2607 int16_t *hChrFilterPos= c->hChrFilterPos;
2608 int16_t *vLumFilter= c->vLumFilter;
2609 int16_t *vChrFilter= c->vChrFilter;
2610 int16_t *hLumFilter= c->hLumFilter;
2611 int16_t *hChrFilter= c->hChrFilter;
2612 int32_t *lumMmxFilter= c->lumMmxFilter;
2613 int32_t *chrMmxFilter= c->chrMmxFilter;
2614 const int vLumFilterSize= c->vLumFilterSize;
2615 const int vChrFilterSize= c->vChrFilterSize;
2616 const int hLumFilterSize= c->hLumFilterSize;
2617 const int hChrFilterSize= c->hChrFilterSize;
2618 int16_t **lumPixBuf= c->lumPixBuf;
2619 int16_t **chrPixBuf= c->chrPixBuf;
2620 const int vLumBufSize= c->vLumBufSize;
2621 const int vChrBufSize= c->vChrBufSize;
2622 uint8_t *funnyYCode= c->funnyYCode;
2623 uint8_t *funnyUVCode= c->funnyUVCode;
2624 uint8_t *formatConvBuffer= c->formatConvBuffer;
2625 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2626 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2627 int lastDstY;
2628
2629 /* vars whch will change and which we need to storw back in the context */
2630 int dstY= c->dstY;
2631 int lumBufIndex= c->lumBufIndex;
2632 int chrBufIndex= c->chrBufIndex;
2633 int lastInLumBuf= c->lastInLumBuf;
2634 int lastInChrBuf= c->lastInChrBuf;
2635
2636 if(isPacked(c->srcFormat)){
2637 src[0]=
2638 src[1]=
2639 src[2]= src[0];
2640 srcStride[0]=
2641 srcStride[1]=
2642 srcStride[2]= srcStride[0];
2643 }
2644 srcStride[1]<<= c->vChrDrop;
2645 srcStride[2]<<= c->vChrDrop;
2646
2647 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2648 // (int)dst[0], (int)dst[1], (int)dst[2]);
2649
2650 #if 0 //self test FIXME move to a vfilter or something
2651 {
2652 static volatile int i=0;
2653 i++;
2654 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2655 selfTest(src, srcStride, c->srcW, c->srcH);
2656 i--;
2657 }
2658 #endif
2659
2660 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2661 //dstStride[0],dstStride[1],dstStride[2]);
2662
2663 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2664 {
2665 static int firstTime=1; //FIXME move this into the context perhaps
2666 if(flags & SWS_PRINT_INFO && firstTime)
2667 {
2668 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2669 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2670 firstTime=0;
2671 }
2672 }
2673
2674 /* Note the user might start scaling the picture in the middle so this will not get executed
2675 this is not really intended but works currently, so ppl might do it */
2676 if(srcSliceY ==0){
2677 lumBufIndex=0;
2678 chrBufIndex=0;
2679 dstY=0;
2680 lastInLumBuf= -1;
2681 lastInChrBuf= -1;
2682 }
2683
2684 lastDstY= dstY;
2685
2686 for(;dstY < dstH; dstY++){
2687 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2688 const int chrDstY= dstY>>c->chrDstVSubSample;
2689 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2690 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2691
2692 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2693 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2694 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input