ee9cfbe40a6fb25141ed9f33eca8e7afcf5ff870
[libav.git] / postproc / swscale_template.c
1 /*
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19 #undef REAL_MOVNTQ
20 #undef MOVNTQ
21 #undef PAVGB
22 #undef PREFETCH
23 #undef PREFETCHW
24 #undef EMMS
25 #undef SFENCE
26
27 #ifdef HAVE_3DNOW
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29 #define EMMS "femms"
30 #else
31 #define EMMS "emms"
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #else
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
43 #endif
44
45 #ifdef HAVE_MMX2
46 #define SFENCE "sfence"
47 #else
48 #define SFENCE "/nop"
49 #endif
50
51 #ifdef HAVE_MMX2
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55 #endif
56
57 #ifdef HAVE_MMX2
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #else
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61 #endif
62 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
63
64 #ifdef HAVE_ALTIVEC
65 #include "swscale_altivec_template.c"
66 #endif
67
68 #define YSCALEYUV2YV12X(x, offset) \
69 "xor %%"REG_a", %%"REG_a" \n\t"\
70 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71 "movq %%mm3, %%mm4 \n\t"\
72 "lea " offset "(%0), %%"REG_d" \n\t"\
73 "mov (%%"REG_d"), %%"REG_S" \n\t"\
74 ".balign 16 \n\t" /* FIXME Unroll? */\
75 "1: \n\t"\
76 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
77 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79 "add $16, %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
81 "test %%"REG_S", %%"REG_S" \n\t"\
82 "pmulhw %%mm0, %%mm2 \n\t"\
83 "pmulhw %%mm0, %%mm5 \n\t"\
84 "paddw %%mm2, %%mm3 \n\t"\
85 "paddw %%mm5, %%mm4 \n\t"\
86 " jnz 1b \n\t"\
87 "psraw $3, %%mm3 \n\t"\
88 "psraw $3, %%mm4 \n\t"\
89 "packuswb %%mm4, %%mm3 \n\t"\
90 MOVNTQ(%%mm3, (%1, %%REGa))\
91 "add $8, %%"REG_a" \n\t"\
92 "cmp %2, %%"REG_a" \n\t"\
93 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94 "movq %%mm3, %%mm4 \n\t"\
95 "lea " offset "(%0), %%"REG_d" \n\t"\
96 "mov (%%"REG_d"), %%"REG_S" \n\t"\
97 "jb 1b \n\t"
98
99 #define YSCALEYUV2YV121 \
100 "mov %2, %%"REG_a" \n\t"\
101 ".balign 16 \n\t" /* FIXME Unroll? */\
102 "1: \n\t"\
103 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105 "psraw $7, %%mm0 \n\t"\
106 "psraw $7, %%mm1 \n\t"\
107 "packuswb %%mm1, %%mm0 \n\t"\
108 MOVNTQ(%%mm0, (%1, %%REGa))\
109 "add $8, %%"REG_a" \n\t"\
110 "jnc 1b \n\t"
111
112 /*
113 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115 "r" (dest), "m" (dstW),
116 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118 */
119 #define YSCALEYUV2PACKEDX \
120 "xor %%"REG_a", %%"REG_a" \n\t"\
121 ".balign 16 \n\t"\
122 "nop \n\t"\
123 "1: \n\t"\
124 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125 "mov (%%"REG_d"), %%"REG_S" \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127 "movq %%mm3, %%mm4 \n\t"\
128 ".balign 16 \n\t"\
129 "2: \n\t"\
130 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
131 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
132 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133 "add $16, %%"REG_d" \n\t"\
134 "mov (%%"REG_d"), %%"REG_S" \n\t"\
135 "pmulhw %%mm0, %%mm2 \n\t"\
136 "pmulhw %%mm0, %%mm5 \n\t"\
137 "paddw %%mm2, %%mm3 \n\t"\
138 "paddw %%mm5, %%mm4 \n\t"\
139 "test %%"REG_S", %%"REG_S" \n\t"\
140 " jnz 2b \n\t"\
141 \
142 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143 "mov (%%"REG_d"), %%"REG_S" \n\t"\
144 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145 "movq %%mm1, %%mm7 \n\t"\
146 ".balign 16 \n\t"\
147 "2: \n\t"\
148 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
149 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
150 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151 "add $16, %%"REG_d" \n\t"\
152 "mov (%%"REG_d"), %%"REG_S" \n\t"\
153 "pmulhw %%mm0, %%mm2 \n\t"\
154 "pmulhw %%mm0, %%mm5 \n\t"\
155 "paddw %%mm2, %%mm1 \n\t"\
156 "paddw %%mm5, %%mm7 \n\t"\
157 "test %%"REG_S", %%"REG_S" \n\t"\
158 " jnz 2b \n\t"\
159
160
161 #define YSCALEYUV2RGBX \
162 YSCALEYUV2PACKEDX\
163 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
164 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
165 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
166 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
167 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
168 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
169 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
171 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
172 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
173 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
174 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
175 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
176 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177 "paddw %%mm3, %%mm4 \n\t"\
178 "movq %%mm2, %%mm0 \n\t"\
179 "movq %%mm5, %%mm6 \n\t"\
180 "movq %%mm4, %%mm3 \n\t"\
181 "punpcklwd %%mm2, %%mm2 \n\t"\
182 "punpcklwd %%mm5, %%mm5 \n\t"\
183 "punpcklwd %%mm4, %%mm4 \n\t"\
184 "paddw %%mm1, %%mm2 \n\t"\
185 "paddw %%mm1, %%mm5 \n\t"\
186 "paddw %%mm1, %%mm4 \n\t"\
187 "punpckhwd %%mm0, %%mm0 \n\t"\
188 "punpckhwd %%mm6, %%mm6 \n\t"\
189 "punpckhwd %%mm3, %%mm3 \n\t"\
190 "paddw %%mm7, %%mm0 \n\t"\
191 "paddw %%mm7, %%mm6 \n\t"\
192 "paddw %%mm7, %%mm3 \n\t"\
193 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194 "packuswb %%mm0, %%mm2 \n\t"\
195 "packuswb %%mm6, %%mm5 \n\t"\
196 "packuswb %%mm3, %%mm4 \n\t"\
197 "pxor %%mm7, %%mm7 \n\t"
198 #if 0
199 #define FULL_YSCALEYUV2RGB \
200 "pxor %%mm7, %%mm7 \n\t"\
201 "movd %6, %%mm6 \n\t" /*yalpha1*/\
202 "punpcklwd %%mm6, %%mm6 \n\t"\
203 "punpcklwd %%mm6, %%mm6 \n\t"\
204 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
205 "punpcklwd %%mm5, %%mm5 \n\t"\
206 "punpcklwd %%mm5, %%mm5 \n\t"\
207 "xor %%"REG_a", %%"REG_a" \n\t"\
208 ".balign 16 \n\t"\
209 "1: \n\t"\
210 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
213 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
214 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
215 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
220 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
223 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
226 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
227 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
228 \
229 \
230 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
232 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
237 \
238 \
239 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
240 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242 "paddw %%mm1, %%mm3 \n\t" /* B*/\
243 "paddw %%mm1, %%mm0 \n\t" /* R*/\
244 "packuswb %%mm3, %%mm3 \n\t"\
245 \
246 "packuswb %%mm0, %%mm0 \n\t"\
247 "paddw %%mm4, %%mm2 \n\t"\
248 "paddw %%mm2, %%mm1 \n\t" /* G*/\
249 \
250 "packuswb %%mm1, %%mm1 \n\t"
251 #endif
252
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256 "psraw $3, %%mm0 \n\t"\
257 "psraw $3, %%mm1 \n\t"\
258 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260 "xor "#index", "#index" \n\t"\
261 ".balign 16 \n\t"\
262 "1: \n\t"\
263 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
264 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
265 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
277 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
278 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
281 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
282 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288
289 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
290
291 #define REAL_YSCALEYUV2RGB(index, c) \
292 "xor "#index", "#index" \n\t"\
293 ".balign 16 \n\t"\
294 "1: \n\t"\
295 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
296 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
297 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
309 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
316 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
317 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
320 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
321 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
330 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
331 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
332 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
333 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334 "paddw %%mm3, %%mm4 \n\t"\
335 "movq %%mm2, %%mm0 \n\t"\
336 "movq %%mm5, %%mm6 \n\t"\
337 "movq %%mm4, %%mm3 \n\t"\
338 "punpcklwd %%mm2, %%mm2 \n\t"\
339 "punpcklwd %%mm5, %%mm5 \n\t"\
340 "punpcklwd %%mm4, %%mm4 \n\t"\
341 "paddw %%mm1, %%mm2 \n\t"\
342 "paddw %%mm1, %%mm5 \n\t"\
343 "paddw %%mm1, %%mm4 \n\t"\
344 "punpckhwd %%mm0, %%mm0 \n\t"\
345 "punpckhwd %%mm6, %%mm6 \n\t"\
346 "punpckhwd %%mm3, %%mm3 \n\t"\
347 "paddw %%mm7, %%mm0 \n\t"\
348 "paddw %%mm7, %%mm6 \n\t"\
349 "paddw %%mm7, %%mm3 \n\t"\
350 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351 "packuswb %%mm0, %%mm2 \n\t"\
352 "packuswb %%mm6, %%mm5 \n\t"\
353 "packuswb %%mm3, %%mm4 \n\t"\
354 "pxor %%mm7, %%mm7 \n\t"
355 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
356
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358 "xor "#index", "#index" \n\t"\
359 ".balign 16 \n\t"\
360 "1: \n\t"\
361 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
362 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363 "psraw $7, %%mm3 \n\t" \
364 "psraw $7, %%mm4 \n\t" \
365 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
366 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367 "psraw $7, %%mm1 \n\t" \
368 "psraw $7, %%mm7 \n\t" \
369
370 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
371
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373 "xor "#index", "#index" \n\t"\
374 ".balign 16 \n\t"\
375 "1: \n\t"\
376 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
377 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
381 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
382 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
383 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
384 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
388 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
394 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
395 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
396 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
397 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398 "paddw %%mm3, %%mm4 \n\t"\
399 "movq %%mm2, %%mm0 \n\t"\
400 "movq %%mm5, %%mm6 \n\t"\
401 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklwd %%mm2, %%mm2 \n\t"\
403 "punpcklwd %%mm5, %%mm5 \n\t"\
404 "punpcklwd %%mm4, %%mm4 \n\t"\
405 "paddw %%mm1, %%mm2 \n\t"\
406 "paddw %%mm1, %%mm5 \n\t"\
407 "paddw %%mm1, %%mm4 \n\t"\
408 "punpckhwd %%mm0, %%mm0 \n\t"\
409 "punpckhwd %%mm6, %%mm6 \n\t"\
410 "punpckhwd %%mm3, %%mm3 \n\t"\
411 "paddw %%mm7, %%mm0 \n\t"\
412 "paddw %%mm7, %%mm6 \n\t"\
413 "paddw %%mm7, %%mm3 \n\t"\
414 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415 "packuswb %%mm0, %%mm2 \n\t"\
416 "packuswb %%mm6, %%mm5 \n\t"\
417 "packuswb %%mm3, %%mm4 \n\t"\
418 "pxor %%mm7, %%mm7 \n\t"
419 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
420
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422 "xor "#index", "#index" \n\t"\
423 ".balign 16 \n\t"\
424 "1: \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431 "psrlw $8, %%mm3 \n\t" \
432 "psrlw $8, %%mm4 \n\t" \
433 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
434 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435 "psraw $7, %%mm1 \n\t" \
436 "psraw $7, %%mm7 \n\t"
437 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
438
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441 "xor "#index", "#index" \n\t"\
442 ".balign 16 \n\t"\
443 "1: \n\t"\
444 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
445 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
446 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
451 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
452 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
453 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
454 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
455 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
456 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
460 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
491 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
492
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495 "movq %%mm2, %%mm1 \n\t" /* B */\
496 "movq %%mm5, %%mm6 \n\t" /* R */\
497 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
498 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
499 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
500 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
501 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
502 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
503 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
504 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
506 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
507 \
508 MOVNTQ(%%mm0, (dst, index, 4))\
509 MOVNTQ(%%mm2, 8(dst, index, 4))\
510 MOVNTQ(%%mm1, 16(dst, index, 4))\
511 MOVNTQ(%%mm3, 24(dst, index, 4))\
512 \
513 "add $8, "#index" \n\t"\
514 "cmp "#dstw", "#index" \n\t"\
515 " jb 1b \n\t"
516 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
517
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
520 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
521 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
522 "psrlq $3, %%mm2 \n\t"\
523 \
524 "movq %%mm2, %%mm1 \n\t"\
525 "movq %%mm4, %%mm3 \n\t"\
526 \
527 "punpcklbw %%mm7, %%mm3 \n\t"\
528 "punpcklbw %%mm5, %%mm2 \n\t"\
529 "punpckhbw %%mm7, %%mm4 \n\t"\
530 "punpckhbw %%mm5, %%mm1 \n\t"\
531 \
532 "psllq $3, %%mm3 \n\t"\
533 "psllq $3, %%mm4 \n\t"\
534 \
535 "por %%mm3, %%mm2 \n\t"\
536 "por %%mm4, %%mm1 \n\t"\
537 \
538 MOVNTQ(%%mm2, (dst, index, 2))\
539 MOVNTQ(%%mm1, 8(dst, index, 2))\
540 \
541 "add $8, "#index" \n\t"\
542 "cmp "#dstw", "#index" \n\t"\
543 " jb 1b \n\t"
544 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
545
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
548 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
549 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
550 "psrlq $3, %%mm2 \n\t"\
551 "psrlq $1, %%mm5 \n\t"\
552 \
553 "movq %%mm2, %%mm1 \n\t"\
554 "movq %%mm4, %%mm3 \n\t"\
555 \
556 "punpcklbw %%mm7, %%mm3 \n\t"\
557 "punpcklbw %%mm5, %%mm2 \n\t"\
558 "punpckhbw %%mm7, %%mm4 \n\t"\
559 "punpckhbw %%mm5, %%mm1 \n\t"\
560 \
561 "psllq $2, %%mm3 \n\t"\
562 "psllq $2, %%mm4 \n\t"\
563 \
564 "por %%mm3, %%mm2 \n\t"\
565 "por %%mm4, %%mm1 \n\t"\
566 \
567 MOVNTQ(%%mm2, (dst, index, 2))\
568 MOVNTQ(%%mm1, 8(dst, index, 2))\
569 \
570 "add $8, "#index" \n\t"\
571 "cmp "#dstw", "#index" \n\t"\
572 " jb 1b \n\t"
573 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
574
575 #define WRITEBGR24OLD(dst, dstw, index) \
576 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577 "movq %%mm2, %%mm1 \n\t" /* B */\
578 "movq %%mm5, %%mm6 \n\t" /* R */\
579 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
580 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
581 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
582 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
583 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
584 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
585 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
586 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
588 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
589 \
590 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
591 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
592 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
595 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
596 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
597 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
598 \
599 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
600 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
601 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
602 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
603 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
605 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
606 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
609 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
610 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
611 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
612 \
613 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
614 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
615 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
616 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
619 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
620 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
621 \
622 MOVNTQ(%%mm0, (dst))\
623 MOVNTQ(%%mm2, 8(dst))\
624 MOVNTQ(%%mm3, 16(dst))\
625 "add $24, "#dst" \n\t"\
626 \
627 "add $8, "#index" \n\t"\
628 "cmp "#dstw", "#index" \n\t"\
629 " jb 1b \n\t"
630
631 #define WRITEBGR24MMX(dst, dstw, index) \
632 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633 "movq %%mm2, %%mm1 \n\t" /* B */\
634 "movq %%mm5, %%mm6 \n\t" /* R */\
635 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
636 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
637 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
638 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
639 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
640 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
641 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
642 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
643 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
644 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
645 \
646 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
647 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
648 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
649 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
650 \
651 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
652 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
653 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
654 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
655 \
656 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
657 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
658 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
659 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
660 \
661 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
662 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
663 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
664 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
665 MOVNTQ(%%mm0, (dst))\
666 \
667 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
668 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
669 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
670 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
671 MOVNTQ(%%mm6, 8(dst))\
672 \
673 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
674 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
675 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
676 MOVNTQ(%%mm5, 16(dst))\
677 \
678 "add $24, "#dst" \n\t"\
679 \
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
682 " jb 1b \n\t"
683
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686 "movq "MANGLE(M24A)", %%mm0 \n\t"\
687 "movq "MANGLE(M24C)", %%mm7 \n\t"\
688 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
689 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
690 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
691 \
692 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
693 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
694 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
695 \
696 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
697 "por %%mm1, %%mm6 \n\t"\
698 "por %%mm3, %%mm6 \n\t"\
699 MOVNTQ(%%mm6, (dst))\
700 \
701 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
702 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
703 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
704 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
705 \
706 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
707 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
708 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
709 \
710 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
711 "por %%mm3, %%mm6 \n\t"\
712 MOVNTQ(%%mm6, 8(dst))\
713 \
714 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
715 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
716 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
717 \
718 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
719 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
720 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
721 \
722 "por %%mm1, %%mm3 \n\t"\
723 "por %%mm3, %%mm6 \n\t"\
724 MOVNTQ(%%mm6, 16(dst))\
725 \
726 "add $24, "#dst" \n\t"\
727 \
728 "add $8, "#index" \n\t"\
729 "cmp "#dstw", "#index" \n\t"\
730 " jb 1b \n\t"
731
732 #ifdef HAVE_MMX2
733 #undef WRITEBGR24
734 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
735 #else
736 #undef WRITEBGR24
737 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
738 #endif
739
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741 "packuswb %%mm3, %%mm3 \n\t"\
742 "packuswb %%mm4, %%mm4 \n\t"\
743 "packuswb %%mm7, %%mm1 \n\t"\
744 "punpcklbw %%mm4, %%mm3 \n\t"\
745 "movq %%mm1, %%mm7 \n\t"\
746 "punpcklbw %%mm3, %%mm1 \n\t"\
747 "punpckhbw %%mm3, %%mm7 \n\t"\
748 \
749 MOVNTQ(%%mm1, (dst, index, 2))\
750 MOVNTQ(%%mm7, 8(dst, index, 2))\
751 \
752 "add $8, "#index" \n\t"\
753 "cmp "#dstw", "#index" \n\t"\
754 " jb 1b \n\t"
755 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
756
757
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
761 {
762 #ifdef HAVE_MMX
763 if(uDest != NULL)
764 {
765 asm volatile(
766 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767 :: "r" (&c->redDither),
768 "r" (uDest), "p" ((long)chrDstW)
769 : "%"REG_a, "%"REG_d, "%"REG_S
770 );
771
772 asm volatile(
773 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774 :: "r" (&c->redDither),
775 "r" (vDest), "p" ((long)chrDstW)
776 : "%"REG_a, "%"REG_d, "%"REG_S
777 );
778 }
779
780 asm volatile(
781 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782 :: "r" (&c->redDither),
783 "r" (dest), "p" ((long)dstW)
784 : "%"REG_a, "%"REG_d, "%"REG_S
785 );
786 #else
787 #ifdef HAVE_ALTIVEC
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789 chrFilter, chrSrc, chrFilterSize,
790 dest, uDest, vDest, dstW, chrDstW);
791 #else //HAVE_ALTIVEC
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793 chrFilter, chrSrc, chrFilterSize,
794 dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
796 #endif
797 }
798
799 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
802 {
803 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804 chrFilter, chrSrc, chrFilterSize,
805 dest, uDest, dstW, chrDstW, dstFormat);
806 }
807
808 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
809 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
810 {
811 #ifdef HAVE_MMX
812 if(uDest != NULL)
813 {
814 asm volatile(
815 YSCALEYUV2YV121
816 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
817 "g" ((long)-chrDstW)
818 : "%"REG_a
819 );
820
821 asm volatile(
822 YSCALEYUV2YV121
823 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
824 "g" ((long)-chrDstW)
825 : "%"REG_a
826 );
827 }
828
829 asm volatile(
830 YSCALEYUV2YV121
831 :: "r" (lumSrc + dstW), "r" (dest + dstW),
832 "g" ((long)-dstW)
833 : "%"REG_a
834 );
835 #else
836 int i;
837 for(i=0; i<dstW; i++)
838 {
839 int val= lumSrc[i]>>7;
840
841 if(val&256){
842 if(val<0) val=0;
843 else val=255;
844 }
845
846 dest[i]= val;
847 }
848
849 if(uDest != NULL)
850 for(i=0; i<chrDstW; i++)
851 {
852 int u=chrSrc[i]>>7;
853 int v=chrSrc[i + 2048]>>7;
854
855 if((u|v)&256){
856 if(u<0) u=0;
857 else if (u>255) u=255;
858 if(v<0) v=0;
859 else if (v>255) v=255;
860 }
861
862 uDest[i]= u;
863 vDest[i]= v;
864 }
865 #endif
866 }
867
868
869 /**
870 * vertical scale YV12 to RGB
871 */
872 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
873 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
874 uint8_t *dest, int dstW, int dstY)
875 {
876 int dummy=0;
877 switch(c->dstFormat)
878 {
879 #ifdef HAVE_MMX
880 case IMGFMT_BGR32:
881 {
882 asm volatile(
883 YSCALEYUV2RGBX
884 WRITEBGR32(%4, %5, %%REGa)
885
886 :: "r" (&c->redDither),
887 "m" (dummy), "m" (dummy), "m" (dummy),
888 "r" (dest), "m" (dstW)
889 : "%"REG_a, "%"REG_d, "%"REG_S
890 );
891 }
892 break;
893 case IMGFMT_BGR24:
894 {
895 asm volatile(
896 YSCALEYUV2RGBX
897 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898 "add %4, %%"REG_b" \n\t"
899 WRITEBGR24(%%REGb, %5, %%REGa)
900
901 :: "r" (&c->redDither),
902 "m" (dummy), "m" (dummy), "m" (dummy),
903 "r" (dest), "m" (dstW)
904 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
905 );
906 }
907 break;
908 case IMGFMT_BGR15:
909 {
910 asm volatile(
911 YSCALEYUV2RGBX
912 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
913 #ifdef DITHER1XBPP
914 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
917 #endif
918
919 WRITEBGR15(%4, %5, %%REGa)
920
921 :: "r" (&c->redDither),
922 "m" (dummy), "m" (dummy), "m" (dummy),
923 "r" (dest), "m" (dstW)
924 : "%"REG_a, "%"REG_d, "%"REG_S
925 );
926 }
927 break;
928 case IMGFMT_BGR16:
929 {
930 asm volatile(
931 YSCALEYUV2RGBX
932 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
933 #ifdef DITHER1XBPP
934 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
937 #endif
938
939 WRITEBGR16(%4, %5, %%REGa)
940
941 :: "r" (&c->redDither),
942 "m" (dummy), "m" (dummy), "m" (dummy),
943 "r" (dest), "m" (dstW)
944 : "%"REG_a, "%"REG_d, "%"REG_S
945 );
946 }
947 break;
948 case IMGFMT_YUY2:
949 {
950 asm volatile(
951 YSCALEYUV2PACKEDX
952 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
953
954 "psraw $3, %%mm3 \n\t"
955 "psraw $3, %%mm4 \n\t"
956 "psraw $3, %%mm1 \n\t"
957 "psraw $3, %%mm7 \n\t"
958 WRITEYUY2(%4, %5, %%REGa)
959
960 :: "r" (&c->redDither),
961 "m" (dummy), "m" (dummy), "m" (dummy),
962 "r" (dest), "m" (dstW)
963 : "%"REG_a, "%"REG_d, "%"REG_S
964 );
965 }
966 break;
967 #endif
968 default:
969 #ifdef HAVE_ALTIVEC
970 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
972 dest, dstW, dstY);
973 #else
974 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
975 chrFilter, chrSrc, chrFilterSize,
976 dest, dstW, dstY);
977 #endif
978 break;
979 }
980 }
981
982 /**
983 * vertical bilinear scale YV12 to RGB
984 */
985 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
986 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
987 {
988 int yalpha1=yalpha^4095;
989 int uvalpha1=uvalpha^4095;
990 int i;
991
992 #if 0 //isn't used
993 if(flags&SWS_FULL_CHR_H_INT)
994 {
995 switch(dstFormat)
996 {
997 #ifdef HAVE_MMX
998 case IMGFMT_BGR32:
999 asm volatile(
1000
1001
1002 FULL_YSCALEYUV2RGB
1003 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1004 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1005
1006 "movq %%mm3, %%mm1 \n\t"
1007 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1008 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1009
1010 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1011 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1012
1013 "add $4, %%"REG_a" \n\t"
1014 "cmp %5, %%"REG_a" \n\t"
1015 " jb 1b \n\t"
1016
1017
1018 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1019 "m" (yalpha1), "m" (uvalpha1)
1020 : "%"REG_a
1021 );
1022 break;
1023 case IMGFMT_BGR24:
1024 asm volatile(
1025
1026 FULL_YSCALEYUV2RGB
1027
1028 // lsb ... msb
1029 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1030 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1031
1032 "movq %%mm3, %%mm1 \n\t"
1033 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1034 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1035
1036 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1037 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1038 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1039 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1040 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1041 "movq %%mm1, %%mm2 \n\t"
1042 "psllq $48, %%mm1 \n\t" // 000000BG
1043 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1044
1045 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1046 "psrld $16, %%mm2 \n\t" // R000R000
1047 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1048 "por %%mm2, %%mm1 \n\t" // RBGRR000
1049
1050 "mov %4, %%"REG_b" \n\t"
1051 "add %%"REG_a", %%"REG_b" \n\t"
1052
1053 #ifdef HAVE_MMX2
1054 //FIXME Alignment
1055 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1056 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1057 #else
1058 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1059 "psrlq $32, %%mm3 \n\t"
1060 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1061 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1062 #endif
1063 "add $4, %%"REG_a" \n\t"
1064 "cmp %5, %%"REG_a" \n\t"
1065 " jb 1b \n\t"
1066
1067 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1068 "m" (yalpha1), "m" (uvalpha1)
1069 : "%"REG_a, "%"REG_b
1070 );
1071 break;
1072 case IMGFMT_BGR15:
1073 asm volatile(
1074
1075 FULL_YSCALEYUV2RGB
1076 #ifdef DITHER1XBPP
1077 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1078 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1079 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1080 #endif
1081 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1082 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1083 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1084
1085 "psrlw $3, %%mm3 \n\t"
1086 "psllw $2, %%mm1 \n\t"
1087 "psllw $7, %%mm0 \n\t"
1088 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1089 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1090
1091 "por %%mm3, %%mm1 \n\t"
1092 "por %%mm1, %%mm0 \n\t"
1093
1094 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1095
1096 "add $4, %%"REG_a" \n\t"
1097 "cmp %5, %%"REG_a" \n\t"
1098 " jb 1b \n\t"
1099
1100 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1101 "m" (yalpha1), "m" (uvalpha1)
1102 : "%"REG_a
1103 );
1104 break;
1105 case IMGFMT_BGR16:
1106 asm volatile(
1107
1108 FULL_YSCALEYUV2RGB
1109 #ifdef DITHER1XBPP
1110 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1111 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1112 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1113 #endif
1114 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1115 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1116 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1117
1118 "psrlw $3, %%mm3 \n\t"
1119 "psllw $3, %%mm1 \n\t"
1120 "psllw $8, %%mm0 \n\t"
1121 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1122 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1123
1124 "por %%mm3, %%mm1 \n\t"
1125 "por %%mm1, %%mm0 \n\t"
1126
1127 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1128
1129 "add $4, %%"REG_a" \n\t"
1130 "cmp %5, %%"REG_a" \n\t"
1131 " jb 1b \n\t"
1132
1133 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1134 "m" (yalpha1), "m" (uvalpha1)
1135 : "%"REG_a
1136 );
1137 break;
1138 #endif
1139 case IMGFMT_RGB32:
1140 #ifndef HAVE_MMX
1141 case IMGFMT_BGR32:
1142 #endif
1143 if(dstFormat==IMGFMT_BGR32)
1144 {
1145 int i;
1146 #ifdef WORDS_BIGENDIAN
1147 dest++;
1148 #endif
1149 for(i=0;i<dstW;i++){
1150 // vertical linear interpolation && yuv2rgb in a single step:
1151 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1152 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1153 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1155 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1156 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1157 dest+= 4;
1158 }
1159 }
1160 else if(dstFormat==IMGFMT_BGR24)
1161 {
1162 int i;
1163 for(i=0;i<dstW;i++){
1164 // vertical linear interpolation && yuv2rgb in a single step:
1165 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1168 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1169 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1170 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1171 dest+= 3;
1172 }
1173 }
1174 else if(dstFormat==IMGFMT_BGR16)
1175 {
1176 int i;
1177 for(i=0;i<dstW;i++){
1178 // vertical linear interpolation && yuv2rgb in a single step:
1179 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1180 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1181 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1182
1183 ((uint16_t*)dest)[i] =
1184 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1185 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1186 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1187 }
1188 }
1189 else if(dstFormat==IMGFMT_BGR15)
1190 {
1191 int i;
1192 for(i=0;i<dstW;i++){
1193 // vertical linear interpolation && yuv2rgb in a single step:
1194 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1195 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1196 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1197
1198 ((uint16_t*)dest)[i] =
1199 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1200 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1201 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1202 }
1203 }
1204 }//FULL_UV_IPOL
1205 else
1206 {
1207 #endif // if 0
1208 #ifdef HAVE_MMX
1209 switch(c->dstFormat)
1210 {
1211 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1212 case IMGFMT_BGR32:
1213 asm volatile(
1214 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1215 "mov %4, %%"REG_SP" \n\t"
1216 YSCALEYUV2RGB(%%REGa, %5)
1217 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1218 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1219
1220 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1221 "r" (&c->redDither)
1222 : "%"REG_a
1223 );
1224 return;
1225 case IMGFMT_BGR24:
1226 asm volatile(
1227 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1228 "mov %4, %%"REG_SP" \n\t"
1229 YSCALEYUV2RGB(%%REGa, %5)
1230 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1231 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1232 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1233 "r" (&c->redDither)
1234 : "%"REG_a
1235 );
1236 return;
1237 case IMGFMT_BGR15:
1238 asm volatile(
1239 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1240 "mov %4, %%"REG_SP" \n\t"
1241 YSCALEYUV2RGB(%%REGa, %5)
1242 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1243 #ifdef DITHER1XBPP
1244 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1245 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1246 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1247 #endif
1248
1249 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1250 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1251
1252 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1253 "r" (&c->redDither)
1254 : "%"REG_a
1255 );
1256 return;
1257 case IMGFMT_BGR16:
1258 asm volatile(
1259 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1260 "mov %4, %%"REG_SP" \n\t"
1261 YSCALEYUV2RGB(%%REGa, %5)
1262 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1263 #ifdef DITHER1XBPP
1264 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1265 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1266 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1267 #endif
1268
1269 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1270 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1271 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1272 "r" (&c->redDither)
1273 : "%"REG_a
1274 );
1275 return;
1276 case IMGFMT_YUY2:
1277 asm volatile(
1278 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1279 "mov %4, %%"REG_SP" \n\t"
1280 YSCALEYUV2PACKED(%%REGa, %5)
1281 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1282 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1283 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1284 "r" (&c->redDither)
1285 : "%"REG_a
1286 );
1287 return;
1288 default: break;
1289 }
1290 #endif //HAVE_MMX
1291 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1292 }
1293
1294 /**
1295 * YV12 to RGB without scaling or interpolating
1296 */
1297 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1298 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1299 {
1300 const int yalpha1=0;
1301 int i;
1302
1303 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1304 const int yalpha= 4096; //FIXME ...
1305
1306 if(flags&SWS_FULL_CHR_H_INT)
1307 {
1308 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1309 return;
1310 }
1311
1312 #ifdef HAVE_MMX
1313 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1314 {
1315 switch(dstFormat)
1316 {
1317 case IMGFMT_BGR32:
1318 asm volatile(
1319 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1320 "mov %4, %%"REG_SP" \n\t"
1321 YSCALEYUV2RGB1(%%REGa, %5)
1322 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1323 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1324
1325 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1326 "r" (&c->redDither)
1327 : "%"REG_a
1328 );
1329 return;
1330 case IMGFMT_BGR24:
1331 asm volatile(
1332 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1333 "mov %4, %%"REG_SP" \n\t"
1334 YSCALEYUV2RGB1(%%REGa, %5)
1335 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1336 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1337
1338 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1339 "r" (&c->redDither)
1340 : "%"REG_a
1341 );
1342 return;
1343 case IMGFMT_BGR15:
1344 asm volatile(
1345 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1346 "mov %4, %%"REG_SP" \n\t"
1347 YSCALEYUV2RGB1(%%REGa, %5)
1348 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1349 #ifdef DITHER1XBPP
1350 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1351 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1352 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1353 #endif
1354 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1355 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1356
1357 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1358 "r" (&c->redDither)
1359 : "%"REG_a
1360 );
1361 return;
1362 case IMGFMT_BGR16:
1363 asm volatile(
1364 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1365 "mov %4, %%"REG_SP" \n\t"
1366 YSCALEYUV2RGB1(%%REGa, %5)
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368 #ifdef DITHER1XBPP
1369 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1371 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1372 #endif
1373
1374 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1375 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1376
1377 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1378 "r" (&c->redDither)
1379 : "%"REG_a
1380 );
1381 return;
1382 case IMGFMT_YUY2:
1383 asm volatile(
1384 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_SP" \n\t"
1386 YSCALEYUV2PACKED1(%%REGa, %5)
1387 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1388 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1389
1390 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1391 "r" (&c->redDither)
1392 : "%"REG_a
1393 );
1394 return;
1395 }
1396 }
1397 else
1398 {
1399 switch(dstFormat)
1400 {
1401 case IMGFMT_BGR32:
1402 asm volatile(
1403 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1404 "mov %4, %%"REG_SP" \n\t"
1405 YSCALEYUV2RGB1b(%%REGa, %5)
1406 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1407 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1408
1409 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1410 "r" (&c->redDither)
1411 : "%"REG_a
1412 );
1413 return;
1414 case IMGFMT_BGR24:
1415 asm volatile(
1416 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1417 "mov %4, %%"REG_SP" \n\t"
1418 YSCALEYUV2RGB1b(%%REGa, %5)
1419 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1420 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1421
1422 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1423 "r" (&c->redDither)
1424 : "%"REG_a
1425 );
1426 return;
1427 case IMGFMT_BGR15:
1428 asm volatile(
1429 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1430 "mov %4, %%"REG_SP" \n\t"
1431 YSCALEYUV2RGB1b(%%REGa, %5)
1432 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1433 #ifdef DITHER1XBPP
1434 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1435 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1436 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1437 #endif
1438 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1439 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1440
1441 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1442 "r" (&c->redDither)
1443 : "%"REG_a
1444 );
1445 return;
1446 case IMGFMT_BGR16:
1447 asm volatile(
1448 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1449 "mov %4, %%"REG_SP" \n\t"
1450 YSCALEYUV2RGB1b(%%REGa, %5)
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452 #ifdef DITHER1XBPP
1453 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1454 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1455 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1456 #endif
1457
1458 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1459 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1460
1461 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1462 "r" (&c->redDither)
1463 : "%"REG_a
1464 );
1465 return;
1466 case IMGFMT_YUY2:
1467 asm volatile(
1468 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_SP" \n\t"
1470 YSCALEYUV2PACKED1b(%%REGa, %5)
1471 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1472 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1473
1474 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1475 "r" (&c->redDither)
1476 : "%"REG_a
1477 );
1478 return;
1479 }
1480 }
1481 #endif
1482 if( uvalpha < 2048 )
1483 {
1484 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1485 }else{
1486 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1487 }
1488 }
1489
1490 //FIXME yuy2* can read upto 7 samples to much
1491
1492 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1493 {
1494 #ifdef HAVE_MMX
1495 asm volatile(
1496 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1497 "mov %0, %%"REG_a" \n\t"
1498 "1: \n\t"
1499 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1500 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1501 "pand %%mm2, %%mm0 \n\t"
1502 "pand %%mm2, %%mm1 \n\t"
1503 "packuswb %%mm1, %%mm0 \n\t"
1504 "movq %%mm0, (%2, %%"REG_a") \n\t"
1505 "add $8, %%"REG_a" \n\t"
1506 " js 1b \n\t"
1507 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1508 : "%"REG_a
1509 );
1510 #else
1511 int i;
1512 for(i=0; i<width; i++)
1513 dst[i]= src[2*i];
1514 #endif
1515 }
1516
1517 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1518 {
1519 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1520 asm volatile(
1521 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1522 "mov %0, %%"REG_a" \n\t"
1523 "1: \n\t"
1524 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1525 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1526 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1527 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1528 PAVGB(%%mm2, %%mm0)
1529 PAVGB(%%mm3, %%mm1)
1530 "psrlw $8, %%mm0 \n\t"
1531 "psrlw $8, %%mm1 \n\t"
1532 "packuswb %%mm1, %%mm0 \n\t"
1533 "movq %%mm0, %%mm1 \n\t"
1534 "psrlw $8, %%mm0 \n\t"
1535 "pand %%mm4, %%mm1 \n\t"
1536 "packuswb %%mm0, %%mm0 \n\t"
1537 "packuswb %%mm1, %%mm1 \n\t"
1538 "movd %%mm0, (%4, %%"REG_a") \n\t"
1539 "movd %%mm1, (%3, %%"REG_a") \n\t"
1540 "add $4, %%"REG_a" \n\t"
1541 " js 1b \n\t"
1542 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1543 : "%"REG_a
1544 );
1545 #else
1546 int i;
1547 for(i=0; i<width; i++)
1548 {
1549 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1550 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1551 }
1552 #endif
1553 }
1554
1555 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1556 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1557 {
1558 #ifdef HAVE_MMX
1559 asm volatile(
1560 "mov %0, %%"REG_a" \n\t"
1561 "1: \n\t"
1562 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1563 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1564 "psrlw $8, %%mm0 \n\t"
1565 "psrlw $8, %%mm1 \n\t"
1566 "packuswb %%mm1, %%mm0 \n\t"
1567 "movq %%mm0, (%2, %%"REG_a") \n\t"
1568 "add $8, %%"REG_a" \n\t"
1569 " js 1b \n\t"
1570 : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1571 : "%"REG_a
1572 );
1573 #else
1574 int i;
1575 for(i=0; i<width; i++)
1576 dst[i]= src[2*i+1];
1577 #endif
1578 }
1579
1580 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1581 {
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1583 asm volatile(
1584 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1585 "mov %0, %%"REG_a" \n\t"
1586 "1: \n\t"
1587 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1588 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1589 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1590 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1591 PAVGB(%%mm2, %%mm0)
1592 PAVGB(%%mm3, %%mm1)
1593 "pand %%mm4, %%mm0 \n\t"
1594 "pand %%mm4, %%mm1 \n\t"
1595 "packuswb %%mm1, %%mm0 \n\t"
1596 "movq %%mm0, %%mm1 \n\t"
1597 "psrlw $8, %%mm0 \n\t"
1598 "pand %%mm4, %%mm1 \n\t"
1599 "packuswb %%mm0, %%mm0 \n\t"
1600 "packuswb %%mm1, %%mm1 \n\t"
1601 "movd %%mm0, (%4, %%"REG_a") \n\t"
1602 "movd %%mm1, (%3, %%"REG_a") \n\t"
1603 "add $4, %%"REG_a" \n\t"
1604 " js 1b \n\t"
1605 : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1606 : "%"REG_a
1607 );
1608 #else
1609 int i;
1610 for(i=0; i<width; i++)
1611 {
1612 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1613 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1614 }
1615 #endif
1616 }
1617
1618 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1619 {
1620 int i;
1621 for(i=0; i<width; i++)
1622 {
1623 int b= ((uint32_t*)src)[i]&0xFF;
1624 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1625 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1626
1627 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1628 }
1629 }
1630
1631 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1632 {
1633 int i;
1634 for(i=0; i<width; i++)
1635 {
1636 const int a= ((uint32_t*)src1)[2*i+0];
1637 const int e= ((uint32_t*)src1)[2*i+1];
1638 const int c= ((uint32_t*)src2)[2*i+0];
1639 const int d= ((uint32_t*)src2)[2*i+1];
1640 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1641 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1642 const int b= l&0x3FF;
1643 const int g= h>>8;
1644 const int r= l>>16;
1645
1646 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1647 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1648 }
1649 }
1650
1651 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1652 {
1653 #ifdef HAVE_MMX
1654 asm volatile(
1655 "mov %2, %%"REG_a" \n\t"
1656 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1657 "movq "MANGLE(w1111)", %%mm5 \n\t"
1658 "pxor %%mm7, %%mm7 \n\t"
1659 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1660 ".balign 16 \n\t"
1661 "1: \n\t"
1662 PREFETCH" 64(%0, %%"REG_b") \n\t"
1663 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1664 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1665 "punpcklbw %%mm7, %%mm0 \n\t"
1666 "punpcklbw %%mm7, %%mm1 \n\t"
1667 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1668 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1669 "punpcklbw %%mm7, %%mm2 \n\t"
1670 "punpcklbw %%mm7, %%mm3 \n\t"
1671 "pmaddwd %%mm6, %%mm0 \n\t"
1672 "pmaddwd %%mm6, %%mm1 \n\t"
1673 "pmaddwd %%mm6, %%mm2 \n\t"
1674 "pmaddwd %%mm6, %%mm3 \n\t"
1675 #ifndef FAST_BGR2YV12
1676 "psrad $8, %%mm0 \n\t"
1677 "psrad $8, %%mm1 \n\t"
1678 "psrad $8, %%mm2 \n\t"
1679 "psrad $8, %%mm3 \n\t"
1680 #endif
1681 "packssdw %%mm1, %%mm0 \n\t"
1682 "packssdw %%mm3, %%mm2 \n\t"
1683 "pmaddwd %%mm5, %%mm0 \n\t"
1684 "pmaddwd %%mm5, %%mm2 \n\t"
1685 "packssdw %%mm2, %%mm0 \n\t"
1686 "psraw $7, %%mm0 \n\t"
1687
1688 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1689 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1690 "punpcklbw %%mm7, %%mm4 \n\t"
1691 "punpcklbw %%mm7, %%mm1 \n\t"
1692 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1693 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1694 "punpcklbw %%mm7, %%mm2 \n\t"
1695 "punpcklbw %%mm7, %%mm3 \n\t"
1696 "pmaddwd %%mm6, %%mm4 \n\t"
1697 "pmaddwd %%mm6, %%mm1 \n\t"
1698 "pmaddwd %%mm6, %%mm2 \n\t"
1699 "pmaddwd %%mm6, %%mm3 \n\t"
1700 #ifndef FAST_BGR2YV12
1701 "psrad $8, %%mm4 \n\t"
1702 "psrad $8, %%mm1 \n\t"
1703 "psrad $8, %%mm2 \n\t"
1704 "psrad $8, %%mm3 \n\t"
1705 #endif
1706 "packssdw %%mm1, %%mm4 \n\t"
1707 "packssdw %%mm3, %%mm2 \n\t"
1708 "pmaddwd %%mm5, %%mm4 \n\t"
1709 "pmaddwd %%mm5, %%mm2 \n\t"
1710 "add $24, %%"REG_b" \n\t"
1711 "packssdw %%mm2, %%mm4 \n\t"
1712 "psraw $7, %%mm4 \n\t"
1713
1714 "packuswb %%mm4, %%mm0 \n\t"
1715 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1716
1717 "movq %%mm0, (%1, %%"REG_a") \n\t"
1718 "add $8, %%"REG_a" \n\t"
1719 " js 1b \n\t"
1720 : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1721 : "%"REG_a, "%"REG_b
1722 );
1723 #else
1724 int i;
1725 for(i=0; i<width; i++)
1726 {
1727 int b= src[i*3+0];
1728 int g= src[i*3+1];
1729 int r= src[i*3+2];
1730
1731 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1732 }
1733 #endif
1734 }
1735
1736 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1737 {
1738 #ifdef HAVE_MMX
1739 asm volatile(
1740 "mov %4, %%"REG_a" \n\t"
1741 "movq "MANGLE(w1111)", %%mm5 \n\t"
1742 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1743 "pxor %%mm7, %%mm7 \n\t"
1744 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1745 "add %%"REG_b", %%"REG_b" \n\t"
1746 ".balign 16 \n\t"
1747 "1: \n\t"
1748 PREFETCH" 64(%0, %%"REG_b") \n\t"
1749 PREFETCH" 64(%1, %%"REG_b") \n\t"
1750 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1751 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1752 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1753 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1754 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1755 PAVGB(%%mm1, %%mm0)
1756 PAVGB(%%mm3, %%mm2)
1757 "movq %%mm0, %%mm1 \n\t"
1758 "movq %%mm2, %%mm3 \n\t"
1759 "psrlq $24, %%mm0 \n\t"
1760 "psrlq $24, %%mm2 \n\t"
1761 PAVGB(%%mm1, %%mm0)
1762 PAVGB(%%mm3, %%mm2)
1763 "punpcklbw %%mm7, %%mm0 \n\t"
1764 "punpcklbw %%mm7, %%mm2 \n\t"
1765 #else
1766 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1767 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1768 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1769 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1770 "punpcklbw %%mm7, %%mm0 \n\t"
1771 "punpcklbw %%mm7, %%mm1 \n\t"
1772 "punpcklbw %%mm7, %%mm2 \n\t"
1773 "punpcklbw %%mm7, %%mm3 \n\t"
1774 "paddw %%mm1, %%mm0 \n\t"
1775 "paddw %%mm3, %%mm2 \n\t"
1776 "paddw %%mm2, %%mm0 \n\t"
1777 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1778 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1779 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1780 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1781 "punpcklbw %%mm7, %%mm4 \n\t"
1782 "punpcklbw %%mm7, %%mm1 \n\t"
1783 "punpcklbw %%mm7, %%mm2 \n\t"
1784 "punpcklbw %%mm7, %%mm3 \n\t"
1785 "paddw %%mm1, %%mm4 \n\t"
1786 "paddw %%mm3, %%mm2 \n\t"
1787 "paddw %%mm4, %%mm2 \n\t"
1788 "psrlw $2, %%mm0 \n\t"
1789 "psrlw $2, %%mm2 \n\t"
1790 #endif
1791 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1792 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1793
1794 "pmaddwd %%mm0, %%mm1 \n\t"
1795 "pmaddwd %%mm2, %%mm3 \n\t"
1796 "pmaddwd %%mm6, %%mm0 \n\t"
1797 "pmaddwd %%mm6, %%mm2 \n\t"
1798 #ifndef FAST_BGR2YV12
1799 "psrad $8, %%mm0 \n\t"
1800 "psrad $8, %%mm1 \n\t"
1801 "psrad $8, %%mm2 \n\t"
1802 "psrad $8, %%mm3 \n\t"
1803 #endif
1804 "packssdw %%mm2, %%mm0 \n\t"
1805 "packssdw %%mm3, %%mm1 \n\t"
1806 "pmaddwd %%mm5, %%mm0 \n\t"
1807 "pmaddwd %%mm5, %%mm1 \n\t"
1808 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1809 "psraw $7, %%mm0 \n\t"
1810
1811 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1812 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1813 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1814 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1815 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1816 PAVGB(%%mm1, %%mm4)
1817 PAVGB(%%mm3, %%mm2)
1818 "movq %%mm4, %%mm1 \n\t"
1819 "movq %%mm2, %%mm3 \n\t"
1820 "psrlq $24, %%mm4 \n\t"
1821 "psrlq $24, %%mm2 \n\t"
1822 PAVGB(%%mm1, %%mm4)
1823 PAVGB(%%mm3, %%mm2)
1824 "punpcklbw %%mm7, %%mm4 \n\t"
1825 "punpcklbw %%mm7, %%mm2 \n\t"
1826 #else
1827 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1828 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1829 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1830 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1831 "punpcklbw %%mm7, %%mm4 \n\t"
1832 "punpcklbw %%mm7, %%mm1 \n\t"
1833 "punpcklbw %%mm7, %%mm2 \n\t"
1834 "punpcklbw %%mm7, %%mm3 \n\t"
1835 "paddw %%mm1, %%mm4 \n\t"
1836 "paddw %%mm3, %%mm2 \n\t"
1837 "paddw %%mm2, %%mm4 \n\t"
1838 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1839 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1840 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1841 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1842 "punpcklbw %%mm7, %%mm5 \n\t"
1843 "punpcklbw %%mm7, %%mm1 \n\t"
1844 "punpcklbw %%mm7, %%mm2 \n\t"
1845 "punpcklbw %%mm7, %%mm3 \n\t"
1846 "paddw %%mm1, %%mm5 \n\t"
1847 "paddw %%mm3, %%mm2 \n\t"
1848 "paddw %%mm5, %%mm2 \n\t"
1849 "movq "MANGLE(w1111)", %%mm5 \n\t"
1850 "psrlw $2, %%mm4 \n\t"
1851 "psrlw $2, %%mm2 \n\t"
1852 #endif
1853 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1854 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1855
1856 "pmaddwd %%mm4, %%mm1 \n\t"
1857 "pmaddwd %%mm2, %%mm3 \n\t"
1858 "pmaddwd %%mm6, %%mm4 \n\t"
1859 "pmaddwd %%mm6, %%mm2 \n\t"
1860 #ifndef FAST_BGR2YV12
1861 "psrad $8, %%mm4 \n\t"
1862 "psrad $8, %%mm1 \n\t"
1863 "psrad $8, %%mm2 \n\t"
1864 "psrad $8, %%mm3 \n\t"
1865 #endif
1866 "packssdw %%mm2, %%mm4 \n\t"
1867 "packssdw %%mm3, %%mm1 \n\t"
1868 "pmaddwd %%mm5, %%mm4 \n\t"
1869 "pmaddwd %%mm5, %%mm1 \n\t"
1870 "add $24, %%"REG_b" \n\t"
1871 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1872 "psraw $7, %%mm4 \n\t"
1873
1874 "movq %%mm0, %%mm1 \n\t"
1875 "punpckldq %%mm4, %%mm0 \n\t"
1876 "punpckhdq %%mm4, %%mm1 \n\t"
1877 "packsswb %%mm1, %%mm0 \n\t"
1878 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1879
1880 "movd %%mm0, (%2, %%"REG_a") \n\t"
1881 "punpckhdq %%mm0, %%mm0 \n\t"
1882 "movd %%mm0, (%3, %%"REG_a") \n\t"
1883 "add $4, %%"REG_a" \n\t"
1884 " js 1b \n\t"
1885 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1886 : "%"REG_a, "%"REG_b
1887 );
1888 #else
1889 int i;
1890 for(i=0; i<width; i++)
1891 {
1892 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1893 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1894 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1895
1896 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1897 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1898 }
1899 #endif
1900 }
1901
1902 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1903 {
1904 int i;
1905 for(i=0; i<width; i++)
1906 {
1907 int d= ((uint16_t*)src)[i];
1908 int b= d&0x1F;
1909 int g= (d>>5)&0x3F;
1910 int r= (d>>11)&0x1F;
1911
1912 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1913 }
1914 }
1915
1916 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1917 {
1918 int i;
1919 for(i=0; i<width; i++)
1920 {
1921 int d0= ((uint32_t*)src1)[i];
1922 int d1= ((uint32_t*)src2)[i];
1923
1924 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1925 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1926
1927 int dh2= (dh>>11) + (dh<<21);
1928 int d= dh2 + dl;
1929
1930 int b= d&0x7F;
1931 int r= (d>>11)&0x7F;
1932 int g= d>>21;
1933 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1934 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1935 }
1936 }
1937
1938 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1939 {
1940 int i;
1941 for(i=0; i<width; i++)
1942 {
1943 int d= ((uint16_t*)src)[i];
1944 int b= d&0x1F;
1945 int g= (d>>5)&0x1F;
1946 int r= (d>>10)&0x1F;
1947
1948 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1949 }
1950 }
1951
1952 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1953 {
1954 int i;
1955 for(i=0; i<width; i++)
1956 {
1957 int d0= ((uint32_t*)src1)[i];
1958 int d1= ((uint32_t*)src2)[i];
1959
1960 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1961 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1962
1963 int dh2= (dh>>11) + (dh<<21);
1964 int d= dh2 + dl;
1965
1966 int b= d&0x7F;
1967 int r= (d>>10)&0x7F;
1968 int g= d>>21;
1969 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1970 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1971 }
1972 }
1973
1974
1975 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1976 {
1977 int i;
1978 for(i=0; i<width; i++)
1979 {
1980 int r= ((uint32_t*)src)[i]&0xFF;
1981 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1982 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1983
1984 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1985 }
1986 }
1987
1988 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1989 {
1990 int i;
1991 for(i=0; i<width; i++)
1992 {
1993 const int a= ((uint32_t*)src1)[2*i+0];
1994 const int e= ((uint32_t*)src1)[2*i+1];
1995 const int c= ((uint32_t*)src2)[2*i+0];
1996 const int d= ((uint32_t*)src2)[2*i+1];
1997 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1998 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1999 const int r= l&0x3FF;
2000 const int g= h>>8;
2001 const int b= l>>16;
2002
2003 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2004 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2005 }
2006 }
2007
2008 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2009 {
2010 int i;
2011 for(i=0; i<width; i++)
2012 {
2013 int r= src[i*3+0];
2014 int g= src[i*3+1];
2015 int b= src[i*3+2];
2016
2017 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2018 }
2019 }
2020
2021 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2022 {
2023 int i;
2024 for(i=0; i<width; i++)
2025 {
2026 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2027 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2028 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2029
2030 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2031 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2032 }
2033 }
2034
2035
2036 // Bilinear / Bicubic scaling
2037 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2038 int16_t *filter, int16_t *filterPos, int filterSize)
2039 {
2040 #ifdef HAVE_MMX
2041 assert(filterSize % 4 == 0 && filterSize>0);
2042 if(filterSize==4) // allways true for upscaling, sometimes for down too
2043 {
2044 long counter= -2*dstW;
2045 filter-= counter*2;
2046 filterPos-= counter/2;
2047 dst-= counter/2;
2048 asm volatile(
2049 "pxor %%mm7, %%mm7 \n\t"
2050 "movq "MANGLE(w02)", %%mm6 \n\t"
2051 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2052 "mov %%"REG_a", %%"REG_BP" \n\t"
2053 ".balign 16 \n\t"
2054 "1: \n\t"
2055 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2056 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2057 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2058 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2059 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2060 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2061 "punpcklbw %%mm7, %%mm0 \n\t"
2062 "punpcklbw %%mm7, %%mm2 \n\t"
2063 "pmaddwd %%mm1, %%mm0 \n\t"
2064 "pmaddwd %%mm2, %%mm3 \n\t"
2065 "psrad $8, %%mm0 \n\t"
2066 "psrad $8, %%mm3 \n\t"
2067 "packssdw %%mm3, %%mm0 \n\t"
2068 "pmaddwd %%mm6, %%mm0 \n\t"
2069 "packssdw %%mm0, %%mm0 \n\t"
2070 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2071 "add $4, %%"REG_BP" \n\t"
2072 " jnc 1b \n\t"
2073
2074 "pop %%"REG_BP" \n\t"
2075 : "+a" (counter)
2076 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2077 : "%"REG_b
2078 );
2079 }
2080 else if(filterSize==8)
2081 {
2082 long counter= -2*dstW;
2083 filter-= counter*4;
2084 filterPos-= counter/2;
2085 dst-= counter/2;
2086 asm volatile(
2087 "pxor %%mm7, %%mm7 \n\t"
2088 "movq "MANGLE(w02)", %%mm6 \n\t"
2089 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2090 "mov %%"REG_a", %%"REG_BP" \n\t"
2091 ".balign 16 \n\t"
2092 "1: \n\t"
2093 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2094 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2095 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2096 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2097 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2098 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2099 "punpcklbw %%mm7, %%mm0 \n\t"
2100 "punpcklbw %%mm7, %%mm2 \n\t"
2101 "pmaddwd %%mm1, %%mm0 \n\t"
2102 "pmaddwd %%mm2, %%mm3 \n\t"
2103
2104 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2105 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2106 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2107 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2108 "punpcklbw %%mm7, %%mm4 \n\t"
2109 "punpcklbw %%mm7, %%mm2 \n\t"
2110 "pmaddwd %%mm1, %%mm4 \n\t"
2111 "pmaddwd %%mm2, %%mm5 \n\t"
2112 "paddd %%mm4, %%mm0 \n\t"
2113 "paddd %%mm5, %%mm3 \n\t"
2114
2115 "psrad $8, %%mm0 \n\t"
2116 "psrad $8, %%mm3 \n\t"
2117 "packssdw %%mm3, %%mm0 \n\t"
2118 "pmaddwd %%mm6, %%mm0 \n\t"
2119 "packssdw %%mm0, %%mm0 \n\t"
2120 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2121 "add $4, %%"REG_BP" \n\t"
2122 " jnc 1b \n\t"
2123
2124 "pop %%"REG_BP" \n\t"
2125 : "+a" (counter)
2126 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2127 : "%"REG_b
2128 );
2129 }
2130 else
2131 {
2132 uint8_t *offset = src+filterSize;
2133 long counter= -2*dstW;
2134 // filter-= counter*filterSize/2;
2135 filterPos-= counter/2;
2136 dst-= counter/2;
2137 asm volatile(
2138 "pxor %%mm7, %%mm7 \n\t"
2139 "movq "MANGLE(w02)", %%mm6 \n\t"
2140 ".balign 16 \n\t"
2141 "1: \n\t"
2142 "mov %2, %%"REG_c" \n\t"
2143 "movzwl (%%"REG_c", %0), %%eax \n\t"
2144 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2145 "mov %5, %%"REG_c" \n\t"
2146 "pxor %%mm4, %%mm4 \n\t"
2147 "pxor %%mm5, %%mm5 \n\t"
2148 "2: \n\t"
2149 "movq (%1), %%mm1 \n\t"
2150 "movq (%1, %6), %%mm3 \n\t"
2151 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2152 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2153 "punpcklbw %%mm7, %%mm0 \n\t"
2154 "punpcklbw %%mm7, %%mm2 \n\t"
2155 "pmaddwd %%mm1, %%mm0 \n\t"
2156 "pmaddwd %%mm2, %%mm3 \n\t"
2157 "paddd %%mm3, %%mm5 \n\t"
2158 "paddd %%mm0, %%mm4 \n\t"
2159 "add $8, %1 \n\t"
2160 "add $4, %%"REG_c" \n\t"
2161 "cmp %4, %%"REG_c" \n\t"
2162 " jb 2b \n\t"
2163 "add %6, %1 \n\t"
2164 "psrad $8, %%mm4 \n\t"
2165 "psrad $8, %%mm5 \n\t"
2166 "packssdw %%mm5, %%mm4 \n\t"
2167 "pmaddwd %%mm6, %%mm4 \n\t"
2168 "packssdw %%mm4, %%mm4 \n\t"
2169 "mov %3, %%"REG_a" \n\t"
2170 "movd %%mm4, (%%"REG_a", %0) \n\t"
2171 "add $4, %0 \n\t"
2172 " jnc 1b \n\t"
2173
2174 : "+r" (counter), "+r" (filter)
2175 : "m" (filterPos), "m" (dst), "m"(offset),
2176 "m" (src), "r" ((long)filterSize*2)
2177 : "%"REG_b, "%"REG_a, "%"REG_c
2178 );
2179 }
2180 #else
2181 #ifdef HAVE_ALTIVEC
2182 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2183 #else
2184 int i;
2185 for(i=0; i<dstW; i++)
2186 {
2187 int j;
2188 int srcPos= filterPos[i];
2189 int val=0;
2190 // printf("filterPos: %d\n", filterPos[i]);
2191 for(j=0; j<filterSize; j++)
2192 {
2193 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2194 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2195 }
2196 // filter += hFilterSize;
2197 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2198 // dst[i] = val>>7;
2199 }
2200 #endif
2201 #endif
2202 }
2203 // *** horizontal scale Y line to temp buffer
2204 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2205 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2206 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2207 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2208 int32_t *mmx2FilterPos)
2209 {
2210 if(srcFormat==IMGFMT_YUY2)
2211 {
2212 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2213 src= formatConvBuffer;
2214 }
2215 else if(srcFormat==IMGFMT_UYVY)
2216 {
2217 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2218 src= formatConvBuffer;
2219 }
2220 else if(srcFormat==IMGFMT_BGR32)
2221 {
2222 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2223 src= formatConvBuffer;
2224 }
2225 else if(srcFormat==IMGFMT_BGR24)
2226 {
2227 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2228 src= formatConvBuffer;
2229 }
2230 else if(srcFormat==IMGFMT_BGR16)
2231 {
2232 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2233 src= formatConvBuffer;
2234 }
2235 else if(srcFormat==IMGFMT_BGR15)
2236 {
2237 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2238 src= formatConvBuffer;
2239 }
2240 else if(srcFormat==IMGFMT_RGB32)
2241 {
2242 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2243 src= formatConvBuffer;
2244 }
2245 else if(srcFormat==IMGFMT_RGB24)
2246 {
2247 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2248 src= formatConvBuffer;
2249 }
2250
2251 #ifdef HAVE_MMX
2252 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2253 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2254 #else
2255 if(!(flags&SWS_FAST_BILINEAR))
2256 #endif
2257 {
2258 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2259 }
2260 else // Fast Bilinear upscale / crap downscale
2261 {
2262 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2263 #ifdef HAVE_MMX2
2264 int i;
2265 if(canMMX2BeUsed)
2266 {
2267 asm volatile(
2268 "pxor %%mm7, %%mm7 \n\t"
2269 "mov %0, %%"REG_c" \n\t"
2270 "mov %1, %%"REG_D" \n\t"
2271 "mov %2, %%"REG_d" \n\t"
2272 "mov %3, %%"REG_b" \n\t"
2273 "xor %%"REG_a", %%"REG_a" \n\t" // i
2274 PREFETCH" (%%"REG_c") \n\t"
2275 PREFETCH" 32(%%"REG_c") \n\t"
2276 PREFETCH" 64(%%"REG_c") \n\t"
2277
2278 #ifdef ARCH_X86_64
2279
2280 #define FUNNY_Y_CODE \
2281 "movl (%%"REG_b"), %%esi \n\t"\
2282 "call *%4 \n\t"\
2283 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2284 "add %%"REG_S", %%"REG_c" \n\t"\
2285 "add %%"REG_a", %%"REG_D" \n\t"\
2286 "xor %%"REG_a", %%"REG_a" \n\t"\
2287
2288 #else
2289
2290 #define FUNNY_Y_CODE \
2291 "movl (%%"REG_b"), %%esi \n\t"\
2292 "call *%4 \n\t"\
2293 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2294 "add %%"REG_a", %%"REG_D" \n\t"\
2295 "xor %%"REG_a", %%"REG_a" \n\t"\
2296
2297 #endif
2298
2299 FUNNY_Y_CODE
2300 FUNNY_Y_CODE
2301 FUNNY_Y_CODE
2302 FUNNY_Y_CODE
2303 FUNNY_Y_CODE
2304 FUNNY_Y_CODE
2305 FUNNY_Y_CODE
2306 FUNNY_Y_CODE
2307
2308 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2309 "m" (funnyYCode)
2310 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2311 );
2312 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2313 }
2314 else
2315 {
2316 #endif
2317 int xInc_shr16 = xInc >> 16;
2318 int xInc_mask = xInc & 0xffff;
2319 //NO MMX just normal asm ...
2320 asm volatile(
2321 "xor %%"REG_a", %%"REG_a" \n\t" // i
2322 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2323 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2324 ".balign 16 \n\t"
2325 "1: \n\t"
2326 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2327 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2328 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2329 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2330 "shll $16, %%edi \n\t"
2331 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2332 "mov %1, %%"REG_D" \n\t"
2333 "shrl $9, %%esi \n\t"
2334 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2335 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2336 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2337
2338 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2339 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2340 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2341 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2342 "shll $16, %%edi \n\t"
2343 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2344 "mov %1, %%"REG_D" \n\t"
2345 "shrl $9, %%esi \n\t"
2346 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2347 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2348 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2349
2350
2351 "add $2, %%"REG_a" \n\t"
2352 "cmp %2, %%"REG_a" \n\t"
2353 " jb 1b \n\t"
2354
2355
2356 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2357 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2358 );
2359 #ifdef HAVE_MMX2
2360 } //if MMX2 can't be used
2361 #endif
2362 #else
2363 int i;
2364 unsigned int xpos=0;
2365 for(i=0;i<dstWidth;i++)
2366 {
2367 register unsigned int xx=xpos>>16;
2368 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2369 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2370 xpos+=xInc;
2371 }
2372 #endif
2373 }
2374 }
2375
2376 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2377 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2378 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2379 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2380 int32_t *mmx2FilterPos)
2381 {
2382 if(srcFormat==IMGFMT_YUY2)
2383 {
2384 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2385 src1= formatConvBuffer;
2386 src2= formatConvBuffer+2048;
2387 }
2388 else if(srcFormat==IMGFMT_UYVY)
2389 {
2390 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2391 src1= formatConvBuffer;
2392 src2= formatConvBuffer+2048;
2393 }
2394 else if(srcFormat==IMGFMT_BGR32)
2395 {
2396 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2397 src1= formatConvBuffer;
2398 src2= formatConvBuffer+2048;
2399 }
2400 else if(srcFormat==IMGFMT_BGR24)
2401 {
2402 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2403 src1= formatConvBuffer;
2404 src2= formatConvBuffer+2048;
2405 }
2406 else if(srcFormat==IMGFMT_BGR16)
2407 {
2408 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2409 src1= formatConvBuffer;
2410 src2= formatConvBuffer+2048;
2411 }
2412 else if(srcFormat==IMGFMT_BGR15)
2413 {
2414 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2415 src1= formatConvBuffer;
2416 src2= formatConvBuffer+2048;
2417 }
2418 else if(srcFormat==IMGFMT_RGB32)
2419 {
2420 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2421 src1= formatConvBuffer;
2422 src2= formatConvBuffer+2048;
2423 }
2424 else if(srcFormat==IMGFMT_RGB24)
2425 {
2426 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2427 src1= formatConvBuffer;
2428 src2= formatConvBuffer+2048;
2429 }
2430 else if(isGray(srcFormat))
2431 {
2432 return;
2433 }
2434
2435 #ifdef HAVE_MMX
2436 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2437 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2438 #else
2439 if(!(flags&SWS_FAST_BILINEAR))
2440 #endif
2441 {
2442 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2443 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2444 }
2445 else // Fast Bilinear upscale / crap downscale
2446 {
2447 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2448 #ifdef HAVE_MMX2
2449 int i;
2450 if(canMMX2BeUsed)
2451 {
2452 asm volatile(
2453 "pxor %%mm7, %%mm7 \n\t"
2454 "mov %0, %%"REG_c" \n\t"
2455 "mov %1, %%"REG_D" \n\t"
2456 "mov %2, %%"REG_d" \n\t"
2457 "mov %3, %%"REG_b" \n\t"
2458 "xor %%"REG_a", %%"REG_a" \n\t" // i
2459 PREFETCH" (%%"REG_c") \n\t"
2460 PREFETCH" 32(%%"REG_c") \n\t"
2461 PREFETCH" 64(%%"REG_c") \n\t"
2462
2463 #ifdef ARCH_X86_64
2464
2465 #define FUNNY_UV_CODE \
2466 "movl (%%"REG_b"), %%esi \n\t"\
2467 "call *%4 \n\t"\
2468 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2469 "add %%"REG_S", %%"REG_c" \n\t"\
2470 "add %%"REG_a", %%"REG_D" \n\t"\
2471 "xor %%"REG_a", %%"REG_a" \n\t"\
2472
2473 #else
2474
2475 #define FUNNY_UV_CODE \
2476 "movl (%%"REG_b"), %%esi \n\t"\
2477 "call *%4 \n\t"\
2478 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2479 "add %%"REG_a", %%"REG_D" \n\t"\
2480 "xor %%"REG_a", %%"REG_a" \n\t"\
2481
2482 #endif
2483
2484 FUNNY_UV_CODE
2485 FUNNY_UV_CODE
2486 FUNNY_UV_CODE
2487 FUNNY_UV_CODE
2488 "xor %%"REG_a", %%"REG_a" \n\t" // i
2489 "mov %5, %%"REG_c" \n\t" // src
2490 "mov %1, %%"REG_D" \n\t" // buf1
2491 "add $4096, %%"REG_D" \n\t"
2492 PREFETCH" (%%"REG_c") \n\t"
2493 PREFETCH" 32(%%"REG_c") \n\t"
2494 PREFETCH" 64(%%"REG_c") \n\t"
2495
2496 FUNNY_UV_CODE
2497 FUNNY_UV_CODE
2498 FUNNY_UV_CODE
2499 FUNNY_UV_CODE
2500
2501 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2502 "m" (funnyUVCode), "m" (src2)
2503 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2504 );
2505 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2506 {
2507 // printf("%d %d %d\n", dstWidth, i, srcW);
2508 dst[i] = src1[srcW-1]*128;
2509 dst[i+2048] = src2[srcW-1]*128;
2510 }
2511 }
2512 else
2513 {
2514 #endif
2515 long xInc_shr16 = (long) (xInc >> 16);
2516 int xInc_mask = xInc & 0xffff;
2517 asm volatile(
2518 "xor %%"REG_a", %%"REG_a" \n\t" // i
2519 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2520 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2521 ".balign 16 \n\t"
2522 "1: \n\t"
2523 "mov %0, %%"REG_S" \n\t"
2524 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2525 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2526 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2527 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2528 "shll $16, %%edi \n\t"
2529 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2530 "mov %1, %%"REG_D" \n\t"
2531 "shrl $9, %%esi \n\t"
2532 "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2533
2534 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2535 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2536 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2537 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2538 "shll $16, %%edi \n\t"
2539 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2540 "mov %1, %%"REG_D" \n\t"
2541 "shrl $9, %%esi \n\t"
2542 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2543
2544 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2545 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2546 "add $1, %%"REG_a" \n\t"
2547 "cmp %2, %%"REG_a" \n\t"
2548 " jb 1b \n\t"
2549
2550 :: "m" (src1), "m" (dst), "mp" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2551 "r" (src2)
2552 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2553 );
2554 #ifdef HAVE_MMX2
2555 } //if MMX2 can't be used
2556 #endif
2557 #else
2558 int i;
2559 unsigned int xpos=0;
2560 for(i=0;i<dstWidth;i++)
2561 {
2562 register unsigned int xx=xpos>>16;
2563 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2564 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2565 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2566 /* slower
2567 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2568 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2569 */
2570 xpos+=xInc;
2571 }
2572 #endif
2573 }
2574 }
2575
2576 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2577 int srcSliceH, uint8_t* dst[], int dstStride[]){
2578
2579 /* load a few things into local vars to make the code more readable? and faster */
2580 const int srcW= c->srcW;
2581 const int dstW= c->dstW;
2582 const int dstH= c->dstH;
2583 const int chrDstW= c->chrDstW;
2584 const int chrSrcW= c->chrSrcW;
2585 const int lumXInc= c->lumXInc;
2586 const int chrXInc= c->chrXInc;
2587 const int dstFormat= c->dstFormat;
2588 const int srcFormat= c->srcFormat;
2589 const int flags= c->flags;
2590 const int canMMX2BeUsed= c->canMMX2BeUsed;
2591 int16_t *vLumFilterPos= c->vLumFilterPos;
2592 int16_t *vChrFilterPos= c->vChrFilterPos;
2593 int16_t *hLumFilterPos= c->hLumFilterPos;
2594 int16_t *hChrFilterPos= c->hChrFilterPos;
2595 int16_t *vLumFilter= c->vLumFilter;
2596 int16_t *vChrFilter= c->vChrFilter;
2597 int16_t *hLumFilter= c->hLumFilter;
2598 int16_t *hChrFilter= c->hChrFilter;
2599 int32_t *lumMmxFilter= c->lumMmxFilter;
2600 int32_t *chrMmxFilter= c->chrMmxFilter;
2601 const int vLumFilterSize= c->vLumFilterSize;
2602 const int vChrFilterSize= c->vChrFilterSize;
2603 const int hLumFilterSize= c->hLumFilterSize;
2604 const int hChrFilterSize= c->hChrFilterSize;
2605 int16_t **lumPixBuf= c->lumPixBuf;
2606 int16_t **chrPixBuf= c->chrPixBuf;
2607 const int vLumBufSize= c->vLumBufSize;
2608 const int vChrBufSize= c->vChrBufSize;
2609 uint8_t *funnyYCode= c->funnyYCode;
2610 uint8_t *funnyUVCode= c->funnyUVCode;
2611 uint8_t *formatConvBuffer= c->formatConvBuffer;
2612 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2613 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2614 int lastDstY;
2615
2616 /* vars whch will change and which we need to storw back in the context */
2617 int dstY= c->dstY;
2618 int lumBufIndex= c->lumBufIndex;
2619 int chrBufIndex= c->chrBufIndex;
2620 int lastInLumBuf= c->lastInLumBuf;
2621 int lastInChrBuf= c->lastInChrBuf;
2622
2623 if(isPacked(c->srcFormat)){
2624 src[0]=
2625 src[1]=
2626 src[2]= src[0];
2627 srcStride[0]=
2628 srcStride[1]=
2629 srcStride[2]= srcStride[0];
2630 }
2631 srcStride[1]<<= c->vChrDrop;
2632 srcStride[2]<<= c->vChrDrop;
2633
2634 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2635 // (int)dst[0], (int)dst[1], (int)dst[2]);
2636
2637 #if 0 //self test FIXME move to a vfilter or something
2638 {
2639 static volatile int i=0;
2640 i++;
2641 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2642 selfTest(src, srcStride, c->srcW, c->srcH);
2643 i--;
2644 }
2645 #endif
2646
2647 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2648 //dstStride[0],dstStride[1],dstStride[2]);
2649
2650 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2651 {
2652 static int firstTime=1; //FIXME move this into the context perhaps
2653 if(flags & SWS_PRINT_INFO && firstTime)
2654 {
2655 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2656 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2657 firstTime=0;
2658 }
2659 }
2660
2661 /* Note the user might start scaling the picture in the middle so this will not get executed
2662 this is not really intended but works currently, so ppl might do it */
2663 if(srcSliceY ==0){
2664 lumBufIndex=0;
2665 chrBufIndex=0;
2666 dstY=0;
2667 lastInLumBuf= -1;
2668 lastInChrBuf= -1;
2669 }
2670
2671 lastDstY= dstY;
2672
2673 for(;dstY < dstH; dstY++){
2674 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2675 const int chrDstY= dstY>>c->chrDstVSubSample;
2676 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2677 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2678
2679 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2680 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2681 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2682 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2683
2684 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2685 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2686 //handle holes (FAST_BILINEAR & weird filters)
2687 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2688 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2689 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2690 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2691 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2692
2693 // Do we have enough lines in this slice to output the dstY line
2694 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2695 {
2696 //Do horizontal scaling
2697 while(lastInLumBuf < lastLumSrcY)
2698 {
2699 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2700 lumBufIndex++;
2701 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2702 ASSERT(lumBufIndex < 2*vLumBufSize)
2703 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2704 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2705 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2706 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2707 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2708 funnyYCode, c->srcFormat, formatConvBuffer,
2709 c->lumMmx2Filter, c->lumMmx2FilterPos);
2710 lastInLumBuf++;
2711 }
2712 while(lastInChrBuf < lastChrSrcY)
2713 {
2714 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2715 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2716 chrBufIndex++;
2717 ASSERT(chrBufIndex < 2*vChrBufSize)
2718 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2719 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2720 //FIXME replace parameters through context struct (some at least)
2721
2722 if(!(isGray(srcFormat) || isGray(dstFormat)))
2723 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2724 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2725 funnyUVCode, c->srcFormat, formatConvBuffer,
2726 c->chrMmx2Filter, c->chrMmx2FilterPos);
2727 lastInChrBuf++;
2728 }
2729 //wrap buf index around to stay inside the ring buffer
2730 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2731 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2732 }
2733 else // not enough lines left in this slice -> load the rest in the buffer
2734 {
2735 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2736 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2737 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2738 vChrBufSize, vLumBufSize);*/
2739
2740 //Do horizontal scaling
2741 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2742 {
2743 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2744 lumBufIndex++;
2745 ASSERT(lumBufIndex < 2*vLumBufSize)
2746 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2747 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2748 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2749 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2750 funnyYCode, c->srcFormat, formatConvBuffer,
2751 c->lumMmx2Filter, c->lumMmx2FilterPos);
2752 lastInLumBuf++;
2753 }
2754 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2755 {
2756 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2757 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2758 chrBufIndex++;
2759 ASSERT(chrBufIndex < 2*vChrBufSize)
2760 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2761 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2762
2763 if(!(isGray(srcFormat) || isGray(dstFormat)))
2764 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2765 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2766 funnyUVCode, c->srcFormat, formatConvBuffer,
2767 c->chrMmx2Filter, c->chrMmx2FilterPos);
2768 lastInChrBuf++;
2769 }
2770 //wrap buf index around to stay inside the ring buffer
2771 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2772 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2773 break; //we can't output a dstY line so let's try with the next slice
2774 }
2775
2776 #ifdef HAVE_MMX
2777 b5Dither= dither8[dstY&1];
2778 g6Dither= dither4[dstY&1];
2779 g5Dither= dither8[dstY&1];
2780 r5Dither= dither8[(dstY+1)&1];
2781 #endif
2782 if(dstY < dstH-2)
2783 {
2784 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2785 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2786 #ifdef HAVE_MMX
2787 int i;
2788 for(i=0; i<vLumFilterSize; i++)
2789 {
2790 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2791 lumMmxFilter[4*i+2]=
2792 lumMmxFilter[4*i+3]=
2793 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2794 }
2795 for(i=0; i<vChrFilterSize; i++)
2796 {
2797 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2798 chrMmxFilter[4*i+2]=
2799 chrMmxFilter[4*i+3]=
2800 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2801 }
2802 #endif
2803 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2804 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2805 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2806 RENAME(yuv2nv12X)(c,
2807 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2808 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2809 dest, uDest, dstW, chrDstW, dstFormat);
2810 }
2811 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2812 {
2813 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2814 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2815 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2816 {
2817 int16_t *lumBuf = lumPixBuf[0];
2818 int16_t *chrBuf= chrPixBuf[0];
2819 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2820 }
2821 else //General YV12
2822 {
2823 RENAME(yuv2yuvX)(c,
2824 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2825 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2826 dest, uDest, vDest, dstW, chrDstW);
2827 }
2828 }
2829 else
2830 {
2831 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2832 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2833 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2834 {
2835 int chrAlpha= vChrFilter[2*dstY+1];
2836 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2837 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2838 }
2839 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2840 {
2841 int lumAlpha= vLumFilter[2*dstY+1];
2842 int chrAlpha= vChrFilter[2*dstY+1];
2843 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2844 dest, dstW, lumAlpha, chrAlpha, dstY);
2845 }
2846 else //General RGB
2847 {
2848 RENAME(yuv2packedX)(c,
2849 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2850 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2851 dest, dstW, dstY);
2852 }
2853 }
2854 }
2855 else // hmm looks like we can't use MMX here without overwriting this array's tail
2856 {
2857 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2858 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2859 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2860 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2861 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2862 yuv2nv12XinC(
2863 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2864 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2865 dest, uDest, dstW, chrDstW, dstFormat);
2866 }
2867 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2868 {
2869 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2870 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2871 yuv2yuvXinC(
2872 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2873 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2874 dest, uDest, vDest, dstW, chrDstW);
2875 }
2876 else
2877 {
2878 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2879 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2880 yuv2packedXinC(c,
2881 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2882 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2883 dest, dstW, dstY);
2884 }
2885 }
2886 }
2887
2888 #ifdef HAVE_MMX
2889 __asm __volatile(SFENCE:::"memory");
2890 __asm __volatile(EMMS:::"memory");
2891 #endif
2892 /* store changed local vars back in the context */
2893 c->dstY= dstY;
2894 c->lumBufIndex= lumBufIndex;
2895 c->chrBufIndex= chrBufIndex;
2896 c->lastInLumBuf= lastInLumBuf;
2897 c->lastInChrBuf= lastInChrBuf;
2898
2899 return dstY - lastDstY;
2900 }