Commit | Line | Data |
---|---|---|
fe8054c0 | 1 | /* |
d026b45e DB |
2 | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with FFmpeg; if not, write to the Free Software | |
b19bcbaa | 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
d026b45e | 19 | * |
8a322796 DB |
20 | * The C code (not assembly, MMX, ...) of this file can be used |
21 | * under the LGPL license. | |
d026b45e | 22 | */ |
783e9cc9 | 23 | |
6e1c66bc | 24 | #undef REAL_MOVNTQ |
541c4eb9 | 25 | #undef MOVNTQ |
7d7f78b5 | 26 | #undef PAVGB |
48a05cec MN |
27 | #undef PREFETCH |
28 | #undef PREFETCHW | |
29 | #undef EMMS | |
30 | #undef SFENCE | |
31 | ||
f4406ec1 | 32 | #if HAVE_AMD3DNOW |
aeb87a49 | 33 | /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ |
48a05cec MN |
34 | #define EMMS "femms" |
35 | #else | |
36 | #define EMMS "emms" | |
37 | #endif | |
38 | ||
f4406ec1 | 39 | #if HAVE_AMD3DNOW |
48a05cec MN |
40 | #define PREFETCH "prefetch" |
41 | #define PREFETCHW "prefetchw" | |
b63f641e | 42 | #elif HAVE_MMX2 |
48a05cec MN |
43 | #define PREFETCH "prefetchnta" |
44 | #define PREFETCHW "prefetcht0" | |
45 | #else | |
d904b5fc NP |
46 | #define PREFETCH " # nop" |
47 | #define PREFETCHW " # nop" | |
48a05cec MN |
48 | #endif |
49 | ||
b63f641e | 50 | #if HAVE_MMX2 |
48a05cec MN |
51 | #define SFENCE "sfence" |
52 | #else | |
d904b5fc | 53 | #define SFENCE " # nop" |
48a05cec | 54 | #endif |
d3f41512 | 55 | |
b63f641e | 56 | #if HAVE_MMX2 |
d604bab9 | 57 | #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
f4406ec1 | 58 | #elif HAVE_AMD3DNOW |
d604bab9 MN |
59 | #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
60 | #endif | |
d3f41512 | 61 | |
b63f641e | 62 | #if HAVE_MMX2 |
6e1c66bc | 63 | #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
d604bab9 | 64 | #else |
6e1c66bc | 65 | #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
d604bab9 | 66 | #endif |
6e1c66bc | 67 | #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) |
d604bab9 | 68 | |
b63f641e | 69 | #if HAVE_ALTIVEC |
a2faa401 RD |
70 | #include "swscale_altivec_template.c" |
71 | #endif | |
72 | ||
bca11e75 | 73 | #define YSCALEYUV2YV12X(x, offset, dest, width) \ |
7ad6469e | 74 | __asm__ volatile(\ |
2da0d70d DB |
75 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
76 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | |
77 | "movq %%mm3, %%mm4 \n\t"\ | |
78 | "lea " offset "(%0), %%"REG_d" \n\t"\ | |
79 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
80 | ASMALIGN(4) /* FIXME Unroll? */\ | |
81 | "1: \n\t"\ | |
82 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
8b2fce0d MN |
83 | "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ |
84 | "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\ | |
2da0d70d DB |
85 | "add $16, %%"REG_d" \n\t"\ |
86 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
87 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
88 | "pmulhw %%mm0, %%mm2 \n\t"\ | |
89 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
90 | "paddw %%mm2, %%mm3 \n\t"\ | |
91 | "paddw %%mm5, %%mm4 \n\t"\ | |
92 | " jnz 1b \n\t"\ | |
93 | "psraw $3, %%mm3 \n\t"\ | |
94 | "psraw $3, %%mm4 \n\t"\ | |
95 | "packuswb %%mm4, %%mm3 \n\t"\ | |
96 | MOVNTQ(%%mm3, (%1, %%REGa))\ | |
97 | "add $8, %%"REG_a" \n\t"\ | |
98 | "cmp %2, %%"REG_a" \n\t"\ | |
99 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | |
100 | "movq %%mm3, %%mm4 \n\t"\ | |
101 | "lea " offset "(%0), %%"REG_d" \n\t"\ | |
102 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
103 | "jb 1b \n\t"\ | |
104 | :: "r" (&c->redDither),\ | |
105 | "r" (dest), "g" (width)\ | |
106 | : "%"REG_a, "%"REG_d, "%"REG_S\ | |
107 | ); | |
bca11e75 MN |
108 | |
109 | #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ | |
7ad6469e | 110 | __asm__ volatile(\ |
2da0d70d DB |
111 | "lea " offset "(%0), %%"REG_d" \n\t"\ |
112 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
113 | "pxor %%mm4, %%mm4 \n\t"\ | |
114 | "pxor %%mm5, %%mm5 \n\t"\ | |
115 | "pxor %%mm6, %%mm6 \n\t"\ | |
116 | "pxor %%mm7, %%mm7 \n\t"\ | |
117 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
118 | ASMALIGN(4) \ | |
119 | "1: \n\t"\ | |
8b2fce0d MN |
120 | "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\ |
121 | "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ | |
1625216e | 122 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ |
8b2fce0d | 123 | "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\ |
2da0d70d DB |
124 | "movq %%mm0, %%mm3 \n\t"\ |
125 | "punpcklwd %%mm1, %%mm0 \n\t"\ | |
126 | "punpckhwd %%mm1, %%mm3 \n\t"\ | |
1625216e | 127 | "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ |
2da0d70d DB |
128 | "pmaddwd %%mm1, %%mm0 \n\t"\ |
129 | "pmaddwd %%mm1, %%mm3 \n\t"\ | |
130 | "paddd %%mm0, %%mm4 \n\t"\ | |
131 | "paddd %%mm3, %%mm5 \n\t"\ | |
8b2fce0d | 132 | "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\ |
1625216e MN |
133 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ |
134 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |
2da0d70d DB |
135 | "test %%"REG_S", %%"REG_S" \n\t"\ |
136 | "movq %%mm2, %%mm0 \n\t"\ | |
137 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
138 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
139 | "pmaddwd %%mm1, %%mm2 \n\t"\ | |
140 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
141 | "paddd %%mm2, %%mm6 \n\t"\ | |
142 | "paddd %%mm0, %%mm7 \n\t"\ | |
143 | " jnz 1b \n\t"\ | |
144 | "psrad $16, %%mm4 \n\t"\ | |
145 | "psrad $16, %%mm5 \n\t"\ | |
146 | "psrad $16, %%mm6 \n\t"\ | |
147 | "psrad $16, %%mm7 \n\t"\ | |
148 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | |
149 | "packssdw %%mm5, %%mm4 \n\t"\ | |
150 | "packssdw %%mm7, %%mm6 \n\t"\ | |
151 | "paddw %%mm0, %%mm4 \n\t"\ | |
152 | "paddw %%mm0, %%mm6 \n\t"\ | |
153 | "psraw $3, %%mm4 \n\t"\ | |
154 | "psraw $3, %%mm6 \n\t"\ | |
155 | "packuswb %%mm6, %%mm4 \n\t"\ | |
156 | MOVNTQ(%%mm4, (%1, %%REGa))\ | |
157 | "add $8, %%"REG_a" \n\t"\ | |
158 | "cmp %2, %%"REG_a" \n\t"\ | |
159 | "lea " offset "(%0), %%"REG_d" \n\t"\ | |
160 | "pxor %%mm4, %%mm4 \n\t"\ | |
161 | "pxor %%mm5, %%mm5 \n\t"\ | |
162 | "pxor %%mm6, %%mm6 \n\t"\ | |
163 | "pxor %%mm7, %%mm7 \n\t"\ | |
164 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
165 | "jb 1b \n\t"\ | |
166 | :: "r" (&c->redDither),\ | |
167 | "r" (dest), "g" (width)\ | |
168 | : "%"REG_a, "%"REG_d, "%"REG_S\ | |
169 | ); | |
c1b0bfb4 MN |
170 | |
171 | #define YSCALEYUV2YV121 \ | |
2da0d70d DB |
172 | "mov %2, %%"REG_a" \n\t"\ |
173 | ASMALIGN(4) /* FIXME Unroll? */\ | |
174 | "1: \n\t"\ | |
175 | "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | |
176 | "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ | |
177 | "psraw $7, %%mm0 \n\t"\ | |
178 | "psraw $7, %%mm1 \n\t"\ | |
179 | "packuswb %%mm1, %%mm0 \n\t"\ | |
180 | MOVNTQ(%%mm0, (%1, %%REGa))\ | |
181 | "add $8, %%"REG_a" \n\t"\ | |
182 | "jnc 1b \n\t" | |
c1b0bfb4 | 183 | |
bf2bdde6 MN |
184 | #define YSCALEYUV2YV121_ACCURATE \ |
185 | "mov %2, %%"REG_a" \n\t"\ | |
186 | "pcmpeqw %%mm7, %%mm7 \n\t"\ | |
187 | "psrlw $15, %%mm7 \n\t"\ | |
188 | "psllw $6, %%mm7 \n\t"\ | |
189 | ASMALIGN(4) /* FIXME Unroll? */\ | |
190 | "1: \n\t"\ | |
191 | "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | |
192 | "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ | |
33a67bd6 MN |
193 | "paddsw %%mm7, %%mm0 \n\t"\ |
194 | "paddsw %%mm7, %%mm1 \n\t"\ | |
bf2bdde6 MN |
195 | "psraw $7, %%mm0 \n\t"\ |
196 | "psraw $7, %%mm1 \n\t"\ | |
197 | "packuswb %%mm1, %%mm0 \n\t"\ | |
198 | MOVNTQ(%%mm0, (%1, %%REGa))\ | |
199 | "add $8, %%"REG_a" \n\t"\ | |
200 | "jnc 1b \n\t" | |
201 | ||
c1b0bfb4 | 202 | /* |
2da0d70d DB |
203 | :: "m" (-lumFilterSize), "m" (-chrFilterSize), |
204 | "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
205 | "r" (dest), "m" (dstW), | |
206 | "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
207 | : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
c1b0bfb4 | 208 | */ |
df57ab14 | 209 | #define YSCALEYUV2PACKEDX_UV \ |
7ad6469e | 210 | __asm__ volatile(\ |
2da0d70d DB |
211 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
212 | ASMALIGN(4)\ | |
213 | "nop \n\t"\ | |
214 | "1: \n\t"\ | |
215 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | |
216 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
217 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | |
218 | "movq %%mm3, %%mm4 \n\t"\ | |
219 | ASMALIGN(4)\ | |
220 | "2: \n\t"\ | |
221 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
222 | "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | |
8b2fce0d | 223 | "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ |
2da0d70d DB |
224 | "add $16, %%"REG_d" \n\t"\ |
225 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
226 | "pmulhw %%mm0, %%mm2 \n\t"\ | |
227 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
228 | "paddw %%mm2, %%mm3 \n\t"\ | |
229 | "paddw %%mm5, %%mm4 \n\t"\ | |
230 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
231 | " jnz 2b \n\t"\ | |
df57ab14 | 232 | |
fe91924d | 233 | #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ |
df57ab14 | 234 | "lea "offset"(%0), %%"REG_d" \n\t"\ |
2da0d70d | 235 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
fe91924d CS |
236 | "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ |
237 | "movq "#dst1", "#dst2" \n\t"\ | |
2da0d70d DB |
238 | ASMALIGN(4)\ |
239 | "2: \n\t"\ | |
fe91924d CS |
240 | "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ |
241 | "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ | |
242 | "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ | |
2da0d70d DB |
243 | "add $16, %%"REG_d" \n\t"\ |
244 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
fe91924d CS |
245 | "pmulhw "#coeff", "#src1" \n\t"\ |
246 | "pmulhw "#coeff", "#src2" \n\t"\ | |
247 | "paddw "#src1", "#dst1" \n\t"\ | |
248 | "paddw "#src2", "#dst2" \n\t"\ | |
2da0d70d DB |
249 | "test %%"REG_S", %%"REG_S" \n\t"\ |
250 | " jnz 2b \n\t"\ | |
251 | ||
df57ab14 CS |
252 | #define YSCALEYUV2PACKEDX \ |
253 | YSCALEYUV2PACKEDX_UV \ | |
fe91924d | 254 | YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ |
df57ab14 | 255 | |
2da0d70d DB |
256 | #define YSCALEYUV2PACKEDX_END \ |
257 | :: "r" (&c->redDither), \ | |
258 | "m" (dummy), "m" (dummy), "m" (dummy),\ | |
259 | "r" (dest), "m" (dstW) \ | |
260 | : "%"REG_a, "%"REG_d, "%"REG_S \ | |
261 | ); | |
8422aa88 | 262 | |
df57ab14 | 263 | #define YSCALEYUV2PACKEDX_ACCURATE_UV \ |
7ad6469e | 264 | __asm__ volatile(\ |
2da0d70d DB |
265 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
266 | ASMALIGN(4)\ | |
267 | "nop \n\t"\ | |
268 | "1: \n\t"\ | |
269 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | |
270 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
271 | "pxor %%mm4, %%mm4 \n\t"\ | |
272 | "pxor %%mm5, %%mm5 \n\t"\ | |
273 | "pxor %%mm6, %%mm6 \n\t"\ | |
274 | "pxor %%mm7, %%mm7 \n\t"\ | |
275 | ASMALIGN(4)\ | |
276 | "2: \n\t"\ | |
277 | "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ | |
8b2fce0d | 278 | "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ |
1625216e | 279 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ |
2da0d70d DB |
280 | "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ |
281 | "movq %%mm0, %%mm3 \n\t"\ | |
282 | "punpcklwd %%mm1, %%mm0 \n\t"\ | |
283 | "punpckhwd %%mm1, %%mm3 \n\t"\ | |
1625216e | 284 | "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ |
2da0d70d DB |
285 | "pmaddwd %%mm1, %%mm0 \n\t"\ |
286 | "pmaddwd %%mm1, %%mm3 \n\t"\ | |
287 | "paddd %%mm0, %%mm4 \n\t"\ | |
288 | "paddd %%mm3, %%mm5 \n\t"\ | |
8b2fce0d | 289 | "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ |
1625216e MN |
290 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ |
291 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |
2da0d70d DB |
292 | "test %%"REG_S", %%"REG_S" \n\t"\ |
293 | "movq %%mm2, %%mm0 \n\t"\ | |
294 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
295 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
296 | "pmaddwd %%mm1, %%mm2 \n\t"\ | |
297 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
298 | "paddd %%mm2, %%mm6 \n\t"\ | |
299 | "paddd %%mm0, %%mm7 \n\t"\ | |
300 | " jnz 2b \n\t"\ | |
301 | "psrad $16, %%mm4 \n\t"\ | |
302 | "psrad $16, %%mm5 \n\t"\ | |
303 | "psrad $16, %%mm6 \n\t"\ | |
304 | "psrad $16, %%mm7 \n\t"\ | |
305 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | |
306 | "packssdw %%mm5, %%mm4 \n\t"\ | |
307 | "packssdw %%mm7, %%mm6 \n\t"\ | |
308 | "paddw %%mm0, %%mm4 \n\t"\ | |
309 | "paddw %%mm0, %%mm6 \n\t"\ | |
310 | "movq %%mm4, "U_TEMP"(%0) \n\t"\ | |
311 | "movq %%mm6, "V_TEMP"(%0) \n\t"\ | |
df57ab14 CS |
312 | |
313 | #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ | |
314 | "lea "offset"(%0), %%"REG_d" \n\t"\ | |
2da0d70d DB |
315 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
316 | "pxor %%mm1, %%mm1 \n\t"\ | |
317 | "pxor %%mm5, %%mm5 \n\t"\ | |
318 | "pxor %%mm7, %%mm7 \n\t"\ | |
319 | "pxor %%mm6, %%mm6 \n\t"\ | |
320 | ASMALIGN(4)\ | |
321 | "2: \n\t"\ | |
322 | "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ | |
323 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ | |
1625216e | 324 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ |
2da0d70d DB |
325 | "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ |
326 | "movq %%mm0, %%mm3 \n\t"\ | |
327 | "punpcklwd %%mm4, %%mm0 \n\t"\ | |
328 | "punpckhwd %%mm4, %%mm3 \n\t"\ | |
1625216e | 329 | "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ |
2da0d70d DB |
330 | "pmaddwd %%mm4, %%mm0 \n\t"\ |
331 | "pmaddwd %%mm4, %%mm3 \n\t"\ | |
332 | "paddd %%mm0, %%mm1 \n\t"\ | |
333 | "paddd %%mm3, %%mm5 \n\t"\ | |
334 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ | |
1625216e MN |
335 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ |
336 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |
2da0d70d DB |
337 | "test %%"REG_S", %%"REG_S" \n\t"\ |
338 | "movq %%mm2, %%mm0 \n\t"\ | |
339 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
340 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
341 | "pmaddwd %%mm4, %%mm2 \n\t"\ | |
342 | "pmaddwd %%mm4, %%mm0 \n\t"\ | |
343 | "paddd %%mm2, %%mm7 \n\t"\ | |
344 | "paddd %%mm0, %%mm6 \n\t"\ | |
345 | " jnz 2b \n\t"\ | |
346 | "psrad $16, %%mm1 \n\t"\ | |
347 | "psrad $16, %%mm5 \n\t"\ | |
348 | "psrad $16, %%mm7 \n\t"\ | |
349 | "psrad $16, %%mm6 \n\t"\ | |
350 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | |
351 | "packssdw %%mm5, %%mm1 \n\t"\ | |
352 | "packssdw %%mm6, %%mm7 \n\t"\ | |
353 | "paddw %%mm0, %%mm1 \n\t"\ | |
354 | "paddw %%mm0, %%mm7 \n\t"\ | |
355 | "movq "U_TEMP"(%0), %%mm3 \n\t"\ | |
356 | "movq "V_TEMP"(%0), %%mm4 \n\t"\ | |
bca11e75 | 357 | |
df57ab14 CS |
358 | #define YSCALEYUV2PACKEDX_ACCURATE \ |
359 | YSCALEYUV2PACKEDX_ACCURATE_UV \ | |
360 | YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) | |
361 | ||
8422aa88 | 362 | #define YSCALEYUV2RGBX \ |
2da0d70d DB |
363 | "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
364 | "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
365 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
366 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
367 | "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ | |
368 | "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
369 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
370 | "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ | |
371 | "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
372 | "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
373 | "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
374 | "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
375 | "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
376 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
377 | "paddw %%mm3, %%mm4 \n\t"\ | |
378 | "movq %%mm2, %%mm0 \n\t"\ | |
379 | "movq %%mm5, %%mm6 \n\t"\ | |
380 | "movq %%mm4, %%mm3 \n\t"\ | |
381 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
382 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
383 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
384 | "paddw %%mm1, %%mm2 \n\t"\ | |
385 | "paddw %%mm1, %%mm5 \n\t"\ | |
386 | "paddw %%mm1, %%mm4 \n\t"\ | |
387 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
388 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
389 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
390 | "paddw %%mm7, %%mm0 \n\t"\ | |
391 | "paddw %%mm7, %%mm6 \n\t"\ | |
392 | "paddw %%mm7, %%mm3 \n\t"\ | |
393 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
394 | "packuswb %%mm0, %%mm2 \n\t"\ | |
395 | "packuswb %%mm6, %%mm5 \n\t"\ | |
396 | "packuswb %%mm3, %%mm4 \n\t"\ | |
d604bab9 | 397 | |
6e1c66bc | 398 | #define REAL_YSCALEYUV2PACKED(index, c) \ |
2da0d70d DB |
399 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ |
400 | "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ | |
401 | "psraw $3, %%mm0 \n\t"\ | |
402 | "psraw $3, %%mm1 \n\t"\ | |
403 | "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ | |
404 | "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ | |
405 | "xor "#index", "#index" \n\t"\ | |
406 | ASMALIGN(4)\ | |
407 | "1: \n\t"\ | |
408 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
409 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
8b2fce0d MN |
410 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
411 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2da0d70d DB |
412 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
413 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
414 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | |
415 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
416 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
417 | "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
418 | "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
419 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
420 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
421 | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
422 | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
423 | "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
424 | "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
425 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
426 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
427 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
428 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
429 | "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
430 | "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
431 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
432 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
6a4970ab | 433 | |
6e1c66bc | 434 | #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) |
6a4970ab | 435 | |
df57ab14 | 436 | #define REAL_YSCALEYUV2RGB_UV(index, c) \ |
2da0d70d DB |
437 | "xor "#index", "#index" \n\t"\ |
438 | ASMALIGN(4)\ | |
439 | "1: \n\t"\ | |
440 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
441 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
8b2fce0d MN |
442 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
443 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2da0d70d DB |
444 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
445 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
446 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | |
447 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
448 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
449 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
450 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
451 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
452 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
453 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
454 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
455 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
456 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
457 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | |
458 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | |
459 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
df57ab14 | 460 | |
786dcfef CS |
461 | #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ |
462 | "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
463 | "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
464 | "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
465 | "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
2da0d70d DB |
466 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
467 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
468 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
469 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
470 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
471 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
472 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
473 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
df57ab14 CS |
474 | |
475 | #define REAL_YSCALEYUV2RGB_COEFF(c) \ | |
2da0d70d DB |
476 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ |
477 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | |
478 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
479 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
480 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
481 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
482 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
483 | "paddw %%mm3, %%mm4 \n\t"\ | |
484 | "movq %%mm2, %%mm0 \n\t"\ | |
485 | "movq %%mm5, %%mm6 \n\t"\ | |
486 | "movq %%mm4, %%mm3 \n\t"\ | |
487 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
488 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
489 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
490 | "paddw %%mm1, %%mm2 \n\t"\ | |
491 | "paddw %%mm1, %%mm5 \n\t"\ | |
492 | "paddw %%mm1, %%mm4 \n\t"\ | |
493 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
494 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
495 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
496 | "paddw %%mm7, %%mm0 \n\t"\ | |
497 | "paddw %%mm7, %%mm6 \n\t"\ | |
498 | "paddw %%mm7, %%mm3 \n\t"\ | |
499 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
500 | "packuswb %%mm0, %%mm2 \n\t"\ | |
501 | "packuswb %%mm6, %%mm5 \n\t"\ | |
502 | "packuswb %%mm3, %%mm4 \n\t"\ | |
40494418 | 503 | |
786dcfef | 504 | #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) |
df57ab14 CS |
505 | |
506 | #define YSCALEYUV2RGB(index, c) \ | |
507 | REAL_YSCALEYUV2RGB_UV(index, c) \ | |
786dcfef | 508 | REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ |
df57ab14 | 509 | REAL_YSCALEYUV2RGB_COEFF(c) |
6a4970ab | 510 | |
6e1c66bc | 511 | #define REAL_YSCALEYUV2PACKED1(index, c) \ |
2da0d70d DB |
512 | "xor "#index", "#index" \n\t"\ |
513 | ASMALIGN(4)\ | |
514 | "1: \n\t"\ | |
515 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
8b2fce0d | 516 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
2da0d70d DB |
517 | "psraw $7, %%mm3 \n\t" \ |
518 | "psraw $7, %%mm4 \n\t" \ | |
519 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
520 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
521 | "psraw $7, %%mm1 \n\t" \ | |
522 | "psraw $7, %%mm7 \n\t" \ | |
6a4970ab | 523 | |
6e1c66bc | 524 | #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) |
6a4970ab | 525 | |
6e1c66bc | 526 | #define REAL_YSCALEYUV2RGB1(index, c) \ |
2da0d70d DB |
527 | "xor "#index", "#index" \n\t"\ |
528 | ASMALIGN(4)\ | |
529 | "1: \n\t"\ | |
530 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
8b2fce0d | 531 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
2da0d70d DB |
532 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
533 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
534 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
535 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
536 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
537 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
538 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | |
539 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | |
540 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
541 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
542 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
543 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
544 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
545 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | |
546 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | |
547 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
548 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
549 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
550 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
551 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
552 | "paddw %%mm3, %%mm4 \n\t"\ | |
553 | "movq %%mm2, %%mm0 \n\t"\ | |
554 | "movq %%mm5, %%mm6 \n\t"\ | |
555 | "movq %%mm4, %%mm3 \n\t"\ | |
556 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
557 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
558 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
559 | "paddw %%mm1, %%mm2 \n\t"\ | |
560 | "paddw %%mm1, %%mm5 \n\t"\ | |
561 | "paddw %%mm1, %%mm4 \n\t"\ | |
562 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
563 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
564 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
565 | "paddw %%mm7, %%mm0 \n\t"\ | |
566 | "paddw %%mm7, %%mm6 \n\t"\ | |
567 | "paddw %%mm7, %%mm3 \n\t"\ | |
568 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
569 | "packuswb %%mm0, %%mm2 \n\t"\ | |
570 | "packuswb %%mm6, %%mm5 \n\t"\ | |
571 | "packuswb %%mm3, %%mm4 \n\t"\ | |
40494418 | 572 | |
6e1c66bc | 573 | #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) |
497d4f99 | 574 | |
6e1c66bc | 575 | #define REAL_YSCALEYUV2PACKED1b(index, c) \ |
2da0d70d DB |
576 | "xor "#index", "#index" \n\t"\ |
577 | ASMALIGN(4)\ | |
578 | "1: \n\t"\ | |
579 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
580 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
8b2fce0d MN |
581 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
582 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2da0d70d DB |
583 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
584 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
585 | "psrlw $8, %%mm3 \n\t" \ | |
586 | "psrlw $8, %%mm4 \n\t" \ | |
587 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
588 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
589 | "psraw $7, %%mm1 \n\t" \ | |
590 | "psraw $7, %%mm7 \n\t" | |
6e1c66bc | 591 | #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) |
6a4970ab | 592 | |
497d4f99 | 593 | // do vertical chrominance interpolation |
6e1c66bc | 594 | #define REAL_YSCALEYUV2RGB1b(index, c) \ |
2da0d70d DB |
595 | "xor "#index", "#index" \n\t"\ |
596 | ASMALIGN(4)\ | |
597 | "1: \n\t"\ | |
598 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
599 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
8b2fce0d MN |
600 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
601 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2da0d70d DB |
602 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
603 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
604 | "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ | |
605 | "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
606 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
607 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
608 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
609 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
610 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | |
611 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | |
612 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
613 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
614 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
615 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
616 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
617 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | |
618 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | |
619 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
620 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
621 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
622 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
623 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
624 | "paddw %%mm3, %%mm4 \n\t"\ | |
625 | "movq %%mm2, %%mm0 \n\t"\ | |
626 | "movq %%mm5, %%mm6 \n\t"\ | |
627 | "movq %%mm4, %%mm3 \n\t"\ | |
628 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
629 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
630 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
631 | "paddw %%mm1, %%mm2 \n\t"\ | |
632 | "paddw %%mm1, %%mm5 \n\t"\ | |
633 | "paddw %%mm1, %%mm4 \n\t"\ | |
634 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
635 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
636 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
637 | "paddw %%mm7, %%mm0 \n\t"\ | |
638 | "paddw %%mm7, %%mm6 \n\t"\ | |
639 | "paddw %%mm7, %%mm3 \n\t"\ | |
640 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
641 | "packuswb %%mm0, %%mm2 \n\t"\ | |
642 | "packuswb %%mm6, %%mm5 \n\t"\ | |
643 | "packuswb %%mm3, %%mm4 \n\t"\ | |
40494418 | 644 | |
6e1c66bc | 645 | #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) |
d604bab9 | 646 | |
9c77b26b CS |
647 | #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ |
648 | "movq "#b", "#q2" \n\t" /* B */\ | |
649 | "movq "#r", "#t" \n\t" /* R */\ | |
650 | "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ | |
651 | "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ | |
652 | "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ | |
653 | "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ | |
654 | "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ | |
655 | "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ | |
656 | "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ | |
657 | "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ | |
658 | "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ | |
659 | "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ | |
d604bab9 | 660 | \ |
9c77b26b CS |
661 | MOVNTQ( q0, (dst, index, 4))\ |
662 | MOVNTQ( b, 8(dst, index, 4))\ | |
663 | MOVNTQ( q2, 16(dst, index, 4))\ | |
664 | MOVNTQ( q3, 24(dst, index, 4))\ | |
d604bab9 | 665 | \ |
2da0d70d DB |
666 | "add $8, "#index" \n\t"\ |
667 | "cmp "#dstw", "#index" \n\t"\ | |
668 | " jb 1b \n\t" | |
9c77b26b | 669 | #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) |
d604bab9 | 670 | |
27a90b04 | 671 | #define REAL_WRITERGB16(dst, dstw, index) \ |
2da0d70d DB |
672 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
673 | "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
674 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
675 | "psrlq $3, %%mm2 \n\t"\ | |
d604bab9 | 676 | \ |
2da0d70d DB |
677 | "movq %%mm2, %%mm1 \n\t"\ |
678 | "movq %%mm4, %%mm3 \n\t"\ | |
d604bab9 | 679 | \ |
2da0d70d DB |
680 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
681 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
682 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
683 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
d604bab9 | 684 | \ |
2da0d70d DB |
685 | "psllq $3, %%mm3 \n\t"\ |
686 | "psllq $3, %%mm4 \n\t"\ | |
d604bab9 | 687 | \ |
2da0d70d DB |
688 | "por %%mm3, %%mm2 \n\t"\ |
689 | "por %%mm4, %%mm1 \n\t"\ | |
d604bab9 | 690 | \ |
2da0d70d DB |
691 | MOVNTQ(%%mm2, (dst, index, 2))\ |
692 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
d604bab9 | 693 | \ |
2da0d70d DB |
694 | "add $8, "#index" \n\t"\ |
695 | "cmp "#dstw", "#index" \n\t"\ | |
696 | " jb 1b \n\t" | |
27a90b04 | 697 | #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) |
d604bab9 | 698 | |
27a90b04 | 699 | #define REAL_WRITERGB15(dst, dstw, index) \ |
2da0d70d DB |
700 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
701 | "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
702 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
703 | "psrlq $3, %%mm2 \n\t"\ | |
704 | "psrlq $1, %%mm5 \n\t"\ | |
d604bab9 | 705 | \ |
2da0d70d DB |
706 | "movq %%mm2, %%mm1 \n\t"\ |
707 | "movq %%mm4, %%mm3 \n\t"\ | |
d604bab9 | 708 | \ |
2da0d70d DB |
709 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
710 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
711 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
712 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
d604bab9 | 713 | \ |
2da0d70d DB |
714 | "psllq $2, %%mm3 \n\t"\ |
715 | "psllq $2, %%mm4 \n\t"\ | |
d604bab9 | 716 | \ |
2da0d70d DB |
717 | "por %%mm3, %%mm2 \n\t"\ |
718 | "por %%mm4, %%mm1 \n\t"\ | |
d604bab9 | 719 | \ |
2da0d70d DB |
720 | MOVNTQ(%%mm2, (dst, index, 2))\ |
721 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
d604bab9 | 722 | \ |
2da0d70d DB |
723 | "add $8, "#index" \n\t"\ |
724 | "cmp "#dstw", "#index" \n\t"\ | |
725 | " jb 1b \n\t" | |
27a90b04 | 726 | #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) |
f62255fb | 727 | |
6542b44e | 728 | #define WRITEBGR24OLD(dst, dstw, index) \ |
2da0d70d DB |
729 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
730 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
731 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
732 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
733 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
734 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
735 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
736 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
737 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
738 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
739 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
740 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
741 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
d604bab9 | 742 | \ |
2da0d70d DB |
743 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
744 | "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | |
745 | "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\ | |
746 | "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\ | |
747 | "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
748 | "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | |
749 | "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | |
750 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
d604bab9 | 751 | \ |
2da0d70d DB |
752 | "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
753 | "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | |
754 | "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | |
755 | "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | |
756 | "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\ | |
757 | "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ | |
758 | "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | |
759 | "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\ | |
760 | "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\ | |
761 | "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ | |
762 | "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | |
763 | "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | |
764 | "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | |
d604bab9 | 765 | \ |
2da0d70d DB |
766 | "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ |
767 | "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | |
768 | "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | |
769 | "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\ | |
770 | "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\ | |
771 | "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ | |
772 | "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
773 | "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | |
d604bab9 | 774 | \ |
2da0d70d DB |
775 | MOVNTQ(%%mm0, (dst))\ |
776 | MOVNTQ(%%mm2, 8(dst))\ | |
777 | MOVNTQ(%%mm3, 16(dst))\ | |
778 | "add $24, "#dst" \n\t"\ | |
d604bab9 | 779 | \ |
2da0d70d DB |
780 | "add $8, "#index" \n\t"\ |
781 | "cmp "#dstw", "#index" \n\t"\ | |
782 | " jb 1b \n\t" | |
d604bab9 | 783 | |
6542b44e | 784 | #define WRITEBGR24MMX(dst, dstw, index) \ |
2da0d70d DB |
785 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
786 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
787 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
788 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
789 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
790 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
791 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
792 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
793 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
794 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
795 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
796 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
797 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
99d2cb72 | 798 | \ |
2da0d70d DB |
799 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
800 | "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
801 | "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
802 | "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
99d2cb72 | 803 | \ |
2da0d70d DB |
804 | "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ |
805 | "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
806 | "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
807 | "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
99d2cb72 | 808 | \ |
2da0d70d DB |
809 | "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ |
810 | "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
811 | "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
812 | "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
99d2cb72 | 813 | \ |
2da0d70d DB |
814 | "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
815 | "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
816 | "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
817 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
818 | MOVNTQ(%%mm0, (dst))\ | |
99d2cb72 | 819 | \ |
2da0d70d DB |
820 | "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ |
821 | "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
822 | "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
823 | "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
824 | MOVNTQ(%%mm6, 8(dst))\ | |
99d2cb72 | 825 | \ |
2da0d70d DB |
826 | "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ |
827 | "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
828 | "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
829 | MOVNTQ(%%mm5, 16(dst))\ | |
99d2cb72 | 830 | \ |
2da0d70d | 831 | "add $24, "#dst" \n\t"\ |
99d2cb72 | 832 | \ |
2da0d70d DB |
833 | "add $8, "#index" \n\t"\ |
834 | "cmp "#dstw", "#index" \n\t"\ | |
835 | " jb 1b \n\t" | |
99d2cb72 | 836 | |
6542b44e | 837 | #define WRITEBGR24MMX2(dst, dstw, index) \ |
2da0d70d | 838 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
5802683a RD |
839 | "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ |
840 | "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ | |
2da0d70d DB |
841 | "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
842 | "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
843 | "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
99d2cb72 | 844 | \ |
2da0d70d DB |
845 | "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ |
846 | "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
847 | "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
99d2cb72 | 848 | \ |
2da0d70d DB |
849 | "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ |
850 | "por %%mm1, %%mm6 \n\t"\ | |
851 | "por %%mm3, %%mm6 \n\t"\ | |
852 | MOVNTQ(%%mm6, (dst))\ | |
99d2cb72 | 853 | \ |
2da0d70d DB |
854 | "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ |
855 | "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
856 | "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
857 | "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
99d2cb72 | 858 | \ |
5802683a | 859 | "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ |
2da0d70d DB |
860 | "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
861 | "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
99d2cb72 | 862 | \ |
2da0d70d DB |
863 | "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ |
864 | "por %%mm3, %%mm6 \n\t"\ | |
865 | MOVNTQ(%%mm6, 8(dst))\ | |
99d2cb72 | 866 | \ |
2da0d70d DB |
867 | "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ |
868 | "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
869 | "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
99d2cb72 | 870 | \ |
2da0d70d DB |
871 | "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ |
872 | "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
5802683a | 873 | "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
99d2cb72 | 874 | \ |
2da0d70d DB |
875 | "por %%mm1, %%mm3 \n\t"\ |
876 | "por %%mm3, %%mm6 \n\t"\ | |
877 | MOVNTQ(%%mm6, 16(dst))\ | |
99d2cb72 | 878 | \ |
2da0d70d | 879 | "add $24, "#dst" \n\t"\ |
99d2cb72 | 880 | \ |
2da0d70d DB |
881 | "add $8, "#index" \n\t"\ |
882 | "cmp "#dstw", "#index" \n\t"\ | |
883 | " jb 1b \n\t" | |
99d2cb72 | 884 | |
b63f641e | 885 | #if HAVE_MMX2 |
7630f2e0 | 886 | #undef WRITEBGR24 |
6e1c66bc | 887 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) |
99d2cb72 | 888 | #else |
7630f2e0 | 889 | #undef WRITEBGR24 |
6e1c66bc | 890 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) |
99d2cb72 MN |
891 | #endif |
892 | ||
6e1c66bc | 893 | #define REAL_WRITEYUY2(dst, dstw, index) \ |
2da0d70d DB |
894 | "packuswb %%mm3, %%mm3 \n\t"\ |
895 | "packuswb %%mm4, %%mm4 \n\t"\ | |
896 | "packuswb %%mm7, %%mm1 \n\t"\ | |
897 | "punpcklbw %%mm4, %%mm3 \n\t"\ | |
898 | "movq %%mm1, %%mm7 \n\t"\ | |
899 | "punpcklbw %%mm3, %%mm1 \n\t"\ | |
900 | "punpckhbw %%mm3, %%mm7 \n\t"\ | |
25593e29 | 901 | \ |
2da0d70d DB |
902 | MOVNTQ(%%mm1, (dst, index, 2))\ |
903 | MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
25593e29 | 904 | \ |
2da0d70d DB |
905 | "add $8, "#index" \n\t"\ |
906 | "cmp "#dstw", "#index" \n\t"\ | |
907 | " jb 1b \n\t" | |
6e1c66bc | 908 | #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) |
25593e29 MN |
909 | |
910 | ||
77a49659 | 911 | static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
2da0d70d DB |
912 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
913 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
38858470 | 914 | { |
b63f641e | 915 | #if HAVE_MMX |
f433c8ab | 916 | if(!(c->flags & SWS_BITEXACT)){ |
14014d47 MN |
917 | if (c->flags & SWS_ACCURATE_RND){ |
918 | if (uDest){ | |
919 | YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) | |
920 | YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) | |
921 | } | |
bca11e75 | 922 | |
14014d47 MN |
923 | YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW) |
924 | }else{ | |
925 | if (uDest){ | |
926 | YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) | |
927 | YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) | |
928 | } | |
2da0d70d | 929 | |
14014d47 MN |
930 | YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW) |
931 | } | |
f433c8ab MN |
932 | return; |
933 | } | |
934 | #endif | |
b63f641e | 935 | #if HAVE_ALTIVEC |
a2faa401 | 936 | yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, |
2da0d70d DB |
937 | chrFilter, chrSrc, chrFilterSize, |
938 | dest, uDest, vDest, dstW, chrDstW); | |
a2faa401 | 939 | #else //HAVE_ALTIVEC |
5859233b | 940 | yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, |
2da0d70d DB |
941 | chrFilter, chrSrc, chrFilterSize, |
942 | dest, uDest, vDest, dstW, chrDstW); | |
a2faa401 | 943 | #endif //!HAVE_ALTIVEC |
c1b0bfb4 | 944 | } |
2add307d | 945 | |
6118e52e | 946 | static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
2da0d70d DB |
947 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
948 | uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | |
6118e52e VS |
949 | { |
950 | yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | |
2da0d70d DB |
951 | chrFilter, chrSrc, chrFilterSize, |
952 | dest, uDest, dstW, chrDstW, dstFormat); | |
6118e52e VS |
953 | } |
954 | ||
bf2bdde6 | 955 | static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, |
2da0d70d | 956 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) |
c1b0bfb4 | 957 | { |
f433c8ab | 958 | int i; |
b63f641e | 959 | #if HAVE_MMX |
f433c8ab | 960 | if(!(c->flags & SWS_BITEXACT)){ |
14014d47 MN |
961 | long p= uDest ? 3 : 1; |
962 | uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW}; | |
963 | uint8_t *dst[3]= {dest, uDest, vDest}; | |
d0ce212a | 964 | x86_reg counter[3] = {dstW, chrDstW, chrDstW}; |
2da0d70d | 965 | |
14014d47 MN |
966 | if (c->flags & SWS_ACCURATE_RND){ |
967 | while(p--){ | |
7ad6469e | 968 | __asm__ volatile( |
14014d47 MN |
969 | YSCALEYUV2YV121_ACCURATE |
970 | :: "r" (src[p]), "r" (dst[p] + counter[p]), | |
971 | "g" (-counter[p]) | |
972 | : "%"REG_a | |
973 | ); | |
974 | } | |
975 | }else{ | |
976 | while(p--){ | |
7ad6469e | 977 | __asm__ volatile( |
14014d47 MN |
978 | YSCALEYUV2YV121 |
979 | :: "r" (src[p]), "r" (dst[p] + counter[p]), | |
980 | "g" (-counter[p]) | |
981 | : "%"REG_a | |
982 | ); | |
983 | } | |
d78c1ea1 | 984 | } |
f433c8ab MN |
985 | return; |
986 | } | |
987 | #endif | |
2da0d70d DB |
988 | for (i=0; i<dstW; i++) |
989 | { | |
a1f3ffa3 | 990 | int val= (lumSrc[i]+64)>>7; |
2da0d70d DB |
991 | |
992 | if (val&256){ | |
993 | if (val<0) val=0; | |
994 | else val=255; | |
995 | } | |
996 | ||
997 | dest[i]= val; | |
998 | } | |
999 | ||
1b0a4572 | 1000 | if (uDest) |
2da0d70d DB |
1001 | for (i=0; i<chrDstW; i++) |
1002 | { | |
a1f3ffa3 MN |
1003 | int u=(chrSrc[i ]+64)>>7; |
1004 | int v=(chrSrc[i + VOFW]+64)>>7; | |
2da0d70d DB |
1005 | |
1006 | if ((u|v)&256){ | |
1007 | if (u<0) u=0; | |
1008 | else if (u>255) u=255; | |
1009 | if (v<0) v=0; | |
1010 | else if (v>255) v=255; | |
1011 | } | |
1012 | ||
1013 | uDest[i]= u; | |
1014 | vDest[i]= v; | |
1015 | } | |
38858470 MN |
1016 | } |
1017 | ||
c1b0bfb4 | 1018 | |
d604bab9 MN |
1019 | /** |
1020 | * vertical scale YV12 to RGB | |
1021 | */ | |
25593e29 | 1022 | static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
2da0d70d DB |
1023 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
1024 | uint8_t *dest, long dstW, long dstY) | |
c1b0bfb4 | 1025 | { |
b63f641e | 1026 | #if HAVE_MMX |
d0ce212a | 1027 | x86_reg dummy=0; |
f433c8ab | 1028 | if(!(c->flags & SWS_BITEXACT)){ |
14014d47 MN |
1029 | if (c->flags & SWS_ACCURATE_RND){ |
1030 | switch(c->dstFormat){ | |
1031 | case PIX_FMT_RGB32: | |
1032 | YSCALEYUV2PACKEDX_ACCURATE | |
1033 | YSCALEYUV2RGBX | |
f8a138be | 1034 | "pcmpeqd %%mm7, %%mm7 \n\t" |
9c77b26b | 1035 | WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
2da0d70d | 1036 | |
14014d47 MN |
1037 | YSCALEYUV2PACKEDX_END |
1038 | return; | |
1039 | case PIX_FMT_BGR24: | |
1040 | YSCALEYUV2PACKEDX_ACCURATE | |
1041 | YSCALEYUV2RGBX | |
40494418 | 1042 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 MN |
1043 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
1044 | "add %4, %%"REG_c" \n\t" | |
1045 | WRITEBGR24(%%REGc, %5, %%REGa) | |
2da0d70d DB |
1046 | |
1047 | ||
14014d47 MN |
1048 | :: "r" (&c->redDither), |
1049 | "m" (dummy), "m" (dummy), "m" (dummy), | |
1050 | "r" (dest), "m" (dstW) | |
1051 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | |
1052 | ); | |
1053 | return; | |
1054 | case PIX_FMT_RGB555: | |
1055 | YSCALEYUV2PACKEDX_ACCURATE | |
1056 | YSCALEYUV2RGBX | |
40494418 | 1057 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 | 1058 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bca11e75 | 1059 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1060 | "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" |
1061 | "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" | |
1062 | "paddusb "RED_DITHER"(%0), %%mm5\n\t" | |
2da0d70d DB |
1063 | #endif |
1064 | ||
14014d47 MN |
1065 | WRITERGB15(%4, %5, %%REGa) |
1066 | YSCALEYUV2PACKEDX_END | |
1067 | return; | |
1068 | case PIX_FMT_RGB565: | |
1069 | YSCALEYUV2PACKEDX_ACCURATE | |
1070 | YSCALEYUV2RGBX | |
40494418 | 1071 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 | 1072 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bca11e75 | 1073 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1074 | "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" |
1075 | "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" | |
1076 | "paddusb "RED_DITHER"(%0), %%mm5\n\t" | |
2da0d70d DB |
1077 | #endif |
1078 | ||
14014d47 MN |
1079 | WRITERGB16(%4, %5, %%REGa) |
1080 | YSCALEYUV2PACKEDX_END | |
1081 | return; | |
1082 | case PIX_FMT_YUYV422: | |
1083 | YSCALEYUV2PACKEDX_ACCURATE | |
1084 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1085 | ||
1086 | "psraw $3, %%mm3 \n\t" | |
1087 | "psraw $3, %%mm4 \n\t" | |
1088 | "psraw $3, %%mm1 \n\t" | |
1089 | "psraw $3, %%mm7 \n\t" | |
1090 | WRITEYUY2(%4, %5, %%REGa) | |
1091 | YSCALEYUV2PACKEDX_END | |
1092 | return; | |
1093 | } | |
1094 | }else{ | |
1095 | switch(c->dstFormat) | |
1096 | { | |
1097 | case PIX_FMT_RGB32: | |
1098 | YSCALEYUV2PACKEDX | |
1099 | YSCALEYUV2RGBX | |
f8a138be | 1100 | "pcmpeqd %%mm7, %%mm7 \n\t" |
9c77b26b | 1101 | WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
14014d47 MN |
1102 | YSCALEYUV2PACKEDX_END |
1103 | return; | |
1104 | case PIX_FMT_BGR24: | |
1105 | YSCALEYUV2PACKEDX | |
1106 | YSCALEYUV2RGBX | |
40494418 | 1107 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 MN |
1108 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize |
1109 | "add %4, %%"REG_c" \n\t" | |
1110 | WRITEBGR24(%%REGc, %5, %%REGa) | |
1111 | ||
1112 | :: "r" (&c->redDither), | |
1113 | "m" (dummy), "m" (dummy), "m" (dummy), | |
1114 | "r" (dest), "m" (dstW) | |
1115 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | |
1116 | ); | |
1117 | return; | |
1118 | case PIX_FMT_RGB555: | |
1119 | YSCALEYUV2PACKEDX | |
1120 | YSCALEYUV2RGBX | |
40494418 | 1121 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 | 1122 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
c1b0bfb4 | 1123 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1124 | "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" |
1125 | "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" | |
1126 | "paddusb "RED_DITHER"(%0), %%mm5 \n\t" | |
2da0d70d DB |
1127 | #endif |
1128 | ||
14014d47 MN |
1129 | WRITERGB15(%4, %5, %%REGa) |
1130 | YSCALEYUV2PACKEDX_END | |
1131 | return; | |
1132 | case PIX_FMT_RGB565: | |
1133 | YSCALEYUV2PACKEDX | |
1134 | YSCALEYUV2RGBX | |
40494418 | 1135 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 | 1136 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
c1b0bfb4 | 1137 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1138 | "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" |
1139 | "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" | |
1140 | "paddusb "RED_DITHER"(%0), %%mm5 \n\t" | |
2da0d70d DB |
1141 | #endif |
1142 | ||
14014d47 MN |
1143 | WRITERGB16(%4, %5, %%REGa) |
1144 | YSCALEYUV2PACKEDX_END | |
1145 | return; | |
1146 | case PIX_FMT_YUYV422: | |
1147 | YSCALEYUV2PACKEDX | |
1148 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1149 | ||
1150 | "psraw $3, %%mm3 \n\t" | |
1151 | "psraw $3, %%mm4 \n\t" | |
1152 | "psraw $3, %%mm1 \n\t" | |
1153 | "psraw $3, %%mm7 \n\t" | |
1154 | WRITEYUY2(%4, %5, %%REGa) | |
1155 | YSCALEYUV2PACKEDX_END | |
1156 | return; | |
1157 | } | |
bca11e75 MN |
1158 | } |
1159 | } | |
bc279024 | 1160 | #endif /* HAVE_MMX */ |
b63f641e | 1161 | #if HAVE_ALTIVEC |
2da0d70d | 1162 | /* The following list of supported dstFormat values should |
780daf2b | 1163 | match what's found in the body of ff_yuv2packedX_altivec() */ |
12794f73 KS |
1164 | if (!(c->flags & SWS_BITEXACT) && |
1165 | (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || | |
2da0d70d | 1166 | c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || |
12794f73 | 1167 | c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)) |
780daf2b DB |
1168 | ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, |
1169 | chrFilter, chrSrc, chrFilterSize, | |
1170 | dest, dstW, dstY); | |
2da0d70d DB |
1171 | else |
1172 | #endif | |
1173 | yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, | |
1174 | chrFilter, chrSrc, chrFilterSize, | |
1175 | dest, dstW, dstY); | |
c1b0bfb4 MN |
1176 | } |
1177 | ||
c1b0bfb4 MN |
1178 | /** |
1179 | * vertical bilinear scale YV12 to RGB | |
1180 | */ | |
25593e29 | 1181 | static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, |
2da0d70d | 1182 | uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) |
d604bab9 | 1183 | { |
ac0ad729 MN |
1184 | int yalpha1=4095- yalpha; |
1185 | int uvalpha1=4095-uvalpha; | |
2da0d70d | 1186 | int i; |
d604bab9 | 1187 | |
b63f641e | 1188 | #if HAVE_MMX |
f433c8ab | 1189 | if(!(c->flags & SWS_BITEXACT)){ |
2da0d70d DB |
1190 | switch(c->dstFormat) |
1191 | { | |
1192 | //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( | |
1193 | case PIX_FMT_RGB32: | |
7ad6469e | 1194 | __asm__ volatile( |
2da0d70d DB |
1195 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1196 | "mov %4, %%"REG_b" \n\t" | |
1197 | "push %%"REG_BP" \n\t" | |
1198 | YSCALEYUV2RGB(%%REGBP, %5) | |
f8a138be | 1199 | "pcmpeqd %%mm7, %%mm7 \n\t" |
9c77b26b | 1200 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
2da0d70d DB |
1201 | "pop %%"REG_BP" \n\t" |
1202 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1203 | ||
1204 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1205 | "a" (&c->redDither) | |
1206 | ); | |
1207 | return; | |
1208 | case PIX_FMT_BGR24: | |
7ad6469e | 1209 | __asm__ volatile( |
2da0d70d DB |
1210 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1211 | "mov %4, %%"REG_b" \n\t" | |
1212 | "push %%"REG_BP" \n\t" | |
1213 | YSCALEYUV2RGB(%%REGBP, %5) | |
40494418 | 1214 | "pxor %%mm7, %%mm7 \n\t" |
2da0d70d DB |
1215 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) |
1216 | "pop %%"REG_BP" \n\t" | |
1217 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1218 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1219 | "a" (&c->redDither) | |
1220 | ); | |
1221 | return; | |
27a90b04 | 1222 | case PIX_FMT_RGB555: |
7ad6469e | 1223 | __asm__ volatile( |
2da0d70d DB |
1224 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1225 | "mov %4, %%"REG_b" \n\t" | |
1226 | "push %%"REG_BP" \n\t" | |
1227 | YSCALEYUV2RGB(%%REGBP, %5) | |
40494418 | 1228 | "pxor %%mm7, %%mm7 \n\t" |
2da0d70d | 1229 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
d604bab9 | 1230 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1231 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1232 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1233 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
2da0d70d DB |
1234 | #endif |
1235 | ||
27a90b04 | 1236 | WRITERGB15(%%REGb, 8280(%5), %%REGBP) |
2da0d70d DB |
1237 | "pop %%"REG_BP" \n\t" |
1238 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1239 | ||
1240 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1241 | "a" (&c->redDither) | |
1242 | ); | |
1243 | return; | |
27a90b04 | 1244 | case PIX_FMT_RGB565: |
7ad6469e | 1245 | __asm__ volatile( |
2da0d70d DB |
1246 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1247 | "mov %4, %%"REG_b" \n\t" | |
1248 | "push %%"REG_BP" \n\t" | |
1249 | YSCALEYUV2RGB(%%REGBP, %5) | |
40494418 | 1250 | "pxor %%mm7, %%mm7 \n\t" |
2da0d70d | 1251 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
d604bab9 | 1252 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1253 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1254 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1255 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
2da0d70d DB |
1256 | #endif |
1257 | ||
27a90b04 | 1258 | WRITERGB16(%%REGb, 8280(%5), %%REGBP) |
2da0d70d DB |
1259 | "pop %%"REG_BP" \n\t" |
1260 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1261 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1262 | "a" (&c->redDither) | |
1263 | ); | |
1264 | return; | |
1265 | case PIX_FMT_YUYV422: | |
7ad6469e | 1266 | __asm__ volatile( |
2da0d70d DB |
1267 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1268 | "mov %4, %%"REG_b" \n\t" | |
1269 | "push %%"REG_BP" \n\t" | |
1270 | YSCALEYUV2PACKED(%%REGBP, %5) | |
1271 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1272 | "pop %%"REG_BP" \n\t" | |
1273 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1274 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1275 | "a" (&c->redDither) | |
1276 | ); | |
1277 | return; | |
1278 | default: break; | |
1279 | } | |
f433c8ab | 1280 | } |
cf7d1c1a | 1281 | #endif //HAVE_MMX |
ec1bca2a | 1282 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C) |
d604bab9 MN |
1283 | } |
1284 | ||
1285 | /** | |
1286 | * YV12 to RGB without scaling or interpolating | |
1287 | */ | |
25593e29 | 1288 | static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
2da0d70d | 1289 | uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) |
d604bab9 | 1290 | { |
2da0d70d DB |
1291 | const int yalpha1=0; |
1292 | int i; | |
6a4970ab | 1293 | |
8a322796 | 1294 | uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
2da0d70d | 1295 | const int yalpha= 4096; //FIXME ... |
96034638 | 1296 | |
2da0d70d DB |
1297 | if (flags&SWS_FULL_CHR_H_INT) |
1298 | { | |
1299 | RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); | |
1300 | return; | |
1301 | } | |
397c035e | 1302 | |
b63f641e | 1303 | #if HAVE_MMX |
f433c8ab | 1304 | if(!(flags & SWS_BITEXACT)){ |
14014d47 | 1305 | if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
2da0d70d | 1306 | { |
14014d47 MN |
1307 | switch(dstFormat) |
1308 | { | |
1309 | case PIX_FMT_RGB32: | |
7ad6469e | 1310 | __asm__ volatile( |
14014d47 MN |
1311 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1312 | "mov %4, %%"REG_b" \n\t" | |
1313 | "push %%"REG_BP" \n\t" | |
1314 | YSCALEYUV2RGB1(%%REGBP, %5) | |
f8a138be | 1315 | "pcmpeqd %%mm7, %%mm7 \n\t" |
9c77b26b | 1316 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
14014d47 MN |
1317 | "pop %%"REG_BP" \n\t" |
1318 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1319 | ||
1320 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1321 | "a" (&c->redDither) | |
1322 | ); | |
1323 | return; | |
1324 | case PIX_FMT_BGR24: | |
7ad6469e | 1325 | __asm__ volatile( |
14014d47 MN |
1326 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1327 | "mov %4, %%"REG_b" \n\t" | |
1328 | "push %%"REG_BP" \n\t" | |
1329 | YSCALEYUV2RGB1(%%REGBP, %5) | |
40494418 | 1330 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 MN |
1331 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) |
1332 | "pop %%"REG_BP" \n\t" | |
1333 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1334 | ||
1335 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1336 | "a" (&c->redDither) | |
1337 | ); | |
1338 | return; | |
1339 | case PIX_FMT_RGB555: | |
7ad6469e | 1340 | __asm__ volatile( |
14014d47 MN |
1341 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1342 | "mov %4, %%"REG_b" \n\t" | |
1343 | "push %%"REG_BP" \n\t" | |
1344 | YSCALEYUV2RGB1(%%REGBP, %5) | |
40494418 | 1345 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 | 1346 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
d604bab9 | 1347 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1348 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1349 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1350 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
2da0d70d | 1351 | #endif |
14014d47 MN |
1352 | WRITERGB15(%%REGb, 8280(%5), %%REGBP) |
1353 | "pop %%"REG_BP" \n\t" | |
1354 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
2da0d70d | 1355 | |
14014d47 MN |
1356 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), |
1357 | "a" (&c->redDither) | |
1358 | ); | |
1359 | return; | |
1360 | case PIX_FMT_RGB565: | |
7ad6469e | 1361 | __asm__ volatile( |
14014d47 MN |
1362 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1363 | "mov %4, %%"REG_b" \n\t" | |
1364 | "push %%"REG_BP" \n\t" | |
1365 | YSCALEYUV2RGB1(%%REGBP, %5) | |
40494418 | 1366 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 | 1367 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
d604bab9 | 1368 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1369 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1370 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1371 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
2da0d70d DB |
1372 | #endif |
1373 | ||
14014d47 MN |
1374 | WRITERGB16(%%REGb, 8280(%5), %%REGBP) |
1375 | "pop %%"REG_BP" \n\t" | |
1376 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
2da0d70d | 1377 | |
14014d47 MN |
1378 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), |
1379 | "a" (&c->redDither) | |
1380 | ); | |
1381 | return; | |
1382 | case PIX_FMT_YUYV422: | |
7ad6469e | 1383 | __asm__ volatile( |
14014d47 MN |
1384 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1385 | "mov %4, %%"REG_b" \n\t" | |
1386 | "push %%"REG_BP" \n\t" | |
1387 | YSCALEYUV2PACKED1(%%REGBP, %5) | |
1388 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1389 | "pop %%"REG_BP" \n\t" | |
1390 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1391 | ||
1392 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1393 | "a" (&c->redDither) | |
1394 | ); | |
1395 | return; | |
1396 | } | |
2da0d70d | 1397 | } |
14014d47 | 1398 | else |
2da0d70d | 1399 | { |
14014d47 MN |
1400 | switch(dstFormat) |
1401 | { | |
1402 | case PIX_FMT_RGB32: | |
7ad6469e | 1403 | __asm__ volatile( |
14014d47 MN |
1404 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1405 | "mov %4, %%"REG_b" \n\t" | |
1406 | "push %%"REG_BP" \n\t" | |
1407 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
f8a138be | 1408 | "pcmpeqd %%mm7, %%mm7 \n\t" |
9c77b26b | 1409 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
14014d47 MN |
1410 | "pop %%"REG_BP" \n\t" |
1411 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1412 | ||
1413 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1414 | "a" (&c->redDither) | |
1415 | ); | |
1416 | return; | |
1417 | case PIX_FMT_BGR24: | |
7ad6469e | 1418 | __asm__ volatile( |
14014d47 MN |
1419 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1420 | "mov %4, %%"REG_b" \n\t" | |
1421 | "push %%"REG_BP" \n\t" | |
1422 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
40494418 | 1423 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 MN |
1424 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) |
1425 | "pop %%"REG_BP" \n\t" | |
1426 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1427 | ||
1428 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1429 | "a" (&c->redDither) | |
1430 | ); | |
1431 | return; | |
1432 | case PIX_FMT_RGB555: | |
7ad6469e | 1433 | __asm__ volatile( |
14014d47 MN |
1434 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1435 | "mov %4, %%"REG_b" \n\t" | |
1436 | "push %%"REG_BP" \n\t" | |
1437 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
40494418 | 1438 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 | 1439 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
497d4f99 | 1440 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1441 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1442 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1443 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
2da0d70d | 1444 | #endif |
14014d47 MN |
1445 | WRITERGB15(%%REGb, 8280(%5), %%REGBP) |
1446 | "pop %%"REG_BP" \n\t" | |
1447 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
2da0d70d | 1448 | |
14014d47 MN |
1449 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), |
1450 | "a" (&c->redDither) | |
1451 | ); | |
1452 | return; | |
1453 | case PIX_FMT_RGB565: | |
7ad6469e | 1454 | __asm__ volatile( |
14014d47 MN |
1455 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1456 | "mov %4, %%"REG_b" \n\t" | |
1457 | "push %%"REG_BP" \n\t" | |
1458 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
40494418 | 1459 | "pxor %%mm7, %%mm7 \n\t" |
14014d47 | 1460 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
497d4f99 | 1461 | #ifdef DITHER1XBPP |
88e2a9ae CEH |
1462 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1463 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |
1464 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |
2da0d70d DB |
1465 | #endif |
1466 | ||
14014d47 MN |
1467 | WRITERGB16(%%REGb, 8280(%5), %%REGBP) |
1468 | "pop %%"REG_BP" \n\t" | |
1469 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
2da0d70d | 1470 | |
14014d47 MN |
1471 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), |
1472 | "a" (&c->redDither) | |
1473 | ); | |
1474 | return; | |
1475 | case PIX_FMT_YUYV422: | |
7ad6469e | 1476 | __asm__ volatile( |
14014d47 MN |
1477 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1478 | "mov %4, %%"REG_b" \n\t" | |
1479 | "push %%"REG_BP" \n\t" | |
1480 | YSCALEYUV2PACKED1b(%%REGBP, %5) | |
1481 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1482 | "pop %%"REG_BP" \n\t" | |
1483 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1484 | ||
1485 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1486 | "a" (&c->redDither) | |
1487 | ); | |
1488 | return; | |
1489 | } | |
2da0d70d DB |
1490 | } |
1491 | } | |
bc279024 | 1492 | #endif /* HAVE_MMX */ |
e5091488 | 1493 | if (uvalpha < 2048) |
2da0d70d | 1494 | { |
ec1bca2a | 1495 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) |
2da0d70d | 1496 | }else{ |
ec1bca2a | 1497 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) |
2da0d70d | 1498 | } |
d604bab9 MN |
1499 | } |
1500 | ||
8a322796 | 1501 | //FIXME yuy2* can read up to 7 samples too much |
6ff0ad6b | 1502 | |
896a22b8 | 1503 | static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) |
1e621b18 | 1504 | { |
b63f641e | 1505 | #if HAVE_MMX |
7ad6469e | 1506 | __asm__ volatile( |
2da0d70d DB |
1507 | "movq "MANGLE(bm01010101)", %%mm2 \n\t" |
1508 | "mov %0, %%"REG_a" \n\t" | |
1509 | "1: \n\t" | |
1510 | "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1511 | "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1512 | "pand %%mm2, %%mm0 \n\t" | |
1513 | "pand %%mm2, %%mm1 \n\t" | |
1514 | "packuswb %%mm1, %%mm0 \n\t" | |
1515 | "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1516 | "add $8, %%"REG_a" \n\t" | |
1517 | " js 1b \n\t" | |
d0ce212a | 1518 | : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) |
2da0d70d DB |
1519 | : "%"REG_a |
1520 | ); | |
1e621b18 | 1521 | #else |
2da0d70d DB |
1522 | int i; |
1523 | for (i=0; i<width; i++) | |
1524 | dst[i]= src[2*i]; | |
1e621b18 MN |
1525 | #endif |
1526 | } | |
1527 | ||
896a22b8 | 1528 | static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) |
1e621b18 | 1529 | { |
b63f641e | 1530 | #if HAVE_MMX |
7ad6469e | 1531 | __asm__ volatile( |
2da0d70d DB |
1532 | "movq "MANGLE(bm01010101)", %%mm4 \n\t" |
1533 | "mov %0, %%"REG_a" \n\t" | |
1534 | "1: \n\t" | |
1535 | "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1536 | "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1537 | "psrlw $8, %%mm0 \n\t" | |
1538 | "psrlw $8, %%mm1 \n\t" | |
1539 | "packuswb %%mm1, %%mm0 \n\t" | |
1540 | "movq %%mm0, %%mm1 \n\t" | |
1541 | "psrlw $8, %%mm0 \n\t" | |
1542 | "pand %%mm4, %%mm1 \n\t" | |
1543 | "packuswb %%mm0, %%mm0 \n\t" | |
1544 | "packuswb %%mm1, %%mm1 \n\t" | |
1545 | "movd %%mm0, (%3, %%"REG_a") \n\t" | |
1546 | "movd %%mm1, (%2, %%"REG_a") \n\t" | |
1547 | "add $4, %%"REG_a" \n\t" | |
1548 | " js 1b \n\t" | |
d0ce212a | 1549 | : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) |
2da0d70d DB |
1550 | : "%"REG_a |
1551 | ); | |
1e621b18 | 1552 | #else |
2da0d70d DB |
1553 | int i; |
1554 | for (i=0; i<width; i++) | |
1555 | { | |
1556 | dstU[i]= src1[4*i + 1]; | |
1557 | dstV[i]= src1[4*i + 3]; | |
1558 | } | |
1559 | #endif | |
1560 | assert(src1 == src2); | |
1e621b18 MN |
1561 | } |
1562 | ||
4cf16bbe DB |
1563 | /* This is almost identical to the previous, end exists only because |
1564 | * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ | |
896a22b8 | 1565 | static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) |
7322a67c | 1566 | { |
b63f641e | 1567 | #if HAVE_MMX |
7ad6469e | 1568 | __asm__ volatile( |
2da0d70d DB |
1569 | "mov %0, %%"REG_a" \n\t" |
1570 | "1: \n\t" | |
1571 | "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1572 | "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1573 | "psrlw $8, %%mm0 \n\t" | |
1574 | "psrlw $8, %%mm1 \n\t" | |
1575 | "packuswb %%mm1, %%mm0 \n\t" | |
1576 | "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1577 | "add $8, %%"REG_a" \n\t" | |
1578 | " js 1b \n\t" | |
d0ce212a | 1579 | : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) |
2da0d70d DB |
1580 | : "%"REG_a |
1581 | ); | |
7322a67c | 1582 | #else |
2da0d70d DB |
1583 | int i; |
1584 | for (i=0; i<width; i++) | |
1585 | dst[i]= src[2*i+1]; | |
7322a67c MN |
1586 | #endif |
1587 | } | |
1588 | ||
896a22b8 | 1589 | static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) |
7322a67c | 1590 | { |
b63f641e | 1591 | #if HAVE_MMX |
7ad6469e | 1592 | __asm__ volatile( |
2da0d70d DB |
1593 | "movq "MANGLE(bm01010101)", %%mm4 \n\t" |
1594 | "mov %0, %%"REG_a" \n\t" | |
1595 | "1: \n\t" | |
1596 | "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1597 | "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1598 | "pand %%mm4, %%mm0 \n\t" | |
1599 | "pand %%mm4, %%mm1 \n\t" | |
1600 | "packuswb %%mm1, %%mm0 \n\t" | |
1601 | "movq %%mm0, %%mm1 \n\t" | |
1602 | "psrlw $8, %%mm0 \n\t" | |
1603 | "pand %%mm4, %%mm1 \n\t" | |
1604 | "packuswb %%mm0, %%mm0 \n\t" | |
1605 | "packuswb %%mm1, %%mm1 \n\t" | |
1606 | "movd %%mm0, (%3, %%"REG_a") \n\t" | |
1607 | "movd %%mm1, (%2, %%"REG_a") \n\t" | |
1608 | "add $4, %%"REG_a" \n\t" | |
1609 | " js 1b \n\t" | |
d0ce212a | 1610 | : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) |
2da0d70d DB |
1611 | : "%"REG_a |
1612 | ); | |
7322a67c | 1613 | #else |
2da0d70d DB |
1614 | int i; |
1615 | for (i=0; i<width; i++) | |
1616 | { | |
1617 | dstU[i]= src1[4*i + 0]; | |
1618 | dstV[i]= src1[4*i + 2]; | |
1619 | } | |
1620 | #endif | |
1621 | assert(src1 == src2); | |
7322a67c MN |
1622 | } |
1623 | ||
214892ee | 1624 | #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\ |
896a22b8 | 1625 | static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\ |
214892ee MN |
1626 | {\ |
1627 | int i;\ | |
1628 | for (i=0; i<width; i++)\ | |
1629 | {\ | |
1630 | int b= (((type*)src)[i]>>shb)&maskb;\ | |
1631 | int g= (((type*)src)[i]>>shg)&maskg;\ | |
1632 | int r= (((type*)src)[i]>>shr)&maskr;\ | |
1633 | \ | |
1634 | dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\ | |
1635 | }\ | |
1e621b18 MN |
1636 | } |
1637 | ||
214892ee MN |
1638 | BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8) |
1639 | BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8) | |
1640 | BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8) | |
1641 | BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7) | |
1642 | BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8) | |
1643 | BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7) | |
1644 | ||
f8a138be | 1645 | #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\ |
896a22b8 | 1646 | static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\ |
a0baa07a MN |
1647 | {\ |
1648 | int i;\ | |
1649 | for (i=0; i<width; i++)\ | |
1650 | {\ | |
ba83d862 MN |
1651 | int b= (((type*)src)[i]&maskb)>>shb;\ |
1652 | int g= (((type*)src)[i]&maskg)>>shg;\ | |
1653 | int r= (((type*)src)[i]&maskr)>>shr;\ | |
a0baa07a MN |
1654 | \ |
1655 | dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\ | |
1656 | dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\ | |
1657 | }\ | |
ba83d862 | 1658 | }\ |
896a22b8 | 1659 | static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\ |
ba83d862 MN |
1660 | {\ |
1661 | int i;\ | |
1662 | for (i=0; i<width; i++)\ | |
1663 | {\ | |
1664 | int pix0= ((type*)src)[2*i+0];\ | |
1665 | int pix1= ((type*)src)[2*i+1];\ | |
bcff32d1 | 1666 | int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\ |
ba83d862 MN |
1667 | int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\ |
1668 | int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\ | |
f8a138be | 1669 | g&= maskg|(2*maskg);\ |
ba83d862 MN |
1670 | \ |
1671 | g>>=shg;\ | |
1672 | \ | |
6b79dbce MN |
1673 | dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\ |
1674 | dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\ | |
ba83d862 | 1675 | }\ |
2f60f629 MN |
1676 | } |
1677 | ||
f8a138be CS |
1678 | BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8) |
1679 | BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8) | |
1680 | BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8) | |
1681 | BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7) | |
1682 | BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8) | |
1683 | BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7) | |
a0baa07a | 1684 | |
b63f641e | 1685 | #if HAVE_MMX |
a35acd7f | 1686 | static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat) |
dfb09bd1 MN |
1687 | { |
1688 | ||
1689 | if(srcFormat == PIX_FMT_BGR24){ | |
7ad6469e | 1690 | __asm__ volatile( |
ff9a056d MN |
1691 | "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t" |
1692 | "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t" | |
1693 | : | |
dfb09bd1 MN |
1694 | ); |
1695 | }else{ | |
7ad6469e | 1696 | __asm__ volatile( |
ff9a056d MN |
1697 | "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t" |
1698 | "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t" | |
1699 | : | |
dfb09bd1 MN |
1700 | ); |
1701 | } | |
1702 | ||
7ad6469e | 1703 | __asm__ volatile( |
dfb09bd1 MN |
1704 | "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t" |
1705 | "mov %2, %%"REG_a" \n\t" | |
1706 | "pxor %%mm7, %%mm7 \n\t" | |
1707 | "1: \n\t" | |
1708 | PREFETCH" 64(%0) \n\t" | |
1709 | "movd (%0), %%mm0 \n\t" | |
1710 | "movd 2(%0), %%mm1 \n\t" | |
1711 | "movd 6(%0), %%mm2 \n\t" | |
1712 | "movd 8(%0), %%mm3 \n\t" | |
1713 | "add $12, %0 \n\t" | |
1714 | "punpcklbw %%mm7, %%mm0 \n\t" | |
1715 | "punpcklbw %%mm7, %%mm1 \n\t" | |
1716 | "punpcklbw %%mm7, %%mm2 \n\t" | |
1717 | "punpcklbw %%mm7, %%mm3 \n\t" | |
1718 | "pmaddwd %%mm5, %%mm0 \n\t" | |
1719 | "pmaddwd %%mm6, %%mm1 \n\t" | |
1720 | "pmaddwd %%mm5, %%mm2 \n\t" | |
1721 | "pmaddwd %%mm6, %%mm3 \n\t" | |
1722 | "paddd %%mm1, %%mm0 \n\t" | |
1723 | "paddd %%mm3, %%mm2 \n\t" | |
1724 | "paddd %%mm4, %%mm0 \n\t" | |
1725 | "paddd %%mm4, %%mm2 \n\t" | |
1726 | "psrad $15, %%mm0 \n\t" | |
1727 | "psrad $15, %%mm2 \n\t" | |
1728 | "packssdw %%mm2, %%mm0 \n\t" | |
1729 | "packuswb %%mm0, %%mm0 \n\t" | |
1730 | "movd %%mm0, (%1, %%"REG_a") \n\t" | |
1731 | "add $4, %%"REG_a" \n\t" | |
1732 | " js 1b \n\t" | |
1733 | : "+r" (src) | |
d0ce212a | 1734 | : "r" (dst+width), "g" ((x86_reg)-width) |
dfb09bd1 | 1735 | : "%"REG_a |
2da0d70d | 1736 | ); |
dfb09bd1 MN |
1737 | } |
1738 | ||
a35acd7f | 1739 | static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat) |
dfb09bd1 | 1740 | { |
7ad6469e | 1741 | __asm__ volatile( |
dfb09bd1 MN |
1742 | "movq 24+%4, %%mm6 \n\t" |
1743 | "mov %3, %%"REG_a" \n\t" | |
1744 | "pxor %%mm7, %%mm7 \n\t" | |
1745 | "1: \n\t" | |
1746 | PREFETCH" 64(%0) \n\t" | |
1747 | "movd (%0), %%mm0 \n\t" | |
1748 | "movd 2(%0), %%mm1 \n\t" | |
1749 | "punpcklbw %%mm7, %%mm0 \n\t" | |
1750 | "punpcklbw %%mm7, %%mm1 \n\t" | |
1751 | "movq %%mm0, %%mm2 \n\t" | |
1752 | "movq %%mm1, %%mm3 \n\t" | |
1753 | "pmaddwd %4, %%mm0 \n\t" | |
1754 | "pmaddwd 8+%4, %%mm1 \n\t" | |
1755 | "pmaddwd 16+%4, %%mm2 \n\t" | |
1756 | "pmaddwd %%mm6, %%mm3 \n\t" | |
1757 | "paddd %%mm1, %%mm0 \n\t" | |
1758 | "paddd %%mm3, %%mm2 \n\t" | |
1759 | ||
1760 | "movd 6(%0), %%mm1 \n\t" | |
1761 | "movd 8(%0), %%mm3 \n\t" | |
1762 | "add $12, %0 \n\t" | |
1763 | "punpcklbw %%mm7, %%mm1 \n\t" | |
1764 | "punpcklbw %%mm7, %%mm3 \n\t" | |
1765 | "movq %%mm1, %%mm4 \n\t" | |
1766 | "movq %%mm3, %%mm5 \n\t" | |
1767 | "pmaddwd %4, %%mm1 \n\t" | |
1768 | "pmaddwd 8+%4, %%mm3 \n\t" | |
1769 | "pmaddwd 16+%4, %%mm4 \n\t" | |
1770 | "pmaddwd %%mm6, %%mm5 \n\t" | |
1771 | "paddd %%mm3, %%mm1 \n\t" | |
1772 | "paddd %%mm5, %%mm4 \n\t" | |
1773 | ||
1774 | "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t" | |
1775 | "paddd %%mm3, %%mm0 \n\t" | |
1776 | "paddd %%mm3, %%mm2 \n\t" | |
1777 | "paddd %%mm3, %%mm1 \n\t" | |
1778 | "paddd %%mm3, %%mm4 \n\t" | |
1779 | "psrad $15, %%mm0 \n\t" | |
1780 | "psrad $15, %%mm2 \n\t" | |
1781 | "psrad $15, %%mm1 \n\t" | |
1782 | "psrad $15, %%mm4 \n\t" | |
1783 | "packssdw %%mm1, %%mm0 \n\t" | |
1784 | "packssdw %%mm4, %%mm2 \n\t" | |
1785 | "packuswb %%mm0, %%mm0 \n\t" | |
1786 | "packuswb %%mm2, %%mm2 \n\t" | |
1787 | "movd %%mm0, (%1, %%"REG_a") \n\t" | |
1788 | "movd %%mm2, (%2, %%"REG_a") \n\t" | |
1789 | "add $4, %%"REG_a" \n\t" | |
1790 | " js 1b \n\t" | |
1791 | : "+r" (src) | |
d0ce212a | 1792 | : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0]) |
dfb09bd1 MN |
1793 | : "%"REG_a |
1794 | ); | |
1795 | } | |
1796 | #endif | |
1797 | ||
896a22b8 | 1798 | static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) |
dfb09bd1 | 1799 | { |
b63f641e | 1800 | #if HAVE_MMX |
a35acd7f | 1801 | RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24); |
1e621b18 | 1802 | #else |
2da0d70d DB |
1803 | int i; |
1804 | for (i=0; i<width; i++) | |
1805 | { | |
1806 | int b= src[i*3+0]; | |
1807 | int g= src[i*3+1]; | |
1808 | int r= src[i*3+2]; | |
1e621b18 | 1809 | |
e5091488 | 1810 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); |
2da0d70d | 1811 | } |
bc279024 | 1812 | #endif /* HAVE_MMX */ |
1e621b18 MN |
1813 | } |
1814 | ||
896a22b8 | 1815 | static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) |
1e621b18 | 1816 | { |
b63f641e | 1817 | #if HAVE_MMX |
a35acd7f | 1818 | RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24); |
1e621b18 | 1819 | #else |
2da0d70d DB |
1820 | int i; |
1821 | for (i=0; i<width; i++) | |
1822 | { | |
dfb09bd1 MN |
1823 | int b= src1[3*i + 0]; |
1824 | int g= src1[3*i + 1]; | |
1825 | int r= src1[3*i + 2]; | |
2da0d70d | 1826 | |
dfb09bd1 MN |
1827 | dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; |
1828 | dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; | |
2da0d70d | 1829 | } |
bc279024 | 1830 | #endif /* HAVE_MMX */ |
2da0d70d | 1831 | assert(src1 == src2); |
1e621b18 MN |
1832 | } |
1833 | ||
896a22b8 | 1834 | static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) |
2f60f629 MN |
1835 | { |
1836 | int i; | |
1837 | for (i=0; i<width; i++) | |
1838 | { | |
1839 | int b= src1[6*i + 0] + src1[6*i + 3]; | |
1840 | int g= src1[6*i + 1] + src1[6*i + 4]; | |
1841 | int r= src1[6*i + 2] + src1[6*i + 5]; | |
1842 | ||
1843 | dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); | |
1844 | dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); | |
1845 | } | |
1846 | assert(src1 == src2); | |
1847 | } | |
1848 | ||
896a22b8 | 1849 | static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) |
a861d4d7 | 1850 | { |
b63f641e | 1851 | #if HAVE_MMX |
a35acd7f | 1852 | RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24); |
dfb09bd1 | 1853 | #else |
2da0d70d DB |
1854 | int i; |
1855 | for (i=0; i<width; i++) | |
1856 | { | |
1857 | int r= src[i*3+0]; | |
1858 | int g= src[i*3+1]; | |
1859 | int b= src[i*3+2]; | |
1860 | ||
e5091488 | 1861 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); |
2da0d70d | 1862 | } |
dfb09bd1 | 1863 | #endif |
a861d4d7 MN |
1864 | } |
1865 | ||
896a22b8 | 1866 | static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) |
a861d4d7 | 1867 | { |
b63f641e | 1868 | #if HAVE_MMX |
5155b839 | 1869 | assert(src1==src2); |
a35acd7f | 1870 | RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24); |
dfb09bd1 | 1871 | #else |
5155b839 DB |
1872 | int i; |
1873 | assert(src1==src2); | |
2da0d70d DB |
1874 | for (i=0; i<width; i++) |
1875 | { | |
dfb09bd1 MN |
1876 | int r= src1[3*i + 0]; |
1877 | int g= src1[3*i + 1]; | |
1878 | int b= src1[3*i + 2]; | |
2da0d70d | 1879 | |
dfb09bd1 MN |
1880 | dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; |
1881 | dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; | |
2da0d70d | 1882 | } |
dfb09bd1 | 1883 | #endif |
a861d4d7 MN |
1884 | } |
1885 | ||
896a22b8 | 1886 | static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) |
2f60f629 MN |
1887 | { |
1888 | int i; | |
1889 | assert(src1==src2); | |
1890 | for (i=0; i<width; i++) | |
1891 | { | |
e09d7eef MN |
1892 | int r= src1[6*i + 0] + src1[6*i + 3]; |
1893 | int g= src1[6*i + 1] + src1[6*i + 4]; | |
1894 | int b= src1[6*i + 2] + src1[6*i + 5]; | |
2f60f629 MN |
1895 | |
1896 | dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); | |
1897 | dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); | |
1898 | } | |
1899 | } | |
1900 | ||
1e621b18 | 1901 | |
97b93389 | 1902 | static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal) |
e28630fc | 1903 | { |
2da0d70d DB |
1904 | int i; |
1905 | for (i=0; i<width; i++) | |
1906 | { | |
1907 | int d= src[i]; | |
e28630fc | 1908 | |
2da0d70d DB |
1909 | dst[i]= pal[d] & 0xFF; |
1910 | } | |
e28630fc MN |
1911 | } |
1912 | ||
97b93389 | 1913 | static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal) |
e28630fc | 1914 | { |
2da0d70d DB |
1915 | int i; |
1916 | assert(src1 == src2); | |
1917 | for (i=0; i<width; i++) | |
1918 | { | |
1919 | int p= pal[src1[i]]; | |
1920 | ||
1921 | dstU[i]= p>>8; | |
1922 | dstV[i]= p>>16; | |
1923 | } | |
e28630fc MN |
1924 | } |
1925 | ||
896a22b8 | 1926 | static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) |
3d05e078 MN |
1927 | { |
1928 | int i, j; | |
1929 | for (i=0; i<width/8; i++){ | |
3a5ba0c3 LB |
1930 | int d= ~src[i]; |
1931 | for(j=0; j<8; j++) | |
1932 | dst[8*i+j]= ((d>>(7-j))&1)*255; | |
1933 | } | |
1934 | } | |
1935 | ||
896a22b8 | 1936 | static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) |
3a5ba0c3 LB |
1937 | { |
1938 | int i, j; | |
1939 | for (i=0; i<width/8; i++){ | |
1940 | int d= src[i]; | |
78454dfc MN |
1941 | for(j=0; j<8; j++) |
1942 | dst[8*i+j]= ((d>>(7-j))&1)*255; | |
3d05e078 MN |
1943 | } |
1944 | } | |
1945 | ||
8a322796 | 1946 | // bilinear / bicubic scaling |
077ea8a7 | 1947 | static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, |
2da0d70d | 1948 | int16_t *filter, int16_t *filterPos, long filterSize) |
2ff198c1 | 1949 | { |
b63f641e | 1950 | #if HAVE_MMX |
2da0d70d DB |
1951 | assert(filterSize % 4 == 0 && filterSize>0); |
1952 | if (filterSize==4) // Always true for upscaling, sometimes for down, too. | |
1953 | { | |
d0ce212a | 1954 | x86_reg counter= -2*dstW; |
2da0d70d DB |
1955 | filter-= counter*2; |
1956 | filterPos-= counter/2; | |
1957 | dst-= counter/2; | |
7ad6469e | 1958 | __asm__ volatile( |
83c89c78 | 1959 | #if defined(PIC) |
2da0d70d DB |
1960 | "push %%"REG_b" \n\t" |
1961 | #endif | |
1962 | "pxor %%mm7, %%mm7 \n\t" | |
2da0d70d DB |
1963 | "push %%"REG_BP" \n\t" // we use 7 regs here ... |
1964 | "mov %%"REG_a", %%"REG_BP" \n\t" | |
1965 | ASMALIGN(4) | |
1966 | "1: \n\t" | |
1967 | "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
1968 | "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | |
1969 | "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" | |
1970 | "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" | |
1971 | "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
1972 | "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
1973 | "punpcklbw %%mm7, %%mm0 \n\t" | |
1974 | "punpcklbw %%mm7, %%mm2 \n\t" | |
1975 | "pmaddwd %%mm1, %%mm0 \n\t" | |
1976 | "pmaddwd %%mm2, %%mm3 \n\t" | |
ef423a66 MN |
1977 | "movq %%mm0, %%mm4 \n\t" |
1978 | "punpckldq %%mm3, %%mm0 \n\t" | |
1979 | "punpckhdq %%mm3, %%mm4 \n\t" | |
1980 | "paddd %%mm4, %%mm0 \n\t" | |
1981 | "psrad $7, %%mm0 \n\t" | |
2da0d70d DB |
1982 | "packssdw %%mm0, %%mm0 \n\t" |
1983 | "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
1984 | "add $4, %%"REG_BP" \n\t" | |
1985 | " jnc 1b \n\t" | |
1986 | ||
1987 | "pop %%"REG_BP" \n\t" | |
83c89c78 | 1988 | #if defined(PIC) |
2da0d70d | 1989 | "pop %%"REG_b" \n\t" |
83c89c78 | 1990 | #endif |
2da0d70d DB |
1991 | : "+a" (counter) |
1992 | : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
83c89c78 | 1993 | #if !defined(PIC) |
2da0d70d DB |
1994 | : "%"REG_b |
1995 | #endif | |
1996 | ); | |
1997 | } | |
1998 | else if (filterSize==8) | |
1999 | { | |
d0ce212a | 2000 | x86_reg counter= -2*dstW; |
2da0d70d DB |
2001 | filter-= counter*4; |
2002 | filterPos-= counter/2; | |
2003 | dst-= counter/2; | |
7ad6469e | 2004 | __asm__ volatile( |
83c89c78 | 2005 | #if defined(PIC) |
2da0d70d DB |
2006 | "push %%"REG_b" \n\t" |
2007 | #endif | |
2008 | "pxor %%mm7, %%mm7 \n\t" | |
2da0d70d DB |
2009 | "push %%"REG_BP" \n\t" // we use 7 regs here ... |
2010 | "mov %%"REG_a", %%"REG_BP" \n\t" | |
2011 | ASMALIGN(4) | |
2012 | "1: \n\t" | |
2013 | "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2014 | "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | |
2015 | "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" | |
2016 | "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" | |
2017 | "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2018 | "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2019 | "punpcklbw %%mm7, %%mm0 \n\t" | |
2020 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2021 | "pmaddwd %%mm1, %%mm0 \n\t" | |
2022 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2023 | ||
2024 | "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" | |
2025 | "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" | |
2026 | "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | |
2027 | "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | |
2028 | "punpcklbw %%mm7, %%mm4 \n\t" | |
2029 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2030 | "pmaddwd %%mm1, %%mm4 \n\t" | |
2031 | "pmaddwd %%mm2, %%mm5 \n\t" | |
2032 | "paddd %%mm4, %%mm0 \n\t" | |
2033 | "paddd %%mm5, %%mm3 \n\t" | |
ef423a66 MN |
2034 | "movq %%mm0, %%mm4 \n\t" |
2035 | "punpckldq %%mm3, %%mm0 \n\t" | |
2036 | "punpckhdq %%mm3, %%mm4 \n\t" | |
2037 | "paddd %%mm4, %%mm0 \n\t" | |
2038 | "psrad $7, %%mm0 \n\t" | |
2da0d70d DB |
2039 | "packssdw %%mm0, %%mm0 \n\t" |
2040 | "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2041 | "add $4, %%"REG_BP" \n\t" | |
2042 | " jnc 1b \n\t" | |
2043 | ||
2044 | "pop %%"REG_BP" \n\t" | |
83c89c78 | 2045 | #if defined(PIC) |
2da0d70d | 2046 | "pop %%"REG_b" \n\t" |
83c89c78 | 2047 | #endif |
2da0d70d DB |
2048 | : "+a" (counter) |
2049 | : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
83c89c78 | 2050 | #if !defined(PIC) |
2da0d70d DB |
2051 | : "%"REG_b |
2052 | #endif | |
2053 | ); | |
2054 | } | |
2055 | else | |
2056 | { | |
2057 | uint8_t *offset = src+filterSize; | |
d0ce212a | 2058 | x86_reg counter= -2*dstW; |
2da0d70d DB |
2059 | //filter-= counter*filterSize/2; |
2060 | filterPos-= counter/2; | |
2061 | dst-= counter/2; | |
7ad6469e | 2062 | __asm__ volatile( |
2da0d70d | 2063 | "pxor %%mm7, %%mm7 \n\t" |
2da0d70d DB |
2064 | ASMALIGN(4) |
2065 | "1: \n\t" | |
2066 | "mov %2, %%"REG_c" \n\t" | |
2067 | "movzwl (%%"REG_c", %0), %%eax \n\t" | |
2068 | "movzwl 2(%%"REG_c", %0), %%edx \n\t" | |
2069 | "mov %5, %%"REG_c" \n\t" | |
2070 | "pxor %%mm4, %%mm4 \n\t" | |
2071 | "pxor %%mm5, %%mm5 \n\t" | |
2072 | "2: \n\t" | |
2073 | "movq (%1), %%mm1 \n\t" | |
2074 | "movq (%1, %6), %%mm3 \n\t" | |
2075 | "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" | |
2076 | "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" | |
2077 | "punpcklbw %%mm7, %%mm0 \n\t" | |
2078 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2079 | "pmaddwd %%mm1, %%mm0 \n\t" | |
2080 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2081 | "paddd %%mm3, %%mm5 \n\t" | |
2082 | "paddd %%mm0, %%mm4 \n\t" | |
2083 | "add $8, %1 \n\t" | |
2084 | "add $4, %%"REG_c" \n\t" | |
2085 | "cmp %4, %%"REG_c" \n\t" | |
2086 | " jb 2b \n\t" | |
2087 | "add %6, %1 \n\t" | |
ef423a66 MN |
2088 | "movq %%mm4, %%mm0 \n\t" |
2089 | "punpckldq %%mm5, %%mm4 \n\t" | |
2090 | "punpckhdq %%mm5, %%mm0 \n\t" | |
2091 | "paddd %%mm0, %%mm4 \n\t" | |
2092 | "psrad $7, %%mm4 \n\t" | |
2da0d70d DB |
2093 | "packssdw %%mm4, %%mm4 \n\t" |
2094 | "mov %3, %%"REG_a" \n\t" | |
2095 | "movd %%mm4, (%%"REG_a", %0) \n\t" | |
2096 | "add $4, %0 \n\t" | |
2097 | " jnc 1b \n\t" | |
2098 | ||
2099 | : "+r" (counter), "+r" (filter) | |
2100 | : "m" (filterPos), "m" (dst), "m"(offset), | |
d0ce212a | 2101 | "m" (src), "r" ((x86_reg)filterSize*2) |
2da0d70d DB |
2102 | : "%"REG_a, "%"REG_c, "%"REG_d |
2103 | ); | |
2104 | } | |
077ea8a7 | 2105 | #else |
b63f641e | 2106 | #if HAVE_ALTIVEC |
2da0d70d | 2107 | hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); |
8c266f0c | 2108 | #else |
2da0d70d DB |
2109 | int i; |
2110 | for (i=0; i<dstW; i++) | |
2111 | { | |
2112 | int j; | |
2113 | int srcPos= filterPos[i]; | |
2114 | int val=0; | |
2115 | //printf("filterPos: %d\n", filterPos[i]); | |
2116 | for (j=0; j<filterSize; j++) | |
2117 | { | |
2118 | //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2119 | val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2120 | } | |
2121 | //filter += hFilterSize; | |
881c4294 | 2122 | dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ... |
2da0d70d DB |
2123 | //dst[i] = val>>7; |
2124 | } | |
bc279024 DB |
2125 | #endif /* HAVE_ALTIVEC */ |
2126 | #endif /* HAVE_MMX */ | |
077ea8a7 | 2127 | } |
2ff198c1 | 2128 | // *** horizontal scale Y line to temp buffer |
6bc0c792 | 2129 | static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, |
2da0d70d DB |
2130 | int flags, int canMMX2BeUsed, int16_t *hLumFilter, |
2131 | int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, | |
2132 | int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
e48a79c9 | 2133 | int32_t *mmx2FilterPos, uint32_t *pal) |
077ea8a7 | 2134 | { |
2da0d70d | 2135 | if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE) |
1e621b18 | 2136 | { |
896a22b8 | 2137 | RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2138 | src= formatConvBuffer; |
1e621b18 | 2139 | } |
2da0d70d | 2140 | else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE) |
7322a67c | 2141 | { |
896a22b8 | 2142 | RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2143 | src= formatConvBuffer; |
7322a67c | 2144 | } |
2da0d70d | 2145 | else if (srcFormat==PIX_FMT_RGB32) |
1e621b18 | 2146 | { |
896a22b8 | 2147 | RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2148 | src= formatConvBuffer; |
1e621b18 | 2149 | } |
9990e426 MN |
2150 | else if (srcFormat==PIX_FMT_RGB32_1) |
2151 | { | |
896a22b8 | 2152 | RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal); |
9990e426 MN |
2153 | src= formatConvBuffer; |
2154 | } | |
2da0d70d | 2155 | else if (srcFormat==PIX_FMT_BGR24) |
1e621b18 | 2156 | { |
896a22b8 | 2157 | RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2158 | src= formatConvBuffer; |
1e621b18 | 2159 | } |
2da0d70d | 2160 | else if (srcFormat==PIX_FMT_BGR565) |
6af250ea | 2161 | { |
896a22b8 | 2162 | RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2163 | src= formatConvBuffer; |
6af250ea | 2164 | } |
2da0d70d | 2165 | else if (srcFormat==PIX_FMT_BGR555) |
b72034dd | 2166 | { |
896a22b8 | 2167 | RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2168 | src= formatConvBuffer; |
b72034dd | 2169 | } |
2da0d70d | 2170 | else if (srcFormat==PIX_FMT_BGR32) |
a861d4d7 | 2171 | { |
896a22b8 | 2172 | RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2173 | src= formatConvBuffer; |
a861d4d7 | 2174 | } |
9990e426 MN |
2175 | else if (srcFormat==PIX_FMT_BGR32_1) |
2176 | { | |
896a22b8 | 2177 | RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal); |
9990e426 MN |
2178 | src= formatConvBuffer; |
2179 | } | |
2da0d70d | 2180 | else if (srcFormat==PIX_FMT_RGB24) |
a861d4d7 | 2181 | { |
896a22b8 | 2182 | RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2183 | src= formatConvBuffer; |
a861d4d7 | 2184 | } |
2da0d70d | 2185 | else if (srcFormat==PIX_FMT_RGB565) |
a43fb6b3 | 2186 | { |
896a22b8 | 2187 | RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2188 | src= formatConvBuffer; |
a43fb6b3 | 2189 | } |
2da0d70d | 2190 | else if (srcFormat==PIX_FMT_RGB555) |
a43fb6b3 | 2191 | { |
896a22b8 | 2192 | RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2193 | src= formatConvBuffer; |
a43fb6b3 | 2194 | } |
2da0d70d | 2195 | else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) |
e28630fc | 2196 | { |
e48a79c9 | 2197 | RENAME(palToY)(formatConvBuffer, src, srcW, pal); |
2da0d70d | 2198 | src= formatConvBuffer; |
e28630fc | 2199 | } |
3a5ba0c3 LB |
2200 | else if (srcFormat==PIX_FMT_MONOBLACK) |
2201 | { | |
896a22b8 | 2202 | RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal); |
3a5ba0c3 LB |
2203 | src= formatConvBuffer; |
2204 | } | |
2205 | else if (srcFormat==PIX_FMT_MONOWHITE) | |
3d05e078 | 2206 | { |
896a22b8 | 2207 | RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal); |
3d05e078 MN |
2208 | src= formatConvBuffer; |
2209 | } | |
1e621b18 | 2210 | |
b63f641e | 2211 | #if HAVE_MMX |
8a322796 | 2212 | // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). |
2da0d70d | 2213 | if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
e3d2500f | 2214 | #else |
2da0d70d | 2215 | if (!(flags&SWS_FAST_BILINEAR)) |
e3d2500f | 2216 | #endif |
077ea8a7 | 2217 | { |
2da0d70d | 2218 | RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); |
077ea8a7 | 2219 | } |
8a322796 | 2220 | else // fast bilinear upscale / crap downscale |
077ea8a7 | 2221 | { |
57f9a560 | 2222 | #if ARCH_X86 && CONFIG_GPL |
b63f641e | 2223 | #if HAVE_MMX2 |
2da0d70d | 2224 | int i; |
83c89c78 | 2225 | #if defined(PIC) |
2da0d70d | 2226 | uint64_t ebxsave __attribute__((aligned(8))); |
83c89c78 | 2227 | #endif |
2da0d70d DB |
2228 | if (canMMX2BeUsed) |
2229 | { | |
7ad6469e | 2230 | __asm__ volatile( |
83c89c78 | 2231 | #if defined(PIC) |
2da0d70d DB |
2232 | "mov %%"REG_b", %5 \n\t" |
2233 | #endif | |
2234 | "pxor %%mm7, %%mm7 \n\t" | |
2235 | "mov %0, %%"REG_c" \n\t" | |
2236 | "mov %1, %%"REG_D" \n\t" | |
2237 | "mov %2, %%"REG_d" \n\t" | |
2238 | "mov %3, %%"REG_b" \n\t" | |
2239 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2240 | PREFETCH" (%%"REG_c") \n\t" | |
2241 | PREFETCH" 32(%%"REG_c") \n\t" | |
2242 | PREFETCH" 64(%%"REG_c") \n\t" | |
99cefd0b | 2243 | |
b63f641e | 2244 | #if ARCH_X86_64 |
6d606c4f AJ |
2245 | |
2246 | #define FUNNY_Y_CODE \ | |
2da0d70d DB |
2247 | "movl (%%"REG_b"), %%esi \n\t"\ |
2248 | "call *%4 \n\t"\ | |
2249 | "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ | |
2250 | "add %%"REG_S", %%"REG_c" \n\t"\ | |
2251 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2252 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
6d606c4f AJ |
2253 | |
2254 | #else | |
2255 | ||
2ff198c1 | 2256 | #define FUNNY_Y_CODE \ |
2da0d70d DB |
2257 | "movl (%%"REG_b"), %%esi \n\t"\ |
2258 | "call *%4 \n\t"\ | |
2259 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ | |
2260 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2261 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
99cefd0b | 2262 | |
bc279024 | 2263 | #endif /* ARCH_X86_64 */ |
6d606c4f | 2264 | |
2ff198c1 MN |
2265 | FUNNY_Y_CODE |
2266 | FUNNY_Y_CODE | |
2267 | FUNNY_Y_CODE | |
2268 | FUNNY_Y_CODE | |
2269 | FUNNY_Y_CODE | |
2270 | FUNNY_Y_CODE | |
2271 | FUNNY_Y_CODE | |
2272 | FUNNY_Y_CODE | |
2273 | ||
83c89c78 | 2274 | #if defined(PIC) |
2da0d70d | 2275 | "mov %5, %%"REG_b" \n\t" |
83c89c78 | 2276 | #endif |
2da0d70d DB |
2277 | :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2278 | "m" (funnyYCode) | |
83c89c78 | 2279 | #if defined(PIC) |
2da0d70d | 2280 | ,"m" (ebxsave) |
83c89c78 | 2281 | #endif |
2da0d70d | 2282 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
83c89c78 | 2283 | #if !defined(PIC) |
2da0d70d DB |
2284 | ,"%"REG_b |
2285 | #endif | |
2286 | ); | |
2287 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; | |
2288 | } | |
2289 | else | |
2290 | { | |
bc279024 | 2291 | #endif /* HAVE_MMX2 */ |
d0ce212a | 2292 | x86_reg xInc_shr16 = xInc >> 16; |
2da0d70d DB |
2293 | uint16_t xInc_mask = xInc & 0xffff; |
2294 | //NO MMX just normal asm ... | |
7ad6469e | 2295 | __asm__ volatile( |
2da0d70d DB |
2296 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
2297 | "xor %%"REG_d", %%"REG_d" \n\t" // xx | |
2298 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2299 | ASMALIGN(4) | |
2300 | "1: \n\t" | |
2301 | "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] | |
2302 | "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2303 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2304 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2305 | "shll $16, %%edi \n\t" | |
2306 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2307 | "mov %1, %%"REG_D" \n\t" | |
2308 | "shrl $9, %%esi \n\t" | |
2309 | "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" | |
2310 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2311 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry | |
2312 | ||
2313 | "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] | |
2314 | "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2315 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2316 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2317 | "shll $16, %%edi \n\t" | |
2318 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2319 | "mov %1, %%"REG_D" \n\t" | |
2320 | "shrl $9, %%esi \n\t" | |
2321 | "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t" | |
2322 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2323 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry | |
2324 | ||
2325 | ||
2326 | "add $2, %%"REG_a" \n\t" | |
2327 | "cmp %2, %%"REG_a" \n\t" | |
2328 | " jb 1b \n\t" | |
2329 | ||
2330 | ||
2331 | :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) | |
2332 | : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" | |
2333 | ); | |
b63f641e | 2334 | #if HAVE_MMX2 |
2da0d70d | 2335 | } //if MMX2 can't be used |
2ff198c1 MN |
2336 | #endif |
2337 | #else | |
2da0d70d DB |
2338 | int i; |
2339 | unsigned int xpos=0; | |
2340 | for (i=0;i<dstWidth;i++) | |
2341 | { | |
2342 | register unsigned int xx=xpos>>16; | |
2343 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2344 | dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2345 | xpos+=xInc; | |
2346 | } | |
b63f641e | 2347 | #endif /* ARCH_X86 */ |
077ea8a7 | 2348 | } |
6bc0c792 MN |
2349 | |
2350 | if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ | |
2351 | int i; | |
2352 | //FIXME all pal and rgb srcFormats could do this convertion as well | |
2353 | //FIXME all scalers more complex than bilinear could do half of this transform | |
2354 | if(c->srcRange){ | |
2355 | for (i=0; i<dstWidth; i++) | |
2356 | dst[i]= (dst[i]*14071 + 33561947)>>14; | |
2357 | }else{ | |
2358 | for (i=0; i<dstWidth; i++) | |
aa13b0fc | 2359 | dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14; |
6bc0c792 MN |
2360 | } |
2361 | } | |
2ff198c1 MN |
2362 | } |
2363 | ||
6bc0c792 | 2364 | inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, |
2da0d70d DB |
2365 | int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, |
2366 | int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, | |
2367 | int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
e48a79c9 | 2368 | int32_t *mmx2FilterPos, uint32_t *pal) |
2ff198c1 | 2369 | { |
2da0d70d | 2370 | if (srcFormat==PIX_FMT_YUYV422) |
1e621b18 | 2371 | { |
896a22b8 | 2372 | RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2373 | src1= formatConvBuffer; |
8b2fce0d | 2374 | src2= formatConvBuffer+VOFW; |
1e621b18 | 2375 | } |
2da0d70d | 2376 | else if (srcFormat==PIX_FMT_UYVY422) |
7322a67c | 2377 | { |
896a22b8 | 2378 | RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2379 | src1= formatConvBuffer; |
8b2fce0d | 2380 | src2= formatConvBuffer+VOFW; |
7322a67c | 2381 | } |
2da0d70d | 2382 | else if (srcFormat==PIX_FMT_RGB32) |
1e621b18 | 2383 | { |
2f60f629 | 2384 | if(c->chrSrcHSubSample) |
896a22b8 | 2385 | RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2f60f629 | 2386 | else |
896a22b8 | 2387 | RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2388 | src1= formatConvBuffer; |
8b2fce0d | 2389 | src2= formatConvBuffer+VOFW; |
1e621b18 | 2390 | } |
9990e426 MN |
2391 | else if (srcFormat==PIX_FMT_RGB32_1) |
2392 | { | |
2f60f629 | 2393 | if(c->chrSrcHSubSample) |
896a22b8 | 2394 | RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal); |
2f60f629 | 2395 | else |
896a22b8 | 2396 | RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal); |
9990e426 MN |
2397 | src1= formatConvBuffer; |
2398 | src2= formatConvBuffer+VOFW; | |
2399 | } | |
2da0d70d | 2400 | else if (srcFormat==PIX_FMT_BGR24) |
1e621b18 | 2401 | { |
2f60f629 | 2402 | if(c->chrSrcHSubSample) |
896a22b8 | 2403 | RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2f60f629 | 2404 | else |
896a22b8 | 2405 | RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2406 | src1= formatConvBuffer; |
8b2fce0d | 2407 | src2= formatConvBuffer+VOFW; |
1e621b18 | 2408 | } |
2da0d70d | 2409 | else if (srcFormat==PIX_FMT_BGR565) |
6af250ea | 2410 | { |
2f60f629 | 2411 | if(c->chrSrcHSubSample) |
896a22b8 | 2412 | RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2f60f629 | 2413 | else |
896a22b8 | 2414 | RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2415 | src1= formatConvBuffer; |
8b2fce0d | 2416 | src2= formatConvBuffer+VOFW; |
6af250ea | 2417 | } |
2da0d70d | 2418 | else if (srcFormat==PIX_FMT_BGR555) |
b72034dd | 2419 | { |
2f60f629 | 2420 | if(c->chrSrcHSubSample) |
896a22b8 | 2421 | RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2f60f629 | 2422 | else |
896a22b8 | 2423 | RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2424 | src1= formatConvBuffer; |
8b2fce0d | 2425 | src2= formatConvBuffer+VOFW; |
b72034dd | 2426 | } |
2da0d70d | 2427 | else if (srcFormat==PIX_FMT_BGR32) |
a861d4d7 | 2428 | { |
2f60f629 | 2429 | if(c->chrSrcHSubSample) |
896a22b8 | 2430 | RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2f60f629 | 2431 | else |
896a22b8 | 2432 | RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2433 | src1= formatConvBuffer; |
8b2fce0d | 2434 | src2= formatConvBuffer+VOFW; |
a861d4d7 | 2435 | } |
9990e426 MN |
2436 | else if (srcFormat==PIX_FMT_BGR32_1) |
2437 | { | |
2f60f629 | 2438 | if(c->chrSrcHSubSample) |
896a22b8 | 2439 | RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal); |
2f60f629 | 2440 | else |
896a22b8 | 2441 | RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal); |
9990e426 MN |
2442 | src1= formatConvBuffer; |
2443 | src2= formatConvBuffer+VOFW; | |
2444 | } | |
2da0d70d | 2445 | else if (srcFormat==PIX_FMT_RGB24) |
a861d4d7 | 2446 | { |
2f60f629 | 2447 | if(c->chrSrcHSubSample) |
896a22b8 | 2448 | RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2f60f629 | 2449 | else |
896a22b8 | 2450 | RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2451 | src1= formatConvBuffer; |
8b2fce0d | 2452 | src2= formatConvBuffer+VOFW; |
a861d4d7 | 2453 | } |
2da0d70d | 2454 | else if (srcFormat==PIX_FMT_RGB565) |
a43fb6b3 | 2455 | { |
2f60f629 | 2456 | if(c->chrSrcHSubSample) |
896a22b8 | 2457 | RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2f60f629 | 2458 | else |
896a22b8 | 2459 | RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2460 | src1= formatConvBuffer; |
8b2fce0d | 2461 | src2= formatConvBuffer+VOFW; |
a43fb6b3 | 2462 | } |
2da0d70d | 2463 | else if (srcFormat==PIX_FMT_RGB555) |
a43fb6b3 | 2464 | { |
2f60f629 | 2465 | if(c->chrSrcHSubSample) |
896a22b8 | 2466 | RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2f60f629 | 2467 | else |
896a22b8 | 2468 | RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2469 | src1= formatConvBuffer; |
8b2fce0d | 2470 | src2= formatConvBuffer+VOFW; |
a43fb6b3 | 2471 | } |
4bb9adcf | 2472 | else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE) |
6ff0ad6b | 2473 | { |
2da0d70d | 2474 | return; |
6ff0ad6b | 2475 | } |
2da0d70d | 2476 | else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) |
e28630fc | 2477 | { |
e48a79c9 | 2478 | RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); |
2da0d70d | 2479 | src1= formatConvBuffer; |
8b2fce0d | 2480 | src2= formatConvBuffer+VOFW; |
e28630fc | 2481 | } |
1e621b18 | 2482 | |
b63f641e | 2483 | #if HAVE_MMX |
8a322796 | 2484 | // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). |
2da0d70d | 2485 | if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
e3d2500f | 2486 | #else |
2da0d70d | 2487 | if (!(flags&SWS_FAST_BILINEAR)) |
e3d2500f | 2488 | #endif |
077ea8a7 | 2489 | { |
2da0d70d | 2490 | RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
8b2fce0d | 2491 | RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
077ea8a7 | 2492 | } |
8a322796 | 2493 | else // fast bilinear upscale / crap downscale |
077ea8a7 | 2494 | { |
57f9a560 | 2495 | #if ARCH_X86 && CONFIG_GPL |
b63f641e | 2496 | #if HAVE_MMX2 |
2da0d70d | 2497 | int i; |
83c89c78 | 2498 | #if defined(PIC) |
2da0d70d | 2499 | uint64_t ebxsave __attribute__((aligned(8))); |
83c89c78 | 2500 | #endif |
2da0d70d DB |
2501 | if (canMMX2BeUsed) |
2502 | { | |
7ad6469e | 2503 | __asm__ volatile( |
83c89c78 | 2504 | #if defined(PIC) |
2da0d70d DB |
2505 | "mov %%"REG_b", %6 \n\t" |
2506 | #endif | |
2507 | "pxor %%mm7, %%mm7 \n\t" | |
2508 | "mov %0, %%"REG_c" \n\t" | |
2509 | "mov %1, %%"REG_D" \n\t" | |
2510 | "mov %2, %%"REG_d" \n\t" | |
2511 | "mov %3, %%"REG_b" \n\t" | |
2512 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2513 | PREFETCH" (%%"REG_c") \n\t" | |
2514 | PREFETCH" 32(%%"REG_c") \n\t" | |
2515 | PREFETCH" 64(%%"REG_c") \n\t" | |
b7dc6f66 | 2516 | |
b63f641e | 2517 | #if ARCH_X86_64 |
6d606c4f AJ |
2518 | |
2519 | #define FUNNY_UV_CODE \ | |
2da0d70d DB |
2520 | "movl (%%"REG_b"), %%esi \n\t"\ |
2521 | "call *%4 \n\t"\ | |
2522 | "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ | |
2523 | "add %%"REG_S", %%"REG_c" \n\t"\ | |
2524 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2525 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
6d606c4f AJ |
2526 | |
2527 | #else | |
2528 | ||
b7dc6f66 | 2529 | #define FUNNY_UV_CODE \ |
2da0d70d DB |
2530 | "movl (%%"REG_b"), %%esi \n\t"\ |
2531 | "call *%4 \n\t"\ | |
2532 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ | |
2533 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2534 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
b7dc6f66 | 2535 | |
bc279024 | 2536 | #endif /* ARCH_X86_64 */ |
6d606c4f | 2537 | |
b7dc6f66 MN |
2538 | FUNNY_UV_CODE |
2539 | FUNNY_UV_CODE | |
2540 | FUNNY_UV_CODE | |
2541 | FUNNY_UV_CODE | |
2da0d70d DB |
2542 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
2543 | "mov %5, %%"REG_c" \n\t" // src | |
2544 | "mov %1, %%"REG_D" \n\t" // buf1 | |
8b2fce0d | 2545 | "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t" |
2da0d70d DB |
2546 | PREFETCH" (%%"REG_c") \n\t" |
2547 | PREFETCH" 32(%%"REG_c") \n\t" | |
2548 | PREFETCH" 64(%%"REG_c") \n\t" | |
b7dc6f66 MN |
2549 | |
2550 | FUNNY_UV_CODE | |
2551 | FUNNY_UV_CODE | |
2552 | FUNNY_UV_CODE | |
2553 | FUNNY_UV_CODE | |
2554 | ||
83c89c78 | 2555 | #if defined(PIC) |
2da0d70d | 2556 | "mov %6, %%"REG_b" \n\t" |
83c89c78 | 2557 | #endif |
2da0d70d DB |
2558 | :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2559 | "m" (funnyUVCode), "m" (src2) | |
83c89c78 | 2560 | #if defined(PIC) |
2da0d70d | 2561 | ,"m" (ebxsave) |
83c89c78 | 2562 | #endif |
2da0d70d | 2563 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
83c89c78 | 2564 | #if !defined(PIC) |
2da0d70d DB |
2565 | ,"%"REG_b |
2566 | #endif | |
2567 | ); | |
2568 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) | |
2569 | { | |
2570 | //printf("%d %d %d\n", dstWidth, i, srcW); | |
2571 | dst[i] = src1[srcW-1]*128; | |
8b2fce0d | 2572 | dst[i+VOFW] = src2[srcW-1]*128; |
2da0d70d DB |
2573 | } |
2574 | } | |
2575 | else | |
2576 | { | |
bc279024 | 2577 | #endif /* HAVE_MMX2 */ |
d0ce212a | 2578 | x86_reg xInc_shr16 = (x86_reg) (xInc >> 16); |
2da0d70d | 2579 | uint16_t xInc_mask = xInc & 0xffff; |
7ad6469e | 2580 | __asm__ volatile( |
2da0d70d DB |
2581 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
2582 | "xor %%"REG_d", %%"REG_d" \n\t" // xx | |
2583 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2584 | ASMALIGN(4) | |
2585 | "1: \n\t" | |
2586 | "mov %0, %%"REG_S" \n\t" | |
2587 | "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] | |
2588 | "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2589 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2590 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2591 | "shll $16, %%edi \n\t" | |
2592 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2593 | "mov %1, %%"REG_D" \n\t" | |
2594 | "shrl $9, %%esi \n\t" | |
2595 | "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" | |
2596 | ||
2597 | "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] | |
2598 | "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2599 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2600 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2601 | "shll $16, %%edi \n\t" | |
2602 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2603 | "mov %1, %%"REG_D" \n\t" | |
2604 | "shrl $9, %%esi \n\t" | |
8b2fce0d | 2605 | "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t" |
2da0d70d DB |
2606 | |
2607 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2608 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry | |
2609 | "add $1, %%"REG_a" \n\t" | |
2610 | "cmp %2, %%"REG_a" \n\t" | |
2611 | " jb 1b \n\t" | |
2ff198c1 | 2612 | |
8a322796 DB |
2613 | /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, |
2614 | which is needed to support GCC 4.0. */ | |
b63f641e | 2615 | #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) |
e29c3f93 | 2616 | :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
dc77ef7f | 2617 | #else |
e29c3f93 | 2618 | :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
dc77ef7f | 2619 | #endif |
2da0d70d DB |
2620 | "r" (src2) |
2621 | : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" | |
2622 | ); | |
b63f641e | 2623 | #if HAVE_MMX2 |
2da0d70d | 2624 | } //if MMX2 can't be used |
2ff198c1 MN |
2625 | #endif |
2626 | #else | |
2da0d70d DB |
2627 | int i; |
2628 | unsigned int xpos=0; | |
2629 | for (i=0;i<dstWidth;i++) | |
2630 | { | |
2631 | register unsigned int xx=xpos>>16; | |
2632 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2633 | dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
8b2fce0d | 2634 | dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); |
2da0d70d DB |
2635 | /* slower |
2636 | dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
8b2fce0d | 2637 | dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; |
2da0d70d DB |
2638 | */ |
2639 | xpos+=xInc; | |
2640 | } | |
b63f641e | 2641 | #endif /* ARCH_X86 */ |
2da0d70d | 2642 | } |
6bc0c792 MN |
2643 | if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ |
2644 | int i; | |
2645 | //FIXME all pal and rgb srcFormats could do this convertion as well | |
2646 | //FIXME all scalers more complex than bilinear could do half of this transform | |
2647 | if(c->srcRange){ | |
2648 | for (i=0; i<dstWidth; i++){ | |
2649 | dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469 | |
2650 | dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469 | |
2651 | } | |
2652 | }else{ | |
2653 | for (i=0; i<dstWidth; i++){ | |
aa13b0fc MN |
2654 | dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264 |
2655 | dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264 | |
6bc0c792 MN |
2656 | } |
2657 | } | |
2658 | } | |
077ea8a7 MN |
2659 | } |
2660 | ||
3e499f53 | 2661 | static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
2da0d70d DB |
2662 | int srcSliceH, uint8_t* dst[], int dstStride[]){ |
2663 | ||
2664 | /* load a few things into local vars to make the code more readable? and faster */ | |
2665 | const int srcW= c->srcW; | |
2666 | const int dstW= c->dstW; | |
2667 | const int dstH= c->dstH; | |
2668 | const int chrDstW= c->chrDstW; | |
2669 | const int chrSrcW= c->chrSrcW; | |
2670 | const int lumXInc= c->lumXInc; | |
2671 | const int chrXInc= c->chrXInc; | |
2672 | const int dstFormat= c->dstFormat; | |
2673 | const int srcFormat= c->srcFormat; | |
2674 | const int flags= c->flags; | |
2675 | const int canMMX2BeUsed= c->canMMX2BeUsed; | |
2676 | int16_t *vLumFilterPos= c->vLumFilterPos; | |
2677 | int16_t *vChrFilterPos= c->vChrFilterPos; | |
2678 | int16_t *hLumFilterPos= c->hLumFilterPos; | |
2679 | int16_t *hChrFilterPos= c->hChrFilterPos; | |
2680 | int16_t *vLumFilter= c->vLumFilter; | |
2681 | int16_t *vChrFilter= c->vChrFilter; | |
2682 | int16_t *hLumFilter= c->hLumFilter; | |
2683 | int16_t *hChrFilter= c->hChrFilter; | |
2684 | int32_t *lumMmxFilter= c->lumMmxFilter; | |
2685 | int32_t *chrMmxFilter= c->chrMmxFilter; | |
2686 | const int vLumFilterSize= c->vLumFilterSize; | |
2687 | const int vChrFilterSize= c->vChrFilterSize; | |
2688 | const int hLumFilterSize= c->hLumFilterSize; | |
2689 | const int hChrFilterSize= c->hChrFilterSize; | |
2690 | int16_t **lumPixBuf= c->lumPixBuf; | |
2691 | int16_t **chrPixBuf= c->chrPixBuf; | |
2692 | const int vLumBufSize= c->vLumBufSize; | |
2693 | const int vChrBufSize= c->vChrBufSize; | |
2694 | uint8_t *funnyYCode= c->funnyYCode; | |
2695 | uint8_t *funnyUVCode= c->funnyUVCode; | |
2696 | uint8_t *formatConvBuffer= c->formatConvBuffer; | |
2697 | const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; | |
2698 | const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); | |
2699 | int lastDstY; | |
e150ef8d | 2700 | uint32_t *pal=c->pal_yuv; |
2da0d70d | 2701 | |
8a322796 | 2702 | /* vars which will change and which we need to store back in the context */ |
2da0d70d DB |
2703 | int dstY= c->dstY; |
2704 | int lumBufIndex= c->lumBufIndex; | |
2705 | int chrBufIndex= c->chrBufIndex; | |
2706 | int lastInLumBuf= c->lastInLumBuf; | |
2707 | int lastInChrBuf= c->lastInChrBuf; | |
2708 | ||
2709 | if (isPacked(c->srcFormat)){ | |
2da0d70d DB |
2710 | src[0]= |
2711 | src[1]= | |
2712 | src[2]= src[0]; | |
2713 | srcStride[0]= | |
2714 | srcStride[1]= | |
2715 | srcStride[2]= srcStride[0]; | |
2716 | } | |
2717 | srcStride[1]<<= c->vChrDrop; | |
2718 | srcStride[2]<<= c->vChrDrop; | |
2719 | ||
2720 | //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], | |
2721 | // (int)dst[0], (int)dst[1], (int)dst[2]); | |
c7a810cc MN |
2722 | |
2723 | #if 0 //self test FIXME move to a vfilter or something | |
2da0d70d DB |
2724 | { |
2725 | static volatile int i=0; | |
2726 | i++; | |
2727 | if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH) | |
2728 | selfTest(src, srcStride, c->srcW, c->srcH); | |
2729 | i--; | |
2730 | } | |
c7a810cc | 2731 | #endif |
37079906 | 2732 | |
2da0d70d DB |
2733 | //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], |
2734 | //dstStride[0],dstStride[1],dstStride[2]); | |
2735 | ||
2736 | if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
2737 | { | |
6683a37f DP |
2738 | static int warnedAlready=0; //FIXME move this into the context perhaps |
2739 | if (flags & SWS_PRINT_INFO && !warnedAlready) | |
2da0d70d | 2740 | { |
4b0c30b7 | 2741 | av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" |
8a322796 | 2742 | " ->cannot do aligned memory accesses anymore\n"); |
6683a37f | 2743 | warnedAlready=1; |
2da0d70d DB |
2744 | } |
2745 | } | |
2746 | ||
8a322796 DB |
2747 | /* Note the user might start scaling the picture in the middle so this |
2748 | will not get executed. This is not really intended but works | |
2749 | currently, so people might do it. */ | |
2da0d70d DB |
2750 | if (srcSliceY ==0){ |
2751 | lumBufIndex=0; | |
2752 | chrBufIndex=0; | |
2753 | dstY=0; | |
2754 | lastInLumBuf= -1; | |
2755 | lastInChrBuf= -1; | |
2756 | } | |
2757 | ||
2758 | lastDstY= dstY; | |
2759 | ||
2760 | for (;dstY < dstH; dstY++){ | |
2761 | unsigned char *dest =dst[0]+dstStride[0]*dstY; | |
2762 | const int chrDstY= dstY>>c->chrDstVSubSample; | |
2763 | unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
2764 | unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
2765 | ||
2766 | const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2767 | const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2768 | const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2769 | const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2770 | ||
2771 | //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", | |
2772 | // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
2773 | //handle holes (FAST_BILINEAR & weird filters) | |
2774 | if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | |
2775 | if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | |
2776 | //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); | |
fcc402b1 LB |
2777 | assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1); |
2778 | assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1); | |
2da0d70d DB |
2779 | |
2780 | // Do we have enough lines in this slice to output the dstY line | |
2781 | if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) | |
2782 | { | |
2783 | //Do horizontal scaling | |
2784 | while(lastInLumBuf < lastLumSrcY) | |
2785 | { | |
2786 | uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
2787 | lumBufIndex++; | |
2788 | //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); | |
fcc402b1 LB |
2789 | assert(lumBufIndex < 2*vLumBufSize); |
2790 | assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); | |
2791 | assert(lastInLumBuf + 1 - srcSliceY >= 0); | |
2da0d70d | 2792 | //printf("%d %d\n", lumBufIndex, vLumBufSize); |
6bc0c792 | 2793 | RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
2da0d70d DB |
2794 | flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
2795 | funnyYCode, c->srcFormat, formatConvBuffer, | |
2796 | c->lumMmx2Filter, c->lumMmx2FilterPos, pal); | |
2797 | lastInLumBuf++; | |
2798 | } | |
2799 | while(lastInChrBuf < lastChrSrcY) | |
2800 | { | |
2801 | uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
2802 | uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
2803 | chrBufIndex++; | |
fcc402b1 LB |
2804 | assert(chrBufIndex < 2*vChrBufSize); |
2805 | assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); | |
2806 | assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); | |
2da0d70d DB |
2807 | //FIXME replace parameters through context struct (some at least) |
2808 | ||
2809 | if (!(isGray(srcFormat) || isGray(dstFormat))) | |
6bc0c792 | 2810 | RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |
2da0d70d DB |
2811 | flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
2812 | funnyUVCode, c->srcFormat, formatConvBuffer, | |
2813 | c->chrMmx2Filter, c->chrMmx2FilterPos, pal); | |
2814 | lastInChrBuf++; | |
2815 | } | |
2816 | //wrap buf index around to stay inside the ring buffer | |
e5091488 BF |
2817 | if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; |
2818 | if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; | |
2da0d70d DB |
2819 | } |
2820 | else // not enough lines left in this slice -> load the rest in the buffer | |
2821 | { | |
2822 | /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", | |
2823 | firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
2824 | lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
2825 | vChrBufSize, vLumBufSize);*/ | |
2826 | ||
2827 | //Do horizontal scaling | |
2828 | while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
2829 | { | |
2830 | uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
2831 | lumBufIndex++; | |
fcc402b1 LB |
2832 | assert(lumBufIndex < 2*vLumBufSize); |
2833 | assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); | |
2834 | assert(lastInLumBuf + 1 - srcSliceY >= 0); | |
6bc0c792 | 2835 | RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
2da0d70d DB |
2836 | flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
2837 | funnyYCode, c->srcFormat, formatConvBuffer, | |
2838 | c->lumMmx2Filter, c->lumMmx2FilterPos, pal); | |
2839 | lastInLumBuf++; | |
2840 | } | |
2841 | while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) | |
2842 | { | |
2843 | uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
2844 | uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
2845 | chrBufIndex++; | |
fcc402b1 LB |
2846 | assert(chrBufIndex < 2*vChrBufSize); |
2847 | assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH); | |
2848 | assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); | |
2da0d70d DB |
2849 | |
2850 | if (!(isGray(srcFormat) || isGray(dstFormat))) | |
6bc0c792 | 2851 | RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |
2da0d70d DB |
2852 | flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
2853 | funnyUVCode, c->srcFormat, formatConvBuffer, | |
2854 | c->chrMmx2Filter, c->chrMmx2FilterPos, pal); | |
2855 | lastInChrBuf++; | |
2856 | } | |
2857 | //wrap buf index around to stay inside the ring buffer | |
e5091488 BF |
2858 | if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; |
2859 | if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; | |
2da0d70d DB |
2860 | break; //we can't output a dstY line so let's try with the next slice |
2861 | } | |
d3f41512 | 2862 | |
b63f641e | 2863 | #if HAVE_MMX |
88e2a9ae | 2864 | c->blueDither= ff_dither8[dstY&1]; |
92c7b471 | 2865 | if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555) |
88e2a9ae | 2866 | c->greenDither= ff_dither8[dstY&1]; |
92c7b471 | 2867 | else |
88e2a9ae CEH |
2868 | c->greenDither= ff_dither4[dstY&1]; |
2869 | c->redDither= ff_dither8[(dstY+1)&1]; | |
2da0d70d DB |
2870 | #endif |
2871 | if (dstY < dstH-2) | |
2872 | { | |
2873 | int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2874 | int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
b63f641e | 2875 | #if HAVE_MMX |
2da0d70d DB |
2876 | int i; |
2877 | if (flags & SWS_ACCURATE_RND){ | |
1625216e | 2878 | int s= APCK_SIZE / 8; |
2da0d70d | 2879 | for (i=0; i<vLumFilterSize; i+=2){ |
1625216e MN |
2880 | *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ]; |
2881 | *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)]; | |
2882 | lumMmxFilter[s*i+APCK_COEF/4 ]= | |
2883 | lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ] | |
2da0d70d DB |
2884 | + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); |
2885 | } | |
2886 | for (i=0; i<vChrFilterSize; i+=2){ | |
1625216e MN |
2887 | *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ]; |
2888 | *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)]; | |
2889 | chrMmxFilter[s*i+APCK_COEF/4 ]= | |
2890 | chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ] | |
2da0d70d | 2891 | + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); |
bca11e75 | 2892 | } |
2da0d70d DB |
2893 | }else{ |
2894 | for (i=0; i<vLumFilterSize; i++) | |
2895 | { | |
2896 | lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; | |
2897 | lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32; | |
2898 | lumMmxFilter[4*i+2]= | |
2899 | lumMmxFilter[4*i+3]= | |
2900 | ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; | |
2901 | } | |
2902 | for (i=0; i<vChrFilterSize; i++) | |
2903 | { | |
2904 | chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; | |
2905 | chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32; | |
2906 | chrMmxFilter[4*i+2]= | |
2907 | chrMmxFilter[4*i+3]= | |
2908 | ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; | |
2909 | } | |
2910 | } | |
6542b44e | 2911 | #endif |
2da0d70d DB |
2912 | if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ |
2913 | const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2914 | if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
2915 | RENAME(yuv2nv12X)(c, | |
2916 | vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2917 | vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2918 | dest, uDest, dstW, chrDstW, dstFormat); | |
e3d2500f | 2919 | } |
b0880d5d | 2920 | else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like |
2da0d70d DB |
2921 | { |
2922 | const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2923 | if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
8a322796 | 2924 | if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12 |
2da0d70d DB |
2925 | { |
2926 | int16_t *lumBuf = lumPixBuf[0]; | |
2927 | int16_t *chrBuf= chrPixBuf[0]; | |
bf2bdde6 | 2928 | RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); |
2da0d70d DB |
2929 | } |
2930 | else //General YV12 | |
2931 | { | |
2932 | RENAME(yuv2yuvX)(c, | |
2933 | vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2934 | vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2935 | dest, uDest, vDest, dstW, chrDstW); | |
2936 | } | |
2937 | } | |
2938 | else | |
2939 | { | |
fcc402b1 LB |
2940 | assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); |
2941 | assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
8a322796 | 2942 | if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB |
2da0d70d DB |
2943 | { |
2944 | int chrAlpha= vChrFilter[2*dstY+1]; | |
f0faee4c MN |
2945 | if(flags & SWS_FULL_CHR_H_INT){ |
2946 | yuv2rgbXinC_full(c, //FIXME write a packed1_full function | |
2947 | vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2948 | vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2949 | dest, dstW, dstY); | |
2950 | }else{ | |
14014d47 MN |
2951 | RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), |
2952 | dest, dstW, chrAlpha, dstFormat, flags, dstY); | |
f0faee4c | 2953 | } |
2da0d70d | 2954 | } |
8a322796 | 2955 | else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB |
2da0d70d DB |
2956 | { |
2957 | int lumAlpha= vLumFilter[2*dstY+1]; | |
2958 | int chrAlpha= vChrFilter[2*dstY+1]; | |
2959 | lumMmxFilter[2]= | |
2960 | lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001; | |
2961 | chrMmxFilter[2]= | |
2962 | chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001; | |
f0faee4c MN |
2963 | if(flags & SWS_FULL_CHR_H_INT){ |
2964 | yuv2rgbXinC_full(c, //FIXME write a packed2_full function | |
2965 | vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2966 | vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2967 | dest, dstW, dstY); | |
2968 | }else{ | |
14014d47 MN |
2969 | RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), |
2970 | dest, dstW, lumAlpha, chrAlpha, dstY); | |
f0faee4c | 2971 | } |
2da0d70d | 2972 | } |
8a322796 | 2973 | else //general RGB |
2da0d70d | 2974 | { |
f0faee4c MN |
2975 | if(flags & SWS_FULL_CHR_H_INT){ |
2976 | yuv2rgbXinC_full(c, | |
2977 | vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2978 | vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2979 | dest, dstW, dstY); | |
2980 | }else{ | |
14014d47 MN |
2981 | RENAME(yuv2packedX)(c, |
2982 | vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2983 | vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2984 | dest, dstW, dstY); | |
f0faee4c | 2985 | } |
2da0d70d DB |
2986 | } |
2987 | } | |
2988 | } | |
2989 | else // hmm looks like we can't use MMX here without overwriting this array's tail | |
2990 | { | |
2991 | int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2992 | int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2993 | if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ | |
2994 | const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2995 | if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
2996 | yuv2nv12XinC( | |
2997 | vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2998 | vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2999 | dest, uDest, dstW, chrDstW, dstFormat); | |
3000 | } | |
b0880d5d | 3001 | else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 |
2da0d70d DB |
3002 | { |
3003 | const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3004 | if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3005 | yuv2yuvXinC( | |
3006 | vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3007 | vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3008 | dest, uDest, vDest, dstW, chrDstW); | |
3009 | } | |
3010 | else | |
3011 | { | |
fcc402b1 LB |
3012 | assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); |
3013 | assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
f0faee4c MN |
3014 | if(flags & SWS_FULL_CHR_H_INT){ |
3015 | yuv2rgbXinC_full(c, | |
3016 | vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
3017 | vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3018 | dest, dstW, dstY); | |
3019 | }else{ | |
14014d47 MN |
3020 | yuv2packedXinC(c, |
3021 | vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
3022 | vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3023 | dest, dstW, dstY); | |
f0faee4c | 3024 | } |
2da0d70d DB |
3025 | } |
3026 | } | |
3027 | } | |
17f715fa | 3028 | |
b63f641e | 3029 | #if HAVE_MMX |
7ad6469e DP |
3030 | __asm__ volatile(SFENCE:::"memory"); |
3031 | __asm__ volatile(EMMS:::"memory"); | |
17f715fa | 3032 | #endif |
2da0d70d DB |
3033 | /* store changed local vars back in the context */ |
3034 | c->dstY= dstY; | |
3035 | c->lumBufIndex= lumBufIndex; | |
3036 | c->chrBufIndex= chrBufIndex; | |
3037 | c->lastInLumBuf= lastInLumBuf; | |
3038 | c->lastInChrBuf= lastInChrBuf; | |
d4e24275 | 3039 | |
2da0d70d | 3040 | return dstY - lastDstY; |
627690b5 | 3041 | } |