Commit | Line | Data |
---|---|---|
fe8054c0 | 1 | /* |
d026b45e DB |
2 | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with FFmpeg; if not, write to the Free Software | |
b19bcbaa | 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
d026b45e | 19 | * |
8a322796 DB |
20 | * The C code (not assembly, MMX, ...) of this file can be used |
21 | * under the LGPL license. | |
d026b45e | 22 | */ |
783e9cc9 | 23 | |
6e1c66bc | 24 | #undef REAL_MOVNTQ |
541c4eb9 | 25 | #undef MOVNTQ |
7d7f78b5 | 26 | #undef PAVGB |
48a05cec MN |
27 | #undef PREFETCH |
28 | #undef PREFETCHW | |
29 | #undef EMMS | |
30 | #undef SFENCE | |
31 | ||
32 | #ifdef HAVE_3DNOW | |
8a322796 | 33 | /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */ |
48a05cec MN |
34 | #define EMMS "femms" |
35 | #else | |
36 | #define EMMS "emms" | |
37 | #endif | |
38 | ||
39 | #ifdef HAVE_3DNOW | |
40 | #define PREFETCH "prefetch" | |
41 | #define PREFETCHW "prefetchw" | |
e5091488 | 42 | #elif defined (HAVE_MMX2) |
48a05cec MN |
43 | #define PREFETCH "prefetchnta" |
44 | #define PREFETCHW "prefetcht0" | |
45 | #else | |
d904b5fc NP |
46 | #define PREFETCH " # nop" |
47 | #define PREFETCHW " # nop" | |
48a05cec MN |
48 | #endif |
49 | ||
50 | #ifdef HAVE_MMX2 | |
51 | #define SFENCE "sfence" | |
52 | #else | |
d904b5fc | 53 | #define SFENCE " # nop" |
48a05cec | 54 | #endif |
d3f41512 | 55 | |
d604bab9 MN |
56 | #ifdef HAVE_MMX2 |
57 | #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
58 | #elif defined (HAVE_3DNOW) | |
59 | #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
60 | #endif | |
d3f41512 | 61 | |
d604bab9 | 62 | #ifdef HAVE_MMX2 |
6e1c66bc | 63 | #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
d604bab9 | 64 | #else |
6e1c66bc | 65 | #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
d604bab9 | 66 | #endif |
6e1c66bc | 67 | #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) |
d604bab9 | 68 | |
a2faa401 RD |
69 | #ifdef HAVE_ALTIVEC |
70 | #include "swscale_altivec_template.c" | |
71 | #endif | |
72 | ||
bca11e75 | 73 | #define YSCALEYUV2YV12X(x, offset, dest, width) \ |
2da0d70d DB |
74 | asm volatile(\ |
75 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
76 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | |
77 | "movq %%mm3, %%mm4 \n\t"\ | |
78 | "lea " offset "(%0), %%"REG_d" \n\t"\ | |
79 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
80 | ASMALIGN(4) /* FIXME Unroll? */\ | |
81 | "1: \n\t"\ | |
82 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
8b2fce0d MN |
83 | "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ |
84 | "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\ | |
2da0d70d DB |
85 | "add $16, %%"REG_d" \n\t"\ |
86 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
87 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
88 | "pmulhw %%mm0, %%mm2 \n\t"\ | |
89 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
90 | "paddw %%mm2, %%mm3 \n\t"\ | |
91 | "paddw %%mm5, %%mm4 \n\t"\ | |
92 | " jnz 1b \n\t"\ | |
93 | "psraw $3, %%mm3 \n\t"\ | |
94 | "psraw $3, %%mm4 \n\t"\ | |
95 | "packuswb %%mm4, %%mm3 \n\t"\ | |
96 | MOVNTQ(%%mm3, (%1, %%REGa))\ | |
97 | "add $8, %%"REG_a" \n\t"\ | |
98 | "cmp %2, %%"REG_a" \n\t"\ | |
99 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | |
100 | "movq %%mm3, %%mm4 \n\t"\ | |
101 | "lea " offset "(%0), %%"REG_d" \n\t"\ | |
102 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
103 | "jb 1b \n\t"\ | |
104 | :: "r" (&c->redDither),\ | |
105 | "r" (dest), "g" (width)\ | |
106 | : "%"REG_a, "%"REG_d, "%"REG_S\ | |
107 | ); | |
bca11e75 MN |
108 | |
109 | #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ | |
2da0d70d DB |
110 | asm volatile(\ |
111 | "lea " offset "(%0), %%"REG_d" \n\t"\ | |
112 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
113 | "pxor %%mm4, %%mm4 \n\t"\ | |
114 | "pxor %%mm5, %%mm5 \n\t"\ | |
115 | "pxor %%mm6, %%mm6 \n\t"\ | |
116 | "pxor %%mm7, %%mm7 \n\t"\ | |
117 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
118 | ASMALIGN(4) \ | |
119 | "1: \n\t"\ | |
8b2fce0d MN |
120 | "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\ |
121 | "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ | |
1625216e | 122 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ |
8b2fce0d | 123 | "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\ |
2da0d70d DB |
124 | "movq %%mm0, %%mm3 \n\t"\ |
125 | "punpcklwd %%mm1, %%mm0 \n\t"\ | |
126 | "punpckhwd %%mm1, %%mm3 \n\t"\ | |
1625216e | 127 | "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ |
2da0d70d DB |
128 | "pmaddwd %%mm1, %%mm0 \n\t"\ |
129 | "pmaddwd %%mm1, %%mm3 \n\t"\ | |
130 | "paddd %%mm0, %%mm4 \n\t"\ | |
131 | "paddd %%mm3, %%mm5 \n\t"\ | |
8b2fce0d | 132 | "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\ |
1625216e MN |
133 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ |
134 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |
2da0d70d DB |
135 | "test %%"REG_S", %%"REG_S" \n\t"\ |
136 | "movq %%mm2, %%mm0 \n\t"\ | |
137 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
138 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
139 | "pmaddwd %%mm1, %%mm2 \n\t"\ | |
140 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
141 | "paddd %%mm2, %%mm6 \n\t"\ | |
142 | "paddd %%mm0, %%mm7 \n\t"\ | |
143 | " jnz 1b \n\t"\ | |
144 | "psrad $16, %%mm4 \n\t"\ | |
145 | "psrad $16, %%mm5 \n\t"\ | |
146 | "psrad $16, %%mm6 \n\t"\ | |
147 | "psrad $16, %%mm7 \n\t"\ | |
148 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | |
149 | "packssdw %%mm5, %%mm4 \n\t"\ | |
150 | "packssdw %%mm7, %%mm6 \n\t"\ | |
151 | "paddw %%mm0, %%mm4 \n\t"\ | |
152 | "paddw %%mm0, %%mm6 \n\t"\ | |
153 | "psraw $3, %%mm4 \n\t"\ | |
154 | "psraw $3, %%mm6 \n\t"\ | |
155 | "packuswb %%mm6, %%mm4 \n\t"\ | |
156 | MOVNTQ(%%mm4, (%1, %%REGa))\ | |
157 | "add $8, %%"REG_a" \n\t"\ | |
158 | "cmp %2, %%"REG_a" \n\t"\ | |
159 | "lea " offset "(%0), %%"REG_d" \n\t"\ | |
160 | "pxor %%mm4, %%mm4 \n\t"\ | |
161 | "pxor %%mm5, %%mm5 \n\t"\ | |
162 | "pxor %%mm6, %%mm6 \n\t"\ | |
163 | "pxor %%mm7, %%mm7 \n\t"\ | |
164 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
165 | "jb 1b \n\t"\ | |
166 | :: "r" (&c->redDither),\ | |
167 | "r" (dest), "g" (width)\ | |
168 | : "%"REG_a, "%"REG_d, "%"REG_S\ | |
169 | ); | |
c1b0bfb4 MN |
170 | |
171 | #define YSCALEYUV2YV121 \ | |
2da0d70d DB |
172 | "mov %2, %%"REG_a" \n\t"\ |
173 | ASMALIGN(4) /* FIXME Unroll? */\ | |
174 | "1: \n\t"\ | |
175 | "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | |
176 | "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ | |
177 | "psraw $7, %%mm0 \n\t"\ | |
178 | "psraw $7, %%mm1 \n\t"\ | |
179 | "packuswb %%mm1, %%mm0 \n\t"\ | |
180 | MOVNTQ(%%mm0, (%1, %%REGa))\ | |
181 | "add $8, %%"REG_a" \n\t"\ | |
182 | "jnc 1b \n\t" | |
c1b0bfb4 | 183 | |
bf2bdde6 MN |
184 | #define YSCALEYUV2YV121_ACCURATE \ |
185 | "mov %2, %%"REG_a" \n\t"\ | |
186 | "pcmpeqw %%mm7, %%mm7 \n\t"\ | |
187 | "psrlw $15, %%mm7 \n\t"\ | |
188 | "psllw $6, %%mm7 \n\t"\ | |
189 | ASMALIGN(4) /* FIXME Unroll? */\ | |
190 | "1: \n\t"\ | |
191 | "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | |
192 | "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ | |
193 | "paddw %%mm7, %%mm0 \n\t"\ | |
194 | "paddw %%mm7, %%mm1 \n\t"\ | |
195 | "psraw $7, %%mm0 \n\t"\ | |
196 | "psraw $7, %%mm1 \n\t"\ | |
197 | "packuswb %%mm1, %%mm0 \n\t"\ | |
198 | MOVNTQ(%%mm0, (%1, %%REGa))\ | |
199 | "add $8, %%"REG_a" \n\t"\ | |
200 | "jnc 1b \n\t" | |
201 | ||
c1b0bfb4 | 202 | /* |
2da0d70d DB |
203 | :: "m" (-lumFilterSize), "m" (-chrFilterSize), |
204 | "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
205 | "r" (dest), "m" (dstW), | |
206 | "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
207 | : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
c1b0bfb4 | 208 | */ |
25593e29 | 209 | #define YSCALEYUV2PACKEDX \ |
2da0d70d DB |
210 | asm volatile(\ |
211 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
212 | ASMALIGN(4)\ | |
213 | "nop \n\t"\ | |
214 | "1: \n\t"\ | |
215 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | |
216 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
217 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | |
218 | "movq %%mm3, %%mm4 \n\t"\ | |
219 | ASMALIGN(4)\ | |
220 | "2: \n\t"\ | |
221 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
222 | "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | |
8b2fce0d | 223 | "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ |
2da0d70d DB |
224 | "add $16, %%"REG_d" \n\t"\ |
225 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
226 | "pmulhw %%mm0, %%mm2 \n\t"\ | |
227 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
228 | "paddw %%mm2, %%mm3 \n\t"\ | |
229 | "paddw %%mm5, %%mm4 \n\t"\ | |
230 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
231 | " jnz 2b \n\t"\ | |
c1b0bfb4 | 232 | \ |
2da0d70d DB |
233 | "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ |
234 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
235 | "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\ | |
236 | "movq %%mm1, %%mm7 \n\t"\ | |
237 | ASMALIGN(4)\ | |
238 | "2: \n\t"\ | |
239 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
240 | "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ | |
241 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ | |
242 | "add $16, %%"REG_d" \n\t"\ | |
243 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
244 | "pmulhw %%mm0, %%mm2 \n\t"\ | |
245 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
246 | "paddw %%mm2, %%mm1 \n\t"\ | |
247 | "paddw %%mm5, %%mm7 \n\t"\ | |
248 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
249 | " jnz 2b \n\t"\ | |
250 | ||
251 | #define YSCALEYUV2PACKEDX_END \ | |
252 | :: "r" (&c->redDither), \ | |
253 | "m" (dummy), "m" (dummy), "m" (dummy),\ | |
254 | "r" (dest), "m" (dstW) \ | |
255 | : "%"REG_a, "%"REG_d, "%"REG_S \ | |
256 | ); | |
8422aa88 | 257 | |
bca11e75 | 258 | #define YSCALEYUV2PACKEDX_ACCURATE \ |
2da0d70d DB |
259 | asm volatile(\ |
260 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
261 | ASMALIGN(4)\ | |
262 | "nop \n\t"\ | |
263 | "1: \n\t"\ | |
264 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | |
265 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
266 | "pxor %%mm4, %%mm4 \n\t"\ | |
267 | "pxor %%mm5, %%mm5 \n\t"\ | |
268 | "pxor %%mm6, %%mm6 \n\t"\ | |
269 | "pxor %%mm7, %%mm7 \n\t"\ | |
270 | ASMALIGN(4)\ | |
271 | "2: \n\t"\ | |
272 | "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ | |
8b2fce0d | 273 | "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ |
1625216e | 274 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ |
2da0d70d DB |
275 | "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ |
276 | "movq %%mm0, %%mm3 \n\t"\ | |
277 | "punpcklwd %%mm1, %%mm0 \n\t"\ | |
278 | "punpckhwd %%mm1, %%mm3 \n\t"\ | |
1625216e | 279 | "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ |
2da0d70d DB |
280 | "pmaddwd %%mm1, %%mm0 \n\t"\ |
281 | "pmaddwd %%mm1, %%mm3 \n\t"\ | |
282 | "paddd %%mm0, %%mm4 \n\t"\ | |
283 | "paddd %%mm3, %%mm5 \n\t"\ | |
8b2fce0d | 284 | "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ |
1625216e MN |
285 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ |
286 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |
2da0d70d DB |
287 | "test %%"REG_S", %%"REG_S" \n\t"\ |
288 | "movq %%mm2, %%mm0 \n\t"\ | |
289 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
290 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
291 | "pmaddwd %%mm1, %%mm2 \n\t"\ | |
292 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
293 | "paddd %%mm2, %%mm6 \n\t"\ | |
294 | "paddd %%mm0, %%mm7 \n\t"\ | |
295 | " jnz 2b \n\t"\ | |
296 | "psrad $16, %%mm4 \n\t"\ | |
297 | "psrad $16, %%mm5 \n\t"\ | |
298 | "psrad $16, %%mm6 \n\t"\ | |
299 | "psrad $16, %%mm7 \n\t"\ | |
300 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | |
301 | "packssdw %%mm5, %%mm4 \n\t"\ | |
302 | "packssdw %%mm7, %%mm6 \n\t"\ | |
303 | "paddw %%mm0, %%mm4 \n\t"\ | |
304 | "paddw %%mm0, %%mm6 \n\t"\ | |
305 | "movq %%mm4, "U_TEMP"(%0) \n\t"\ | |
306 | "movq %%mm6, "V_TEMP"(%0) \n\t"\ | |
bca11e75 | 307 | \ |
2da0d70d DB |
308 | "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ |
309 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
310 | "pxor %%mm1, %%mm1 \n\t"\ | |
311 | "pxor %%mm5, %%mm5 \n\t"\ | |
312 | "pxor %%mm7, %%mm7 \n\t"\ | |
313 | "pxor %%mm6, %%mm6 \n\t"\ | |
314 | ASMALIGN(4)\ | |
315 | "2: \n\t"\ | |
316 | "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ | |
317 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ | |
1625216e | 318 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ |
2da0d70d DB |
319 | "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ |
320 | "movq %%mm0, %%mm3 \n\t"\ | |
321 | "punpcklwd %%mm4, %%mm0 \n\t"\ | |
322 | "punpckhwd %%mm4, %%mm3 \n\t"\ | |
1625216e | 323 | "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ |
2da0d70d DB |
324 | "pmaddwd %%mm4, %%mm0 \n\t"\ |
325 | "pmaddwd %%mm4, %%mm3 \n\t"\ | |
326 | "paddd %%mm0, %%mm1 \n\t"\ | |
327 | "paddd %%mm3, %%mm5 \n\t"\ | |
328 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ | |
1625216e MN |
329 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ |
330 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |
2da0d70d DB |
331 | "test %%"REG_S", %%"REG_S" \n\t"\ |
332 | "movq %%mm2, %%mm0 \n\t"\ | |
333 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
334 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
335 | "pmaddwd %%mm4, %%mm2 \n\t"\ | |
336 | "pmaddwd %%mm4, %%mm0 \n\t"\ | |
337 | "paddd %%mm2, %%mm7 \n\t"\ | |
338 | "paddd %%mm0, %%mm6 \n\t"\ | |
339 | " jnz 2b \n\t"\ | |
340 | "psrad $16, %%mm1 \n\t"\ | |
341 | "psrad $16, %%mm5 \n\t"\ | |
342 | "psrad $16, %%mm7 \n\t"\ | |
343 | "psrad $16, %%mm6 \n\t"\ | |
344 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | |
345 | "packssdw %%mm5, %%mm1 \n\t"\ | |
346 | "packssdw %%mm6, %%mm7 \n\t"\ | |
347 | "paddw %%mm0, %%mm1 \n\t"\ | |
348 | "paddw %%mm0, %%mm7 \n\t"\ | |
349 | "movq "U_TEMP"(%0), %%mm3 \n\t"\ | |
350 | "movq "V_TEMP"(%0), %%mm4 \n\t"\ | |
bca11e75 | 351 | |
8422aa88 | 352 | #define YSCALEYUV2RGBX \ |
2da0d70d DB |
353 | "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
354 | "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
355 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
356 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
357 | "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ | |
358 | "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
359 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
360 | "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ | |
361 | "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
362 | "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
363 | "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
364 | "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
365 | "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
366 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
367 | "paddw %%mm3, %%mm4 \n\t"\ | |
368 | "movq %%mm2, %%mm0 \n\t"\ | |
369 | "movq %%mm5, %%mm6 \n\t"\ | |
370 | "movq %%mm4, %%mm3 \n\t"\ | |
371 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
372 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
373 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
374 | "paddw %%mm1, %%mm2 \n\t"\ | |
375 | "paddw %%mm1, %%mm5 \n\t"\ | |
376 | "paddw %%mm1, %%mm4 \n\t"\ | |
377 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
378 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
379 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
380 | "paddw %%mm7, %%mm0 \n\t"\ | |
381 | "paddw %%mm7, %%mm6 \n\t"\ | |
382 | "paddw %%mm7, %%mm3 \n\t"\ | |
383 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
384 | "packuswb %%mm0, %%mm2 \n\t"\ | |
385 | "packuswb %%mm6, %%mm5 \n\t"\ | |
386 | "packuswb %%mm3, %%mm4 \n\t"\ | |
387 | "pxor %%mm7, %%mm7 \n\t" | |
77a49659 | 388 | #if 0 |
d604bab9 | 389 | #define FULL_YSCALEYUV2RGB \ |
2da0d70d DB |
390 | "pxor %%mm7, %%mm7 \n\t"\ |
391 | "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
392 | "punpcklwd %%mm6, %%mm6 \n\t"\ | |
393 | "punpcklwd %%mm6, %%mm6 \n\t"\ | |
394 | "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
395 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
396 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
397 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
398 | ASMALIGN(4)\ | |
399 | "1: \n\t"\ | |
400 | "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\ | |
401 | "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\ | |
402 | "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
403 | "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
404 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
405 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
406 | "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
407 | "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
408 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
8b2fce0d | 409 | "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
2da0d70d DB |
410 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
411 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
8b2fce0d | 412 | "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ |
2da0d70d DB |
413 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
414 | "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
415 | "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | |
416 | "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
417 | "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
d604bab9 MN |
418 | \ |
419 | \ | |
2da0d70d DB |
420 | "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
421 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
422 | "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\ | |
423 | "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
424 | "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\ | |
425 | "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
426 | "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ | |
d604bab9 MN |
427 | \ |
428 | \ | |
2da0d70d DB |
429 | "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ |
430 | "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\ | |
431 | "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\ | |
432 | "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
433 | "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |
434 | "packuswb %%mm3, %%mm3 \n\t"\ | |
d604bab9 | 435 | \ |
2da0d70d DB |
436 | "packuswb %%mm0, %%mm0 \n\t"\ |
437 | "paddw %%mm4, %%mm2 \n\t"\ | |
438 | "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |
d604bab9 | 439 | \ |
2da0d70d | 440 | "packuswb %%mm1, %%mm1 \n\t" |
77a49659 | 441 | #endif |
d604bab9 | 442 | |
6e1c66bc | 443 | #define REAL_YSCALEYUV2PACKED(index, c) \ |
2da0d70d DB |
444 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ |
445 | "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ | |
446 | "psraw $3, %%mm0 \n\t"\ | |
447 | "psraw $3, %%mm1 \n\t"\ | |
448 | "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ | |
449 | "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ | |
450 | "xor "#index", "#index" \n\t"\ | |
451 | ASMALIGN(4)\ | |
452 | "1: \n\t"\ | |
453 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
454 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
8b2fce0d MN |
455 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
456 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2da0d70d DB |
457 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
458 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
459 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | |
460 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
461 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
462 | "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
463 | "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
464 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
465 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
466 | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
467 | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
468 | "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
469 | "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
470 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
471 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
472 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
473 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
474 | "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
475 | "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
476 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
477 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
6a4970ab | 478 | |
6e1c66bc | 479 | #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) |
6a4970ab | 480 | |
6e1c66bc | 481 | #define REAL_YSCALEYUV2RGB(index, c) \ |
2da0d70d DB |
482 | "xor "#index", "#index" \n\t"\ |
483 | ASMALIGN(4)\ | |
484 | "1: \n\t"\ | |
485 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
486 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
8b2fce0d MN |
487 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
488 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2da0d70d DB |
489 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
490 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
491 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | |
492 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
493 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
494 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
495 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
496 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
497 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
498 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
499 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
500 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
501 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
502 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | |
503 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | |
504 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
505 | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
506 | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
507 | "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
508 | "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
509 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
510 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
511 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
512 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
513 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
514 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
515 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
516 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
517 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | |
518 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | |
519 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
520 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
521 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
522 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
523 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
524 | "paddw %%mm3, %%mm4 \n\t"\ | |
525 | "movq %%mm2, %%mm0 \n\t"\ | |
526 | "movq %%mm5, %%mm6 \n\t"\ | |
527 | "movq %%mm4, %%mm3 \n\t"\ | |
528 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
529 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
530 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
531 | "paddw %%mm1, %%mm2 \n\t"\ | |
532 | "paddw %%mm1, %%mm5 \n\t"\ | |
533 | "paddw %%mm1, %%mm4 \n\t"\ | |
534 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
535 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
536 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
537 | "paddw %%mm7, %%mm0 \n\t"\ | |
538 | "paddw %%mm7, %%mm6 \n\t"\ | |
539 | "paddw %%mm7, %%mm3 \n\t"\ | |
540 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
541 | "packuswb %%mm0, %%mm2 \n\t"\ | |
542 | "packuswb %%mm6, %%mm5 \n\t"\ | |
543 | "packuswb %%mm3, %%mm4 \n\t"\ | |
544 | "pxor %%mm7, %%mm7 \n\t" | |
6e1c66bc | 545 | #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c) |
6a4970ab | 546 | |
6e1c66bc | 547 | #define REAL_YSCALEYUV2PACKED1(index, c) \ |
2da0d70d DB |
548 | "xor "#index", "#index" \n\t"\ |
549 | ASMALIGN(4)\ | |
550 | "1: \n\t"\ | |
551 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
8b2fce0d | 552 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
2da0d70d DB |
553 | "psraw $7, %%mm3 \n\t" \ |
554 | "psraw $7, %%mm4 \n\t" \ | |
555 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
556 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
557 | "psraw $7, %%mm1 \n\t" \ | |
558 | "psraw $7, %%mm7 \n\t" \ | |
6a4970ab | 559 | |
6e1c66bc | 560 | #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) |
6a4970ab | 561 | |
6e1c66bc | 562 | #define REAL_YSCALEYUV2RGB1(index, c) \ |
2da0d70d DB |
563 | "xor "#index", "#index" \n\t"\ |
564 | ASMALIGN(4)\ | |
565 | "1: \n\t"\ | |
566 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
8b2fce0d | 567 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
2da0d70d DB |
568 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
569 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
570 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
571 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
572 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
573 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
574 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | |
575 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | |
576 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
577 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
578 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
579 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
580 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
581 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | |
582 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | |
583 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
584 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
585 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
586 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
587 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
588 | "paddw %%mm3, %%mm4 \n\t"\ | |
589 | "movq %%mm2, %%mm0 \n\t"\ | |
590 | "movq %%mm5, %%mm6 \n\t"\ | |
591 | "movq %%mm4, %%mm3 \n\t"\ | |
592 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
593 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
594 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
595 | "paddw %%mm1, %%mm2 \n\t"\ | |
596 | "paddw %%mm1, %%mm5 \n\t"\ | |
597 | "paddw %%mm1, %%mm4 \n\t"\ | |
598 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
599 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
600 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
601 | "paddw %%mm7, %%mm0 \n\t"\ | |
602 | "paddw %%mm7, %%mm6 \n\t"\ | |
603 | "paddw %%mm7, %%mm3 \n\t"\ | |
604 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
605 | "packuswb %%mm0, %%mm2 \n\t"\ | |
606 | "packuswb %%mm6, %%mm5 \n\t"\ | |
607 | "packuswb %%mm3, %%mm4 \n\t"\ | |
608 | "pxor %%mm7, %%mm7 \n\t" | |
6e1c66bc | 609 | #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) |
497d4f99 | 610 | |
6e1c66bc | 611 | #define REAL_YSCALEYUV2PACKED1b(index, c) \ |
2da0d70d DB |
612 | "xor "#index", "#index" \n\t"\ |
613 | ASMALIGN(4)\ | |
614 | "1: \n\t"\ | |
615 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
616 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
8b2fce0d MN |
617 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
618 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2da0d70d DB |
619 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
620 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
621 | "psrlw $8, %%mm3 \n\t" \ | |
622 | "psrlw $8, %%mm4 \n\t" \ | |
623 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
624 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
625 | "psraw $7, %%mm1 \n\t" \ | |
626 | "psraw $7, %%mm7 \n\t" | |
6e1c66bc | 627 | #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) |
6a4970ab | 628 | |
497d4f99 | 629 | // do vertical chrominance interpolation |
6e1c66bc | 630 | #define REAL_YSCALEYUV2RGB1b(index, c) \ |
2da0d70d DB |
631 | "xor "#index", "#index" \n\t"\ |
632 | ASMALIGN(4)\ | |
633 | "1: \n\t"\ | |
634 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
635 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
8b2fce0d MN |
636 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
637 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2da0d70d DB |
638 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
639 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
640 | "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ | |
641 | "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
642 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
643 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
644 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
645 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
646 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | |
647 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | |
648 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
649 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
650 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
651 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
652 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
653 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | |
654 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | |
655 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
656 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
657 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
658 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
659 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
660 | "paddw %%mm3, %%mm4 \n\t"\ | |
661 | "movq %%mm2, %%mm0 \n\t"\ | |
662 | "movq %%mm5, %%mm6 \n\t"\ | |
663 | "movq %%mm4, %%mm3 \n\t"\ | |
664 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
665 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
666 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
667 | "paddw %%mm1, %%mm2 \n\t"\ | |
668 | "paddw %%mm1, %%mm5 \n\t"\ | |
669 | "paddw %%mm1, %%mm4 \n\t"\ | |
670 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
671 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
672 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
673 | "paddw %%mm7, %%mm0 \n\t"\ | |
674 | "paddw %%mm7, %%mm6 \n\t"\ | |
675 | "paddw %%mm7, %%mm3 \n\t"\ | |
676 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
677 | "packuswb %%mm0, %%mm2 \n\t"\ | |
678 | "packuswb %%mm6, %%mm5 \n\t"\ | |
679 | "packuswb %%mm3, %%mm4 \n\t"\ | |
680 | "pxor %%mm7, %%mm7 \n\t" | |
6e1c66bc | 681 | #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) |
d604bab9 | 682 | |
6e1c66bc | 683 | #define REAL_WRITEBGR32(dst, dstw, index) \ |
2da0d70d DB |
684 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
685 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
686 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
687 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
688 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
689 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
690 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
691 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
692 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
693 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
694 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
695 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
696 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
d604bab9 | 697 | \ |
2da0d70d DB |
698 | MOVNTQ(%%mm0, (dst, index, 4))\ |
699 | MOVNTQ(%%mm2, 8(dst, index, 4))\ | |
700 | MOVNTQ(%%mm1, 16(dst, index, 4))\ | |
701 | MOVNTQ(%%mm3, 24(dst, index, 4))\ | |
d604bab9 | 702 | \ |
2da0d70d DB |
703 | "add $8, "#index" \n\t"\ |
704 | "cmp "#dstw", "#index" \n\t"\ | |
705 | " jb 1b \n\t" | |
6e1c66bc | 706 | #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index) |
d604bab9 | 707 | |
27a90b04 | 708 | #define REAL_WRITERGB16(dst, dstw, index) \ |
2da0d70d DB |
709 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
710 | "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
711 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
712 | "psrlq $3, %%mm2 \n\t"\ | |
d604bab9 | 713 | \ |
2da0d70d DB |
714 | "movq %%mm2, %%mm1 \n\t"\ |
715 | "movq %%mm4, %%mm3 \n\t"\ | |
d604bab9 | 716 | \ |
2da0d70d DB |
717 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
718 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
719 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
720 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
d604bab9 | 721 | \ |
2da0d70d DB |
722 | "psllq $3, %%mm3 \n\t"\ |
723 | "psllq $3, %%mm4 \n\t"\ | |
d604bab9 | 724 | \ |
2da0d70d DB |
725 | "por %%mm3, %%mm2 \n\t"\ |
726 | "por %%mm4, %%mm1 \n\t"\ | |
d604bab9 | 727 | \ |
2da0d70d DB |
728 | MOVNTQ(%%mm2, (dst, index, 2))\ |
729 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
d604bab9 | 730 | \ |
2da0d70d DB |
731 | "add $8, "#index" \n\t"\ |
732 | "cmp "#dstw", "#index" \n\t"\ | |
733 | " jb 1b \n\t" | |
27a90b04 | 734 | #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) |
d604bab9 | 735 | |
27a90b04 | 736 | #define REAL_WRITERGB15(dst, dstw, index) \ |
2da0d70d DB |
737 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
738 | "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
739 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
740 | "psrlq $3, %%mm2 \n\t"\ | |
741 | "psrlq $1, %%mm5 \n\t"\ | |
d604bab9 | 742 | \ |
2da0d70d DB |
743 | "movq %%mm2, %%mm1 \n\t"\ |
744 | "movq %%mm4, %%mm3 \n\t"\ | |
d604bab9 | 745 | \ |
2da0d70d DB |
746 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
747 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
748 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
749 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
d604bab9 | 750 | \ |
2da0d70d DB |
751 | "psllq $2, %%mm3 \n\t"\ |
752 | "psllq $2, %%mm4 \n\t"\ | |
d604bab9 | 753 | \ |
2da0d70d DB |
754 | "por %%mm3, %%mm2 \n\t"\ |
755 | "por %%mm4, %%mm1 \n\t"\ | |
d604bab9 | 756 | \ |
2da0d70d DB |
757 | MOVNTQ(%%mm2, (dst, index, 2))\ |
758 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
d604bab9 | 759 | \ |
2da0d70d DB |
760 | "add $8, "#index" \n\t"\ |
761 | "cmp "#dstw", "#index" \n\t"\ | |
762 | " jb 1b \n\t" | |
27a90b04 | 763 | #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) |
f62255fb | 764 | |
6542b44e | 765 | #define WRITEBGR24OLD(dst, dstw, index) \ |
2da0d70d DB |
766 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
767 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
768 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
769 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
770 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
771 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
772 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
773 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
774 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
775 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
776 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
777 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
778 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
d604bab9 | 779 | \ |
2da0d70d DB |
780 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
781 | "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | |
782 | "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\ | |
783 | "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\ | |
784 | "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
785 | "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | |
786 | "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | |
787 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
d604bab9 | 788 | \ |
2da0d70d DB |
789 | "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
790 | "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | |
791 | "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | |
792 | "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | |
793 | "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\ | |
794 | "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ | |
795 | "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | |
796 | "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\ | |
797 | "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\ | |
798 | "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ | |
799 | "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | |
800 | "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | |
801 | "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | |
d604bab9 | 802 | \ |
2da0d70d DB |
803 | "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ |
804 | "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | |
805 | "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | |
806 | "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\ | |
807 | "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\ | |
808 | "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ | |
809 | "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
810 | "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | |
d604bab9 | 811 | \ |
2da0d70d DB |
812 | MOVNTQ(%%mm0, (dst))\ |
813 | MOVNTQ(%%mm2, 8(dst))\ | |
814 | MOVNTQ(%%mm3, 16(dst))\ | |
815 | "add $24, "#dst" \n\t"\ | |
d604bab9 | 816 | \ |
2da0d70d DB |
817 | "add $8, "#index" \n\t"\ |
818 | "cmp "#dstw", "#index" \n\t"\ | |
819 | " jb 1b \n\t" | |
d604bab9 | 820 | |
6542b44e | 821 | #define WRITEBGR24MMX(dst, dstw, index) \ |
2da0d70d DB |
822 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
823 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
824 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
825 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
826 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
827 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
828 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
829 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
830 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
831 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
832 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
833 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
834 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
99d2cb72 | 835 | \ |
2da0d70d DB |
836 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
837 | "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
838 | "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
839 | "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
99d2cb72 | 840 | \ |
2da0d70d DB |
841 | "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ |
842 | "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
843 | "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
844 | "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
99d2cb72 | 845 | \ |
2da0d70d DB |
846 | "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ |
847 | "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
848 | "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
849 | "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
99d2cb72 | 850 | \ |
2da0d70d DB |
851 | "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
852 | "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
853 | "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
854 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
855 | MOVNTQ(%%mm0, (dst))\ | |
99d2cb72 | 856 | \ |
2da0d70d DB |
857 | "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ |
858 | "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
859 | "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
860 | "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
861 | MOVNTQ(%%mm6, 8(dst))\ | |
99d2cb72 | 862 | \ |
2da0d70d DB |
863 | "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ |
864 | "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
865 | "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
866 | MOVNTQ(%%mm5, 16(dst))\ | |
99d2cb72 | 867 | \ |
2da0d70d | 868 | "add $24, "#dst" \n\t"\ |
99d2cb72 | 869 | \ |
2da0d70d DB |
870 | "add $8, "#index" \n\t"\ |
871 | "cmp "#dstw", "#index" \n\t"\ | |
872 | " jb 1b \n\t" | |
99d2cb72 | 873 | |
6542b44e | 874 | #define WRITEBGR24MMX2(dst, dstw, index) \ |
2da0d70d | 875 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
5802683a RD |
876 | "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ |
877 | "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ | |
2da0d70d DB |
878 | "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
879 | "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
880 | "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
99d2cb72 | 881 | \ |
2da0d70d DB |
882 | "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ |
883 | "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
884 | "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
99d2cb72 | 885 | \ |
2da0d70d DB |
886 | "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ |
887 | "por %%mm1, %%mm6 \n\t"\ | |
888 | "por %%mm3, %%mm6 \n\t"\ | |
889 | MOVNTQ(%%mm6, (dst))\ | |
99d2cb72 | 890 | \ |
2da0d70d DB |
891 | "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ |
892 | "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
893 | "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
894 | "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
99d2cb72 | 895 | \ |
5802683a | 896 | "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ |
2da0d70d DB |
897 | "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
898 | "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
99d2cb72 | 899 | \ |
2da0d70d DB |
900 | "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ |
901 | "por %%mm3, %%mm6 \n\t"\ | |
902 | MOVNTQ(%%mm6, 8(dst))\ | |
99d2cb72 | 903 | \ |
2da0d70d DB |
904 | "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ |
905 | "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
906 | "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
99d2cb72 | 907 | \ |
2da0d70d DB |
908 | "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ |
909 | "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
5802683a | 910 | "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
99d2cb72 | 911 | \ |
2da0d70d DB |
912 | "por %%mm1, %%mm3 \n\t"\ |
913 | "por %%mm3, %%mm6 \n\t"\ | |
914 | MOVNTQ(%%mm6, 16(dst))\ | |
99d2cb72 | 915 | \ |
2da0d70d | 916 | "add $24, "#dst" \n\t"\ |
99d2cb72 | 917 | \ |
2da0d70d DB |
918 | "add $8, "#index" \n\t"\ |
919 | "cmp "#dstw", "#index" \n\t"\ | |
920 | " jb 1b \n\t" | |
99d2cb72 MN |
921 | |
922 | #ifdef HAVE_MMX2 | |
7630f2e0 | 923 | #undef WRITEBGR24 |
6e1c66bc | 924 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) |
99d2cb72 | 925 | #else |
7630f2e0 | 926 | #undef WRITEBGR24 |
6e1c66bc | 927 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) |
99d2cb72 MN |
928 | #endif |
929 | ||
6e1c66bc | 930 | #define REAL_WRITEYUY2(dst, dstw, index) \ |
2da0d70d DB |
931 | "packuswb %%mm3, %%mm3 \n\t"\ |
932 | "packuswb %%mm4, %%mm4 \n\t"\ | |
933 | "packuswb %%mm7, %%mm1 \n\t"\ | |
934 | "punpcklbw %%mm4, %%mm3 \n\t"\ | |
935 | "movq %%mm1, %%mm7 \n\t"\ | |
936 | "punpcklbw %%mm3, %%mm1 \n\t"\ | |
937 | "punpckhbw %%mm3, %%mm7 \n\t"\ | |
25593e29 | 938 | \ |
2da0d70d DB |
939 | MOVNTQ(%%mm1, (dst, index, 2))\ |
940 | MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
25593e29 | 941 | \ |
2da0d70d DB |
942 | "add $8, "#index" \n\t"\ |
943 | "cmp "#dstw", "#index" \n\t"\ | |
944 | " jb 1b \n\t" | |
6e1c66bc | 945 | #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) |
25593e29 MN |
946 | |
947 | ||
77a49659 | 948 | static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
2da0d70d DB |
949 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
950 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
38858470 | 951 | { |
c1b0bfb4 | 952 | #ifdef HAVE_MMX |
2da0d70d DB |
953 | if (c->flags & SWS_ACCURATE_RND){ |
954 | if (uDest){ | |
8b2fce0d MN |
955 | YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) |
956 | YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) | |
2da0d70d | 957 | } |
bca11e75 | 958 | |
8b2fce0d | 959 | YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW) |
2da0d70d DB |
960 | }else{ |
961 | if (uDest){ | |
8b2fce0d MN |
962 | YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) |
963 | YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) | |
bca11e75 | 964 | } |
2da0d70d | 965 | |
8b2fce0d | 966 | YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW) |
2da0d70d | 967 | } |
c1b0bfb4 | 968 | #else |
a2faa401 RD |
969 | #ifdef HAVE_ALTIVEC |
970 | yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, | |
2da0d70d DB |
971 | chrFilter, chrSrc, chrFilterSize, |
972 | dest, uDest, vDest, dstW, chrDstW); | |
a2faa401 | 973 | #else //HAVE_ALTIVEC |
5859233b | 974 | yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, |
2da0d70d DB |
975 | chrFilter, chrSrc, chrFilterSize, |
976 | dest, uDest, vDest, dstW, chrDstW); | |
a2faa401 | 977 | #endif //!HAVE_ALTIVEC |
bc279024 | 978 | #endif /* HAVE_MMX */ |
c1b0bfb4 | 979 | } |
2add307d | 980 | |
6118e52e | 981 | static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
2da0d70d DB |
982 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
983 | uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | |
6118e52e VS |
984 | { |
985 | yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | |
2da0d70d DB |
986 | chrFilter, chrSrc, chrFilterSize, |
987 | dest, uDest, dstW, chrDstW, dstFormat); | |
6118e52e VS |
988 | } |
989 | ||
bf2bdde6 | 990 | static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, |
2da0d70d | 991 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) |
c1b0bfb4 MN |
992 | { |
993 | #ifdef HAVE_MMX | |
7bae01c6 MN |
994 | long p= uDest ? 3 : 1; |
995 | uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW}; | |
996 | uint8_t *dst[3]= {dest, uDest, vDest}; | |
997 | long counter[3] = {dstW, chrDstW, chrDstW}; | |
2da0d70d | 998 | |
bf2bdde6 MN |
999 | if (c->flags & SWS_ACCURATE_RND){ |
1000 | while(p--){ | |
1001 | asm volatile( | |
1002 | YSCALEYUV2YV121_ACCURATE | |
1003 | :: "r" (src[p]), "r" (dst[p] + counter[p]), | |
1004 | "g" (-counter[p]) | |
1005 | : "%"REG_a | |
1006 | ); | |
1007 | } | |
1008 | }else{ | |
d78c1ea1 MN |
1009 | while(p--){ |
1010 | asm volatile( | |
1011 | YSCALEYUV2YV121 | |
1012 | :: "r" (src[p]), "r" (dst[p] + counter[p]), | |
1013 | "g" (-counter[p]) | |
1014 | : "%"REG_a | |
1015 | ); | |
1016 | } | |
bf2bdde6 | 1017 | } |
2da0d70d | 1018 | |
c1b0bfb4 | 1019 | #else |
2da0d70d DB |
1020 | int i; |
1021 | for (i=0; i<dstW; i++) | |
1022 | { | |
a1f3ffa3 | 1023 | int val= (lumSrc[i]+64)>>7; |
2da0d70d DB |
1024 | |
1025 | if (val&256){ | |
1026 | if (val<0) val=0; | |
1027 | else val=255; | |
1028 | } | |
1029 | ||
1030 | dest[i]= val; | |
1031 | } | |
1032 | ||
1b0a4572 | 1033 | if (uDest) |
2da0d70d DB |
1034 | for (i=0; i<chrDstW; i++) |
1035 | { | |
a1f3ffa3 MN |
1036 | int u=(chrSrc[i ]+64)>>7; |
1037 | int v=(chrSrc[i + VOFW]+64)>>7; | |
2da0d70d DB |
1038 | |
1039 | if ((u|v)&256){ | |
1040 | if (u<0) u=0; | |
1041 | else if (u>255) u=255; | |
1042 | if (v<0) v=0; | |
1043 | else if (v>255) v=255; | |
1044 | } | |
1045 | ||
1046 | uDest[i]= u; | |
1047 | vDest[i]= v; | |
1048 | } | |
c1b0bfb4 | 1049 | #endif |
38858470 MN |
1050 | } |
1051 | ||
c1b0bfb4 | 1052 | |
d604bab9 MN |
1053 | /** |
1054 | * vertical scale YV12 to RGB | |
1055 | */ | |
25593e29 | 1056 | static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
2da0d70d DB |
1057 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
1058 | uint8_t *dest, long dstW, long dstY) | |
c1b0bfb4 | 1059 | { |
bca11e75 | 1060 | #ifdef HAVE_MMX |
f8d61128 | 1061 | long dummy=0; |
2da0d70d DB |
1062 | if (c->flags & SWS_ACCURATE_RND){ |
1063 | switch(c->dstFormat){ | |
1064 | case PIX_FMT_RGB32: | |
1065 | YSCALEYUV2PACKEDX_ACCURATE | |
1066 | YSCALEYUV2RGBX | |
1067 | WRITEBGR32(%4, %5, %%REGa) | |
1068 | ||
1069 | YSCALEYUV2PACKEDX_END | |
1070 | return; | |
1071 | case PIX_FMT_BGR24: | |
1072 | YSCALEYUV2PACKEDX_ACCURATE | |
1073 | YSCALEYUV2RGBX | |
1074 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize | |
1075 | "add %4, %%"REG_c" \n\t" | |
1076 | WRITEBGR24(%%REGc, %5, %%REGa) | |
1077 | ||
1078 | ||
1079 | :: "r" (&c->redDither), | |
1080 | "m" (dummy), "m" (dummy), "m" (dummy), | |
1081 | "r" (dest), "m" (dstW) | |
1082 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | |
1083 | ); | |
1084 | return; | |
27a90b04 | 1085 | case PIX_FMT_RGB555: |
2da0d70d DB |
1086 | YSCALEYUV2PACKEDX_ACCURATE |
1087 | YSCALEYUV2RGBX | |
1088 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
bca11e75 | 1089 | #ifdef DITHER1XBPP |
2da0d70d DB |
1090 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1091 | "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1092 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1093 | #endif | |
1094 | ||
27a90b04 | 1095 | WRITERGB15(%4, %5, %%REGa) |
2da0d70d DB |
1096 | YSCALEYUV2PACKEDX_END |
1097 | return; | |
27a90b04 | 1098 | case PIX_FMT_RGB565: |
2da0d70d DB |
1099 | YSCALEYUV2PACKEDX_ACCURATE |
1100 | YSCALEYUV2RGBX | |
1101 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
bca11e75 | 1102 | #ifdef DITHER1XBPP |
2da0d70d DB |
1103 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1104 | "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1105 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1106 | #endif | |
1107 | ||
27a90b04 | 1108 | WRITERGB16(%4, %5, %%REGa) |
2da0d70d DB |
1109 | YSCALEYUV2PACKEDX_END |
1110 | return; | |
1111 | case PIX_FMT_YUYV422: | |
1112 | YSCALEYUV2PACKEDX_ACCURATE | |
1113 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1114 | ||
1115 | "psraw $3, %%mm3 \n\t" | |
1116 | "psraw $3, %%mm4 \n\t" | |
1117 | "psraw $3, %%mm1 \n\t" | |
1118 | "psraw $3, %%mm7 \n\t" | |
1119 | WRITEYUY2(%4, %5, %%REGa) | |
1120 | YSCALEYUV2PACKEDX_END | |
1121 | return; | |
1122 | } | |
bca11e75 | 1123 | }else{ |
2da0d70d DB |
1124 | switch(c->dstFormat) |
1125 | { | |
1126 | case PIX_FMT_RGB32: | |
1127 | YSCALEYUV2PACKEDX | |
1128 | YSCALEYUV2RGBX | |
1129 | WRITEBGR32(%4, %5, %%REGa) | |
1130 | YSCALEYUV2PACKEDX_END | |
1131 | return; | |
1132 | case PIX_FMT_BGR24: | |
1133 | YSCALEYUV2PACKEDX | |
1134 | YSCALEYUV2RGBX | |
1135 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize | |
1136 | "add %4, %%"REG_c" \n\t" | |
1137 | WRITEBGR24(%%REGc, %5, %%REGa) | |
1138 | ||
1139 | :: "r" (&c->redDither), | |
1140 | "m" (dummy), "m" (dummy), "m" (dummy), | |
1141 | "r" (dest), "m" (dstW) | |
1142 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | |
1143 | ); | |
1144 | return; | |
27a90b04 | 1145 | case PIX_FMT_RGB555: |
2da0d70d DB |
1146 | YSCALEYUV2PACKEDX |
1147 | YSCALEYUV2RGBX | |
1148 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
c1b0bfb4 | 1149 | #ifdef DITHER1XBPP |
2da0d70d DB |
1150 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" |
1151 | "paddusb "MANGLE(g5Dither)", %%mm4 \n\t" | |
1152 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | |
1153 | #endif | |
1154 | ||
27a90b04 | 1155 | WRITERGB15(%4, %5, %%REGa) |
2da0d70d DB |
1156 | YSCALEYUV2PACKEDX_END |
1157 | return; | |
27a90b04 | 1158 | case PIX_FMT_RGB565: |
2da0d70d DB |
1159 | YSCALEYUV2PACKEDX |
1160 | YSCALEYUV2RGBX | |
1161 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
c1b0bfb4 | 1162 | #ifdef DITHER1XBPP |
2da0d70d DB |
1163 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" |
1164 | "paddusb "MANGLE(g6Dither)", %%mm4 \n\t" | |
1165 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | |
1166 | #endif | |
1167 | ||
27a90b04 | 1168 | WRITERGB16(%4, %5, %%REGa) |
2da0d70d DB |
1169 | YSCALEYUV2PACKEDX_END |
1170 | return; | |
1171 | case PIX_FMT_YUYV422: | |
1172 | YSCALEYUV2PACKEDX | |
1173 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1174 | ||
1175 | "psraw $3, %%mm3 \n\t" | |
1176 | "psraw $3, %%mm4 \n\t" | |
1177 | "psraw $3, %%mm1 \n\t" | |
1178 | "psraw $3, %%mm7 \n\t" | |
1179 | WRITEYUY2(%4, %5, %%REGa) | |
1180 | YSCALEYUV2PACKEDX_END | |
1181 | return; | |
bca11e75 MN |
1182 | } |
1183 | } | |
bc279024 | 1184 | #endif /* HAVE_MMX */ |
a31de956 | 1185 | #ifdef HAVE_ALTIVEC |
2da0d70d DB |
1186 | /* The following list of supported dstFormat values should |
1187 | match what's found in the body of altivec_yuv2packedX() */ | |
1188 | if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || | |
1189 | c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || | |
1190 | c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB) | |
1191 | altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, | |
1192 | chrFilter, chrSrc, chrFilterSize, | |
1193 | dest, dstW, dstY); | |
1194 | else | |
1195 | #endif | |
1196 | yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, | |
1197 | chrFilter, chrSrc, chrFilterSize, | |
1198 | dest, dstW, dstY); | |
c1b0bfb4 MN |
1199 | } |
1200 | ||
c1b0bfb4 MN |
1201 | /** |
1202 | * vertical bilinear scale YV12 to RGB | |
1203 | */ | |
25593e29 | 1204 | static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, |
2da0d70d | 1205 | uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) |
d604bab9 | 1206 | { |
2da0d70d DB |
1207 | int yalpha1=yalpha^4095; |
1208 | int uvalpha1=uvalpha^4095; | |
1209 | int i; | |
d604bab9 | 1210 | |
77a416e8 | 1211 | #if 0 //isn't used |
2da0d70d DB |
1212 | if (flags&SWS_FULL_CHR_H_INT) |
1213 | { | |
1214 | switch(dstFormat) | |
1215 | { | |
cf7d1c1a | 1216 | #ifdef HAVE_MMX |
2da0d70d DB |
1217 | case PIX_FMT_RGB32: |
1218 | asm volatile( | |
d604bab9 MN |
1219 | |
1220 | ||
1221 | FULL_YSCALEYUV2RGB | |
2da0d70d DB |
1222 | "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
1223 | "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
d604bab9 | 1224 | |
2da0d70d DB |
1225 | "movq %%mm3, %%mm1 \n\t" |
1226 | "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1227 | "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
d604bab9 | 1228 | |
2da0d70d DB |
1229 | MOVNTQ(%%mm3, (%4, %%REGa, 4)) |
1230 | MOVNTQ(%%mm1, 8(%4, %%REGa, 4)) | |
d604bab9 | 1231 | |
2da0d70d DB |
1232 | "add $4, %%"REG_a" \n\t" |
1233 | "cmp %5, %%"REG_a" \n\t" | |
1234 | " jb 1b \n\t" | |
d604bab9 | 1235 | |
2da0d70d DB |
1236 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW), |
1237 | "m" (yalpha1), "m" (uvalpha1) | |
1238 | : "%"REG_a | |
1239 | ); | |
1240 | break; | |
1241 | case PIX_FMT_BGR24: | |
1242 | asm volatile( | |
d604bab9 MN |
1243 | |
1244 | FULL_YSCALEYUV2RGB | |
1245 | ||
2da0d70d DB |
1246 | // lsb ... msb |
1247 | "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1248 | "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
d604bab9 | 1249 | |
2da0d70d DB |
1250 | "movq %%mm3, %%mm1 \n\t" |
1251 | "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1252 | "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
d604bab9 | 1253 | |
2da0d70d DB |
1254 | "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 |
1255 | "psrlq $8, %%mm3 \n\t" // GR0BGR00 | |
1256 | "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000 | |
1257 | "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00 | |
1258 | "por %%mm2, %%mm3 \n\t" // BGRBGR00 | |
1259 | "movq %%mm1, %%mm2 \n\t" | |
1260 | "psllq $48, %%mm1 \n\t" // 000000BG | |
1261 | "por %%mm1, %%mm3 \n\t" // BGRBGRBG | |
d604bab9 | 1262 | |
2da0d70d DB |
1263 | "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 |
1264 | "psrld $16, %%mm2 \n\t" // R000R000 | |
1265 | "psrlq $24, %%mm1 \n\t" // 0BGR0000 | |
1266 | "por %%mm2, %%mm1 \n\t" // RBGRR000 | |
d604bab9 | 1267 | |
2da0d70d DB |
1268 | "mov %4, %%"REG_b" \n\t" |
1269 | "add %%"REG_a", %%"REG_b" \n\t" | |
d604bab9 MN |
1270 | |
1271 | #ifdef HAVE_MMX2 | |
2da0d70d DB |
1272 | //FIXME Alignment |
1273 | "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" | |
1274 | "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" | |
d604bab9 | 1275 | #else |
2da0d70d DB |
1276 | "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" |
1277 | "psrlq $32, %%mm3 \n\t" | |
1278 | "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t" | |
1279 | "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" | |
1280 | #endif | |
1281 | "add $4, %%"REG_a" \n\t" | |
1282 | "cmp %5, %%"REG_a" \n\t" | |
1283 | " jb 1b \n\t" | |
1284 | ||
1285 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), | |
1286 | "m" (yalpha1), "m" (uvalpha1) | |
1287 | : "%"REG_a, "%"REG_b | |
1288 | ); | |
1289 | break; | |
1290 | case PIX_FMT_BGR555: | |
1291 | asm volatile( | |
d604bab9 MN |
1292 | |
1293 | FULL_YSCALEYUV2RGB | |
1294 | #ifdef DITHER1XBPP | |
2da0d70d DB |
1295 | "paddusb "MANGLE(g5Dither)", %%mm1 \n\t" |
1296 | "paddusb "MANGLE(r5Dither)", %%mm0 \n\t" | |
1297 | "paddusb "MANGLE(b5Dither)", %%mm3 \n\t" | |
d604bab9 | 1298 | #endif |
2da0d70d DB |
1299 | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
1300 | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1301 | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
d604bab9 | 1302 | |
2da0d70d DB |
1303 | "psrlw $3, %%mm3 \n\t" |
1304 | "psllw $2, %%mm1 \n\t" | |
1305 | "psllw $7, %%mm0 \n\t" | |
1306 | "pand "MANGLE(g15Mask)", %%mm1 \n\t" | |
1307 | "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
d604bab9 | 1308 | |
2da0d70d DB |
1309 | "por %%mm3, %%mm1 \n\t" |
1310 | "por %%mm1, %%mm0 \n\t" | |
d604bab9 | 1311 | |
2da0d70d | 1312 | MOVNTQ(%%mm0, (%4, %%REGa, 2)) |
d604bab9 | 1313 | |
2da0d70d DB |
1314 | "add $4, %%"REG_a" \n\t" |
1315 | "cmp %5, %%"REG_a" \n\t" | |
1316 | " jb 1b \n\t" | |
d604bab9 | 1317 | |
2da0d70d DB |
1318 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
1319 | "m" (yalpha1), "m" (uvalpha1) | |
1320 | : "%"REG_a | |
1321 | ); | |
1322 | break; | |
1323 | case PIX_FMT_BGR565: | |
1324 | asm volatile( | |
d604bab9 MN |
1325 | |
1326 | FULL_YSCALEYUV2RGB | |
1327 | #ifdef DITHER1XBPP | |
2da0d70d DB |
1328 | "paddusb "MANGLE(g6Dither)", %%mm1 \n\t" |
1329 | "paddusb "MANGLE(r5Dither)", %%mm0 \n\t" | |
1330 | "paddusb "MANGLE(b5Dither)", %%mm3 \n\t" | |
d604bab9 | 1331 | #endif |
2da0d70d DB |
1332 | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
1333 | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1334 | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
d604bab9 | 1335 | |
2da0d70d DB |
1336 | "psrlw $3, %%mm3 \n\t" |
1337 | "psllw $3, %%mm1 \n\t" | |
1338 | "psllw $8, %%mm0 \n\t" | |
1339 | "pand "MANGLE(g16Mask)", %%mm1 \n\t" | |
1340 | "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
d604bab9 | 1341 | |
2da0d70d DB |
1342 | "por %%mm3, %%mm1 \n\t" |
1343 | "por %%mm1, %%mm0 \n\t" | |
d604bab9 | 1344 | |
2da0d70d | 1345 | MOVNTQ(%%mm0, (%4, %%REGa, 2)) |
d604bab9 | 1346 | |
2da0d70d DB |
1347 | "add $4, %%"REG_a" \n\t" |
1348 | "cmp %5, %%"REG_a" \n\t" | |
1349 | " jb 1b \n\t" | |
d604bab9 | 1350 | |
2da0d70d DB |
1351 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
1352 | "m" (yalpha1), "m" (uvalpha1) | |
1353 | : "%"REG_a | |
1354 | ); | |
1355 | break; | |
bc279024 | 1356 | #endif /* HAVE_MMX */ |
2da0d70d | 1357 | case PIX_FMT_BGR32: |
cf7d1c1a | 1358 | #ifndef HAVE_MMX |
2da0d70d | 1359 | case PIX_FMT_RGB32: |
cf7d1c1a | 1360 | #endif |
2da0d70d DB |
1361 | if (dstFormat==PIX_FMT_RGB32) |
1362 | { | |
1363 | int i; | |
df3c183a | 1364 | #ifdef WORDS_BIGENDIAN |
2da0d70d DB |
1365 | dest++; |
1366 | #endif | |
1367 | for (i=0;i<dstW;i++){ | |
1368 | // vertical linear interpolation && yuv2rgb in a single step: | |
1369 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1370 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
8b2fce0d | 1371 | int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19); |
2da0d70d DB |
1372 | dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
1373 | dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1374 | dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1375 | dest+= 4; | |
1376 | } | |
1377 | } | |
1378 | else if (dstFormat==PIX_FMT_BGR24) | |
1379 | { | |
1380 | int i; | |
1381 | for (i=0;i<dstW;i++){ | |
1382 | // vertical linear interpolation && yuv2rgb in a single step: | |
1383 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1384 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
8b2fce0d | 1385 | int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19); |
2da0d70d DB |
1386 | dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
1387 | dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1388 | dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1389 | dest+= 3; | |
1390 | } | |
1391 | } | |
1392 | else if (dstFormat==PIX_FMT_BGR565) | |
1393 | { | |
1394 | int i; | |
1395 | for (i=0;i<dstW;i++){ | |
1396 | // vertical linear interpolation && yuv2rgb in a single step: | |
1397 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1398 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
8b2fce0d | 1399 | int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19); |
2da0d70d DB |
1400 | |
1401 | ((uint16_t*)dest)[i] = | |
1402 | clip_table16b[(Y + yuvtab_40cf[U]) >>13] | | |
1403 | clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1404 | clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
1405 | } | |
1406 | } | |
1407 | else if (dstFormat==PIX_FMT_BGR555) | |
1408 | { | |
1409 | int i; | |
1410 | for (i=0;i<dstW;i++){ | |
1411 | // vertical linear interpolation && yuv2rgb in a single step: | |
1412 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1413 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
8b2fce0d | 1414 | int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19); |
2da0d70d DB |
1415 | |
1416 | ((uint16_t*)dest)[i] = | |
1417 | clip_table15b[(Y + yuvtab_40cf[U]) >>13] | | |
1418 | clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1419 | clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
1420 | } | |
1421 | } | |
1422 | }//FULL_UV_IPOL | |
1423 | else | |
1424 | { | |
cf7d1c1a | 1425 | #endif // if 0 |
d604bab9 | 1426 | #ifdef HAVE_MMX |
2da0d70d DB |
1427 | switch(c->dstFormat) |
1428 | { | |
1429 | //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( | |
1430 | case PIX_FMT_RGB32: | |
1431 | asm volatile( | |
1432 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1433 | "mov %4, %%"REG_b" \n\t" | |
1434 | "push %%"REG_BP" \n\t" | |
1435 | YSCALEYUV2RGB(%%REGBP, %5) | |
1436 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1437 | "pop %%"REG_BP" \n\t" | |
1438 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1439 | ||
1440 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1441 | "a" (&c->redDither) | |
1442 | ); | |
1443 | return; | |
1444 | case PIX_FMT_BGR24: | |
1445 | asm volatile( | |
1446 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1447 | "mov %4, %%"REG_b" \n\t" | |
1448 | "push %%"REG_BP" \n\t" | |
1449 | YSCALEYUV2RGB(%%REGBP, %5) | |
1450 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1451 | "pop %%"REG_BP" \n\t" | |
1452 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1453 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1454 | "a" (&c->redDither) | |
1455 | ); | |
1456 | return; | |
27a90b04 | 1457 | case PIX_FMT_RGB555: |
2da0d70d DB |
1458 | asm volatile( |
1459 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1460 | "mov %4, %%"REG_b" \n\t" | |
1461 | "push %%"REG_BP" \n\t" | |
1462 | YSCALEYUV2RGB(%%REGBP, %5) | |
1463 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
d604bab9 | 1464 | #ifdef DITHER1XBPP |
2da0d70d DB |
1465 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" |
1466 | "paddusb "MANGLE(g5Dither)", %%mm4 \n\t" | |
1467 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | |
1468 | #endif | |
1469 | ||
27a90b04 | 1470 | WRITERGB15(%%REGb, 8280(%5), %%REGBP) |
2da0d70d DB |
1471 | "pop %%"REG_BP" \n\t" |
1472 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1473 | ||
1474 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1475 | "a" (&c->redDither) | |
1476 | ); | |
1477 | return; | |
27a90b04 | 1478 | case PIX_FMT_RGB565: |
2da0d70d DB |
1479 | asm volatile( |
1480 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1481 | "mov %4, %%"REG_b" \n\t" | |
1482 | "push %%"REG_BP" \n\t" | |
1483 | YSCALEYUV2RGB(%%REGBP, %5) | |
1484 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
d604bab9 | 1485 | #ifdef DITHER1XBPP |
2da0d70d DB |
1486 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" |
1487 | "paddusb "MANGLE(g6Dither)", %%mm4 \n\t" | |
1488 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | |
1489 | #endif | |
1490 | ||
27a90b04 | 1491 | WRITERGB16(%%REGb, 8280(%5), %%REGBP) |
2da0d70d DB |
1492 | "pop %%"REG_BP" \n\t" |
1493 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1494 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1495 | "a" (&c->redDither) | |
1496 | ); | |
1497 | return; | |
1498 | case PIX_FMT_YUYV422: | |
1499 | asm volatile( | |
1500 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1501 | "mov %4, %%"REG_b" \n\t" | |
1502 | "push %%"REG_BP" \n\t" | |
1503 | YSCALEYUV2PACKED(%%REGBP, %5) | |
1504 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1505 | "pop %%"REG_BP" \n\t" | |
1506 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1507 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1508 | "a" (&c->redDither) | |
1509 | ); | |
1510 | return; | |
1511 | default: break; | |
1512 | } | |
cf7d1c1a | 1513 | #endif //HAVE_MMX |
b0880d5d | 1514 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C) |
d604bab9 MN |
1515 | } |
1516 | ||
1517 | /** | |
1518 | * YV12 to RGB without scaling or interpolating | |
1519 | */ | |
25593e29 | 1520 | static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
2da0d70d | 1521 | uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) |
d604bab9 | 1522 | { |
2da0d70d DB |
1523 | const int yalpha1=0; |
1524 | int i; | |
6a4970ab | 1525 | |
8a322796 | 1526 | uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
2da0d70d | 1527 | const int yalpha= 4096; //FIXME ... |
96034638 | 1528 | |
2da0d70d DB |
1529 | if (flags&SWS_FULL_CHR_H_INT) |
1530 | { | |
1531 | RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); | |
1532 | return; | |
1533 | } | |
397c035e MN |
1534 | |
1535 | #ifdef HAVE_MMX | |
e5091488 | 1536 | if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
2da0d70d DB |
1537 | { |
1538 | switch(dstFormat) | |
1539 | { | |
1540 | case PIX_FMT_RGB32: | |
1541 | asm volatile( | |
1542 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1543 | "mov %4, %%"REG_b" \n\t" | |
1544 | "push %%"REG_BP" \n\t" | |
1545 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1546 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1547 | "pop %%"REG_BP" \n\t" | |
1548 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1549 | ||
1550 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1551 | "a" (&c->redDither) | |
1552 | ); | |
1553 | return; | |
1554 | case PIX_FMT_BGR24: | |
1555 | asm volatile( | |
1556 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1557 | "mov %4, %%"REG_b" \n\t" | |
1558 | "push %%"REG_BP" \n\t" | |
1559 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1560 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1561 | "pop %%"REG_BP" \n\t" | |
1562 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1563 | ||
1564 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1565 | "a" (&c->redDither) | |
1566 | ); | |
1567 | return; | |
27a90b04 | 1568 | case PIX_FMT_RGB555: |
2da0d70d DB |
1569 | asm volatile( |
1570 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1571 | "mov %4, %%"REG_b" \n\t" | |
1572 | "push %%"REG_BP" \n\t" | |
1573 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1574 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
d604bab9 | 1575 | #ifdef DITHER1XBPP |
2da0d70d DB |
1576 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" |
1577 | "paddusb "MANGLE(g5Dither)", %%mm4 \n\t" | |
1578 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | |
1579 | #endif | |
27a90b04 | 1580 | WRITERGB15(%%REGb, 8280(%5), %%REGBP) |
2da0d70d DB |
1581 | "pop %%"REG_BP" \n\t" |
1582 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1583 | ||
1584 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1585 | "a" (&c->redDither) | |
1586 | ); | |
1587 | return; | |
27a90b04 | 1588 | case PIX_FMT_RGB565: |
2da0d70d DB |
1589 | asm volatile( |
1590 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1591 | "mov %4, %%"REG_b" \n\t" | |
1592 | "push %%"REG_BP" \n\t" | |
1593 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1594 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
d604bab9 | 1595 | #ifdef DITHER1XBPP |
2da0d70d DB |
1596 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" |
1597 | "paddusb "MANGLE(g6Dither)", %%mm4 \n\t" | |
1598 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | |
1599 | #endif | |
1600 | ||
27a90b04 | 1601 | WRITERGB16(%%REGb, 8280(%5), %%REGBP) |
2da0d70d DB |
1602 | "pop %%"REG_BP" \n\t" |
1603 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1604 | ||
1605 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1606 | "a" (&c->redDither) | |
1607 | ); | |
1608 | return; | |
1609 | case PIX_FMT_YUYV422: | |
1610 | asm volatile( | |
1611 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1612 | "mov %4, %%"REG_b" \n\t" | |
1613 | "push %%"REG_BP" \n\t" | |
1614 | YSCALEYUV2PACKED1(%%REGBP, %5) | |
1615 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1616 | "pop %%"REG_BP" \n\t" | |
1617 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1618 | ||
1619 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1620 | "a" (&c->redDither) | |
1621 | ); | |
1622 | return; | |
1623 | } | |
1624 | } | |
1625 | else | |
1626 | { | |
1627 | switch(dstFormat) | |
1628 | { | |
1629 | case PIX_FMT_RGB32: | |
1630 | asm volatile( | |
1631 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1632 | "mov %4, %%"REG_b" \n\t" | |
1633 | "push %%"REG_BP" \n\t" | |
1634 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1635 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1636 | "pop %%"REG_BP" \n\t" | |
1637 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1638 | ||
1639 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1640 | "a" (&c->redDither) | |
1641 | ); | |
1642 | return; | |
1643 | case PIX_FMT_BGR24: | |
1644 | asm volatile( | |
1645 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1646 | "mov %4, %%"REG_b" \n\t" | |
1647 | "push %%"REG_BP" \n\t" | |
1648 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1649 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1650 | "pop %%"REG_BP" \n\t" | |
1651 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1652 | ||
1653 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1654 | "a" (&c->redDither) | |
1655 | ); | |
1656 | return; | |
27a90b04 | 1657 | case PIX_FMT_RGB555: |
2da0d70d DB |
1658 | asm volatile( |
1659 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1660 | "mov %4, %%"REG_b" \n\t" | |
1661 | "push %%"REG_BP" \n\t" | |
1662 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1663 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
497d4f99 | 1664 | #ifdef DITHER1XBPP |
2da0d70d DB |
1665 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" |
1666 | "paddusb "MANGLE(g5Dither)", %%mm4 \n\t" | |
1667 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | |
1668 | #endif | |
27a90b04 | 1669 | WRITERGB15(%%REGb, 8280(%5), %%REGBP) |
2da0d70d DB |
1670 | "pop %%"REG_BP" \n\t" |
1671 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1672 | ||
1673 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1674 | "a" (&c->redDither) | |
1675 | ); | |
1676 | return; | |
27a90b04 | 1677 | case PIX_FMT_RGB565: |
2da0d70d DB |
1678 | asm volatile( |
1679 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1680 | "mov %4, %%"REG_b" \n\t" | |
1681 | "push %%"REG_BP" \n\t" | |
1682 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1683 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
497d4f99 | 1684 | #ifdef DITHER1XBPP |
2da0d70d DB |
1685 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" |
1686 | "paddusb "MANGLE(g6Dither)", %%mm4 \n\t" | |
1687 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | |
1688 | #endif | |
1689 | ||
27a90b04 | 1690 | WRITERGB16(%%REGb, 8280(%5), %%REGBP) |
2da0d70d DB |
1691 | "pop %%"REG_BP" \n\t" |
1692 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1693 | ||
1694 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1695 | "a" (&c->redDither) | |
1696 | ); | |
1697 | return; | |
1698 | case PIX_FMT_YUYV422: | |
1699 | asm volatile( | |
1700 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1701 | "mov %4, %%"REG_b" \n\t" | |
1702 | "push %%"REG_BP" \n\t" | |
1703 | YSCALEYUV2PACKED1b(%%REGBP, %5) | |
1704 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1705 | "pop %%"REG_BP" \n\t" | |
1706 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1707 | ||
1708 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1709 | "a" (&c->redDither) | |
1710 | ); | |
1711 | return; | |
1712 | } | |
1713 | } | |
bc279024 | 1714 | #endif /* HAVE_MMX */ |
e5091488 | 1715 | if (uvalpha < 2048) |
2da0d70d | 1716 | { |
b0880d5d | 1717 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C) |
2da0d70d | 1718 | }else{ |
b0880d5d | 1719 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C) |
2da0d70d | 1720 | } |
d604bab9 MN |
1721 | } |
1722 | ||
8a322796 | 1723 | //FIXME yuy2* can read up to 7 samples too much |
6ff0ad6b | 1724 | |
7f526efd | 1725 | static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) |
1e621b18 | 1726 | { |
6ff0ad6b | 1727 | #ifdef HAVE_MMX |
2da0d70d DB |
1728 | asm volatile( |
1729 | "movq "MANGLE(bm01010101)", %%mm2 \n\t" | |
1730 | "mov %0, %%"REG_a" \n\t" | |
1731 | "1: \n\t" | |
1732 | "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1733 | "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1734 | "pand %%mm2, %%mm0 \n\t" | |
1735 | "pand %%mm2, %%mm1 \n\t" | |
1736 | "packuswb %%mm1, %%mm0 \n\t" | |
1737 | "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1738 | "add $8, %%"REG_a" \n\t" | |
1739 | " js 1b \n\t" | |
1740 | : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1741 | : "%"REG_a | |
1742 | ); | |
1e621b18 | 1743 | #else |
2da0d70d DB |
1744 | int i; |
1745 | for (i=0; i<width; i++) | |
1746 | dst[i]= src[2*i]; | |
1e621b18 MN |
1747 | #endif |
1748 | } | |
1749 | ||
7f526efd | 1750 | static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
1e621b18 | 1751 | { |
c2271987 | 1752 | #ifdef HAVE_MMX |
2da0d70d DB |
1753 | asm volatile( |
1754 | "movq "MANGLE(bm01010101)", %%mm4 \n\t" | |
1755 | "mov %0, %%"REG_a" \n\t" | |
1756 | "1: \n\t" | |
1757 | "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1758 | "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1759 | "psrlw $8, %%mm0 \n\t" | |
1760 | "psrlw $8, %%mm1 \n\t" | |
1761 | "packuswb %%mm1, %%mm0 \n\t" | |
1762 | "movq %%mm0, %%mm1 \n\t" | |
1763 | "psrlw $8, %%mm0 \n\t" | |
1764 | "pand %%mm4, %%mm1 \n\t" | |
1765 | "packuswb %%mm0, %%mm0 \n\t" | |
1766 | "packuswb %%mm1, %%mm1 \n\t" | |
1767 | "movd %%mm0, (%3, %%"REG_a") \n\t" | |
1768 | "movd %%mm1, (%2, %%"REG_a") \n\t" | |
1769 | "add $4, %%"REG_a" \n\t" | |
1770 | " js 1b \n\t" | |
1771 | : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) | |
1772 | : "%"REG_a | |
1773 | ); | |
1e621b18 | 1774 | #else |
2da0d70d DB |
1775 | int i; |
1776 | for (i=0; i<width; i++) | |
1777 | { | |
1778 | dstU[i]= src1[4*i + 1]; | |
1779 | dstV[i]= src1[4*i + 3]; | |
1780 | } | |
1781 | #endif | |
1782 | assert(src1 == src2); | |
1e621b18 MN |
1783 | } |
1784 | ||
4cf16bbe DB |
1785 | /* This is almost identical to the previous, end exists only because |
1786 | * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ | |
7f526efd | 1787 | static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width) |
7322a67c MN |
1788 | { |
1789 | #ifdef HAVE_MMX | |
2da0d70d DB |
1790 | asm volatile( |
1791 | "mov %0, %%"REG_a" \n\t" | |
1792 | "1: \n\t" | |
1793 | "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1794 | "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1795 | "psrlw $8, %%mm0 \n\t" | |
1796 | "psrlw $8, %%mm1 \n\t" | |
1797 | "packuswb %%mm1, %%mm0 \n\t" | |
1798 | "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1799 | "add $8, %%"REG_a" \n\t" | |
1800 | " js 1b \n\t" | |
1801 | : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1802 | : "%"REG_a | |
1803 | ); | |
7322a67c | 1804 | #else |
2da0d70d DB |
1805 | int i; |
1806 | for (i=0; i<width; i++) | |
1807 | dst[i]= src[2*i+1]; | |
7322a67c MN |
1808 | #endif |
1809 | } | |
1810 | ||
7f526efd | 1811 | static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
7322a67c | 1812 | { |
c2271987 | 1813 | #ifdef HAVE_MMX |
2da0d70d DB |
1814 | asm volatile( |
1815 | "movq "MANGLE(bm01010101)", %%mm4 \n\t" | |
1816 | "mov %0, %%"REG_a" \n\t" | |
1817 | "1: \n\t" | |
1818 | "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1819 | "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1820 | "pand %%mm4, %%mm0 \n\t" | |
1821 | "pand %%mm4, %%mm1 \n\t" | |
1822 | "packuswb %%mm1, %%mm0 \n\t" | |
1823 | "movq %%mm0, %%mm1 \n\t" | |
1824 | "psrlw $8, %%mm0 \n\t" | |
1825 | "pand %%mm4, %%mm1 \n\t" | |
1826 | "packuswb %%mm0, %%mm0 \n\t" | |
1827 | "packuswb %%mm1, %%mm1 \n\t" | |
1828 | "movd %%mm0, (%3, %%"REG_a") \n\t" | |
1829 | "movd %%mm1, (%2, %%"REG_a") \n\t" | |
1830 | "add $4, %%"REG_a" \n\t" | |
1831 | " js 1b \n\t" | |
1832 | : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) | |
1833 | : "%"REG_a | |
1834 | ); | |
7322a67c | 1835 | #else |
2da0d70d DB |
1836 | int i; |
1837 | for (i=0; i<width; i++) | |
1838 | { | |
1839 | dstU[i]= src1[4*i + 0]; | |
1840 | dstV[i]= src1[4*i + 2]; | |
1841 | } | |
1842 | #endif | |
1843 | assert(src1 == src2); | |
7322a67c MN |
1844 | } |
1845 | ||
97b93389 | 1846 | static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, long width) |
1e621b18 | 1847 | { |
2da0d70d DB |
1848 | int i; |
1849 | for (i=0; i<width; i++) | |
1850 | { | |
1851 | int b= ((uint32_t*)src)[i]&0xFF; | |
1852 | int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
1853 | int r= (((uint32_t*)src)[i]>>16)&0xFF; | |
1854 | ||
e5091488 | 1855 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); |
2da0d70d | 1856 | } |
1e621b18 MN |
1857 | } |
1858 | ||
97b93389 | 1859 | static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
1e621b18 | 1860 | { |
2da0d70d DB |
1861 | int i; |
1862 | assert(src1 == src2); | |
1863 | for (i=0; i<width; i++) | |
1864 | { | |
1865 | const int a= ((uint32_t*)src1)[2*i+0]; | |
1866 | const int e= ((uint32_t*)src1)[2*i+1]; | |
1867 | const int l= (a&0xFF00FF) + (e&0xFF00FF); | |
1868 | const int h= (a&0x00FF00) + (e&0x00FF00); | |
1869 | const int b= l&0x3FF; | |
1870 | const int g= h>>8; | |
1871 | const int r= l>>16; | |
1872 | ||
35ab2b64 MN |
1873 | dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); |
1874 | dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); | |
2da0d70d | 1875 | } |
1e621b18 MN |
1876 | } |
1877 | ||
7f526efd | 1878 | static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) |
1e621b18 | 1879 | { |
ac6a2e45 | 1880 | #ifdef HAVE_MMX |
2da0d70d DB |
1881 | asm volatile( |
1882 | "mov %2, %%"REG_a" \n\t" | |
5802683a RD |
1883 | "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" |
1884 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | |
2da0d70d DB |
1885 | "pxor %%mm7, %%mm7 \n\t" |
1886 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | |
1887 | ASMALIGN(4) | |
1888 | "1: \n\t" | |
1889 | PREFETCH" 64(%0, %%"REG_d") \n\t" | |
1890 | "movd (%0, %%"REG_d"), %%mm0 \n\t" | |
1891 | "movd 3(%0, %%"REG_d"), %%mm1 \n\t" | |
1892 | "punpcklbw %%mm7, %%mm0 \n\t" | |
1893 | "punpcklbw %%mm7, %%mm1 \n\t" | |
1894 | "movd 6(%0, %%"REG_d"), %%mm2 \n\t" | |
1895 | "movd 9(%0, %%"REG_d"), %%mm3 \n\t" | |
1896 | "punpcklbw %%mm7, %%mm2 \n\t" | |
1897 | "punpcklbw %%mm7, %%mm3 \n\t" | |
1898 | "pmaddwd %%mm6, %%mm0 \n\t" | |
1899 | "pmaddwd %%mm6, %%mm1 \n\t" | |
1900 | "pmaddwd %%mm6, %%mm2 \n\t" | |
1901 | "pmaddwd %%mm6, %%mm3 \n\t" | |
ac6a2e45 | 1902 | #ifndef FAST_BGR2YV12 |
2da0d70d DB |
1903 | "psrad $8, %%mm0 \n\t" |
1904 | "psrad $8, %%mm1 \n\t" | |
1905 | "psrad $8, %%mm2 \n\t" | |
1906 | "psrad $8, %%mm3 \n\t" | |
1907 | #endif | |
1908 | "packssdw %%mm1, %%mm0 \n\t" | |
1909 | "packssdw %%mm3, %%mm2 \n\t" | |
1910 | "pmaddwd %%mm5, %%mm0 \n\t" | |
1911 | "pmaddwd %%mm5, %%mm2 \n\t" | |
1912 | "packssdw %%mm2, %%mm0 \n\t" | |
1913 | "psraw $7, %%mm0 \n\t" | |
1914 | ||
1915 | "movd 12(%0, %%"REG_d"), %%mm4 \n\t" | |
1916 | "movd 15(%0, %%"REG_d"), %%mm1 \n\t" | |
1917 | "punpcklbw %%mm7, %%mm4 \n\t" | |
1918 | "punpcklbw %%mm7, %%mm1 \n\t" | |
1919 | "movd 18(%0, %%"REG_d"), %%mm2 \n\t" | |
1920 | "movd 21(%0, %%"REG_d"), %%mm3 \n\t" | |
1921 | "punpcklbw %%mm7, %%mm2 \n\t" | |
1922 | "punpcklbw %%mm7, %%mm3 \n\t" | |
1923 | "pmaddwd %%mm6, %%mm4 \n\t" | |
1924 | "pmaddwd %%mm6, %%mm1 \n\t" | |
1925 | "pmaddwd %%mm6, %%mm2 \n\t" | |
1926 | "pmaddwd %%mm6, %%mm3 \n\t" | |
ac6a2e45 | 1927 | #ifndef FAST_BGR2YV12 |
2da0d70d DB |
1928 | "psrad $8, %%mm4 \n\t" |
1929 | "psrad $8, %%mm1 \n\t" | |
1930 | "psrad $8, %%mm2 \n\t" | |
1931 | "psrad $8, %%mm3 \n\t" | |
1932 | #endif | |
1933 | "packssdw %%mm1, %%mm4 \n\t" | |
1934 | "packssdw %%mm3, %%mm2 \n\t" | |
1935 | "pmaddwd %%mm5, %%mm4 \n\t" | |
1936 | "pmaddwd %%mm5, %%mm2 \n\t" | |
1937 | "add $24, %%"REG_d" \n\t" | |
1938 | "packssdw %%mm2, %%mm4 \n\t" | |
1939 | "psraw $7, %%mm4 \n\t" | |
1940 | ||
1941 | "packuswb %%mm4, %%mm0 \n\t" | |
5802683a | 1942 | "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" |
2da0d70d DB |
1943 | |
1944 | "movq %%mm0, (%1, %%"REG_a") \n\t" | |
1945 | "add $8, %%"REG_a" \n\t" | |
1946 | " js 1b \n\t" | |
1947 | : : "r" (src+width*3), "r" (dst+width), "g" (-width) | |
1948 | : "%"REG_a, "%"REG_d | |
1949 | ); | |
1e621b18 | 1950 | #else |
2da0d70d DB |
1951 | int i; |
1952 | for (i=0; i<width; i++) | |
1953 | { | |
1954 | int b= src[i*3+0]; | |
1955 | int g= src[i*3+1]; | |
1956 | int r= src[i*3+2]; | |
1e621b18 | 1957 | |
e5091488 | 1958 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); |
2da0d70d | 1959 | } |
bc279024 | 1960 | #endif /* HAVE_MMX */ |
1e621b18 MN |
1961 | } |
1962 | ||
7f526efd | 1963 | static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
1e621b18 | 1964 | { |
4342fc14 | 1965 | #ifdef HAVE_MMX |
2da0d70d DB |
1966 | asm volatile( |
1967 | "mov %3, %%"REG_a" \n\t" | |
5802683a RD |
1968 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" |
1969 | "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" | |
2da0d70d DB |
1970 | "pxor %%mm7, %%mm7 \n\t" |
1971 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | |
1972 | "add %%"REG_d", %%"REG_d" \n\t" | |
1973 | ASMALIGN(4) | |
1974 | "1: \n\t" | |
1975 | PREFETCH" 64(%0, %%"REG_d") \n\t" | |
4342fc14 | 1976 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2da0d70d DB |
1977 | "movq (%0, %%"REG_d"), %%mm0 \n\t" |
1978 | "movq 6(%0, %%"REG_d"), %%mm2 \n\t" | |
1979 | "movq %%mm0, %%mm1 \n\t" | |
1980 | "movq %%mm2, %%mm3 \n\t" | |
1981 | "psrlq $24, %%mm0 \n\t" | |
1982 | "psrlq $24, %%mm2 \n\t" | |
1983 | PAVGB(%%mm1, %%mm0) | |
1984 | PAVGB(%%mm3, %%mm2) | |
1985 | "punpcklbw %%mm7, %%mm0 \n\t" | |
1986 | "punpcklbw %%mm7, %%mm2 \n\t" | |
4342fc14 | 1987 | #else |
2da0d70d DB |
1988 | "movd (%0, %%"REG_d"), %%mm0 \n\t" |
1989 | "movd 3(%0, %%"REG_d"), %%mm2 \n\t" | |
1990 | "punpcklbw %%mm7, %%mm0 \n\t" | |
1991 | "punpcklbw %%mm7, %%mm2 \n\t" | |
1992 | "paddw %%mm2, %%mm0 \n\t" | |
1993 | "movd 6(%0, %%"REG_d"), %%mm4 \n\t" | |
1994 | "movd 9(%0, %%"REG_d"), %%mm2 \n\t" | |
1995 | "punpcklbw %%mm7, %%mm4 \n\t" | |
1996 | "punpcklbw %%mm7, %%mm2 \n\t" | |
1997 | "paddw %%mm4, %%mm2 \n\t" | |
1998 | "psrlw $1, %%mm0 \n\t" | |
1999 | "psrlw $1, %%mm2 \n\t" | |
2000 | #endif | |
5802683a RD |
2001 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" |
2002 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" | |
2da0d70d DB |
2003 | |
2004 | "pmaddwd %%mm0, %%mm1 \n\t" | |
2005 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2006 | "pmaddwd %%mm6, %%mm0 \n\t" | |
2007 | "pmaddwd %%mm6, %%mm2 \n\t" | |
4342fc14 | 2008 | #ifndef FAST_BGR2YV12 |
2da0d70d DB |
2009 | "psrad $8, %%mm0 \n\t" |
2010 | "psrad $8, %%mm1 \n\t" | |
2011 | "psrad $8, %%mm2 \n\t" | |
2012 | "psrad $8, %%mm3 \n\t" | |
2013 | #endif | |
2014 | "packssdw %%mm2, %%mm0 \n\t" | |
2015 | "packssdw %%mm3, %%mm1 \n\t" | |
2016 | "pmaddwd %%mm5, %%mm0 \n\t" | |
2017 | "pmaddwd %%mm5, %%mm1 \n\t" | |
2018 | "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
2019 | "psraw $7, %%mm0 \n\t" | |
4342fc14 MN |
2020 | |
2021 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
2da0d70d DB |
2022 | "movq 12(%0, %%"REG_d"), %%mm4 \n\t" |
2023 | "movq 18(%0, %%"REG_d"), %%mm2 \n\t" | |
2024 | "movq %%mm4, %%mm1 \n\t" | |
2025 | "movq %%mm2, %%mm3 \n\t" | |
2026 | "psrlq $24, %%mm4 \n\t" | |
2027 | "psrlq $24, %%mm2 \n\t" | |
2028 | PAVGB(%%mm1, %%mm4) | |
2029 | PAVGB(%%mm3, %%mm2) | |
2030 | "punpcklbw %%mm7, %%mm4 \n\t" | |
2031 | "punpcklbw %%mm7, %%mm2 \n\t" | |
4342fc14 | 2032 | #else |
2da0d70d DB |
2033 | "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
2034 | "movd 15(%0, %%"REG_d"), %%mm2 \n\t" | |
2035 | "punpcklbw %%mm7, %%mm4 \n\t" | |
2036 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2037 | "paddw %%mm2, %%mm4 \n\t" | |
2038 | "movd 18(%0, %%"REG_d"), %%mm5 \n\t" | |
2039 | "movd 21(%0, %%"REG_d"), %%mm2 \n\t" | |
2040 | "punpcklbw %%mm7, %%mm5 \n\t" | |
2041 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2042 | "paddw %%mm5, %%mm2 \n\t" | |
5802683a | 2043 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" |
2da0d70d DB |
2044 | "psrlw $2, %%mm4 \n\t" |
2045 | "psrlw $2, %%mm2 \n\t" | |
2046 | #endif | |
5802683a RD |
2047 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" |
2048 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" | |
2da0d70d DB |
2049 | |
2050 | "pmaddwd %%mm4, %%mm1 \n\t" | |
2051 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2052 | "pmaddwd %%mm6, %%mm4 \n\t" | |
2053 | "pmaddwd %%mm6, %%mm2 \n\t" | |
4342fc14 | 2054 | #ifndef FAST_BGR2YV12 |
2da0d70d DB |
2055 | "psrad $8, %%mm4 \n\t" |
2056 | "psrad $8, %%mm1 \n\t" | |
2057 | "psrad $8, %%mm2 \n\t" | |
2058 | "psrad $8, %%mm3 \n\t" | |
2059 | #endif | |
2060 | "packssdw %%mm2, %%mm4 \n\t" | |
2061 | "packssdw %%mm3, %%mm1 \n\t" | |
2062 | "pmaddwd %%mm5, %%mm4 \n\t" | |
2063 | "pmaddwd %%mm5, %%mm1 \n\t" | |
2064 | "add $24, %%"REG_d" \n\t" | |
2065 | "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | |
2066 | "psraw $7, %%mm4 \n\t" | |
2067 | ||
2068 | "movq %%mm0, %%mm1 \n\t" | |
2069 | "punpckldq %%mm4, %%mm0 \n\t" | |
2070 | "punpckhdq %%mm4, %%mm1 \n\t" | |
2071 | "packsswb %%mm1, %%mm0 \n\t" | |
5802683a | 2072 | "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" |
2da0d70d DB |
2073 | |
2074 | "movd %%mm0, (%1, %%"REG_a") \n\t" | |
2075 | "punpckhdq %%mm0, %%mm0 \n\t" | |
2076 | "movd %%mm0, (%2, %%"REG_a") \n\t" | |
2077 | "add $4, %%"REG_a" \n\t" | |
2078 | " js 1b \n\t" | |
2079 | : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) | |
2080 | : "%"REG_a, "%"REG_d | |
2081 | ); | |
1e621b18 | 2082 | #else |
2da0d70d DB |
2083 | int i; |
2084 | for (i=0; i<width; i++) | |
2085 | { | |
2086 | int b= src1[6*i + 0] + src1[6*i + 3]; | |
2087 | int g= src1[6*i + 1] + src1[6*i + 4]; | |
2088 | int r= src1[6*i + 2] + src1[6*i + 5]; | |
2089 | ||
35ab2b64 MN |
2090 | dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); |
2091 | dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); | |
2da0d70d | 2092 | } |
bc279024 | 2093 | #endif /* HAVE_MMX */ |
2da0d70d | 2094 | assert(src1 == src2); |
1e621b18 MN |
2095 | } |
2096 | ||
97b93389 | 2097 | static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, long width) |
6af250ea | 2098 | { |
2da0d70d DB |
2099 | int i; |
2100 | for (i=0; i<width; i++) | |
2101 | { | |
2102 | int d= ((uint16_t*)src)[i]; | |
2103 | int b= d&0x1F; | |
2104 | int g= (d>>5)&0x3F; | |
2105 | int r= (d>>11)&0x1F; | |
2106 | ||
35ab2b64 | 2107 | dst[i]= (2*RY*r + GY*g + 2*BY*b + (33<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT-2); |
2da0d70d | 2108 | } |
6af250ea MN |
2109 | } |
2110 | ||
97b93389 | 2111 | static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
6af250ea | 2112 | { |
2da0d70d DB |
2113 | int i; |
2114 | assert(src1==src2); | |
2115 | for (i=0; i<width; i++) | |
2116 | { | |
2117 | int d0= ((uint32_t*)src1)[i]; | |
2118 | ||
2119 | int dl= (d0&0x07E0F81F); | |
2120 | int dh= ((d0>>5)&0x07C0F83F); | |
2121 | ||
2122 | int dh2= (dh>>11) + (dh<<21); | |
2123 | int d= dh2 + dl; | |
2124 | ||
2125 | int b= d&0x7F; | |
2126 | int r= (d>>11)&0x7F; | |
2127 | int g= d>>21; | |
35ab2b64 MN |
2128 | dstU[i]= (2*RU*r + GU*g + 2*BU*b + (257<<(RGB2YUV_SHIFT-2)))>>(RGB2YUV_SHIFT+1-2); |
2129 | dstV[i]= (2*RV*r + GV*g + 2*BV*b + (257<<(RGB2YUV_SHIFT-2)))>>(RGB2YUV_SHIFT+1-2); | |
2da0d70d | 2130 | } |
6af250ea MN |
2131 | } |
2132 | ||
97b93389 | 2133 | static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, long width) |
b72034dd | 2134 | { |
2da0d70d DB |
2135 | int i; |
2136 | for (i=0; i<width; i++) | |
2137 | { | |
2138 | int d= ((uint16_t*)src)[i]; | |
2139 | int b= d&0x1F; | |
2140 | int g= (d>>5)&0x1F; | |
2141 | int r= (d>>10)&0x1F; | |
2142 | ||
35ab2b64 | 2143 | dst[i]= (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-4)))>>(RGB2YUV_SHIFT-3); |
2da0d70d | 2144 | } |
b72034dd MN |
2145 | } |
2146 | ||
97b93389 | 2147 | static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
b72034dd | 2148 | { |
2da0d70d DB |
2149 | int i; |
2150 | assert(src1==src2); | |
2151 | for (i=0; i<width; i++) | |
2152 | { | |
2153 | int d0= ((uint32_t*)src1)[i]; | |
2154 | ||
2155 | int dl= (d0&0x03E07C1F); | |
2156 | int dh= ((d0>>5)&0x03E0F81F); | |
2157 | ||
2158 | int dh2= (dh>>11) + (dh<<21); | |
2159 | int d= dh2 + dl; | |
2160 | ||
2161 | int b= d&0x7F; | |
2162 | int r= (d>>10)&0x7F; | |
2163 | int g= d>>21; | |
35ab2b64 MN |
2164 | dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT+1-3); |
2165 | dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT+1-3); | |
2da0d70d | 2166 | } |
b72034dd MN |
2167 | } |
2168 | ||
2169 | ||
97b93389 | 2170 | static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, long width) |
a861d4d7 | 2171 | { |
2da0d70d DB |
2172 | int i; |
2173 | for (i=0; i<width; i++) | |
2174 | { | |
2175 | int r= ((uint32_t*)src)[i]&0xFF; | |
2176 | int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
2177 | int b= (((uint32_t*)src)[i]>>16)&0xFF; | |
2178 | ||
e5091488 | 2179 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); |
2da0d70d | 2180 | } |
a861d4d7 MN |
2181 | } |
2182 | ||
97b93389 | 2183 | static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
a861d4d7 | 2184 | { |
2da0d70d DB |
2185 | int i; |
2186 | assert(src1==src2); | |
2187 | for (i=0; i<width; i++) | |
2188 | { | |
2189 | const int a= ((uint32_t*)src1)[2*i+0]; | |
2190 | const int e= ((uint32_t*)src1)[2*i+1]; | |
2191 | const int l= (a&0xFF00FF) + (e&0xFF00FF); | |
2192 | const int h= (a&0x00FF00) + (e&0x00FF00); | |
2193 | const int r= l&0x3FF; | |
2194 | const int g= h>>8; | |
2195 | const int b= l>>16; | |
2196 | ||
35ab2b64 MN |
2197 | dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1); |
2198 | dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1); | |
2da0d70d | 2199 | } |
a861d4d7 MN |
2200 | } |
2201 | ||
97b93389 | 2202 | static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width) |
a861d4d7 | 2203 | { |
2da0d70d DB |
2204 | int i; |
2205 | for (i=0; i<width; i++) | |
2206 | { | |
2207 | int r= src[i*3+0]; | |
2208 | int g= src[i*3+1]; | |
2209 | int b= src[i*3+2]; | |
2210 | ||
e5091488 | 2211 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); |
2da0d70d | 2212 | } |
a861d4d7 MN |
2213 | } |
2214 | ||
97b93389 | 2215 | static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
a861d4d7 | 2216 | { |
2da0d70d DB |
2217 | int i; |
2218 | assert(src1==src2); | |
2219 | for (i=0; i<width; i++) | |
2220 | { | |
2221 | int r= src1[6*i + 0] + src1[6*i + 3]; | |
2222 | int g= src1[6*i + 1] + src1[6*i + 4]; | |
2223 | int b= src1[6*i + 2] + src1[6*i + 5]; | |
2224 | ||
35ab2b64 MN |
2225 | dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1); |
2226 | dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT)))>>(RGB2YUV_SHIFT+1); | |
2da0d70d | 2227 | } |
a861d4d7 MN |
2228 | } |
2229 | ||
97b93389 | 2230 | static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, long width) |
a43fb6b3 | 2231 | { |
2da0d70d DB |
2232 | int i; |
2233 | for (i=0; i<width; i++) | |
2234 | { | |
2235 | int d= ((uint16_t*)src)[i]; | |
2236 | int r= d&0x1F; | |
2237 | int g= (d>>5)&0x3F; | |
2238 | int b= (d>>11)&0x1F; | |
2239 | ||
35ab2b64 | 2240 | dst[i]= (2*RY*r + GY*g + 2*BY*b + (33<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT-2); |
2da0d70d | 2241 | } |
a43fb6b3 LA |
2242 | } |
2243 | ||
97b93389 | 2244 | static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
a43fb6b3 | 2245 | { |
2da0d70d DB |
2246 | int i; |
2247 | assert(src1 == src2); | |
2248 | for (i=0; i<width; i++) | |
2249 | { | |
2250 | int d0= ((uint32_t*)src1)[i]; | |
2251 | ||
2252 | int dl= (d0&0x07E0F81F); | |
fa884294 | 2253 | int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F); |
2da0d70d | 2254 | |
fa884294 IP |
2255 | int r= d&0x3F; |
2256 | int b= (d>>11)&0x3F; | |
2da0d70d | 2257 | int g= d>>21; |
35ab2b64 MN |
2258 | dstU[i]= (2*RU*r + GU*g + 2*BU*b + (257<<(RGB2YUV_SHIFT-2)))>>(RGB2YUV_SHIFT+1-2); |
2259 | dstV[i]= (2*RV*r + GV*g + 2*BV*b + (257<<(RGB2YUV_SHIFT-2)))>>(RGB2YUV_SHIFT+1-2); | |
2da0d70d | 2260 | } |
a43fb6b3 LA |
2261 | } |
2262 | ||
97b93389 | 2263 | static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, long width) |
a43fb6b3 | 2264 | { |
2da0d70d DB |
2265 | int i; |
2266 | for (i=0; i<width; i++) | |
2267 | { | |
2268 | int d= ((uint16_t*)src)[i]; | |
2269 | int r= d&0x1F; | |
2270 | int g= (d>>5)&0x1F; | |
2271 | int b= (d>>10)&0x1F; | |
2272 | ||
35ab2b64 | 2273 | dst[i]= (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-4)))>>(RGB2YUV_SHIFT-3); |
2da0d70d | 2274 | } |
a43fb6b3 LA |
2275 | } |
2276 | ||
97b93389 | 2277 | static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
a43fb6b3 | 2278 | { |
2da0d70d DB |
2279 | int i; |
2280 | assert(src1 == src2); | |
2281 | for (i=0; i<width; i++) | |
2282 | { | |
2283 | int d0= ((uint32_t*)src1)[i]; | |
2284 | ||
2285 | int dl= (d0&0x03E07C1F); | |
f96829d2 | 2286 | int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F); |
2da0d70d | 2287 | |
f96829d2 IP |
2288 | int r= d&0x3F; |
2289 | int b= (d>>10)&0x3F; | |
2290 | int g= d>>21; | |
35ab2b64 MN |
2291 | dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT+1-3); |
2292 | dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-3)))>>(RGB2YUV_SHIFT+1-3); | |
2da0d70d | 2293 | } |
a43fb6b3 | 2294 | } |
1e621b18 | 2295 | |
97b93389 | 2296 | static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal) |
e28630fc | 2297 | { |
2da0d70d DB |
2298 | int i; |
2299 | for (i=0; i<width; i++) | |
2300 | { | |
2301 | int d= src[i]; | |
e28630fc | 2302 | |
2da0d70d DB |
2303 | dst[i]= pal[d] & 0xFF; |
2304 | } | |
e28630fc MN |
2305 | } |
2306 | ||
97b93389 | 2307 | static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal) |
e28630fc | 2308 | { |
2da0d70d DB |
2309 | int i; |
2310 | assert(src1 == src2); | |
2311 | for (i=0; i<width; i++) | |
2312 | { | |
2313 | int p= pal[src1[i]]; | |
2314 | ||
2315 | dstU[i]= p>>8; | |
2316 | dstV[i]= p>>16; | |
2317 | } | |
e28630fc MN |
2318 | } |
2319 | ||
8a322796 | 2320 | // bilinear / bicubic scaling |
077ea8a7 | 2321 | static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, |
2da0d70d | 2322 | int16_t *filter, int16_t *filterPos, long filterSize) |
2ff198c1 | 2323 | { |
077ea8a7 | 2324 | #ifdef HAVE_MMX |
2da0d70d DB |
2325 | assert(filterSize % 4 == 0 && filterSize>0); |
2326 | if (filterSize==4) // Always true for upscaling, sometimes for down, too. | |
2327 | { | |
2328 | long counter= -2*dstW; | |
2329 | filter-= counter*2; | |
2330 | filterPos-= counter/2; | |
2331 | dst-= counter/2; | |
2332 | asm volatile( | |
83c89c78 | 2333 | #if defined(PIC) |
2da0d70d DB |
2334 | "push %%"REG_b" \n\t" |
2335 | #endif | |
2336 | "pxor %%mm7, %%mm7 \n\t" | |
2337 | "movq "MANGLE(w02)", %%mm6 \n\t" | |
2338 | "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2339 | "mov %%"REG_a", %%"REG_BP" \n\t" | |
2340 | ASMALIGN(4) | |
2341 | "1: \n\t" | |
2342 | "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2343 | "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | |
2344 | "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" | |
2345 | "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" | |
2346 | "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2347 | "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2348 | "punpcklbw %%mm7, %%mm0 \n\t" | |
2349 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2350 | "pmaddwd %%mm1, %%mm0 \n\t" | |
2351 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2352 | "psrad $8, %%mm0 \n\t" | |
2353 | "psrad $8, %%mm3 \n\t" | |
2354 | "packssdw %%mm3, %%mm0 \n\t" | |
2355 | "pmaddwd %%mm6, %%mm0 \n\t" | |
2356 | "packssdw %%mm0, %%mm0 \n\t" | |
2357 | "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2358 | "add $4, %%"REG_BP" \n\t" | |
2359 | " jnc 1b \n\t" | |
2360 | ||
2361 | "pop %%"REG_BP" \n\t" | |
83c89c78 | 2362 | #if defined(PIC) |
2da0d70d | 2363 | "pop %%"REG_b" \n\t" |
83c89c78 | 2364 | #endif |
2da0d70d DB |
2365 | : "+a" (counter) |
2366 | : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
83c89c78 | 2367 | #if !defined(PIC) |
2da0d70d DB |
2368 | : "%"REG_b |
2369 | #endif | |
2370 | ); | |
2371 | } | |
2372 | else if (filterSize==8) | |
2373 | { | |
2374 | long counter= -2*dstW; | |
2375 | filter-= counter*4; | |
2376 | filterPos-= counter/2; | |
2377 | dst-= counter/2; | |
2378 | asm volatile( | |
83c89c78 | 2379 | #if defined(PIC) |
2da0d70d DB |
2380 | "push %%"REG_b" \n\t" |
2381 | #endif | |
2382 | "pxor %%mm7, %%mm7 \n\t" | |
2383 | "movq "MANGLE(w02)", %%mm6 \n\t" | |
2384 | "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2385 | "mov %%"REG_a", %%"REG_BP" \n\t" | |
2386 | ASMALIGN(4) | |
2387 | "1: \n\t" | |
2388 | "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2389 | "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | |
2390 | "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" | |
2391 | "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" | |
2392 | "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2393 | "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2394 | "punpcklbw %%mm7, %%mm0 \n\t" | |
2395 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2396 | "pmaddwd %%mm1, %%mm0 \n\t" | |
2397 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2398 | ||
2399 | "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" | |
2400 | "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" | |
2401 | "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | |
2402 | "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | |
2403 | "punpcklbw %%mm7, %%mm4 \n\t" | |
2404 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2405 | "pmaddwd %%mm1, %%mm4 \n\t" | |
2406 | "pmaddwd %%mm2, %%mm5 \n\t" | |
2407 | "paddd %%mm4, %%mm0 \n\t" | |
2408 | "paddd %%mm5, %%mm3 \n\t" | |
2409 | ||
2410 | "psrad $8, %%mm0 \n\t" | |
2411 | "psrad $8, %%mm3 \n\t" | |
2412 | "packssdw %%mm3, %%mm0 \n\t" | |
2413 | "pmaddwd %%mm6, %%mm0 \n\t" | |
2414 | "packssdw %%mm0, %%mm0 \n\t" | |
2415 | "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2416 | "add $4, %%"REG_BP" \n\t" | |
2417 | " jnc 1b \n\t" | |
2418 | ||
2419 | "pop %%"REG_BP" \n\t" | |
83c89c78 | 2420 | #if defined(PIC) |
2da0d70d | 2421 | "pop %%"REG_b" \n\t" |
83c89c78 | 2422 | #endif |
2da0d70d DB |
2423 | : "+a" (counter) |
2424 | : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
83c89c78 | 2425 | #if !defined(PIC) |
2da0d70d DB |
2426 | : "%"REG_b |
2427 | #endif | |
2428 | ); | |
2429 | } | |
2430 | else | |
2431 | { | |
2432 | uint8_t *offset = src+filterSize; | |
2433 | long counter= -2*dstW; | |
2434 | //filter-= counter*filterSize/2; | |
2435 | filterPos-= counter/2; | |
2436 | dst-= counter/2; | |
2437 | asm volatile( | |
2438 | "pxor %%mm7, %%mm7 \n\t" | |
2439 | "movq "MANGLE(w02)", %%mm6 \n\t" | |
2440 | ASMALIGN(4) | |
2441 | "1: \n\t" | |
2442 | "mov %2, %%"REG_c" \n\t" | |
2443 | "movzwl (%%"REG_c", %0), %%eax \n\t" | |
2444 | "movzwl 2(%%"REG_c", %0), %%edx \n\t" | |
2445 | "mov %5, %%"REG_c" \n\t" | |
2446 | "pxor %%mm4, %%mm4 \n\t" | |
2447 | "pxor %%mm5, %%mm5 \n\t" | |
2448 | "2: \n\t" | |
2449 | "movq (%1), %%mm1 \n\t" | |
2450 | "movq (%1, %6), %%mm3 \n\t" | |
2451 | "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" | |
2452 | "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" | |
2453 | "punpcklbw %%mm7, %%mm0 \n\t" | |
2454 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2455 | "pmaddwd %%mm1, %%mm0 \n\t" | |
2456 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2457 | "paddd %%mm3, %%mm5 \n\t" | |
2458 | "paddd %%mm0, %%mm4 \n\t" | |
2459 | "add $8, %1 \n\t" | |
2460 | "add $4, %%"REG_c" \n\t" | |
2461 | "cmp %4, %%"REG_c" \n\t" | |
2462 | " jb 2b \n\t" | |
2463 | "add %6, %1 \n\t" | |
2464 | "psrad $8, %%mm4 \n\t" | |
2465 | "psrad $8, %%mm5 \n\t" | |
2466 | "packssdw %%mm5, %%mm4 \n\t" | |
2467 | "pmaddwd %%mm6, %%mm4 \n\t" | |
2468 | "packssdw %%mm4, %%mm4 \n\t" | |
2469 | "mov %3, %%"REG_a" \n\t" | |
2470 | "movd %%mm4, (%%"REG_a", %0) \n\t" | |
2471 | "add $4, %0 \n\t" | |
2472 | " jnc 1b \n\t" | |
2473 | ||
2474 | : "+r" (counter), "+r" (filter) | |
2475 | : "m" (filterPos), "m" (dst), "m"(offset), | |
2476 | "m" (src), "r" (filterSize*2) | |
2477 | : "%"REG_a, "%"REG_c, "%"REG_d | |
2478 | ); | |
2479 | } | |
077ea8a7 | 2480 | #else |
8c266f0c | 2481 | #ifdef HAVE_ALTIVEC |
2da0d70d | 2482 | hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); |
8c266f0c | 2483 | #else |
2da0d70d DB |
2484 | int i; |
2485 | for (i=0; i<dstW; i++) | |
2486 | { | |
2487 | int j; | |
2488 | int srcPos= filterPos[i]; | |
2489 | int val=0; | |
2490 | //printf("filterPos: %d\n", filterPos[i]); | |
2491 | for (j=0; j<filterSize; j++) | |
2492 | { | |
2493 | //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2494 | val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2495 | } | |
2496 | //filter += hFilterSize; | |
2497 | dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ... | |
2498 | //dst[i] = val>>7; | |
2499 | } | |
bc279024 DB |
2500 | #endif /* HAVE_ALTIVEC */ |
2501 | #endif /* HAVE_MMX */ | |
077ea8a7 | 2502 | } |
2ff198c1 | 2503 | // *** horizontal scale Y line to temp buffer |
6bc0c792 | 2504 | static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, |
2da0d70d DB |
2505 | int flags, int canMMX2BeUsed, int16_t *hLumFilter, |
2506 | int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, | |
2507 | int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
2508 | int32_t *mmx2FilterPos, uint8_t *pal) | |
077ea8a7 | 2509 | { |
2da0d70d | 2510 | if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE) |
1e621b18 | 2511 | { |
2da0d70d DB |
2512 | RENAME(yuy2ToY)(formatConvBuffer, src, srcW); |
2513 | src= formatConvBuffer; | |
1e621b18 | 2514 | } |
2da0d70d | 2515 | else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE) |
7322a67c | 2516 | { |
2da0d70d DB |
2517 | RENAME(uyvyToY)(formatConvBuffer, src, srcW); |
2518 | src= formatConvBuffer; | |
7322a67c | 2519 | } |
2da0d70d | 2520 | else if (srcFormat==PIX_FMT_RGB32) |
1e621b18 | 2521 | { |
2da0d70d DB |
2522 | RENAME(bgr32ToY)(formatConvBuffer, src, srcW); |
2523 | src= formatConvBuffer; | |
1e621b18 | 2524 | } |
9990e426 MN |
2525 | else if (srcFormat==PIX_FMT_RGB32_1) |
2526 | { | |
2527 | RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW); | |
2528 | src= formatConvBuffer; | |
2529 | } | |
2da0d70d | 2530 | else if (srcFormat==PIX_FMT_BGR24) |
1e621b18 | 2531 | { |
2da0d70d DB |
2532 | RENAME(bgr24ToY)(formatConvBuffer, src, srcW); |
2533 | src= formatConvBuffer; | |
1e621b18 | 2534 | } |
2da0d70d | 2535 | else if (srcFormat==PIX_FMT_BGR565) |
6af250ea | 2536 | { |
2da0d70d DB |
2537 | RENAME(bgr16ToY)(formatConvBuffer, src, srcW); |
2538 | src= formatConvBuffer; | |
6af250ea | 2539 | } |
2da0d70d | 2540 | else if (srcFormat==PIX_FMT_BGR555) |
b72034dd | 2541 | { |
2da0d70d DB |
2542 | RENAME(bgr15ToY)(formatConvBuffer, src, srcW); |
2543 | src= formatConvBuffer; | |
b72034dd | 2544 | } |
2da0d70d | 2545 | else if (srcFormat==PIX_FMT_BGR32) |
a861d4d7 | 2546 | { |
2da0d70d DB |
2547 | RENAME(rgb32ToY)(formatConvBuffer, src, srcW); |
2548 | src= formatConvBuffer; | |
a861d4d7 | 2549 | } |
9990e426 MN |
2550 | else if (srcFormat==PIX_FMT_BGR32_1) |
2551 | { | |
2552 | RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW); | |
2553 | src= formatConvBuffer; | |
2554 | } | |
2da0d70d | 2555 | else if (srcFormat==PIX_FMT_RGB24) |
a861d4d7 | 2556 | { |
2da0d70d DB |
2557 | RENAME(rgb24ToY)(formatConvBuffer, src, srcW); |
2558 | src= formatConvBuffer; | |
a861d4d7 | 2559 | } |
2da0d70d | 2560 | else if (srcFormat==PIX_FMT_RGB565) |
a43fb6b3 | 2561 | { |
2da0d70d DB |
2562 | RENAME(rgb16ToY)(formatConvBuffer, src, srcW); |
2563 | src= formatConvBuffer; | |
a43fb6b3 | 2564 | } |
2da0d70d | 2565 | else if (srcFormat==PIX_FMT_RGB555) |
a43fb6b3 | 2566 | { |
2da0d70d DB |
2567 | RENAME(rgb15ToY)(formatConvBuffer, src, srcW); |
2568 | src= formatConvBuffer; | |
a43fb6b3 | 2569 | } |
2da0d70d | 2570 | else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) |
e28630fc | 2571 | { |
87cf861c | 2572 | RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal); |
2da0d70d | 2573 | src= formatConvBuffer; |
e28630fc | 2574 | } |
1e621b18 | 2575 | |
e3d2500f | 2576 | #ifdef HAVE_MMX |
8a322796 | 2577 | // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). |
2da0d70d | 2578 | if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
e3d2500f | 2579 | #else |
2da0d70d | 2580 | if (!(flags&SWS_FAST_BILINEAR)) |
e3d2500f | 2581 | #endif |
077ea8a7 | 2582 | { |
2da0d70d | 2583 | RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); |
077ea8a7 | 2584 | } |
8a322796 | 2585 | else // fast bilinear upscale / crap downscale |
077ea8a7 | 2586 | { |
3d6a30d9 | 2587 | #if defined(ARCH_X86) |
2ff198c1 | 2588 | #ifdef HAVE_MMX2 |
2da0d70d | 2589 | int i; |
83c89c78 | 2590 | #if defined(PIC) |
2da0d70d | 2591 | uint64_t ebxsave __attribute__((aligned(8))); |
83c89c78 | 2592 | #endif |
2da0d70d DB |
2593 | if (canMMX2BeUsed) |
2594 | { | |
2595 | asm volatile( | |
83c89c78 | 2596 | #if defined(PIC) |
2da0d70d DB |
2597 | "mov %%"REG_b", %5 \n\t" |
2598 | #endif | |
2599 | "pxor %%mm7, %%mm7 \n\t" | |
2600 | "mov %0, %%"REG_c" \n\t" | |
2601 | "mov %1, %%"REG_D" \n\t" | |
2602 | "mov %2, %%"REG_d" \n\t" | |
2603 | "mov %3, %%"REG_b" \n\t" | |
2604 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2605 | PREFETCH" (%%"REG_c") \n\t" | |
2606 | PREFETCH" 32(%%"REG_c") \n\t" | |
2607 | PREFETCH" 64(%%"REG_c") \n\t" | |
99cefd0b | 2608 | |
6d606c4f AJ |
2609 | #ifdef ARCH_X86_64 |
2610 | ||
2611 | #define FUNNY_Y_CODE \ | |
2da0d70d DB |
2612 | "movl (%%"REG_b"), %%esi \n\t"\ |
2613 | "call *%4 \n\t"\ | |
2614 | "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ | |
2615 | "add %%"REG_S", %%"REG_c" \n\t"\ | |
2616 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2617 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
6d606c4f AJ |
2618 | |
2619 | #else | |
2620 | ||
2ff198c1 | 2621 | #define FUNNY_Y_CODE \ |
2da0d70d DB |
2622 | "movl (%%"REG_b"), %%esi \n\t"\ |
2623 | "call *%4 \n\t"\ | |
2624 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ | |
2625 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2626 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
99cefd0b | 2627 | |
bc279024 | 2628 | #endif /* ARCH_X86_64 */ |
6d606c4f | 2629 | |
2ff198c1 MN |
2630 | FUNNY_Y_CODE |
2631 | FUNNY_Y_CODE | |
2632 | FUNNY_Y_CODE | |
2633 | FUNNY_Y_CODE | |
2634 | FUNNY_Y_CODE | |
2635 | FUNNY_Y_CODE | |
2636 | FUNNY_Y_CODE | |
2637 | FUNNY_Y_CODE | |
2638 | ||
83c89c78 | 2639 | #if defined(PIC) |
2da0d70d | 2640 | "mov %5, %%"REG_b" \n\t" |
83c89c78 | 2641 | #endif |
2da0d70d DB |
2642 | :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2643 | "m" (funnyYCode) | |
83c89c78 | 2644 | #if defined(PIC) |
2da0d70d | 2645 | ,"m" (ebxsave) |
83c89c78 | 2646 | #endif |
2da0d70d | 2647 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
83c89c78 | 2648 | #if !defined(PIC) |
2da0d70d DB |
2649 | ,"%"REG_b |
2650 | #endif | |
2651 | ); | |
2652 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; | |
2653 | } | |
2654 | else | |
2655 | { | |
bc279024 | 2656 | #endif /* HAVE_MMX2 */ |
2da0d70d DB |
2657 | long xInc_shr16 = xInc >> 16; |
2658 | uint16_t xInc_mask = xInc & 0xffff; | |
2659 | //NO MMX just normal asm ... | |
2660 | asm volatile( | |
2661 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2662 | "xor %%"REG_d", %%"REG_d" \n\t" // xx | |
2663 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2664 | ASMALIGN(4) | |
2665 | "1: \n\t" | |
2666 | "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] | |
2667 | "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2668 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2669 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2670 | "shll $16, %%edi \n\t" | |
2671 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2672 | "mov %1, %%"REG_D" \n\t" | |
2673 | "shrl $9, %%esi \n\t" | |
2674 | "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" | |
2675 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2676 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry | |
2677 | ||
2678 | "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] | |
2679 | "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2680 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2681 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2682 | "shll $16, %%edi \n\t" | |
2683 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2684 | "mov %1, %%"REG_D" \n\t" | |
2685 | "shrl $9, %%esi \n\t" | |
2686 | "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t" | |
2687 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2688 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry | |
2689 | ||
2690 | ||
2691 | "add $2, %%"REG_a" \n\t" | |
2692 | "cmp %2, %%"REG_a" \n\t" | |
2693 | " jb 1b \n\t" | |
2694 | ||
2695 | ||
2696 | :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) | |
2697 | : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" | |
2698 | ); | |
2ff198c1 | 2699 | #ifdef HAVE_MMX2 |
2da0d70d | 2700 | } //if MMX2 can't be used |
2ff198c1 MN |
2701 | #endif |
2702 | #else | |
2da0d70d DB |
2703 | int i; |
2704 | unsigned int xpos=0; | |
2705 | for (i=0;i<dstWidth;i++) | |
2706 | { | |
2707 | register unsigned int xx=xpos>>16; | |
2708 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2709 | dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2710 | xpos+=xInc; | |
2711 | } | |
bc279024 | 2712 | #endif /* defined(ARCH_X86) */ |
077ea8a7 | 2713 | } |
6bc0c792 MN |
2714 | |
2715 | if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ | |
2716 | int i; | |
2717 | //FIXME all pal and rgb srcFormats could do this convertion as well | |
2718 | //FIXME all scalers more complex than bilinear could do half of this transform | |
2719 | if(c->srcRange){ | |
2720 | for (i=0; i<dstWidth; i++) | |
2721 | dst[i]= (dst[i]*14071 + 33561947)>>14; | |
2722 | }else{ | |
2723 | for (i=0; i<dstWidth; i++) | |
aa13b0fc | 2724 | dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14; |
6bc0c792 MN |
2725 | } |
2726 | } | |
2ff198c1 MN |
2727 | } |
2728 | ||
6bc0c792 | 2729 | inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, |
2da0d70d DB |
2730 | int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, |
2731 | int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, | |
2732 | int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
2733 | int32_t *mmx2FilterPos, uint8_t *pal) | |
2ff198c1 | 2734 | { |
2da0d70d | 2735 | if (srcFormat==PIX_FMT_YUYV422) |
1e621b18 | 2736 | { |
8b2fce0d | 2737 | RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2738 | src1= formatConvBuffer; |
8b2fce0d | 2739 | src2= formatConvBuffer+VOFW; |
1e621b18 | 2740 | } |
2da0d70d | 2741 | else if (srcFormat==PIX_FMT_UYVY422) |
7322a67c | 2742 | { |
8b2fce0d | 2743 | RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2744 | src1= formatConvBuffer; |
8b2fce0d | 2745 | src2= formatConvBuffer+VOFW; |
7322a67c | 2746 | } |
2da0d70d | 2747 | else if (srcFormat==PIX_FMT_RGB32) |
1e621b18 | 2748 | { |
8b2fce0d | 2749 | RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2750 | src1= formatConvBuffer; |
8b2fce0d | 2751 | src2= formatConvBuffer+VOFW; |
1e621b18 | 2752 | } |
9990e426 MN |
2753 | else if (srcFormat==PIX_FMT_RGB32_1) |
2754 | { | |
2755 | RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW); | |
2756 | src1= formatConvBuffer; | |
2757 | src2= formatConvBuffer+VOFW; | |
2758 | } | |
2da0d70d | 2759 | else if (srcFormat==PIX_FMT_BGR24) |
1e621b18 | 2760 | { |
8b2fce0d | 2761 | RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2762 | src1= formatConvBuffer; |
8b2fce0d | 2763 | src2= formatConvBuffer+VOFW; |
1e621b18 | 2764 | } |
2da0d70d | 2765 | else if (srcFormat==PIX_FMT_BGR565) |
6af250ea | 2766 | { |
8b2fce0d | 2767 | RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2768 | src1= formatConvBuffer; |
8b2fce0d | 2769 | src2= formatConvBuffer+VOFW; |
6af250ea | 2770 | } |
2da0d70d | 2771 | else if (srcFormat==PIX_FMT_BGR555) |
b72034dd | 2772 | { |
8b2fce0d | 2773 | RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2774 | src1= formatConvBuffer; |
8b2fce0d | 2775 | src2= formatConvBuffer+VOFW; |
b72034dd | 2776 | } |
2da0d70d | 2777 | else if (srcFormat==PIX_FMT_BGR32) |
a861d4d7 | 2778 | { |
8b2fce0d | 2779 | RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2780 | src1= formatConvBuffer; |
8b2fce0d | 2781 | src2= formatConvBuffer+VOFW; |
a861d4d7 | 2782 | } |
9990e426 MN |
2783 | else if (srcFormat==PIX_FMT_BGR32_1) |
2784 | { | |
2785 | RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW); | |
2786 | src1= formatConvBuffer; | |
2787 | src2= formatConvBuffer+VOFW; | |
2788 | } | |
2da0d70d | 2789 | else if (srcFormat==PIX_FMT_RGB24) |
a861d4d7 | 2790 | { |
8b2fce0d | 2791 | RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2792 | src1= formatConvBuffer; |
8b2fce0d | 2793 | src2= formatConvBuffer+VOFW; |
a861d4d7 | 2794 | } |
2da0d70d | 2795 | else if (srcFormat==PIX_FMT_RGB565) |
a43fb6b3 | 2796 | { |
8b2fce0d | 2797 | RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2798 | src1= formatConvBuffer; |
8b2fce0d | 2799 | src2= formatConvBuffer+VOFW; |
a43fb6b3 | 2800 | } |
2da0d70d | 2801 | else if (srcFormat==PIX_FMT_RGB555) |
a43fb6b3 | 2802 | { |
8b2fce0d | 2803 | RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); |
2da0d70d | 2804 | src1= formatConvBuffer; |
8b2fce0d | 2805 | src2= formatConvBuffer+VOFW; |
a43fb6b3 | 2806 | } |
2da0d70d | 2807 | else if (isGray(srcFormat)) |
6ff0ad6b | 2808 | { |
2da0d70d | 2809 | return; |
6ff0ad6b | 2810 | } |
2da0d70d | 2811 | else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) |
e28630fc | 2812 | { |
87cf861c | 2813 | RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal); |
2da0d70d | 2814 | src1= formatConvBuffer; |
8b2fce0d | 2815 | src2= formatConvBuffer+VOFW; |
e28630fc | 2816 | } |
1e621b18 | 2817 | |
e3d2500f | 2818 | #ifdef HAVE_MMX |
8a322796 | 2819 | // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). |
2da0d70d | 2820 | if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
e3d2500f | 2821 | #else |
2da0d70d | 2822 | if (!(flags&SWS_FAST_BILINEAR)) |
e3d2500f | 2823 | #endif |
077ea8a7 | 2824 | { |
2da0d70d | 2825 | RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
8b2fce0d | 2826 | RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
077ea8a7 | 2827 | } |
8a322796 | 2828 | else // fast bilinear upscale / crap downscale |
077ea8a7 | 2829 | { |
3d6a30d9 | 2830 | #if defined(ARCH_X86) |
2ff198c1 | 2831 | #ifdef HAVE_MMX2 |
2da0d70d | 2832 | int i; |
83c89c78 | 2833 | #if defined(PIC) |
2da0d70d | 2834 | uint64_t ebxsave __attribute__((aligned(8))); |
83c89c78 | 2835 | #endif |
2da0d70d DB |
2836 | if (canMMX2BeUsed) |
2837 | { | |
2838 | asm volatile( | |
83c89c78 | 2839 | #if defined(PIC) |
2da0d70d DB |
2840 | "mov %%"REG_b", %6 \n\t" |
2841 | #endif | |
2842 | "pxor %%mm7, %%mm7 \n\t" | |
2843 | "mov %0, %%"REG_c" \n\t" | |
2844 | "mov %1, %%"REG_D" \n\t" | |
2845 | "mov %2, %%"REG_d" \n\t" | |
2846 | "mov %3, %%"REG_b" \n\t" | |
2847 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2848 | PREFETCH" (%%"REG_c") \n\t" | |
2849 | PREFETCH" 32(%%"REG_c") \n\t" | |
2850 | PREFETCH" 64(%%"REG_c") \n\t" | |
b7dc6f66 | 2851 | |
6d606c4f AJ |
2852 | #ifdef ARCH_X86_64 |
2853 | ||
2854 | #define FUNNY_UV_CODE \ | |
2da0d70d DB |
2855 | "movl (%%"REG_b"), %%esi \n\t"\ |
2856 | "call *%4 \n\t"\ | |
2857 | "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ | |
2858 | "add %%"REG_S", %%"REG_c" \n\t"\ | |
2859 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2860 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
6d606c4f AJ |
2861 | |
2862 | #else | |
2863 | ||
b7dc6f66 | 2864 | #define FUNNY_UV_CODE \ |
2da0d70d DB |
2865 | "movl (%%"REG_b"), %%esi \n\t"\ |
2866 | "call *%4 \n\t"\ | |
2867 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ | |
2868 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2869 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
b7dc6f66 | 2870 | |
bc279024 | 2871 | #endif /* ARCH_X86_64 */ |
6d606c4f | 2872 | |
b7dc6f66 MN |
2873 | FUNNY_UV_CODE |
2874 | FUNNY_UV_CODE | |
2875 | FUNNY_UV_CODE | |
2876 | FUNNY_UV_CODE | |
2da0d70d DB |
2877 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
2878 | "mov %5, %%"REG_c" \n\t" // src | |
2879 | "mov %1, %%"REG_D" \n\t" // buf1 | |
8b2fce0d | 2880 | "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t" |
2da0d70d DB |
2881 | PREFETCH" (%%"REG_c") \n\t" |
2882 | PREFETCH" 32(%%"REG_c") \n\t" | |
2883 | PREFETCH" 64(%%"REG_c") \n\t" | |
b7dc6f66 MN |
2884 | |
2885 | FUNNY_UV_CODE | |
2886 | FUNNY_UV_CODE | |
2887 | FUNNY_UV_CODE | |
2888 | FUNNY_UV_CODE | |
2889 | ||
83c89c78 | 2890 | #if defined(PIC) |
2da0d70d | 2891 | "mov %6, %%"REG_b" \n\t" |
83c89c78 | 2892 | #endif |
2da0d70d DB |
2893 | :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2894 | "m" (funnyUVCode), "m" (src2) | |
83c89c78 | 2895 | #if defined(PIC) |
2da0d70d | 2896 | ,"m" (ebxsave) |
83c89c78 | 2897 | #endif |
2da0d70d | 2898 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
83c89c78 | 2899 | #if !defined(PIC) |
2da0d70d DB |
2900 | ,"%"REG_b |
2901 | #endif | |
2902 | ); | |
2903 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) | |
2904 | { | |
2905 | //printf("%d %d %d\n", dstWidth, i, srcW); | |
2906 | dst[i] = src1[srcW-1]*128; | |
8b2fce0d | 2907 | dst[i+VOFW] = src2[srcW-1]*128; |
2da0d70d DB |
2908 | } |
2909 | } | |
2910 | else | |
2911 | { | |
bc279024 | 2912 | #endif /* HAVE_MMX2 */ |
2da0d70d DB |
2913 | long xInc_shr16 = (long) (xInc >> 16); |
2914 | uint16_t xInc_mask = xInc & 0xffff; | |
2915 | asm volatile( | |
2916 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2917 | "xor %%"REG_d", %%"REG_d" \n\t" // xx | |
2918 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2919 | ASMALIGN(4) | |
2920 | "1: \n\t" | |
2921 | "mov %0, %%"REG_S" \n\t" | |
2922 | "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] | |
2923 | "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2924 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2925 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2926 | "shll $16, %%edi \n\t" | |
2927 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2928 | "mov %1, %%"REG_D" \n\t" | |
2929 | "shrl $9, %%esi \n\t" | |
2930 | "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" | |
2931 | ||
2932 | "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] | |
2933 | "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2934 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2935 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2936 | "shll $16, %%edi \n\t" | |
2937 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2938 | "mov %1, %%"REG_D" \n\t" | |
2939 | "shrl $9, %%esi \n\t" | |
8b2fce0d | 2940 | "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t" |
2da0d70d DB |
2941 | |
2942 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2943 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry | |
2944 | "add $1, %%"REG_a" \n\t" | |
2945 | "cmp %2, %%"REG_a" \n\t" | |
2946 | " jb 1b \n\t" | |
2ff198c1 | 2947 | |
8a322796 DB |
2948 | /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, |
2949 | which is needed to support GCC 4.0. */ | |
e5091488 | 2950 | #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) |
2da0d70d | 2951 | :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
dc77ef7f | 2952 | #else |
2da0d70d | 2953 | :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
dc77ef7f | 2954 | #endif |
2da0d70d DB |
2955 | "r" (src2) |
2956 | : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" | |
2957 | ); | |
2ff198c1 | 2958 | #ifdef HAVE_MMX2 |
2da0d70d | 2959 | } //if MMX2 can't be used |
2ff198c1 MN |
2960 | #endif |
2961 | #else | |
2da0d70d DB |
2962 | int i; |
2963 | unsigned int xpos=0; | |
2964 | for (i=0;i<dstWidth;i++) | |
2965 | { | |
2966 | register unsigned int xx=xpos>>16; | |
2967 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2968 | dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
8b2fce0d | 2969 | dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); |
2da0d70d DB |
2970 | /* slower |
2971 | dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
8b2fce0d | 2972 | dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; |
2da0d70d DB |
2973 | */ |
2974 | xpos+=xInc; | |
2975 | } | |
bc279024 | 2976 | #endif /* defined(ARCH_X86) */ |
2da0d70d | 2977 | } |
6bc0c792 MN |
2978 | if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ |
2979 | int i; | |
2980 | //FIXME all pal and rgb srcFormats could do this convertion as well | |
2981 | //FIXME all scalers more complex than bilinear could do half of this transform | |
2982 | if(c->srcRange){ | |
2983 | for (i=0; i<dstWidth; i++){ | |
2984 | dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469 | |
2985 | dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469 | |
2986 | } | |
2987 | }else{ | |
2988 | for (i=0; i<dstWidth; i++){ | |
aa13b0fc MN |
2989 | dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264 |
2990 | dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264 | |
6bc0c792 MN |
2991 | } |
2992 | } | |
2993 | } | |
077ea8a7 MN |
2994 | } |
2995 | ||
3e499f53 | 2996 | static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
2da0d70d DB |
2997 | int srcSliceH, uint8_t* dst[], int dstStride[]){ |
2998 | ||
2999 | /* load a few things into local vars to make the code more readable? and faster */ | |
3000 | const int srcW= c->srcW; | |
3001 | const int dstW= c->dstW; | |
3002 | const int dstH= c->dstH; | |
3003 | const int chrDstW= c->chrDstW; | |
3004 | const int chrSrcW= c->chrSrcW; | |
3005 | const int lumXInc= c->lumXInc; | |
3006 | const int chrXInc= c->chrXInc; | |
3007 | const int dstFormat= c->dstFormat; | |
3008 | const int srcFormat= c->srcFormat; | |
3009 | const int flags= c->flags; | |
3010 | const int canMMX2BeUsed= c->canMMX2BeUsed; | |
3011 | int16_t *vLumFilterPos= c->vLumFilterPos; | |
3012 | int16_t *vChrFilterPos= c->vChrFilterPos; | |
3013 | int16_t *hLumFilterPos= c->hLumFilterPos; | |
3014 | int16_t *hChrFilterPos= c->hChrFilterPos; | |
3015 | int16_t *vLumFilter= c->vLumFilter; | |
3016 | int16_t *vChrFilter= c->vChrFilter; | |
3017 | int16_t *hLumFilter= c->hLumFilter; | |
3018 | int16_t *hChrFilter= c->hChrFilter; | |
3019 | int32_t *lumMmxFilter= c->lumMmxFilter; | |
3020 | int32_t *chrMmxFilter= c->chrMmxFilter; | |
3021 | const int vLumFilterSize= c->vLumFilterSize; | |
3022 | const int vChrFilterSize= c->vChrFilterSize; | |
3023 | const int hLumFilterSize= c->hLumFilterSize; | |
3024 | const int hChrFilterSize= c->hChrFilterSize; | |
3025 | int16_t **lumPixBuf= c->lumPixBuf; | |
3026 | int16_t **chrPixBuf= c->chrPixBuf; | |
3027 | const int vLumBufSize= c->vLumBufSize; | |
3028 | const int vChrBufSize= c->vChrBufSize; | |
3029 | uint8_t *funnyYCode= c->funnyYCode; | |
3030 | uint8_t *funnyUVCode= c->funnyUVCode; | |
3031 | uint8_t *formatConvBuffer= c->formatConvBuffer; | |
3032 | const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; | |
3033 | const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); | |
3034 | int lastDstY; | |
3035 | uint8_t *pal=NULL; | |
3036 | ||
8a322796 | 3037 | /* vars which will change and which we need to store back in the context */ |
2da0d70d DB |
3038 | int dstY= c->dstY; |
3039 | int lumBufIndex= c->lumBufIndex; | |
3040 | int chrBufIndex= c->chrBufIndex; | |
3041 | int lastInLumBuf= c->lastInLumBuf; | |
3042 | int lastInChrBuf= c->lastInChrBuf; | |
3043 | ||
3044 | if (isPacked(c->srcFormat)){ | |
3045 | pal= src[1]; | |
3046 | src[0]= | |
3047 | src[1]= | |
3048 | src[2]= src[0]; | |
3049 | srcStride[0]= | |
3050 | srcStride[1]= | |
3051 | srcStride[2]= srcStride[0]; | |
3052 | } | |
3053 | srcStride[1]<<= c->vChrDrop; | |
3054 | srcStride[2]<<= c->vChrDrop; | |
3055 | ||
3056 | //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], | |
3057 | // (int)dst[0], (int)dst[1], (int)dst[2]); | |
c7a810cc MN |
3058 | |
3059 | #if 0 //self test FIXME move to a vfilter or something | |
2da0d70d DB |
3060 | { |
3061 | static volatile int i=0; | |
3062 | i++; | |
3063 | if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH) | |
3064 | selfTest(src, srcStride, c->srcW, c->srcH); | |
3065 | i--; | |
3066 | } | |
c7a810cc | 3067 | #endif |
37079906 | 3068 | |
2da0d70d DB |
3069 | //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], |
3070 | //dstStride[0],dstStride[1],dstStride[2]); | |
3071 | ||
3072 | if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
3073 | { | |
3074 | static int firstTime=1; //FIXME move this into the context perhaps | |
3075 | if (flags & SWS_PRINT_INFO && firstTime) | |
3076 | { | |
4b0c30b7 | 3077 | av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" |
8a322796 | 3078 | " ->cannot do aligned memory accesses anymore\n"); |
2da0d70d DB |
3079 | firstTime=0; |
3080 | } | |
3081 | } | |
3082 | ||
8a322796 DB |
3083 | /* Note the user might start scaling the picture in the middle so this |
3084 | will not get executed. This is not really intended but works | |
3085 | currently, so people might do it. */ | |
2da0d70d DB |
3086 | if (srcSliceY ==0){ |
3087 | lumBufIndex=0; | |
3088 | chrBufIndex=0; | |
3089 | dstY=0; | |
3090 | lastInLumBuf= -1; | |
3091 | lastInChrBuf= -1; | |
3092 | } | |
3093 | ||
3094 | lastDstY= dstY; | |
3095 | ||
3096 | for (;dstY < dstH; dstY++){ | |
3097 | unsigned char *dest =dst[0]+dstStride[0]*dstY; | |
3098 | const int chrDstY= dstY>>c->chrDstVSubSample; | |
3099 | unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
3100 | unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
3101 | ||
3102 | const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
3103 | const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
3104 | const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
3105 | const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
3106 | ||
3107 | //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", | |
3108 | // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
3109 | //handle holes (FAST_BILINEAR & weird filters) | |
3110 | if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | |
3111 | if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | |
3112 | //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); | |
fcc402b1 LB |
3113 | assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1); |
3114 | assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1); | |
2da0d70d DB |
3115 | |
3116 | // Do we have enough lines in this slice to output the dstY line | |
3117 | if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) | |
3118 | { | |
3119 | //Do horizontal scaling | |
3120 | while(lastInLumBuf < lastLumSrcY) | |
3121 | { | |
3122 | uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
3123 | lumBufIndex++; | |
3124 | //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); | |
fcc402b1 LB |
3125 | assert(lumBufIndex < 2*vLumBufSize); |
3126 | assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); | |
3127 | assert(lastInLumBuf + 1 - srcSliceY >= 0); | |
2da0d70d | 3128 | //printf("%d %d\n", lumBufIndex, vLumBufSize); |
6bc0c792 | 3129 | RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
2da0d70d DB |
3130 | flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
3131 | funnyYCode, c->srcFormat, formatConvBuffer, | |
3132 | c->lumMmx2Filter, c->lumMmx2FilterPos, pal); | |
3133 | lastInLumBuf++; | |
3134 | } | |
3135 | while(lastInChrBuf < lastChrSrcY) | |
3136 | { | |
3137 | uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
3138 | uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
3139 | chrBufIndex++; | |
fcc402b1 LB |
3140 | assert(chrBufIndex < 2*vChrBufSize); |
3141 | assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); | |
3142 | assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); | |
2da0d70d DB |
3143 | //FIXME replace parameters through context struct (some at least) |
3144 | ||
3145 | if (!(isGray(srcFormat) || isGray(dstFormat))) | |
6bc0c792 | 3146 | RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |
2da0d70d DB |
3147 | flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
3148 | funnyUVCode, c->srcFormat, formatConvBuffer, | |
3149 | c->chrMmx2Filter, c->chrMmx2FilterPos, pal); | |
3150 | lastInChrBuf++; | |
3151 | } | |
3152 | //wrap buf index around to stay inside the ring buffer | |
e5091488 BF |
3153 | if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; |
3154 | if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; | |
2da0d70d DB |
3155 | } |
3156 | else // not enough lines left in this slice -> load the rest in the buffer | |
3157 | { | |
3158 | /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", | |
3159 | firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
3160 | lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
3161 | vChrBufSize, vLumBufSize);*/ | |
3162 | ||
3163 | //Do horizontal scaling | |
3164 | while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
3165 | { | |
3166 | uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
3167 | lumBufIndex++; | |
fcc402b1 LB |
3168 | assert(lumBufIndex < 2*vLumBufSize); |
3169 | assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); | |
3170 | assert(lastInLumBuf + 1 - srcSliceY >= 0); | |
6bc0c792 | 3171 | RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
2da0d70d DB |
3172 | flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
3173 | funnyYCode, c->srcFormat, formatConvBuffer, | |
3174 | c->lumMmx2Filter, c->lumMmx2FilterPos, pal); | |
3175 | lastInLumBuf++; | |
3176 | } | |
3177 | while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) | |
3178 | { | |
3179 | uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
3180 | uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
3181 | chrBufIndex++; | |
fcc402b1 LB |
3182 | assert(chrBufIndex < 2*vChrBufSize); |
3183 | assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH); | |
3184 | assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); | |
2da0d70d DB |
3185 | |
3186 | if (!(isGray(srcFormat) || isGray(dstFormat))) | |
6bc0c792 | 3187 | RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |