Commit | Line | Data |
---|---|---|
fe8054c0 | 1 | /* |
d026b45e DB |
2 | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | * | |
807e0c66 LA |
20 | * the C code (not assembly, mmx, ...) of this file can be used |
21 | * under the LGPL license too | |
d026b45e | 22 | */ |
783e9cc9 | 23 | |
6e1c66bc | 24 | #undef REAL_MOVNTQ |
541c4eb9 | 25 | #undef MOVNTQ |
7d7f78b5 | 26 | #undef PAVGB |
48a05cec MN |
27 | #undef PREFETCH |
28 | #undef PREFETCHW | |
29 | #undef EMMS | |
30 | #undef SFENCE | |
31 | ||
32 | #ifdef HAVE_3DNOW | |
33 | /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
34 | #define EMMS "femms" | |
35 | #else | |
36 | #define EMMS "emms" | |
37 | #endif | |
38 | ||
39 | #ifdef HAVE_3DNOW | |
40 | #define PREFETCH "prefetch" | |
41 | #define PREFETCHW "prefetchw" | |
42 | #elif defined ( HAVE_MMX2 ) | |
43 | #define PREFETCH "prefetchnta" | |
44 | #define PREFETCHW "prefetcht0" | |
45 | #else | |
d904b5fc NP |
46 | #define PREFETCH " # nop" |
47 | #define PREFETCHW " # nop" | |
48a05cec MN |
48 | #endif |
49 | ||
50 | #ifdef HAVE_MMX2 | |
51 | #define SFENCE "sfence" | |
52 | #else | |
d904b5fc | 53 | #define SFENCE " # nop" |
48a05cec | 54 | #endif |
d3f41512 | 55 | |
d604bab9 MN |
56 | #ifdef HAVE_MMX2 |
57 | #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
58 | #elif defined (HAVE_3DNOW) | |
59 | #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
60 | #endif | |
d3f41512 | 61 | |
d604bab9 | 62 | #ifdef HAVE_MMX2 |
6e1c66bc | 63 | #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
d604bab9 | 64 | #else |
6e1c66bc | 65 | #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
d604bab9 | 66 | #endif |
6e1c66bc | 67 | #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) |
d604bab9 | 68 | |
a2faa401 RD |
69 | #ifdef HAVE_ALTIVEC |
70 | #include "swscale_altivec_template.c" | |
71 | #endif | |
72 | ||
bca11e75 MN |
73 | #define YSCALEYUV2YV12X(x, offset, dest, width) \ |
74 | asm volatile(\ | |
6e1c66bc | 75 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
379a2036 MN |
76 | "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
77 | "movq %%mm3, %%mm4 \n\t"\ | |
6e1c66bc AJ |
78 | "lea " offset "(%0), %%"REG_d" \n\t"\ |
79 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
4bff9ef9 | 80 | ASMALIGN(4) /* FIXME Unroll? */\ |
c1b0bfb4 | 81 | "1: \n\t"\ |
6e1c66bc AJ |
82 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
83 | "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ | |
84 | "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\ | |
85 | "add $16, %%"REG_d" \n\t"\ | |
86 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
87 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
c1b0bfb4 MN |
88 | "pmulhw %%mm0, %%mm2 \n\t"\ |
89 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
90 | "paddw %%mm2, %%mm3 \n\t"\ | |
91 | "paddw %%mm5, %%mm4 \n\t"\ | |
c1b0bfb4 MN |
92 | " jnz 1b \n\t"\ |
93 | "psraw $3, %%mm3 \n\t"\ | |
94 | "psraw $3, %%mm4 \n\t"\ | |
95 | "packuswb %%mm4, %%mm3 \n\t"\ | |
6e1c66bc AJ |
96 | MOVNTQ(%%mm3, (%1, %%REGa))\ |
97 | "add $8, %%"REG_a" \n\t"\ | |
98 | "cmp %2, %%"REG_a" \n\t"\ | |
379a2036 MN |
99 | "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
100 | "movq %%mm3, %%mm4 \n\t"\ | |
6e1c66bc AJ |
101 | "lea " offset "(%0), %%"REG_d" \n\t"\ |
102 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
bca11e75 MN |
103 | "jb 1b \n\t"\ |
104 | :: "r" (&c->redDither),\ | |
e96da13b | 105 | "r" (dest), "g" (width)\ |
bca11e75 MN |
106 | : "%"REG_a, "%"REG_d, "%"REG_S\ |
107 | ); | |
108 | ||
109 | #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ | |
110 | asm volatile(\ | |
111 | "lea " offset "(%0), %%"REG_d" \n\t"\ | |
112 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
113 | "pxor %%mm4, %%mm4 \n\t"\ | |
114 | "pxor %%mm5, %%mm5 \n\t"\ | |
115 | "pxor %%mm6, %%mm6 \n\t"\ | |
116 | "pxor %%mm7, %%mm7 \n\t"\ | |
117 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
4bff9ef9 | 118 | ASMALIGN(4) \ |
bca11e75 MN |
119 | "1: \n\t"\ |
120 | "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\ | |
121 | "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ | |
122 | "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ | |
123 | "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\ | |
124 | "movq %%mm0, %%mm3 \n\t"\ | |
125 | "punpcklwd %%mm1, %%mm0 \n\t"\ | |
126 | "punpckhwd %%mm1, %%mm3 \n\t"\ | |
127 | "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ | |
128 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
129 | "pmaddwd %%mm1, %%mm3 \n\t"\ | |
130 | "paddd %%mm0, %%mm4 \n\t"\ | |
131 | "paddd %%mm3, %%mm5 \n\t"\ | |
132 | "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\ | |
133 | "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ | |
134 | "add $16, %%"REG_d" \n\t"\ | |
135 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
136 | "movq %%mm2, %%mm0 \n\t"\ | |
137 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
138 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
139 | "pmaddwd %%mm1, %%mm2 \n\t"\ | |
140 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
141 | "paddd %%mm2, %%mm6 \n\t"\ | |
142 | "paddd %%mm0, %%mm7 \n\t"\ | |
143 | " jnz 1b \n\t"\ | |
144 | "psrad $16, %%mm4 \n\t"\ | |
145 | "psrad $16, %%mm5 \n\t"\ | |
146 | "psrad $16, %%mm6 \n\t"\ | |
147 | "psrad $16, %%mm7 \n\t"\ | |
148 | "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ | |
149 | "packssdw %%mm5, %%mm4 \n\t"\ | |
150 | "packssdw %%mm7, %%mm6 \n\t"\ | |
151 | "paddw %%mm0, %%mm4 \n\t"\ | |
152 | "paddw %%mm0, %%mm6 \n\t"\ | |
153 | "psraw $3, %%mm4 \n\t"\ | |
154 | "psraw $3, %%mm6 \n\t"\ | |
155 | "packuswb %%mm6, %%mm4 \n\t"\ | |
156 | MOVNTQ(%%mm4, (%1, %%REGa))\ | |
157 | "add $8, %%"REG_a" \n\t"\ | |
158 | "cmp %2, %%"REG_a" \n\t"\ | |
159 | "lea " offset "(%0), %%"REG_d" \n\t"\ | |
160 | "pxor %%mm4, %%mm4 \n\t"\ | |
161 | "pxor %%mm5, %%mm5 \n\t"\ | |
162 | "pxor %%mm6, %%mm6 \n\t"\ | |
163 | "pxor %%mm7, %%mm7 \n\t"\ | |
164 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
165 | "jb 1b \n\t"\ | |
166 | :: "r" (&c->redDither),\ | |
e96da13b | 167 | "r" (dest), "g" (width)\ |
bca11e75 MN |
168 | : "%"REG_a, "%"REG_d, "%"REG_S\ |
169 | ); | |
c1b0bfb4 MN |
170 | |
171 | #define YSCALEYUV2YV121 \ | |
6e1c66bc | 172 | "mov %2, %%"REG_a" \n\t"\ |
4bff9ef9 | 173 | ASMALIGN(4) /* FIXME Unroll? */\ |
c1b0bfb4 | 174 | "1: \n\t"\ |
6e1c66bc AJ |
175 | "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ |
176 | "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\ | |
c1b0bfb4 MN |
177 | "psraw $7, %%mm0 \n\t"\ |
178 | "psraw $7, %%mm1 \n\t"\ | |
179 | "packuswb %%mm1, %%mm0 \n\t"\ | |
6e1c66bc AJ |
180 | MOVNTQ(%%mm0, (%1, %%REGa))\ |
181 | "add $8, %%"REG_a" \n\t"\ | |
c1b0bfb4 MN |
182 | "jnc 1b \n\t" |
183 | ||
184 | /* | |
185 | :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
186 | "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
187 | "r" (dest), "m" (dstW), | |
188 | "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
189 | : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
190 | */ | |
25593e29 | 191 | #define YSCALEYUV2PACKEDX \ |
8422aa88 | 192 | asm volatile(\ |
6e1c66bc | 193 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
4bff9ef9 | 194 | ASMALIGN(4)\ |
77a49659 | 195 | "nop \n\t"\ |
c1b0bfb4 | 196 | "1: \n\t"\ |
6e1c66bc AJ |
197 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
198 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
379a2036 MN |
199 | "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
200 | "movq %%mm3, %%mm4 \n\t"\ | |
4bff9ef9 | 201 | ASMALIGN(4)\ |
c1b0bfb4 | 202 | "2: \n\t"\ |
6e1c66bc AJ |
203 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
204 | "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | |
205 | "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ | |
206 | "add $16, %%"REG_d" \n\t"\ | |
207 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
c1b0bfb4 MN |
208 | "pmulhw %%mm0, %%mm2 \n\t"\ |
209 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
210 | "paddw %%mm2, %%mm3 \n\t"\ | |
211 | "paddw %%mm5, %%mm4 \n\t"\ | |
6e1c66bc | 212 | "test %%"REG_S", %%"REG_S" \n\t"\ |
c1b0bfb4 MN |
213 | " jnz 2b \n\t"\ |
214 | \ | |
6e1c66bc AJ |
215 | "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
216 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
379a2036 MN |
217 | "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\ |
218 | "movq %%mm1, %%mm7 \n\t"\ | |
4bff9ef9 | 219 | ASMALIGN(4)\ |
c1b0bfb4 | 220 | "2: \n\t"\ |
6e1c66bc AJ |
221 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
222 | "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ | |
223 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ | |
224 | "add $16, %%"REG_d" \n\t"\ | |
225 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
c1b0bfb4 MN |
226 | "pmulhw %%mm0, %%mm2 \n\t"\ |
227 | "pmulhw %%mm0, %%mm5 \n\t"\ | |
228 | "paddw %%mm2, %%mm1 \n\t"\ | |
229 | "paddw %%mm5, %%mm7 \n\t"\ | |
6e1c66bc | 230 | "test %%"REG_S", %%"REG_S" \n\t"\ |
c1b0bfb4 | 231 | " jnz 2b \n\t"\ |
25593e29 | 232 | |
8422aa88 MN |
233 | #define YSCALEYUV2PACKEDX_END\ |
234 | :: "r" (&c->redDither), \ | |
235 | "m" (dummy), "m" (dummy), "m" (dummy),\ | |
236 | "r" (dest), "m" (dstW)\ | |
237 | : "%"REG_a, "%"REG_d, "%"REG_S\ | |
238 | ); | |
239 | ||
bca11e75 | 240 | #define YSCALEYUV2PACKEDX_ACCURATE \ |
8422aa88 | 241 | asm volatile(\ |
bca11e75 | 242 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
4bff9ef9 | 243 | ASMALIGN(4)\ |
bca11e75 MN |
244 | "nop \n\t"\ |
245 | "1: \n\t"\ | |
246 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
247 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
248 | "pxor %%mm4, %%mm4 \n\t"\ | |
249 | "pxor %%mm5, %%mm5 \n\t"\ | |
250 | "pxor %%mm6, %%mm6 \n\t"\ | |
251 | "pxor %%mm7, %%mm7 \n\t"\ | |
4bff9ef9 | 252 | ASMALIGN(4)\ |
bca11e75 MN |
253 | "2: \n\t"\ |
254 | "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ | |
255 | "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ | |
256 | "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ | |
257 | "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ | |
258 | "movq %%mm0, %%mm3 \n\t"\ | |
259 | "punpcklwd %%mm1, %%mm0 \n\t"\ | |
260 | "punpckhwd %%mm1, %%mm3 \n\t"\ | |
261 | "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ | |
262 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
263 | "pmaddwd %%mm1, %%mm3 \n\t"\ | |
264 | "paddd %%mm0, %%mm4 \n\t"\ | |
265 | "paddd %%mm3, %%mm5 \n\t"\ | |
266 | "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ | |
267 | "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ | |
268 | "add $16, %%"REG_d" \n\t"\ | |
269 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
270 | "movq %%mm2, %%mm0 \n\t"\ | |
271 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
272 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
273 | "pmaddwd %%mm1, %%mm2 \n\t"\ | |
274 | "pmaddwd %%mm1, %%mm0 \n\t"\ | |
275 | "paddd %%mm2, %%mm6 \n\t"\ | |
276 | "paddd %%mm0, %%mm7 \n\t"\ | |
277 | " jnz 2b \n\t"\ | |
278 | "psrad $16, %%mm4 \n\t"\ | |
279 | "psrad $16, %%mm5 \n\t"\ | |
280 | "psrad $16, %%mm6 \n\t"\ | |
281 | "psrad $16, %%mm7 \n\t"\ | |
282 | "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ | |
283 | "packssdw %%mm5, %%mm4 \n\t"\ | |
284 | "packssdw %%mm7, %%mm6 \n\t"\ | |
285 | "paddw %%mm0, %%mm4 \n\t"\ | |
286 | "paddw %%mm0, %%mm6 \n\t"\ | |
287 | "movq %%mm4, "U_TEMP"(%0) \n\t"\ | |
288 | "movq %%mm6, "V_TEMP"(%0) \n\t"\ | |
289 | \ | |
290 | "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
291 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
292 | "pxor %%mm1, %%mm1 \n\t"\ | |
293 | "pxor %%mm5, %%mm5 \n\t"\ | |
294 | "pxor %%mm7, %%mm7 \n\t"\ | |
295 | "pxor %%mm6, %%mm6 \n\t"\ | |
4bff9ef9 | 296 | ASMALIGN(4)\ |
bca11e75 MN |
297 | "2: \n\t"\ |
298 | "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ | |
299 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ | |
300 | "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ | |
301 | "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ | |
302 | "movq %%mm0, %%mm3 \n\t"\ | |
303 | "punpcklwd %%mm4, %%mm0 \n\t"\ | |
304 | "punpckhwd %%mm4, %%mm3 \n\t"\ | |
305 | "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ | |
306 | "pmaddwd %%mm4, %%mm0 \n\t"\ | |
307 | "pmaddwd %%mm4, %%mm3 \n\t"\ | |
308 | "paddd %%mm0, %%mm1 \n\t"\ | |
309 | "paddd %%mm3, %%mm5 \n\t"\ | |
310 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ | |
311 | "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ | |
312 | "add $16, %%"REG_d" \n\t"\ | |
313 | "test %%"REG_S", %%"REG_S" \n\t"\ | |
314 | "movq %%mm2, %%mm0 \n\t"\ | |
315 | "punpcklwd %%mm3, %%mm2 \n\t"\ | |
316 | "punpckhwd %%mm3, %%mm0 \n\t"\ | |
317 | "pmaddwd %%mm4, %%mm2 \n\t"\ | |
318 | "pmaddwd %%mm4, %%mm0 \n\t"\ | |
319 | "paddd %%mm2, %%mm7 \n\t"\ | |
320 | "paddd %%mm0, %%mm6 \n\t"\ | |
321 | " jnz 2b \n\t"\ | |
322 | "psrad $16, %%mm1 \n\t"\ | |
323 | "psrad $16, %%mm5 \n\t"\ | |
324 | "psrad $16, %%mm7 \n\t"\ | |
325 | "psrad $16, %%mm6 \n\t"\ | |
326 | "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ | |
327 | "packssdw %%mm5, %%mm1 \n\t"\ | |
328 | "packssdw %%mm6, %%mm7 \n\t"\ | |
329 | "paddw %%mm0, %%mm1 \n\t"\ | |
330 | "paddw %%mm0, %%mm7 \n\t"\ | |
331 | "movq "U_TEMP"(%0), %%mm3 \n\t"\ | |
332 | "movq "V_TEMP"(%0), %%mm4 \n\t"\ | |
333 | ||
8422aa88 | 334 | #define YSCALEYUV2RGBX \ |
77a49659 MN |
335 | "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
336 | "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
c1b0bfb4 MN |
337 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
338 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
77a49659 MN |
339 | "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ |
340 | "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
c1b0bfb4 | 341 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
77a49659 MN |
342 | "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ |
343 | "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
344 | "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
345 | "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
346 | "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
347 | "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
c1b0bfb4 MN |
348 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
349 | "paddw %%mm3, %%mm4 \n\t"\ | |
350 | "movq %%mm2, %%mm0 \n\t"\ | |
351 | "movq %%mm5, %%mm6 \n\t"\ | |
352 | "movq %%mm4, %%mm3 \n\t"\ | |
353 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
354 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
355 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
356 | "paddw %%mm1, %%mm2 \n\t"\ | |
357 | "paddw %%mm1, %%mm5 \n\t"\ | |
358 | "paddw %%mm1, %%mm4 \n\t"\ | |
359 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
360 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
361 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
362 | "paddw %%mm7, %%mm0 \n\t"\ | |
363 | "paddw %%mm7, %%mm6 \n\t"\ | |
364 | "paddw %%mm7, %%mm3 \n\t"\ | |
365 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
366 | "packuswb %%mm0, %%mm2 \n\t"\ | |
367 | "packuswb %%mm6, %%mm5 \n\t"\ | |
368 | "packuswb %%mm3, %%mm4 \n\t"\ | |
369 | "pxor %%mm7, %%mm7 \n\t" | |
77a49659 | 370 | #if 0 |
d604bab9 MN |
371 | #define FULL_YSCALEYUV2RGB \ |
372 | "pxor %%mm7, %%mm7 \n\t"\ | |
373 | "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
374 | "punpcklwd %%mm6, %%mm6 \n\t"\ | |
375 | "punpcklwd %%mm6, %%mm6 \n\t"\ | |
376 | "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
377 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
378 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
6e1c66bc | 379 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
4bff9ef9 | 380 | ASMALIGN(4)\ |
d604bab9 | 381 | "1: \n\t"\ |
6e1c66bc AJ |
382 | "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
383 | "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
384 | "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
385 | "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
d604bab9 MN |
386 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
387 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
388 | "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
389 | "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
390 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
6e1c66bc | 391 | "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
d604bab9 MN |
392 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
393 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
6e1c66bc | 394 | "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ |
d604bab9 MN |
395 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
396 | "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
9b464428 FB |
397 | "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ |
398 | "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
399 | "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
d604bab9 MN |
400 | \ |
401 | \ | |
402 | "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
403 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
9b464428 | 404 | "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\ |
d604bab9 | 405 | "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
9b464428 | 406 | "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\ |
d604bab9 | 407 | "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
9b464428 | 408 | "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ |
d604bab9 MN |
409 | \ |
410 | \ | |
411 | "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |
9b464428 FB |
412 | "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\ |
413 | "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
d604bab9 MN |
414 | "paddw %%mm1, %%mm3 \n\t" /* B*/\ |
415 | "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |
416 | "packuswb %%mm3, %%mm3 \n\t"\ | |
417 | \ | |
418 | "packuswb %%mm0, %%mm0 \n\t"\ | |
419 | "paddw %%mm4, %%mm2 \n\t"\ | |
420 | "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |
421 | \ | |
422 | "packuswb %%mm1, %%mm1 \n\t" | |
77a49659 | 423 | #endif |
d604bab9 | 424 | |
6e1c66bc | 425 | #define REAL_YSCALEYUV2PACKED(index, c) \ |
6542b44e MN |
426 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
427 | "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\ | |
428 | "psraw $3, %%mm0 \n\t"\ | |
429 | "psraw $3, %%mm1 \n\t"\ | |
430 | "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
431 | "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
6e1c66bc | 432 | "xor "#index", "#index" \n\t"\ |
4bff9ef9 | 433 | ASMALIGN(4)\ |
25593e29 | 434 | "1: \n\t"\ |
6542b44e MN |
435 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
436 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
437 | "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
438 | "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
25593e29 MN |
439 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
440 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
6542b44e | 441 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
25593e29 MN |
442 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
443 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
444 | "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
445 | "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
446 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
447 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
6542b44e MN |
448 | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
449 | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
450 | "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
451 | "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
25593e29 MN |
452 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
453 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
6542b44e MN |
454 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
455 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
25593e29 MN |
456 | "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
457 | "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
458 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
459 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
6a4970ab | 460 | |
6e1c66bc | 461 | #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) |
6a4970ab | 462 | |
6e1c66bc AJ |
463 | #define REAL_YSCALEYUV2RGB(index, c) \ |
464 | "xor "#index", "#index" \n\t"\ | |
4bff9ef9 | 465 | ASMALIGN(4)\ |
d604bab9 | 466 | "1: \n\t"\ |
6542b44e MN |
467 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
468 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
469 | "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\ | |
470 | "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\ | |
d604bab9 MN |
471 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
472 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
6542b44e | 473 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
d604bab9 MN |
474 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
475 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
476 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
477 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
478 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
479 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
6542b44e MN |
480 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
481 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
d604bab9 MN |
482 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
483 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
6542b44e MN |
484 | "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
485 | "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
d604bab9 | 486 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
6542b44e MN |
487 | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
488 | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
489 | "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\ | |
490 | "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\ | |
d604bab9 MN |
491 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
492 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
6542b44e MN |
493 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
494 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
d604bab9 MN |
495 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
496 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
497 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
498 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
6542b44e MN |
499 | "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
500 | "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
501 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
502 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
503 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
504 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
d604bab9 MN |
505 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
506 | "paddw %%mm3, %%mm4 \n\t"\ | |
507 | "movq %%mm2, %%mm0 \n\t"\ | |
508 | "movq %%mm5, %%mm6 \n\t"\ | |
509 | "movq %%mm4, %%mm3 \n\t"\ | |
510 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
511 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
512 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
513 | "paddw %%mm1, %%mm2 \n\t"\ | |
514 | "paddw %%mm1, %%mm5 \n\t"\ | |
515 | "paddw %%mm1, %%mm4 \n\t"\ | |
516 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
517 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
518 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
519 | "paddw %%mm7, %%mm0 \n\t"\ | |
520 | "paddw %%mm7, %%mm6 \n\t"\ | |
521 | "paddw %%mm7, %%mm3 \n\t"\ | |
522 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
523 | "packuswb %%mm0, %%mm2 \n\t"\ | |
524 | "packuswb %%mm6, %%mm5 \n\t"\ | |
525 | "packuswb %%mm3, %%mm4 \n\t"\ | |
526 | "pxor %%mm7, %%mm7 \n\t" | |
6e1c66bc | 527 | #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c) |
6a4970ab | 528 | |
6e1c66bc AJ |
529 | #define REAL_YSCALEYUV2PACKED1(index, c) \ |
530 | "xor "#index", "#index" \n\t"\ | |
4bff9ef9 | 531 | ASMALIGN(4)\ |
25593e29 | 532 | "1: \n\t"\ |
e54d94ba MN |
533 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
534 | "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
25593e29 MN |
535 | "psraw $7, %%mm3 \n\t" \ |
536 | "psraw $7, %%mm4 \n\t" \ | |
e54d94ba MN |
537 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
538 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
25593e29 MN |
539 | "psraw $7, %%mm1 \n\t" \ |
540 | "psraw $7, %%mm7 \n\t" \ | |
6a4970ab | 541 | |
6e1c66bc | 542 | #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) |
6a4970ab | 543 | |
6e1c66bc AJ |
544 | #define REAL_YSCALEYUV2RGB1(index, c) \ |
545 | "xor "#index", "#index" \n\t"\ | |
4bff9ef9 | 546 | ASMALIGN(4)\ |
d604bab9 | 547 | "1: \n\t"\ |
e54d94ba MN |
548 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
549 | "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
d604bab9 MN |
550 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
551 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
e54d94ba MN |
552 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
553 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
d604bab9 MN |
554 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
555 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
e54d94ba MN |
556 | "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
557 | "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
d604bab9 | 558 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
e54d94ba MN |
559 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
560 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
497d4f99 MN |
561 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
562 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
e54d94ba MN |
563 | "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
564 | "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
565 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
566 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
567 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
568 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
497d4f99 MN |
569 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
570 | "paddw %%mm3, %%mm4 \n\t"\ | |
571 | "movq %%mm2, %%mm0 \n\t"\ | |
572 | "movq %%mm5, %%mm6 \n\t"\ | |
573 | "movq %%mm4, %%mm3 \n\t"\ | |
574 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
575 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
576 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
577 | "paddw %%mm1, %%mm2 \n\t"\ | |
578 | "paddw %%mm1, %%mm5 \n\t"\ | |
579 | "paddw %%mm1, %%mm4 \n\t"\ | |
580 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
581 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
582 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
583 | "paddw %%mm7, %%mm0 \n\t"\ | |
584 | "paddw %%mm7, %%mm6 \n\t"\ | |
585 | "paddw %%mm7, %%mm3 \n\t"\ | |
586 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
587 | "packuswb %%mm0, %%mm2 \n\t"\ | |
588 | "packuswb %%mm6, %%mm5 \n\t"\ | |
589 | "packuswb %%mm3, %%mm4 \n\t"\ | |
590 | "pxor %%mm7, %%mm7 \n\t" | |
6e1c66bc | 591 | #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) |
497d4f99 | 592 | |
6e1c66bc AJ |
593 | #define REAL_YSCALEYUV2PACKED1b(index, c) \ |
594 | "xor "#index", "#index" \n\t"\ | |
4bff9ef9 | 595 | ASMALIGN(4)\ |
25593e29 | 596 | "1: \n\t"\ |
e54d94ba MN |
597 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
598 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
599 | "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
600 | "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
25593e29 MN |
601 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
602 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
603 | "psrlw $8, %%mm3 \n\t" \ | |
604 | "psrlw $8, %%mm4 \n\t" \ | |
e54d94ba MN |
605 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
606 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
25593e29 | 607 | "psraw $7, %%mm1 \n\t" \ |
6a4970ab | 608 | "psraw $7, %%mm7 \n\t" |
6e1c66bc | 609 | #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) |
6a4970ab | 610 | |
497d4f99 | 611 | // do vertical chrominance interpolation |
6e1c66bc AJ |
612 | #define REAL_YSCALEYUV2RGB1b(index, c) \ |
613 | "xor "#index", "#index" \n\t"\ | |
4bff9ef9 | 614 | ASMALIGN(4)\ |
497d4f99 | 615 | "1: \n\t"\ |
e54d94ba MN |
616 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
617 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
618 | "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
619 | "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
397c035e MN |
620 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
621 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
c1b0bfb4 MN |
622 | "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ |
623 | "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
e54d94ba MN |
624 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
625 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
497d4f99 MN |
626 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
627 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
e54d94ba MN |
628 | "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
629 | "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
497d4f99 | 630 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
e54d94ba MN |
631 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
632 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
d604bab9 MN |
633 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
634 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
e54d94ba MN |
635 | "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
636 | "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
637 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
638 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
639 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
640 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
d604bab9 MN |
641 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
642 | "paddw %%mm3, %%mm4 \n\t"\ | |
643 | "movq %%mm2, %%mm0 \n\t"\ | |
644 | "movq %%mm5, %%mm6 \n\t"\ | |
645 | "movq %%mm4, %%mm3 \n\t"\ | |
646 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
647 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
648 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
649 | "paddw %%mm1, %%mm2 \n\t"\ | |
650 | "paddw %%mm1, %%mm5 \n\t"\ | |
651 | "paddw %%mm1, %%mm4 \n\t"\ | |
652 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
653 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
654 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
655 | "paddw %%mm7, %%mm0 \n\t"\ | |
656 | "paddw %%mm7, %%mm6 \n\t"\ | |
657 | "paddw %%mm7, %%mm3 \n\t"\ | |
658 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
659 | "packuswb %%mm0, %%mm2 \n\t"\ | |
660 | "packuswb %%mm6, %%mm5 \n\t"\ | |
661 | "packuswb %%mm3, %%mm4 \n\t"\ | |
662 | "pxor %%mm7, %%mm7 \n\t" | |
6e1c66bc | 663 | #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) |
d604bab9 | 664 | |
6e1c66bc | 665 | #define REAL_WRITEBGR32(dst, dstw, index) \ |
d604bab9 MN |
666 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
667 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
668 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
669 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
670 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
671 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
672 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
673 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
674 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
675 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
676 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
677 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
678 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
679 | \ | |
6542b44e MN |
680 | MOVNTQ(%%mm0, (dst, index, 4))\ |
681 | MOVNTQ(%%mm2, 8(dst, index, 4))\ | |
682 | MOVNTQ(%%mm1, 16(dst, index, 4))\ | |
683 | MOVNTQ(%%mm3, 24(dst, index, 4))\ | |
d604bab9 | 684 | \ |
6e1c66bc AJ |
685 | "add $8, "#index" \n\t"\ |
686 | "cmp "#dstw", "#index" \n\t"\ | |
d604bab9 | 687 | " jb 1b \n\t" |
6e1c66bc | 688 | #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index) |
d604bab9 | 689 | |
6e1c66bc | 690 | #define REAL_WRITEBGR16(dst, dstw, index) \ |
9b464428 FB |
691 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
692 | "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
693 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
f62255fb | 694 | "psrlq $3, %%mm2 \n\t"\ |
d604bab9 | 695 | \ |
f62255fb MN |
696 | "movq %%mm2, %%mm1 \n\t"\ |
697 | "movq %%mm4, %%mm3 \n\t"\ | |
d604bab9 | 698 | \ |
f62255fb MN |
699 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
700 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
701 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
702 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
d604bab9 | 703 | \ |
f62255fb MN |
704 | "psllq $3, %%mm3 \n\t"\ |
705 | "psllq $3, %%mm4 \n\t"\ | |
d604bab9 MN |
706 | \ |
707 | "por %%mm3, %%mm2 \n\t"\ | |
d604bab9 | 708 | "por %%mm4, %%mm1 \n\t"\ |
d604bab9 | 709 | \ |
6542b44e MN |
710 | MOVNTQ(%%mm2, (dst, index, 2))\ |
711 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
d604bab9 | 712 | \ |
6e1c66bc AJ |
713 | "add $8, "#index" \n\t"\ |
714 | "cmp "#dstw", "#index" \n\t"\ | |
d604bab9 | 715 | " jb 1b \n\t" |
6e1c66bc | 716 | #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index) |
d604bab9 | 717 | |
6e1c66bc | 718 | #define REAL_WRITEBGR15(dst, dstw, index) \ |
9b464428 FB |
719 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
720 | "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
721 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
f62255fb MN |
722 | "psrlq $3, %%mm2 \n\t"\ |
723 | "psrlq $1, %%mm5 \n\t"\ | |
d604bab9 | 724 | \ |
f62255fb MN |
725 | "movq %%mm2, %%mm1 \n\t"\ |
726 | "movq %%mm4, %%mm3 \n\t"\ | |
d604bab9 | 727 | \ |
f62255fb MN |
728 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
729 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
730 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
731 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
d604bab9 | 732 | \ |
f62255fb MN |
733 | "psllq $2, %%mm3 \n\t"\ |
734 | "psllq $2, %%mm4 \n\t"\ | |
d604bab9 MN |
735 | \ |
736 | "por %%mm3, %%mm2 \n\t"\ | |
d604bab9 | 737 | "por %%mm4, %%mm1 \n\t"\ |
d604bab9 | 738 | \ |
6542b44e MN |
739 | MOVNTQ(%%mm2, (dst, index, 2))\ |
740 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
d604bab9 | 741 | \ |
6e1c66bc AJ |
742 | "add $8, "#index" \n\t"\ |
743 | "cmp "#dstw", "#index" \n\t"\ | |
d604bab9 | 744 | " jb 1b \n\t" |
6e1c66bc | 745 | #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index) |
f62255fb | 746 | |
6542b44e | 747 | #define WRITEBGR24OLD(dst, dstw, index) \ |
d604bab9 MN |
748 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
749 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
750 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
751 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
752 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
753 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
754 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
755 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
756 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
a525ce8d MN |
757 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
758 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
759 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
760 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
d604bab9 MN |
761 | \ |
762 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
763 | "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | |
9b464428 FB |
764 | "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\ |
765 | "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\ | |
d604bab9 MN |
766 | "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
767 | "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | |
768 | "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | |
769 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
770 | \ | |
771 | "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
772 | "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | |
773 | "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | |
774 | "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | |
9b464428 | 775 | "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\ |
d604bab9 MN |
776 | "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ |
777 | "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | |
9b464428 FB |
778 | "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\ |
779 | "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\ | |
d604bab9 MN |
780 | "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ |
781 | "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | |
782 | "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | |
783 | "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | |
784 | \ | |
785 | "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ | |
786 | "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | |
787 | "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | |
9b464428 FB |
788 | "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\ |
789 | "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\ | |
d604bab9 MN |
790 | "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ |
791 | "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
792 | "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | |
793 | \ | |
6542b44e MN |
794 | MOVNTQ(%%mm0, (dst))\ |
795 | MOVNTQ(%%mm2, 8(dst))\ | |
796 | MOVNTQ(%%mm3, 16(dst))\ | |
6e1c66bc | 797 | "add $24, "#dst" \n\t"\ |
d604bab9 | 798 | \ |
6e1c66bc AJ |
799 | "add $8, "#index" \n\t"\ |
800 | "cmp "#dstw", "#index" \n\t"\ | |
d604bab9 MN |
801 | " jb 1b \n\t" |
802 | ||
6542b44e | 803 | #define WRITEBGR24MMX(dst, dstw, index) \ |
99d2cb72 MN |
804 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
805 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
806 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
807 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
808 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
809 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
810 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
811 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
812 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
813 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
814 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
815 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
816 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
817 | \ | |
818 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
819 | "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
820 | "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
821 | "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
822 | \ | |
823 | "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
824 | "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
825 | "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
826 | "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
827 | \ | |
828 | "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
829 | "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
830 | "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
831 | "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
832 | \ | |
833 | "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
834 | "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
835 | "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
836 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
6542b44e | 837 | MOVNTQ(%%mm0, (dst))\ |
99d2cb72 MN |
838 | \ |
839 | "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
840 | "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
841 | "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
842 | "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
6542b44e | 843 | MOVNTQ(%%mm6, 8(dst))\ |
99d2cb72 MN |
844 | \ |
845 | "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
846 | "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
847 | "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
6542b44e | 848 | MOVNTQ(%%mm5, 16(dst))\ |
99d2cb72 | 849 | \ |
6e1c66bc | 850 | "add $24, "#dst" \n\t"\ |
99d2cb72 | 851 | \ |
6e1c66bc AJ |
852 | "add $8, "#index" \n\t"\ |
853 | "cmp "#dstw", "#index" \n\t"\ | |
99d2cb72 MN |
854 | " jb 1b \n\t" |
855 | ||
6542b44e | 856 | #define WRITEBGR24MMX2(dst, dstw, index) \ |
99d2cb72 | 857 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
9b464428 FB |
858 | "movq "MANGLE(M24A)", %%mm0 \n\t"\ |
859 | "movq "MANGLE(M24C)", %%mm7 \n\t"\ | |
99d2cb72 MN |
860 | "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
861 | "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
862 | "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
863 | \ | |
864 | "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
865 | "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
866 | "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
867 | \ | |
868 | "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
869 | "por %%mm1, %%mm6 \n\t"\ | |
870 | "por %%mm3, %%mm6 \n\t"\ | |
6542b44e | 871 | MOVNTQ(%%mm6, (dst))\ |
99d2cb72 MN |
872 | \ |
873 | "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
874 | "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
875 | "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
876 | "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
877 | \ | |
9b464428 | 878 | "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ |
99d2cb72 MN |
879 | "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
880 | "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
881 | \ | |
882 | "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
883 | "por %%mm3, %%mm6 \n\t"\ | |
6542b44e | 884 | MOVNTQ(%%mm6, 8(dst))\ |
99d2cb72 MN |
885 | \ |
886 | "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
887 | "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
888 | "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
889 | \ | |
890 | "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
891 | "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
9b464428 | 892 | "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
99d2cb72 MN |
893 | \ |
894 | "por %%mm1, %%mm3 \n\t"\ | |
895 | "por %%mm3, %%mm6 \n\t"\ | |
6542b44e | 896 | MOVNTQ(%%mm6, 16(dst))\ |
99d2cb72 | 897 | \ |
6e1c66bc | 898 | "add $24, "#dst" \n\t"\ |
99d2cb72 | 899 | \ |
6e1c66bc AJ |
900 | "add $8, "#index" \n\t"\ |
901 | "cmp "#dstw", "#index" \n\t"\ | |
99d2cb72 MN |
902 | " jb 1b \n\t" |
903 | ||
904 | #ifdef HAVE_MMX2 | |
7630f2e0 | 905 | #undef WRITEBGR24 |
6e1c66bc | 906 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) |
99d2cb72 | 907 | #else |
7630f2e0 | 908 | #undef WRITEBGR24 |
6e1c66bc | 909 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) |
99d2cb72 MN |
910 | #endif |
911 | ||
6e1c66bc | 912 | #define REAL_WRITEYUY2(dst, dstw, index) \ |
25593e29 MN |
913 | "packuswb %%mm3, %%mm3 \n\t"\ |
914 | "packuswb %%mm4, %%mm4 \n\t"\ | |
915 | "packuswb %%mm7, %%mm1 \n\t"\ | |
916 | "punpcklbw %%mm4, %%mm3 \n\t"\ | |
917 | "movq %%mm1, %%mm7 \n\t"\ | |
918 | "punpcklbw %%mm3, %%mm1 \n\t"\ | |
919 | "punpckhbw %%mm3, %%mm7 \n\t"\ | |
920 | \ | |
6542b44e MN |
921 | MOVNTQ(%%mm1, (dst, index, 2))\ |
922 | MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
25593e29 | 923 | \ |
6e1c66bc AJ |
924 | "add $8, "#index" \n\t"\ |
925 | "cmp "#dstw", "#index" \n\t"\ | |
25593e29 | 926 | " jb 1b \n\t" |
6e1c66bc | 927 | #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) |
25593e29 MN |
928 | |
929 | ||
77a49659 | 930 | static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
c1b0bfb4 | 931 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
7f526efd | 932 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) |
38858470 | 933 | { |
c1b0bfb4 | 934 | #ifdef HAVE_MMX |
bca11e75 MN |
935 | if(c->flags & SWS_ACCURATE_RND){ |
936 | if(uDest){ | |
937 | YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW) | |
938 | YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW) | |
939 | } | |
940 | ||
941 | YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW) | |
942 | }else{ | |
943 | if(uDest){ | |
944 | YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW) | |
945 | YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW) | |
946 | } | |
947 | ||
948 | YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW) | |
949 | } | |
c1b0bfb4 | 950 | #else |
a2faa401 RD |
951 | #ifdef HAVE_ALTIVEC |
952 | yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, | |
953 | chrFilter, chrSrc, chrFilterSize, | |
954 | dest, uDest, vDest, dstW, chrDstW); | |
955 | #else //HAVE_ALTIVEC | |
5859233b | 956 | yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, |
e3d2500f | 957 | chrFilter, chrSrc, chrFilterSize, |
5859233b | 958 | dest, uDest, vDest, dstW, chrDstW); |
a2faa401 | 959 | #endif //!HAVE_ALTIVEC |
7630f2e0 | 960 | #endif |
c1b0bfb4 | 961 | } |
2add307d | 962 | |
6118e52e VS |
963 | static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
964 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
965 | uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | |
966 | { | |
967 | yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | |
968 | chrFilter, chrSrc, chrFilterSize, | |
969 | dest, uDest, dstW, chrDstW, dstFormat); | |
970 | } | |
971 | ||
c1b0bfb4 | 972 | static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, |
7f526efd | 973 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) |
c1b0bfb4 MN |
974 | { |
975 | #ifdef HAVE_MMX | |
976 | if(uDest != NULL) | |
38858470 | 977 | { |
c1b0bfb4 MN |
978 | asm volatile( |
979 | YSCALEYUV2YV121 | |
e616aa93 | 980 | :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW), |
7f526efd | 981 | "g" (-chrDstW) |
6e1c66bc | 982 | : "%"REG_a |
c1b0bfb4 MN |
983 | ); |
984 | ||
985 | asm volatile( | |
986 | YSCALEYUV2YV121 | |
e616aa93 | 987 | :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW), |
7f526efd | 988 | "g" (-chrDstW) |
6e1c66bc | 989 | : "%"REG_a |
c1b0bfb4 | 990 | ); |
38858470 MN |
991 | } |
992 | ||
c1b0bfb4 MN |
993 | asm volatile( |
994 | YSCALEYUV2YV121 | |
995 | :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
7f526efd | 996 | "g" (-dstW) |
6e1c66bc | 997 | : "%"REG_a |
c1b0bfb4 MN |
998 | ); |
999 | #else | |
c1b0bfb4 MN |
1000 | int i; |
1001 | for(i=0; i<dstW; i++) | |
38858470 | 1002 | { |
c1b0bfb4 | 1003 | int val= lumSrc[i]>>7; |
6a4970ab | 1004 | |
44c1035c MN |
1005 | if(val&256){ |
1006 | if(val<0) val=0; | |
1007 | else val=255; | |
1008 | } | |
c1b0bfb4 | 1009 | |
44c1035c | 1010 | dest[i]= val; |
c1b0bfb4 MN |
1011 | } |
1012 | ||
1013 | if(uDest != NULL) | |
e616aa93 | 1014 | for(i=0; i<chrDstW; i++) |
38858470 | 1015 | { |
c1b0bfb4 MN |
1016 | int u=chrSrc[i]>>7; |
1017 | int v=chrSrc[i + 2048]>>7; | |
1018 | ||
44c1035c MN |
1019 | if((u|v)&256){ |
1020 | if(u<0) u=0; | |
1021 | else if (u>255) u=255; | |
1022 | if(v<0) v=0; | |
1023 | else if (v>255) v=255; | |
1024 | } | |
1025 | ||
1026 | uDest[i]= u; | |
1027 | vDest[i]= v; | |
38858470 | 1028 | } |
c1b0bfb4 | 1029 | #endif |
38858470 MN |
1030 | } |
1031 | ||
c1b0bfb4 | 1032 | |
d604bab9 MN |
1033 | /** |
1034 | * vertical scale YV12 to RGB | |
1035 | */ | |
25593e29 | 1036 | static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
c1b0bfb4 | 1037 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
065ee1ec | 1038 | uint8_t *dest, long dstW, long dstY) |
c1b0bfb4 | 1039 | { |
bca11e75 | 1040 | #ifdef HAVE_MMX |
f8d61128 | 1041 | long dummy=0; |
bca11e75 MN |
1042 | if(c->flags & SWS_ACCURATE_RND){ |
1043 | switch(c->dstFormat){ | |
e9e12f0e | 1044 | case PIX_FMT_RGB32: |
8422aa88 MN |
1045 | YSCALEYUV2PACKEDX_ACCURATE |
1046 | YSCALEYUV2RGBX | |
bca11e75 MN |
1047 | WRITEBGR32(%4, %5, %%REGa) |
1048 | ||
8422aa88 | 1049 | YSCALEYUV2PACKEDX_END |
bca11e75 | 1050 | return; |
e9e12f0e | 1051 | case PIX_FMT_BGR24: |
8422aa88 MN |
1052 | YSCALEYUV2PACKEDX_ACCURATE |
1053 | YSCALEYUV2RGBX | |
83c89c78 JT |
1054 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
1055 | "add %4, %%"REG_c" \n\t" | |
1056 | WRITEBGR24(%%REGc, %5, %%REGa) | |
bca11e75 | 1057 | |
8422aa88 | 1058 | |
6a4970ab | 1059 | :: "r" (&c->redDither), |
bca11e75 MN |
1060 | "m" (dummy), "m" (dummy), "m" (dummy), |
1061 | "r" (dest), "m" (dstW) | |
83c89c78 | 1062 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
bca11e75 MN |
1063 | ); |
1064 | return; | |
e9e12f0e | 1065 | case PIX_FMT_BGR555: |
8422aa88 MN |
1066 | YSCALEYUV2PACKEDX_ACCURATE |
1067 | YSCALEYUV2RGBX | |
bca11e75 MN |
1068 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1069 | #ifdef DITHER1XBPP | |
1070 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1071 | "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1072 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1073 | #endif | |
1074 | ||
1075 | WRITEBGR15(%4, %5, %%REGa) | |
8422aa88 | 1076 | YSCALEYUV2PACKEDX_END |
bca11e75 | 1077 | return; |
e9e12f0e | 1078 | case PIX_FMT_BGR565: |
8422aa88 MN |
1079 | YSCALEYUV2PACKEDX_ACCURATE |
1080 | YSCALEYUV2RGBX | |
bca11e75 MN |
1081 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1082 | #ifdef DITHER1XBPP | |
1083 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1084 | "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1085 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1086 | #endif | |
1087 | ||
1088 | WRITEBGR16(%4, %5, %%REGa) | |
8422aa88 | 1089 | YSCALEYUV2PACKEDX_END |
bca11e75 | 1090 | return; |
e9e12f0e | 1091 | case PIX_FMT_YUYV422: |
bca11e75 MN |
1092 | YSCALEYUV2PACKEDX_ACCURATE |
1093 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1094 | ||
1095 | "psraw $3, %%mm3 \n\t" | |
1096 | "psraw $3, %%mm4 \n\t" | |
1097 | "psraw $3, %%mm1 \n\t" | |
1098 | "psraw $3, %%mm7 \n\t" | |
1099 | WRITEYUY2(%4, %5, %%REGa) | |
8422aa88 | 1100 | YSCALEYUV2PACKEDX_END |
bca11e75 MN |
1101 | return; |
1102 | } | |
1103 | }else{ | |
cf7d1c1a | 1104 | switch(c->dstFormat) |
c1b0bfb4 | 1105 | { |
e9e12f0e | 1106 | case PIX_FMT_RGB32: |
8422aa88 MN |
1107 | YSCALEYUV2PACKEDX |
1108 | YSCALEYUV2RGBX | |
6e1c66bc | 1109 | WRITEBGR32(%4, %5, %%REGa) |
8422aa88 | 1110 | YSCALEYUV2PACKEDX_END |
bca11e75 | 1111 | return; |
e9e12f0e | 1112 | case PIX_FMT_BGR24: |
8422aa88 MN |
1113 | YSCALEYUV2PACKEDX |
1114 | YSCALEYUV2RGBX | |
83c89c78 JT |
1115 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
1116 | "add %4, %%"REG_c" \n\t" | |
1117 | WRITEBGR24(%%REGc, %5, %%REGa) | |
c1b0bfb4 | 1118 | |
6a4970ab | 1119 | :: "r" (&c->redDither), |
77a49659 MN |
1120 | "m" (dummy), "m" (dummy), "m" (dummy), |
1121 | "r" (dest), "m" (dstW) | |
83c89c78 | 1122 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
c1b0bfb4 | 1123 | ); |
bca11e75 | 1124 | return; |
e9e12f0e | 1125 | case PIX_FMT_BGR555: |
8422aa88 MN |
1126 | YSCALEYUV2PACKEDX |
1127 | YSCALEYUV2RGBX | |
c1b0bfb4 MN |
1128 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1129 | #ifdef DITHER1XBPP | |
9b464428 FB |
1130 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1131 | "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1132 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
c1b0bfb4 MN |
1133 | #endif |
1134 | ||
6e1c66bc | 1135 | WRITEBGR15(%4, %5, %%REGa) |
8422aa88 | 1136 | YSCALEYUV2PACKEDX_END |
bca11e75 | 1137 | return; |
e9e12f0e | 1138 | case PIX_FMT_BGR565: |
8422aa88 MN |
1139 | YSCALEYUV2PACKEDX |
1140 | YSCALEYUV2RGBX | |
c1b0bfb4 MN |
1141 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1142 | #ifdef DITHER1XBPP | |
9b464428 FB |
1143 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1144 | "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1145 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
c1b0bfb4 MN |
1146 | #endif |
1147 | ||
6e1c66bc | 1148 | WRITEBGR16(%4, %5, %%REGa) |
8422aa88 | 1149 | YSCALEYUV2PACKEDX_END |
bca11e75 | 1150 | return; |
e9e12f0e | 1151 | case PIX_FMT_YUYV422: |
25593e29 MN |
1152 | YSCALEYUV2PACKEDX |
1153 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1154 | ||
1155 | "psraw $3, %%mm3 \n\t" | |
1156 | "psraw $3, %%mm4 \n\t" | |
1157 | "psraw $3, %%mm1 \n\t" | |
1158 | "psraw $3, %%mm7 \n\t" | |
6e1c66bc | 1159 | WRITEYUY2(%4, %5, %%REGa) |
8422aa88 | 1160 | YSCALEYUV2PACKEDX_END |
bca11e75 MN |
1161 | return; |
1162 | } | |
1163 | } | |
c1b0bfb4 | 1164 | #endif |
a31de956 | 1165 | #ifdef HAVE_ALTIVEC |
b9a6fae9 AC |
1166 | /* The following list of supported dstFormat values should |
1167 | match what's found in the body of altivec_yuv2packedX() */ | |
e9e12f0e LA |
1168 | if(c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || |
1169 | c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || | |
1170 | c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB) | |
b9a6fae9 AC |
1171 | altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, |
1172 | chrFilter, chrSrc, chrFilterSize, | |
1173 | dest, dstW, dstY); | |
1174 | else | |
a31de956 | 1175 | #endif |
b9a6fae9 AC |
1176 | yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, |
1177 | chrFilter, chrSrc, chrFilterSize, | |
1178 | dest, dstW, dstY); | |
c1b0bfb4 MN |
1179 | } |
1180 | ||
c1b0bfb4 MN |
1181 | /** |
1182 | * vertical bilinear scale YV12 to RGB | |
1183 | */ | |
25593e29 | 1184 | static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, |
cf7d1c1a | 1185 | uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) |
d604bab9 MN |
1186 | { |
1187 | int yalpha1=yalpha^4095; | |
1188 | int uvalpha1=uvalpha^4095; | |
cf7d1c1a | 1189 | int i; |
d604bab9 | 1190 | |
77a416e8 | 1191 | #if 0 //isn't used |
1e621b18 | 1192 | if(flags&SWS_FULL_CHR_H_INT) |
d604bab9 | 1193 | { |
cf7d1c1a | 1194 | switch(dstFormat) |
d604bab9 | 1195 | { |
cf7d1c1a | 1196 | #ifdef HAVE_MMX |
e9e12f0e | 1197 | case PIX_FMT_RGB32: |
d604bab9 MN |
1198 | asm volatile( |
1199 | ||
1200 | ||
1201 | FULL_YSCALEYUV2RGB | |
1202 | "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1203 | "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1204 | ||
1205 | "movq %%mm3, %%mm1 \n\t" | |
1206 | "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1207 | "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1208 | ||
6e1c66bc AJ |
1209 | MOVNTQ(%%mm3, (%4, %%REGa, 4)) |
1210 | MOVNTQ(%%mm1, 8(%4, %%REGa, 4)) | |
d604bab9 | 1211 | |
6e1c66bc AJ |
1212 | "add $4, %%"REG_a" \n\t" |
1213 | "cmp %5, %%"REG_a" \n\t" | |
d604bab9 MN |
1214 | " jb 1b \n\t" |
1215 | ||
1216 | ||
6e1c66bc | 1217 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW), |
d604bab9 | 1218 | "m" (yalpha1), "m" (uvalpha1) |
6e1c66bc | 1219 | : "%"REG_a |
d604bab9 | 1220 | ); |
cf7d1c1a | 1221 | break; |
e9e12f0e | 1222 | case PIX_FMT_BGR24: |
d604bab9 MN |
1223 | asm volatile( |
1224 | ||
1225 | FULL_YSCALEYUV2RGB | |
1226 | ||
1227 | // lsb ... msb | |
1228 | "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1229 | "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1230 | ||
1231 | "movq %%mm3, %%mm1 \n\t" | |
1232 | "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1233 | "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1234 | ||
1235 | "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 | |
1236 | "psrlq $8, %%mm3 \n\t" // GR0BGR00 | |
9b464428 FB |
1237 | "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000 |
1238 | "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00 | |
d604bab9 MN |
1239 | "por %%mm2, %%mm3 \n\t" // BGRBGR00 |
1240 | "movq %%mm1, %%mm2 \n\t" | |
1241 | "psllq $48, %%mm1 \n\t" // 000000BG | |
1242 | "por %%mm1, %%mm3 \n\t" // BGRBGRBG | |
1243 | ||
1244 | "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 | |
1245 | "psrld $16, %%mm2 \n\t" // R000R000 | |
1246 | "psrlq $24, %%mm1 \n\t" // 0BGR0000 | |
1247 | "por %%mm2, %%mm1 \n\t" // RBGRR000 | |
1248 | ||
6e1c66bc AJ |
1249 | "mov %4, %%"REG_b" \n\t" |
1250 | "add %%"REG_a", %%"REG_b" \n\t" | |
d604bab9 MN |
1251 | |
1252 | #ifdef HAVE_MMX2 | |
1253 | //FIXME Alignment | |
6e1c66bc AJ |
1254 | "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t" |
1255 | "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t" | |
d604bab9 | 1256 | #else |
6e1c66bc | 1257 | "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" |
d604bab9 | 1258 | "psrlq $32, %%mm3 \n\t" |
6e1c66bc AJ |
1259 | "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t" |
1260 | "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" | |
d604bab9 | 1261 | #endif |
6e1c66bc AJ |
1262 | "add $4, %%"REG_a" \n\t" |
1263 | "cmp %5, %%"REG_a" \n\t" | |
d604bab9 MN |
1264 | " jb 1b \n\t" |
1265 | ||
d1fac6cf | 1266 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
d604bab9 | 1267 | "m" (yalpha1), "m" (uvalpha1) |
6e1c66bc | 1268 | : "%"REG_a, "%"REG_b |
d604bab9 | 1269 | ); |
cf7d1c1a | 1270 | break; |
e9e12f0e | 1271 | case PIX_FMT_BGR555: |
d604bab9 MN |
1272 | asm volatile( |
1273 | ||
1274 | FULL_YSCALEYUV2RGB | |
1275 | #ifdef DITHER1XBPP | |
9b464428 FB |
1276 | "paddusb "MANGLE(g5Dither)", %%mm1\n\t" |
1277 | "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1278 | "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
d604bab9 MN |
1279 | #endif |
1280 | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1281 | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1282 | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1283 | ||
1284 | "psrlw $3, %%mm3 \n\t" | |
1285 | "psllw $2, %%mm1 \n\t" | |
1286 | "psllw $7, %%mm0 \n\t" | |
9b464428 FB |
1287 | "pand "MANGLE(g15Mask)", %%mm1 \n\t" |
1288 | "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
d604bab9 MN |
1289 | |
1290 | "por %%mm3, %%mm1 \n\t" | |
1291 | "por %%mm1, %%mm0 \n\t" | |
1292 | ||
6e1c66bc | 1293 | MOVNTQ(%%mm0, (%4, %%REGa, 2)) |
d604bab9 | 1294 | |
6e1c66bc AJ |
1295 | "add $4, %%"REG_a" \n\t" |
1296 | "cmp %5, %%"REG_a" \n\t" | |
d604bab9 MN |
1297 | " jb 1b \n\t" |
1298 | ||
d1fac6cf | 1299 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
d604bab9 | 1300 | "m" (yalpha1), "m" (uvalpha1) |
6e1c66bc | 1301 | : "%"REG_a |
d604bab9 | 1302 | ); |
cf7d1c1a | 1303 | break; |
e9e12f0e | 1304 | case PIX_FMT_BGR565: |
d604bab9 MN |
1305 | asm volatile( |
1306 | ||
1307 | FULL_YSCALEYUV2RGB | |
1308 | #ifdef DITHER1XBPP | |
9b464428 FB |
1309 | "paddusb "MANGLE(g6Dither)", %%mm1\n\t" |
1310 | "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1311 | "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
d604bab9 MN |
1312 | #endif |
1313 | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1314 | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1315 | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1316 | ||
1317 | "psrlw $3, %%mm3 \n\t" | |
1318 | "psllw $3, %%mm1 \n\t" | |
1319 | "psllw $8, %%mm0 \n\t" | |
9b464428 FB |
1320 | "pand "MANGLE(g16Mask)", %%mm1 \n\t" |
1321 | "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
d604bab9 MN |
1322 | |
1323 | "por %%mm3, %%mm1 \n\t" | |
1324 | "por %%mm1, %%mm0 \n\t" | |
1325 | ||
6e1c66bc | 1326 | MOVNTQ(%%mm0, (%4, %%REGa, 2)) |
d604bab9 | 1327 | |
6e1c66bc AJ |
1328 | "add $4, %%"REG_a" \n\t" |
1329 | "cmp %5, %%"REG_a" \n\t" | |
d604bab9 MN |
1330 | " jb 1b \n\t" |
1331 | ||
d1fac6cf | 1332 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
d604bab9 | 1333 | "m" (yalpha1), "m" (uvalpha1) |
6e1c66bc | 1334 | : "%"REG_a |
d604bab9 | 1335 | ); |
cf7d1c1a MN |
1336 | break; |
1337 | #endif | |
e9e12f0e | 1338 | case PIX_FMT_BGR32: |
cf7d1c1a | 1339 | #ifndef HAVE_MMX |
e9e12f0e | 1340 | case PIX_FMT_RGB32: |
cf7d1c1a | 1341 | #endif |
e9e12f0e | 1342 | if(dstFormat==PIX_FMT_RGB32) |
28bf81c9 | 1343 | { |
2ba1bff0 | 1344 | int i; |
df3c183a MN |
1345 | #ifdef WORDS_BIGENDIAN |
1346 | dest++; | |
1347 | #endif | |
28bf81c9 MN |
1348 | for(i=0;i<dstW;i++){ |
1349 | // vertical linear interpolation && yuv2rgb in a single step: | |
1350 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1351 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1352 | int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1353 | dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | |
1354 | dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1355 | dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1356 | dest+= 4; | |
1357 | } | |
1358 | } | |
e9e12f0e | 1359 | else if(dstFormat==PIX_FMT_BGR24) |
d604bab9 | 1360 | { |
96034638 | 1361 | int i; |
d1fac6cf | 1362 | for(i=0;i<dstW;i++){ |
d604bab9 MN |
1363 | // vertical linear interpolation && yuv2rgb in a single step: |
1364 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1365 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1366 | int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
390b20a6 MN |
1367 | dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
1368 | dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1369 | dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
28bf81c9 | 1370 | dest+= 3; |
d604bab9 MN |
1371 | } |
1372 | } | |
e9e12f0e | 1373 | else if(dstFormat==PIX_FMT_BGR565) |
d604bab9 | 1374 | { |
96034638 | 1375 | int i; |
d1fac6cf | 1376 | for(i=0;i<dstW;i++){ |
d604bab9 MN |
1377 | // vertical linear interpolation && yuv2rgb in a single step: |
1378 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1379 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1380 | int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1381 | ||
d022ce5c | 1382 | ((uint16_t*)dest)[i] = |
b18ea156 MN |
1383 | clip_table16b[(Y + yuvtab_40cf[U]) >>13] | |
1384 | clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1385 | clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
d604bab9 MN |
1386 | } |
1387 | } | |
e9e12f0e | 1388 | else if(dstFormat==PIX_FMT_BGR555) |
d604bab9 | 1389 | { |
96034638 | 1390 | int i; |
d1fac6cf | 1391 | for(i=0;i<dstW;i++){ |
d604bab9 MN |
1392 | // vertical linear interpolation && yuv2rgb in a single step: |
1393 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1394 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1395 | int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1396 | ||
d022ce5c | 1397 | ((uint16_t*)dest)[i] = |
b18ea156 MN |
1398 | clip_table15b[(Y + yuvtab_40cf[U]) >>13] | |
1399 | clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1400 | clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
d604bab9 MN |
1401 | } |
1402 | } | |
d604bab9 MN |
1403 | }//FULL_UV_IPOL |
1404 | else | |
1405 | { | |
cf7d1c1a | 1406 | #endif // if 0 |
d604bab9 | 1407 | #ifdef HAVE_MMX |
cf7d1c1a MN |
1408 | switch(c->dstFormat) |
1409 | { | |
77a416e8 | 1410 | //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( |
e9e12f0e | 1411 | case PIX_FMT_RGB32: |
d604bab9 | 1412 | asm volatile( |
46fe31a0 MN |
1413 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1414 | "mov %4, %%"REG_b" \n\t" | |
1415 | "push %%"REG_BP" \n\t" | |
1416 | YSCALEYUV2RGB(%%REGBP, %5) | |
1417 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1418 | "pop %%"REG_BP" \n\t" | |
1419 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1420 | ||
1421 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1422 | "a" (&c->redDither) | |
d604bab9 | 1423 | ); |
cf7d1c1a | 1424 | return; |
e9e12f0e | 1425 | case PIX_FMT_BGR24: |
d604bab9 | 1426 | asm volatile( |
46fe31a0 MN |
1427 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1428 | "mov %4, %%"REG_b" \n\t" | |
1429 | "push %%"REG_BP" \n\t" | |
1430 | YSCALEYUV2RGB(%%REGBP, %5) | |
1431 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1432 | "pop %%"REG_BP" \n\t" | |
1433 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1434 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1435 | "a" (&c->redDither) | |
d604bab9 | 1436 | ); |
cf7d1c1a | 1437 | return; |
e9e12f0e | 1438 | case PIX_FMT_BGR555: |
d604bab9 | 1439 | asm volatile( |
46fe31a0 MN |
1440 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1441 | "mov %4, %%"REG_b" \n\t" | |
1442 | "push %%"REG_BP" \n\t" | |
1443 | YSCALEYUV2RGB(%%REGBP, %5) | |
d604bab9 MN |
1444 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1445 | #ifdef DITHER1XBPP | |
9b464428 FB |
1446 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1447 | "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1448 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
d604bab9 MN |
1449 | #endif |
1450 | ||
46fe31a0 MN |
1451 | WRITEBGR15(%%REGb, 8280(%5), %%REGBP) |
1452 | "pop %%"REG_BP" \n\t" | |
1453 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
d604bab9 | 1454 | |
46fe31a0 MN |
1455 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), |
1456 | "a" (&c->redDither) | |
d604bab9 | 1457 | ); |
cf7d1c1a | 1458 | return; |
e9e12f0e | 1459 | case PIX_FMT_BGR565: |
d604bab9 | 1460 | asm volatile( |
46fe31a0 MN |
1461 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1462 | "mov %4, %%"REG_b" \n\t" | |
1463 | "push %%"REG_BP" \n\t" | |
1464 | YSCALEYUV2RGB(%%REGBP, %5) | |
d604bab9 MN |
1465 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1466 | #ifdef DITHER1XBPP | |
9b464428 FB |
1467 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1468 | "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1469 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
d604bab9 MN |
1470 | #endif |
1471 | ||
46fe31a0 MN |
1472 | WRITEBGR16(%%REGb, 8280(%5), %%REGBP) |
1473 | "pop %%"REG_BP" \n\t" | |
1474 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1475 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1476 | "a" (&c->redDither) | |
d604bab9 | 1477 | ); |
cf7d1c1a | 1478 | return; |
e9e12f0e | 1479 | case PIX_FMT_YUYV422: |
25593e29 | 1480 | asm volatile( |
46fe31a0 MN |
1481 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1482 | "mov %4, %%"REG_b" \n\t" | |
1483 | "push %%"REG_BP" \n\t" | |
1484 | YSCALEYUV2PACKED(%%REGBP, %5) | |
1485 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1486 | "pop %%"REG_BP" \n\t" | |
1487 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1488 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1489 | "a" (&c->redDither) | |
25593e29 MN |
1490 | ); |
1491 | return; | |
cf7d1c1a MN |
1492 | default: break; |
1493 | } | |
1494 | #endif //HAVE_MMX | |
25593e29 | 1495 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) |
d604bab9 MN |
1496 | } |
1497 | ||
1498 | /** | |
1499 | * YV12 to RGB without scaling or interpolating | |
1500 | */ | |
25593e29 | 1501 | static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
cf7d1c1a | 1502 | uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) |
d604bab9 | 1503 | { |
c1b0bfb4 | 1504 | const int yalpha1=0; |
cf7d1c1a | 1505 | int i; |
6a4970ab | 1506 | |
cf7d1c1a MN |
1507 | uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 |
1508 | const int yalpha= 4096; //FIXME ... | |
96034638 | 1509 | |
1e621b18 | 1510 | if(flags&SWS_FULL_CHR_H_INT) |
d604bab9 | 1511 | { |
25593e29 | 1512 | RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); |
d604bab9 MN |
1513 | return; |
1514 | } | |
397c035e MN |
1515 | |
1516 | #ifdef HAVE_MMX | |
497d4f99 MN |
1517 | if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster |
1518 | { | |
cf7d1c1a | 1519 | switch(dstFormat) |
d604bab9 | 1520 | { |
e9e12f0e | 1521 | case PIX_FMT_RGB32: |
d604bab9 | 1522 | asm volatile( |
46fe31a0 MN |
1523 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1524 | "mov %4, %%"REG_b" \n\t" | |
1525 | "push %%"REG_BP" \n\t" | |
1526 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1527 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1528 | "pop %%"REG_BP" \n\t" | |
1529 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1530 | ||
1531 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1532 | "a" (&c->redDither) | |
d604bab9 | 1533 | ); |
cf7d1c1a | 1534 | return; |
e9e12f0e | 1535 | case PIX_FMT_BGR24: |
d604bab9 | 1536 | asm volatile( |
46fe31a0 MN |
1537 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1538 | "mov %4, %%"REG_b" \n\t" | |
1539 | "push %%"REG_BP" \n\t" | |
1540 | YSCALEYUV2RGB1(%%REGBP, %5) | |
1541 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1542 | "pop %%"REG_BP" \n\t" | |
1543 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1544 | ||
1545 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1546 | "a" (&c->redDither) | |
d604bab9 | 1547 | ); |
cf7d1c1a | 1548 | return; |
e9e12f0e | 1549 | case PIX_FMT_BGR555: |
d604bab9 | 1550 | asm volatile( |
46fe31a0 MN |
1551 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1552 | "mov %4, %%"REG_b" \n\t" | |
1553 | "push %%"REG_BP" \n\t" | |
1554 | YSCALEYUV2RGB1(%%REGBP, %5) | |
d604bab9 MN |
1555 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1556 | #ifdef DITHER1XBPP | |
9b464428 FB |
1557 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1558 | "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1559 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
d604bab9 | 1560 | #endif |
46fe31a0 MN |
1561 | WRITEBGR15(%%REGb, 8280(%5), %%REGBP) |
1562 | "pop %%"REG_BP" \n\t" | |
1563 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
e54d94ba | 1564 | |
46fe31a0 MN |
1565 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), |
1566 | "a" (&c->redDither) | |
d604bab9 | 1567 | ); |
cf7d1c1a | 1568 | return; |
e9e12f0e | 1569 | case PIX_FMT_BGR565: |
d604bab9 | 1570 | asm volatile( |
46fe31a0 MN |
1571 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1572 | "mov %4, %%"REG_b" \n\t" | |
1573 | "push %%"REG_BP" \n\t" | |
1574 | YSCALEYUV2RGB1(%%REGBP, %5) | |
d604bab9 MN |
1575 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1576 | #ifdef DITHER1XBPP | |
9b464428 FB |
1577 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1578 | "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1579 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
d604bab9 MN |
1580 | #endif |
1581 | ||
46fe31a0 MN |
1582 | WRITEBGR16(%%REGb, 8280(%5), %%REGBP) |
1583 | "pop %%"REG_BP" \n\t" | |
1584 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
e54d94ba | 1585 | |
46fe31a0 MN |
1586 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), |
1587 | "a" (&c->redDither) | |
d604bab9 | 1588 | ); |
cf7d1c1a | 1589 | return; |
e9e12f0e | 1590 | case PIX_FMT_YUYV422: |
25593e29 | 1591 | asm volatile( |
46fe31a0 MN |
1592 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1593 | "mov %4, %%"REG_b" \n\t" | |
1594 | "push %%"REG_BP" \n\t" | |
1595 | YSCALEYUV2PACKED1(%%REGBP, %5) | |
1596 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1597 | "pop %%"REG_BP" \n\t" | |
1598 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1599 | ||
1600 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1601 | "a" (&c->redDither) | |
25593e29 MN |
1602 | ); |
1603 | return; | |
d604bab9 | 1604 | } |
497d4f99 MN |
1605 | } |
1606 | else | |
1607 | { | |
cf7d1c1a | 1608 | switch(dstFormat) |
d604bab9 | 1609 | { |
e9e12f0e | 1610 | case PIX_FMT_RGB32: |
497d4f99 | 1611 | asm volatile( |
46fe31a0 MN |
1612 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1613 | "mov %4, %%"REG_b" \n\t" | |
1614 | "push %%"REG_BP" \n\t" | |
1615 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1616 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1617 | "pop %%"REG_BP" \n\t" | |
1618 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1619 | ||
1620 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1621 | "a" (&c->redDither) | |
497d4f99 | 1622 | ); |
cf7d1c1a | 1623 | return; |
e9e12f0e | 1624 | case PIX_FMT_BGR24: |
497d4f99 | 1625 | asm volatile( |
46fe31a0 MN |
1626 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1627 | "mov %4, %%"REG_b" \n\t" | |
1628 | "push %%"REG_BP" \n\t" | |
1629 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
1630 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1631 | "pop %%"REG_BP" \n\t" | |
1632 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1633 | ||
1634 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1635 | "a" (&c->redDither) | |
497d4f99 | 1636 | ); |
cf7d1c1a | 1637 | return; |
e9e12f0e | 1638 | case PIX_FMT_BGR555: |
497d4f99 | 1639 | asm volatile( |
46fe31a0 MN |
1640 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1641 | "mov %4, %%"REG_b" \n\t" | |
1642 | "push %%"REG_BP" \n\t" | |
1643 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
497d4f99 MN |
1644 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1645 | #ifdef DITHER1XBPP | |
9b464428 FB |
1646 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1647 | "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1648 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
497d4f99 | 1649 | #endif |
46fe31a0 MN |
1650 | WRITEBGR15(%%REGb, 8280(%5), %%REGBP) |
1651 | "pop %%"REG_BP" \n\t" | |
1652 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
e54d94ba | 1653 | |
46fe31a0 MN |
1654 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), |
1655 | "a" (&c->redDither) | |
497d4f99 | 1656 | ); |
cf7d1c1a | 1657 | return; |
e9e12f0e | 1658 | case PIX_FMT_BGR565: |
497d4f99 | 1659 | asm volatile( |
46fe31a0 MN |
1660 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1661 | "mov %4, %%"REG_b" \n\t" | |
1662 | "push %%"REG_BP" \n\t" | |
1663 | YSCALEYUV2RGB1b(%%REGBP, %5) | |
497d4f99 MN |
1664 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1665 | #ifdef DITHER1XBPP | |
9b464428 FB |
1666 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1667 | "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1668 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
497d4f99 | 1669 | #endif |
d604bab9 | 1670 | |
46fe31a0 MN |
1671 | WRITEBGR16(%%REGb, 8280(%5), %%REGBP) |
1672 | "pop %%"REG_BP" \n\t" | |
1673 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
e54d94ba | 1674 | |
46fe31a0 MN |
1675 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), |
1676 | "a" (&c->redDither) | |
497d4f99 | 1677 | ); |
cf7d1c1a | 1678 | return; |
e9e12f0e | 1679 | case PIX_FMT_YUYV422: |
25593e29 | 1680 | asm volatile( |
46fe31a0 MN |
1681 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
1682 | "mov %4, %%"REG_b" \n\t" | |
1683 | "push %%"REG_BP" \n\t" | |
1684 | YSCALEYUV2PACKED1b(%%REGBP, %5) | |
1685 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1686 | "pop %%"REG_BP" \n\t" | |
1687 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1688 | ||
1689 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1690 | "a" (&c->redDither) | |
25593e29 MN |
1691 | ); |
1692 | return; | |
d604bab9 | 1693 | } |
497d4f99 | 1694 | } |
df3c183a | 1695 | #endif |
cf7d1c1a | 1696 | if( uvalpha < 2048 ) |
497d4f99 | 1697 | { |
25593e29 | 1698 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) |
cf7d1c1a | 1699 | }else{ |
25593e29 | 1700 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) |
497d4f99 | 1701 | } |
d604bab9 MN |
1702 | } |
1703 | ||
6ff0ad6b MN |
1704 | //FIXME yuy2* can read upto 7 samples to much |
1705 | ||
7f526efd | 1706 | static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) |
1e621b18 | 1707 | { |
6ff0ad6b MN |
1708 | #ifdef HAVE_MMX |
1709 | asm volatile( | |
1710 | "movq "MANGLE(bm01010101)", %%mm2\n\t" | |
6e1c66bc | 1711 | "mov %0, %%"REG_a" \n\t" |
6ff0ad6b | 1712 | "1: \n\t" |
6e1c66bc AJ |
1713 | "movq (%1, %%"REG_a",2), %%mm0 \n\t" |
1714 | "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
6ff0ad6b MN |
1715 | "pand %%mm2, %%mm0 \n\t" |
1716 | "pand %%mm2, %%mm1 \n\t" | |
1717 | "packuswb %%mm1, %%mm0 \n\t" | |
6e1c66bc AJ |
1718 | "movq %%mm0, (%2, %%"REG_a") \n\t" |
1719 | "add $8, %%"REG_a" \n\t" | |
6ff0ad6b | 1720 | " js 1b \n\t" |
7f526efd | 1721 | : : "g" (-width), "r" (src+width*2), "r" (dst+width) |
6e1c66bc | 1722 | : "%"REG_a |
6ff0ad6b | 1723 | ); |
1e621b18 MN |
1724 | #else |
1725 | int i; | |
1726 | for(i=0; i<width; i++) | |
1727 | dst[i]= src[2*i]; | |
1728 | #endif | |
1729 | } | |
1730 | ||
7f526efd | 1731 | static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
1e621b18 | 1732 | { |
c2271987 | 1733 | #ifdef HAVE_MMX |
6ff0ad6b MN |
1734 | asm volatile( |
1735 | "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
6e1c66bc | 1736 | "mov %0, %%"REG_a" \n\t" |
6ff0ad6b | 1737 | "1: \n\t" |
6e1c66bc AJ |
1738 | "movq (%1, %%"REG_a",4), %%mm0 \n\t" |
1739 | "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
6ff0ad6b MN |
1740 | "psrlw $8, %%mm0 \n\t" |
1741 | "psrlw $8, %%mm1 \n\t" | |
1742 | "packuswb %%mm1, %%mm0 \n\t" | |
1743 | "movq %%mm0, %%mm1 \n\t" | |
1744 | "psrlw $8, %%mm0 \n\t" | |
1745 | "pand %%mm4, %%mm1 \n\t" | |
1746 | "packuswb %%mm0, %%mm0 \n\t" | |
1747 | "packuswb %%mm1, %%mm1 \n\t" | |
c2271987 MN |
1748 | "movd %%mm0, (%3, %%"REG_a") \n\t" |
1749 | "movd %%mm1, (%2, %%"REG_a") \n\t" | |
6e1c66bc | 1750 | "add $4, %%"REG_a" \n\t" |
6ff0ad6b | 1751 | " js 1b \n\t" |
c2271987 | 1752 | : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) |
6e1c66bc | 1753 | : "%"REG_a |
6ff0ad6b | 1754 | ); |
1e621b18 MN |
1755 | #else |
1756 | int i; | |
1757 | for(i=0; i<width; i++) | |
1758 | { | |
c2271987 MN |
1759 | dstU[i]= src1[4*i + 1]; |
1760 | dstV[i]= src1[4*i + 3]; | |
1e621b18 MN |
1761 | } |
1762 | #endif | |
0683a5c5 | 1763 | assert(src1 == src2); |
1e621b18 MN |
1764 | } |
1765 | ||
7322a67c | 1766 | //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses |
7f526efd | 1767 | static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width) |
7322a67c MN |
1768 | { |
1769 | #ifdef HAVE_MMX | |
1770 | asm volatile( | |
6e1c66bc | 1771 | "mov %0, %%"REG_a" \n\t" |
7322a67c | 1772 | "1: \n\t" |
6e1c66bc AJ |
1773 | "movq (%1, %%"REG_a",2), %%mm0 \n\t" |
1774 | "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
7322a67c MN |
1775 | "psrlw $8, %%mm0 \n\t" |
1776 | "psrlw $8, %%mm1 \n\t" | |
1777 | "packuswb %%mm1, %%mm0 \n\t" | |
6e1c66bc AJ |
1778 | "movq %%mm0, (%2, %%"REG_a") \n\t" |
1779 | "add $8, %%"REG_a" \n\t" | |
7322a67c | 1780 | " js 1b \n\t" |
7f526efd | 1781 | : : "g" (-width), "r" (src+width*2), "r" (dst+width) |
6e1c66bc | 1782 | : "%"REG_a |
7322a67c MN |
1783 | ); |
1784 | #else | |
1785 | int i; | |
1786 | for(i=0; i<width; i++) | |
1787 | dst[i]= src[2*i+1]; | |
1788 | #endif | |
1789 | } | |
1790 | ||
7f526efd | 1791 | static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
7322a67c | 1792 | { |
c2271987 | 1793 | #ifdef HAVE_MMX |
7322a67c MN |
1794 | asm volatile( |
1795 | "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
6e1c66bc | 1796 | "mov %0, %%"REG_a" \n\t" |
7322a67c | 1797 | "1: \n\t" |
6e1c66bc AJ |
1798 | "movq (%1, %%"REG_a",4), %%mm0 \n\t" |
1799 | "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
7322a67c MN |
1800 | "pand %%mm4, %%mm0 \n\t" |
1801 | "pand %%mm4, %%mm1 \n\t" | |
1802 | "packuswb %%mm1, %%mm0 \n\t" | |
1803 | "movq %%mm0, %%mm1 \n\t" | |
1804 | "psrlw $8, %%mm0 \n\t" | |
1805 | "pand %%mm4, %%mm1 \n\t" | |
1806 | "packuswb %%mm0, %%mm0 \n\t" | |
1807 | "packuswb %%mm1, %%mm1 \n\t" | |
c2271987 MN |
1808 | "movd %%mm0, (%3, %%"REG_a") \n\t" |
1809 | "movd %%mm1, (%2, %%"REG_a") \n\t" | |
6e1c66bc | 1810 | "add $4, %%"REG_a" \n\t" |
7322a67c | 1811 | " js 1b \n\t" |
c2271987 | 1812 | : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) |
6e1c66bc | 1813 | : "%"REG_a |
7322a67c MN |
1814 | ); |
1815 | #else | |
1816 | int i; | |
1817 | for(i=0; i<width; i++) | |
1818 | { | |
c2271987 MN |
1819 | dstU[i]= src1[4*i + 0]; |
1820 | dstV[i]= src1[4*i + 2]; | |
7322a67c MN |
1821 | } |
1822 | #endif | |
0683a5c5 | 1823 | assert(src1 == src2); |
7322a67c MN |
1824 | } |
1825 | ||
1e621b18 MN |
1826 | static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) |
1827 | { | |
1e621b18 MN |
1828 | int i; |
1829 | for(i=0; i<width; i++) | |
1830 | { | |
4e61e21c MN |
1831 | int b= ((uint32_t*)src)[i]&0xFF; |
1832 | int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
3e499f53 | 1833 | int r= (((uint32_t*)src)[i]>>16)&0xFF; |
1e621b18 | 1834 | |
4e61e21c | 1835 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
1e621b18 | 1836 | } |
1e621b18 MN |
1837 | } |
1838 | ||
1839 | static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1840 | { | |
1e621b18 | 1841 | int i; |
c2271987 | 1842 | assert(src1 == src2); |
1e621b18 MN |
1843 | for(i=0; i<width; i++) |
1844 | { | |
4e61e21c MN |
1845 | const int a= ((uint32_t*)src1)[2*i+0]; |
1846 | const int e= ((uint32_t*)src1)[2*i+1]; | |
c2271987 MN |
1847 | const int l= (a&0xFF00FF) + (e&0xFF00FF); |
1848 | const int h= (a&0x00FF00) + (e&0x00FF00); | |
4e61e21c MN |
1849 | const int b= l&0x3FF; |
1850 | const int g= h>>8; | |
1851 | const int r= l>>16; | |
1e621b18 | 1852 | |
c2271987 MN |
1853 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; |
1854 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; | |
1e621b18 | 1855 | } |
1e621b18 MN |
1856 | } |
1857 | ||
7f526efd | 1858 | static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) |
1e621b18 | 1859 | { |
ac6a2e45 MN |
1860 | #ifdef HAVE_MMX |
1861 | asm volatile( | |
6e1c66bc | 1862 | "mov %2, %%"REG_a" \n\t" |
854288bb FB |
1863 | "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
1864 | "movq "MANGLE(w1111)", %%mm5 \n\t" | |
ac6a2e45 | 1865 | "pxor %%mm7, %%mm7 \n\t" |
83c89c78 | 1866 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t" |
4bff9ef9 | 1867 | ASMALIGN(4) |
ac6a2e45 | 1868 | "1: \n\t" |
83c89c78 JT |
1869 | PREFETCH" 64(%0, %%"REG_d") \n\t" |
1870 | "movd (%0, %%"REG_d"), %%mm0 \n\t" | |
1871 | "movd 3(%0, %%"REG_d"), %%mm1 \n\t" | |
ac6a2e45 MN |
1872 | "punpcklbw %%mm7, %%mm0 \n\t" |
1873 | "punpcklbw %%mm7, %%mm1 \n\t" | |
83c89c78 JT |
1874 | "movd 6(%0, %%"REG_d"), %%mm2 \n\t" |
1875 | "movd 9(%0, %%"REG_d"), %%mm3 \n\t" | |
ac6a2e45 MN |
1876 | "punpcklbw %%mm7, %%mm2 \n\t" |
1877 | "punpcklbw %%mm7, %%mm3 \n\t" | |
1878 | "pmaddwd %%mm6, %%mm0 \n\t" | |
1879 | "pmaddwd %%mm6, %%mm1 \n\t" | |
1880 | "pmaddwd %%mm6, %%mm2 \n\t" | |
1881 | "pmaddwd %%mm6, %%mm3 \n\t" | |
1882 | #ifndef FAST_BGR2YV12 | |
1883 | "psrad $8, %%mm0 \n\t" | |
1884 | "psrad $8, %%mm1 \n\t" | |
1885 | "psrad $8, %%mm2 \n\t" | |
1886 | "psrad $8, %%mm3 \n\t" | |
1887 | #endif | |
1888 | "packssdw %%mm1, %%mm0 \n\t" | |
1889 | "packssdw %%mm3, %%mm2 \n\t" | |
1890 | "pmaddwd %%mm5, %%mm0 \n\t" | |
1891 | "pmaddwd %%mm5, %%mm2 \n\t" | |
1892 | "packssdw %%mm2, %%mm0 \n\t" | |
1893 | "psraw $7, %%mm0 \n\t" | |
1894 | ||
83c89c78 JT |
1895 | "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
1896 | "movd 15(%0, %%"REG_d"), %%mm1 \n\t" | |
ac6a2e45 MN |
1897 | "punpcklbw %%mm7, %%mm4 \n\t" |
1898 | "punpcklbw %%mm7, %%mm1 \n\t" | |
83c89c78 JT |
1899 | "movd 18(%0, %%"REG_d"), %%mm2 \n\t" |
1900 | "movd 21(%0, %%"REG_d"), %%mm3 \n\t" | |
ac6a2e45 MN |
1901 | "punpcklbw %%mm7, %%mm2 \n\t" |
1902 | "punpcklbw %%mm7, %%mm3 \n\t" | |
1903 | "pmaddwd %%mm6, %%mm4 \n\t" | |
1904 | "pmaddwd %%mm6, %%mm1 \n\t" | |
1905 | "pmaddwd %%mm6, %%mm2 \n\t" | |
1906 | "pmaddwd %%mm6, %%mm3 \n\t" | |
1907 | #ifndef FAST_BGR2YV12 | |
1908 | "psrad $8, %%mm4 \n\t" | |
1909 | "psrad $8, %%mm1 \n\t" | |
1910 | "psrad $8, %%mm2 \n\t" | |
1911 | "psrad $8, %%mm3 \n\t" | |
1912 | #endif | |
1913 | "packssdw %%mm1, %%mm4 \n\t" | |
1914 | "packssdw %%mm3, %%mm2 \n\t" | |
1915 | "pmaddwd %%mm5, %%mm4 \n\t" | |
1916 | "pmaddwd %%mm5, %%mm2 \n\t" | |
83c89c78 | 1917 | "add $24, %%"REG_d" \n\t" |
ac6a2e45 MN |
1918 | "packssdw %%mm2, %%mm4 \n\t" |
1919 | "psraw $7, %%mm4 \n\t" | |
1920 | ||
1921 | "packuswb %%mm4, %%mm0 \n\t" | |
854288bb | 1922 | "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
ac6a2e45 | 1923 | |
6e1c66bc AJ |
1924 | "movq %%mm0, (%1, %%"REG_a") \n\t" |
1925 | "add $8, %%"REG_a" \n\t" | |
ac6a2e45 | 1926 | " js 1b \n\t" |
7f526efd | 1927 | : : "r" (src+width*3), "r" (dst+width), "g" (-width) |
83c89c78 | 1928 | : "%"REG_a, "%"REG_d |
ac6a2e45 | 1929 | ); |
1e621b18 MN |
1930 | #else |
1931 | int i; | |
1932 | for(i=0; i<width; i++) | |
1933 | { | |
1934 | int b= src[i*3+0]; | |
1935 | int g= src[i*3+1]; | |
1936 | int r= src[i*3+2]; | |
1937 | ||
9902f4e2 | 1938 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
1e621b18 MN |
1939 | } |
1940 | #endif | |
1941 | } | |
1942 | ||
7f526efd | 1943 | static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
1e621b18 | 1944 | { |
4342fc14 MN |
1945 | #ifdef HAVE_MMX |
1946 | asm volatile( | |
c2271987 | 1947 | "mov %3, %%"REG_a" \n\t" |
854288bb FB |
1948 | "movq "MANGLE(w1111)", %%mm5 \n\t" |
1949 | "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4342fc14 | 1950 | "pxor %%mm7, %%mm7 \n\t" |
83c89c78 JT |
1951 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" |
1952 | "add %%"REG_d", %%"REG_d" \n\t" | |
4bff9ef9 | 1953 | ASMALIGN(4) |
4342fc14 | 1954 | "1: \n\t" |
83c89c78 | 1955 | PREFETCH" 64(%0, %%"REG_d") \n\t" |
4342fc14 | 1956 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
83c89c78 | 1957 | "movq (%0, %%"REG_d"), %%mm0 \n\t" |
83c89c78 | 1958 | "movq 6(%0, %%"REG_d"), %%mm2 \n\t" |
4342fc14 MN |
1959 | "movq %%mm0, %%mm1 \n\t" |
1960 | "movq %%mm2, %%mm3 \n\t" | |
1961 | "psrlq $24, %%mm0 \n\t" | |
1962 | "psrlq $24, %%mm2 \n\t" | |
1963 | PAVGB(%%mm1, %%mm0) | |
1964 | PAVGB(%%mm3, %%mm2) | |
1965 | "punpcklbw %%mm7, %%mm0 \n\t" | |
1966 | "punpcklbw %%mm7, %%mm2 \n\t" | |
1967 | #else | |
83c89c78 | 1968 | "movd (%0, %%"REG_d"), %%mm0 \n\t" |
83c89c78 | 1969 | "movd 3(%0, %%"REG_d"), %%mm2 \n\t" |
4342fc14 | 1970 | "punpcklbw %%mm7, %%mm0 \n\t" |
4342fc14 | 1971 | "punpcklbw %%mm7, %%mm2 \n\t" |
4342fc14 | 1972 | "paddw %%mm2, %%mm0 \n\t" |
83c89c78 | 1973 | "movd 6(%0, %%"REG_d"), %%mm4 \n\t" |
83c89c78 | 1974 | "movd 9(%0, %%"REG_d"), %%mm2 \n\t" |
4342fc14 | 1975 | "punpcklbw %%mm7, %%mm4 \n\t" |
4342fc14 | 1976 | "punpcklbw %%mm7, %%mm2 \n\t" |
4342fc14 | 1977 | "paddw %%mm4, %%mm2 \n\t" |
c2271987 MN |
1978 | "psrlw $1, %%mm0 \n\t" |
1979 | "psrlw $1, %%mm2 \n\t" | |
4342fc14 | 1980 | #endif |
854288bb FB |
1981 | "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1982 | "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
6a4970ab | 1983 | |
4342fc14 MN |
1984 | "pmaddwd %%mm0, %%mm1 \n\t" |
1985 | "pmaddwd %%mm2, %%mm3 \n\t" | |
1986 | "pmaddwd %%mm6, %%mm0 \n\t" | |
1987 | "pmaddwd %%mm6, %%mm2 \n\t" | |
1988 | #ifndef FAST_BGR2YV12 | |
1989 | "psrad $8, %%mm0 \n\t" | |
1990 | "psrad $8, %%mm1 \n\t" | |
1991 | "psrad $8, %%mm2 \n\t" | |
1992 | "psrad $8, %%mm3 \n\t" | |
1993 | #endif | |
1994 | "packssdw %%mm2, %%mm0 \n\t" | |
1995 | "packssdw %%mm3, %%mm1 \n\t" | |
1996 | "pmaddwd %%mm5, %%mm0 \n\t" | |
1997 | "pmaddwd %%mm5, %%mm1 \n\t" | |
1998 | "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1999 | "psraw $7, %%mm0 \n\t" | |
2000 | ||
2001 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
83c89c78 | 2002 | "movq 12(%0, %%"REG_d"), %%mm4 \n\t" |
83c89c78 | 2003 | "movq 18(%0, %%"REG_d"), %%mm2 \n\t" |
4342fc14 MN |
2004 | "movq %%mm4, %%mm1 \n\t" |
2005 | "movq %%mm2, %%mm3 \n\t" | |
2006 | "psrlq $24, %%mm4 \n\t" | |
2007 | "psrlq $24, %%mm2 \n\t" | |
2008 | PAVGB(%%mm1, %%mm4) | |
2009 | PAVGB(%%mm3, %%mm2) | |
2010 | "punpcklbw %%mm7, %%mm4 \n\t" | |
2011 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2012 | #else | |
83c89c78 | 2013 | "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
83c89c78 | 2014 | "movd 15(%0, %%"REG_d"), %%mm2 \n\t" |
4342fc14 | 2015 | "punpcklbw %%mm7, %%mm4 \n\t" |
4342fc14 | 2016 | "punpcklbw %%mm7, %%mm2 \n\t" |
4342fc14 | 2017 | "paddw %%mm2, %%mm4 \n\t" |
83c89c78 | 2018 | "movd 18(%0, %%"REG_d"), %%mm5 \n\t" |
83c89c78 | 2019 | "movd 21(%0, %%"REG_d"), %%mm2 \n\t" |
4342fc14 | 2020 | "punpcklbw %%mm7, %%mm5 \n\t" |
4342fc14 | 2021 | "punpcklbw %%mm7, %%mm2 \n\t" |
4342fc14 | 2022 | "paddw %%mm5, %%mm2 \n\t" |
854288bb | 2023 | "movq "MANGLE(w1111)", %%mm5 \n\t" |
4342fc14 MN |
2024 | "psrlw $2, %%mm4 \n\t" |
2025 | "psrlw $2, %%mm2 \n\t" | |
2026 | #endif | |
854288bb FB |
2027 | "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
2028 | "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
6a4970ab | 2029 | |
4342fc14 MN |
2030 | "pmaddwd %%mm4, %%mm1 \n\t" |
2031 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2032 | "pmaddwd %%mm6, %%mm4 \n\t" | |
2033 | "pmaddwd %%mm6, %%mm2 \n\t" | |
2034 | #ifndef FAST_BGR2YV12 | |
2035 | "psrad $8, %%mm4 \n\t" | |
2036 | "psrad $8, %%mm1 \n\t" | |
2037 | "psrad $8, %%mm2 \n\t" | |
2038 | "psrad $8, %%mm3 \n\t" | |
2039 | #endif | |
2040 | "packssdw %%mm2, %%mm4 \n\t" | |
2041 | "packssdw %%mm3, %%mm1 \n\t" | |
2042 | "pmaddwd %%mm5, %%mm4 \n\t" | |
2043 | "pmaddwd %%mm5, %%mm1 \n\t" | |
83c89c78 | 2044 | "add $24, %%"REG_d" \n\t" |
4342fc14 MN |
2045 | "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
2046 | "psraw $7, %%mm4 \n\t" | |
6a4970ab | 2047 | |
4342fc14 MN |
2048 | "movq %%mm0, %%mm1 \n\t" |
2049 | "punpckldq %%mm4, %%mm0 \n\t" | |
2050 | "punpckhdq %%mm4, %%mm1 \n\t" | |
2051 | "packsswb %%mm1, %%mm0 \n\t" | |
854288bb | 2052 | "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
4342fc14 | 2053 | |
c2271987 | 2054 | "movd %%mm0, (%1, %%"REG_a") \n\t" |
4342fc14 | 2055 | "punpckhdq %%mm0, %%mm0 \n\t" |
c2271987 | 2056 | "movd %%mm0, (%2, %%"REG_a") \n\t" |
6e1c66bc | 2057 | "add $4, %%"REG_a" \n\t" |
4342fc14 | 2058 | " js 1b \n\t" |
c2271987 | 2059 | : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) |
83c89c78 | 2060 | : "%"REG_a, "%"REG_d |
4342fc14 | 2061 | ); |
1e621b18 MN |
2062 | #else |
2063 | int i; | |
2064 | for(i=0; i<width; i++) | |
2065 | { | |
c2271987 MN |
2066 | int b= src1[6*i + 0] + src1[6*i + 3]; |
2067 | int g= src1[6*i + 1] + src1[6*i + 4]; | |
2068 | int r= src1[6*i + 2] + src1[6*i + 5]; | |
1e621b18 | 2069 | |
c2271987 MN |
2070 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; |
2071 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; | |
1e621b18 MN |
2072 | } |
2073 | #endif | |
0683a5c5 | 2074 | assert(src1 == src2); |
1e621b18 MN |
2075 | } |
2076 | ||
6af250ea MN |
2077 | static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) |
2078 | { | |
2079 | int i; | |
2080 | for(i=0; i<width; i++) | |
2081 | { | |
4e61e21c | 2082 | int d= ((uint16_t*)src)[i]; |
6af250ea MN |
2083 | int b= d&0x1F; |
2084 | int g= (d>>5)&0x3F; | |
2085 | int r= (d>>11)&0x1F; | |
2086 | ||
2087 | dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
2088 | } | |
2089 | } | |
2090 | ||
2091 | static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2092 | { | |
2093 | int i; | |
6264a515 | 2094 | assert(src1==src2); |
6af250ea MN |
2095 | for(i=0; i<width; i++) |
2096 | { | |
4e61e21c | 2097 | int d0= ((uint32_t*)src1)[i]; |
6a4970ab | 2098 | |
c2271987 MN |
2099 | int dl= (d0&0x07E0F81F); |
2100 | int dh= ((d0>>5)&0x07C0F83F); | |
5bb9d9d8 MN |
2101 | |
2102 | int dh2= (dh>>11) + (dh<<21); | |
2103 | int d= dh2 + dl; | |
2104 | ||
2105 | int b= d&0x7F; | |
2106 | int r= (d>>11)&0x7F; | |
2107 | int g= d>>21; | |
c2271987 MN |
2108 | dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128; |
2109 | dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128; | |
6af250ea MN |
2110 | } |
2111 | } | |
2112 | ||
b72034dd MN |
2113 | static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) |
2114 | { | |
2115 | int i; | |
2116 | for(i=0; i<width; i++) | |
2117 | { | |
4e61e21c | 2118 | int d= ((uint16_t*)src)[i]; |
b72034dd MN |
2119 | int b= d&0x1F; |
2120 | int g= (d>>5)&0x1F; | |
2121 | int r= (d>>10)&0x1F; | |
2122 | ||
2123 | dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
2124 | } | |
2125 | } | |
2126 | ||
2127 | static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2128 | { | |
2129 | int i; | |
c2271987 | 2130 | assert(src1==src2); |
b72034dd MN |
2131 | for(i=0; i<width; i++) |
2132 | { | |
4e61e21c | 2133 | int d0= ((uint32_t*)src1)[i]; |
6a4970ab | 2134 | |
c2271987 MN |
2135 | int dl= (d0&0x03E07C1F); |
2136 | int dh= ((d0>>5)&0x03E0F81F); | |
b72034dd MN |
2137 | |
2138 | int dh2= (dh>>11) + (dh<<21); | |
2139 | int d= dh2 + dl; | |
2140 | ||
2141 | int b= d&0x7F; | |
2142 | int r= (d>>10)&0x7F; | |
2143 | int g= d>>21; | |
c2271987 MN |
2144 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128; |
2145 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128; | |
b72034dd MN |
2146 | } |
2147 | } | |
2148 | ||
2149 | ||
a861d4d7 MN |
2150 | static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) |
2151 | { | |
2152 | int i; | |
2153 | for(i=0; i<width; i++) | |
2154 | { | |
4e61e21c MN |
2155 | int r= ((uint32_t*)src)[i]&0xFF; |
2156 | int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
3e499f53 | 2157 | int b= (((uint32_t*)src)[i]>>16)&0xFF; |
a861d4d7 | 2158 | |
4e61e21c | 2159 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
a861d4d7 MN |
2160 | } |
2161 | } | |
2162 | ||
2163 | static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2164 | { | |
2165 | int i; | |
c2271987 | 2166 | assert(src1==src2); |
a861d4d7 MN |
2167 | for(i=0; i<width; i++) |
2168 | { | |
4e61e21c MN |
2169 | const int a= ((uint32_t*)src1)[2*i+0]; |
2170 | const int e= ((uint32_t*)src1)[2*i+1]; | |
c2271987 MN |
2171 | const int l= (a&0xFF00FF) + (e&0xFF00FF); |
2172 | const int h= (a&0x00FF00) + (e&0x00FF00); | |
4e61e21c MN |
2173 | const int r= l&0x3FF; |
2174 | const int g= h>>8; | |
2175 | const int b= l>>16; | |
a861d4d7 | 2176 | |
c2271987 MN |
2177 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; |
2178 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; | |
a861d4d7 MN |
2179 | } |
2180 | } | |
2181 | ||
2182 | static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | |
2183 | { | |
2184 | int i; | |
2185 | for(i=0; i<width; i++) | |
2186 | { | |
2187 | int r= src[i*3+0]; | |
2188 | int g= src[i*3+1]; | |
2189 | int b= src[i*3+2]; | |
2190 | ||
4e61e21c | 2191 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
a861d4d7 MN |
2192 | } |
2193 | } | |
2194 | ||
2195 | static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2196 | { | |
2197 | int i; | |
c2271987 | 2198 | assert(src1==src2); |
a861d4d7 MN |
2199 | for(i=0; i<width; i++) |
2200 | { | |
c2271987 MN |
2201 | int r= src1[6*i + 0] + src1[6*i + 3]; |
2202 | int g= src1[6*i + 1] + src1[6*i + 4]; | |
2203 | int b= src1[6*i + 2] + src1[6*i + 5]; | |
a861d4d7 | 2204 | |
c2271987 MN |
2205 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; |
2206 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; | |
a861d4d7 MN |
2207 | } |
2208 | } | |
2209 | ||
a43fb6b3 LA |
2210 | static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width) |
2211 | { | |
2212 | int i; | |
2213 | for(i=0; i<width; i++) | |
2214 | { | |
2215 | int d= ((uint16_t*)src)[i]; | |
2216 | int r= d&0x1F; | |
2217 | int g= (d>>5)&0x3F; | |
2218 | int b= (d>>11)&0x1F; | |
2219 | ||
2220 | dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
2221 | } | |
2222 | } | |
2223 | ||
2224 | static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2225 | { | |
2226 | int i; | |
6264a515 | 2227 | assert(src1 == src2); |
a43fb6b3 LA |
2228 | for(i=0; i<width; i++) |
2229 | { | |
2230 | int d0= ((uint32_t*)src1)[i]; | |
6a4970ab | 2231 | |
6264a515 MN |
2232 | int dl= (d0&0x07E0F81F); |
2233 | int dh= ((d0>>5)&0x07C0F83F); | |
a43fb6b3 LA |
2234 | |
2235 | int dh2= (dh>>11) + (dh<<21); | |
2236 | int d= dh2 + dl; | |
2237 | ||
2238 | int r= d&0x7F; | |
2239 | int b= (d>>11)&0x7F; | |
2240 | int g= d>>21; | |
6264a515 MN |
2241 | dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128; |
2242 | dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128; | |
a43fb6b3 LA |
2243 | } |
2244 | } | |
2245 | ||
2246 | static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width) | |
2247 | { | |
2248 | int i; | |
2249 | for(i=0; i<width; i++) | |
2250 | { | |
2251 | int d= ((uint16_t*)src)[i]; | |
2252 | int r= d&0x1F; | |
2253 | int g= (d>>5)&0x1F; | |
2254 | int b= (d>>10)&0x1F; | |
2255 | ||
2256 | dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
2257 | } | |
2258 | } | |
2259 | ||
2260 | static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2261 | { | |
2262 | int i; | |
6264a515 | 2263 | assert(src1 == src2); |
a43fb6b3 LA |
2264 | for(i=0; i<width; i++) |
2265 | { | |
2266 | int d0= ((uint32_t*)src1)[i]; | |
6a4970ab | 2267 | |
6264a515 MN |
2268 | int dl= (d0&0x03E07C1F); |
2269 | int dh= ((d0>>5)&0x03E0F81F); | |
a43fb6b3 LA |
2270 | |
2271 | int dh2= (dh>>11) + (dh<<21); | |
2272 | int d= dh2 + dl; | |
2273 | ||
2274 | int g= d&0x7F; | |
2275 | int r= (d>>10)&0x7F; | |
2276 | int b= d>>21; | |
6264a515 MN |
2277 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128; |
2278 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128; | |
a43fb6b3 LA |
2279 | } |
2280 | } | |
1e621b18 | 2281 | |
e28630fc MN |
2282 | static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal) |
2283 | { | |
2284 | int i; | |
2285 | for(i=0; i<width; i++) | |
2286 | { | |
2287 | int d= src[i]; | |
e28630fc | 2288 | |
21c08a3f | 2289 | dst[i]= pal[d] & 0xFF; |
e28630fc MN |
2290 | } |
2291 | } | |
2292 | ||
2293 | static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal) | |
2294 | { | |
2295 | int i; | |
2296 | assert(src1 == src2); | |
2297 | for(i=0; i<width; i++) | |
2298 | { | |
fa65e2f6 | 2299 | int p= pal[src1[i]]; |
e28630fc | 2300 | |
fa65e2f6 MN |
2301 | dstU[i]= p>>8; |
2302 | dstV[i]= p>>16; | |
e28630fc MN |
2303 | } |
2304 | } | |
2305 | ||
077ea8a7 MN |
2306 | // Bilinear / Bicubic scaling |
2307 | static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
7f526efd | 2308 | int16_t *filter, int16_t *filterPos, long filterSize) |
2ff198c1 | 2309 | { |
077ea8a7 | 2310 | #ifdef HAVE_MMX |
c9b99ea6 | 2311 | assert(filterSize % 4 == 0 && filterSize>0); |
911406f2 | 2312 | if(filterSize==4) // Always true for upscaling, sometimes for down, too. |
077ea8a7 | 2313 | { |
6e1c66bc | 2314 | long counter= -2*dstW; |
077ea8a7 MN |
2315 | filter-= counter*2; |
2316 | filterPos-= counter/2; | |
2317 | dst-= counter/2; | |
2318 | asm volatile( | |
83c89c78 JT |
2319 | #if defined(PIC) |
2320 | "push %%"REG_b" \n\t" | |
2321 | #endif | |
077ea8a7 | 2322 | "pxor %%mm7, %%mm7 \n\t" |
9b464428 | 2323 | "movq "MANGLE(w02)", %%mm6 \n\t" |
6e1c66bc AJ |
2324 | "push %%"REG_BP" \n\t" // we use 7 regs here ... |
2325 | "mov %%"REG_a", %%"REG_BP" \n\t" | |
4bff9ef9 | 2326 | ASMALIGN(4) |
077ea8a7 | 2327 | "1: \n\t" |
a7b42d28 AJ |
2328 | "movzwl (%2, %%"REG_BP"), %%eax \n\t" |
2329 | "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
6e1c66bc AJ |
2330 | "movq (%1, %%"REG_BP", 4), %%mm1\n\t" |
2331 | "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t" | |
2332 | "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2333 | "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
077ea8a7 MN |
2334 | "punpcklbw %%mm7, %%mm0 \n\t" |
2335 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2336 | "pmaddwd %%mm1, %%mm0 \n\t" | |
2337 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2338 | "psrad $8, %%mm0 \n\t" | |
2339 | "psrad $8, %%mm3 \n\t" | |
2340 | "packssdw %%mm3, %%mm0 \n\t" | |
2341 | "pmaddwd %%mm6, %%mm0 \n\t" | |
2342 | "packssdw %%mm0, %%mm0 \n\t" | |
6e1c66bc AJ |
2343 | "movd %%mm0, (%4, %%"REG_BP") \n\t" |
2344 | "add $4, %%"REG_BP" \n\t" | |
077ea8a7 | 2345 | " jnc 1b \n\t" |
e3d2500f | 2346 | |
6e1c66bc | 2347 | "pop %%"REG_BP" \n\t" |
83c89c78 JT |
2348 | #if defined(PIC) |
2349 | "pop %%"REG_b" \n\t" | |
2350 | #endif | |
077ea8a7 MN |
2351 | : "+a" (counter) |
2352 | : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
83c89c78 | 2353 | #if !defined(PIC) |
6e1c66bc | 2354 | : "%"REG_b |
83c89c78 | 2355 | #endif |
077ea8a7 MN |
2356 | ); |
2357 | } | |
2358 | else if(filterSize==8) | |
2359 | { | |
6e1c66bc | 2360 | long counter= -2*dstW; |
077ea8a7 MN |
2361 | filter-= counter*4; |
2362 | filterPos-= counter/2; | |
2363 | dst-= counter/2; | |
2364 | asm volatile( | |
83c89c78 JT |
2365 | #if defined(PIC) |
2366 | "push %%"REG_b" \n\t" | |
2367 | #endif | |
077ea8a7 | 2368 | "pxor %%mm7, %%mm7 \n\t" |
9b464428 | 2369 | "movq "MANGLE(w02)", %%mm6 \n\t" |
6e1c66bc AJ |
2370 | "push %%"REG_BP" \n\t" // we use 7 regs here ... |
2371 | "mov %%"REG_a", %%"REG_BP" \n\t" | |
4bff9ef9 | 2372 | ASMALIGN(4) |
077ea8a7 | 2373 | "1: \n\t" |
a7b42d28 AJ |
2374 | "movzwl (%2, %%"REG_BP"), %%eax \n\t" |
2375 | "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
6e1c66bc AJ |
2376 | "movq (%1, %%"REG_BP", 8), %%mm1\n\t" |
2377 | "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t" | |
2378 | "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2379 | "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
077ea8a7 MN |
2380 | "punpcklbw %%mm7, %%mm0 \n\t" |
2381 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2382 | "pmaddwd %%mm1, %%mm0 \n\t" | |
2383 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2384 | ||
6e1c66bc AJ |
2385 | "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t" |
2386 | "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t" | |
2387 | "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | |
2388 | "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | |
077ea8a7 MN |
2389 | "punpcklbw %%mm7, %%mm4 \n\t" |
2390 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2391 | "pmaddwd %%mm1, %%mm4 \n\t" | |
2392 | "pmaddwd %%mm2, %%mm5 \n\t" | |
2393 | "paddd %%mm4, %%mm0 \n\t" | |
2394 | "paddd %%mm5, %%mm3 \n\t" | |
6a4970ab | 2395 | |
077ea8a7 MN |
2396 | "psrad $8, %%mm0 \n\t" |
2397 | "psrad $8, %%mm3 \n\t" | |
2398 | "packssdw %%mm3, %%mm0 \n\t" | |
2399 | "pmaddwd %%mm6, %%mm0 \n\t" | |
2400 | "packssdw %%mm0, %%mm0 \n\t" | |
6e1c66bc AJ |
2401 | "movd %%mm0, (%4, %%"REG_BP") \n\t" |
2402 | "add $4, %%"REG_BP" \n\t" | |
077ea8a7 | 2403 | " jnc 1b \n\t" |
c1b0bfb4 | 2404 | |
6e1c66bc | 2405 | "pop %%"REG_BP" \n\t" |
83c89c78 JT |
2406 | #if defined(PIC) |
2407 | "pop %%"REG_b" \n\t" | |
2408 | #endif | |
077ea8a7 MN |
2409 | : "+a" (counter) |
2410 | : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
83c89c78 | 2411 | #if !defined(PIC) |
6e1c66bc | 2412 | : "%"REG_b |
83c89c78 | 2413 | #endif |
077ea8a7 MN |
2414 | ); |
2415 | } | |
2416 | else | |
2417 | { | |
20ffdcf9 | 2418 | uint8_t *offset = src+filterSize; |
6e1c66bc | 2419 | long counter= -2*dstW; |
077ea8a7 MN |
2420 | // filter-= counter*filterSize/2; |
2421 | filterPos-= counter/2; | |
2422 | dst-= counter/2; | |
2423 | asm volatile( | |
2424 | "pxor %%mm7, %%mm7 \n\t" | |
9b464428 | 2425 | "movq "MANGLE(w02)", %%mm6 \n\t" |
4bff9ef9 | 2426 | ASMALIGN(4) |
077ea8a7 | 2427 | "1: \n\t" |
6e1c66bc | 2428 | "mov %2, %%"REG_c" \n\t" |
a7b42d28 | 2429 | "movzwl (%%"REG_c", %0), %%eax \n\t" |
83c89c78 | 2430 | "movzwl 2(%%"REG_c", %0), %%edx \n\t" |
6e1c66bc | 2431 | "mov %5, %%"REG_c" \n\t" |
077ea8a7 MN |
2432 | "pxor %%mm4, %%mm4 \n\t" |
2433 | "pxor %%mm5, %%mm5 \n\t" | |
2434 | "2: \n\t" | |
2435 | "movq (%1), %%mm1 \n\t" | |
2436 | "movq (%1, %6), %%mm3 \n\t" | |
6e1c66bc | 2437 | "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t" |
83c89c78 | 2438 | "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t" |
077ea8a7 MN |
2439 | "punpcklbw %%mm7, %%mm0 \n\t" |
2440 | "punpcklbw %%mm7, %%mm2 \n\t" | |
2441 | "pmaddwd %%mm1, %%mm0 \n\t" | |
2442 | "pmaddwd %%mm2, %%mm3 \n\t" | |
2443 | "paddd %%mm3, %%mm5 \n\t" | |
2444 | "paddd %%mm0, %%mm4 \n\t" | |
6e1c66bc AJ |
2445 | "add $8, %1 \n\t" |
2446 | "add $4, %%"REG_c" \n\t" | |
2447 | "cmp %4, %%"REG_c" \n\t" | |
077ea8a7 | 2448 | " jb 2b \n\t" |
6e1c66bc | 2449 | "add %6, %1 \n\t" |
077ea8a7 MN |
2450 | "psrad $8, %%mm4 \n\t" |
2451 | "psrad $8, %%mm5 \n\t" | |
2452 | "packssdw %%mm5, %%mm4 \n\t" | |
2453 | "pmaddwd %%mm6, %%mm4 \n\t" | |
2454 | "packssdw %%mm4, %%mm4 \n\t" | |
6e1c66bc AJ |
2455 | "mov %3, %%"REG_a" \n\t" |
2456 | "movd %%mm4, (%%"REG_a", %0) \n\t" | |
2457 | "add $4, %0 \n\t" | |
077ea8a7 | 2458 | " jnc 1b \n\t" |
c1b0bfb4 | 2459 | |
627690b5 | 2460 | : "+r" (counter), "+r" (filter) |
20ffdcf9 | 2461 | : "m" (filterPos), "m" (dst), "m"(offset), |
7f526efd | 2462 | "m" (src), "r" (filterSize*2) |
83c89c78 | 2463 | : "%"REG_a, "%"REG_c, "%"REG_d |
077ea8a7 MN |
2464 | ); |
2465 | } | |
2466 | #else | |
8c266f0c RD |
2467 | #ifdef HAVE_ALTIVEC |
2468 | hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); | |
2469 | #else | |
077ea8a7 MN |
2470 | int i; |
2471 | for(i=0; i<dstW; i++) | |
2472 | { | |
2473 | int j; | |
2474 | int srcPos= filterPos[i]; | |
2475 | int val=0; | |
c1b0bfb4 | 2476 | // printf("filterPos: %d\n", filterPos[i]); |
077ea8a7 MN |
2477 | for(j=0; j<filterSize; j++) |
2478 | { | |
2479 | // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2480 | val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2481 | } | |
2482 | // filter += hFilterSize; | |
adcec46a | 2483 | dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ... |
077ea8a7 MN |
2484 | // dst[i] = val>>7; |
2485 | } | |
2486 | #endif | |
8c266f0c | 2487 | #endif |
077ea8a7 | 2488 | } |
2ff198c1 | 2489 | // *** horizontal scale Y line to temp buffer |
065ee1ec | 2490 | static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, |
28bf81c9 | 2491 | int flags, int canMMX2BeUsed, int16_t *hLumFilter, |
6a4970ab | 2492 | int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, |
b7dc6f66 | 2493 | int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
e28630fc | 2494 | int32_t *mmx2FilterPos, uint8_t *pal) |
077ea8a7 | 2495 | { |
4884b9e5 | 2496 | if(srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE) |
1e621b18 MN |
2497 | { |
2498 | RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | |
2499 | src= formatConvBuffer; | |
2500 | } | |
4884b9e5 | 2501 | else if(srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE) |
7322a67c MN |
2502 | { |
2503 | RENAME(uyvyToY)(formatConvBuffer, src, srcW); | |
2504 | src= formatConvBuffer; | |
2505 | } | |
e9e12f0e | 2506 | else if(srcFormat==PIX_FMT_RGB32) |
1e621b18 MN |
2507 | { |
2508 | RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | |
2509 | src= formatConvBuffer; | |
2510 | } | |
e9e12f0e | 2511 | else if(srcFormat==PIX_FMT_BGR24) |
1e621b18 MN |
2512 | { |
2513 | RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | |
2514 | src= formatConvBuffer; | |
2515 | } | |
e9e12f0e | 2516 | else if(srcFormat==PIX_FMT_BGR565) |
6af250ea MN |
2517 | { |
2518 | RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | |
2519 | src= formatConvBuffer; | |
2520 | } | |
e9e12f0e | 2521 | else if(srcFormat==PIX_FMT_BGR555) |
b72034dd MN |
2522 | { |
2523 | RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | |
2524 | src= formatConvBuffer; | |
2525 | } | |
e9e12f0e | 2526 | else if(srcFormat==PIX_FMT_BGR32) |
a861d4d7 MN |
2527 | { |
2528 | RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | |
2529 | src= formatConvBuffer; | |
2530 | } | |
e9e12f0e | 2531 | else if(srcFormat==PIX_FMT_RGB24) |
a861d4d7 MN |
2532 | { |
2533 | RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | |
2534 | src= formatConvBuffer; | |
2535 | } | |
a43fb6b3 LA |
2536 | else if(srcFormat==PIX_FMT_RGB565) |
2537 | { | |
2538 | RENAME(rgb16ToY)(formatConvBuffer, src, srcW); | |
2539 | src= formatConvBuffer; | |
2540 | } | |
2541 | else if(srcFormat==PIX_FMT_RGB555) | |
2542 | { | |
2543 | RENAME(rgb15ToY)(formatConvBuffer, src, srcW); | |
2544 | src= formatConvBuffer; | |
2545 | } | |
18064f5c | 2546 | else if(srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) |
e28630fc MN |
2547 | { |
2548 | RENAME(palToY)(formatConvBuffer, src, srcW, pal); | |
2549 | src= formatConvBuffer; | |
2550 | } | |
1e621b18 | 2551 | |
e3d2500f | 2552 | #ifdef HAVE_MMX |
77a416e8 | 2553 | // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) |
28bf81c9 | 2554 | if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
e3d2500f | 2555 | #else |
28bf81c9 | 2556 | if(!(flags&SWS_FAST_BILINEAR)) |
e3d2500f | 2557 | #endif |
077ea8a7 MN |
2558 | { |
2559 | RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
2560 | } | |
2561 | else // Fast Bilinear upscale / crap downscale | |
2562 | { | |
3d6a30d9 | 2563 | #if defined(ARCH_X86) |
2ff198c1 | 2564 | #ifdef HAVE_MMX2 |
96034638 | 2565 | int i; |
83c89c78 JT |
2566 | #if defined(PIC) |
2567 | uint64_t ebxsave __attribute__((aligned(8))); | |
2568 | #endif | |
2ff198c1 MN |
2569 | if(canMMX2BeUsed) |
2570 | { | |
2571 | asm volatile( | |
83c89c78 JT |
2572 | #if defined(PIC) |
2573 | "mov %%"REG_b", %5 \n\t" | |
2574 | #endif | |
2ff198c1 | 2575 | "pxor %%mm7, %%mm7 \n\t" |
6e1c66bc AJ |
2576 | "mov %0, %%"REG_c" \n\t" |
2577 | "mov %1, %%"REG_D" \n\t" | |
2578 | "mov %2, %%"REG_d" \n\t" | |
2579 | "mov %3, %%"REG_b" \n\t" | |
2580 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2581 | PREFETCH" (%%"REG_c") \n\t" | |
2582 | PREFETCH" 32(%%"REG_c") \n\t" | |
2583 | PREFETCH" 64(%%"REG_c") \n\t" | |
99cefd0b | 2584 | |
6d606c4f AJ |
2585 | #ifdef ARCH_X86_64 |
2586 | ||
2587 | #define FUNNY_Y_CODE \ | |
2588 | "movl (%%"REG_b"), %%esi \n\t"\ | |
2589 | "call *%4 \n\t"\ | |
2590 | "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2591 | "add %%"REG_S", %%"REG_c" \n\t"\ | |
2592 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2593 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2594 | ||
2595 | #else | |
2596 | ||
2ff198c1 | 2597 | #define FUNNY_Y_CODE \ |
6d606c4f | 2598 | "movl (%%"REG_b"), %%esi \n\t"\ |
b7dc6f66 | 2599 | "call *%4 \n\t"\ |
6d606c4f | 2600 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ |
b6663a55 | 2601 | "add %%"REG_a", %%"REG_D" \n\t"\ |
6e1c66bc | 2602 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
99cefd0b | 2603 | |
6d606c4f AJ |
2604 | #endif |
2605 | ||
2ff198c1 MN |
2606 | FUNNY_Y_CODE |
2607 | FUNNY_Y_CODE | |
2608 | FUNNY_Y_CODE | |
2609 | FUNNY_Y_CODE | |
2610 | FUNNY_Y_CODE | |
2611 | FUNNY_Y_CODE | |
2612 | FUNNY_Y_CODE | |
2613 | FUNNY_Y_CODE | |
2614 | ||
83c89c78 JT |
2615 | #if defined(PIC) |
2616 | "mov %5, %%"REG_b" \n\t" | |
2617 | #endif | |
b7dc6f66 MN |
2618 | :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2619 | "m" (funnyYCode) | |
83c89c78 JT |
2620 | #if defined(PIC) |
2621 | ,"m" (ebxsave) | |
2622 | #endif | |
2623 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |
2624 | #if !defined(PIC) | |
2625 | ,"%"REG_b | |
2626 | #endif | |
2ff198c1 | 2627 | ); |
af91b8b3 | 2628 | for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; |
2ff198c1 MN |
2629 | } |
2630 | else | |
2631 | { | |
2632 | #endif | |
065ee1ec RD |
2633 | long xInc_shr16 = xInc >> 16; |
2634 | uint16_t xInc_mask = xInc & 0xffff; | |
2ff198c1 MN |
2635 | //NO MMX just normal asm ... |
2636 | asm volatile( | |
6e1c66bc | 2637 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
83c89c78 | 2638 | "xor %%"REG_d", %%"REG_d" \n\t" // xx |
2ff198c1 | 2639 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
4bff9ef9 | 2640 | ASMALIGN(4) |
2ff198c1 | 2641 | "1: \n\t" |
83c89c78 JT |
2642 | "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] |
2643 | "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2ff198c1 MN |
2644 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2645 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2646 | "shll $16, %%edi \n\t" | |
2647 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
6e1c66bc | 2648 | "mov %1, %%"REG_D" \n\t" |
2ff198c1 | 2649 | "shrl $9, %%esi \n\t" |
6e1c66bc | 2650 | "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" |
2ff198c1 | 2651 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF |
83c89c78 | 2652 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
2ff198c1 | 2653 | |
83c89c78 JT |
2654 | "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] |
2655 | "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2ff198c1 MN |
2656 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2657 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2658 | "shll $16, %%edi \n\t" | |
2659 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
6e1c66bc | 2660 | "mov %1, %%"REG_D" \n\t" |
2ff198c1 | 2661 | "shrl $9, %%esi \n\t" |
6e1c66bc | 2662 | "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t" |
2ff198c1 | 2663 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF |
83c89c78 | 2664 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
2ff198c1 MN |
2665 | |
2666 | ||
6e1c66bc AJ |
2667 | "add $2, %%"REG_a" \n\t" |
2668 | "cmp %2, %%"REG_a" \n\t" | |
2ff198c1 MN |
2669 | " jb 1b \n\t" |
2670 | ||
2671 | ||
20ffdcf9 | 2672 | :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) |
83c89c78 | 2673 | : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" |
2ff198c1 MN |
2674 | ); |
2675 | #ifdef HAVE_MMX2 | |
77a416e8 | 2676 | } //if MMX2 can't be used |
2ff198c1 MN |
2677 | #endif |
2678 | #else | |
96034638 MN |
2679 | int i; |
2680 | unsigned int xpos=0; | |
2681 | for(i=0;i<dstWidth;i++) | |
2682 | { | |
2683 | register unsigned int xx=xpos>>16; | |
2684 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2685 | dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2686 | xpos+=xInc; | |
2687 | } | |
2ff198c1 | 2688 | #endif |
077ea8a7 | 2689 | } |
2ff198c1 MN |
2690 | } |
2691 | ||
7f526efd | 2692 | inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, |
28bf81c9 | 2693 | int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, |
1e621b18 | 2694 | int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, |
b7dc6f66 | 2695 | int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
e28630fc | 2696 | int32_t *mmx2FilterPos, uint8_t *pal) |
2ff198c1 | 2697 | { |
e9e12f0e | 2698 | if(srcFormat==PIX_FMT_YUYV422) |
1e621b18 MN |
2699 | { |
2700 | RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2701 | src1= formatConvBuffer; | |
2702 | src2= formatConvBuffer+2048; | |
2703 | } | |
e9e12f0e | 2704 | else if(srcFormat==PIX_FMT_UYVY422) |
7322a67c MN |
2705 | { |
2706 | RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2707 | src1= formatConvBuffer; | |
2708 | src2= formatConvBuffer+2048; | |
2709 | } | |
e9e12f0e | 2710 | else if(srcFormat==PIX_FMT_RGB32) |
1e621b18 MN |
2711 | { |
2712 | RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2713 | src1= formatConvBuffer; | |
2714 | src2= formatConvBuffer+2048; | |
2715 | } | |
e9e12f0e | 2716 | else if(srcFormat==PIX_FMT_BGR24) |
1e621b18 MN |
2717 | { |
2718 | RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2719 | src1= formatConvBuffer; | |
2720 | src2= formatConvBuffer+2048; | |
2721 | } | |
e9e12f0e | 2722 | else if(srcFormat==PIX_FMT_BGR565) |
6af250ea MN |
2723 | { |
2724 | RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2725 | src1= formatConvBuffer; | |
2726 | src2= formatConvBuffer+2048; | |
2727 | } | |
e9e12f0e | 2728 | else if(srcFormat==PIX_FMT_BGR555) |
b72034dd MN |
2729 | { |
2730 | RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2731 | src1= formatConvBuffer; | |
2732 | src2= formatConvBuffer+2048; | |
2733 | } | |
e9e12f0e | 2734 | else if(srcFormat==PIX_FMT_BGR32) |
a861d4d7 MN |
2735 | { |
2736 | RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2737 | src1= formatConvBuffer; | |
2738 | src2= formatConvBuffer+2048; | |
2739 | } | |
e9e12f0e | 2740 | else if(srcFormat==PIX_FMT_RGB24) |
a861d4d7 MN |
2741 | { |
2742 | RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2743 | src1= formatConvBuffer; | |
2744 | src2= formatConvBuffer+2048; | |
2745 | } | |
a43fb6b3 LA |
2746 | else if(srcFormat==PIX_FMT_RGB565) |
2747 | { | |
2748 | RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2749 | src1= formatConvBuffer; | |
2750 | src2= formatConvBuffer+2048; | |
2751 | } | |
2752 | else if(srcFormat==PIX_FMT_RGB555) | |
2753 | { | |
2754 | RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2755 | src1= formatConvBuffer; | |
2756 | src2= formatConvBuffer+2048; | |
2757 | } | |
6ff0ad6b MN |
2758 | else if(isGray(srcFormat)) |
2759 | { | |
2760 | return; | |
2761 | } | |
18064f5c | 2762 | else if(srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) |
e28630fc MN |
2763 | { |
2764 | RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal); | |
2765 | src1= formatConvBuffer; | |
2766 | src2= formatConvBuffer+2048; | |
2767 | } | |
1e621b18 | 2768 | |
e3d2500f | 2769 | #ifdef HAVE_MMX |
77a416e8 | 2770 | // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) |
28bf81c9 | 2771 | if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
e3d2500f | 2772 | #else |
28bf81c9 | 2773 | if(!(flags&SWS_FAST_BILINEAR)) |
e3d2500f | 2774 | #endif |
077ea8a7 MN |
2775 | { |
2776 | RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2777 | RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2778 | } | |
2779 | else // Fast Bilinear upscale / crap downscale | |
2780 | { | |
3d6a30d9 | 2781 | #if defined(ARCH_X86) |
2ff198c1 | 2782 | #ifdef HAVE_MMX2 |
96034638 | 2783 | int i; |
83c89c78 JT |
2784 | #if defined(PIC) |
2785 | uint64_t ebxsave __attribute__((aligned(8))); | |
2786 | #endif | |
2ff198c1 MN |
2787 | if(canMMX2BeUsed) |
2788 | { | |
2789 | asm volatile( | |
83c89c78 JT |
2790 | #if defined(PIC) |
2791 | "mov %%"REG_b", %6 \n\t" | |
2792 | #endif | |
b7dc6f66 | 2793 | "pxor %%mm7, %%mm7 \n\t" |
6e1c66bc AJ |
2794 | "mov %0, %%"REG_c" \n\t" |
2795 | "mov %1, %%"REG_D" \n\t" | |
2796 | "mov %2, %%"REG_d" \n\t" | |
2797 | "mov %3, %%"REG_b" \n\t" | |
2798 | "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2799 | PREFETCH" (%%"REG_c") \n\t" | |
2800 | PREFETCH" 32(%%"REG_c") \n\t" | |
2801 | PREFETCH" 64(%%"REG_c") \n\t" | |
b7dc6f66 | 2802 | |
6d606c4f AJ |
2803 | #ifdef ARCH_X86_64 |
2804 | ||
2805 | #define FUNNY_UV_CODE \ | |
2806 | "movl (%%"REG_b"), %%esi \n\t"\ | |
2807 | "call *%4 \n\t"\ | |
2808 | "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2809 | "add %%"REG_S", %%"REG_c" \n\t"\ | |
2810 | "add %%"REG_a", %%"REG_D" \n\t"\ | |
2811 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2812 | ||
2813 | #else | |
2814 | ||
b7dc6f66 | 2815 | #define FUNNY_UV_CODE \ |
6e1c66bc | 2816 | "movl (%%"REG_b"), %%esi \n\t"\ |
b7dc6f66 | 2817 | "call *%4 \n\t"\ |
6d606c4f | 2818 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ |
6e1c66bc AJ |
2819 | "add %%"REG_a", %%"REG_D" \n\t"\ |
2820 | "xor %%"REG_a", %%"REG_a" \n\t"\ | |
b7dc6f66 | 2821 | |
6d606c4f AJ |
2822 | #endif |
2823 | ||
b7dc6f66 MN |
2824 | FUNNY_UV_CODE |
2825 | FUNNY_UV_CODE | |
2826 | FUNNY_UV_CODE | |
2827 | FUNNY_UV_CODE | |
6e1c66bc AJ |
2828 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
2829 | "mov %5, %%"REG_c" \n\t" // src | |
2830 | "mov %1, %%"REG_D" \n\t" // buf1 | |
2831 | "add $4096, %%"REG_D" \n\t" | |
2832 | PREFETCH" (%%"REG_c") \n\t" | |
2833 | PREFETCH" 32(%%"REG_c") \n\t" | |
2834 | PREFETCH" 64(%%"REG_c") \n\t" | |
b7dc6f66 MN |
2835 | |
2836 | FUNNY_UV_CODE | |
2837 | FUNNY_UV_CODE | |
2838 | FUNNY_UV_CODE | |
2839 | FUNNY_UV_CODE | |
2840 | ||
83c89c78 JT |
2841 | #if defined(PIC) |
2842 | "mov %6, %%"REG_b" \n\t" | |
2843 | #endif | |
b7dc6f66 MN |
2844 | :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2845 | "m" (funnyUVCode), "m" (src2) | |
83c89c78 JT |
2846 | #if defined(PIC) |
2847 | ,"m" (ebxsave) | |
2848 | #endif | |
91d0bda2 | 2849 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
83c89c78 JT |
2850 | #if !defined(PIC) |
2851 | ,"%"REG_b | |
2852 | #endif | |
b7dc6f66 | 2853 | ); |
c1b0bfb4 | 2854 | for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
2ff198c1 | 2855 | { |
c1b0bfb4 MN |
2856 | // printf("%d %d %d\n", dstWidth, i, srcW); |
2857 | dst[i] = src1[srcW-1]*128; | |
2858 | dst[i+2048] = src2[srcW-1]*128; | |
2ff198c1 MN |
2859 | } |
2860 | } | |
2861 | else | |
2862 | { | |
2863 | #endif | |
20ffdcf9 | 2864 | long xInc_shr16 = (long) (xInc >> 16); |
6a4970ab | 2865 | uint16_t xInc_mask = xInc & 0xffff; |
2ff198c1 | 2866 | asm volatile( |
6e1c66bc | 2867 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
83c89c78 | 2868 | "xor %%"REG_d", %%"REG_d" \n\t" // xx |
2ff198c1 | 2869 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
4bff9ef9 | 2870 | ASMALIGN(4) |
2ff198c1 | 2871 | "1: \n\t" |
6e1c66bc | 2872 | "mov %0, %%"REG_S" \n\t" |
83c89c78 JT |
2873 | "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] |
2874 | "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2ff198c1 MN |
2875 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2876 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2877 | "shll $16, %%edi \n\t" | |
2878 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
6e1c66bc | 2879 | "mov %1, %%"REG_D" \n\t" |
2ff198c1 | 2880 | "shrl $9, %%esi \n\t" |
c35afa2f | 2881 | "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" |
2ff198c1 | 2882 | |
83c89c78 JT |
2883 | "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] |
2884 | "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
2ff198c1 MN |
2885 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2886 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2887 | "shll $16, %%edi \n\t" | |
2888 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
6e1c66bc | 2889 | "mov %1, %%"REG_D" \n\t" |
2ff198c1 | 2890 | "shrl $9, %%esi \n\t" |
6e1c66bc | 2891 | "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t" |
2ff198c1 MN |
2892 | |
2893 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
83c89c78 | 2894 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
6e1c66bc AJ |
2895 | "add $1, %%"REG_a" \n\t" |
2896 | "cmp %2, %%"REG_a" \n\t" | |
2ff198c1 MN |
2897 | " jb 1b \n\t" |
2898 | ||
dc77ef7f GP |
2899 | /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, |
2900 | which is needed to support GCC-4.0 */ | |
2901 | #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4)) | |
2902 | :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | |
2903 | #else | |
9cc768f6 | 2904 | :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
dc77ef7f | 2905 | #endif |
2ff198c1 | 2906 | "r" (src2) |
83c89c78 | 2907 | : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" |
2ff198c1 MN |
2908 | ); |
2909 | #ifdef HAVE_MMX2 | |
77a416e8 | 2910 | } //if MMX2 can't be used |
2ff198c1 MN |
2911 | #endif |
2912 | #else | |
96034638 MN |
2913 | int i; |
2914 | unsigned int xpos=0; | |
2915 | for(i=0;i<dstWidth;i++) | |
2916 | { | |
2917 | register unsigned int xx=xpos>>16; | |
2918 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2919 | dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2920 | dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
1faf0867 MN |
2921 | /* slower |
2922 | dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2923 | dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2924 | */ | |
96034638 MN |
2925 | xpos+=xInc; |
2926 | } | |
2ff198c1 | 2927 | #endif |
077ea8a7 MN |
2928 | } |
2929 | } | |
2930 | ||
3e499f53 MN |
2931 | static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
2932 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | |
28bf81c9 MN |
2933 | |
2934 | /* load a few things into local vars to make the code more readable? and faster */ | |
2935 | const int srcW= c->srcW; | |
2936 | const int dstW= c->dstW; | |
2937 | const int dstH= c->dstH; | |
2938 | const int chrDstW= c->chrDstW; | |
e616aa93 | 2939 | const int chrSrcW= c->chrSrcW; |
28bf81c9 MN |
2940 | const int lumXInc= c->lumXInc; |
2941 | const int chrXInc= c->chrXInc; | |
fe8054c0 | 2942 | const int dstFormat= c->dstFormat; |
44c1035c | 2943 | const int srcFormat= c->srcFormat; |
28bf81c9 MN |
2944 | const int flags= c->flags; |
2945 | const int canMMX2BeUsed= c->canMMX2BeUsed; | |
2946 | int16_t *vLumFilterPos= c->vLumFilterPos; | |
2947 | int16_t *vChrFilterPos= c->vChrFilterPos; | |
2948 | int16_t *hLumFilterPos= c->hLumFilterPos; | |
2949 | int16_t *hChrFilterPos= c->hChrFilterPos; | |
2950 | int16_t *vLumFilter= c->vLumFilter; | |
2951 | int16_t *vChrFilter= c->vChrFilter; | |
2952 | int16_t *hLumFilter= c->hLumFilter; | |
2953 | int16_t *hChrFilter= c->hChrFilter; | |
77a49659 MN |
2954 | int32_t *lumMmxFilter= c->lumMmxFilter; |
2955 | int32_t *chrMmxFilter= c->chrMmxFilter; | |
28bf81c9 MN |
2956 | const int vLumFilterSize= c->vLumFilterSize; |
2957 | const int vChrFilterSize= c->vChrFilterSize; | |
2958 | const int hLumFilterSize= c->hLumFilterSize; | |
2959 | const int hChrFilterSize= c->hChrFilterSize; | |
2960 | int16_t **lumPixBuf= c->lumPixBuf; | |
2961 | int16_t **chrPixBuf= c->chrPixBuf; | |
2962 | const int vLumBufSize= c->vLumBufSize; | |
2963 | const int vChrBufSize= c->vChrBufSize; | |
2964 | uint8_t *funnyYCode= c->funnyYCode; | |
2965 | uint8_t *funnyUVCode= c->funnyUVCode; | |
1e621b18 | 2966 | uint8_t *formatConvBuffer= c->formatConvBuffer; |
e616aa93 MN |
2967 | const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; |
2968 | const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); | |
d4e24275 | 2969 | int lastDstY; |
e28630fc | 2970 | uint8_t *pal=NULL; |
28bf81c9 MN |
2971 | |
2972 | /* vars whch will change and which we need to storw back in the context */ | |
2973 | int dstY= c->dstY; | |
2974 | int lumBufIndex= c->lumBufIndex; | |
2975 | int chrBufIndex= c->chrBufIndex; | |
2976 | int lastInLumBuf= c->lastInLumBuf; | |
2977 | int lastInChrBuf= c->lastInChrBuf; | |
6a4970ab | 2978 | |
5859233b | 2979 | if(isPacked(c->srcFormat)){ |
e28630fc | 2980 | pal= src[1]; |
1e621b18 MN |
2981 | src[0]= |
2982 | src[1]= | |
3e499f53 | 2983 | src[2]= src[0]; |
5859233b | 2984 | srcStride[0]= |
1e621b18 | 2985 | srcStride[1]= |
3e499f53 | 2986 | srcStride[2]= srcStride[0]; |
6c7506de | 2987 | } |
5859233b MN |
2988 | srcStride[1]<<= c->vChrDrop; |
2989 | srcStride[2]<<= c->vChrDrop; | |
6c7506de | 2990 | |
c7a810cc MN |
2991 | // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], |
2992 | // (int)dst[0], (int)dst[1], (int)dst[2]); | |
2993 | ||
2994 | #if 0 //self test FIXME move to a vfilter or something | |
2995 | { | |
2996 | static volatile int i=0; | |
2997 | i++; | |
e9e12f0e | 2998 | if(srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH) |
c7a810cc MN |
2999 | selfTest(src, srcStride, c->srcW, c->srcH); |
3000 | i--; | |
3001 | } | |
3002 | #endif | |
37079906 MN |
3003 | |
3004 | //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | |
3005 | //dstStride[0],dstStride[1],dstStride[2]); | |
6c7506de MN |
3006 | |
3007 | if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
3008 | { | |
3009 | static int firstTime=1; //FIXME move this into the context perhaps | |
3010 | if(flags & SWS_PRINT_INFO && firstTime) | |
3011 | { | |
2d529db5 | 3012 | av_log(c, AV_LOG_WARNING, "SwScaler: Warning: dstStride is not aligned!\n" |
6c7506de MN |
3013 | "SwScaler: ->cannot do aligned memory acesses anymore\n"); |
3014 | firstTime=0; | |
3015 | } | |
3016 | } | |
28bf81c9 | 3017 | |
1e621b18 MN |
3018 | /* Note the user might start scaling the picture in the middle so this will not get executed |
3019 | this is not really intended but works currently, so ppl might do it */ | |
28bf81c9 MN |
3020 | if(srcSliceY ==0){ |
3021 | lumBufIndex=0; | |
3022 | chrBufIndex=0; | |
6a4970ab | 3023 | dstY=0; |
28bf81c9 MN |
3024 | lastInLumBuf= -1; |
3025 | lastInChrBuf= -1; | |
077ea8a7 | 3026 | } |
d3f41512 | 3027 | |
d4e24275 MN |
3028 | lastDstY= dstY; |
3029 | ||
c1b0bfb4 | 3030 | for(;dstY < dstH; dstY++){ |
28bf81c9 | 3031 | unsigned char *dest =dst[0]+dstStride[0]*dstY; |
3f7bb50c MN |
3032 | const int chrDstY= dstY>>c->chrDstVSubSample; |
3033 | unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
3034 | unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
d3f41512 | 3035 | |
c1b0bfb4 MN |
3036 | const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input |
3037 | const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
3038 | const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
3039 | const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
d604bab9 | 3040 | |
379a2036 MN |
3041 | //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", |
3042 | // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
c7f822d9 MN |
3043 | //handle holes (FAST_BILINEAR & weird filters) |
3044 | if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | |
3045 | if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | |
3046 | //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); | |
c1b0bfb4 MN |
3047 | ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) |
3048 | ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
d3f41512 | 3049 | |
c1b0bfb4 | 3050 | // Do we have enough lines in this slice to output the dstY line |
e616aa93 | 3051 | if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) |
c1b0bfb4 MN |
3052 | { |
3053 | //Do horizontal scaling | |
3054 | while(lastInLumBuf < lastLumSrcY) | |
d3f41512 | 3055 | { |
28bf81c9 | 3056 | uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; |
c1b0bfb4 | 3057 | lumBufIndex++; |
c7f822d9 | 3058 | // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); |
c1b0bfb4 MN |
3059 | ASSERT(lumBufIndex < 2*vLumBufSize) |
3060 | ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
3061 | ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
3062 | // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
28bf81c9 MN |
3063 | RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
3064 | flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | |
6a4970ab | 3065 | funnyYCode, c->srcFormat, formatConvBuffer, |
e28630fc | 3066 | c->lumMmx2Filter, c->lumMmx2FilterPos, pal); |
c1b0bfb4 MN |
3067 | lastInLumBuf++; |
3068 | } | |
3069 | while(lastInChrBuf < lastChrSrcY) | |
3070 | { | |
e616aa93 MN |
3071 |