Commit | Line | Data |
---|---|---|
31190492 A |
1 | |
2 | // Software scaling and colorspace conversion routines for MPlayer | |
3 | ||
afa569af | 4 | // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu> |
783e9cc9 | 5 | // current version mostly by Michael Niedermayer (michaelni@gmx.at) |
d604bab9 | 6 | // the parts written by michael are under GNU GPL |
783e9cc9 | 7 | |
d3f41512 | 8 | #include <inttypes.h> |
dda87e9f | 9 | #include <string.h> |
d3f41512 | 10 | #include "../config.h" |
d604bab9 | 11 | #include "swscale.h" |
99cefd0b | 12 | #include "../mmx_defs.h" |
541c4eb9 | 13 | #undef MOVNTQ |
7d7f78b5 | 14 | #undef PAVGB |
d3f41512 | 15 | |
783e9cc9 | 16 | //#undef HAVE_MMX2 |
d3f41512 | 17 | //#undef HAVE_MMX |
783e9cc9 | 18 | //#undef ARCH_X86 |
d604bab9 MN |
19 | #define DITHER1XBPP |
20 | int fullUVIpol=0; | |
21 | //disables the unscaled height version | |
22 | int allwaysIpol=0; | |
d3f41512 MN |
23 | |
24 | #define RET 0xC3 //near return opcode | |
783e9cc9 MN |
25 | /* |
26 | NOTES | |
d3f41512 | 27 | |
d604bab9 MN |
28 | known BUGS with known cause (no bugreports please!, but patches are welcome :) ) |
29 | horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11) | |
30 | ||
a525ce8d | 31 | Supported output formats BGR15 BGR16 BGR24 BGR32 |
d604bab9 MN |
32 | BGR15 & BGR16 MMX verions support dithering |
33 | Special versions: fast Y 1:1 scaling (no interpolation in y direction) | |
31190492 | 34 | |
783e9cc9 | 35 | TODO |
d604bab9 | 36 | more intelligent missalignment avoidance for the horizontal scaler |
1faf0867 | 37 | bicubic scaler |
02a0a992 MN |
38 | dither in C |
39 | change the distance of the u & v buffer | |
783e9cc9 | 40 | */ |
31190492 | 41 | |
d604bab9 | 42 | #define ABS(a) ((a) > 0 ? (a) : (-(a))) |
2ff198c1 MN |
43 | #define MIN(a,b) ((a) > (b) ? (b) : (a)) |
44 | #define MAX(a,b) ((a) < (b) ? (b) : (a)) | |
d604bab9 MN |
45 | |
46 | #ifdef HAVE_MMX2 | |
47 | #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
48 | #elif defined (HAVE_3DNOW) | |
49 | #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
50 | #endif | |
d3f41512 | 51 | |
d604bab9 MN |
52 | #ifdef HAVE_MMX2 |
53 | #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" | |
54 | #else | |
55 | #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t" | |
56 | #endif | |
57 | ||
58 | ||
59 | #ifdef HAVE_MMX | |
60 | static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL; | |
390b20a6 MN |
61 | static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL; |
62 | static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL; | |
63 | static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL; | |
64 | static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL; | |
f62255fb MN |
65 | static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL; |
66 | static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL; | |
d604bab9 MN |
67 | static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL; |
68 | static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL; | |
69 | static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL; | |
70 | static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL; | |
71 | static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL; | |
72 | static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL; | |
73 | ||
3fe8e8f0 MN |
74 | static volatile uint64_t __attribute__((aligned(8))) b5Dither; |
75 | static volatile uint64_t __attribute__((aligned(8))) g5Dither; | |
76 | static volatile uint64_t __attribute__((aligned(8))) g6Dither; | |
77 | static volatile uint64_t __attribute__((aligned(8))) r5Dither; | |
d8fa3c54 MN |
78 | |
79 | static uint64_t __attribute__((aligned(8))) dither4[2]={ | |
80 | 0x0103010301030103LL, | |
81 | 0x0200020002000200LL,}; | |
82 | ||
83 | static uint64_t __attribute__((aligned(8))) dither8[2]={ | |
84 | 0x0602060206020602LL, | |
85 | 0x0004000400040004LL,}; | |
d604bab9 MN |
86 | |
87 | static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL; | |
88 | static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL; | |
89 | static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL; | |
90 | static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL; | |
91 | static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL; | |
92 | static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL; | |
93 | ||
99d2cb72 MN |
94 | static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL; |
95 | static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL; | |
96 | static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL; | |
97 | ||
d604bab9 MN |
98 | static uint64_t __attribute__((aligned(8))) temp0; |
99 | static uint64_t __attribute__((aligned(8))) asm_yalpha1; | |
100 | static uint64_t __attribute__((aligned(8))) asm_uvalpha1; | |
101 | #endif | |
783e9cc9 MN |
102 | |
103 | // temporary storage for 4 yuv lines: | |
104 | // 16bit for now (mmx likes it more compact) | |
d604bab9 MN |
105 | #ifdef HAVE_MMX |
106 | static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048]; | |
107 | static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2]; | |
108 | #else | |
783e9cc9 MN |
109 | static uint16_t pix_buf_y[4][2048]; |
110 | static uint16_t pix_buf_uv[2][2048*2]; | |
d604bab9 | 111 | #endif |
783e9cc9 MN |
112 | |
113 | // clipping helper table for C implementations: | |
114 | static unsigned char clip_table[768]; | |
115 | ||
b18ea156 MN |
116 | static unsigned short clip_table16b[768]; |
117 | static unsigned short clip_table16g[768]; | |
118 | static unsigned short clip_table16r[768]; | |
119 | static unsigned short clip_table15b[768]; | |
120 | static unsigned short clip_table15g[768]; | |
121 | static unsigned short clip_table15r[768]; | |
122 | ||
783e9cc9 MN |
123 | // yuv->rgb conversion tables: |
124 | static int yuvtab_2568[256]; | |
125 | static int yuvtab_3343[256]; | |
126 | static int yuvtab_0c92[256]; | |
127 | static int yuvtab_1a1e[256]; | |
128 | static int yuvtab_40cf[256]; | |
129 | ||
96034638 | 130 | #ifdef HAVE_MMX2 |
d3f41512 MN |
131 | static uint8_t funnyYCode[10000]; |
132 | static uint8_t funnyUVCode[10000]; | |
96034638 | 133 | #endif |
d3f41512 | 134 | |
2ff198c1 MN |
135 | static int canMMX2BeUsed=0; |
136 | ||
d604bab9 MN |
137 | #define FULL_YSCALEYUV2RGB \ |
138 | "pxor %%mm7, %%mm7 \n\t"\ | |
139 | "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
140 | "punpcklwd %%mm6, %%mm6 \n\t"\ | |
141 | "punpcklwd %%mm6, %%mm6 \n\t"\ | |
142 | "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
143 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
144 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
145 | "xorl %%eax, %%eax \n\t"\ | |
cff6ecd7 | 146 | ".balign 16 \n\t"\ |
d604bab9 MN |
147 | "1: \n\t"\ |
148 | "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
149 | "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
150 | "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
151 | "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
152 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
153 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
154 | "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
155 | "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
156 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
157 | "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
158 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
159 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
160 | "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |
161 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
162 | "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
163 | "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ | |
164 | "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\ | |
165 | "pmulhw yCoeff, %%mm1 \n\t"\ | |
166 | \ | |
167 | \ | |
168 | "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
169 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
170 | "pmulhw ubCoeff, %%mm3 \n\t"\ | |
171 | "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
172 | "pmulhw ugCoeff, %%mm2 \n\t"\ | |
173 | "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
174 | "psubw w400, %%mm0 \n\t" /* (V-128)8*/\ | |
175 | \ | |
176 | \ | |
177 | "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |
178 | "pmulhw vrCoeff, %%mm0 \n\t"\ | |
179 | "pmulhw vgCoeff, %%mm4 \n\t"\ | |
180 | "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
181 | "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |
182 | "packuswb %%mm3, %%mm3 \n\t"\ | |
183 | \ | |
184 | "packuswb %%mm0, %%mm0 \n\t"\ | |
185 | "paddw %%mm4, %%mm2 \n\t"\ | |
186 | "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |
187 | \ | |
188 | "packuswb %%mm1, %%mm1 \n\t" | |
189 | ||
190 | #define YSCALEYUV2RGB \ | |
191 | "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
192 | "punpcklwd %%mm6, %%mm6 \n\t"\ | |
193 | "punpcklwd %%mm6, %%mm6 \n\t"\ | |
194 | "movq %%mm6, asm_yalpha1 \n\t"\ | |
195 | "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
196 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
197 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
198 | "movq %%mm5, asm_uvalpha1 \n\t"\ | |
199 | "xorl %%eax, %%eax \n\t"\ | |
cff6ecd7 | 200 | ".balign 16 \n\t"\ |
d604bab9 MN |
201 | "1: \n\t"\ |
202 | "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
203 | "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
204 | "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
205 | "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
206 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
207 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
208 | "movq asm_uvalpha1, %%mm0 \n\t"\ | |
209 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
210 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
211 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
212 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
213 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
214 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
215 | "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ | |
216 | "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ | |
217 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
218 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
219 | "pmulhw ugCoeff, %%mm3 \n\t"\ | |
220 | "pmulhw vgCoeff, %%mm4 \n\t"\ | |
221 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
222 | "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
223 | "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
224 | "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
225 | "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
226 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
227 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
228 | "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
229 | "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
230 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
231 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
232 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
233 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
234 | "pmulhw ubCoeff, %%mm2 \n\t"\ | |
235 | "pmulhw vrCoeff, %%mm5 \n\t"\ | |
236 | "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ | |
237 | "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ | |
238 | "pmulhw yCoeff, %%mm1 \n\t"\ | |
239 | "pmulhw yCoeff, %%mm7 \n\t"\ | |
240 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
241 | "paddw %%mm3, %%mm4 \n\t"\ | |
242 | "movq %%mm2, %%mm0 \n\t"\ | |
243 | "movq %%mm5, %%mm6 \n\t"\ | |
244 | "movq %%mm4, %%mm3 \n\t"\ | |
245 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
246 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
247 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
248 | "paddw %%mm1, %%mm2 \n\t"\ | |
249 | "paddw %%mm1, %%mm5 \n\t"\ | |
250 | "paddw %%mm1, %%mm4 \n\t"\ | |
251 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
252 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
253 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
254 | "paddw %%mm7, %%mm0 \n\t"\ | |
255 | "paddw %%mm7, %%mm6 \n\t"\ | |
256 | "paddw %%mm7, %%mm3 \n\t"\ | |
257 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
258 | "packuswb %%mm0, %%mm2 \n\t"\ | |
259 | "packuswb %%mm6, %%mm5 \n\t"\ | |
260 | "packuswb %%mm3, %%mm4 \n\t"\ | |
261 | "pxor %%mm7, %%mm7 \n\t" | |
262 | ||
263 | #define YSCALEYUV2RGB1 \ | |
264 | "xorl %%eax, %%eax \n\t"\ | |
cff6ecd7 | 265 | ".balign 16 \n\t"\ |
d604bab9 MN |
266 | "1: \n\t"\ |
267 | "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
268 | "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
269 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
270 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
271 | "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ | |
272 | "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ | |
273 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
274 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
275 | "pmulhw ugCoeff, %%mm3 \n\t"\ | |
276 | "pmulhw vgCoeff, %%mm4 \n\t"\ | |
277 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
497d4f99 MN |
278 | "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ |
279 | "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
280 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
281 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
282 | "pmulhw ubCoeff, %%mm2 \n\t"\ | |
283 | "pmulhw vrCoeff, %%mm5 \n\t"\ | |
284 | "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ | |
285 | "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ | |
286 | "pmulhw yCoeff, %%mm1 \n\t"\ | |
287 | "pmulhw yCoeff, %%mm7 \n\t"\ | |
288 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
289 | "paddw %%mm3, %%mm4 \n\t"\ | |
290 | "movq %%mm2, %%mm0 \n\t"\ | |
291 | "movq %%mm5, %%mm6 \n\t"\ | |
292 | "movq %%mm4, %%mm3 \n\t"\ | |
293 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
294 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
295 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
296 | "paddw %%mm1, %%mm2 \n\t"\ | |
297 | "paddw %%mm1, %%mm5 \n\t"\ | |
298 | "paddw %%mm1, %%mm4 \n\t"\ | |
299 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
300 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
301 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
302 | "paddw %%mm7, %%mm0 \n\t"\ | |
303 | "paddw %%mm7, %%mm6 \n\t"\ | |
304 | "paddw %%mm7, %%mm3 \n\t"\ | |
305 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
306 | "packuswb %%mm0, %%mm2 \n\t"\ | |
307 | "packuswb %%mm6, %%mm5 \n\t"\ | |
308 | "packuswb %%mm3, %%mm4 \n\t"\ | |
309 | "pxor %%mm7, %%mm7 \n\t" | |
310 | ||
311 | // do vertical chrominance interpolation | |
312 | #define YSCALEYUV2RGB1b \ | |
313 | "xorl %%eax, %%eax \n\t"\ | |
cff6ecd7 | 314 | ".balign 16 \n\t"\ |
497d4f99 MN |
315 | "1: \n\t"\ |
316 | "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
317 | "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
318 | "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
319 | "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
397c035e MN |
320 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
321 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
322 | "psrlw $5, %%mm3 \n\t"\ | |
323 | "psrlw $5, %%mm4 \n\t"\ | |
497d4f99 MN |
324 | "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ |
325 | "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ | |
326 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
327 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
328 | "pmulhw ugCoeff, %%mm3 \n\t"\ | |
329 | "pmulhw vgCoeff, %%mm4 \n\t"\ | |
330 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
331 | "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
332 | "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
d604bab9 MN |
333 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
334 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
335 | "pmulhw ubCoeff, %%mm2 \n\t"\ | |
336 | "pmulhw vrCoeff, %%mm5 \n\t"\ | |
337 | "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ | |
338 | "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ | |
339 | "pmulhw yCoeff, %%mm1 \n\t"\ | |
340 | "pmulhw yCoeff, %%mm7 \n\t"\ | |
341 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
342 | "paddw %%mm3, %%mm4 \n\t"\ | |
343 | "movq %%mm2, %%mm0 \n\t"\ | |
344 | "movq %%mm5, %%mm6 \n\t"\ | |
345 | "movq %%mm4, %%mm3 \n\t"\ | |
346 | "punpcklwd %%mm2, %%mm2 \n\t"\ | |
347 | "punpcklwd %%mm5, %%mm5 \n\t"\ | |
348 | "punpcklwd %%mm4, %%mm4 \n\t"\ | |
349 | "paddw %%mm1, %%mm2 \n\t"\ | |
350 | "paddw %%mm1, %%mm5 \n\t"\ | |
351 | "paddw %%mm1, %%mm4 \n\t"\ | |
352 | "punpckhwd %%mm0, %%mm0 \n\t"\ | |
353 | "punpckhwd %%mm6, %%mm6 \n\t"\ | |
354 | "punpckhwd %%mm3, %%mm3 \n\t"\ | |
355 | "paddw %%mm7, %%mm0 \n\t"\ | |
356 | "paddw %%mm7, %%mm6 \n\t"\ | |
357 | "paddw %%mm7, %%mm3 \n\t"\ | |
358 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
359 | "packuswb %%mm0, %%mm2 \n\t"\ | |
360 | "packuswb %%mm6, %%mm5 \n\t"\ | |
361 | "packuswb %%mm3, %%mm4 \n\t"\ | |
362 | "pxor %%mm7, %%mm7 \n\t" | |
363 | ||
364 | #define WRITEBGR32 \ | |
365 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
366 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
367 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
368 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
369 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
370 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
371 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
372 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
373 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
374 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
375 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
376 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
377 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
378 | \ | |
379 | MOVNTQ(%%mm0, (%4, %%eax, 4))\ | |
380 | MOVNTQ(%%mm2, 8(%4, %%eax, 4))\ | |
381 | MOVNTQ(%%mm1, 16(%4, %%eax, 4))\ | |
382 | MOVNTQ(%%mm3, 24(%4, %%eax, 4))\ | |
383 | \ | |
384 | "addl $8, %%eax \n\t"\ | |
385 | "cmpl %5, %%eax \n\t"\ | |
386 | " jb 1b \n\t" | |
387 | ||
388 | #define WRITEBGR16 \ | |
f62255fb MN |
389 | "pand bF8, %%mm2 \n\t" /* B */\ |
390 | "pand bFC, %%mm4 \n\t" /* G */\ | |
391 | "pand bF8, %%mm5 \n\t" /* R */\ | |
392 | "psrlq $3, %%mm2 \n\t"\ | |
d604bab9 | 393 | \ |
f62255fb MN |
394 | "movq %%mm2, %%mm1 \n\t"\ |
395 | "movq %%mm4, %%mm3 \n\t"\ | |
d604bab9 | 396 | \ |
f62255fb MN |
397 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
398 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
399 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
400 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
d604bab9 | 401 | \ |
f62255fb MN |
402 | "psllq $3, %%mm3 \n\t"\ |
403 | "psllq $3, %%mm4 \n\t"\ | |
d604bab9 MN |
404 | \ |
405 | "por %%mm3, %%mm2 \n\t"\ | |
d604bab9 | 406 | "por %%mm4, %%mm1 \n\t"\ |
d604bab9 MN |
407 | \ |
408 | MOVNTQ(%%mm2, (%4, %%eax, 2))\ | |
409 | MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ | |
410 | \ | |
411 | "addl $8, %%eax \n\t"\ | |
412 | "cmpl %5, %%eax \n\t"\ | |
413 | " jb 1b \n\t" | |
414 | ||
415 | #define WRITEBGR15 \ | |
f62255fb MN |
416 | "pand bF8, %%mm2 \n\t" /* B */\ |
417 | "pand bF8, %%mm4 \n\t" /* G */\ | |
418 | "pand bF8, %%mm5 \n\t" /* R */\ | |
419 | "psrlq $3, %%mm2 \n\t"\ | |
420 | "psrlq $1, %%mm5 \n\t"\ | |
d604bab9 | 421 | \ |
f62255fb MN |
422 | "movq %%mm2, %%mm1 \n\t"\ |
423 | "movq %%mm4, %%mm3 \n\t"\ | |
d604bab9 | 424 | \ |
f62255fb MN |
425 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
426 | "punpcklbw %%mm5, %%mm2 \n\t"\ | |
427 | "punpckhbw %%mm7, %%mm4 \n\t"\ | |
428 | "punpckhbw %%mm5, %%mm1 \n\t"\ | |
d604bab9 | 429 | \ |
f62255fb MN |
430 | "psllq $2, %%mm3 \n\t"\ |
431 | "psllq $2, %%mm4 \n\t"\ | |
d604bab9 MN |
432 | \ |
433 | "por %%mm3, %%mm2 \n\t"\ | |
d604bab9 | 434 | "por %%mm4, %%mm1 \n\t"\ |
d604bab9 MN |
435 | \ |
436 | MOVNTQ(%%mm2, (%4, %%eax, 2))\ | |
437 | MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ | |
438 | \ | |
439 | "addl $8, %%eax \n\t"\ | |
440 | "cmpl %5, %%eax \n\t"\ | |
441 | " jb 1b \n\t" | |
f62255fb | 442 | |
99d2cb72 | 443 | #define WRITEBGR24OLD \ |
d604bab9 MN |
444 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
445 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
446 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
447 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
448 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
449 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
450 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
451 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
452 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
a525ce8d MN |
453 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
454 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
455 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
456 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
d604bab9 MN |
457 | \ |
458 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
459 | "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | |
460 | "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\ | |
461 | "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\ | |
462 | "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
463 | "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | |
464 | "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | |
465 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
466 | \ | |
467 | "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
468 | "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | |
469 | "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | |
470 | "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | |
471 | "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\ | |
472 | "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ | |
473 | "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | |
474 | "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\ | |
475 | "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\ | |
476 | "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ | |
477 | "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | |
478 | "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | |
479 | "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | |
480 | \ | |
481 | "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ | |
482 | "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | |
483 | "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | |
484 | "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\ | |
485 | "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\ | |
486 | "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ | |
487 | "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
488 | "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | |
489 | \ | |
bdc2eb9a MN |
490 | MOVNTQ(%%mm0, (%%ebx))\ |
491 | MOVNTQ(%%mm2, 8(%%ebx))\ | |
492 | MOVNTQ(%%mm3, 16(%%ebx))\ | |
493 | "addl $24, %%ebx \n\t"\ | |
d604bab9 MN |
494 | \ |
495 | "addl $8, %%eax \n\t"\ | |
496 | "cmpl %5, %%eax \n\t"\ | |
497 | " jb 1b \n\t" | |
498 | ||
99d2cb72 MN |
499 | #define WRITEBGR24MMX \ |
500 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
501 | "movq %%mm2, %%mm1 \n\t" /* B */\ | |
502 | "movq %%mm5, %%mm6 \n\t" /* R */\ | |
503 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
504 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
505 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
506 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
507 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
508 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
509 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
510 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
511 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
512 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
513 | \ | |
514 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
515 | "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
516 | "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
517 | "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
518 | \ | |
519 | "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
520 | "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
521 | "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
522 | "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
523 | \ | |
524 | "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
525 | "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
526 | "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
527 | "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
528 | \ | |
529 | "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
530 | "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
531 | "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
532 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
533 | MOVNTQ(%%mm0, (%%ebx))\ | |
534 | \ | |
535 | "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
536 | "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
537 | "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
538 | "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
539 | MOVNTQ(%%mm6, 8(%%ebx))\ | |
540 | \ | |
541 | "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
542 | "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
543 | "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
544 | MOVNTQ(%%mm5, 16(%%ebx))\ | |
545 | \ | |
546 | "addl $24, %%ebx \n\t"\ | |
547 | \ | |
548 | "addl $8, %%eax \n\t"\ | |
549 | "cmpl %5, %%eax \n\t"\ | |
550 | " jb 1b \n\t" | |
551 | ||
552 | #define WRITEBGR24MMX2 \ | |
553 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
554 | "movq M24A, %%mm0 \n\t"\ | |
555 | "movq M24C, %%mm7 \n\t"\ | |
556 | "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ | |
557 | "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
558 | "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
559 | \ | |
560 | "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
561 | "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
562 | "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
563 | \ | |
564 | "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
565 | "por %%mm1, %%mm6 \n\t"\ | |
566 | "por %%mm3, %%mm6 \n\t"\ | |
567 | MOVNTQ(%%mm6, (%%ebx))\ | |
568 | \ | |
569 | "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
570 | "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
571 | "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
572 | "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
573 | \ | |
574 | "pand M24B, %%mm1 \n\t" /* B5 B4 B3 */\ | |
575 | "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ | |
576 | "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
577 | \ | |
578 | "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
579 | "por %%mm3, %%mm6 \n\t"\ | |
580 | MOVNTQ(%%mm6, 8(%%ebx))\ | |
581 | \ | |
582 | "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
583 | "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
584 | "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
585 | \ | |
586 | "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
587 | "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
588 | "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */\ | |
589 | \ | |
590 | "por %%mm1, %%mm3 \n\t"\ | |
591 | "por %%mm3, %%mm6 \n\t"\ | |
592 | MOVNTQ(%%mm6, 16(%%ebx))\ | |
593 | \ | |
594 | "addl $24, %%ebx \n\t"\ | |
595 | \ | |
596 | "addl $8, %%eax \n\t"\ | |
597 | "cmpl %5, %%eax \n\t"\ | |
598 | " jb 1b \n\t" | |
599 | ||
600 | #ifdef HAVE_MMX2 | |
601 | #define WRITEBGR24 WRITEBGR24MMX2 | |
602 | #else | |
603 | #define WRITEBGR24 WRITEBGR24MMX | |
604 | #endif | |
605 | ||
96034638 MN |
606 | #ifdef HAVE_MMX |
607 | void in_asm_used_var_warning_killer() | |
608 | { | |
609 | int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+ | |
d8fa3c54 | 610 | bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+ |
99d2cb72 | 611 | M24A+M24B+M24C; |
96034638 MN |
612 | if(i) i=0; |
613 | } | |
614 | #endif | |
d604bab9 | 615 | |
38858470 MN |
616 | static inline void yuv2yuv(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, |
617 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstw, int yalpha, int uvalpha) | |
618 | { | |
619 | int yalpha1=yalpha^4095; | |
620 | int uvalpha1=uvalpha^4095; | |
621 | int i; | |
622 | ||
2add307d MN |
623 | asm volatile ("\n\t"::: "memory"); |
624 | ||
38858470 MN |
625 | for(i=0;i<dstw;i++) |
626 | { | |
d022ce5c | 627 | ((uint8_t*)dest)[i] = (buf0[i]*yalpha1+buf1[i]*yalpha)>>19; |
38858470 MN |
628 | } |
629 | ||
630 | if(uvalpha != -1) | |
631 | { | |
02a0a992 | 632 | for(i=0; i<(dstw>>1); i++) |
38858470 | 633 | { |
d022ce5c MN |
634 | ((uint8_t*)uDest)[i] = (uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19; |
635 | ((uint8_t*)vDest)[i] = (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19; | |
38858470 MN |
636 | } |
637 | } | |
638 | } | |
639 | ||
d604bab9 MN |
640 | /** |
641 | * vertical scale YV12 to RGB | |
642 | */ | |
643 | static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
644 | uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp) | |
645 | { | |
646 | int yalpha1=yalpha^4095; | |
647 | int uvalpha1=uvalpha^4095; | |
d604bab9 MN |
648 | |
649 | if(fullUVIpol) | |
650 | { | |
651 | ||
652 | #ifdef HAVE_MMX | |
653 | if(dstbpp == 32) | |
654 | { | |
655 | asm volatile( | |
656 | ||
657 | ||
658 | FULL_YSCALEYUV2RGB | |
659 | "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
660 | "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
661 | ||
662 | "movq %%mm3, %%mm1 \n\t" | |
663 | "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
664 | "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
665 | ||
666 | MOVNTQ(%%mm3, (%4, %%eax, 4)) | |
667 | MOVNTQ(%%mm1, 8(%4, %%eax, 4)) | |
668 | ||
669 | "addl $4, %%eax \n\t" | |
670 | "cmpl %5, %%eax \n\t" | |
671 | " jb 1b \n\t" | |
672 | ||
673 | ||
674 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
675 | "m" (yalpha1), "m" (uvalpha1) | |
676 | : "%eax" | |
677 | ); | |
678 | } | |
679 | else if(dstbpp==24) | |
680 | { | |
681 | asm volatile( | |
682 | ||
683 | FULL_YSCALEYUV2RGB | |
684 | ||
685 | // lsb ... msb | |
686 | "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
687 | "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
688 | ||
689 | "movq %%mm3, %%mm1 \n\t" | |
690 | "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
691 | "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
692 | ||
693 | "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 | |
694 | "psrlq $8, %%mm3 \n\t" // GR0BGR00 | |
695 | "pand bm00000111, %%mm2 \n\t" // BGR00000 | |
696 | "pand bm11111000, %%mm3 \n\t" // 000BGR00 | |
697 | "por %%mm2, %%mm3 \n\t" // BGRBGR00 | |
698 | "movq %%mm1, %%mm2 \n\t" | |
699 | "psllq $48, %%mm1 \n\t" // 000000BG | |
700 | "por %%mm1, %%mm3 \n\t" // BGRBGRBG | |
701 | ||
702 | "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 | |
703 | "psrld $16, %%mm2 \n\t" // R000R000 | |
704 | "psrlq $24, %%mm1 \n\t" // 0BGR0000 | |
705 | "por %%mm2, %%mm1 \n\t" // RBGRR000 | |
706 | ||
707 | "movl %4, %%ebx \n\t" | |
708 | "addl %%eax, %%ebx \n\t" | |
709 | ||
710 | #ifdef HAVE_MMX2 | |
711 | //FIXME Alignment | |
712 | "movntq %%mm3, (%%ebx, %%eax, 2)\n\t" | |
713 | "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t" | |
714 | #else | |
715 | "movd %%mm3, (%%ebx, %%eax, 2) \n\t" | |
716 | "psrlq $32, %%mm3 \n\t" | |
717 | "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t" | |
718 | "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t" | |
719 | #endif | |
720 | "addl $4, %%eax \n\t" | |
721 | "cmpl %5, %%eax \n\t" | |
722 | " jb 1b \n\t" | |
723 | ||
724 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw), | |
725 | "m" (yalpha1), "m" (uvalpha1) | |
726 | : "%eax", "%ebx" | |
727 | ); | |
728 | } | |
729 | else if(dstbpp==15) | |
730 | { | |
731 | asm volatile( | |
732 | ||
733 | FULL_YSCALEYUV2RGB | |
734 | #ifdef DITHER1XBPP | |
d8fa3c54 MN |
735 | "paddusb g5Dither, %%mm1 \n\t" |
736 | "paddusb r5Dither, %%mm0 \n\t" | |
737 | "paddusb b5Dither, %%mm3 \n\t" | |
d604bab9 MN |
738 | #endif |
739 | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
740 | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
741 | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
742 | ||
743 | "psrlw $3, %%mm3 \n\t" | |
744 | "psllw $2, %%mm1 \n\t" | |
745 | "psllw $7, %%mm0 \n\t" | |
746 | "pand g15Mask, %%mm1 \n\t" | |
747 | "pand r15Mask, %%mm0 \n\t" | |
748 | ||
749 | "por %%mm3, %%mm1 \n\t" | |
750 | "por %%mm1, %%mm0 \n\t" | |
751 | ||
752 | MOVNTQ(%%mm0, (%4, %%eax, 2)) | |
753 | ||
754 | "addl $4, %%eax \n\t" | |
755 | "cmpl %5, %%eax \n\t" | |
756 | " jb 1b \n\t" | |
757 | ||
758 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
759 | "m" (yalpha1), "m" (uvalpha1) | |
760 | : "%eax" | |
761 | ); | |
762 | } | |
763 | else if(dstbpp==16) | |
764 | { | |
765 | asm volatile( | |
766 | ||
767 | FULL_YSCALEYUV2RGB | |
768 | #ifdef DITHER1XBPP | |
d8fa3c54 MN |
769 | "paddusb g6Dither, %%mm1 \n\t" |
770 | "paddusb r5Dither, %%mm0 \n\t" | |
771 | "paddusb b5Dither, %%mm3 \n\t" | |
d604bab9 MN |
772 | #endif |
773 | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
774 | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
775 | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
776 | ||
777 | "psrlw $3, %%mm3 \n\t" | |
778 | "psllw $3, %%mm1 \n\t" | |
779 | "psllw $8, %%mm0 \n\t" | |
780 | "pand g16Mask, %%mm1 \n\t" | |
781 | "pand r16Mask, %%mm0 \n\t" | |
782 | ||
783 | "por %%mm3, %%mm1 \n\t" | |
784 | "por %%mm1, %%mm0 \n\t" | |
785 | ||
786 | MOVNTQ(%%mm0, (%4, %%eax, 2)) | |
787 | ||
788 | "addl $4, %%eax \n\t" | |
789 | "cmpl %5, %%eax \n\t" | |
790 | " jb 1b \n\t" | |
791 | ||
792 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
793 | "m" (yalpha1), "m" (uvalpha1) | |
794 | : "%eax" | |
795 | ); | |
796 | } | |
797 | #else | |
2add307d MN |
798 | asm volatile ("\n\t"::: "memory"); |
799 | ||
d604bab9 MN |
800 | if(dstbpp==32 || dstbpp==24) |
801 | { | |
96034638 | 802 | int i; |
d604bab9 MN |
803 | for(i=0;i<dstw;i++){ |
804 | // vertical linear interpolation && yuv2rgb in a single step: | |
805 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
806 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
807 | int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
390b20a6 MN |
808 | dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
809 | dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
810 | dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
d604bab9 MN |
811 | dest+=dstbpp>>3; |
812 | } | |
813 | } | |
814 | else if(dstbpp==16) | |
815 | { | |
96034638 | 816 | int i; |
d604bab9 MN |
817 | for(i=0;i<dstw;i++){ |
818 | // vertical linear interpolation && yuv2rgb in a single step: | |
819 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
820 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
821 | int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
822 | ||
d022ce5c | 823 | ((uint16_t*)dest)[i] = |
b18ea156 MN |
824 | clip_table16b[(Y + yuvtab_40cf[U]) >>13] | |
825 | clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
826 | clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
d604bab9 MN |
827 | } |
828 | } | |
829 | else if(dstbpp==15) | |
830 | { | |
96034638 | 831 | int i; |
d604bab9 MN |
832 | for(i=0;i<dstw;i++){ |
833 | // vertical linear interpolation && yuv2rgb in a single step: | |
834 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
835 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
836 | int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
837 | ||
d022ce5c | 838 | ((uint16_t*)dest)[i] = |
b18ea156 MN |
839 | clip_table15b[(Y + yuvtab_40cf[U]) >>13] | |
840 | clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
841 | clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
d604bab9 MN |
842 | } |
843 | } | |
844 | #endif | |
845 | }//FULL_UV_IPOL | |
846 | else | |
847 | { | |
848 | #ifdef HAVE_MMX | |
849 | if(dstbpp == 32) | |
850 | { | |
851 | asm volatile( | |
852 | YSCALEYUV2RGB | |
853 | WRITEBGR32 | |
854 | ||
855 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
856 | "m" (yalpha1), "m" (uvalpha1) | |
857 | : "%eax" | |
858 | ); | |
859 | } | |
860 | else if(dstbpp==24) | |
861 | { | |
862 | asm volatile( | |
bdc2eb9a | 863 | "movl %4, %%ebx \n\t" |
d604bab9 MN |
864 | YSCALEYUV2RGB |
865 | WRITEBGR24 | |
866 | ||
bdc2eb9a | 867 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw), |
d604bab9 MN |
868 | "m" (yalpha1), "m" (uvalpha1) |
869 | : "%eax", "%ebx" | |
870 | ); | |
871 | } | |
872 | else if(dstbpp==15) | |
873 | { | |
874 | asm volatile( | |
875 | YSCALEYUV2RGB | |
876 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
877 | #ifdef DITHER1XBPP | |
d8fa3c54 MN |
878 | "paddusb b5Dither, %%mm2 \n\t" |
879 | "paddusb g5Dither, %%mm4 \n\t" | |
880 | "paddusb r5Dither, %%mm5 \n\t" | |
d604bab9 MN |
881 | #endif |
882 | ||
883 | WRITEBGR15 | |
884 | ||
885 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
886 | "m" (yalpha1), "m" (uvalpha1) | |
887 | : "%eax" | |
888 | ); | |
889 | } | |
890 | else if(dstbpp==16) | |
891 | { | |
892 | asm volatile( | |
893 | YSCALEYUV2RGB | |
894 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
895 | #ifdef DITHER1XBPP | |
d8fa3c54 MN |
896 | "paddusb b5Dither, %%mm2 \n\t" |
897 | "paddusb g6Dither, %%mm4 \n\t" | |
898 | "paddusb r5Dither, %%mm5 \n\t" | |
d604bab9 MN |
899 | #endif |
900 | ||
901 | WRITEBGR16 | |
902 | ||
903 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
904 | "m" (yalpha1), "m" (uvalpha1) | |
905 | : "%eax" | |
906 | ); | |
907 | } | |
908 | #else | |
2add307d MN |
909 | asm volatile ("\n\t"::: "memory"); |
910 | ||
d9fc1cfe | 911 | if(dstbpp==32) |
d604bab9 | 912 | { |
96034638 | 913 | int i; |
d9fc1cfe | 914 | for(i=0; i<dstw-1; i+=2){ |
d604bab9 | 915 | // vertical linear interpolation && yuv2rgb in a single step: |
d9fc1cfe MN |
916 | int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
917 | int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
02a0a992 MN |
918 | int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
919 | int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
d9fc1cfe MN |
920 | |
921 | int Cb= yuvtab_40cf[U]; | |
922 | int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
923 | int Cr= yuvtab_3343[V]; | |
924 | ||
925 | dest[4*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
926 | dest[4*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
927 | dest[4*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
928 | ||
929 | dest[4*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
930 | dest[4*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
931 | dest[4*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
932 | } | |
933 | } | |
934 | if(dstbpp==24) | |
935 | { | |
96034638 | 936 | int i; |
d9fc1cfe MN |
937 | for(i=0; i<dstw-1; i+=2){ |
938 | // vertical linear interpolation && yuv2rgb in a single step: | |
939 | int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
940 | int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
02a0a992 MN |
941 | int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
942 | int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
d9fc1cfe MN |
943 | |
944 | int Cb= yuvtab_40cf[U]; | |
945 | int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
946 | int Cr= yuvtab_3343[V]; | |
947 | ||
948 | dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
949 | dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
950 | dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
951 | ||
952 | dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
953 | dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
954 | dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
955 | dest+=6; | |
d604bab9 MN |
956 | } |
957 | } | |
958 | else if(dstbpp==16) | |
959 | { | |
96034638 | 960 | int i; |
d9fc1cfe | 961 | for(i=0; i<dstw-1; i+=2){ |
d604bab9 | 962 | // vertical linear interpolation && yuv2rgb in a single step: |
d9fc1cfe MN |
963 | int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
964 | int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
02a0a992 MN |
965 | int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
966 | int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
d604bab9 | 967 | |
d9fc1cfe MN |
968 | int Cb= yuvtab_40cf[U]; |
969 | int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
970 | int Cr= yuvtab_3343[V]; | |
971 | ||
d022ce5c | 972 | ((uint16_t*)dest)[i] = |
b18ea156 MN |
973 | clip_table16b[(Y1 + Cb) >>13] | |
974 | clip_table16g[(Y1 + Cg) >>13] | | |
975 | clip_table16r[(Y1 + Cr) >>13]; | |
d9fc1cfe MN |
976 | |
977 | ((uint16_t*)dest)[i+1] = | |
b18ea156 MN |
978 | clip_table16b[(Y2 + Cb) >>13] | |
979 | clip_table16g[(Y2 + Cg) >>13] | | |
980 | clip_table16r[(Y2 + Cr) >>13]; | |
d604bab9 MN |
981 | } |
982 | } | |
983 | else if(dstbpp==15) | |
984 | { | |
96034638 | 985 | int i; |
d9fc1cfe | 986 | for(i=0; i<dstw-1; i+=2){ |
d604bab9 | 987 | // vertical linear interpolation && yuv2rgb in a single step: |
d9fc1cfe MN |
988 | int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
989 | int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
02a0a992 MN |
990 | int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
991 | int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
d604bab9 | 992 | |
d9fc1cfe MN |
993 | int Cb= yuvtab_40cf[U]; |
994 | int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
995 | int Cr= yuvtab_3343[V]; | |
996 | ||
d022ce5c | 997 | ((uint16_t*)dest)[i] = |
b18ea156 MN |
998 | clip_table15b[(Y1 + Cb) >>13] | |
999 | clip_table15g[(Y1 + Cg) >>13] | | |
1000 | clip_table15r[(Y1 + Cr) >>13]; | |
1001 | ||
d9fc1cfe | 1002 | ((uint16_t*)dest)[i+1] = |
b18ea156 MN |
1003 | clip_table15b[(Y2 + Cb) >>13] | |
1004 | clip_table15g[(Y2 + Cg) >>13] | | |
1005 | clip_table15r[(Y2 + Cr) >>13]; | |
d604bab9 MN |
1006 | } |
1007 | } | |
1008 | #endif | |
1009 | } //!FULL_UV_IPOL | |
1010 | } | |
1011 | ||
1012 | /** | |
1013 | * YV12 to RGB without scaling or interpolating | |
1014 | */ | |
1015 | static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
1016 | uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp) | |
1017 | { | |
d604bab9 | 1018 | int uvalpha1=uvalpha^4095; |
96034638 MN |
1019 | #ifdef HAVE_MMX |
1020 | int yalpha1=yalpha^4095; | |
1021 | #endif | |
1022 | ||
d604bab9 MN |
1023 | if(fullUVIpol || allwaysIpol) |
1024 | { | |
1025 | yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); | |
1026 | return; | |
1027 | } | |
497d4f99 | 1028 | if( yalpha > 2048 ) buf0 = buf1; |
397c035e MN |
1029 | |
1030 | #ifdef HAVE_MMX | |
497d4f99 MN |
1031 | if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster |
1032 | { | |
d604bab9 MN |
1033 | if(dstbpp == 32) |
1034 | { | |
1035 | asm volatile( | |
1036 | YSCALEYUV2RGB1 | |
1037 | WRITEBGR32 | |
1038 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
1039 | "m" (yalpha1), "m" (uvalpha1) | |
1040 | : "%eax" | |
1041 | ); | |
1042 | } | |
1043 | else if(dstbpp==24) | |
1044 | { | |
1045 | asm volatile( | |
bdc2eb9a | 1046 | "movl %4, %%ebx \n\t" |
d604bab9 MN |
1047 | YSCALEYUV2RGB1 |
1048 | WRITEBGR24 | |
bdc2eb9a | 1049 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw), |
d604bab9 MN |
1050 | "m" (yalpha1), "m" (uvalpha1) |
1051 | : "%eax", "%ebx" | |
1052 | ); | |
1053 | } | |
1054 | else if(dstbpp==15) | |
1055 | { | |
1056 | asm volatile( | |
1057 | YSCALEYUV2RGB1 | |
1058 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1059 | #ifdef DITHER1XBPP | |
d8fa3c54 MN |
1060 | "paddusb b5Dither, %%mm2 \n\t" |
1061 | "paddusb g5Dither, %%mm4 \n\t" | |
1062 | "paddusb r5Dither, %%mm5 \n\t" | |
d604bab9 MN |
1063 | #endif |
1064 | WRITEBGR15 | |
1065 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
1066 | "m" (yalpha1), "m" (uvalpha1) | |
1067 | : "%eax" | |
1068 | ); | |
1069 | } | |
1070 | else if(dstbpp==16) | |
1071 | { | |
1072 | asm volatile( | |
1073 | YSCALEYUV2RGB1 | |
1074 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1075 | #ifdef DITHER1XBPP | |
d8fa3c54 MN |
1076 | "paddusb b5Dither, %%mm2 \n\t" |
1077 | "paddusb g6Dither, %%mm4 \n\t" | |
1078 | "paddusb r5Dither, %%mm5 \n\t" | |
d604bab9 MN |
1079 | #endif |
1080 | ||
1081 | WRITEBGR16 | |
1082 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
1083 | "m" (yalpha1), "m" (uvalpha1) | |
1084 | : "%eax" | |
1085 | ); | |
1086 | } | |
497d4f99 MN |
1087 | } |
1088 | else | |
1089 | { | |
1090 | if(dstbpp == 32) | |
d604bab9 | 1091 | { |
497d4f99 MN |
1092 | asm volatile( |
1093 | YSCALEYUV2RGB1b | |
1094 | WRITEBGR32 | |
1095 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
1096 | "m" (yalpha1), "m" (uvalpha1) | |
1097 | : "%eax" | |
1098 | ); | |
d604bab9 | 1099 | } |
497d4f99 | 1100 | else if(dstbpp==24) |
d604bab9 | 1101 | { |
497d4f99 | 1102 | asm volatile( |
bdc2eb9a | 1103 | "movl %4, %%ebx \n\t" |
497d4f99 MN |
1104 | YSCALEYUV2RGB1b |
1105 | WRITEBGR24 | |
bdc2eb9a | 1106 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw), |
497d4f99 MN |
1107 | "m" (yalpha1), "m" (uvalpha1) |
1108 | : "%eax", "%ebx" | |
1109 | ); | |
d604bab9 MN |
1110 | } |
1111 | else if(dstbpp==15) | |
1112 | { | |
497d4f99 MN |
1113 | asm volatile( |
1114 | YSCALEYUV2RGB1b | |
1115 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1116 | #ifdef DITHER1XBPP | |
d8fa3c54 MN |
1117 | "paddusb b5Dither, %%mm2 \n\t" |
1118 | "paddusb g5Dither, %%mm4 \n\t" | |
1119 | "paddusb r5Dither, %%mm5 \n\t" | |
497d4f99 MN |
1120 | #endif |
1121 | WRITEBGR15 | |
1122 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
1123 | "m" (yalpha1), "m" (uvalpha1) | |
1124 | : "%eax" | |
1125 | ); | |
1126 | } | |
1127 | else if(dstbpp==16) | |
1128 | { | |
1129 | asm volatile( | |
1130 | YSCALEYUV2RGB1b | |
1131 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1132 | #ifdef DITHER1XBPP | |
d8fa3c54 MN |
1133 | "paddusb b5Dither, %%mm2 \n\t" |
1134 | "paddusb g6Dither, %%mm4 \n\t" | |
1135 | "paddusb r5Dither, %%mm5 \n\t" | |
497d4f99 | 1136 | #endif |
d604bab9 | 1137 | |
497d4f99 MN |
1138 | WRITEBGR16 |
1139 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
1140 | "m" (yalpha1), "m" (uvalpha1) | |
1141 | : "%eax" | |
1142 | ); | |
d604bab9 | 1143 | } |
497d4f99 MN |
1144 | } |
1145 | #else | |
397c035e | 1146 | //FIXME write 2 versions (for even & odd lines) |
497d4f99 MN |
1147 | asm volatile ("\n\t"::: "memory"); |
1148 | ||
397c035e | 1149 | if(dstbpp==32) |
497d4f99 | 1150 | { |
96034638 | 1151 | int i; |
397c035e | 1152 | for(i=0; i<dstw-1; i+=2){ |
497d4f99 | 1153 | // vertical linear interpolation && yuv2rgb in a single step: |
397c035e MN |
1154 | int Y1=yuvtab_2568[buf0[i]>>7]; |
1155 | int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
02a0a992 MN |
1156 | int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1157 | int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
397c035e MN |
1158 | |
1159 | int Cb= yuvtab_40cf[U]; | |
1160 | int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1161 | int Cr= yuvtab_3343[V]; | |
1162 | ||
1163 | dest[4*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
1164 | dest[4*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
1165 | dest[4*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
1166 | ||
1167 | dest[4*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
1168 | dest[4*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
1169 | dest[4*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
1170 | } | |
1171 | } | |
1172 | if(dstbpp==24) | |
1173 | { | |
96034638 | 1174 | int i; |
397c035e MN |
1175 | for(i=0; i<dstw-1; i+=2){ |
1176 | // vertical linear interpolation && yuv2rgb in a single step: | |
1177 | int Y1=yuvtab_2568[buf0[i]>>7]; | |
1178 | int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
02a0a992 MN |
1179 | int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1180 | int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
397c035e MN |
1181 | |
1182 | int Cb= yuvtab_40cf[U]; | |
1183 | int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1184 | int Cr= yuvtab_3343[V]; | |
1185 | ||
1186 | dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
1187 | dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
1188 | dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
1189 | ||
1190 | dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
1191 | dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
1192 | dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
1193 | dest+=6; | |
497d4f99 MN |
1194 | } |
1195 | } | |
1196 | else if(dstbpp==16) | |
1197 | { | |
96034638 | 1198 | int i; |
397c035e | 1199 | for(i=0; i<dstw-1; i+=2){ |
497d4f99 | 1200 | // vertical linear interpolation && yuv2rgb in a single step: |
397c035e MN |
1201 | int Y1=yuvtab_2568[buf0[i]>>7]; |
1202 | int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
02a0a992 MN |
1203 | int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1204 | int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
497d4f99 | 1205 | |
397c035e MN |
1206 | int Cb= yuvtab_40cf[U]; |
1207 | int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1208 | int Cr= yuvtab_3343[V]; | |
1209 | ||
d022ce5c | 1210 | ((uint16_t*)dest)[i] = |
b18ea156 MN |
1211 | clip_table16b[(Y1 + Cb) >>13] | |
1212 | clip_table16g[(Y1 + Cg) >>13] | | |
1213 | clip_table16r[(Y1 + Cr) >>13]; | |
397c035e MN |
1214 | |
1215 | ((uint16_t*)dest)[i+1] = | |
b18ea156 MN |
1216 | clip_table16b[(Y2 + Cb) >>13] | |
1217 | clip_table16g[(Y2 + Cg) >>13] | | |
1218 | clip_table16r[(Y2 + Cr) >>13]; | |
497d4f99 MN |
1219 | } |
1220 | } | |
1221 | else if(dstbpp==15) | |
1222 | { | |
96034638 | 1223 | int i; |
397c035e | 1224 | for(i=0; i<dstw-1; i+=2){ |
497d4f99 | 1225 | // vertical linear interpolation && yuv2rgb in a single step: |
397c035e MN |
1226 | int Y1=yuvtab_2568[buf0[i]>>7]; |
1227 | int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
02a0a992 MN |
1228 | int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1229 | int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
497d4f99 | 1230 | |
397c035e MN |
1231 | int Cb= yuvtab_40cf[U]; |
1232 | int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1233 | int Cr= yuvtab_3343[V]; | |
1234 | ||
d022ce5c | 1235 | ((uint16_t*)dest)[i] = |
b18ea156 MN |
1236 | clip_table15b[(Y1 + Cb) >>13] | |
1237 | clip_table15g[(Y1 + Cg) >>13] | | |
1238 | clip_table15r[(Y1 + Cr) >>13]; | |
1239 | ||
397c035e | 1240 | ((uint16_t*)dest)[i+1] = |
b18ea156 MN |
1241 | clip_table15b[(Y2 + Cb) >>13] | |
1242 | clip_table15g[(Y2 + Cg) >>13] | | |
1243 | clip_table15r[(Y2 + Cr) >>13]; | |
497d4f99 MN |
1244 | } |
1245 | } | |
d604bab9 MN |
1246 | #endif |
1247 | } | |
1248 | ||
1249 | ||
2ff198c1 MN |
1250 | static inline void hyscale(uint16_t *dst, int dstWidth, uint8_t *src, int srcWidth, int xInc) |
1251 | { | |
2ff198c1 MN |
1252 | // *** horizontal scale Y line to temp buffer |
1253 | #ifdef ARCH_X86 | |
2ff198c1 | 1254 | #ifdef HAVE_MMX2 |
96034638 | 1255 | int i; |
2ff198c1 MN |
1256 | if(canMMX2BeUsed) |
1257 | { | |
1258 | asm volatile( | |
1259 | "pxor %%mm7, %%mm7 \n\t" | |
1260 | "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |
1261 | "movd %5, %%mm6 \n\t" // xInc&0xFFFF | |
1262 | "punpcklwd %%mm6, %%mm6 \n\t" | |
1263 | "punpcklwd %%mm6, %%mm6 \n\t" | |
1264 | "movq %%mm6, %%mm2 \n\t" | |
1265 | "psllq $16, %%mm2 \n\t" | |
1266 | "paddw %%mm6, %%mm2 \n\t" | |
1267 | "psllq $16, %%mm2 \n\t" | |
1268 | "paddw %%mm6, %%mm2 \n\t" | |
1269 | "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF | |
1270 | "movq %%mm2, temp0 \n\t" | |
1271 | "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF | |
1272 | "punpcklwd %%mm6, %%mm6 \n\t" | |
1273 | "punpcklwd %%mm6, %%mm6 \n\t" | |
1274 | "xorl %%eax, %%eax \n\t" // i | |
1275 | "movl %0, %%esi \n\t" // src | |
1276 | "movl %1, %%edi \n\t" // buf1 | |
1277 | "movl %3, %%edx \n\t" // (xInc*4)>>16 | |
1278 | "xorl %%ecx, %%ecx \n\t" | |
1279 | "xorl %%ebx, %%ebx \n\t" | |
1280 | "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | |
99cefd0b | 1281 | |
2ff198c1 | 1282 | #define FUNNY_Y_CODE \ |
99cefd0b MN |
1283 | PREFETCH" 1024(%%esi) \n\t"\ |
1284 | PREFETCH" 1056(%%esi) \n\t"\ | |
1285 | PREFETCH" 1088(%%esi) \n\t"\ | |
2ff198c1 MN |
1286 | "call funnyYCode \n\t"\ |
1287 | "movq temp0, %%mm2 \n\t"\ | |
1288 | "xorl %%ecx, %%ecx \n\t" | |
99cefd0b | 1289 | |
2ff198c1 MN |
1290 | FUNNY_Y_CODE |
1291 | FUNNY_Y_CODE | |
1292 | FUNNY_Y_CODE | |
1293 | FUNNY_Y_CODE | |
1294 | FUNNY_Y_CODE | |
1295 | FUNNY_Y_CODE | |
1296 | FUNNY_Y_CODE | |
1297 | FUNNY_Y_CODE | |
1298 | ||
1299 | :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | |
1300 | "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF) | |
1301 | : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |
1302 | ); | |
1303 | for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth-1; i--) dst[i] = src[srcWidth-1]*128; | |
1304 | } | |
1305 | else | |
1306 | { | |
1307 | #endif | |
1308 | //NO MMX just normal asm ... | |
1309 | asm volatile( | |
1310 | "xorl %%eax, %%eax \n\t" // i | |
1311 | "xorl %%ebx, %%ebx \n\t" // xx | |
1312 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
cff6ecd7 | 1313 | ".balign 16 \n\t" |
2ff198c1 MN |
1314 | "1: \n\t" |
1315 | "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
1316 | "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
1317 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1318 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1319 | "shll $16, %%edi \n\t" | |
1320 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1321 | "movl %1, %%edi \n\t" | |
1322 | "shrl $9, %%esi \n\t" | |
1323 | "movw %%si, (%%edi, %%eax, 2) \n\t" | |
1324 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
1325 | "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
1326 | ||
1327 | "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
1328 | "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
1329 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1330 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1331 | "shll $16, %%edi \n\t" | |
1332 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1333 | "movl %1, %%edi \n\t" | |
1334 | "shrl $9, %%esi \n\t" | |
1335 | "movw %%si, 2(%%edi, %%eax, 2) \n\t" | |
1336 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
1337 | "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
1338 | ||
1339 | ||
1340 | "addl $2, %%eax \n\t" | |
1341 | "cmpl %2, %%eax \n\t" | |
1342 | " jb 1b \n\t" | |
1343 | ||
1344 | ||
1345 | :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF) | |
1346 | : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
1347 | ); | |
1348 | #ifdef HAVE_MMX2 | |
1349 | } //if MMX2 cant be used | |
1350 | #endif | |
1351 | #else | |
96034638 MN |
1352 | int i; |
1353 | unsigned int xpos=0; | |
1354 | for(i=0;i<dstWidth;i++) | |
1355 | { | |
1356 | register unsigned int xx=xpos>>16; | |
1357 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
1358 | dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
1359 | xpos+=xInc; | |
1360 | } | |
2ff198c1 MN |
1361 | #endif |
1362 | } | |
1363 | ||
1364 | inline static void hcscale(uint16_t *dst, int dstWidth, | |
1365 | uint8_t *src1, uint8_t *src2, int srcWidth, int xInc) | |
1366 | { | |
2ff198c1 MN |
1367 | #ifdef ARCH_X86 |
1368 | #ifdef HAVE_MMX2 | |
96034638 | 1369 | int i; |
2ff198c1 MN |
1370 | if(canMMX2BeUsed) |
1371 | { | |
1372 | asm volatile( | |
1373 | "pxor %%mm7, %%mm7 \n\t" | |
1374 | "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |
1375 | "movd %5, %%mm6 \n\t" // xInc&0xFFFF | |
1376 | "punpcklwd %%mm6, %%mm6 \n\t" | |
1377 | "punpcklwd %%mm6, %%mm6 \n\t" | |
1378 | "movq %%mm6, %%mm2 \n\t" | |
1379 | "psllq $16, %%mm2 \n\t" | |
1380 | "paddw %%mm6, %%mm2 \n\t" | |
1381 | "psllq $16, %%mm2 \n\t" | |
1382 | "paddw %%mm6, %%mm2 \n\t" | |
1383 | "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF | |
1384 | "movq %%mm2, temp0 \n\t" | |
1385 | "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF | |
1386 | "punpcklwd %%mm6, %%mm6 \n\t" | |
1387 | "punpcklwd %%mm6, %%mm6 \n\t" | |
1388 | "xorl %%eax, %%eax \n\t" // i | |
1389 | "movl %0, %%esi \n\t" // src | |
1390 | "movl %1, %%edi \n\t" // buf1 | |
1391 | "movl %3, %%edx \n\t" // (xInc*4)>>16 | |
1392 | "xorl %%ecx, %%ecx \n\t" | |
1393 | "xorl %%ebx, %%ebx \n\t" | |
1394 | "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | |
1395 | ||
2ff198c1 | 1396 | #define FUNNYUVCODE \ |
99cefd0b MN |
1397 | PREFETCH" 1024(%%esi) \n\t"\ |
1398 | PREFETCH" 1056(%%esi) \n\t"\ | |
1399 | PREFETCH" 1088(%%esi) \n\t"\ | |
2ff198c1 MN |
1400 | "call funnyUVCode \n\t"\ |
1401 | "movq temp0, %%mm2 \n\t"\ | |
1402 | "xorl %%ecx, %%ecx \n\t" | |
2ff198c1 MN |
1403 | |
1404 | FUNNYUVCODE | |
1405 | FUNNYUVCODE | |
1406 | FUNNYUVCODE | |
1407 | FUNNYUVCODE | |
1408 | ||
1409 | FUNNYUVCODE | |
1410 | FUNNYUVCODE | |
1411 | FUNNYUVCODE | |
1412 | FUNNYUVCODE | |
2ff198c1 MN |
1413 | "xorl %%eax, %%eax \n\t" // i |
1414 | "movl %6, %%esi \n\t" // src | |
1415 | "movl %1, %%edi \n\t" // buf1 | |
1416 | "addl $4096, %%edi \n\t" | |
1417 | ||
1418 | FUNNYUVCODE | |
1419 | FUNNYUVCODE | |
1420 | FUNNYUVCODE | |
1421 | FUNNYUVCODE | |
1422 | ||
1423 | FUNNYUVCODE | |
1424 | FUNNYUVCODE | |
1425 | FUNNYUVCODE | |
1426 | FUNNYUVCODE | |
1427 | ||
1428 | :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | |
1429 | "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2) | |
1430 | : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |
1431 | ); | |
1432 | for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth/2-1; i--) | |
1433 | { | |
1434 | dst[i] = src1[srcWidth/2-1]*128; | |
1435 | dst[i+2048] = src2[srcWidth/2-1]*128; | |
1436 | } | |
1437 | } | |
1438 | else | |
1439 | { | |
1440 | #endif | |
1441 | asm volatile( | |
1442 | "xorl %%eax, %%eax \n\t" // i | |
1443 | "xorl %%ebx, %%ebx \n\t" // xx | |
1444 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
cff6ecd7 | 1445 | ".balign 16 \n\t" |
2ff198c1 MN |
1446 | "1: \n\t" |
1447 | "movl %0, %%esi \n\t" | |
1448 | "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx] | |
1449 | "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | |
1450 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1451 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1452 | "shll $16, %%edi \n\t" | |
1453 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1454 | "movl %1, %%edi \n\t" | |
1455 | "shrl $9, %%esi \n\t" | |
1456 | "movw %%si, (%%edi, %%eax, 2) \n\t" | |
1457 | ||
1458 | "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | |
1459 | "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | |
1460 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1461 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1462 | "shll $16, %%edi \n\t" | |
1463 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1464 | "movl %1, %%edi \n\t" | |
1465 | "shrl $9, %%esi \n\t" | |
1466 | "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | |
1467 | ||
1468 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
1469 | "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
1470 | "addl $1, %%eax \n\t" | |
1471 | "cmpl %2, %%eax \n\t" | |
1472 | " jb 1b \n\t" | |
1473 | ||
1474 | :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF), | |
1475 | "r" (src2) | |
1476 | : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
1477 | ); | |
1478 | #ifdef HAVE_MMX2 | |
1479 | } //if MMX2 cant be used | |
1480 | #endif | |
1481 | #else | |
96034638 MN |
1482 | int i; |
1483 | unsigned int xpos=0; | |
1484 | for(i=0;i<dstWidth;i++) | |
1485 | { | |
1486 | register unsigned int xx=xpos>>16; | |
1487 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
1488 | dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
1489 | dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
1faf0867 MN |
1490 | /* slower |
1491 | dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
1492 | dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
1493 | */ | |
96034638 MN |
1494 | xpos+=xInc; |
1495 | } | |
2ff198c1 MN |
1496 | #endif |
1497 | } | |
d604bab9 | 1498 | |
d3f41512 | 1499 | |
38858470 | 1500 | // *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices: |
31190492 A |
1501 | // *** Note: it's called multiple times while decoding a frame, first time y==0 |
1502 | // *** Designed to upscale, but may work for downscale too. | |
44f9179b | 1503 | // s_xinc = (src_width << 16) / dst_width |
31190492 | 1504 | // s_yinc = (src_height << 16) / dst_height |
38858470 MN |
1505 | void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int y, int h, |
1506 | uint8_t* dstptr[], int dststride, int dstw, int dstbpp, | |
31190492 A |
1507 | unsigned int s_xinc,unsigned int s_yinc){ |
1508 | ||
1509 | // scaling factors: | |
1510 | //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; | |
1511 | //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; | |
1512 | ||
783e9cc9 | 1513 | unsigned int s_xinc2; |
31190492 | 1514 | |
783e9cc9 | 1515 | static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) |
31190492 | 1516 | static int s_ypos; |
783e9cc9 MN |
1517 | |
1518 | // last horzontally interpolated lines, used to avoid unnecessary calculations | |
31190492 | 1519 | static int s_last_ypos; |
783e9cc9 MN |
1520 | static int s_last_y1pos; |
1521 | ||
d3f41512 | 1522 | #ifdef HAVE_MMX2 |
783e9cc9 | 1523 | // used to detect a horizontal size change |
d3f41512 MN |
1524 | static int old_dstw= -1; |
1525 | static int old_s_xinc= -1; | |
1526 | #endif | |
d604bab9 | 1527 | |
7d7f78b5 MN |
1528 | int srcWidth; |
1529 | int dstUVw; | |
162caf68 | 1530 | int i; |
31190492 | 1531 | |
7d7f78b5 MN |
1532 | if(((dstw + 7)&(~7)) >= dststride) dstw&= ~7; |
1533 | ||
1534 | srcWidth= (dstw*s_xinc + 0x8000)>>16; | |
1535 | dstUVw= fullUVIpol ? dstw : dstw/2; | |
1536 | ||
d3fda508 | 1537 | #ifdef HAVE_MMX2 |
0f25d72b | 1538 | canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0; |
d3fda508 MN |
1539 | #endif |
1540 | ||
0f25d72b MN |
1541 | // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst |
1542 | // n-2 is the last chrominance sample available | |
1543 | // FIXME this is not perfect, but noone shuld notice the difference, the more correct variant | |
1544 | // would be like the vertical one, but that would require some special code for the | |
1545 | // first and last pixel | |
1546 | if(canMMX2BeUsed) s_xinc+= 20; | |
1547 | else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20; | |
0f25d72b | 1548 | |
1faf0867 MN |
1549 | if(fullUVIpol && !(dstbpp==12)) s_xinc2= s_xinc>>1; |
1550 | else s_xinc2= s_xinc; | |
b3a134b6 | 1551 | // force calculation of the horizontal interpolation of the first line |
b3a134b6 | 1552 | |
31190492 | 1553 | if(y==0){ |
7d7f78b5 | 1554 | // printf("dstw %d, srcw %d, mmx2 %d\n", dstw, srcWidth, canMMX2BeUsed); |
2ff198c1 MN |
1555 | s_last_ypos=-99; |
1556 | s_last_y1pos=-99; | |
1557 | s_srcypos= s_yinc/2 - 0x8000; | |
1558 | s_ypos=0; | |
162caf68 MN |
1559 | |
1560 | // clean the buffers so that no green stuff is drawen if the width is not sane (%8=0) | |
1561 | for(i=dstw-2; i<dstw+20; i++) | |
1562 | { | |
1563 | pix_buf_uv[0][i] = pix_buf_uv[1][i] | |
7d7f78b5 | 1564 | = pix_buf_uv[0][2048+i] = pix_buf_uv[1][2048+i] = 128*128; |
162caf68 | 1565 | pix_buf_uv[0][i/2] = pix_buf_uv[1][i/2] |
7d7f78b5 | 1566 | = pix_buf_uv[0][2048+i/2] = pix_buf_uv[1][2048+i/2] = 128*128; |
162caf68 MN |
1567 | pix_buf_y[0][i]= pix_buf_y[1][i]= 0; |
1568 | } | |
1569 | ||
d3f41512 MN |
1570 | #ifdef HAVE_MMX2 |
1571 | // cant downscale !!! | |
783e9cc9 | 1572 | if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed) |
d3f41512 MN |
1573 | { |
1574 | uint8_t *fragment; | |
1575 | int imm8OfPShufW1; | |
1576 | int imm8OfPShufW2; | |
1577 | int fragmentLength; | |
1578 | ||
96034638 | 1579 | int xpos, i; |
d3f41512 MN |
1580 | |
1581 | old_s_xinc= s_xinc; | |
1582 | old_dstw= dstw; | |
1583 | ||
d3f41512 MN |
1584 | // create an optimized horizontal scaling routine |
1585 | ||
1586 | //code fragment | |
1587 | ||
d3f41512 MN |
1588 | asm volatile( |
1589 | "jmp 9f \n\t" | |
1590 | // Begin | |
1591 | "0: \n\t" | |
783e9cc9 | 1592 | "movq (%%esi), %%mm0 \n\t" //FIXME Alignment |
d3f41512 MN |
1593 | "movq %%mm0, %%mm1 \n\t" |
1594 | "psrlq $8, %%mm0 \n\t" | |
1595 | "punpcklbw %%mm7, %%mm1 \n\t" | |
783e9cc9 | 1596 | "movq %%mm2, %%mm3 \n\t" |
d3f41512 | 1597 | "punpcklbw %%mm7, %%mm0 \n\t" |
783e9cc9 | 1598 | "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF |
d3f41512 MN |
1599 | "pshufw $0xFF, %%mm1, %%mm1 \n\t" |
1600 | "1: \n\t" | |
783e9cc9 | 1601 | "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry |
d3f41512 MN |
1602 | "pshufw $0xFF, %%mm0, %%mm0 \n\t" |
1603 | "2: \n\t" | |
783e9cc9 | 1604 | "psrlw $9, %%mm3 \n\t" |
d3f41512 | 1605 | "psubw %%mm1, %%mm0 \n\t" |
783e9cc9 MN |
1606 | "pmullw %%mm3, %%mm0 \n\t" |
1607 | "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF | |
d3f41512 MN |
1608 | "psllw $7, %%mm1 \n\t" |
1609 | "paddw %%mm1, %%mm0 \n\t" | |
d3f41512 | 1610 | |
783e9cc9 | 1611 | "movq %%mm0, (%%edi, %%eax) \n\t" |
d3f41512 MN |
1612 | |
1613 | "addl $8, %%eax \n\t" | |
1614 | // End | |
1615 | "9: \n\t" | |
1616 | // "int $3\n\t" | |
1617 | "leal 0b, %0 \n\t" | |
1618 | "leal 1b, %1 \n\t" | |
1619 | "leal 2b, %2 \n\t" | |
1620 | "decl %1 \n\t" | |
1621 | "decl %2 \n\t" | |
1622 | "subl %0, %1 \n\t" | |
1623 | "subl %0, %2 \n\t" | |
1624 | "leal 9b, %3 \n\t" | |
1625 | "subl %0, %3 \n\t" | |
1626 | :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2), | |
1627 | "=r" (fragmentLength) | |
1628 | ); | |
1629 | ||
0f25d72b | 1630 | xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers |
783e9cc9 MN |
1631 | |
1632 | /* choose xinc so that all 8 parts fit exactly | |
1633 | Note: we cannot use just 1 part because it would not fit in the code cache */ | |
0f25d72b | 1634 | // s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))-10; |
783e9cc9 MN |
1635 | // s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8)); |
1636 | #ifdef ALT_ERROR | |
0f25d72b | 1637 | // s_xinc2_diff+= ((0x10000/(dstw/8))); |
783e9cc9 | 1638 | #endif |
0f25d72b | 1639 | // s_xinc_diff= s_xinc2_diff*2; |
783e9cc9 | 1640 | |
0f25d72b MN |
1641 | // s_xinc2+= s_xinc2_diff; |
1642 | // s_xinc+= s_xinc_diff; | |
d3fda508 | 1643 | |
0f25d72b | 1644 | // old_s_xinc= s_xinc; |
d3fda508 | 1645 | |
d3f41512 MN |
1646 | for(i=0; i<dstw/8; i++) |
1647 | { | |
783e9cc9 | 1648 | int xx=xpos>>16; |
d3f41512 MN |
1649 | |
1650 | if((i&3) == 0) | |
1651 | { | |
1652 | int a=0; | |
783e9cc9 MN |
1653 | int b=((xpos+s_xinc)>>16) - xx; |
1654 | int c=((xpos+s_xinc*2)>>16) - xx; | |
1655 | int d=((xpos+s_xinc*3)>>16) - xx; | |
d3f41512 MN |
1656 | |
1657 | memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | |
1658 | ||
1659 | funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]= | |
1660 | funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= | |
1661 | a | (b<<2) | (c<<4) | (d<<6); | |
1662 | ||
d604bab9 MN |
1663 | // if we dont need to read 8 bytes than dont :), reduces the chance of |
1664 | // crossing a cache line | |
1665 | if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E; | |
1666 | ||
d3f41512 MN |
1667 | funnyYCode[fragmentLength*(i+4)/4]= RET; |
1668 | } | |
1669 | xpos+=s_xinc; | |
1670 | } | |
1671 | ||
0f25d72b | 1672 | xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples |
d604bab9 | 1673 | for(i=0; i<dstUVw/8; i++) |
d3f41512 | 1674 | { |
783e9cc9 | 1675 | int xx=xpos>>16; |
d3f41512 MN |
1676 | |
1677 | if((i&3) == 0) | |
1678 | { | |
1679 | int a=0; | |
783e9cc9 MN |
1680 | int b=((xpos+s_xinc2)>>16) - xx; |
1681 | int c=((xpos+s_xinc2*2)>>16) - xx; | |
1682 | int d=((xpos+s_xinc2*3)>>16) - xx; | |
d3f41512 MN |
1683 | |
1684 | memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | |
1685 | ||
1686 | funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= | |
1687 | funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= | |
1688 | a | (b<<2) | (c<<4) | (d<<6); | |
1689 | ||
d604bab9 MN |
1690 | // if we dont need to read 8 bytes than dont :), reduces the chance of |
1691 | // crossing a cache line | |
1692 | if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E; | |
1693 | ||
d3f41512 MN |
1694 | funnyUVCode[fragmentLength*(i+4)/4]= RET; |
1695 | } | |
1696 | xpos+=s_xinc2; | |
1697 | } | |
1698 | // funnyCode[0]= RET; | |
d3f41512 | 1699 | } |
783e9cc9 | 1700 | |
783e9cc9 | 1701 | #endif // HAVE_MMX2 |
31190492 | 1702 | } // reset counters |
d3f41512 | 1703 | |
31190492 | 1704 | while(1){ |
38858470 MN |
1705 | unsigned char *dest =dstptr[0]+dststride*s_ypos; |
1706 | unsigned char *uDest=dstptr[1]+(dststride>>1)*(s_ypos>>1); | |
1707 | unsigned char *vDest=dstptr[2]+(dststride>>1)*(s_ypos>>1); | |
1708 | ||
783e9cc9 MN |
1709 | int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line |
1710 | // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | |
38858470 MN |
1711 | int srcuvpos= dstbpp==12 ? s_srcypos + s_yinc/2 - 0x8000 : |
1712 | s_srcypos - 0x8000; | |
783e9cc9 | 1713 | int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line |
d604bab9 MN |
1714 | int yalpha=((s_srcypos-1)&0xFFFF)>>4; |
1715 | int uvalpha=((srcuvpos-1)&0x1FFFF)>>5; | |
783e9cc9 MN |
1716 | uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice |
1717 | uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice | |
1718 | uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice | |
1719 | uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice | |
31190492 | 1720 | |
783e9cc9 MN |
1721 | if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway |
1722 | ||
38858470 | 1723 | if((y0&1) && dstbpp==12) uvalpha=-1; // there is no alpha if there is no line |
31190492 A |
1724 | |
1725 | s_ypos++; s_srcypos+=s_yinc; | |
1726 | ||
783e9cc9 | 1727 | //only interpolate the src line horizontally if we didnt do it allready |
2ff198c1 | 1728 | if(s_last_ypos!=y0) |
783e9cc9 | 1729 | { |
2ff198c1 MN |
1730 | unsigned char *src; |
1731 | // skip if first line has been horiz scaled alleady | |
1732 | if(s_last_ypos != y0-1) | |
1733 | { | |
1734 | // check if first line is before any available src lines | |
1735 | if(y0-1 < y) src=srcptr[0]+(0 )*stride[0]; | |
1736 | else src=srcptr[0]+(y0-y-1)*stride[0]; | |
d3f41512 | 1737 | |
2ff198c1 MN |
1738 | hyscale(buf0, dstw, src, srcWidth, s_xinc); |
1739 | } | |
1740 | // check if second line is after any available src lines | |
1741 | if(y0-y >= h) src=srcptr[0]+(h-1)*stride[0]; | |
1742 | else src=srcptr[0]+(y0-y)*stride[0]; | |
d3f41512 | 1743 | |
2ff198c1 MN |
1744 | // the min() is required to avoid reuseing lines which where not available |
1745 | s_last_ypos= MIN(y0, y+h-1); | |
1746 | hyscale(buf1, dstw, src, srcWidth, s_xinc); | |
1747 | } | |
1748 | // printf("%d %d %d %d\n", y, y1, s_last_y1pos, h); | |
31190492 | 1749 | // *** horizontal scale U and V lines to temp buffer |
2ff198c1 | 1750 | if(s_last_y1pos!=y1) |
783e9cc9 | 1751 | { |
2ff198c1 MN |
1752 | uint8_t *src1, *src2; |
1753 | // skip if first line has been horiz scaled alleady | |
1754 | if(s_last_y1pos != y1-1) | |
0f25d72b | 1755 | { |
2ff198c1 MN |
1756 | // check if first line is before any available src lines |
1757 | if(y1-y/2-1 < 0) | |
1758 | { | |
1759 | src1= srcptr[1]+(0)*stride[1]; | |
1760 | src2= srcptr[2]+(0)*stride[2]; | |
1761 | }else{ | |
1762 | src1= srcptr[1]+(y1-y/2-1)*stride[1]; | |
1763 | src2= srcptr[2]+(y1-y/2-1)*stride[2]; | |
1764 | } | |
1765 | hcscale(uvbuf0, dstUVw, src1, src2, srcWidth, s_xinc2); | |
0f25d72b | 1766 | } |
d3f41512 | 1767 | |
2ff198c1 MN |
1768 | // check if second line is after any available src lines |
1769 | if(y1 - y/2 >= h/2) | |
1770 | { | |
1771 | src1= srcptr[1]+(h/2-1)*stride[1]; | |
1772 | src2= srcptr[2]+(h/2-1)*stride[2]; | |
1773 | }else{ | |
1774 | src1= srcptr[1]+(y1-y/2)*stride[1]; | |
1775 | src2= srcptr[2]+(y1-y/2)*stride[2]; | |
1776 | } | |
1777 | hcscale(uvbuf1, dstUVw, src1, src2, srcWidth, s_xinc2); | |
d3f41512 | 1778 | |
2ff198c1 MN |
1779 | // the min() is required to avoid reuseing lines which where not available |
1780 | s_last_y1pos= MIN(y1, y/2+h/2-1); | |
84adc106 | 1781 | } |
d8fa3c54 MN |
1782 | #ifdef HAVE_MMX |
1783 | b5Dither= dither8[s_ypos&1]; | |
1784 | g6Dither= dither4[s_ypos&1]; | |
1785 | g5Dither= dither8[s_ypos&1]; | |
1786 | r5Dither= dither8[(s_ypos+1)&1]; | |
1787 | #endif | |
2ff198c1 | 1788 | |
38858470 MN |
1789 | if(dstbpp==12) //YV12 |
1790 | yuv2yuv(buf0, buf1, uvbuf0, uvbuf1, dest, uDest, vDest, dstw, yalpha, uvalpha); | |
1791 | else if(ABS(s_yinc - 0x10000) < 10) | |
d604bab9 MN |
1792 | yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); |
1793 | else | |
1794 | yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); | |
31190492 | 1795 | } |
17f715fa MN |
1796 | |
1797 | #ifdef HAVE_MMX | |
1798 | __asm __volatile(SFENCE:::"memory"); | |
1faf0867 | 1799 | __asm __volatile(EMMS:::"memory"); |
17f715fa | 1800 | #endif |
31190492 A |
1801 | } |
1802 | ||
1803 | ||
1804 | void SwScale_Init(){ | |
1805 | // generating tables: | |
1806 | int i; | |
1807 | for(i=0;i<256;i++){ | |
1808 | clip_table[i]=0; | |
1809 | clip_table[i+256]=i; | |
1810 | clip_table[i+512]=255; | |
1811 | yuvtab_2568[i]=(0x2568*(i-16))+(256<<13); | |
1812 | yuvtab_3343[i]=0x3343*(i-128); | |
1813 | yuvtab_0c92[i]=-0x0c92*(i-128); | |
1814 | yuvtab_1a1e[i]=-0x1a1e*(i-128); | |
1815 | yuvtab_40cf[i]=0x40cf*(i-128); | |
1816 | } | |
1817 | ||
b18ea156 MN |
1818 | for(i=0; i<768; i++) |
1819 | { | |
1820 | int v= clip_table[i]; | |
1821 | clip_table16b[i]= v>>3; | |
1822 | clip_table16g[i]= (v<<3)&0x07E0; | |
1823 | clip_table16r[i]= (v<<8)&0xF800; | |
1824 | clip_table15b[i]= v>>3; | |
1825 | clip_table15g[i]= (v<<2)&0x03E0; | |
1826 | clip_table15r[i]= (v<<7)&0x7C00; | |
1827 | } | |
1828 | ||
31190492 | 1829 | } |