runtime cpu detection
[libav.git] / postproc / swscale.c
CommitLineData
31190492
A
1
2// Software scaling and colorspace conversion routines for MPlayer
3
afa569af 4// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
783e9cc9 5// current version mostly by Michael Niedermayer (michaelni@gmx.at)
d604bab9 6// the parts written by michael are under GNU GPL
783e9cc9 7
d3f41512 8#include <inttypes.h>
dda87e9f 9#include <string.h>
d3f41512 10#include "../config.h"
d604bab9 11#include "swscale.h"
99cefd0b 12#include "../mmx_defs.h"
541c4eb9 13#undef MOVNTQ
7d7f78b5 14#undef PAVGB
d3f41512 15
783e9cc9 16//#undef HAVE_MMX2
d3f41512 17//#undef HAVE_MMX
783e9cc9 18//#undef ARCH_X86
d604bab9
MN
19#define DITHER1XBPP
20int fullUVIpol=0;
21//disables the unscaled height version
22int allwaysIpol=0;
d3f41512
MN
23
24#define RET 0xC3 //near return opcode
783e9cc9
MN
25/*
26NOTES
d3f41512 27
d604bab9
MN
28known BUGS with known cause (no bugreports please!, but patches are welcome :) )
29horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
30
a525ce8d 31Supported output formats BGR15 BGR16 BGR24 BGR32
d604bab9
MN
32BGR15 & BGR16 MMX verions support dithering
33Special versions: fast Y 1:1 scaling (no interpolation in y direction)
31190492 34
783e9cc9 35TODO
d604bab9 36more intelligent missalignment avoidance for the horizontal scaler
1faf0867 37bicubic scaler
02a0a992
MN
38dither in C
39change the distance of the u & v buffer
783e9cc9 40*/
31190492 41
d604bab9 42#define ABS(a) ((a) > 0 ? (a) : (-(a)))
2ff198c1
MN
43#define MIN(a,b) ((a) > (b) ? (b) : (a))
44#define MAX(a,b) ((a) < (b) ? (b) : (a))
d604bab9
MN
45
46#ifdef HAVE_MMX2
47#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
48#elif defined (HAVE_3DNOW)
49#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
50#endif
d3f41512 51
d604bab9
MN
52#ifdef HAVE_MMX2
53#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
54#else
55#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
56#endif
57
58
59#ifdef HAVE_MMX
60static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
390b20a6
MN
61static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
62static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
63static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
64static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
f62255fb
MN
65static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
66static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
d604bab9
MN
67static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
68static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
69static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
70static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
71static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
72static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
73
3fe8e8f0
MN
74static volatile uint64_t __attribute__((aligned(8))) b5Dither;
75static volatile uint64_t __attribute__((aligned(8))) g5Dither;
76static volatile uint64_t __attribute__((aligned(8))) g6Dither;
77static volatile uint64_t __attribute__((aligned(8))) r5Dither;
d8fa3c54
MN
78
79static uint64_t __attribute__((aligned(8))) dither4[2]={
80 0x0103010301030103LL,
81 0x0200020002000200LL,};
82
83static uint64_t __attribute__((aligned(8))) dither8[2]={
84 0x0602060206020602LL,
85 0x0004000400040004LL,};
d604bab9
MN
86
87static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
88static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
89static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
90static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
91static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
92static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
93
99d2cb72
MN
94static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
95static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
96static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
97
d604bab9
MN
98static uint64_t __attribute__((aligned(8))) temp0;
99static uint64_t __attribute__((aligned(8))) asm_yalpha1;
100static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
101#endif
783e9cc9
MN
102
103// temporary storage for 4 yuv lines:
104// 16bit for now (mmx likes it more compact)
d604bab9
MN
105#ifdef HAVE_MMX
106static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
107static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
108#else
783e9cc9
MN
109static uint16_t pix_buf_y[4][2048];
110static uint16_t pix_buf_uv[2][2048*2];
d604bab9 111#endif
783e9cc9
MN
112
113// clipping helper table for C implementations:
114static unsigned char clip_table[768];
115
b18ea156
MN
116static unsigned short clip_table16b[768];
117static unsigned short clip_table16g[768];
118static unsigned short clip_table16r[768];
119static unsigned short clip_table15b[768];
120static unsigned short clip_table15g[768];
121static unsigned short clip_table15r[768];
122
783e9cc9
MN
123// yuv->rgb conversion tables:
124static int yuvtab_2568[256];
125static int yuvtab_3343[256];
126static int yuvtab_0c92[256];
127static int yuvtab_1a1e[256];
128static int yuvtab_40cf[256];
129
96034638 130#ifdef HAVE_MMX2
d3f41512
MN
131static uint8_t funnyYCode[10000];
132static uint8_t funnyUVCode[10000];
96034638 133#endif
d3f41512 134
2ff198c1
MN
135static int canMMX2BeUsed=0;
136
d604bab9
MN
137#define FULL_YSCALEYUV2RGB \
138 "pxor %%mm7, %%mm7 \n\t"\
139 "movd %6, %%mm6 \n\t" /*yalpha1*/\
140 "punpcklwd %%mm6, %%mm6 \n\t"\
141 "punpcklwd %%mm6, %%mm6 \n\t"\
142 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
143 "punpcklwd %%mm5, %%mm5 \n\t"\
144 "punpcklwd %%mm5, %%mm5 \n\t"\
145 "xorl %%eax, %%eax \n\t"\
cff6ecd7 146 ".balign 16 \n\t"\
d604bab9
MN
147 "1: \n\t"\
148 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
149 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
150 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
151 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
152 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
153 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
154 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
155 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
156 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
157 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
158 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
159 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
160 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
161 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
162 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
163 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
164 "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\
165 "pmulhw yCoeff, %%mm1 \n\t"\
166\
167\
168 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
169 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
170 "pmulhw ubCoeff, %%mm3 \n\t"\
171 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
172 "pmulhw ugCoeff, %%mm2 \n\t"\
173 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
174 "psubw w400, %%mm0 \n\t" /* (V-128)8*/\
175\
176\
177 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
178 "pmulhw vrCoeff, %%mm0 \n\t"\
179 "pmulhw vgCoeff, %%mm4 \n\t"\
180 "paddw %%mm1, %%mm3 \n\t" /* B*/\
181 "paddw %%mm1, %%mm0 \n\t" /* R*/\
182 "packuswb %%mm3, %%mm3 \n\t"\
183\
184 "packuswb %%mm0, %%mm0 \n\t"\
185 "paddw %%mm4, %%mm2 \n\t"\
186 "paddw %%mm2, %%mm1 \n\t" /* G*/\
187\
188 "packuswb %%mm1, %%mm1 \n\t"
189
190#define YSCALEYUV2RGB \
191 "movd %6, %%mm6 \n\t" /*yalpha1*/\
192 "punpcklwd %%mm6, %%mm6 \n\t"\
193 "punpcklwd %%mm6, %%mm6 \n\t"\
194 "movq %%mm6, asm_yalpha1 \n\t"\
195 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
196 "punpcklwd %%mm5, %%mm5 \n\t"\
197 "punpcklwd %%mm5, %%mm5 \n\t"\
198 "movq %%mm5, asm_uvalpha1 \n\t"\
199 "xorl %%eax, %%eax \n\t"\
cff6ecd7 200 ".balign 16 \n\t"\
d604bab9
MN
201 "1: \n\t"\
202 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
203 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
204 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
205 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
206 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
207 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
208 "movq asm_uvalpha1, %%mm0 \n\t"\
209 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
210 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
211 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
212 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
213 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
214 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
215 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
216 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
217 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
218 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
219 "pmulhw ugCoeff, %%mm3 \n\t"\
220 "pmulhw vgCoeff, %%mm4 \n\t"\
221 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
222 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
223 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
224 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
225 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
226 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
227 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
228 "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
229 "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
230 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
231 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
232 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
233 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
234 "pmulhw ubCoeff, %%mm2 \n\t"\
235 "pmulhw vrCoeff, %%mm5 \n\t"\
236 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
237 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
238 "pmulhw yCoeff, %%mm1 \n\t"\
239 "pmulhw yCoeff, %%mm7 \n\t"\
240 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
241 "paddw %%mm3, %%mm4 \n\t"\
242 "movq %%mm2, %%mm0 \n\t"\
243 "movq %%mm5, %%mm6 \n\t"\
244 "movq %%mm4, %%mm3 \n\t"\
245 "punpcklwd %%mm2, %%mm2 \n\t"\
246 "punpcklwd %%mm5, %%mm5 \n\t"\
247 "punpcklwd %%mm4, %%mm4 \n\t"\
248 "paddw %%mm1, %%mm2 \n\t"\
249 "paddw %%mm1, %%mm5 \n\t"\
250 "paddw %%mm1, %%mm4 \n\t"\
251 "punpckhwd %%mm0, %%mm0 \n\t"\
252 "punpckhwd %%mm6, %%mm6 \n\t"\
253 "punpckhwd %%mm3, %%mm3 \n\t"\
254 "paddw %%mm7, %%mm0 \n\t"\
255 "paddw %%mm7, %%mm6 \n\t"\
256 "paddw %%mm7, %%mm3 \n\t"\
257 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
258 "packuswb %%mm0, %%mm2 \n\t"\
259 "packuswb %%mm6, %%mm5 \n\t"\
260 "packuswb %%mm3, %%mm4 \n\t"\
261 "pxor %%mm7, %%mm7 \n\t"
262
263#define YSCALEYUV2RGB1 \
264 "xorl %%eax, %%eax \n\t"\
cff6ecd7 265 ".balign 16 \n\t"\
d604bab9
MN
266 "1: \n\t"\
267 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
268 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
269 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
270 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
271 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
272 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
273 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
274 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
275 "pmulhw ugCoeff, %%mm3 \n\t"\
276 "pmulhw vgCoeff, %%mm4 \n\t"\
277 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
497d4f99
MN
278 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
279 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
280 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
281 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
282 "pmulhw ubCoeff, %%mm2 \n\t"\
283 "pmulhw vrCoeff, %%mm5 \n\t"\
284 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
285 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
286 "pmulhw yCoeff, %%mm1 \n\t"\
287 "pmulhw yCoeff, %%mm7 \n\t"\
288 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
289 "paddw %%mm3, %%mm4 \n\t"\
290 "movq %%mm2, %%mm0 \n\t"\
291 "movq %%mm5, %%mm6 \n\t"\
292 "movq %%mm4, %%mm3 \n\t"\
293 "punpcklwd %%mm2, %%mm2 \n\t"\
294 "punpcklwd %%mm5, %%mm5 \n\t"\
295 "punpcklwd %%mm4, %%mm4 \n\t"\
296 "paddw %%mm1, %%mm2 \n\t"\
297 "paddw %%mm1, %%mm5 \n\t"\
298 "paddw %%mm1, %%mm4 \n\t"\
299 "punpckhwd %%mm0, %%mm0 \n\t"\
300 "punpckhwd %%mm6, %%mm6 \n\t"\
301 "punpckhwd %%mm3, %%mm3 \n\t"\
302 "paddw %%mm7, %%mm0 \n\t"\
303 "paddw %%mm7, %%mm6 \n\t"\
304 "paddw %%mm7, %%mm3 \n\t"\
305 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
306 "packuswb %%mm0, %%mm2 \n\t"\
307 "packuswb %%mm6, %%mm5 \n\t"\
308 "packuswb %%mm3, %%mm4 \n\t"\
309 "pxor %%mm7, %%mm7 \n\t"
310
311// do vertical chrominance interpolation
312#define YSCALEYUV2RGB1b \
313 "xorl %%eax, %%eax \n\t"\
cff6ecd7 314 ".balign 16 \n\t"\
497d4f99
MN
315 "1: \n\t"\
316 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
317 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
318 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
319 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
397c035e
MN
320 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
321 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
322 "psrlw $5, %%mm3 \n\t"\
323 "psrlw $5, %%mm4 \n\t"\
497d4f99
MN
324 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
325 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
326 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
327 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
328 "pmulhw ugCoeff, %%mm3 \n\t"\
329 "pmulhw vgCoeff, %%mm4 \n\t"\
330 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
331 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
332 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
d604bab9
MN
333 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
334 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
335 "pmulhw ubCoeff, %%mm2 \n\t"\
336 "pmulhw vrCoeff, %%mm5 \n\t"\
337 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
338 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
339 "pmulhw yCoeff, %%mm1 \n\t"\
340 "pmulhw yCoeff, %%mm7 \n\t"\
341 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
342 "paddw %%mm3, %%mm4 \n\t"\
343 "movq %%mm2, %%mm0 \n\t"\
344 "movq %%mm5, %%mm6 \n\t"\
345 "movq %%mm4, %%mm3 \n\t"\
346 "punpcklwd %%mm2, %%mm2 \n\t"\
347 "punpcklwd %%mm5, %%mm5 \n\t"\
348 "punpcklwd %%mm4, %%mm4 \n\t"\
349 "paddw %%mm1, %%mm2 \n\t"\
350 "paddw %%mm1, %%mm5 \n\t"\
351 "paddw %%mm1, %%mm4 \n\t"\
352 "punpckhwd %%mm0, %%mm0 \n\t"\
353 "punpckhwd %%mm6, %%mm6 \n\t"\
354 "punpckhwd %%mm3, %%mm3 \n\t"\
355 "paddw %%mm7, %%mm0 \n\t"\
356 "paddw %%mm7, %%mm6 \n\t"\
357 "paddw %%mm7, %%mm3 \n\t"\
358 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
359 "packuswb %%mm0, %%mm2 \n\t"\
360 "packuswb %%mm6, %%mm5 \n\t"\
361 "packuswb %%mm3, %%mm4 \n\t"\
362 "pxor %%mm7, %%mm7 \n\t"
363
364#define WRITEBGR32 \
365 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
366 "movq %%mm2, %%mm1 \n\t" /* B */\
367 "movq %%mm5, %%mm6 \n\t" /* R */\
368 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
369 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
370 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
371 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
372 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
373 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
374 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
375 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
376 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
377 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
378\
379 MOVNTQ(%%mm0, (%4, %%eax, 4))\
380 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
381 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
382 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
383\
384 "addl $8, %%eax \n\t"\
385 "cmpl %5, %%eax \n\t"\
386 " jb 1b \n\t"
387
388#define WRITEBGR16 \
f62255fb
MN
389 "pand bF8, %%mm2 \n\t" /* B */\
390 "pand bFC, %%mm4 \n\t" /* G */\
391 "pand bF8, %%mm5 \n\t" /* R */\
392 "psrlq $3, %%mm2 \n\t"\
d604bab9 393\
f62255fb
MN
394 "movq %%mm2, %%mm1 \n\t"\
395 "movq %%mm4, %%mm3 \n\t"\
d604bab9 396\
f62255fb
MN
397 "punpcklbw %%mm7, %%mm3 \n\t"\
398 "punpcklbw %%mm5, %%mm2 \n\t"\
399 "punpckhbw %%mm7, %%mm4 \n\t"\
400 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 401\
f62255fb
MN
402 "psllq $3, %%mm3 \n\t"\
403 "psllq $3, %%mm4 \n\t"\
d604bab9
MN
404\
405 "por %%mm3, %%mm2 \n\t"\
d604bab9 406 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
407\
408 MOVNTQ(%%mm2, (%4, %%eax, 2))\
409 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
410\
411 "addl $8, %%eax \n\t"\
412 "cmpl %5, %%eax \n\t"\
413 " jb 1b \n\t"
414
415#define WRITEBGR15 \
f62255fb
MN
416 "pand bF8, %%mm2 \n\t" /* B */\
417 "pand bF8, %%mm4 \n\t" /* G */\
418 "pand bF8, %%mm5 \n\t" /* R */\
419 "psrlq $3, %%mm2 \n\t"\
420 "psrlq $1, %%mm5 \n\t"\
d604bab9 421\
f62255fb
MN
422 "movq %%mm2, %%mm1 \n\t"\
423 "movq %%mm4, %%mm3 \n\t"\
d604bab9 424\
f62255fb
MN
425 "punpcklbw %%mm7, %%mm3 \n\t"\
426 "punpcklbw %%mm5, %%mm2 \n\t"\
427 "punpckhbw %%mm7, %%mm4 \n\t"\
428 "punpckhbw %%mm5, %%mm1 \n\t"\
d604bab9 429\
f62255fb
MN
430 "psllq $2, %%mm3 \n\t"\
431 "psllq $2, %%mm4 \n\t"\
d604bab9
MN
432\
433 "por %%mm3, %%mm2 \n\t"\
d604bab9 434 "por %%mm4, %%mm1 \n\t"\
d604bab9
MN
435\
436 MOVNTQ(%%mm2, (%4, %%eax, 2))\
437 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
438\
439 "addl $8, %%eax \n\t"\
440 "cmpl %5, %%eax \n\t"\
441 " jb 1b \n\t"
f62255fb 442
99d2cb72 443#define WRITEBGR24OLD \
d604bab9
MN
444 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
445 "movq %%mm2, %%mm1 \n\t" /* B */\
446 "movq %%mm5, %%mm6 \n\t" /* R */\
447 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
448 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
449 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
450 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
451 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
452 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
a525ce8d
MN
453 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
454 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
455 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
456 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
d604bab9
MN
457\
458 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
459 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
460 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\
461 "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\
462 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
463 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
464 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
465 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
466\
467 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
468 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
469 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
470 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
471 "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\
472 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
473 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
474 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\
475 "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\
476 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
477 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
478 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
479 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
480\
481 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
482 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
483 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
484 "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\
485 "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\
486 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
487 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
488 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
489\
bdc2eb9a
MN
490 MOVNTQ(%%mm0, (%%ebx))\
491 MOVNTQ(%%mm2, 8(%%ebx))\
492 MOVNTQ(%%mm3, 16(%%ebx))\
493 "addl $24, %%ebx \n\t"\
d604bab9
MN
494\
495 "addl $8, %%eax \n\t"\
496 "cmpl %5, %%eax \n\t"\
497 " jb 1b \n\t"
498
99d2cb72
MN
499#define WRITEBGR24MMX \
500 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
501 "movq %%mm2, %%mm1 \n\t" /* B */\
502 "movq %%mm5, %%mm6 \n\t" /* R */\
503 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
504 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
505 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
506 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
507 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
508 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
509 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
510 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
511 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
512 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
513\
514 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
515 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
516 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
517 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
518\
519 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
520 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
521 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
522 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
523\
524 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
525 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
526 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
527 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
528\
529 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
530 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
531 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
532 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
533 MOVNTQ(%%mm0, (%%ebx))\
534\
535 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
536 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
537 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
538 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
539 MOVNTQ(%%mm6, 8(%%ebx))\
540\
541 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
542 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
543 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
544 MOVNTQ(%%mm5, 16(%%ebx))\
545\
546 "addl $24, %%ebx \n\t"\
547\
548 "addl $8, %%eax \n\t"\
549 "cmpl %5, %%eax \n\t"\
550 " jb 1b \n\t"
551
552#define WRITEBGR24MMX2 \
553 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
554 "movq M24A, %%mm0 \n\t"\
555 "movq M24C, %%mm7 \n\t"\
556 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
557 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
558 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
559\
560 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
561 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
562 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
563\
564 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
565 "por %%mm1, %%mm6 \n\t"\
566 "por %%mm3, %%mm6 \n\t"\
567 MOVNTQ(%%mm6, (%%ebx))\
568\
569 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
570 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
571 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
572 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
573\
574 "pand M24B, %%mm1 \n\t" /* B5 B4 B3 */\
575 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
576 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
577\
578 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
579 "por %%mm3, %%mm6 \n\t"\
580 MOVNTQ(%%mm6, 8(%%ebx))\
581\
582 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
583 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
584 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
585\
586 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
587 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
588 "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */\
589\
590 "por %%mm1, %%mm3 \n\t"\
591 "por %%mm3, %%mm6 \n\t"\
592 MOVNTQ(%%mm6, 16(%%ebx))\
593\
594 "addl $24, %%ebx \n\t"\
595\
596 "addl $8, %%eax \n\t"\
597 "cmpl %5, %%eax \n\t"\
598 " jb 1b \n\t"
599
600#ifdef HAVE_MMX2
601#define WRITEBGR24 WRITEBGR24MMX2
602#else
603#define WRITEBGR24 WRITEBGR24MMX
604#endif
605
96034638
MN
606#ifdef HAVE_MMX
607void in_asm_used_var_warning_killer()
608{
609 int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
d8fa3c54 610 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+
99d2cb72 611 M24A+M24B+M24C;
96034638
MN
612 if(i) i=0;
613}
614#endif
d604bab9 615
38858470
MN
616static inline void yuv2yuv(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
617 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstw, int yalpha, int uvalpha)
618{
619 int yalpha1=yalpha^4095;
620 int uvalpha1=uvalpha^4095;
621 int i;
622
2add307d
MN
623 asm volatile ("\n\t"::: "memory");
624
38858470
MN
625 for(i=0;i<dstw;i++)
626 {
d022ce5c 627 ((uint8_t*)dest)[i] = (buf0[i]*yalpha1+buf1[i]*yalpha)>>19;
38858470
MN
628 }
629
630 if(uvalpha != -1)
631 {
02a0a992 632 for(i=0; i<(dstw>>1); i++)
38858470 633 {
d022ce5c
MN
634 ((uint8_t*)uDest)[i] = (uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19;
635 ((uint8_t*)vDest)[i] = (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;
38858470
MN
636 }
637 }
638}
639
d604bab9
MN
640/**
641 * vertical scale YV12 to RGB
642 */
643static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
644 uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
645{
646 int yalpha1=yalpha^4095;
647 int uvalpha1=uvalpha^4095;
d604bab9
MN
648
649 if(fullUVIpol)
650 {
651
652#ifdef HAVE_MMX
653 if(dstbpp == 32)
654 {
655 asm volatile(
656
657
658FULL_YSCALEYUV2RGB
659 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
660 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
661
662 "movq %%mm3, %%mm1 \n\t"
663 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
664 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
665
666 MOVNTQ(%%mm3, (%4, %%eax, 4))
667 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
668
669 "addl $4, %%eax \n\t"
670 "cmpl %5, %%eax \n\t"
671 " jb 1b \n\t"
672
673
674 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
675 "m" (yalpha1), "m" (uvalpha1)
676 : "%eax"
677 );
678 }
679 else if(dstbpp==24)
680 {
681 asm volatile(
682
683FULL_YSCALEYUV2RGB
684
685 // lsb ... msb
686 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
687 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
688
689 "movq %%mm3, %%mm1 \n\t"
690 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
691 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
692
693 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
694 "psrlq $8, %%mm3 \n\t" // GR0BGR00
695 "pand bm00000111, %%mm2 \n\t" // BGR00000
696 "pand bm11111000, %%mm3 \n\t" // 000BGR00
697 "por %%mm2, %%mm3 \n\t" // BGRBGR00
698 "movq %%mm1, %%mm2 \n\t"
699 "psllq $48, %%mm1 \n\t" // 000000BG
700 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
701
702 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
703 "psrld $16, %%mm2 \n\t" // R000R000
704 "psrlq $24, %%mm1 \n\t" // 0BGR0000
705 "por %%mm2, %%mm1 \n\t" // RBGRR000
706
707 "movl %4, %%ebx \n\t"
708 "addl %%eax, %%ebx \n\t"
709
710#ifdef HAVE_MMX2
711 //FIXME Alignment
712 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
713 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
714#else
715 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
716 "psrlq $32, %%mm3 \n\t"
717 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
718 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
719#endif
720 "addl $4, %%eax \n\t"
721 "cmpl %5, %%eax \n\t"
722 " jb 1b \n\t"
723
724 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
725 "m" (yalpha1), "m" (uvalpha1)
726 : "%eax", "%ebx"
727 );
728 }
729 else if(dstbpp==15)
730 {
731 asm volatile(
732
733FULL_YSCALEYUV2RGB
734#ifdef DITHER1XBPP
d8fa3c54
MN
735 "paddusb g5Dither, %%mm1 \n\t"
736 "paddusb r5Dither, %%mm0 \n\t"
737 "paddusb b5Dither, %%mm3 \n\t"
d604bab9
MN
738#endif
739 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
740 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
741 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
742
743 "psrlw $3, %%mm3 \n\t"
744 "psllw $2, %%mm1 \n\t"
745 "psllw $7, %%mm0 \n\t"
746 "pand g15Mask, %%mm1 \n\t"
747 "pand r15Mask, %%mm0 \n\t"
748
749 "por %%mm3, %%mm1 \n\t"
750 "por %%mm1, %%mm0 \n\t"
751
752 MOVNTQ(%%mm0, (%4, %%eax, 2))
753
754 "addl $4, %%eax \n\t"
755 "cmpl %5, %%eax \n\t"
756 " jb 1b \n\t"
757
758 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
759 "m" (yalpha1), "m" (uvalpha1)
760 : "%eax"
761 );
762 }
763 else if(dstbpp==16)
764 {
765 asm volatile(
766
767FULL_YSCALEYUV2RGB
768#ifdef DITHER1XBPP
d8fa3c54
MN
769 "paddusb g6Dither, %%mm1 \n\t"
770 "paddusb r5Dither, %%mm0 \n\t"
771 "paddusb b5Dither, %%mm3 \n\t"
d604bab9
MN
772#endif
773 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
774 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
775 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
776
777 "psrlw $3, %%mm3 \n\t"
778 "psllw $3, %%mm1 \n\t"
779 "psllw $8, %%mm0 \n\t"
780 "pand g16Mask, %%mm1 \n\t"
781 "pand r16Mask, %%mm0 \n\t"
782
783 "por %%mm3, %%mm1 \n\t"
784 "por %%mm1, %%mm0 \n\t"
785
786 MOVNTQ(%%mm0, (%4, %%eax, 2))
787
788 "addl $4, %%eax \n\t"
789 "cmpl %5, %%eax \n\t"
790 " jb 1b \n\t"
791
792 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
793 "m" (yalpha1), "m" (uvalpha1)
794 : "%eax"
795 );
796 }
797#else
2add307d
MN
798 asm volatile ("\n\t"::: "memory");
799
d604bab9
MN
800 if(dstbpp==32 || dstbpp==24)
801 {
96034638 802 int i;
d604bab9
MN
803 for(i=0;i<dstw;i++){
804 // vertical linear interpolation && yuv2rgb in a single step:
805 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
806 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
807 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
390b20a6
MN
808 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
809 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
810 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
d604bab9
MN
811 dest+=dstbpp>>3;
812 }
813 }
814 else if(dstbpp==16)
815 {
96034638 816 int i;
d604bab9
MN
817 for(i=0;i<dstw;i++){
818 // vertical linear interpolation && yuv2rgb in a single step:
819 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
820 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
821 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
822
d022ce5c 823 ((uint16_t*)dest)[i] =
b18ea156
MN
824 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
825 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
826 clip_table16r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
827 }
828 }
829 else if(dstbpp==15)
830 {
96034638 831 int i;
d604bab9
MN
832 for(i=0;i<dstw;i++){
833 // vertical linear interpolation && yuv2rgb in a single step:
834 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
835 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
836 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
837
d022ce5c 838 ((uint16_t*)dest)[i] =
b18ea156
MN
839 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
840 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
841 clip_table15r[(Y + yuvtab_3343[V]) >>13];
d604bab9
MN
842 }
843 }
844#endif
845 }//FULL_UV_IPOL
846 else
847 {
848#ifdef HAVE_MMX
849 if(dstbpp == 32)
850 {
851 asm volatile(
852 YSCALEYUV2RGB
853 WRITEBGR32
854
855 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
856 "m" (yalpha1), "m" (uvalpha1)
857 : "%eax"
858 );
859 }
860 else if(dstbpp==24)
861 {
862 asm volatile(
bdc2eb9a 863 "movl %4, %%ebx \n\t"
d604bab9
MN
864 YSCALEYUV2RGB
865 WRITEBGR24
866
bdc2eb9a 867 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
d604bab9
MN
868 "m" (yalpha1), "m" (uvalpha1)
869 : "%eax", "%ebx"
870 );
871 }
872 else if(dstbpp==15)
873 {
874 asm volatile(
875 YSCALEYUV2RGB
876 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
877#ifdef DITHER1XBPP
d8fa3c54
MN
878 "paddusb b5Dither, %%mm2 \n\t"
879 "paddusb g5Dither, %%mm4 \n\t"
880 "paddusb r5Dither, %%mm5 \n\t"
d604bab9
MN
881#endif
882
883 WRITEBGR15
884
885 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
886 "m" (yalpha1), "m" (uvalpha1)
887 : "%eax"
888 );
889 }
890 else if(dstbpp==16)
891 {
892 asm volatile(
893 YSCALEYUV2RGB
894 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
895#ifdef DITHER1XBPP
d8fa3c54
MN
896 "paddusb b5Dither, %%mm2 \n\t"
897 "paddusb g6Dither, %%mm4 \n\t"
898 "paddusb r5Dither, %%mm5 \n\t"
d604bab9
MN
899#endif
900
901 WRITEBGR16
902
903 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
904 "m" (yalpha1), "m" (uvalpha1)
905 : "%eax"
906 );
907 }
908#else
2add307d
MN
909 asm volatile ("\n\t"::: "memory");
910
d9fc1cfe 911 if(dstbpp==32)
d604bab9 912 {
96034638 913 int i;
d9fc1cfe 914 for(i=0; i<dstw-1; i+=2){
d604bab9 915 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
916 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
917 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
918 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
919 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d9fc1cfe
MN
920
921 int Cb= yuvtab_40cf[U];
922 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
923 int Cr= yuvtab_3343[V];
924
925 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
926 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
927 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
928
929 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
930 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
931 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
932 }
933 }
934 if(dstbpp==24)
935 {
96034638 936 int i;
d9fc1cfe
MN
937 for(i=0; i<dstw-1; i+=2){
938 // vertical linear interpolation && yuv2rgb in a single step:
939 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
940 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
941 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
942 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d9fc1cfe
MN
943
944 int Cb= yuvtab_40cf[U];
945 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
946 int Cr= yuvtab_3343[V];
947
948 dest[0]=clip_table[((Y1 + Cb) >>13)];
949 dest[1]=clip_table[((Y1 + Cg) >>13)];
950 dest[2]=clip_table[((Y1 + Cr) >>13)];
951
952 dest[3]=clip_table[((Y2 + Cb) >>13)];
953 dest[4]=clip_table[((Y2 + Cg) >>13)];
954 dest[5]=clip_table[((Y2 + Cr) >>13)];
955 dest+=6;
d604bab9
MN
956 }
957 }
958 else if(dstbpp==16)
959 {
96034638 960 int i;
d9fc1cfe 961 for(i=0; i<dstw-1; i+=2){
d604bab9 962 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
963 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
964 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
965 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
966 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d604bab9 967
d9fc1cfe
MN
968 int Cb= yuvtab_40cf[U];
969 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
970 int Cr= yuvtab_3343[V];
971
d022ce5c 972 ((uint16_t*)dest)[i] =
b18ea156
MN
973 clip_table16b[(Y1 + Cb) >>13] |
974 clip_table16g[(Y1 + Cg) >>13] |
975 clip_table16r[(Y1 + Cr) >>13];
d9fc1cfe
MN
976
977 ((uint16_t*)dest)[i+1] =
b18ea156
MN
978 clip_table16b[(Y2 + Cb) >>13] |
979 clip_table16g[(Y2 + Cg) >>13] |
980 clip_table16r[(Y2 + Cr) >>13];
d604bab9
MN
981 }
982 }
983 else if(dstbpp==15)
984 {
96034638 985 int i;
d9fc1cfe 986 for(i=0; i<dstw-1; i+=2){
d604bab9 987 // vertical linear interpolation && yuv2rgb in a single step:
d9fc1cfe
MN
988 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
989 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
02a0a992
MN
990 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
991 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
d604bab9 992
d9fc1cfe
MN
993 int Cb= yuvtab_40cf[U];
994 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
995 int Cr= yuvtab_3343[V];
996
d022ce5c 997 ((uint16_t*)dest)[i] =
b18ea156
MN
998 clip_table15b[(Y1 + Cb) >>13] |
999 clip_table15g[(Y1 + Cg) >>13] |
1000 clip_table15r[(Y1 + Cr) >>13];
1001
d9fc1cfe 1002 ((uint16_t*)dest)[i+1] =
b18ea156
MN
1003 clip_table15b[(Y2 + Cb) >>13] |
1004 clip_table15g[(Y2 + Cg) >>13] |
1005 clip_table15r[(Y2 + Cr) >>13];
d604bab9
MN
1006 }
1007 }
1008#endif
1009 } //!FULL_UV_IPOL
1010}
1011
1012/**
1013 * YV12 to RGB without scaling or interpolating
1014 */
1015static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1016 uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
1017{
d604bab9 1018 int uvalpha1=uvalpha^4095;
96034638
MN
1019#ifdef HAVE_MMX
1020 int yalpha1=yalpha^4095;
1021#endif
1022
d604bab9
MN
1023 if(fullUVIpol || allwaysIpol)
1024 {
1025 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1026 return;
1027 }
497d4f99 1028 if( yalpha > 2048 ) buf0 = buf1;
397c035e
MN
1029
1030#ifdef HAVE_MMX
497d4f99
MN
1031 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1032 {
d604bab9
MN
1033 if(dstbpp == 32)
1034 {
1035 asm volatile(
1036 YSCALEYUV2RGB1
1037 WRITEBGR32
1038 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1039 "m" (yalpha1), "m" (uvalpha1)
1040 : "%eax"
1041 );
1042 }
1043 else if(dstbpp==24)
1044 {
1045 asm volatile(
bdc2eb9a 1046 "movl %4, %%ebx \n\t"
d604bab9
MN
1047 YSCALEYUV2RGB1
1048 WRITEBGR24
bdc2eb9a 1049 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
d604bab9
MN
1050 "m" (yalpha1), "m" (uvalpha1)
1051 : "%eax", "%ebx"
1052 );
1053 }
1054 else if(dstbpp==15)
1055 {
1056 asm volatile(
1057 YSCALEYUV2RGB1
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059#ifdef DITHER1XBPP
d8fa3c54
MN
1060 "paddusb b5Dither, %%mm2 \n\t"
1061 "paddusb g5Dither, %%mm4 \n\t"
1062 "paddusb r5Dither, %%mm5 \n\t"
d604bab9
MN
1063#endif
1064 WRITEBGR15
1065 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1066 "m" (yalpha1), "m" (uvalpha1)
1067 : "%eax"
1068 );
1069 }
1070 else if(dstbpp==16)
1071 {
1072 asm volatile(
1073 YSCALEYUV2RGB1
1074 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1075#ifdef DITHER1XBPP
d8fa3c54
MN
1076 "paddusb b5Dither, %%mm2 \n\t"
1077 "paddusb g6Dither, %%mm4 \n\t"
1078 "paddusb r5Dither, %%mm5 \n\t"
d604bab9
MN
1079#endif
1080
1081 WRITEBGR16
1082 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1083 "m" (yalpha1), "m" (uvalpha1)
1084 : "%eax"
1085 );
1086 }
497d4f99
MN
1087 }
1088 else
1089 {
1090 if(dstbpp == 32)
d604bab9 1091 {
497d4f99
MN
1092 asm volatile(
1093 YSCALEYUV2RGB1b
1094 WRITEBGR32
1095 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1096 "m" (yalpha1), "m" (uvalpha1)
1097 : "%eax"
1098 );
d604bab9 1099 }
497d4f99 1100 else if(dstbpp==24)
d604bab9 1101 {
497d4f99 1102 asm volatile(
bdc2eb9a 1103 "movl %4, %%ebx \n\t"
497d4f99
MN
1104 YSCALEYUV2RGB1b
1105 WRITEBGR24
bdc2eb9a 1106 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
497d4f99
MN
1107 "m" (yalpha1), "m" (uvalpha1)
1108 : "%eax", "%ebx"
1109 );
d604bab9
MN
1110 }
1111 else if(dstbpp==15)
1112 {
497d4f99
MN
1113 asm volatile(
1114 YSCALEYUV2RGB1b
1115 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1116#ifdef DITHER1XBPP
d8fa3c54
MN
1117 "paddusb b5Dither, %%mm2 \n\t"
1118 "paddusb g5Dither, %%mm4 \n\t"
1119 "paddusb r5Dither, %%mm5 \n\t"
497d4f99
MN
1120#endif
1121 WRITEBGR15
1122 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1123 "m" (yalpha1), "m" (uvalpha1)
1124 : "%eax"
1125 );
1126 }
1127 else if(dstbpp==16)
1128 {
1129 asm volatile(
1130 YSCALEYUV2RGB1b
1131 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1132#ifdef DITHER1XBPP
d8fa3c54
MN
1133 "paddusb b5Dither, %%mm2 \n\t"
1134 "paddusb g6Dither, %%mm4 \n\t"
1135 "paddusb r5Dither, %%mm5 \n\t"
497d4f99 1136#endif
d604bab9 1137
497d4f99
MN
1138 WRITEBGR16
1139 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1140 "m" (yalpha1), "m" (uvalpha1)
1141 : "%eax"
1142 );
d604bab9 1143 }
497d4f99
MN
1144 }
1145#else
397c035e 1146//FIXME write 2 versions (for even & odd lines)
497d4f99
MN
1147 asm volatile ("\n\t"::: "memory");
1148
397c035e 1149 if(dstbpp==32)
497d4f99 1150 {
96034638 1151 int i;
397c035e 1152 for(i=0; i<dstw-1; i+=2){
497d4f99 1153 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1154 int Y1=yuvtab_2568[buf0[i]>>7];
1155 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1156 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1157 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
397c035e
MN
1158
1159 int Cb= yuvtab_40cf[U];
1160 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1161 int Cr= yuvtab_3343[V];
1162
1163 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1164 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1165 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1166
1167 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1168 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1169 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1170 }
1171 }
1172 if(dstbpp==24)
1173 {
96034638 1174 int i;
397c035e
MN
1175 for(i=0; i<dstw-1; i+=2){
1176 // vertical linear interpolation && yuv2rgb in a single step:
1177 int Y1=yuvtab_2568[buf0[i]>>7];
1178 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1179 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1180 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
397c035e
MN
1181
1182 int Cb= yuvtab_40cf[U];
1183 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1184 int Cr= yuvtab_3343[V];
1185
1186 dest[0]=clip_table[((Y1 + Cb) >>13)];
1187 dest[1]=clip_table[((Y1 + Cg) >>13)];
1188 dest[2]=clip_table[((Y1 + Cr) >>13)];
1189
1190 dest[3]=clip_table[((Y2 + Cb) >>13)];
1191 dest[4]=clip_table[((Y2 + Cg) >>13)];
1192 dest[5]=clip_table[((Y2 + Cr) >>13)];
1193 dest+=6;
497d4f99
MN
1194 }
1195 }
1196 else if(dstbpp==16)
1197 {
96034638 1198 int i;
397c035e 1199 for(i=0; i<dstw-1; i+=2){
497d4f99 1200 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1201 int Y1=yuvtab_2568[buf0[i]>>7];
1202 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1203 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1204 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
497d4f99 1205
397c035e
MN
1206 int Cb= yuvtab_40cf[U];
1207 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1208 int Cr= yuvtab_3343[V];
1209
d022ce5c 1210 ((uint16_t*)dest)[i] =
b18ea156
MN
1211 clip_table16b[(Y1 + Cb) >>13] |
1212 clip_table16g[(Y1 + Cg) >>13] |
1213 clip_table16r[(Y1 + Cr) >>13];
397c035e
MN
1214
1215 ((uint16_t*)dest)[i+1] =
b18ea156
MN
1216 clip_table16b[(Y2 + Cb) >>13] |
1217 clip_table16g[(Y2 + Cg) >>13] |
1218 clip_table16r[(Y2 + Cr) >>13];
497d4f99
MN
1219 }
1220 }
1221 else if(dstbpp==15)
1222 {
96034638 1223 int i;
397c035e 1224 for(i=0; i<dstw-1; i+=2){
497d4f99 1225 // vertical linear interpolation && yuv2rgb in a single step:
397c035e
MN
1226 int Y1=yuvtab_2568[buf0[i]>>7];
1227 int Y2=yuvtab_2568[buf0[i+1]>>7];
02a0a992
MN
1228 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1229 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
497d4f99 1230
397c035e
MN
1231 int Cb= yuvtab_40cf[U];
1232 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1233 int Cr= yuvtab_3343[V];
1234
d022ce5c 1235 ((uint16_t*)dest)[i] =
b18ea156
MN
1236 clip_table15b[(Y1 + Cb) >>13] |
1237 clip_table15g[(Y1 + Cg) >>13] |
1238 clip_table15r[(Y1 + Cr) >>13];
1239
397c035e 1240 ((uint16_t*)dest)[i+1] =
b18ea156
MN
1241 clip_table15b[(Y2 + Cb) >>13] |
1242 clip_table15g[(Y2 + Cg) >>13] |
1243 clip_table15r[(Y2 + Cr) >>13];
497d4f99
MN
1244 }
1245 }
d604bab9
MN
1246#endif
1247}
1248
1249
2ff198c1
MN
1250static inline void hyscale(uint16_t *dst, int dstWidth, uint8_t *src, int srcWidth, int xInc)
1251{
2ff198c1
MN
1252 // *** horizontal scale Y line to temp buffer
1253#ifdef ARCH_X86
2ff198c1 1254#ifdef HAVE_MMX2
96034638 1255 int i;
2ff198c1
MN
1256 if(canMMX2BeUsed)
1257 {
1258 asm volatile(
1259 "pxor %%mm7, %%mm7 \n\t"
1260 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1261 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1262 "punpcklwd %%mm6, %%mm6 \n\t"
1263 "punpcklwd %%mm6, %%mm6 \n\t"
1264 "movq %%mm6, %%mm2 \n\t"
1265 "psllq $16, %%mm2 \n\t"
1266 "paddw %%mm6, %%mm2 \n\t"
1267 "psllq $16, %%mm2 \n\t"
1268 "paddw %%mm6, %%mm2 \n\t"
1269 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
1270 "movq %%mm2, temp0 \n\t"
1271 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1272 "punpcklwd %%mm6, %%mm6 \n\t"
1273 "punpcklwd %%mm6, %%mm6 \n\t"
1274 "xorl %%eax, %%eax \n\t" // i
1275 "movl %0, %%esi \n\t" // src
1276 "movl %1, %%edi \n\t" // buf1
1277 "movl %3, %%edx \n\t" // (xInc*4)>>16
1278 "xorl %%ecx, %%ecx \n\t"
1279 "xorl %%ebx, %%ebx \n\t"
1280 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
99cefd0b 1281
2ff198c1 1282#define FUNNY_Y_CODE \
99cefd0b
MN
1283 PREFETCH" 1024(%%esi) \n\t"\
1284 PREFETCH" 1056(%%esi) \n\t"\
1285 PREFETCH" 1088(%%esi) \n\t"\
2ff198c1
MN
1286 "call funnyYCode \n\t"\
1287 "movq temp0, %%mm2 \n\t"\
1288 "xorl %%ecx, %%ecx \n\t"
99cefd0b 1289
2ff198c1
MN
1290FUNNY_Y_CODE
1291FUNNY_Y_CODE
1292FUNNY_Y_CODE
1293FUNNY_Y_CODE
1294FUNNY_Y_CODE
1295FUNNY_Y_CODE
1296FUNNY_Y_CODE
1297FUNNY_Y_CODE
1298
1299 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1300 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1301 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1302 );
1303 for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth-1; i--) dst[i] = src[srcWidth-1]*128;
1304 }
1305 else
1306 {
1307#endif
1308 //NO MMX just normal asm ...
1309 asm volatile(
1310 "xorl %%eax, %%eax \n\t" // i
1311 "xorl %%ebx, %%ebx \n\t" // xx
1312 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 1313 ".balign 16 \n\t"
2ff198c1
MN
1314 "1: \n\t"
1315 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1316 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1317 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1318 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1319 "shll $16, %%edi \n\t"
1320 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1321 "movl %1, %%edi \n\t"
1322 "shrl $9, %%esi \n\t"
1323 "movw %%si, (%%edi, %%eax, 2) \n\t"
1324 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1325 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1326
1327 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
1328 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
1329 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1330 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1331 "shll $16, %%edi \n\t"
1332 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1333 "movl %1, %%edi \n\t"
1334 "shrl $9, %%esi \n\t"
1335 "movw %%si, 2(%%edi, %%eax, 2) \n\t"
1336 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1337 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1338
1339
1340 "addl $2, %%eax \n\t"
1341 "cmpl %2, %%eax \n\t"
1342 " jb 1b \n\t"
1343
1344
1345 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1346 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1347 );
1348#ifdef HAVE_MMX2
1349 } //if MMX2 cant be used
1350#endif
1351#else
96034638
MN
1352 int i;
1353 unsigned int xpos=0;
1354 for(i=0;i<dstWidth;i++)
1355 {
1356 register unsigned int xx=xpos>>16;
1357 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1358 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1359 xpos+=xInc;
1360 }
2ff198c1
MN
1361#endif
1362}
1363
1364inline static void hcscale(uint16_t *dst, int dstWidth,
1365 uint8_t *src1, uint8_t *src2, int srcWidth, int xInc)
1366{
2ff198c1
MN
1367#ifdef ARCH_X86
1368#ifdef HAVE_MMX2
96034638 1369 int i;
2ff198c1
MN
1370 if(canMMX2BeUsed)
1371 {
1372 asm volatile(
1373 "pxor %%mm7, %%mm7 \n\t"
1374 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
1375 "movd %5, %%mm6 \n\t" // xInc&0xFFFF
1376 "punpcklwd %%mm6, %%mm6 \n\t"
1377 "punpcklwd %%mm6, %%mm6 \n\t"
1378 "movq %%mm6, %%mm2 \n\t"
1379 "psllq $16, %%mm2 \n\t"
1380 "paddw %%mm6, %%mm2 \n\t"
1381 "psllq $16, %%mm2 \n\t"
1382 "paddw %%mm6, %%mm2 \n\t"
1383 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
1384 "movq %%mm2, temp0 \n\t"
1385 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
1386 "punpcklwd %%mm6, %%mm6 \n\t"
1387 "punpcklwd %%mm6, %%mm6 \n\t"
1388 "xorl %%eax, %%eax \n\t" // i
1389 "movl %0, %%esi \n\t" // src
1390 "movl %1, %%edi \n\t" // buf1
1391 "movl %3, %%edx \n\t" // (xInc*4)>>16
1392 "xorl %%ecx, %%ecx \n\t"
1393 "xorl %%ebx, %%ebx \n\t"
1394 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
1395
2ff198c1 1396#define FUNNYUVCODE \
99cefd0b
MN
1397 PREFETCH" 1024(%%esi) \n\t"\
1398 PREFETCH" 1056(%%esi) \n\t"\
1399 PREFETCH" 1088(%%esi) \n\t"\
2ff198c1
MN
1400 "call funnyUVCode \n\t"\
1401 "movq temp0, %%mm2 \n\t"\
1402 "xorl %%ecx, %%ecx \n\t"
2ff198c1
MN
1403
1404FUNNYUVCODE
1405FUNNYUVCODE
1406FUNNYUVCODE
1407FUNNYUVCODE
1408
1409FUNNYUVCODE
1410FUNNYUVCODE
1411FUNNYUVCODE
1412FUNNYUVCODE
2ff198c1
MN
1413 "xorl %%eax, %%eax \n\t" // i
1414 "movl %6, %%esi \n\t" // src
1415 "movl %1, %%edi \n\t" // buf1
1416 "addl $4096, %%edi \n\t"
1417
1418FUNNYUVCODE
1419FUNNYUVCODE
1420FUNNYUVCODE
1421FUNNYUVCODE
1422
1423FUNNYUVCODE
1424FUNNYUVCODE
1425FUNNYUVCODE
1426FUNNYUVCODE
1427
1428 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1429 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1430 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1431 );
1432 for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth/2-1; i--)
1433 {
1434 dst[i] = src1[srcWidth/2-1]*128;
1435 dst[i+2048] = src2[srcWidth/2-1]*128;
1436 }
1437 }
1438 else
1439 {
1440#endif
1441 asm volatile(
1442 "xorl %%eax, %%eax \n\t" // i
1443 "xorl %%ebx, %%ebx \n\t" // xx
1444 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
cff6ecd7 1445 ".balign 16 \n\t"
2ff198c1
MN
1446 "1: \n\t"
1447 "movl %0, %%esi \n\t"
1448 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
1449 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
1450 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1451 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1452 "shll $16, %%edi \n\t"
1453 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1454 "movl %1, %%edi \n\t"
1455 "shrl $9, %%esi \n\t"
1456 "movw %%si, (%%edi, %%eax, 2) \n\t"
1457
1458 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
1459 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
1460 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
1461 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
1462 "shll $16, %%edi \n\t"
1463 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1464 "movl %1, %%edi \n\t"
1465 "shrl $9, %%esi \n\t"
1466 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1467
1468 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
1469 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
1470 "addl $1, %%eax \n\t"
1471 "cmpl %2, %%eax \n\t"
1472 " jb 1b \n\t"
1473
1474 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1475 "r" (src2)
1476 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1477 );
1478#ifdef HAVE_MMX2
1479 } //if MMX2 cant be used
1480#endif
1481#else
96034638
MN
1482 int i;
1483 unsigned int xpos=0;
1484 for(i=0;i<dstWidth;i++)
1485 {
1486 register unsigned int xx=xpos>>16;
1487 register unsigned int xalpha=(xpos&0xFFFF)>>9;
1488 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1489 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1faf0867
MN
1490/* slower
1491 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1492 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1493*/
96034638
MN
1494 xpos+=xInc;
1495 }
2ff198c1
MN
1496#endif
1497}
d604bab9 1498
d3f41512 1499
38858470 1500// *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
31190492
A
1501// *** Note: it's called multiple times while decoding a frame, first time y==0
1502// *** Designed to upscale, but may work for downscale too.
44f9179b 1503// s_xinc = (src_width << 16) / dst_width
31190492 1504// s_yinc = (src_height << 16) / dst_height
38858470
MN
1505void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int y, int h,
1506 uint8_t* dstptr[], int dststride, int dstw, int dstbpp,
31190492
A
1507 unsigned int s_xinc,unsigned int s_yinc){
1508
1509// scaling factors:
1510//static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
1511//static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
1512
783e9cc9 1513unsigned int s_xinc2;
31190492 1514
783e9cc9 1515static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
31190492 1516static int s_ypos;
783e9cc9
MN
1517
1518// last horzontally interpolated lines, used to avoid unnecessary calculations
31190492 1519static int s_last_ypos;
783e9cc9
MN
1520static int s_last_y1pos;
1521
d3f41512 1522#ifdef HAVE_MMX2
783e9cc9 1523// used to detect a horizontal size change
d3f41512
MN
1524static int old_dstw= -1;
1525static int old_s_xinc= -1;
1526#endif
d604bab9 1527
7d7f78b5
MN
1528int srcWidth;
1529int dstUVw;
162caf68 1530int i;
31190492 1531
7d7f78b5
MN
1532if(((dstw + 7)&(~7)) >= dststride) dstw&= ~7;
1533
1534srcWidth= (dstw*s_xinc + 0x8000)>>16;
1535dstUVw= fullUVIpol ? dstw : dstw/2;
1536
d3fda508 1537#ifdef HAVE_MMX2
0f25d72b 1538canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;
d3fda508
MN
1539#endif
1540
0f25d72b
MN
1541// match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1542// n-2 is the last chrominance sample available
1543// FIXME this is not perfect, but noone shuld notice the difference, the more correct variant
1544// would be like the vertical one, but that would require some special code for the
1545// first and last pixel
1546if(canMMX2BeUsed) s_xinc+= 20;
1547else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
0f25d72b 1548
1faf0867
MN
1549if(fullUVIpol && !(dstbpp==12)) s_xinc2= s_xinc>>1;
1550else s_xinc2= s_xinc;
b3a134b6 1551 // force calculation of the horizontal interpolation of the first line
b3a134b6 1552
31190492 1553 if(y==0){
7d7f78b5 1554// printf("dstw %d, srcw %d, mmx2 %d\n", dstw, srcWidth, canMMX2BeUsed);
2ff198c1
MN
1555 s_last_ypos=-99;
1556 s_last_y1pos=-99;
1557 s_srcypos= s_yinc/2 - 0x8000;
1558 s_ypos=0;
162caf68
MN
1559
1560 // clean the buffers so that no green stuff is drawen if the width is not sane (%8=0)
1561 for(i=dstw-2; i<dstw+20; i++)
1562 {
1563 pix_buf_uv[0][i] = pix_buf_uv[1][i]
7d7f78b5 1564 = pix_buf_uv[0][2048+i] = pix_buf_uv[1][2048+i] = 128*128;
162caf68 1565 pix_buf_uv[0][i/2] = pix_buf_uv[1][i/2]
7d7f78b5 1566 = pix_buf_uv[0][2048+i/2] = pix_buf_uv[1][2048+i/2] = 128*128;
162caf68
MN
1567 pix_buf_y[0][i]= pix_buf_y[1][i]= 0;
1568 }
1569
d3f41512
MN
1570#ifdef HAVE_MMX2
1571// cant downscale !!!
783e9cc9 1572 if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
d3f41512
MN
1573 {
1574 uint8_t *fragment;
1575 int imm8OfPShufW1;
1576 int imm8OfPShufW2;
1577 int fragmentLength;
1578
96034638 1579 int xpos, i;
d3f41512
MN
1580
1581 old_s_xinc= s_xinc;
1582 old_dstw= dstw;
1583
d3f41512
MN
1584 // create an optimized horizontal scaling routine
1585
1586 //code fragment
1587
d3f41512
MN
1588 asm volatile(
1589 "jmp 9f \n\t"
1590 // Begin
1591 "0: \n\t"
783e9cc9 1592 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
d3f41512
MN
1593 "movq %%mm0, %%mm1 \n\t"
1594 "psrlq $8, %%mm0 \n\t"
1595 "punpcklbw %%mm7, %%mm1 \n\t"
783e9cc9 1596 "movq %%mm2, %%mm3 \n\t"
d3f41512 1597 "punpcklbw %%mm7, %%mm0 \n\t"
783e9cc9 1598 "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF
d3f41512
MN
1599 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
1600 "1: \n\t"
783e9cc9 1601 "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry
d3f41512
MN
1602 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1603 "2: \n\t"
783e9cc9 1604 "psrlw $9, %%mm3 \n\t"
d3f41512 1605 "psubw %%mm1, %%mm0 \n\t"
783e9cc9
MN
1606 "pmullw %%mm3, %%mm0 \n\t"
1607 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
d3f41512
MN
1608 "psllw $7, %%mm1 \n\t"
1609 "paddw %%mm1, %%mm0 \n\t"
d3f41512 1610
783e9cc9 1611 "movq %%mm0, (%%edi, %%eax) \n\t"
d3f41512
MN
1612
1613 "addl $8, %%eax \n\t"
1614 // End
1615 "9: \n\t"
1616// "int $3\n\t"
1617 "leal 0b, %0 \n\t"
1618 "leal 1b, %1 \n\t"
1619 "leal 2b, %2 \n\t"
1620 "decl %1 \n\t"
1621 "decl %2 \n\t"
1622 "subl %0, %1 \n\t"
1623 "subl %0, %2 \n\t"
1624 "leal 9b, %3 \n\t"
1625 "subl %0, %3 \n\t"
1626 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1627 "=r" (fragmentLength)
1628 );
1629
0f25d72b 1630 xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers
783e9cc9
MN
1631
1632 /* choose xinc so that all 8 parts fit exactly
1633 Note: we cannot use just 1 part because it would not fit in the code cache */
0f25d72b 1634// s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))-10;
783e9cc9
MN
1635// s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8));
1636#ifdef ALT_ERROR
0f25d72b 1637// s_xinc2_diff+= ((0x10000/(dstw/8)));
783e9cc9 1638#endif
0f25d72b 1639// s_xinc_diff= s_xinc2_diff*2;
783e9cc9 1640
0f25d72b
MN
1641// s_xinc2+= s_xinc2_diff;
1642// s_xinc+= s_xinc_diff;
d3fda508 1643
0f25d72b 1644// old_s_xinc= s_xinc;
d3fda508 1645
d3f41512
MN
1646 for(i=0; i<dstw/8; i++)
1647 {
783e9cc9 1648 int xx=xpos>>16;
d3f41512
MN
1649
1650 if((i&3) == 0)
1651 {
1652 int a=0;
783e9cc9
MN
1653 int b=((xpos+s_xinc)>>16) - xx;
1654 int c=((xpos+s_xinc*2)>>16) - xx;
1655 int d=((xpos+s_xinc*3)>>16) - xx;
d3f41512
MN
1656
1657 memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
1658
1659 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
1660 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
1661 a | (b<<2) | (c<<4) | (d<<6);
1662
d604bab9
MN
1663 // if we dont need to read 8 bytes than dont :), reduces the chance of
1664 // crossing a cache line
1665 if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;
1666
d3f41512
MN
1667 funnyYCode[fragmentLength*(i+4)/4]= RET;
1668 }
1669 xpos+=s_xinc;
1670 }
1671
0f25d72b 1672 xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples
d604bab9 1673 for(i=0; i<dstUVw/8; i++)
d3f41512 1674 {
783e9cc9 1675 int xx=xpos>>16;
d3f41512
MN
1676
1677 if((i&3) == 0)
1678 {
1679 int a=0;
783e9cc9
MN
1680 int b=((xpos+s_xinc2)>>16) - xx;
1681 int c=((xpos+s_xinc2*2)>>16) - xx;
1682 int d=((xpos+s_xinc2*3)>>16) - xx;
d3f41512
MN
1683
1684 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
1685
1686 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
1687 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
1688 a | (b<<2) | (c<<4) | (d<<6);
1689
d604bab9
MN
1690 // if we dont need to read 8 bytes than dont :), reduces the chance of
1691 // crossing a cache line
1692 if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
1693
d3f41512
MN
1694 funnyUVCode[fragmentLength*(i+4)/4]= RET;
1695 }
1696 xpos+=s_xinc2;
1697 }
1698// funnyCode[0]= RET;
d3f41512 1699 }
783e9cc9 1700
783e9cc9 1701#endif // HAVE_MMX2
31190492 1702 } // reset counters
d3f41512 1703
31190492 1704 while(1){
38858470
MN
1705 unsigned char *dest =dstptr[0]+dststride*s_ypos;
1706 unsigned char *uDest=dstptr[1]+(dststride>>1)*(s_ypos>>1);
1707 unsigned char *vDest=dstptr[2]+(dststride>>1)*(s_ypos>>1);
1708
783e9cc9
MN
1709 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line
1710 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
38858470
MN
1711 int srcuvpos= dstbpp==12 ? s_srcypos + s_yinc/2 - 0x8000 :
1712 s_srcypos - 0x8000;
783e9cc9 1713 int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
d604bab9
MN
1714 int yalpha=((s_srcypos-1)&0xFFFF)>>4;
1715 int uvalpha=((srcuvpos-1)&0x1FFFF)>>5;
783e9cc9
MN
1716 uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice
1717 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice
1718 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice
1719 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice
31190492 1720
783e9cc9
MN
1721 if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
1722
38858470 1723 if((y0&1) && dstbpp==12) uvalpha=-1; // there is no alpha if there is no line
31190492
A
1724
1725 s_ypos++; s_srcypos+=s_yinc;
1726
783e9cc9 1727 //only interpolate the src line horizontally if we didnt do it allready
2ff198c1 1728 if(s_last_ypos!=y0)
783e9cc9 1729 {
2ff198c1
MN
1730 unsigned char *src;
1731 // skip if first line has been horiz scaled alleady
1732 if(s_last_ypos != y0-1)
1733 {
1734 // check if first line is before any available src lines
1735 if(y0-1 < y) src=srcptr[0]+(0 )*stride[0];
1736 else src=srcptr[0]+(y0-y-1)*stride[0];
d3f41512 1737
2ff198c1
MN
1738 hyscale(buf0, dstw, src, srcWidth, s_xinc);
1739 }
1740 // check if second line is after any available src lines
1741 if(y0-y >= h) src=srcptr[0]+(h-1)*stride[0];
1742 else src=srcptr[0]+(y0-y)*stride[0];
d3f41512 1743
2ff198c1
MN
1744 // the min() is required to avoid reuseing lines which where not available
1745 s_last_ypos= MIN(y0, y+h-1);
1746 hyscale(buf1, dstw, src, srcWidth, s_xinc);
1747 }
1748// printf("%d %d %d %d\n", y, y1, s_last_y1pos, h);
31190492 1749 // *** horizontal scale U and V lines to temp buffer
2ff198c1 1750 if(s_last_y1pos!=y1)
783e9cc9 1751 {
2ff198c1
MN
1752 uint8_t *src1, *src2;
1753 // skip if first line has been horiz scaled alleady
1754 if(s_last_y1pos != y1-1)
0f25d72b 1755 {
2ff198c1
MN
1756 // check if first line is before any available src lines
1757 if(y1-y/2-1 < 0)
1758 {
1759 src1= srcptr[1]+(0)*stride[1];
1760 src2= srcptr[2]+(0)*stride[2];
1761 }else{
1762 src1= srcptr[1]+(y1-y/2-1)*stride[1];
1763 src2= srcptr[2]+(y1-y/2-1)*stride[2];
1764 }
1765 hcscale(uvbuf0, dstUVw, src1, src2, srcWidth, s_xinc2);
0f25d72b 1766 }
d3f41512 1767
2ff198c1
MN
1768 // check if second line is after any available src lines
1769 if(y1 - y/2 >= h/2)
1770 {
1771 src1= srcptr[1]+(h/2-1)*stride[1];
1772 src2= srcptr[2]+(h/2-1)*stride[2];
1773 }else{
1774 src1= srcptr[1]+(y1-y/2)*stride[1];
1775 src2= srcptr[2]+(y1-y/2)*stride[2];
1776 }
1777 hcscale(uvbuf1, dstUVw, src1, src2, srcWidth, s_xinc2);
d3f41512 1778
2ff198c1
MN
1779 // the min() is required to avoid reuseing lines which where not available
1780 s_last_y1pos= MIN(y1, y/2+h/2-1);
84adc106 1781 }
d8fa3c54
MN
1782#ifdef HAVE_MMX
1783 b5Dither= dither8[s_ypos&1];
1784 g6Dither= dither4[s_ypos&1];
1785 g5Dither= dither8[s_ypos&1];
1786 r5Dither= dither8[(s_ypos+1)&1];
1787#endif
2ff198c1 1788
38858470
MN
1789 if(dstbpp==12) //YV12
1790 yuv2yuv(buf0, buf1, uvbuf0, uvbuf1, dest, uDest, vDest, dstw, yalpha, uvalpha);
1791 else if(ABS(s_yinc - 0x10000) < 10)
d604bab9
MN
1792 yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1793 else
1794 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
31190492 1795 }
17f715fa
MN
1796
1797#ifdef HAVE_MMX
1798 __asm __volatile(SFENCE:::"memory");
1faf0867 1799 __asm __volatile(EMMS:::"memory");
17f715fa 1800#endif
31190492
A
1801}
1802
1803
1804void SwScale_Init(){
1805 // generating tables:
1806 int i;
1807 for(i=0;i<256;i++){
1808 clip_table[i]=0;
1809 clip_table[i+256]=i;
1810 clip_table[i+512]=255;
1811 yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
1812 yuvtab_3343[i]=0x3343*(i-128);
1813 yuvtab_0c92[i]=-0x0c92*(i-128);
1814 yuvtab_1a1e[i]=-0x1a1e*(i-128);
1815 yuvtab_40cf[i]=0x40cf*(i-128);
1816 }
1817
b18ea156
MN
1818 for(i=0; i<768; i++)
1819 {
1820 int v= clip_table[i];
1821 clip_table16b[i]= v>>3;
1822 clip_table16g[i]= (v<<3)&0x07E0;
1823 clip_table16r[i]= (v<<8)&0xF800;
1824 clip_table15b[i]= v>>3;
1825 clip_table15g[i]= (v<<2)&0x03E0;
1826 clip_table15r[i]= (v<<7)&0x7C00;
1827 }
1828
31190492 1829}