fixing green line at right side bug (reported by Nick Kurshev <nickols_k@mail.ru>)
[libav.git] / postproc / swscale.c
CommitLineData
fe8054c0
MN
1/*
2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
28bf81c9 19/*
a861d4d7 20 supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, RGB32, RGB24, Y8, Y800
6c7506de
MN
21 supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
22 BGR15/16 support dithering
a861d4d7
MN
23
24 unscaled special converters
25 YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
26 YV12/I420/IYUV -> YV12/I420/IYUV
27 YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
28bf81c9
MN
28*/
29
d3f41512 30#include <inttypes.h>
dda87e9f 31#include <string.h>
077ea8a7 32#include <math.h>
c1b0bfb4 33#include <stdio.h>
d3f41512 34#include "../config.h"
9b464428 35#include "../mangle.h"
c1b0bfb4
MN
36#ifdef HAVE_MALLOC_H
37#include <malloc.h>
38#endif
d604bab9 39#include "swscale.h"
7630f2e0 40#include "../cpudetect.h"
a861d4d7 41#include "../bswap.h"
28bf81c9 42#include "../libvo/img_format.h"
37079906 43#include "rgb2rgb.h"
541c4eb9 44#undef MOVNTQ
7d7f78b5 45#undef PAVGB
d3f41512 46
783e9cc9 47//#undef HAVE_MMX2
7f56a527 48//#define HAVE_3DNOW
d3f41512 49//#undef HAVE_MMX
783e9cc9 50//#undef ARCH_X86
d604bab9 51#define DITHER1XBPP
d3f41512 52
1e621b18 53#define RET 0xC3 //near return opcode for X86
c1b0bfb4 54
28bf81c9
MN
55#ifdef MP_DEBUG
56#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
57#else
c1b0bfb4 58#define ASSERT(x) ;
28bf81c9
MN
59#endif
60
61#ifdef M_PI
62#define PI M_PI
63#else
64#define PI 3.14159265358979323846
65#endif
c1b0bfb4 66
6c7506de 67//FIXME replace this with something faster
6ff0ad6b 68#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
1e621b18 69#define isYUV(x) ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
6ff0ad6b
MN
70#define isHalfChrV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
71#define isHalfChrH(x) ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
a861d4d7 72#define isPacked(x) ((x)==IMGFMT_YUY2 || ((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR || ((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
6ff0ad6b
MN
73#define isGray(x) ((x)==IMGFMT_Y800)
74#define isSupportedIn(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
75 || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24\
a861d4d7 76 || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
6ff0ad6b
MN
77 || (x)==IMGFMT_Y800)
78#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
79 || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
37079906 80#define isBGR(x) ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
6ff0ad6b
MN
81
82#define RGB2YUV_SHIFT 16
1e621b18
MN
83#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
84#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
85#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
86#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
87#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
88#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
89#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
90#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
91#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
6c7506de 92
e3d2500f 93extern int verbose; // defined in mplayer.c
783e9cc9
MN
94/*
95NOTES
d3f41512 96
d604bab9 97known BUGS with known cause (no bugreports please!, but patches are welcome :) )
e3d2500f 98horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
d604bab9 99
d604bab9 100Special versions: fast Y 1:1 scaling (no interpolation in y direction)
31190492 101
783e9cc9 102TODO
d604bab9 103more intelligent missalignment avoidance for the horizontal scaler
c1b0bfb4
MN
104write special vertical cubic upscale version
105Optimize C code (yv12 / minmax)
ff7ba856 106add support for packed pixel yuv input & output
6ff0ad6b
MN
107add support for Y8 output
108optimize bgr24 & bgr32
ff7ba856 109add BGR4 output support
1e621b18 110write special BGR->BGR scaler
37079906 111deglobalize yuv2rgb*.c
783e9cc9 112*/
31190492 113
d604bab9 114#define ABS(a) ((a) > 0 ? (a) : (-(a)))
2ff198c1
MN
115#define MIN(a,b) ((a) > (b) ? (b) : (a))
116#define MAX(a,b) ((a) < (b) ? (b) : (a))
d604bab9 117
7630f2e0
MN
118#ifdef ARCH_X86
119#define CAN_COMPILE_X86_ASM
d604bab9
MN
120#endif
121
7630f2e0 122#ifdef CAN_COMPILE_X86_ASM
d604bab9 123static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
390b20a6
MN
124static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
125static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
126static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
127static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
f62255fb
MN
128static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
129static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
d604bab9
MN
130static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
131static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
132static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
077ea8a7 133static uint64_t __attribute__((aligned(8))) w02= 0x0002000200020002LL;
d604bab9
MN
134static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
135static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
136static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
6ff0ad6b 137static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
d604bab9 138
3fe8e8f0
MN
139static volatile uint64_t __attribute__((aligned(8))) b5Dither;
140static volatile uint64_t __attribute__((aligned(8))) g5Dither;
141static volatile uint64_t __attribute__((aligned(8))) g6Dither;
142static volatile uint64_t __attribute__((aligned(8))) r5Dither;
d8fa3c54
MN
143
144static uint64_t __attribute__((aligned(8))) dither4[2]={
145 0x0103010301030103LL,
146 0x0200020002000200LL,};
147
148static uint64_t __attribute__((aligned(8))) dither8[2]={
149 0x0602060206020602LL,
150 0x0004000400040004LL,};
d604bab9
MN
151
152static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
153static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
154static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
155static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
156static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
157static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
158
99d2cb72
MN
159static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
160static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
161static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
162
28bf81c9 163// FIXME remove
d604bab9
MN
164static uint64_t __attribute__((aligned(8))) asm_yalpha1;
165static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
d604bab9 166#endif
783e9cc9
MN
167
168// clipping helper table for C implementations:
169static unsigned char clip_table[768];
170
b18ea156
MN
171static unsigned short clip_table16b[768];
172static unsigned short clip_table16g[768];
173static unsigned short clip_table16r[768];
174static unsigned short clip_table15b[768];
175static unsigned short clip_table15g[768];
176static unsigned short clip_table15r[768];
177
783e9cc9
MN
178// yuv->rgb conversion tables:
179static int yuvtab_2568[256];
180static int yuvtab_3343[256];
181static int yuvtab_0c92[256];
182static int yuvtab_1a1e[256];
183static int yuvtab_40cf[256];
c1b0bfb4
MN
184// Needed for cubic scaler to catch overflows
185static int clip_yuvtab_2568[768];
186static int clip_yuvtab_3343[768];
187static int clip_yuvtab_0c92[768];
188static int clip_yuvtab_1a1e[768];
189static int clip_yuvtab_40cf[768];
190
28bf81c9 191//global sws_flags from the command line
1f347f22 192int sws_flags=2;
077ea8a7 193
5cebb24b
MN
194//global srcFilter
195SwsFilter src_filter= {NULL, NULL, NULL, NULL};
196
197float sws_lum_gblur= 0.0;
198float sws_chr_gblur= 0.0;
199int sws_chr_vshift= 0;
200int sws_chr_hshift= 0;
5521b193
MN
201float sws_chr_sharpen= 0.0;
202float sws_lum_sharpen= 0.0;
5cebb24b 203
28bf81c9
MN
204/* cpuCaps combined from cpudetect and whats actually compiled in
205 (if there is no support for something compiled in it wont appear here) */
206static CpuCaps cpuCaps;
d3f41512 207
28bf81c9
MN
208void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
209 int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
2ff198c1 210
5cebb24b
MN
211static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
212
7630f2e0 213#ifdef CAN_COMPILE_X86_ASM
96034638
MN
214void in_asm_used_var_warning_killer()
215{
077ea8a7 216 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
28bf81c9 217 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
6ff0ad6b 218 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
96034638
MN
219 if(i) i=0;
220}
221#endif
d604bab9 222
e3d2500f
MN
223static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
224 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
225 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
226{
227 //FIXME Optimize (just quickly writen not opti..)
228 int i;
229 for(i=0; i<dstW; i++)
230 {
231 int val=0;
232 int j;
233 for(j=0; j<lumFilterSize; j++)
234 val += lumSrc[j][i] * lumFilter[j];
235
236 dest[i]= MIN(MAX(val>>19, 0), 255);
237 }
238
239 if(uDest != NULL)
240 for(i=0; i<(dstW>>1); i++)
241 {
242 int u=0;
243 int v=0;
244 int j;
627690b5 245 for(j=0; j<chrFilterSize; j++)
e3d2500f
MN
246 {
247 u += chrSrc[j][i] * chrFilter[j];
248 v += chrSrc[j][i + 2048] * chrFilter[j];
249 }
250
251 uDest[i]= MIN(MAX(u>>19, 0), 255);
252 vDest[i]= MIN(MAX(v>>19, 0), 255);
253 }
254}
255
256static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
257 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
28bf81c9 258 uint8_t *dest, int dstW, int dstFormat)
e3d2500f 259{
28bf81c9 260 if(dstFormat==IMGFMT_BGR32)
e3d2500f
MN
261 {
262 int i;
263 for(i=0; i<(dstW>>1); i++){
264 int j;
265 int Y1=0;
266 int Y2=0;
267 int U=0;
268 int V=0;
269 int Cb, Cr, Cg;
270 for(j=0; j<lumFilterSize; j++)
271 {
272 Y1 += lumSrc[j][2*i] * lumFilter[j];
273 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
274 }
275 for(j=0; j<chrFilterSize; j++)
276 {
277 U += chrSrc[j][i] * chrFilter[j];
278 V += chrSrc[j][i+2048] * chrFilter[j];
279 }
280 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
281 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
282 U >>= 19;
283 V >>= 19;
284
285 Cb= clip_yuvtab_40cf[U+ 256];
286 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
287 Cr= clip_yuvtab_3343[V+ 256];
288
289 dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
290 dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
291 dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
292
293 dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
294 dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
295 dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
296 }
297 }
28bf81c9 298 else if(dstFormat==IMGFMT_BGR24)
e3d2500f
MN
299 {
300 int i;
301 for(i=0; i<(dstW>>1); i++){
302 int j;
303 int Y1=0;
304 int Y2=0;
305 int U=0;
306 int V=0;
307 int Cb, Cr, Cg;
308 for(j=0; j<lumFilterSize; j++)
309 {
310 Y1 += lumSrc[j][2*i] * lumFilter[j];
311 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
312 }
313 for(j=0; j<chrFilterSize; j++)
314 {
315 U += chrSrc[j][i] * chrFilter[j];
316 V += chrSrc[j][i+2048] * chrFilter[j];
317 }
318 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
319 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
320 U >>= 19;
321 V >>= 19;
322
323 Cb= clip_yuvtab_40cf[U+ 256];
324 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
325 Cr= clip_yuvtab_3343[V+ 256];
326
327 dest[0]=clip_table[((Y1 + Cb) >>13)];
328 dest[1]=clip_table[((Y1 + Cg) >>13)];
329 dest[2]=clip_table[((Y1 + Cr) >>13)];
330
331 dest[3]=clip_table[((Y2 + Cb) >>13)];
332 dest[4]=clip_table[((Y2 + Cg) >>13)];
333 dest[5]=clip_table[((Y2 + Cr) >>13)];
334 dest+=6;
335 }
336 }
28bf81c9 337 else if(dstFormat==IMGFMT_BGR16)
e3d2500f
MN
338 {
339 int i;
5521b193
MN
340#ifdef DITHER1XBPP
341 static int ditherb1=1<<14;
342 static int ditherg1=1<<13;
343 static int ditherr1=2<<14;
344 static int ditherb2=3<<14;
345 static int ditherg2=3<<13;
346 static int ditherr2=0<<14;
347
348 ditherb1 ^= (1^2)<<14;
349 ditherg1 ^= (1^2)<<13;
350 ditherr1 ^= (1^2)<<14;
351 ditherb2 ^= (3^0)<<14;
352 ditherg2 ^= (3^0)<<13;
353 ditherr2 ^= (3^0)<<14;
354#else
355 const int ditherb1=0;
356 const int ditherg1=0;
357 const int ditherr1=0;
358 const int ditherb2=0;
359 const int ditherg2=0;
360 const int ditherr2=0;
361#endif
e3d2500f
MN
362 for(i=0; i<(dstW>>1); i++){
363 int j;
364 int Y1=0;
365 int Y2=0;
366 int U=0;
367 int V=0;
368 int Cb, Cr, Cg;
369 for(j=0; j<lumFilterSize; j++)
370 {
371 Y1 += lumSrc[j][2*i] * lumFilter[j];
372 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
373 }
374 for(j=0; j<chrFilterSize; j++)
375 {
376 U += chrSrc[j][i] * chrFilter[j];
377 V += chrSrc[j][i+2048] * chrFilter[j];
378 }
379 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
380 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
381 U >>= 19;
382 V >>= 19;
383
384 Cb= clip_yuvtab_40cf[U+ 256];
385 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
386 Cr= clip_yuvtab_3343[V+ 256];
387
388 ((uint16_t*)dest)[2*i] =
5521b193
MN
389 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
390 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
391 clip_table16r[(Y1 + Cr + ditherr1) >>13];
e3d2500f
MN
392
393 ((uint16_t*)dest)[2*i+1] =
5521b193
MN
394 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
395 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
396 clip_table16r[(Y2 + Cr + ditherr2) >>13];
e3d2500f
MN
397 }
398 }
28bf81c9 399 else if(dstFormat==IMGFMT_BGR15)
e3d2500f
MN
400 {
401 int i;
5521b193
MN
402#ifdef DITHER1XBPP
403 static int ditherb1=1<<14;
404 static int ditherg1=1<<14;
405 static int ditherr1=2<<14;
406 static int ditherb2=3<<14;
407 static int ditherg2=3<<14;
408 static int ditherr2=0<<14;
409
410 ditherb1 ^= (1^2)<<14;
411 ditherg1 ^= (1^2)<<14;
412 ditherr1 ^= (1^2)<<14;
413 ditherb2 ^= (3^0)<<14;
414 ditherg2 ^= (3^0)<<14;
415 ditherr2 ^= (3^0)<<14;
416#else
417 const int ditherb1=0;
418 const int ditherg1=0;
419 const int ditherr1=0;
420 const int ditherb2=0;
421 const int ditherg2=0;
422 const int ditherr2=0;
423#endif
e3d2500f
MN
424 for(i=0; i<(dstW>>1); i++){
425 int j;
426 int Y1=0;
427 int Y2=0;
428 int U=0;
429 int V=0;
430 int Cb, Cr, Cg;
431 for(j=0; j<lumFilterSize; j++)
432 {
433 Y1 += lumSrc[j][2*i] * lumFilter[j];
434 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
435 }
436 for(j=0; j<chrFilterSize; j++)
437 {
438 U += chrSrc[j][i] * chrFilter[j];
439 V += chrSrc[j][i+2048] * chrFilter[j];
440 }
441 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
442 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
443 U >>= 19;
444 V >>= 19;
445
446 Cb= clip_yuvtab_40cf[U+ 256];
447 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
448 Cr= clip_yuvtab_3343[V+ 256];
449
450 ((uint16_t*)dest)[2*i] =
5521b193
MN
451 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
452 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
453 clip_table15r[(Y1 + Cr + ditherr1) >>13];
e3d2500f
MN
454
455 ((uint16_t*)dest)[2*i+1] =
5521b193
MN
456 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
457 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
458 clip_table15r[(Y2 + Cr + ditherr2) >>13];
e3d2500f
MN
459 }
460 }
461}
462
463
7630f2e0
MN
464//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
465//Plain C versions
726a959a
MN
466#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
467#define COMPILE_C
468#endif
469
470#ifdef CAN_COMPILE_X86_ASM
471
472#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
473#define COMPILE_MMX
474#endif
475
476#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
477#define COMPILE_MMX2
478#endif
479
480#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
481#define COMPILE_3DNOW
482#endif
483#endif //CAN_COMPILE_X86_ASM
484
485#undef HAVE_MMX
486#undef HAVE_MMX2
487#undef HAVE_3DNOW
726a959a
MN
488
489#ifdef COMPILE_C
7630f2e0
MN
490#undef HAVE_MMX
491#undef HAVE_MMX2
492#undef HAVE_3DNOW
7630f2e0
MN
493#define RENAME(a) a ## _C
494#include "swscale_template.c"
726a959a 495#endif
397c035e 496
7630f2e0 497#ifdef CAN_COMPILE_X86_ASM
397c035e 498
7630f2e0
MN
499//X86 versions
500/*
501#undef RENAME
502#undef HAVE_MMX
503#undef HAVE_MMX2
504#undef HAVE_3DNOW
505#define ARCH_X86
506#define RENAME(a) a ## _X86
507#include "swscale_template.c"
1faf0867 508*/
7630f2e0 509//MMX versions
726a959a 510#ifdef COMPILE_MMX
7630f2e0
MN
511#undef RENAME
512#define HAVE_MMX
513#undef HAVE_MMX2
514#undef HAVE_3DNOW
7630f2e0
MN
515#define RENAME(a) a ## _MMX
516#include "swscale_template.c"
726a959a 517#endif
7630f2e0
MN
518
519//MMX2 versions
726a959a 520#ifdef COMPILE_MMX2
7630f2e0
MN
521#undef RENAME
522#define HAVE_MMX
523#define HAVE_MMX2
524#undef HAVE_3DNOW
7630f2e0
MN
525#define RENAME(a) a ## _MMX2
526#include "swscale_template.c"
726a959a 527#endif
7630f2e0
MN
528
529//3DNOW versions
726a959a 530#ifdef COMPILE_3DNOW
7630f2e0
MN
531#undef RENAME
532#define HAVE_MMX
533#undef HAVE_MMX2
534#define HAVE_3DNOW
7630f2e0
MN
535#define RENAME(a) a ## _3DNow
536#include "swscale_template.c"
726a959a 537#endif
7630f2e0
MN
538
539#endif //CAN_COMPILE_X86_ASM
540
541// minor note: the HAVE_xyz is messed up after that line so dont use it
d604bab9 542
d3f41512 543
6c7506de 544// old global scaler, dont use for new code
28bf81c9
MN
545// will use sws_flags from the command line
546void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
547 int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
d1fac6cf 548 int srcW, int srcH, int dstW, int dstH){
31190492 549
28bf81c9
MN
550 static SwsContext *context=NULL;
551 int dstFormat;
28bf81c9
MN
552 int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
553
6c7506de 554 switch(dstbpp)
28bf81c9 555 {
6c7506de
MN
556 case 8 : dstFormat= IMGFMT_Y8; break;
557 case 12: dstFormat= IMGFMT_YV12; break;
558 case 15: dstFormat= IMGFMT_BGR15; break;
559 case 16: dstFormat= IMGFMT_BGR16; break;
560 case 24: dstFormat= IMGFMT_BGR24; break;
561 case 32: dstFormat= IMGFMT_BGR32; break;
562 default: return;
563 }
564
565 if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
566
b6654a54 567 context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
6c7506de
MN
568}
569
570// will use sws_flags & src_filter (from cmd line)
571SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
572{
573 int flags=0;
574 static int firstTime=1;
575
5521b193 576#ifdef ARCH_X86
6c7506de
MN
577 if(gCpuCaps.hasMMX)
578 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
5521b193 579#endif
6c7506de
MN
580 if(firstTime)
581 {
28bf81c9 582 firstTime=0;
6c7506de
MN
583 flags= SWS_PRINT_INFO;
584 }
585 else if(verbose>1) flags= SWS_PRINT_INFO;
586
587 if(src_filter.lumH) freeVec(src_filter.lumH);
588 if(src_filter.lumV) freeVec(src_filter.lumV);
589 if(src_filter.chrH) freeVec(src_filter.chrH);
590 if(src_filter.chrV) freeVec(src_filter.chrV);
591
592 if(sws_lum_gblur!=0.0){
593 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
594 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
595 }else{
596 src_filter.lumH= getIdentityVec();
597 src_filter.lumV= getIdentityVec();
598 }
c7f822d9 599
6c7506de
MN
600 if(sws_chr_gblur!=0.0){
601 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
602 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
603 }else{
604 src_filter.chrH= getIdentityVec();
605 src_filter.chrV= getIdentityVec();
606 }
5521b193 607
6c7506de
MN
608 if(sws_chr_sharpen!=0.0){
609 SwsVector *g= getConstVec(-1.0, 3);
610 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
611 g->coeff[1]=2.0;
612 addVec(id, g);
613 convVec(src_filter.chrH, id);
614 convVec(src_filter.chrV, id);
615 freeVec(g);
616 freeVec(id);
617 }
5521b193 618
6c7506de
MN
619 if(sws_lum_sharpen!=0.0){
620 SwsVector *g= getConstVec(-1.0, 3);
621 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
622 g->coeff[1]=2.0;
623 addVec(id, g);
624 convVec(src_filter.lumH, id);
625 convVec(src_filter.lumV, id);
626 freeVec(g);
627 freeVec(id);
628 }
c7f822d9 629
6c7506de
MN
630 if(sws_chr_hshift)
631 shiftVec(src_filter.chrH, sws_chr_hshift);
c7f822d9 632
6c7506de
MN
633 if(sws_chr_vshift)
634 shiftVec(src_filter.chrV, sws_chr_vshift);
5521b193 635
6c7506de
MN
636 normalizeVec(src_filter.chrH, 1.0);
637 normalizeVec(src_filter.chrV, 1.0);
638 normalizeVec(src_filter.lumH, 1.0);
639 normalizeVec(src_filter.lumV, 1.0);
28bf81c9 640
6c7506de
MN
641 if(verbose > 1) printVec(src_filter.chrH);
642 if(verbose > 1) printVec(src_filter.lumH);
28bf81c9
MN
643
644 switch(sws_flags)
645 {
646 case 0: flags|= SWS_FAST_BILINEAR; break;
647 case 1: flags|= SWS_BILINEAR; break;
648 case 2: flags|= SWS_BICUBIC; break;
649 case 3: flags|= SWS_X; break;
ff7ba856 650 case 4: flags|= SWS_POINT; break;
d8863d37 651 case 5: flags|= SWS_AREA; break;
28bf81c9
MN
652 default:flags|= SWS_BILINEAR; break;
653 }
654
6c7506de 655 return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
28bf81c9
MN
656}
657
6c7506de 658
c7f822d9
MN
659static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
660 int srcW, int dstW, int filterAlign, int one, int flags,
661 SwsVector *srcFilter, SwsVector *dstFilter)
28bf81c9
MN
662{
663 int i;
c7f822d9
MN
664 int filterSize;
665 int filter2Size;
666 int minFilterSize;
667 double *filter=NULL;
668 double *filter2=NULL;
28bf81c9
MN
669#ifdef ARCH_X86
670 if(gCpuCaps.hasMMX)
671 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
726a959a 672#endif
31190492 673
adeaecb9 674 // Note the +1 is for the MMXscaler which reads over the end
6c7506de 675 *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
6c7506de 676
28bf81c9
MN
677 if(ABS(xInc - 0x10000) <10) // unscaled
678 {
679 int i;
c7f822d9
MN
680 filterSize= 1;
681 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
682 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
28bf81c9
MN
683
684 for(i=0; i<dstW; i++)
685 {
c7f822d9
MN
686 filter[i*filterSize]=1;
687 (*filterPos)[i]=i;
28bf81c9
MN
688 }
689
690 }
ff7ba856
MN
691 else if(flags&SWS_POINT) // lame looking point sampling mode
692 {
693 int i;
694 int xDstInSrc;
695 filterSize= 1;
696 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
697
698 xDstInSrc= xInc/2 - 0x8000;
699 for(i=0; i<dstW; i++)
700 {
8a01d20c 701 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
ff7ba856
MN
702
703 (*filterPos)[i]= xx;
704 filter[i]= 1.0;
705 xDstInSrc+= xInc;
706 }
707 }
28bf81c9
MN
708 else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
709 {
710 int i;
711 int xDstInSrc;
c7f822d9
MN
712 if (flags&SWS_BICUBIC) filterSize= 4;
713 else if(flags&SWS_X ) filterSize= 4;
d8863d37 714 else filterSize= 2; // SWS_BILINEAR / SWS_AREA
28bf81c9 715// printf("%d %d %d\n", filterSize, srcW, dstW);
c7f822d9 716 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
28bf81c9
MN
717
718 xDstInSrc= xInc/2 - 0x8000;
719 for(i=0; i<dstW; i++)
720 {
8a01d20c 721 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
28bf81c9
MN
722 int j;
723
c7f822d9 724 (*filterPos)[i]= xx;
28bf81c9
MN
725 if((flags & SWS_BICUBIC) || (flags & SWS_X))
726 {
727 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
728 double y1,y2,y3,y4;
729 double A= -0.6;
730 if(flags & SWS_BICUBIC){
731 // Equation is from VirtualDub
732 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
733 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
734 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
735 y4 = ( + A*d*d - A*d*d*d);
736 }else{
737 // cubic interpolation (derived it myself)
738 y1 = ( -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
739 y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
740 y3 = ( +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
741 y4 = ( -1.0*d + 1.0*d*d*d)/6.0;
742 }
743
744// printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
c7f822d9
MN
745 filter[i*filterSize + 0]= y1;
746 filter[i*filterSize + 1]= y2;
747 filter[i*filterSize + 2]= y3;
748 filter[i*filterSize + 3]= y4;
28bf81c9
MN
749// printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
750 }
751 else
752 {
d8863d37 753 //Bilinear upscale / linear interpolate / Area averaging
c7f822d9 754 for(j=0; j<filterSize; j++)
28bf81c9
MN
755 {
756 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
757 double coeff= 1.0 - d;
758 if(coeff<0) coeff=0;
759 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
c7f822d9 760 filter[i*filterSize + j]= coeff;
28bf81c9
MN
761 xx++;
762 }
763 }
764 xDstInSrc+= xInc;
765 }
766 }
767 else // downscale
768 {
769 int xDstInSrc;
d8863d37
MN
770 if(flags&SWS_BICUBIC) filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
771 else if(flags&SWS_X) filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
772 else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
773 else /* BILINEAR */ filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
28bf81c9 774// printf("%d %d %d\n", *filterSize, srcW, dstW);
c7f822d9 775 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
28bf81c9
MN
776
777 xDstInSrc= xInc/2 - 0x8000;
778 for(i=0; i<dstW; i++)
779 {
c7f822d9 780 int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
28bf81c9 781 int j;
c7f822d9
MN
782 (*filterPos)[i]= xx;
783 for(j=0; j<filterSize; j++)
28bf81c9
MN
784 {
785 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
786 double coeff;
787 if((flags & SWS_BICUBIC) || (flags & SWS_X))
788 {
789 double A= -0.75;
790// d*=2;
791 // Equation is from VirtualDub
792 if(d<1.0)
793 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
794 else if(d<2.0)
795 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
796 else
797 coeff=0.0;
798 }
d8863d37 799 else if(flags & SWS_AREA)
28bf81c9 800 {
d8863d37
MN
801 double srcPixelSize= (1<<16)/(double)xInc;
802 if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
803 else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
804 else coeff=0.0;
805 }
28bf81c9
MN
806 else
807 {
808 coeff= 1.0 - d;
809 if(coeff<0) coeff=0;
810 }
d8863d37 811// printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
c7f822d9 812 filter[i*filterSize + j]= coeff;
28bf81c9
MN
813 xx++;
814 }
815 xDstInSrc+= xInc;
816 }
817 }
818
c7f822d9
MN
819 /* apply src & dst Filter to filter -> filter2
820 free(filter);
821 */
822 filter2Size= filterSize;
823 if(srcFilter) filter2Size+= srcFilter->length - 1;
824 if(dstFilter) filter2Size+= dstFilter->length - 1;
825 filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
826
827 for(i=0; i<dstW; i++)
828 {
829 int j;
830 SwsVector scaleFilter;
831 SwsVector *outVec;
832
833 scaleFilter.coeff= filter + i*filterSize;
834 scaleFilter.length= filterSize;
835
5cebb24b 836 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
c7f822d9
MN
837 else outVec= &scaleFilter;
838
839 ASSERT(outVec->length == filter2Size)
840 //FIXME dstFilter
841
842 for(j=0; j<outVec->length; j++)
843 {
844 filter2[i*filter2Size + j]= outVec->coeff[j];
845 }
846
847 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
848
849 if(outVec != &scaleFilter) freeVec(outVec);
850 }
851 free(filter); filter=NULL;
852
853 /* try to reduce the filter-size (step1 find size and shift left) */
854 // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
855 minFilterSize= 0;
856 for(i=dstW-1; i>=0; i--)
857 {
858 int min= filter2Size;
859 int j;
860 double cutOff=0.0;
861
862 /* get rid off near zero elements on the left by shifting left */
863 for(j=0; j<filter2Size; j++)
864 {
865 int k;
866 cutOff += ABS(filter2[i*filter2Size]);
867
868 if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
869
870 /* preserve Monotonicity because the core cant handle the filter otherwise */
871 if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
872
873 // Move filter coeffs left
874 for(k=1; k<filter2Size; k++)
875 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
876 filter2[i*filter2Size + k - 1]= 0.0;
877 (*filterPos)[i]++;
878 }
879
880 cutOff=0.0;
881 /* count near zeros on the right */
882 for(j=filter2Size-1; j>0; j--)
883 {
884 cutOff += ABS(filter2[i*filter2Size + j]);
885
886 if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
887 min--;
888 }
889
890 if(min>minFilterSize) minFilterSize= min;
891 }
892
6c7506de
MN
893 filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
894 filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
895 *outFilterSize= filterSize;
896
897 if((flags&SWS_PRINT_INFO) && verbose)
898 printf("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
c7f822d9
MN
899 /* try to reduce the filter-size (step2 reduce it) */
900 for(i=0; i<dstW; i++)
901 {
902 int j;
903
6c7506de
MN
904 for(j=0; j<filterSize; j++)
905 {
906 if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
907 else filter[i*filterSize + j]= filter2[i*filter2Size + j];
908 }
c7f822d9 909 }
6c7506de
MN
910 free(filter2); filter2=NULL;
911
912 ASSERT(filterSize > 0)
c7f822d9
MN
913
914 //FIXME try to align filterpos if possible
915
28bf81c9
MN
916 //fix borders
917 for(i=0; i<dstW; i++)
918 {
919 int j;
c7f822d9 920 if((*filterPos)[i] < 0)
28bf81c9
MN
921 {
922 // Move filter coeffs left to compensate for filterPos
6c7506de 923 for(j=1; j<filterSize; j++)
28bf81c9 924 {
c7f822d9 925 int left= MAX(j + (*filterPos)[i], 0);
6c7506de
MN
926 filter[i*filterSize + left] += filter[i*filterSize + j];
927 filter[i*filterSize + j]=0;
28bf81c9 928 }
c7f822d9 929 (*filterPos)[i]= 0;
28bf81c9
MN
930 }
931
6c7506de 932 if((*filterPos)[i] + filterSize > srcW)
28bf81c9 933 {
6c7506de 934 int shift= (*filterPos)[i] + filterSize - srcW;
28bf81c9 935 // Move filter coeffs right to compensate for filterPos
6c7506de 936 for(j=filterSize-2; j>=0; j--)
28bf81c9 937 {
6c7506de
MN
938 int right= MIN(j + shift, filterSize-1);
939 filter[i*filterSize +right] += filter[i*filterSize +j];
940 filter[i*filterSize +j]=0;
28bf81c9 941 }
6c7506de 942 (*filterPos)[i]= srcW - filterSize;
28bf81c9
MN
943 }
944 }
945
6c7506de
MN
946 // Note the +1 is for the MMXscaler which reads over the end
947 *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
948 memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
c7f822d9
MN
949
950 /* Normalize & Store in outFilter */
28bf81c9
MN
951 for(i=0; i<dstW; i++)
952 {
953 int j;
954 double sum=0;
955 double scale= one;
6c7506de 956 for(j=0; j<filterSize; j++)
28bf81c9 957 {
6c7506de 958 sum+= filter[i*filterSize + j];
28bf81c9
MN
959 }
960 scale/= sum;
6c7506de 961 for(j=0; j<filterSize; j++)
28bf81c9 962 {
6c7506de 963 (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
28bf81c9
MN
964 }
965 }
adeaecb9
MN
966
967 (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
968 for(i=0; i<*outFilterSize; i++)
969 {
970 int j= dstW*(*outFilterSize);
971 (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
972 }
c7f822d9 973
6c7506de 974 free(filter);
7630f2e0 975}
31190492 976
28bf81c9
MN
977#ifdef ARCH_X86
978static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
979{
980 uint8_t *fragment;
981 int imm8OfPShufW1;
982 int imm8OfPShufW2;
983 int fragmentLength;
984
985 int xpos, i;
986
987 // create an optimized horizontal scaling routine
988
989 //code fragment
990
991 asm volatile(
992 "jmp 9f \n\t"
993 // Begin
994 "0: \n\t"
995 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
996 "movq %%mm0, %%mm1 \n\t"
997 "psrlq $8, %%mm0 \n\t"
998 "punpcklbw %%mm7, %%mm1 \n\t"
999 "movq %%mm2, %%mm3 \n\t"
1000 "punpcklbw %%mm7, %%mm0 \n\t"
1001 "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
1002 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
1003 "1: \n\t"
1004 "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
1005 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1006 "2: \n\t"
1007 "psrlw $9, %%mm3 \n\t"
1008 "psubw %%mm1, %%mm0 \n\t"
1009 "pmullw %%mm3, %%mm0 \n\t"
1010 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
1011 "psllw $7, %%mm1 \n\t"
1012 "paddw %%mm1, %%mm0 \n\t"
1013
1014 "movq %%mm0, (%%edi, %%eax) \n\t"
1015
1016 "addl $8, %%eax \n\t"
1017 // End
1018 "9: \n\t"
1019// "int $3\n\t"
1020 "leal 0b, %0 \n\t"
1021 "leal 1b, %1 \n\t"
1022 "leal 2b, %2 \n\t"
1023 "decl %1 \n\t"
1024 "decl %2 \n\t"
1025 "subl %0, %1 \n\t"
1026 "subl %0, %2 \n\t"
1027 "leal 9b, %3 \n\t"
1028 "subl %0, %3 \n\t"
1029 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1030 "=r" (fragmentLength)
1031 );
1032
1033 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1034
1035 for(i=0; i<dstW/8; i++)
1036 {
1037 int xx=xpos>>16;
1038
1039 if((i&3) == 0)
1040 {
1041 int a=0;
1042 int b=((xpos+xInc)>>16) - xx;
1043 int c=((xpos+xInc*2)>>16) - xx;
1044 int d=((xpos+xInc*3)>>16) - xx;
1045
1046 memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
1047
1048 funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
1049 funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
1050 a | (b<<2) | (c<<4) | (d<<6);
1051
1052 // if we dont need to read 8 bytes than dont :), reduces the chance of
1053 // crossing a cache line
1054 if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
1055
1056 funnyCode[fragmentLength*(i+4)/4]= RET;
1057 }
1058 xpos+=xInc;
1059 }
1060}
1061#endif // ARCH_X86
1062
1063//FIXME remove
31190492 1064void SwScale_Init(){
28bf81c9
MN
1065}
1066
1067static void globalInit(){
31190492
A
1068 // generating tables:
1069 int i;
c1b0bfb4
MN
1070 for(i=0; i<768; i++){
1071 int c= MIN(MAX(i-256, 0), 255);
1072 clip_table[i]=c;
1073 yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1074 yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1075 yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1076 yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1077 yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
31190492
A
1078 }
1079
b18ea156
MN
1080 for(i=0; i<768; i++)
1081 {
28bf81c9 1082 int v= clip_table[i];
a861d4d7
MN
1083 clip_table16b[i]= le2me_16( v>>3);
1084 clip_table16g[i]= le2me_16((v<<3)&0x07E0);
1085 clip_table16r[i]= le2me_16((v<<8)&0xF800);
1086 clip_table15b[i]= le2me_16( v>>3);
1087 clip_table15g[i]= le2me_16((v<<2)&0x03E0);
1088 clip_table15r[i]= le2me_16((v<<7)&0x7C00);
b18ea156 1089 }
c1b0bfb4 1090
28bf81c9
MN
1091cpuCaps= gCpuCaps;
1092
1093#ifdef RUNTIME_CPUDETECT
1094#ifdef CAN_COMPILE_X86_ASM
1095 // ordered per speed fasterst first
1096 if(gCpuCaps.hasMMX2)
1097 swScale= swScale_MMX2;
1098 else if(gCpuCaps.has3DNow)
7f56a527 1099 swScale= swScale_3DNow;
28bf81c9
MN
1100 else if(gCpuCaps.hasMMX)
1101 swScale= swScale_MMX;
1102 else
1103 swScale= swScale_C;
1104
1105#else
1106 swScale= swScale_C;
1107 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1108#endif
1109#else //RUNTIME_CPUDETECT
1110#ifdef HAVE_MMX2
1111 swScale= swScale_MMX2;
1112 cpuCaps.has3DNow = 0;
1113#elif defined (HAVE_3DNOW)
7f56a527 1114 swScale= swScale_3DNow;
28bf81c9
MN
1115 cpuCaps.hasMMX2 = 0;
1116#elif defined (HAVE_MMX)
1117 swScale= swScale_MMX;
1118 cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1119#else
1120 swScale= swScale_C;
1121 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1122#endif
1123#endif //!RUNTIME_CPUDETECT
31190492 1124}
7630f2e0 1125
37079906
MN
1126/* Warper functions for yuv2bgr */
1127static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
b6654a54
MN
1128 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1129 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
37079906
MN
1130
1131 if(c->srcFormat==IMGFMT_YV12)
b6654a54 1132 yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
37079906 1133 else /* I420 & IYUV */
b6654a54
MN
1134 yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1135}
1136
1137/* unscaled copy like stuff (assumes nearly identical formats) */
1138static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1139 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1140
1141 int srcStride[3];
1142 uint8_t *src[3];
1143 uint8_t *dst[3];
1144
1145 if(c->srcFormat == IMGFMT_I420){
1146 src[0]= srcParam[0];
1147 src[1]= srcParam[2];
1148 src[2]= srcParam[1];
1149 srcStride[0]= srcStrideParam[0];
1150 srcStride[1]= srcStrideParam[2];
1151 srcStride[2]= srcStrideParam[1];
1152 }
1153 else if(c->srcFormat==IMGFMT_YV12){
1154 src[0]= srcParam[0];
1155 src[1]= srcParam[1];
1156 src[2]= srcParam[2];
1157 srcStride[0]= srcStrideParam[0];
1158 srcStride[1]= srcStrideParam[1];
1159 srcStride[2]= srcStrideParam[2];
1160 }
1161 else if(isPacked(c->srcFormat) || isGray(c->srcFormat)){
1162 src[0]= srcParam[0];
1163 src[1]=
1164 src[2]= NULL;
1165 srcStride[0]= srcStrideParam[0];
1166 srcStride[1]=
1167 srcStride[2]= 0;
1168 }
1169
1170 if(c->dstFormat == IMGFMT_I420){
1171 dst[0]= dstParam[0];
1172 dst[1]= dstParam[2];
1173 dst[2]= dstParam[1];
1174
1175 }else{
1176 dst[0]= dstParam[0];
1177 dst[1]= dstParam[1];
1178 dst[2]= dstParam[2];
1179 }
1180
1181 if(isPacked(c->srcFormat))
1182 {
1183 if(dstStride[0]==srcStride[0])
1184 memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1185 else
1186 {
1187 int i;
1188 uint8_t *srcPtr= src[0];
1189 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
a861d4d7
MN
1190 int length=0;
1191
1192 /* universal length finder */
1193 while(length+c->srcW <= dstStride[0]
1194 && length+c->srcW <= srcStride[0]) length+= c->srcW;
1195 ASSERT(length!=0);
b6654a54
MN
1196
1197 for(i=0; i<srcSliceH; i++)
1198 {
1199 memcpy(dstPtr, srcPtr, length);
1200 srcPtr+= srcStride[0];
1201 dstPtr+= dstStride[0];
1202 }
1203 }
1204 }
1205 else
1206 { /* Planar YUV */
1207 int plane;
1208 for(plane=0; plane<3; plane++)
1209 {
1210 int length= plane==0 ? c->srcW : ((c->srcW+1)>>1);
1211 int y= plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1212 int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
a861d4d7 1213
b6654a54
MN
1214 if(dstStride[plane]==srcStride[plane])
1215 memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1216 else
1217 {
1218 int i;
1219 uint8_t *srcPtr= src[plane];
1220 uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1221 for(i=0; i<height; i++)
1222 {
1223 memcpy(dstPtr, srcPtr, length);
1224 srcPtr+= srcStride[plane];
1225 dstPtr+= dstStride[plane];
1226 }
1227 }
1228 }
1229 }
37079906 1230}
28bf81c9
MN
1231
1232SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1233 SwsFilter *srcFilter, SwsFilter *dstFilter){
1234
28bf81c9
MN
1235 SwsContext *c;
1236 int i;
37079906 1237 int usesFilter;
c7f822d9
MN
1238 SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1239
5cebb24b
MN
1240#ifdef ARCH_X86
1241 if(gCpuCaps.hasMMX)
1242 asm volatile("emms\n\t"::: "memory");
1243#endif
1244
28bf81c9
MN
1245 if(swScale==NULL) globalInit();
1246
6ff0ad6b
MN
1247 /* avoid dupplicate Formats, so we dont need to check to much */
1248 if(srcFormat==IMGFMT_IYUV) srcFormat=IMGFMT_I420;
1249 if(srcFormat==IMGFMT_Y8) srcFormat=IMGFMT_Y800;
8a01d20c
MN
1250 if(dstFormat==IMGFMT_Y8) dstFormat=IMGFMT_Y800;
1251
b81cf274
MN
1252 if(!isSupportedIn(srcFormat))
1253 {
1254 fprintf(stderr, "swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1255 return NULL;
1256 }
1257 if(!isSupportedOut(dstFormat))
1258 {
1259 fprintf(stderr, "swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1260 return NULL;
1261 }
6ff0ad6b 1262
28bf81c9 1263 /* sanity check */
b81cf274
MN
1264 if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1265 {
1266 fprintf(stderr, "swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
1267 srcW, srcH, dstW, dstH);
1268 return NULL;
1269 }
28bf81c9 1270
c7f822d9
MN
1271 if(!dstFilter) dstFilter= &dummyFilter;
1272 if(!srcFilter) srcFilter= &dummyFilter;
1273
28bf81c9 1274 c= memalign(64, sizeof(SwsContext));
c7f822d9 1275 memset(c, 0, sizeof(SwsContext));
28bf81c9
MN
1276
1277 c->srcW= srcW;
1278 c->srcH= srcH;
1279 c->dstW= dstW;
1280 c->dstH= dstH;
5521b193
MN
1281 c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1282 c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
28bf81c9
MN
1283 c->flags= flags;
1284 c->dstFormat= dstFormat;
1285 c->srcFormat= srcFormat;
1286
37079906
MN
1287 usesFilter=0;
1288 if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1289 if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1290 if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1291 if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1292 if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1293 if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1294 if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1295 if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1296
1297 /* special Cases */
1298 if(srcW==dstW && srcH==dstH && !usesFilter)
1299 {
1300 /* yuv2bgr */
1301 if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1302 {
1303 // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
b6654a54 1304 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
37079906 1305 c->swScale= planarYuvToBgr;
b6654a54
MN
1306
1307 if(flags&SWS_PRINT_INFO)
1308 printf("SwScaler: using unscaled %s -> %s special converter\n",
1309 vo_format_name(srcFormat), vo_format_name(dstFormat));
1310 return c;
1311 }
1312
1313 /* simple copy */
1314 if(srcFormat == dstFormat || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)))
1315 {
1316 c->swScale= simpleCopy;
1317
37079906
MN
1318 if(flags&SWS_PRINT_INFO)
1319 printf("SwScaler: using unscaled %s -> %s special converter\n",
1320 vo_format_name(srcFormat), vo_format_name(dstFormat));
1321 return c;
1322 }
1323 }
1324
28bf81c9
MN
1325 if(cpuCaps.hasMMX2)
1326 {
1327 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1328 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1329 {
1330 if(flags&SWS_PRINT_INFO)
1331 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1332 }
1333 }
1334 else
1335 c->canMMX2BeUsed=0;
1336
1e621b18
MN
1337
1338 /* dont use full vertical UV input/internaly if the source doesnt even have it */
1339 if(isHalfChrV(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_V);
1340 /* dont use full horizontal UV input if the source doesnt even have it */
1341 if(isHalfChrH(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INP);
1342 /* dont use full horizontal UV internally if the destination doesnt even have it */
1343 if(isHalfChrH(dstFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INT);
1344
1345 if(flags&SWS_FULL_CHR_H_INP) c->chrSrcW= srcW;
1346 else c->chrSrcW= (srcW+1)>>1;
1347
1348 if(flags&SWS_FULL_CHR_H_INT) c->chrDstW= dstW;
1349 else c->chrDstW= (dstW+1)>>1;
1350
1351 if(flags&SWS_FULL_CHR_V) c->chrSrcH= srcH;
1352 else c->chrSrcH= (srcH+1)>>1;
1353
1354 if(isHalfChrV(dstFormat)) c->chrDstH= (dstH+1)>>1;
1355 else c->chrDstH= dstH;
1356
1357 c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
1358 c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
1359
1360
28bf81c9
MN
1361 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1362 // but only for the FAST_BILINEAR mode otherwise do correct scaling
1363 // n-2 is the last chrominance sample available
1364 // this is not perfect, but noone shuld notice the difference, the more correct variant
1365 // would be like the vertical one, but that would require some special code for the
1366 // first and last pixel
1367 if(flags&SWS_FAST_BILINEAR)
1368 {
1e621b18
MN
1369 if(c->canMMX2BeUsed)
1370 {
1371 c->lumXInc+= 20;
1372 c->chrXInc+= 20;
1373 }
28bf81c9 1374 //we dont use the x86asm scaler if mmx is available
1e621b18
MN
1375 else if(cpuCaps.hasMMX)
1376 {
1377 c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1378 c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
1379 }
28bf81c9
MN
1380 }
1381
28bf81c9
MN
1382 /* precalculate horizontal scaler filter coefficients */
1383 {
1384 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1385
c7f822d9
MN
1386 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1387 srcW , dstW, filterAlign, 1<<14, flags,
1388 srcFilter->lumH, dstFilter->lumH);
1389 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1390 (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1391 srcFilter->chrH, dstFilter->chrH);
28bf81c9
MN
1392
1393#ifdef ARCH_X86
1394// cant downscale !!!
1395 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1396 {
1397 initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode);
1398 initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
1399 }
1400#endif
1401 } // Init Horizontal stuff
1402
1403
1404
1405 /* precalculate vertical scaler filter coefficients */
c7f822d9
MN
1406 initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1407 srcH , dstH, 1, (1<<12)-4, flags,
1408 srcFilter->lumV, dstFilter->lumV);
1409 initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1410 (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1411 srcFilter->chrV, dstFilter->chrV);
28bf81c9
MN
1412
1413 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1414 c->vLumBufSize= c->vLumFilterSize;
1415 c->vChrBufSize= c->vChrFilterSize;
1416 for(i=0; i<dstH; i++)
1417 {
1418 int chrI= i*c->chrDstH / dstH;
1419 int nextSlice= MAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1,
1420 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1421 nextSlice&= ~1; // Slices start at even boundaries
1422 if(c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice)
1423 c->vLumBufSize= nextSlice - c->vLumFilterPos[i ];
1424 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1425 c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1426 }
1427
1428 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
c7f822d9
MN
1429 c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1430 c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
6c7506de 1431 //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
28bf81c9
MN
1432 for(i=0; i<c->vLumBufSize; i++)
1433 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1434 for(i=0; i<c->vChrBufSize; i++)
1435 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1436
1437 //try to avoid drawing green stuff between the right end and the stride end
1438 for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1439 for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1440
1441 ASSERT(c->chrDstH <= dstH)
28bf81c9
MN
1442
1443 // pack filter data for mmx code
1444 if(cpuCaps.hasMMX)
1445 {
c7f822d9
MN
1446 c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize* dstH*4*sizeof(int16_t));
1447 c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
28bf81c9
MN
1448 for(i=0; i<c->vLumFilterSize*dstH; i++)
1449 c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1450 c->vLumFilter[i];
1451 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1452 c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1453 c->vChrFilter[i];
1454 }
1455
1456 if(flags&SWS_PRINT_INFO)
1457 {
1458#ifdef DITHER1XBPP
5521b193
MN
1459 char *dither= " dithered";
1460#else
1461 char *dither= "";
28bf81c9
MN
1462#endif
1463 if(flags&SWS_FAST_BILINEAR)
17470314 1464 fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler, ");
28bf81c9 1465 else if(flags&SWS_BILINEAR)
17470314 1466 fprintf(stderr, "\nSwScaler: BILINEAR scaler, ");
28bf81c9 1467 else if(flags&SWS_BICUBIC)
17470314 1468 fprintf(stderr, "\nSwScaler: BICUBIC scaler, ");
1e621b18 1469 else if(flags&SWS_X)
17470314 1470 fprintf(stderr, "\nSwScaler: Experimental scaler, ");
ff7ba856 1471 else if(flags&SWS_POINT)
17470314 1472 fprintf(stderr, "\nSwScaler: Nearest Neighbor / POINT scaler, ");
d8863d37 1473 else if(flags&SWS_AREA)
17470314 1474 fprintf(stderr, "\nSwScaler: Area Averageing scaler, ");
28bf81c9
MN
1475 else
1476 fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
1477
17470314
MN
1478 if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
1479 fprintf(stderr, "from %s to%s %s ",
1480 vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
28bf81c9 1481 else
17470314
MN
1482 fprintf(stderr, "from %s to %s ",
1483 vo_format_name(srcFormat), vo_format_name(dstFormat));
28bf81c9
MN
1484
1485 if(cpuCaps.hasMMX2)
1486 fprintf(stderr, "using MMX2\n");
1487 else if(cpuCaps.has3DNow)
1488 fprintf(stderr, "using 3DNOW\n");
1489 else if(cpuCaps.hasMMX)
1490 fprintf(stderr, "using MMX\n");
1491 else
1492 fprintf(stderr, "using C\n");
1493 }
1494
1495 if((flags & SWS_PRINT_INFO) && verbose)
1496 {
1497 if(cpuCaps.hasMMX)
1498 {
1499 if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1500 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1501 else
1502 {
1503 if(c->hLumFilterSize==4)
1504 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1505 else if(c->hLumFilterSize==8)
1506 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1507 else
1508 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1509
1510 if(c->hChrFilterSize==4)
1511 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1512 else if(c->hChrFilterSize==8)
1513 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1514 else
1515 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1516 }
1517 }
1518 else
1519 {
1520#ifdef ARCH_X86
1521 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1522#else
1523 if(flags & SWS_FAST_BILINEAR)
1524 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1525 else
1526 printf("SwScaler: using C scaler for horizontal scaling\n");
1527#endif
1528 }
6c7506de 1529 if(isPlanarYUV(dstFormat))
28bf81c9
MN
1530 {
1531 if(c->vLumFilterSize==1)
6c7506de 1532 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9 1533 else
6c7506de 1534 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9
MN
1535 }
1536 else
1537 {
1538 if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1539 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1540 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1541 else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1542 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1543 else
1544 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1545 }
1546
1547 if(dstFormat==IMGFMT_BGR24)
1548 printf("SwScaler: using %s YV12->BGR24 Converter\n",
1549 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
fd284805
MN
1550 else if(dstFormat==IMGFMT_BGR32)
1551 printf("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1552 else if(dstFormat==IMGFMT_BGR16)
1553 printf("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1554 else if(dstFormat==IMGFMT_BGR15)
1555 printf("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9
MN
1556
1557 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1558 }
1e621b18
MN
1559 if((flags & SWS_PRINT_INFO) && verbose>1)
1560 {
1561 printf("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1562 c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
1563 printf("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1564 c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
1565 }
37079906
MN
1566
1567 c->swScale= swScale;
28bf81c9
MN
1568 return c;
1569}
1570
1571/**
1572 * returns a normalized gaussian curve used to filter stuff
1573 * quality=3 is high quality, lowwer is lowwer quality
1574 */
c7f822d9
MN
1575
1576SwsVector *getGaussianVec(double variance, double quality){
28bf81c9
MN
1577 const int length= (int)(variance*quality + 0.5) | 1;
1578 int i;
1579 double *coeff= memalign(sizeof(double), length*sizeof(double));
1580 double middle= (length-1)*0.5;
c7f822d9
MN
1581 SwsVector *vec= malloc(sizeof(SwsVector));
1582
1583 vec->coeff= coeff;
1584 vec->length= length;
28bf81c9
MN
1585
1586 for(i=0; i<length; i++)
1587 {
1588 double dist= i-middle;
1589 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1590 }
1591
c7f822d9
MN
1592 normalizeVec(vec, 1.0);
1593
1594 return vec;
28bf81c9
MN
1595}
1596
5521b193
MN
1597SwsVector *getConstVec(double c, int length){
1598 int i;
1599 double *coeff= memalign(sizeof(double), length*sizeof(double));
1600 SwsVector *vec= malloc(sizeof(SwsVector));
1601
1602 vec->coeff= coeff;
1603 vec->length= length;
1604
1605 for(i=0; i<length; i++)
1606 coeff[i]= c;
1607
1608 return vec;
1609}
1610
1611
c7f822d9
MN
1612SwsVector *getIdentityVec(void){
1613 double *coeff= memalign(sizeof(double), sizeof(double));
1614 SwsVector *vec= malloc(sizeof(SwsVector));
1615 coeff[0]= 1.0;
1616
1617 vec->coeff= coeff;
1618 vec->length= 1;
1619
1620 return vec;
1621}
1622
1623void normalizeVec(SwsVector *a, double height){
28bf81c9
MN
1624 int i;
1625 double sum=0;
1626 double inv;
1627
c7f822d9
MN
1628 for(i=0; i<a->length; i++)
1629 sum+= a->coeff[i];
28bf81c9
MN
1630
1631 inv= height/sum;
1632
c7f822d9
MN
1633 for(i=0; i<a->length; i++)
1634 a->coeff[i]*= height;
28bf81c9
MN
1635}
1636
c7f822d9
MN
1637void scaleVec(SwsVector *a, double scalar){
1638 int i;
1639
1640 for(i=0; i<a->length; i++)
1641 a->coeff[i]*= scalar;
1642}
1643
5cebb24b 1644static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
c7f822d9 1645 int length= a->length + b->length - 1;
28bf81c9
MN
1646 double *coeff= memalign(sizeof(double), length*sizeof(double));
1647 int i, j;
c7f822d9
MN
1648 SwsVector *vec= malloc(sizeof(SwsVector));
1649
1650 vec->coeff= coeff;
1651 vec->length= length;
28bf81c9
MN
1652
1653 for(i=0; i<length; i++) coeff[i]= 0.0;
1654
c7f822d9 1655 for(i=0; i<a->length; i++)
28bf81c9 1656 {
c7f822d9 1657 for(j=0; j<b->length; j++)
28bf81c9 1658 {
c7f822d9 1659 coeff[i+j]+= a->coeff[i]*b->coeff[j];
28bf81c9
MN
1660 }
1661 }
1662
c7f822d9 1663 return vec;
28bf81c9
MN
1664}
1665
5cebb24b 1666static SwsVector *sumVec(SwsVector *a, SwsVector *b){
c7f822d9 1667 int length= MAX(a->length, b->length);
28bf81c9
MN
1668 double *coeff= memalign(sizeof(double), length*sizeof(double));
1669 int i;
c7f822d9
MN
1670 SwsVector *vec= malloc(sizeof(SwsVector));
1671
1672 vec->coeff= coeff;
1673 vec->length= length;
28bf81c9
MN
1674
1675 for(i=0; i<length; i++) coeff[i]= 0.0;
1676
c7f822d9
MN
1677 for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1678 for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1679
1680 return vec;
28bf81c9 1681}
c7f822d9 1682
5cebb24b 1683static SwsVector *diffVec(SwsVector *a, SwsVector *b){
c7f822d9
MN
1684 int length= MAX(a->length, b->length);
1685 double *coeff= memalign(sizeof(double), length*sizeof(double));
1686 int i;
1687 SwsVector *vec= malloc(sizeof(SwsVector));
1688
1689 vec->coeff= coeff;
1690 vec->length= length;
1691
1692 for(i=0; i<length; i++) coeff[i]= 0.0;
1693
1694 for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1695 for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1696
1697 return vec;
1698}
1699
1700/* shift left / or right if "shift" is negative */
5cebb24b 1701static SwsVector *getShiftedVec(SwsVector *a, int shift){
c7f822d9
MN
1702 int length= a->length + ABS(shift)*2;
1703 double *coeff= memalign(sizeof(double), length*sizeof(double));
ff7ba856 1704 int i;
c7f822d9
MN
1705 SwsVector *vec= malloc(sizeof(SwsVector));
1706
1707 vec->coeff= coeff;
1708 vec->length= length;
1709
1710 for(i=0; i<length; i++) coeff[i]= 0.0;
1711
1712 for(i=0; i<a->length; i++)
1713 {
1714 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1715 }
1716
1717 return vec;
1718}
1719
5cebb24b
MN
1720void shiftVec(SwsVector *a, int shift){
1721 SwsVector *shifted= getShiftedVec(a, shift);
1722 free(a->coeff);
1723 a->coeff= shifted->coeff;
1724 a->length= shifted->length;
1725 free(shifted);
1726}
1727
1728void addVec(SwsVector *a, SwsVector *b){
1729 SwsVector *sum= sumVec(a, b);
1730 free(a->coeff);
1731 a->coeff= sum->coeff;
1732 a->length= sum->length;
1733 free(sum);
1734}
1735
1736void subVec(SwsVector *a, SwsVector *b){
1737 SwsVector *diff= diffVec(a, b);
1738 free(a->coeff);
1739 a->coeff= diff->coeff;
1740 a->length= diff->length;
1741 free(diff);
1742}
1743
1744void convVec(SwsVector *a, SwsVector *b){
1745 SwsVector *conv= getConvVec(a, b);
1746 free(a->coeff);
1747 a->coeff= conv->coeff;
1748 a->length= conv->length;
1749 free(conv);
1750}
1751
1752SwsVector *cloneVec(SwsVector *a){
1753 double *coeff= memalign(sizeof(double), a->length*sizeof(double));
1754 int i;
1755 SwsVector *vec= malloc(sizeof(SwsVector));
1756
1757 vec->coeff= coeff;
1758 vec->length= a->length;
1759
1760 for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
1761
1762 return vec;
1763}
1764
c7f822d9
MN
1765void printVec(SwsVector *a){
1766 int i;
1767 double max=0;
1768 double min=0;
1769 double range;
1770
1771 for(i=0; i<a->length; i++)
1772 if(a->coeff[i]>max) max= a->coeff[i];
1773
1774 for(i=0; i<a->length; i++)
1775 if(a->coeff[i]<min) min= a->coeff[i];
1776
1777 range= max - min;
1778
1779 for(i=0; i<a->length; i++)
1780 {
1781 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
1782 printf("%1.3f ", a->coeff[i]);
1783 for(;x>0; x--) printf(" ");
1784 printf("|\n");
1785 }
1786}
1787
1788void freeVec(SwsVector *a){
1789 if(!a) return;
1790 if(a->coeff) free(a->coeff);
1791 a->coeff=NULL;
1792 a->length=0;
1793 free(a);
1794}
1795
1796void freeSwsContext(SwsContext *c){
1797 int i;
1798
1799 if(!c) return;
1800
1801 if(c->lumPixBuf)
1802 {
6c7506de 1803 for(i=0; i<c->vLumBufSize; i++)
c7f822d9
MN
1804 {
1805 if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
1806 c->lumPixBuf[i]=NULL;
1807 }
1808 free(c->lumPixBuf);
1809 c->lumPixBuf=NULL;
1810 }
1811
1812 if(c->chrPixBuf)
1813 {
6c7506de 1814 for(i=0; i<c->vChrBufSize; i++)
c7f822d9
MN
1815 {
1816 if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
1817 c->chrPixBuf[i]=NULL;
1818 }
1819 free(c->chrPixBuf);
1820 c->chrPixBuf=NULL;
1821 }
1822
1823 if(c->vLumFilter) free(c->vLumFilter);
1824 c->vLumFilter = NULL;
1825 if(c->vChrFilter) free(c->vChrFilter);
1826 c->vChrFilter = NULL;
1827 if(c->hLumFilter) free(c->hLumFilter);
1828 c->hLumFilter = NULL;
1829 if(c->hChrFilter) free(c->hChrFilter);
1830 c->hChrFilter = NULL;
1831
1832 if(c->vLumFilterPos) free(c->vLumFilterPos);
1833 c->vLumFilterPos = NULL;
1834 if(c->vChrFilterPos) free(c->vChrFilterPos);
1835 c->vChrFilterPos = NULL;
1836 if(c->hLumFilterPos) free(c->hLumFilterPos);
1837 c->hLumFilterPos = NULL;
1838 if(c->hChrFilterPos) free(c->hChrFilterPos);
1839 c->hChrFilterPos = NULL;
1840
1841 if(c->lumMmxFilter) free(c->lumMmxFilter);
1842 c->lumMmxFilter = NULL;
1843 if(c->chrMmxFilter) free(c->chrMmxFilter);
1844 c->chrMmxFilter = NULL;
1845
1846 free(c);
1847}
1848
7f56a527 1849