multiple init bugfix (patch by Alex Beregszaszi <alex@naxine.org>)
[libav.git] / postproc / swscale.c
CommitLineData
fe8054c0
MN
1/*
2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
28bf81c9 19/*
b72034dd 20 supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8, Y800
6c7506de
MN
21 supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
22 BGR15/16 support dithering
a861d4d7
MN
23
24 unscaled special converters
25 YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
26 YV12/I420/IYUV -> YV12/I420/IYUV
27 YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
b935781b
MN
28 BGR24 -> BGR32 & RGB24 -> RGB32
29 BGR32 -> BGR24 & RGB32 -> RGB24
4bb3fa5e 30 BGR15 -> BGR16
b935781b
MN
31*/
32
33/*
34tested special converters
35 YV12/I420 -> BGR16
36 YV12 -> YV12
4bb3fa5e 37 BGR15 -> BGR16
1e1c4fe9 38 BGR16 -> BGR16
b935781b
MN
39
40untested special converters
1e1c4fe9
MN
41 YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
42 YV12/I420 -> YV12/I420
43 YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
b935781b
MN
44 BGR24 -> BGR32 & RGB24 -> RGB32
45 BGR32 -> BGR24 & RGB32 -> RGB24
ec22603f 46 BGR24 -> YV12
28bf81c9
MN
47*/
48
d3f41512 49#include <inttypes.h>
dda87e9f 50#include <string.h>
077ea8a7 51#include <math.h>
c1b0bfb4 52#include <stdio.h>
d3f41512 53#include "../config.h"
9b464428 54#include "../mangle.h"
81b7c056 55#include <assert.h>
c1b0bfb4
MN
56#ifdef HAVE_MALLOC_H
57#include <malloc.h>
58#endif
d604bab9 59#include "swscale.h"
7630f2e0 60#include "../cpudetect.h"
a861d4d7 61#include "../bswap.h"
28bf81c9 62#include "../libvo/img_format.h"
37079906 63#include "rgb2rgb.h"
b0db4198 64#include "../libvo/fastmemcpy.h"
541c4eb9 65#undef MOVNTQ
7d7f78b5 66#undef PAVGB
d3f41512 67
783e9cc9 68//#undef HAVE_MMX2
7f56a527 69//#define HAVE_3DNOW
d3f41512 70//#undef HAVE_MMX
783e9cc9 71//#undef ARCH_X86
2ba1bff0 72//#define WORDS_BIGENDIAN
d604bab9 73#define DITHER1XBPP
d3f41512 74
ac6a2e45
MN
75#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
76
1e621b18 77#define RET 0xC3 //near return opcode for X86
c1b0bfb4 78
28bf81c9 79#ifdef MP_DEBUG
81b7c056 80#define ASSERT(x) assert(x);
28bf81c9 81#else
c1b0bfb4 82#define ASSERT(x) ;
28bf81c9
MN
83#endif
84
85#ifdef M_PI
86#define PI M_PI
87#else
88#define PI 3.14159265358979323846
89#endif
c1b0bfb4 90
6c7506de 91//FIXME replace this with something faster
6ff0ad6b 92#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
1e621b18 93#define isYUV(x) ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
6ff0ad6b
MN
94#define isHalfChrV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
95#define isHalfChrH(x) ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
a861d4d7 96#define isPacked(x) ((x)==IMGFMT_YUY2 || ((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR || ((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
6ff0ad6b
MN
97#define isGray(x) ((x)==IMGFMT_Y800)
98#define isSupportedIn(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
b72034dd 99 || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
a861d4d7 100 || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
6ff0ad6b
MN
101 || (x)==IMGFMT_Y800)
102#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
103 || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
37079906 104#define isBGR(x) ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
6ff0ad6b
MN
105
106#define RGB2YUV_SHIFT 16
1e621b18
MN
107#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
108#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
109#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
110#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
111#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
112#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
113#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
114#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
115#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
6c7506de 116
e3d2500f 117extern int verbose; // defined in mplayer.c
783e9cc9
MN
118/*
119NOTES
d3f41512 120
d604bab9 121known BUGS with known cause (no bugreports please!, but patches are welcome :) )
e3d2500f 122horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
d604bab9 123
d604bab9 124Special versions: fast Y 1:1 scaling (no interpolation in y direction)
31190492 125
783e9cc9 126TODO
d604bab9 127more intelligent missalignment avoidance for the horizontal scaler
c1b0bfb4
MN
128write special vertical cubic upscale version
129Optimize C code (yv12 / minmax)
ff7ba856 130add support for packed pixel yuv input & output
6ff0ad6b
MN
131add support for Y8 output
132optimize bgr24 & bgr32
ff7ba856 133add BGR4 output support
1e621b18 134write special BGR->BGR scaler
37079906 135deglobalize yuv2rgb*.c
783e9cc9 136*/
31190492 137
d604bab9 138#define ABS(a) ((a) > 0 ? (a) : (-(a)))
2ff198c1
MN
139#define MIN(a,b) ((a) > (b) ? (b) : (a))
140#define MAX(a,b) ((a) < (b) ? (b) : (a))
d604bab9 141
7630f2e0
MN
142#ifdef ARCH_X86
143#define CAN_COMPILE_X86_ASM
d604bab9
MN
144#endif
145
7630f2e0 146#ifdef CAN_COMPILE_X86_ASM
d604bab9 147static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
390b20a6
MN
148static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
149static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
150static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
151static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
f62255fb
MN
152static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
153static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
d604bab9
MN
154static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
155static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
156static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
077ea8a7 157static uint64_t __attribute__((aligned(8))) w02= 0x0002000200020002LL;
d604bab9
MN
158static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
159static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
160static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
6ff0ad6b 161static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
d604bab9 162
3fe8e8f0
MN
163static volatile uint64_t __attribute__((aligned(8))) b5Dither;
164static volatile uint64_t __attribute__((aligned(8))) g5Dither;
165static volatile uint64_t __attribute__((aligned(8))) g6Dither;
166static volatile uint64_t __attribute__((aligned(8))) r5Dither;
d8fa3c54
MN
167
168static uint64_t __attribute__((aligned(8))) dither4[2]={
169 0x0103010301030103LL,
170 0x0200020002000200LL,};
171
172static uint64_t __attribute__((aligned(8))) dither8[2]={
173 0x0602060206020602LL,
174 0x0004000400040004LL,};
d604bab9
MN
175
176static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
177static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
178static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
179static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
180static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
181static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
182
99d2cb72
MN
183static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
184static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
185static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
186
ac6a2e45
MN
187#ifdef FAST_BGR2YV12
188static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000000210041000DULL;
4342fc14
MN
189static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
190static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
ac6a2e45
MN
191#else
192static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000020E540830C8BULL;
4342fc14
MN
193static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
194static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
ac6a2e45
MN
195#endif
196static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
4342fc14 197static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
ac6a2e45
MN
198static const uint64_t w1111 __attribute__((aligned(8))) = 0x0001000100010001ULL;
199
28bf81c9 200// FIXME remove
d604bab9
MN
201static uint64_t __attribute__((aligned(8))) asm_yalpha1;
202static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
d604bab9 203#endif
783e9cc9
MN
204
205// clipping helper table for C implementations:
206static unsigned char clip_table[768];
207
b18ea156
MN
208static unsigned short clip_table16b[768];
209static unsigned short clip_table16g[768];
210static unsigned short clip_table16r[768];
211static unsigned short clip_table15b[768];
212static unsigned short clip_table15g[768];
213static unsigned short clip_table15r[768];
214
783e9cc9
MN
215// yuv->rgb conversion tables:
216static int yuvtab_2568[256];
217static int yuvtab_3343[256];
218static int yuvtab_0c92[256];
219static int yuvtab_1a1e[256];
220static int yuvtab_40cf[256];
c1b0bfb4
MN
221// Needed for cubic scaler to catch overflows
222static int clip_yuvtab_2568[768];
223static int clip_yuvtab_3343[768];
224static int clip_yuvtab_0c92[768];
225static int clip_yuvtab_1a1e[768];
226static int clip_yuvtab_40cf[768];
227
28bf81c9 228//global sws_flags from the command line
1f347f22 229int sws_flags=2;
077ea8a7 230
5cebb24b
MN
231//global srcFilter
232SwsFilter src_filter= {NULL, NULL, NULL, NULL};
233
234float sws_lum_gblur= 0.0;
235float sws_chr_gblur= 0.0;
236int sws_chr_vshift= 0;
237int sws_chr_hshift= 0;
5521b193
MN
238float sws_chr_sharpen= 0.0;
239float sws_lum_sharpen= 0.0;
5cebb24b 240
28bf81c9
MN
241/* cpuCaps combined from cpudetect and whats actually compiled in
242 (if there is no support for something compiled in it wont appear here) */
243static CpuCaps cpuCaps;
d3f41512 244
28bf81c9
MN
245void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
246 int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
2ff198c1 247
5cebb24b
MN
248static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
249
7630f2e0 250#ifdef CAN_COMPILE_X86_ASM
96034638
MN
251void in_asm_used_var_warning_killer()
252{
077ea8a7 253 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
28bf81c9 254 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
6ff0ad6b 255 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
96034638
MN
256 if(i) i=0;
257}
258#endif
d604bab9 259
e3d2500f
MN
260static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
261 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
262 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
263{
264 //FIXME Optimize (just quickly writen not opti..)
265 int i;
266 for(i=0; i<dstW; i++)
267 {
268 int val=0;
269 int j;
270 for(j=0; j<lumFilterSize; j++)
271 val += lumSrc[j][i] * lumFilter[j];
272
273 dest[i]= MIN(MAX(val>>19, 0), 255);
274 }
275
276 if(uDest != NULL)
277 for(i=0; i<(dstW>>1); i++)
278 {
279 int u=0;
280 int v=0;
281 int j;
627690b5 282 for(j=0; j<chrFilterSize; j++)
e3d2500f
MN
283 {
284 u += chrSrc[j][i] * chrFilter[j];
285 v += chrSrc[j][i + 2048] * chrFilter[j];
286 }
287
288 uDest[i]= MIN(MAX(u>>19, 0), 255);
289 vDest[i]= MIN(MAX(v>>19, 0), 255);
290 }
291}
292
293static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
294 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
28bf81c9 295 uint8_t *dest, int dstW, int dstFormat)
e3d2500f 296{
28bf81c9 297 if(dstFormat==IMGFMT_BGR32)
e3d2500f 298 {
2ba1bff0 299 int i;
df3c183a
MN
300#ifdef WORDS_BIGENDIAN
301 dest++;
302#endif
e3d2500f
MN
303 for(i=0; i<(dstW>>1); i++){
304 int j;
305 int Y1=0;
306 int Y2=0;
307 int U=0;
308 int V=0;
309 int Cb, Cr, Cg;
310 for(j=0; j<lumFilterSize; j++)
311 {
312 Y1 += lumSrc[j][2*i] * lumFilter[j];
313 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
314 }
315 for(j=0; j<chrFilterSize; j++)
316 {
317 U += chrSrc[j][i] * chrFilter[j];
318 V += chrSrc[j][i+2048] * chrFilter[j];
319 }
320 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
321 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
322 U >>= 19;
323 V >>= 19;
324
325 Cb= clip_yuvtab_40cf[U+ 256];
326 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
327 Cr= clip_yuvtab_3343[V+ 256];
328
329 dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
330 dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
331 dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
332
333 dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
334 dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
335 dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
336 }
337 }
28bf81c9 338 else if(dstFormat==IMGFMT_BGR24)
e3d2500f
MN
339 {
340 int i;
341 for(i=0; i<(dstW>>1); i++){
342 int j;
343 int Y1=0;
344 int Y2=0;
345 int U=0;
346 int V=0;
347 int Cb, Cr, Cg;
348 for(j=0; j<lumFilterSize; j++)
349 {
350 Y1 += lumSrc[j][2*i] * lumFilter[j];
351 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
352 }
353 for(j=0; j<chrFilterSize; j++)
354 {
355 U += chrSrc[j][i] * chrFilter[j];
356 V += chrSrc[j][i+2048] * chrFilter[j];
357 }
358 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
359 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
360 U >>= 19;
361 V >>= 19;
362
363 Cb= clip_yuvtab_40cf[U+ 256];
364 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
365 Cr= clip_yuvtab_3343[V+ 256];
366
367 dest[0]=clip_table[((Y1 + Cb) >>13)];
368 dest[1]=clip_table[((Y1 + Cg) >>13)];
369 dest[2]=clip_table[((Y1 + Cr) >>13)];
370
371 dest[3]=clip_table[((Y2 + Cb) >>13)];
372 dest[4]=clip_table[((Y2 + Cg) >>13)];
373 dest[5]=clip_table[((Y2 + Cr) >>13)];
374 dest+=6;
375 }
376 }
28bf81c9 377 else if(dstFormat==IMGFMT_BGR16)
e3d2500f
MN
378 {
379 int i;
5521b193
MN
380#ifdef DITHER1XBPP
381 static int ditherb1=1<<14;
382 static int ditherg1=1<<13;
383 static int ditherr1=2<<14;
384 static int ditherb2=3<<14;
385 static int ditherg2=3<<13;
386 static int ditherr2=0<<14;
387
388 ditherb1 ^= (1^2)<<14;
389 ditherg1 ^= (1^2)<<13;
390 ditherr1 ^= (1^2)<<14;
391 ditherb2 ^= (3^0)<<14;
392 ditherg2 ^= (3^0)<<13;
393 ditherr2 ^= (3^0)<<14;
394#else
395 const int ditherb1=0;
396 const int ditherg1=0;
397 const int ditherr1=0;
398 const int ditherb2=0;
399 const int ditherg2=0;
400 const int ditherr2=0;
401#endif
e3d2500f
MN
402 for(i=0; i<(dstW>>1); i++){
403 int j;
404 int Y1=0;
405 int Y2=0;
406 int U=0;
407 int V=0;
408 int Cb, Cr, Cg;
409 for(j=0; j<lumFilterSize; j++)
410 {
411 Y1 += lumSrc[j][2*i] * lumFilter[j];
412 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
413 }
414 for(j=0; j<chrFilterSize; j++)
415 {
416 U += chrSrc[j][i] * chrFilter[j];
417 V += chrSrc[j][i+2048] * chrFilter[j];
418 }
419 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
420 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
421 U >>= 19;
422 V >>= 19;
423
424 Cb= clip_yuvtab_40cf[U+ 256];
425 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
426 Cr= clip_yuvtab_3343[V+ 256];
427
428 ((uint16_t*)dest)[2*i] =
5521b193
MN
429 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
430 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
431 clip_table16r[(Y1 + Cr + ditherr1) >>13];
e3d2500f
MN
432
433 ((uint16_t*)dest)[2*i+1] =
5521b193
MN
434 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
435 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
436 clip_table16r[(Y2 + Cr + ditherr2) >>13];
e3d2500f
MN
437 }
438 }
28bf81c9 439 else if(dstFormat==IMGFMT_BGR15)
e3d2500f
MN
440 {
441 int i;
5521b193
MN
442#ifdef DITHER1XBPP
443 static int ditherb1=1<<14;
444 static int ditherg1=1<<14;
445 static int ditherr1=2<<14;
446 static int ditherb2=3<<14;
447 static int ditherg2=3<<14;
448 static int ditherr2=0<<14;
449
450 ditherb1 ^= (1^2)<<14;
451 ditherg1 ^= (1^2)<<14;
452 ditherr1 ^= (1^2)<<14;
453 ditherb2 ^= (3^0)<<14;
454 ditherg2 ^= (3^0)<<14;
455 ditherr2 ^= (3^0)<<14;
456#else
457 const int ditherb1=0;
458 const int ditherg1=0;
459 const int ditherr1=0;
460 const int ditherb2=0;
461 const int ditherg2=0;
462 const int ditherr2=0;
463#endif
e3d2500f
MN
464 for(i=0; i<(dstW>>1); i++){
465 int j;
466 int Y1=0;
467 int Y2=0;
468 int U=0;
469 int V=0;
470 int Cb, Cr, Cg;
471 for(j=0; j<lumFilterSize; j++)
472 {
473 Y1 += lumSrc[j][2*i] * lumFilter[j];
474 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
475 }
476 for(j=0; j<chrFilterSize; j++)
477 {
478 U += chrSrc[j][i] * chrFilter[j];
479 V += chrSrc[j][i+2048] * chrFilter[j];
480 }
481 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
482 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
483 U >>= 19;
484 V >>= 19;
485
486 Cb= clip_yuvtab_40cf[U+ 256];
487 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
488 Cr= clip_yuvtab_3343[V+ 256];
489
490 ((uint16_t*)dest)[2*i] =
5521b193
MN
491 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
492 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
493 clip_table15r[(Y1 + Cr + ditherr1) >>13];
e3d2500f
MN
494
495 ((uint16_t*)dest)[2*i+1] =
5521b193
MN
496 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
497 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
498 clip_table15r[(Y2 + Cr + ditherr2) >>13];
e3d2500f
MN
499 }
500 }
501}
502
503
7630f2e0
MN
504//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
505//Plain C versions
726a959a
MN
506#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
507#define COMPILE_C
508#endif
509
510#ifdef CAN_COMPILE_X86_ASM
511
512#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
513#define COMPILE_MMX
514#endif
515
516#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
517#define COMPILE_MMX2
518#endif
519
520#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
521#define COMPILE_3DNOW
522#endif
523#endif //CAN_COMPILE_X86_ASM
524
525#undef HAVE_MMX
526#undef HAVE_MMX2
527#undef HAVE_3DNOW
726a959a
MN
528
529#ifdef COMPILE_C
7630f2e0
MN
530#undef HAVE_MMX
531#undef HAVE_MMX2
532#undef HAVE_3DNOW
7630f2e0
MN
533#define RENAME(a) a ## _C
534#include "swscale_template.c"
726a959a 535#endif
397c035e 536
7630f2e0 537#ifdef CAN_COMPILE_X86_ASM
397c035e 538
7630f2e0
MN
539//X86 versions
540/*
541#undef RENAME
542#undef HAVE_MMX
543#undef HAVE_MMX2
544#undef HAVE_3DNOW
545#define ARCH_X86
546#define RENAME(a) a ## _X86
547#include "swscale_template.c"
1faf0867 548*/
7630f2e0 549//MMX versions
726a959a 550#ifdef COMPILE_MMX
7630f2e0
MN
551#undef RENAME
552#define HAVE_MMX
553#undef HAVE_MMX2
554#undef HAVE_3DNOW
7630f2e0
MN
555#define RENAME(a) a ## _MMX
556#include "swscale_template.c"
726a959a 557#endif
7630f2e0
MN
558
559//MMX2 versions
726a959a 560#ifdef COMPILE_MMX2
7630f2e0
MN
561#undef RENAME
562#define HAVE_MMX
563#define HAVE_MMX2
564#undef HAVE_3DNOW
7630f2e0
MN
565#define RENAME(a) a ## _MMX2
566#include "swscale_template.c"
726a959a 567#endif
7630f2e0
MN
568
569//3DNOW versions
726a959a 570#ifdef COMPILE_3DNOW
7630f2e0
MN
571#undef RENAME
572#define HAVE_MMX
573#undef HAVE_MMX2
574#define HAVE_3DNOW
7630f2e0
MN
575#define RENAME(a) a ## _3DNow
576#include "swscale_template.c"
726a959a 577#endif
7630f2e0
MN
578
579#endif //CAN_COMPILE_X86_ASM
580
581// minor note: the HAVE_xyz is messed up after that line so dont use it
d604bab9 582
d3f41512 583
6c7506de 584// old global scaler, dont use for new code
28bf81c9
MN
585// will use sws_flags from the command line
586void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
587 int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
d1fac6cf 588 int srcW, int srcH, int dstW, int dstH){
31190492 589
28bf81c9
MN
590 static SwsContext *context=NULL;
591 int dstFormat;
28bf81c9
MN
592 int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
593
6c7506de 594 switch(dstbpp)
28bf81c9 595 {
6c7506de
MN
596 case 8 : dstFormat= IMGFMT_Y8; break;
597 case 12: dstFormat= IMGFMT_YV12; break;
598 case 15: dstFormat= IMGFMT_BGR15; break;
599 case 16: dstFormat= IMGFMT_BGR16; break;
600 case 24: dstFormat= IMGFMT_BGR24; break;
601 case 32: dstFormat= IMGFMT_BGR32; break;
602 default: return;
603 }
604
605 if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
606
b6654a54 607 context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
6c7506de
MN
608}
609
610// will use sws_flags & src_filter (from cmd line)
611SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
612{
613 int flags=0;
614 static int firstTime=1;
615
5521b193 616#ifdef ARCH_X86
6c7506de
MN
617 if(gCpuCaps.hasMMX)
618 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
5521b193 619#endif
6c7506de
MN
620 if(firstTime)
621 {
28bf81c9 622 firstTime=0;
6c7506de
MN
623 flags= SWS_PRINT_INFO;
624 }
625 else if(verbose>1) flags= SWS_PRINT_INFO;
626
627 if(src_filter.lumH) freeVec(src_filter.lumH);
628 if(src_filter.lumV) freeVec(src_filter.lumV);
629 if(src_filter.chrH) freeVec(src_filter.chrH);
630 if(src_filter.chrV) freeVec(src_filter.chrV);
631
632 if(sws_lum_gblur!=0.0){
633 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
634 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
635 }else{
636 src_filter.lumH= getIdentityVec();
637 src_filter.lumV= getIdentityVec();
638 }
c7f822d9 639
6c7506de
MN
640 if(sws_chr_gblur!=0.0){
641 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
642 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
643 }else{
644 src_filter.chrH= getIdentityVec();
645 src_filter.chrV= getIdentityVec();
646 }
5521b193 647
6c7506de
MN
648 if(sws_chr_sharpen!=0.0){
649 SwsVector *g= getConstVec(-1.0, 3);
650 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
651 g->coeff[1]=2.0;
652 addVec(id, g);
653 convVec(src_filter.chrH, id);
654 convVec(src_filter.chrV, id);
655 freeVec(g);
656 freeVec(id);
657 }
5521b193 658
6c7506de
MN
659 if(sws_lum_sharpen!=0.0){
660 SwsVector *g= getConstVec(-1.0, 3);
661 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
662 g->coeff[1]=2.0;
663 addVec(id, g);
664 convVec(src_filter.lumH, id);
665 convVec(src_filter.lumV, id);
666 freeVec(g);
667 freeVec(id);
668 }
c7f822d9 669
6c7506de
MN
670 if(sws_chr_hshift)
671 shiftVec(src_filter.chrH, sws_chr_hshift);
c7f822d9 672
6c7506de
MN
673 if(sws_chr_vshift)
674 shiftVec(src_filter.chrV, sws_chr_vshift);
5521b193 675
6c7506de
MN
676 normalizeVec(src_filter.chrH, 1.0);
677 normalizeVec(src_filter.chrV, 1.0);
678 normalizeVec(src_filter.lumH, 1.0);
679 normalizeVec(src_filter.lumV, 1.0);
28bf81c9 680
6c7506de
MN
681 if(verbose > 1) printVec(src_filter.chrH);
682 if(verbose > 1) printVec(src_filter.lumH);
28bf81c9
MN
683
684 switch(sws_flags)
685 {
686 case 0: flags|= SWS_FAST_BILINEAR; break;
687 case 1: flags|= SWS_BILINEAR; break;
688 case 2: flags|= SWS_BICUBIC; break;
689 case 3: flags|= SWS_X; break;
ff7ba856 690 case 4: flags|= SWS_POINT; break;
d8863d37 691 case 5: flags|= SWS_AREA; break;
28bf81c9
MN
692 default:flags|= SWS_BILINEAR; break;
693 }
694
6c7506de 695 return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
28bf81c9
MN
696}
697
6c7506de 698
c7f822d9
MN
699static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
700 int srcW, int dstW, int filterAlign, int one, int flags,
701 SwsVector *srcFilter, SwsVector *dstFilter)
28bf81c9
MN
702{
703 int i;
c7f822d9
MN
704 int filterSize;
705 int filter2Size;
706 int minFilterSize;
707 double *filter=NULL;
708 double *filter2=NULL;
28bf81c9
MN
709#ifdef ARCH_X86
710 if(gCpuCaps.hasMMX)
711 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
726a959a 712#endif
31190492 713
adeaecb9 714 // Note the +1 is for the MMXscaler which reads over the end
6c7506de 715 *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
6c7506de 716
28bf81c9
MN
717 if(ABS(xInc - 0x10000) <10) // unscaled
718 {
719 int i;
c7f822d9
MN
720 filterSize= 1;
721 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
722 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
28bf81c9
MN
723
724 for(i=0; i<dstW; i++)
725 {
c7f822d9
MN
726 filter[i*filterSize]=1;
727 (*filterPos)[i]=i;
28bf81c9
MN
728 }
729
730 }
ff7ba856
MN
731 else if(flags&SWS_POINT) // lame looking point sampling mode
732 {
733 int i;
734 int xDstInSrc;
735 filterSize= 1;
736 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
737
738 xDstInSrc= xInc/2 - 0x8000;
739 for(i=0; i<dstW; i++)
740 {
8a01d20c 741 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
ff7ba856
MN
742
743 (*filterPos)[i]= xx;
744 filter[i]= 1.0;
745 xDstInSrc+= xInc;
746 }
747 }
28bf81c9
MN
748 else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
749 {
750 int i;
751 int xDstInSrc;
c7f822d9
MN
752 if (flags&SWS_BICUBIC) filterSize= 4;
753 else if(flags&SWS_X ) filterSize= 4;
d8863d37 754 else filterSize= 2; // SWS_BILINEAR / SWS_AREA
28bf81c9 755// printf("%d %d %d\n", filterSize, srcW, dstW);
c7f822d9 756 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
28bf81c9
MN
757
758 xDstInSrc= xInc/2 - 0x8000;
759 for(i=0; i<dstW; i++)
760 {
8a01d20c 761 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
28bf81c9
MN
762 int j;
763
c7f822d9 764 (*filterPos)[i]= xx;
28bf81c9
MN
765 if((flags & SWS_BICUBIC) || (flags & SWS_X))
766 {
767 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
768 double y1,y2,y3,y4;
769 double A= -0.6;
770 if(flags & SWS_BICUBIC){
771 // Equation is from VirtualDub
772 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
773 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
774 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
775 y4 = ( + A*d*d - A*d*d*d);
776 }else{
777 // cubic interpolation (derived it myself)
778 y1 = ( -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
779 y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
780 y3 = ( +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
781 y4 = ( -1.0*d + 1.0*d*d*d)/6.0;
782 }
783
784// printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
c7f822d9
MN
785 filter[i*filterSize + 0]= y1;
786 filter[i*filterSize + 1]= y2;
787 filter[i*filterSize + 2]= y3;
788 filter[i*filterSize + 3]= y4;
28bf81c9
MN
789// printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
790 }
791 else
792 {
d8863d37 793 //Bilinear upscale / linear interpolate / Area averaging
c7f822d9 794 for(j=0; j<filterSize; j++)
28bf81c9
MN
795 {
796 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
797 double coeff= 1.0 - d;
798 if(coeff<0) coeff=0;
799 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
c7f822d9 800 filter[i*filterSize + j]= coeff;
28bf81c9
MN
801 xx++;
802 }
803 }
804 xDstInSrc+= xInc;
805 }
806 }
807 else // downscale
808 {
809 int xDstInSrc;
81b7c056
MN
810 ASSERT(dstW <= srcW)
811
d8863d37
MN
812 if(flags&SWS_BICUBIC) filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
813 else if(flags&SWS_X) filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
814 else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
815 else /* BILINEAR */ filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
28bf81c9 816// printf("%d %d %d\n", *filterSize, srcW, dstW);
c7f822d9 817 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
28bf81c9
MN
818
819 xDstInSrc= xInc/2 - 0x8000;
820 for(i=0; i<dstW; i++)
821 {
c7f822d9 822 int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
28bf81c9 823 int j;
c7f822d9
MN
824 (*filterPos)[i]= xx;
825 for(j=0; j<filterSize; j++)
28bf81c9
MN
826 {
827 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
828 double coeff;
829 if((flags & SWS_BICUBIC) || (flags & SWS_X))
830 {
831 double A= -0.75;
832// d*=2;
833 // Equation is from VirtualDub
834 if(d<1.0)
835 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
836 else if(d<2.0)
837 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
838 else
839 coeff=0.0;
840 }
d8863d37 841 else if(flags & SWS_AREA)
28bf81c9 842 {
d8863d37
MN
843 double srcPixelSize= (1<<16)/(double)xInc;
844 if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
845 else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
846 else coeff=0.0;
847 }
28bf81c9
MN
848 else
849 {
850 coeff= 1.0 - d;
851 if(coeff<0) coeff=0;
852 }
d8863d37 853// printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
c7f822d9 854 filter[i*filterSize + j]= coeff;
28bf81c9
MN
855 xx++;
856 }
857 xDstInSrc+= xInc;
858 }
859 }
860
c7f822d9
MN
861 /* apply src & dst Filter to filter -> filter2
862 free(filter);
863 */
81b7c056 864 ASSERT(filterSize>0)
c7f822d9
MN
865 filter2Size= filterSize;
866 if(srcFilter) filter2Size+= srcFilter->length - 1;
867 if(dstFilter) filter2Size+= dstFilter->length - 1;
81b7c056 868 ASSERT(filter2Size>0)
c7f822d9
MN
869 filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
870
871 for(i=0; i<dstW; i++)
872 {
873 int j;
874 SwsVector scaleFilter;
875 SwsVector *outVec;
876
877 scaleFilter.coeff= filter + i*filterSize;
878 scaleFilter.length= filterSize;
879
5cebb24b 880 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
c7f822d9
MN
881 else outVec= &scaleFilter;
882
883 ASSERT(outVec->length == filter2Size)
884 //FIXME dstFilter
885
886 for(j=0; j<outVec->length; j++)
887 {
888 filter2[i*filter2Size + j]= outVec->coeff[j];
889 }
890
891 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
892
893 if(outVec != &scaleFilter) freeVec(outVec);
894 }
895 free(filter); filter=NULL;
896
897 /* try to reduce the filter-size (step1 find size and shift left) */
898 // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
899 minFilterSize= 0;
900 for(i=dstW-1; i>=0; i--)
901 {
902 int min= filter2Size;
903 int j;
904 double cutOff=0.0;
905
906 /* get rid off near zero elements on the left by shifting left */
907 for(j=0; j<filter2Size; j++)
908 {
909 int k;
910 cutOff += ABS(filter2[i*filter2Size]);
911
912 if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
913
914 /* preserve Monotonicity because the core cant handle the filter otherwise */
915 if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
916
917 // Move filter coeffs left
918 for(k=1; k<filter2Size; k++)
919 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
920 filter2[i*filter2Size + k - 1]= 0.0;
921 (*filterPos)[i]++;
922 }
923
924 cutOff=0.0;
925 /* count near zeros on the right */
926 for(j=filter2Size-1; j>0; j--)
927 {
928 cutOff += ABS(filter2[i*filter2Size + j]);
929
930 if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
931 min--;
932 }
933
934 if(min>minFilterSize) minFilterSize= min;
935 }
936
81b7c056 937 ASSERT(minFilterSize > 0)
6c7506de 938 filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
81b7c056 939 ASSERT(filterSize > 0)
6c7506de
MN
940 filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
941 *outFilterSize= filterSize;
942
943 if((flags&SWS_PRINT_INFO) && verbose)
944 printf("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
c7f822d9
MN
945 /* try to reduce the filter-size (step2 reduce it) */
946 for(i=0; i<dstW; i++)
947 {
948 int j;
949
6c7506de
MN
950 for(j=0; j<filterSize; j++)
951 {
952 if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
953 else filter[i*filterSize + j]= filter2[i*filter2Size + j];
954 }
c7f822d9 955 }
6c7506de
MN
956 free(filter2); filter2=NULL;
957
c7f822d9
MN
958
959 //FIXME try to align filterpos if possible
960
28bf81c9
MN
961 //fix borders
962 for(i=0; i<dstW; i++)
963 {
964 int j;
c7f822d9 965 if((*filterPos)[i] < 0)
28bf81c9
MN
966 {
967 // Move filter coeffs left to compensate for filterPos
6c7506de 968 for(j=1; j<filterSize; j++)
28bf81c9 969 {
c7f822d9 970 int left= MAX(j + (*filterPos)[i], 0);
6c7506de
MN
971 filter[i*filterSize + left] += filter[i*filterSize + j];
972 filter[i*filterSize + j]=0;
28bf81c9 973 }
c7f822d9 974 (*filterPos)[i]= 0;
28bf81c9
MN
975 }
976
6c7506de 977 if((*filterPos)[i] + filterSize > srcW)
28bf81c9 978 {
6c7506de 979 int shift= (*filterPos)[i] + filterSize - srcW;
28bf81c9 980 // Move filter coeffs right to compensate for filterPos
6c7506de 981 for(j=filterSize-2; j>=0; j--)
28bf81c9 982 {
6c7506de
MN
983 int right= MIN(j + shift, filterSize-1);
984 filter[i*filterSize +right] += filter[i*filterSize +j];
985 filter[i*filterSize +j]=0;
28bf81c9 986 }
6c7506de 987 (*filterPos)[i]= srcW - filterSize;
28bf81c9
MN
988 }
989 }
990
6c7506de
MN
991 // Note the +1 is for the MMXscaler which reads over the end
992 *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
993 memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
c7f822d9
MN
994
995 /* Normalize & Store in outFilter */
28bf81c9
MN
996 for(i=0; i<dstW; i++)
997 {
998 int j;
999 double sum=0;
1000 double scale= one;
6c7506de 1001 for(j=0; j<filterSize; j++)
28bf81c9 1002 {
6c7506de 1003 sum+= filter[i*filterSize + j];
28bf81c9
MN
1004 }
1005 scale/= sum;
6c7506de 1006 for(j=0; j<filterSize; j++)
28bf81c9 1007 {
6c7506de 1008 (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
28bf81c9
MN
1009 }
1010 }
adeaecb9
MN
1011
1012 (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1013 for(i=0; i<*outFilterSize; i++)
1014 {
1015 int j= dstW*(*outFilterSize);
1016 (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1017 }
c7f822d9 1018
6c7506de 1019 free(filter);
7630f2e0 1020}
31190492 1021
28bf81c9
MN
1022#ifdef ARCH_X86
1023static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
1024{
1025 uint8_t *fragment;
1026 int imm8OfPShufW1;
1027 int imm8OfPShufW2;
1028 int fragmentLength;
1029
1030 int xpos, i;
1031
1032 // create an optimized horizontal scaling routine
1033
1034 //code fragment
1035
1036 asm volatile(
1037 "jmp 9f \n\t"
1038 // Begin
1039 "0: \n\t"
1040 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
1041 "movq %%mm0, %%mm1 \n\t"
1042 "psrlq $8, %%mm0 \n\t"
1043 "punpcklbw %%mm7, %%mm1 \n\t"
1044 "movq %%mm2, %%mm3 \n\t"
1045 "punpcklbw %%mm7, %%mm0 \n\t"
1046 "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
1047 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
1048 "1: \n\t"
1049 "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
1050 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1051 "2: \n\t"
1052 "psrlw $9, %%mm3 \n\t"
1053 "psubw %%mm1, %%mm0 \n\t"
1054 "pmullw %%mm3, %%mm0 \n\t"
1055 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
1056 "psllw $7, %%mm1 \n\t"
1057 "paddw %%mm1, %%mm0 \n\t"
1058
1059 "movq %%mm0, (%%edi, %%eax) \n\t"
1060
1061 "addl $8, %%eax \n\t"
1062 // End
1063 "9: \n\t"
1064// "int $3\n\t"
1065 "leal 0b, %0 \n\t"
1066 "leal 1b, %1 \n\t"
1067 "leal 2b, %2 \n\t"
1068 "decl %1 \n\t"
1069 "decl %2 \n\t"
1070 "subl %0, %1 \n\t"
1071 "subl %0, %2 \n\t"
1072 "leal 9b, %3 \n\t"
1073 "subl %0, %3 \n\t"
1074 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1075 "=r" (fragmentLength)
1076 );
1077
1078 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1079
1080 for(i=0; i<dstW/8; i++)
1081 {
1082 int xx=xpos>>16;
1083
1084 if((i&3) == 0)
1085 {
1086 int a=0;
1087 int b=((xpos+xInc)>>16) - xx;
1088 int c=((xpos+xInc*2)>>16) - xx;
1089 int d=((xpos+xInc*3)>>16) - xx;
1090
1091 memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
1092
1093 funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
1094 funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
1095 a | (b<<2) | (c<<4) | (d<<6);
1096
1097 // if we dont need to read 8 bytes than dont :), reduces the chance of
1098 // crossing a cache line
1099 if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
1100
1101 funnyCode[fragmentLength*(i+4)/4]= RET;
1102 }
1103 xpos+=xInc;
1104 }
1105}
1106#endif // ARCH_X86
1107
1108//FIXME remove
31190492 1109void SwScale_Init(){
28bf81c9
MN
1110}
1111
1112static void globalInit(){
31190492
A
1113 // generating tables:
1114 int i;
c1b0bfb4
MN
1115 for(i=0; i<768; i++){
1116 int c= MIN(MAX(i-256, 0), 255);
1117 clip_table[i]=c;
1118 yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1119 yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1120 yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1121 yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1122 yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
31190492
A
1123 }
1124
b18ea156
MN
1125 for(i=0; i<768; i++)
1126 {
28bf81c9 1127 int v= clip_table[i];
daa57641
MN
1128 clip_table16b[i]= v>>3;
1129 clip_table16g[i]= (v<<3)&0x07E0;
1130 clip_table16r[i]= (v<<8)&0xF800;
1131 clip_table15b[i]= v>>3;
1132 clip_table15g[i]= (v<<2)&0x03E0;
1133 clip_table15r[i]= (v<<7)&0x7C00;
b18ea156 1134 }
c1b0bfb4 1135
28bf81c9
MN
1136cpuCaps= gCpuCaps;
1137
1138#ifdef RUNTIME_CPUDETECT
1139#ifdef CAN_COMPILE_X86_ASM
1140 // ordered per speed fasterst first
1141 if(gCpuCaps.hasMMX2)
1142 swScale= swScale_MMX2;
1143 else if(gCpuCaps.has3DNow)
7f56a527 1144 swScale= swScale_3DNow;
28bf81c9
MN
1145 else if(gCpuCaps.hasMMX)
1146 swScale= swScale_MMX;
1147 else
1148 swScale= swScale_C;
1149
1150#else
1151 swScale= swScale_C;
1152 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1153#endif
1154#else //RUNTIME_CPUDETECT
1155#ifdef HAVE_MMX2
1156 swScale= swScale_MMX2;
1157 cpuCaps.has3DNow = 0;
1158#elif defined (HAVE_3DNOW)
7f56a527 1159 swScale= swScale_3DNow;
28bf81c9
MN
1160 cpuCaps.hasMMX2 = 0;
1161#elif defined (HAVE_MMX)
1162 swScale= swScale_MMX;
1163 cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1164#else
1165 swScale= swScale_C;
1166 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1167#endif
1168#endif //!RUNTIME_CPUDETECT
31190492 1169}
7630f2e0 1170
37079906
MN
1171/* Warper functions for yuv2bgr */
1172static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
b6654a54
MN
1173 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1174 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
37079906
MN
1175
1176 if(c->srcFormat==IMGFMT_YV12)
b6654a54 1177 yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
37079906 1178 else /* I420 & IYUV */
b6654a54
MN
1179 yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1180}
1181
b935781b
MN
1182static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1183 int srcSliceH, uint8_t* dst[], int dstStride[]){
1184
1185 if(dstStride[0]*3==srcStride[0]*4)
4bb3fa5e 1186 rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
b935781b
MN
1187 else
1188 {
1189 int i;
1190 uint8_t *srcPtr= src[0];
1191 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1192
1193 for(i=0; i<srcSliceH; i++)
1194 {
4bb3fa5e 1195 rgb24to32(srcPtr, dstPtr, c->srcW*3);
b935781b
MN
1196 srcPtr+= srcStride[0];
1197 dstPtr+= dstStride[0];
1198 }
1199 }
1200}
1201
1202static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1203 int srcSliceH, uint8_t* dst[], int dstStride[]){
1204
1205 if(dstStride[0]*4==srcStride[0]*3)
4bb3fa5e 1206 rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
b935781b
MN
1207 else
1208 {
1209 int i;
1210 uint8_t *srcPtr= src[0];
1211 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1212
1213 for(i=0; i<srcSliceH; i++)
1214 {
4bb3fa5e
MN
1215 rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1216 srcPtr+= srcStride[0];
1217 dstPtr+= dstStride[0];
1218 }
1219 }
1220}
1221
1222static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1223 int srcSliceH, uint8_t* dst[], int dstStride[]){
1224
1225 if(dstStride[0]==srcStride[0])
1226 rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1227 else
1228 {
1229 int i;
1230 uint8_t *srcPtr= src[0];
1231 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1232
1233 for(i=0; i<srcSliceH; i++)
1234 {
1235 rgb15to16(srcPtr, dstPtr, c->srcW<<1);
b935781b
MN
1236 srcPtr+= srcStride[0];
1237 dstPtr+= dstStride[0];
1238 }
1239 }
1240}
1241
ec22603f
MN
1242static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1243 int srcSliceH, uint8_t* dst[], int dstStride[]){
1244
1245 rgb24toyv12(
1246 src[0],
1247 dst[0]+ srcSliceY *dstStride[0],
1248 dst[1]+(srcSliceY>>1)*dstStride[1],
1249 dst[2]+(srcSliceY>>1)*dstStride[2],
1250 c->srcW, srcSliceH,
1251 dstStride[0], dstStride[1], srcStride[0]);
1252}
1253
b935781b 1254
b6654a54
MN
1255/* unscaled copy like stuff (assumes nearly identical formats) */
1256static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1257 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1258
1259 int srcStride[3];
1260 uint8_t *src[3];
1261 uint8_t *dst[3];
1262
1263 if(c->srcFormat == IMGFMT_I420){
1264 src[0]= srcParam[0];
1265 src[1]= srcParam[2];
1266 src[2]= srcParam[1];
1267 srcStride[0]= srcStrideParam[0];
1268 srcStride[1]= srcStrideParam[2];
1269 srcStride[2]= srcStrideParam[1];
1270 }
1271 else if(c->srcFormat==IMGFMT_YV12){
1272 src[0]= srcParam[0];
1273 src[1]= srcParam[1];
1274 src[2]= srcParam[2];
1275 srcStride[0]= srcStrideParam[0];
1276 srcStride[1]= srcStrideParam[1];
1277 srcStride[2]= srcStrideParam[2];
1278 }
1279 else if(isPacked(c->srcFormat) || isGray(c->srcFormat)){
1280 src[0]= srcParam[0];
1281 src[1]=
1282 src[2]= NULL;
1283 srcStride[0]= srcStrideParam[0];
1284 srcStride[1]=
1285 srcStride[2]= 0;
1286 }
1287
1288 if(c->dstFormat == IMGFMT_I420){
1289 dst[0]= dstParam[0];
1290 dst[1]= dstParam[2];
1291 dst[2]= dstParam[1];
1292
1293 }else{
1294 dst[0]= dstParam[0];
1295 dst[1]= dstParam[1];
1296 dst[2]= dstParam[2];
1297 }
1298
1299 if(isPacked(c->srcFormat))
1300 {
1301 if(dstStride[0]==srcStride[0])
1302 memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1303 else
1304 {
1305 int i;
1306 uint8_t *srcPtr= src[0];
1307 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
a861d4d7
MN
1308 int length=0;
1309
1310 /* universal length finder */
9bd8bd1a
MN
1311 while(length+c->srcW <= ABS(dstStride[0])
1312 && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
a861d4d7 1313 ASSERT(length!=0);
b6654a54
MN
1314
1315 for(i=0; i<srcSliceH; i++)
1316 {
1317 memcpy(dstPtr, srcPtr, length);
1318 srcPtr+= srcStride[0];
1319 dstPtr+= dstStride[0];
1320 }
1321 }
1322 }
1323 else
1324 { /* Planar YUV */
1325 int plane;
1326 for(plane=0; plane<3; plane++)
1327 {
1328 int length= plane==0 ? c->srcW : ((c->srcW+1)>>1);
1329 int y= plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1330 int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
a861d4d7 1331
b6654a54
MN
1332 if(dstStride[plane]==srcStride[plane])
1333 memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1334 else
1335 {
1336 int i;
1337 uint8_t *srcPtr= src[plane];
1338 uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1339 for(i=0; i<height; i++)
1340 {
1341 memcpy(dstPtr, srcPtr, length);
1342 srcPtr+= srcStride[plane];
1343 dstPtr+= dstStride[plane];
1344 }
1345 }
1346 }
1347 }
37079906 1348}
28bf81c9
MN
1349
1350SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1351 SwsFilter *srcFilter, SwsFilter *dstFilter){
1352
28bf81c9
MN
1353 SwsContext *c;
1354 int i;
37079906 1355 int usesFilter;
c7f822d9
MN
1356 SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1357
5cebb24b
MN
1358#ifdef ARCH_X86
1359 if(gCpuCaps.hasMMX)
1360 asm volatile("emms\n\t"::: "memory");
1361#endif
1362
28bf81c9
MN
1363 if(swScale==NULL) globalInit();
1364
6ff0ad6b
MN
1365 /* avoid dupplicate Formats, so we dont need to check to much */
1366 if(srcFormat==IMGFMT_IYUV) srcFormat=IMGFMT_I420;
1367 if(srcFormat==IMGFMT_Y8) srcFormat=IMGFMT_Y800;
8a01d20c
MN
1368 if(dstFormat==IMGFMT_Y8) dstFormat=IMGFMT_Y800;
1369
b81cf274
MN
1370 if(!isSupportedIn(srcFormat))
1371 {
1372 fprintf(stderr, "swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1373 return NULL;
1374 }
1375 if(!isSupportedOut(dstFormat))
1376 {
1377 fprintf(stderr, "swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1378 return NULL;
1379 }
6ff0ad6b 1380
28bf81c9 1381 /* sanity check */
b81cf274
MN
1382 if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1383 {
1384 fprintf(stderr, "swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
1385 srcW, srcH, dstW, dstH);
1386 return NULL;
1387 }
28bf81c9 1388
c7f822d9
MN
1389 if(!dstFilter) dstFilter= &dummyFilter;
1390 if(!srcFilter) srcFilter= &dummyFilter;
1391
28bf81c9 1392 c= memalign(64, sizeof(SwsContext));
c7f822d9 1393 memset(c, 0, sizeof(SwsContext));
28bf81c9
MN
1394
1395 c->srcW= srcW;
1396 c->srcH= srcH;
1397 c->dstW= dstW;
1398 c->dstH= dstH;
5521b193
MN
1399 c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1400 c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
28bf81c9
MN
1401 c->flags= flags;
1402 c->dstFormat= dstFormat;
1403 c->srcFormat= srcFormat;
1404
37079906
MN
1405 usesFilter=0;
1406 if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1407 if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1408 if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1409 if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1410 if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1411 if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1412 if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1413 if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1414
b935781b 1415 /* unscaled special Cases */
37079906
MN
1416 if(srcW==dstW && srcH==dstH && !usesFilter)
1417 {
1418 /* yuv2bgr */
1419 if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1420 {
1421 // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1e1c4fe9 1422#ifdef WORDS_BIGENDIAN
daa57641
MN
1423 if(dstFormat==IMGFMT_BGR32)
1424 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
1425 else
1426 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1e1c4fe9 1427#else
b6654a54 1428 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1e1c4fe9 1429#endif
37079906 1430 c->swScale= planarYuvToBgr;
b6654a54
MN
1431
1432 if(flags&SWS_PRINT_INFO)
1433 printf("SwScaler: using unscaled %s -> %s special converter\n",
1434 vo_format_name(srcFormat), vo_format_name(dstFormat));
1435 return c;
1436 }
1437
1438 /* simple copy */
1439 if(srcFormat == dstFormat || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)))
1440 {
1441 c->swScale= simpleCopy;
1442
37079906
MN
1443 if(flags&SWS_PRINT_INFO)
1444 printf("SwScaler: using unscaled %s -> %s special converter\n",
1445 vo_format_name(srcFormat), vo_format_name(dstFormat));
1446 return c;
1447 }
b935781b
MN
1448
1449 /* bgr32to24 & rgb32to24*/
1450 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1451 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1452 {
1453 c->swScale= bgr32to24Wrapper;
1454
1455 if(flags&SWS_PRINT_INFO)
1456 printf("SwScaler: using unscaled %s -> %s special converter\n",
1457 vo_format_name(srcFormat), vo_format_name(dstFormat));
1458 return c;
1459 }
1460
1461 /* bgr24to32 & rgb24to32*/
1462 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1463 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1464 {
1465 c->swScale= bgr24to32Wrapper;
1466
1467 if(flags&SWS_PRINT_INFO)
1468 printf("SwScaler: using unscaled %s -> %s special converter\n",
1469 vo_format_name(srcFormat), vo_format_name(dstFormat));
1470 return c;
1471 }
4bb3fa5e
MN
1472
1473 /* bgr15to16 */
1474 if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
1475 {
1476 c->swScale= bgr15to16Wrapper;
1477
1478 if(flags&SWS_PRINT_INFO)
1479 printf("SwScaler: using unscaled %s -> %s special converter\n",
1480 vo_format_name(srcFormat), vo_format_name(dstFormat));
1481 return c;
1482 }
ec22603f
MN
1483
1484 /* bgr24toYV12 */
1485 if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
1486 {
1487 c->swScale= bgr24toyv12Wrapper;
1488
1489 if(flags&SWS_PRINT_INFO)
1490 printf("SwScaler: using unscaled %s -> %s special converter\n",
1491 vo_format_name(srcFormat), vo_format_name(dstFormat));
1492 return c;
1493 }
37079906
MN
1494 }
1495
28bf81c9
MN
1496 if(cpuCaps.hasMMX2)
1497 {
1498 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1499 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1500 {
1501 if(flags&SWS_PRINT_INFO)
1502 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1503 }
1504 }
1505 else
1506 c->canMMX2BeUsed=0;
1507
1e621b18
MN
1508
1509 /* dont use full vertical UV input/internaly if the source doesnt even have it */
1510 if(isHalfChrV(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_V);
1511 /* dont use full horizontal UV input if the source doesnt even have it */
1512 if(isHalfChrH(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INP);
1513 /* dont use full horizontal UV internally if the destination doesnt even have it */
1514 if(isHalfChrH(dstFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INT);
1515
1516 if(flags&SWS_FULL_CHR_H_INP) c->chrSrcW= srcW;
1517 else c->chrSrcW= (srcW+1)>>1;
1518
1519 if(flags&SWS_FULL_CHR_H_INT) c->chrDstW= dstW;
1520 else c->chrDstW= (dstW+1)>>1;
1521
1522 if(flags&SWS_FULL_CHR_V) c->chrSrcH= srcH;
1523 else c->chrSrcH= (srcH+1)>>1;
1524
1525 if(isHalfChrV(dstFormat)) c->chrDstH= (dstH+1)>>1;
1526 else c->chrDstH= dstH;
1527
1528 c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
1529 c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
1530
1531
28bf81c9
MN
1532 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1533 // but only for the FAST_BILINEAR mode otherwise do correct scaling
1534 // n-2 is the last chrominance sample available
1535 // this is not perfect, but noone shuld notice the difference, the more correct variant
1536 // would be like the vertical one, but that would require some special code for the
1537 // first and last pixel
1538 if(flags&SWS_FAST_BILINEAR)
1539 {
1e621b18
MN
1540 if(c->canMMX2BeUsed)
1541 {
1542 c->lumXInc+= 20;
1543 c->chrXInc+= 20;
1544 }
28bf81c9 1545 //we dont use the x86asm scaler if mmx is available
1e621b18
MN
1546 else if(cpuCaps.hasMMX)
1547 {
1548 c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1549 c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
1550 }
28bf81c9
MN
1551 }
1552
28bf81c9
MN
1553 /* precalculate horizontal scaler filter coefficients */
1554 {
1555 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1556
c7f822d9
MN
1557 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1558 srcW , dstW, filterAlign, 1<<14, flags,
1559 srcFilter->lumH, dstFilter->lumH);
1560 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1561 (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1562 srcFilter->chrH, dstFilter->chrH);
28bf81c9
MN
1563
1564#ifdef ARCH_X86
1565// cant downscale !!!
1566 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1567 {
1568 initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode);
1569 initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
1570 }
1571#endif
1572 } // Init Horizontal stuff
1573
1574
1575
1576 /* precalculate vertical scaler filter coefficients */
c7f822d9
MN
1577 initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1578 srcH , dstH, 1, (1<<12)-4, flags,
1579 srcFilter->lumV, dstFilter->lumV);
1580 initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1581 (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1582 srcFilter->chrV, dstFilter->chrV);
28bf81c9
MN
1583
1584 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1585 c->vLumBufSize= c->vLumFilterSize;
1586 c->vChrBufSize= c->vChrFilterSize;
1587 for(i=0; i<dstH; i++)
1588 {
1589 int chrI= i*c->chrDstH / dstH;
1590 int nextSlice= MAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1,
1591 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1592 nextSlice&= ~1; // Slices start at even boundaries
1593 if(c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice)
1594 c->vLumBufSize= nextSlice - c->vLumFilterPos[i ];
1595 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1596 c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1597 }
1598
1599 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
c7f822d9
MN
1600 c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1601 c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
6c7506de 1602 //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
28bf81c9
MN
1603 for(i=0; i<c->vLumBufSize; i++)
1604 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1605 for(i=0; i<c->vChrBufSize; i++)
1606 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1607
1608 //try to avoid drawing green stuff between the right end and the stride end
1609 for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1610 for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1611
1612 ASSERT(c->chrDstH <= dstH)
28bf81c9
MN
1613
1614 // pack filter data for mmx code
1615 if(cpuCaps.hasMMX)
1616 {
c7f822d9
MN
1617 c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize* dstH*4*sizeof(int16_t));
1618 c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
28bf81c9
MN
1619 for(i=0; i<c->vLumFilterSize*dstH; i++)
1620 c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1621 c->vLumFilter[i];
1622 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1623 c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1624 c->vChrFilter[i];
1625 }
1626
1627 if(flags&SWS_PRINT_INFO)
1628 {
1629#ifdef DITHER1XBPP
5521b193
MN
1630 char *dither= " dithered";
1631#else
1632 char *dither= "";
28bf81c9
MN
1633#endif
1634 if(flags&SWS_FAST_BILINEAR)
17470314 1635 fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler, ");
28bf81c9 1636 else if(flags&SWS_BILINEAR)
17470314 1637 fprintf(stderr, "\nSwScaler: BILINEAR scaler, ");
28bf81c9 1638 else if(flags&SWS_BICUBIC)
17470314 1639 fprintf(stderr, "\nSwScaler: BICUBIC scaler, ");
1e621b18 1640 else if(flags&SWS_X)
17470314 1641 fprintf(stderr, "\nSwScaler: Experimental scaler, ");
ff7ba856 1642 else if(flags&SWS_POINT)
17470314 1643 fprintf(stderr, "\nSwScaler: Nearest Neighbor / POINT scaler, ");
d8863d37 1644 else if(flags&SWS_AREA)
17470314 1645 fprintf(stderr, "\nSwScaler: Area Averageing scaler, ");
28bf81c9
MN
1646 else
1647 fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
1648
17470314
MN
1649 if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
1650 fprintf(stderr, "from %s to%s %s ",
1651 vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
28bf81c9 1652 else
17470314
MN
1653 fprintf(stderr, "from %s to %s ",
1654 vo_format_name(srcFormat), vo_format_name(dstFormat));
28bf81c9
MN
1655
1656 if(cpuCaps.hasMMX2)
1657 fprintf(stderr, "using MMX2\n");
1658 else if(cpuCaps.has3DNow)
1659 fprintf(stderr, "using 3DNOW\n");
1660 else if(cpuCaps.hasMMX)
1661 fprintf(stderr, "using MMX\n");
1662 else
1663 fprintf(stderr, "using C\n");
1664 }
1665
1666 if((flags & SWS_PRINT_INFO) && verbose)
1667 {
1668 if(cpuCaps.hasMMX)
1669 {
1670 if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1671 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1672 else
1673 {
1674 if(c->hLumFilterSize==4)
1675 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1676 else if(c->hLumFilterSize==8)
1677 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1678 else
1679 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1680
1681 if(c->hChrFilterSize==4)
1682 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1683 else if(c->hChrFilterSize==8)
1684 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1685 else
1686 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1687 }
1688 }
1689 else
1690 {
1691#ifdef ARCH_X86
1692 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1693#else
1694 if(flags & SWS_FAST_BILINEAR)
1695 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1696 else
1697 printf("SwScaler: using C scaler for horizontal scaling\n");
1698#endif
1699 }
6c7506de 1700 if(isPlanarYUV(dstFormat))
28bf81c9
MN
1701 {
1702 if(c->vLumFilterSize==1)
6c7506de 1703 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9 1704 else
6c7506de 1705 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9
MN
1706 }
1707 else
1708 {
1709 if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1710 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1711 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1712 else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1713 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1714 else
1715 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1716 }
1717
1718 if(dstFormat==IMGFMT_BGR24)
1719 printf("SwScaler: using %s YV12->BGR24 Converter\n",
1720 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
fd284805
MN
1721 else if(dstFormat==IMGFMT_BGR32)
1722 printf("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1723 else if(dstFormat==IMGFMT_BGR16)
1724 printf("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1725 else if(dstFormat==IMGFMT_BGR15)
1726 printf("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9
MN
1727
1728 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1729 }
1e621b18
MN
1730 if((flags & SWS_PRINT_INFO) && verbose>1)
1731 {
1732 printf("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1733 c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
1734 printf("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1735 c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
1736 }
37079906
MN
1737
1738 c->swScale= swScale;
28bf81c9
MN
1739 return c;
1740}
1741
1742/**
1743 * returns a normalized gaussian curve used to filter stuff
1744 * quality=3 is high quality, lowwer is lowwer quality
1745 */
c7f822d9
MN
1746
1747SwsVector *getGaussianVec(double variance, double quality){
28bf81c9
MN
1748 const int length= (int)(variance*quality + 0.5) | 1;
1749 int i;
1750 double *coeff= memalign(sizeof(double), length*sizeof(double));
1751 double middle= (length-1)*0.5;
c7f822d9
MN
1752 SwsVector *vec= malloc(sizeof(SwsVector));
1753
1754 vec->coeff= coeff;
1755 vec->length= length;
28bf81c9
MN
1756
1757 for(i=0; i<length; i++)
1758 {
1759 double dist= i-middle;
1760 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1761 }
1762
c7f822d9
MN
1763 normalizeVec(vec, 1.0);
1764
1765 return vec;
28bf81c9
MN
1766}
1767
5521b193
MN
1768SwsVector *getConstVec(double c, int length){
1769 int i;
1770 double *coeff= memalign(sizeof(double), length*sizeof(double));
1771 SwsVector *vec= malloc(sizeof(SwsVector));
1772
1773 vec->coeff= coeff;
1774 vec->length= length;
1775
1776 for(i=0; i<length; i++)
1777 coeff[i]= c;
1778
1779 return vec;
1780}
1781
1782
c7f822d9
MN
1783SwsVector *getIdentityVec(void){
1784 double *coeff= memalign(sizeof(double), sizeof(double));
1785 SwsVector *vec= malloc(sizeof(SwsVector));
1786 coeff[0]= 1.0;
1787
1788 vec->coeff= coeff;
1789 vec->length= 1;
1790
1791 return vec;
1792}
1793
1794void normalizeVec(SwsVector *a, double height){
28bf81c9
MN
1795 int i;
1796 double sum=0;
1797 double inv;
1798
c7f822d9
MN
1799 for(i=0; i<a->length; i++)
1800 sum+= a->coeff[i];
28bf81c9
MN
1801
1802 inv= height/sum;
1803
c7f822d9
MN
1804 for(i=0; i<a->length; i++)
1805 a->coeff[i]*= height;
28bf81c9
MN
1806}
1807
c7f822d9
MN
1808void scaleVec(SwsVector *a, double scalar){
1809 int i;
1810
1811 for(i=0; i<a->length; i++)
1812 a->coeff[i]*= scalar;
1813}
1814
5cebb24b 1815static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
c7f822d9 1816 int length= a->length + b->length - 1;
28bf81c9
MN
1817 double *coeff= memalign(sizeof(double), length*sizeof(double));
1818 int i, j;
c7f822d9
MN
1819 SwsVector *vec= malloc(sizeof(SwsVector));
1820
1821 vec->coeff= coeff;
1822 vec->length= length;
28bf81c9
MN
1823
1824 for(i=0; i<length; i++) coeff[i]= 0.0;
1825
c7f822d9 1826 for(i=0; i<a->length; i++)
28bf81c9 1827 {
c7f822d9 1828 for(j=0; j<b->length; j++)
28bf81c9 1829 {
c7f822d9 1830 coeff[i+j]+= a->coeff[i]*b->coeff[j];
28bf81c9
MN
1831 }
1832 }
1833
c7f822d9 1834 return vec;
28bf81c9
MN
1835}
1836
5cebb24b 1837static SwsVector *sumVec(SwsVector *a, SwsVector *b){
c7f822d9 1838 int length= MAX(a->length, b->length);
28bf81c9
MN
1839 double *coeff= memalign(sizeof(double), length*sizeof(double));
1840 int i;
c7f822d9
MN
1841 SwsVector *vec= malloc(sizeof(SwsVector));
1842
1843 vec->coeff= coeff;
1844 vec->length= length;
28bf81c9
MN
1845
1846 for(i=0; i<length; i++) coeff[i]= 0.0;
1847
c7f822d9
MN
1848 for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1849 for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1850
1851 return vec;
28bf81c9 1852}
c7f822d9 1853
5cebb24b 1854static SwsVector *diffVec(SwsVector *a, SwsVector *b){
c7f822d9
MN
1855 int length= MAX(a->length, b->length);
1856 double *coeff= memalign(sizeof(double), length*sizeof(double));
1857 int i;
1858 SwsVector *vec= malloc(sizeof(SwsVector));
1859
1860 vec->coeff= coeff;
1861 vec->length= length;
1862
1863 for(i=0; i<length; i++) coeff[i]= 0.0;
1864
1865 for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1866 for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1867
1868 return vec;
1869}
1870
1871/* shift left / or right if "shift" is negative */
5cebb24b 1872static SwsVector *getShiftedVec(SwsVector *a, int shift){
c7f822d9
MN
1873 int length= a->length + ABS(shift)*2;
1874 double *coeff= memalign(sizeof(double), length*sizeof(double));
ff7ba856 1875 int i;
c7f822d9
MN
1876 SwsVector *vec= malloc(sizeof(SwsVector));
1877
1878 vec->coeff= coeff;
1879 vec->length= length;
1880
1881 for(i=0; i<length; i++) coeff[i]= 0.0;
1882
1883 for(i=0; i<a->length; i++)
1884 {
1885 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1886 }
1887
1888 return vec;
1889}
1890
5cebb24b
MN
1891void shiftVec(SwsVector *a, int shift){
1892 SwsVector *shifted= getShiftedVec(a, shift);
1893 free(a->coeff);
1894 a->coeff= shifted->coeff;
1895 a->length= shifted->length;
1896 free(shifted);
1897}
1898
1899void addVec(SwsVector *a, SwsVector *b){
1900 SwsVector *sum= sumVec(a, b);
1901 free(a->coeff);
1902 a->coeff= sum->coeff;
1903 a->length= sum->length;
1904 free(sum);
1905}
1906
1907void subVec(SwsVector *a, SwsVector *b){
1908 SwsVector *diff= diffVec(a, b);
1909 free(a->coeff);
1910 a->coeff= diff->coeff;
1911 a->length= diff->length;
1912 free(diff);
1913}
1914
1915void convVec(SwsVector *a, SwsVector *b){
1916 SwsVector *conv= getConvVec(a, b);
1917 free(a->coeff);
1918 a->coeff= conv->coeff;
1919 a->length= conv->length;
1920 free(conv);
1921}
1922
1923SwsVector *cloneVec(SwsVector *a){
1924 double *coeff= memalign(sizeof(double), a->length*sizeof(double));
1925 int i;
1926 SwsVector *vec= malloc(sizeof(SwsVector));
1927
1928 vec->coeff= coeff;
1929 vec->length= a->length;
1930
1931 for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
1932
1933 return vec;
1934}
1935
c7f822d9
MN
1936void printVec(SwsVector *a){
1937 int i;
1938 double max=0;
1939 double min=0;
1940 double range;
1941
1942 for(i=0; i<a->length; i++)
1943 if(a->coeff[i]>max) max= a->coeff[i];
1944
1945 for(i=0; i<a->length; i++)
1946 if(a->coeff[i]<min) min= a->coeff[i];
1947
1948 range= max - min;
1949
1950 for(i=0; i<a->length; i++)
1951 {
1952 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
1953 printf("%1.3f ", a->coeff[i]);
1954 for(;x>0; x--) printf(" ");
1955 printf("|\n");
1956 }
1957}
1958
1959void freeVec(SwsVector *a){
1960 if(!a) return;
1961 if(a->coeff) free(a->coeff);
1962 a->coeff=NULL;
1963 a->length=0;
1964 free(a);
1965}
1966
1967void freeSwsContext(SwsContext *c){
1968 int i;
1969
1970 if(!c) return;
1971
1972 if(c->lumPixBuf)
1973 {
6c7506de 1974 for(i=0; i<c->vLumBufSize; i++)
c7f822d9
MN
1975 {
1976 if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
1977 c->lumPixBuf[i]=NULL;
1978 }
1979 free(c->lumPixBuf);
1980 c->lumPixBuf=NULL;
1981 }
1982
1983 if(c->chrPixBuf)
1984 {
6c7506de 1985 for(i=0; i<c->vChrBufSize; i++)
c7f822d9
MN
1986 {
1987 if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
1988 c->chrPixBuf[i]=NULL;
1989 }
1990 free(c->chrPixBuf);
1991 c->chrPixBuf=NULL;
1992 }
1993
1994 if(c->vLumFilter) free(c->vLumFilter);
1995 c->vLumFilter = NULL;
1996 if(c->vChrFilter) free(c->vChrFilter);
1997 c->vChrFilter = NULL;
1998 if(c->hLumFilter) free(c->hLumFilter);
1999 c->hLumFilter = NULL;
2000 if(c->hChrFilter) free(c->hChrFilter);
2001 c->hChrFilter = NULL;
2002
2003 if(c->vLumFilterPos) free(c->vLumFilterPos);
2004 c->vLumFilterPos = NULL;
2005 if(c->vChrFilterPos) free(c->vChrFilterPos);
2006 c->vChrFilterPos = NULL;
2007 if(c->hLumFilterPos) free(c->hLumFilterPos);
2008 c->hLumFilterPos = NULL;
2009 if(c->hChrFilterPos) free(c->hChrFilterPos);
2010 c->hChrFilterPos = NULL;
2011
2012 if(c->lumMmxFilter) free(c->lumMmxFilter);
2013 c->lumMmxFilter = NULL;
2014 if(c->chrMmxFilter) free(c->chrMmxFilter);
2015 c->chrMmxFilter = NULL;
2016
2017 free(c);
2018}
2019
7f56a527 2020