more mpeg4 fourcc's
[libav.git] / postproc / swscale.c
CommitLineData
fe8054c0
MN
1/*
2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
28bf81c9 19/*
b72034dd 20 supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8, Y800
44c1035c 21 supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32, Y8, Y800
6c7506de 22 BGR15/16 support dithering
a861d4d7
MN
23
24 unscaled special converters
25 YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
26 YV12/I420/IYUV -> YV12/I420/IYUV
27 YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
b935781b
MN
28 BGR24 -> BGR32 & RGB24 -> RGB32
29 BGR32 -> BGR24 & RGB32 -> RGB24
4bb3fa5e 30 BGR15 -> BGR16
b935781b
MN
31*/
32
33/*
34tested special converters
35 YV12/I420 -> BGR16
36 YV12 -> YV12
4bb3fa5e 37 BGR15 -> BGR16
1e1c4fe9 38 BGR16 -> BGR16
b935781b
MN
39
40untested special converters
1e1c4fe9
MN
41 YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
42 YV12/I420 -> YV12/I420
43 YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
b935781b
MN
44 BGR24 -> BGR32 & RGB24 -> RGB32
45 BGR32 -> BGR24 & RGB32 -> RGB24
ec22603f 46 BGR24 -> YV12
28bf81c9
MN
47*/
48
d3f41512 49#include <inttypes.h>
dda87e9f 50#include <string.h>
077ea8a7 51#include <math.h>
c1b0bfb4 52#include <stdio.h>
d3f41512 53#include "../config.h"
9b464428 54#include "../mangle.h"
81b7c056 55#include <assert.h>
c1b0bfb4
MN
56#ifdef HAVE_MALLOC_H
57#include <malloc.h>
b6acbc3c
BS
58#else
59#include <stdlib.h>
c1b0bfb4 60#endif
d604bab9 61#include "swscale.h"
7630f2e0 62#include "../cpudetect.h"
a861d4d7 63#include "../bswap.h"
28bf81c9 64#include "../libvo/img_format.h"
37079906 65#include "rgb2rgb.h"
b0db4198 66#include "../libvo/fastmemcpy.h"
4a53a912 67#include "../mp_msg.h"
0d9f3d85
A
68
69#define MSG_WARN(args...) mp_msg(MSGT_SWS,MSGL_WARN, ##args )
70#define MSG_FATAL(args...) mp_msg(MSGT_SWS,MSGL_FATAL, ##args )
71#define MSG_ERR(args...) mp_msg(MSGT_SWS,MSGL_ERR, ##args )
72#define MSG_V(args...) mp_msg(MSGT_SWS,MSGL_V, ##args )
73#define MSG_DBG2(args...) mp_msg(MSGT_SWS,MSGL_DBG2, ##args )
74#define MSG_INFO(args...) mp_msg(MSGT_SWS,MSGL_INFO, ##args )
75
541c4eb9 76#undef MOVNTQ
7d7f78b5 77#undef PAVGB
d3f41512 78
783e9cc9 79//#undef HAVE_MMX2
7f56a527 80//#define HAVE_3DNOW
d3f41512 81//#undef HAVE_MMX
783e9cc9 82//#undef ARCH_X86
2ba1bff0 83//#define WORDS_BIGENDIAN
d604bab9 84#define DITHER1XBPP
d3f41512 85
ac6a2e45
MN
86#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
87
1e621b18 88#define RET 0xC3 //near return opcode for X86
c1b0bfb4 89
28bf81c9 90#ifdef MP_DEBUG
81b7c056 91#define ASSERT(x) assert(x);
28bf81c9 92#else
c1b0bfb4 93#define ASSERT(x) ;
28bf81c9
MN
94#endif
95
96#ifdef M_PI
97#define PI M_PI
98#else
99#define PI 3.14159265358979323846
100#endif
c1b0bfb4 101
6c7506de 102//FIXME replace this with something faster
44c1035c
MN
103#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YVU9)
104#define isYUV(x) ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
105#define isHalfChrV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
6ff0ad6b 106#define isHalfChrH(x) ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
44c1035c 107#define isGray(x) ((x)==IMGFMT_Y800)
6ff0ad6b 108#define isSupportedIn(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
b72034dd 109 || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
a861d4d7 110 || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
6ff0ad6b
MN
111 || (x)==IMGFMT_Y800)
112#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
0d9f3d85 113 || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
0d9f3d85 114 || (x)==IMGFMT_Y800)
44c1035c
MN
115#define isRGB(x) (((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
116#define isBGR(x) (((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
117#define isPacked(x) ((x)==IMGFMT_YUY2 || isRGB(x) || isBGR(x))
6ff0ad6b
MN
118
119#define RGB2YUV_SHIFT 16
1e621b18
MN
120#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
121#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
122#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
123#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
124#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
125#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
126#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
127#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
128#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
6c7506de 129
e3d2500f 130extern int verbose; // defined in mplayer.c
783e9cc9
MN
131/*
132NOTES
d604bab9 133Special versions: fast Y 1:1 scaling (no interpolation in y direction)
31190492 134
783e9cc9 135TODO
d604bab9 136more intelligent missalignment avoidance for the horizontal scaler
c1b0bfb4
MN
137write special vertical cubic upscale version
138Optimize C code (yv12 / minmax)
ff7ba856 139add support for packed pixel yuv input & output
6ff0ad6b
MN
140add support for Y8 output
141optimize bgr24 & bgr32
ff7ba856 142add BGR4 output support
1e621b18 143write special BGR->BGR scaler
37079906 144deglobalize yuv2rgb*.c
783e9cc9 145*/
31190492 146
d604bab9 147#define ABS(a) ((a) > 0 ? (a) : (-(a)))
2ff198c1
MN
148#define MIN(a,b) ((a) > (b) ? (b) : (a))
149#define MAX(a,b) ((a) < (b) ? (b) : (a))
d604bab9 150
7630f2e0
MN
151#ifdef ARCH_X86
152#define CAN_COMPILE_X86_ASM
d604bab9
MN
153#endif
154
7630f2e0 155#ifdef CAN_COMPILE_X86_ASM
d604bab9 156static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
390b20a6
MN
157static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
158static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
159static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
160static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
f62255fb
MN
161static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
162static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
d604bab9
MN
163static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
164static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
165static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
077ea8a7 166static uint64_t __attribute__((aligned(8))) w02= 0x0002000200020002LL;
d604bab9
MN
167static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
168static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
169static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
6ff0ad6b 170static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
d604bab9 171
3fe8e8f0
MN
172static volatile uint64_t __attribute__((aligned(8))) b5Dither;
173static volatile uint64_t __attribute__((aligned(8))) g5Dither;
174static volatile uint64_t __attribute__((aligned(8))) g6Dither;
175static volatile uint64_t __attribute__((aligned(8))) r5Dither;
d8fa3c54
MN
176
177static uint64_t __attribute__((aligned(8))) dither4[2]={
178 0x0103010301030103LL,
179 0x0200020002000200LL,};
180
181static uint64_t __attribute__((aligned(8))) dither8[2]={
182 0x0602060206020602LL,
183 0x0004000400040004LL,};
d604bab9
MN
184
185static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
186static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
187static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
188static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
189static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
190static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
191
99d2cb72
MN
192static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
193static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
194static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
195
ac6a2e45
MN
196#ifdef FAST_BGR2YV12
197static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000000210041000DULL;
4342fc14
MN
198static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
199static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
ac6a2e45
MN
200#else
201static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000020E540830C8BULL;
4342fc14
MN
202static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
203static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
ac6a2e45
MN
204#endif
205static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
4342fc14 206static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
ac6a2e45
MN
207static const uint64_t w1111 __attribute__((aligned(8))) = 0x0001000100010001ULL;
208
28bf81c9 209// FIXME remove
d604bab9
MN
210static uint64_t __attribute__((aligned(8))) asm_yalpha1;
211static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
d604bab9 212#endif
783e9cc9
MN
213
214// clipping helper table for C implementations:
215static unsigned char clip_table[768];
216
b18ea156
MN
217static unsigned short clip_table16b[768];
218static unsigned short clip_table16g[768];
219static unsigned short clip_table16r[768];
220static unsigned short clip_table15b[768];
221static unsigned short clip_table15g[768];
222static unsigned short clip_table15r[768];
223
783e9cc9
MN
224// yuv->rgb conversion tables:
225static int yuvtab_2568[256];
226static int yuvtab_3343[256];
227static int yuvtab_0c92[256];
228static int yuvtab_1a1e[256];
229static int yuvtab_40cf[256];
c1b0bfb4
MN
230// Needed for cubic scaler to catch overflows
231static int clip_yuvtab_2568[768];
232static int clip_yuvtab_3343[768];
233static int clip_yuvtab_0c92[768];
234static int clip_yuvtab_1a1e[768];
235static int clip_yuvtab_40cf[768];
236
28bf81c9 237//global sws_flags from the command line
1f347f22 238int sws_flags=2;
077ea8a7 239
5cebb24b
MN
240//global srcFilter
241SwsFilter src_filter= {NULL, NULL, NULL, NULL};
242
243float sws_lum_gblur= 0.0;
244float sws_chr_gblur= 0.0;
245int sws_chr_vshift= 0;
246int sws_chr_hshift= 0;
5521b193
MN
247float sws_chr_sharpen= 0.0;
248float sws_lum_sharpen= 0.0;
5cebb24b 249
28bf81c9
MN
250/* cpuCaps combined from cpudetect and whats actually compiled in
251 (if there is no support for something compiled in it wont appear here) */
252static CpuCaps cpuCaps;
d3f41512 253
28bf81c9
MN
254void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
255 int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
2ff198c1 256
5cebb24b
MN
257static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
258
7630f2e0 259#ifdef CAN_COMPILE_X86_ASM
96034638
MN
260void in_asm_used_var_warning_killer()
261{
077ea8a7 262 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
28bf81c9 263 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
6ff0ad6b 264 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
96034638
MN
265 if(i) i=0;
266}
267#endif
d604bab9 268
e3d2500f
MN
269static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
270 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
271 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
272{
273 //FIXME Optimize (just quickly writen not opti..)
274 int i;
275 for(i=0; i<dstW; i++)
276 {
277 int val=0;
278 int j;
279 for(j=0; j<lumFilterSize; j++)
280 val += lumSrc[j][i] * lumFilter[j];
281
282 dest[i]= MIN(MAX(val>>19, 0), 255);
283 }
284
285 if(uDest != NULL)
286 for(i=0; i<(dstW>>1); i++)
287 {
288 int u=0;
289 int v=0;
290 int j;
627690b5 291 for(j=0; j<chrFilterSize; j++)
e3d2500f
MN
292 {
293 u += chrSrc[j][i] * chrFilter[j];
294 v += chrSrc[j][i + 2048] * chrFilter[j];
295 }
296
297 uDest[i]= MIN(MAX(u>>19, 0), 255);
298 vDest[i]= MIN(MAX(v>>19, 0), 255);
299 }
300}
301
302static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
303 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
28bf81c9 304 uint8_t *dest, int dstW, int dstFormat)
e3d2500f 305{
28bf81c9 306 if(dstFormat==IMGFMT_BGR32)
e3d2500f 307 {
2ba1bff0 308 int i;
df3c183a
MN
309#ifdef WORDS_BIGENDIAN
310 dest++;
311#endif
e3d2500f
MN
312 for(i=0; i<(dstW>>1); i++){
313 int j;
314 int Y1=0;
315 int Y2=0;
316 int U=0;
317 int V=0;
318 int Cb, Cr, Cg;
319 for(j=0; j<lumFilterSize; j++)
320 {
321 Y1 += lumSrc[j][2*i] * lumFilter[j];
322 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
323 }
324 for(j=0; j<chrFilterSize; j++)
325 {
326 U += chrSrc[j][i] * chrFilter[j];
327 V += chrSrc[j][i+2048] * chrFilter[j];
328 }
329 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
330 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
331 U >>= 19;
332 V >>= 19;
333
334 Cb= clip_yuvtab_40cf[U+ 256];
335 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
336 Cr= clip_yuvtab_3343[V+ 256];
337
338 dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
339 dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
340 dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
341
342 dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
343 dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
344 dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
345 }
346 }
28bf81c9 347 else if(dstFormat==IMGFMT_BGR24)
e3d2500f
MN
348 {
349 int i;
350 for(i=0; i<(dstW>>1); i++){
351 int j;
352 int Y1=0;
353 int Y2=0;
354 int U=0;
355 int V=0;
356 int Cb, Cr, Cg;
357 for(j=0; j<lumFilterSize; j++)
358 {
359 Y1 += lumSrc[j][2*i] * lumFilter[j];
360 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
361 }
362 for(j=0; j<chrFilterSize; j++)
363 {
364 U += chrSrc[j][i] * chrFilter[j];
365 V += chrSrc[j][i+2048] * chrFilter[j];
366 }
367 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
368 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
369 U >>= 19;
370 V >>= 19;
371
372 Cb= clip_yuvtab_40cf[U+ 256];
373 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
374 Cr= clip_yuvtab_3343[V+ 256];
375
376 dest[0]=clip_table[((Y1 + Cb) >>13)];
377 dest[1]=clip_table[((Y1 + Cg) >>13)];
378 dest[2]=clip_table[((Y1 + Cr) >>13)];
379
380 dest[3]=clip_table[((Y2 + Cb) >>13)];
381 dest[4]=clip_table[((Y2 + Cg) >>13)];
382 dest[5]=clip_table[((Y2 + Cr) >>13)];
383 dest+=6;
384 }
385 }
28bf81c9 386 else if(dstFormat==IMGFMT_BGR16)
e3d2500f
MN
387 {
388 int i;
5521b193
MN
389#ifdef DITHER1XBPP
390 static int ditherb1=1<<14;
391 static int ditherg1=1<<13;
392 static int ditherr1=2<<14;
393 static int ditherb2=3<<14;
394 static int ditherg2=3<<13;
395 static int ditherr2=0<<14;
396
397 ditherb1 ^= (1^2)<<14;
398 ditherg1 ^= (1^2)<<13;
399 ditherr1 ^= (1^2)<<14;
400 ditherb2 ^= (3^0)<<14;
401 ditherg2 ^= (3^0)<<13;
402 ditherr2 ^= (3^0)<<14;
403#else
404 const int ditherb1=0;
405 const int ditherg1=0;
406 const int ditherr1=0;
407 const int ditherb2=0;
408 const int ditherg2=0;
409 const int ditherr2=0;
410#endif
e3d2500f
MN
411 for(i=0; i<(dstW>>1); i++){
412 int j;
413 int Y1=0;
414 int Y2=0;
415 int U=0;
416 int V=0;
417 int Cb, Cr, Cg;
418 for(j=0; j<lumFilterSize; j++)
419 {
420 Y1 += lumSrc[j][2*i] * lumFilter[j];
421 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
422 }
423 for(j=0; j<chrFilterSize; j++)
424 {
425 U += chrSrc[j][i] * chrFilter[j];
426 V += chrSrc[j][i+2048] * chrFilter[j];
427 }
428 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
429 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
430 U >>= 19;
431 V >>= 19;
432
433 Cb= clip_yuvtab_40cf[U+ 256];
434 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
435 Cr= clip_yuvtab_3343[V+ 256];
436
437 ((uint16_t*)dest)[2*i] =
5521b193
MN
438 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
439 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
440 clip_table16r[(Y1 + Cr + ditherr1) >>13];
e3d2500f
MN
441
442 ((uint16_t*)dest)[2*i+1] =
5521b193
MN
443 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
444 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
445 clip_table16r[(Y2 + Cr + ditherr2) >>13];
e3d2500f
MN
446 }
447 }
28bf81c9 448 else if(dstFormat==IMGFMT_BGR15)
e3d2500f
MN
449 {
450 int i;
5521b193
MN
451#ifdef DITHER1XBPP
452 static int ditherb1=1<<14;
453 static int ditherg1=1<<14;
454 static int ditherr1=2<<14;
455 static int ditherb2=3<<14;
456 static int ditherg2=3<<14;
457 static int ditherr2=0<<14;
458
459 ditherb1 ^= (1^2)<<14;
460 ditherg1 ^= (1^2)<<14;
461 ditherr1 ^= (1^2)<<14;
462 ditherb2 ^= (3^0)<<14;
463 ditherg2 ^= (3^0)<<14;
464 ditherr2 ^= (3^0)<<14;
465#else
466 const int ditherb1=0;
467 const int ditherg1=0;
468 const int ditherr1=0;
469 const int ditherb2=0;
470 const int ditherg2=0;
471 const int ditherr2=0;
472#endif
e3d2500f
MN
473 for(i=0; i<(dstW>>1); i++){
474 int j;
475 int Y1=0;
476 int Y2=0;
477 int U=0;
478 int V=0;
479 int Cb, Cr, Cg;
480 for(j=0; j<lumFilterSize; j++)
481 {
482 Y1 += lumSrc[j][2*i] * lumFilter[j];
483 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
484 }
485 for(j=0; j<chrFilterSize; j++)
486 {
487 U += chrSrc[j][i] * chrFilter[j];
488 V += chrSrc[j][i+2048] * chrFilter[j];
489 }
490 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
491 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
492 U >>= 19;
493 V >>= 19;
494
495 Cb= clip_yuvtab_40cf[U+ 256];
496 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
497 Cr= clip_yuvtab_3343[V+ 256];
498
499 ((uint16_t*)dest)[2*i] =
5521b193
MN
500 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
501 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
502 clip_table15r[(Y1 + Cr + ditherr1) >>13];
e3d2500f
MN
503
504 ((uint16_t*)dest)[2*i+1] =
5521b193
MN
505 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
506 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
507 clip_table15r[(Y2 + Cr + ditherr2) >>13];
e3d2500f
MN
508 }
509 }
510}
511
512
7630f2e0
MN
513//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
514//Plain C versions
726a959a
MN
515#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
516#define COMPILE_C
517#endif
518
519#ifdef CAN_COMPILE_X86_ASM
520
521#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
522#define COMPILE_MMX
523#endif
524
525#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
526#define COMPILE_MMX2
527#endif
528
529#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
530#define COMPILE_3DNOW
531#endif
532#endif //CAN_COMPILE_X86_ASM
533
534#undef HAVE_MMX
535#undef HAVE_MMX2
536#undef HAVE_3DNOW
726a959a
MN
537
538#ifdef COMPILE_C
7630f2e0
MN
539#undef HAVE_MMX
540#undef HAVE_MMX2
541#undef HAVE_3DNOW
7630f2e0
MN
542#define RENAME(a) a ## _C
543#include "swscale_template.c"
726a959a 544#endif
397c035e 545
7630f2e0 546#ifdef CAN_COMPILE_X86_ASM
397c035e 547
7630f2e0
MN
548//X86 versions
549/*
550#undef RENAME
551#undef HAVE_MMX
552#undef HAVE_MMX2
553#undef HAVE_3DNOW
554#define ARCH_X86
555#define RENAME(a) a ## _X86
556#include "swscale_template.c"
1faf0867 557*/
7630f2e0 558//MMX versions
726a959a 559#ifdef COMPILE_MMX
7630f2e0
MN
560#undef RENAME
561#define HAVE_MMX
562#undef HAVE_MMX2
563#undef HAVE_3DNOW
7630f2e0
MN
564#define RENAME(a) a ## _MMX
565#include "swscale_template.c"
726a959a 566#endif
7630f2e0
MN
567
568//MMX2 versions
726a959a 569#ifdef COMPILE_MMX2
7630f2e0
MN
570#undef RENAME
571#define HAVE_MMX
572#define HAVE_MMX2
573#undef HAVE_3DNOW
7630f2e0
MN
574#define RENAME(a) a ## _MMX2
575#include "swscale_template.c"
726a959a 576#endif
7630f2e0
MN
577
578//3DNOW versions
726a959a 579#ifdef COMPILE_3DNOW
7630f2e0
MN
580#undef RENAME
581#define HAVE_MMX
582#undef HAVE_MMX2
583#define HAVE_3DNOW
7630f2e0
MN
584#define RENAME(a) a ## _3DNow
585#include "swscale_template.c"
726a959a 586#endif
7630f2e0
MN
587
588#endif //CAN_COMPILE_X86_ASM
589
590// minor note: the HAVE_xyz is messed up after that line so dont use it
d604bab9 591
d3f41512 592
6c7506de 593// old global scaler, dont use for new code
28bf81c9
MN
594// will use sws_flags from the command line
595void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
596 int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
d1fac6cf 597 int srcW, int srcH, int dstW, int dstH){
31190492 598
28bf81c9
MN
599 static SwsContext *context=NULL;
600 int dstFormat;
28bf81c9
MN
601 int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
602
6c7506de 603 switch(dstbpp)
28bf81c9 604 {
6c7506de
MN
605 case 8 : dstFormat= IMGFMT_Y8; break;
606 case 12: dstFormat= IMGFMT_YV12; break;
607 case 15: dstFormat= IMGFMT_BGR15; break;
608 case 16: dstFormat= IMGFMT_BGR16; break;
609 case 24: dstFormat= IMGFMT_BGR24; break;
610 case 32: dstFormat= IMGFMT_BGR32; break;
611 default: return;
612 }
613
614 if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
615
b6654a54 616 context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
6c7506de
MN
617}
618
619// will use sws_flags & src_filter (from cmd line)
620SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
621{
622 int flags=0;
623 static int firstTime=1;
624
5521b193 625#ifdef ARCH_X86
6c7506de
MN
626 if(gCpuCaps.hasMMX)
627 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
5521b193 628#endif
6c7506de
MN
629 if(firstTime)
630 {
28bf81c9 631 firstTime=0;
6c7506de
MN
632 flags= SWS_PRINT_INFO;
633 }
634 else if(verbose>1) flags= SWS_PRINT_INFO;
635
636 if(src_filter.lumH) freeVec(src_filter.lumH);
637 if(src_filter.lumV) freeVec(src_filter.lumV);
638 if(src_filter.chrH) freeVec(src_filter.chrH);
639 if(src_filter.chrV) freeVec(src_filter.chrV);
640
641 if(sws_lum_gblur!=0.0){
642 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
643 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
644 }else{
645 src_filter.lumH= getIdentityVec();
646 src_filter.lumV= getIdentityVec();
647 }
c7f822d9 648
6c7506de
MN
649 if(sws_chr_gblur!=0.0){
650 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
651 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
652 }else{
653 src_filter.chrH= getIdentityVec();
654 src_filter.chrV= getIdentityVec();
655 }
5521b193 656
6c7506de
MN
657 if(sws_chr_sharpen!=0.0){
658 SwsVector *g= getConstVec(-1.0, 3);
659 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
660 g->coeff[1]=2.0;
661 addVec(id, g);
662 convVec(src_filter.chrH, id);
663 convVec(src_filter.chrV, id);
664 freeVec(g);
665 freeVec(id);
666 }
5521b193 667
6c7506de
MN
668 if(sws_lum_sharpen!=0.0){
669 SwsVector *g= getConstVec(-1.0, 3);
670 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
671 g->coeff[1]=2.0;
672 addVec(id, g);
673 convVec(src_filter.lumH, id);
674 convVec(src_filter.lumV, id);
675 freeVec(g);
676 freeVec(id);
677 }
c7f822d9 678
6c7506de
MN
679 if(sws_chr_hshift)
680 shiftVec(src_filter.chrH, sws_chr_hshift);
c7f822d9 681
6c7506de
MN
682 if(sws_chr_vshift)
683 shiftVec(src_filter.chrV, sws_chr_vshift);
5521b193 684
6c7506de
MN
685 normalizeVec(src_filter.chrH, 1.0);
686 normalizeVec(src_filter.chrV, 1.0);
687 normalizeVec(src_filter.lumH, 1.0);
688 normalizeVec(src_filter.lumV, 1.0);
28bf81c9 689
6c7506de
MN
690 if(verbose > 1) printVec(src_filter.chrH);
691 if(verbose > 1) printVec(src_filter.lumH);
28bf81c9
MN
692
693 switch(sws_flags)
694 {
695 case 0: flags|= SWS_FAST_BILINEAR; break;
696 case 1: flags|= SWS_BILINEAR; break;
697 case 2: flags|= SWS_BICUBIC; break;
698 case 3: flags|= SWS_X; break;
ff7ba856 699 case 4: flags|= SWS_POINT; break;
d8863d37 700 case 5: flags|= SWS_AREA; break;
28bf81c9
MN
701 default:flags|= SWS_BILINEAR; break;
702 }
703
6c7506de 704 return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
28bf81c9
MN
705}
706
6c7506de 707
c7f822d9
MN
708static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
709 int srcW, int dstW, int filterAlign, int one, int flags,
710 SwsVector *srcFilter, SwsVector *dstFilter)
28bf81c9
MN
711{
712 int i;
c7f822d9
MN
713 int filterSize;
714 int filter2Size;
715 int minFilterSize;
716 double *filter=NULL;
717 double *filter2=NULL;
28bf81c9
MN
718#ifdef ARCH_X86
719 if(gCpuCaps.hasMMX)
720 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
726a959a 721#endif
31190492 722
adeaecb9 723 // Note the +1 is for the MMXscaler which reads over the end
6c7506de 724 *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
6c7506de 725
28bf81c9
MN
726 if(ABS(xInc - 0x10000) <10) // unscaled
727 {
728 int i;
c7f822d9
MN
729 filterSize= 1;
730 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
731 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
28bf81c9
MN
732
733 for(i=0; i<dstW; i++)
734 {
c7f822d9
MN
735 filter[i*filterSize]=1;
736 (*filterPos)[i]=i;
28bf81c9
MN
737 }
738
739 }
ff7ba856
MN
740 else if(flags&SWS_POINT) // lame looking point sampling mode
741 {
742 int i;
743 int xDstInSrc;
744 filterSize= 1;
745 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
746
747 xDstInSrc= xInc/2 - 0x8000;
748 for(i=0; i<dstW; i++)
749 {
8a01d20c 750 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
ff7ba856
MN
751
752 (*filterPos)[i]= xx;
753 filter[i]= 1.0;
754 xDstInSrc+= xInc;
755 }
756 }
28bf81c9
MN
757 else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
758 {
759 int i;
760 int xDstInSrc;
c7f822d9
MN
761 if (flags&SWS_BICUBIC) filterSize= 4;
762 else if(flags&SWS_X ) filterSize= 4;
d8863d37 763 else filterSize= 2; // SWS_BILINEAR / SWS_AREA
c7f822d9 764 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
28bf81c9
MN
765
766 xDstInSrc= xInc/2 - 0x8000;
767 for(i=0; i<dstW; i++)
768 {
8a01d20c 769 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
28bf81c9
MN
770 int j;
771
c7f822d9 772 (*filterPos)[i]= xx;
28bf81c9
MN
773 if((flags & SWS_BICUBIC) || (flags & SWS_X))
774 {
775 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
776 double y1,y2,y3,y4;
777 double A= -0.6;
778 if(flags & SWS_BICUBIC){
779 // Equation is from VirtualDub
780 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
781 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
782 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
783 y4 = ( + A*d*d - A*d*d*d);
784 }else{
785 // cubic interpolation (derived it myself)
786 y1 = ( -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
787 y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
788 y3 = ( +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
789 y4 = ( -1.0*d + 1.0*d*d*d)/6.0;
790 }
791
c7f822d9
MN
792 filter[i*filterSize + 0]= y1;
793 filter[i*filterSize + 1]= y2;
794 filter[i*filterSize + 2]= y3;
795 filter[i*filterSize + 3]= y4;
28bf81c9
MN
796 }
797 else
798 {
d8863d37 799 //Bilinear upscale / linear interpolate / Area averaging
c7f822d9 800 for(j=0; j<filterSize; j++)
28bf81c9
MN
801 {
802 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
803 double coeff= 1.0 - d;
804 if(coeff<0) coeff=0;
c7f822d9 805 filter[i*filterSize + j]= coeff;
28bf81c9
MN
806 xx++;
807 }
808 }
809 xDstInSrc+= xInc;
810 }
811 }
812 else // downscale
813 {
814 int xDstInSrc;
81b7c056
MN
815 ASSERT(dstW <= srcW)
816
d8863d37
MN
817 if(flags&SWS_BICUBIC) filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
818 else if(flags&SWS_X) filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
819 else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
820 else /* BILINEAR */ filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
c7f822d9 821 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
28bf81c9
MN
822
823 xDstInSrc= xInc/2 - 0x8000;
824 for(i=0; i<dstW; i++)
825 {
c7f822d9 826 int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
28bf81c9 827 int j;
c7f822d9
MN
828 (*filterPos)[i]= xx;
829 for(j=0; j<filterSize; j++)
28bf81c9
MN
830 {
831 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
832 double coeff;
833 if((flags & SWS_BICUBIC) || (flags & SWS_X))
834 {
835 double A= -0.75;
836// d*=2;
837 // Equation is from VirtualDub
838 if(d<1.0)
839 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
840 else if(d<2.0)
841 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
842 else
843 coeff=0.0;
844 }
d8863d37 845 else if(flags & SWS_AREA)
28bf81c9 846 {
d8863d37
MN
847 double srcPixelSize= (1<<16)/(double)xInc;
848 if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
849 else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
850 else coeff=0.0;
851 }
28bf81c9
MN
852 else
853 {
854 coeff= 1.0 - d;
855 if(coeff<0) coeff=0;
856 }
c7f822d9 857 filter[i*filterSize + j]= coeff;
28bf81c9
MN
858 xx++;
859 }
860 xDstInSrc+= xInc;
861 }
862 }
863
c7f822d9
MN
864 /* apply src & dst Filter to filter -> filter2
865 free(filter);
866 */
81b7c056 867 ASSERT(filterSize>0)
c7f822d9
MN
868 filter2Size= filterSize;
869 if(srcFilter) filter2Size+= srcFilter->length - 1;
870 if(dstFilter) filter2Size+= dstFilter->length - 1;
81b7c056 871 ASSERT(filter2Size>0)
c7f822d9
MN
872 filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
873
874 for(i=0; i<dstW; i++)
875 {
876 int j;
877 SwsVector scaleFilter;
878 SwsVector *outVec;
879
880 scaleFilter.coeff= filter + i*filterSize;
881 scaleFilter.length= filterSize;
882
5cebb24b 883 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
c7f822d9
MN
884 else outVec= &scaleFilter;
885
886 ASSERT(outVec->length == filter2Size)
887 //FIXME dstFilter
888
889 for(j=0; j<outVec->length; j++)
890 {
891 filter2[i*filter2Size + j]= outVec->coeff[j];
892 }
893
894 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
895
896 if(outVec != &scaleFilter) freeVec(outVec);
897 }
898 free(filter); filter=NULL;
899
900 /* try to reduce the filter-size (step1 find size and shift left) */
901 // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
902 minFilterSize= 0;
903 for(i=dstW-1; i>=0; i--)
904 {
905 int min= filter2Size;
906 int j;
907 double cutOff=0.0;
908
909 /* get rid off near zero elements on the left by shifting left */
910 for(j=0; j<filter2Size; j++)
911 {
912 int k;
913 cutOff += ABS(filter2[i*filter2Size]);
914
915 if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
916
917 /* preserve Monotonicity because the core cant handle the filter otherwise */
918 if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
919
920 // Move filter coeffs left
921 for(k=1; k<filter2Size; k++)
922 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
923 filter2[i*filter2Size + k - 1]= 0.0;
924 (*filterPos)[i]++;
925 }
926
927 cutOff=0.0;
928 /* count near zeros on the right */
929 for(j=filter2Size-1; j>0; j--)
930 {
931 cutOff += ABS(filter2[i*filter2Size + j]);
932
933 if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
934 min--;
935 }
936
937 if(min>minFilterSize) minFilterSize= min;
938 }
939
81b7c056 940 ASSERT(minFilterSize > 0)
6c7506de 941 filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
81b7c056 942 ASSERT(filterSize > 0)
6c7506de
MN
943 filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
944 *outFilterSize= filterSize;
945
4a53a912 946 if(flags&SWS_PRINT_INFO)
0d9f3d85 947 MSG_INFO("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
c7f822d9
MN
948 /* try to reduce the filter-size (step2 reduce it) */
949 for(i=0; i<dstW; i++)
950 {
951 int j;
952
6c7506de
MN
953 for(j=0; j<filterSize; j++)
954 {
955 if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
956 else filter[i*filterSize + j]= filter2[i*filter2Size + j];
957 }
c7f822d9 958 }
6c7506de
MN
959 free(filter2); filter2=NULL;
960
c7f822d9
MN
961
962 //FIXME try to align filterpos if possible
963
28bf81c9
MN
964 //fix borders
965 for(i=0; i<dstW; i++)
966 {
967 int j;
c7f822d9 968 if((*filterPos)[i] < 0)
28bf81c9
MN
969 {
970 // Move filter coeffs left to compensate for filterPos
6c7506de 971 for(j=1; j<filterSize; j++)
28bf81c9 972 {
c7f822d9 973 int left= MAX(j + (*filterPos)[i], 0);
6c7506de
MN
974 filter[i*filterSize + left] += filter[i*filterSize + j];
975 filter[i*filterSize + j]=0;
28bf81c9 976 }
c7f822d9 977 (*filterPos)[i]= 0;
28bf81c9
MN
978 }
979
6c7506de 980 if((*filterPos)[i] + filterSize > srcW)
28bf81c9 981 {
6c7506de 982 int shift= (*filterPos)[i] + filterSize - srcW;
28bf81c9 983 // Move filter coeffs right to compensate for filterPos
6c7506de 984 for(j=filterSize-2; j>=0; j--)
28bf81c9 985 {
6c7506de
MN
986 int right= MIN(j + shift, filterSize-1);
987 filter[i*filterSize +right] += filter[i*filterSize +j];
988 filter[i*filterSize +j]=0;
28bf81c9 989 }
6c7506de 990 (*filterPos)[i]= srcW - filterSize;
28bf81c9
MN
991 }
992 }
993
6c7506de
MN
994 // Note the +1 is for the MMXscaler which reads over the end
995 *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
996 memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
c7f822d9
MN
997
998 /* Normalize & Store in outFilter */
28bf81c9
MN
999 for(i=0; i<dstW; i++)
1000 {
1001 int j;
1002 double sum=0;
1003 double scale= one;
6c7506de 1004 for(j=0; j<filterSize; j++)
28bf81c9 1005 {
6c7506de 1006 sum+= filter[i*filterSize + j];
28bf81c9
MN
1007 }
1008 scale/= sum;
6c7506de 1009 for(j=0; j<filterSize; j++)
28bf81c9 1010 {
6c7506de 1011 (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
28bf81c9
MN
1012 }
1013 }
adeaecb9
MN
1014
1015 (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1016 for(i=0; i<*outFilterSize; i++)
1017 {
1018 int j= dstW*(*outFilterSize);
1019 (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1020 }
c7f822d9 1021
6c7506de 1022 free(filter);
7630f2e0 1023}
31190492 1024
28bf81c9 1025#ifdef ARCH_X86
b7dc6f66 1026static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
28bf81c9 1027{
b7dc6f66
MN
1028 uint8_t *fragmentA;
1029 int imm8OfPShufW1A;
1030 int imm8OfPShufW2A;
1031 int fragmentLengthA;
1032 uint8_t *fragmentB;
1033 int imm8OfPShufW1B;
1034 int imm8OfPShufW2B;
1035 int fragmentLengthB;
1036 int fragmentPos;
28bf81c9
MN
1037
1038 int xpos, i;
1039
1040 // create an optimized horizontal scaling routine
1041
1042 //code fragment
1043
1044 asm volatile(
1045 "jmp 9f \n\t"
1046 // Begin
1047 "0: \n\t"
b7dc6f66
MN
1048 "movq (%%edx, %%eax), %%mm3 \n\t"
1049 "movd (%%ecx, %%esi), %%mm0 \n\t"
1050 "movd 1(%%ecx, %%esi), %%mm1 \n\t"
1051 "punpcklbw %%mm7, %%mm1 \n\t"
1052 "punpcklbw %%mm7, %%mm0 \n\t"
28bf81c9
MN
1053 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
1054 "1: \n\t"
28bf81c9
MN
1055 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1056 "2: \n\t"
28bf81c9 1057 "psubw %%mm1, %%mm0 \n\t"
b7dc6f66 1058 "movl 8(%%ebx, %%eax), %%esi \n\t"
28bf81c9 1059 "pmullw %%mm3, %%mm0 \n\t"
28bf81c9
MN
1060 "psllw $7, %%mm1 \n\t"
1061 "paddw %%mm1, %%mm0 \n\t"
1062
1063 "movq %%mm0, (%%edi, %%eax) \n\t"
1064
1065 "addl $8, %%eax \n\t"
1066 // End
1067 "9: \n\t"
1068// "int $3\n\t"
1069 "leal 0b, %0 \n\t"
1070 "leal 1b, %1 \n\t"
1071 "leal 2b, %2 \n\t"
1072 "decl %1 \n\t"
1073 "decl %2 \n\t"
1074 "subl %0, %1 \n\t"
1075 "subl %0, %2 \n\t"
1076 "leal 9b, %3 \n\t"
1077 "subl %0, %3 \n\t"
b7dc6f66
MN
1078
1079
1080 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1081 "=r" (fragmentLengthA)
28bf81c9
MN
1082 );
1083
b7dc6f66
MN
1084 asm volatile(
1085 "jmp 9f \n\t"
1086 // Begin
1087 "0: \n\t"
1088 "movq (%%edx, %%eax), %%mm3 \n\t"
1089 "movd (%%ecx, %%esi), %%mm0 \n\t"
1090 "punpcklbw %%mm7, %%mm0 \n\t"
1091 "pshufw $0xFF, %%mm0, %%mm1 \n\t"
1092 "1: \n\t"
1093 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1094 "2: \n\t"
1095 "psubw %%mm1, %%mm0 \n\t"
1096 "movl 8(%%ebx, %%eax), %%esi \n\t"
1097 "pmullw %%mm3, %%mm0 \n\t"
1098 "psllw $7, %%mm1 \n\t"
1099 "paddw %%mm1, %%mm0 \n\t"
1100
1101 "movq %%mm0, (%%edi, %%eax) \n\t"
28bf81c9 1102
b7dc6f66
MN
1103 "addl $8, %%eax \n\t"
1104 // End
1105 "9: \n\t"
1106// "int $3\n\t"
1107 "leal 0b, %0 \n\t"
1108 "leal 1b, %1 \n\t"
1109 "leal 2b, %2 \n\t"
1110 "decl %1 \n\t"
1111 "decl %2 \n\t"
1112 "subl %0, %1 \n\t"
1113 "subl %0, %2 \n\t"
1114 "leal 9b, %3 \n\t"
1115 "subl %0, %3 \n\t"
1116
1117
1118 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1119 "=r" (fragmentLengthB)
1120 );
1121
1122 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1123 fragmentPos=0;
1124
1125 for(i=0; i<dstW/numSplits; i++)
28bf81c9
MN
1126 {
1127 int xx=xpos>>16;
1128
1129 if((i&3) == 0)
1130 {
1131 int a=0;
1132 int b=((xpos+xInc)>>16) - xx;
1133 int c=((xpos+xInc*2)>>16) - xx;
1134 int d=((xpos+xInc*3)>>16) - xx;
1135
b7dc6f66
MN
1136 filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9;
1137 filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9;
1138 filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
1139 filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
1140 filterPos[i/2]= xx;
1141
1142 if(d+1<4)
1143 {
1144 int maxShift= 3-(d+1);
1145 int shift=0;
1146
1147 memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
1148
1149 funnyCode[fragmentPos + imm8OfPShufW1B]=
1150 (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
1151 funnyCode[fragmentPos + imm8OfPShufW2B]=
1152 a | (b<<2) | (c<<4) | (d<<6);
1153
1154 if(i+3>=dstW) shift=maxShift; //avoid overread
1155 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
1156
1157 if(shift && i>=shift)
1158 {
1159 funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
1160 funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
1161 filterPos[i/2]-=shift;
1162 }
1163
1164 fragmentPos+= fragmentLengthB;
1165 }
1166 else
1167 {
1168 int maxShift= 3-d;
1169 int shift=0;
1170
1171 memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
28bf81c9 1172
b7dc6f66
MN
1173 funnyCode[fragmentPos + imm8OfPShufW1A]=
1174 funnyCode[fragmentPos + imm8OfPShufW2A]=
1175 a | (b<<2) | (c<<4) | (d<<6);
28bf81c9 1176
b7dc6f66
MN
1177 if(i+4>=dstW) shift=maxShift; //avoid overread
1178 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
28bf81c9 1179
b7dc6f66
MN
1180 if(shift && i>=shift)
1181 {
1182 funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
1183 funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
1184 filterPos[i/2]-=shift;
1185 }
1186
1187 fragmentPos+= fragmentLengthA;
1188 }
1189
1190 funnyCode[fragmentPos]= RET;
28bf81c9
MN
1191 }
1192 xpos+=xInc;
1193 }
b7dc6f66 1194 filterPos[i/2]= xpos>>16; // needed to jump to the next part
28bf81c9
MN
1195}
1196#endif // ARCH_X86
1197
1198//FIXME remove
31190492 1199void SwScale_Init(){
28bf81c9
MN
1200}
1201
1202static void globalInit(){
31190492
A
1203 // generating tables:
1204 int i;
c1b0bfb4
MN
1205 for(i=0; i<768; i++){
1206 int c= MIN(MAX(i-256, 0), 255);
1207 clip_table[i]=c;
1208 yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1209 yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1210 yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1211 yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1212 yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
31190492
A
1213 }
1214
b18ea156
MN
1215 for(i=0; i<768; i++)
1216 {
28bf81c9 1217 int v= clip_table[i];
daa57641
MN
1218 clip_table16b[i]= v>>3;
1219 clip_table16g[i]= (v<<3)&0x07E0;
1220 clip_table16r[i]= (v<<8)&0xF800;
1221 clip_table15b[i]= v>>3;
1222 clip_table15g[i]= (v<<2)&0x03E0;
1223 clip_table15r[i]= (v<<7)&0x7C00;
b18ea156 1224 }
c1b0bfb4 1225
28bf81c9
MN
1226cpuCaps= gCpuCaps;
1227
1228#ifdef RUNTIME_CPUDETECT
1229#ifdef CAN_COMPILE_X86_ASM
1230 // ordered per speed fasterst first
1231 if(gCpuCaps.hasMMX2)
1232 swScale= swScale_MMX2;
1233 else if(gCpuCaps.has3DNow)
7f56a527 1234 swScale= swScale_3DNow;
28bf81c9
MN
1235 else if(gCpuCaps.hasMMX)
1236 swScale= swScale_MMX;
1237 else
1238 swScale= swScale_C;
1239
1240#else
1241 swScale= swScale_C;
1242 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1243#endif
1244#else //RUNTIME_CPUDETECT
1245#ifdef HAVE_MMX2
1246 swScale= swScale_MMX2;
1247 cpuCaps.has3DNow = 0;
1248#elif defined (HAVE_3DNOW)
7f56a527 1249 swScale= swScale_3DNow;
28bf81c9
MN
1250 cpuCaps.hasMMX2 = 0;
1251#elif defined (HAVE_MMX)
1252 swScale= swScale_MMX;
1253 cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1254#else
1255 swScale= swScale_C;
1256 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1257#endif
1258#endif //!RUNTIME_CPUDETECT
31190492 1259}
7630f2e0 1260
0d9f3d85
A
1261static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1262 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1263 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1264 /* Copy Y plane */
1265 if(dstStride[0]==srcStride[0])
1266 memcpy(dst, src[0], srcSliceH*dstStride[0]);
1267 else
1268 {
1269 int i;
1270 uint8_t *srcPtr= src[0];
1271 uint8_t *dstPtr= dst;
1272 for(i=0; i<srcSliceH; i++)
1273 {
1274 memcpy(dstPtr, srcPtr, srcStride[0]);
1275 srcPtr+= srcStride[0];
1276 dstPtr+= dstStride[0];
1277 }
1278 }
1279 dst = dstParam[1] + dstStride[1]*srcSliceY;
1280 if(c->srcFormat==IMGFMT_YV12)
1281 interleaveBytes( src[1],src[2],dst,c->srcW,srcSliceH,srcStride[1],srcStride[2],dstStride[0] );
1282 else /* I420 & IYUV */
1283 interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
1284}
1285
1286
37079906
MN
1287/* Warper functions for yuv2bgr */
1288static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
b6654a54
MN
1289 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1290 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
37079906
MN
1291
1292 if(c->srcFormat==IMGFMT_YV12)
b6654a54 1293 yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
37079906 1294 else /* I420 & IYUV */
b6654a54
MN
1295 yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1296}
1297
44c1035c 1298static void PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
0d9f3d85
A
1299 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1300 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1301
1302 if(c->srcFormat==IMGFMT_YV12)
1303 yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1304 else /* I420 & IYUV */
1305 yv12toyuy2( src[0],src[2],src[1],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1306}
1307
b935781b
MN
1308static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1309 int srcSliceH, uint8_t* dst[], int dstStride[]){
1310
1311 if(dstStride[0]*3==srcStride[0]*4)
4bb3fa5e 1312 rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
b935781b
MN
1313 else
1314 {
1315 int i;
1316 uint8_t *srcPtr= src[0];
1317 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1318
1319 for(i=0; i<srcSliceH; i++)
1320 {
4bb3fa5e 1321 rgb24to32(srcPtr, dstPtr, c->srcW*3);
b935781b
MN
1322 srcPtr+= srcStride[0];
1323 dstPtr+= dstStride[0];
1324 }
1325 }
1326}
1327
0d9f3d85
A
1328static void bgr24to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1329 int srcSliceH, uint8_t* dst[], int dstStride[]){
1330
1331 if(dstStride[0]*3==srcStride[0]*2)
1332 rgb24to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1333 else
1334 {
1335 int i;
1336 uint8_t *srcPtr= src[0];
1337 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1338
1339 for(i=0; i<srcSliceH; i++)
1340 {
1341 rgb24to16(srcPtr, dstPtr, c->srcW*3);
1342 srcPtr+= srcStride[0];
1343 dstPtr+= dstStride[0];
1344 }
1345 }
1346}
1347
1348static void bgr24to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1349 int srcSliceH, uint8_t* dst[], int dstStride[]){
1350
1351 if(dstStride[0]*3==srcStride[0]*2)
1352 rgb24to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1353 else
1354 {
1355 int i;
1356 uint8_t *srcPtr= src[0];
1357 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1358
1359 for(i=0; i<srcSliceH; i++)
1360 {
1361 rgb24to15(srcPtr, dstPtr, c->srcW*3);
1362 srcPtr+= srcStride[0];
1363 dstPtr+= dstStride[0];
1364 }
1365 }
1366}
1367
b935781b
MN
1368static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1369 int srcSliceH, uint8_t* dst[], int dstStride[]){
1370
1371 if(dstStride[0]*4==srcStride[0]*3)
4bb3fa5e 1372 rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
b935781b
MN
1373 else
1374 {
1375 int i;
1376 uint8_t *srcPtr= src[0];
1377 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1378
1379 for(i=0; i<srcSliceH; i++)
1380 {
4bb3fa5e
MN
1381 rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1382 srcPtr+= srcStride[0];
1383 dstPtr+= dstStride[0];
1384 }
1385 }
1386}
1387
0d9f3d85
A
1388static void bgr32to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1389 int srcSliceH, uint8_t* dst[], int dstStride[]){
1390
1391 if(dstStride[0]*4==srcStride[0]*2)
1392 rgb32to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1393 else
1394 {
1395 int i;
1396 uint8_t *srcPtr= src[0];
1397 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1398
1399 for(i=0; i<srcSliceH; i++)
1400 {
1401 rgb32to16(srcPtr, dstPtr, c->srcW<<2);
1402 srcPtr+= srcStride[0];
1403 dstPtr+= dstStride[0];
1404 }
1405 }
1406}
1407
1408static void bgr32to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1409 int srcSliceH, uint8_t* dst[], int dstStride[]){
1410
1411 if(dstStride[0]*4==srcStride[0]*2)
1412 rgb32to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1413 else
1414 {
1415 int i;
1416 uint8_t *srcPtr= src[0];
1417 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1418
1419 for(i=0; i<srcSliceH; i++)
1420 {
1421 rgb32to15(srcPtr, dstPtr, c->srcW<<2);
1422 srcPtr+= srcStride[0];
1423 dstPtr+= dstStride[0];
1424 }
1425 }
1426}
1427
4bb3fa5e
MN
1428static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1429 int srcSliceH, uint8_t* dst[], int dstStride[]){
1430
1431 if(dstStride[0]==srcStride[0])
1432 rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1433 else
1434 {
1435 int i;
1436 uint8_t *srcPtr= src[0];
1437 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1438
1439 for(i=0; i<srcSliceH; i++)
1440 {
1441 rgb15to16(srcPtr, dstPtr, c->srcW<<1);
b935781b
MN
1442 srcPtr+= srcStride[0];
1443 dstPtr+= dstStride[0];
1444 }
1445 }
1446}
1447
0d9f3d85
A
1448static void bgr15to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1449 int srcSliceH, uint8_t* dst[], int dstStride[]){
1450
1451 if(dstStride[0]*2==srcStride[0]*3)
1452 rgb15to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1453 else
1454 {
1455 int i;
1456 uint8_t *srcPtr= src[0];
1457 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1458
1459 for(i=0; i<srcSliceH; i++)
1460 {
1461 rgb15to24(srcPtr, dstPtr, c->srcW<<1);
1462 srcPtr+= srcStride[0];
1463 dstPtr+= dstStride[0];
1464 }
1465 }
1466}
1467
1468static void bgr15to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1469 int srcSliceH, uint8_t* dst[], int dstStride[]){
1470
1471 if(dstStride[0]*2==srcStride[0]*4)
1472 rgb15to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1473 else
1474 {
1475 int i;
1476 uint8_t *srcPtr= src[0];
1477 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1478
1479 for(i=0; i<srcSliceH; i++)
1480 {
1481 rgb15to32(srcPtr, dstPtr, c->srcW<<1);
1482 srcPtr+= srcStride[0];
1483 dstPtr+= dstStride[0];
1484 }
1485 }
1486}
1487
1488static void bgr16to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1489 int srcSliceH, uint8_t* dst[], int dstStride[]){
1490
1491 if(dstStride[0]*2==srcStride[0]*3)
1492 rgb16to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1493 else
1494 {
1495 int i;
1496 uint8_t *srcPtr= src[0];
1497 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1498
1499 for(i=0; i<srcSliceH; i++)
1500 {
1501 rgb16to24(srcPtr, dstPtr, c->srcW<<1);
1502 srcPtr+= srcStride[0];
1503 dstPtr+= dstStride[0];
1504 }
1505 }
1506}
1507
1508static void bgr16to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1509 int srcSliceH, uint8_t* dst[], int dstStride[]){
1510
1511 if(dstStride[0]*2==srcStride[0]*4)
1512 rgb16to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1513 else
1514 {
1515 int i;
1516 uint8_t *srcPtr= src[0];
1517 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1518
1519 for(i=0; i<srcSliceH; i++)
1520 {
1521 rgb16to32(srcPtr, dstPtr, c->srcW<<1);
1522 srcPtr+= srcStride[0];
1523 dstPtr+= dstStride[0];
1524 }
1525 }
1526}
1527
ec22603f
MN
1528static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1529 int srcSliceH, uint8_t* dst[], int dstStride[]){
1530
1531 rgb24toyv12(
1532 src[0],
1533 dst[0]+ srcSliceY *dstStride[0],
1534 dst[1]+(srcSliceY>>1)*dstStride[1],
1535 dst[2]+(srcSliceY>>1)*dstStride[2],
1536 c->srcW, srcSliceH,
1537 dstStride[0], dstStride[1], srcStride[0]);
1538}
1539
44c1035c
MN
1540/**
1541 * bring pointers in YUV order instead of YVU
1542 */
1543static inline void orderYUV(SwsContext *c, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
1544 if(c->srcFormat == IMGFMT_YV12){
1545 sortedP[0]= p[0];
1546 sortedP[1]= p[1];
1547 sortedP[2]= p[2];
1548 sortedStride[0]= sortedStride[0];
1549 sortedStride[1]= sortedStride[1];
1550 sortedStride[2]= sortedStride[2];
1551 }
1552 else if(isPacked(c->srcFormat) || isGray(c->srcFormat))
1553 {
1554 sortedP[0]= p[0];
1555 sortedP[1]=
1556 sortedP[2]= NULL;
1557 sortedStride[0]= sortedStride[0];
1558 sortedStride[1]=
1559 sortedStride[2]= 0;
1560 }
1561 else /* I420 */
1562 {
1563 sortedP[0]= p[0];
1564 sortedP[1]= p[2];
1565 sortedP[2]= p[1];
1566 sortedStride[0]= sortedStride[0];
1567 sortedStride[1]= sortedStride[2];
1568 sortedStride[2]= sortedStride[1];
1569 }
1570}
b935781b 1571
b6654a54
MN
1572/* unscaled copy like stuff (assumes nearly identical formats) */
1573static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
44c1035c 1574 int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
b6654a54
MN
1575
1576 int srcStride[3];
44c1035c 1577 int dstStride[3];
b6654a54
MN
1578 uint8_t *src[3];
1579 uint8_t *dst[3];
1580
44c1035c
MN
1581 orderYUV(c, src, srcStride, srcParam, srcStrideParam);
1582 orderYUV(c, dst, dstStride, dstParam, dstStrideParam);
b6654a54
MN
1583
1584 if(isPacked(c->srcFormat))
1585 {
1586 if(dstStride[0]==srcStride[0])
1587 memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1588 else
1589 {
1590 int i;
1591 uint8_t *srcPtr= src[0];
1592 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
a861d4d7
MN
1593 int length=0;
1594
1595 /* universal length finder */
9bd8bd1a
MN
1596 while(length+c->srcW <= ABS(dstStride[0])
1597 && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
a861d4d7 1598 ASSERT(length!=0);
b6654a54
MN
1599
1600 for(i=0; i<srcSliceH; i++)
1601 {
1602 memcpy(dstPtr, srcPtr, length);
1603 srcPtr+= srcStride[0];
1604 dstPtr+= dstStride[0];
1605 }
1606 }
1607 }
1608 else
44c1035c 1609 { /* Planar YUV or gray */
b6654a54
MN
1610 int plane;
1611 for(plane=0; plane<3; plane++)
1612 {
44c1035c
MN
1613 int length= plane==0 ? c->srcW : ((c->srcW+1)>>1);
1614 int y= plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1615 int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
1616
1617 if((isGray(c->srcFormat) || isGray(c->dstFormat)) && plane>0)
0d9f3d85 1618 {
44c1035c
MN
1619 if(!isGray(c->dstFormat))
1620 memset(dst[plane], 0, dstStride[plane]*height);
0d9f3d85 1621 }
b6654a54
MN
1622 else
1623 {
44c1035c
MN
1624 if(dstStride[plane]==srcStride[plane])
1625 memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1626 else
b6654a54 1627 {
44c1035c
MN
1628 int i;
1629 uint8_t *srcPtr= src[plane];
1630 uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1631 for(i=0; i<height; i++)
1632 {
1633 memcpy(dstPtr, srcPtr, length);
1634 srcPtr+= srcStride[plane];
1635 dstPtr+= dstStride[plane];
1636 }
b6654a54
MN
1637 }
1638 }
1639 }
1640 }
37079906 1641}
28bf81c9 1642
44c1035c 1643static int remove_dup_fourcc(int fourcc)
0d9f3d85
A
1644{
1645 switch(fourcc)
1646 {
1647 case IMGFMT_IYUV: return IMGFMT_I420;
1648 case IMGFMT_Y8 : return IMGFMT_Y800;
1649 default: return fourcc;
1650 }
1651}
1652
28bf81c9
MN
1653SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1654 SwsFilter *srcFilter, SwsFilter *dstFilter){
1655
28bf81c9
MN
1656 SwsContext *c;
1657 int i;
37079906 1658 int usesFilter;
44c1035c 1659 int unscaled;
c7f822d9 1660 SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
5cebb24b
MN
1661#ifdef ARCH_X86
1662 if(gCpuCaps.hasMMX)
1663 asm volatile("emms\n\t"::: "memory");
1664#endif
28bf81c9
MN
1665 if(swScale==NULL) globalInit();
1666
6ff0ad6b 1667 /* avoid dupplicate Formats, so we dont need to check to much */
0d9f3d85
A
1668 srcFormat = remove_dup_fourcc(srcFormat);
1669 dstFormat = remove_dup_fourcc(dstFormat);
44c1035c
MN
1670
1671 unscaled = (srcW == dstW && srcH == dstH);
1672
1673 if(!isSupportedIn(srcFormat))
b81cf274 1674 {
44c1035c
MN
1675 MSG_ERR("swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1676 return NULL;
1677 }
1678 if(!isSupportedOut(dstFormat))
1679 {
1680 MSG_ERR("swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1681 return NULL;
b81cf274 1682 }
44c1035c 1683
28bf81c9 1684 /* sanity check */
b81cf274
MN
1685 if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1686 {
0d9f3d85 1687 MSG_ERR("swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
b81cf274
MN
1688 srcW, srcH, dstW, dstH);
1689 return NULL;
1690 }
28bf81c9 1691
c7f822d9
MN
1692 if(!dstFilter) dstFilter= &dummyFilter;
1693 if(!srcFilter) srcFilter= &dummyFilter;
1694
28bf81c9 1695 c= memalign(64, sizeof(SwsContext));
c7f822d9 1696 memset(c, 0, sizeof(SwsContext));
28bf81c9
MN
1697
1698 c->srcW= srcW;
1699 c->srcH= srcH;
1700 c->dstW= dstW;
1701 c->dstH= dstH;
5521b193
MN
1702 c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1703 c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
28bf81c9
MN
1704 c->flags= flags;
1705 c->dstFormat= dstFormat;
1706 c->srcFormat= srcFormat;
1707
37079906
MN
1708 usesFilter=0;
1709 if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1710 if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1711 if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1712 if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1713 if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1714 if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1715 if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1716 if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1717
b935781b 1718 /* unscaled special Cases */
44c1035c 1719 if(unscaled && !usesFilter)
37079906 1720 {
0d9f3d85
A
1721 /* yv12_to_nv12 */
1722 if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_NV12)
1723 {
1724 c->swScale= PlanarToNV12Wrapper;
1725
1726 if(flags&SWS_PRINT_INFO)
1727 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1728 vo_format_name(srcFormat), vo_format_name(dstFormat));
1729 return c;
1730 }
1731 /* yv12_to_yuy2 */
1732 if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_YUY2)
1733 {
44c1035c 1734 c->swScale= PlanarToYuy2Wrapper;
0d9f3d85
A
1735
1736 if(flags&SWS_PRINT_INFO)
1737 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1738 vo_format_name(srcFormat), vo_format_name(dstFormat));
1739 return c;
1740 }
37079906
MN
1741 /* yuv2bgr */
1742 if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1743 {
1744 // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1e1c4fe9 1745#ifdef WORDS_BIGENDIAN
daa57641
MN
1746 if(dstFormat==IMGFMT_BGR32)
1747 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
1748 else
1749 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1e1c4fe9 1750#else
b6654a54 1751 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1e1c4fe9 1752#endif
37079906 1753 c->swScale= planarYuvToBgr;
b6654a54
MN
1754
1755 if(flags&SWS_PRINT_INFO)
0d9f3d85 1756 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
b6654a54
MN
1757 vo_format_name(srcFormat), vo_format_name(dstFormat));
1758 return c;
1759 }
1760
1761 /* simple copy */
44c1035c
MN
1762 if(srcFormat == dstFormat
1763 || ((isPlanarYUV(srcFormat)||isGray(srcFormat)) && (isPlanarYUV(dstFormat)||isGray(dstFormat))))
b6654a54
MN
1764 {
1765 c->swScale= simpleCopy;
1766
37079906 1767 if(flags&SWS_PRINT_INFO)
0d9f3d85 1768 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
37079906
MN
1769 vo_format_name(srcFormat), vo_format_name(dstFormat));
1770 return c;
1771 }
44c1035c 1772
b935781b
MN
1773 /* bgr32to24 & rgb32to24*/
1774 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1775 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1776 {
1777 c->swScale= bgr32to24Wrapper;
1778
1779 if(flags&SWS_PRINT_INFO)
0d9f3d85
A
1780 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1781 vo_format_name(srcFormat), vo_format_name(dstFormat));
1782 return c;
1783 }
1784
1785 /* bgr32to16 & rgb32to16*/
1786 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR16)
1787 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB16))
1788 {
1789 c->swScale= bgr32to16Wrapper;
1790
1791 if(flags&SWS_PRINT_INFO)
1792 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1793 vo_format_name(srcFormat), vo_format_name(dstFormat));
1794 return c;
1795 }
1796
1797 /* bgr32to15 & rgb32to15*/
1798 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR15)
1799 ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB15))
1800 {
1801 c->swScale= bgr32to15Wrapper;
1802
1803 if(flags&SWS_PRINT_INFO)
1804 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
b935781b
MN
1805 vo_format_name(srcFormat), vo_format_name(dstFormat));
1806 return c;
1807 }
1808
1809 /* bgr24to32 & rgb24to32*/
1810 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1811 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1812 {
1813 c->swScale= bgr24to32Wrapper;
1814
1815 if(flags&SWS_PRINT_INFO)
0d9f3d85
A
1816 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1817 vo_format_name(srcFormat), vo_format_name(dstFormat));
1818 return c;
1819 }
1820
1821 /* bgr24to16 & rgb24to16*/
1822 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR16)
1823 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB16))
1824 {
1825 c->swScale= bgr24to16Wrapper;
1826
1827 if(flags&SWS_PRINT_INFO)
1828 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1829 vo_format_name(srcFormat), vo_format_name(dstFormat));
1830 return c;
1831 }
1832
1833 /* bgr24to15 & rgb24to15*/
1834 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR15)
1835 ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB15))
1836 {
1837 c->swScale= bgr24to15Wrapper;
1838
1839 if(flags&SWS_PRINT_INFO)
1840 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
b935781b
MN
1841 vo_format_name(srcFormat), vo_format_name(dstFormat));
1842 return c;
1843 }
4bb3fa5e
MN
1844
1845 /* bgr15to16 */
1846 if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
1847 {
1848 c->swScale= bgr15to16Wrapper;
1849
1850 if(flags&SWS_PRINT_INFO)
0d9f3d85
A
1851 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1852 vo_format_name(srcFormat), vo_format_name(dstFormat));
1853 return c;
1854 }
1855
1856 /* bgr15to24 */
1857 if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR24)
1858 ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB24))
1859 {
1860 c->swScale= bgr15to24Wrapper;
1861
1862 if(flags&SWS_PRINT_INFO)
1863 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1864 vo_format_name(srcFormat), vo_format_name(dstFormat));
1865 return c;
1866 }
1867
1868 /* bgr15to32 */
1869 if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR32)
1870 ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB32))
1871 {
1872 c->swScale= bgr15to32Wrapper;
1873
1874 if(flags&SWS_PRINT_INFO)
1875 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1876 vo_format_name(srcFormat), vo_format_name(dstFormat));
1877 return c;
1878 }
1879
1880 /* bgr16to24 */
1881 if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR24)
1882 ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB24))
1883 {
1884 c->swScale= bgr16to24Wrapper;
1885
1886 if(flags&SWS_PRINT_INFO)
1887 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1888 vo_format_name(srcFormat), vo_format_name(dstFormat));
1889 return c;
1890 }
1891
1892 /* bgr16to32 */
1893 if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR32)
1894 ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB32))
1895 {
1896 c->swScale= bgr16to32Wrapper;
1897
1898 if(flags&SWS_PRINT_INFO)
1899 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
4bb3fa5e
MN
1900 vo_format_name(srcFormat), vo_format_name(dstFormat));
1901 return c;
1902 }
ec22603f
MN
1903
1904 /* bgr24toYV12 */
1905 if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
1906 {
1907 c->swScale= bgr24toyv12Wrapper;
1908
1909 if(flags&SWS_PRINT_INFO)
0d9f3d85 1910 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
ec22603f
MN
1911 vo_format_name(srcFormat), vo_format_name(dstFormat));
1912 return c;
1913 }
37079906
MN
1914 }
1915
28bf81c9
MN
1916 if(cpuCaps.hasMMX2)
1917 {
1918 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1919 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1920 {
1921 if(flags&SWS_PRINT_INFO)
0d9f3d85 1922 MSG_INFO("SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
28bf81c9
MN
1923 }
1924 }
1925 else
1926 c->canMMX2BeUsed=0;
1927
1e621b18
MN
1928
1929 /* dont use full vertical UV input/internaly if the source doesnt even have it */
1930 if(isHalfChrV(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_V);
1931 /* dont use full horizontal UV input if the source doesnt even have it */
1932 if(isHalfChrH(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INP);
1933 /* dont use full horizontal UV internally if the destination doesnt even have it */
1934 if(isHalfChrH(dstFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INT);
1935
1936 if(flags&SWS_FULL_CHR_H_INP) c->chrSrcW= srcW;
1937 else c->chrSrcW= (srcW+1)>>1;
1938
1939 if(flags&SWS_FULL_CHR_H_INT) c->chrDstW= dstW;
1940 else c->chrDstW= (dstW+1)>>1;
1941
1942 if(flags&SWS_FULL_CHR_V) c->chrSrcH= srcH;
1943 else c->chrSrcH= (srcH+1)>>1;
1944
1945 if(isHalfChrV(dstFormat)) c->chrDstH= (dstH+1)>>1;
1946 else c->chrDstH= dstH;
1947
1948 c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
1949 c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
1950
1951
28bf81c9
MN
1952 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1953 // but only for the FAST_BILINEAR mode otherwise do correct scaling
1954 // n-2 is the last chrominance sample available
1955 // this is not perfect, but noone shuld notice the difference, the more correct variant
1956 // would be like the vertical one, but that would require some special code for the
1957 // first and last pixel
1958 if(flags&SWS_FAST_BILINEAR)
1959 {
1e621b18
MN
1960 if(c->canMMX2BeUsed)
1961 {
1962 c->lumXInc+= 20;
1963 c->chrXInc+= 20;
1964 }
28bf81c9 1965 //we dont use the x86asm scaler if mmx is available
1e621b18
MN
1966 else if(cpuCaps.hasMMX)
1967 {
1968 c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1969 c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
1970 }
28bf81c9
MN
1971 }
1972
28bf81c9
MN
1973 /* precalculate horizontal scaler filter coefficients */
1974 {
1975 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1976
c7f822d9
MN
1977 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1978 srcW , dstW, filterAlign, 1<<14, flags,
1979 srcFilter->lumH, dstFilter->lumH);
1980 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1981 (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1982 srcFilter->chrH, dstFilter->chrH);
28bf81c9
MN
1983
1984#ifdef ARCH_X86
1985// cant downscale !!!
1986 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1987 {
b7dc6f66
MN
1988 c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t));
1989 c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t));
1990 c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW /2/8+8)*sizeof(int32_t));
1991 c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
1992
1993 initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
1994 initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
28bf81c9
MN
1995 }
1996#endif
1997 } // Init Horizontal stuff
1998
1999
2000
2001 /* precalculate vertical scaler filter coefficients */
c7f822d9
MN
2002 initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
2003 srcH , dstH, 1, (1<<12)-4, flags,
2004 srcFilter->lumV, dstFilter->lumV);
2005 initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
2006 (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
2007 srcFilter->chrV, dstFilter->chrV);
28bf81c9
MN
2008
2009 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2010 c->vLumBufSize= c->vLumFilterSize;
2011 c->vChrBufSize= c->vChrFilterSize;
2012 for(i=0; i<dstH; i++)
2013 {
2014 int chrI= i*c->chrDstH / dstH;
2015 int nextSlice= MAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1,
2016 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
2017 nextSlice&= ~1; // Slices start at even boundaries
2018 if(c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice)
2019 c->vLumBufSize= nextSlice - c->vLumFilterPos[i ];
2020 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
2021 c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
2022 }
2023
2024 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
c7f822d9
MN
2025 c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
2026 c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
6c7506de 2027 //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
28bf81c9
MN
2028 for(i=0; i<c->vLumBufSize; i++)
2029 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
2030 for(i=0; i<c->vChrBufSize; i++)
2031 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
2032
2033 //try to avoid drawing green stuff between the right end and the stride end
2034 for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
2035 for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
2036
2037 ASSERT(c->chrDstH <= dstH)
28bf81c9
MN
2038
2039 // pack filter data for mmx code
2040 if(cpuCaps.hasMMX)
2041 {
c7f822d9
MN
2042 c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize* dstH*4*sizeof(int16_t));
2043 c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
28bf81c9
MN
2044 for(i=0; i<c->vLumFilterSize*dstH; i++)
2045 c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
2046 c->vLumFilter[i];
2047 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
2048 c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
2049 c->vChrFilter[i];
2050 }
2051
2052 if(flags&SWS_PRINT_INFO)
2053 {
2054#ifdef DITHER1XBPP
5521b193
MN
2055 char *dither= " dithered";
2056#else
2057 char *dither= "";
28bf81c9
MN
2058#endif
2059 if(flags&SWS_FAST_BILINEAR)
0d9f3d85 2060 MSG_INFO("\nSwScaler: FAST_BILINEAR scaler, ");
28bf81c9 2061 else if(flags&SWS_BILINEAR)
0d9f3d85 2062 MSG_INFO("\nSwScaler: BILINEAR scaler, ");
28bf81c9 2063 else if(flags&SWS_BICUBIC)
0d9f3d85 2064 MSG_INFO("\nSwScaler: BICUBIC scaler, ");
1e621b18 2065 else if(flags&SWS_X)
0d9f3d85 2066 MSG_INFO("\nSwScaler: Experimental scaler, ");
ff7ba856 2067 else if(flags&SWS_POINT)
0d9f3d85 2068 MSG_INFO("\nSwScaler: Nearest Neighbor / POINT scaler, ");
d8863d37 2069 else if(flags&SWS_AREA)
0d9f3d85 2070 MSG_INFO("\nSwScaler: Area Averageing scaler, ");
28bf81c9 2071 else
0d9f3d85 2072 MSG_INFO("\nSwScaler: ehh flags invalid?! ");
28bf81c9 2073
0d9f3d85
A
2074 if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
2075 MSG_INFO("from %s to%s %s ",
2076 vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
2077 else
2078 MSG_INFO("from %s to %s ",
2079 vo_format_name(srcFormat), vo_format_name(dstFormat));
28bf81c9
MN
2080
2081 if(cpuCaps.hasMMX2)
0d9f3d85 2082 MSG_INFO("using MMX2\n");
28bf81c9 2083 else if(cpuCaps.has3DNow)
0d9f3d85 2084 MSG_INFO("using 3DNOW\n");
28bf81c9 2085 else if(cpuCaps.hasMMX)
0d9f3d85 2086 MSG_INFO("using MMX\n");
28bf81c9 2087 else
0d9f3d85 2088 MSG_INFO("using C\n");
28bf81c9
MN
2089 }
2090
2091 if((flags & SWS_PRINT_INFO) && verbose)
2092 {
2093 if(cpuCaps.hasMMX)
2094 {
2095 if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
0d9f3d85 2096 MSG_V("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
28bf81c9
MN
2097 else
2098 {
2099 if(c->hLumFilterSize==4)
0d9f3d85 2100 MSG_V("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
28bf81c9 2101 else if(c->hLumFilterSize==8)
0d9f3d85 2102 MSG_V("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
28bf81c9 2103 else
0d9f3d85 2104 MSG_V("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
28bf81c9
MN
2105
2106 if(c->hChrFilterSize==4)
0d9f3d85 2107 MSG_V("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
28bf81c9 2108 else if(c->hChrFilterSize==8)
0d9f3d85 2109 MSG_V("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
28bf81c9 2110 else
0d9f3d85 2111 MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
28bf81c9
MN
2112 }
2113 }
2114 else
2115 {
2116#ifdef ARCH_X86
0d9f3d85 2117 MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
28bf81c9
MN
2118#else
2119 if(flags & SWS_FAST_BILINEAR)
0d9f3d85 2120 MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
28bf81c9 2121 else
0d9f3d85 2122 MSG_V("SwScaler: using C scaler for horizontal scaling\n");
28bf81c9
MN
2123#endif
2124 }
6c7506de 2125 if(isPlanarYUV(dstFormat))
28bf81c9
MN
2126 {
2127 if(c->vLumFilterSize==1)
0d9f3d85 2128 MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9 2129 else
0d9f3d85 2130 MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9
MN
2131 }
2132 else
2133 {
2134 if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
0d9f3d85 2135 MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
28bf81c9
MN
2136 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
2137 else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
0d9f3d85 2138 MSG_V("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9 2139 else
0d9f3d85 2140 MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9
MN
2141 }
2142
2143 if(dstFormat==IMGFMT_BGR24)
0d9f3d85 2144 MSG_V("SwScaler: using %s YV12->BGR24 Converter\n",
28bf81c9 2145 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
fd284805 2146 else if(dstFormat==IMGFMT_BGR32)
0d9f3d85 2147 MSG_V("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
fd284805 2148 else if(dstFormat==IMGFMT_BGR16)
0d9f3d85 2149 MSG_V("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
fd284805 2150 else if(dstFormat==IMGFMT_BGR15)
0d9f3d85 2151 MSG_V("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9 2152
0d9f3d85 2153 MSG_V("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
28bf81c9 2154 }
1e621b18
MN
2155 if((flags & SWS_PRINT_INFO) && verbose>1)
2156 {
0d9f3d85 2157 MSG_DBG2("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1e621b18 2158 c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
0d9f3d85 2159 MSG_DBG2("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1e621b18
MN
2160 c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
2161 }
37079906
MN
2162
2163 c->swScale= swScale;
28bf81c9
MN
2164 return c;
2165}
2166
2167/**
2168 * returns a normalized gaussian curve used to filter stuff
2169 * quality=3 is high quality, lowwer is lowwer quality
2170 */
c7f822d9
MN
2171
2172SwsVector *getGaussianVec(double variance, double quality){
28bf81c9
MN
2173 const int length= (int)(variance*quality + 0.5) | 1;
2174 int i;
2175 double *coeff= memalign(sizeof(double), length*sizeof(double));
2176 double middle= (length-1)*0.5;
c7f822d9
MN
2177 SwsVector *vec= malloc(sizeof(SwsVector));
2178
2179 vec->coeff= coeff;
2180 vec->length= length;
28bf81c9
MN
2181
2182 for(i=0; i<length; i++)
2183 {
2184 double dist= i-middle;
2185 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
2186 }
2187
c7f822d9
MN
2188 normalizeVec(vec, 1.0);
2189
2190 return vec;
28bf81c9
MN
2191}
2192
5521b193
MN
2193SwsVector *getConstVec(double c, int length){
2194 int i;
2195 double *coeff= memalign(sizeof(double), length*sizeof(double));
2196 SwsVector *vec= malloc(sizeof(SwsVector));
2197
2198 vec->coeff= coeff;
2199 vec->length= length;
2200
2201 for(i=0; i<length; i++)
2202 coeff[i]= c;
2203
2204 return vec;
2205}
2206
2207
c7f822d9
MN
2208SwsVector *getIdentityVec(void){
2209 double *coeff= memalign(sizeof(double), sizeof(double));
2210 SwsVector *vec= malloc(sizeof(SwsVector));
2211 coeff[0]= 1.0;
2212
2213 vec->coeff= coeff;
2214 vec->length= 1;
2215
2216 return vec;
2217}
2218
2219void normalizeVec(SwsVector *a, double height){
28bf81c9
MN
2220 int i;
2221 double sum=0;
2222 double inv;
2223
c7f822d9
MN
2224 for(i=0; i<a->length; i++)
2225 sum+= a->coeff[i];
28bf81c9
MN
2226
2227 inv= height/sum;
2228
c7f822d9
MN
2229 for(i=0; i<a->length; i++)
2230 a->coeff[i]*= height;
28bf81c9
MN
2231}
2232
c7f822d9
MN
2233void scaleVec(SwsVector *a, double scalar){
2234 int i;
2235
2236 for(i=0; i<a->length; i++)
2237 a->coeff[i]*= scalar;
2238}
2239
5cebb24b 2240static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
c7f822d9 2241 int length= a->length + b->length - 1;
28bf81c9
MN
2242 double *coeff= memalign(sizeof(double), length*sizeof(double));
2243 int i, j;
c7f822d9
MN
2244 SwsVector *vec= malloc(sizeof(SwsVector));
2245
2246 vec->coeff= coeff;
2247 vec->length= length;
28bf81c9
MN
2248
2249 for(i=0; i<length; i++) coeff[i]= 0.0;
2250
c7f822d9 2251 for(i=0; i<a->length; i++)
28bf81c9 2252 {
c7f822d9 2253 for(j=0; j<b->length; j++)
28bf81c9 2254 {
c7f822d9 2255 coeff[i+j]+= a->coeff[i]*b->coeff[j];
28bf81c9
MN
2256 }
2257 }
2258
c7f822d9 2259 return vec;
28bf81c9
MN
2260}
2261
5cebb24b 2262static SwsVector *sumVec(SwsVector *a, SwsVector *b){
c7f822d9 2263 int length= MAX(a->length, b->length);
28bf81c9
MN
2264 double *coeff= memalign(sizeof(double), length*sizeof(double));
2265 int i;
c7f822d9
MN
2266 SwsVector *vec= malloc(sizeof(SwsVector));
2267
2268 vec->coeff= coeff;
2269 vec->length= length;
28bf81c9
MN
2270
2271 for(i=0; i<length; i++) coeff[i]= 0.0;
2272
c7f822d9
MN
2273 for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2274 for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
2275
2276 return vec;
28bf81c9 2277}
c7f822d9 2278
5cebb24b 2279static SwsVector *diffVec(SwsVector *a, SwsVector *b){
c7f822d9
MN
2280 int length= MAX(a->length, b->length);
2281 double *coeff= memalign(sizeof(double), length*sizeof(double));
2282 int i;
2283 SwsVector *vec= malloc(sizeof(SwsVector));
2284
2285 vec->coeff= coeff;
2286 vec->length= length;
2287
2288 for(i=0; i<length; i++) coeff[i]= 0.0;
2289
2290 for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2291 for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
2292
2293 return vec;
2294}
2295
2296/* shift left / or right if "shift" is negative */
5cebb24b 2297static SwsVector *getShiftedVec(SwsVector *a, int shift){
c7f822d9
MN
2298 int length= a->length + ABS(shift)*2;
2299 double *coeff= memalign(sizeof(double), length*sizeof(double));
ff7ba856 2300 int i;
c7f822d9
MN
2301 SwsVector *vec= malloc(sizeof(SwsVector));
2302
2303 vec->coeff= coeff;
2304 vec->length= length;
2305
2306 for(i=0; i<length; i++) coeff[i]= 0.0;
2307
2308 for(i=0; i<a->length; i++)
2309 {
2310 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
2311 }
2312
2313 return vec;
2314}
2315
5cebb24b
MN
2316void shiftVec(SwsVector *a, int shift){
2317 SwsVector *shifted= getShiftedVec(a, shift);
2318 free(a->coeff);
2319 a->coeff= shifted->coeff;
2320 a->length= shifted->length;
2321 free(shifted);
2322}
2323
2324void addVec(SwsVector *a, SwsVector *b){
2325 SwsVector *sum= sumVec(a, b);
2326 free(a->coeff);
2327 a->coeff= sum->coeff;
2328 a->length= sum->length;
2329 free(sum);
2330}
2331
2332void subVec(SwsVector *a, SwsVector *b){
2333 SwsVector *diff= diffVec(a, b);
2334 free(a->coeff);
2335 a->coeff= diff->coeff;
2336 a->length= diff->length;
2337 free(diff);
2338}
2339
2340void convVec(SwsVector *a, SwsVector *b){
2341 SwsVector *conv= getConvVec(a, b);
2342 free(a->coeff);
2343 a->coeff= conv->coeff;
2344 a->length= conv->length;
2345 free(conv);
2346}
2347
2348SwsVector *cloneVec(SwsVector *a){
2349 double *coeff= memalign(sizeof(double), a->length*sizeof(double));
2350 int i;
2351 SwsVector *vec= malloc(sizeof(SwsVector));
2352
2353 vec->coeff= coeff;
2354 vec->length= a->length;
2355
2356 for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
2357
2358 return vec;
2359}
2360
c7f822d9
MN
2361void printVec(SwsVector *a){
2362 int i;
2363 double max=0;
2364 double min=0;
2365 double range;
2366
2367 for(i=0; i<a->length; i++)
2368 if(a->coeff[i]>max) max= a->coeff[i];
2369
2370 for(i=0; i<a->length; i++)
2371 if(a->coeff[i]<min) min= a->coeff[i];
2372
2373 range= max - min;
2374
2375 for(i=0; i<a->length; i++)
2376 {
2377 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
0d9f3d85
A
2378 MSG_DBG2("%1.3f ", a->coeff[i]);
2379 for(;x>0; x--) MSG_DBG2(" ");
2380 MSG_DBG2("|\n");
c7f822d9
MN
2381 }
2382}
2383
2384void freeVec(SwsVector *a){
2385 if(!a) return;
2386 if(a->coeff) free(a->coeff);
2387 a->coeff=NULL;
2388 a->length=0;
2389 free(a);
2390}
2391
2392void freeSwsContext(SwsContext *c){
2393 int i;
c7f822d9
MN
2394 if(!c) return;
2395
2396 if(c->lumPixBuf)
2397 {
6c7506de 2398 for(i=0; i<c->vLumBufSize; i++)
c7f822d9
MN
2399 {
2400 if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
2401 c->lumPixBuf[i]=NULL;
2402 }
2403 free(c->lumPixBuf);
2404 c->lumPixBuf=NULL;
2405 }
2406
2407 if(c->chrPixBuf)
2408 {
6c7506de 2409 for(i=0; i<c->vChrBufSize; i++)
c7f822d9
MN
2410 {
2411 if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
2412 c->chrPixBuf[i]=NULL;
2413 }
2414 free(c->chrPixBuf);
2415 c->chrPixBuf=NULL;
2416 }
2417
2418 if(c->vLumFilter) free(c->vLumFilter);
2419 c->vLumFilter = NULL;
2420 if(c->vChrFilter) free(c->vChrFilter);
2421 c->vChrFilter = NULL;
2422 if(c->hLumFilter) free(c->hLumFilter);
2423 c->hLumFilter = NULL;
2424 if(c->hChrFilter) free(c->hChrFilter);
2425 c->hChrFilter = NULL;
2426
2427 if(c->vLumFilterPos) free(c->vLumFilterPos);
2428 c->vLumFilterPos = NULL;
2429 if(c->vChrFilterPos) free(c->vChrFilterPos);
2430 c->vChrFilterPos = NULL;
2431 if(c->hLumFilterPos) free(c->hLumFilterPos);
2432 c->hLumFilterPos = NULL;
2433 if(c->hChrFilterPos) free(c->hChrFilterPos);
2434 c->hChrFilterPos = NULL;
2435
2436 if(c->lumMmxFilter) free(c->lumMmxFilter);
2437 c->lumMmxFilter = NULL;
2438 if(c->chrMmxFilter) free(c->chrMmxFilter);
2439 c->chrMmxFilter = NULL;
2440
b7dc6f66
MN
2441 if(c->lumMmx2Filter) free(c->lumMmx2Filter);
2442 c->lumMmx2Filter=NULL;
2443 if(c->chrMmx2Filter) free(c->chrMmx2Filter);
2444 c->chrMmx2Filter=NULL;
2445 if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
2446 c->lumMmx2FilterPos=NULL;
2447 if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
2448 c->chrMmx2FilterPos=NULL;
2449
c7f822d9
MN
2450 free(c);
2451}
2452
7f56a527 2453