optimize
[libav.git] / postproc / swscale.c
CommitLineData
fe8054c0
MN
1/*
2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
31190492 8
fe8054c0
MN
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
31190492 13
fe8054c0
MN
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
783e9cc9 18
28bf81c9 19/*
7322a67c 20 supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09
46de8b73 21 supported output formats: YV12, I420/IYUV, YUY2, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
e09d12f4 22 {BGR,RGB}{1,4,8,15,16} support dithering
a861d4d7 23
e09d12f4
MN
24 unscaled special converters (YV12=I420=IYUV, Y800=Y8)
25 YV12 -> {BGR,RGB}{1,4,8,15,16,24,32}
26 x -> x
27 YUV9 -> YV12
28 YUV9/YV12 -> Y800
29 Y800 -> YUV9/YV12
b935781b
MN
30 BGR24 -> BGR32 & RGB24 -> RGB32
31 BGR32 -> BGR24 & RGB32 -> RGB24
4bb3fa5e 32 BGR15 -> BGR16
b935781b
MN
33*/
34
35/*
e09d12f4
MN
36tested special converters (most are tested actually but i didnt write it down ...)
37 YV12 -> BGR16
b935781b 38 YV12 -> YV12
4bb3fa5e 39 BGR15 -> BGR16
1e1c4fe9 40 BGR16 -> BGR16
e09d12f4 41 YVU9 -> YV12
b935781b
MN
42
43untested special converters
1e1c4fe9
MN
44 YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
45 YV12/I420 -> YV12/I420
46 YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
b935781b
MN
47 BGR24 -> BGR32 & RGB24 -> RGB32
48 BGR32 -> BGR24 & RGB32 -> RGB24
ec22603f 49 BGR24 -> YV12
28bf81c9
MN
50*/
51
d3f41512 52#include <inttypes.h>
dda87e9f 53#include <string.h>
077ea8a7 54#include <math.h>
c1b0bfb4 55#include <stdio.h>
d3f41512 56#include "../config.h"
9b464428 57#include "../mangle.h"
81b7c056 58#include <assert.h>
c1b0bfb4
MN
59#ifdef HAVE_MALLOC_H
60#include <malloc.h>
b6acbc3c
BS
61#else
62#include <stdlib.h>
c1b0bfb4 63#endif
d604bab9 64#include "swscale.h"
7630f2e0 65#include "../cpudetect.h"
a861d4d7 66#include "../bswap.h"
28bf81c9 67#include "../libvo/img_format.h"
37079906 68#include "rgb2rgb.h"
b0db4198 69#include "../libvo/fastmemcpy.h"
4a53a912 70#include "../mp_msg.h"
0d9f3d85
A
71
72#define MSG_WARN(args...) mp_msg(MSGT_SWS,MSGL_WARN, ##args )
73#define MSG_FATAL(args...) mp_msg(MSGT_SWS,MSGL_FATAL, ##args )
74#define MSG_ERR(args...) mp_msg(MSGT_SWS,MSGL_ERR, ##args )
75#define MSG_V(args...) mp_msg(MSGT_SWS,MSGL_V, ##args )
76#define MSG_DBG2(args...) mp_msg(MSGT_SWS,MSGL_DBG2, ##args )
77#define MSG_INFO(args...) mp_msg(MSGT_SWS,MSGL_INFO, ##args )
78
541c4eb9 79#undef MOVNTQ
7d7f78b5 80#undef PAVGB
d3f41512 81
783e9cc9 82//#undef HAVE_MMX2
7f56a527 83//#define HAVE_3DNOW
d3f41512 84//#undef HAVE_MMX
783e9cc9 85//#undef ARCH_X86
2ba1bff0 86//#define WORDS_BIGENDIAN
d604bab9 87#define DITHER1XBPP
d3f41512 88
ac6a2e45
MN
89#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
90
1e621b18 91#define RET 0xC3 //near return opcode for X86
c1b0bfb4 92
28bf81c9 93#ifdef MP_DEBUG
81b7c056 94#define ASSERT(x) assert(x);
28bf81c9 95#else
c1b0bfb4 96#define ASSERT(x) ;
28bf81c9
MN
97#endif
98
99#ifdef M_PI
100#define PI M_PI
101#else
102#define PI 3.14159265358979323846
103#endif
c1b0bfb4 104
6c7506de 105//FIXME replace this with something faster
d80e2fa2
MN
106#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YVU9 \
107 || (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P)
7322a67c 108#define isYUV(x) ((x)==IMGFMT_UYVY || (x)==IMGFMT_YUY2 || isPlanarYUV(x))
44c1035c 109#define isGray(x) ((x)==IMGFMT_Y800)
cf7d1c1a
MN
110#define isRGB(x) (((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
111#define isBGR(x) (((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
7322a67c 112#define isSupportedIn(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY\
b72034dd 113 || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
a861d4d7 114 || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
d80e2fa2
MN
115 || (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9\
116 || (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P)
46de8b73 117#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2\
d80e2fa2 118 || (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P\
cf7d1c1a 119 || isRGB(x) || isBGR(x)\
e616aa93 120 || (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
7322a67c 121#define isPacked(x) ((x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY ||isRGB(x) || isBGR(x))
6ff0ad6b
MN
122
123#define RGB2YUV_SHIFT 16
1e621b18
MN
124#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
125#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
126#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
127#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
128#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
129#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
130#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
131#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
132#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
6c7506de 133
e3d2500f 134extern int verbose; // defined in mplayer.c
0481412a
MN
135extern const int32_t Inverse_Table_6_9[8][4];
136
783e9cc9
MN
137/*
138NOTES
d604bab9 139Special versions: fast Y 1:1 scaling (no interpolation in y direction)
31190492 140
783e9cc9 141TODO
d604bab9 142more intelligent missalignment avoidance for the horizontal scaler
c1b0bfb4
MN
143write special vertical cubic upscale version
144Optimize C code (yv12 / minmax)
ff7ba856 145add support for packed pixel yuv input & output
6ff0ad6b
MN
146add support for Y8 output
147optimize bgr24 & bgr32
ff7ba856 148add BGR4 output support
1e621b18 149write special BGR->BGR scaler
37079906 150deglobalize yuv2rgb*.c
783e9cc9 151*/
31190492 152
d604bab9 153#define ABS(a) ((a) > 0 ? (a) : (-(a)))
2ff198c1
MN
154#define MIN(a,b) ((a) > (b) ? (b) : (a))
155#define MAX(a,b) ((a) < (b) ? (b) : (a))
d604bab9 156
7630f2e0
MN
157#ifdef ARCH_X86
158#define CAN_COMPILE_X86_ASM
d604bab9
MN
159#endif
160
7630f2e0 161#ifdef CAN_COMPILE_X86_ASM
d604bab9 162static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
390b20a6
MN
163static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
164static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
165static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
166static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
f62255fb
MN
167static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
168static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
d604bab9
MN
169static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
170static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
171static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
077ea8a7 172static uint64_t __attribute__((aligned(8))) w02= 0x0002000200020002LL;
d604bab9
MN
173static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
174static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
175static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
6ff0ad6b 176static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
d604bab9 177
3fe8e8f0
MN
178static volatile uint64_t __attribute__((aligned(8))) b5Dither;
179static volatile uint64_t __attribute__((aligned(8))) g5Dither;
180static volatile uint64_t __attribute__((aligned(8))) g6Dither;
181static volatile uint64_t __attribute__((aligned(8))) r5Dither;
d8fa3c54
MN
182
183static uint64_t __attribute__((aligned(8))) dither4[2]={
184 0x0103010301030103LL,
185 0x0200020002000200LL,};
186
187static uint64_t __attribute__((aligned(8))) dither8[2]={
188 0x0602060206020602LL,
189 0x0004000400040004LL,};
d604bab9
MN
190
191static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
192static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
193static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
194static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
195static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
196static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
197
99d2cb72
MN
198static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
199static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
200static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
201
ac6a2e45
MN
202#ifdef FAST_BGR2YV12
203static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000000210041000DULL;
4342fc14
MN
204static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
205static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
ac6a2e45
MN
206#else
207static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000020E540830C8BULL;
4342fc14
MN
208static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
209static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
ac6a2e45
MN
210#endif
211static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
4342fc14 212static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
ac6a2e45 213static const uint64_t w1111 __attribute__((aligned(8))) = 0x0001000100010001ULL;
d604bab9 214#endif
783e9cc9
MN
215
216// clipping helper table for C implementations:
217static unsigned char clip_table[768];
218
28bf81c9 219//global sws_flags from the command line
1f347f22 220int sws_flags=2;
077ea8a7 221
5cebb24b
MN
222//global srcFilter
223SwsFilter src_filter= {NULL, NULL, NULL, NULL};
224
225float sws_lum_gblur= 0.0;
226float sws_chr_gblur= 0.0;
227int sws_chr_vshift= 0;
228int sws_chr_hshift= 0;
5521b193
MN
229float sws_chr_sharpen= 0.0;
230float sws_lum_sharpen= 0.0;
5cebb24b 231
28bf81c9
MN
232/* cpuCaps combined from cpudetect and whats actually compiled in
233 (if there is no support for something compiled in it wont appear here) */
234static CpuCaps cpuCaps;
d3f41512 235
28bf81c9
MN
236void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
237 int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
2ff198c1 238
5cebb24b 239static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
5859233b 240static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]);
cf7d1c1a 241void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_gU[256], int table_gV[256], void *table_bU[256]);
5859233b 242
cf7d1c1a
MN
243extern const uint8_t dither_2x2_4[2][8];
244extern const uint8_t dither_2x2_8[2][8];
245extern const uint8_t dither_8x8_32[8][8];
246extern const uint8_t dither_8x8_73[8][8];
247extern const uint8_t dither_8x8_220[8][8];
5cebb24b 248
7630f2e0 249#ifdef CAN_COMPILE_X86_ASM
96034638
MN
250void in_asm_used_var_warning_killer()
251{
077ea8a7 252 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
5ac80202 253 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+
6ff0ad6b 254 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
96034638
MN
255 if(i) i=0;
256}
257#endif
d604bab9 258
c7a810cc 259static int testFormat[]={
e616aa93 260IMGFMT_YVU9,
c7a810cc
MN
261IMGFMT_YV12,
262//IMGFMT_IYUV,
263IMGFMT_I420,
264IMGFMT_BGR15,
265IMGFMT_BGR16,
266IMGFMT_BGR24,
267IMGFMT_BGR32,
81a571a8
MN
268IMGFMT_RGB24,
269IMGFMT_RGB32,
c7a810cc
MN
270//IMGFMT_Y8,
271IMGFMT_Y800,
272//IMGFMT_YUY2,
2730
274};
275
276static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){
277 int x,y;
278 uint64_t ssd=0;
279
280 for(y=0; y<h; y++){
281 for(x=0; x<w; x++){
282 int d= src1[x + y*stride1] - src2[x + y*stride2];
283 ssd+= d*d;
284 }
285 }
286 return ssd;
287}
288
289// test by ref -> src -> dst -> out & compare out against ref
290// ref & out are YV12
291static void doTest(uint8_t *ref[3], int refStride[3], int w, int h, int srcFormat, int dstFormat,
292 int srcW, int srcH, int dstW, int dstH, int flags){
293 uint8_t *src[3];
294 uint8_t *dst[3];
295 uint8_t *out[3];
296 int srcStride[3], dstStride[3];
297 int i;
298 uint64_t ssdY, ssdU, ssdV;
299 SwsContext *srcContext, *dstContext, *outContext;
300
301 for(i=0; i<3; i++){
e09d12f4
MN
302 // avoid stride % bpp != 0
303 if(srcFormat==IMGFMT_RGB24 || srcFormat==IMGFMT_BGR24)
304 srcStride[i]= srcW*3;
305 else
306 srcStride[i]= srcW*4;
307
308 if(dstFormat==IMGFMT_RGB24 || dstFormat==IMGFMT_BGR24)
309 dstStride[i]= dstW*3;
310 else
311 dstStride[i]= dstW*4;
312
c7a810cc
MN
313 src[i]= malloc(srcStride[i]*srcH);
314 dst[i]= malloc(dstStride[i]*dstH);
315 out[i]= malloc(refStride[i]*h);
316 }
317
318 srcContext= getSwsContext(w, h, IMGFMT_YV12, srcW, srcH, srcFormat, flags, NULL, NULL);
319 dstContext= getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL);
320 outContext= getSwsContext(dstW, dstH, dstFormat, w, h, IMGFMT_YV12, flags, NULL, NULL);
321 if(srcContext==NULL ||dstContext==NULL ||outContext==NULL){
322 printf("Failed allocating swsContext\n");
323 goto end;
324 }
325// printf("test %X %X %X -> %X %X %X\n", (int)ref[0], (int)ref[1], (int)ref[2],
326// (int)src[0], (int)src[1], (int)src[2]);
327
328 srcContext->swScale(srcContext, ref, refStride, 0, h , src, srcStride);
329 dstContext->swScale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
330 outContext->swScale(outContext, dst, dstStride, 0, dstH, out, refStride);
331
332 ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
333 ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1);
334 ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1);
335
336 if(isGray(srcFormat) || isGray(dstFormat)) ssdU=ssdV=0; //FIXME check that output is really gray
337
338 ssdY/= w*h;
339 ssdU/= w*h/4;
340 ssdV/= w*h/4;
341
e616aa93 342 if(ssdY>100 || ssdU>50 || ssdV>50){
c7a810cc
MN
343 printf(" %s %dx%d -> %s %4dx%4d flags=%2d SSD=%5lld,%5lld,%5lld\n",
344 vo_format_name(srcFormat), srcW, srcH,
345 vo_format_name(dstFormat), dstW, dstH,
346 flags,
347 ssdY, ssdU, ssdV);
348 }
349
350 end:
351
352 freeSwsContext(srcContext);
353 freeSwsContext(dstContext);
354 freeSwsContext(outContext);
355
356 for(i=0; i<3; i++){
357 free(src[i]);
358 free(dst[i]);
359 free(out[i]);
360 }
361}
362
363static void selfTest(uint8_t *src[3], int stride[3], int w, int h){
364 int srcFormat, dstFormat, srcFormatIndex, dstFormatIndex;
365 int srcW, srcH, dstW, dstH;
366 int flags;
367
368 for(srcFormatIndex=0; ;srcFormatIndex++){
369 srcFormat= testFormat[srcFormatIndex];
370 if(!srcFormat) break;
371 for(dstFormatIndex=0; ;dstFormatIndex++){
372 dstFormat= testFormat[dstFormatIndex];
373 if(!dstFormat) break;
374 if(!isSupportedOut(dstFormat)) continue;
e09d12f4
MN
375printf("%s -> %s\n",
376 vo_format_name(srcFormat),
377 vo_format_name(dstFormat));
c7a810cc
MN
378
379 srcW= w+w/3;
380 srcH= h+h/3;
381 for(dstW=w; dstW<w*2; dstW+= dstW/3){
382 for(dstH=h; dstH<h*2; dstH+= dstH/3){
383 for(flags=1; flags<33; flags*=2)
384 doTest(src, stride, w, h, srcFormat, dstFormat,
385 srcW, srcH, dstW, dstH, flags);
386 }
387 }
388 }
389 }
390}
391
5859233b 392static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
e3d2500f 393 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
5859233b 394 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
e3d2500f
MN
395{
396 //FIXME Optimize (just quickly writen not opti..)
397 int i;
5859233b 398 for(i=0; i<dstW; i++)
e3d2500f
MN
399 {
400 int val=0;
401 int j;
402 for(j=0; j<lumFilterSize; j++)
403 val += lumSrc[j][i] * lumFilter[j];
404
405 dest[i]= MIN(MAX(val>>19, 0), 255);
406 }
407
408 if(uDest != NULL)
5859233b 409 for(i=0; i<chrDstW; i++)
e3d2500f
MN
410 {
411 int u=0;
412 int v=0;
413 int j;
627690b5 414 for(j=0; j<chrFilterSize; j++)
e3d2500f
MN
415 {
416 u += chrSrc[j][i] * chrFilter[j];
417 v += chrSrc[j][i + 2048] * chrFilter[j];
418 }
419
420 uDest[i]= MIN(MAX(u>>19, 0), 255);
421 vDest[i]= MIN(MAX(v>>19, 0), 255);
422 }
423}
424
46de8b73 425
25593e29 426#define YSCALE_YUV_2_PACKEDX_C(type) \
cf7d1c1a
MN
427 for(i=0; i<(dstW>>1); i++){\
428 int j;\
429 int Y1=0;\
430 int Y2=0;\
431 int U=0;\
432 int V=0;\
433 type *r, *b, *g;\
434 const int i2= 2*i;\
435 \
436 for(j=0; j<lumFilterSize; j++)\
437 {\
438 Y1 += lumSrc[j][i2] * lumFilter[j];\
439 Y2 += lumSrc[j][i2+1] * lumFilter[j];\
440 }\
441 for(j=0; j<chrFilterSize; j++)\
442 {\
443 U += chrSrc[j][i] * chrFilter[j];\
444 V += chrSrc[j][i+2048] * chrFilter[j];\
445 }\
446 Y1>>=19;\
447 Y2>>=19;\
448 U >>=19;\
449 V >>=19;\
450 if((Y1|Y2|U|V)&256)\
451 {\
452 if(Y1>255) Y1=255;\
453 else if(Y1<0)Y1=0;\
454 if(Y2>255) Y2=255;\
455 else if(Y2<0)Y2=0;\
456 if(U>255) U=255;\
457 else if(U<0) U=0;\
458 if(V>255) V=255;\
459 else if(V<0) V=0;\
46de8b73
MN
460 }
461
462#define YSCALE_YUV_2_RGBX_C(type) \
25593e29 463 YSCALE_YUV_2_PACKEDX_C(type)\
cf7d1c1a
MN
464 r = c->table_rV[V];\
465 g = c->table_gU[U] + c->table_gV[V];\
466 b = c->table_bU[U];\
467
25593e29 468#define YSCALE_YUV_2_PACKED2_C \
cf7d1c1a
MN
469 for(i=0; i<(dstW>>1); i++){\
470 const int i2= 2*i;\
471 int Y1= (buf0[i2 ]*yalpha1+buf1[i2 ]*yalpha)>>19;\
472 int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;\
473 int U= (uvbuf0[i ]*uvalpha1+uvbuf1[i ]*uvalpha)>>19;\
474 int V= (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;\
46de8b73
MN
475
476#define YSCALE_YUV_2_RGB2_C(type) \
25593e29 477 YSCALE_YUV_2_PACKED2_C\
cf7d1c1a
MN
478 type *r, *b, *g;\
479 r = c->table_rV[V];\
480 g = c->table_gU[U] + c->table_gV[V];\
481 b = c->table_bU[U];\
482
25593e29 483#define YSCALE_YUV_2_PACKED1_C \
cf7d1c1a
MN
484 for(i=0; i<(dstW>>1); i++){\
485 const int i2= 2*i;\
486 int Y1= buf0[i2 ]>>7;\
487 int Y2= buf0[i2+1]>>7;\
488 int U= (uvbuf1[i ])>>7;\
489 int V= (uvbuf1[i+2048])>>7;\
46de8b73
MN
490
491#define YSCALE_YUV_2_RGB1_C(type) \
25593e29 492 YSCALE_YUV_2_PACKED1_C\
cf7d1c1a
MN
493 type *r, *b, *g;\
494 r = c->table_rV[V];\
495 g = c->table_gU[U] + c->table_gV[V];\
496 b = c->table_bU[U];\
497
25593e29 498#define YSCALE_YUV_2_PACKED1B_C \
cf7d1c1a
MN
499 for(i=0; i<(dstW>>1); i++){\
500 const int i2= 2*i;\
501 int Y1= buf0[i2 ]>>7;\
502 int Y2= buf0[i2+1]>>7;\
503 int U= (uvbuf0[i ] + uvbuf1[i ])>>8;\
504 int V= (uvbuf0[i+2048] + uvbuf1[i+2048])>>8;\
46de8b73
MN
505
506#define YSCALE_YUV_2_RGB1B_C(type) \
25593e29 507 YSCALE_YUV_2_PACKED1B_C\
cf7d1c1a
MN
508 type *r, *b, *g;\
509 r = c->table_rV[V];\
510 g = c->table_gU[U] + c->table_gV[V];\
511 b = c->table_bU[U];\
512
46de8b73 513#define YSCALE_YUV_2_ANYRGB_C(func, func2)\
cf7d1c1a
MN
514 switch(c->dstFormat)\
515 {\
516 case IMGFMT_BGR32:\
517 case IMGFMT_RGB32:\
518 func(uint32_t)\
519 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
520 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
521 } \
522 break;\
523 case IMGFMT_RGB24:\
524 func(uint8_t)\
525 ((uint8_t*)dest)[0]= r[Y1];\
526 ((uint8_t*)dest)[1]= g[Y1];\
527 ((uint8_t*)dest)[2]= b[Y1];\
528 ((uint8_t*)dest)[3]= r[Y2];\
529 ((uint8_t*)dest)[4]= g[Y2];\
530 ((uint8_t*)dest)[5]= b[Y2];\
531 ((uint8_t*)dest)+=6;\
532 }\
533 break;\
534 case IMGFMT_BGR24:\
535 func(uint8_t)\
536 ((uint8_t*)dest)[0]= b[Y1];\
537 ((uint8_t*)dest)[1]= g[Y1];\
538 ((uint8_t*)dest)[2]= r[Y1];\
539 ((uint8_t*)dest)[3]= b[Y2];\
540 ((uint8_t*)dest)[4]= g[Y2];\
541 ((uint8_t*)dest)[5]= r[Y2];\
542 ((uint8_t*)dest)+=6;\
543 }\
544 break;\
545 case IMGFMT_RGB16:\
546 case IMGFMT_BGR16:\
547 {\
548 const int dr1= dither_2x2_8[y&1 ][0];\
549 const int dg1= dither_2x2_4[y&1 ][0];\
550 const int db1= dither_2x2_8[(y&1)^1][0];\
551 const int dr2= dither_2x2_8[y&1 ][1];\
552 const int dg2= dither_2x2_4[y&1 ][1];\
553 const int db2= dither_2x2_8[(y&1)^1][1];\
554 func(uint16_t)\
555 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
556 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
557 }\
558 }\
559 break;\
560 case IMGFMT_RGB15:\
561 case IMGFMT_BGR15:\
562 {\
563 const int dr1= dither_2x2_8[y&1 ][0];\
564 const int dg1= dither_2x2_8[y&1 ][1];\
565 const int db1= dither_2x2_8[(y&1)^1][0];\
566 const int dr2= dither_2x2_8[y&1 ][1];\
567 const int dg2= dither_2x2_8[y&1 ][0];\
568 const int db2= dither_2x2_8[(y&1)^1][1];\
569 func(uint16_t)\
570 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
571 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
572 }\
573 }\
574 break;\
575 case IMGFMT_RGB8:\
576 case IMGFMT_BGR8:\
577 {\
578 const uint8_t * const d64= dither_8x8_73[y&7];\
579 const uint8_t * const d32= dither_8x8_32[y&7];\
580 func(uint8_t)\
581 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
582 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
583 }\
584 }\
585 break;\
586 case IMGFMT_RGB4:\
587 case IMGFMT_BGR4:\
588 {\
589 const uint8_t * const d64= dither_8x8_73 [y&7];\
590 const uint8_t * const d128=dither_8x8_220[y&7];\
591 func(uint8_t)\
799fd467 592 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
f17457ac
MN
593 + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
594 }\
595 }\
596 break;\
597 case IMGFMT_RG4B:\
598 case IMGFMT_BG4B:\
599 {\
600 const uint8_t * const d64= dither_8x8_73 [y&7];\
601 const uint8_t * const d128=dither_8x8_220[y&7];\
602 func(uint8_t)\
cf7d1c1a
MN
603 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
604 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
605 }\
606 }\
607 break;\
608 case IMGFMT_RGB1:\
609 case IMGFMT_BGR1:\
610 {\
611 const uint8_t * const d128=dither_8x8_220[y&7];\
612 uint8_t *g= c->table_gU[128] + c->table_gV[128];\
613 for(i=0; i<dstW-7; i+=8){\
614 int acc;\
615 acc = g[((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19) + d128[0]];\
616 acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
617 acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
618 acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
619 acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
620 acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
621 acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
622 acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
623 ((uint8_t*)dest)[0]= acc;\
624 ((uint8_t*)dest)++;\
625 }\
626\
627/*\
628((uint8_t*)dest)-= dstW>>4;\
629{\
630 int acc=0;\
631 int left=0;\
632 static int top[1024];\
633 static int last_new[1024][1024];\
634 static int last_in3[1024][1024];\
635 static int drift[1024][1024];\
636 int topLeft=0;\
637 int shift=0;\
638 int count=0;\
639 const uint8_t * const d128=dither_8x8_220[y&7];\
640 int error_new=0;\
641 int error_in3=0;\
642 int f=0;\
643 \
644 for(i=dstW>>1; i<dstW; i++){\
645 int in= ((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19);\
646 int in2 = (76309 * (in - 16) + 32768) >> 16;\
647 int in3 = (in2 < 0) ? 0 : ((in2 > 255) ? 255 : in2);\
648 int old= (left*7 + topLeft + top[i]*5 + top[i+1]*3)/20 + in3\
649 + (last_new[y][i] - in3)*f/256;\
650 int new= old> 128 ? 255 : 0;\
651\
652 error_new+= ABS(last_new[y][i] - new);\
653 error_in3+= ABS(last_in3[y][i] - in3);\
654 f= error_new - error_in3*4;\
655 if(f<0) f=0;\
656 if(f>256) f=256;\
657\
658 topLeft= top[i];\
659 left= top[i]= old - new;\
660 last_new[y][i]= new;\
661 last_in3[y][i]= in3;\
662\
663 acc+= acc + (new&1);\
664 if((i&7)==6){\
665 ((uint8_t*)dest)[0]= acc;\
666 ((uint8_t*)dest)++;\
667 }\
668 }\
669}\
670*/\
671 }\
672 break;\
46de8b73
MN
673 case IMGFMT_YUY2:\
674 func2\
675 ((uint8_t*)dest)[2*i2+0]= Y1;\
676 ((uint8_t*)dest)[2*i2+1]= U;\
677 ((uint8_t*)dest)[2*i2+2]= Y2;\
678 ((uint8_t*)dest)[2*i2+3]= V;\
679 } \
680 break;\
cf7d1c1a
MN
681 }\
682
683
25593e29 684static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
e3d2500f 685 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
cf7d1c1a 686 uint8_t *dest, int dstW, int y)
e3d2500f 687{
cf7d1c1a
MN
688 int i;
689 switch(c->dstFormat)
e3d2500f 690 {
cf7d1c1a
MN
691 case IMGFMT_RGB32:
692 case IMGFMT_BGR32:
693 YSCALE_YUV_2_RGBX_C(uint32_t)
694 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];
695 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];
e3d2500f 696 }
cf7d1c1a
MN
697 break;
698 case IMGFMT_RGB24:
699 YSCALE_YUV_2_RGBX_C(uint8_t)
700 ((uint8_t*)dest)[0]= r[Y1];
701 ((uint8_t*)dest)[1]= g[Y1];
702 ((uint8_t*)dest)[2]= b[Y1];
703 ((uint8_t*)dest)[3]= r[Y2];
704 ((uint8_t*)dest)[4]= g[Y2];
705 ((uint8_t*)dest)[5]= b[Y2];
706 ((uint8_t*)dest)+=6;
707 }
708 break;
709 case IMGFMT_BGR24:
710 YSCALE_YUV_2_RGBX_C(uint8_t)
711 ((uint8_t*)dest)[0]= b[Y1];
712 ((uint8_t*)dest)[1]= g[Y1];
713 ((uint8_t*)dest)[2]= r[Y1];
714 ((uint8_t*)dest)[3]= b[Y2];
715 ((uint8_t*)dest)[4]= g[Y2];
716 ((uint8_t*)dest)[5]= r[Y2];
717 ((uint8_t*)dest)+=6;
718 }
719 break;
720 case IMGFMT_RGB16:
721 case IMGFMT_BGR16:
722 {
723 const int dr1= dither_2x2_8[y&1 ][0];
724 const int dg1= dither_2x2_4[y&1 ][0];
725 const int db1= dither_2x2_8[(y&1)^1][0];
726 const int dr2= dither_2x2_8[y&1 ][1];
727 const int dg2= dither_2x2_4[y&1 ][1];
728 const int db2= dither_2x2_8[(y&1)^1][1];
729 YSCALE_YUV_2_RGBX_C(uint16_t)
730 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];
731 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];
e3d2500f 732 }
e3d2500f 733 }
cf7d1c1a
MN
734 break;
735 case IMGFMT_RGB15:
736 case IMGFMT_BGR15:
737 {
738 const int dr1= dither_2x2_8[y&1 ][0];
739 const int dg1= dither_2x2_8[y&1 ][1];
740 const int db1= dither_2x2_8[(y&1)^1][0];
741 const int dr2= dither_2x2_8[y&1 ][1];
742 const int dg2= dither_2x2_8[y&1 ][0];
743 const int db2= dither_2x2_8[(y&1)^1][1];
744 YSCALE_YUV_2_RGBX_C(uint16_t)
745 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];
746 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];
e3d2500f 747 }
cf7d1c1a
MN
748 }
749 break;
750 case IMGFMT_RGB8:
751 case IMGFMT_BGR8:
752 {
753 const uint8_t * const d64= dither_8x8_73[y&7];
754 const uint8_t * const d32= dither_8x8_32[y&7];
755 YSCALE_YUV_2_RGBX_C(uint8_t)
756 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];
757 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];
e3d2500f 758 }
e3d2500f 759 }
cf7d1c1a
MN
760 break;
761 case IMGFMT_RGB4:
762 case IMGFMT_BGR4:
763 {
764 const uint8_t * const d64= dither_8x8_73 [y&7];
765 const uint8_t * const d128=dither_8x8_220[y&7];
766 YSCALE_YUV_2_RGBX_C(uint8_t)
799fd467 767 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]
f17457ac
MN
768 +((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);
769 }
770 }
771 break;
772 case IMGFMT_RG4B:
773 case IMGFMT_BG4B:
774 {
775 const uint8_t * const d64= dither_8x8_73 [y&7];
776 const uint8_t * const d128=dither_8x8_220[y&7];
777 YSCALE_YUV_2_RGBX_C(uint8_t)
cf7d1c1a
MN
778 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];
779 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];
e3d2500f 780 }
cf7d1c1a
MN
781 }
782 break;
783 case IMGFMT_RGB1:
784 case IMGFMT_BGR1:
785 {
786 const uint8_t * const d128=dither_8x8_220[y&7];
787 uint8_t *g= c->table_gU[128] + c->table_gV[128];
788 int acc=0;
789 for(i=0; i<dstW-1; i+=2){
790 int j;
791 int Y1=0;
792 int Y2=0;
793
794 for(j=0; j<lumFilterSize; j++)
795 {
796 Y1 += lumSrc[j][i] * lumFilter[j];
797 Y2 += lumSrc[j][i+1] * lumFilter[j];
798 }
799 Y1>>=19;
800 Y2>>=19;
801 if((Y1|Y2)&256)
802 {
803 if(Y1>255) Y1=255;
804 else if(Y1<0)Y1=0;
805 if(Y2>255) Y2=255;
806 else if(Y2<0)Y2=0;
807 }
808 acc+= acc + g[Y1+d128[(i+0)&7]];
809 acc+= acc + g[Y2+d128[(i+1)&7]];
810 if((i&7)==6){
811 ((uint8_t*)dest)[0]= acc;
812 ((uint8_t*)dest)++;
813 }
e3d2500f 814 }
e3d2500f 815 }
cf7d1c1a 816 break;
46de8b73 817 case IMGFMT_YUY2:
25593e29 818 YSCALE_YUV_2_PACKEDX_C(void)
46de8b73
MN
819 ((uint8_t*)dest)[2*i2+0]= Y1;
820 ((uint8_t*)dest)[2*i2+1]= U;
821 ((uint8_t*)dest)[2*i2+2]= Y2;
822 ((uint8_t*)dest)[2*i2+3]= V;
823 }
824 break;
e3d2500f
MN
825 }
826}
827
828
7630f2e0
MN
829//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
830//Plain C versions
726a959a
MN
831#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
832#define COMPILE_C
833#endif
834
835#ifdef CAN_COMPILE_X86_ASM
836
837#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
838#define COMPILE_MMX
839#endif
840
841#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
842#define COMPILE_MMX2
843#endif
844
845#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
846#define COMPILE_3DNOW
847#endif
848#endif //CAN_COMPILE_X86_ASM
849
850#undef HAVE_MMX
851#undef HAVE_MMX2
852#undef HAVE_3DNOW
726a959a
MN
853
854#ifdef COMPILE_C
7630f2e0
MN
855#undef HAVE_MMX
856#undef HAVE_MMX2
857#undef HAVE_3DNOW
7630f2e0
MN
858#define RENAME(a) a ## _C
859#include "swscale_template.c"
726a959a 860#endif
397c035e 861
7630f2e0 862#ifdef CAN_COMPILE_X86_ASM
397c035e 863
7630f2e0
MN
864//X86 versions
865/*
866#undef RENAME
867#undef HAVE_MMX
868#undef HAVE_MMX2
869#undef HAVE_3DNOW
870#define ARCH_X86
871#define RENAME(a) a ## _X86
872#include "swscale_template.c"
1faf0867 873*/
7630f2e0 874//MMX versions
726a959a 875#ifdef COMPILE_MMX
7630f2e0
MN
876#undef RENAME
877#define HAVE_MMX
878#undef HAVE_MMX2
879#undef HAVE_3DNOW
7630f2e0
MN
880#define RENAME(a) a ## _MMX
881#include "swscale_template.c"
726a959a 882#endif
7630f2e0
MN
883
884//MMX2 versions
726a959a 885#ifdef COMPILE_MMX2
7630f2e0
MN
886#undef RENAME
887#define HAVE_MMX
888#define HAVE_MMX2
889#undef HAVE_3DNOW
7630f2e0
MN
890#define RENAME(a) a ## _MMX2
891#include "swscale_template.c"
726a959a 892#endif
7630f2e0
MN
893
894//3DNOW versions
726a959a 895#ifdef COMPILE_3DNOW
7630f2e0
MN
896#undef RENAME
897#define HAVE_MMX
898#undef HAVE_MMX2
899#define HAVE_3DNOW
7630f2e0
MN
900#define RENAME(a) a ## _3DNow
901#include "swscale_template.c"
726a959a 902#endif
7630f2e0
MN
903
904#endif //CAN_COMPILE_X86_ASM
905
906// minor note: the HAVE_xyz is messed up after that line so dont use it
d604bab9 907
d3f41512 908
6c7506de 909// old global scaler, dont use for new code
28bf81c9
MN
910// will use sws_flags from the command line
911void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
912 int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
d1fac6cf 913 int srcW, int srcH, int dstW, int dstH){
31190492 914
28bf81c9
MN
915 static SwsContext *context=NULL;
916 int dstFormat;
28bf81c9
MN
917 int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
918
6c7506de 919 switch(dstbpp)
28bf81c9 920 {
6c7506de
MN
921 case 8 : dstFormat= IMGFMT_Y8; break;
922 case 12: dstFormat= IMGFMT_YV12; break;
923 case 15: dstFormat= IMGFMT_BGR15; break;
924 case 16: dstFormat= IMGFMT_BGR16; break;
925 case 24: dstFormat= IMGFMT_BGR24; break;
926 case 32: dstFormat= IMGFMT_BGR32; break;
927 default: return;
928 }
929
930 if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
931
b6654a54 932 context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
6c7506de
MN
933}
934
5859233b 935void swsGetFlagsAndFilterFromCmdLine(int *flags, SwsFilter **srcFilterParam, SwsFilter **dstFilterParam)
6c7506de 936{
6c7506de 937 static int firstTime=1;
5859233b 938 *flags=0;
6c7506de 939
5521b193 940#ifdef ARCH_X86
6c7506de
MN
941 if(gCpuCaps.hasMMX)
942 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
5521b193 943#endif
6c7506de
MN
944 if(firstTime)
945 {
28bf81c9 946 firstTime=0;
5859233b 947 *flags= SWS_PRINT_INFO;
6c7506de 948 }
5859233b 949 else if(verbose>1) *flags= SWS_PRINT_INFO;
6c7506de
MN
950
951 if(src_filter.lumH) freeVec(src_filter.lumH);
952 if(src_filter.lumV) freeVec(src_filter.lumV);
953 if(src_filter.chrH) freeVec(src_filter.chrH);
954 if(src_filter.chrV) freeVec(src_filter.chrV);
955
956 if(sws_lum_gblur!=0.0){
957 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
958 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
959 }else{
960 src_filter.lumH= getIdentityVec();
961 src_filter.lumV= getIdentityVec();
962 }
c7f822d9 963
6c7506de
MN
964 if(sws_chr_gblur!=0.0){
965 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
966 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
967 }else{
968 src_filter.chrH= getIdentityVec();
969 src_filter.chrV= getIdentityVec();
970 }
5521b193 971
6c7506de
MN
972 if(sws_chr_sharpen!=0.0){
973 SwsVector *g= getConstVec(-1.0, 3);
974 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
975 g->coeff[1]=2.0;
976 addVec(id, g);
977 convVec(src_filter.chrH, id);
978 convVec(src_filter.chrV, id);
979 freeVec(g);
980 freeVec(id);
981 }
5521b193 982
6c7506de
MN
983 if(sws_lum_sharpen!=0.0){
984 SwsVector *g= getConstVec(-1.0, 3);
985 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
986 g->coeff[1]=2.0;
987 addVec(id, g);
988 convVec(src_filter.lumH, id);
989 convVec(src_filter.lumV, id);
990 freeVec(g);
991 freeVec(id);
992 }
c7f822d9 993
6c7506de
MN
994 if(sws_chr_hshift)
995 shiftVec(src_filter.chrH, sws_chr_hshift);
c7f822d9 996
6c7506de
MN
997 if(sws_chr_vshift)
998 shiftVec(src_filter.chrV, sws_chr_vshift);
5521b193 999
6c7506de
MN
1000 normalizeVec(src_filter.chrH, 1.0);
1001 normalizeVec(src_filter.chrV, 1.0);
1002 normalizeVec(src_filter.lumH, 1.0);
1003 normalizeVec(src_filter.lumV, 1.0);
28bf81c9 1004
6c7506de
MN
1005 if(verbose > 1) printVec(src_filter.chrH);
1006 if(verbose > 1) printVec(src_filter.lumH);
28bf81c9
MN
1007
1008 switch(sws_flags)
1009 {
5859233b
MN
1010 case 0: *flags|= SWS_FAST_BILINEAR; break;
1011 case 1: *flags|= SWS_BILINEAR; break;
1012 case 2: *flags|= SWS_BICUBIC; break;
1013 case 3: *flags|= SWS_X; break;
1014 case 4: *flags|= SWS_POINT; break;
1015 case 5: *flags|= SWS_AREA; break;
81a571a8 1016 case 6: *flags|= SWS_BICUBLIN; break;
a86c461c
MN
1017 case 7: *flags|= SWS_GAUSS; break;
1018 case 8: *flags|= SWS_SINC; break;
1019 case 9: *flags|= SWS_LANCZOS; break;
1020 case 10:*flags|= SWS_SPLINE; break;
5859233b 1021 default:*flags|= SWS_BILINEAR; break;
28bf81c9 1022 }
5859233b
MN
1023
1024 *srcFilterParam= &src_filter;
1025 *dstFilterParam= NULL;
1026}
1027
1028// will use sws_flags & src_filter (from cmd line)
1029SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
1030{
1031 int flags;
1032 SwsFilter *dstFilterParam, *srcFilterParam;
1033 swsGetFlagsAndFilterFromCmdLine(&flags, &srcFilterParam, &dstFilterParam);
28bf81c9 1034
5859233b 1035 return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, srcFilterParam, dstFilterParam);
28bf81c9
MN
1036}
1037
a86c461c
MN
1038static double getSplineCoeff(double a, double b, double c, double d, double dist)
1039{
1040// printf("%f %f %f %f %f\n", a,b,c,d,dist);
1041 if(dist<=1.0) return ((d*dist + c)*dist + b)*dist +a;
1042 else return getSplineCoeff( 0.0,
1043 b+ 2.0*c + 3.0*d,
1044 c + 3.0*d,
1045 -b- 3.0*c - 6.0*d,
1046 dist-1.0);
1047}
6c7506de 1048
c7f822d9
MN
1049static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
1050 int srcW, int dstW, int filterAlign, int one, int flags,
1051 SwsVector *srcFilter, SwsVector *dstFilter)
28bf81c9
MN
1052{
1053 int i;
c7f822d9
MN
1054 int filterSize;
1055 int filter2Size;
1056 int minFilterSize;
1057 double *filter=NULL;
1058 double *filter2=NULL;
28bf81c9
MN
1059#ifdef ARCH_X86
1060 if(gCpuCaps.hasMMX)
1061 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
726a959a 1062#endif
31190492 1063
adeaecb9 1064 // Note the +1 is for the MMXscaler which reads over the end
6c7506de 1065 *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
6c7506de 1066
28bf81c9
MN
1067 if(ABS(xInc - 0x10000) <10) // unscaled
1068 {
1069 int i;
c7f822d9
MN
1070 filterSize= 1;
1071 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
1072 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
28bf81c9
MN
1073
1074 for(i=0; i<dstW; i++)
1075 {
c7f822d9
MN
1076 filter[i*filterSize]=1;
1077 (*filterPos)[i]=i;
28bf81c9
MN
1078 }
1079
1080 }
ff7ba856
MN
1081 else if(flags&SWS_POINT) // lame looking point sampling mode
1082 {
1083 int i;
1084 int xDstInSrc;
1085 filterSize= 1;
1086 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
1087
1088 xDstInSrc= xInc/2 - 0x8000;
1089 for(i=0; i<dstW; i++)
1090 {
8a01d20c 1091 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
ff7ba856
MN
1092
1093 (*filterPos)[i]= xx;
1094 filter[i]= 1.0;
1095 xDstInSrc+= xInc;
1096 }
1097 }
a86c461c 1098 else if((xInc <= (1<<16) && (flags&SWS_AREA)) || (flags&SWS_FAST_BILINEAR)) // bilinear upscale
28bf81c9
MN
1099 {
1100 int i;
1101 int xDstInSrc;
c7f822d9
MN
1102 if (flags&SWS_BICUBIC) filterSize= 4;
1103 else if(flags&SWS_X ) filterSize= 4;
d8863d37 1104 else filterSize= 2; // SWS_BILINEAR / SWS_AREA
c7f822d9 1105 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
28bf81c9
MN
1106
1107 xDstInSrc= xInc/2 - 0x8000;
1108 for(i=0; i<dstW; i++)
1109 {
8a01d20c 1110 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
28bf81c9
MN
1111 int j;
1112
c7f822d9 1113 (*filterPos)[i]= xx;
d8863d37 1114 //Bilinear upscale / linear interpolate / Area averaging
c7f822d9 1115 for(j=0; j<filterSize; j++)
28bf81c9
MN
1116 {
1117 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
1118 double coeff= 1.0 - d;
1119 if(coeff<0) coeff=0;
c7f822d9 1120 filter[i*filterSize + j]= coeff;
28bf81c9
MN
1121 xx++;
1122 }
28bf81c9
MN
1123 xDstInSrc+= xInc;
1124 }
1125 }
a86c461c 1126 else
28bf81c9 1127 {
a86c461c
MN
1128 double xDstInSrc;
1129 double sizeFactor, filterSizeInSrc;
1130 const double xInc1= (double)xInc / (double)(1<<16);
1131 int param= (flags&SWS_PARAM_MASK)>>SWS_PARAM_SHIFT;
1132
1133 if (flags&SWS_BICUBIC) sizeFactor= 4.0;
1134 else if(flags&SWS_X) sizeFactor= 8.0;
1135 else if(flags&SWS_AREA) sizeFactor= 1.0; //downscale only, for upscale it is bilinear
1136 else if(flags&SWS_GAUSS) sizeFactor= 8.0; // infinite ;)
1137 else if(flags&SWS_LANCZOS) sizeFactor= param ? 2.0*param : 6.0;
93768378 1138 else if(flags&SWS_SINC) sizeFactor= 20.0; // infinite ;)
a86c461c
MN
1139 else if(flags&SWS_SPLINE) sizeFactor= 20.0; // infinite ;)
1140 else if(flags&SWS_BILINEAR) sizeFactor= 2.0;
93768378
MN
1141 else {
1142 sizeFactor= 0.0; //GCC warning killer
1143 ASSERT(0)
1144 }
a86c461c
MN
1145
1146 if(xInc1 <= 1.0) filterSizeInSrc= sizeFactor; // upscale
1147 else filterSizeInSrc= sizeFactor*srcW / (double)dstW;
81b7c056 1148
a86c461c
MN
1149 filterSize= (int)ceil(1 + filterSizeInSrc); // will be reduced later if possible
1150 if(filterSize > srcW-2) filterSize=srcW-2;
28bf81c9 1151
a86c461c
MN
1152 filter= (double*)memalign(16, dstW*sizeof(double)*filterSize);
1153
1154 xDstInSrc= xInc1 / 2.0 - 0.5;
28bf81c9
MN
1155 for(i=0; i<dstW; i++)
1156 {
a86c461c 1157 int xx= (int)(xDstInSrc - (filterSize-1)*0.5 + 0.5);
28bf81c9 1158 int j;
c7f822d9
MN
1159 (*filterPos)[i]= xx;
1160 for(j=0; j<filterSize; j++)
28bf81c9 1161 {
a86c461c 1162 double d= ABS(xx - xDstInSrc)/filterSizeInSrc*sizeFactor;
28bf81c9 1163 double coeff;
a86c461c 1164 if(flags & SWS_BICUBIC)
28bf81c9 1165 {
a86c461c
MN
1166 double A= param ? -param*0.01 : -0.60;
1167
28bf81c9
MN
1168 // Equation is from VirtualDub
1169 if(d<1.0)
1170 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
1171 else if(d<2.0)
1172 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
1173 else
1174 coeff=0.0;
1175 }
a86c461c
MN
1176/* else if(flags & SWS_X)
1177 {
1178 double p= param ? param*0.01 : 0.3;
1179 coeff = d ? sin(d*PI)/(d*PI) : 1.0;
1180 coeff*= pow(2.0, - p*d*d);
1181 }*/
1182 else if(flags & SWS_X)
1183 {
1184 double A= param ? param*0.1 : 1.0;
1185
1186 if(d<1.0)
1187 coeff = cos(d*PI);
1188 else
1189 coeff=-1.0;
1190 if(coeff<0.0) coeff= -pow(-coeff, A);
1191 else coeff= pow( coeff, A);
1192 coeff= coeff*0.5 + 0.5;
1193 }
d8863d37 1194 else if(flags & SWS_AREA)
28bf81c9 1195 {
a86c461c 1196 double srcPixelSize= 1.0/xInc1;
d8863d37
MN
1197 if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
1198 else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
1199 else coeff=0.0;
1200 }
a86c461c
MN
1201 else if(flags & SWS_GAUSS)
1202 {
1203 double p= param ? param*0.1 : 3.0;
1204 coeff = pow(2.0, - p*d*d);
1205 }
1206 else if(flags & SWS_SINC)
1207 {
1208 coeff = d ? sin(d*PI)/(d*PI) : 1.0;
1209 }
1210 else if(flags & SWS_LANCZOS)
1211 {
1212 double p= param ? param : 3.0;
1213 coeff = d ? sin(d*PI)*sin(d*PI/p)/(d*d*PI*PI/p) : 1.0;
1214 if(d>p) coeff=0;
1215 }
1216 else if(flags & SWS_BILINEAR)
28bf81c9
MN
1217 {
1218 coeff= 1.0 - d;
1219 if(coeff<0) coeff=0;
1220 }
a86c461c
MN
1221 else if(flags & SWS_SPLINE)
1222 {
1223 double p=-2.196152422706632;
1224 coeff = getSplineCoeff(1.0, 0.0, p, -p-1.0, d);
1225 }
93768378
MN
1226 else {
1227 coeff= 0.0; //GCC warning killer
1228 ASSERT(0)
1229 }
a86c461c 1230
c7f822d9 1231 filter[i*filterSize + j]= coeff;
28bf81c9
MN
1232 xx++;
1233 }
a86c461c 1234 xDstInSrc+= xInc1;
28bf81c9
MN
1235 }
1236 }
1237
c7f822d9
MN
1238 /* apply src & dst Filter to filter -> filter2
1239 free(filter);
1240 */
81b7c056 1241 ASSERT(filterSize>0)
c7f822d9
MN
1242 filter2Size= filterSize;
1243 if(srcFilter) filter2Size+= srcFilter->length - 1;
1244 if(dstFilter) filter2Size+= dstFilter->length - 1;
81b7c056 1245 ASSERT(filter2Size>0)
c7f822d9
MN
1246 filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
1247
1248 for(i=0; i<dstW; i++)
1249 {
1250 int j;
1251 SwsVector scaleFilter;
1252 SwsVector *outVec;
1253
1254 scaleFilter.coeff= filter + i*filterSize;
1255 scaleFilter.length= filterSize;
1256
5cebb24b 1257 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
c7f822d9
MN
1258 else outVec= &scaleFilter;
1259
1260 ASSERT(outVec->length == filter2Size)
1261 //FIXME dstFilter
1262
1263 for(j=0; j<outVec->length; j++)
1264 {
1265 filter2[i*filter2Size + j]= outVec->coeff[j];
1266 }
1267
1268 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
1269
1270 if(outVec != &scaleFilter) freeVec(outVec);
1271 }
1272 free(filter); filter=NULL;
1273
1274 /* try to reduce the filter-size (step1 find size and shift left) */
1275 // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
1276 minFilterSize= 0;
1277 for(i=dstW-1; i>=0; i--)
1278 {
1279 int min= filter2Size;
1280 int j;
1281 double cutOff=0.0;
1282
1283 /* get rid off near zero elements on the left by shifting left */
1284 for(j=0; j<filter2Size; j++)
1285 {
1286 int k;
1287 cutOff += ABS(filter2[i*filter2Size]);
1288
1289 if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1290
1291 /* preserve Monotonicity because the core cant handle the filter otherwise */
1292 if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
1293
1294 // Move filter coeffs left
1295 for(k=1; k<filter2Size; k++)
1296 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
1297 filter2[i*filter2Size + k - 1]= 0.0;
1298 (*filterPos)[i]++;
1299 }
1300
1301 cutOff=0.0;
1302 /* count near zeros on the right */
1303 for(j=filter2Size-1; j>0; j--)
1304 {
1305 cutOff += ABS(filter2[i*filter2Size + j]);
1306
1307 if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1308 min--;
1309 }
1310
1311 if(min>minFilterSize) minFilterSize= min;
1312 }
1313
81b7c056 1314 ASSERT(minFilterSize > 0)
6c7506de 1315 filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
81b7c056 1316 ASSERT(filterSize > 0)
6c7506de
MN
1317 filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
1318 *outFilterSize= filterSize;
1319
4a53a912 1320 if(flags&SWS_PRINT_INFO)
0d9f3d85 1321 MSG_INFO("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
c7f822d9
MN
1322 /* try to reduce the filter-size (step2 reduce it) */
1323 for(i=0; i<dstW; i++)
1324 {
1325 int j;
1326
6c7506de
MN
1327 for(j=0; j<filterSize; j++)
1328 {
1329 if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
1330 else filter[i*filterSize + j]= filter2[i*filter2Size + j];
1331 }
c7f822d9 1332 }
6c7506de
MN
1333 free(filter2); filter2=NULL;
1334
c7f822d9
MN
1335
1336 //FIXME try to align filterpos if possible
1337
28bf81c9
MN
1338 //fix borders
1339 for(i=0; i<dstW; i++)
1340 {
1341 int j;
c7f822d9 1342 if((*filterPos)[i] < 0)
28bf81c9
MN
1343 {
1344 // Move filter coeffs left to compensate for filterPos
6c7506de 1345 for(j=1; j<filterSize; j++)
28bf81c9 1346 {
c7f822d9 1347 int left= MAX(j + (*filterPos)[i], 0);
6c7506de
MN
1348 filter[i*filterSize + left] += filter[i*filterSize + j];
1349 filter[i*filterSize + j]=0;
28bf81c9 1350 }
c7f822d9 1351 (*filterPos)[i]= 0;
28bf81c9
MN
1352 }
1353
6c7506de 1354 if((*filterPos)[i] + filterSize > srcW)
28bf81c9 1355 {
6c7506de 1356 int shift= (*filterPos)[i] + filterSize - srcW;
28bf81c9 1357 // Move filter coeffs right to compensate for filterPos
6c7506de 1358 for(j=filterSize-2; j>=0; j--)
28bf81c9 1359 {
6c7506de
MN
1360 int right= MIN(j + shift, filterSize-1);
1361 filter[i*filterSize +right] += filter[i*filterSize +j];
1362 filter[i*filterSize +j]=0;
28bf81c9 1363 }
6c7506de 1364 (*filterPos)[i]= srcW - filterSize;
28bf81c9
MN
1365 }
1366 }
1367
6c7506de
MN
1368 // Note the +1 is for the MMXscaler which reads over the end
1369 *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
1370 memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
c7f822d9
MN
1371
1372 /* Normalize & Store in outFilter */
28bf81c9
MN
1373 for(i=0; i<dstW; i++)
1374 {
1375 int j;
1376 double sum=0;
1377 double scale= one;
6c7506de 1378 for(j=0; j<filterSize; j++)
28bf81c9 1379 {
6c7506de 1380 sum+= filter[i*filterSize + j];
28bf81c9
MN
1381 }
1382 scale/= sum;
93768378 1383 for(j=0; j<*outFilterSize; j++)
28bf81c9 1384 {
6c7506de 1385 (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
28bf81c9
MN
1386 }
1387 }
adeaecb9
MN
1388
1389 (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1390 for(i=0; i<*outFilterSize; i++)
1391 {
1392 int j= dstW*(*outFilterSize);
1393 (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1394 }
c7f822d9 1395
6c7506de 1396 free(filter);
7630f2e0 1397}
31190492 1398
28bf81c9 1399#ifdef ARCH_X86
b7dc6f66 1400static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
28bf81c9 1401{
b7dc6f66
MN
1402 uint8_t *fragmentA;
1403 int imm8OfPShufW1A;
1404 int imm8OfPShufW2A;
1405 int fragmentLengthA;
1406 uint8_t *fragmentB;
1407 int imm8OfPShufW1B;
1408 int imm8OfPShufW2B;
1409 int fragmentLengthB;
1410 int fragmentPos;
28bf81c9
MN
1411
1412 int xpos, i;
1413
1414 // create an optimized horizontal scaling routine
1415
1416 //code fragment
1417
1418 asm volatile(
1419 "jmp 9f \n\t"
1420 // Begin
1421 "0: \n\t"
b7dc6f66
MN
1422 "movq (%%edx, %%eax), %%mm3 \n\t"
1423 "movd (%%ecx, %%esi), %%mm0 \n\t"
1424 "movd 1(%%ecx, %%esi), %%mm1 \n\t"
1425 "punpcklbw %%mm7, %%mm1 \n\t"
1426 "punpcklbw %%mm7, %%mm0 \n\t"
28bf81c9
MN
1427 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
1428 "1: \n\t"
28bf81c9
MN
1429 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1430 "2: \n\t"
28bf81c9 1431 "psubw %%mm1, %%mm0 \n\t"
b7dc6f66 1432 "movl 8(%%ebx, %%eax), %%esi \n\t"
28bf81c9 1433 "pmullw %%mm3, %%mm0 \n\t"
28bf81c9
MN
1434 "psllw $7, %%mm1 \n\t"
1435 "paddw %%mm1, %%mm0 \n\t"
1436
1437 "movq %%mm0, (%%edi, %%eax) \n\t"
1438
1439 "addl $8, %%eax \n\t"
1440 // End
1441 "9: \n\t"
1442// "int $3\n\t"
1443 "leal 0b, %0 \n\t"
1444 "leal 1b, %1 \n\t"
1445 "leal 2b, %2 \n\t"
1446 "decl %1 \n\t"
1447 "decl %2 \n\t"
1448 "subl %0, %1 \n\t"
1449 "subl %0, %2 \n\t"
1450 "leal 9b, %3 \n\t"
1451 "subl %0, %3 \n\t"
b7dc6f66
MN
1452
1453
1454 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1455 "=r" (fragmentLengthA)
28bf81c9
MN
1456 );
1457
b7dc6f66
MN
1458 asm volatile(
1459 "jmp 9f \n\t"
1460 // Begin
1461 "0: \n\t"
1462 "movq (%%edx, %%eax), %%mm3 \n\t"
1463 "movd (%%ecx, %%esi), %%mm0 \n\t"
1464 "punpcklbw %%mm7, %%mm0 \n\t"
1465 "pshufw $0xFF, %%mm0, %%mm1 \n\t"
1466 "1: \n\t"
1467 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1468 "2: \n\t"
1469 "psubw %%mm1, %%mm0 \n\t"
1470 "movl 8(%%ebx, %%eax), %%esi \n\t"
1471 "pmullw %%mm3, %%mm0 \n\t"
1472 "psllw $7, %%mm1 \n\t"
1473 "paddw %%mm1, %%mm0 \n\t"
1474
1475 "movq %%mm0, (%%edi, %%eax) \n\t"
28bf81c9 1476
b7dc6f66
MN
1477 "addl $8, %%eax \n\t"
1478 // End
1479 "9: \n\t"
1480// "int $3\n\t"
1481 "leal 0b, %0 \n\t"
1482 "leal 1b, %1 \n\t"
1483 "leal 2b, %2 \n\t"
1484 "decl %1 \n\t"
1485 "decl %2 \n\t"
1486 "subl %0, %1 \n\t"
1487 "subl %0, %2 \n\t"
1488 "leal 9b, %3 \n\t"
1489 "subl %0, %3 \n\t"
1490
1491
1492 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1493 "=r" (fragmentLengthB)
1494 );
1495
1496 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1497 fragmentPos=0;
1498
1499 for(i=0; i<dstW/numSplits; i++)
28bf81c9
MN
1500 {
1501 int xx=xpos>>16;
1502
1503 if((i&3) == 0)
1504 {
1505 int a=0;
1506 int b=((xpos+xInc)>>16) - xx;
1507 int c=((xpos+xInc*2)>>16) - xx;
1508 int d=((xpos+xInc*3)>>16) - xx;
1509
b7dc6f66
MN
1510 filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9;
1511 filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9;
1512 filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
1513 filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
1514 filterPos[i/2]= xx;
1515
1516 if(d+1<4)
1517 {
1518 int maxShift= 3-(d+1);
1519 int shift=0;
1520
1521 memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
1522
1523 funnyCode[fragmentPos + imm8OfPShufW1B]=
1524 (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
1525 funnyCode[fragmentPos + imm8OfPShufW2B]=
1526 a | (b<<2) | (c<<4) | (d<<6);
1527
1528 if(i+3>=dstW) shift=maxShift; //avoid overread
1529 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
1530
1531 if(shift && i>=shift)
1532 {
1533 funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
1534 funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
1535 filterPos[i/2]-=shift;
1536 }
1537
1538 fragmentPos+= fragmentLengthB;
1539 }
1540 else
1541 {
1542 int maxShift= 3-d;
1543 int shift=0;
1544
1545 memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
28bf81c9 1546
b7dc6f66
MN
1547 funnyCode[fragmentPos + imm8OfPShufW1A]=
1548 funnyCode[fragmentPos + imm8OfPShufW2A]=
1549 a | (b<<2) | (c<<4) | (d<<6);
28bf81c9 1550
b7dc6f66
MN
1551 if(i+4>=dstW) shift=maxShift; //avoid overread
1552 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
28bf81c9 1553
b7dc6f66
MN
1554 if(shift && i>=shift)
1555 {
1556 funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
1557 funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
1558 filterPos[i/2]-=shift;
1559 }
1560
1561 fragmentPos+= fragmentLengthA;
1562 }
1563
1564 funnyCode[fragmentPos]= RET;
28bf81c9
MN
1565 }
1566 xpos+=xInc;
1567 }
b7dc6f66 1568 filterPos[i/2]= xpos>>16; // needed to jump to the next part
28bf81c9
MN
1569}
1570#endif // ARCH_X86
1571
1572//FIXME remove
31190492 1573void SwScale_Init(){
28bf81c9
MN
1574}
1575
1576static void globalInit(){
31190492
A
1577 // generating tables:
1578 int i;
c1b0bfb4
MN
1579 for(i=0; i<768; i++){
1580 int c= MIN(MAX(i-256, 0), 255);
1581 clip_table[i]=c;
b18ea156 1582 }
c1b0bfb4 1583
28bf81c9
MN
1584cpuCaps= gCpuCaps;
1585
1586#ifdef RUNTIME_CPUDETECT
1587#ifdef CAN_COMPILE_X86_ASM
1588 // ordered per speed fasterst first
1589 if(gCpuCaps.hasMMX2)
1590 swScale= swScale_MMX2;
1591 else if(gCpuCaps.has3DNow)
7f56a527 1592 swScale= swScale_3DNow;
28bf81c9
MN
1593 else if(gCpuCaps.hasMMX)
1594 swScale= swScale_MMX;
1595 else
1596 swScale= swScale_C;
1597
1598#else
1599 swScale= swScale_C;
1600 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1601#endif
1602#else //RUNTIME_CPUDETECT
1603#ifdef HAVE_MMX2
1604 swScale= swScale_MMX2;
1605 cpuCaps.has3DNow = 0;
1606#elif defined (HAVE_3DNOW)
7f56a527 1607 swScale= swScale_3DNow;
28bf81c9
MN
1608 cpuCaps.hasMMX2 = 0;
1609#elif defined (HAVE_MMX)
1610 swScale= swScale_MMX;
1611 cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1612#else
1613 swScale= swScale_C;
1614 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1615#endif
1616#endif //!RUNTIME_CPUDETECT
31190492 1617}
7630f2e0 1618
0d9f3d85
A
1619static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1620 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1621 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1622 /* Copy Y plane */
1623 if(dstStride[0]==srcStride[0])
1624 memcpy(dst, src[0], srcSliceH*dstStride[0]);
1625 else
1626 {
1627 int i;
1628 uint8_t *srcPtr= src[0];
1629 uint8_t *dstPtr= dst;
1630 for(i=0; i<srcSliceH; i++)
1631 {
1632 memcpy(dstPtr, srcPtr, srcStride[0]);
1633 srcPtr+= srcStride[0];
1634 dstPtr+= dstStride[0];
1635 }
1636 }
1637 dst = dstParam[1] + dstStride[1]*srcSliceY;
1638 if(c->srcFormat==IMGFMT_YV12)
1639 interleaveBytes( src[1],src[2],dst,c->srcW,srcSliceH,srcStride[1],srcStride[2],dstStride[0] );
1640 else /* I420 & IYUV */
1641 interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
1642}
1643
1644
37079906
MN
1645/* Warper functions for yuv2bgr */
1646static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
b6654a54
MN
1647 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1648 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
37079906
MN
1649
1650 if(c->srcFormat==IMGFMT_YV12)
b6654a54 1651 yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
37079906 1652 else /* I420 & IYUV */
b6654a54
MN
1653 yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1654}
1655
44c1035c 1656static void PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
0d9f3d85
A
1657 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1658 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1659
1660 if(c->srcFormat==IMGFMT_YV12)
1661 yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1662 else /* I420 & IYUV */
1663 yv12toyuy2( src[0],src[2],src[1],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1664}
1665
e09d12f4
MN
1666/* {RGB,BGR}{15,16,24,32} -> {RGB,BGR}{15,16,24,32} */
1667static void rgb2rgbWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1668 int srcSliceH, uint8_t* dst[], int dstStride[]){
1669 const int srcFormat= c->srcFormat;
1670 const int dstFormat= c->dstFormat;
1671 const int srcBpp= ((srcFormat&0xFF) + 7)>>3;
1672 const int dstBpp= ((dstFormat&0xFF) + 7)>>3;
1673 const int srcId= (srcFormat&0xFF)>>2; // 1:0, 4:1, 8:2, 15:3, 16:4, 24:6, 32:8
1674 const int dstId= (dstFormat&0xFF)>>2;
1675 void (*conv)(const uint8_t *src, uint8_t *dst, unsigned src_size)=NULL;
1676
1677 /* BGR -> BGR */
1678 if(isBGR(srcFormat) && isBGR(dstFormat)){
1679 switch(srcId | (dstId<<4)){
1680 case 0x34: conv= rgb16to15; break;
1681 case 0x36: conv= rgb24to15; break;
1682 case 0x38: conv= rgb32to15; break;
1683 case 0x43: conv= rgb15to16; break;
1684 case 0x46: conv= rgb24to16; break;
1685 case 0x48: conv= rgb32to16; break;
1686 case 0x63: conv= rgb15to24; break;
1687 case 0x64: conv= rgb16to24; break;
1688 case 0x68: conv= rgb32to24; break;
1689 case 0x83: conv= rgb15to32; break;
1690 case 0x84: conv= rgb16to32; break;
1691 case 0x86: conv= rgb24to32; break;
1692 default: MSG_ERR("swScaler: internal error %s -> %s converter\n",
1693 vo_format_name(srcFormat), vo_format_name(dstFormat)); break;
b935781b 1694 }
e09d12f4
MN
1695 }else if(isBGR(srcFormat) && isRGB(dstFormat)){
1696 switch(srcId | (dstId<<4)){
1697 case 0x33: conv= rgb15tobgr15; break;
1698 case 0x34: conv= rgb16tobgr15; break;
1699 case 0x36: conv= rgb24tobgr15; break;
1700 case 0x38: conv= rgb32tobgr15; break;
1701 case 0x43: conv= rgb15tobgr16; break;
1702 case 0x44: conv= rgb16tobgr16; break;
1703 case 0x46: conv= rgb24tobgr16; break;
1704 case 0x48: conv= rgb32tobgr16; break;
1705 case 0x63: conv= rgb15tobgr24; break;
1706 case 0x64: conv= rgb16tobgr24; break;
1707 case 0x66: conv= rgb24tobgr24; break;
1708 case 0x68: conv= rgb32tobgr24; break;
1709 case 0x83: conv= rgb15tobgr32; break;
1710 case 0x84: conv= rgb16tobgr32; break;
1711 case 0x86: conv= rgb24tobgr32; break;
1712 case 0x88: conv= rgb32tobgr32; break;
1713 default: MSG_ERR("swScaler: internal error %s -> %s converter\n",
1714 vo_format_name(srcFormat), vo_format_name(dstFormat)); break;
0d9f3d85 1715 }
e09d12f4
MN
1716 }else if(isRGB(srcFormat) && isRGB(dstFormat)){
1717 switch(srcId | (dstId<<4)){
1718 case 0x34: conv= rgb16to15; break;
1719 case 0x36: conv= rgb24to15; break;
1720 case 0x38: conv= rgb32to15; break;
1721 case 0x43: conv= rgb15to16; break;
1722 case 0x46: conv= rgb24to16; break;
1723 case 0x48: conv= rgb32to16; break;
1724 case 0x63: conv= rgb15to24; break;
1725 case 0x64: conv= rgb16to24; break;
1726 case 0x68: conv= rgb32to24; break;
1727 case 0x83: conv= rgb15to32; break;
1728 case 0x84: conv= rgb16to32; break;
1729 case 0x86: conv= rgb24to32; break;
1730 default: MSG_ERR("swScaler: internal error %s -> %s converter\n",
1731 vo_format_name(srcFormat), vo_format_name(dstFormat)); break;
0d9f3d85 1732 }
e09d12f4
MN
1733 }else if(isRGB(srcFormat) && isBGR(dstFormat)){
1734 switch(srcId | (dstId<<4)){
1735 case 0x33: conv= rgb15tobgr15; break;
1736 case 0x34: conv= rgb16tobgr15; break;
1737 case 0x36: conv= rgb24tobgr15; break;
1738 case 0x38: conv= rgb32tobgr15; break;
1739 case 0x43: conv= rgb15tobgr16; break;
1740 case 0x44: conv= rgb16tobgr16; break;
1741 case 0x46: conv= rgb24tobgr16; break;
1742 case 0x48: conv= rgb32tobgr16; break;
1743 case 0x63: conv= rgb15tobgr24; break;
1744 case 0x64: conv= rgb16tobgr24; break;
1745 case 0x66: conv= rgb24tobgr24; break;
1746 case 0x68: conv= rgb32tobgr24; break;
1747 case 0x83: conv= rgb15tobgr32; break;
1748 case 0x84: conv= rgb16tobgr32; break;
1749 case 0x86: conv= rgb24tobgr32; break;
1750 case 0x88: conv= rgb32tobgr32; break;
1751 default: MSG_ERR("swScaler: internal error %s -> %s converter\n",
1752 vo_format_name(srcFormat), vo_format_name(dstFormat)); break;
0d9f3d85 1753 }
e09d12f4
MN
1754 }
1755 if(dstStride[0]*srcBpp == srcStride[0]*dstBpp)
1756 conv(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
0d9f3d85
A
1757 else
1758 {
1759 int i;
1760 uint8_t *srcPtr= src[0];
1761 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1762
1763 for(i=0; i<srcSliceH; i++)
1764 {
e09d12f4 1765 conv(srcPtr, dstPtr, c->srcW*srcBpp);
0d9f3d85
A
1766 srcPtr+= srcStride[0];
1767 dstPtr+= dstStride[0];
1768 }
1769 }
1770}
1771
ec22603f
MN
1772static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1773 int srcSliceH, uint8_t* dst[], int dstStride[]){
1774
1775 rgb24toyv12(
1776 src[0],
1777 dst[0]+ srcSliceY *dstStride[0],
1778 dst[1]+(srcSliceY>>1)*dstStride[1],
1779 dst[2]+(srcSliceY>>1)*dstStride[2],
1780 c->srcW, srcSliceH,
1781 dstStride[0], dstStride[1], srcStride[0]);
1782}
1783
b241cbf2
MN
1784static void yvu9toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1785 int srcSliceH, uint8_t* dst[], int dstStride[]){
1786 int i;
1787
1788 /* copy Y */
1789 if(srcStride[0]==dstStride[0])
1790 memcpy(dst[0]+ srcSliceY*dstStride[0], src[0], srcStride[0]*srcSliceH);
1791 else{
1792 uint8_t *srcPtr= src[0];
1793 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1794
1795 for(i=0; i<srcSliceH; i++)
1796 {
1797 memcpy(dstPtr, srcPtr, c->srcW);
1798 srcPtr+= srcStride[0];
1799 dstPtr+= dstStride[0];
1800 }
1801 }
1802
1803 if(c->dstFormat==IMGFMT_YV12){
1804 planar2x(src[1], dst[1], c->chrSrcW, c->chrSrcH, srcStride[1], dstStride[1]);
1805 planar2x(src[2], dst[2], c->chrSrcW, c->chrSrcH, srcStride[2], dstStride[2]);
1806 }else{
1807 planar2x(src[1], dst[2], c->chrSrcW, c->chrSrcH, srcStride[1], dstStride[2]);
1808 planar2x(src[2], dst[1], c->chrSrcW, c->chrSrcH, srcStride[2], dstStride[1]);
1809 }
1810}
1811
44c1035c
MN
1812/**
1813 * bring pointers in YUV order instead of YVU
1814 */
c7a810cc 1815static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
d80e2fa2
MN
1816 if(format == IMGFMT_YV12 || format == IMGFMT_YVU9
1817 || format == IMGFMT_444P || format == IMGFMT_422P || format == IMGFMT_411P){
44c1035c
MN
1818 sortedP[0]= p[0];
1819 sortedP[1]= p[1];
1820 sortedP[2]= p[2];
c7a810cc
MN
1821 sortedStride[0]= stride[0];
1822 sortedStride[1]= stride[1];
1823 sortedStride[2]= stride[2];
44c1035c 1824 }
c7a810cc 1825 else if(isPacked(format) || isGray(format))
44c1035c
MN
1826 {
1827 sortedP[0]= p[0];
1828 sortedP[1]=
1829 sortedP[2]= NULL;
c7a810cc 1830 sortedStride[0]= stride[0];
44c1035c
MN
1831 sortedStride[1]=
1832 sortedStride[2]= 0;
1833 }
e09d12f4 1834 else if(format == IMGFMT_I420)
44c1035c
MN
1835 {
1836 sortedP[0]= p[0];
1837 sortedP[1]= p[2];
1838 sortedP[2]= p[1];
c7a810cc
MN
1839 sortedStride[0]= stride[0];
1840 sortedStride[1]= stride[2];
1841 sortedStride[2]= stride[1];
e09d12f4
MN
1842 }else{
1843 MSG_ERR("internal error in orderYUV\n");
44c1035c
MN
1844 }
1845}
b935781b 1846
b6654a54
MN
1847/* unscaled copy like stuff (assumes nearly identical formats) */
1848static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
44c1035c 1849 int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
b6654a54
MN
1850
1851 int srcStride[3];
44c1035c 1852 int dstStride[3];
b6654a54
MN
1853 uint8_t *src[3];
1854 uint8_t *dst[3];
1855
c7a810cc
MN
1856 orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
1857 orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
b6654a54
MN
1858
1859 if(isPacked(c->srcFormat))
1860 {
1861 if(dstStride[0]==srcStride[0])
1862 memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1863 else
1864 {
1865 int i;
1866 uint8_t *srcPtr= src[0];
1867 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
a861d4d7
MN
1868 int length=0;
1869
1870 /* universal length finder */
9bd8bd1a
MN
1871 while(length+c->srcW <= ABS(dstStride[0])
1872 && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
a861d4d7 1873 ASSERT(length!=0);
b6654a54
MN
1874
1875 for(i=0; i<srcSliceH; i++)
1876 {
1877 memcpy(dstPtr, srcPtr, length);
1878 srcPtr+= srcStride[0];
1879 dstPtr+= dstStride[0];
1880 }
1881 }
1882 }
1883 else
44c1035c 1884 { /* Planar YUV or gray */
b6654a54
MN
1885 int plane;
1886 for(plane=0; plane<3; plane++)
1887 {
e616aa93
MN
1888 int length= plane==0 ? c->srcW : -((-c->srcW )>>c->chrDstHSubSample);
1889 int y= plane==0 ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
1890 int height= plane==0 ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
44c1035c
MN
1891
1892 if((isGray(c->srcFormat) || isGray(c->dstFormat)) && plane>0)
0d9f3d85 1893 {
44c1035c 1894 if(!isGray(c->dstFormat))
e616aa93 1895 memset(dst[plane], 128, dstStride[plane]*height);
0d9f3d85 1896 }
b6654a54
MN
1897 else
1898 {
44c1035c
MN
1899 if(dstStride[plane]==srcStride[plane])
1900 memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1901 else
b6654a54 1902 {
44c1035c
MN
1903 int i;
1904 uint8_t *srcPtr= src[plane];
1905 uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1906 for(i=0; i<height; i++)
1907 {
1908 memcpy(dstPtr, srcPtr, length);
1909 srcPtr+= srcStride[plane];
1910 dstPtr+= dstStride[plane];
1911 }
b6654a54
MN
1912 }
1913 }
1914 }
1915 }
37079906 1916}
28bf81c9 1917
44c1035c 1918static int remove_dup_fourcc(int fourcc)
0d9f3d85
A
1919{
1920 switch(fourcc)
1921 {
1922 case IMGFMT_IYUV: return IMGFMT_I420;
1923 case IMGFMT_Y8 : return IMGFMT_Y800;
0c51ef97 1924 case IMGFMT_IF09: return IMGFMT_YVU9;
0d9f3d85
A
1925 default: return fourcc;
1926 }
1927}
1928
c7a810cc
MN
1929static void getSubSampleFactors(int *h, int *v, int format){
1930 switch(format){
7322a67c 1931 case IMGFMT_UYVY:
c7a810cc
MN
1932 case IMGFMT_YUY2:
1933 *h=1;
1934 *v=0;
1935 break;
1936 case IMGFMT_YV12:
1937 case IMGFMT_I420:
e616aa93 1938 case IMGFMT_Y800: //FIXME remove after different subsamplings are fully implemented
c7a810cc
MN
1939 *h=1;
1940 *v=1;
1941 break;
1942 case IMGFMT_YVU9:
1943 *h=2;
1944 *v=2;
1945 break;
d80e2fa2
MN
1946 case IMGFMT_444P:
1947 *h=0;
1948 *v=0;
1949 break;
1950 case IMGFMT_422P:
1951 *h=1;
1952 *v=0;
1953 break;
1954 case IMGFMT_411P:
1955 *h=2;
1956 *v=0;
1957 break;
c7a810cc
MN
1958 default:
1959 *h=0;
1960 *v=0;
1961 break;
1962 }
1963}
1964
0481412a
MN
1965static uint16_t roundToInt16(float f){
1966 if(f<-0x7FFF) f= -0x7FFF;
1967 else if(f> 0x7FFF) f= 0x7FFF;
1968
1969 return (int)floor(f + 0.5);
1970}
1971
1972/**
1973 * @param colorspace colorspace
1974 * @param fullRange if 1 then the luma range is 0..255 if 0 its 16..235
1975 */
1976void setInputColorspaceDetails(SwsContext *c, int colorspace, int fullRange, float brightness, float contrast, float saturation){
1977
1978 float crv = Inverse_Table_6_9[colorspace][0]/65536.0;
1979 float cbu = Inverse_Table_6_9[colorspace][1]/65536.0;
1980 float cgu = -Inverse_Table_6_9[colorspace][2]/65536.0;
1981 float cgv = -Inverse_Table_6_9[colorspace][3]/65536.0;
1982 float cy = 1.0;
1983 float oy = 0;
1984
1985 c->uOffset= 0x0400040004000400LL;
1986 c->vOffset= 0x0400040004000400LL;
1987
1988 if(!fullRange){
1989 cy= (cy*255.0) / 219.0;
1990 oy= 16.0;
1991 }
1992
1993 cy *= contrast;
1994 crv*= contrast * saturation;
1995 cbu*= contrast * saturation;
1996 cgu*= contrast * saturation;
1997 cgv*= contrast * saturation;
1998
1999 oy -= 256.0*brightness;
2000
2001 c->yCoeff= roundToInt16(cy *8192) * 0x0001000100010001ULL;
2002 c->vrCoeff= roundToInt16(crv*8192) * 0x0001000100010001ULL;
2003 c->ubCoeff= roundToInt16(cbu*8192) * 0x0001000100010001ULL;
2004 c->vgCoeff= roundToInt16(cgv*8192) * 0x0001000100010001ULL;
2005 c->ugCoeff= roundToInt16(cgu*8192) * 0x0001000100010001ULL;
2006 c->yOffset= roundToInt16(oy * 8) * 0x0001000100010001ULL;
2007}
2008
28bf81c9
MN
2009SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
2010 SwsFilter *srcFilter, SwsFilter *dstFilter){
2011
28bf81c9
MN
2012 SwsContext *c;
2013 int i;
37079906 2014 int usesFilter;
e09d12f4 2015 int unscaled, needsDither;
c7f822d9 2016 SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
5cebb24b
MN
2017#ifdef ARCH_X86
2018 if(gCpuCaps.hasMMX)
2019 asm volatile("emms\n\t"::: "memory");
2020#endif
28bf81c9 2021 if(swScale==NULL) globalInit();
e616aa93 2022//srcFormat= IMGFMT_Y800;
e09d12f4 2023//dstFormat= IMGFMT_Y800;
6ff0ad6b 2024 /* avoid dupplicate Formats, so we dont need to check to much */
0d9f3d85
A
2025 srcFormat = remove_dup_fourcc(srcFormat);
2026 dstFormat = remove_dup_fourcc(dstFormat);
44c1035c
MN
2027
2028 unscaled = (srcW == dstW && srcH == dstH);
e09d12f4
MN
2029 needsDither= (isBGR(dstFormat) || isRGB(dstFormat))
2030 && (dstFormat&0xFF)<24
2031 && ((dstFormat&0xFF)<(srcFormat&0xFF) || (!(isRGB(srcFormat) || isBGR(srcFormat))));
44c1035c
MN
2032
2033 if(!isSupportedIn(srcFormat))
b81cf274 2034 {
44c1035c
MN
2035 MSG_ERR("swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
2036 return NULL;
2037 }
2038 if(!isSupportedOut(dstFormat))
2039 {
2040 MSG_ERR("swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
2041 return NULL;
b81cf274 2042 }
44c1035c 2043
28bf81c9 2044 /* sanity check */
b81cf274
MN
2045 if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
2046 {
0d9f3d85 2047 MSG_ERR("swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
b81cf274
MN
2048 srcW, srcH, dstW, dstH);
2049 return NULL;
2050 }
28bf81c9 2051
c7f822d9
MN
2052 if(!dstFilter) dstFilter= &dummyFilter;
2053 if(!srcFilter) srcFilter= &dummyFilter;
2054
28bf81c9 2055 c= memalign(64, sizeof(SwsContext));
c7f822d9 2056 memset(c, 0, sizeof(SwsContext));
28bf81c9
MN
2057
2058 c->srcW= srcW;
2059 c->srcH= srcH;
2060 c->dstW= dstW;
2061 c->dstH= dstH;
5521b193
MN
2062 c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
2063 c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
28bf81c9
MN
2064 c->flags= flags;
2065 c->dstFormat= dstFormat;
2066 c->srcFormat= srcFormat;
2067
0481412a
MN
2068 setInputColorspaceDetails(c, SWS_CS_DEFAULT, 0, 0.0, 1.0, 1.0);
2069
37079906
MN
2070 usesFilter=0;
2071 if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
2072 if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
2073 if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
2074 if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
2075 if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
2076 if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
2077 if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
2078 if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
e616aa93
MN
2079
2080 getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
2081 getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
2082
2083 // reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation
2084 if((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
2085
5859233b
MN
2086 // drop some chroma lines if the user wants it
2087 c->vChrDrop= (flags&SWS_SRC_V_CHR_DROP_MASK)>>SWS_SRC_V_CHR_DROP_SHIFT;
2088 c->chrSrcVSubSample+= c->vChrDrop;
e616aa93 2089
5859233b 2090 // drop every 2. pixel for chroma calculation unless user wants full chroma
e616aa93
MN
2091 if((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP))
2092 c->chrSrcHSubSample=1;
2093
2094 c->chrIntHSubSample= c->chrDstHSubSample;
2095 c->chrIntVSubSample= c->chrSrcVSubSample;
2096
2097 // note the -((-x)>>y) is so that we allways round toward +inf
2098 c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
2099 c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
2100 c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
2101 c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
37079906 2102
cf7d1c1a
MN
2103 if(isBGR(dstFormat))
2104 c->yuvTable= yuv2rgb_c_init(dstFormat & 0xFF, MODE_RGB, c->table_rV, c->table_gU, c->table_gV, c->table_bU);
2105 if(isRGB(dstFormat))
2106 c->yuvTable= yuv2rgb_c_init(dstFormat & 0xFF, MODE_BGR, c->table_rV, c->table_gU, c->table_gV, c->table_bU);
2107
b935781b 2108 /* unscaled special Cases */
44c1035c 2109 if(unscaled && !usesFilter)
37079906 2110 {
0d9f3d85
A
2111 /* yv12_to_nv12 */
2112 if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_NV12)
2113 {
2114 c->swScale= PlanarToNV12Wrapper;
2115
2116 if(flags&SWS_PRINT_INFO)
2117 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2118 vo_format_name(srcFormat), vo_format_name(dstFormat));
2119 return c;
2120 }
37079906 2121 /* yuv2bgr */
e616aa93 2122 if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420) && isBGR(dstFormat))
37079906
MN
2123 {
2124 // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
cf7d1c1a 2125 //FIXME rgb vs. bgr ?
1e1c4fe9 2126#ifdef WORDS_BIGENDIAN
daa57641
MN
2127 if(dstFormat==IMGFMT_BGR32)
2128 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
2129 else
2130 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1e1c4fe9 2131#else
b6654a54 2132 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1e1c4fe9 2133#endif
37079906 2134 c->swScale= planarYuvToBgr;
b6654a54
MN
2135
2136 if(flags&SWS_PRINT_INFO)
0d9f3d85 2137 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
b6654a54
MN
2138 vo_format_name(srcFormat), vo_format_name(dstFormat));
2139 return c;
2140 }
b241cbf2 2141
b6654a54 2142 /* simple copy */
e616aa93
MN
2143 if( srcFormat == dstFormat
2144 || (srcFormat==IMGFMT_YV12 && dstFormat==IMGFMT_I420)
2145 || (srcFormat==IMGFMT_I420 && dstFormat==IMGFMT_YV12)
2146 || (isPlanarYUV(srcFormat) && isGray(dstFormat))
2147 || (isPlanarYUV(dstFormat) && isGray(srcFormat))
2148 )
b6654a54
MN
2149 {
2150 c->swScale= simpleCopy;
2151
37079906 2152 if(flags&SWS_PRINT_INFO)
0d9f3d85 2153 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
37079906
MN
2154 vo_format_name(srcFormat), vo_format_name(dstFormat));
2155 return c;
2156 }
b241cbf2
MN
2157
2158 if( srcFormat==IMGFMT_YVU9 && (dstFormat==IMGFMT_YV12 || dstFormat==IMGFMT_I420) )
2159 {
2160 c->swScale= yvu9toyv12Wrapper;
2161
2162 if(flags&SWS_PRINT_INFO)
2163 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2164 vo_format_name(srcFormat), vo_format_name(dstFormat));
2165 return c;
2166 }
2167
ec22603f
MN
2168 /* bgr24toYV12 */
2169 if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
ec22603f 2170 c->swScale= bgr24toyv12Wrapper;
e09d12f4
MN
2171
2172 /* rgb/bgr -> rgb/bgr (no dither needed forms) */
2173 if( (isBGR(srcFormat) || isRGB(srcFormat))
2174 && (isBGR(dstFormat) || isRGB(dstFormat))
2175 && !needsDither)
2176 c->swScale= rgb2rgbWrapper;
2177
2178 /* LQ converters if -sws 0 or -sws 4*/
2179 if(c->flags&(SWS_FAST_BILINEAR|SWS_POINT)){
2180 /* rgb/bgr -> rgb/bgr (dither needed forms) */
2181 if( (isBGR(srcFormat) || isRGB(srcFormat))
2182 && (isBGR(dstFormat) || isRGB(dstFormat))
2183 && needsDither)
2184 c->swScale= rgb2rgbWrapper;
2ce486d8
MN
2185
2186 /* yv12_to_yuy2 */
2187 if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_YUY2)
2188 {
2189 c->swScale= PlanarToYuy2Wrapper;
2190
2191 if(flags&SWS_PRINT_INFO)
2192 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2193 vo_format_name(srcFormat), vo_format_name(dstFormat));
2194 return c;
2195 }
e09d12f4 2196 }
ec22603f 2197
e09d12f4 2198 if(c->swScale){
ec22603f 2199 if(flags&SWS_PRINT_INFO)
0d9f3d85 2200 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
ec22603f
MN
2201 vo_format_name(srcFormat), vo_format_name(dstFormat));
2202 return c;
2203 }
37079906
MN
2204 }
2205
28bf81c9
MN
2206 if(cpuCaps.hasMMX2)
2207 {
2208 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2209 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
2210 {
2211 if(flags&SWS_PRINT_INFO)
0d9f3d85 2212 MSG_INFO("SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
28bf81c9
MN
2213 }
2214 }
2215 else
2216 c->canMMX2BeUsed=0;
2217
1e621b18
MN
2218 c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
2219 c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
2220
28bf81c9
MN
2221 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2222 // but only for the FAST_BILINEAR mode otherwise do correct scaling
2223 // n-2 is the last chrominance sample available
2224 // this is not perfect, but noone shuld notice the difference, the more correct variant
2225 // would be like the vertical one, but that would require some special code for the
2226 // first and last pixel
2227 if(flags&SWS_FAST_BILINEAR)
2228 {
1e621b18
MN
2229 if(c->canMMX2BeUsed)
2230 {
2231 c->lumXInc+= 20;
2232 c->chrXInc+= 20;
2233 }
28bf81c9 2234 //we dont use the x86asm scaler if mmx is available
1e621b18
MN
2235 else if(cpuCaps.hasMMX)
2236 {
2237 c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2238 c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
2239 }
28bf81c9
MN
2240 }
2241
28bf81c9
MN
2242 /* precalculate horizontal scaler filter coefficients */
2243 {
2244 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
2245
c7f822d9 2246 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
81a571a8
MN
2247 srcW , dstW, filterAlign, 1<<14,
2248 (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC) : flags,
c7f822d9
MN
2249 srcFilter->lumH, dstFilter->lumH);
2250 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
81a571a8
MN
2251 c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
2252 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
c7f822d9 2253 srcFilter->chrH, dstFilter->chrH);
28bf81c9
MN
2254
2255#ifdef ARCH_X86
2256// cant downscale !!!
2257 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
2258 {
b7dc6f66
MN
2259 c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t));
2260 c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t));
2261 c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW /2/8+8)*sizeof(int32_t));
2262 c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
2263
2264 initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
2265 initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
28bf81c9
MN
2266 }
2267#endif
2268 } // Init Horizontal stuff
2269
2270
2271
2272 /* precalculate vertical scaler filter coefficients */
c7f822d9 2273 initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
81a571a8
MN
2274 srcH , dstH, 1, (1<<12)-4,
2275 (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC) : flags,
c7f822d9
MN
2276 srcFilter->lumV, dstFilter->lumV);
2277 initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
81a571a8
MN
2278 c->chrSrcH, c->chrDstH, 1, (1<<12)-4,
2279 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
2280 srcFilter->chrV, dstFilter->chrV);
28bf81c9
MN
2281
2282 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2283 c->vLumBufSize= c->vLumFilterSize;
2284 c->vChrBufSize= c->vChrFilterSize;
2285 for(i=0; i<dstH; i++)
2286 {
2287 int chrI= i*c->chrDstH / dstH;
2288 int nextSlice= MAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1,
e616aa93
MN
2289 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample));
2290 nextSlice&= ~3; // Slices start at boundaries which are divisable through 4
28bf81c9
MN
2291 if(c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice)
2292 c->vLumBufSize= nextSlice - c->vLumFilterPos[i ];
e616aa93
MN
2293 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>c->chrSrcVSubSample))
2294 c->vChrBufSize= (nextSlice>>c->chrSrcVSubSample) - c->vChrFilterPos[chrI];
28bf81c9
MN
2295 }
2296
2297 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
c7f822d9
MN
2298 c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
2299 c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
6c7506de 2300 //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
28bf81c9
MN
2301 for(i=0; i<c->vLumBufSize; i++)
2302 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
2303 for(i=0; i<c->vChrBufSize; i++)
2304 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
2305
2306 //try to avoid drawing green stuff between the right end and the stride end
2307 for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
2308 for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
2309
2310 ASSERT(c->chrDstH <= dstH)
28bf81c9 2311
28bf81c9
MN
2312 if(flags&SWS_PRINT_INFO)
2313 {
2314#ifdef DITHER1XBPP
5521b193
MN
2315 char *dither= " dithered";
2316#else
2317 char *dither= "";
28bf81c9
MN
2318#endif
2319 if(flags&SWS_FAST_BILINEAR)
0d9f3d85 2320 MSG_INFO("\nSwScaler: FAST_BILINEAR scaler, ");
28bf81c9 2321 else if(flags&SWS_BILINEAR)
0d9f3d85 2322 MSG_INFO("\nSwScaler: BILINEAR scaler, ");
28bf81c9 2323 else if(flags&SWS_BICUBIC)
0d9f3d85 2324 MSG_INFO("\nSwScaler: BICUBIC scaler, ");
1e621b18 2325 else if(flags&SWS_X)
0d9f3d85 2326 MSG_INFO("\nSwScaler: Experimental scaler, ");
ff7ba856 2327 else if(flags&SWS_POINT)
0d9f3d85 2328 MSG_INFO("\nSwScaler: Nearest Neighbor / POINT scaler, ");
d8863d37 2329 else if(flags&SWS_AREA)
0d9f3d85 2330 MSG_INFO("\nSwScaler: Area Averageing scaler, ");
81a571a8 2331 else if(flags&SWS_BICUBLIN)
a86c461c
MN
2332 MSG_INFO("\nSwScaler: luma BICUBIC / chroma BILINEAR scaler, ");
2333 else if(flags&SWS_GAUSS)
2334 MSG_INFO("\nSwScaler: Gaussian scaler, ");
2335 else if(flags&SWS_SINC)
2336 MSG_INFO("\nSwScaler: Sinc scaler, ");
2337 else if(flags&SWS_LANCZOS)
2338 MSG_INFO("\nSwScaler: Lanczos scaler, ");
2339 else if(flags&SWS_SPLINE)
2340 MSG_INFO("\nSwScaler: Bicubic spline scaler, ");
28bf81c9 2341 else
0d9f3d85 2342 MSG_INFO("\nSwScaler: ehh flags invalid?! ");
28bf81c9 2343
0d9f3d85
A
2344 if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
2345 MSG_INFO("from %s to%s %s ",
2346 vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
2347 else
2348 MSG_INFO("from %s to %s ",
2349 vo_format_name(srcFormat), vo_format_name(dstFormat));
28bf81c9
MN
2350
2351 if(cpuCaps.hasMMX2)
0d9f3d85 2352 MSG_INFO("using MMX2\n");
28bf81c9 2353 else if(cpuCaps.has3DNow)
0d9f3d85 2354 MSG_INFO("using 3DNOW\n");
28bf81c9 2355 else if(cpuCaps.hasMMX)
0d9f3d85 2356 MSG_INFO("using MMX\n");
28bf81c9 2357 else
0d9f3d85 2358 MSG_INFO("using C\n");
28bf81c9
MN
2359 }
2360
a749913f 2361 if((flags & SWS_PRINT_INFO) && verbose>0)
28bf81c9
MN
2362 {
2363 if(cpuCaps.hasMMX)
2364 {
2365 if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
0d9f3d85 2366 MSG_V("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
28bf81c9
MN
2367 else
2368 {
2369 if(c->hLumFilterSize==4)
0d9f3d85 2370 MSG_V("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
28bf81c9 2371 else if(c->hLumFilterSize==8)
0d9f3d85 2372 MSG_V("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
28bf81c9 2373 else
0d9f3d85 2374 MSG_V("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
28bf81c9
MN
2375
2376 if(c->hChrFilterSize==4)
0d9f3d85 2377 MSG_V("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
28bf81c9 2378 else if(c->hChrFilterSize==8)
0d9f3d85 2379 MSG_V("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
28bf81c9 2380 else
0d9f3d85 2381 MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
28bf81c9
MN
2382 }
2383 }
2384 else
2385 {
2386#ifdef ARCH_X86
0d9f3d85 2387 MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
28bf81c9
MN
2388#else
2389 if(flags & SWS_FAST_BILINEAR)
0d9f3d85 2390 MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
28bf81c9 2391 else
0d9f3d85 2392 MSG_V("SwScaler: using C scaler for horizontal scaling\n");
28bf81c9
MN
2393#endif
2394 }
6c7506de 2395 if(isPlanarYUV(dstFormat))
28bf81c9
MN
2396 {
2397 if(c->vLumFilterSize==1)
0d9f3d85 2398 MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9 2399 else
0d9f3d85 2400 MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9
MN
2401 }
2402 else
2403 {
2404 if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
0d9f3d85 2405 MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
28bf81c9
MN
2406 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
2407 else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
0d9f3d85 2408 MSG_V("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9 2409 else
0d9f3d85 2410 MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9
MN
2411 }
2412
2413 if(dstFormat==IMGFMT_BGR24)
0d9f3d85 2414 MSG_V("SwScaler: using %s YV12->BGR24 Converter\n",
28bf81c9 2415 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
fd284805 2416 else if(dstFormat==IMGFMT_BGR32)
0d9f3d85 2417 MSG_V("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
fd284805 2418 else if(dstFormat==IMGFMT_BGR16)
0d9f3d85 2419 MSG_V("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
fd284805 2420 else if(dstFormat==IMGFMT_BGR15)
0d9f3d85 2421 MSG_V("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
28bf81c9 2422
0d9f3d85 2423 MSG_V("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
28bf81c9 2424 }
1e621b18
MN
2425 if((flags & SWS_PRINT_INFO) && verbose>1)
2426 {
0d9f3d85 2427 MSG_DBG2("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1e621b18 2428 c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
0d9f3d85 2429 MSG_DBG2("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1e621b18
MN
2430 c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
2431 }
37079906
MN
2432
2433 c->swScale= swScale;
28bf81c9
MN
2434 return c;
2435}
2436
2437/**
2438 * returns a normalized gaussian curve used to filter stuff
2439 * quality=3 is high quality, lowwer is lowwer quality
2440 */
c7f822d9
MN
2441
2442SwsVector *getGaussianVec(double variance, double quality){
28bf81c9
MN
2443 const int length= (int)(variance*quality + 0.5) | 1;
2444 int i;
2445 double *coeff= memalign(sizeof(double), length*sizeof(double));
2446 double middle= (length-1)*0.5;
c7f822d9
MN
2447 SwsVector *vec= malloc(sizeof(SwsVector));
2448
2449 vec->coeff= coeff;
2450 vec->length= length;
28bf81c9
MN
2451
2452 for(i=0; i<length; i++)
2453 {
2454 double dist= i-middle;
2455 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
2456 }
2457
c7f822d9
MN
2458 normalizeVec(vec, 1.0);
2459
2460 return vec;
28bf81c9
MN
2461}
2462
5521b193
MN
2463SwsVector *getConstVec(double c, int length){
2464 int i;
2465 double *coeff= memalign(sizeof(double), length*sizeof(double));
2466 SwsVector *vec= malloc(sizeof(SwsVector));
2467
2468 vec->coeff= coeff;
2469 vec->length= length;
2470
2471 for(i=0; i<length; i++)
2472 coeff[i]= c;
2473
2474 return vec;
2475}
2476
2477
c7f822d9
MN
2478SwsVector *getIdentityVec(void){
2479 double *coeff= memalign(sizeof(double), sizeof(double));
2480 SwsVector *vec= malloc(sizeof(SwsVector));
2481 coeff[0]= 1.0;
2482
2483 vec->coeff= coeff;
2484 vec->length= 1;
2485
2486 return vec;
2487}
2488
2489void normalizeVec(SwsVector *a, double height){
28bf81c9
MN
2490 int i;
2491 double sum=0;
2492 double inv;
2493
c7f822d9
MN
2494 for(i=0; i<a->length; i++)
2495 sum+= a->coeff[i];
28bf81c9
MN
2496
2497 inv= height/sum;
2498
c7f822d9 2499 for(i=0; i<a->length; i++)
8664c807 2500 a->coeff[i]*= inv;
28bf81c9
MN
2501}
2502
c7f822d9
MN
2503void scaleVec(SwsVector *a, double scalar){
2504 int i;
2505
2506 for(i=0; i<a->length; i++)
2507 a->coeff[i]*= scalar;
2508}
2509
5cebb24b 2510static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
c7f822d9 2511 int length= a->length + b->length - 1;
28bf81c9
MN
2512 double *coeff= memalign(sizeof(double), length*sizeof(double));
2513 int i, j;
c7f822d9
MN
2514 SwsVector *vec= malloc(sizeof(SwsVector));
2515
2516 vec->coeff= coeff;
2517 vec->length= length;
28bf81c9
MN
2518
2519 for(i=0; i<length; i++) coeff[i]= 0.0;
2520
c7f822d9 2521 for(i=0; i<a->length; i++)
28bf81c9 2522 {
c7f822d9 2523 for(j=0; j<b->length; j++)
28bf81c9 2524 {
c7f822d9 2525 coeff[i+j]+= a->coeff[i]*b->coeff[j];
28bf81c9
MN
2526 }
2527 }
2528
c7f822d9 2529 return vec;
28bf81c9
MN
2530}
2531
5cebb24b 2532static SwsVector *sumVec(SwsVector *a, SwsVector *b){
c7f822d9 2533 int length= MAX(a->length, b->length);
28bf81c9
MN
2534 double *coeff= memalign(sizeof(double), length*sizeof(double));
2535 int i;
c7f822d9
MN
2536 SwsVector *vec= malloc(sizeof(SwsVector));
2537
2538 vec->coeff= coeff;
2539 vec->length= length;
28bf81c9
MN
2540
2541 for(i=0; i<length; i++) coeff[i]= 0.0;
2542
c7f822d9
MN
2543 for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2544 for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
2545
2546 return vec;
28bf81c9 2547}
c7f822d9 2548
5cebb24b 2549static SwsVector *diffVec(SwsVector *a, SwsVector *b){
c7f822d9
MN
2550 int length= MAX(a->length, b->length);
2551 double *coeff= memalign(sizeof(double), length*sizeof(double));
2552 int i;
2553 SwsVector *vec= malloc(sizeof(SwsVector));
2554
2555 vec->coeff= coeff;
2556 vec->length= length;
2557
2558 for(i=0; i<length; i++) coeff[i]= 0.0;
2559
2560 for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2561 for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
2562
2563 return vec;
2564}
2565
2566/* shift left / or right if "shift" is negative */
5cebb24b 2567static SwsVector *getShiftedVec(SwsVector *a, int shift){
c7f822d9
MN
2568 int length= a->length + ABS(shift)*2;
2569 double *coeff= memalign(sizeof(double), length*sizeof(double));
ff7ba856 2570 int i;
c7f822d9
MN
2571 SwsVector *vec= malloc(sizeof(SwsVector));
2572
2573 vec->coeff= coeff;
2574 vec->length= length;
2575
2576 for(i=0; i<length; i++) coeff[i]= 0.0;
2577
2578 for(i=0; i<a->length; i++)
2579 {
2580 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
2581 }
2582
2583 return vec;
2584}
2585
5cebb24b
MN
2586void shiftVec(SwsVector *a, int shift){
2587 SwsVector *shifted= getShiftedVec(a, shift);
2588 free(a->coeff);
2589 a->coeff= shifted->coeff;
2590 a->length= shifted->length;
2591 free(shifted);
2592}
2593
2594void addVec(SwsVector *a, SwsVector *b){
2595 SwsVector *sum= sumVec(a, b);
2596 free(a->coeff);
2597 a->coeff= sum->coeff;
2598 a->length= sum->length;
2599 free(sum);
2600}
2601
2602void subVec(SwsVector *a, SwsVector *b){
2603 SwsVector *diff= diffVec(a, b);
2604 free(a->coeff);
2605 a->coeff= diff->coeff;
2606 a->length= diff->length;
2607 free(diff);
2608}
2609
2610void convVec(SwsVector *a, SwsVector *b){
2611 SwsVector *conv= getConvVec(a, b);
2612 free(a->coeff);
2613 a->coeff= conv->coeff;
2614 a->length= conv->length;
2615 free(conv);
2616}
2617
2618SwsVector *cloneVec(SwsVector *a){
2619 double *coeff= memalign(sizeof(double), a->length*sizeof(double));
2620 int i;
2621 SwsVector *vec= malloc(sizeof(SwsVector));
2622
2623 vec->coeff= coeff;
2624 vec->length= a->length;
2625
2626 for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
2627
2628 return vec;
2629}
2630
c7f822d9
MN
2631void printVec(SwsVector *a){
2632 int i;
2633 double max=0;
2634 double min=0;
2635 double range;
2636
2637 for(i=0; i<a->length; i++)
2638 if(a->coeff[i]>max) max= a->coeff[i];
2639
2640 for(i=0; i<a->length; i++)
2641 if(a->coeff[i]<min) min= a->coeff[i];
2642
2643 range= max - min;
2644
2645 for(i=0; i<a->length; i++)
2646 {
2647 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
0d9f3d85
A
2648 MSG_DBG2("%1.3f ", a->coeff[i]);
2649 for(;x>0; x--) MSG_DBG2(" ");
2650 MSG_DBG2("|\n");
c7f822d9
MN
2651 }
2652}
2653
2654void freeVec(SwsVector *a){
2655 if(!a) return;
2656 if(a->coeff) free(a->coeff);
2657 a->coeff=NULL;
2658 a->length=0;
2659 free(a);
2660}
2661
2662void freeSwsContext(SwsContext *c){
2663 int i;
c7f822d9
MN
2664 if(!c) return;
2665
2666 if(c->lumPixBuf)
2667 {
6c7506de 2668 for(i=0; i<c->vLumBufSize; i++)
c7f822d9
MN
2669 {
2670 if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
2671 c->lumPixBuf[i]=NULL;
2672 }
2673 free(c->lumPixBuf);
2674 c->lumPixBuf=NULL;
2675 }
2676
2677 if(c->chrPixBuf)
2678 {
6c7506de 2679 for(i=0; i<c->vChrBufSize; i++)
c7f822d9
MN
2680 {
2681 if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
2682 c->chrPixBuf[i]=NULL;
2683 }
2684 free(c->chrPixBuf);
2685 c->chrPixBuf=NULL;
2686 }
2687
2688 if(c->vLumFilter) free(c->vLumFilter);
2689 c->vLumFilter = NULL;
2690 if(c->vChrFilter) free(c->vChrFilter);
2691 c->vChrFilter = NULL;
2692 if(c->hLumFilter) free(c->hLumFilter);
2693 c->hLumFilter = NULL;
2694 if(c->hChrFilter) free(c->hChrFilter);
2695 c->hChrFilter = NULL;
2696
2697 if(c->vLumFilterPos) free(c->vLumFilterPos);
2698 c->vLumFilterPos = NULL;
2699 if(c->vChrFilterPos) free(c->vChrFilterPos);
2700 c->vChrFilterPos = NULL;
2701 if(c->hLumFilterPos) free(c->hLumFilterPos);
2702 c->hLumFilterPos = NULL;
2703 if(c->hChrFilterPos) free(c->hChrFilterPos);
2704 c->hChrFilterPos = NULL;
2705
b7dc6f66
MN
2706 if(c->lumMmx2Filter) free(c->lumMmx2Filter);
2707 c->lumMmx2Filter=NULL;
2708 if(c->chrMmx2Filter) free(c->chrMmx2Filter);
2709 c->chrMmx2Filter=NULL;
2710 if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
2711 c->lumMmx2FilterPos=NULL;
2712 if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
2713 c->chrMmx2FilterPos=NULL;
cf7d1c1a
MN
2714 if(c->yuvTable) free(c->yuvTable);
2715 c->yuvTable=NULL;
b7dc6f66 2716
c7f822d9
MN
2717 free(c);
2718}
2719