dithering info wasnt displayed
[libav.git] / postproc / swscale.c
CommitLineData
31190492
A
1
2// Software scaling and colorspace conversion routines for MPlayer
3
afa569af 4// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
783e9cc9 5// current version mostly by Michael Niedermayer (michaelni@gmx.at)
d604bab9 6// the parts written by michael are under GNU GPL
783e9cc9 7
d3f41512 8#include <inttypes.h>
dda87e9f 9#include <string.h>
077ea8a7 10#include <math.h>
c1b0bfb4 11#include <stdio.h>
d3f41512 12#include "../config.h"
c1b0bfb4
MN
13#ifdef HAVE_MALLOC_H
14#include <malloc.h>
15#endif
d604bab9 16#include "swscale.h"
7630f2e0 17#include "../cpudetect.h"
541c4eb9 18#undef MOVNTQ
7d7f78b5 19#undef PAVGB
d3f41512 20
783e9cc9 21//#undef HAVE_MMX2
d3f41512 22//#undef HAVE_MMX
783e9cc9 23//#undef ARCH_X86
d604bab9
MN
24#define DITHER1XBPP
25int fullUVIpol=0;
26//disables the unscaled height version
27int allwaysIpol=0;
d3f41512
MN
28
29#define RET 0xC3 //near return opcode
c1b0bfb4
MN
30
31//#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
32#define ASSERT(x) ;
33
34
783e9cc9
MN
35/*
36NOTES
d3f41512 37
d604bab9
MN
38known BUGS with known cause (no bugreports please!, but patches are welcome :) )
39horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
40
c1b0bfb4 41Supported output formats BGR15 BGR16 BGR24 BGR32, YV12
d604bab9
MN
42BGR15 & BGR16 MMX verions support dithering
43Special versions: fast Y 1:1 scaling (no interpolation in y direction)
31190492 44
783e9cc9 45TODO
d604bab9 46more intelligent missalignment avoidance for the horizontal scaler
02a0a992
MN
47dither in C
48change the distance of the u & v buffer
c1b0bfb4
MN
49Move static / global vars into a struct so multiple scalers can be used
50write special vertical cubic upscale version
51Optimize C code (yv12 / minmax)
783e9cc9 52*/
31190492 53
d604bab9 54#define ABS(a) ((a) > 0 ? (a) : (-(a)))
2ff198c1
MN
55#define MIN(a,b) ((a) > (b) ? (b) : (a))
56#define MAX(a,b) ((a) < (b) ? (b) : (a))
d604bab9 57
7630f2e0
MN
58#ifdef ARCH_X86
59#define CAN_COMPILE_X86_ASM
d604bab9
MN
60#endif
61
7630f2e0 62#ifdef CAN_COMPILE_X86_ASM
d604bab9 63static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
390b20a6
MN
64static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
65static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
66static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
67static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
f62255fb
MN
68static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
69static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
d604bab9
MN
70static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
71static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
72static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
077ea8a7 73static uint64_t __attribute__((aligned(8))) w02= 0x0002000200020002LL;
d604bab9
MN
74static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
75static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
76static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
77
3fe8e8f0
MN
78static volatile uint64_t __attribute__((aligned(8))) b5Dither;
79static volatile uint64_t __attribute__((aligned(8))) g5Dither;
80static volatile uint64_t __attribute__((aligned(8))) g6Dither;
81static volatile uint64_t __attribute__((aligned(8))) r5Dither;
d8fa3c54
MN
82
83static uint64_t __attribute__((aligned(8))) dither4[2]={
84 0x0103010301030103LL,
85 0x0200020002000200LL,};
86
87static uint64_t __attribute__((aligned(8))) dither8[2]={
88 0x0602060206020602LL,
89 0x0004000400040004LL,};
d604bab9
MN
90
91static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
92static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
93static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
94static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
95static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
96static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
97
99d2cb72
MN
98static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
99static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
100static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
101
d604bab9
MN
102static uint64_t __attribute__((aligned(8))) temp0;
103static uint64_t __attribute__((aligned(8))) asm_yalpha1;
104static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
783e9cc9 105
c1b0bfb4
MN
106static int16_t __attribute__((aligned(8))) *lumPixBuf[2000];
107static int16_t __attribute__((aligned(8))) *chrPixBuf[2000];
077ea8a7
MN
108static int16_t __attribute__((aligned(8))) hLumFilter[8000];
109static int16_t __attribute__((aligned(8))) hLumFilterPos[2000];
110static int16_t __attribute__((aligned(8))) hChrFilter[8000];
111static int16_t __attribute__((aligned(8))) hChrFilterPos[2000];
c1b0bfb4
MN
112static int16_t __attribute__((aligned(8))) vLumFilter[8000];
113static int16_t __attribute__((aligned(8))) vLumFilterPos[2000];
114static int16_t __attribute__((aligned(8))) vChrFilter[8000];
115static int16_t __attribute__((aligned(8))) vChrFilterPos[2000];
116
117// Contain simply the values from v(Lum|Chr)Filter just nicely packed for mmx
118//FIXME these are very likely too small / 8000 caused problems with 480x480
119static int16_t __attribute__((aligned(8))) lumMmxFilter[16000];
120static int16_t __attribute__((aligned(8))) chrMmxFilter[16000];
d604bab9 121#else
c1b0bfb4
MN
122static int16_t *lumPixBuf[2000];
123static int16_t *chrPixBuf[2000];
077ea8a7
MN
124static int16_t hLumFilter[8000];
125static int16_t hLumFilterPos[2000];
126static int16_t hChrFilter[8000];
127static int16_t hChrFilterPos[2000];
c1b0bfb4
MN
128static int16_t vLumFilter[8000];
129static int16_t vLumFilterPos[2000];
130static int16_t vChrFilter[8000];
131static int16_t vChrFilterPos[2000];
132//FIXME just dummy vars
133static int16_t lumMmxFilter[1];
134static int16_t chrMmxFilter[1];
d604bab9 135#endif
783e9cc9
MN
136
137// clipping helper table for C implementations:
138static unsigned char clip_table[768];
139
b18ea156
MN
140static unsigned short clip_table16b[768];
141static unsigned short clip_table16g[768];
142static unsigned short clip_table16r[768];
143static unsigned short clip_table15b[768];
144static unsigned short clip_table15g[768];
145static unsigned short clip_table15r[768];
146
783e9cc9
MN
147// yuv->rgb conversion tables:
148static int yuvtab_2568[256];
149static int yuvtab_3343[256];
150static int yuvtab_0c92[256];
151static int yuvtab_1a1e[256];
152static int yuvtab_40cf[256];
c1b0bfb4
MN
153// Needed for cubic scaler to catch overflows
154static int clip_yuvtab_2568[768];
155static int clip_yuvtab_3343[768];
156static int clip_yuvtab_0c92[768];
157static int clip_yuvtab_1a1e[768];
158static int clip_yuvtab_40cf[768];
159
160static int hLumFilterSize=0;
161static int hChrFilterSize=0;
162static int vLumFilterSize=0;
163static int vChrFilterSize=0;
164static int vLumBufSize=0;
165static int vChrBufSize=0;
077ea8a7
MN
166
167int sws_flags=0;
168
7630f2e0 169#ifdef CAN_COMPILE_X86_ASM
d3f41512
MN
170static uint8_t funnyYCode[10000];
171static uint8_t funnyUVCode[10000];
96034638 172#endif
d3f41512 173
2ff198c1
MN
174static int canMMX2BeUsed=0;
175
7630f2e0 176#ifdef CAN_COMPILE_X86_ASM
96034638
MN
177void in_asm_used_var_warning_killer()
178{
077ea8a7 179 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
d8fa3c54 180 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+
077ea8a7 181 M24A+M24B+M24C+w02 + funnyYCode[0]+ funnyUVCode[0]+b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0];
96034638
MN
182 if(i) i=0;
183}
184#endif
d604bab9 185
7630f2e0
MN
186//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
187//Plain C versions
726a959a
MN
188#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
189#define COMPILE_C
190#endif
191
192#ifdef CAN_COMPILE_X86_ASM
193
194#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
195#define COMPILE_MMX
196#endif
197
198#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
199#define COMPILE_MMX2
200#endif
201
202#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
203#define COMPILE_3DNOW
204#endif
205#endif //CAN_COMPILE_X86_ASM
206
207#undef HAVE_MMX
208#undef HAVE_MMX2
209#undef HAVE_3DNOW
210#undef ARCH_X86
211
212#ifdef COMPILE_C
7630f2e0
MN
213#undef HAVE_MMX
214#undef HAVE_MMX2
215#undef HAVE_3DNOW
216#undef ARCH_X86
217#define RENAME(a) a ## _C
218#include "swscale_template.c"
726a959a 219#endif
397c035e 220
7630f2e0 221#ifdef CAN_COMPILE_X86_ASM
397c035e 222
7630f2e0
MN
223//X86 versions
224/*
225#undef RENAME
226#undef HAVE_MMX
227#undef HAVE_MMX2
228#undef HAVE_3DNOW
229#define ARCH_X86
230#define RENAME(a) a ## _X86
231#include "swscale_template.c"
1faf0867 232*/
7630f2e0 233//MMX versions
726a959a 234#ifdef COMPILE_MMX
7630f2e0
MN
235#undef RENAME
236#define HAVE_MMX
237#undef HAVE_MMX2
238#undef HAVE_3DNOW
239#define ARCH_X86
240#define RENAME(a) a ## _MMX
241#include "swscale_template.c"
726a959a 242#endif
7630f2e0
MN
243
244//MMX2 versions
726a959a 245#ifdef COMPILE_MMX2
7630f2e0
MN
246#undef RENAME
247#define HAVE_MMX
248#define HAVE_MMX2
249#undef HAVE_3DNOW
250#define ARCH_X86
251#define RENAME(a) a ## _MMX2
252#include "swscale_template.c"
726a959a 253#endif
7630f2e0
MN
254
255//3DNOW versions
726a959a 256#ifdef COMPILE_3DNOW
7630f2e0
MN
257#undef RENAME
258#define HAVE_MMX
259#undef HAVE_MMX2
260#define HAVE_3DNOW
261#define ARCH_X86
262#define RENAME(a) a ## _3DNow
263#include "swscale_template.c"
726a959a 264#endif
7630f2e0
MN
265
266#endif //CAN_COMPILE_X86_ASM
267
268// minor note: the HAVE_xyz is messed up after that line so dont use it
d604bab9 269
d3f41512 270
38858470 271// *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
31190492
A
272// *** Note: it's called multiple times while decoding a frame, first time y==0
273// *** Designed to upscale, but may work for downscale too.
7630f2e0 274// switching the cpu type during a sliced drawing can have bad effects, like sig11
d1fac6cf
MN
275void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int srcSliceY ,
276 int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
277 int srcW, int srcH, int dstW, int dstH){
31190492 278
726a959a 279#ifdef RUNTIME_CPUDETECT
7630f2e0
MN
280#ifdef CAN_COMPILE_X86_ASM
281 // ordered per speed fasterst first
282 if(gCpuCaps.hasMMX2)
d1fac6cf 283 SwScale_YV12slice_MMX2(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
7630f2e0 284 else if(gCpuCaps.has3DNow)
d1fac6cf 285 SwScale_YV12slice_3DNow(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
7630f2e0 286 else if(gCpuCaps.hasMMX)
d1fac6cf 287 SwScale_YV12slice_MMX(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
d604bab9 288 else
d1fac6cf 289 SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
7630f2e0 290#else
d1fac6cf 291 SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
17f715fa 292#endif
726a959a
MN
293#else //RUNTIME_CPUDETECT
294#ifdef HAVE_MMX2
d1fac6cf 295 SwScale_YV12slice_MMX2(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
726a959a 296#elif defined (HAVE_3DNOW)
d1fac6cf 297 SwScale_YV12slice_3DNow(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
726a959a 298#elif defined (HAVE_MMX)
d1fac6cf 299 SwScale_YV12slice_MMX(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
726a959a 300#else
d1fac6cf 301 SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
726a959a
MN
302#endif
303#endif //!RUNTIME_CPUDETECT
31190492 304
7630f2e0 305}
31190492
A
306
307void SwScale_Init(){
308 // generating tables:
309 int i;
c1b0bfb4
MN
310 for(i=0; i<768; i++){
311 int c= MIN(MAX(i-256, 0), 255);
312 clip_table[i]=c;
313 yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
314 yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
315 yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
316 yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
317 yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
31190492
A
318 }
319
b18ea156
MN
320 for(i=0; i<768; i++)
321 {
322 int v= clip_table[i];
323 clip_table16b[i]= v>>3;
324 clip_table16g[i]= (v<<3)&0x07E0;
325 clip_table16r[i]= (v<<8)&0xF800;
326 clip_table15b[i]= v>>3;
327 clip_table15g[i]= (v<<2)&0x03E0;
328 clip_table15r[i]= (v<<7)&0x7C00;
329 }
c1b0bfb4 330
31190492 331}
7630f2e0 332