runtime cpu detect optional (compiles faster)
[libav.git] / postproc / swscale.c
CommitLineData
31190492
A
1
2// Software scaling and colorspace conversion routines for MPlayer
3
afa569af 4// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
783e9cc9 5// current version mostly by Michael Niedermayer (michaelni@gmx.at)
d604bab9 6// the parts written by michael are under GNU GPL
783e9cc9 7
d3f41512 8#include <inttypes.h>
dda87e9f 9#include <string.h>
d3f41512 10#include "../config.h"
d604bab9 11#include "swscale.h"
7630f2e0 12#include "../cpudetect.h"
541c4eb9 13#undef MOVNTQ
7d7f78b5 14#undef PAVGB
d3f41512 15
783e9cc9 16//#undef HAVE_MMX2
d3f41512 17//#undef HAVE_MMX
783e9cc9 18//#undef ARCH_X86
d604bab9
MN
19#define DITHER1XBPP
20int fullUVIpol=0;
21//disables the unscaled height version
22int allwaysIpol=0;
d3f41512
MN
23
24#define RET 0xC3 //near return opcode
783e9cc9
MN
25/*
26NOTES
d3f41512 27
d604bab9
MN
28known BUGS with known cause (no bugreports please!, but patches are welcome :) )
29horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
30
a525ce8d 31Supported output formats BGR15 BGR16 BGR24 BGR32
d604bab9
MN
32BGR15 & BGR16 MMX verions support dithering
33Special versions: fast Y 1:1 scaling (no interpolation in y direction)
31190492 34
783e9cc9 35TODO
d604bab9 36more intelligent missalignment avoidance for the horizontal scaler
1faf0867 37bicubic scaler
02a0a992
MN
38dither in C
39change the distance of the u & v buffer
7630f2e0 40how to differenciate between x86 an C at runtime ?! (using C for now)
783e9cc9 41*/
31190492 42
d604bab9 43#define ABS(a) ((a) > 0 ? (a) : (-(a)))
2ff198c1
MN
44#define MIN(a,b) ((a) > (b) ? (b) : (a))
45#define MAX(a,b) ((a) < (b) ? (b) : (a))
d604bab9 46
7630f2e0
MN
47#ifdef ARCH_X86
48#define CAN_COMPILE_X86_ASM
d604bab9
MN
49#endif
50
7630f2e0 51#ifdef CAN_COMPILE_X86_ASM
d604bab9 52static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
390b20a6
MN
53static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
54static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
55static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
56static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
f62255fb
MN
57static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
58static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
d604bab9
MN
59static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
60static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
61static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
62static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
63static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
64static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
65
3fe8e8f0
MN
66static volatile uint64_t __attribute__((aligned(8))) b5Dither;
67static volatile uint64_t __attribute__((aligned(8))) g5Dither;
68static volatile uint64_t __attribute__((aligned(8))) g6Dither;
69static volatile uint64_t __attribute__((aligned(8))) r5Dither;
d8fa3c54
MN
70
71static uint64_t __attribute__((aligned(8))) dither4[2]={
72 0x0103010301030103LL,
73 0x0200020002000200LL,};
74
75static uint64_t __attribute__((aligned(8))) dither8[2]={
76 0x0602060206020602LL,
77 0x0004000400040004LL,};
d604bab9
MN
78
79static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
80static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
81static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
82static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
83static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
84static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
85
99d2cb72
MN
86static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
87static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
88static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
89
d604bab9
MN
90static uint64_t __attribute__((aligned(8))) temp0;
91static uint64_t __attribute__((aligned(8))) asm_yalpha1;
92static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
783e9cc9
MN
93
94// temporary storage for 4 yuv lines:
95// 16bit for now (mmx likes it more compact)
d604bab9
MN
96static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
97static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
98#else
783e9cc9
MN
99static uint16_t pix_buf_y[4][2048];
100static uint16_t pix_buf_uv[2][2048*2];
d604bab9 101#endif
783e9cc9
MN
102
103// clipping helper table for C implementations:
104static unsigned char clip_table[768];
105
b18ea156
MN
106static unsigned short clip_table16b[768];
107static unsigned short clip_table16g[768];
108static unsigned short clip_table16r[768];
109static unsigned short clip_table15b[768];
110static unsigned short clip_table15g[768];
111static unsigned short clip_table15r[768];
112
783e9cc9
MN
113// yuv->rgb conversion tables:
114static int yuvtab_2568[256];
115static int yuvtab_3343[256];
116static int yuvtab_0c92[256];
117static int yuvtab_1a1e[256];
118static int yuvtab_40cf[256];
119
7630f2e0 120#ifdef CAN_COMPILE_X86_ASM
d3f41512
MN
121static uint8_t funnyYCode[10000];
122static uint8_t funnyUVCode[10000];
96034638 123#endif
d3f41512 124
2ff198c1
MN
125static int canMMX2BeUsed=0;
126
7630f2e0 127#ifdef CAN_COMPILE_X86_ASM
96034638
MN
128void in_asm_used_var_warning_killer()
129{
130 int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
d8fa3c54 131 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+
99d2cb72 132 M24A+M24B+M24C;
96034638
MN
133 if(i) i=0;
134}
135#endif
d604bab9 136
7630f2e0
MN
137//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
138//Plain C versions
139#undef HAVE_MMX
140#undef HAVE_MMX2
141#undef HAVE_3DNOW
142#undef ARCH_X86
143#define RENAME(a) a ## _C
144#include "swscale_template.c"
397c035e 145
7630f2e0 146#ifdef CAN_COMPILE_X86_ASM
397c035e 147
7630f2e0
MN
148//X86 versions
149/*
150#undef RENAME
151#undef HAVE_MMX
152#undef HAVE_MMX2
153#undef HAVE_3DNOW
154#define ARCH_X86
155#define RENAME(a) a ## _X86
156#include "swscale_template.c"
1faf0867 157*/
7630f2e0
MN
158//MMX versions
159#undef RENAME
160#define HAVE_MMX
161#undef HAVE_MMX2
162#undef HAVE_3DNOW
163#define ARCH_X86
164#define RENAME(a) a ## _MMX
165#include "swscale_template.c"
166
167//MMX2 versions
168#undef RENAME
169#define HAVE_MMX
170#define HAVE_MMX2
171#undef HAVE_3DNOW
172#define ARCH_X86
173#define RENAME(a) a ## _MMX2
174#include "swscale_template.c"
175
176//3DNOW versions
177#undef RENAME
178#define HAVE_MMX
179#undef HAVE_MMX2
180#define HAVE_3DNOW
181#define ARCH_X86
182#define RENAME(a) a ## _3DNow
183#include "swscale_template.c"
184
185#endif //CAN_COMPILE_X86_ASM
186
187// minor note: the HAVE_xyz is messed up after that line so dont use it
d604bab9 188
d3f41512 189
38858470 190// *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
31190492
A
191// *** Note: it's called multiple times while decoding a frame, first time y==0
192// *** Designed to upscale, but may work for downscale too.
44f9179b 193// s_xinc = (src_width << 16) / dst_width
31190492 194// s_yinc = (src_height << 16) / dst_height
7630f2e0 195// switching the cpu type during a sliced drawing can have bad effects, like sig11
38858470
MN
196void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int y, int h,
197 uint8_t* dstptr[], int dststride, int dstw, int dstbpp,
31190492
A
198 unsigned int s_xinc,unsigned int s_yinc){
199
200// scaling factors:
201//static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
202//static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
203
7630f2e0
MN
204#ifdef CAN_COMPILE_X86_ASM
205 // ordered per speed fasterst first
206 if(gCpuCaps.hasMMX2)
207 SwScale_YV12slice_MMX2(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
208 else if(gCpuCaps.has3DNow)
209 SwScale_YV12slice_3DNow(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
210 else if(gCpuCaps.hasMMX)
211 SwScale_YV12slice_MMX(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
d604bab9 212 else
7630f2e0
MN
213 SwScale_YV12slice_C(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
214#else
215 SwScale_YV12slice_C(srcptr, stride, y, h, dstptr, dststride, dstw, dstbpp, s_xinc, s_yinc);
17f715fa 216#endif
31190492 217
7630f2e0 218}
31190492
A
219
220void SwScale_Init(){
221 // generating tables:
222 int i;
223 for(i=0;i<256;i++){
224 clip_table[i]=0;
225 clip_table[i+256]=i;
226 clip_table[i+512]=255;
227 yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
228 yuvtab_3343[i]=0x3343*(i-128);
229 yuvtab_0c92[i]=-0x0c92*(i-128);
230 yuvtab_1a1e[i]=-0x1a1e*(i-128);
231 yuvtab_40cf[i]=0x40cf*(i-128);
232 }
233
b18ea156
MN
234 for(i=0; i<768; i++)
235 {
236 int v= clip_table[i];
237 clip_table16b[i]= v>>3;
238 clip_table16g[i]= (v<<3)&0x07E0;
239 clip_table16r[i]= (v<<8)&0xF800;
240 clip_table15b[i]= v>>3;
241 clip_table15g[i]= (v<<2)&0x03E0;
242 clip_table15r[i]= (v<<7)&0x7C00;
243 }
31190492 244}
7630f2e0 245