add mangling
[libav.git] / postproc / swscale.c
CommitLineData
31190492
A
1
2// Software scaling and colorspace conversion routines for MPlayer
3
afa569af 4// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
783e9cc9 5// current version mostly by Michael Niedermayer (michaelni@gmx.at)
d604bab9 6// the parts written by michael are under GNU GPL
783e9cc9 7
28bf81c9
MN
8/*
9 supported Input formats: YV12 (grayscale soon too)
10 supported output formats: YV12, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
11*/
12
d3f41512 13#include <inttypes.h>
dda87e9f 14#include <string.h>
077ea8a7 15#include <math.h>
c1b0bfb4 16#include <stdio.h>
d3f41512 17#include "../config.h"
9b464428 18#include "../mangle.h"
c1b0bfb4
MN
19#ifdef HAVE_MALLOC_H
20#include <malloc.h>
21#endif
d604bab9 22#include "swscale.h"
7630f2e0 23#include "../cpudetect.h"
28bf81c9 24#include "../libvo/img_format.h"
541c4eb9 25#undef MOVNTQ
7d7f78b5 26#undef PAVGB
d3f41512 27
783e9cc9 28//#undef HAVE_MMX2
7f56a527 29//#define HAVE_3DNOW
d3f41512 30//#undef HAVE_MMX
783e9cc9 31//#undef ARCH_X86
d604bab9 32#define DITHER1XBPP
d3f41512
MN
33
34#define RET 0xC3 //near return opcode
c1b0bfb4 35
28bf81c9
MN
36#ifdef MP_DEBUG
37#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
38#else
c1b0bfb4 39#define ASSERT(x) ;
28bf81c9
MN
40#endif
41
42#ifdef M_PI
43#define PI M_PI
44#else
45#define PI 3.14159265358979323846
46#endif
c1b0bfb4 47
e3d2500f 48extern int verbose; // defined in mplayer.c
783e9cc9
MN
49/*
50NOTES
d3f41512 51
d604bab9 52known BUGS with known cause (no bugreports please!, but patches are welcome :) )
e3d2500f 53horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
d604bab9 54
e3d2500f 55Supported output formats BGR15 BGR16 BGR24 BGR32 YV12
d604bab9
MN
56BGR15 & BGR16 MMX verions support dithering
57Special versions: fast Y 1:1 scaling (no interpolation in y direction)
31190492 58
783e9cc9 59TODO
d604bab9 60more intelligent missalignment avoidance for the horizontal scaler
02a0a992
MN
61dither in C
62change the distance of the u & v buffer
c1b0bfb4
MN
63Move static / global vars into a struct so multiple scalers can be used
64write special vertical cubic upscale version
65Optimize C code (yv12 / minmax)
783e9cc9 66*/
31190492 67
d604bab9 68#define ABS(a) ((a) > 0 ? (a) : (-(a)))
2ff198c1
MN
69#define MIN(a,b) ((a) > (b) ? (b) : (a))
70#define MAX(a,b) ((a) < (b) ? (b) : (a))
d604bab9 71
7630f2e0
MN
72#ifdef ARCH_X86
73#define CAN_COMPILE_X86_ASM
d604bab9
MN
74#endif
75
7630f2e0 76#ifdef CAN_COMPILE_X86_ASM
d604bab9 77static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
390b20a6
MN
78static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
79static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
80static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
81static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
f62255fb
MN
82static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
83static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
d604bab9
MN
84static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
85static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
86static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
077ea8a7 87static uint64_t __attribute__((aligned(8))) w02= 0x0002000200020002LL;
d604bab9
MN
88static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
89static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
90static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
91
3fe8e8f0
MN
92static volatile uint64_t __attribute__((aligned(8))) b5Dither;
93static volatile uint64_t __attribute__((aligned(8))) g5Dither;
94static volatile uint64_t __attribute__((aligned(8))) g6Dither;
95static volatile uint64_t __attribute__((aligned(8))) r5Dither;
d8fa3c54
MN
96
97static uint64_t __attribute__((aligned(8))) dither4[2]={
98 0x0103010301030103LL,
99 0x0200020002000200LL,};
100
101static uint64_t __attribute__((aligned(8))) dither8[2]={
102 0x0602060206020602LL,
103 0x0004000400040004LL,};
d604bab9
MN
104
105static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
106static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
107static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
108static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
109static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
110static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
111
99d2cb72
MN
112static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
113static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
114static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
115
28bf81c9 116// FIXME remove
d604bab9
MN
117static uint64_t __attribute__((aligned(8))) asm_yalpha1;
118static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
d604bab9 119#endif
783e9cc9
MN
120
121// clipping helper table for C implementations:
122static unsigned char clip_table[768];
123
b18ea156
MN
124static unsigned short clip_table16b[768];
125static unsigned short clip_table16g[768];
126static unsigned short clip_table16r[768];
127static unsigned short clip_table15b[768];
128static unsigned short clip_table15g[768];
129static unsigned short clip_table15r[768];
130
783e9cc9
MN
131// yuv->rgb conversion tables:
132static int yuvtab_2568[256];
133static int yuvtab_3343[256];
134static int yuvtab_0c92[256];
135static int yuvtab_1a1e[256];
136static int yuvtab_40cf[256];
c1b0bfb4
MN
137// Needed for cubic scaler to catch overflows
138static int clip_yuvtab_2568[768];
139static int clip_yuvtab_3343[768];
140static int clip_yuvtab_0c92[768];
141static int clip_yuvtab_1a1e[768];
142static int clip_yuvtab_40cf[768];
143
28bf81c9 144//global sws_flags from the command line
077ea8a7
MN
145int sws_flags=0;
146
28bf81c9
MN
147/* cpuCaps combined from cpudetect and whats actually compiled in
148 (if there is no support for something compiled in it wont appear here) */
149static CpuCaps cpuCaps;
d3f41512 150
28bf81c9
MN
151void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
152 int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
2ff198c1 153
7630f2e0 154#ifdef CAN_COMPILE_X86_ASM
96034638
MN
155void in_asm_used_var_warning_killer()
156{
077ea8a7 157 volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
28bf81c9
MN
158 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
159 M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0];
96034638
MN
160 if(i) i=0;
161}
162#endif
d604bab9 163
e3d2500f
MN
164static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
165 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
166 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
167{
168 //FIXME Optimize (just quickly writen not opti..)
169 int i;
170 for(i=0; i<dstW; i++)
171 {
172 int val=0;
173 int j;
174 for(j=0; j<lumFilterSize; j++)
175 val += lumSrc[j][i] * lumFilter[j];
176
177 dest[i]= MIN(MAX(val>>19, 0), 255);
178 }
179
180 if(uDest != NULL)
181 for(i=0; i<(dstW>>1); i++)
182 {
183 int u=0;
184 int v=0;
185 int j;
627690b5 186 for(j=0; j<chrFilterSize; j++)
e3d2500f
MN
187 {
188 u += chrSrc[j][i] * chrFilter[j];
189 v += chrSrc[j][i + 2048] * chrFilter[j];
190 }
191
192 uDest[i]= MIN(MAX(u>>19, 0), 255);
193 vDest[i]= MIN(MAX(v>>19, 0), 255);
194 }
195}
196
197static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
198 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
28bf81c9 199 uint8_t *dest, int dstW, int dstFormat)
e3d2500f 200{
28bf81c9 201 if(dstFormat==IMGFMT_BGR32)
e3d2500f
MN
202 {
203 int i;
204 for(i=0; i<(dstW>>1); i++){
205 int j;
206 int Y1=0;
207 int Y2=0;
208 int U=0;
209 int V=0;
210 int Cb, Cr, Cg;
211 for(j=0; j<lumFilterSize; j++)
212 {
213 Y1 += lumSrc[j][2*i] * lumFilter[j];
214 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
215 }
216 for(j=0; j<chrFilterSize; j++)
217 {
218 U += chrSrc[j][i] * chrFilter[j];
219 V += chrSrc[j][i+2048] * chrFilter[j];
220 }
221 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
222 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
223 U >>= 19;
224 V >>= 19;
225
226 Cb= clip_yuvtab_40cf[U+ 256];
227 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
228 Cr= clip_yuvtab_3343[V+ 256];
229
230 dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
231 dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
232 dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
233
234 dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
235 dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
236 dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
237 }
238 }
28bf81c9 239 else if(dstFormat==IMGFMT_BGR24)
e3d2500f
MN
240 {
241 int i;
242 for(i=0; i<(dstW>>1); i++){
243 int j;
244 int Y1=0;
245 int Y2=0;
246 int U=0;
247 int V=0;
248 int Cb, Cr, Cg;
249 for(j=0; j<lumFilterSize; j++)
250 {
251 Y1 += lumSrc[j][2*i] * lumFilter[j];
252 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
253 }
254 for(j=0; j<chrFilterSize; j++)
255 {
256 U += chrSrc[j][i] * chrFilter[j];
257 V += chrSrc[j][i+2048] * chrFilter[j];
258 }
259 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
260 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
261 U >>= 19;
262 V >>= 19;
263
264 Cb= clip_yuvtab_40cf[U+ 256];
265 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
266 Cr= clip_yuvtab_3343[V+ 256];
267
268 dest[0]=clip_table[((Y1 + Cb) >>13)];
269 dest[1]=clip_table[((Y1 + Cg) >>13)];
270 dest[2]=clip_table[((Y1 + Cr) >>13)];
271
272 dest[3]=clip_table[((Y2 + Cb) >>13)];
273 dest[4]=clip_table[((Y2 + Cg) >>13)];
274 dest[5]=clip_table[((Y2 + Cr) >>13)];
275 dest+=6;
276 }
277 }
28bf81c9 278 else if(dstFormat==IMGFMT_BGR16)
e3d2500f
MN
279 {
280 int i;
281 for(i=0; i<(dstW>>1); i++){
282 int j;
283 int Y1=0;
284 int Y2=0;
285 int U=0;
286 int V=0;
287 int Cb, Cr, Cg;
288 for(j=0; j<lumFilterSize; j++)
289 {
290 Y1 += lumSrc[j][2*i] * lumFilter[j];
291 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
292 }
293 for(j=0; j<chrFilterSize; j++)
294 {
295 U += chrSrc[j][i] * chrFilter[j];
296 V += chrSrc[j][i+2048] * chrFilter[j];
297 }
298 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
299 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
300 U >>= 19;
301 V >>= 19;
302
303 Cb= clip_yuvtab_40cf[U+ 256];
304 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
305 Cr= clip_yuvtab_3343[V+ 256];
306
307 ((uint16_t*)dest)[2*i] =
308 clip_table16b[(Y1 + Cb) >>13] |
309 clip_table16g[(Y1 + Cg) >>13] |
310 clip_table16r[(Y1 + Cr) >>13];
311
312 ((uint16_t*)dest)[2*i+1] =
313 clip_table16b[(Y2 + Cb) >>13] |
314 clip_table16g[(Y2 + Cg) >>13] |
315 clip_table16r[(Y2 + Cr) >>13];
316 }
317 }
28bf81c9 318 else if(dstFormat==IMGFMT_BGR15)
e3d2500f
MN
319 {
320 int i;
321 for(i=0; i<(dstW>>1); i++){
322 int j;
323 int Y1=0;
324 int Y2=0;
325 int U=0;
326 int V=0;
327 int Cb, Cr, Cg;
328 for(j=0; j<lumFilterSize; j++)
329 {
330 Y1 += lumSrc[j][2*i] * lumFilter[j];
331 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
332 }
333 for(j=0; j<chrFilterSize; j++)
334 {
335 U += chrSrc[j][i] * chrFilter[j];
336 V += chrSrc[j][i+2048] * chrFilter[j];
337 }
338 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
339 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
340 U >>= 19;
341 V >>= 19;
342
343 Cb= clip_yuvtab_40cf[U+ 256];
344 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
345 Cr= clip_yuvtab_3343[V+ 256];
346
347 ((uint16_t*)dest)[2*i] =
348 clip_table15b[(Y1 + Cb) >>13] |
349 clip_table15g[(Y1 + Cg) >>13] |
350 clip_table15r[(Y1 + Cr) >>13];
351
352 ((uint16_t*)dest)[2*i+1] =
353 clip_table15b[(Y2 + Cb) >>13] |
354 clip_table15g[(Y2 + Cg) >>13] |
355 clip_table15r[(Y2 + Cr) >>13];
356 }
357 }
358}
359
360
7630f2e0
MN
361//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
362//Plain C versions
726a959a
MN
363#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
364#define COMPILE_C
365#endif
366
367#ifdef CAN_COMPILE_X86_ASM
368
369#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
370#define COMPILE_MMX
371#endif
372
373#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
374#define COMPILE_MMX2
375#endif
376
377#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
378#define COMPILE_3DNOW
379#endif
380#endif //CAN_COMPILE_X86_ASM
381
382#undef HAVE_MMX
383#undef HAVE_MMX2
384#undef HAVE_3DNOW
385#undef ARCH_X86
386
387#ifdef COMPILE_C
7630f2e0
MN
388#undef HAVE_MMX
389#undef HAVE_MMX2
390#undef HAVE_3DNOW
391#undef ARCH_X86
392#define RENAME(a) a ## _C
393#include "swscale_template.c"
726a959a 394#endif
397c035e 395
7630f2e0 396#ifdef CAN_COMPILE_X86_ASM
397c035e 397
7630f2e0
MN
398//X86 versions
399/*
400#undef RENAME
401#undef HAVE_MMX
402#undef HAVE_MMX2
403#undef HAVE_3DNOW
404#define ARCH_X86
405#define RENAME(a) a ## _X86
406#include "swscale_template.c"
1faf0867 407*/
7630f2e0 408//MMX versions
726a959a 409#ifdef COMPILE_MMX
7630f2e0
MN
410#undef RENAME
411#define HAVE_MMX
412#undef HAVE_MMX2
413#undef HAVE_3DNOW
414#define ARCH_X86
415#define RENAME(a) a ## _MMX
416#include "swscale_template.c"
726a959a 417#endif
7630f2e0
MN
418
419//MMX2 versions
726a959a 420#ifdef COMPILE_MMX2
7630f2e0
MN
421#undef RENAME
422#define HAVE_MMX
423#define HAVE_MMX2
424#undef HAVE_3DNOW
425#define ARCH_X86
426#define RENAME(a) a ## _MMX2
427#include "swscale_template.c"
726a959a 428#endif
7630f2e0
MN
429
430//3DNOW versions
726a959a 431#ifdef COMPILE_3DNOW
7630f2e0
MN
432#undef RENAME
433#define HAVE_MMX
434#undef HAVE_MMX2
435#define HAVE_3DNOW
436#define ARCH_X86
437#define RENAME(a) a ## _3DNow
438#include "swscale_template.c"
726a959a 439#endif
7630f2e0
MN
440
441#endif //CAN_COMPILE_X86_ASM
442
443// minor note: the HAVE_xyz is messed up after that line so dont use it
d604bab9 444
d3f41512 445
28bf81c9
MN
446// old global scaler, dont use for new code
447// will use sws_flags from the command line
448void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
449 int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
d1fac6cf 450 int srcW, int srcH, int dstW, int dstH){
31190492 451
28bf81c9
MN
452 static SwsContext *context=NULL;
453 int dstFormat;
454 int flags=0;
455 static int firstTime=1;
456 int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
457
458 if(firstTime)
459 {
460 flags= SWS_PRINT_INFO;
461 firstTime=0;
462 }
463
464 switch(dstbpp)
465 {
466 case 8 : dstFormat= IMGFMT_Y8; break;
467 case 12: dstFormat= IMGFMT_YV12; break;
468 case 15: dstFormat= IMGFMT_BGR15; break;
469 case 16: dstFormat= IMGFMT_BGR16; break;
470 case 24: dstFormat= IMGFMT_BGR24; break;
471 case 32: dstFormat= IMGFMT_BGR32; break;
472 default: return;
473 }
474
475 switch(sws_flags)
476 {
477 case 0: flags|= SWS_FAST_BILINEAR; break;
478 case 1: flags|= SWS_BILINEAR; break;
479 case 2: flags|= SWS_BICUBIC; break;
480 case 3: flags|= SWS_X; break;
481 default:flags|= SWS_BILINEAR; break;
482 }
483
484 if(!context) context=getSwsContext(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat, flags, NULL, NULL);
485
486
487 swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
488}
489
490static inline void initFilter(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
491 int srcW, int dstW, int filterAlign, int one, int flags)
492{
493 int i;
494 double filter[10000];
495#ifdef ARCH_X86
496 if(gCpuCaps.hasMMX)
497 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
726a959a 498#endif
31190492 499
28bf81c9
MN
500 if(ABS(xInc - 0x10000) <10) // unscaled
501 {
502 int i;
503 *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly
504 for(i=0; i<dstW*(*filterSize); i++) filter[i]=0;
505
506 for(i=0; i<dstW; i++)
507 {
508 filter[i*(*filterSize)]=1;
509 filterPos[i]=i;
510 }
511
512 }
513 else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
514 {
515 int i;
516 int xDstInSrc;
517 if (flags&SWS_BICUBIC) *filterSize= 4;
518 else if(flags&SWS_X ) *filterSize= 4;
519 else *filterSize= 2;
520// printf("%d %d %d\n", filterSize, srcW, dstW);
521 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
522
523 xDstInSrc= xInc/2 - 0x8000;
524 for(i=0; i<dstW; i++)
525 {
526 int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1;
527 int j;
528
529 filterPos[i]= xx;
530 if((flags & SWS_BICUBIC) || (flags & SWS_X))
531 {
532 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
533 double y1,y2,y3,y4;
534 double A= -0.6;
535 if(flags & SWS_BICUBIC){
536 // Equation is from VirtualDub
537 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
538 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
539 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
540 y4 = ( + A*d*d - A*d*d*d);
541 }else{
542 // cubic interpolation (derived it myself)
543 y1 = ( -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
544 y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
545 y3 = ( +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
546 y4 = ( -1.0*d + 1.0*d*d*d)/6.0;
547 }
548
549// printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
550 filter[i*(*filterSize) + 0]= y1;
551 filter[i*(*filterSize) + 1]= y2;
552 filter[i*(*filterSize) + 2]= y3;
553 filter[i*(*filterSize) + 3]= y4;
554// printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
555 }
556 else
557 {
558 for(j=0; j<*filterSize; j++)
559 {
560 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
561 double coeff= 1.0 - d;
562 if(coeff<0) coeff=0;
563 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
564 filter[i*(*filterSize) + j]= coeff;
565 xx++;
566 }
567 }
568 xDstInSrc+= xInc;
569 }
570 }
571 else // downscale
572 {
573 int xDstInSrc;
574 if(flags&SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
575 else if(flags&SWS_X) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
576 else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
577// printf("%d %d %d\n", *filterSize, srcW, dstW);
578 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
579
580 xDstInSrc= xInc/2 - 0x8000;
581 for(i=0; i<dstW; i++)
582 {
583 int xx= (int)((double)xDstInSrc/(double)(1<<16) - ((*filterSize)-1)*0.5 + 0.5);
584 int j;
585 filterPos[i]= xx;
586 for(j=0; j<*filterSize; j++)
587 {
588 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
589 double coeff;
590 if((flags & SWS_BICUBIC) || (flags & SWS_X))
591 {
592 double A= -0.75;
593// d*=2;
594 // Equation is from VirtualDub
595 if(d<1.0)
596 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
597 else if(d<2.0)
598 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
599 else
600 coeff=0.0;
601 }
602/* else if(flags & SWS_X)
603 {
604 }*/
605 else
606 {
607 coeff= 1.0 - d;
608 if(coeff<0) coeff=0;
609 }
610// printf("%1.3f %d %d \n", coeff, (int)d, xDstInSrc);
611 filter[i*(*filterSize) + j]= coeff;
612 xx++;
613 }
614 xDstInSrc+= xInc;
615 }
616 }
617
618 //fix borders
619 for(i=0; i<dstW; i++)
620 {
621 int j;
622 if(filterPos[i] < 0)
623 {
624 // Move filter coeffs left to compensate for filterPos
625 for(j=1; j<*filterSize; j++)
626 {
627 int left= MAX(j + filterPos[i], 0);
628 filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j];
629 filter[i*(*filterSize) + j]=0;
630 }
631 filterPos[i]= 0;
632 }
633
634 if(filterPos[i] + (*filterSize) > srcW)
635 {
636 int shift= filterPos[i] + (*filterSize) - srcW;
637 // Move filter coeffs right to compensate for filterPos
638 for(j=(*filterSize)-2; j>=0; j--)
639 {
640 int right= MIN(j + shift, (*filterSize)-1);
641 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
642 filter[i*(*filterSize) +j]=0;
643 }
644 filterPos[i]= srcW - (*filterSize);
645 }
646 }
647
648 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
649 // and skip these than later
650
651 //Normalize
652 for(i=0; i<dstW; i++)
653 {
654 int j;
655 double sum=0;
656 double scale= one;
657 for(j=0; j<*filterSize; j++)
658 {
659 sum+= filter[i*(*filterSize) + j];
660 }
661 scale/= sum;
662 for(j=0; j<*filterSize; j++)
663 {
664 dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
665 }
666 }
7630f2e0 667}
31190492 668
28bf81c9
MN
669#ifdef ARCH_X86
670static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
671{
672 uint8_t *fragment;
673 int imm8OfPShufW1;
674 int imm8OfPShufW2;
675 int fragmentLength;
676
677 int xpos, i;
678
679 // create an optimized horizontal scaling routine
680
681 //code fragment
682
683 asm volatile(
684 "jmp 9f \n\t"
685 // Begin
686 "0: \n\t"
687 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
688 "movq %%mm0, %%mm1 \n\t"
689 "psrlq $8, %%mm0 \n\t"
690 "punpcklbw %%mm7, %%mm1 \n\t"
691 "movq %%mm2, %%mm3 \n\t"
692 "punpcklbw %%mm7, %%mm0 \n\t"
693 "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
694 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
695 "1: \n\t"
696 "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
697 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
698 "2: \n\t"
699 "psrlw $9, %%mm3 \n\t"
700 "psubw %%mm1, %%mm0 \n\t"
701 "pmullw %%mm3, %%mm0 \n\t"
702 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
703 "psllw $7, %%mm1 \n\t"
704 "paddw %%mm1, %%mm0 \n\t"
705
706 "movq %%mm0, (%%edi, %%eax) \n\t"
707
708 "addl $8, %%eax \n\t"
709 // End
710 "9: \n\t"
711// "int $3\n\t"
712 "leal 0b, %0 \n\t"
713 "leal 1b, %1 \n\t"
714 "leal 2b, %2 \n\t"
715 "decl %1 \n\t"
716 "decl %2 \n\t"
717 "subl %0, %1 \n\t"
718 "subl %0, %2 \n\t"
719 "leal 9b, %3 \n\t"
720 "subl %0, %3 \n\t"
721 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
722 "=r" (fragmentLength)
723 );
724
725 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
726
727 for(i=0; i<dstW/8; i++)
728 {
729 int xx=xpos>>16;
730
731 if((i&3) == 0)
732 {
733 int a=0;
734 int b=((xpos+xInc)>>16) - xx;
735 int c=((xpos+xInc*2)>>16) - xx;
736 int d=((xpos+xInc*3)>>16) - xx;
737
738 memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
739
740 funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
741 funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
742 a | (b<<2) | (c<<4) | (d<<6);
743
744 // if we dont need to read 8 bytes than dont :), reduces the chance of
745 // crossing a cache line
746 if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
747
748 funnyCode[fragmentLength*(i+4)/4]= RET;
749 }
750 xpos+=xInc;
751 }
752}
753#endif // ARCH_X86
754
755//FIXME remove
31190492 756void SwScale_Init(){
28bf81c9
MN
757}
758
759static void globalInit(){
31190492
A
760 // generating tables:
761 int i;
c1b0bfb4
MN
762 for(i=0; i<768; i++){
763 int c= MIN(MAX(i-256, 0), 255);
764 clip_table[i]=c;
765 yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
766 yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
767 yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
768 yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
769 yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
31190492
A
770 }
771
b18ea156
MN
772 for(i=0; i<768; i++)
773 {
28bf81c9 774 int v= clip_table[i];
b18ea156
MN
775 clip_table16b[i]= v>>3;
776 clip_table16g[i]= (v<<3)&0x07E0;
777 clip_table16r[i]= (v<<8)&0xF800;
778 clip_table15b[i]= v>>3;
779 clip_table15g[i]= (v<<2)&0x03E0;
780 clip_table15r[i]= (v<<7)&0x7C00;
781 }
c1b0bfb4 782
28bf81c9
MN
783cpuCaps= gCpuCaps;
784
785#ifdef RUNTIME_CPUDETECT
786#ifdef CAN_COMPILE_X86_ASM
787 // ordered per speed fasterst first
788 if(gCpuCaps.hasMMX2)
789 swScale= swScale_MMX2;
790 else if(gCpuCaps.has3DNow)
7f56a527 791 swScale= swScale_3DNow;
28bf81c9
MN
792 else if(gCpuCaps.hasMMX)
793 swScale= swScale_MMX;
794 else
795 swScale= swScale_C;
796
797#else
798 swScale= swScale_C;
799 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
800#endif
801#else //RUNTIME_CPUDETECT
802#ifdef HAVE_MMX2
803 swScale= swScale_MMX2;
804 cpuCaps.has3DNow = 0;
805#elif defined (HAVE_3DNOW)
7f56a527 806 swScale= swScale_3DNow;
28bf81c9
MN
807 cpuCaps.hasMMX2 = 0;
808#elif defined (HAVE_MMX)
809 swScale= swScale_MMX;
810 cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
811#else
812 swScale= swScale_C;
813 cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
814#endif
815#endif //!RUNTIME_CPUDETECT
31190492 816}
7630f2e0 817
28bf81c9
MN
818
819SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
820 SwsFilter *srcFilter, SwsFilter *dstFilter){
821
822 const int widthAlign= dstFormat==IMGFMT_YV12 ? 16 : 8;
823 SwsContext *c;
824 int i;
825//const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
826//const int over= dstFormat==IMGFMT_YV12 ? (((dstW+15)&(~15))) - dststride
827// : (((dstW+7)&(~7)))*bytespp - dststride;
828 if(swScale==NULL) globalInit();
829
830 /* sanity check */
831 if(srcW<1 || srcH<1 || dstW<1 || dstH<1) return NULL;
832 if(srcW>=SWS_MAX_SIZE || dstW>=SWS_MAX_SIZE || srcH>=SWS_MAX_SIZE || dstH>=SWS_MAX_SIZE)
833 {
834 fprintf(stderr, "size is too large, increase SWS_MAX_SIZE\n");
835 return NULL;
836 }
837
838/* FIXME
839 if(dstStride[0]%widthAlign !=0 )
840 {
841 if(flags & SWS_PRINT_INFO)
842 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
843 "SwScaler: ->cannot do aligned memory acesses anymore\n",
844 widthAlign);
845 }
846*/
847 c= memalign(64, sizeof(SwsContext));
848
849 c->srcW= srcW;
850 c->srcH= srcH;
851 c->dstW= dstW;
852 c->dstH= dstH;
853 c->lumXInc= ((srcW<<16) + (1<<15))/dstW;
854 c->lumYInc= ((srcH<<16) + (1<<15))/dstH;
855 c->flags= flags;
856 c->dstFormat= dstFormat;
857 c->srcFormat= srcFormat;
858
859 if(cpuCaps.hasMMX2)
860 {
861 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
862 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
863 {
864 if(flags&SWS_PRINT_INFO)
865 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
866 }
867 }
868 else
869 c->canMMX2BeUsed=0;
870
871 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
872 // but only for the FAST_BILINEAR mode otherwise do correct scaling
873 // n-2 is the last chrominance sample available
874 // this is not perfect, but noone shuld notice the difference, the more correct variant
875 // would be like the vertical one, but that would require some special code for the
876 // first and last pixel
877 if(flags&SWS_FAST_BILINEAR)
878 {
879 if(c->canMMX2BeUsed) c->lumXInc+= 20;
880 //we dont use the x86asm scaler if mmx is available
881 else if(cpuCaps.hasMMX) c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
882 }
883
884 /* set chrXInc & chrDstW */
885 if((flags&SWS_FULL_UV_IPOL) && dstFormat!=IMGFMT_YV12)
886 c->chrXInc= c->lumXInc>>1, c->chrDstW= dstW;
887 else
888 c->chrXInc= c->lumXInc, c->chrDstW= (dstW+1)>>1;
889
890 /* set chrYInc & chrDstH */
891 if(dstFormat==IMGFMT_YV12) c->chrYInc= c->lumYInc, c->chrDstH= (dstH+1)>>1;
892 else c->chrYInc= c->lumYInc>>1, c->chrDstH= dstH;
893
894 /* precalculate horizontal scaler filter coefficients */
895 {
896 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
897
898 initFilter(c->hLumFilter, c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
899 srcW , dstW, filterAlign, 1<<14, flags);
900 initFilter(c->hChrFilter, c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
901 (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags);
902
903#ifdef ARCH_X86
904// cant downscale !!!
905 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
906 {
907 initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode);
908 initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
909 }
910#endif
911 } // Init Horizontal stuff
912
913
914
915 /* precalculate vertical scaler filter coefficients */
916 initFilter(c->vLumFilter, c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
917 srcH , dstH, 1, (1<<12)-4, flags);
918 initFilter(c->vChrFilter, c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
919 (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags);
920
921 // Calculate Buffer Sizes so that they wont run out while handling these damn slices
922 c->vLumBufSize= c->vLumFilterSize;
923 c->vChrBufSize= c->vChrFilterSize;
924 for(i=0; i<dstH; i++)
925 {
926 int chrI= i*c->chrDstH / dstH;
927 int nextSlice= MAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1,
928 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
929 nextSlice&= ~1; // Slices start at even boundaries
930 if(c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice)
931 c->vLumBufSize= nextSlice - c->vLumFilterPos[i ];
932 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
933 c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
934 }
935
936 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
937 // allocate several megabytes to handle all possible cases)
938 for(i=0; i<c->vLumBufSize; i++)
939 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
940 for(i=0; i<c->vChrBufSize; i++)
941 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
942
943 //try to avoid drawing green stuff between the right end and the stride end
944 for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
945 for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
946
947 ASSERT(c->chrDstH <= dstH)
948 ASSERT(c->vLumFilterSize* dstH*4 <= SWS_MAX_SIZE*20)
949 ASSERT(c->vChrFilterSize*c->chrDstH*4 <= SWS_MAX_SIZE*20)
950
951 // pack filter data for mmx code
952 if(cpuCaps.hasMMX)
953 {
954 for(i=0; i<c->vLumFilterSize*dstH; i++)
955 c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
956 c->vLumFilter[i];
957 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
958 c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
959 c->vChrFilter[i];
960 }
961
962 if(flags&SWS_PRINT_INFO)
963 {
964#ifdef DITHER1XBPP
965 char *dither= cpuCaps.hasMMX ? " dithered" : "";
966#endif
967 if(flags&SWS_FAST_BILINEAR)
968 fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
969 else if(flags&SWS_BILINEAR)
970 fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
971 else if(flags&SWS_BICUBIC)
972 fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
973 else
974 fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
975
976 if(dstFormat==IMGFMT_BGR15)
977 fprintf(stderr, "with%s BGR15 output ", dither);
978 else if(dstFormat==IMGFMT_BGR16)
979 fprintf(stderr, "with%s BGR16 output ", dither);
980 else if(dstFormat==IMGFMT_BGR24)
981 fprintf(stderr, "with BGR24 output ");
982 else if(dstFormat==IMGFMT_BGR32)
983 fprintf(stderr, "with BGR32 output ");
984 else if(dstFormat==IMGFMT_YV12)
985 fprintf(stderr, "with YV12 output ");
986 else
987 fprintf(stderr, "without output ");
988
989 if(cpuCaps.hasMMX2)
990 fprintf(stderr, "using MMX2\n");
991 else if(cpuCaps.has3DNow)
992 fprintf(stderr, "using 3DNOW\n");
993 else if(cpuCaps.hasMMX)
994 fprintf(stderr, "using MMX\n");
995 else
996 fprintf(stderr, "using C\n");
997 }
998
999 if((flags & SWS_PRINT_INFO) && verbose)
1000 {
1001 if(cpuCaps.hasMMX)
1002 {
1003 if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1004 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1005 else
1006 {
1007 if(c->hLumFilterSize==4)
1008 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1009 else if(c->hLumFilterSize==8)
1010 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1011 else
1012 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1013
1014 if(c->hChrFilterSize==4)
1015 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1016 else if(c->hChrFilterSize==8)
1017 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1018 else
1019 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1020 }
1021 }
1022 else
1023 {
1024#ifdef ARCH_X86
1025 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1026#else
1027 if(flags & SWS_FAST_BILINEAR)
1028 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1029 else
1030 printf("SwScaler: using C scaler for horizontal scaling\n");
1031#endif
1032 }
1033 if(dstFormat==IMGFMT_YV12)
1034 {
1035 if(c->vLumFilterSize==1)
1036 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", cpuCaps.hasMMX ? "MMX" : "C");
1037 else
1038 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", cpuCaps.hasMMX ? "MMX" : "C");
1039 }
1040 else
1041 {
1042 if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1043 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1044 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1045 else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1046 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1047 else
1048 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1049 }
1050
1051 if(dstFormat==IMGFMT_BGR24)
1052 printf("SwScaler: using %s YV12->BGR24 Converter\n",
1053 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1054 else
1055 printf("SwScaler: using %s YV12->BGR Converter\n", cpuCaps.hasMMX ? "MMX" : "C");//FIXME print format
1056
1057 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1058 }
1059
1060 return c;
1061}
1062
1063/**
1064 * returns a normalized gaussian curve used to filter stuff
1065 * quality=3 is high quality, lowwer is lowwer quality
1066 */
1067double *getGaussian(double variance, double quality){
1068 const int length= (int)(variance*quality + 0.5) | 1;
1069 int i;
1070 double *coeff= memalign(sizeof(double), length*sizeof(double));
1071 double middle= (length-1)*0.5;
1072
1073 for(i=0; i<length; i++)
1074 {
1075 double dist= i-middle;
1076 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1077 }
1078
1079 normalize(coeff, length, 1.0);
1080 return coeff;
1081}
1082
1083void normalize(double *coeff, int length, double height){
1084 int i;
1085 double sum=0;
1086 double inv;
1087
1088 for(i=0; i<length; i++)
1089 sum+= coeff[i];
1090
1091 inv= height/sum;
1092
1093 for(i=0; i<length; i++)
1094 coeff[i]*= height;
1095}
1096
1097double *conv(double *a, int aLength, double *b, int bLength){
1098 int length= aLength + bLength - 1;
1099 double *coeff= memalign(sizeof(double), length*sizeof(double));
1100 int i, j;
1101
1102 for(i=0; i<length; i++) coeff[i]= 0.0;
1103
1104 for(i=0; i<aLength; i++)
1105 {
1106 for(j=0; j<bLength; j++)
1107 {
1108 coeff[i+j]+= a[i]*b[j];
1109 }
1110 }
1111
1112 return coeff;
1113}
1114
1115/*
1116double *sum(double *a, int aLength, double *b, int bLength){
1117 int length= MAX(aLength, bLength);
1118 double *coeff= memalign(sizeof(double), length*sizeof(double));
1119 int i;
1120
1121 for(i=0; i<length; i++) coeff[i]= 0.0;
1122
1123 for(i=0; i<aLength; i++) coeff[i]+= a[i];
1124}
7f56a527
MN
1125*/
1126