10l fixes by ("Debabrata Banerjee" <davatar at comcast dot net>)
[libav.git] / libavcodec / libpostproc / postprocess.c
CommitLineData
3057fa66 1/*
9858f773 2 Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3057fa66
A
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
b304569a
MN
19/**
20 * @file postprocess.c
21 * postprocessing.
22 */
23
3057fa66 24/*
3b58b885 25 C MMX MMX2 3DNow
3057fa66
A
26isVertDC Ec Ec
27isVertMinMaxOk Ec Ec
3b58b885 28doVertLowPass E e e
7f16f6e6 29doVertDefFilter Ec Ec e e
3057fa66 30isHorizDC Ec Ec
4e4dcbc5
MN
31isHorizMinMaxOk a E
32doHorizLowPass E e e
7f16f6e6 33doHorizDefFilter Ec Ec e e
2e212618 34deRing E e e*
3b58b885 35Vertical RKAlgo1 E a a
e5c30e06 36Horizontal RKAlgo1 a a
117e45b0
MN
37Vertical X1# a E E
38Horizontal X1# a E E
acced553
MN
39LinIpolDeinterlace e E E*
40CubicIpolDeinterlace a e e*
41LinBlendDeinterlace e E E*
9b1663fc 42MedianDeinterlace# E Ec Ec
be44a4d7 43TempDeNoiser# E e e
d5a1a995 44
117e45b0
MN
45* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
46# more or less selfinvented filters so the exactness isnt too meaningfull
3057fa66 47E = Exact implementation
acced553 48e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
49a = alternative / approximate impl
50c = checked against the other implementations (-vo md5)
51*/
52
53/*
54TODO:
3057fa66 55reduce the time wasted on the mem transfer
3057fa66 56unroll stuff if instructions depend too much on the prior one
3057fa66 57move YScale thing to the end instead of fixing QP
13e00528 58write a faster and higher quality deblocking filter :)
d5a1a995
MN
59make the mainloop more flexible (variable number of blocks at once
60 (the if/else stuff per block is slowing things down)
9f45d04d 61compare the quality & speed of all filters
9f45d04d 62split this huge file
8405b3fd 63optimize c versions
117e45b0 64try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
3057fa66 65...
13e00528
A
66*/
67
a6be8111 68//Changelog: use the CVS log
3057fa66 69
9858f773 70#include "config.h"
3057fa66
A
71#include <inttypes.h>
72#include <stdio.h>
d5a1a995 73#include <stdlib.h>
911879d1 74#include <string.h>
dda87e9f
PL
75#ifdef HAVE_MALLOC_H
76#include <malloc.h>
77#endif
3057fa66 78//#undef HAVE_MMX2
13e00528 79//#define HAVE_3DNOW
3057fa66 80//#undef HAVE_MMX
cc9b0679 81//#undef ARCH_X86
7f16f6e6 82//#define DEBUG_BRIGHTNESS
bba9b16c 83#ifdef USE_FASTMEMCPY
0a87c409 84#include "fastmemcpy.h"
70d4f2da 85#endif
13e00528 86#include "postprocess.h"
c41d972d 87#include "postprocess_internal.h"
bba9b16c
MN
88
89#include "mangle.h" //FIXME should be supressed
3057fa66 90
ca390e72
ZK
91#ifndef HAVE_MEMALIGN
92#define memalign(a,b) malloc(b)
93#endif
94
e939e1c3
A
95#define MIN(a,b) ((a) > (b) ? (b) : (a))
96#define MAX(a,b) ((a) < (b) ? (b) : (a))
97#define ABS(a) ((a) > 0 ? (a) : (-(a)))
98#define SIGN(a) ((a) > 0 ? 1 : -1)
99
911879d1
MN
100#define GET_MODE_BUFFER_SIZE 500
101#define OPTIONS_ARRAY_SIZE 10
9c9e467d
MN
102#define BLOCK_SIZE 8
103#define TEMP_STRIDE 8
104//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
911879d1 105
3f1d4e96
DB
106#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
107# define attribute_used __attribute__((used))
108#else
109# define attribute_used
110#endif
111
cc9b0679 112#ifdef ARCH_X86
3f1d4e96
DB
113static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
114static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
115static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
116static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
117static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
118static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
119static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
b28daef8 120#endif
3057fa66 121
134eb1e5
MN
122
123static uint8_t clip_table[3*256];
124static uint8_t * const clip_tab= clip_table + 256;
125
4df8ca9d 126static const int verbose= 0;
45b4f285 127
3f1d4e96 128static const int attribute_used deringThreshold= 20;
3057fa66 129
9c9e467d 130
911879d1
MN
131static struct PPFilter filters[]=
132{
133 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
134 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
9c9e467d
MN
135/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
136 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
911879d1
MN
137 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
138 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
139 {"dr", "dering", 1, 5, 6, DERING},
140 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
43d52f76
MN
141 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
142 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
143 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
144 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
9c9e467d 145 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
134eb1e5 146 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
117e45b0 147 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
8aaac435 148 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
911879d1
MN
149 {NULL, NULL,0,0,0,0} //End Marker
150};
151
152static char *replaceTable[]=
153{
117e45b0
MN
154 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
155 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
156 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
157 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
911879d1
MN
158 NULL //End Marker
159};
160
3057fa66 161
9c9e467d 162#ifdef ARCH_X86
3057fa66
A
163static inline void prefetchnta(void *p)
164{
165 asm volatile( "prefetchnta (%0)\n\t"
166 : : "r" (p)
167 );
168}
169
170static inline void prefetcht0(void *p)
171{
172 asm volatile( "prefetcht0 (%0)\n\t"
173 : : "r" (p)
174 );
175}
176
177static inline void prefetcht1(void *p)
178{
179 asm volatile( "prefetcht1 (%0)\n\t"
180 : : "r" (p)
181 );
182}
183
184static inline void prefetcht2(void *p)
185{
186 asm volatile( "prefetcht2 (%0)\n\t"
187 : : "r" (p)
188 );
189}
9a722af7 190#endif
3057fa66 191
cc9b0679 192// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
3057fa66 193
cf5ec61d
MN
194/**
195 * Check if the given 8x8 Block is mostly "flat"
196 */
9c9e467d 197static inline int isHorizDC(uint8_t src[], int stride, PPContext *c)
cf5ec61d
MN
198{
199 int numEq= 0;
200 int y;
0426af31 201 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
ec487e5d 202 const int dcThreshold= dcOffset*2 + 1;
0426af31 203
cf5ec61d
MN
204 for(y=0; y<BLOCK_SIZE; y++)
205 {
9c9e467d
MN
206 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
207 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
208 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
209 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
210 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
211 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
212 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
213 src+= stride;
214 }
215 return numEq > c->ppMode.flatnessThreshold;
216}
217
218/**
219 * Check if the middle 8x8 Block in the given 8x16 block is flat
220 */
221static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
222 int numEq= 0;
223 int y;
0426af31 224 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
ec487e5d 225 const int dcThreshold= dcOffset*2 + 1;
0426af31 226
9c9e467d
MN
227 src+= stride*4; // src points to begin of the 8x8 Block
228 for(y=0; y<BLOCK_SIZE-1; y++)
229 {
230 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
231 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
232 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
233 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
234 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
235 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
236 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
237 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
cf5ec61d
MN
238 src+= stride;
239 }
9c9e467d 240 return numEq > c->ppMode.flatnessThreshold;
cf5ec61d
MN
241}
242
243static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
244{
cb482d25
MN
245 int i;
246#if 1
247 for(i=0; i<2; i++){
248 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
249 src += stride;
250 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
251 src += stride;
252 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
253 src += stride;
254 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
255 src += stride;
256 }
257#else
258 for(i=0; i<8; i++){
259 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
260 src += stride;
261 }
262#endif
263 return 1;
264}
cf5ec61d 265
cb482d25
MN
266static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
267{
268#if 1
269#if 1
270 int x;
271 src+= stride*4;
272 for(x=0; x<BLOCK_SIZE; x+=4)
273 {
274 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
275 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
276 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
277 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
278 }
279#else
280 int x;
281 src+= stride*3;
282 for(x=0; x<BLOCK_SIZE; x++)
283 {
284 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
285 }
286#endif
287 return 1;
288#else
289 int x;
290 src+= stride*4;
291 for(x=0; x<BLOCK_SIZE; x++)
292 {
293 int min=255;
294 int max=0;
295 int y;
296 for(y=0; y<8; y++){
297 int v= src[x + y*stride];
298 if(v>max) max=v;
299 if(v<min) min=v;
300 }
301 if(max-min > 2*QP) return 0;
302 }
cf5ec61d 303 return 1;
cb482d25
MN
304#endif
305}
306
307static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
308 if( isVertDC_C(src, stride, c) ){
309 if( isVertMinMaxOk_C(src, stride, c->QP) )
310 return 1;
311 else
312 return 0;
313 }else{
314 return 2;
315 }
cf5ec61d
MN
316}
317
318static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
319{
320 int y;
321 for(y=0; y<BLOCK_SIZE; y++)
322 {
323 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
324
325 if(ABS(middleEnergy) < 8*QP)
326 {
327 const int q=(dst[3] - dst[4])/2;
328 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
329 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
330
331 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
332 d= MAX(d, 0);
333
334 d= (5*d + 32) >> 6;
335 d*= SIGN(-middleEnergy);
336
337 if(q>0)
338 {
339 d= d<0 ? 0 : d;
340 d= d>q ? q : d;
341 }
342 else
343 {
344 d= d>0 ? 0 : d;
345 d= d<q ? q : d;
346 }
347
348 dst[3]-= d;
349 dst[4]+= d;
350 }
351 dst+= stride;
352 }
353}
354
355/**
356 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
357 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
358 */
359static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
360{
361
362 int y;
363 for(y=0; y<BLOCK_SIZE; y++)
364 {
365 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
366 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
367
368 int sums[9];
369 sums[0] = first + dst[0];
370 sums[1] = dst[0] + dst[1];
371 sums[2] = dst[1] + dst[2];
372 sums[3] = dst[2] + dst[3];
373 sums[4] = dst[3] + dst[4];
374 sums[5] = dst[4] + dst[5];
375 sums[6] = dst[5] + dst[6];
376 sums[7] = dst[6] + dst[7];
377 sums[8] = dst[7] + last;
378
379 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
380 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
381 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
382 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
383 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
384 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
385 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
386 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
387
388 dst+= stride;
389 }
390}
391
4e4dcbc5 392/**
cc9b0679
MN
393 * Experimental Filter 1 (Horizontal)
394 * will not damage linear gradients
395 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
396 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
397 * MMX2 version does correct clipping C version doesnt
398 * not identical with the vertical one
4e4dcbc5 399 */
cc9b0679
MN
400static inline void horizX1Filter(uint8_t *src, int stride, int QP)
401{
117e45b0 402 int y;
cc9b0679
MN
403 static uint64_t *lut= NULL;
404 if(lut==NULL)
117e45b0 405 {
cc9b0679
MN
406 int i;
407 lut= (uint64_t*)memalign(8, 256*8);
408 for(i=0; i<256; i++)
117e45b0 409 {
cc9b0679 410 int v= i < 128 ? 2*i : 2*(i-256);
117e45b0 411/*
cc9b0679
MN
412//Simulate 112242211 9-Tap filter
413 uint64_t a= (v/16) & 0xFF;
414 uint64_t b= (v/8) & 0xFF;
415 uint64_t c= (v/4) & 0xFF;
416 uint64_t d= (3*v/8) & 0xFF;
117e45b0 417*/
cc9b0679
MN
418//Simulate piecewise linear interpolation
419 uint64_t a= (v/16) & 0xFF;
420 uint64_t b= (v*3/16) & 0xFF;
421 uint64_t c= (v*5/16) & 0xFF;
422 uint64_t d= (7*v/16) & 0xFF;
423 uint64_t A= (0x100 - a)&0xFF;
424 uint64_t B= (0x100 - b)&0xFF;
425 uint64_t C= (0x100 - c)&0xFF;
426 uint64_t D= (0x100 - c)&0xFF;
427
428 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
429 (D<<24) | (C<<16) | (B<<8) | (A);
430 //lut[i] = (v<<32) | (v<<24);
117e45b0
MN
431 }
432 }
cc9b0679
MN
433
434 for(y=0; y<BLOCK_SIZE; y++)
117e45b0 435 {
cc9b0679
MN
436 int a= src[1] - src[2];
437 int b= src[3] - src[4];
438 int c= src[5] - src[6];
439
440 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
441
442 if(d < QP)
117e45b0 443 {
cc9b0679
MN
444 int v = d * SIGN(-b);
445
446 src[1] +=v/8;
447 src[2] +=v/4;
448 src[3] +=3*v/8;
449 src[4] -=3*v/8;
450 src[5] -=v/4;
451 src[6] -=v/8;
452
117e45b0 453 }
cc9b0679 454 src+=stride;
117e45b0 455 }
cc9b0679
MN
456}
457
458
e89952aa 459//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
cc9b0679 460//Plain C versions
e89952aa
MN
461#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
462#define COMPILE_C
463#endif
464
9c9e467d 465#ifdef ARCH_X86
e89952aa
MN
466
467#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
468#define COMPILE_MMX
469#endif
470
471#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
472#define COMPILE_MMX2
473#endif
474
475#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
476#define COMPILE_3DNOW
477#endif
9c9e467d 478#endif //ARCH_X86
e89952aa
MN
479
480#undef HAVE_MMX
481#undef HAVE_MMX2
482#undef HAVE_3DNOW
483#undef ARCH_X86
484
485#ifdef COMPILE_C
cc9b0679
MN
486#undef HAVE_MMX
487#undef HAVE_MMX2
488#undef HAVE_3DNOW
489#undef ARCH_X86
490#define RENAME(a) a ## _C
491#include "postprocess_template.c"
e89952aa 492#endif
cc9b0679
MN
493
494//MMX versions
e89952aa 495#ifdef COMPILE_MMX
cc9b0679
MN
496#undef RENAME
497#define HAVE_MMX
498#undef HAVE_MMX2
499#undef HAVE_3DNOW
500#define ARCH_X86
501#define RENAME(a) a ## _MMX
502#include "postprocess_template.c"
e89952aa 503#endif
cc9b0679
MN
504
505//MMX2 versions
e89952aa 506#ifdef COMPILE_MMX2
cc9b0679
MN
507#undef RENAME
508#define HAVE_MMX
509#define HAVE_MMX2
510#undef HAVE_3DNOW
511#define ARCH_X86
512#define RENAME(a) a ## _MMX2
513#include "postprocess_template.c"
e89952aa 514#endif
cc9b0679
MN
515
516//3DNOW versions
e89952aa 517#ifdef COMPILE_3DNOW
cc9b0679
MN
518#undef RENAME
519#define HAVE_MMX
520#undef HAVE_MMX2
521#define HAVE_3DNOW
522#define ARCH_X86
523#define RENAME(a) a ## _3DNow
524#include "postprocess_template.c"
e89952aa 525#endif
cc9b0679
MN
526
527// minor note: the HAVE_xyz is messed up after that line so dont use it
528
529static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
c41d972d 530 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
cc9b0679 531{
9c9e467d 532 PPContext *c= (PPContext *)vc;
c41d972d 533 PPMode *ppMode= (PPMode *)vm;
9c9e467d
MN
534 c->ppMode= *ppMode; //FIXME
535
cc9b0679
MN
536 // useing ifs here as they are faster than function pointers allthough the
537 // difference wouldnt be messureable here but its much better because
538 // someone might exchange the cpu whithout restarting mplayer ;)
e89952aa 539#ifdef RUNTIME_CPUDETECT
9c9e467d 540#ifdef ARCH_X86
cc9b0679 541 // ordered per speed fasterst first
fa6ea14e 542 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
9c9e467d 543 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
fa6ea14e 544 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
9c9e467d 545 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
fa6ea14e 546 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
9c9e467d 547 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
cc9b0679 548 else
9c9e467d 549 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
cc9b0679 550#else
9c9e467d 551 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
be44a4d7 552#endif
e89952aa
MN
553#else //RUNTIME_CPUDETECT
554#ifdef HAVE_MMX2
9c9e467d 555 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
e89952aa 556#elif defined (HAVE_3DNOW)
9c9e467d 557 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
e89952aa 558#elif defined (HAVE_MMX)
9c9e467d 559 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
e89952aa 560#else
9c9e467d 561 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
e89952aa
MN
562#endif
563#endif //!RUNTIME_CPUDETECT
117e45b0
MN
564}
565
cc9b0679
MN
566//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
567// QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
13e00528 568
911879d1 569/* -pp Command line Help
911879d1 570*/
4407a3c4 571char *pp_help=
b01be121 572"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
4b001a13 573"long form example:\n"
b01be121 574"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
4b001a13 575"short form example:\n"
b01be121 576"vb:a/hb:a/lb de,-vb\n"
4b001a13 577"more examples:\n"
1d9324fd 578"tn:64:128:256\n"
4b001a13
MN
579"Filters Options\n"
580"short long name short long option Description\n"
6423d073
MM
581"* * a autoq CPU power dependent enabler\n"
582" c chrom chrominance filtering enabled\n"
583" y nochrom chrominance filtering disabled\n"
584"hb hdeblock (2 threshold) horizontal deblocking filter\n"
68bf295e
MN
585" 1. difference factor: default=32, higher -> more deblocking\n"
586" 2. flatness threshold: default=39, lower -> more deblocking\n"
4b001a13 587" the h & v deblocking filters share these\n"
6423d073
MM
588" so you can't set different thresholds for h / v\n"
589"vb vdeblock (2 threshold) vertical deblocking filter\n"
590"h1 x1hdeblock experimental h deblock filter 1\n"
591"v1 x1vdeblock experimental v deblock filter 1\n"
592"dr dering deringing filter\n"
4b001a13
MN
593"al autolevels automatic brightness / contrast\n"
594" f fullyrange stretch luminance to (0..255)\n"
595"lb linblenddeint linear blend deinterlacer\n"
596"li linipoldeint linear interpolating deinterlace\n"
597"ci cubicipoldeint cubic interpolating deinterlacer\n"
598"md mediandeint median deinterlacer\n"
9c9e467d 599"fd ffmpegdeint ffmpeg deinterlacer\n"
4b001a13
MN
600"de default hb:a,vb:a,dr:a,al\n"
601"fa fast h1:a,v1:a,dr:a,al\n"
6423d073 602"tn tmpnoise (3 threshold) temporal noise reducer\n"
4b001a13 603" 1. <= 2. <= 3. larger -> stronger filtering\n"
6423d073 604"fq forceQuant <quantizer> force quantizer\n"
4b001a13 605;
911879d1 606
c41d972d 607pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
911879d1
MN
608{
609 char temp[GET_MODE_BUFFER_SIZE];
610 char *p= temp;
9c9e467d 611 char *filterDelimiters= ",/";
911879d1 612 char *optionDelimiters= ":";
c41d972d 613 struct PPMode *ppMode;
911879d1
MN
614 char *filterToken;
615
c41d972d
MN
616 ppMode= memalign(8, sizeof(PPMode));
617
618 ppMode->lumMode= 0;
619 ppMode->chromMode= 0;
620 ppMode->maxTmpNoise[0]= 700;
621 ppMode->maxTmpNoise[1]= 1500;
622 ppMode->maxTmpNoise[2]= 3000;
623 ppMode->maxAllowedY= 234;
624 ppMode->minAllowedY= 16;
68bf295e
MN
625 ppMode->baseDcDiff= 256/8;
626 ppMode->flatnessThreshold= 56-16-1;
c41d972d
MN
627 ppMode->maxClippedThreshold= 0.01;
628 ppMode->error=0;
df8d4d0e 629
911879d1
MN
630 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
631
162c9c2e 632 if(verbose>1) printf("pp: %s\n", name);
117e45b0 633
911879d1 634 for(;;){
911879d1 635 char *filterName;
326d40af 636 int q= 1000000; //PP_QUALITY_MAX;
911879d1
MN
637 int chrom=-1;
638 char *option;
639 char *options[OPTIONS_ARRAY_SIZE];
640 int i;
641 int filterNameOk=0;
642 int numOfUnknownOptions=0;
643 int enable=1; //does the user want us to enabled or disabled the filter
644
645 filterToken= strtok(p, filterDelimiters);
646 if(filterToken == NULL) break;
117e45b0 647 p+= strlen(filterToken) + 1; // p points to next filterToken
911879d1 648 filterName= strtok(filterToken, optionDelimiters);
162c9c2e 649 if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
911879d1
MN
650
651 if(*filterName == '-')
652 {
653 enable=0;
654 filterName++;
655 }
117e45b0 656
911879d1
MN
657 for(;;){ //for all options
658 option= strtok(NULL, optionDelimiters);
659 if(option == NULL) break;
660
162c9c2e 661 if(verbose>1) printf("pp: option: %s\n", option);
911879d1
MN
662 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
663 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
664 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
665 else
666 {
667 options[numOfUnknownOptions] = option;
668 numOfUnknownOptions++;
911879d1
MN
669 }
670 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
671 }
117e45b0 672 options[numOfUnknownOptions] = NULL;
911879d1
MN
673
674 /* replace stuff from the replace Table */
675 for(i=0; replaceTable[2*i]!=NULL; i++)
676 {
677 if(!strcmp(replaceTable[2*i], filterName))
678 {
679 int newlen= strlen(replaceTable[2*i + 1]);
680 int plen;
681 int spaceLeft;
682
683 if(p==NULL) p= temp, *p=0; //last filter
684 else p--, *p=','; //not last filter
685
686 plen= strlen(p);
8cd91a44 687 spaceLeft= p - temp + plen;
911879d1
MN
688 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
689 {
c41d972d 690 ppMode->error++;
911879d1
MN
691 break;
692 }
693 memmove(p + newlen, p, plen+1);
694 memcpy(p, replaceTable[2*i + 1], newlen);
695 filterNameOk=1;
696 }
697 }
698
699 for(i=0; filters[i].shortName!=NULL; i++)
700 {
117e45b0 701// printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
911879d1
MN
702 if( !strcmp(filters[i].longName, filterName)
703 || !strcmp(filters[i].shortName, filterName))
704 {
c41d972d
MN
705 ppMode->lumMode &= ~filters[i].mask;
706 ppMode->chromMode &= ~filters[i].mask;
911879d1
MN
707
708 filterNameOk=1;
709 if(!enable) break; // user wants to disable it
710
711 if(q >= filters[i].minLumQuality)
c41d972d 712 ppMode->lumMode|= filters[i].mask;
911879d1
MN
713 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
714 if(q >= filters[i].minChromQuality)
c41d972d 715 ppMode->chromMode|= filters[i].mask;
911879d1
MN
716
717 if(filters[i].mask == LEVEL_FIX)
718 {
719 int o;
c41d972d
MN
720 ppMode->minAllowedY= 16;
721 ppMode->maxAllowedY= 234;
911879d1 722 for(o=0; options[o]!=NULL; o++)
07f8991b 723 {
911879d1
MN
724 if( !strcmp(options[o],"fullyrange")
725 ||!strcmp(options[o],"f"))
726 {
c41d972d
MN
727 ppMode->minAllowedY= 0;
728 ppMode->maxAllowedY= 255;
911879d1
MN
729 numOfUnknownOptions--;
730 }
07f8991b 731 }
911879d1 732 }
117e45b0
MN
733 else if(filters[i].mask == TEMP_NOISE_FILTER)
734 {
735 int o;
736 int numOfNoises=0;
117e45b0
MN
737
738 for(o=0; options[o]!=NULL; o++)
739 {
740 char *tail;
c41d972d 741 ppMode->maxTmpNoise[numOfNoises]=
117e45b0
MN
742 strtol(options[o], &tail, 0);
743 if(tail!=options[o])
744 {
745 numOfNoises++;
746 numOfUnknownOptions--;
747 if(numOfNoises >= 3) break;
748 }
749 }
750 }
43d52f76
MN
751 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK)
752 {
753 int o;
754
755 for(o=0; options[o]!=NULL && o<2; o++)
756 {
757 char *tail;
758 int val= strtol(options[o], &tail, 0);
759 if(tail==options[o]) break;
760
761 numOfUnknownOptions--;
c41d972d
MN
762 if(o==0) ppMode->baseDcDiff= val;
763 else ppMode->flatnessThreshold= val;
43d52f76
MN
764 }
765 }
8aaac435
MN
766 else if(filters[i].mask == FORCE_QUANT)
767 {
768 int o;
c41d972d 769 ppMode->forcedQuant= 15;
8aaac435
MN
770
771 for(o=0; options[o]!=NULL && o<1; o++)
772 {
773 char *tail;
774 int val= strtol(options[o], &tail, 0);
775 if(tail==options[o]) break;
776
777 numOfUnknownOptions--;
c41d972d 778 ppMode->forcedQuant= val;
8aaac435
MN
779 }
780 }
911879d1
MN
781 }
782 }
c41d972d
MN
783 if(!filterNameOk) ppMode->error++;
784 ppMode->error += numOfUnknownOptions;
911879d1
MN
785 }
786
c41d972d
MN
787 if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
788 if(ppMode->error)
789 {
790 fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
791 free(ppMode);
792 return NULL;
793 }
911879d1
MN
794 return ppMode;
795}
796
c41d972d
MN
797void pp_free_mode(pp_mode_t *mode){
798 if(mode) free(mode);
799}
800
88c0bc7e
MN
801static void reallocAlign(void **p, int alignment, int size){
802 if(*p) free(*p);
803 *p= memalign(alignment, size);
804 memset(*p, 0, size);
805}
806
0426af31 807static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
ec487e5d
MN
808 int mbWidth = (width+15)>>4;
809 int mbHeight= (height+15)>>4;
88c0bc7e
MN
810 int i;
811
812 c->stride= stride;
0426af31 813 c->qpStride= qpStride;
9c9e467d 814
88c0bc7e
MN
815 reallocAlign((void **)&c->tempDst, 8, stride*24);
816 reallocAlign((void **)&c->tempSrc, 8, stride*24);
817 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
818 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
9c9e467d
MN
819 for(i=0; i<256; i++)
820 c->yHistogram[i]= width*height/64*15/256;
821
822 for(i=0; i<3; i++)
211c4920 823 {
9c9e467d 824 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
88c0bc7e
MN
825 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
826 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
211c4920 827 }
45b4f285 828
134eb1e5 829 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
0426af31
MN
830 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
831 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
88c0bc7e
MN
832 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
833}
834
4cfbf61b 835static void global_init(void){
134eb1e5
MN
836 int i;
837 memset(clip_table, 0, 256);
838 for(i=256; i<512; i++)
839 clip_table[i]= i;
840 memset(clip_table+512, 0, 256);
841}
842
88c0bc7e
MN
843pp_context_t *pp_get_context(int width, int height, int cpuCaps){
844 PPContext *c= memalign(32, sizeof(PPContext));
88c0bc7e 845 int stride= (width+15)&(~15); //assumed / will realloc if needed
0426af31 846 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
88c0bc7e 847
134eb1e5
MN
848 global_init();
849
88c0bc7e
MN
850 memset(c, 0, sizeof(PPContext));
851 c->cpuCaps= cpuCaps;
e9effafd
MN
852 if(cpuCaps&PP_FORMAT){
853 c->hChromaSubSample= cpuCaps&0x3;
854 c->vChromaSubSample= (cpuCaps>>4)&0x3;
855 }else{
856 c->hChromaSubSample= 1;
857 c->vChromaSubSample= 1;
858 }
88c0bc7e 859
0426af31 860 reallocBuffers(c, width, height, stride, qpStride);
88c0bc7e 861
9c9e467d 862 c->frameNum=-1;
45b4f285 863
9c9e467d 864 return c;
45b4f285
MN
865}
866
9cb54f43 867void pp_free_context(void *vc){
9c9e467d
MN
868 PPContext *c = (PPContext*)vc;
869 int i;
870
871 for(i=0; i<3; i++) free(c->tempBlured[i]);
872 for(i=0; i<3; i++) free(c->tempBluredPast[i]);
873
874 free(c->tempBlocks);
875 free(c->yHistogram);
876 free(c->tempDst);
877 free(c->tempSrc);
9c9e467d 878 free(c->deintTemp);
0426af31 879 free(c->stdQPTable);
ec487e5d 880 free(c->nonBQPTable);
88c0bc7e
MN
881 free(c->forcedQPTable);
882
883 memset(c, 0, sizeof(PPContext));
884
9c9e467d
MN
885 free(c);
886}
887
9cb54f43 888void pp_postprocess(uint8_t * src[3], int srcStride[3],
9c9e467d 889 uint8_t * dst[3], int dstStride[3],
ec487e5d 890 int width, int height,
9c9e467d 891 QP_STORE_T *QP_store, int QPStride,
c41d972d 892 pp_mode_t *vm, void *vc, int pict_type)
911879d1 893{
ec487e5d
MN
894 int mbWidth = (width+15)>>4;
895 int mbHeight= (height+15)>>4;
c41d972d 896 PPMode *mode = (PPMode*)vm;
ec487e5d 897 PPContext *c = (PPContext*)vc;
88c0bc7e 898 int minStride= MAX(srcStride[0], dstStride[0]);
0426af31
MN
899
900 if(c->stride < minStride || c->qpStride < QPStride)
901 reallocBuffers(c, width, height,
902 MAX(minStride, c->stride),
903 MAX(c->qpStride, QPStride));
9c9e467d 904
8aaac435 905 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
815cbfe7 906 {
8aaac435 907 int i;
88c0bc7e 908 QP_store= c->forcedQPTable;
9c9e467d 909 QPStride= 0;
8aaac435 910 if(mode->lumMode & FORCE_QUANT)
88c0bc7e 911 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
8aaac435 912 else
88c0bc7e 913 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
815cbfe7 914 }
0426af31
MN
915//printf("pict_type:%d\n", pict_type);
916
917 if(pict_type & PP_PICT_TYPE_QP2){
918 int i;
919 const int count= mbHeight * QPStride;
920 for(i=0; i<(count>>2); i++){
921 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
922 }
923 for(i<<=2; i<count; i++){
924 c->stdQPTable[i] = QP_store[i]>>1;
925 }
926 QP_store= c->stdQPTable;
927 }
928
ec487e5d
MN
929if(0){
930int x,y;
931for(y=0; y<mbHeight; y++){
932 for(x=0; x<mbWidth; x++){
933 printf("%2d ", QP_store[x + y*QPStride]);
934 }
935 printf("\n");
936}
937 printf("\n");
938}
51e19dcc 939
0426af31 940 if((pict_type&7)!=3)
ec487e5d 941 {
0426af31
MN
942 int i;
943 const int count= mbHeight * QPStride;
944 for(i=0; i<(count>>2); i++){
2e90b37c 945 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
0426af31
MN
946 }
947 for(i<<=2; i<count; i++){
2e90b37c 948 c->nonBQPTable[i] = QP_store[i] & 0x3F;
ec487e5d
MN
949 }
950 }
815cbfe7 951
df8d4d0e 952 if(verbose>2)
162c9c2e
MN
953 {
954 printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
162c9c2e
MN
955 }
956
9c9e467d 957 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
b2a3fcb7 958 width, height, QP_store, QPStride, 0, mode, c);
911879d1 959
e9effafd
MN
960 width = (width )>>c->hChromaSubSample;
961 height = (height)>>c->vChromaSubSample;
911879d1 962
4e1349d4
MN
963 if(mode->chromMode)
964 {
9c9e467d 965 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
b2a3fcb7 966 width, height, QP_store, QPStride, 1, mode, c);
9c9e467d 967 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
b2a3fcb7 968 width, height, QP_store, QPStride, 2, mode, c);
4e1349d4 969 }
9c9e467d 970 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
4e1349d4 971 {
ec487e5d
MN
972 memcpy(dst[1], src[1], srcStride[1]*height);
973 memcpy(dst[2], src[2], srcStride[2]*height);
4e1349d4
MN
974 }
975 else
976 {
977 int y;
ec487e5d 978 for(y=0; y<height; y++)
4e1349d4 979 {
ec487e5d
MN
980 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
981 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
4e1349d4
MN
982 }
983 }
911879d1
MN
984}
985