altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa...
[libav.git] / libavcodec / libpostproc / postprocess.c
CommitLineData
3057fa66 1/*
9858f773 2 Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3057fa66 3
b0ac780a
MN
4 AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5
3057fa66
A
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19*/
20
b304569a
MN
21/**
22 * @file postprocess.c
23 * postprocessing.
24 */
25
3057fa66 26/*
b0ac780a
MN
27 C MMX MMX2 3DNow AltiVec
28isVertDC Ec Ec Ec
29isVertMinMaxOk Ec Ec Ec
30doVertLowPass E e e Ec
31doVertDefFilter Ec Ec e e Ec
3057fa66 32isHorizDC Ec Ec
4e4dcbc5
MN
33isHorizMinMaxOk a E
34doHorizLowPass E e e
7f16f6e6 35doHorizDefFilter Ec Ec e e
b0ac780a 36deRing E e e* Ecp
3b58b885 37Vertical RKAlgo1 E a a
e5c30e06 38Horizontal RKAlgo1 a a
117e45b0
MN
39Vertical X1# a E E
40Horizontal X1# a E E
acced553
MN
41LinIpolDeinterlace e E E*
42CubicIpolDeinterlace a e e*
43LinBlendDeinterlace e E E*
9b1663fc 44MedianDeinterlace# E Ec Ec
be44a4d7 45TempDeNoiser# E e e
d5a1a995 46
117e45b0
MN
47* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
48# more or less selfinvented filters so the exactness isnt too meaningfull
3057fa66 49E = Exact implementation
acced553 50e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
51a = alternative / approximate impl
52c = checked against the other implementations (-vo md5)
b0ac780a 53p = partially optimized, still some work to do
3057fa66
A
54*/
55
56/*
57TODO:
3057fa66 58reduce the time wasted on the mem transfer
3057fa66 59unroll stuff if instructions depend too much on the prior one
3057fa66 60move YScale thing to the end instead of fixing QP
13e00528 61write a faster and higher quality deblocking filter :)
d5a1a995
MN
62make the mainloop more flexible (variable number of blocks at once
63 (the if/else stuff per block is slowing things down)
9f45d04d 64compare the quality & speed of all filters
9f45d04d 65split this huge file
8405b3fd 66optimize c versions
117e45b0 67try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
3057fa66 68...
13e00528
A
69*/
70
a6be8111 71//Changelog: use the CVS log
3057fa66 72
9858f773 73#include "config.h"
3057fa66
A
74#include <inttypes.h>
75#include <stdio.h>
d5a1a995 76#include <stdlib.h>
911879d1 77#include <string.h>
dda87e9f
PL
78#ifdef HAVE_MALLOC_H
79#include <malloc.h>
80#endif
3057fa66 81//#undef HAVE_MMX2
13e00528 82//#define HAVE_3DNOW
3057fa66 83//#undef HAVE_MMX
cc9b0679 84//#undef ARCH_X86
7f16f6e6 85//#define DEBUG_BRIGHTNESS
bba9b16c 86#ifdef USE_FASTMEMCPY
0a87c409 87#include "fastmemcpy.h"
70d4f2da 88#endif
13e00528 89#include "postprocess.h"
c41d972d 90#include "postprocess_internal.h"
bba9b16c
MN
91
92#include "mangle.h" //FIXME should be supressed
3057fa66 93
ca390e72
ZK
94#ifndef HAVE_MEMALIGN
95#define memalign(a,b) malloc(b)
96#endif
97
e939e1c3
A
98#define MIN(a,b) ((a) > (b) ? (b) : (a))
99#define MAX(a,b) ((a) < (b) ? (b) : (a))
100#define ABS(a) ((a) > 0 ? (a) : (-(a)))
101#define SIGN(a) ((a) > 0 ? 1 : -1)
102
911879d1
MN
103#define GET_MODE_BUFFER_SIZE 500
104#define OPTIONS_ARRAY_SIZE 10
9c9e467d
MN
105#define BLOCK_SIZE 8
106#define TEMP_STRIDE 8
107//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
911879d1 108
3f1d4e96
DB
109#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
110# define attribute_used __attribute__((used))
111#else
112# define attribute_used
113#endif
114
cc9b0679 115#ifdef ARCH_X86
3f1d4e96
DB
116static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
117static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
118static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
119static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
120static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
121static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
122static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
b28daef8 123#endif
3057fa66 124
134eb1e5
MN
125
126static uint8_t clip_table[3*256];
127static uint8_t * const clip_tab= clip_table + 256;
128
4df8ca9d 129static const int verbose= 0;
45b4f285 130
3f1d4e96 131static const int attribute_used deringThreshold= 20;
3057fa66 132
9c9e467d 133
911879d1
MN
134static struct PPFilter filters[]=
135{
136 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
137 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
9c9e467d
MN
138/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
139 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
911879d1
MN
140 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
141 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
142 {"dr", "dering", 1, 5, 6, DERING},
143 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
43d52f76
MN
144 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
145 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
146 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
147 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
9c9e467d 148 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
134eb1e5 149 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
117e45b0 150 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
8aaac435 151 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
911879d1
MN
152 {NULL, NULL,0,0,0,0} //End Marker
153};
154
155static char *replaceTable[]=
156{
117e45b0
MN
157 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
158 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
159 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
160 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
911879d1
MN
161 NULL //End Marker
162};
163
3057fa66 164
9c9e467d 165#ifdef ARCH_X86
3057fa66
A
166static inline void prefetchnta(void *p)
167{
168 asm volatile( "prefetchnta (%0)\n\t"
169 : : "r" (p)
170 );
171}
172
173static inline void prefetcht0(void *p)
174{
175 asm volatile( "prefetcht0 (%0)\n\t"
176 : : "r" (p)
177 );
178}
179
180static inline void prefetcht1(void *p)
181{
182 asm volatile( "prefetcht1 (%0)\n\t"
183 : : "r" (p)
184 );
185}
186
187static inline void prefetcht2(void *p)
188{
189 asm volatile( "prefetcht2 (%0)\n\t"
190 : : "r" (p)
191 );
192}
9a722af7 193#endif
3057fa66 194
cc9b0679 195// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
3057fa66 196
cf5ec61d
MN
197/**
198 * Check if the given 8x8 Block is mostly "flat"
199 */
b0ac780a 200static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
cf5ec61d
MN
201{
202 int numEq= 0;
203 int y;
0426af31 204 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
ec487e5d 205 const int dcThreshold= dcOffset*2 + 1;
0426af31 206
cf5ec61d
MN
207 for(y=0; y<BLOCK_SIZE; y++)
208 {
9c9e467d
MN
209 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
210 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
211 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
212 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
213 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
214 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
215 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
216 src+= stride;
217 }
218 return numEq > c->ppMode.flatnessThreshold;
219}
220
221/**
222 * Check if the middle 8x8 Block in the given 8x16 block is flat
223 */
224static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
225 int numEq= 0;
226 int y;
0426af31 227 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
ec487e5d 228 const int dcThreshold= dcOffset*2 + 1;
0426af31 229
9c9e467d
MN
230 src+= stride*4; // src points to begin of the 8x8 Block
231 for(y=0; y<BLOCK_SIZE-1; y++)
232 {
233 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
234 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
235 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
236 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
237 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
238 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
239 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
240 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
cf5ec61d
MN
241 src+= stride;
242 }
9c9e467d 243 return numEq > c->ppMode.flatnessThreshold;
cf5ec61d
MN
244}
245
b0ac780a 246static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
cf5ec61d 247{
cb482d25
MN
248 int i;
249#if 1
250 for(i=0; i<2; i++){
251 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
252 src += stride;
253 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
254 src += stride;
255 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
256 src += stride;
257 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
258 src += stride;
259 }
260#else
261 for(i=0; i<8; i++){
262 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
263 src += stride;
264 }
265#endif
266 return 1;
267}
cf5ec61d 268
cb482d25
MN
269static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
270{
271#if 1
272#if 1
273 int x;
274 src+= stride*4;
275 for(x=0; x<BLOCK_SIZE; x+=4)
276 {
277 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
278 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
279 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
280 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
281 }
282#else
283 int x;
284 src+= stride*3;
285 for(x=0; x<BLOCK_SIZE; x++)
286 {
287 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
288 }
289#endif
290 return 1;
291#else
292 int x;
293 src+= stride*4;
294 for(x=0; x<BLOCK_SIZE; x++)
295 {
296 int min=255;
297 int max=0;
298 int y;
299 for(y=0; y<8; y++){
300 int v= src[x + y*stride];
301 if(v>max) max=v;
302 if(v<min) min=v;
303 }
304 if(max-min > 2*QP) return 0;
305 }
cf5ec61d 306 return 1;
cb482d25
MN
307#endif
308}
309
b0ac780a
MN
310static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
311 if( isHorizDC_C(src, stride, c) ){
312 if( isHorizMinMaxOk_C(src, stride, c->QP) )
313 return 1;
314 else
315 return 0;
316 }else{
317 return 2;
318 }
319}
320
cb482d25
MN
321static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
322 if( isVertDC_C(src, stride, c) ){
323 if( isVertMinMaxOk_C(src, stride, c->QP) )
324 return 1;
325 else
326 return 0;
327 }else{
328 return 2;
329 }
cf5ec61d
MN
330}
331
b0ac780a 332static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
cf5ec61d
MN
333{
334 int y;
335 for(y=0; y<BLOCK_SIZE; y++)
336 {
b0ac780a 337 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
cf5ec61d 338
b0ac780a 339 if(ABS(middleEnergy) < 8*c->QP)
cf5ec61d
MN
340 {
341 const int q=(dst[3] - dst[4])/2;
342 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
343 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
344
345 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
346 d= MAX(d, 0);
347
348 d= (5*d + 32) >> 6;
349 d*= SIGN(-middleEnergy);
350
351 if(q>0)
352 {
353 d= d<0 ? 0 : d;
354 d= d>q ? q : d;
355 }
356 else
357 {
358 d= d>0 ? 0 : d;
359 d= d<q ? q : d;
360 }
361
362 dst[3]-= d;
363 dst[4]+= d;
364 }
365 dst+= stride;
366 }
367}
368
369/**
370 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
371 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
372 */
b0ac780a 373static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
cf5ec61d
MN
374{
375
376 int y;
377 for(y=0; y<BLOCK_SIZE; y++)
378 {
b0ac780a
MN
379 const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
380 const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
cf5ec61d
MN
381
382 int sums[9];
383 sums[0] = first + dst[0];
384 sums[1] = dst[0] + dst[1];
385 sums[2] = dst[1] + dst[2];
386 sums[3] = dst[2] + dst[3];
387 sums[4] = dst[3] + dst[4];
388 sums[5] = dst[4] + dst[5];
389 sums[6] = dst[5] + dst[6];
390 sums[7] = dst[6] + dst[7];
391 sums[8] = dst[7] + last;
392
393 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
394 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
395 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
396 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
397 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
398 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
399 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
400 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
401
402 dst+= stride;
403 }
404}
405
4e4dcbc5 406/**
cc9b0679
MN
407 * Experimental Filter 1 (Horizontal)
408 * will not damage linear gradients
409 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
410 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
411 * MMX2 version does correct clipping C version doesnt
412 * not identical with the vertical one
4e4dcbc5 413 */
cc9b0679
MN
414static inline void horizX1Filter(uint8_t *src, int stride, int QP)
415{
117e45b0 416 int y;
cc9b0679
MN
417 static uint64_t *lut= NULL;
418 if(lut==NULL)
117e45b0 419 {
cc9b0679
MN
420 int i;
421 lut= (uint64_t*)memalign(8, 256*8);
422 for(i=0; i<256; i++)
117e45b0 423 {
cc9b0679 424 int v= i < 128 ? 2*i : 2*(i-256);
117e45b0 425/*
cc9b0679
MN
426//Simulate 112242211 9-Tap filter
427 uint64_t a= (v/16) & 0xFF;
428 uint64_t b= (v/8) & 0xFF;
429 uint64_t c= (v/4) & 0xFF;
430 uint64_t d= (3*v/8) & 0xFF;
117e45b0 431*/
cc9b0679
MN
432//Simulate piecewise linear interpolation
433 uint64_t a= (v/16) & 0xFF;
434 uint64_t b= (v*3/16) & 0xFF;
435 uint64_t c= (v*5/16) & 0xFF;
436 uint64_t d= (7*v/16) & 0xFF;
437 uint64_t A= (0x100 - a)&0xFF;
438 uint64_t B= (0x100 - b)&0xFF;
439 uint64_t C= (0x100 - c)&0xFF;
440 uint64_t D= (0x100 - c)&0xFF;
441
442 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
443 (D<<24) | (C<<16) | (B<<8) | (A);
444 //lut[i] = (v<<32) | (v<<24);
117e45b0
MN
445 }
446 }
cc9b0679
MN
447
448 for(y=0; y<BLOCK_SIZE; y++)
117e45b0 449 {
cc9b0679
MN
450 int a= src[1] - src[2];
451 int b= src[3] - src[4];
452 int c= src[5] - src[6];
453
454 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
455
456 if(d < QP)
117e45b0 457 {
cc9b0679
MN
458 int v = d * SIGN(-b);
459
460 src[1] +=v/8;
461 src[2] +=v/4;
462 src[3] +=3*v/8;
463 src[4] -=3*v/8;
464 src[5] -=v/4;
465 src[6] -=v/8;
466
117e45b0 467 }
cc9b0679 468 src+=stride;
117e45b0 469 }
cc9b0679
MN
470}
471
472
e89952aa 473//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
cc9b0679 474//Plain C versions
e89952aa
MN
475#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
476#define COMPILE_C
477#endif
478
b0ac780a
MN
479#ifdef ARCH_POWERPC
480#ifdef HAVE_ALTIVEC
481#define COMPILE_ALTIVEC
482#ifndef CONFIG_DARWIN
483#warning "################################################################################"
484#warning "WARNING: No gcc available as of today (2004-05-25) seems to be able to compile properly some of the code under non-Darwin PPC OSes. Some functions result in wrong results, while others simply won't compile (gcc explodes after allocating 1GiB+)."
485#warning "################################################################################"
486#endif //CONFIG_DARWIN
487#endif //HAVE_ALTIVEC
488#endif //ARCH_POWERPC
489
9c9e467d 490#ifdef ARCH_X86
e89952aa
MN
491
492#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
493#define COMPILE_MMX
494#endif
495
496#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
497#define COMPILE_MMX2
498#endif
499
500#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
501#define COMPILE_3DNOW
502#endif
9c9e467d 503#endif //ARCH_X86
e89952aa
MN
504
505#undef HAVE_MMX
506#undef HAVE_MMX2
507#undef HAVE_3DNOW
b0ac780a 508#undef HAVE_ALTIVEC
e89952aa
MN
509#undef ARCH_X86
510
511#ifdef COMPILE_C
cc9b0679
MN
512#undef HAVE_MMX
513#undef HAVE_MMX2
514#undef HAVE_3DNOW
515#undef ARCH_X86
516#define RENAME(a) a ## _C
517#include "postprocess_template.c"
e89952aa 518#endif
cc9b0679 519
b0ac780a
MN
520#ifdef ARCH_POWERPC
521#ifdef COMPILE_ALTIVEC
522#undef RENAME
523#define HAVE_ALTIVEC
524#define RENAME(a) a ## _altivec
525#include "postprocess_altivec_template.c"
526#include "postprocess_template.c"
527#endif
528#endif //ARCH_POWERPC
529
cc9b0679 530//MMX versions
e89952aa 531#ifdef COMPILE_MMX
cc9b0679
MN
532#undef RENAME
533#define HAVE_MMX
534#undef HAVE_MMX2
535#undef HAVE_3DNOW
536#define ARCH_X86
537#define RENAME(a) a ## _MMX
538#include "postprocess_template.c"
e89952aa 539#endif
cc9b0679
MN
540
541//MMX2 versions
e89952aa 542#ifdef COMPILE_MMX2
cc9b0679
MN
543#undef RENAME
544#define HAVE_MMX
545#define HAVE_MMX2
546#undef HAVE_3DNOW
547#define ARCH_X86
548#define RENAME(a) a ## _MMX2
549#include "postprocess_template.c"
e89952aa 550#endif
cc9b0679
MN
551
552//3DNOW versions
e89952aa 553#ifdef COMPILE_3DNOW
cc9b0679
MN
554#undef RENAME
555#define HAVE_MMX
556#undef HAVE_MMX2
557#define HAVE_3DNOW
558#define ARCH_X86
559#define RENAME(a) a ## _3DNow
560#include "postprocess_template.c"
e89952aa 561#endif
cc9b0679
MN
562
563// minor note: the HAVE_xyz is messed up after that line so dont use it
564
565static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
c41d972d 566 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
cc9b0679 567{
9c9e467d 568 PPContext *c= (PPContext *)vc;
c41d972d 569 PPMode *ppMode= (PPMode *)vm;
9c9e467d
MN
570 c->ppMode= *ppMode; //FIXME
571
cc9b0679
MN
572 // useing ifs here as they are faster than function pointers allthough the
573 // difference wouldnt be messureable here but its much better because
574 // someone might exchange the cpu whithout restarting mplayer ;)
e89952aa 575#ifdef RUNTIME_CPUDETECT
9c9e467d 576#ifdef ARCH_X86
cc9b0679 577 // ordered per speed fasterst first
fa6ea14e 578 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
9c9e467d 579 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
fa6ea14e 580 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
9c9e467d 581 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
fa6ea14e 582 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
9c9e467d 583 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
cc9b0679 584 else
9c9e467d 585 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
cc9b0679 586#else
b0ac780a
MN
587#ifdef ARCH_POWERPC
588#ifdef HAVE_ALTIVEC
589 else if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
590 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
591 else
592#endif
593#endif
9c9e467d 594 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
be44a4d7 595#endif
e89952aa
MN
596#else //RUNTIME_CPUDETECT
597#ifdef HAVE_MMX2
9c9e467d 598 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
e89952aa 599#elif defined (HAVE_3DNOW)
9c9e467d 600 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
e89952aa 601#elif defined (HAVE_MMX)
9c9e467d 602 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
b0ac780a
MN
603#elif defined (HAVE_ALTIVEC)
604 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
e89952aa 605#else
9c9e467d 606 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
e89952aa
MN
607#endif
608#endif //!RUNTIME_CPUDETECT
117e45b0
MN
609}
610
cc9b0679
MN
611//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
612// QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
13e00528 613
911879d1 614/* -pp Command line Help
911879d1 615*/
4407a3c4 616char *pp_help=
b01be121 617"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
4b001a13 618"long form example:\n"
b01be121 619"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
4b001a13 620"short form example:\n"
b01be121 621"vb:a/hb:a/lb de,-vb\n"
4b001a13 622"more examples:\n"
1d9324fd 623"tn:64:128:256\n"
4b001a13
MN
624"Filters Options\n"
625"short long name short long option Description\n"
6423d073
MM
626"* * a autoq CPU power dependent enabler\n"
627" c chrom chrominance filtering enabled\n"
628" y nochrom chrominance filtering disabled\n"
629"hb hdeblock (2 threshold) horizontal deblocking filter\n"
68bf295e
MN
630" 1. difference factor: default=32, higher -> more deblocking\n"
631" 2. flatness threshold: default=39, lower -> more deblocking\n"
4b001a13 632" the h & v deblocking filters share these\n"
6423d073
MM
633" so you can't set different thresholds for h / v\n"
634"vb vdeblock (2 threshold) vertical deblocking filter\n"
635"h1 x1hdeblock experimental h deblock filter 1\n"
636"v1 x1vdeblock experimental v deblock filter 1\n"
637"dr dering deringing filter\n"
4b001a13
MN
638"al autolevels automatic brightness / contrast\n"
639" f fullyrange stretch luminance to (0..255)\n"
640"lb linblenddeint linear blend deinterlacer\n"
641"li linipoldeint linear interpolating deinterlace\n"
642"ci cubicipoldeint cubic interpolating deinterlacer\n"
643"md mediandeint median deinterlacer\n"
9c9e467d 644"fd ffmpegdeint ffmpeg deinterlacer\n"
4b001a13
MN
645"de default hb:a,vb:a,dr:a,al\n"
646"fa fast h1:a,v1:a,dr:a,al\n"
6423d073 647"tn tmpnoise (3 threshold) temporal noise reducer\n"
4b001a13 648" 1. <= 2. <= 3. larger -> stronger filtering\n"
6423d073 649"fq forceQuant <quantizer> force quantizer\n"
4b001a13 650;
911879d1 651
c41d972d 652pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
911879d1
MN
653{
654 char temp[GET_MODE_BUFFER_SIZE];
655 char *p= temp;
9c9e467d 656 char *filterDelimiters= ",/";
911879d1 657 char *optionDelimiters= ":";
c41d972d 658 struct PPMode *ppMode;
911879d1
MN
659 char *filterToken;
660
c41d972d
MN
661 ppMode= memalign(8, sizeof(PPMode));
662
663 ppMode->lumMode= 0;
664 ppMode->chromMode= 0;
665 ppMode->maxTmpNoise[0]= 700;
666 ppMode->maxTmpNoise[1]= 1500;
667 ppMode->maxTmpNoise[2]= 3000;
668 ppMode->maxAllowedY= 234;
669 ppMode->minAllowedY= 16;
68bf295e
MN
670 ppMode->baseDcDiff= 256/8;
671 ppMode->flatnessThreshold= 56-16-1;
c41d972d
MN
672 ppMode->maxClippedThreshold= 0.01;
673 ppMode->error=0;
df8d4d0e 674
911879d1
MN
675 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
676
162c9c2e 677 if(verbose>1) printf("pp: %s\n", name);
117e45b0 678
911879d1 679 for(;;){
911879d1 680 char *filterName;
326d40af 681 int q= 1000000; //PP_QUALITY_MAX;
911879d1
MN
682 int chrom=-1;
683 char *option;
684 char *options[OPTIONS_ARRAY_SIZE];
685 int i;
686 int filterNameOk=0;
687 int numOfUnknownOptions=0;
688 int enable=1; //does the user want us to enabled or disabled the filter
689
690 filterToken= strtok(p, filterDelimiters);
691 if(filterToken == NULL) break;
117e45b0 692 p+= strlen(filterToken) + 1; // p points to next filterToken
911879d1 693 filterName= strtok(filterToken, optionDelimiters);
162c9c2e 694 if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
911879d1
MN
695
696 if(*filterName == '-')
697 {
698 enable=0;
699 filterName++;
700 }
117e45b0 701
911879d1
MN
702 for(;;){ //for all options
703 option= strtok(NULL, optionDelimiters);
704 if(option == NULL) break;
705
162c9c2e 706 if(verbose>1) printf("pp: option: %s\n", option);
911879d1
MN
707 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
708 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
709 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
710 else
711 {
712 options[numOfUnknownOptions] = option;
713 numOfUnknownOptions++;
911879d1
MN
714 }
715 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
716 }
117e45b0 717 options[numOfUnknownOptions] = NULL;
911879d1
MN
718
719 /* replace stuff from the replace Table */
720 for(i=0; replaceTable[2*i]!=NULL; i++)
721 {
722 if(!strcmp(replaceTable[2*i], filterName))
723 {
724 int newlen= strlen(replaceTable[2*i + 1]);
725 int plen;
726 int spaceLeft;
727
728 if(p==NULL) p= temp, *p=0; //last filter
729 else p--, *p=','; //not last filter
730
731 plen= strlen(p);
8cd91a44 732 spaceLeft= p - temp + plen;
911879d1
MN
733 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
734 {
c41d972d 735 ppMode->error++;
911879d1
MN
736 break;
737 }
738 memmove(p + newlen, p, plen+1);
739 memcpy(p, replaceTable[2*i + 1], newlen);
740 filterNameOk=1;
741 }
742 }
743
744 for(i=0; filters[i].shortName!=NULL; i++)
745 {
117e45b0 746// printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
911879d1
MN
747 if( !strcmp(filters[i].longName, filterName)
748 || !strcmp(filters[i].shortName, filterName))
749 {
c41d972d
MN
750 ppMode->lumMode &= ~filters[i].mask;
751 ppMode->chromMode &= ~filters[i].mask;
911879d1
MN
752
753 filterNameOk=1;
754 if(!enable) break; // user wants to disable it
755
756 if(q >= filters[i].minLumQuality)
c41d972d 757 ppMode->lumMode|= filters[i].mask;
911879d1
MN
758 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
759 if(q >= filters[i].minChromQuality)
c41d972d 760 ppMode->chromMode|= filters[i].mask;
911879d1
MN
761
762 if(filters[i].mask == LEVEL_FIX)
763 {
764 int o;
c41d972d
MN
765 ppMode->minAllowedY= 16;
766 ppMode->maxAllowedY= 234;
911879d1 767 for(o=0; options[o]!=NULL; o++)
07f8991b 768 {
911879d1
MN
769 if( !strcmp(options[o],"fullyrange")
770 ||!strcmp(options[o],"f"))
771 {
c41d972d
MN
772 ppMode->minAllowedY= 0;
773 ppMode->maxAllowedY= 255;
911879d1
MN
774 numOfUnknownOptions--;
775 }
07f8991b 776 }
911879d1 777 }
117e45b0
MN
778 else if(filters[i].mask == TEMP_NOISE_FILTER)
779 {
780 int o;
781 int numOfNoises=0;
117e45b0
MN
782
783 for(o=0; options[o]!=NULL; o++)
784 {
785 char *tail;
c41d972d 786 ppMode->maxTmpNoise[numOfNoises]=
117e45b0
MN
787 strtol(options[o], &tail, 0);
788 if(tail!=options[o])
789 {
790 numOfNoises++;
791 numOfUnknownOptions--;
792 if(numOfNoises >= 3) break;
793 }
794 }
795 }
43d52f76
MN
796 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK)
797 {
798 int o;
799
800 for(o=0; options[o]!=NULL && o<2; o++)
801 {
802 char *tail;
803 int val= strtol(options[o], &tail, 0);
804 if(tail==options[o]) break;
805
806 numOfUnknownOptions--;
c41d972d
MN
807 if(o==0) ppMode->baseDcDiff= val;
808 else ppMode->flatnessThreshold= val;
43d52f76
MN
809 }
810 }
8aaac435
MN
811 else if(filters[i].mask == FORCE_QUANT)
812 {
813 int o;
c41d972d 814 ppMode->forcedQuant= 15;
8aaac435
MN
815
816 for(o=0; options[o]!=NULL && o<1; o++)
817 {
818 char *tail;
819 int val= strtol(options[o], &tail, 0);
820 if(tail==options[o]) break;
821
822 numOfUnknownOptions--;
c41d972d 823 ppMode->forcedQuant= val;
8aaac435
MN
824 }
825 }
911879d1
MN
826 }
827 }
c41d972d
MN
828 if(!filterNameOk) ppMode->error++;
829 ppMode->error += numOfUnknownOptions;
911879d1
MN
830 }
831
c41d972d
MN
832 if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
833 if(ppMode->error)
834 {
835 fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
836 free(ppMode);
837 return NULL;
838 }
911879d1
MN
839 return ppMode;
840}
841
c41d972d
MN
842void pp_free_mode(pp_mode_t *mode){
843 if(mode) free(mode);
844}
845
88c0bc7e
MN
846static void reallocAlign(void **p, int alignment, int size){
847 if(*p) free(*p);
848 *p= memalign(alignment, size);
849 memset(*p, 0, size);
850}
851
0426af31 852static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
ec487e5d
MN
853 int mbWidth = (width+15)>>4;
854 int mbHeight= (height+15)>>4;
88c0bc7e
MN
855 int i;
856
857 c->stride= stride;
0426af31 858 c->qpStride= qpStride;
9c9e467d 859
88c0bc7e
MN
860 reallocAlign((void **)&c->tempDst, 8, stride*24);
861 reallocAlign((void **)&c->tempSrc, 8, stride*24);
862 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
863 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
9c9e467d
MN
864 for(i=0; i<256; i++)
865 c->yHistogram[i]= width*height/64*15/256;
866
867 for(i=0; i<3; i++)
211c4920 868 {
9c9e467d 869 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
88c0bc7e
MN
870 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
871 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
211c4920 872 }
45b4f285 873
134eb1e5 874 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
0426af31
MN
875 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
876 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
88c0bc7e
MN
877 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
878}
879
4cfbf61b 880static void global_init(void){
134eb1e5
MN
881 int i;
882 memset(clip_table, 0, 256);
883 for(i=256; i<512; i++)
884 clip_table[i]= i;
885 memset(clip_table+512, 0, 256);
886}
887
88c0bc7e
MN
888pp_context_t *pp_get_context(int width, int height, int cpuCaps){
889 PPContext *c= memalign(32, sizeof(PPContext));
88c0bc7e 890 int stride= (width+15)&(~15); //assumed / will realloc if needed
0426af31 891 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
88c0bc7e 892
134eb1e5
MN
893 global_init();
894
88c0bc7e
MN
895 memset(c, 0, sizeof(PPContext));
896 c->cpuCaps= cpuCaps;
e9effafd
MN
897 if(cpuCaps&PP_FORMAT){
898 c->hChromaSubSample= cpuCaps&0x3;
899 c->vChromaSubSample= (cpuCaps>>4)&0x3;
900 }else{
901 c->hChromaSubSample= 1;
902 c->vChromaSubSample= 1;
903 }
88c0bc7e 904
0426af31 905 reallocBuffers(c, width, height, stride, qpStride);
88c0bc7e 906
9c9e467d 907 c->frameNum=-1;
45b4f285 908
9c9e467d 909 return c;
45b4f285
MN
910}
911
9cb54f43 912void pp_free_context(void *vc){
9c9e467d
MN
913 PPContext *c = (PPContext*)vc;
914 int i;
915
916 for(i=0; i<3; i++) free(c->tempBlured[i]);
917 for(i=0; i<3; i++) free(c->tempBluredPast[i]);
918
919 free(c->tempBlocks);
920 free(c->yHistogram);
921 free(c->tempDst);
922 free(c->tempSrc);
9c9e467d 923 free(c->deintTemp);
0426af31 924 free(c->stdQPTable);
ec487e5d 925 free(c->nonBQPTable);
88c0bc7e
MN
926 free(c->forcedQPTable);
927
928 memset(c, 0, sizeof(PPContext));
929
9c9e467d
MN
930 free(c);
931}
932
9cb54f43 933void pp_postprocess(uint8_t * src[3], int srcStride[3],
9c9e467d 934 uint8_t * dst[3], int dstStride[3],
ec487e5d 935 int width, int height,
9c9e467d 936 QP_STORE_T *QP_store, int QPStride,
c41d972d 937 pp_mode_t *vm, void *vc, int pict_type)
911879d1 938{
ec487e5d
MN
939 int mbWidth = (width+15)>>4;
940 int mbHeight= (height+15)>>4;
c41d972d 941 PPMode *mode = (PPMode*)vm;
ec487e5d 942 PPContext *c = (PPContext*)vc;
88c0bc7e 943 int minStride= MAX(srcStride[0], dstStride[0]);
0426af31
MN
944
945 if(c->stride < minStride || c->qpStride < QPStride)
946 reallocBuffers(c, width, height,
947 MAX(minStride, c->stride),
948 MAX(c->qpStride, QPStride));
9c9e467d 949
8aaac435 950 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
815cbfe7 951 {
8aaac435 952 int i;
88c0bc7e 953 QP_store= c->forcedQPTable;
9c9e467d 954 QPStride= 0;
8aaac435 955 if(mode->lumMode & FORCE_QUANT)
88c0bc7e 956 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
8aaac435 957 else
88c0bc7e 958 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
815cbfe7 959 }
0426af31
MN
960//printf("pict_type:%d\n", pict_type);
961
962 if(pict_type & PP_PICT_TYPE_QP2){
963 int i;
964 const int count= mbHeight * QPStride;
965 for(i=0; i<(count>>2); i++){
966 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
967 }
968 for(i<<=2; i<count; i++){
969 c->stdQPTable[i] = QP_store[i]>>1;
970 }
971 QP_store= c->stdQPTable;
972 }
973
ec487e5d
MN
974if(0){
975int x,y;
976for(y=0; y<mbHeight; y++){
977 for(x=0; x<mbWidth; x++){
978 printf("%2d ", QP_store[x + y*QPStride]);
979 }
980 printf("\n");
981}
982 printf("\n");
983}
51e19dcc 984
0426af31 985 if((pict_type&7)!=3)
ec487e5d 986 {
0426af31
MN
987 int i;
988 const int count= mbHeight * QPStride;
989 for(i=0; i<(count>>2); i++){
2e90b37c 990 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
0426af31
MN
991 }
992 for(i<<=2; i<count; i++){
2e90b37c 993 c->nonBQPTable[i] = QP_store[i] & 0x3F;
ec487e5d
MN
994 }
995 }
815cbfe7 996
df8d4d0e 997 if(verbose>2)
162c9c2e
MN
998 {
999 printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
162c9c2e
MN
1000 }
1001
9c9e467d 1002 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
b2a3fcb7 1003 width, height, QP_store, QPStride, 0, mode, c);
911879d1 1004
e9effafd
MN
1005 width = (width )>>c->hChromaSubSample;
1006 height = (height)>>c->vChromaSubSample;
911879d1 1007
4e1349d4
MN
1008 if(mode->chromMode)
1009 {
9c9e467d 1010 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
b2a3fcb7 1011 width, height, QP_store, QPStride, 1, mode, c);
9c9e467d 1012 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
b2a3fcb7 1013 width, height, QP_store, QPStride, 2, mode, c);
4e1349d4 1014 }
9c9e467d 1015 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
4e1349d4 1016 {
ec487e5d
MN
1017 memcpy(dst[1], src[1], srcStride[1]*height);
1018 memcpy(dst[2], src[2], srcStride[2]*height);
4e1349d4
MN
1019 }
1020 else
1021 {
1022 int y;
ec487e5d 1023 for(y=0; y<height; y++)
4e1349d4 1024 {
ec487e5d
MN
1025 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1026 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
4e1349d4
MN
1027 }
1028 }
911879d1
MN
1029}
1030