8e934deb1cf634bc039e8b5caac8f50fa0f2c11b
[libav.git] / libpostproc / postprocess.c
1 /*
2 Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3
4 AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 /**
22 * @file postprocess.c
23 * postprocessing.
24 */
25
26 /*
27 C MMX MMX2 3DNow AltiVec
28 isVertDC Ec Ec Ec
29 isVertMinMaxOk Ec Ec Ec
30 doVertLowPass E e e Ec
31 doVertDefFilter Ec Ec e e Ec
32 isHorizDC Ec Ec Ec
33 isHorizMinMaxOk a E Ec
34 doHorizLowPass E e e Ec
35 doHorizDefFilter Ec Ec e e Ec
36 do_a_deblock Ec E Ec E
37 deRing E e e* Ecp
38 Vertical RKAlgo1 E a a
39 Horizontal RKAlgo1 a a
40 Vertical X1# a E E
41 Horizontal X1# a E E
42 LinIpolDeinterlace e E E*
43 CubicIpolDeinterlace a e e*
44 LinBlendDeinterlace e E E*
45 MedianDeinterlace# E Ec Ec
46 TempDeNoiser# E e e Ec
47
48 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
49 # more or less selfinvented filters so the exactness isnt too meaningfull
50 E = Exact implementation
51 e = allmost exact implementation (slightly different rounding,...)
52 a = alternative / approximate impl
53 c = checked against the other implementations (-vo md5)
54 p = partially optimized, still some work to do
55 */
56
57 /*
58 TODO:
59 reduce the time wasted on the mem transfer
60 unroll stuff if instructions depend too much on the prior one
61 move YScale thing to the end instead of fixing QP
62 write a faster and higher quality deblocking filter :)
63 make the mainloop more flexible (variable number of blocks at once
64 (the if/else stuff per block is slowing things down)
65 compare the quality & speed of all filters
66 split this huge file
67 optimize c versions
68 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
69 ...
70 */
71
72 //Changelog: use the Subversion log
73
74 #include "config.h"
75 #include "avutil.h"
76 #include <inttypes.h>
77 #include <stdio.h>
78 #include <stdlib.h>
79 #include <string.h>
80 #ifdef HAVE_MALLOC_H
81 #include <malloc.h>
82 #endif
83 //#undef HAVE_MMX2
84 //#define HAVE_3DNOW
85 //#undef HAVE_MMX
86 //#undef ARCH_X86
87 //#define DEBUG_BRIGHTNESS
88 #ifdef USE_FASTMEMCPY
89 #include "libvo/fastmemcpy.h"
90 #endif
91 #include "postprocess.h"
92 #include "postprocess_internal.h"
93
94 #include "mangle.h" //FIXME should be supressed
95
96 #ifdef HAVE_ALTIVEC_H
97 #include <altivec.h>
98 #endif
99
100 #define MIN(a,b) ((a) > (b) ? (b) : (a))
101 #define MAX(a,b) ((a) < (b) ? (b) : (a))
102 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
103 #define SIGN(a) ((a) > 0 ? 1 : -1)
104
105 #define GET_MODE_BUFFER_SIZE 500
106 #define OPTIONS_ARRAY_SIZE 10
107 #define BLOCK_SIZE 8
108 #define TEMP_STRIDE 8
109 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
110
111 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
112 # define attribute_used __attribute__((used))
113 # define always_inline __attribute__((always_inline)) inline
114 #else
115 # define attribute_used
116 # define always_inline inline
117 #endif
118
119 #if defined(ARCH_X86) || defined(ARCH_X86_64)
120 static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
121 static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
122 static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
123 static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
124 static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
125 static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
126 static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
127 static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
128 #endif
129
130 static uint8_t clip_table[3*256];
131 static uint8_t * const clip_tab= clip_table + 256;
132
133 static const int verbose= 0;
134
135 static const int attribute_used deringThreshold= 20;
136
137
138 static struct PPFilter filters[]=
139 {
140 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
141 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
142 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
143 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
144 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
145 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
146 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
147 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
148 {"dr", "dering", 1, 5, 6, DERING},
149 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
150 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
151 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
152 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
153 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
154 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
155 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
156 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
157 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
158 {NULL, NULL,0,0,0,0} //End Marker
159 };
160
161 static const char *replaceTable[]=
162 {
163 "default", "hdeblock:a,vdeblock:a,dering:a",
164 "de", "hdeblock:a,vdeblock:a,dering:a",
165 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
166 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
167 "ac", "ha:a:128:7,va:a,dering:a",
168 NULL //End Marker
169 };
170
171
172 #if defined(ARCH_X86) || defined(ARCH_X86_64)
173 static inline void prefetchnta(void *p)
174 {
175 asm volatile( "prefetchnta (%0)\n\t"
176 : : "r" (p)
177 );
178 }
179
180 static inline void prefetcht0(void *p)
181 {
182 asm volatile( "prefetcht0 (%0)\n\t"
183 : : "r" (p)
184 );
185 }
186
187 static inline void prefetcht1(void *p)
188 {
189 asm volatile( "prefetcht1 (%0)\n\t"
190 : : "r" (p)
191 );
192 }
193
194 static inline void prefetcht2(void *p)
195 {
196 asm volatile( "prefetcht2 (%0)\n\t"
197 : : "r" (p)
198 );
199 }
200 #endif
201
202 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
203
204 /**
205 * Check if the given 8x8 Block is mostly "flat"
206 */
207 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
208 {
209 int numEq= 0;
210 int y;
211 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
212 const int dcThreshold= dcOffset*2 + 1;
213
214 for(y=0; y<BLOCK_SIZE; y++)
215 {
216 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
217 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
218 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
219 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
220 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
221 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
222 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
223 src+= stride;
224 }
225 return numEq > c->ppMode.flatnessThreshold;
226 }
227
228 /**
229 * Check if the middle 8x8 Block in the given 8x16 block is flat
230 */
231 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
232 int numEq= 0;
233 int y;
234 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
235 const int dcThreshold= dcOffset*2 + 1;
236
237 src+= stride*4; // src points to begin of the 8x8 Block
238 for(y=0; y<BLOCK_SIZE-1; y++)
239 {
240 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
241 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
242 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
243 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
244 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
245 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
246 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
247 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
248 src+= stride;
249 }
250 return numEq > c->ppMode.flatnessThreshold;
251 }
252
253 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
254 {
255 int i;
256 #if 1
257 for(i=0; i<2; i++){
258 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
259 src += stride;
260 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
261 src += stride;
262 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
263 src += stride;
264 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
265 src += stride;
266 }
267 #else
268 for(i=0; i<8; i++){
269 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
270 src += stride;
271 }
272 #endif
273 return 1;
274 }
275
276 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
277 {
278 #if 1
279 #if 1
280 int x;
281 src+= stride*4;
282 for(x=0; x<BLOCK_SIZE; x+=4)
283 {
284 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
285 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
286 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
287 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
288 }
289 #else
290 int x;
291 src+= stride*3;
292 for(x=0; x<BLOCK_SIZE; x++)
293 {
294 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
295 }
296 #endif
297 return 1;
298 #else
299 int x;
300 src+= stride*4;
301 for(x=0; x<BLOCK_SIZE; x++)
302 {
303 int min=255;
304 int max=0;
305 int y;
306 for(y=0; y<8; y++){
307 int v= src[x + y*stride];
308 if(v>max) max=v;
309 if(v<min) min=v;
310 }
311 if(max-min > 2*QP) return 0;
312 }
313 return 1;
314 #endif
315 }
316
317 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
318 if( isHorizDC_C(src, stride, c) ){
319 if( isHorizMinMaxOk_C(src, stride, c->QP) )
320 return 1;
321 else
322 return 0;
323 }else{
324 return 2;
325 }
326 }
327
328 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
329 if( isVertDC_C(src, stride, c) ){
330 if( isVertMinMaxOk_C(src, stride, c->QP) )
331 return 1;
332 else
333 return 0;
334 }else{
335 return 2;
336 }
337 }
338
339 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
340 {
341 int y;
342 for(y=0; y<BLOCK_SIZE; y++)
343 {
344 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
345
346 if(ABS(middleEnergy) < 8*c->QP)
347 {
348 const int q=(dst[3] - dst[4])/2;
349 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
350 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
351
352 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
353 d= MAX(d, 0);
354
355 d= (5*d + 32) >> 6;
356 d*= SIGN(-middleEnergy);
357
358 if(q>0)
359 {
360 d= d<0 ? 0 : d;
361 d= d>q ? q : d;
362 }
363 else
364 {
365 d= d>0 ? 0 : d;
366 d= d<q ? q : d;
367 }
368
369 dst[3]-= d;
370 dst[4]+= d;
371 }
372 dst+= stride;
373 }
374 }
375
376 /**
377 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
378 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
379 */
380 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
381 {
382 int y;
383 for(y=0; y<BLOCK_SIZE; y++)
384 {
385 const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
386 const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
387
388 int sums[10];
389 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
390 sums[1] = sums[0] - first + dst[3];
391 sums[2] = sums[1] - first + dst[4];
392 sums[3] = sums[2] - first + dst[5];
393 sums[4] = sums[3] - first + dst[6];
394 sums[5] = sums[4] - dst[0] + dst[7];
395 sums[6] = sums[5] - dst[1] + last;
396 sums[7] = sums[6] - dst[2] + last;
397 sums[8] = sums[7] - dst[3] + last;
398 sums[9] = sums[8] - dst[4] + last;
399
400 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
401 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
402 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
403 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
404 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
405 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
406 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
407 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
408
409 dst+= stride;
410 }
411 }
412
413 /**
414 * Experimental Filter 1 (Horizontal)
415 * will not damage linear gradients
416 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
417 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
418 * MMX2 version does correct clipping C version doesnt
419 * not identical with the vertical one
420 */
421 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
422 {
423 int y;
424 static uint64_t *lut= NULL;
425 if(lut==NULL)
426 {
427 int i;
428 lut = av_malloc(256*8);
429 for(i=0; i<256; i++)
430 {
431 int v= i < 128 ? 2*i : 2*(i-256);
432 /*
433 //Simulate 112242211 9-Tap filter
434 uint64_t a= (v/16) & 0xFF;
435 uint64_t b= (v/8) & 0xFF;
436 uint64_t c= (v/4) & 0xFF;
437 uint64_t d= (3*v/8) & 0xFF;
438 */
439 //Simulate piecewise linear interpolation
440 uint64_t a= (v/16) & 0xFF;
441 uint64_t b= (v*3/16) & 0xFF;
442 uint64_t c= (v*5/16) & 0xFF;
443 uint64_t d= (7*v/16) & 0xFF;
444 uint64_t A= (0x100 - a)&0xFF;
445 uint64_t B= (0x100 - b)&0xFF;
446 uint64_t C= (0x100 - c)&0xFF;
447 uint64_t D= (0x100 - c)&0xFF;
448
449 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
450 (D<<24) | (C<<16) | (B<<8) | (A);
451 //lut[i] = (v<<32) | (v<<24);
452 }
453 }
454
455 for(y=0; y<BLOCK_SIZE; y++)
456 {
457 int a= src[1] - src[2];
458 int b= src[3] - src[4];
459 int c= src[5] - src[6];
460
461 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
462
463 if(d < QP)
464 {
465 int v = d * SIGN(-b);
466
467 src[1] +=v/8;
468 src[2] +=v/4;
469 src[3] +=3*v/8;
470 src[4] -=3*v/8;
471 src[5] -=v/4;
472 src[6] -=v/8;
473
474 }
475 src+=stride;
476 }
477 }
478
479 /**
480 * accurate deblock filter
481 */
482 static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
483 int y;
484 const int QP= c->QP;
485 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
486 const int dcThreshold= dcOffset*2 + 1;
487 //START_TIMER
488 src+= step*4; // src points to begin of the 8x8 Block
489 for(y=0; y<8; y++){
490 int numEq= 0;
491
492 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
493 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
494 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
495 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
496 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
497 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
498 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
499 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
500 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
501 if(numEq > c->ppMode.flatnessThreshold){
502 int min, max, x;
503
504 if(src[0] > src[step]){
505 max= src[0];
506 min= src[step];
507 }else{
508 max= src[step];
509 min= src[0];
510 }
511 for(x=2; x<8; x+=2){
512 if(src[x*step] > src[(x+1)*step]){
513 if(src[x *step] > max) max= src[ x *step];
514 if(src[(x+1)*step] < min) min= src[(x+1)*step];
515 }else{
516 if(src[(x+1)*step] > max) max= src[(x+1)*step];
517 if(src[ x *step] < min) min= src[ x *step];
518 }
519 }
520 if(max-min < 2*QP){
521 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
522 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
523
524 int sums[10];
525 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
526 sums[1] = sums[0] - first + src[3*step];
527 sums[2] = sums[1] - first + src[4*step];
528 sums[3] = sums[2] - first + src[5*step];
529 sums[4] = sums[3] - first + src[6*step];
530 sums[5] = sums[4] - src[0*step] + src[7*step];
531 sums[6] = sums[5] - src[1*step] + last;
532 sums[7] = sums[6] - src[2*step] + last;
533 sums[8] = sums[7] - src[3*step] + last;
534 sums[9] = sums[8] - src[4*step] + last;
535
536 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
537 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
538 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
539 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
540 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
541 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
542 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
543 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
544 }
545 }else{
546 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
547
548 if(ABS(middleEnergy) < 8*QP)
549 {
550 const int q=(src[3*step] - src[4*step])/2;
551 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
552 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
553
554 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
555 d= MAX(d, 0);
556
557 d= (5*d + 32) >> 6;
558 d*= SIGN(-middleEnergy);
559
560 if(q>0)
561 {
562 d= d<0 ? 0 : d;
563 d= d>q ? q : d;
564 }
565 else
566 {
567 d= d>0 ? 0 : d;
568 d= d<q ? q : d;
569 }
570
571 src[3*step]-= d;
572 src[4*step]+= d;
573 }
574 }
575
576 src += stride;
577 }
578 /*if(step==16){
579 STOP_TIMER("step16")
580 }else{
581 STOP_TIMER("stepX")
582 }*/
583 }
584
585 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
586 //Plain C versions
587 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
588 #define COMPILE_C
589 #endif
590
591 #ifdef ARCH_POWERPC
592 #ifdef HAVE_ALTIVEC
593 #define COMPILE_ALTIVEC
594 #endif //HAVE_ALTIVEC
595 #endif //ARCH_POWERPC
596
597 #if defined(ARCH_X86) || defined(ARCH_X86_64)
598
599 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
600 #define COMPILE_MMX
601 #endif
602
603 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
604 #define COMPILE_MMX2
605 #endif
606
607 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
608 #define COMPILE_3DNOW
609 #endif
610 #endif //ARCH_X86
611
612 #undef HAVE_MMX
613 #undef HAVE_MMX2
614 #undef HAVE_3DNOW
615 #undef HAVE_ALTIVEC
616
617 #ifdef COMPILE_C
618 #undef HAVE_MMX
619 #undef HAVE_MMX2
620 #undef HAVE_3DNOW
621 #define RENAME(a) a ## _C
622 #include "postprocess_template.c"
623 #endif
624
625 #ifdef ARCH_POWERPC
626 #ifdef COMPILE_ALTIVEC
627 #undef RENAME
628 #define HAVE_ALTIVEC
629 #define RENAME(a) a ## _altivec
630 #include "postprocess_altivec_template.c"
631 #include "postprocess_template.c"
632 #endif
633 #endif //ARCH_POWERPC
634
635 //MMX versions
636 #ifdef COMPILE_MMX
637 #undef RENAME
638 #define HAVE_MMX
639 #undef HAVE_MMX2
640 #undef HAVE_3DNOW
641 #define RENAME(a) a ## _MMX
642 #include "postprocess_template.c"
643 #endif
644
645 //MMX2 versions
646 #ifdef COMPILE_MMX2
647 #undef RENAME
648 #define HAVE_MMX
649 #define HAVE_MMX2
650 #undef HAVE_3DNOW
651 #define RENAME(a) a ## _MMX2
652 #include "postprocess_template.c"
653 #endif
654
655 //3DNOW versions
656 #ifdef COMPILE_3DNOW
657 #undef RENAME
658 #define HAVE_MMX
659 #undef HAVE_MMX2
660 #define HAVE_3DNOW
661 #define RENAME(a) a ## _3DNow
662 #include "postprocess_template.c"
663 #endif
664
665 // minor note: the HAVE_xyz is messed up after that line so dont use it
666
667 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
668 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
669 {
670 PPContext *c= (PPContext *)vc;
671 PPMode *ppMode= (PPMode *)vm;
672 c->ppMode= *ppMode; //FIXME
673
674 // useing ifs here as they are faster than function pointers allthough the
675 // difference wouldnt be messureable here but its much better because
676 // someone might exchange the cpu whithout restarting mplayer ;)
677 #ifdef RUNTIME_CPUDETECT
678 #if defined(ARCH_X86) || defined(ARCH_X86_64)
679 // ordered per speed fasterst first
680 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
681 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
682 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
683 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
684 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
685 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
686 else
687 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
688 #else
689 #ifdef ARCH_POWERPC
690 #ifdef HAVE_ALTIVEC
691 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
692 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
693 else
694 #endif
695 #endif
696 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
697 #endif
698 #else //RUNTIME_CPUDETECT
699 #ifdef HAVE_MMX2
700 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
701 #elif defined (HAVE_3DNOW)
702 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
703 #elif defined (HAVE_MMX)
704 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
705 #elif defined (HAVE_ALTIVEC)
706 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
707 #else
708 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
709 #endif
710 #endif //!RUNTIME_CPUDETECT
711 }
712
713 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
714 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
715
716 /* -pp Command line Help
717 */
718 char *pp_help=
719 "Available postprocessing filters:\n"
720 "Filters Options\n"
721 "short long name short long option Description\n"
722 "* * a autoq CPU power dependent enabler\n"
723 " c chrom chrominance filtering enabled\n"
724 " y nochrom chrominance filtering disabled\n"
725 " n noluma luma filtering disabled\n"
726 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
727 " 1. difference factor: default=32, higher -> more deblocking\n"
728 " 2. flatness threshold: default=39, lower -> more deblocking\n"
729 " the h & v deblocking filters share these\n"
730 " so you can't set different thresholds for h / v\n"
731 "vb vdeblock (2 threshold) vertical deblocking filter\n"
732 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
733 "va vadeblock (2 threshold) vertical deblocking filter\n"
734 "h1 x1hdeblock experimental h deblock filter 1\n"
735 "v1 x1vdeblock experimental v deblock filter 1\n"
736 "dr dering deringing filter\n"
737 "al autolevels automatic brightness / contrast\n"
738 " f fullyrange stretch luminance to (0..255)\n"
739 "lb linblenddeint linear blend deinterlacer\n"
740 "li linipoldeint linear interpolating deinterlace\n"
741 "ci cubicipoldeint cubic interpolating deinterlacer\n"
742 "md mediandeint median deinterlacer\n"
743 "fd ffmpegdeint ffmpeg deinterlacer\n"
744 "l5 lowpass5 FIR lowpass deinterlacer\n"
745 "de default hb:a,vb:a,dr:a\n"
746 "fa fast h1:a,v1:a,dr:a\n"
747 "ac ha:a:128:7,va:a,dr:a\n"
748 "tn tmpnoise (3 threshold) temporal noise reducer\n"
749 " 1. <= 2. <= 3. larger -> stronger filtering\n"
750 "fq forceQuant <quantizer> force quantizer\n"
751 "Usage:\n"
752 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
753 "long form example:\n"
754 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
755 "short form example:\n"
756 "vb:a/hb:a/lb de,-vb\n"
757 "more examples:\n"
758 "tn:64:128:256\n"
759 "\n"
760 ;
761
762 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
763 {
764 char temp[GET_MODE_BUFFER_SIZE];
765 char *p= temp;
766 const char *filterDelimiters= ",/";
767 const char *optionDelimiters= ":";
768 struct PPMode *ppMode;
769 char *filterToken;
770
771 ppMode= av_malloc(sizeof(PPMode));
772
773 ppMode->lumMode= 0;
774 ppMode->chromMode= 0;
775 ppMode->maxTmpNoise[0]= 700;
776 ppMode->maxTmpNoise[1]= 1500;
777 ppMode->maxTmpNoise[2]= 3000;
778 ppMode->maxAllowedY= 234;
779 ppMode->minAllowedY= 16;
780 ppMode->baseDcDiff= 256/8;
781 ppMode->flatnessThreshold= 56-16-1;
782 ppMode->maxClippedThreshold= 0.01;
783 ppMode->error=0;
784
785 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
786
787 if(verbose>1) printf("pp: %s\n", name);
788
789 for(;;){
790 char *filterName;
791 int q= 1000000; //PP_QUALITY_MAX;
792 int chrom=-1;
793 int luma=-1;
794 char *option;
795 char *options[OPTIONS_ARRAY_SIZE];
796 int i;
797 int filterNameOk=0;
798 int numOfUnknownOptions=0;
799 int enable=1; //does the user want us to enabled or disabled the filter
800
801 filterToken= strtok(p, filterDelimiters);
802 if(filterToken == NULL) break;
803 p+= strlen(filterToken) + 1; // p points to next filterToken
804 filterName= strtok(filterToken, optionDelimiters);
805 if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
806
807 if(*filterName == '-')
808 {
809 enable=0;
810 filterName++;
811 }
812
813 for(;;){ //for all options
814 option= strtok(NULL, optionDelimiters);
815 if(option == NULL) break;
816
817 if(verbose>1) printf("pp: option: %s\n", option);
818 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
819 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
820 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
821 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
822 else
823 {
824 options[numOfUnknownOptions] = option;
825 numOfUnknownOptions++;
826 }
827 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
828 }
829 options[numOfUnknownOptions] = NULL;
830
831 /* replace stuff from the replace Table */
832 for(i=0; replaceTable[2*i]!=NULL; i++)
833 {
834 if(!strcmp(replaceTable[2*i], filterName))
835 {
836 int newlen= strlen(replaceTable[2*i + 1]);
837 int plen;
838 int spaceLeft;
839
840 if(p==NULL) p= temp, *p=0; //last filter
841 else p--, *p=','; //not last filter
842
843 plen= strlen(p);
844 spaceLeft= p - temp + plen;
845 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
846 {
847 ppMode->error++;
848 break;
849 }
850 memmove(p + newlen, p, plen+1);
851 memcpy(p, replaceTable[2*i + 1], newlen);
852 filterNameOk=1;
853 }
854 }
855
856 for(i=0; filters[i].shortName!=NULL; i++)
857 {
858 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
859 if( !strcmp(filters[i].longName, filterName)
860 || !strcmp(filters[i].shortName, filterName))
861 {
862 ppMode->lumMode &= ~filters[i].mask;
863 ppMode->chromMode &= ~filters[i].mask;
864
865 filterNameOk=1;
866 if(!enable) break; // user wants to disable it
867
868 if(q >= filters[i].minLumQuality && luma)
869 ppMode->lumMode|= filters[i].mask;
870 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
871 if(q >= filters[i].minChromQuality)
872 ppMode->chromMode|= filters[i].mask;
873
874 if(filters[i].mask == LEVEL_FIX)
875 {
876 int o;
877 ppMode->minAllowedY= 16;
878 ppMode->maxAllowedY= 234;
879 for(o=0; options[o]!=NULL; o++)
880 {
881 if( !strcmp(options[o],"fullyrange")
882 ||!strcmp(options[o],"f"))
883 {
884 ppMode->minAllowedY= 0;
885 ppMode->maxAllowedY= 255;
886 numOfUnknownOptions--;
887 }
888 }
889 }
890 else if(filters[i].mask == TEMP_NOISE_FILTER)
891 {
892 int o;
893 int numOfNoises=0;
894
895 for(o=0; options[o]!=NULL; o++)
896 {
897 char *tail;
898 ppMode->maxTmpNoise[numOfNoises]=
899 strtol(options[o], &tail, 0);
900 if(tail!=options[o])
901 {
902 numOfNoises++;
903 numOfUnknownOptions--;
904 if(numOfNoises >= 3) break;
905 }
906 }
907 }
908 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
909 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
910 {
911 int o;
912
913 for(o=0; options[o]!=NULL && o<2; o++)
914 {
915 char *tail;
916 int val= strtol(options[o], &tail, 0);
917 if(tail==options[o]) break;
918
919 numOfUnknownOptions--;
920 if(o==0) ppMode->baseDcDiff= val;
921 else ppMode->flatnessThreshold= val;
922 }
923 }
924 else if(filters[i].mask == FORCE_QUANT)
925 {
926 int o;
927 ppMode->forcedQuant= 15;
928
929 for(o=0; options[o]!=NULL && o<1; o++)
930 {
931 char *tail;
932 int val= strtol(options[o], &tail, 0);
933 if(tail==options[o]) break;
934
935 numOfUnknownOptions--;
936 ppMode->forcedQuant= val;
937 }
938 }
939 }
940 }
941 if(!filterNameOk) ppMode->error++;
942 ppMode->error += numOfUnknownOptions;
943 }
944
945 if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
946 if(ppMode->error)
947 {
948 fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
949 av_free(ppMode);
950 return NULL;
951 }
952 return ppMode;
953 }
954
955 void pp_free_mode(pp_mode_t *mode){
956 av_free(mode);
957 }
958
959 static void reallocAlign(void **p, int alignment, int size){
960 av_free(p);
961 *p= av_mallocz(size);
962 }
963
964 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
965 int mbWidth = (width+15)>>4;
966 int mbHeight= (height+15)>>4;
967 int i;
968
969 c->stride= stride;
970 c->qpStride= qpStride;
971
972 reallocAlign((void **)&c->tempDst, 8, stride*24);
973 reallocAlign((void **)&c->tempSrc, 8, stride*24);
974 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
975 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
976 for(i=0; i<256; i++)
977 c->yHistogram[i]= width*height/64*15/256;
978
979 for(i=0; i<3; i++)
980 {
981 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
982 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
983 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
984 }
985
986 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
987 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
988 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
989 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
990 }
991
992 static void global_init(void){
993 int i;
994 memset(clip_table, 0, 256);
995 for(i=256; i<512; i++)
996 clip_table[i]= i;
997 memset(clip_table+512, 0, 256);
998 }
999
1000 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
1001 PPContext *c= av_malloc(sizeof(PPContext));
1002 int stride= (width+15)&(~15); //assumed / will realloc if needed
1003 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
1004
1005 global_init();
1006
1007 memset(c, 0, sizeof(PPContext));
1008 c->cpuCaps= cpuCaps;
1009 if(cpuCaps&PP_FORMAT){
1010 c->hChromaSubSample= cpuCaps&0x3;
1011 c->vChromaSubSample= (cpuCaps>>4)&0x3;
1012 }else{
1013 c->hChromaSubSample= 1;
1014 c->vChromaSubSample= 1;
1015 }
1016
1017 reallocBuffers(c, width, height, stride, qpStride);
1018
1019 c->frameNum=-1;
1020
1021 return c;
1022 }
1023
1024 void pp_free_context(void *vc){
1025 PPContext *c = (PPContext*)vc;
1026 int i;
1027
1028 for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1029 for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1030
1031 av_free(c->tempBlocks);
1032 av_free(c->yHistogram);
1033 av_free(c->tempDst);
1034 av_free(c->tempSrc);
1035 av_free(c->deintTemp);
1036 av_free(c->stdQPTable);
1037 av_free(c->nonBQPTable);
1038 av_free(c->forcedQPTable);
1039
1040 memset(c, 0, sizeof(PPContext));
1041
1042 av_free(c);
1043 }
1044
1045 void pp_postprocess(uint8_t * src[3], int srcStride[3],
1046 uint8_t * dst[3], int dstStride[3],
1047 int width, int height,
1048 QP_STORE_T *QP_store, int QPStride,
1049 pp_mode_t *vm, void *vc, int pict_type)
1050 {
1051 int mbWidth = (width+15)>>4;
1052 int mbHeight= (height+15)>>4;
1053 PPMode *mode = (PPMode*)vm;
1054 PPContext *c = (PPContext*)vc;
1055 int minStride= MAX(ABS(srcStride[0]), ABS(dstStride[0]));
1056 int absQPStride = ABS(QPStride);
1057
1058 // c->stride and c->QPStride are always positive
1059 if(c->stride < minStride || c->qpStride < absQPStride)
1060 reallocBuffers(c, width, height,
1061 MAX(minStride, c->stride),
1062 MAX(c->qpStride, absQPStride));
1063
1064 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1065 {
1066 int i;
1067 QP_store= c->forcedQPTable;
1068 absQPStride = QPStride = 0;
1069 if(mode->lumMode & FORCE_QUANT)
1070 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1071 else
1072 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1073 }
1074 //printf("pict_type:%d\n", pict_type);
1075
1076 if(pict_type & PP_PICT_TYPE_QP2){
1077 int i;
1078 const int count= mbHeight * absQPStride;
1079 for(i=0; i<(count>>2); i++){
1080 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1081 }
1082 for(i<<=2; i<count; i++){
1083 c->stdQPTable[i] = QP_store[i]>>1;
1084 }
1085 QP_store= c->stdQPTable;
1086 QPStride= absQPStride;
1087 }
1088
1089 if(0){
1090 int x,y;
1091 for(y=0; y<mbHeight; y++){
1092 for(x=0; x<mbWidth; x++){
1093 printf("%2d ", QP_store[x + y*QPStride]);
1094 }
1095 printf("\n");
1096 }
1097 printf("\n");
1098 }
1099
1100 if((pict_type&7)!=3)
1101 {
1102 if (QPStride >= 0) {
1103 int i;
1104 const int count= mbHeight * QPStride;
1105 for(i=0; i<(count>>2); i++){
1106 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1107 }
1108 for(i<<=2; i<count; i++){
1109 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1110 }
1111 } else {
1112 int i,j;
1113 for(i=0; i<mbHeight; i++) {
1114 for(j=0; j<absQPStride; j++) {
1115 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1116 }
1117 }
1118 }
1119 }
1120
1121 if(verbose>2)
1122 {
1123 printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
1124 }
1125
1126 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1127 width, height, QP_store, QPStride, 0, mode, c);
1128
1129 width = (width )>>c->hChromaSubSample;
1130 height = (height)>>c->vChromaSubSample;
1131
1132 if(mode->chromMode)
1133 {
1134 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1135 width, height, QP_store, QPStride, 1, mode, c);
1136 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1137 width, height, QP_store, QPStride, 2, mode, c);
1138 }
1139 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1140 {
1141 linecpy(dst[1], src[1], height, srcStride[1]);
1142 linecpy(dst[2], src[2], height, srcStride[2]);
1143 }
1144 else
1145 {
1146 int y;
1147 for(y=0; y<height; y++)
1148 {
1149 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1150 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1151 }
1152 }
1153 }
1154