Remove gcc-workaround that is already present in libavutil/common.h.
[libav.git] / libpostproc / postprocess.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file postprocess.c
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
51 # more or less selfinvented filters so the exactness isnt too meaningfull
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use the Subversion log
75
76 #include "config.h"
77 #include "avutil.h"
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <stdlib.h>
81 #include <string.h>
82 #ifdef HAVE_MALLOC_H
83 #include <malloc.h>
84 #endif
85 //#undef HAVE_MMX2
86 //#define HAVE_3DNOW
87 //#undef HAVE_MMX
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #ifdef USE_FASTMEMCPY
91 #include "libvo/fastmemcpy.h"
92 #endif
93 #include "postprocess.h"
94 #include "postprocess_internal.h"
95
96 #include "mangle.h" //FIXME should be supressed
97
98 #ifdef HAVE_ALTIVEC_H
99 #include <altivec.h>
100 #endif
101
102 #define MIN(a,b) ((a) > (b) ? (b) : (a))
103 #define MAX(a,b) ((a) < (b) ? (b) : (a))
104 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
105 #define SIGN(a) ((a) > 0 ? 1 : -1)
106
107 #define GET_MODE_BUFFER_SIZE 500
108 #define OPTIONS_ARRAY_SIZE 10
109 #define BLOCK_SIZE 8
110 #define TEMP_STRIDE 8
111 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
112
113 #if defined(ARCH_X86) || defined(ARCH_X86_64)
114 static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
115 static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
116 static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
117 static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
118 static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
119 static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
120 static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
121 static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
122 #endif
123
124 static uint8_t clip_table[3*256];
125 static uint8_t * const clip_tab= clip_table + 256;
126
127 static const int verbose= 0;
128
129 static const int attribute_used deringThreshold= 20;
130
131
132 static struct PPFilter filters[]=
133 {
134 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
135 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
136 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
137 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
138 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
139 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
140 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
141 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
142 {"dr", "dering", 1, 5, 6, DERING},
143 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
144 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
145 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
146 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
147 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
148 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
149 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
150 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
151 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
152 {NULL, NULL,0,0,0,0} //End Marker
153 };
154
155 static const char *replaceTable[]=
156 {
157 "default", "hdeblock:a,vdeblock:a,dering:a",
158 "de", "hdeblock:a,vdeblock:a,dering:a",
159 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
160 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
161 "ac", "ha:a:128:7,va:a,dering:a",
162 NULL //End Marker
163 };
164
165
166 #if defined(ARCH_X86) || defined(ARCH_X86_64)
167 static inline void prefetchnta(void *p)
168 {
169 asm volatile( "prefetchnta (%0)\n\t"
170 : : "r" (p)
171 );
172 }
173
174 static inline void prefetcht0(void *p)
175 {
176 asm volatile( "prefetcht0 (%0)\n\t"
177 : : "r" (p)
178 );
179 }
180
181 static inline void prefetcht1(void *p)
182 {
183 asm volatile( "prefetcht1 (%0)\n\t"
184 : : "r" (p)
185 );
186 }
187
188 static inline void prefetcht2(void *p)
189 {
190 asm volatile( "prefetcht2 (%0)\n\t"
191 : : "r" (p)
192 );
193 }
194 #endif
195
196 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
197
198 /**
199 * Check if the given 8x8 Block is mostly "flat"
200 */
201 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
202 {
203 int numEq= 0;
204 int y;
205 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
206 const int dcThreshold= dcOffset*2 + 1;
207
208 for(y=0; y<BLOCK_SIZE; y++)
209 {
210 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
211 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
212 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
213 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
214 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
215 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
216 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
217 src+= stride;
218 }
219 return numEq > c->ppMode.flatnessThreshold;
220 }
221
222 /**
223 * Check if the middle 8x8 Block in the given 8x16 block is flat
224 */
225 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
226 int numEq= 0;
227 int y;
228 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
229 const int dcThreshold= dcOffset*2 + 1;
230
231 src+= stride*4; // src points to begin of the 8x8 Block
232 for(y=0; y<BLOCK_SIZE-1; y++)
233 {
234 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
235 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
236 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
237 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
238 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
239 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
240 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
241 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
242 src+= stride;
243 }
244 return numEq > c->ppMode.flatnessThreshold;
245 }
246
247 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
248 {
249 int i;
250 #if 1
251 for(i=0; i<2; i++){
252 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
253 src += stride;
254 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
255 src += stride;
256 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
257 src += stride;
258 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
259 src += stride;
260 }
261 #else
262 for(i=0; i<8; i++){
263 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
264 src += stride;
265 }
266 #endif
267 return 1;
268 }
269
270 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
271 {
272 #if 1
273 #if 1
274 int x;
275 src+= stride*4;
276 for(x=0; x<BLOCK_SIZE; x+=4)
277 {
278 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
279 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
280 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
281 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
282 }
283 #else
284 int x;
285 src+= stride*3;
286 for(x=0; x<BLOCK_SIZE; x++)
287 {
288 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
289 }
290 #endif
291 return 1;
292 #else
293 int x;
294 src+= stride*4;
295 for(x=0; x<BLOCK_SIZE; x++)
296 {
297 int min=255;
298 int max=0;
299 int y;
300 for(y=0; y<8; y++){
301 int v= src[x + y*stride];
302 if(v>max) max=v;
303 if(v<min) min=v;
304 }
305 if(max-min > 2*QP) return 0;
306 }
307 return 1;
308 #endif
309 }
310
311 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
312 if( isHorizDC_C(src, stride, c) ){
313 if( isHorizMinMaxOk_C(src, stride, c->QP) )
314 return 1;
315 else
316 return 0;
317 }else{
318 return 2;
319 }
320 }
321
322 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
323 if( isVertDC_C(src, stride, c) ){
324 if( isVertMinMaxOk_C(src, stride, c->QP) )
325 return 1;
326 else
327 return 0;
328 }else{
329 return 2;
330 }
331 }
332
333 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
334 {
335 int y;
336 for(y=0; y<BLOCK_SIZE; y++)
337 {
338 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
339
340 if(ABS(middleEnergy) < 8*c->QP)
341 {
342 const int q=(dst[3] - dst[4])/2;
343 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
344 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
345
346 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
347 d= MAX(d, 0);
348
349 d= (5*d + 32) >> 6;
350 d*= SIGN(-middleEnergy);
351
352 if(q>0)
353 {
354 d= d<0 ? 0 : d;
355 d= d>q ? q : d;
356 }
357 else
358 {
359 d= d>0 ? 0 : d;
360 d= d<q ? q : d;
361 }
362
363 dst[3]-= d;
364 dst[4]+= d;
365 }
366 dst+= stride;
367 }
368 }
369
370 /**
371 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
372 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
373 */
374 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
375 {
376 int y;
377 for(y=0; y<BLOCK_SIZE; y++)
378 {
379 const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
380 const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
381
382 int sums[10];
383 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
384 sums[1] = sums[0] - first + dst[3];
385 sums[2] = sums[1] - first + dst[4];
386 sums[3] = sums[2] - first + dst[5];
387 sums[4] = sums[3] - first + dst[6];
388 sums[5] = sums[4] - dst[0] + dst[7];
389 sums[6] = sums[5] - dst[1] + last;
390 sums[7] = sums[6] - dst[2] + last;
391 sums[8] = sums[7] - dst[3] + last;
392 sums[9] = sums[8] - dst[4] + last;
393
394 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
395 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
396 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
397 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
398 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
399 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
400 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
401 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
402
403 dst+= stride;
404 }
405 }
406
407 /**
408 * Experimental Filter 1 (Horizontal)
409 * will not damage linear gradients
410 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
411 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
412 * MMX2 version does correct clipping C version doesnt
413 * not identical with the vertical one
414 */
415 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
416 {
417 int y;
418 static uint64_t *lut= NULL;
419 if(lut==NULL)
420 {
421 int i;
422 lut = av_malloc(256*8);
423 for(i=0; i<256; i++)
424 {
425 int v= i < 128 ? 2*i : 2*(i-256);
426 /*
427 //Simulate 112242211 9-Tap filter
428 uint64_t a= (v/16) & 0xFF;
429 uint64_t b= (v/8) & 0xFF;
430 uint64_t c= (v/4) & 0xFF;
431 uint64_t d= (3*v/8) & 0xFF;
432 */
433 //Simulate piecewise linear interpolation
434 uint64_t a= (v/16) & 0xFF;
435 uint64_t b= (v*3/16) & 0xFF;
436 uint64_t c= (v*5/16) & 0xFF;
437 uint64_t d= (7*v/16) & 0xFF;
438 uint64_t A= (0x100 - a)&0xFF;
439 uint64_t B= (0x100 - b)&0xFF;
440 uint64_t C= (0x100 - c)&0xFF;
441 uint64_t D= (0x100 - c)&0xFF;
442
443 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
444 (D<<24) | (C<<16) | (B<<8) | (A);
445 //lut[i] = (v<<32) | (v<<24);
446 }
447 }
448
449 for(y=0; y<BLOCK_SIZE; y++)
450 {
451 int a= src[1] - src[2];
452 int b= src[3] - src[4];
453 int c= src[5] - src[6];
454
455 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
456
457 if(d < QP)
458 {
459 int v = d * SIGN(-b);
460
461 src[1] +=v/8;
462 src[2] +=v/4;
463 src[3] +=3*v/8;
464 src[4] -=3*v/8;
465 src[5] -=v/4;
466 src[6] -=v/8;
467
468 }
469 src+=stride;
470 }
471 }
472
473 /**
474 * accurate deblock filter
475 */
476 static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
477 int y;
478 const int QP= c->QP;
479 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
480 const int dcThreshold= dcOffset*2 + 1;
481 //START_TIMER
482 src+= step*4; // src points to begin of the 8x8 Block
483 for(y=0; y<8; y++){
484 int numEq= 0;
485
486 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
487 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
488 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
489 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
490 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
491 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
492 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
493 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
494 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
495 if(numEq > c->ppMode.flatnessThreshold){
496 int min, max, x;
497
498 if(src[0] > src[step]){
499 max= src[0];
500 min= src[step];
501 }else{
502 max= src[step];
503 min= src[0];
504 }
505 for(x=2; x<8; x+=2){
506 if(src[x*step] > src[(x+1)*step]){
507 if(src[x *step] > max) max= src[ x *step];
508 if(src[(x+1)*step] < min) min= src[(x+1)*step];
509 }else{
510 if(src[(x+1)*step] > max) max= src[(x+1)*step];
511 if(src[ x *step] < min) min= src[ x *step];
512 }
513 }
514 if(max-min < 2*QP){
515 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
516 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
517
518 int sums[10];
519 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
520 sums[1] = sums[0] - first + src[3*step];
521 sums[2] = sums[1] - first + src[4*step];
522 sums[3] = sums[2] - first + src[5*step];
523 sums[4] = sums[3] - first + src[6*step];
524 sums[5] = sums[4] - src[0*step] + src[7*step];
525 sums[6] = sums[5] - src[1*step] + last;
526 sums[7] = sums[6] - src[2*step] + last;
527 sums[8] = sums[7] - src[3*step] + last;
528 sums[9] = sums[8] - src[4*step] + last;
529
530 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
531 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
532 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
533 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
534 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
535 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
536 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
537 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
538 }
539 }else{
540 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
541
542 if(ABS(middleEnergy) < 8*QP)
543 {
544 const int q=(src[3*step] - src[4*step])/2;
545 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
546 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
547
548 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
549 d= MAX(d, 0);
550
551 d= (5*d + 32) >> 6;
552 d*= SIGN(-middleEnergy);
553
554 if(q>0)
555 {
556 d= d<0 ? 0 : d;
557 d= d>q ? q : d;
558 }
559 else
560 {
561 d= d>0 ? 0 : d;
562 d= d<q ? q : d;
563 }
564
565 src[3*step]-= d;
566 src[4*step]+= d;
567 }
568 }
569
570 src += stride;
571 }
572 /*if(step==16){
573 STOP_TIMER("step16")
574 }else{
575 STOP_TIMER("stepX")
576 }*/
577 }
578
579 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
580 //Plain C versions
581 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
582 #define COMPILE_C
583 #endif
584
585 #ifdef ARCH_POWERPC
586 #ifdef HAVE_ALTIVEC
587 #define COMPILE_ALTIVEC
588 #endif //HAVE_ALTIVEC
589 #endif //ARCH_POWERPC
590
591 #if defined(ARCH_X86) || defined(ARCH_X86_64)
592
593 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
594 #define COMPILE_MMX
595 #endif
596
597 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
598 #define COMPILE_MMX2
599 #endif
600
601 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
602 #define COMPILE_3DNOW
603 #endif
604 #endif //ARCH_X86
605
606 #undef HAVE_MMX
607 #undef HAVE_MMX2
608 #undef HAVE_3DNOW
609 #undef HAVE_ALTIVEC
610
611 #ifdef COMPILE_C
612 #undef HAVE_MMX
613 #undef HAVE_MMX2
614 #undef HAVE_3DNOW
615 #define RENAME(a) a ## _C
616 #include "postprocess_template.c"
617 #endif
618
619 #ifdef ARCH_POWERPC
620 #ifdef COMPILE_ALTIVEC
621 #undef RENAME
622 #define HAVE_ALTIVEC
623 #define RENAME(a) a ## _altivec
624 #include "postprocess_altivec_template.c"
625 #include "postprocess_template.c"
626 #endif
627 #endif //ARCH_POWERPC
628
629 //MMX versions
630 #ifdef COMPILE_MMX
631 #undef RENAME
632 #define HAVE_MMX
633 #undef HAVE_MMX2
634 #undef HAVE_3DNOW
635 #define RENAME(a) a ## _MMX
636 #include "postprocess_template.c"
637 #endif
638
639 //MMX2 versions
640 #ifdef COMPILE_MMX2
641 #undef RENAME
642 #define HAVE_MMX
643 #define HAVE_MMX2
644 #undef HAVE_3DNOW
645 #define RENAME(a) a ## _MMX2
646 #include "postprocess_template.c"
647 #endif
648
649 //3DNOW versions
650 #ifdef COMPILE_3DNOW
651 #undef RENAME
652 #define HAVE_MMX
653 #undef HAVE_MMX2
654 #define HAVE_3DNOW
655 #define RENAME(a) a ## _3DNow
656 #include "postprocess_template.c"
657 #endif
658
659 // minor note: the HAVE_xyz is messed up after that line so dont use it
660
661 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
662 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
663 {
664 PPContext *c= (PPContext *)vc;
665 PPMode *ppMode= (PPMode *)vm;
666 c->ppMode= *ppMode; //FIXME
667
668 // useing ifs here as they are faster than function pointers allthough the
669 // difference wouldnt be messureable here but its much better because
670 // someone might exchange the cpu whithout restarting mplayer ;)
671 #ifdef RUNTIME_CPUDETECT
672 #if defined(ARCH_X86) || defined(ARCH_X86_64)
673 // ordered per speed fasterst first
674 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
675 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
676 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
677 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
678 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
679 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
680 else
681 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
682 #else
683 #ifdef ARCH_POWERPC
684 #ifdef HAVE_ALTIVEC
685 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
686 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
687 else
688 #endif
689 #endif
690 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
691 #endif
692 #else //RUNTIME_CPUDETECT
693 #ifdef HAVE_MMX2
694 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
695 #elif defined (HAVE_3DNOW)
696 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
697 #elif defined (HAVE_MMX)
698 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
699 #elif defined (HAVE_ALTIVEC)
700 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
701 #else
702 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
703 #endif
704 #endif //!RUNTIME_CPUDETECT
705 }
706
707 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
708 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
709
710 /* -pp Command line Help
711 */
712 char *pp_help=
713 "Available postprocessing filters:\n"
714 "Filters Options\n"
715 "short long name short long option Description\n"
716 "* * a autoq CPU power dependent enabler\n"
717 " c chrom chrominance filtering enabled\n"
718 " y nochrom chrominance filtering disabled\n"
719 " n noluma luma filtering disabled\n"
720 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
721 " 1. difference factor: default=32, higher -> more deblocking\n"
722 " 2. flatness threshold: default=39, lower -> more deblocking\n"
723 " the h & v deblocking filters share these\n"
724 " so you can't set different thresholds for h / v\n"
725 "vb vdeblock (2 threshold) vertical deblocking filter\n"
726 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
727 "va vadeblock (2 threshold) vertical deblocking filter\n"
728 "h1 x1hdeblock experimental h deblock filter 1\n"
729 "v1 x1vdeblock experimental v deblock filter 1\n"
730 "dr dering deringing filter\n"
731 "al autolevels automatic brightness / contrast\n"
732 " f fullyrange stretch luminance to (0..255)\n"
733 "lb linblenddeint linear blend deinterlacer\n"
734 "li linipoldeint linear interpolating deinterlace\n"
735 "ci cubicipoldeint cubic interpolating deinterlacer\n"
736 "md mediandeint median deinterlacer\n"
737 "fd ffmpegdeint ffmpeg deinterlacer\n"
738 "l5 lowpass5 FIR lowpass deinterlacer\n"
739 "de default hb:a,vb:a,dr:a\n"
740 "fa fast h1:a,v1:a,dr:a\n"
741 "ac ha:a:128:7,va:a,dr:a\n"
742 "tn tmpnoise (3 threshold) temporal noise reducer\n"
743 " 1. <= 2. <= 3. larger -> stronger filtering\n"
744 "fq forceQuant <quantizer> force quantizer\n"
745 "Usage:\n"
746 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
747 "long form example:\n"
748 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
749 "short form example:\n"
750 "vb:a/hb:a/lb de,-vb\n"
751 "more examples:\n"
752 "tn:64:128:256\n"
753 "\n"
754 ;
755
756 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
757 {
758 char temp[GET_MODE_BUFFER_SIZE];
759 char *p= temp;
760 const char *filterDelimiters= ",/";
761 const char *optionDelimiters= ":";
762 struct PPMode *ppMode;
763 char *filterToken;
764
765 ppMode= av_malloc(sizeof(PPMode));
766
767 ppMode->lumMode= 0;
768 ppMode->chromMode= 0;
769 ppMode->maxTmpNoise[0]= 700;
770 ppMode->maxTmpNoise[1]= 1500;
771 ppMode->maxTmpNoise[2]= 3000;
772 ppMode->maxAllowedY= 234;
773 ppMode->minAllowedY= 16;
774 ppMode->baseDcDiff= 256/8;
775 ppMode->flatnessThreshold= 56-16-1;
776 ppMode->maxClippedThreshold= 0.01;
777 ppMode->error=0;
778
779 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
780
781 if(verbose>1) printf("pp: %s\n", name);
782
783 for(;;){
784 char *filterName;
785 int q= 1000000; //PP_QUALITY_MAX;
786 int chrom=-1;
787 int luma=-1;
788 char *option;
789 char *options[OPTIONS_ARRAY_SIZE];
790 int i;
791 int filterNameOk=0;
792 int numOfUnknownOptions=0;
793 int enable=1; //does the user want us to enabled or disabled the filter
794
795 filterToken= strtok(p, filterDelimiters);
796 if(filterToken == NULL) break;
797 p+= strlen(filterToken) + 1; // p points to next filterToken
798 filterName= strtok(filterToken, optionDelimiters);
799 if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
800
801 if(*filterName == '-')
802 {
803 enable=0;
804 filterName++;
805 }
806
807 for(;;){ //for all options
808 option= strtok(NULL, optionDelimiters);
809 if(option == NULL) break;
810
811 if(verbose>1) printf("pp: option: %s\n", option);
812 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
813 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
814 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
815 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
816 else
817 {
818 options[numOfUnknownOptions] = option;
819 numOfUnknownOptions++;
820 }
821 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
822 }
823 options[numOfUnknownOptions] = NULL;
824
825 /* replace stuff from the replace Table */
826 for(i=0; replaceTable[2*i]!=NULL; i++)
827 {
828 if(!strcmp(replaceTable[2*i], filterName))
829 {
830 int newlen= strlen(replaceTable[2*i + 1]);
831 int plen;
832 int spaceLeft;
833
834 if(p==NULL) p= temp, *p=0; //last filter
835 else p--, *p=','; //not last filter
836
837 plen= strlen(p);
838 spaceLeft= p - temp + plen;
839 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
840 {
841 ppMode->error++;
842 break;
843 }
844 memmove(p + newlen, p, plen+1);
845 memcpy(p, replaceTable[2*i + 1], newlen);
846 filterNameOk=1;
847 }
848 }
849
850 for(i=0; filters[i].shortName!=NULL; i++)
851 {
852 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
853 if( !strcmp(filters[i].longName, filterName)
854 || !strcmp(filters[i].shortName, filterName))
855 {
856 ppMode->lumMode &= ~filters[i].mask;
857 ppMode->chromMode &= ~filters[i].mask;
858
859 filterNameOk=1;
860 if(!enable) break; // user wants to disable it
861
862 if(q >= filters[i].minLumQuality && luma)
863 ppMode->lumMode|= filters[i].mask;
864 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
865 if(q >= filters[i].minChromQuality)
866 ppMode->chromMode|= filters[i].mask;
867
868 if(filters[i].mask == LEVEL_FIX)
869 {
870 int o;
871 ppMode->minAllowedY= 16;
872 ppMode->maxAllowedY= 234;
873 for(o=0; options[o]!=NULL; o++)
874 {
875 if( !strcmp(options[o],"fullyrange")
876 ||!strcmp(options[o],"f"))
877 {
878 ppMode->minAllowedY= 0;
879 ppMode->maxAllowedY= 255;
880 numOfUnknownOptions--;
881 }
882 }
883 }
884 else if(filters[i].mask == TEMP_NOISE_FILTER)
885 {
886 int o;
887 int numOfNoises=0;
888
889 for(o=0; options[o]!=NULL; o++)
890 {
891 char *tail;
892 ppMode->maxTmpNoise[numOfNoises]=
893 strtol(options[o], &tail, 0);
894 if(tail!=options[o])
895 {
896 numOfNoises++;
897 numOfUnknownOptions--;
898 if(numOfNoises >= 3) break;
899 }
900 }
901 }
902 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
903 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
904 {
905 int o;
906
907 for(o=0; options[o]!=NULL && o<2; o++)
908 {
909 char *tail;
910 int val= strtol(options[o], &tail, 0);
911 if(tail==options[o]) break;
912
913 numOfUnknownOptions--;
914 if(o==0) ppMode->baseDcDiff= val;
915 else ppMode->flatnessThreshold= val;
916 }
917 }
918 else if(filters[i].mask == FORCE_QUANT)
919 {
920 int o;
921 ppMode->forcedQuant= 15;
922
923 for(o=0; options[o]!=NULL && o<1; o++)
924 {
925 char *tail;
926 int val= strtol(options[o], &tail, 0);
927 if(tail==options[o]) break;
928
929 numOfUnknownOptions--;
930 ppMode->forcedQuant= val;
931 }
932 }
933 }
934 }
935 if(!filterNameOk) ppMode->error++;
936 ppMode->error += numOfUnknownOptions;
937 }
938
939 if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
940 if(ppMode->error)
941 {
942 fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
943 av_free(ppMode);
944 return NULL;
945 }
946 return ppMode;
947 }
948
949 void pp_free_mode(pp_mode_t *mode){
950 av_free(mode);
951 }
952
953 static void reallocAlign(void **p, int alignment, int size){
954 av_free(*p);
955 *p= av_mallocz(size);
956 }
957
958 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
959 int mbWidth = (width+15)>>4;
960 int mbHeight= (height+15)>>4;
961 int i;
962
963 c->stride= stride;
964 c->qpStride= qpStride;
965
966 reallocAlign((void **)&c->tempDst, 8, stride*24);
967 reallocAlign((void **)&c->tempSrc, 8, stride*24);
968 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
969 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
970 for(i=0; i<256; i++)
971 c->yHistogram[i]= width*height/64*15/256;
972
973 for(i=0; i<3; i++)
974 {
975 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
976 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
977 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
978 }
979
980 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
981 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
982 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
983 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
984 }
985
986 static void global_init(void){
987 int i;
988 memset(clip_table, 0, 256);
989 for(i=256; i<512; i++)
990 clip_table[i]= i;
991 memset(clip_table+512, 0, 256);
992 }
993
994 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
995 PPContext *c= av_malloc(sizeof(PPContext));
996 int stride= (width+15)&(~15); //assumed / will realloc if needed
997 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
998
999 global_init();
1000
1001 memset(c, 0, sizeof(PPContext));
1002 c->cpuCaps= cpuCaps;
1003 if(cpuCaps&PP_FORMAT){
1004 c->hChromaSubSample= cpuCaps&0x3;
1005 c->vChromaSubSample= (cpuCaps>>4)&0x3;
1006 }else{
1007 c->hChromaSubSample= 1;
1008 c->vChromaSubSample= 1;
1009 }
1010
1011 reallocBuffers(c, width, height, stride, qpStride);
1012
1013 c->frameNum=-1;
1014
1015 return c;
1016 }
1017
1018 void pp_free_context(void *vc){
1019 PPContext *c = (PPContext*)vc;
1020 int i;
1021
1022 for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1023 for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1024
1025 av_free(c->tempBlocks);
1026 av_free(c->yHistogram);
1027 av_free(c->tempDst);
1028 av_free(c->tempSrc);
1029 av_free(c->deintTemp);
1030 av_free(c->stdQPTable);
1031 av_free(c->nonBQPTable);
1032 av_free(c->forcedQPTable);
1033
1034 memset(c, 0, sizeof(PPContext));
1035
1036 av_free(c);
1037 }
1038
1039 void pp_postprocess(uint8_t * src[3], int srcStride[3],
1040 uint8_t * dst[3], int dstStride[3],
1041 int width, int height,
1042 QP_STORE_T *QP_store, int QPStride,
1043 pp_mode_t *vm, void *vc, int pict_type)
1044 {
1045 int mbWidth = (width+15)>>4;
1046 int mbHeight= (height+15)>>4;
1047 PPMode *mode = (PPMode*)vm;
1048 PPContext *c = (PPContext*)vc;
1049 int minStride= MAX(ABS(srcStride[0]), ABS(dstStride[0]));
1050 int absQPStride = ABS(QPStride);
1051
1052 // c->stride and c->QPStride are always positive
1053 if(c->stride < minStride || c->qpStride < absQPStride)
1054 reallocBuffers(c, width, height,
1055 MAX(minStride, c->stride),
1056 MAX(c->qpStride, absQPStride));
1057
1058 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1059 {
1060 int i;
1061 QP_store= c->forcedQPTable;
1062 absQPStride = QPStride = 0;
1063 if(mode->lumMode & FORCE_QUANT)
1064 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1065 else
1066 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1067 }
1068 //printf("pict_type:%d\n", pict_type);
1069
1070 if(pict_type & PP_PICT_TYPE_QP2){
1071 int i;
1072 const int count= mbHeight * absQPStride;
1073 for(i=0; i<(count>>2); i++){
1074 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1075 }
1076 for(i<<=2; i<count; i++){
1077 c->stdQPTable[i] = QP_store[i]>>1;
1078 }
1079 QP_store= c->stdQPTable;
1080 QPStride= absQPStride;
1081 }
1082
1083 if(0){
1084 int x,y;
1085 for(y=0; y<mbHeight; y++){
1086 for(x=0; x<mbWidth; x++){
1087 printf("%2d ", QP_store[x + y*QPStride]);
1088 }
1089 printf("\n");
1090 }
1091 printf("\n");
1092 }
1093
1094 if((pict_type&7)!=3)
1095 {
1096 if (QPStride >= 0) {
1097 int i;
1098 const int count= mbHeight * QPStride;
1099 for(i=0; i<(count>>2); i++){
1100 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1101 }
1102 for(i<<=2; i<count; i++){
1103 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1104 }
1105 } else {
1106 int i,j;
1107 for(i=0; i<mbHeight; i++) {
1108 for(j=0; j<absQPStride; j++) {
1109 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1110 }
1111 }
1112 }
1113 }
1114
1115 if(verbose>2)
1116 {
1117 printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
1118 }
1119
1120 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1121 width, height, QP_store, QPStride, 0, mode, c);
1122
1123 width = (width )>>c->hChromaSubSample;
1124 height = (height)>>c->vChromaSubSample;
1125
1126 if(mode->chromMode)
1127 {
1128 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1129 width, height, QP_store, QPStride, 1, mode, c);
1130 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1131 width, height, QP_store, QPStride, 2, mode, c);
1132 }
1133 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1134 {
1135 linecpy(dst[1], src[1], height, srcStride[1]);
1136 linecpy(dst[2], src[2], height, srcStride[2]);
1137 }
1138 else
1139 {
1140 int y;
1141 for(y=0; y<height; y++)
1142 {
1143 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1144 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1145 }
1146 }
1147 }
1148