2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
29 C MMX MMX2 3DNow AltiVec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
50 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
51 # more or less selfinvented filters so the exactness isnt too meaningfull
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
74 //Changelog: use the Subversion log
89 //#define DEBUG_BRIGHTNESS
91 #include "libvo/fastmemcpy.h"
93 #include "postprocess.h"
94 #include "postprocess_internal.h"
96 #include "mangle.h" //FIXME should be supressed
102 #define MIN(a,b) ((a) > (b) ? (b) : (a))
103 #define MAX(a,b) ((a) < (b) ? (b) : (a))
105 #define GET_MODE_BUFFER_SIZE 500
106 #define OPTIONS_ARRAY_SIZE 10
108 #define TEMP_STRIDE 8
109 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
111 #if defined(ARCH_X86) || defined(ARCH_X86_64)
112 static uint64_t __attribute__((aligned(8))) attribute_used w05
= 0x0005000500050005LL
;
113 static uint64_t __attribute__((aligned(8))) attribute_used w04
= 0x0004000400040004LL
;
114 static uint64_t __attribute__((aligned(8))) attribute_used w20
= 0x0020002000200020LL
;
115 static uint64_t __attribute__((aligned(8))) attribute_used b00
= 0x0000000000000000LL
;
116 static uint64_t __attribute__((aligned(8))) attribute_used b01
= 0x0101010101010101LL
;
117 static uint64_t __attribute__((aligned(8))) attribute_used b02
= 0x0202020202020202LL
;
118 static uint64_t __attribute__((aligned(8))) attribute_used b08
= 0x0808080808080808LL
;
119 static uint64_t __attribute__((aligned(8))) attribute_used b80
= 0x8080808080808080LL
;
122 static uint8_t clip_table
[3*256];
123 static uint8_t * const clip_tab
= clip_table
+ 256;
125 static const int verbose
= 0;
127 static const int attribute_used deringThreshold
= 20;
130 static struct PPFilter filters
[]=
132 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK
},
133 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK
},
134 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
135 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
136 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER
},
137 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER
},
138 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK
},
139 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK
},
140 {"dr", "dering", 1, 5, 6, DERING
},
141 {"al", "autolevels", 0, 1, 2, LEVEL_FIX
},
142 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER
},
143 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER
},
144 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER
},
145 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER
},
146 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER
},
147 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER
},
148 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER
},
149 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT
},
150 {NULL
, NULL
,0,0,0,0} //End Marker
153 static const char *replaceTable
[]=
155 "default", "hdeblock:a,vdeblock:a,dering:a",
156 "de", "hdeblock:a,vdeblock:a,dering:a",
157 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
158 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
159 "ac", "ha:a:128:7,va:a,dering:a",
164 #if defined(ARCH_X86) || defined(ARCH_X86_64)
165 static inline void prefetchnta(void *p
)
167 asm volatile( "prefetchnta (%0)\n\t"
172 static inline void prefetcht0(void *p
)
174 asm volatile( "prefetcht0 (%0)\n\t"
179 static inline void prefetcht1(void *p
)
181 asm volatile( "prefetcht1 (%0)\n\t"
186 static inline void prefetcht2(void *p
)
188 asm volatile( "prefetcht2 (%0)\n\t"
194 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
197 * Check if the given 8x8 Block is mostly "flat"
199 static inline int isHorizDC_C(uint8_t src
[], int stride
, PPContext
*c
)
203 const int dcOffset
= ((c
->nonBQP
*c
->ppMode
.baseDcDiff
)>>8) + 1;
204 const int dcThreshold
= dcOffset
*2 + 1;
206 for(y
=0; y
<BLOCK_SIZE
; y
++)
208 if(((unsigned)(src
[0] - src
[1] + dcOffset
)) < dcThreshold
) numEq
++;
209 if(((unsigned)(src
[1] - src
[2] + dcOffset
)) < dcThreshold
) numEq
++;
210 if(((unsigned)(src
[2] - src
[3] + dcOffset
)) < dcThreshold
) numEq
++;
211 if(((unsigned)(src
[3] - src
[4] + dcOffset
)) < dcThreshold
) numEq
++;
212 if(((unsigned)(src
[4] - src
[5] + dcOffset
)) < dcThreshold
) numEq
++;
213 if(((unsigned)(src
[5] - src
[6] + dcOffset
)) < dcThreshold
) numEq
++;
214 if(((unsigned)(src
[6] - src
[7] + dcOffset
)) < dcThreshold
) numEq
++;
217 return numEq
> c
->ppMode
.flatnessThreshold
;
221 * Check if the middle 8x8 Block in the given 8x16 block is flat
223 static inline int isVertDC_C(uint8_t src
[], int stride
, PPContext
*c
){
226 const int dcOffset
= ((c
->nonBQP
*c
->ppMode
.baseDcDiff
)>>8) + 1;
227 const int dcThreshold
= dcOffset
*2 + 1;
229 src
+= stride
*4; // src points to begin of the 8x8 Block
230 for(y
=0; y
<BLOCK_SIZE
-1; y
++)
232 if(((unsigned)(src
[0] - src
[0+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
233 if(((unsigned)(src
[1] - src
[1+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
234 if(((unsigned)(src
[2] - src
[2+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
235 if(((unsigned)(src
[3] - src
[3+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
236 if(((unsigned)(src
[4] - src
[4+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
237 if(((unsigned)(src
[5] - src
[5+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
238 if(((unsigned)(src
[6] - src
[6+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
239 if(((unsigned)(src
[7] - src
[7+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
242 return numEq
> c
->ppMode
.flatnessThreshold
;
245 static inline int isHorizMinMaxOk_C(uint8_t src
[], int stride
, int QP
)
250 if((unsigned)(src
[0] - src
[5] + 2*QP
) > 4*QP
) return 0;
252 if((unsigned)(src
[2] - src
[7] + 2*QP
) > 4*QP
) return 0;
254 if((unsigned)(src
[4] - src
[1] + 2*QP
) > 4*QP
) return 0;
256 if((unsigned)(src
[6] - src
[3] + 2*QP
) > 4*QP
) return 0;
261 if((unsigned)(src
[0] - src
[7] + 2*QP
) > 4*QP
) return 0;
268 static inline int isVertMinMaxOk_C(uint8_t src
[], int stride
, int QP
)
274 for(x
=0; x
<BLOCK_SIZE
; x
+=4)
276 if((unsigned)(src
[ x
+ 0*stride
] - src
[ x
+ 5*stride
] + 2*QP
) > 4*QP
) return 0;
277 if((unsigned)(src
[1+x
+ 2*stride
] - src
[1+x
+ 7*stride
] + 2*QP
) > 4*QP
) return 0;
278 if((unsigned)(src
[2+x
+ 4*stride
] - src
[2+x
+ 1*stride
] + 2*QP
) > 4*QP
) return 0;
279 if((unsigned)(src
[3+x
+ 6*stride
] - src
[3+x
+ 3*stride
] + 2*QP
) > 4*QP
) return 0;
284 for(x
=0; x
<BLOCK_SIZE
; x
++)
286 if((unsigned)(src
[x
+ stride
] - src
[x
+ (stride
<<3)] + 2*QP
) > 4*QP
) return 0;
293 for(x
=0; x
<BLOCK_SIZE
; x
++)
299 int v
= src
[x
+ y
*stride
];
303 if(max
-min
> 2*QP
) return 0;
309 static inline int horizClassify_C(uint8_t src
[], int stride
, PPContext
*c
){
310 if( isHorizDC_C(src
, stride
, c
) ){
311 if( isHorizMinMaxOk_C(src
, stride
, c
->QP
) )
320 static inline int vertClassify_C(uint8_t src
[], int stride
, PPContext
*c
){
321 if( isVertDC_C(src
, stride
, c
) ){
322 if( isVertMinMaxOk_C(src
, stride
, c
->QP
) )
331 static inline void doHorizDefFilter_C(uint8_t dst
[], int stride
, PPContext
*c
)
334 for(y
=0; y
<BLOCK_SIZE
; y
++)
336 const int middleEnergy
= 5*(dst
[4] - dst
[3]) + 2*(dst
[2] - dst
[5]);
338 if(ABS(middleEnergy
) < 8*c
->QP
)
340 const int q
=(dst
[3] - dst
[4])/2;
341 const int leftEnergy
= 5*(dst
[2] - dst
[1]) + 2*(dst
[0] - dst
[3]);
342 const int rightEnergy
= 5*(dst
[6] - dst
[5]) + 2*(dst
[4] - dst
[7]);
344 int d
= ABS(middleEnergy
) - MIN( ABS(leftEnergy
), ABS(rightEnergy
) );
348 d
*= SIGN(-middleEnergy
);
369 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
370 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
372 static inline void doHorizLowPass_C(uint8_t dst
[], int stride
, PPContext
*c
)
375 for(y
=0; y
<BLOCK_SIZE
; y
++)
377 const int first
= ABS(dst
[-1] - dst
[0]) < c
->QP ? dst
[-1] : dst
[0];
378 const int last
= ABS(dst
[8] - dst
[7]) < c
->QP ? dst
[8] : dst
[7];
381 sums
[0] = 4*first
+ dst
[0] + dst
[1] + dst
[2] + 4;
382 sums
[1] = sums
[0] - first
+ dst
[3];
383 sums
[2] = sums
[1] - first
+ dst
[4];
384 sums
[3] = sums
[2] - first
+ dst
[5];
385 sums
[4] = sums
[3] - first
+ dst
[6];
386 sums
[5] = sums
[4] - dst
[0] + dst
[7];
387 sums
[6] = sums
[5] - dst
[1] + last
;
388 sums
[7] = sums
[6] - dst
[2] + last
;
389 sums
[8] = sums
[7] - dst
[3] + last
;
390 sums
[9] = sums
[8] - dst
[4] + last
;
392 dst
[0]= (sums
[0] + sums
[2] + 2*dst
[0])>>4;
393 dst
[1]= (sums
[1] + sums
[3] + 2*dst
[1])>>4;
394 dst
[2]= (sums
[2] + sums
[4] + 2*dst
[2])>>4;
395 dst
[3]= (sums
[3] + sums
[5] + 2*dst
[3])>>4;
396 dst
[4]= (sums
[4] + sums
[6] + 2*dst
[4])>>4;
397 dst
[5]= (sums
[5] + sums
[7] + 2*dst
[5])>>4;
398 dst
[6]= (sums
[6] + sums
[8] + 2*dst
[6])>>4;
399 dst
[7]= (sums
[7] + sums
[9] + 2*dst
[7])>>4;
406 * Experimental Filter 1 (Horizontal)
407 * will not damage linear gradients
408 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
409 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
410 * MMX2 version does correct clipping C version doesnt
411 * not identical with the vertical one
413 static inline void horizX1Filter(uint8_t *src
, int stride
, int QP
)
416 static uint64_t *lut
= NULL
;
420 lut
= av_malloc(256*8);
423 int v
= i
< 128 ?
2*i
: 2*(i
-256);
425 //Simulate 112242211 9-Tap filter
426 uint64_t a= (v/16) & 0xFF;
427 uint64_t b= (v/8) & 0xFF;
428 uint64_t c= (v/4) & 0xFF;
429 uint64_t d= (3*v/8) & 0xFF;
431 //Simulate piecewise linear interpolation
432 uint64_t a
= (v
/16) & 0xFF;
433 uint64_t b
= (v
*3/16) & 0xFF;
434 uint64_t c
= (v
*5/16) & 0xFF;
435 uint64_t d
= (7*v
/16) & 0xFF;
436 uint64_t A
= (0x100 - a
)&0xFF;
437 uint64_t B
= (0x100 - b
)&0xFF;
438 uint64_t C
= (0x100 - c
)&0xFF;
439 uint64_t D
= (0x100 - c
)&0xFF;
441 lut
[i
] = (a
<<56) | (b
<<48) | (c
<<40) | (d
<<32) |
442 (D
<<24) | (C
<<16) | (B
<<8) | (A
);
443 //lut[i] = (v<<32) | (v<<24);
447 for(y
=0; y
<BLOCK_SIZE
; y
++)
449 int a
= src
[1] - src
[2];
450 int b
= src
[3] - src
[4];
451 int c
= src
[5] - src
[6];
453 int d
= MAX(ABS(b
) - (ABS(a
) + ABS(c
))/2, 0);
457 int v
= d
* SIGN(-b
);
472 * accurate deblock filter
474 static always_inline
void do_a_deblock_C(uint8_t *src
, int step
, int stride
, PPContext
*c
){
477 const int dcOffset
= ((c
->nonBQP
*c
->ppMode
.baseDcDiff
)>>8) + 1;
478 const int dcThreshold
= dcOffset
*2 + 1;
480 src
+= step
*4; // src points to begin of the 8x8 Block
484 if(((unsigned)(src
[-1*step
] - src
[0*step
] + dcOffset
)) < dcThreshold
) numEq
++;
485 if(((unsigned)(src
[ 0*step
] - src
[1*step
] + dcOffset
)) < dcThreshold
) numEq
++;
486 if(((unsigned)(src
[ 1*step
] - src
[2*step
] + dcOffset
)) < dcThreshold
) numEq
++;
487 if(((unsigned)(src
[ 2*step
] - src
[3*step
] + dcOffset
)) < dcThreshold
) numEq
++;
488 if(((unsigned)(src
[ 3*step
] - src
[4*step
] + dcOffset
)) < dcThreshold
) numEq
++;
489 if(((unsigned)(src
[ 4*step
] - src
[5*step
] + dcOffset
)) < dcThreshold
) numEq
++;
490 if(((unsigned)(src
[ 5*step
] - src
[6*step
] + dcOffset
)) < dcThreshold
) numEq
++;
491 if(((unsigned)(src
[ 6*step
] - src
[7*step
] + dcOffset
)) < dcThreshold
) numEq
++;
492 if(((unsigned)(src
[ 7*step
] - src
[8*step
] + dcOffset
)) < dcThreshold
) numEq
++;
493 if(numEq
> c
->ppMode
.flatnessThreshold
){
496 if(src
[0] > src
[step
]){
504 if(src
[x
*step
] > src
[(x
+1)*step
]){
505 if(src
[x
*step
] > max
) max
= src
[ x
*step
];
506 if(src
[(x
+1)*step
] < min
) min
= src
[(x
+1)*step
];
508 if(src
[(x
+1)*step
] > max
) max
= src
[(x
+1)*step
];
509 if(src
[ x
*step
] < min
) min
= src
[ x
*step
];
513 const int first
= ABS(src
[-1*step
] - src
[0]) < QP ? src
[-1*step
] : src
[0];
514 const int last
= ABS(src
[8*step
] - src
[7*step
]) < QP ? src
[8*step
] : src
[7*step
];
517 sums
[0] = 4*first
+ src
[0*step
] + src
[1*step
] + src
[2*step
] + 4;
518 sums
[1] = sums
[0] - first
+ src
[3*step
];
519 sums
[2] = sums
[1] - first
+ src
[4*step
];
520 sums
[3] = sums
[2] - first
+ src
[5*step
];
521 sums
[4] = sums
[3] - first
+ src
[6*step
];
522 sums
[5] = sums
[4] - src
[0*step
] + src
[7*step
];
523 sums
[6] = sums
[5] - src
[1*step
] + last
;
524 sums
[7] = sums
[6] - src
[2*step
] + last
;
525 sums
[8] = sums
[7] - src
[3*step
] + last
;
526 sums
[9] = sums
[8] - src
[4*step
] + last
;
528 src
[0*step
]= (sums
[0] + sums
[2] + 2*src
[0*step
])>>4;
529 src
[1*step
]= (sums
[1] + sums
[3] + 2*src
[1*step
])>>4;
530 src
[2*step
]= (sums
[2] + sums
[4] + 2*src
[2*step
])>>4;
531 src
[3*step
]= (sums
[3] + sums
[5] + 2*src
[3*step
])>>4;
532 src
[4*step
]= (sums
[4] + sums
[6] + 2*src
[4*step
])>>4;
533 src
[5*step
]= (sums
[5] + sums
[7] + 2*src
[5*step
])>>4;
534 src
[6*step
]= (sums
[6] + sums
[8] + 2*src
[6*step
])>>4;
535 src
[7*step
]= (sums
[7] + sums
[9] + 2*src
[7*step
])>>4;
538 const int middleEnergy
= 5*(src
[4*step
] - src
[3*step
]) + 2*(src
[2*step
] - src
[5*step
]);
540 if(ABS(middleEnergy
) < 8*QP
)
542 const int q
=(src
[3*step
] - src
[4*step
])/2;
543 const int leftEnergy
= 5*(src
[2*step
] - src
[1*step
]) + 2*(src
[0*step
] - src
[3*step
]);
544 const int rightEnergy
= 5*(src
[6*step
] - src
[5*step
]) + 2*(src
[4*step
] - src
[7*step
]);
546 int d
= ABS(middleEnergy
) - MIN( ABS(leftEnergy
), ABS(rightEnergy
) );
550 d
*= SIGN(-middleEnergy
);
577 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
579 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
585 #define COMPILE_ALTIVEC
586 #endif //HAVE_ALTIVEC
587 #endif //ARCH_POWERPC
589 #if defined(ARCH_X86) || defined(ARCH_X86_64)
591 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
595 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
599 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
600 #define COMPILE_3DNOW
613 #define RENAME(a) a ## _C
614 #include "postprocess_template.c"
618 #ifdef COMPILE_ALTIVEC
621 #define RENAME(a) a ## _altivec
622 #include "postprocess_altivec_template.c"
623 #include "postprocess_template.c"
625 #endif //ARCH_POWERPC
633 #define RENAME(a) a ## _MMX
634 #include "postprocess_template.c"
643 #define RENAME(a) a ## _MMX2
644 #include "postprocess_template.c"
653 #define RENAME(a) a ## _3DNow
654 #include "postprocess_template.c"
657 // minor note: the HAVE_xyz is messed up after that line so dont use it
659 static inline void postProcess(uint8_t src
[], int srcStride
, uint8_t dst
[], int dstStride
, int width
, int height
,
660 QP_STORE_T QPs
[], int QPStride
, int isColor
, pp_mode_t
*vm
, pp_context_t
*vc
)
662 PPContext
*c
= (PPContext
*)vc
;
663 PPMode
*ppMode
= (PPMode
*)vm
;
664 c
->ppMode
= *ppMode
; //FIXME
666 // useing ifs here as they are faster than function pointers allthough the
667 // difference wouldnt be messureable here but its much better because
668 // someone might exchange the cpu whithout restarting mplayer ;)
669 #ifdef RUNTIME_CPUDETECT
670 #if defined(ARCH_X86) || defined(ARCH_X86_64)
671 // ordered per speed fasterst first
672 if(c
->cpuCaps
& PP_CPU_CAPS_MMX2
)
673 postProcess_MMX2(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
674 else if(c
->cpuCaps
& PP_CPU_CAPS_3DNOW
)
675 postProcess_3DNow(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
676 else if(c
->cpuCaps
& PP_CPU_CAPS_MMX
)
677 postProcess_MMX(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
679 postProcess_C(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
683 if(c
->cpuCaps
& PP_CPU_CAPS_ALTIVEC
)
684 postProcess_altivec(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
688 postProcess_C(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
690 #else //RUNTIME_CPUDETECT
692 postProcess_MMX2(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
693 #elif defined (HAVE_3DNOW)
694 postProcess_3DNow(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
695 #elif defined (HAVE_MMX)
696 postProcess_MMX(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
697 #elif defined (HAVE_ALTIVEC)
698 postProcess_altivec(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
700 postProcess_C(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
702 #endif //!RUNTIME_CPUDETECT
705 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
706 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
708 /* -pp Command line Help
711 "Available postprocessing filters:\n"
713 "short long name short long option Description\n"
714 "* * a autoq CPU power dependent enabler\n"
715 " c chrom chrominance filtering enabled\n"
716 " y nochrom chrominance filtering disabled\n"
717 " n noluma luma filtering disabled\n"
718 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
719 " 1. difference factor: default=32, higher -> more deblocking\n"
720 " 2. flatness threshold: default=39, lower -> more deblocking\n"
721 " the h & v deblocking filters share these\n"
722 " so you can't set different thresholds for h / v\n"
723 "vb vdeblock (2 threshold) vertical deblocking filter\n"
724 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
725 "va vadeblock (2 threshold) vertical deblocking filter\n"
726 "h1 x1hdeblock experimental h deblock filter 1\n"
727 "v1 x1vdeblock experimental v deblock filter 1\n"
728 "dr dering deringing filter\n"
729 "al autolevels automatic brightness / contrast\n"
730 " f fullyrange stretch luminance to (0..255)\n"
731 "lb linblenddeint linear blend deinterlacer\n"
732 "li linipoldeint linear interpolating deinterlace\n"
733 "ci cubicipoldeint cubic interpolating deinterlacer\n"
734 "md mediandeint median deinterlacer\n"
735 "fd ffmpegdeint ffmpeg deinterlacer\n"
736 "l5 lowpass5 FIR lowpass deinterlacer\n"
737 "de default hb:a,vb:a,dr:a\n"
738 "fa fast h1:a,v1:a,dr:a\n"
739 "ac ha:a:128:7,va:a,dr:a\n"
740 "tn tmpnoise (3 threshold) temporal noise reducer\n"
741 " 1. <= 2. <= 3. larger -> stronger filtering\n"
742 "fq forceQuant <quantizer> force quantizer\n"
744 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
745 "long form example:\n"
746 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
747 "short form example:\n"
748 "vb:a/hb:a/lb de,-vb\n"
754 pp_mode_t
*pp_get_mode_by_name_and_quality(char *name
, int quality
)
756 char temp
[GET_MODE_BUFFER_SIZE
];
758 const char *filterDelimiters
= ",/";
759 const char *optionDelimiters
= ":";
760 struct PPMode
*ppMode
;
763 ppMode
= av_malloc(sizeof(PPMode
));
766 ppMode
->chromMode
= 0;
767 ppMode
->maxTmpNoise
[0]= 700;
768 ppMode
->maxTmpNoise
[1]= 1500;
769 ppMode
->maxTmpNoise
[2]= 3000;
770 ppMode
->maxAllowedY
= 234;
771 ppMode
->minAllowedY
= 16;
772 ppMode
->baseDcDiff
= 256/8;
773 ppMode
->flatnessThreshold
= 56-16-1;
774 ppMode
->maxClippedThreshold
= 0.01;
777 strncpy(temp
, name
, GET_MODE_BUFFER_SIZE
);
779 if(verbose
>1) printf("pp: %s\n", name
);
783 int q
= 1000000; //PP_QUALITY_MAX;
787 char *options
[OPTIONS_ARRAY_SIZE
];
790 int numOfUnknownOptions
=0;
791 int enable
=1; //does the user want us to enabled or disabled the filter
793 filterToken
= strtok(p
, filterDelimiters
);
794 if(filterToken
== NULL
) break;
795 p
+= strlen(filterToken
) + 1; // p points to next filterToken
796 filterName
= strtok(filterToken
, optionDelimiters
);
797 if(verbose
>1) printf("pp: %s::%s\n", filterToken
, filterName
);
799 if(*filterName
== '-')
805 for(;;){ //for all options
806 option
= strtok(NULL
, optionDelimiters
);
807 if(option
== NULL
) break;
809 if(verbose
>1) printf("pp: option: %s\n", option
);
810 if(!strcmp("autoq", option
) || !strcmp("a", option
)) q
= quality
;
811 else if(!strcmp("nochrom", option
) || !strcmp("y", option
)) chrom
=0;
812 else if(!strcmp("chrom", option
) || !strcmp("c", option
)) chrom
=1;
813 else if(!strcmp("noluma", option
) || !strcmp("n", option
)) luma
=0;
816 options
[numOfUnknownOptions
] = option
;
817 numOfUnknownOptions
++;
819 if(numOfUnknownOptions
>= OPTIONS_ARRAY_SIZE
-1) break;
821 options
[numOfUnknownOptions
] = NULL
;
823 /* replace stuff from the replace Table */
824 for(i
=0; replaceTable
[2*i
]!=NULL
; i
++)
826 if(!strcmp(replaceTable
[2*i
], filterName
))
828 int newlen
= strlen(replaceTable
[2*i
+ 1]);
832 if(p
==NULL
) p
= temp
, *p
=0; //last filter
833 else p
--, *p
=','; //not last filter
836 spaceLeft
= p
- temp
+ plen
;
837 if(spaceLeft
+ newlen
>= GET_MODE_BUFFER_SIZE
)
842 memmove(p
+ newlen
, p
, plen
+1);
843 memcpy(p
, replaceTable
[2*i
+ 1], newlen
);
848 for(i
=0; filters
[i
].shortName
!=NULL
; i
++)
850 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
851 if( !strcmp(filters
[i
].longName
, filterName
)
852 || !strcmp(filters
[i
].shortName
, filterName
))
854 ppMode
->lumMode
&= ~filters
[i
].mask
;
855 ppMode
->chromMode
&= ~filters
[i
].mask
;
858 if(!enable
) break; // user wants to disable it
860 if(q
>= filters
[i
].minLumQuality
&& luma
)
861 ppMode
->lumMode
|= filters
[i
].mask
;
862 if(chrom
==1 || (chrom
==-1 && filters
[i
].chromDefault
))
863 if(q
>= filters
[i
].minChromQuality
)
864 ppMode
->chromMode
|= filters
[i
].mask
;
866 if(filters
[i
].mask
== LEVEL_FIX
)
869 ppMode
->minAllowedY
= 16;
870 ppMode
->maxAllowedY
= 234;
871 for(o
=0; options
[o
]!=NULL
; o
++)
873 if( !strcmp(options
[o
],"fullyrange")
874 ||!strcmp(options
[o
],"f"))
876 ppMode
->minAllowedY
= 0;
877 ppMode
->maxAllowedY
= 255;
878 numOfUnknownOptions
--;
882 else if(filters
[i
].mask
== TEMP_NOISE_FILTER
)
887 for(o
=0; options
[o
]!=NULL
; o
++)
890 ppMode
->maxTmpNoise
[numOfNoises
]=
891 strtol(options
[o
], &tail
, 0);
895 numOfUnknownOptions
--;
896 if(numOfNoises
>= 3) break;
900 else if(filters
[i
].mask
== V_DEBLOCK
|| filters
[i
].mask
== H_DEBLOCK
901 || filters
[i
].mask
== V_A_DEBLOCK
|| filters
[i
].mask
== H_A_DEBLOCK
)
905 for(o
=0; options
[o
]!=NULL
&& o
<2; o
++)
908 int val
= strtol(options
[o
], &tail
, 0);
909 if(tail
==options
[o
]) break;
911 numOfUnknownOptions
--;
912 if(o
==0) ppMode
->baseDcDiff
= val
;
913 else ppMode
->flatnessThreshold
= val
;
916 else if(filters
[i
].mask
== FORCE_QUANT
)
919 ppMode
->forcedQuant
= 15;
921 for(o
=0; options
[o
]!=NULL
&& o
<1; o
++)
924 int val
= strtol(options
[o
], &tail
, 0);
925 if(tail
==options
[o
]) break;
927 numOfUnknownOptions
--;
928 ppMode
->forcedQuant
= val
;
933 if(!filterNameOk
) ppMode
->error
++;
934 ppMode
->error
+= numOfUnknownOptions
;
937 if(verbose
>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode
->lumMode
, ppMode
->chromMode
);
940 fprintf(stderr
, "%d errors in postprocess string \"%s\"\n", ppMode
->error
, name
);
947 void pp_free_mode(pp_mode_t
*mode
){
951 static void reallocAlign(void **p
, int alignment
, int size
){
953 *p
= av_mallocz(size
);
956 static void reallocBuffers(PPContext
*c
, int width
, int height
, int stride
, int qpStride
){
957 int mbWidth
= (width
+15)>>4;
958 int mbHeight
= (height
+15)>>4;
962 c
->qpStride
= qpStride
;
964 reallocAlign((void **)&c
->tempDst
, 8, stride
*24);
965 reallocAlign((void **)&c
->tempSrc
, 8, stride
*24);
966 reallocAlign((void **)&c
->tempBlocks
, 8, 2*16*8);
967 reallocAlign((void **)&c
->yHistogram
, 8, 256*sizeof(uint64_t));
969 c
->yHistogram
[i
]= width
*height
/64*15/256;
973 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
974 reallocAlign((void **)&c
->tempBlured
[i
], 8, stride
*mbHeight
*16 + 17*1024);
975 reallocAlign((void **)&c
->tempBluredPast
[i
], 8, 256*((height
+7)&(~7))/2 + 17*1024);//FIXME size
978 reallocAlign((void **)&c
->deintTemp
, 8, 2*width
+32);
979 reallocAlign((void **)&c
->nonBQPTable
, 8, qpStride
*mbHeight
*sizeof(QP_STORE_T
));
980 reallocAlign((void **)&c
->stdQPTable
, 8, qpStride
*mbHeight
*sizeof(QP_STORE_T
));
981 reallocAlign((void **)&c
->forcedQPTable
, 8, mbWidth
*sizeof(QP_STORE_T
));
984 static void global_init(void){
986 memset(clip_table
, 0, 256);
987 for(i
=256; i
<512; i
++)
989 memset(clip_table
+512, 0, 256);
992 pp_context_t
*pp_get_context(int width
, int height
, int cpuCaps
){
993 PPContext
*c
= av_malloc(sizeof(PPContext
));
994 int stride
= (width
+15)&(~15); //assumed / will realloc if needed
995 int qpStride
= (width
+15)/16 + 2; //assumed / will realloc if needed
999 memset(c
, 0, sizeof(PPContext
));
1000 c
->cpuCaps
= cpuCaps
;
1001 if(cpuCaps
&PP_FORMAT
){
1002 c
->hChromaSubSample
= cpuCaps
&0x3;
1003 c
->vChromaSubSample
= (cpuCaps
>>4)&0x3;
1005 c
->hChromaSubSample
= 1;
1006 c
->vChromaSubSample
= 1;
1009 reallocBuffers(c
, width
, height
, stride
, qpStride
);
1016 void pp_free_context(void *vc
){
1017 PPContext
*c
= (PPContext
*)vc
;
1020 for(i
=0; i
<3; i
++) av_free(c
->tempBlured
[i
]);
1021 for(i
=0; i
<3; i
++) av_free(c
->tempBluredPast
[i
]);
1023 av_free(c
->tempBlocks
);
1024 av_free(c
->yHistogram
);
1025 av_free(c
->tempDst
);
1026 av_free(c
->tempSrc
);
1027 av_free(c
->deintTemp
);
1028 av_free(c
->stdQPTable
);
1029 av_free(c
->nonBQPTable
);
1030 av_free(c
->forcedQPTable
);
1032 memset(c
, 0, sizeof(PPContext
));
1037 void pp_postprocess(uint8_t * src
[3], int srcStride
[3],
1038 uint8_t * dst
[3], int dstStride
[3],
1039 int width
, int height
,
1040 QP_STORE_T
*QP_store
, int QPStride
,
1041 pp_mode_t
*vm
, void *vc
, int pict_type
)
1043 int mbWidth
= (width
+15)>>4;
1044 int mbHeight
= (height
+15)>>4;
1045 PPMode
*mode
= (PPMode
*)vm
;
1046 PPContext
*c
= (PPContext
*)vc
;
1047 int minStride
= MAX(ABS(srcStride
[0]), ABS(dstStride
[0]));
1048 int absQPStride
= ABS(QPStride
);
1050 // c->stride and c->QPStride are always positive
1051 if(c
->stride
< minStride
|| c
->qpStride
< absQPStride
)
1052 reallocBuffers(c
, width
, height
,
1053 MAX(minStride
, c
->stride
),
1054 MAX(c
->qpStride
, absQPStride
));
1056 if(QP_store
==NULL
|| (mode
->lumMode
& FORCE_QUANT
))
1059 QP_store
= c
->forcedQPTable
;
1060 absQPStride
= QPStride
= 0;
1061 if(mode
->lumMode
& FORCE_QUANT
)
1062 for(i
=0; i
<mbWidth
; i
++) QP_store
[i
]= mode
->forcedQuant
;
1064 for(i
=0; i
<mbWidth
; i
++) QP_store
[i
]= 1;
1066 //printf("pict_type:%d\n", pict_type);
1068 if(pict_type
& PP_PICT_TYPE_QP2
){
1070 const int count
= mbHeight
* absQPStride
;
1071 for(i
=0; i
<(count
>>2); i
++){
1072 ((uint32_t*)c
->stdQPTable
)[i
] = (((uint32_t*)QP_store
)[i
]>>1) & 0x7F7F7F7F;
1074 for(i
<<=2; i
<count
; i
++){
1075 c
->stdQPTable
[i
] = QP_store
[i
]>>1;
1077 QP_store
= c
->stdQPTable
;
1078 QPStride
= absQPStride
;
1083 for(y
=0; y
<mbHeight
; y
++){
1084 for(x
=0; x
<mbWidth
; x
++){
1085 printf("%2d ", QP_store
[x
+ y
*QPStride
]);
1092 if((pict_type
&7)!=3)
1094 if (QPStride
>= 0) {
1096 const int count
= mbHeight
* QPStride
;
1097 for(i
=0; i
<(count
>>2); i
++){
1098 ((uint32_t*)c
->nonBQPTable
)[i
] = ((uint32_t*)QP_store
)[i
] & 0x3F3F3F3F;
1100 for(i
<<=2; i
<count
; i
++){
1101 c
->nonBQPTable
[i
] = QP_store
[i
] & 0x3F;
1105 for(i
=0; i
<mbHeight
; i
++) {
1106 for(j
=0; j
<absQPStride
; j
++) {
1107 c
->nonBQPTable
[i
*absQPStride
+j
] = QP_store
[i
*QPStride
+j
] & 0x3F;
1115 printf("using npp filters 0x%X/0x%X\n", mode
->lumMode
, mode
->chromMode
);
1118 postProcess(src
[0], srcStride
[0], dst
[0], dstStride
[0],
1119 width
, height
, QP_store
, QPStride
, 0, mode
, c
);
1121 width
= (width
)>>c
->hChromaSubSample
;
1122 height
= (height
)>>c
->vChromaSubSample
;
1126 postProcess(src
[1], srcStride
[1], dst
[1], dstStride
[1],
1127 width
, height
, QP_store
, QPStride
, 1, mode
, c
);
1128 postProcess(src
[2], srcStride
[2], dst
[2], dstStride
[2],
1129 width
, height
, QP_store
, QPStride
, 2, mode
, c
);
1131 else if(srcStride
[1] == dstStride
[1] && srcStride
[2] == dstStride
[2])
1133 linecpy(dst
[1], src
[1], height
, srcStride
[1]);
1134 linecpy(dst
[2], src
[2], height
, srcStride
[2]);
1139 for(y
=0; y
<height
; y
++)
1141 memcpy(&(dst
[1][y
*dstStride
[1]]), &(src
[1][y
*srcStride
[1]]), width
);
1142 memcpy(&(dst
[2][y
*dstStride
[2]]), &(src
[2][y
*srcStride
[2]]), width
);