2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
29 C MMX MMX2 3DNow AltiVec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
50 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
51 # more or less selfinvented filters so the exactness isnt too meaningfull
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
74 //Changelog: use the Subversion log
89 //#define DEBUG_BRIGHTNESS
91 #include "libvo/fastmemcpy.h"
93 #include "postprocess.h"
94 #include "postprocess_internal.h"
96 #include "mangle.h" //FIXME should be supressed
102 #define MIN(a,b) ((a) > (b) ? (b) : (a))
103 #define MAX(a,b) ((a) < (b) ? (b) : (a))
104 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
105 #define SIGN(a) ((a) > 0 ? 1 : -1)
107 #define GET_MODE_BUFFER_SIZE 500
108 #define OPTIONS_ARRAY_SIZE 10
110 #define TEMP_STRIDE 8
111 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
113 #if defined(ARCH_X86) || defined(ARCH_X86_64)
114 static uint64_t __attribute__((aligned(8))) attribute_used w05
= 0x0005000500050005LL
;
115 static uint64_t __attribute__((aligned(8))) attribute_used w04
= 0x0004000400040004LL
;
116 static uint64_t __attribute__((aligned(8))) attribute_used w20
= 0x0020002000200020LL
;
117 static uint64_t __attribute__((aligned(8))) attribute_used b00
= 0x0000000000000000LL
;
118 static uint64_t __attribute__((aligned(8))) attribute_used b01
= 0x0101010101010101LL
;
119 static uint64_t __attribute__((aligned(8))) attribute_used b02
= 0x0202020202020202LL
;
120 static uint64_t __attribute__((aligned(8))) attribute_used b08
= 0x0808080808080808LL
;
121 static uint64_t __attribute__((aligned(8))) attribute_used b80
= 0x8080808080808080LL
;
124 static uint8_t clip_table
[3*256];
125 static uint8_t * const clip_tab
= clip_table
+ 256;
127 static const int verbose
= 0;
129 static const int attribute_used deringThreshold
= 20;
132 static struct PPFilter filters
[]=
134 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK
},
135 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK
},
136 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
137 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
138 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER
},
139 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER
},
140 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK
},
141 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK
},
142 {"dr", "dering", 1, 5, 6, DERING
},
143 {"al", "autolevels", 0, 1, 2, LEVEL_FIX
},
144 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER
},
145 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER
},
146 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER
},
147 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER
},
148 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER
},
149 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER
},
150 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER
},
151 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT
},
152 {NULL
, NULL
,0,0,0,0} //End Marker
155 static const char *replaceTable
[]=
157 "default", "hdeblock:a,vdeblock:a,dering:a",
158 "de", "hdeblock:a,vdeblock:a,dering:a",
159 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
160 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
161 "ac", "ha:a:128:7,va:a,dering:a",
166 #if defined(ARCH_X86) || defined(ARCH_X86_64)
167 static inline void prefetchnta(void *p
)
169 asm volatile( "prefetchnta (%0)\n\t"
174 static inline void prefetcht0(void *p
)
176 asm volatile( "prefetcht0 (%0)\n\t"
181 static inline void prefetcht1(void *p
)
183 asm volatile( "prefetcht1 (%0)\n\t"
188 static inline void prefetcht2(void *p
)
190 asm volatile( "prefetcht2 (%0)\n\t"
196 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
199 * Check if the given 8x8 Block is mostly "flat"
201 static inline int isHorizDC_C(uint8_t src
[], int stride
, PPContext
*c
)
205 const int dcOffset
= ((c
->nonBQP
*c
->ppMode
.baseDcDiff
)>>8) + 1;
206 const int dcThreshold
= dcOffset
*2 + 1;
208 for(y
=0; y
<BLOCK_SIZE
; y
++)
210 if(((unsigned)(src
[0] - src
[1] + dcOffset
)) < dcThreshold
) numEq
++;
211 if(((unsigned)(src
[1] - src
[2] + dcOffset
)) < dcThreshold
) numEq
++;
212 if(((unsigned)(src
[2] - src
[3] + dcOffset
)) < dcThreshold
) numEq
++;
213 if(((unsigned)(src
[3] - src
[4] + dcOffset
)) < dcThreshold
) numEq
++;
214 if(((unsigned)(src
[4] - src
[5] + dcOffset
)) < dcThreshold
) numEq
++;
215 if(((unsigned)(src
[5] - src
[6] + dcOffset
)) < dcThreshold
) numEq
++;
216 if(((unsigned)(src
[6] - src
[7] + dcOffset
)) < dcThreshold
) numEq
++;
219 return numEq
> c
->ppMode
.flatnessThreshold
;
223 * Check if the middle 8x8 Block in the given 8x16 block is flat
225 static inline int isVertDC_C(uint8_t src
[], int stride
, PPContext
*c
){
228 const int dcOffset
= ((c
->nonBQP
*c
->ppMode
.baseDcDiff
)>>8) + 1;
229 const int dcThreshold
= dcOffset
*2 + 1;
231 src
+= stride
*4; // src points to begin of the 8x8 Block
232 for(y
=0; y
<BLOCK_SIZE
-1; y
++)
234 if(((unsigned)(src
[0] - src
[0+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
235 if(((unsigned)(src
[1] - src
[1+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
236 if(((unsigned)(src
[2] - src
[2+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
237 if(((unsigned)(src
[3] - src
[3+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
238 if(((unsigned)(src
[4] - src
[4+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
239 if(((unsigned)(src
[5] - src
[5+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
240 if(((unsigned)(src
[6] - src
[6+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
241 if(((unsigned)(src
[7] - src
[7+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
244 return numEq
> c
->ppMode
.flatnessThreshold
;
247 static inline int isHorizMinMaxOk_C(uint8_t src
[], int stride
, int QP
)
252 if((unsigned)(src
[0] - src
[5] + 2*QP
) > 4*QP
) return 0;
254 if((unsigned)(src
[2] - src
[7] + 2*QP
) > 4*QP
) return 0;
256 if((unsigned)(src
[4] - src
[1] + 2*QP
) > 4*QP
) return 0;
258 if((unsigned)(src
[6] - src
[3] + 2*QP
) > 4*QP
) return 0;
263 if((unsigned)(src
[0] - src
[7] + 2*QP
) > 4*QP
) return 0;
270 static inline int isVertMinMaxOk_C(uint8_t src
[], int stride
, int QP
)
276 for(x
=0; x
<BLOCK_SIZE
; x
+=4)
278 if((unsigned)(src
[ x
+ 0*stride
] - src
[ x
+ 5*stride
] + 2*QP
) > 4*QP
) return 0;
279 if((unsigned)(src
[1+x
+ 2*stride
] - src
[1+x
+ 7*stride
] + 2*QP
) > 4*QP
) return 0;
280 if((unsigned)(src
[2+x
+ 4*stride
] - src
[2+x
+ 1*stride
] + 2*QP
) > 4*QP
) return 0;
281 if((unsigned)(src
[3+x
+ 6*stride
] - src
[3+x
+ 3*stride
] + 2*QP
) > 4*QP
) return 0;
286 for(x
=0; x
<BLOCK_SIZE
; x
++)
288 if((unsigned)(src
[x
+ stride
] - src
[x
+ (stride
<<3)] + 2*QP
) > 4*QP
) return 0;
295 for(x
=0; x
<BLOCK_SIZE
; x
++)
301 int v
= src
[x
+ y
*stride
];
305 if(max
-min
> 2*QP
) return 0;
311 static inline int horizClassify_C(uint8_t src
[], int stride
, PPContext
*c
){
312 if( isHorizDC_C(src
, stride
, c
) ){
313 if( isHorizMinMaxOk_C(src
, stride
, c
->QP
) )
322 static inline int vertClassify_C(uint8_t src
[], int stride
, PPContext
*c
){
323 if( isVertDC_C(src
, stride
, c
) ){
324 if( isVertMinMaxOk_C(src
, stride
, c
->QP
) )
333 static inline void doHorizDefFilter_C(uint8_t dst
[], int stride
, PPContext
*c
)
336 for(y
=0; y
<BLOCK_SIZE
; y
++)
338 const int middleEnergy
= 5*(dst
[4] - dst
[3]) + 2*(dst
[2] - dst
[5]);
340 if(ABS(middleEnergy
) < 8*c
->QP
)
342 const int q
=(dst
[3] - dst
[4])/2;
343 const int leftEnergy
= 5*(dst
[2] - dst
[1]) + 2*(dst
[0] - dst
[3]);
344 const int rightEnergy
= 5*(dst
[6] - dst
[5]) + 2*(dst
[4] - dst
[7]);
346 int d
= ABS(middleEnergy
) - MIN( ABS(leftEnergy
), ABS(rightEnergy
) );
350 d
*= SIGN(-middleEnergy
);
371 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
372 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
374 static inline void doHorizLowPass_C(uint8_t dst
[], int stride
, PPContext
*c
)
377 for(y
=0; y
<BLOCK_SIZE
; y
++)
379 const int first
= ABS(dst
[-1] - dst
[0]) < c
->QP ? dst
[-1] : dst
[0];
380 const int last
= ABS(dst
[8] - dst
[7]) < c
->QP ? dst
[8] : dst
[7];
383 sums
[0] = 4*first
+ dst
[0] + dst
[1] + dst
[2] + 4;
384 sums
[1] = sums
[0] - first
+ dst
[3];
385 sums
[2] = sums
[1] - first
+ dst
[4];
386 sums
[3] = sums
[2] - first
+ dst
[5];
387 sums
[4] = sums
[3] - first
+ dst
[6];
388 sums
[5] = sums
[4] - dst
[0] + dst
[7];
389 sums
[6] = sums
[5] - dst
[1] + last
;
390 sums
[7] = sums
[6] - dst
[2] + last
;
391 sums
[8] = sums
[7] - dst
[3] + last
;
392 sums
[9] = sums
[8] - dst
[4] + last
;
394 dst
[0]= (sums
[0] + sums
[2] + 2*dst
[0])>>4;
395 dst
[1]= (sums
[1] + sums
[3] + 2*dst
[1])>>4;
396 dst
[2]= (sums
[2] + sums
[4] + 2*dst
[2])>>4;
397 dst
[3]= (sums
[3] + sums
[5] + 2*dst
[3])>>4;
398 dst
[4]= (sums
[4] + sums
[6] + 2*dst
[4])>>4;
399 dst
[5]= (sums
[5] + sums
[7] + 2*dst
[5])>>4;
400 dst
[6]= (sums
[6] + sums
[8] + 2*dst
[6])>>4;
401 dst
[7]= (sums
[7] + sums
[9] + 2*dst
[7])>>4;
408 * Experimental Filter 1 (Horizontal)
409 * will not damage linear gradients
410 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
411 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
412 * MMX2 version does correct clipping C version doesnt
413 * not identical with the vertical one
415 static inline void horizX1Filter(uint8_t *src
, int stride
, int QP
)
418 static uint64_t *lut
= NULL
;
422 lut
= av_malloc(256*8);
425 int v
= i
< 128 ?
2*i
: 2*(i
-256);
427 //Simulate 112242211 9-Tap filter
428 uint64_t a= (v/16) & 0xFF;
429 uint64_t b= (v/8) & 0xFF;
430 uint64_t c= (v/4) & 0xFF;
431 uint64_t d= (3*v/8) & 0xFF;
433 //Simulate piecewise linear interpolation
434 uint64_t a
= (v
/16) & 0xFF;
435 uint64_t b
= (v
*3/16) & 0xFF;
436 uint64_t c
= (v
*5/16) & 0xFF;
437 uint64_t d
= (7*v
/16) & 0xFF;
438 uint64_t A
= (0x100 - a
)&0xFF;
439 uint64_t B
= (0x100 - b
)&0xFF;
440 uint64_t C
= (0x100 - c
)&0xFF;
441 uint64_t D
= (0x100 - c
)&0xFF;
443 lut
[i
] = (a
<<56) | (b
<<48) | (c
<<40) | (d
<<32) |
444 (D
<<24) | (C
<<16) | (B
<<8) | (A
);
445 //lut[i] = (v<<32) | (v<<24);
449 for(y
=0; y
<BLOCK_SIZE
; y
++)
451 int a
= src
[1] - src
[2];
452 int b
= src
[3] - src
[4];
453 int c
= src
[5] - src
[6];
455 int d
= MAX(ABS(b
) - (ABS(a
) + ABS(c
))/2, 0);
459 int v
= d
* SIGN(-b
);
474 * accurate deblock filter
476 static always_inline
void do_a_deblock_C(uint8_t *src
, int step
, int stride
, PPContext
*c
){
479 const int dcOffset
= ((c
->nonBQP
*c
->ppMode
.baseDcDiff
)>>8) + 1;
480 const int dcThreshold
= dcOffset
*2 + 1;
482 src
+= step
*4; // src points to begin of the 8x8 Block
486 if(((unsigned)(src
[-1*step
] - src
[0*step
] + dcOffset
)) < dcThreshold
) numEq
++;
487 if(((unsigned)(src
[ 0*step
] - src
[1*step
] + dcOffset
)) < dcThreshold
) numEq
++;
488 if(((unsigned)(src
[ 1*step
] - src
[2*step
] + dcOffset
)) < dcThreshold
) numEq
++;
489 if(((unsigned)(src
[ 2*step
] - src
[3*step
] + dcOffset
)) < dcThreshold
) numEq
++;
490 if(((unsigned)(src
[ 3*step
] - src
[4*step
] + dcOffset
)) < dcThreshold
) numEq
++;
491 if(((unsigned)(src
[ 4*step
] - src
[5*step
] + dcOffset
)) < dcThreshold
) numEq
++;
492 if(((unsigned)(src
[ 5*step
] - src
[6*step
] + dcOffset
)) < dcThreshold
) numEq
++;
493 if(((unsigned)(src
[ 6*step
] - src
[7*step
] + dcOffset
)) < dcThreshold
) numEq
++;
494 if(((unsigned)(src
[ 7*step
] - src
[8*step
] + dcOffset
)) < dcThreshold
) numEq
++;
495 if(numEq
> c
->ppMode
.flatnessThreshold
){
498 if(src
[0] > src
[step
]){
506 if(src
[x
*step
] > src
[(x
+1)*step
]){
507 if(src
[x
*step
] > max
) max
= src
[ x
*step
];
508 if(src
[(x
+1)*step
] < min
) min
= src
[(x
+1)*step
];
510 if(src
[(x
+1)*step
] > max
) max
= src
[(x
+1)*step
];
511 if(src
[ x
*step
] < min
) min
= src
[ x
*step
];
515 const int first
= ABS(src
[-1*step
] - src
[0]) < QP ? src
[-1*step
] : src
[0];
516 const int last
= ABS(src
[8*step
] - src
[7*step
]) < QP ? src
[8*step
] : src
[7*step
];
519 sums
[0] = 4*first
+ src
[0*step
] + src
[1*step
] + src
[2*step
] + 4;
520 sums
[1] = sums
[0] - first
+ src
[3*step
];
521 sums
[2] = sums
[1] - first
+ src
[4*step
];
522 sums
[3] = sums
[2] - first
+ src
[5*step
];
523 sums
[4] = sums
[3] - first
+ src
[6*step
];
524 sums
[5] = sums
[4] - src
[0*step
] + src
[7*step
];
525 sums
[6] = sums
[5] - src
[1*step
] + last
;
526 sums
[7] = sums
[6] - src
[2*step
] + last
;
527 sums
[8] = sums
[7] - src
[3*step
] + last
;
528 sums
[9] = sums
[8] - src
[4*step
] + last
;
530 src
[0*step
]= (sums
[0] + sums
[2] + 2*src
[0*step
])>>4;
531 src
[1*step
]= (sums
[1] + sums
[3] + 2*src
[1*step
])>>4;
532 src
[2*step
]= (sums
[2] + sums
[4] + 2*src
[2*step
])>>4;
533 src
[3*step
]= (sums
[3] + sums
[5] + 2*src
[3*step
])>>4;
534 src
[4*step
]= (sums
[4] + sums
[6] + 2*src
[4*step
])>>4;
535 src
[5*step
]= (sums
[5] + sums
[7] + 2*src
[5*step
])>>4;
536 src
[6*step
]= (sums
[6] + sums
[8] + 2*src
[6*step
])>>4;
537 src
[7*step
]= (sums
[7] + sums
[9] + 2*src
[7*step
])>>4;
540 const int middleEnergy
= 5*(src
[4*step
] - src
[3*step
]) + 2*(src
[2*step
] - src
[5*step
]);
542 if(ABS(middleEnergy
) < 8*QP
)
544 const int q
=(src
[3*step
] - src
[4*step
])/2;
545 const int leftEnergy
= 5*(src
[2*step
] - src
[1*step
]) + 2*(src
[0*step
] - src
[3*step
]);
546 const int rightEnergy
= 5*(src
[6*step
] - src
[5*step
]) + 2*(src
[4*step
] - src
[7*step
]);
548 int d
= ABS(middleEnergy
) - MIN( ABS(leftEnergy
), ABS(rightEnergy
) );
552 d
*= SIGN(-middleEnergy
);
579 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
581 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
587 #define COMPILE_ALTIVEC
588 #endif //HAVE_ALTIVEC
589 #endif //ARCH_POWERPC
591 #if defined(ARCH_X86) || defined(ARCH_X86_64)
593 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
597 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
601 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
602 #define COMPILE_3DNOW
615 #define RENAME(a) a ## _C
616 #include "postprocess_template.c"
620 #ifdef COMPILE_ALTIVEC
623 #define RENAME(a) a ## _altivec
624 #include "postprocess_altivec_template.c"
625 #include "postprocess_template.c"
627 #endif //ARCH_POWERPC
635 #define RENAME(a) a ## _MMX
636 #include "postprocess_template.c"
645 #define RENAME(a) a ## _MMX2
646 #include "postprocess_template.c"
655 #define RENAME(a) a ## _3DNow
656 #include "postprocess_template.c"
659 // minor note: the HAVE_xyz is messed up after that line so dont use it
661 static inline void postProcess(uint8_t src
[], int srcStride
, uint8_t dst
[], int dstStride
, int width
, int height
,
662 QP_STORE_T QPs
[], int QPStride
, int isColor
, pp_mode_t
*vm
, pp_context_t
*vc
)
664 PPContext
*c
= (PPContext
*)vc
;
665 PPMode
*ppMode
= (PPMode
*)vm
;
666 c
->ppMode
= *ppMode
; //FIXME
668 // useing ifs here as they are faster than function pointers allthough the
669 // difference wouldnt be messureable here but its much better because
670 // someone might exchange the cpu whithout restarting mplayer ;)
671 #ifdef RUNTIME_CPUDETECT
672 #if defined(ARCH_X86) || defined(ARCH_X86_64)
673 // ordered per speed fasterst first
674 if(c
->cpuCaps
& PP_CPU_CAPS_MMX2
)
675 postProcess_MMX2(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
676 else if(c
->cpuCaps
& PP_CPU_CAPS_3DNOW
)
677 postProcess_3DNow(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
678 else if(c
->cpuCaps
& PP_CPU_CAPS_MMX
)
679 postProcess_MMX(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
681 postProcess_C(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
685 if(c
->cpuCaps
& PP_CPU_CAPS_ALTIVEC
)
686 postProcess_altivec(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
690 postProcess_C(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
692 #else //RUNTIME_CPUDETECT
694 postProcess_MMX2(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
695 #elif defined (HAVE_3DNOW)
696 postProcess_3DNow(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
697 #elif defined (HAVE_MMX)
698 postProcess_MMX(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
699 #elif defined (HAVE_ALTIVEC)
700 postProcess_altivec(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
702 postProcess_C(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
704 #endif //!RUNTIME_CPUDETECT
707 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
708 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
710 /* -pp Command line Help
713 "Available postprocessing filters:\n"
715 "short long name short long option Description\n"
716 "* * a autoq CPU power dependent enabler\n"
717 " c chrom chrominance filtering enabled\n"
718 " y nochrom chrominance filtering disabled\n"
719 " n noluma luma filtering disabled\n"
720 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
721 " 1. difference factor: default=32, higher -> more deblocking\n"
722 " 2. flatness threshold: default=39, lower -> more deblocking\n"
723 " the h & v deblocking filters share these\n"
724 " so you can't set different thresholds for h / v\n"
725 "vb vdeblock (2 threshold) vertical deblocking filter\n"
726 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
727 "va vadeblock (2 threshold) vertical deblocking filter\n"
728 "h1 x1hdeblock experimental h deblock filter 1\n"
729 "v1 x1vdeblock experimental v deblock filter 1\n"
730 "dr dering deringing filter\n"
731 "al autolevels automatic brightness / contrast\n"
732 " f fullyrange stretch luminance to (0..255)\n"
733 "lb linblenddeint linear blend deinterlacer\n"
734 "li linipoldeint linear interpolating deinterlace\n"
735 "ci cubicipoldeint cubic interpolating deinterlacer\n"
736 "md mediandeint median deinterlacer\n"
737 "fd ffmpegdeint ffmpeg deinterlacer\n"
738 "l5 lowpass5 FIR lowpass deinterlacer\n"
739 "de default hb:a,vb:a,dr:a\n"
740 "fa fast h1:a,v1:a,dr:a\n"
741 "ac ha:a:128:7,va:a,dr:a\n"
742 "tn tmpnoise (3 threshold) temporal noise reducer\n"
743 " 1. <= 2. <= 3. larger -> stronger filtering\n"
744 "fq forceQuant <quantizer> force quantizer\n"
746 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
747 "long form example:\n"
748 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
749 "short form example:\n"
750 "vb:a/hb:a/lb de,-vb\n"
756 pp_mode_t
*pp_get_mode_by_name_and_quality(char *name
, int quality
)
758 char temp
[GET_MODE_BUFFER_SIZE
];
760 const char *filterDelimiters
= ",/";
761 const char *optionDelimiters
= ":";
762 struct PPMode
*ppMode
;
765 ppMode
= av_malloc(sizeof(PPMode
));
768 ppMode
->chromMode
= 0;
769 ppMode
->maxTmpNoise
[0]= 700;
770 ppMode
->maxTmpNoise
[1]= 1500;
771 ppMode
->maxTmpNoise
[2]= 3000;
772 ppMode
->maxAllowedY
= 234;
773 ppMode
->minAllowedY
= 16;
774 ppMode
->baseDcDiff
= 256/8;
775 ppMode
->flatnessThreshold
= 56-16-1;
776 ppMode
->maxClippedThreshold
= 0.01;
779 strncpy(temp
, name
, GET_MODE_BUFFER_SIZE
);
781 if(verbose
>1) printf("pp: %s\n", name
);
785 int q
= 1000000; //PP_QUALITY_MAX;
789 char *options
[OPTIONS_ARRAY_SIZE
];
792 int numOfUnknownOptions
=0;
793 int enable
=1; //does the user want us to enabled or disabled the filter
795 filterToken
= strtok(p
, filterDelimiters
);
796 if(filterToken
== NULL
) break;
797 p
+= strlen(filterToken
) + 1; // p points to next filterToken
798 filterName
= strtok(filterToken
, optionDelimiters
);
799 if(verbose
>1) printf("pp: %s::%s\n", filterToken
, filterName
);
801 if(*filterName
== '-')
807 for(;;){ //for all options
808 option
= strtok(NULL
, optionDelimiters
);
809 if(option
== NULL
) break;
811 if(verbose
>1) printf("pp: option: %s\n", option
);
812 if(!strcmp("autoq", option
) || !strcmp("a", option
)) q
= quality
;
813 else if(!strcmp("nochrom", option
) || !strcmp("y", option
)) chrom
=0;
814 else if(!strcmp("chrom", option
) || !strcmp("c", option
)) chrom
=1;
815 else if(!strcmp("noluma", option
) || !strcmp("n", option
)) luma
=0;
818 options
[numOfUnknownOptions
] = option
;
819 numOfUnknownOptions
++;
821 if(numOfUnknownOptions
>= OPTIONS_ARRAY_SIZE
-1) break;
823 options
[numOfUnknownOptions
] = NULL
;
825 /* replace stuff from the replace Table */
826 for(i
=0; replaceTable
[2*i
]!=NULL
; i
++)
828 if(!strcmp(replaceTable
[2*i
], filterName
))
830 int newlen
= strlen(replaceTable
[2*i
+ 1]);
834 if(p
==NULL
) p
= temp
, *p
=0; //last filter
835 else p
--, *p
=','; //not last filter
838 spaceLeft
= p
- temp
+ plen
;
839 if(spaceLeft
+ newlen
>= GET_MODE_BUFFER_SIZE
)
844 memmove(p
+ newlen
, p
, plen
+1);
845 memcpy(p
, replaceTable
[2*i
+ 1], newlen
);
850 for(i
=0; filters
[i
].shortName
!=NULL
; i
++)
852 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
853 if( !strcmp(filters
[i
].longName
, filterName
)
854 || !strcmp(filters
[i
].shortName
, filterName
))
856 ppMode
->lumMode
&= ~filters
[i
].mask
;
857 ppMode
->chromMode
&= ~filters
[i
].mask
;
860 if(!enable
) break; // user wants to disable it
862 if(q
>= filters
[i
].minLumQuality
&& luma
)
863 ppMode
->lumMode
|= filters
[i
].mask
;
864 if(chrom
==1 || (chrom
==-1 && filters
[i
].chromDefault
))
865 if(q
>= filters
[i
].minChromQuality
)
866 ppMode
->chromMode
|= filters
[i
].mask
;
868 if(filters
[i
].mask
== LEVEL_FIX
)
871 ppMode
->minAllowedY
= 16;
872 ppMode
->maxAllowedY
= 234;
873 for(o
=0; options
[o
]!=NULL
; o
++)
875 if( !strcmp(options
[o
],"fullyrange")
876 ||!strcmp(options
[o
],"f"))
878 ppMode
->minAllowedY
= 0;
879 ppMode
->maxAllowedY
= 255;
880 numOfUnknownOptions
--;
884 else if(filters
[i
].mask
== TEMP_NOISE_FILTER
)
889 for(o
=0; options
[o
]!=NULL
; o
++)
892 ppMode
->maxTmpNoise
[numOfNoises
]=
893 strtol(options
[o
], &tail
, 0);
897 numOfUnknownOptions
--;
898 if(numOfNoises
>= 3) break;
902 else if(filters
[i
].mask
== V_DEBLOCK
|| filters
[i
].mask
== H_DEBLOCK
903 || filters
[i
].mask
== V_A_DEBLOCK
|| filters
[i
].mask
== H_A_DEBLOCK
)
907 for(o
=0; options
[o
]!=NULL
&& o
<2; o
++)
910 int val
= strtol(options
[o
], &tail
, 0);
911 if(tail
==options
[o
]) break;
913 numOfUnknownOptions
--;
914 if(o
==0) ppMode
->baseDcDiff
= val
;
915 else ppMode
->flatnessThreshold
= val
;
918 else if(filters
[i
].mask
== FORCE_QUANT
)
921 ppMode
->forcedQuant
= 15;
923 for(o
=0; options
[o
]!=NULL
&& o
<1; o
++)
926 int val
= strtol(options
[o
], &tail
, 0);
927 if(tail
==options
[o
]) break;
929 numOfUnknownOptions
--;
930 ppMode
->forcedQuant
= val
;
935 if(!filterNameOk
) ppMode
->error
++;
936 ppMode
->error
+= numOfUnknownOptions
;
939 if(verbose
>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode
->lumMode
, ppMode
->chromMode
);
942 fprintf(stderr
, "%d errors in postprocess string \"%s\"\n", ppMode
->error
, name
);
949 void pp_free_mode(pp_mode_t
*mode
){
953 static void reallocAlign(void **p
, int alignment
, int size
){
955 *p
= av_mallocz(size
);
958 static void reallocBuffers(PPContext
*c
, int width
, int height
, int stride
, int qpStride
){
959 int mbWidth
= (width
+15)>>4;
960 int mbHeight
= (height
+15)>>4;
964 c
->qpStride
= qpStride
;
966 reallocAlign((void **)&c
->tempDst
, 8, stride
*24);
967 reallocAlign((void **)&c
->tempSrc
, 8, stride
*24);
968 reallocAlign((void **)&c
->tempBlocks
, 8, 2*16*8);
969 reallocAlign((void **)&c
->yHistogram
, 8, 256*sizeof(uint64_t));
971 c
->yHistogram
[i
]= width
*height
/64*15/256;
975 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
976 reallocAlign((void **)&c
->tempBlured
[i
], 8, stride
*mbHeight
*16 + 17*1024);
977 reallocAlign((void **)&c
->tempBluredPast
[i
], 8, 256*((height
+7)&(~7))/2 + 17*1024);//FIXME size
980 reallocAlign((void **)&c
->deintTemp
, 8, 2*width
+32);
981 reallocAlign((void **)&c
->nonBQPTable
, 8, qpStride
*mbHeight
*sizeof(QP_STORE_T
));
982 reallocAlign((void **)&c
->stdQPTable
, 8, qpStride
*mbHeight
*sizeof(QP_STORE_T
));
983 reallocAlign((void **)&c
->forcedQPTable
, 8, mbWidth
*sizeof(QP_STORE_T
));
986 static void global_init(void){
988 memset(clip_table
, 0, 256);
989 for(i
=256; i
<512; i
++)
991 memset(clip_table
+512, 0, 256);
994 pp_context_t
*pp_get_context(int width
, int height
, int cpuCaps
){
995 PPContext
*c
= av_malloc(sizeof(PPContext
));
996 int stride
= (width
+15)&(~15); //assumed / will realloc if needed
997 int qpStride
= (width
+15)/16 + 2; //assumed / will realloc if needed
1001 memset(c
, 0, sizeof(PPContext
));
1002 c
->cpuCaps
= cpuCaps
;
1003 if(cpuCaps
&PP_FORMAT
){
1004 c
->hChromaSubSample
= cpuCaps
&0x3;
1005 c
->vChromaSubSample
= (cpuCaps
>>4)&0x3;
1007 c
->hChromaSubSample
= 1;
1008 c
->vChromaSubSample
= 1;
1011 reallocBuffers(c
, width
, height
, stride
, qpStride
);
1018 void pp_free_context(void *vc
){
1019 PPContext
*c
= (PPContext
*)vc
;
1022 for(i
=0; i
<3; i
++) av_free(c
->tempBlured
[i
]);
1023 for(i
=0; i
<3; i
++) av_free(c
->tempBluredPast
[i
]);
1025 av_free(c
->tempBlocks
);
1026 av_free(c
->yHistogram
);
1027 av_free(c
->tempDst
);
1028 av_free(c
->tempSrc
);
1029 av_free(c
->deintTemp
);
1030 av_free(c
->stdQPTable
);
1031 av_free(c
->nonBQPTable
);
1032 av_free(c
->forcedQPTable
);
1034 memset(c
, 0, sizeof(PPContext
));
1039 void pp_postprocess(uint8_t * src
[3], int srcStride
[3],
1040 uint8_t * dst
[3], int dstStride
[3],
1041 int width
, int height
,
1042 QP_STORE_T
*QP_store
, int QPStride
,
1043 pp_mode_t
*vm
, void *vc
, int pict_type
)
1045 int mbWidth
= (width
+15)>>4;
1046 int mbHeight
= (height
+15)>>4;
1047 PPMode
*mode
= (PPMode
*)vm
;
1048 PPContext
*c
= (PPContext
*)vc
;
1049 int minStride
= MAX(ABS(srcStride
[0]), ABS(dstStride
[0]));
1050 int absQPStride
= ABS(QPStride
);
1052 // c->stride and c->QPStride are always positive
1053 if(c
->stride
< minStride
|| c
->qpStride
< absQPStride
)
1054 reallocBuffers(c
, width
, height
,
1055 MAX(minStride
, c
->stride
),
1056 MAX(c
->qpStride
, absQPStride
));
1058 if(QP_store
==NULL
|| (mode
->lumMode
& FORCE_QUANT
))
1061 QP_store
= c
->forcedQPTable
;
1062 absQPStride
= QPStride
= 0;
1063 if(mode
->lumMode
& FORCE_QUANT
)
1064 for(i
=0; i
<mbWidth
; i
++) QP_store
[i
]= mode
->forcedQuant
;
1066 for(i
=0; i
<mbWidth
; i
++) QP_store
[i
]= 1;
1068 //printf("pict_type:%d\n", pict_type);
1070 if(pict_type
& PP_PICT_TYPE_QP2
){
1072 const int count
= mbHeight
* absQPStride
;
1073 for(i
=0; i
<(count
>>2); i
++){
1074 ((uint32_t*)c
->stdQPTable
)[i
] = (((uint32_t*)QP_store
)[i
]>>1) & 0x7F7F7F7F;
1076 for(i
<<=2; i
<count
; i
++){
1077 c
->stdQPTable
[i
] = QP_store
[i
]>>1;
1079 QP_store
= c
->stdQPTable
;
1080 QPStride
= absQPStride
;
1085 for(y
=0; y
<mbHeight
; y
++){
1086 for(x
=0; x
<mbWidth
; x
++){
1087 printf("%2d ", QP_store
[x
+ y
*QPStride
]);
1094 if((pict_type
&7)!=3)
1096 if (QPStride
>= 0) {
1098 const int count
= mbHeight
* QPStride
;
1099 for(i
=0; i
<(count
>>2); i
++){
1100 ((uint32_t*)c
->nonBQPTable
)[i
] = ((uint32_t*)QP_store
)[i
] & 0x3F3F3F3F;
1102 for(i
<<=2; i
<count
; i
++){
1103 c
->nonBQPTable
[i
] = QP_store
[i
] & 0x3F;
1107 for(i
=0; i
<mbHeight
; i
++) {
1108 for(j
=0; j
<absQPStride
; j
++) {
1109 c
->nonBQPTable
[i
*absQPStride
+j
] = QP_store
[i
*QPStride
+j
] & 0x3F;
1117 printf("using npp filters 0x%X/0x%X\n", mode
->lumMode
, mode
->chromMode
);
1120 postProcess(src
[0], srcStride
[0], dst
[0], dstStride
[0],
1121 width
, height
, QP_store
, QPStride
, 0, mode
, c
);
1123 width
= (width
)>>c
->hChromaSubSample
;
1124 height
= (height
)>>c
->vChromaSubSample
;
1128 postProcess(src
[1], srcStride
[1], dst
[1], dstStride
[1],
1129 width
, height
, QP_store
, QPStride
, 1, mode
, c
);
1130 postProcess(src
[2], srcStride
[2], dst
[2], dstStride
[2],
1131 width
, height
, QP_store
, QPStride
, 2, mode
, c
);
1133 else if(srcStride
[1] == dstStride
[1] && srcStride
[2] == dstStride
[2])
1135 linecpy(dst
[1], src
[1], height
, srcStride
[1]);
1136 linecpy(dst
[2], src
[2], height
, srcStride
[2]);
1141 for(y
=0; y
<height
; y
++)
1143 memcpy(&(dst
[1][y
*dstStride
[1]]), &(src
[1][y
*srcStride
[1]]), width
);
1144 memcpy(&(dst
[2][y
*dstStride
[2]]), &(src
[2][y
*srcStride
[2]]), width
);