4890bf841a89fc2769671468aa53b04ac991c082
[libav.git] / libpostproc / postprocess.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file postprocess.c
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * i do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use the Subversion log
75
76 #include "config.h"
77 #include "avutil.h"
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <stdlib.h>
81 #include <string.h>
82 #ifdef HAVE_MALLOC_H
83 #include <malloc.h>
84 #endif
85 //#undef HAVE_MMX2
86 //#define HAVE_3DNOW
87 //#undef HAVE_MMX
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #include "postprocess.h"
91 #include "postprocess_internal.h"
92
93 #ifdef HAVE_ALTIVEC_H
94 #include <altivec.h>
95 #endif
96
97 #define GET_MODE_BUFFER_SIZE 500
98 #define OPTIONS_ARRAY_SIZE 10
99 #define BLOCK_SIZE 8
100 #define TEMP_STRIDE 8
101 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
102
103 #if defined(ARCH_X86)
104 static DECLARE_ALIGNED(8, uint64_t attribute_used, w05)= 0x0005000500050005LL;
105 static DECLARE_ALIGNED(8, uint64_t attribute_used, w04)= 0x0004000400040004LL;
106 static DECLARE_ALIGNED(8, uint64_t attribute_used, w20)= 0x0020002000200020LL;
107 static DECLARE_ALIGNED(8, uint64_t attribute_used, b00)= 0x0000000000000000LL;
108 static DECLARE_ALIGNED(8, uint64_t attribute_used, b01)= 0x0101010101010101LL;
109 static DECLARE_ALIGNED(8, uint64_t attribute_used, b02)= 0x0202020202020202LL;
110 static DECLARE_ALIGNED(8, uint64_t attribute_used, b08)= 0x0808080808080808LL;
111 static DECLARE_ALIGNED(8, uint64_t attribute_used, b80)= 0x8080808080808080LL;
112 #endif
113
114 static const int attribute_used deringThreshold= 20;
115
116
117 static struct PPFilter filters[]=
118 {
119 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
120 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
121 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
122 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
123 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
124 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
125 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
126 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
127 {"dr", "dering", 1, 5, 6, DERING},
128 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
129 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
130 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
131 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
132 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
133 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
134 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
135 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
136 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
137 {NULL, NULL,0,0,0,0} //End Marker
138 };
139
140 static const char *replaceTable[]=
141 {
142 "default", "hb:a,vb:a,dr:a",
143 "de", "hb:a,vb:a,dr:a",
144 "fast", "h1:a,v1:a,dr:a",
145 "fa", "h1:a,v1:a,dr:a",
146 "ac", "ha:a:128:7,va:a,dr:a",
147 NULL //End Marker
148 };
149
150
151 #if defined(ARCH_X86)
152 static inline void prefetchnta(void *p)
153 {
154 asm volatile( "prefetchnta (%0)\n\t"
155 : : "r" (p)
156 );
157 }
158
159 static inline void prefetcht0(void *p)
160 {
161 asm volatile( "prefetcht0 (%0)\n\t"
162 : : "r" (p)
163 );
164 }
165
166 static inline void prefetcht1(void *p)
167 {
168 asm volatile( "prefetcht1 (%0)\n\t"
169 : : "r" (p)
170 );
171 }
172
173 static inline void prefetcht2(void *p)
174 {
175 asm volatile( "prefetcht2 (%0)\n\t"
176 : : "r" (p)
177 );
178 }
179 #endif
180
181 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
182
183 /**
184 * Check if the given 8x8 Block is mostly "flat"
185 */
186 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
187 {
188 int numEq= 0;
189 int y;
190 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
191 const int dcThreshold= dcOffset*2 + 1;
192
193 for(y=0; y<BLOCK_SIZE; y++)
194 {
195 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
196 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
197 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
198 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
199 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
200 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
201 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
202 src+= stride;
203 }
204 return numEq > c->ppMode.flatnessThreshold;
205 }
206
207 /**
208 * Check if the middle 8x8 Block in the given 8x16 block is flat
209 */
210 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
211 int numEq= 0;
212 int y;
213 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
214 const int dcThreshold= dcOffset*2 + 1;
215
216 src+= stride*4; // src points to begin of the 8x8 Block
217 for(y=0; y<BLOCK_SIZE-1; y++)
218 {
219 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
220 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
221 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
222 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
223 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
224 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
225 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
226 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
227 src+= stride;
228 }
229 return numEq > c->ppMode.flatnessThreshold;
230 }
231
232 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
233 {
234 int i;
235 #if 1
236 for(i=0; i<2; i++){
237 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
238 src += stride;
239 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
240 src += stride;
241 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
242 src += stride;
243 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
244 src += stride;
245 }
246 #else
247 for(i=0; i<8; i++){
248 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
249 src += stride;
250 }
251 #endif
252 return 1;
253 }
254
255 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
256 {
257 #if 1
258 #if 1
259 int x;
260 src+= stride*4;
261 for(x=0; x<BLOCK_SIZE; x+=4)
262 {
263 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
264 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
265 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
266 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
267 }
268 #else
269 int x;
270 src+= stride*3;
271 for(x=0; x<BLOCK_SIZE; x++)
272 {
273 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
274 }
275 #endif
276 return 1;
277 #else
278 int x;
279 src+= stride*4;
280 for(x=0; x<BLOCK_SIZE; x++)
281 {
282 int min=255;
283 int max=0;
284 int y;
285 for(y=0; y<8; y++){
286 int v= src[x + y*stride];
287 if(v>max) max=v;
288 if(v<min) min=v;
289 }
290 if(max-min > 2*QP) return 0;
291 }
292 return 1;
293 #endif
294 }
295
296 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
297 if( isHorizDC_C(src, stride, c) ){
298 if( isHorizMinMaxOk_C(src, stride, c->QP) )
299 return 1;
300 else
301 return 0;
302 }else{
303 return 2;
304 }
305 }
306
307 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
308 if( isVertDC_C(src, stride, c) ){
309 if( isVertMinMaxOk_C(src, stride, c->QP) )
310 return 1;
311 else
312 return 0;
313 }else{
314 return 2;
315 }
316 }
317
318 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
319 {
320 int y;
321 for(y=0; y<BLOCK_SIZE; y++)
322 {
323 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
324
325 if(FFABS(middleEnergy) < 8*c->QP)
326 {
327 const int q=(dst[3] - dst[4])/2;
328 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
329 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
330
331 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
332 d= FFMAX(d, 0);
333
334 d= (5*d + 32) >> 6;
335 d*= FFSIGN(-middleEnergy);
336
337 if(q>0)
338 {
339 d= d<0 ? 0 : d;
340 d= d>q ? q : d;
341 }
342 else
343 {
344 d= d>0 ? 0 : d;
345 d= d<q ? q : d;
346 }
347
348 dst[3]-= d;
349 dst[4]+= d;
350 }
351 dst+= stride;
352 }
353 }
354
355 /**
356 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
357 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
358 */
359 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
360 {
361 int y;
362 for(y=0; y<BLOCK_SIZE; y++)
363 {
364 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
365 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
366
367 int sums[10];
368 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
369 sums[1] = sums[0] - first + dst[3];
370 sums[2] = sums[1] - first + dst[4];
371 sums[3] = sums[2] - first + dst[5];
372 sums[4] = sums[3] - first + dst[6];
373 sums[5] = sums[4] - dst[0] + dst[7];
374 sums[6] = sums[5] - dst[1] + last;
375 sums[7] = sums[6] - dst[2] + last;
376 sums[8] = sums[7] - dst[3] + last;
377 sums[9] = sums[8] - dst[4] + last;
378
379 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
380 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
381 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
382 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
383 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
384 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
385 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
386 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
387
388 dst+= stride;
389 }
390 }
391
392 /**
393 * Experimental Filter 1 (Horizontal)
394 * will not damage linear gradients
395 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
396 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
397 * MMX2 version does correct clipping C version does not
398 * not identical with the vertical one
399 */
400 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
401 {
402 int y;
403 static uint64_t *lut= NULL;
404 if(lut==NULL)
405 {
406 int i;
407 lut = av_malloc(256*8);
408 for(i=0; i<256; i++)
409 {
410 int v= i < 128 ? 2*i : 2*(i-256);
411 /*
412 //Simulate 112242211 9-Tap filter
413 uint64_t a= (v/16) & 0xFF;
414 uint64_t b= (v/8) & 0xFF;
415 uint64_t c= (v/4) & 0xFF;
416 uint64_t d= (3*v/8) & 0xFF;
417 */
418 //Simulate piecewise linear interpolation
419 uint64_t a= (v/16) & 0xFF;
420 uint64_t b= (v*3/16) & 0xFF;
421 uint64_t c= (v*5/16) & 0xFF;
422 uint64_t d= (7*v/16) & 0xFF;
423 uint64_t A= (0x100 - a)&0xFF;
424 uint64_t B= (0x100 - b)&0xFF;
425 uint64_t C= (0x100 - c)&0xFF;
426 uint64_t D= (0x100 - c)&0xFF;
427
428 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
429 (D<<24) | (C<<16) | (B<<8) | (A);
430 //lut[i] = (v<<32) | (v<<24);
431 }
432 }
433
434 for(y=0; y<BLOCK_SIZE; y++)
435 {
436 int a= src[1] - src[2];
437 int b= src[3] - src[4];
438 int c= src[5] - src[6];
439
440 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
441
442 if(d < QP)
443 {
444 int v = d * FFSIGN(-b);
445
446 src[1] +=v/8;
447 src[2] +=v/4;
448 src[3] +=3*v/8;
449 src[4] -=3*v/8;
450 src[5] -=v/4;
451 src[6] -=v/8;
452
453 }
454 src+=stride;
455 }
456 }
457
458 /**
459 * accurate deblock filter
460 */
461 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
462 int y;
463 const int QP= c->QP;
464 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
465 const int dcThreshold= dcOffset*2 + 1;
466 //START_TIMER
467 src+= step*4; // src points to begin of the 8x8 Block
468 for(y=0; y<8; y++){
469 int numEq= 0;
470
471 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
472 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
473 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
474 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
475 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
476 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
477 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
478 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
479 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
480 if(numEq > c->ppMode.flatnessThreshold){
481 int min, max, x;
482
483 if(src[0] > src[step]){
484 max= src[0];
485 min= src[step];
486 }else{
487 max= src[step];
488 min= src[0];
489 }
490 for(x=2; x<8; x+=2){
491 if(src[x*step] > src[(x+1)*step]){
492 if(src[x *step] > max) max= src[ x *step];
493 if(src[(x+1)*step] < min) min= src[(x+1)*step];
494 }else{
495 if(src[(x+1)*step] > max) max= src[(x+1)*step];
496 if(src[ x *step] < min) min= src[ x *step];
497 }
498 }
499 if(max-min < 2*QP){
500 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
501 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
502
503 int sums[10];
504 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
505 sums[1] = sums[0] - first + src[3*step];
506 sums[2] = sums[1] - first + src[4*step];
507 sums[3] = sums[2] - first + src[5*step];
508 sums[4] = sums[3] - first + src[6*step];
509 sums[5] = sums[4] - src[0*step] + src[7*step];
510 sums[6] = sums[5] - src[1*step] + last;
511 sums[7] = sums[6] - src[2*step] + last;
512 sums[8] = sums[7] - src[3*step] + last;
513 sums[9] = sums[8] - src[4*step] + last;
514
515 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
516 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
517 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
518 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
519 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
520 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
521 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
522 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
523 }
524 }else{
525 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
526
527 if(FFABS(middleEnergy) < 8*QP)
528 {
529 const int q=(src[3*step] - src[4*step])/2;
530 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
531 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
532
533 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
534 d= FFMAX(d, 0);
535
536 d= (5*d + 32) >> 6;
537 d*= FFSIGN(-middleEnergy);
538
539 if(q>0)
540 {
541 d= d<0 ? 0 : d;
542 d= d>q ? q : d;
543 }
544 else
545 {
546 d= d>0 ? 0 : d;
547 d= d<q ? q : d;
548 }
549
550 src[3*step]-= d;
551 src[4*step]+= d;
552 }
553 }
554
555 src += stride;
556 }
557 /*if(step==16){
558 STOP_TIMER("step16")
559 }else{
560 STOP_TIMER("stepX")
561 }*/
562 }
563
564 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
565 //Plain C versions
566 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
567 #define COMPILE_C
568 #endif
569
570 #ifdef ARCH_POWERPC
571 #ifdef HAVE_ALTIVEC
572 #define COMPILE_ALTIVEC
573 #endif //HAVE_ALTIVEC
574 #endif //ARCH_POWERPC
575
576 #if defined(ARCH_X86)
577
578 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
579 #define COMPILE_MMX
580 #endif
581
582 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
583 #define COMPILE_MMX2
584 #endif
585
586 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
587 #define COMPILE_3DNOW
588 #endif
589 #endif /* defined(ARCH_X86) */
590
591 #undef HAVE_MMX
592 #undef HAVE_MMX2
593 #undef HAVE_3DNOW
594 #undef HAVE_ALTIVEC
595
596 #ifdef COMPILE_C
597 #undef HAVE_MMX
598 #undef HAVE_MMX2
599 #undef HAVE_3DNOW
600 #define RENAME(a) a ## _C
601 #include "postprocess_template.c"
602 #endif
603
604 #ifdef ARCH_POWERPC
605 #ifdef COMPILE_ALTIVEC
606 #undef RENAME
607 #define HAVE_ALTIVEC
608 #define RENAME(a) a ## _altivec
609 #include "postprocess_altivec_template.c"
610 #include "postprocess_template.c"
611 #endif
612 #endif //ARCH_POWERPC
613
614 //MMX versions
615 #ifdef COMPILE_MMX
616 #undef RENAME
617 #define HAVE_MMX
618 #undef HAVE_MMX2
619 #undef HAVE_3DNOW
620 #define RENAME(a) a ## _MMX
621 #include "postprocess_template.c"
622 #endif
623
624 //MMX2 versions
625 #ifdef COMPILE_MMX2
626 #undef RENAME
627 #define HAVE_MMX
628 #define HAVE_MMX2
629 #undef HAVE_3DNOW
630 #define RENAME(a) a ## _MMX2
631 #include "postprocess_template.c"
632 #endif
633
634 //3DNOW versions
635 #ifdef COMPILE_3DNOW
636 #undef RENAME
637 #define HAVE_MMX
638 #undef HAVE_MMX2
639 #define HAVE_3DNOW
640 #define RENAME(a) a ## _3DNow
641 #include "postprocess_template.c"
642 #endif
643
644 // minor note: the HAVE_xyz is messed up after that line so do not use it.
645
646 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
647 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
648 {
649 PPContext *c= (PPContext *)vc;
650 PPMode *ppMode= (PPMode *)vm;
651 c->ppMode= *ppMode; //FIXME
652
653 // Using ifs here as they are faster than function pointers although the
654 // difference would not be measureable here but it is much better because
655 // someone might exchange the CPU whithout restarting MPlayer ;)
656 #ifdef RUNTIME_CPUDETECT
657 #if defined(ARCH_X86)
658 // ordered per speed fasterst first
659 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
660 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
661 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
662 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
663 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
664 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
665 else
666 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
667 #else
668 #ifdef ARCH_POWERPC
669 #ifdef HAVE_ALTIVEC
670 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
671 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
672 else
673 #endif
674 #endif
675 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
676 #endif
677 #else //RUNTIME_CPUDETECT
678 #ifdef HAVE_MMX2
679 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
680 #elif defined (HAVE_3DNOW)
681 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
682 #elif defined (HAVE_MMX)
683 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
684 #elif defined (HAVE_ALTIVEC)
685 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
686 #else
687 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
688 #endif
689 #endif //!RUNTIME_CPUDETECT
690 }
691
692 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
693 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
694
695 /* -pp Command line Help
696 */
697 #if LIBPOSTPROC_VERSION_INT < (52<<16)
698 const char *const pp_help=
699 #else
700 const char pp_help[] =
701 #endif
702 "Available postprocessing filters:\n"
703 "Filters Options\n"
704 "short long name short long option Description\n"
705 "* * a autoq CPU power dependent enabler\n"
706 " c chrom chrominance filtering enabled\n"
707 " y nochrom chrominance filtering disabled\n"
708 " n noluma luma filtering disabled\n"
709 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
710 " 1. difference factor: default=32, higher -> more deblocking\n"
711 " 2. flatness threshold: default=39, lower -> more deblocking\n"
712 " the h & v deblocking filters share these\n"
713 " so you can't set different thresholds for h / v\n"
714 "vb vdeblock (2 threshold) vertical deblocking filter\n"
715 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
716 "va vadeblock (2 threshold) vertical deblocking filter\n"
717 "h1 x1hdeblock experimental h deblock filter 1\n"
718 "v1 x1vdeblock experimental v deblock filter 1\n"
719 "dr dering deringing filter\n"
720 "al autolevels automatic brightness / contrast\n"
721 " f fullyrange stretch luminance to (0..255)\n"
722 "lb linblenddeint linear blend deinterlacer\n"
723 "li linipoldeint linear interpolating deinterlace\n"
724 "ci cubicipoldeint cubic interpolating deinterlacer\n"
725 "md mediandeint median deinterlacer\n"
726 "fd ffmpegdeint ffmpeg deinterlacer\n"
727 "l5 lowpass5 FIR lowpass deinterlacer\n"
728 "de default hb:a,vb:a,dr:a\n"
729 "fa fast h1:a,v1:a,dr:a\n"
730 "ac ha:a:128:7,va:a,dr:a\n"
731 "tn tmpnoise (3 threshold) temporal noise reducer\n"
732 " 1. <= 2. <= 3. larger -> stronger filtering\n"
733 "fq forceQuant <quantizer> force quantizer\n"
734 "Usage:\n"
735 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
736 "long form example:\n"
737 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
738 "short form example:\n"
739 "vb:a/hb:a/lb de,-vb\n"
740 "more examples:\n"
741 "tn:64:128:256\n"
742 "\n"
743 ;
744
745 pp_mode_t *pp_get_mode_by_name_and_quality(const char *name, int quality)
746 {
747 char temp[GET_MODE_BUFFER_SIZE];
748 char *p= temp;
749 static const char filterDelimiters[] = ",/";
750 static const char optionDelimiters[] = ":";
751 struct PPMode *ppMode;
752 char *filterToken;
753
754 ppMode= av_malloc(sizeof(PPMode));
755
756 ppMode->lumMode= 0;
757 ppMode->chromMode= 0;
758 ppMode->maxTmpNoise[0]= 700;
759 ppMode->maxTmpNoise[1]= 1500;
760 ppMode->maxTmpNoise[2]= 3000;
761 ppMode->maxAllowedY= 234;
762 ppMode->minAllowedY= 16;
763 ppMode->baseDcDiff= 256/8;
764 ppMode->flatnessThreshold= 56-16-1;
765 ppMode->maxClippedThreshold= 0.01;
766 ppMode->error=0;
767
768 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
769
770 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
771
772 for(;;){
773 char *filterName;
774 int q= 1000000; //PP_QUALITY_MAX;
775 int chrom=-1;
776 int luma=-1;
777 char *option;
778 char *options[OPTIONS_ARRAY_SIZE];
779 int i;
780 int filterNameOk=0;
781 int numOfUnknownOptions=0;
782 int enable=1; //does the user want us to enabled or disabled the filter
783
784 filterToken= strtok(p, filterDelimiters);
785 if(filterToken == NULL) break;
786 p+= strlen(filterToken) + 1; // p points to next filterToken
787 filterName= strtok(filterToken, optionDelimiters);
788 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
789
790 if(*filterName == '-')
791 {
792 enable=0;
793 filterName++;
794 }
795
796 for(;;){ //for all options
797 option= strtok(NULL, optionDelimiters);
798 if(option == NULL) break;
799
800 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
801 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
802 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
803 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
804 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
805 else
806 {
807 options[numOfUnknownOptions] = option;
808 numOfUnknownOptions++;
809 }
810 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
811 }
812 options[numOfUnknownOptions] = NULL;
813
814 /* replace stuff from the replace Table */
815 for(i=0; replaceTable[2*i]!=NULL; i++)
816 {
817 if(!strcmp(replaceTable[2*i], filterName))
818 {
819 int newlen= strlen(replaceTable[2*i + 1]);
820 int plen;
821 int spaceLeft;
822
823 if(p==NULL) p= temp, *p=0; //last filter
824 else p--, *p=','; //not last filter
825
826 plen= strlen(p);
827 spaceLeft= p - temp + plen;
828 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
829 {
830 ppMode->error++;
831 break;
832 }
833 memmove(p + newlen, p, plen+1);
834 memcpy(p, replaceTable[2*i + 1], newlen);
835 filterNameOk=1;
836 }
837 }
838
839 for(i=0; filters[i].shortName!=NULL; i++)
840 {
841 if( !strcmp(filters[i].longName, filterName)
842 || !strcmp(filters[i].shortName, filterName))
843 {
844 ppMode->lumMode &= ~filters[i].mask;
845 ppMode->chromMode &= ~filters[i].mask;
846
847 filterNameOk=1;
848 if(!enable) break; // user wants to disable it
849
850 if(q >= filters[i].minLumQuality && luma)
851 ppMode->lumMode|= filters[i].mask;
852 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
853 if(q >= filters[i].minChromQuality)
854 ppMode->chromMode|= filters[i].mask;
855
856 if(filters[i].mask == LEVEL_FIX)
857 {
858 int o;
859 ppMode->minAllowedY= 16;
860 ppMode->maxAllowedY= 234;
861 for(o=0; options[o]!=NULL; o++)
862 {
863 if( !strcmp(options[o],"fullyrange")
864 ||!strcmp(options[o],"f"))
865 {
866 ppMode->minAllowedY= 0;
867 ppMode->maxAllowedY= 255;
868 numOfUnknownOptions--;
869 }
870 }
871 }
872 else if(filters[i].mask == TEMP_NOISE_FILTER)
873 {
874 int o;
875 int numOfNoises=0;
876
877 for(o=0; options[o]!=NULL; o++)
878 {
879 char *tail;
880 ppMode->maxTmpNoise[numOfNoises]=
881 strtol(options[o], &tail, 0);
882 if(tail!=options[o])
883 {
884 numOfNoises++;
885 numOfUnknownOptions--;
886 if(numOfNoises >= 3) break;
887 }
888 }
889 }
890 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
891 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
892 {
893 int o;
894
895 for(o=0; options[o]!=NULL && o<2; o++)
896 {
897 char *tail;
898 int val= strtol(options[o], &tail, 0);
899 if(tail==options[o]) break;
900
901 numOfUnknownOptions--;
902 if(o==0) ppMode->baseDcDiff= val;
903 else ppMode->flatnessThreshold= val;
904 }
905 }
906 else if(filters[i].mask == FORCE_QUANT)
907 {
908 int o;
909 ppMode->forcedQuant= 15;
910
911 for(o=0; options[o]!=NULL && o<1; o++)
912 {
913 char *tail;
914 int val= strtol(options[o], &tail, 0);
915 if(tail==options[o]) break;
916
917 numOfUnknownOptions--;
918 ppMode->forcedQuant= val;
919 }
920 }
921 }
922 }
923 if(!filterNameOk) ppMode->error++;
924 ppMode->error += numOfUnknownOptions;
925 }
926
927 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
928 if(ppMode->error)
929 {
930 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
931 av_free(ppMode);
932 return NULL;
933 }
934 return ppMode;
935 }
936
937 void pp_free_mode(pp_mode_t *mode){
938 av_free(mode);
939 }
940
941 static void reallocAlign(void **p, int alignment, int size){
942 av_free(*p);
943 *p= av_mallocz(size);
944 }
945
946 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
947 int mbWidth = (width+15)>>4;
948 int mbHeight= (height+15)>>4;
949 int i;
950
951 c->stride= stride;
952 c->qpStride= qpStride;
953
954 reallocAlign((void **)&c->tempDst, 8, stride*24);
955 reallocAlign((void **)&c->tempSrc, 8, stride*24);
956 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
957 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
958 for(i=0; i<256; i++)
959 c->yHistogram[i]= width*height/64*15/256;
960
961 for(i=0; i<3; i++)
962 {
963 //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
964 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
965 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
966 }
967
968 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
969 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
970 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
971 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
972 }
973
974 static const char * context_to_name(void * ptr) {
975 return "postproc";
976 }
977
978 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
979
980 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
981 PPContext *c= av_malloc(sizeof(PPContext));
982 int stride= (width+15)&(~15); //assumed / will realloc if needed
983 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
984
985 memset(c, 0, sizeof(PPContext));
986 c->av_class = &av_codec_context_class;
987 c->cpuCaps= cpuCaps;
988 if(cpuCaps&PP_FORMAT){
989 c->hChromaSubSample= cpuCaps&0x3;
990 c->vChromaSubSample= (cpuCaps>>4)&0x3;
991 }else{
992 c->hChromaSubSample= 1;
993 c->vChromaSubSample= 1;
994 }
995
996 reallocBuffers(c, width, height, stride, qpStride);
997
998 c->frameNum=-1;
999
1000 return c;
1001 }
1002
1003 void pp_free_context(void *vc){
1004 PPContext *c = (PPContext*)vc;
1005 int i;
1006
1007 for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1008 for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1009
1010 av_free(c->tempBlocks);
1011 av_free(c->yHistogram);
1012 av_free(c->tempDst);
1013 av_free(c->tempSrc);
1014 av_free(c->deintTemp);
1015 av_free(c->stdQPTable);
1016 av_free(c->nonBQPTable);
1017 av_free(c->forcedQPTable);
1018
1019 memset(c, 0, sizeof(PPContext));
1020
1021 av_free(c);
1022 }
1023
1024 void pp_postprocess(uint8_t * src[3], int srcStride[3],
1025 uint8_t * dst[3], int dstStride[3],
1026 int width, int height,
1027 QP_STORE_T *QP_store, int QPStride,
1028 pp_mode_t *vm, void *vc, int pict_type)
1029 {
1030 int mbWidth = (width+15)>>4;
1031 int mbHeight= (height+15)>>4;
1032 PPMode *mode = (PPMode*)vm;
1033 PPContext *c = (PPContext*)vc;
1034 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
1035 int absQPStride = FFABS(QPStride);
1036
1037 // c->stride and c->QPStride are always positive
1038 if(c->stride < minStride || c->qpStride < absQPStride)
1039 reallocBuffers(c, width, height,
1040 FFMAX(minStride, c->stride),
1041 FFMAX(c->qpStride, absQPStride));
1042
1043 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1044 {
1045 int i;
1046 QP_store= c->forcedQPTable;
1047 absQPStride = QPStride = 0;
1048 if(mode->lumMode & FORCE_QUANT)
1049 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1050 else
1051 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1052 }
1053
1054 if(pict_type & PP_PICT_TYPE_QP2){
1055 int i;
1056 const int count= mbHeight * absQPStride;
1057 for(i=0; i<(count>>2); i++){
1058 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1059 }
1060 for(i<<=2; i<count; i++){
1061 c->stdQPTable[i] = QP_store[i]>>1;
1062 }
1063 QP_store= c->stdQPTable;
1064 QPStride= absQPStride;
1065 }
1066
1067 if(0){
1068 int x,y;
1069 for(y=0; y<mbHeight; y++){
1070 for(x=0; x<mbWidth; x++){
1071 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1072 }
1073 av_log(c, AV_LOG_INFO, "\n");
1074 }
1075 av_log(c, AV_LOG_INFO, "\n");
1076 }
1077
1078 if((pict_type&7)!=3)
1079 {
1080 if (QPStride >= 0) {
1081 int i;
1082 const int count= mbHeight * QPStride;
1083 for(i=0; i<(count>>2); i++){
1084 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1085 }
1086 for(i<<=2; i<count; i++){
1087 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1088 }
1089 } else {
1090 int i,j;
1091 for(i=0; i<mbHeight; i++) {
1092 for(j=0; j<absQPStride; j++) {
1093 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1094 }
1095 }
1096 }
1097 }
1098
1099 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1100 mode->lumMode, mode->chromMode);
1101
1102 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1103 width, height, QP_store, QPStride, 0, mode, c);
1104
1105 width = (width )>>c->hChromaSubSample;
1106 height = (height)>>c->vChromaSubSample;
1107
1108 if(mode->chromMode)
1109 {
1110 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1111 width, height, QP_store, QPStride, 1, mode, c);
1112 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1113 width, height, QP_store, QPStride, 2, mode, c);
1114 }
1115 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1116 {
1117 linecpy(dst[1], src[1], height, srcStride[1]);
1118 linecpy(dst[2], src[2], height, srcStride[2]);
1119 }
1120 else
1121 {
1122 int y;
1123 for(y=0; y<height; y++)
1124 {
1125 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1126 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1127 }
1128 }
1129 }
1130