Change libpostproc to use the FFMIN/FFMAX macros from libavutil.
[libav.git] / libpostproc / postprocess.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file postprocess.c
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
51 # more or less selfinvented filters so the exactness isnt too meaningfull
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use the Subversion log
75
76 #include "config.h"
77 #include "avutil.h"
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <stdlib.h>
81 #include <string.h>
82 #ifdef HAVE_MALLOC_H
83 #include <malloc.h>
84 #endif
85 //#undef HAVE_MMX2
86 //#define HAVE_3DNOW
87 //#undef HAVE_MMX
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #ifdef USE_FASTMEMCPY
91 #include "libvo/fastmemcpy.h"
92 #endif
93 #include "postprocess.h"
94 #include "postprocess_internal.h"
95
96 #include "mangle.h" //FIXME should be supressed
97
98 #ifdef HAVE_ALTIVEC_H
99 #include <altivec.h>
100 #endif
101
102 #define GET_MODE_BUFFER_SIZE 500
103 #define OPTIONS_ARRAY_SIZE 10
104 #define BLOCK_SIZE 8
105 #define TEMP_STRIDE 8
106 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
107
108 #if defined(ARCH_X86) || defined(ARCH_X86_64)
109 static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
110 static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
111 static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
112 static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
113 static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
114 static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
115 static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
116 static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
117 #endif
118
119 static uint8_t clip_table[3*256];
120 static uint8_t * const clip_tab= clip_table + 256;
121
122 static const int verbose= 0;
123
124 static const int attribute_used deringThreshold= 20;
125
126
127 static struct PPFilter filters[]=
128 {
129 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
130 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
131 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
132 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
133 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
134 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
135 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
136 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
137 {"dr", "dering", 1, 5, 6, DERING},
138 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
139 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
140 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
141 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
142 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
143 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
144 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
145 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
146 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
147 {NULL, NULL,0,0,0,0} //End Marker
148 };
149
150 static const char *replaceTable[]=
151 {
152 "default", "hdeblock:a,vdeblock:a,dering:a",
153 "de", "hdeblock:a,vdeblock:a,dering:a",
154 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
155 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
156 "ac", "ha:a:128:7,va:a,dering:a",
157 NULL //End Marker
158 };
159
160
161 #if defined(ARCH_X86) || defined(ARCH_X86_64)
162 static inline void prefetchnta(void *p)
163 {
164 asm volatile( "prefetchnta (%0)\n\t"
165 : : "r" (p)
166 );
167 }
168
169 static inline void prefetcht0(void *p)
170 {
171 asm volatile( "prefetcht0 (%0)\n\t"
172 : : "r" (p)
173 );
174 }
175
176 static inline void prefetcht1(void *p)
177 {
178 asm volatile( "prefetcht1 (%0)\n\t"
179 : : "r" (p)
180 );
181 }
182
183 static inline void prefetcht2(void *p)
184 {
185 asm volatile( "prefetcht2 (%0)\n\t"
186 : : "r" (p)
187 );
188 }
189 #endif
190
191 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
192
193 /**
194 * Check if the given 8x8 Block is mostly "flat"
195 */
196 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
197 {
198 int numEq= 0;
199 int y;
200 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
201 const int dcThreshold= dcOffset*2 + 1;
202
203 for(y=0; y<BLOCK_SIZE; y++)
204 {
205 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
206 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
207 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
208 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
209 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
210 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
211 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
212 src+= stride;
213 }
214 return numEq > c->ppMode.flatnessThreshold;
215 }
216
217 /**
218 * Check if the middle 8x8 Block in the given 8x16 block is flat
219 */
220 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
221 int numEq= 0;
222 int y;
223 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
224 const int dcThreshold= dcOffset*2 + 1;
225
226 src+= stride*4; // src points to begin of the 8x8 Block
227 for(y=0; y<BLOCK_SIZE-1; y++)
228 {
229 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
230 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
231 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
232 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
233 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
234 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
235 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
236 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
237 src+= stride;
238 }
239 return numEq > c->ppMode.flatnessThreshold;
240 }
241
242 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
243 {
244 int i;
245 #if 1
246 for(i=0; i<2; i++){
247 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
248 src += stride;
249 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
250 src += stride;
251 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
252 src += stride;
253 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
254 src += stride;
255 }
256 #else
257 for(i=0; i<8; i++){
258 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
259 src += stride;
260 }
261 #endif
262 return 1;
263 }
264
265 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
266 {
267 #if 1
268 #if 1
269 int x;
270 src+= stride*4;
271 for(x=0; x<BLOCK_SIZE; x+=4)
272 {
273 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
274 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
275 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
276 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
277 }
278 #else
279 int x;
280 src+= stride*3;
281 for(x=0; x<BLOCK_SIZE; x++)
282 {
283 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
284 }
285 #endif
286 return 1;
287 #else
288 int x;
289 src+= stride*4;
290 for(x=0; x<BLOCK_SIZE; x++)
291 {
292 int min=255;
293 int max=0;
294 int y;
295 for(y=0; y<8; y++){
296 int v= src[x + y*stride];
297 if(v>max) max=v;
298 if(v<min) min=v;
299 }
300 if(max-min > 2*QP) return 0;
301 }
302 return 1;
303 #endif
304 }
305
306 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
307 if( isHorizDC_C(src, stride, c) ){
308 if( isHorizMinMaxOk_C(src, stride, c->QP) )
309 return 1;
310 else
311 return 0;
312 }else{
313 return 2;
314 }
315 }
316
317 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
318 if( isVertDC_C(src, stride, c) ){
319 if( isVertMinMaxOk_C(src, stride, c->QP) )
320 return 1;
321 else
322 return 0;
323 }else{
324 return 2;
325 }
326 }
327
328 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
329 {
330 int y;
331 for(y=0; y<BLOCK_SIZE; y++)
332 {
333 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
334
335 if(ABS(middleEnergy) < 8*c->QP)
336 {
337 const int q=(dst[3] - dst[4])/2;
338 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
339 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
340
341 int d= ABS(middleEnergy) - FFMIN( ABS(leftEnergy), ABS(rightEnergy) );
342 d= FFMAX(d, 0);
343
344 d= (5*d + 32) >> 6;
345 d*= SIGN(-middleEnergy);
346
347 if(q>0)
348 {
349 d= d<0 ? 0 : d;
350 d= d>q ? q : d;
351 }
352 else
353 {
354 d= d>0 ? 0 : d;
355 d= d<q ? q : d;
356 }
357
358 dst[3]-= d;
359 dst[4]+= d;
360 }
361 dst+= stride;
362 }
363 }
364
365 /**
366 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
367 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
368 */
369 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
370 {
371 int y;
372 for(y=0; y<BLOCK_SIZE; y++)
373 {
374 const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
375 const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
376
377 int sums[10];
378 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
379 sums[1] = sums[0] - first + dst[3];
380 sums[2] = sums[1] - first + dst[4];
381 sums[3] = sums[2] - first + dst[5];
382 sums[4] = sums[3] - first + dst[6];
383 sums[5] = sums[4] - dst[0] + dst[7];
384 sums[6] = sums[5] - dst[1] + last;
385 sums[7] = sums[6] - dst[2] + last;
386 sums[8] = sums[7] - dst[3] + last;
387 sums[9] = sums[8] - dst[4] + last;
388
389 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
390 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
391 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
392 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
393 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
394 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
395 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
396 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
397
398 dst+= stride;
399 }
400 }
401
402 /**
403 * Experimental Filter 1 (Horizontal)
404 * will not damage linear gradients
405 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
406 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
407 * MMX2 version does correct clipping C version doesnt
408 * not identical with the vertical one
409 */
410 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
411 {
412 int y;
413 static uint64_t *lut= NULL;
414 if(lut==NULL)
415 {
416 int i;
417 lut = av_malloc(256*8);
418 for(i=0; i<256; i++)
419 {
420 int v= i < 128 ? 2*i : 2*(i-256);
421 /*
422 //Simulate 112242211 9-Tap filter
423 uint64_t a= (v/16) & 0xFF;
424 uint64_t b= (v/8) & 0xFF;
425 uint64_t c= (v/4) & 0xFF;
426 uint64_t d= (3*v/8) & 0xFF;
427 */
428 //Simulate piecewise linear interpolation
429 uint64_t a= (v/16) & 0xFF;
430 uint64_t b= (v*3/16) & 0xFF;
431 uint64_t c= (v*5/16) & 0xFF;
432 uint64_t d= (7*v/16) & 0xFF;
433 uint64_t A= (0x100 - a)&0xFF;
434 uint64_t B= (0x100 - b)&0xFF;
435 uint64_t C= (0x100 - c)&0xFF;
436 uint64_t D= (0x100 - c)&0xFF;
437
438 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
439 (D<<24) | (C<<16) | (B<<8) | (A);
440 //lut[i] = (v<<32) | (v<<24);
441 }
442 }
443
444 for(y=0; y<BLOCK_SIZE; y++)
445 {
446 int a= src[1] - src[2];
447 int b= src[3] - src[4];
448 int c= src[5] - src[6];
449
450 int d= FFMAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
451
452 if(d < QP)
453 {
454 int v = d * SIGN(-b);
455
456 src[1] +=v/8;
457 src[2] +=v/4;
458 src[3] +=3*v/8;
459 src[4] -=3*v/8;
460 src[5] -=v/4;
461 src[6] -=v/8;
462
463 }
464 src+=stride;
465 }
466 }
467
468 /**
469 * accurate deblock filter
470 */
471 static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
472 int y;
473 const int QP= c->QP;
474 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
475 const int dcThreshold= dcOffset*2 + 1;
476 //START_TIMER
477 src+= step*4; // src points to begin of the 8x8 Block
478 for(y=0; y<8; y++){
479 int numEq= 0;
480
481 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
482 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
483 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
484 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
485 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
486 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
487 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
488 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
489 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
490 if(numEq > c->ppMode.flatnessThreshold){
491 int min, max, x;
492
493 if(src[0] > src[step]){
494 max= src[0];
495 min= src[step];
496 }else{
497 max= src[step];
498 min= src[0];
499 }
500 for(x=2; x<8; x+=2){
501 if(src[x*step] > src[(x+1)*step]){
502 if(src[x *step] > max) max= src[ x *step];
503 if(src[(x+1)*step] < min) min= src[(x+1)*step];
504 }else{
505 if(src[(x+1)*step] > max) max= src[(x+1)*step];
506 if(src[ x *step] < min) min= src[ x *step];
507 }
508 }
509 if(max-min < 2*QP){
510 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
511 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
512
513 int sums[10];
514 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
515 sums[1] = sums[0] - first + src[3*step];
516 sums[2] = sums[1] - first + src[4*step];
517 sums[3] = sums[2] - first + src[5*step];
518 sums[4] = sums[3] - first + src[6*step];
519 sums[5] = sums[4] - src[0*step] + src[7*step];
520 sums[6] = sums[5] - src[1*step] + last;
521 sums[7] = sums[6] - src[2*step] + last;
522 sums[8] = sums[7] - src[3*step] + last;
523 sums[9] = sums[8] - src[4*step] + last;
524
525 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
526 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
527 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
528 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
529 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
530 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
531 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
532 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
533 }
534 }else{
535 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
536
537 if(ABS(middleEnergy) < 8*QP)
538 {
539 const int q=(src[3*step] - src[4*step])/2;
540 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
541 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
542
543 int d= ABS(middleEnergy) - FFMIN( ABS(leftEnergy), ABS(rightEnergy) );
544 d= FFMAX(d, 0);
545
546 d= (5*d + 32) >> 6;
547 d*= SIGN(-middleEnergy);
548
549 if(q>0)
550 {
551 d= d<0 ? 0 : d;
552 d= d>q ? q : d;
553 }
554 else
555 {
556 d= d>0 ? 0 : d;
557 d= d<q ? q : d;
558 }
559
560 src[3*step]-= d;
561 src[4*step]+= d;
562 }
563 }
564
565 src += stride;
566 }
567 /*if(step==16){
568 STOP_TIMER("step16")
569 }else{
570 STOP_TIMER("stepX")
571 }*/
572 }
573
574 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
575 //Plain C versions
576 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
577 #define COMPILE_C
578 #endif
579
580 #ifdef ARCH_POWERPC
581 #ifdef HAVE_ALTIVEC
582 #define COMPILE_ALTIVEC
583 #endif //HAVE_ALTIVEC
584 #endif //ARCH_POWERPC
585
586 #if defined(ARCH_X86) || defined(ARCH_X86_64)
587
588 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
589 #define COMPILE_MMX
590 #endif
591
592 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
593 #define COMPILE_MMX2
594 #endif
595
596 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
597 #define COMPILE_3DNOW
598 #endif
599 #endif //ARCH_X86
600
601 #undef HAVE_MMX
602 #undef HAVE_MMX2
603 #undef HAVE_3DNOW
604 #undef HAVE_ALTIVEC
605
606 #ifdef COMPILE_C
607 #undef HAVE_MMX
608 #undef HAVE_MMX2
609 #undef HAVE_3DNOW
610 #define RENAME(a) a ## _C
611 #include "postprocess_template.c"
612 #endif
613
614 #ifdef ARCH_POWERPC
615 #ifdef COMPILE_ALTIVEC
616 #undef RENAME
617 #define HAVE_ALTIVEC
618 #define RENAME(a) a ## _altivec
619 #include "postprocess_altivec_template.c"
620 #include "postprocess_template.c"
621 #endif
622 #endif //ARCH_POWERPC
623
624 //MMX versions
625 #ifdef COMPILE_MMX
626 #undef RENAME
627 #define HAVE_MMX
628 #undef HAVE_MMX2
629 #undef HAVE_3DNOW
630 #define RENAME(a) a ## _MMX
631 #include "postprocess_template.c"
632 #endif
633
634 //MMX2 versions
635 #ifdef COMPILE_MMX2
636 #undef RENAME
637 #define HAVE_MMX
638 #define HAVE_MMX2
639 #undef HAVE_3DNOW
640 #define RENAME(a) a ## _MMX2
641 #include "postprocess_template.c"
642 #endif
643
644 //3DNOW versions
645 #ifdef COMPILE_3DNOW
646 #undef RENAME
647 #define HAVE_MMX
648 #undef HAVE_MMX2
649 #define HAVE_3DNOW
650 #define RENAME(a) a ## _3DNow
651 #include "postprocess_template.c"
652 #endif
653
654 // minor note: the HAVE_xyz is messed up after that line so dont use it
655
656 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
657 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
658 {
659 PPContext *c= (PPContext *)vc;
660 PPMode *ppMode= (PPMode *)vm;
661 c->ppMode= *ppMode; //FIXME
662
663 // useing ifs here as they are faster than function pointers allthough the
664 // difference wouldnt be messureable here but its much better because
665 // someone might exchange the cpu whithout restarting mplayer ;)
666 #ifdef RUNTIME_CPUDETECT
667 #if defined(ARCH_X86) || defined(ARCH_X86_64)
668 // ordered per speed fasterst first
669 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
670 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
671 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
672 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
673 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
674 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
675 else
676 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
677 #else
678 #ifdef ARCH_POWERPC
679 #ifdef HAVE_ALTIVEC
680 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
681 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
682 else
683 #endif
684 #endif
685 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
686 #endif
687 #else //RUNTIME_CPUDETECT
688 #ifdef HAVE_MMX2
689 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
690 #elif defined (HAVE_3DNOW)
691 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
692 #elif defined (HAVE_MMX)
693 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
694 #elif defined (HAVE_ALTIVEC)
695 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
696 #else
697 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
698 #endif
699 #endif //!RUNTIME_CPUDETECT
700 }
701
702 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
703 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
704
705 /* -pp Command line Help
706 */
707 char *pp_help=
708 "Available postprocessing filters:\n"
709 "Filters Options\n"
710 "short long name short long option Description\n"
711 "* * a autoq CPU power dependent enabler\n"
712 " c chrom chrominance filtering enabled\n"
713 " y nochrom chrominance filtering disabled\n"
714 " n noluma luma filtering disabled\n"
715 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
716 " 1. difference factor: default=32, higher -> more deblocking\n"
717 " 2. flatness threshold: default=39, lower -> more deblocking\n"
718 " the h & v deblocking filters share these\n"
719 " so you can't set different thresholds for h / v\n"
720 "vb vdeblock (2 threshold) vertical deblocking filter\n"
721 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
722 "va vadeblock (2 threshold) vertical deblocking filter\n"
723 "h1 x1hdeblock experimental h deblock filter 1\n"
724 "v1 x1vdeblock experimental v deblock filter 1\n"
725 "dr dering deringing filter\n"
726 "al autolevels automatic brightness / contrast\n"
727 " f fullyrange stretch luminance to (0..255)\n"
728 "lb linblenddeint linear blend deinterlacer\n"
729 "li linipoldeint linear interpolating deinterlace\n"
730 "ci cubicipoldeint cubic interpolating deinterlacer\n"
731 "md mediandeint median deinterlacer\n"
732 "fd ffmpegdeint ffmpeg deinterlacer\n"
733 "l5 lowpass5 FIR lowpass deinterlacer\n"
734 "de default hb:a,vb:a,dr:a\n"
735 "fa fast h1:a,v1:a,dr:a\n"
736 "ac ha:a:128:7,va:a,dr:a\n"
737 "tn tmpnoise (3 threshold) temporal noise reducer\n"
738 " 1. <= 2. <= 3. larger -> stronger filtering\n"
739 "fq forceQuant <quantizer> force quantizer\n"
740 "Usage:\n"
741 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
742 "long form example:\n"
743 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
744 "short form example:\n"
745 "vb:a/hb:a/lb de,-vb\n"
746 "more examples:\n"
747 "tn:64:128:256\n"
748 "\n"
749 ;
750
751 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
752 {
753 char temp[GET_MODE_BUFFER_SIZE];
754 char *p= temp;
755 const char *filterDelimiters= ",/";
756 const char *optionDelimiters= ":";
757 struct PPMode *ppMode;
758 char *filterToken;
759
760 ppMode= av_malloc(sizeof(PPMode));
761
762 ppMode->lumMode= 0;
763 ppMode->chromMode= 0;
764 ppMode->maxTmpNoise[0]= 700;
765 ppMode->maxTmpNoise[1]= 1500;
766 ppMode->maxTmpNoise[2]= 3000;
767 ppMode->maxAllowedY= 234;
768 ppMode->minAllowedY= 16;
769 ppMode->baseDcDiff= 256/8;
770 ppMode->flatnessThreshold= 56-16-1;
771 ppMode->maxClippedThreshold= 0.01;
772 ppMode->error=0;
773
774 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
775
776 if(verbose>1) printf("pp: %s\n", name);
777
778 for(;;){
779 char *filterName;
780 int q= 1000000; //PP_QUALITY_MAX;
781 int chrom=-1;
782 int luma=-1;
783 char *option;
784 char *options[OPTIONS_ARRAY_SIZE];
785 int i;
786 int filterNameOk=0;
787 int numOfUnknownOptions=0;
788 int enable=1; //does the user want us to enabled or disabled the filter
789
790 filterToken= strtok(p, filterDelimiters);
791 if(filterToken == NULL) break;
792 p+= strlen(filterToken) + 1; // p points to next filterToken
793 filterName= strtok(filterToken, optionDelimiters);
794 if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
795
796 if(*filterName == '-')
797 {
798 enable=0;
799 filterName++;
800 }
801
802 for(;;){ //for all options
803 option= strtok(NULL, optionDelimiters);
804 if(option == NULL) break;
805
806 if(verbose>1) printf("pp: option: %s\n", option);
807 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
808 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
809 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
810 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
811 else
812 {
813 options[numOfUnknownOptions] = option;
814 numOfUnknownOptions++;
815 }
816 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
817 }
818 options[numOfUnknownOptions] = NULL;
819
820 /* replace stuff from the replace Table */
821 for(i=0; replaceTable[2*i]!=NULL; i++)
822 {
823 if(!strcmp(replaceTable[2*i], filterName))
824 {
825 int newlen= strlen(replaceTable[2*i + 1]);
826 int plen;
827 int spaceLeft;
828
829 if(p==NULL) p= temp, *p=0; //last filter
830 else p--, *p=','; //not last filter
831
832 plen= strlen(p);
833 spaceLeft= p - temp + plen;
834 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
835 {
836 ppMode->error++;
837 break;
838 }
839 memmove(p + newlen, p, plen+1);
840 memcpy(p, replaceTable[2*i + 1], newlen);
841 filterNameOk=1;
842 }
843 }
844
845 for(i=0; filters[i].shortName!=NULL; i++)
846 {
847 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
848 if( !strcmp(filters[i].longName, filterName)
849 || !strcmp(filters[i].shortName, filterName))
850 {
851 ppMode->lumMode &= ~filters[i].mask;
852 ppMode->chromMode &= ~filters[i].mask;
853
854 filterNameOk=1;
855 if(!enable) break; // user wants to disable it
856
857 if(q >= filters[i].minLumQuality && luma)
858 ppMode->lumMode|= filters[i].mask;
859 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
860 if(q >= filters[i].minChromQuality)
861 ppMode->chromMode|= filters[i].mask;
862
863 if(filters[i].mask == LEVEL_FIX)
864 {
865 int o;
866 ppMode->minAllowedY= 16;
867 ppMode->maxAllowedY= 234;
868 for(o=0; options[o]!=NULL; o++)
869 {
870 if( !strcmp(options[o],"fullyrange")
871 ||!strcmp(options[o],"f"))
872 {
873 ppMode->minAllowedY= 0;
874 ppMode->maxAllowedY= 255;
875 numOfUnknownOptions--;
876 }
877 }
878 }
879 else if(filters[i].mask == TEMP_NOISE_FILTER)
880 {
881 int o;
882 int numOfNoises=0;
883
884 for(o=0; options[o]!=NULL; o++)
885 {
886 char *tail;
887 ppMode->maxTmpNoise[numOfNoises]=
888 strtol(options[o], &tail, 0);
889 if(tail!=options[o])
890 {
891 numOfNoises++;
892 numOfUnknownOptions--;
893 if(numOfNoises >= 3) break;
894 }
895 }
896 }
897 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
898 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
899 {
900 int o;
901
902 for(o=0; options[o]!=NULL && o<2; o++)
903 {
904 char *tail;
905 int val= strtol(options[o], &tail, 0);
906 if(tail==options[o]) break;
907
908 numOfUnknownOptions--;
909 if(o==0) ppMode->baseDcDiff= val;
910 else ppMode->flatnessThreshold= val;
911 }
912 }
913 else if(filters[i].mask == FORCE_QUANT)
914 {
915 int o;
916 ppMode->forcedQuant= 15;
917
918 for(o=0; options[o]!=NULL && o<1; o++)
919 {
920 char *tail;
921 int val= strtol(options[o], &tail, 0);
922 if(tail==options[o]) break;
923
924 numOfUnknownOptions--;
925 ppMode->forcedQuant= val;
926 }
927 }
928 }
929 }
930 if(!filterNameOk) ppMode->error++;
931 ppMode->error += numOfUnknownOptions;
932 }
933
934 if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
935 if(ppMode->error)
936 {
937 fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
938 av_free(ppMode);
939 return NULL;
940 }
941 return ppMode;
942 }
943
944 void pp_free_mode(pp_mode_t *mode){
945 av_free(mode);
946 }
947
948 static void reallocAlign(void **p, int alignment, int size){
949 av_free(*p);
950 *p= av_mallocz(size);
951 }
952
953 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
954 int mbWidth = (width+15)>>4;
955 int mbHeight= (height+15)>>4;
956 int i;
957
958 c->stride= stride;
959 c->qpStride= qpStride;
960
961 reallocAlign((void **)&c->tempDst, 8, stride*24);
962 reallocAlign((void **)&c->tempSrc, 8, stride*24);
963 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
964 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
965 for(i=0; i<256; i++)
966 c->yHistogram[i]= width*height/64*15/256;
967
968 for(i=0; i<3; i++)
969 {
970 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
971 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
972 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
973 }
974
975 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
976 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
977 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
978 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
979 }
980
981 static void global_init(void){
982 int i;
983 memset(clip_table, 0, 256);
984 for(i=256; i<512; i++)
985 clip_table[i]= i;
986 memset(clip_table+512, 0, 256);
987 }
988
989 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
990 PPContext *c= av_malloc(sizeof(PPContext));
991 int stride= (width+15)&(~15); //assumed / will realloc if needed
992 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
993
994 global_init();
995
996 memset(c, 0, sizeof(PPContext));
997 c->cpuCaps= cpuCaps;
998 if(cpuCaps&PP_FORMAT){
999 c->hChromaSubSample= cpuCaps&0x3;
1000 c->vChromaSubSample= (cpuCaps>>4)&0x3;
1001 }else{
1002 c->hChromaSubSample= 1;
1003 c->vChromaSubSample= 1;
1004 }
1005
1006 reallocBuffers(c, width, height, stride, qpStride);
1007
1008 c->frameNum=-1;
1009
1010 return c;
1011 }
1012
1013 void pp_free_context(void *vc){
1014 PPContext *c = (PPContext*)vc;
1015 int i;
1016
1017 for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1018 for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1019
1020 av_free(c->tempBlocks);
1021 av_free(c->yHistogram);
1022 av_free(c->tempDst);
1023 av_free(c->tempSrc);
1024 av_free(c->deintTemp);
1025 av_free(c->stdQPTable);
1026 av_free(c->nonBQPTable);
1027 av_free(c->forcedQPTable);
1028
1029 memset(c, 0, sizeof(PPContext));
1030
1031 av_free(c);
1032 }
1033
1034 void pp_postprocess(uint8_t * src[3], int srcStride[3],
1035 uint8_t * dst[3], int dstStride[3],
1036 int width, int height,
1037 QP_STORE_T *QP_store, int QPStride,
1038 pp_mode_t *vm, void *vc, int pict_type)
1039 {
1040 int mbWidth = (width+15)>>4;
1041 int mbHeight= (height+15)>>4;
1042 PPMode *mode = (PPMode*)vm;
1043 PPContext *c = (PPContext*)vc;
1044 int minStride= FFMAX(ABS(srcStride[0]), ABS(dstStride[0]));
1045 int absQPStride = ABS(QPStride);
1046
1047 // c->stride and c->QPStride are always positive
1048 if(c->stride < minStride || c->qpStride < absQPStride)
1049 reallocBuffers(c, width, height,
1050 FFMAX(minStride, c->stride),
1051 FFMAX(c->qpStride, absQPStride));
1052
1053 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1054 {
1055 int i;
1056 QP_store= c->forcedQPTable;
1057 absQPStride = QPStride = 0;
1058 if(mode->lumMode & FORCE_QUANT)
1059 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1060 else
1061 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1062 }
1063 //printf("pict_type:%d\n", pict_type);
1064
1065 if(pict_type & PP_PICT_TYPE_QP2){
1066 int i;
1067 const int count= mbHeight * absQPStride;
1068 for(i=0; i<(count>>2); i++){
1069 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1070 }
1071 for(i<<=2; i<count; i++){
1072 c->stdQPTable[i] = QP_store[i]>>1;
1073 }
1074 QP_store= c->stdQPTable;
1075 QPStride= absQPStride;
1076 }
1077
1078 if(0){
1079 int x,y;
1080 for(y=0; y<mbHeight; y++){
1081 for(x=0; x<mbWidth; x++){
1082 printf("%2d ", QP_store[x + y*QPStride]);
1083 }
1084 printf("\n");
1085 }
1086 printf("\n");
1087 }
1088
1089 if((pict_type&7)!=3)
1090 {
1091 if (QPStride >= 0) {
1092 int i;
1093 const int count= mbHeight * QPStride;
1094 for(i=0; i<(count>>2); i++){
1095 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1096 }
1097 for(i<<=2; i<count; i++){
1098 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1099 }
1100 } else {
1101 int i,j;
1102 for(i=0; i<mbHeight; i++) {
1103 for(j=0; j<absQPStride; j++) {
1104 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1105 }
1106 }
1107 }
1108 }
1109
1110 if(verbose>2)
1111 {
1112 printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
1113 }
1114
1115 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1116 width, height, QP_store, QPStride, 0, mode, c);
1117
1118 width = (width )>>c->hChromaSubSample;
1119 height = (height)>>c->vChromaSubSample;
1120
1121 if(mode->chromMode)
1122 {
1123 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1124 width, height, QP_store, QPStride, 1, mode, c);
1125 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1126 width, height, QP_store, QPStride, 2, mode, c);
1127 }
1128 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1129 {
1130 linecpy(dst[1], src[1], height, srcStride[1]);
1131 linecpy(dst[2], src[2], height, srcStride[2]);
1132 }
1133 else
1134 {
1135 int y;
1136 for(y=0; y<height; y++)
1137 {
1138 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1139 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1140 }
1141 }
1142 }
1143