dabe9e1c670244873a06f62851007f48ec57b0b2
[libav.git] / libpostproc / postprocess.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file postprocess.c
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
51 # more or less selfinvented filters so the exactness isnt too meaningfull
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use the Subversion log
75
76 #include "config.h"
77 #include "avutil.h"
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <stdlib.h>
81 #include <string.h>
82 #ifdef HAVE_MALLOC_H
83 #include <malloc.h>
84 #endif
85 //#undef HAVE_MMX2
86 //#define HAVE_3DNOW
87 //#undef HAVE_MMX
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #ifdef USE_FASTMEMCPY
91 #include "libvo/fastmemcpy.h"
92 #endif
93 #include "postprocess.h"
94 #include "postprocess_internal.h"
95
96 #include "mangle.h" //FIXME should be supressed
97
98 #ifdef HAVE_ALTIVEC_H
99 #include <altivec.h>
100 #endif
101
102 #define MIN(a,b) ((a) > (b) ? (b) : (a))
103 #define MAX(a,b) ((a) < (b) ? (b) : (a))
104 #define SIGN(a) ((a) > 0 ? 1 : -1)
105
106 #define GET_MODE_BUFFER_SIZE 500
107 #define OPTIONS_ARRAY_SIZE 10
108 #define BLOCK_SIZE 8
109 #define TEMP_STRIDE 8
110 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
111
112 #if defined(ARCH_X86) || defined(ARCH_X86_64)
113 static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
114 static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
115 static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
116 static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
117 static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
118 static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
119 static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
120 static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
121 #endif
122
123 static uint8_t clip_table[3*256];
124 static uint8_t * const clip_tab= clip_table + 256;
125
126 static const int verbose= 0;
127
128 static const int attribute_used deringThreshold= 20;
129
130
131 static struct PPFilter filters[]=
132 {
133 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
134 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
135 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
136 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
137 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
138 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
139 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
140 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
141 {"dr", "dering", 1, 5, 6, DERING},
142 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
143 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
144 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
145 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
146 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
147 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
148 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
149 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
150 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
151 {NULL, NULL,0,0,0,0} //End Marker
152 };
153
154 static const char *replaceTable[]=
155 {
156 "default", "hdeblock:a,vdeblock:a,dering:a",
157 "de", "hdeblock:a,vdeblock:a,dering:a",
158 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
159 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
160 "ac", "ha:a:128:7,va:a,dering:a",
161 NULL //End Marker
162 };
163
164
165 #if defined(ARCH_X86) || defined(ARCH_X86_64)
166 static inline void prefetchnta(void *p)
167 {
168 asm volatile( "prefetchnta (%0)\n\t"
169 : : "r" (p)
170 );
171 }
172
173 static inline void prefetcht0(void *p)
174 {
175 asm volatile( "prefetcht0 (%0)\n\t"
176 : : "r" (p)
177 );
178 }
179
180 static inline void prefetcht1(void *p)
181 {
182 asm volatile( "prefetcht1 (%0)\n\t"
183 : : "r" (p)
184 );
185 }
186
187 static inline void prefetcht2(void *p)
188 {
189 asm volatile( "prefetcht2 (%0)\n\t"
190 : : "r" (p)
191 );
192 }
193 #endif
194
195 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
196
197 /**
198 * Check if the given 8x8 Block is mostly "flat"
199 */
200 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
201 {
202 int numEq= 0;
203 int y;
204 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
205 const int dcThreshold= dcOffset*2 + 1;
206
207 for(y=0; y<BLOCK_SIZE; y++)
208 {
209 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
210 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
211 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
212 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
213 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
214 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
215 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
216 src+= stride;
217 }
218 return numEq > c->ppMode.flatnessThreshold;
219 }
220
221 /**
222 * Check if the middle 8x8 Block in the given 8x16 block is flat
223 */
224 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
225 int numEq= 0;
226 int y;
227 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
228 const int dcThreshold= dcOffset*2 + 1;
229
230 src+= stride*4; // src points to begin of the 8x8 Block
231 for(y=0; y<BLOCK_SIZE-1; y++)
232 {
233 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
234 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
235 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
236 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
237 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
238 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
239 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
240 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
241 src+= stride;
242 }
243 return numEq > c->ppMode.flatnessThreshold;
244 }
245
246 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
247 {
248 int i;
249 #if 1
250 for(i=0; i<2; i++){
251 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
252 src += stride;
253 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
254 src += stride;
255 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
256 src += stride;
257 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
258 src += stride;
259 }
260 #else
261 for(i=0; i<8; i++){
262 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
263 src += stride;
264 }
265 #endif
266 return 1;
267 }
268
269 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
270 {
271 #if 1
272 #if 1
273 int x;
274 src+= stride*4;
275 for(x=0; x<BLOCK_SIZE; x+=4)
276 {
277 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
278 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
279 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
280 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
281 }
282 #else
283 int x;
284 src+= stride*3;
285 for(x=0; x<BLOCK_SIZE; x++)
286 {
287 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
288 }
289 #endif
290 return 1;
291 #else
292 int x;
293 src+= stride*4;
294 for(x=0; x<BLOCK_SIZE; x++)
295 {
296 int min=255;
297 int max=0;
298 int y;
299 for(y=0; y<8; y++){
300 int v= src[x + y*stride];
301 if(v>max) max=v;
302 if(v<min) min=v;
303 }
304 if(max-min > 2*QP) return 0;
305 }
306 return 1;
307 #endif
308 }
309
310 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
311 if( isHorizDC_C(src, stride, c) ){
312 if( isHorizMinMaxOk_C(src, stride, c->QP) )
313 return 1;
314 else
315 return 0;
316 }else{
317 return 2;
318 }
319 }
320
321 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
322 if( isVertDC_C(src, stride, c) ){
323 if( isVertMinMaxOk_C(src, stride, c->QP) )
324 return 1;
325 else
326 return 0;
327 }else{
328 return 2;
329 }
330 }
331
332 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
333 {
334 int y;
335 for(y=0; y<BLOCK_SIZE; y++)
336 {
337 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
338
339 if(ABS(middleEnergy) < 8*c->QP)
340 {
341 const int q=(dst[3] - dst[4])/2;
342 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
343 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
344
345 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
346 d= MAX(d, 0);
347
348 d= (5*d + 32) >> 6;
349 d*= SIGN(-middleEnergy);
350
351 if(q>0)
352 {
353 d= d<0 ? 0 : d;
354 d= d>q ? q : d;
355 }
356 else
357 {
358 d= d>0 ? 0 : d;
359 d= d<q ? q : d;
360 }
361
362 dst[3]-= d;
363 dst[4]+= d;
364 }
365 dst+= stride;
366 }
367 }
368
369 /**
370 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
371 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
372 */
373 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
374 {
375 int y;
376 for(y=0; y<BLOCK_SIZE; y++)
377 {
378 const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
379 const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
380
381 int sums[10];
382 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
383 sums[1] = sums[0] - first + dst[3];
384 sums[2] = sums[1] - first + dst[4];
385 sums[3] = sums[2] - first + dst[5];
386 sums[4] = sums[3] - first + dst[6];
387 sums[5] = sums[4] - dst[0] + dst[7];
388 sums[6] = sums[5] - dst[1] + last;
389 sums[7] = sums[6] - dst[2] + last;
390 sums[8] = sums[7] - dst[3] + last;
391 sums[9] = sums[8] - dst[4] + last;
392
393 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
394 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
395 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
396 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
397 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
398 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
399 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
400 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
401
402 dst+= stride;
403 }
404 }
405
406 /**
407 * Experimental Filter 1 (Horizontal)
408 * will not damage linear gradients
409 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
410 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
411 * MMX2 version does correct clipping C version doesnt
412 * not identical with the vertical one
413 */
414 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
415 {
416 int y;
417 static uint64_t *lut= NULL;
418 if(lut==NULL)
419 {
420 int i;
421 lut = av_malloc(256*8);
422 for(i=0; i<256; i++)
423 {
424 int v= i < 128 ? 2*i : 2*(i-256);
425 /*
426 //Simulate 112242211 9-Tap filter
427 uint64_t a= (v/16) & 0xFF;
428 uint64_t b= (v/8) & 0xFF;
429 uint64_t c= (v/4) & 0xFF;
430 uint64_t d= (3*v/8) & 0xFF;
431 */
432 //Simulate piecewise linear interpolation
433 uint64_t a= (v/16) & 0xFF;
434 uint64_t b= (v*3/16) & 0xFF;
435 uint64_t c= (v*5/16) & 0xFF;
436 uint64_t d= (7*v/16) & 0xFF;
437 uint64_t A= (0x100 - a)&0xFF;
438 uint64_t B= (0x100 - b)&0xFF;
439 uint64_t C= (0x100 - c)&0xFF;
440 uint64_t D= (0x100 - c)&0xFF;
441
442 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
443 (D<<24) | (C<<16) | (B<<8) | (A);
444 //lut[i] = (v<<32) | (v<<24);
445 }
446 }
447
448 for(y=0; y<BLOCK_SIZE; y++)
449 {
450 int a= src[1] - src[2];
451 int b= src[3] - src[4];
452 int c= src[5] - src[6];
453
454 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
455
456 if(d < QP)
457 {
458 int v = d * SIGN(-b);
459
460 src[1] +=v/8;
461 src[2] +=v/4;
462 src[3] +=3*v/8;
463 src[4] -=3*v/8;
464 src[5] -=v/4;
465 src[6] -=v/8;
466
467 }
468 src+=stride;
469 }
470 }
471
472 /**
473 * accurate deblock filter
474 */
475 static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
476 int y;
477 const int QP= c->QP;
478 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
479 const int dcThreshold= dcOffset*2 + 1;
480 //START_TIMER
481 src+= step*4; // src points to begin of the 8x8 Block
482 for(y=0; y<8; y++){
483 int numEq= 0;
484
485 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
486 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
487 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
488 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
489 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
490 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
491 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
492 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
493 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
494 if(numEq > c->ppMode.flatnessThreshold){
495 int min, max, x;
496
497 if(src[0] > src[step]){
498 max= src[0];
499 min= src[step];
500 }else{
501 max= src[step];
502 min= src[0];
503 }
504 for(x=2; x<8; x+=2){
505 if(src[x*step] > src[(x+1)*step]){
506 if(src[x *step] > max) max= src[ x *step];
507 if(src[(x+1)*step] < min) min= src[(x+1)*step];
508 }else{
509 if(src[(x+1)*step] > max) max= src[(x+1)*step];
510 if(src[ x *step] < min) min= src[ x *step];
511 }
512 }
513 if(max-min < 2*QP){
514 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
515 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
516
517 int sums[10];
518 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
519 sums[1] = sums[0] - first + src[3*step];
520 sums[2] = sums[1] - first + src[4*step];
521 sums[3] = sums[2] - first + src[5*step];
522 sums[4] = sums[3] - first + src[6*step];
523 sums[5] = sums[4] - src[0*step] + src[7*step];
524 sums[6] = sums[5] - src[1*step] + last;
525 sums[7] = sums[6] - src[2*step] + last;
526 sums[8] = sums[7] - src[3*step] + last;
527 sums[9] = sums[8] - src[4*step] + last;
528
529 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
530 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
531 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
532 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
533 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
534 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
535 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
536 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
537 }
538 }else{
539 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
540
541 if(ABS(middleEnergy) < 8*QP)
542 {
543 const int q=(src[3*step] - src[4*step])/2;
544 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
545 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
546
547 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
548 d= MAX(d, 0);
549
550 d= (5*d + 32) >> 6;
551 d*= SIGN(-middleEnergy);
552
553 if(q>0)
554 {
555 d= d<0 ? 0 : d;
556 d= d>q ? q : d;
557 }
558 else
559 {
560 d= d>0 ? 0 : d;
561 d= d<q ? q : d;
562 }
563
564 src[3*step]-= d;
565 src[4*step]+= d;
566 }
567 }
568
569 src += stride;
570 }
571 /*if(step==16){
572 STOP_TIMER("step16")
573 }else{
574 STOP_TIMER("stepX")
575 }*/
576 }
577
578 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
579 //Plain C versions
580 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
581 #define COMPILE_C
582 #endif
583
584 #ifdef ARCH_POWERPC
585 #ifdef HAVE_ALTIVEC
586 #define COMPILE_ALTIVEC
587 #endif //HAVE_ALTIVEC
588 #endif //ARCH_POWERPC
589
590 #if defined(ARCH_X86) || defined(ARCH_X86_64)
591
592 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
593 #define COMPILE_MMX
594 #endif
595
596 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
597 #define COMPILE_MMX2
598 #endif
599
600 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
601 #define COMPILE_3DNOW
602 #endif
603 #endif //ARCH_X86
604
605 #undef HAVE_MMX
606 #undef HAVE_MMX2
607 #undef HAVE_3DNOW
608 #undef HAVE_ALTIVEC
609
610 #ifdef COMPILE_C
611 #undef HAVE_MMX
612 #undef HAVE_MMX2
613 #undef HAVE_3DNOW
614 #define RENAME(a) a ## _C
615 #include "postprocess_template.c"
616 #endif
617
618 #ifdef ARCH_POWERPC
619 #ifdef COMPILE_ALTIVEC
620 #undef RENAME
621 #define HAVE_ALTIVEC
622 #define RENAME(a) a ## _altivec
623 #include "postprocess_altivec_template.c"
624 #include "postprocess_template.c"
625 #endif
626 #endif //ARCH_POWERPC
627
628 //MMX versions
629 #ifdef COMPILE_MMX
630 #undef RENAME
631 #define HAVE_MMX
632 #undef HAVE_MMX2
633 #undef HAVE_3DNOW
634 #define RENAME(a) a ## _MMX
635 #include "postprocess_template.c"
636 #endif
637
638 //MMX2 versions
639 #ifdef COMPILE_MMX2
640 #undef RENAME
641 #define HAVE_MMX
642 #define HAVE_MMX2
643 #undef HAVE_3DNOW
644 #define RENAME(a) a ## _MMX2
645 #include "postprocess_template.c"
646 #endif
647
648 //3DNOW versions
649 #ifdef COMPILE_3DNOW
650 #undef RENAME
651 #define HAVE_MMX
652 #undef HAVE_MMX2
653 #define HAVE_3DNOW
654 #define RENAME(a) a ## _3DNow
655 #include "postprocess_template.c"
656 #endif
657
658 // minor note: the HAVE_xyz is messed up after that line so dont use it
659
660 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
661 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
662 {
663 PPContext *c= (PPContext *)vc;
664 PPMode *ppMode= (PPMode *)vm;
665 c->ppMode= *ppMode; //FIXME
666
667 // useing ifs here as they are faster than function pointers allthough the
668 // difference wouldnt be messureable here but its much better because
669 // someone might exchange the cpu whithout restarting mplayer ;)
670 #ifdef RUNTIME_CPUDETECT
671 #if defined(ARCH_X86) || defined(ARCH_X86_64)
672 // ordered per speed fasterst first
673 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
674 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
675 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
676 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
677 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
678 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
679 else
680 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
681 #else
682 #ifdef ARCH_POWERPC
683 #ifdef HAVE_ALTIVEC
684 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
685 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
686 else
687 #endif
688 #endif
689 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
690 #endif
691 #else //RUNTIME_CPUDETECT
692 #ifdef HAVE_MMX2
693 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
694 #elif defined (HAVE_3DNOW)
695 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
696 #elif defined (HAVE_MMX)
697 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
698 #elif defined (HAVE_ALTIVEC)
699 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
700 #else
701 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
702 #endif
703 #endif //!RUNTIME_CPUDETECT
704 }
705
706 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
707 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
708
709 /* -pp Command line Help
710 */
711 char *pp_help=
712 "Available postprocessing filters:\n"
713 "Filters Options\n"
714 "short long name short long option Description\n"
715 "* * a autoq CPU power dependent enabler\n"
716 " c chrom chrominance filtering enabled\n"
717 " y nochrom chrominance filtering disabled\n"
718 " n noluma luma filtering disabled\n"
719 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
720 " 1. difference factor: default=32, higher -> more deblocking\n"
721 " 2. flatness threshold: default=39, lower -> more deblocking\n"
722 " the h & v deblocking filters share these\n"
723 " so you can't set different thresholds for h / v\n"
724 "vb vdeblock (2 threshold) vertical deblocking filter\n"
725 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
726 "va vadeblock (2 threshold) vertical deblocking filter\n"
727 "h1 x1hdeblock experimental h deblock filter 1\n"
728 "v1 x1vdeblock experimental v deblock filter 1\n"
729 "dr dering deringing filter\n"
730 "al autolevels automatic brightness / contrast\n"
731 " f fullyrange stretch luminance to (0..255)\n"
732 "lb linblenddeint linear blend deinterlacer\n"
733 "li linipoldeint linear interpolating deinterlace\n"
734 "ci cubicipoldeint cubic interpolating deinterlacer\n"
735 "md mediandeint median deinterlacer\n"
736 "fd ffmpegdeint ffmpeg deinterlacer\n"
737 "l5 lowpass5 FIR lowpass deinterlacer\n"
738 "de default hb:a,vb:a,dr:a\n"
739 "fa fast h1:a,v1:a,dr:a\n"
740 "ac ha:a:128:7,va:a,dr:a\n"
741 "tn tmpnoise (3 threshold) temporal noise reducer\n"
742 " 1. <= 2. <= 3. larger -> stronger filtering\n"
743 "fq forceQuant <quantizer> force quantizer\n"
744 "Usage:\n"
745 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
746 "long form example:\n"
747 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
748 "short form example:\n"
749 "vb:a/hb:a/lb de,-vb\n"
750 "more examples:\n"
751 "tn:64:128:256\n"
752 "\n"
753 ;
754
755 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
756 {
757 char temp[GET_MODE_BUFFER_SIZE];
758 char *p= temp;
759 const char *filterDelimiters= ",/";
760 const char *optionDelimiters= ":";
761 struct PPMode *ppMode;
762 char *filterToken;
763
764 ppMode= av_malloc(sizeof(PPMode));
765
766 ppMode->lumMode= 0;
767 ppMode->chromMode= 0;
768 ppMode->maxTmpNoise[0]= 700;
769 ppMode->maxTmpNoise[1]= 1500;
770 ppMode->maxTmpNoise[2]= 3000;
771 ppMode->maxAllowedY= 234;
772 ppMode->minAllowedY= 16;
773 ppMode->baseDcDiff= 256/8;
774 ppMode->flatnessThreshold= 56-16-1;
775 ppMode->maxClippedThreshold= 0.01;
776 ppMode->error=0;
777
778 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
779
780 if(verbose>1) printf("pp: %s\n", name);
781
782 for(;;){
783 char *filterName;
784 int q= 1000000; //PP_QUALITY_MAX;
785 int chrom=-1;
786 int luma=-1;
787 char *option;
788 char *options[OPTIONS_ARRAY_SIZE];
789 int i;
790 int filterNameOk=0;
791 int numOfUnknownOptions=0;
792 int enable=1; //does the user want us to enabled or disabled the filter
793
794 filterToken= strtok(p, filterDelimiters);
795 if(filterToken == NULL) break;
796 p+= strlen(filterToken) + 1; // p points to next filterToken
797 filterName= strtok(filterToken, optionDelimiters);
798 if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
799
800 if(*filterName == '-')
801 {
802 enable=0;
803 filterName++;
804 }
805
806 for(;;){ //for all options
807 option= strtok(NULL, optionDelimiters);
808 if(option == NULL) break;
809
810 if(verbose>1) printf("pp: option: %s\n", option);
811 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
812 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
813 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
814 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
815 else
816 {
817 options[numOfUnknownOptions] = option;
818 numOfUnknownOptions++;
819 }
820 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
821 }
822 options[numOfUnknownOptions] = NULL;
823
824 /* replace stuff from the replace Table */
825 for(i=0; replaceTable[2*i]!=NULL; i++)
826 {
827 if(!strcmp(replaceTable[2*i], filterName))
828 {
829 int newlen= strlen(replaceTable[2*i + 1]);
830 int plen;
831 int spaceLeft;
832
833 if(p==NULL) p= temp, *p=0; //last filter
834 else p--, *p=','; //not last filter
835
836 plen= strlen(p);
837 spaceLeft= p - temp + plen;
838 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
839 {
840 ppMode->error++;
841 break;
842 }
843 memmove(p + newlen, p, plen+1);
844 memcpy(p, replaceTable[2*i + 1], newlen);
845 filterNameOk=1;
846 }
847 }
848
849 for(i=0; filters[i].shortName!=NULL; i++)
850 {
851 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
852 if( !strcmp(filters[i].longName, filterName)
853 || !strcmp(filters[i].shortName, filterName))
854 {
855 ppMode->lumMode &= ~filters[i].mask;
856 ppMode->chromMode &= ~filters[i].mask;
857
858 filterNameOk=1;
859 if(!enable) break; // user wants to disable it
860
861 if(q >= filters[i].minLumQuality && luma)
862 ppMode->lumMode|= filters[i].mask;
863 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
864 if(q >= filters[i].minChromQuality)
865 ppMode->chromMode|= filters[i].mask;
866
867 if(filters[i].mask == LEVEL_FIX)
868 {
869 int o;
870 ppMode->minAllowedY= 16;
871 ppMode->maxAllowedY= 234;
872 for(o=0; options[o]!=NULL; o++)
873 {
874 if( !strcmp(options[o],"fullyrange")
875 ||!strcmp(options[o],"f"))
876 {
877 ppMode->minAllowedY= 0;
878 ppMode->maxAllowedY= 255;
879 numOfUnknownOptions--;
880 }
881 }
882 }
883 else if(filters[i].mask == TEMP_NOISE_FILTER)
884 {
885 int o;
886 int numOfNoises=0;
887
888 for(o=0; options[o]!=NULL; o++)
889 {
890 char *tail;
891 ppMode->maxTmpNoise[numOfNoises]=
892 strtol(options[o], &tail, 0);
893 if(tail!=options[o])
894 {
895 numOfNoises++;
896 numOfUnknownOptions--;
897 if(numOfNoises >= 3) break;
898 }
899 }
900 }
901 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
902 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
903 {
904 int o;
905
906 for(o=0; options[o]!=NULL && o<2; o++)
907 {
908 char *tail;
909 int val= strtol(options[o], &tail, 0);
910 if(tail==options[o]) break;
911
912 numOfUnknownOptions--;
913 if(o==0) ppMode->baseDcDiff= val;
914 else ppMode->flatnessThreshold= val;
915 }
916 }
917 else if(filters[i].mask == FORCE_QUANT)
918 {
919 int o;
920 ppMode->forcedQuant= 15;
921
922 for(o=0; options[o]!=NULL && o<1; o++)
923 {
924 char *tail;
925 int val= strtol(options[o], &tail, 0);
926 if(tail==options[o]) break;
927
928 numOfUnknownOptions--;
929 ppMode->forcedQuant= val;
930 }
931 }
932 }
933 }
934 if(!filterNameOk) ppMode->error++;
935 ppMode->error += numOfUnknownOptions;
936 }
937
938 if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
939 if(ppMode->error)
940 {
941 fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
942 av_free(ppMode);
943 return NULL;
944 }
945 return ppMode;
946 }
947
948 void pp_free_mode(pp_mode_t *mode){
949 av_free(mode);
950 }
951
952 static void reallocAlign(void **p, int alignment, int size){
953 av_free(*p);
954 *p= av_mallocz(size);
955 }
956
957 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
958 int mbWidth = (width+15)>>4;
959 int mbHeight= (height+15)>>4;
960 int i;
961
962 c->stride= stride;
963 c->qpStride= qpStride;
964
965 reallocAlign((void **)&c->tempDst, 8, stride*24);
966 reallocAlign((void **)&c->tempSrc, 8, stride*24);
967 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
968 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
969 for(i=0; i<256; i++)
970 c->yHistogram[i]= width*height/64*15/256;
971
972 for(i=0; i<3; i++)
973 {
974 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
975 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
976 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
977 }
978
979 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
980 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
981 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
982 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
983 }
984
985 static void global_init(void){
986 int i;
987 memset(clip_table, 0, 256);
988 for(i=256; i<512; i++)
989 clip_table[i]= i;
990 memset(clip_table+512, 0, 256);
991 }
992
993 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
994 PPContext *c= av_malloc(sizeof(PPContext));
995 int stride= (width+15)&(~15); //assumed / will realloc if needed
996 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
997
998 global_init();
999
1000 memset(c, 0, sizeof(PPContext));
1001 c->cpuCaps= cpuCaps;
1002 if(cpuCaps&PP_FORMAT){
1003 c->hChromaSubSample= cpuCaps&0x3;
1004 c->vChromaSubSample= (cpuCaps>>4)&0x3;
1005 }else{
1006 c->hChromaSubSample= 1;
1007 c->vChromaSubSample= 1;
1008 }
1009
1010 reallocBuffers(c, width, height, stride, qpStride);
1011
1012 c->frameNum=-1;
1013
1014 return c;
1015 }
1016
1017 void pp_free_context(void *vc){
1018 PPContext *c = (PPContext*)vc;
1019 int i;
1020
1021 for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1022 for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1023
1024 av_free(c->tempBlocks);
1025 av_free(c->yHistogram);
1026 av_free(c->tempDst);
1027 av_free(c->tempSrc);
1028 av_free(c->deintTemp);
1029 av_free(c->stdQPTable);
1030 av_free(c->nonBQPTable);
1031 av_free(c->forcedQPTable);
1032
1033 memset(c, 0, sizeof(PPContext));
1034
1035 av_free(c);
1036 }
1037
1038 void pp_postprocess(uint8_t * src[3], int srcStride[3],
1039 uint8_t * dst[3], int dstStride[3],
1040 int width, int height,
1041 QP_STORE_T *QP_store, int QPStride,
1042 pp_mode_t *vm, void *vc, int pict_type)
1043 {
1044 int mbWidth = (width+15)>>4;
1045 int mbHeight= (height+15)>>4;
1046 PPMode *mode = (PPMode*)vm;
1047 PPContext *c = (PPContext*)vc;
1048 int minStride= MAX(ABS(srcStride[0]), ABS(dstStride[0]));
1049 int absQPStride = ABS(QPStride);
1050
1051 // c->stride and c->QPStride are always positive
1052 if(c->stride < minStride || c->qpStride < absQPStride)
1053 reallocBuffers(c, width, height,
1054 MAX(minStride, c->stride),
1055 MAX(c->qpStride, absQPStride));
1056
1057 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1058 {
1059 int i;
1060 QP_store= c->forcedQPTable;
1061 absQPStride = QPStride = 0;
1062 if(mode->lumMode & FORCE_QUANT)
1063 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1064 else
1065 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1066 }
1067 //printf("pict_type:%d\n", pict_type);
1068
1069 if(pict_type & PP_PICT_TYPE_QP2){
1070 int i;
1071 const int count= mbHeight * absQPStride;
1072 for(i=0; i<(count>>2); i++){
1073 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1074 }
1075 for(i<<=2; i<count; i++){
1076 c->stdQPTable[i] = QP_store[i]>>1;
1077 }
1078 QP_store= c->stdQPTable;
1079 QPStride= absQPStride;
1080 }
1081
1082 if(0){
1083 int x,y;
1084 for(y=0; y<mbHeight; y++){
1085 for(x=0; x<mbWidth; x++){
1086 printf("%2d ", QP_store[x + y*QPStride]);
1087 }
1088 printf("\n");
1089 }
1090 printf("\n");
1091 }
1092
1093 if((pict_type&7)!=3)
1094 {
1095 if (QPStride >= 0) {
1096 int i;
1097 const int count= mbHeight * QPStride;
1098 for(i=0; i<(count>>2); i++){
1099 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1100 }
1101 for(i<<=2; i<count; i++){
1102 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1103 }
1104 } else {
1105 int i,j;
1106 for(i=0; i<mbHeight; i++) {
1107 for(j=0; j<absQPStride; j++) {
1108 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1109 }
1110 }
1111 }
1112 }
1113
1114 if(verbose>2)
1115 {
1116 printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
1117 }
1118
1119 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1120 width, height, QP_store, QPStride, 0, mode, c);
1121
1122 width = (width )>>c->hChromaSubSample;
1123 height = (height)>>c->vChromaSubSample;
1124
1125 if(mode->chromMode)
1126 {
1127 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1128 width, height, QP_store, QPStride, 1, mode, c);
1129 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1130 width, height, QP_store, QPStride, 2, mode, c);
1131 }
1132 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1133 {
1134 linecpy(dst[1], src[1], height, srcStride[1]);
1135 linecpy(dst[2], src[2], height, srcStride[2]);
1136 }
1137 else
1138 {
1139 int y;
1140 for(y=0; y<height; y++)
1141 {
1142 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1143 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1144 }
1145 }
1146 }
1147