typo fix by (Marcin 'Morgoth' Kurek <morgoth6 at box43 dot pl>)
[libav.git] / libavcodec / libpostproc / postprocess.c
1 /*
2 Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3
4 AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21 /**
22 * @file postprocess.c
23 * postprocessing.
24 */
25
26 /*
27 C MMX MMX2 3DNow AltiVec
28 isVertDC Ec Ec Ec
29 isVertMinMaxOk Ec Ec Ec
30 doVertLowPass E e e Ec
31 doVertDefFilter Ec Ec e e Ec
32 isHorizDC Ec Ec Ec
33 isHorizMinMaxOk a E Ec
34 doHorizLowPass E e e Ec
35 doHorizDefFilter Ec Ec e e Ec
36 do_a_deblock Ec E Ec E
37 deRing E e e* Ecp
38 Vertical RKAlgo1 E a a
39 Horizontal RKAlgo1 a a
40 Vertical X1# a E E
41 Horizontal X1# a E E
42 LinIpolDeinterlace e E E*
43 CubicIpolDeinterlace a e e*
44 LinBlendDeinterlace e E E*
45 MedianDeinterlace# E Ec Ec
46 TempDeNoiser# E e e Ec
47
48 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
49 # more or less selfinvented filters so the exactness isnt too meaningfull
50 E = Exact implementation
51 e = allmost exact implementation (slightly different rounding,...)
52 a = alternative / approximate impl
53 c = checked against the other implementations (-vo md5)
54 p = partially optimized, still some work to do
55 */
56
57 /*
58 TODO:
59 reduce the time wasted on the mem transfer
60 unroll stuff if instructions depend too much on the prior one
61 move YScale thing to the end instead of fixing QP
62 write a faster and higher quality deblocking filter :)
63 make the mainloop more flexible (variable number of blocks at once
64 (the if/else stuff per block is slowing things down)
65 compare the quality & speed of all filters
66 split this huge file
67 optimize c versions
68 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
69 ...
70 */
71
72 //Changelog: use the CVS log
73
74 #include "config.h"
75 #include <inttypes.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
79 #ifdef HAVE_MALLOC_H
80 #include <malloc.h>
81 #endif
82 //#undef HAVE_MMX2
83 //#define HAVE_3DNOW
84 //#undef HAVE_MMX
85 //#undef ARCH_X86
86 //#define DEBUG_BRIGHTNESS
87 #ifdef USE_FASTMEMCPY
88 #include "fastmemcpy.h"
89 #endif
90 #include "postprocess.h"
91 #include "postprocess_internal.h"
92
93 #include "mangle.h" //FIXME should be supressed
94
95 #ifdef HAVE_ALTIVEC_H
96 #include <altivec.h>
97 #endif
98
99 #ifndef HAVE_MEMALIGN
100 #define memalign(a,b) malloc(b)
101 #endif
102
103 #define MIN(a,b) ((a) > (b) ? (b) : (a))
104 #define MAX(a,b) ((a) < (b) ? (b) : (a))
105 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
106 #define SIGN(a) ((a) > 0 ? 1 : -1)
107
108 #define GET_MODE_BUFFER_SIZE 500
109 #define OPTIONS_ARRAY_SIZE 10
110 #define BLOCK_SIZE 8
111 #define TEMP_STRIDE 8
112 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
113
114 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
115 # define attribute_used __attribute__((used))
116 # define always_inline __attribute__((always_inline)) inline
117 #else
118 # define attribute_used
119 # define always_inline inline
120 #endif
121
122 #ifdef ARCH_X86
123 static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
124 static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
125 static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
126 static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
127 static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
128 static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
129 static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
130 static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
131 #endif
132
133 static uint8_t clip_table[3*256];
134 static uint8_t * const clip_tab= clip_table + 256;
135
136 static const int verbose= 0;
137
138 static const int attribute_used deringThreshold= 20;
139
140
141 static struct PPFilter filters[]=
142 {
143 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
144 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
145 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
146 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
147 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
148 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
149 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
150 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
151 {"dr", "dering", 1, 5, 6, DERING},
152 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
153 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
154 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
155 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
156 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
157 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
158 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
159 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
160 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
161 {NULL, NULL,0,0,0,0} //End Marker
162 };
163
164 static char *replaceTable[]=
165 {
166 "default", "hdeblock:a,vdeblock:a,dering:a",
167 "de", "hdeblock:a,vdeblock:a,dering:a",
168 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
169 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
170 "ac", "ha:a:128:7,va:a,dering:a",
171 NULL //End Marker
172 };
173
174
175 #ifdef ARCH_X86
176 static inline void prefetchnta(void *p)
177 {
178 asm volatile( "prefetchnta (%0)\n\t"
179 : : "r" (p)
180 );
181 }
182
183 static inline void prefetcht0(void *p)
184 {
185 asm volatile( "prefetcht0 (%0)\n\t"
186 : : "r" (p)
187 );
188 }
189
190 static inline void prefetcht1(void *p)
191 {
192 asm volatile( "prefetcht1 (%0)\n\t"
193 : : "r" (p)
194 );
195 }
196
197 static inline void prefetcht2(void *p)
198 {
199 asm volatile( "prefetcht2 (%0)\n\t"
200 : : "r" (p)
201 );
202 }
203 #endif
204
205 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
206
207 /**
208 * Check if the given 8x8 Block is mostly "flat"
209 */
210 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
211 {
212 int numEq= 0;
213 int y;
214 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
215 const int dcThreshold= dcOffset*2 + 1;
216
217 for(y=0; y<BLOCK_SIZE; y++)
218 {
219 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
220 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
221 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
222 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
223 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
224 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
225 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
226 src+= stride;
227 }
228 return numEq > c->ppMode.flatnessThreshold;
229 }
230
231 /**
232 * Check if the middle 8x8 Block in the given 8x16 block is flat
233 */
234 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
235 int numEq= 0;
236 int y;
237 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
238 const int dcThreshold= dcOffset*2 + 1;
239
240 src+= stride*4; // src points to begin of the 8x8 Block
241 for(y=0; y<BLOCK_SIZE-1; y++)
242 {
243 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
244 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
245 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
246 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
247 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
248 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
249 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
250 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
251 src+= stride;
252 }
253 return numEq > c->ppMode.flatnessThreshold;
254 }
255
256 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
257 {
258 int i;
259 #if 1
260 for(i=0; i<2; i++){
261 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
262 src += stride;
263 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
264 src += stride;
265 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
266 src += stride;
267 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
268 src += stride;
269 }
270 #else
271 for(i=0; i<8; i++){
272 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
273 src += stride;
274 }
275 #endif
276 return 1;
277 }
278
279 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
280 {
281 #if 1
282 #if 1
283 int x;
284 src+= stride*4;
285 for(x=0; x<BLOCK_SIZE; x+=4)
286 {
287 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
288 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
289 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
290 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
291 }
292 #else
293 int x;
294 src+= stride*3;
295 for(x=0; x<BLOCK_SIZE; x++)
296 {
297 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
298 }
299 #endif
300 return 1;
301 #else
302 int x;
303 src+= stride*4;
304 for(x=0; x<BLOCK_SIZE; x++)
305 {
306 int min=255;
307 int max=0;
308 int y;
309 for(y=0; y<8; y++){
310 int v= src[x + y*stride];
311 if(v>max) max=v;
312 if(v<min) min=v;
313 }
314 if(max-min > 2*QP) return 0;
315 }
316 return 1;
317 #endif
318 }
319
320 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
321 if( isHorizDC_C(src, stride, c) ){
322 if( isHorizMinMaxOk_C(src, stride, c->QP) )
323 return 1;
324 else
325 return 0;
326 }else{
327 return 2;
328 }
329 }
330
331 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
332 if( isVertDC_C(src, stride, c) ){
333 if( isVertMinMaxOk_C(src, stride, c->QP) )
334 return 1;
335 else
336 return 0;
337 }else{
338 return 2;
339 }
340 }
341
342 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
343 {
344 int y;
345 for(y=0; y<BLOCK_SIZE; y++)
346 {
347 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
348
349 if(ABS(middleEnergy) < 8*c->QP)
350 {
351 const int q=(dst[3] - dst[4])/2;
352 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
353 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
354
355 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
356 d= MAX(d, 0);
357
358 d= (5*d + 32) >> 6;
359 d*= SIGN(-middleEnergy);
360
361 if(q>0)
362 {
363 d= d<0 ? 0 : d;
364 d= d>q ? q : d;
365 }
366 else
367 {
368 d= d>0 ? 0 : d;
369 d= d<q ? q : d;
370 }
371
372 dst[3]-= d;
373 dst[4]+= d;
374 }
375 dst+= stride;
376 }
377 }
378
379 /**
380 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
381 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
382 */
383 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
384 {
385 int y;
386 for(y=0; y<BLOCK_SIZE; y++)
387 {
388 const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
389 const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
390
391 int sums[10];
392 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
393 sums[1] = sums[0] - first + dst[3];
394 sums[2] = sums[1] - first + dst[4];
395 sums[3] = sums[2] - first + dst[5];
396 sums[4] = sums[3] - first + dst[6];
397 sums[5] = sums[4] - dst[0] + dst[7];
398 sums[6] = sums[5] - dst[1] + last;
399 sums[7] = sums[6] - dst[2] + last;
400 sums[8] = sums[7] - dst[3] + last;
401 sums[9] = sums[8] - dst[4] + last;
402
403 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
404 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
405 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
406 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
407 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
408 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
409 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
410 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
411
412 dst+= stride;
413 }
414 }
415
416 /**
417 * Experimental Filter 1 (Horizontal)
418 * will not damage linear gradients
419 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
420 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
421 * MMX2 version does correct clipping C version doesnt
422 * not identical with the vertical one
423 */
424 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
425 {
426 int y;
427 static uint64_t *lut= NULL;
428 if(lut==NULL)
429 {
430 int i;
431 lut= (uint64_t*)memalign(8, 256*8);
432 for(i=0; i<256; i++)
433 {
434 int v= i < 128 ? 2*i : 2*(i-256);
435 /*
436 //Simulate 112242211 9-Tap filter
437 uint64_t a= (v/16) & 0xFF;
438 uint64_t b= (v/8) & 0xFF;
439 uint64_t c= (v/4) & 0xFF;
440 uint64_t d= (3*v/8) & 0xFF;
441 */
442 //Simulate piecewise linear interpolation
443 uint64_t a= (v/16) & 0xFF;
444 uint64_t b= (v*3/16) & 0xFF;
445 uint64_t c= (v*5/16) & 0xFF;
446 uint64_t d= (7*v/16) & 0xFF;
447 uint64_t A= (0x100 - a)&0xFF;
448 uint64_t B= (0x100 - b)&0xFF;
449 uint64_t C= (0x100 - c)&0xFF;
450 uint64_t D= (0x100 - c)&0xFF;
451
452 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
453 (D<<24) | (C<<16) | (B<<8) | (A);
454 //lut[i] = (v<<32) | (v<<24);
455 }
456 }
457
458 for(y=0; y<BLOCK_SIZE; y++)
459 {
460 int a= src[1] - src[2];
461 int b= src[3] - src[4];
462 int c= src[5] - src[6];
463
464 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
465
466 if(d < QP)
467 {
468 int v = d * SIGN(-b);
469
470 src[1] +=v/8;
471 src[2] +=v/4;
472 src[3] +=3*v/8;
473 src[4] -=3*v/8;
474 src[5] -=v/4;
475 src[6] -=v/8;
476
477 }
478 src+=stride;
479 }
480 }
481
482 /**
483 * accurate deblock filter
484 */
485 static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
486 int y;
487 const int QP= c->QP;
488 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
489 const int dcThreshold= dcOffset*2 + 1;
490 //START_TIMER
491 src+= step*4; // src points to begin of the 8x8 Block
492 for(y=0; y<8; y++){
493 int numEq= 0;
494
495 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
496 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
497 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
498 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
499 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
500 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
501 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
502 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
503 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
504 if(numEq > c->ppMode.flatnessThreshold){
505 int min, max, x;
506
507 if(src[0] > src[step]){
508 max= src[0];
509 min= src[step];
510 }else{
511 max= src[step];
512 min= src[0];
513 }
514 for(x=2; x<8; x+=2){
515 if(src[x*step] > src[(x+1)*step]){
516 if(src[x *step] > max) max= src[ x *step];
517 if(src[(x+1)*step] < min) min= src[(x+1)*step];
518 }else{
519 if(src[(x+1)*step] > max) max= src[(x+1)*step];
520 if(src[ x *step] < min) min= src[ x *step];
521 }
522 }
523 if(max-min < 2*QP){
524 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
525 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
526
527 int sums[10];
528 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
529 sums[1] = sums[0] - first + src[3*step];
530 sums[2] = sums[1] - first + src[4*step];
531 sums[3] = sums[2] - first + src[5*step];
532 sums[4] = sums[3] - first + src[6*step];
533 sums[5] = sums[4] - src[0*step] + src[7*step];
534 sums[6] = sums[5] - src[1*step] + last;
535 sums[7] = sums[6] - src[2*step] + last;
536 sums[8] = sums[7] - src[3*step] + last;
537 sums[9] = sums[8] - src[4*step] + last;
538
539 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
540 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
541 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
542 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
543 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
544 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
545 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
546 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
547 }
548 }else{
549 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
550
551 if(ABS(middleEnergy) < 8*QP)
552 {
553 const int q=(src[3*step] - src[4*step])/2;
554 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
555 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
556
557 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
558 d= MAX(d, 0);
559
560 d= (5*d + 32) >> 6;
561 d*= SIGN(-middleEnergy);
562
563 if(q>0)
564 {
565 d= d<0 ? 0 : d;
566 d= d>q ? q : d;
567 }
568 else
569 {
570 d= d>0 ? 0 : d;
571 d= d<q ? q : d;
572 }
573
574 src[3*step]-= d;
575 src[4*step]+= d;
576 }
577 }
578
579 src += stride;
580 }
581 /*if(step==16){
582 STOP_TIMER("step16")
583 }else{
584 STOP_TIMER("stepX")
585 }*/
586 }
587
588 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
589 //Plain C versions
590 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
591 #define COMPILE_C
592 #endif
593
594 #ifdef ARCH_POWERPC
595 #ifdef HAVE_ALTIVEC
596 #define COMPILE_ALTIVEC
597 #endif //HAVE_ALTIVEC
598 #endif //ARCH_POWERPC
599
600 #ifdef ARCH_X86
601
602 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
603 #define COMPILE_MMX
604 #endif
605
606 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
607 #define COMPILE_MMX2
608 #endif
609
610 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
611 #define COMPILE_3DNOW
612 #endif
613 #endif //ARCH_X86
614
615 #undef HAVE_MMX
616 #undef HAVE_MMX2
617 #undef HAVE_3DNOW
618 #undef HAVE_ALTIVEC
619 #undef ARCH_X86
620
621 #ifdef COMPILE_C
622 #undef HAVE_MMX
623 #undef HAVE_MMX2
624 #undef HAVE_3DNOW
625 #undef ARCH_X86
626 #define RENAME(a) a ## _C
627 #include "postprocess_template.c"
628 #endif
629
630 #ifdef ARCH_POWERPC
631 #ifdef COMPILE_ALTIVEC
632 #undef RENAME
633 #define HAVE_ALTIVEC
634 #define RENAME(a) a ## _altivec
635 #include "postprocess_altivec_template.c"
636 #include "postprocess_template.c"
637 #endif
638 #endif //ARCH_POWERPC
639
640 //MMX versions
641 #ifdef COMPILE_MMX
642 #undef RENAME
643 #define HAVE_MMX
644 #undef HAVE_MMX2
645 #undef HAVE_3DNOW
646 #define ARCH_X86
647 #define RENAME(a) a ## _MMX
648 #include "postprocess_template.c"
649 #endif
650
651 //MMX2 versions
652 #ifdef COMPILE_MMX2
653 #undef RENAME
654 #define HAVE_MMX
655 #define HAVE_MMX2
656 #undef HAVE_3DNOW
657 #define ARCH_X86
658 #define RENAME(a) a ## _MMX2
659 #include "postprocess_template.c"
660 #endif
661
662 //3DNOW versions
663 #ifdef COMPILE_3DNOW
664 #undef RENAME
665 #define HAVE_MMX
666 #undef HAVE_MMX2
667 #define HAVE_3DNOW
668 #define ARCH_X86
669 #define RENAME(a) a ## _3DNow
670 #include "postprocess_template.c"
671 #endif
672
673 // minor note: the HAVE_xyz is messed up after that line so dont use it
674
675 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
676 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
677 {
678 PPContext *c= (PPContext *)vc;
679 PPMode *ppMode= (PPMode *)vm;
680 c->ppMode= *ppMode; //FIXME
681
682 // useing ifs here as they are faster than function pointers allthough the
683 // difference wouldnt be messureable here but its much better because
684 // someone might exchange the cpu whithout restarting mplayer ;)
685 #ifdef RUNTIME_CPUDETECT
686 #ifdef ARCH_X86
687 // ordered per speed fasterst first
688 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
689 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
690 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
691 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
692 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
693 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
694 else
695 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
696 #else
697 #ifdef ARCH_POWERPC
698 #ifdef HAVE_ALTIVEC
699 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
700 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
701 else
702 #endif
703 #endif
704 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
705 #endif
706 #else //RUNTIME_CPUDETECT
707 #ifdef HAVE_MMX2
708 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
709 #elif defined (HAVE_3DNOW)
710 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
711 #elif defined (HAVE_MMX)
712 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
713 #elif defined (HAVE_ALTIVEC)
714 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
715 #else
716 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
717 #endif
718 #endif //!RUNTIME_CPUDETECT
719 }
720
721 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
722 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
723
724 /* -pp Command line Help
725 */
726 char *pp_help=
727 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
728 "long form example:\n"
729 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
730 "short form example:\n"
731 "vb:a/hb:a/lb de,-vb\n"
732 "more examples:\n"
733 "tn:64:128:256\n"
734 "Filters Options\n"
735 "short long name short long option Description\n"
736 "* * a autoq CPU power dependent enabler\n"
737 " c chrom chrominance filtering enabled\n"
738 " y nochrom chrominance filtering disabled\n"
739 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
740 " 1. difference factor: default=32, higher -> more deblocking\n"
741 " 2. flatness threshold: default=39, lower -> more deblocking\n"
742 " the h & v deblocking filters share these\n"
743 " so you can't set different thresholds for h / v\n"
744 "vb vdeblock (2 threshold) vertical deblocking filter\n"
745 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
746 "va vadeblock (2 threshold) vertical deblocking filter\n"
747 "h1 x1hdeblock experimental h deblock filter 1\n"
748 "v1 x1vdeblock experimental v deblock filter 1\n"
749 "dr dering deringing filter\n"
750 "al autolevels automatic brightness / contrast\n"
751 " f fullyrange stretch luminance to (0..255)\n"
752 "lb linblenddeint linear blend deinterlacer\n"
753 "li linipoldeint linear interpolating deinterlace\n"
754 "ci cubicipoldeint cubic interpolating deinterlacer\n"
755 "md mediandeint median deinterlacer\n"
756 "fd ffmpegdeint ffmpeg deinterlacer\n"
757 "de default hb:a,vb:a,dr:a\n"
758 "fa fast h1:a,v1:a,dr:a\n"
759 "tn tmpnoise (3 threshold) temporal noise reducer\n"
760 " 1. <= 2. <= 3. larger -> stronger filtering\n"
761 "fq forceQuant <quantizer> force quantizer\n"
762 ;
763
764 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
765 {
766 char temp[GET_MODE_BUFFER_SIZE];
767 char *p= temp;
768 char *filterDelimiters= ",/";
769 char *optionDelimiters= ":";
770 struct PPMode *ppMode;
771 char *filterToken;
772
773 ppMode= memalign(8, sizeof(PPMode));
774
775 ppMode->lumMode= 0;
776 ppMode->chromMode= 0;
777 ppMode->maxTmpNoise[0]= 700;
778 ppMode->maxTmpNoise[1]= 1500;
779 ppMode->maxTmpNoise[2]= 3000;
780 ppMode->maxAllowedY= 234;
781 ppMode->minAllowedY= 16;
782 ppMode->baseDcDiff= 256/8;
783 ppMode->flatnessThreshold= 56-16-1;
784 ppMode->maxClippedThreshold= 0.01;
785 ppMode->error=0;
786
787 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
788
789 if(verbose>1) printf("pp: %s\n", name);
790
791 for(;;){
792 char *filterName;
793 int q= 1000000; //PP_QUALITY_MAX;
794 int chrom=-1;
795 char *option;
796 char *options[OPTIONS_ARRAY_SIZE];
797 int i;
798 int filterNameOk=0;
799 int numOfUnknownOptions=0;
800 int enable=1; //does the user want us to enabled or disabled the filter
801
802 filterToken= strtok(p, filterDelimiters);
803 if(filterToken == NULL) break;
804 p+= strlen(filterToken) + 1; // p points to next filterToken
805 filterName= strtok(filterToken, optionDelimiters);
806 if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
807
808 if(*filterName == '-')
809 {
810 enable=0;
811 filterName++;
812 }
813
814 for(;;){ //for all options
815 option= strtok(NULL, optionDelimiters);
816 if(option == NULL) break;
817
818 if(verbose>1) printf("pp: option: %s\n", option);
819 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
820 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
821 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
822 else
823 {
824 options[numOfUnknownOptions] = option;
825 numOfUnknownOptions++;
826 }
827 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
828 }
829 options[numOfUnknownOptions] = NULL;
830
831 /* replace stuff from the replace Table */
832 for(i=0; replaceTable[2*i]!=NULL; i++)
833 {
834 if(!strcmp(replaceTable[2*i], filterName))
835 {
836 int newlen= strlen(replaceTable[2*i + 1]);
837 int plen;
838 int spaceLeft;
839
840 if(p==NULL) p= temp, *p=0; //last filter
841 else p--, *p=','; //not last filter
842
843 plen= strlen(p);
844 spaceLeft= p - temp + plen;
845 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
846 {
847 ppMode->error++;
848 break;
849 }
850 memmove(p + newlen, p, plen+1);
851 memcpy(p, replaceTable[2*i + 1], newlen);
852 filterNameOk=1;
853 }
854 }
855
856 for(i=0; filters[i].shortName!=NULL; i++)
857 {
858 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
859 if( !strcmp(filters[i].longName, filterName)
860 || !strcmp(filters[i].shortName, filterName))
861 {
862 ppMode->lumMode &= ~filters[i].mask;
863 ppMode->chromMode &= ~filters[i].mask;
864
865 filterNameOk=1;
866 if(!enable) break; // user wants to disable it
867
868 if(q >= filters[i].minLumQuality)
869 ppMode->lumMode|= filters[i].mask;
870 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
871 if(q >= filters[i].minChromQuality)
872 ppMode->chromMode|= filters[i].mask;
873
874 if(filters[i].mask == LEVEL_FIX)
875 {
876 int o;
877 ppMode->minAllowedY= 16;
878 ppMode->maxAllowedY= 234;
879 for(o=0; options[o]!=NULL; o++)
880 {
881 if( !strcmp(options[o],"fullyrange")
882 ||!strcmp(options[o],"f"))
883 {
884 ppMode->minAllowedY= 0;
885 ppMode->maxAllowedY= 255;
886 numOfUnknownOptions--;
887 }
888 }
889 }
890 else if(filters[i].mask == TEMP_NOISE_FILTER)
891 {
892 int o;
893 int numOfNoises=0;
894
895 for(o=0; options[o]!=NULL; o++)
896 {
897 char *tail;
898 ppMode->maxTmpNoise[numOfNoises]=
899 strtol(options[o], &tail, 0);
900 if(tail!=options[o])
901 {
902 numOfNoises++;
903 numOfUnknownOptions--;
904 if(numOfNoises >= 3) break;
905 }
906 }
907 }
908 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
909 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
910 {
911 int o;
912
913 for(o=0; options[o]!=NULL && o<2; o++)
914 {
915 char *tail;
916 int val= strtol(options[o], &tail, 0);
917 if(tail==options[o]) break;
918
919 numOfUnknownOptions--;
920 if(o==0) ppMode->baseDcDiff= val;
921 else ppMode->flatnessThreshold= val;
922 }
923 }
924 else if(filters[i].mask == FORCE_QUANT)
925 {
926 int o;
927 ppMode->forcedQuant= 15;
928
929 for(o=0; options[o]!=NULL && o<1; o++)
930 {
931 char *tail;
932 int val= strtol(options[o], &tail, 0);
933 if(tail==options[o]) break;
934
935 numOfUnknownOptions--;
936 ppMode->forcedQuant= val;
937 }
938 }
939 }
940 }
941 if(!filterNameOk) ppMode->error++;
942 ppMode->error += numOfUnknownOptions;
943 }
944
945 if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
946 if(ppMode->error)
947 {
948 fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
949 free(ppMode);
950 return NULL;
951 }
952 return ppMode;
953 }
954
955 void pp_free_mode(pp_mode_t *mode){
956 if(mode) free(mode);
957 }
958
959 static void reallocAlign(void **p, int alignment, int size){
960 if(*p) free(*p);
961 *p= memalign(alignment, size);
962 memset(*p, 0, size);
963 }
964
965 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
966 int mbWidth = (width+15)>>4;
967 int mbHeight= (height+15)>>4;
968 int i;
969
970 c->stride= stride;
971 c->qpStride= qpStride;
972
973 reallocAlign((void **)&c->tempDst, 8, stride*24);
974 reallocAlign((void **)&c->tempSrc, 8, stride*24);
975 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
976 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
977 for(i=0; i<256; i++)
978 c->yHistogram[i]= width*height/64*15/256;
979
980 for(i=0; i<3; i++)
981 {
982 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
983 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
984 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
985 }
986
987 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
988 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
989 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
990 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
991 }
992
993 static void global_init(void){
994 int i;
995 memset(clip_table, 0, 256);
996 for(i=256; i<512; i++)
997 clip_table[i]= i;
998 memset(clip_table+512, 0, 256);
999 }
1000
1001 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
1002 PPContext *c= memalign(32, sizeof(PPContext));
1003 int stride= (width+15)&(~15); //assumed / will realloc if needed
1004 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
1005
1006 global_init();
1007
1008 memset(c, 0, sizeof(PPContext));
1009 c->cpuCaps= cpuCaps;
1010 if(cpuCaps&PP_FORMAT){
1011 c->hChromaSubSample= cpuCaps&0x3;
1012 c->vChromaSubSample= (cpuCaps>>4)&0x3;
1013 }else{
1014 c->hChromaSubSample= 1;
1015 c->vChromaSubSample= 1;
1016 }
1017
1018 reallocBuffers(c, width, height, stride, qpStride);
1019
1020 c->frameNum=-1;
1021
1022 return c;
1023 }
1024
1025 void pp_free_context(void *vc){
1026 PPContext *c = (PPContext*)vc;
1027 int i;
1028
1029 for(i=0; i<3; i++) free(c->tempBlured[i]);
1030 for(i=0; i<3; i++) free(c->tempBluredPast[i]);
1031
1032 free(c->tempBlocks);
1033 free(c->yHistogram);
1034 free(c->tempDst);
1035 free(c->tempSrc);
1036 free(c->deintTemp);
1037 free(c->stdQPTable);
1038 free(c->nonBQPTable);
1039 free(c->forcedQPTable);
1040
1041 memset(c, 0, sizeof(PPContext));
1042
1043 free(c);
1044 }
1045
1046 void pp_postprocess(uint8_t * src[3], int srcStride[3],
1047 uint8_t * dst[3], int dstStride[3],
1048 int width, int height,
1049 QP_STORE_T *QP_store, int QPStride,
1050 pp_mode_t *vm, void *vc, int pict_type)
1051 {
1052 int mbWidth = (width+15)>>4;
1053 int mbHeight= (height+15)>>4;
1054 PPMode *mode = (PPMode*)vm;
1055 PPContext *c = (PPContext*)vc;
1056 int minStride= MAX(srcStride[0], dstStride[0]);
1057
1058 if(c->stride < minStride || c->qpStride < QPStride)
1059 reallocBuffers(c, width, height,
1060 MAX(minStride, c->stride),
1061 MAX(c->qpStride, QPStride));
1062
1063 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1064 {
1065 int i;
1066 QP_store= c->forcedQPTable;
1067 QPStride= 0;
1068 if(mode->lumMode & FORCE_QUANT)
1069 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1070 else
1071 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1072 }
1073 //printf("pict_type:%d\n", pict_type);
1074
1075 if(pict_type & PP_PICT_TYPE_QP2){
1076 int i;
1077 const int count= mbHeight * QPStride;
1078 for(i=0; i<(count>>2); i++){
1079 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1080 }
1081 for(i<<=2; i<count; i++){
1082 c->stdQPTable[i] = QP_store[i]>>1;
1083 }
1084 QP_store= c->stdQPTable;
1085 }
1086
1087 if(0){
1088 int x,y;
1089 for(y=0; y<mbHeight; y++){
1090 for(x=0; x<mbWidth; x++){
1091 printf("%2d ", QP_store[x + y*QPStride]);
1092 }
1093 printf("\n");
1094 }
1095 printf("\n");
1096 }
1097
1098 if((pict_type&7)!=3)
1099 {
1100 int i;
1101 const int count= mbHeight * QPStride;
1102 for(i=0; i<(count>>2); i++){
1103 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1104 }
1105 for(i<<=2; i<count; i++){
1106 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1107 }
1108 }
1109
1110 if(verbose>2)
1111 {
1112 printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
1113 }
1114
1115 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1116 width, height, QP_store, QPStride, 0, mode, c);
1117
1118 width = (width )>>c->hChromaSubSample;
1119 height = (height)>>c->vChromaSubSample;
1120
1121 if(mode->chromMode)
1122 {
1123 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1124 width, height, QP_store, QPStride, 1, mode, c);
1125 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1126 width, height, QP_store, QPStride, 2, mode, c);
1127 }
1128 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1129 {
1130 memcpy(dst[1], src[1], srcStride[1]*height);
1131 memcpy(dst[2], src[2], srcStride[2]*height);
1132 }
1133 else
1134 {
1135 int y;
1136 for(y=0; y<height; y++)
1137 {
1138 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1139 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1140 }
1141 }
1142 }
1143