misc spelling fixes
[libav.git] / libpostproc / postprocess.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file postprocess.c
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * i do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use the Subversion log
75
76 #include "config.h"
77 #include "avutil.h"
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <stdlib.h>
81 #include <string.h>
82 #ifdef HAVE_MALLOC_H
83 #include <malloc.h>
84 #endif
85 //#undef HAVE_MMX2
86 //#define HAVE_3DNOW
87 //#undef HAVE_MMX
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #include "postprocess.h"
91 #include "postprocess_internal.h"
92
93 #include "mangle.h" //FIXME should be supressed
94
95 #ifdef HAVE_ALTIVEC_H
96 #include <altivec.h>
97 #endif
98
99 #define GET_MODE_BUFFER_SIZE 500
100 #define OPTIONS_ARRAY_SIZE 10
101 #define BLOCK_SIZE 8
102 #define TEMP_STRIDE 8
103 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
104
105 #if defined(ARCH_X86)
106 static DECLARE_ALIGNED(8, uint64_t attribute_used, w05)= 0x0005000500050005LL;
107 static DECLARE_ALIGNED(8, uint64_t attribute_used, w04)= 0x0004000400040004LL;
108 static DECLARE_ALIGNED(8, uint64_t attribute_used, w20)= 0x0020002000200020LL;
109 static DECLARE_ALIGNED(8, uint64_t attribute_used, b00)= 0x0000000000000000LL;
110 static DECLARE_ALIGNED(8, uint64_t attribute_used, b01)= 0x0101010101010101LL;
111 static DECLARE_ALIGNED(8, uint64_t attribute_used, b02)= 0x0202020202020202LL;
112 static DECLARE_ALIGNED(8, uint64_t attribute_used, b08)= 0x0808080808080808LL;
113 static DECLARE_ALIGNED(8, uint64_t attribute_used, b80)= 0x8080808080808080LL;
114 #endif
115
116 static uint8_t clip_table[3*256];
117 static uint8_t * const clip_tab= clip_table + 256;
118
119 static const int attribute_used deringThreshold= 20;
120
121
122 static struct PPFilter filters[]=
123 {
124 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
125 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
126 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
127 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
128 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
129 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
130 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
131 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
132 {"dr", "dering", 1, 5, 6, DERING},
133 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
134 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
135 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
136 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
137 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
138 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
139 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
140 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
141 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
142 {NULL, NULL,0,0,0,0} //End Marker
143 };
144
145 static const char *replaceTable[]=
146 {
147 "default", "hdeblock:a,vdeblock:a,dering:a",
148 "de", "hdeblock:a,vdeblock:a,dering:a",
149 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
150 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
151 "ac", "ha:a:128:7,va:a,dering:a",
152 NULL //End Marker
153 };
154
155
156 #if defined(ARCH_X86)
157 static inline void prefetchnta(void *p)
158 {
159 asm volatile( "prefetchnta (%0)\n\t"
160 : : "r" (p)
161 );
162 }
163
164 static inline void prefetcht0(void *p)
165 {
166 asm volatile( "prefetcht0 (%0)\n\t"
167 : : "r" (p)
168 );
169 }
170
171 static inline void prefetcht1(void *p)
172 {
173 asm volatile( "prefetcht1 (%0)\n\t"
174 : : "r" (p)
175 );
176 }
177
178 static inline void prefetcht2(void *p)
179 {
180 asm volatile( "prefetcht2 (%0)\n\t"
181 : : "r" (p)
182 );
183 }
184 #endif
185
186 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
187
188 /**
189 * Check if the given 8x8 Block is mostly "flat"
190 */
191 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
192 {
193 int numEq= 0;
194 int y;
195 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
196 const int dcThreshold= dcOffset*2 + 1;
197
198 for(y=0; y<BLOCK_SIZE; y++)
199 {
200 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
201 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
202 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
203 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
204 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
205 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
206 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
207 src+= stride;
208 }
209 return numEq > c->ppMode.flatnessThreshold;
210 }
211
212 /**
213 * Check if the middle 8x8 Block in the given 8x16 block is flat
214 */
215 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
216 int numEq= 0;
217 int y;
218 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
219 const int dcThreshold= dcOffset*2 + 1;
220
221 src+= stride*4; // src points to begin of the 8x8 Block
222 for(y=0; y<BLOCK_SIZE-1; y++)
223 {
224 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
225 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
226 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
227 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
228 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
229 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
230 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
231 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
232 src+= stride;
233 }
234 return numEq > c->ppMode.flatnessThreshold;
235 }
236
237 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
238 {
239 int i;
240 #if 1
241 for(i=0; i<2; i++){
242 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
243 src += stride;
244 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
245 src += stride;
246 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
247 src += stride;
248 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
249 src += stride;
250 }
251 #else
252 for(i=0; i<8; i++){
253 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
254 src += stride;
255 }
256 #endif
257 return 1;
258 }
259
260 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
261 {
262 #if 1
263 #if 1
264 int x;
265 src+= stride*4;
266 for(x=0; x<BLOCK_SIZE; x+=4)
267 {
268 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
269 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
270 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
271 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
272 }
273 #else
274 int x;
275 src+= stride*3;
276 for(x=0; x<BLOCK_SIZE; x++)
277 {
278 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
279 }
280 #endif
281 return 1;
282 #else
283 int x;
284 src+= stride*4;
285 for(x=0; x<BLOCK_SIZE; x++)
286 {
287 int min=255;
288 int max=0;
289 int y;
290 for(y=0; y<8; y++){
291 int v= src[x + y*stride];
292 if(v>max) max=v;
293 if(v<min) min=v;
294 }
295 if(max-min > 2*QP) return 0;
296 }
297 return 1;
298 #endif
299 }
300
301 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
302 if( isHorizDC_C(src, stride, c) ){
303 if( isHorizMinMaxOk_C(src, stride, c->QP) )
304 return 1;
305 else
306 return 0;
307 }else{
308 return 2;
309 }
310 }
311
312 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
313 if( isVertDC_C(src, stride, c) ){
314 if( isVertMinMaxOk_C(src, stride, c->QP) )
315 return 1;
316 else
317 return 0;
318 }else{
319 return 2;
320 }
321 }
322
323 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
324 {
325 int y;
326 for(y=0; y<BLOCK_SIZE; y++)
327 {
328 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
329
330 if(FFABS(middleEnergy) < 8*c->QP)
331 {
332 const int q=(dst[3] - dst[4])/2;
333 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
334 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
335
336 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
337 d= FFMAX(d, 0);
338
339 d= (5*d + 32) >> 6;
340 d*= FFSIGN(-middleEnergy);
341
342 if(q>0)
343 {
344 d= d<0 ? 0 : d;
345 d= d>q ? q : d;
346 }
347 else
348 {
349 d= d>0 ? 0 : d;
350 d= d<q ? q : d;
351 }
352
353 dst[3]-= d;
354 dst[4]+= d;
355 }
356 dst+= stride;
357 }
358 }
359
360 /**
361 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
362 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
363 */
364 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
365 {
366 int y;
367 for(y=0; y<BLOCK_SIZE; y++)
368 {
369 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
370 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
371
372 int sums[10];
373 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
374 sums[1] = sums[0] - first + dst[3];
375 sums[2] = sums[1] - first + dst[4];
376 sums[3] = sums[2] - first + dst[5];
377 sums[4] = sums[3] - first + dst[6];
378 sums[5] = sums[4] - dst[0] + dst[7];
379 sums[6] = sums[5] - dst[1] + last;
380 sums[7] = sums[6] - dst[2] + last;
381 sums[8] = sums[7] - dst[3] + last;
382 sums[9] = sums[8] - dst[4] + last;
383
384 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
385 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
386 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
387 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
388 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
389 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
390 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
391 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
392
393 dst+= stride;
394 }
395 }
396
397 /**
398 * Experimental Filter 1 (Horizontal)
399 * will not damage linear gradients
400 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
401 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
402 * MMX2 version does correct clipping C version does not
403 * not identical with the vertical one
404 */
405 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
406 {
407 int y;
408 static uint64_t *lut= NULL;
409 if(lut==NULL)
410 {
411 int i;
412 lut = av_malloc(256*8);
413 for(i=0; i<256; i++)
414 {
415 int v= i < 128 ? 2*i : 2*(i-256);
416 /*
417 //Simulate 112242211 9-Tap filter
418 uint64_t a= (v/16) & 0xFF;
419 uint64_t b= (v/8) & 0xFF;
420 uint64_t c= (v/4) & 0xFF;
421 uint64_t d= (3*v/8) & 0xFF;
422 */
423 //Simulate piecewise linear interpolation
424 uint64_t a= (v/16) & 0xFF;
425 uint64_t b= (v*3/16) & 0xFF;
426 uint64_t c= (v*5/16) & 0xFF;
427 uint64_t d= (7*v/16) & 0xFF;
428 uint64_t A= (0x100 - a)&0xFF;
429 uint64_t B= (0x100 - b)&0xFF;
430 uint64_t C= (0x100 - c)&0xFF;
431 uint64_t D= (0x100 - c)&0xFF;
432
433 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
434 (D<<24) | (C<<16) | (B<<8) | (A);
435 //lut[i] = (v<<32) | (v<<24);
436 }
437 }
438
439 for(y=0; y<BLOCK_SIZE; y++)
440 {
441 int a= src[1] - src[2];
442 int b= src[3] - src[4];
443 int c= src[5] - src[6];
444
445 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
446
447 if(d < QP)
448 {
449 int v = d * FFSIGN(-b);
450
451 src[1] +=v/8;
452 src[2] +=v/4;
453 src[3] +=3*v/8;
454 src[4] -=3*v/8;
455 src[5] -=v/4;
456 src[6] -=v/8;
457
458 }
459 src+=stride;
460 }
461 }
462
463 /**
464 * accurate deblock filter
465 */
466 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
467 int y;
468 const int QP= c->QP;
469 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
470 const int dcThreshold= dcOffset*2 + 1;
471 //START_TIMER
472 src+= step*4; // src points to begin of the 8x8 Block
473 for(y=0; y<8; y++){
474 int numEq= 0;
475
476 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
477 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
478 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
479 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
480 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
481 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
482 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
483 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
484 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
485 if(numEq > c->ppMode.flatnessThreshold){
486 int min, max, x;
487
488 if(src[0] > src[step]){
489 max= src[0];
490 min= src[step];
491 }else{
492 max= src[step];
493 min= src[0];
494 }
495 for(x=2; x<8; x+=2){
496 if(src[x*step] > src[(x+1)*step]){
497 if(src[x *step] > max) max= src[ x *step];
498 if(src[(x+1)*step] < min) min= src[(x+1)*step];
499 }else{
500 if(src[(x+1)*step] > max) max= src[(x+1)*step];
501 if(src[ x *step] < min) min= src[ x *step];
502 }
503 }
504 if(max-min < 2*QP){
505 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
506 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
507
508 int sums[10];
509 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
510 sums[1] = sums[0] - first + src[3*step];
511 sums[2] = sums[1] - first + src[4*step];
512 sums[3] = sums[2] - first + src[5*step];
513 sums[4] = sums[3] - first + src[6*step];
514 sums[5] = sums[4] - src[0*step] + src[7*step];
515 sums[6] = sums[5] - src[1*step] + last;
516 sums[7] = sums[6] - src[2*step] + last;
517 sums[8] = sums[7] - src[3*step] + last;
518 sums[9] = sums[8] - src[4*step] + last;
519
520 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
521 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
522 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
523 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
524 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
525 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
526 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
527 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
528 }
529 }else{
530 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
531
532 if(FFABS(middleEnergy) < 8*QP)
533 {
534 const int q=(src[3*step] - src[4*step])/2;
535 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
536 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
537
538 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
539 d= FFMAX(d, 0);
540
541 d= (5*d + 32) >> 6;
542 d*= FFSIGN(-middleEnergy);
543
544 if(q>0)
545 {
546 d= d<0 ? 0 : d;
547 d= d>q ? q : d;
548 }
549 else
550 {
551 d= d>0 ? 0 : d;
552 d= d<q ? q : d;
553 }
554
555 src[3*step]-= d;
556 src[4*step]+= d;
557 }
558 }
559
560 src += stride;
561 }
562 /*if(step==16){
563 STOP_TIMER("step16")
564 }else{
565 STOP_TIMER("stepX")
566 }*/
567 }
568
569 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
570 //Plain C versions
571 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
572 #define COMPILE_C
573 #endif
574
575 #ifdef ARCH_POWERPC
576 #ifdef HAVE_ALTIVEC
577 #define COMPILE_ALTIVEC
578 #endif //HAVE_ALTIVEC
579 #endif //ARCH_POWERPC
580
581 #if defined(ARCH_X86)
582
583 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
584 #define COMPILE_MMX
585 #endif
586
587 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
588 #define COMPILE_MMX2
589 #endif
590
591 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
592 #define COMPILE_3DNOW
593 #endif
594 #endif /* defined(ARCH_X86) */
595
596 #undef HAVE_MMX
597 #undef HAVE_MMX2
598 #undef HAVE_3DNOW
599 #undef HAVE_ALTIVEC
600
601 #ifdef COMPILE_C
602 #undef HAVE_MMX
603 #undef HAVE_MMX2
604 #undef HAVE_3DNOW
605 #define RENAME(a) a ## _C
606 #include "postprocess_template.c"
607 #endif
608
609 #ifdef ARCH_POWERPC
610 #ifdef COMPILE_ALTIVEC
611 #undef RENAME
612 #define HAVE_ALTIVEC
613 #define RENAME(a) a ## _altivec
614 #include "postprocess_altivec_template.c"
615 #include "postprocess_template.c"
616 #endif
617 #endif //ARCH_POWERPC
618
619 //MMX versions
620 #ifdef COMPILE_MMX
621 #undef RENAME
622 #define HAVE_MMX
623 #undef HAVE_MMX2
624 #undef HAVE_3DNOW
625 #define RENAME(a) a ## _MMX
626 #include "postprocess_template.c"
627 #endif
628
629 //MMX2 versions
630 #ifdef COMPILE_MMX2
631 #undef RENAME
632 #define HAVE_MMX
633 #define HAVE_MMX2
634 #undef HAVE_3DNOW
635 #define RENAME(a) a ## _MMX2
636 #include "postprocess_template.c"
637 #endif
638
639 //3DNOW versions
640 #ifdef COMPILE_3DNOW
641 #undef RENAME
642 #define HAVE_MMX
643 #undef HAVE_MMX2
644 #define HAVE_3DNOW
645 #define RENAME(a) a ## _3DNow
646 #include "postprocess_template.c"
647 #endif
648
649 // minor note: the HAVE_xyz is messed up after that line so do not use it.
650
651 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
652 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
653 {
654 PPContext *c= (PPContext *)vc;
655 PPMode *ppMode= (PPMode *)vm;
656 c->ppMode= *ppMode; //FIXME
657
658 // Using ifs here as they are faster than function pointers although the
659 // difference would not be measureable here but it is much better because
660 // someone might exchange the CPU whithout restarting MPlayer ;)
661 #ifdef RUNTIME_CPUDETECT
662 #if defined(ARCH_X86)
663 // ordered per speed fasterst first
664 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
665 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
666 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
667 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
668 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
669 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
670 else
671 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
672 #else
673 #ifdef ARCH_POWERPC
674 #ifdef HAVE_ALTIVEC
675 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
676 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
677 else
678 #endif
679 #endif
680 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
681 #endif
682 #else //RUNTIME_CPUDETECT
683 #ifdef HAVE_MMX2
684 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
685 #elif defined (HAVE_3DNOW)
686 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
687 #elif defined (HAVE_MMX)
688 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
689 #elif defined (HAVE_ALTIVEC)
690 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
691 #else
692 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
693 #endif
694 #endif //!RUNTIME_CPUDETECT
695 }
696
697 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
698 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
699
700 /* -pp Command line Help
701 */
702 char *pp_help=
703 "Available postprocessing filters:\n"
704 "Filters Options\n"
705 "short long name short long option Description\n"
706 "* * a autoq CPU power dependent enabler\n"
707 " c chrom chrominance filtering enabled\n"
708 " y nochrom chrominance filtering disabled\n"
709 " n noluma luma filtering disabled\n"
710 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
711 " 1. difference factor: default=32, higher -> more deblocking\n"
712 " 2. flatness threshold: default=39, lower -> more deblocking\n"
713 " the h & v deblocking filters share these\n"
714 " so you can't set different thresholds for h / v\n"
715 "vb vdeblock (2 threshold) vertical deblocking filter\n"
716 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
717 "va vadeblock (2 threshold) vertical deblocking filter\n"
718 "h1 x1hdeblock experimental h deblock filter 1\n"
719 "v1 x1vdeblock experimental v deblock filter 1\n"
720 "dr dering deringing filter\n"
721 "al autolevels automatic brightness / contrast\n"
722 " f fullyrange stretch luminance to (0..255)\n"
723 "lb linblenddeint linear blend deinterlacer\n"
724 "li linipoldeint linear interpolating deinterlace\n"
725 "ci cubicipoldeint cubic interpolating deinterlacer\n"
726 "md mediandeint median deinterlacer\n"
727 "fd ffmpegdeint ffmpeg deinterlacer\n"
728 "l5 lowpass5 FIR lowpass deinterlacer\n"
729 "de default hb:a,vb:a,dr:a\n"
730 "fa fast h1:a,v1:a,dr:a\n"
731 "ac ha:a:128:7,va:a,dr:a\n"
732 "tn tmpnoise (3 threshold) temporal noise reducer\n"
733 " 1. <= 2. <= 3. larger -> stronger filtering\n"
734 "fq forceQuant <quantizer> force quantizer\n"
735 "Usage:\n"
736 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
737 "long form example:\n"
738 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
739 "short form example:\n"
740 "vb:a/hb:a/lb de,-vb\n"
741 "more examples:\n"
742 "tn:64:128:256\n"
743 "\n"
744 ;
745
746 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
747 {
748 char temp[GET_MODE_BUFFER_SIZE];
749 char *p= temp;
750 const char *filterDelimiters= ",/";
751 const char *optionDelimiters= ":";
752 struct PPMode *ppMode;
753 char *filterToken;
754
755 ppMode= av_malloc(sizeof(PPMode));
756
757 ppMode->lumMode= 0;
758 ppMode->chromMode= 0;
759 ppMode->maxTmpNoise[0]= 700;
760 ppMode->maxTmpNoise[1]= 1500;
761 ppMode->maxTmpNoise[2]= 3000;
762 ppMode->maxAllowedY= 234;
763 ppMode->minAllowedY= 16;
764 ppMode->baseDcDiff= 256/8;
765 ppMode->flatnessThreshold= 56-16-1;
766 ppMode->maxClippedThreshold= 0.01;
767 ppMode->error=0;
768
769 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
770
771 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
772
773 for(;;){
774 char *filterName;
775 int q= 1000000; //PP_QUALITY_MAX;
776 int chrom=-1;
777 int luma=-1;
778 char *option;
779 char *options[OPTIONS_ARRAY_SIZE];
780 int i;
781 int filterNameOk=0;
782 int numOfUnknownOptions=0;
783 int enable=1; //does the user want us to enabled or disabled the filter
784
785 filterToken= strtok(p, filterDelimiters);
786 if(filterToken == NULL) break;
787 p+= strlen(filterToken) + 1; // p points to next filterToken
788 filterName= strtok(filterToken, optionDelimiters);
789 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
790
791 if(*filterName == '-')
792 {
793 enable=0;
794 filterName++;
795 }
796
797 for(;;){ //for all options
798 option= strtok(NULL, optionDelimiters);
799 if(option == NULL) break;
800
801 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
802 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
803 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
804 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
805 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
806 else
807 {
808 options[numOfUnknownOptions] = option;
809 numOfUnknownOptions++;
810 }
811 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
812 }
813 options[numOfUnknownOptions] = NULL;
814
815 /* replace stuff from the replace Table */
816 for(i=0; replaceTable[2*i]!=NULL; i++)
817 {
818 if(!strcmp(replaceTable[2*i], filterName))
819 {
820 int newlen= strlen(replaceTable[2*i + 1]);
821 int plen;
822 int spaceLeft;
823
824 if(p==NULL) p= temp, *p=0; //last filter
825 else p--, *p=','; //not last filter
826
827 plen= strlen(p);
828 spaceLeft= p - temp + plen;
829 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
830 {
831 ppMode->error++;
832 break;
833 }
834 memmove(p + newlen, p, plen+1);
835 memcpy(p, replaceTable[2*i + 1], newlen);
836 filterNameOk=1;
837 }
838 }
839
840 for(i=0; filters[i].shortName!=NULL; i++)
841 {
842 if( !strcmp(filters[i].longName, filterName)
843 || !strcmp(filters[i].shortName, filterName))
844 {
845 ppMode->lumMode &= ~filters[i].mask;
846 ppMode->chromMode &= ~filters[i].mask;
847
848 filterNameOk=1;
849 if(!enable) break; // user wants to disable it
850
851 if(q >= filters[i].minLumQuality && luma)
852 ppMode->lumMode|= filters[i].mask;
853 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
854 if(q >= filters[i].minChromQuality)
855 ppMode->chromMode|= filters[i].mask;
856
857 if(filters[i].mask == LEVEL_FIX)
858 {
859 int o;
860 ppMode->minAllowedY= 16;
861 ppMode->maxAllowedY= 234;
862 for(o=0; options[o]!=NULL; o++)
863 {
864 if( !strcmp(options[o],"fullyrange")
865 ||!strcmp(options[o],"f"))
866 {
867 ppMode->minAllowedY= 0;
868 ppMode->maxAllowedY= 255;
869 numOfUnknownOptions--;
870 }
871 }
872 }
873 else if(filters[i].mask == TEMP_NOISE_FILTER)
874 {
875 int o;
876 int numOfNoises=0;
877
878 for(o=0; options[o]!=NULL; o++)
879 {
880 char *tail;
881 ppMode->maxTmpNoise[numOfNoises]=
882 strtol(options[o], &tail, 0);
883 if(tail!=options[o])
884 {
885 numOfNoises++;
886 numOfUnknownOptions--;
887 if(numOfNoises >= 3) break;
888 }
889 }
890 }
891 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
892 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
893 {
894 int o;
895
896 for(o=0; options[o]!=NULL && o<2; o++)
897 {
898 char *tail;
899 int val= strtol(options[o], &tail, 0);
900 if(tail==options[o]) break;
901
902 numOfUnknownOptions--;
903 if(o==0) ppMode->baseDcDiff= val;
904 else ppMode->flatnessThreshold= val;
905 }
906 }
907 else if(filters[i].mask == FORCE_QUANT)
908 {
909 int o;
910 ppMode->forcedQuant= 15;
911
912 for(o=0; options[o]!=NULL && o<1; o++)
913 {
914 char *tail;
915 int val= strtol(options[o], &tail, 0);
916 if(tail==options[o]) break;
917
918 numOfUnknownOptions--;
919 ppMode->forcedQuant= val;
920 }
921 }
922 }
923 }
924 if(!filterNameOk) ppMode->error++;
925 ppMode->error += numOfUnknownOptions;
926 }
927
928 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
929 if(ppMode->error)
930 {
931 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
932 av_free(ppMode);
933 return NULL;
934 }
935 return ppMode;
936 }
937
938 void pp_free_mode(pp_mode_t *mode){
939 av_free(mode);
940 }
941
942 static void reallocAlign(void **p, int alignment, int size){
943 av_free(*p);
944 *p= av_mallocz(size);
945 }
946
947 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
948 int mbWidth = (width+15)>>4;
949 int mbHeight= (height+15)>>4;
950 int i;
951
952 c->stride= stride;
953 c->qpStride= qpStride;
954
955 reallocAlign((void **)&c->tempDst, 8, stride*24);
956 reallocAlign((void **)&c->tempSrc, 8, stride*24);
957 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
958 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
959 for(i=0; i<256; i++)
960 c->yHistogram[i]= width*height/64*15/256;
961
962 for(i=0; i<3; i++)
963 {
964 //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
965 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
966 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
967 }
968
969 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
970 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
971 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
972 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
973 }
974
975 static void global_init(void){
976 int i;
977 memset(clip_table, 0, 256);
978 for(i=256; i<512; i++)
979 clip_table[i]= i;
980 memset(clip_table+512, 0, 256);
981 }
982
983 static const char * context_to_name(void * ptr) {
984 return "postproc";
985 }
986
987 static AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
988
989 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
990 PPContext *c= av_malloc(sizeof(PPContext));
991 int stride= (width+15)&(~15); //assumed / will realloc if needed
992 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
993
994 global_init();
995
996 memset(c, 0, sizeof(PPContext));
997 c->av_class = &av_codec_context_class;
998 c->cpuCaps= cpuCaps;
999 if(cpuCaps&PP_FORMAT){
1000 c->hChromaSubSample= cpuCaps&0x3;
1001 c->vChromaSubSample= (cpuCaps>>4)&0x3;
1002 }else{
1003 c->hChromaSubSample= 1;
1004 c->vChromaSubSample= 1;
1005 }
1006
1007 reallocBuffers(c, width, height, stride, qpStride);
1008
1009 c->frameNum=-1;
1010
1011 return c;
1012 }
1013
1014 void pp_free_context(void *vc){
1015 PPContext *c = (PPContext*)vc;
1016 int i;
1017
1018 for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1019 for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1020
1021 av_free(c->tempBlocks);
1022 av_free(c->yHistogram);
1023 av_free(c->tempDst);
1024 av_free(c->tempSrc);
1025 av_free(c->deintTemp);
1026 av_free(c->stdQPTable);
1027 av_free(c->nonBQPTable);
1028 av_free(c->forcedQPTable);
1029
1030 memset(c, 0, sizeof(PPContext));
1031
1032 av_free(c);
1033 }
1034
1035 void pp_postprocess(uint8_t * src[3], int srcStride[3],
1036 uint8_t * dst[3], int dstStride[3],
1037 int width, int height,
1038 QP_STORE_T *QP_store, int QPStride,
1039 pp_mode_t *vm, void *vc, int pict_type)
1040 {
1041 int mbWidth = (width+15)>>4;
1042 int mbHeight= (height+15)>>4;
1043 PPMode *mode = (PPMode*)vm;
1044 PPContext *c = (PPContext*)vc;
1045 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
1046 int absQPStride = FFABS(QPStride);
1047
1048 // c->stride and c->QPStride are always positive
1049 if(c->stride < minStride || c->qpStride < absQPStride)
1050 reallocBuffers(c, width, height,
1051 FFMAX(minStride, c->stride),
1052 FFMAX(c->qpStride, absQPStride));
1053
1054 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1055 {
1056 int i;
1057 QP_store= c->forcedQPTable;
1058 absQPStride = QPStride = 0;
1059 if(mode->lumMode & FORCE_QUANT)
1060 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1061 else
1062 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1063 }
1064
1065 if(pict_type & PP_PICT_TYPE_QP2){
1066 int i;
1067 const int count= mbHeight * absQPStride;
1068 for(i=0; i<(count>>2); i++){
1069 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1070 }
1071 for(i<<=2; i<count; i++){
1072 c->stdQPTable[i] = QP_store[i]>>1;
1073 }
1074 QP_store= c->stdQPTable;
1075 QPStride= absQPStride;
1076 }
1077
1078 if(0){
1079 int x,y;
1080 for(y=0; y<mbHeight; y++){
1081 for(x=0; x<mbWidth; x++){
1082 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1083 }
1084 av_log(c, AV_LOG_INFO, "\n");
1085 }
1086 av_log(c, AV_LOG_INFO, "\n");
1087 }
1088
1089 if((pict_type&7)!=3)
1090 {
1091 if (QPStride >= 0) {
1092 int i;
1093 const int count= mbHeight * QPStride;
1094 for(i=0; i<(count>>2); i++){
1095 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1096 }
1097 for(i<<=2; i<count; i++){
1098 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1099 }
1100 } else {
1101 int i,j;
1102 for(i=0; i<mbHeight; i++) {
1103 for(j=0; j<absQPStride; j++) {
1104 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1105 }
1106 }
1107 }
1108 }
1109
1110 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1111 mode->lumMode, mode->chromMode);
1112
1113 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1114 width, height, QP_store, QPStride, 0, mode, c);
1115
1116 width = (width )>>c->hChromaSubSample;
1117 height = (height)>>c->vChromaSubSample;
1118
1119 if(mode->chromMode)
1120 {
1121 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1122 width, height, QP_store, QPStride, 1, mode, c);
1123 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1124 width, height, QP_store, QPStride, 2, mode, c);
1125 }
1126 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1127 {
1128 linecpy(dst[1], src[1], height, srcStride[1]);
1129 linecpy(dst[2], src[2], height, srcStride[2]);
1130 }
1131 else
1132 {
1133 int y;
1134 for(y=0; y<height; y++)
1135 {
1136 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1137 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1138 }
1139 }
1140 }
1141