cosmetics: Fix indentation to be 4 spaces and consistently place {}.
[libav.git] / libpostproc / postprocess.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file postprocess.c
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use the Subversion log
75
76 #include "config.h"
77 #include "avutil.h"
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <stdlib.h>
81 #include <string.h>
82 #ifdef HAVE_MALLOC_H
83 #include <malloc.h>
84 #endif
85 //#undef HAVE_MMX2
86 //#define HAVE_3DNOW
87 //#undef HAVE_MMX
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #include "postprocess.h"
91 #include "postprocess_internal.h"
92
93 #ifdef HAVE_ALTIVEC_H
94 #include <altivec.h>
95 #endif
96
97 #define GET_MODE_BUFFER_SIZE 500
98 #define OPTIONS_ARRAY_SIZE 10
99 #define BLOCK_SIZE 8
100 #define TEMP_STRIDE 8
101 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
102
103 #if defined(ARCH_X86)
104 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
105 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
106 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
107 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
108 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
109 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
110 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
111 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
112 #endif
113
114 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
115
116
117 static struct PPFilter filters[]=
118 {
119 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
120 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
121 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
122 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
123 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
124 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
125 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
126 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
127 {"dr", "dering", 1, 5, 6, DERING},
128 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
129 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
130 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
131 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
132 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
133 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
134 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
135 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
136 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
137 {NULL, NULL,0,0,0,0} //End Marker
138 };
139
140 static const char *replaceTable[]=
141 {
142 "default", "hb:a,vb:a,dr:a",
143 "de", "hb:a,vb:a,dr:a",
144 "fast", "h1:a,v1:a,dr:a",
145 "fa", "h1:a,v1:a,dr:a",
146 "ac", "ha:a:128:7,va:a,dr:a",
147 NULL //End Marker
148 };
149
150
151 #if defined(ARCH_X86)
152 static inline void prefetchnta(void *p)
153 {
154 asm volatile( "prefetchnta (%0)\n\t"
155 : : "r" (p)
156 );
157 }
158
159 static inline void prefetcht0(void *p)
160 {
161 asm volatile( "prefetcht0 (%0)\n\t"
162 : : "r" (p)
163 );
164 }
165
166 static inline void prefetcht1(void *p)
167 {
168 asm volatile( "prefetcht1 (%0)\n\t"
169 : : "r" (p)
170 );
171 }
172
173 static inline void prefetcht2(void *p)
174 {
175 asm volatile( "prefetcht2 (%0)\n\t"
176 : : "r" (p)
177 );
178 }
179 #endif
180
181 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
182
183 /**
184 * Check if the given 8x8 Block is mostly "flat"
185 */
186 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
187 {
188 int numEq= 0;
189 int y;
190 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
191 const int dcThreshold= dcOffset*2 + 1;
192
193 for(y=0; y<BLOCK_SIZE; y++){
194 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
195 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
196 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
197 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
198 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
199 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
200 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
201 src+= stride;
202 }
203 return numEq > c->ppMode.flatnessThreshold;
204 }
205
206 /**
207 * Check if the middle 8x8 Block in the given 8x16 block is flat
208 */
209 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c)
210 {
211 int numEq= 0;
212 int y;
213 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
214 const int dcThreshold= dcOffset*2 + 1;
215
216 src+= stride*4; // src points to begin of the 8x8 Block
217 for(y=0; y<BLOCK_SIZE-1; y++){
218 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
219 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
220 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
221 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
222 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
223 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
224 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
225 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
226 src+= stride;
227 }
228 return numEq > c->ppMode.flatnessThreshold;
229 }
230
231 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
232 {
233 int i;
234 #if 1
235 for(i=0; i<2; i++){
236 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
237 src += stride;
238 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
239 src += stride;
240 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
241 src += stride;
242 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
243 src += stride;
244 }
245 #else
246 for(i=0; i<8; i++){
247 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
248 src += stride;
249 }
250 #endif
251 return 1;
252 }
253
254 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
255 {
256 #if 1
257 #if 1
258 int x;
259 src+= stride*4;
260 for(x=0; x<BLOCK_SIZE; x+=4){
261 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
262 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
263 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
264 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
265 }
266 #else
267 int x;
268 src+= stride*3;
269 for(x=0; x<BLOCK_SIZE; x++){
270 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
271 }
272 #endif
273 return 1;
274 #else
275 int x;
276 src+= stride*4;
277 for(x=0; x<BLOCK_SIZE; x++){
278 int min=255;
279 int max=0;
280 int y;
281 for(y=0; y<8; y++){
282 int v= src[x + y*stride];
283 if(v>max) max=v;
284 if(v<min) min=v;
285 }
286 if(max-min > 2*QP) return 0;
287 }
288 return 1;
289 #endif
290 }
291
292 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c)
293 {
294 if( isHorizDC_C(src, stride, c) ){
295 if( isHorizMinMaxOk_C(src, stride, c->QP) )
296 return 1;
297 else
298 return 0;
299 }else{
300 return 2;
301 }
302 }
303
304 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c)
305 {
306 if( isVertDC_C(src, stride, c) ){
307 if( isVertMinMaxOk_C(src, stride, c->QP) )
308 return 1;
309 else
310 return 0;
311 }else{
312 return 2;
313 }
314 }
315
316 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
317 {
318 int y;
319 for(y=0; y<BLOCK_SIZE; y++){
320 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
321
322 if(FFABS(middleEnergy) < 8*c->QP){
323 const int q=(dst[3] - dst[4])/2;
324 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
325 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
326
327 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
328 d= FFMAX(d, 0);
329
330 d= (5*d + 32) >> 6;
331 d*= FFSIGN(-middleEnergy);
332
333 if(q>0)
334 {
335 d= d<0 ? 0 : d;
336 d= d>q ? q : d;
337 }
338 else
339 {
340 d= d>0 ? 0 : d;
341 d= d<q ? q : d;
342 }
343
344 dst[3]-= d;
345 dst[4]+= d;
346 }
347 dst+= stride;
348 }
349 }
350
351 /**
352 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
353 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
354 */
355 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
356 {
357 int y;
358 for(y=0; y<BLOCK_SIZE; y++){
359 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
360 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
361
362 int sums[10];
363 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
364 sums[1] = sums[0] - first + dst[3];
365 sums[2] = sums[1] - first + dst[4];
366 sums[3] = sums[2] - first + dst[5];
367 sums[4] = sums[3] - first + dst[6];
368 sums[5] = sums[4] - dst[0] + dst[7];
369 sums[6] = sums[5] - dst[1] + last;
370 sums[7] = sums[6] - dst[2] + last;
371 sums[8] = sums[7] - dst[3] + last;
372 sums[9] = sums[8] - dst[4] + last;
373
374 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
375 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
376 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
377 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
378 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
379 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
380 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
381 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
382
383 dst+= stride;
384 }
385 }
386
387 /**
388 * Experimental Filter 1 (Horizontal)
389 * will not damage linear gradients
390 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
391 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
392 * MMX2 version does correct clipping C version does not
393 * not identical with the vertical one
394 */
395 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
396 {
397 int y;
398 static uint64_t *lut= NULL;
399 if(lut==NULL)
400 {
401 int i;
402 lut = av_malloc(256*8);
403 for(i=0; i<256; i++)
404 {
405 int v= i < 128 ? 2*i : 2*(i-256);
406 /*
407 //Simulate 112242211 9-Tap filter
408 uint64_t a= (v/16) & 0xFF;
409 uint64_t b= (v/8) & 0xFF;
410 uint64_t c= (v/4) & 0xFF;
411 uint64_t d= (3*v/8) & 0xFF;
412 */
413 //Simulate piecewise linear interpolation
414 uint64_t a= (v/16) & 0xFF;
415 uint64_t b= (v*3/16) & 0xFF;
416 uint64_t c= (v*5/16) & 0xFF;
417 uint64_t d= (7*v/16) & 0xFF;
418 uint64_t A= (0x100 - a)&0xFF;
419 uint64_t B= (0x100 - b)&0xFF;
420 uint64_t C= (0x100 - c)&0xFF;
421 uint64_t D= (0x100 - c)&0xFF;
422
423 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
424 (D<<24) | (C<<16) | (B<<8) | (A);
425 //lut[i] = (v<<32) | (v<<24);
426 }
427 }
428
429 for(y=0; y<BLOCK_SIZE; y++){
430 int a= src[1] - src[2];
431 int b= src[3] - src[4];
432 int c= src[5] - src[6];
433
434 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
435
436 if(d < QP){
437 int v = d * FFSIGN(-b);
438
439 src[1] +=v/8;
440 src[2] +=v/4;
441 src[3] +=3*v/8;
442 src[4] -=3*v/8;
443 src[5] -=v/4;
444 src[6] -=v/8;
445 }
446 src+=stride;
447 }
448 }
449
450 /**
451 * accurate deblock filter
452 */
453 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
454 int y;
455 const int QP= c->QP;
456 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
457 const int dcThreshold= dcOffset*2 + 1;
458 //START_TIMER
459 src+= step*4; // src points to begin of the 8x8 Block
460 for(y=0; y<8; y++){
461 int numEq= 0;
462
463 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
464 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
465 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
466 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
467 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
468 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
469 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
470 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
471 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
472 if(numEq > c->ppMode.flatnessThreshold){
473 int min, max, x;
474
475 if(src[0] > src[step]){
476 max= src[0];
477 min= src[step];
478 }else{
479 max= src[step];
480 min= src[0];
481 }
482 for(x=2; x<8; x+=2){
483 if(src[x*step] > src[(x+1)*step]){
484 if(src[x *step] > max) max= src[ x *step];
485 if(src[(x+1)*step] < min) min= src[(x+1)*step];
486 }else{
487 if(src[(x+1)*step] > max) max= src[(x+1)*step];
488 if(src[ x *step] < min) min= src[ x *step];
489 }
490 }
491 if(max-min < 2*QP){
492 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
493 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
494
495 int sums[10];
496 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
497 sums[1] = sums[0] - first + src[3*step];
498 sums[2] = sums[1] - first + src[4*step];
499 sums[3] = sums[2] - first + src[5*step];
500 sums[4] = sums[3] - first + src[6*step];
501 sums[5] = sums[4] - src[0*step] + src[7*step];
502 sums[6] = sums[5] - src[1*step] + last;
503 sums[7] = sums[6] - src[2*step] + last;
504 sums[8] = sums[7] - src[3*step] + last;
505 sums[9] = sums[8] - src[4*step] + last;
506
507 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
508 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
509 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
510 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
511 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
512 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
513 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
514 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
515 }
516 }else{
517 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
518
519 if(FFABS(middleEnergy) < 8*QP){
520 const int q=(src[3*step] - src[4*step])/2;
521 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
522 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
523
524 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
525 d= FFMAX(d, 0);
526
527 d= (5*d + 32) >> 6;
528 d*= FFSIGN(-middleEnergy);
529
530 if(q>0){
531 d= d<0 ? 0 : d;
532 d= d>q ? q : d;
533 }else{
534 d= d>0 ? 0 : d;
535 d= d<q ? q : d;
536 }
537
538 src[3*step]-= d;
539 src[4*step]+= d;
540 }
541 }
542
543 src += stride;
544 }
545 /*if(step==16){
546 STOP_TIMER("step16")
547 }else{
548 STOP_TIMER("stepX")
549 }*/
550 }
551
552 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
553 //Plain C versions
554 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
555 #define COMPILE_C
556 #endif
557
558 #ifdef HAVE_ALTIVEC
559 #define COMPILE_ALTIVEC
560 #endif //HAVE_ALTIVEC
561
562 #if defined(ARCH_X86)
563
564 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
565 #define COMPILE_MMX
566 #endif
567
568 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
569 #define COMPILE_MMX2
570 #endif
571
572 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
573 #define COMPILE_3DNOW
574 #endif
575 #endif /* defined(ARCH_X86) */
576
577 #undef HAVE_MMX
578 #undef HAVE_MMX2
579 #undef HAVE_3DNOW
580 #undef HAVE_ALTIVEC
581
582 #ifdef COMPILE_C
583 #undef HAVE_MMX
584 #undef HAVE_MMX2
585 #undef HAVE_3DNOW
586 #define RENAME(a) a ## _C
587 #include "postprocess_template.c"
588 #endif
589
590 #ifdef COMPILE_ALTIVEC
591 #undef RENAME
592 #define HAVE_ALTIVEC
593 #define RENAME(a) a ## _altivec
594 #include "postprocess_altivec_template.c"
595 #include "postprocess_template.c"
596 #endif
597
598 //MMX versions
599 #ifdef COMPILE_MMX
600 #undef RENAME
601 #define HAVE_MMX
602 #undef HAVE_MMX2
603 #undef HAVE_3DNOW
604 #define RENAME(a) a ## _MMX
605 #include "postprocess_template.c"
606 #endif
607
608 //MMX2 versions
609 #ifdef COMPILE_MMX2
610 #undef RENAME
611 #define HAVE_MMX
612 #define HAVE_MMX2
613 #undef HAVE_3DNOW
614 #define RENAME(a) a ## _MMX2
615 #include "postprocess_template.c"
616 #endif
617
618 //3DNOW versions
619 #ifdef COMPILE_3DNOW
620 #undef RENAME
621 #define HAVE_MMX
622 #undef HAVE_MMX2
623 #define HAVE_3DNOW
624 #define RENAME(a) a ## _3DNow
625 #include "postprocess_template.c"
626 #endif
627
628 // minor note: the HAVE_xyz is messed up after that line so do not use it.
629
630 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
631 const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
632 {
633 PPContext *c= (PPContext *)vc;
634 PPMode *ppMode= (PPMode *)vm;
635 c->ppMode= *ppMode; //FIXME
636
637 // Using ifs here as they are faster than function pointers although the
638 // difference would not be measurable here but it is much better because
639 // someone might exchange the CPU whithout restarting MPlayer ;)
640 #ifdef RUNTIME_CPUDETECT
641 #if defined(ARCH_X86)
642 // ordered per speed fastest first
643 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
644 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
645 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
646 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
647 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
648 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
649 else
650 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
651 #else
652 #ifdef HAVE_ALTIVEC
653 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
654 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
655 else
656 #endif
657 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
658 #endif
659 #else //RUNTIME_CPUDETECT
660 #ifdef HAVE_MMX2
661 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
662 #elif defined (HAVE_3DNOW)
663 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
664 #elif defined (HAVE_MMX)
665 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
666 #elif defined (HAVE_ALTIVEC)
667 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
668 #else
669 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
670 #endif
671 #endif //!RUNTIME_CPUDETECT
672 }
673
674 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
675 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
676
677 /* -pp Command line Help
678 */
679 #if LIBPOSTPROC_VERSION_INT < (52<<16)
680 const char *const pp_help=
681 #else
682 const char pp_help[] =
683 #endif
684 "Available postprocessing filters:\n"
685 "Filters Options\n"
686 "short long name short long option Description\n"
687 "* * a autoq CPU power dependent enabler\n"
688 " c chrom chrominance filtering enabled\n"
689 " y nochrom chrominance filtering disabled\n"
690 " n noluma luma filtering disabled\n"
691 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
692 " 1. difference factor: default=32, higher -> more deblocking\n"
693 " 2. flatness threshold: default=39, lower -> more deblocking\n"
694 " the h & v deblocking filters share these\n"
695 " so you can't set different thresholds for h / v\n"
696 "vb vdeblock (2 threshold) vertical deblocking filter\n"
697 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
698 "va vadeblock (2 threshold) vertical deblocking filter\n"
699 "h1 x1hdeblock experimental h deblock filter 1\n"
700 "v1 x1vdeblock experimental v deblock filter 1\n"
701 "dr dering deringing filter\n"
702 "al autolevels automatic brightness / contrast\n"
703 " f fullyrange stretch luminance to (0..255)\n"
704 "lb linblenddeint linear blend deinterlacer\n"
705 "li linipoldeint linear interpolating deinterlace\n"
706 "ci cubicipoldeint cubic interpolating deinterlacer\n"
707 "md mediandeint median deinterlacer\n"
708 "fd ffmpegdeint ffmpeg deinterlacer\n"
709 "l5 lowpass5 FIR lowpass deinterlacer\n"
710 "de default hb:a,vb:a,dr:a\n"
711 "fa fast h1:a,v1:a,dr:a\n"
712 "ac ha:a:128:7,va:a,dr:a\n"
713 "tn tmpnoise (3 threshold) temporal noise reducer\n"
714 " 1. <= 2. <= 3. larger -> stronger filtering\n"
715 "fq forceQuant <quantizer> force quantizer\n"
716 "Usage:\n"
717 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
718 "long form example:\n"
719 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
720 "short form example:\n"
721 "vb:a/hb:a/lb de,-vb\n"
722 "more examples:\n"
723 "tn:64:128:256\n"
724 "\n"
725 ;
726
727 pp_mode_t *pp_get_mode_by_name_and_quality(const char *name, int quality)
728 {
729 char temp[GET_MODE_BUFFER_SIZE];
730 char *p= temp;
731 static const char filterDelimiters[] = ",/";
732 static const char optionDelimiters[] = ":";
733 struct PPMode *ppMode;
734 char *filterToken;
735
736 ppMode= av_malloc(sizeof(PPMode));
737
738 ppMode->lumMode= 0;
739 ppMode->chromMode= 0;
740 ppMode->maxTmpNoise[0]= 700;
741 ppMode->maxTmpNoise[1]= 1500;
742 ppMode->maxTmpNoise[2]= 3000;
743 ppMode->maxAllowedY= 234;
744 ppMode->minAllowedY= 16;
745 ppMode->baseDcDiff= 256/8;
746 ppMode->flatnessThreshold= 56-16-1;
747 ppMode->maxClippedThreshold= 0.01;
748 ppMode->error=0;
749
750 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
751
752 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
753
754 for(;;){
755 char *filterName;
756 int q= 1000000; //PP_QUALITY_MAX;
757 int chrom=-1;
758 int luma=-1;
759 char *option;
760 char *options[OPTIONS_ARRAY_SIZE];
761 int i;
762 int filterNameOk=0;
763 int numOfUnknownOptions=0;
764 int enable=1; //does the user want us to enabled or disabled the filter
765
766 filterToken= strtok(p, filterDelimiters);
767 if(filterToken == NULL) break;
768 p+= strlen(filterToken) + 1; // p points to next filterToken
769 filterName= strtok(filterToken, optionDelimiters);
770 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
771
772 if(*filterName == '-'){
773 enable=0;
774 filterName++;
775 }
776
777 for(;;){ //for all options
778 option= strtok(NULL, optionDelimiters);
779 if(option == NULL) break;
780
781 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
782 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
783 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
784 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
785 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
786 else{
787 options[numOfUnknownOptions] = option;
788 numOfUnknownOptions++;
789 }
790 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
791 }
792 options[numOfUnknownOptions] = NULL;
793
794 /* replace stuff from the replace Table */
795 for(i=0; replaceTable[2*i]!=NULL; i++){
796 if(!strcmp(replaceTable[2*i], filterName)){
797 int newlen= strlen(replaceTable[2*i + 1]);
798 int plen;
799 int spaceLeft;
800
801 if(p==NULL) p= temp, *p=0; //last filter
802 else p--, *p=','; //not last filter
803
804 plen= strlen(p);
805 spaceLeft= p - temp + plen;
806 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE){
807 ppMode->error++;
808 break;
809 }
810 memmove(p + newlen, p, plen+1);
811 memcpy(p, replaceTable[2*i + 1], newlen);
812 filterNameOk=1;
813 }
814 }
815
816 for(i=0; filters[i].shortName!=NULL; i++){
817 if( !strcmp(filters[i].longName, filterName)
818 || !strcmp(filters[i].shortName, filterName)){
819 ppMode->lumMode &= ~filters[i].mask;
820 ppMode->chromMode &= ~filters[i].mask;
821
822 filterNameOk=1;
823 if(!enable) break; // user wants to disable it
824
825 if(q >= filters[i].minLumQuality && luma)
826 ppMode->lumMode|= filters[i].mask;
827 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
828 if(q >= filters[i].minChromQuality)
829 ppMode->chromMode|= filters[i].mask;
830
831 if(filters[i].mask == LEVEL_FIX){
832 int o;
833 ppMode->minAllowedY= 16;
834 ppMode->maxAllowedY= 234;
835 for(o=0; options[o]!=NULL; o++){
836 if( !strcmp(options[o],"fullyrange")
837 ||!strcmp(options[o],"f")){
838 ppMode->minAllowedY= 0;
839 ppMode->maxAllowedY= 255;
840 numOfUnknownOptions--;
841 }
842 }
843 }
844 else if(filters[i].mask == TEMP_NOISE_FILTER)
845 {
846 int o;
847 int numOfNoises=0;
848
849 for(o=0; options[o]!=NULL; o++){
850 char *tail;
851 ppMode->maxTmpNoise[numOfNoises]=
852 strtol(options[o], &tail, 0);
853 if(tail!=options[o]){
854 numOfNoises++;
855 numOfUnknownOptions--;
856 if(numOfNoises >= 3) break;
857 }
858 }
859 }
860 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
861 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
862 int o;
863
864 for(o=0; options[o]!=NULL && o<2; o++){
865 char *tail;
866 int val= strtol(options[o], &tail, 0);
867 if(tail==options[o]) break;
868
869 numOfUnknownOptions--;
870 if(o==0) ppMode->baseDcDiff= val;
871 else ppMode->flatnessThreshold= val;
872 }
873 }
874 else if(filters[i].mask == FORCE_QUANT){
875 int o;
876 ppMode->forcedQuant= 15;
877
878 for(o=0; options[o]!=NULL && o<1; o++){
879 char *tail;
880 int val= strtol(options[o], &tail, 0);
881 if(tail==options[o]) break;
882
883 numOfUnknownOptions--;
884 ppMode->forcedQuant= val;
885 }
886 }
887 }
888 }
889 if(!filterNameOk) ppMode->error++;
890 ppMode->error += numOfUnknownOptions;
891 }
892
893 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
894 if(ppMode->error){
895 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
896 av_free(ppMode);
897 return NULL;
898 }
899 return ppMode;
900 }
901
902 void pp_free_mode(pp_mode_t *mode){
903 av_free(mode);
904 }
905
906 static void reallocAlign(void **p, int alignment, int size){
907 av_free(*p);
908 *p= av_mallocz(size);
909 }
910
911 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
912 int mbWidth = (width+15)>>4;
913 int mbHeight= (height+15)>>4;
914 int i;
915
916 c->stride= stride;
917 c->qpStride= qpStride;
918
919 reallocAlign((void **)&c->tempDst, 8, stride*24);
920 reallocAlign((void **)&c->tempSrc, 8, stride*24);
921 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
922 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
923 for(i=0; i<256; i++)
924 c->yHistogram[i]= width*height/64*15/256;
925
926 for(i=0; i<3; i++){
927 //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
928 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
929 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
930 }
931
932 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
933 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
934 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
935 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
936 }
937
938 static const char * context_to_name(void * ptr) {
939 return "postproc";
940 }
941
942 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
943
944 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
945 PPContext *c= av_malloc(sizeof(PPContext));
946 int stride= (width+15)&(~15); //assumed / will realloc if needed
947 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
948
949 memset(c, 0, sizeof(PPContext));
950 c->av_class = &av_codec_context_class;
951 c->cpuCaps= cpuCaps;
952 if(cpuCaps&PP_FORMAT){
953 c->hChromaSubSample= cpuCaps&0x3;
954 c->vChromaSubSample= (cpuCaps>>4)&0x3;
955 }else{
956 c->hChromaSubSample= 1;
957 c->vChromaSubSample= 1;
958 }
959
960 reallocBuffers(c, width, height, stride, qpStride);
961
962 c->frameNum=-1;
963
964 return c;
965 }
966
967 void pp_free_context(void *vc){
968 PPContext *c = (PPContext*)vc;
969 int i;
970
971 for(i=0; i<3; i++) av_free(c->tempBlured[i]);
972 for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
973
974 av_free(c->tempBlocks);
975 av_free(c->yHistogram);
976 av_free(c->tempDst);
977 av_free(c->tempSrc);
978 av_free(c->deintTemp);
979 av_free(c->stdQPTable);
980 av_free(c->nonBQPTable);
981 av_free(c->forcedQPTable);
982
983 memset(c, 0, sizeof(PPContext));
984
985 av_free(c);
986 }
987
988 void pp_postprocess(const uint8_t * src[3], const int srcStride[3],
989 uint8_t * dst[3], const int dstStride[3],
990 int width, int height,
991 const QP_STORE_T *QP_store, int QPStride,
992 pp_mode_t *vm, void *vc, int pict_type)
993 {
994 int mbWidth = (width+15)>>4;
995 int mbHeight= (height+15)>>4;
996 PPMode *mode = (PPMode*)vm;
997 PPContext *c = (PPContext*)vc;
998 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
999 int absQPStride = FFABS(QPStride);
1000
1001 // c->stride and c->QPStride are always positive
1002 if(c->stride < minStride || c->qpStride < absQPStride)
1003 reallocBuffers(c, width, height,
1004 FFMAX(minStride, c->stride),
1005 FFMAX(c->qpStride, absQPStride));
1006
1007 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){
1008 int i;
1009 QP_store= c->forcedQPTable;
1010 absQPStride = QPStride = 0;
1011 if(mode->lumMode & FORCE_QUANT)
1012 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
1013 else
1014 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
1015 }
1016
1017 if(pict_type & PP_PICT_TYPE_QP2){
1018 int i;
1019 const int count= mbHeight * absQPStride;
1020 for(i=0; i<(count>>2); i++){
1021 ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1022 }
1023 for(i<<=2; i<count; i++){
1024 c->stdQPTable[i] = QP_store[i]>>1;
1025 }
1026 QP_store= c->stdQPTable;
1027 QPStride= absQPStride;
1028 }
1029
1030 if(0){
1031 int x,y;
1032 for(y=0; y<mbHeight; y++){
1033 for(x=0; x<mbWidth; x++){
1034 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1035 }
1036 av_log(c, AV_LOG_INFO, "\n");
1037 }
1038 av_log(c, AV_LOG_INFO, "\n");
1039 }
1040
1041 if((pict_type&7)!=3){
1042 if (QPStride >= 0){
1043 int i;
1044 const int count= mbHeight * QPStride;
1045 for(i=0; i<(count>>2); i++){
1046 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1047 }
1048 for(i<<=2; i<count; i++){
1049 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1050 }
1051 } else {
1052 int i,j;
1053 for(i=0; i<mbHeight; i++) {
1054 for(j=0; j<absQPStride; j++) {
1055 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1056 }
1057 }
1058 }
1059 }
1060
1061 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1062 mode->lumMode, mode->chromMode);
1063
1064 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1065 width, height, QP_store, QPStride, 0, mode, c);
1066
1067 width = (width )>>c->hChromaSubSample;
1068 height = (height)>>c->vChromaSubSample;
1069
1070 if(mode->chromMode){
1071 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1072 width, height, QP_store, QPStride, 1, mode, c);
1073 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1074 width, height, QP_store, QPStride, 2, mode, c);
1075 }
1076 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1077 linecpy(dst[1], src[1], height, srcStride[1]);
1078 linecpy(dst[2], src[2], height, srcStride[2]);
1079 }else{
1080 int y;
1081 for(y=0; y<height; y++){
1082 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1083 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1084 }
1085 }
1086 }
1087