Use common define for x86_32 and x86_64.
[libav.git] / libpostproc / postprocess.c
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file postprocess.c
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
51 # more or less selfinvented filters so the exactness isnt too meaningfull
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use the Subversion log
75
76 #include "config.h"
77 #include "avutil.h"
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <stdlib.h>
81 #include <string.h>
82 #ifdef HAVE_MALLOC_H
83 #include <malloc.h>
84 #endif
85 //#undef HAVE_MMX2
86 //#define HAVE_3DNOW
87 //#undef HAVE_MMX
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #ifdef USE_FASTMEMCPY
91 #include "libvo/fastmemcpy.h"
92 #endif
93 #include "postprocess.h"
94 #include "postprocess_internal.h"
95
96 #include "mangle.h" //FIXME should be supressed
97
98 #ifdef HAVE_ALTIVEC_H
99 #include <altivec.h>
100 #endif
101
102 #define GET_MODE_BUFFER_SIZE 500
103 #define OPTIONS_ARRAY_SIZE 10
104 #define BLOCK_SIZE 8
105 #define TEMP_STRIDE 8
106 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
107
108 #if defined(ARCH_X86)
109 static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
110 static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
111 static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
112 static uint64_t __attribute__((aligned(8))) attribute_used b00= 0x0000000000000000LL;
113 static uint64_t __attribute__((aligned(8))) attribute_used b01= 0x0101010101010101LL;
114 static uint64_t __attribute__((aligned(8))) attribute_used b02= 0x0202020202020202LL;
115 static uint64_t __attribute__((aligned(8))) attribute_used b08= 0x0808080808080808LL;
116 static uint64_t __attribute__((aligned(8))) attribute_used b80= 0x8080808080808080LL;
117 #endif
118
119 static uint8_t clip_table[3*256];
120 static uint8_t * const clip_tab= clip_table + 256;
121
122 static const int attribute_used deringThreshold= 20;
123
124
125 static struct PPFilter filters[]=
126 {
127 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
128 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
129 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
130 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
131 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
132 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
133 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
134 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
135 {"dr", "dering", 1, 5, 6, DERING},
136 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
137 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
138 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
139 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
140 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
141 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
142 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
143 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
144 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
145 {NULL, NULL,0,0,0,0} //End Marker
146 };
147
148 static const char *replaceTable[]=
149 {
150 "default", "hdeblock:a,vdeblock:a,dering:a",
151 "de", "hdeblock:a,vdeblock:a,dering:a",
152 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
153 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
154 "ac", "ha:a:128:7,va:a,dering:a",
155 NULL //End Marker
156 };
157
158
159 #if defined(ARCH_X86)
160 static inline void prefetchnta(void *p)
161 {
162 asm volatile( "prefetchnta (%0)\n\t"
163 : : "r" (p)
164 );
165 }
166
167 static inline void prefetcht0(void *p)
168 {
169 asm volatile( "prefetcht0 (%0)\n\t"
170 : : "r" (p)
171 );
172 }
173
174 static inline void prefetcht1(void *p)
175 {
176 asm volatile( "prefetcht1 (%0)\n\t"
177 : : "r" (p)
178 );
179 }
180
181 static inline void prefetcht2(void *p)
182 {
183 asm volatile( "prefetcht2 (%0)\n\t"
184 : : "r" (p)
185 );
186 }
187 #endif
188
189 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
190
191 /**
192 * Check if the given 8x8 Block is mostly "flat"
193 */
194 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
195 {
196 int numEq= 0;
197 int y;
198 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
199 const int dcThreshold= dcOffset*2 + 1;
200
201 for(y=0; y<BLOCK_SIZE; y++)
202 {
203 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
204 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
205 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
206 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
207 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
208 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
209 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
210 src+= stride;
211 }
212 return numEq > c->ppMode.flatnessThreshold;
213 }
214
215 /**
216 * Check if the middle 8x8 Block in the given 8x16 block is flat
217 */
218 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
219 int numEq= 0;
220 int y;
221 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
222 const int dcThreshold= dcOffset*2 + 1;
223
224 src+= stride*4; // src points to begin of the 8x8 Block
225 for(y=0; y<BLOCK_SIZE-1; y++)
226 {
227 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
228 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
229 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
230 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
231 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
232 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
233 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
234 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
235 src+= stride;
236 }
237 return numEq > c->ppMode.flatnessThreshold;
238 }
239
240 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
241 {
242 int i;
243 #if 1
244 for(i=0; i<2; i++){
245 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
246 src += stride;
247 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
248 src += stride;
249 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
250 src += stride;
251 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
252 src += stride;
253 }
254 #else
255 for(i=0; i<8; i++){
256 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
257 src += stride;
258 }
259 #endif
260 return 1;
261 }
262
263 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
264 {
265 #if 1
266 #if 1
267 int x;
268 src+= stride*4;
269 for(x=0; x<BLOCK_SIZE; x+=4)
270 {
271 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
272 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
273 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
274 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
275 }
276 #else
277 int x;
278 src+= stride*3;
279 for(x=0; x<BLOCK_SIZE; x++)
280 {
281 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
282 }
283 #endif
284 return 1;
285 #else
286 int x;
287 src+= stride*4;
288 for(x=0; x<BLOCK_SIZE; x++)
289 {
290 int min=255;
291 int max=0;
292 int y;
293 for(y=0; y<8; y++){
294 int v= src[x + y*stride];
295 if(v>max) max=v;
296 if(v<min) min=v;
297 }
298 if(max-min > 2*QP) return 0;
299 }
300 return 1;
301 #endif
302 }
303
304 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
305 if( isHorizDC_C(src, stride, c) ){
306 if( isHorizMinMaxOk_C(src, stride, c->QP) )
307 return 1;
308 else
309 return 0;
310 }else{
311 return 2;
312 }
313 }
314
315 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
316 if( isVertDC_C(src, stride, c) ){
317 if( isVertMinMaxOk_C(src, stride, c->QP) )
318 return 1;
319 else
320 return 0;
321 }else{
322 return 2;
323 }
324 }
325
326 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
327 {
328 int y;
329 for(y=0; y<BLOCK_SIZE; y++)
330 {
331 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
332
333 if(FFABS(middleEnergy) < 8*c->QP)
334 {
335 const int q=(dst[3] - dst[4])/2;
336 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
337 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
338
339 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
340 d= FFMAX(d, 0);
341
342 d= (5*d + 32) >> 6;
343 d*= FFSIGN(-middleEnergy);
344
345 if(q>0)
346 {
347 d= d<0 ? 0 : d;
348 d= d>q ? q : d;
349 }
350 else
351 {
352 d= d>0 ? 0 : d;
353 d= d<q ? q : d;
354 }
355
356 dst[3]-= d;
357 dst[4]+= d;
358 }
359 dst+= stride;
360 }
361 }
362
363 /**
364 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
365 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
366 */
367 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
368 {
369 int y;
370 for(y=0; y<BLOCK_SIZE; y++)
371 {
372 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
373 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
374
375 int sums[10];
376 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
377 sums[1] = sums[0] - first + dst[3];
378 sums[2] = sums[1] - first + dst[4];
379 sums[3] = sums[2] - first + dst[5];
380 sums[4] = sums[3] - first + dst[6];
381 sums[5] = sums[4] - dst[0] + dst[7];
382 sums[6] = sums[5] - dst[1] + last;
383 sums[7] = sums[6] - dst[2] + last;
384 sums[8] = sums[7] - dst[3] + last;
385 sums[9] = sums[8] - dst[4] + last;
386
387 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
388 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
389 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
390 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
391 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
392 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
393 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
394 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
395
396 dst+= stride;
397 }
398 }
399
400 /**
401 * Experimental Filter 1 (Horizontal)
402 * will not damage linear gradients
403 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
404 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
405 * MMX2 version does correct clipping C version doesnt
406 * not identical with the vertical one
407 */
408 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
409 {
410 int y;
411 static uint64_t *lut= NULL;
412 if(lut==NULL)
413 {
414 int i;
415 lut = av_malloc(256*8);
416 for(i=0; i<256; i++)
417 {
418 int v= i < 128 ? 2*i : 2*(i-256);
419 /*
420 //Simulate 112242211 9-Tap filter
421 uint64_t a= (v/16) & 0xFF;
422 uint64_t b= (v/8) & 0xFF;
423 uint64_t c= (v/4) & 0xFF;
424 uint64_t d= (3*v/8) & 0xFF;
425 */
426 //Simulate piecewise linear interpolation
427 uint64_t a= (v/16) & 0xFF;
428 uint64_t b= (v*3/16) & 0xFF;
429 uint64_t c= (v*5/16) & 0xFF;
430 uint64_t d= (7*v/16) & 0xFF;
431 uint64_t A= (0x100 - a)&0xFF;
432 uint64_t B= (0x100 - b)&0xFF;
433 uint64_t C= (0x100 - c)&0xFF;
434 uint64_t D= (0x100 - c)&0xFF;
435
436 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
437 (D<<24) | (C<<16) | (B<<8) | (A);
438 //lut[i] = (v<<32) | (v<<24);
439 }
440 }
441
442 for(y=0; y<BLOCK_SIZE; y++)
443 {
444 int a= src[1] - src[2];
445 int b= src[3] - src[4];
446 int c= src[5] - src[6];
447
448 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
449
450 if(d < QP)
451 {
452 int v = d * FFSIGN(-b);
453
454 src[1] +=v/8;
455 src[2] +=v/4;
456 src[3] +=3*v/8;
457 src[4] -=3*v/8;
458 src[5] -=v/4;
459 src[6] -=v/8;
460
461 }
462 src+=stride;
463 }
464 }
465
466 /**
467 * accurate deblock filter
468 */
469 static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
470 int y;
471 const int QP= c->QP;
472 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
473 const int dcThreshold= dcOffset*2 + 1;
474 //START_TIMER
475 src+= step*4; // src points to begin of the 8x8 Block
476 for(y=0; y<8; y++){
477 int numEq= 0;
478
479 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
480 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
481 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
482 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
483 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
484 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
485 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
486 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
487 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
488 if(numEq > c->ppMode.flatnessThreshold){
489 int min, max, x;
490
491 if(src[0] > src[step]){
492 max= src[0];
493 min= src[step];
494 }else{
495 max= src[step];
496 min= src[0];
497 }
498 for(x=2; x<8; x+=2){
499 if(src[x*step] > src[(x+1)*step]){
500 if(src[x *step] > max) max= src[ x *step];
501 if(src[(x+1)*step] < min) min= src[(x+1)*step];
502 }else{
503 if(src[(x+1)*step] > max) max= src[(x+1)*step];
504 if(src[ x *step] < min) min= src[ x *step];
505 }
506 }
507 if(max-min < 2*QP){
508 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
509 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
510
511 int sums[10];
512 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
513 sums[1] = sums[0] - first + src[3*step];
514 sums[2] = sums[1] - first + src[4*step];
515 sums[3] = sums[2] - first + src[5*step];
516 sums[4] = sums[3] - first + src[6*step];
517 sums[5] = sums[4] - src[0*step] + src[7*step];
518 sums[6] = sums[5] - src[1*step] + last;
519 sums[7] = sums[6] - src[2*step] + last;
520 sums[8] = sums[7] - src[3*step] + last;
521 sums[9] = sums[8] - src[4*step] + last;
522
523 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
524 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
525 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
526 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
527 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
528 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
529 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
530 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
531 }
532 }else{
533 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
534
535 if(FFABS(middleEnergy) < 8*QP)
536 {
537 const int q=(src[3*step] - src[4*step])/2;
538 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
539 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
540
541 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
542 d= FFMAX(d, 0);
543
544 d= (5*d + 32) >> 6;
545 d*= FFSIGN(-middleEnergy);
546
547 if(q>0)
548 {
549 d= d<0 ? 0 : d;
550 d= d>q ? q : d;
551 }
552 else
553 {
554 d= d>0 ? 0 : d;
555 d= d<q ? q : d;
556 }
557
558 src[3*step]-= d;
559 src[4*step]+= d;
560 }
561 }
562
563 src += stride;
564 }
565 /*if(step==16){
566 STOP_TIMER("step16")
567 }else{
568 STOP_TIMER("stepX")
569 }*/
570 }
571
572 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
573 //Plain C versions
574 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
575 #define COMPILE_C
576 #endif
577
578 #ifdef ARCH_POWERPC
579 #ifdef HAVE_ALTIVEC
580 #define COMPILE_ALTIVEC
581 #endif //HAVE_ALTIVEC
582 #endif //ARCH_POWERPC
583
584 #if defined(ARCH_X86)
585
586 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
587 #define COMPILE_MMX
588 #endif
589
590 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
591 #define COMPILE_MMX2
592 #endif
593
594 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
595 #define COMPILE_3DNOW
596 #endif
597 #endif /* defined(ARCH_X86) */
598
599 #undef HAVE_MMX
600 #undef HAVE_MMX2
601 #undef HAVE_3DNOW
602 #undef HAVE_ALTIVEC
603
604 #ifdef COMPILE_C
605 #undef HAVE_MMX
606 #undef HAVE_MMX2
607 #undef HAVE_3DNOW
608 #define RENAME(a) a ## _C
609 #include "postprocess_template.c"
610 #endif
611
612 #ifdef ARCH_POWERPC
613 #ifdef COMPILE_ALTIVEC
614 #undef RENAME
615 #define HAVE_ALTIVEC
616 #define RENAME(a) a ## _altivec
617 #include "postprocess_altivec_template.c"
618 #include "postprocess_template.c"
619 #endif
620 #endif //ARCH_POWERPC
621
622 //MMX versions
623 #ifdef COMPILE_MMX
624 #undef RENAME
625 #define HAVE_MMX
626 #undef HAVE_MMX2
627 #undef HAVE_3DNOW
628 #define RENAME(a) a ## _MMX
629 #include "postprocess_template.c"
630 #endif
631
632 //MMX2 versions
633 #ifdef COMPILE_MMX2
634 #undef RENAME
635 #define HAVE_MMX
636 #define HAVE_MMX2
637 #undef HAVE_3DNOW
638 #define RENAME(a) a ## _MMX2
639 #include "postprocess_template.c"
640 #endif
641
642 //3DNOW versions
643 #ifdef COMPILE_3DNOW
644 #undef RENAME
645 #define HAVE_MMX
646 #undef HAVE_MMX2
647 #define HAVE_3DNOW
648 #define RENAME(a) a ## _3DNow
649 #include "postprocess_template.c"
650 #endif
651
652 // minor note: the HAVE_xyz is messed up after that line so dont use it
653
654 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
655 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
656 {
657 PPContext *c= (PPContext *)vc;
658 PPMode *ppMode= (PPMode *)vm;
659 c->ppMode= *ppMode; //FIXME
660
661 // useing ifs here as they are faster than function pointers allthough the
662 // difference wouldnt be messureable here but its much better because
663 // someone might exchange the cpu whithout restarting mplayer ;)
664 #ifdef RUNTIME_CPUDETECT
665 #if defined(ARCH_X86)
666 // ordered per speed fasterst first
667 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
668 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
669 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
670 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
671 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
672 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
673 else
674 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
675 #else
676 #ifdef ARCH_POWERPC
677 #ifdef HAVE_ALTIVEC
678 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
679 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
680 else
681 #endif
682 #endif
683 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
684 #endif
685 #else //RUNTIME_CPUDETECT
686 #ifdef HAVE_MMX2
687 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
688 #elif defined (HAVE_3DNOW)
689 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
690 #elif defined (HAVE_MMX)
691 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
692 #elif defined (HAVE_ALTIVEC)
693 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
694 #else
695 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
696 #endif
697 #endif //!RUNTIME_CPUDETECT
698 }
699
700 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
701 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
702
703 /* -pp Command line Help
704 */
705 char *pp_help=
706 "Available postprocessing filters:\n"
707 "Filters Options\n"
708 "short long name short long option Description\n"
709 "* * a autoq CPU power dependent enabler\n"
710 " c chrom chrominance filtering enabled\n"
711 " y nochrom chrominance filtering disabled\n"
712 " n noluma luma filtering disabled\n"
713 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
714 " 1. difference factor: default=32, higher -> more deblocking\n"
715 " 2. flatness threshold: default=39, lower -> more deblocking\n"
716 " the h & v deblocking filters share these\n"
717 " so you can't set different thresholds for h / v\n"
718 "vb vdeblock (2 threshold) vertical deblocking filter\n"
719 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
720 "va vadeblock (2 threshold) vertical deblocking filter\n"
721 "h1 x1hdeblock experimental h deblock filter 1\n"
722 "v1 x1vdeblock experimental v deblock filter 1\n"
723 "dr dering deringing filter\n"
724 "al autolevels automatic brightness / contrast\n"
725 " f fullyrange stretch luminance to (0..255)\n"
726 "lb linblenddeint linear blend deinterlacer\n"
727 "li linipoldeint linear interpolating deinterlace\n"
728 "ci cubicipoldeint cubic interpolating deinterlacer\n"
729 "md mediandeint median deinterlacer\n"
730 "fd ffmpegdeint ffmpeg deinterlacer\n"
731 "l5 lowpass5 FIR lowpass deinterlacer\n"
732 "de default hb:a,vb:a,dr:a\n"
733 "fa fast h1:a,v1:a,dr:a\n"
734 "ac ha:a:128:7,va:a,dr:a\n"
735 "tn tmpnoise (3 threshold) temporal noise reducer\n"
736 " 1. <= 2. <= 3. larger -> stronger filtering\n"
737 "fq forceQuant <quantizer> force quantizer\n"
738 "Usage:\n"
739 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
740 "long form example:\n"
741 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
742 "short form example:\n"
743 "vb:a/hb:a/lb de,-vb\n"
744 "more examples:\n"
745 "tn:64:128:256\n"
746 "\n"
747 ;
748
749 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
750 {
751 char temp[GET_MODE_BUFFER_SIZE];
752 char *p= temp;
753 const char *filterDelimiters= ",/";
754 const char *optionDelimiters= ":";
755 struct PPMode *ppMode;
756 char *filterToken;
757
758 ppMode= av_malloc(sizeof(PPMode));
759
760 ppMode->lumMode= 0;
761 ppMode->chromMode= 0;
762 ppMode->maxTmpNoise[0]= 700;
763 ppMode->maxTmpNoise[1]= 1500;
764 ppMode->maxTmpNoise[2]= 3000;
765 ppMode->maxAllowedY= 234;
766 ppMode->minAllowedY= 16;
767 ppMode->baseDcDiff= 256/8;
768 ppMode->flatnessThreshold= 56-16-1;
769 ppMode->maxClippedThreshold= 0.01;
770 ppMode->error=0;
771
772 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
773
774 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
775
776 for(;;){
777 char *filterName;
778 int q= 1000000; //PP_QUALITY_MAX;
779 int chrom=-1;
780 int luma=-1;
781 char *option;
782 char *options[OPTIONS_ARRAY_SIZE];
783 int i;
784 int filterNameOk=0;
785 int numOfUnknownOptions=0;
786 int enable=1; //does the user want us to enabled or disabled the filter
787
788 filterToken= strtok(p, filterDelimiters);
789 if(filterToken == NULL) break;
790 p+= strlen(filterToken) + 1; // p points to next filterToken
791 filterName= strtok(filterToken, optionDelimiters);
792 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
793
794 if(*filterName == '-')
795 {
796 enable=0;
797 filterName++;
798 }
799
800 for(;;){ //for all options
801 option= strtok(NULL, optionDelimiters);
802 if(option == NULL) break;
803
804 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
805 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
806 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
807 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
808 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
809 else
810 {
811 options[numOfUnknownOptions] = option;
812 numOfUnknownOptions++;
813 }
814 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
815 }
816 options[numOfUnknownOptions] = NULL;
817
818 /* replace stuff from the replace Table */
819 for(i=0; replaceTable[2*i]!=NULL; i++)
820 {
821 if(!strcmp(replaceTable[2*i], filterName))
822 {
823 int newlen= strlen(replaceTable[2*i + 1]);
824 int plen;
825 int spaceLeft;
826
827 if(p==NULL) p= temp, *p=0; //last filter
828 else p--, *p=','; //not last filter
829
830 plen= strlen(p);
831 spaceLeft= p - temp + plen;
832 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
833 {
834 ppMode->error++;
835 break;
836 }
837 memmove(p + newlen, p, plen+1);
838 memcpy(p, replaceTable[2*i + 1], newlen);
839 filterNameOk=1;
840 }
841 }
842
843 for(i=0; filters[i].shortName!=NULL; i++)
844 {
845 if( !strcmp(filters[i].longName, filterName)
846 || !strcmp(filters[i].shortName, filterName))
847 {
848 ppMode->lumMode &= ~filters[i].mask;
849 ppMode->chromMode &= ~filters[i].mask;
850
851 filterNameOk=1;
852 if(!enable) break; // user wants to disable it
853
854 if(q >= filters[i].minLumQuality && luma)
855 ppMode->lumMode|= filters[i].mask;
856 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
857 if(q >= filters[i].minChromQuality)
858 ppMode->chromMode|= filters[i].mask;
859
860 if(filters[i].mask == LEVEL_FIX)
861 {
862 int o;
863 ppMode->minAllowedY= 16;
864 ppMode->maxAllowedY= 234;
865 for(o=0; options[o]!=NULL; o++)
866 {
867 if( !strcmp(options[o],"fullyrange")
868 ||!strcmp(options[o],"f"))
869 {
870 ppMode->minAllowedY= 0;
871 ppMode->maxAllowedY= 255;
872 numOfUnknownOptions--;
873 }
874 }
875 }
876 else if(filters[i].mask == TEMP_NOISE_FILTER)
877 {
878 int o;
879 int numOfNoises=0;
880
881 for(o=0; options[o]!=NULL; o++)
882 {
883 char *tail;
884 ppMode->maxTmpNoise[numOfNoises]=
885 strtol(options[o], &tail, 0);
886 if(tail!=options[o])
887 {
888 numOfNoises++;
889 numOfUnknownOptions--;
890 if(numOfNoises >= 3) break;
891 }
892 }
893 }
894 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
895 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
896 {
897 int o;
898
899 for(o=0; options[o]!=NULL && o<2; o++)
900 {
901 char *tail;
902 int val= strtol(options[o], &tail, 0);
903 if(tail==options[o]) break;
904
905 numOfUnknownOptions--;
906 if(o==0) ppMode->baseDcDiff= val;
907 else ppMode->flatnessThreshold= val;
908 }
909 }
910 else if(filters[i].mask == FORCE_QUANT)
911 {
912 int o;
913 ppMode->forcedQuant= 15;
914
915 for(o=0; options[o]!=NULL && o<1; o++)
916 {
917 char *tail;
918 int val= strtol(options[o], &tail, 0);
919 if(tail==options[o]) break;
920
921 numOfUnknownOptions--;
922 ppMode->forcedQuant= val;
923 }
924 }
925 }
926 }
927 if(!filterNameOk) ppMode->error++;
928 ppMode->error += numOfUnknownOptions;
929 }
930
931 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
932 if(ppMode->error)
933 {
934 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
935 av_free(ppMode);
936 return NULL;
937 }
938 return ppMode;
939 }
940
941 void pp_free_mode(pp_mode_t *mode){
942 av_free(mode);
943 }
944
945 static void reallocAlign(void **p, int alignment, int size){
946 av_free(*p);
947 *p= av_mallocz(size);
948 }
949
950 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
951 int mbWidth = (width+15)>>4;
952 int mbHeight= (height+15)>>4;
953 int i;
954
955 c->stride= stride;
956 c->qpStride= qpStride;
957
958 reallocAlign((void **)&c->tempDst, 8, stride*24);
959 reallocAlign((void **)&c->tempSrc, 8, stride*24);
960 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
961 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
962 for(i=0; i<256; i++)
963 c->yHistogram[i]= width*height/64*15/256;
964
965 for(i=0; i<3; i++)
966 {
967 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
968 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
969 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
970 }
971
972 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
973 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
974 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
975 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
976 }
977
978 static void global_init(void){
979 int i;
980 memset(clip_table, 0, 256);
981 for(i=256; i<512; i++)
982 clip_table[i]= i;
983 memset(clip_table+512, 0, 256);
984 }
985
986 static const char * context_to_name(void * ptr) {
987 return "postproc";
988 }
989
990 static AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
991
992 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
993 PPContext *c= av_malloc(sizeof(PPContext));
994 int stride= (width+15)&(~15); //assumed / will realloc if needed
995 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
996
997 global_init();
998
999 memset(c, 0, sizeof(PPContext));
1000 c->av_class = &av_codec_context_class;
1001 c->cpuCaps= cpuCaps;
1002 if(cpuCaps&PP_FORMAT){
1003 c->hChromaSubSample= cpuCaps&0x3;
1004 c->vChromaSubSample= (cpuCaps>>4)&0x3;
1005 }else{
1006 c->hChromaSubSample= 1;
1007 c->vChromaSubSample= 1;
1008 }
1009
1010 reallocBuffers(c, width, height, stride, qpStride);
1011
1012 c->frameNum=-1;
1013
1014 return c;
1015 }
1016
1017 void pp_free_context(void *vc){
1018 PPContext *c = (PPContext*)vc;
1019 int i;
1020
1021 for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1022 for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1023
1024 av_free(c->tempBlocks);
1025 av_free(c->yHistogram);
1026 av_free(c->tempDst);
1027 av_free(c->tempSrc);
1028 av_free(c->deintTemp);
1029 av_free(c->stdQPTable);
1030 av_free(c->nonBQPTable);
1031 av_free(c->forcedQPTable);
1032
1033 memset(c, 0, sizeof(PPContext));
1034
1035 av_free(c);
1036 }
1037
1038 void pp_postprocess(uint8_t * src[3], int srcStride[3],
1039 uint8_t * dst[3], int dstStride[3],
1040 int width, int height,
1041 QP_STORE_T *QP_store, int QPStride,
1042 pp_mode_t *vm, void *vc, int pict_type)
1043 {
1044 int mbWidth = (width+15)>>4;
1045 int mbHeight= (height+15)>>4;
1046 PPMode *mode = (PPMode*)vm;
1047 PPContext *c = (PPContext*)vc;
1048 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
1049 int absQPStride = FFABS(QPStride);
1050
1051 // c->stride and c->QPStride are always positive
1052 if(c->stride < minStride || c->qpStride < absQPStride)
1053 reallocBuffers(c, width, height,
1054 FFMAX(minStride, c->stride),
1055 FFMAX(c->qpStride, absQPStride));
1056
1057 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1058 {
1059 int i;
1060 QP_store= c->forcedQPTable;
1061 absQPStride = QPStride = 0;
1062 if(mode->lumMode & FORCE_QUANT)
1063 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1064 else
1065 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1066 }
1067
1068 if(pict_type & PP_PICT_TYPE_QP2){
1069 int i;
1070 const int count= mbHeight * absQPStride;
1071 for(i=0; i<(count>>2); i++){
1072 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1073 }
1074 for(i<<=2; i<count; i++){
1075 c->stdQPTable[i] = QP_store[i]>>1;
1076 }
1077 QP_store= c->stdQPTable;
1078 QPStride= absQPStride;
1079 }
1080
1081 if(0){
1082 int x,y;
1083 for(y=0; y<mbHeight; y++){
1084 for(x=0; x<mbWidth; x++){
1085 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1086 }
1087 av_log(c, AV_LOG_INFO, "\n");
1088 }
1089 av_log(c, AV_LOG_INFO, "\n");
1090 }
1091
1092 if((pict_type&7)!=3)
1093 {
1094 if (QPStride >= 0) {
1095 int i;
1096 const int count= mbHeight * QPStride;
1097 for(i=0; i<(count>>2); i++){
1098 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1099 }
1100 for(i<<=2; i<count; i++){
1101 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1102 }
1103 } else {
1104 int i,j;
1105 for(i=0; i<mbHeight; i++) {
1106 for(j=0; j<absQPStride; j++) {
1107 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1108 }
1109 }
1110 }
1111 }
1112
1113 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1114 mode->lumMode, mode->chromMode);
1115
1116 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1117 width, height, QP_store, QPStride, 0, mode, c);
1118
1119 width = (width )>>c->hChromaSubSample;
1120 height = (height)>>c->vChromaSubSample;
1121
1122 if(mode->chromMode)
1123 {
1124 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1125 width, height, QP_store, QPStride, 1, mode, c);
1126 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1127 width, height, QP_store, QPStride, 2, mode, c);
1128 }
1129 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1130 {
1131 linecpy(dst[1], src[1], height, srcStride[1]);
1132 linecpy(dst[2], src[2], height, srcStride[2]);
1133 }
1134 else
1135 {
1136 int y;
1137 for(y=0; y<height; y++)
1138 {
1139 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1140 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1141 }
1142 }
1143 }
1144