a few warning fixes (missing #include's)
[libav.git] / postproc / postprocess_template.c
CommitLineData
3057fa66
A
1/*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
19/*
3b58b885 20 C MMX MMX2 3DNow
3057fa66
A
21isVertDC Ec Ec
22isVertMinMaxOk Ec Ec
3b58b885 23doVertLowPass E e e
3057fa66
A
24doVertDefFilter Ec Ec Ec
25isHorizDC Ec Ec
4e4dcbc5
MN
26isHorizMinMaxOk a E
27doHorizLowPass E e e
e5c30e06 28doHorizDefFilter Ec Ec Ec
e0f8ffae 29deRing e e*
3b58b885 30Vertical RKAlgo1 E a a
e5c30e06 31Horizontal RKAlgo1 a a
3b58b885
MN
32Vertical X1 a E E
33Horizontal X1 a E E
acced553
MN
34LinIpolDeinterlace e E E*
35CubicIpolDeinterlace a e e*
36LinBlendDeinterlace e E E*
a6be8111 37MedianDeinterlace Ec Ec
d5a1a995 38
3057fa66 39
13e00528 40* i dont have a 3dnow CPU -> its untested
3057fa66 41E = Exact implementation
acced553 42e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
43a = alternative / approximate impl
44c = checked against the other implementations (-vo md5)
45*/
46
47/*
48TODO:
9f45d04d 49verify that everything workes as it should (how?)
3057fa66
A
50reduce the time wasted on the mem transfer
51implement dering
13e00528 52implement everything in C at least (done at the moment but ...)
3057fa66
A
53unroll stuff if instructions depend too much on the prior one
54we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55move YScale thing to the end instead of fixing QP
13e00528 56write a faster and higher quality deblocking filter :)
d5a1a995
MN
57do something about the speed of the horizontal filters
58make the mainloop more flexible (variable number of blocks at once
59 (the if/else stuff per block is slowing things down)
9f45d04d 60compare the quality & speed of all filters
9f45d04d 61split this huge file
3b58b885 62fix warnings (unused vars, ...)
a6be8111 63noise reduction filters
e5c30e06 64border remover
3057fa66
A
65...
66
67Notes:
13e00528
A
68*/
69
a6be8111 70//Changelog: use the CVS log
3057fa66
A
71
72#include <inttypes.h>
73#include <stdio.h>
d5a1a995 74#include <stdlib.h>
911879d1 75#include <string.h>
3057fa66 76#include "../config.h"
dda87e9f
PL
77#ifdef HAVE_MALLOC_H
78#include <malloc.h>
79#endif
3057fa66 80//#undef HAVE_MMX2
13e00528 81//#define HAVE_3DNOW
3057fa66 82//#undef HAVE_MMX
13e00528 83#include "postprocess.h"
3057fa66 84
e939e1c3
A
85#define MIN(a,b) ((a) > (b) ? (b) : (a))
86#define MAX(a,b) ((a) < (b) ? (b) : (a))
87#define ABS(a) ((a) > 0 ? (a) : (-(a)))
88#define SIGN(a) ((a) > 0 ? 1 : -1)
89
90#ifdef HAVE_MMX2
91#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92#elif defined (HAVE_3DNOW)
93#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
94#endif
3057fa66 95
911879d1
MN
96#define GET_MODE_BUFFER_SIZE 500
97#define OPTIONS_ARRAY_SIZE 10
98
99
3057fa66
A
100static uint64_t packedYOffset= 0x0000000000000000LL;
101static uint64_t packedYScale= 0x0100010001000100LL;
102static uint64_t w05= 0x0005000500050005LL;
103static uint64_t w20= 0x0020002000200020LL;
104static uint64_t w1400= 0x1400140014001400LL;
105static uint64_t bm00000001= 0x00000000000000FFLL;
106static uint64_t bm00010000= 0x000000FF00000000LL;
107static uint64_t bm00001000= 0x00000000FF000000LL;
108static uint64_t bm10000000= 0xFF00000000000000LL;
109static uint64_t bm10000001= 0xFF000000000000FFLL;
110static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
13e00528 111static uint64_t bm00000011= 0x000000000000FFFFLL;
9f45d04d 112static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
13e00528 113static uint64_t bm11000000= 0xFFFF000000000000LL;
3057fa66
A
114static uint64_t bm00011000= 0x000000FFFF000000LL;
115static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
116static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
117static uint64_t b00= 0x0000000000000000LL;
9f45d04d 118static uint64_t b01= 0x0101010101010101LL;
3057fa66
A
119static uint64_t b02= 0x0202020202020202LL;
120static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
70c5ae87
MN
121static uint64_t b04= 0x0404040404040404LL;
122static uint64_t b08= 0x0808080808080808LL;
3057fa66 123static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
13e00528
A
124static uint64_t b20= 0x2020202020202020LL;
125static uint64_t b80= 0x8080808080808080LL;
3057fa66
A
126static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
127static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
128static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
129static uint64_t temp0=0;
130static uint64_t temp1=0;
131static uint64_t temp2=0;
132static uint64_t temp3=0;
133static uint64_t temp4=0;
134static uint64_t temp5=0;
135static uint64_t pQPb=0;
70c5ae87 136static uint64_t pQPb2=0;
4e4dcbc5 137static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
3057fa66
A
138
139int hFlatnessThreshold= 56 - 16;
140int vFlatnessThreshold= 56 - 16;
141
142//amount of "black" u r willing to loose to get a brightness corrected picture
143double maxClippedThreshold= 0.01;
144
911879d1 145int maxAllowedY=234;
658a85f2 146int minAllowedY=16;
3057fa66 147
911879d1
MN
148static struct PPFilter filters[]=
149{
150 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
151 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
152 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
153 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
154 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
155 {"dr", "dering", 1, 5, 6, DERING},
156 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
157 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
158 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
159 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
160 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
161 {NULL, NULL,0,0,0,0} //End Marker
162};
163
164static char *replaceTable[]=
165{
166 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels",
167 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels",
168 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
169 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
170 NULL //End Marker
171};
172
e5c30e06
MN
173static inline void unusedVariableWarningFixer()
174{
175if(
176 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
177 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
178 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
179 + bFF + b20 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
180 + temp5 + pQPb== 0) b00=0;
181}
182
a6be8111 183#ifdef TIMING
3057fa66
A
184static inline long long rdtsc()
185{
186 long long l;
187 asm volatile( "rdtsc\n\t"
188 : "=A" (l)
189 );
190// printf("%d\n", int(l/1000));
191 return l;
192}
9a722af7 193#endif
3057fa66 194
9a722af7 195#ifdef HAVE_MMX2
3057fa66
A
196static inline void prefetchnta(void *p)
197{
198 asm volatile( "prefetchnta (%0)\n\t"
199 : : "r" (p)
200 );
201}
202
203static inline void prefetcht0(void *p)
204{
205 asm volatile( "prefetcht0 (%0)\n\t"
206 : : "r" (p)
207 );
208}
209
210static inline void prefetcht1(void *p)
211{
212 asm volatile( "prefetcht1 (%0)\n\t"
213 : : "r" (p)
214 );
215}
216
217static inline void prefetcht2(void *p)
218{
219 asm volatile( "prefetcht2 (%0)\n\t"
220 : : "r" (p)
221 );
222}
9a722af7 223#endif
3057fa66
A
224
225//FIXME? |255-0| = 1 (shouldnt be a problem ...)
226/**
acced553 227 * Check if the middle 8x8 Block in the given 8x16 block is flat
3057fa66 228 */
d5a1a995 229static inline int isVertDC(uint8_t src[], int stride){
3057fa66 230 int numEq= 0;
e5c30e06 231#ifndef HAVE_MMX
d5a1a995 232 int y;
e5c30e06 233#endif
acced553 234 src+= stride*4; // src points to begin of the 8x8 Block
3057fa66 235#ifdef HAVE_MMX
37da00fc
MN
236asm volatile(
237 "leal (%1, %2), %%eax \n\t"
238 "leal (%%eax, %2, 4), %%ebx \n\t"
239// 0 1 2 3 4 5 6 7 8 9
240// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
3057fa66
A
241 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
242 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
243 "movq (%1), %%mm0 \n\t"
37da00fc 244 "movq (%%eax), %%mm1 \n\t"
3057fa66
A
245 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
246 "paddb %%mm7, %%mm0 \n\t"
247 "pcmpgtb %%mm6, %%mm0 \n\t"
248
37da00fc 249 "movq (%%eax,%2), %%mm2 \n\t"
3057fa66
A
250 "psubb %%mm2, %%mm1 \n\t"
251 "paddb %%mm7, %%mm1 \n\t"
252 "pcmpgtb %%mm6, %%mm1 \n\t"
253 "paddb %%mm1, %%mm0 \n\t"
254
37da00fc 255 "movq (%%eax, %2, 2), %%mm1 \n\t"
3057fa66
A
256 "psubb %%mm1, %%mm2 \n\t"
257 "paddb %%mm7, %%mm2 \n\t"
258 "pcmpgtb %%mm6, %%mm2 \n\t"
259 "paddb %%mm2, %%mm0 \n\t"
260
37da00fc 261 "movq (%1, %2, 4), %%mm2 \n\t"
3057fa66
A
262 "psubb %%mm2, %%mm1 \n\t"
263 "paddb %%mm7, %%mm1 \n\t"
264 "pcmpgtb %%mm6, %%mm1 \n\t"
265 "paddb %%mm1, %%mm0 \n\t"
266
37da00fc 267 "movq (%%ebx), %%mm1 \n\t"
3057fa66
A
268 "psubb %%mm1, %%mm2 \n\t"
269 "paddb %%mm7, %%mm2 \n\t"
270 "pcmpgtb %%mm6, %%mm2 \n\t"
271 "paddb %%mm2, %%mm0 \n\t"
272
37da00fc 273 "movq (%%ebx, %2), %%mm2 \n\t"
3057fa66
A
274 "psubb %%mm2, %%mm1 \n\t"
275 "paddb %%mm7, %%mm1 \n\t"
276 "pcmpgtb %%mm6, %%mm1 \n\t"
277 "paddb %%mm1, %%mm0 \n\t"
278
37da00fc 279 "movq (%%ebx, %2, 2), %%mm1 \n\t"
3057fa66
A
280 "psubb %%mm1, %%mm2 \n\t"
281 "paddb %%mm7, %%mm2 \n\t"
282 "pcmpgtb %%mm6, %%mm2 \n\t"
283 "paddb %%mm2, %%mm0 \n\t"
284
285 " \n\t"
286 "movq %%mm0, %%mm1 \n\t"
287 "psrlw $8, %%mm0 \n\t"
288 "paddb %%mm1, %%mm0 \n\t"
e5c30e06
MN
289#ifdef HAVE_MMX2
290 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
291 "paddb %%mm1, %%mm0 \n\t"
292 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
293#else
3057fa66
A
294 "movq %%mm0, %%mm1 \n\t"
295 "psrlq $16, %%mm0 \n\t"
296 "paddb %%mm1, %%mm0 \n\t"
297 "movq %%mm0, %%mm1 \n\t"
298 "psrlq $32, %%mm0 \n\t"
e5c30e06 299#endif
3057fa66 300 "paddb %%mm1, %%mm0 \n\t"
3057fa66
A
301 "movd %%mm0, %0 \n\t"
302 : "=r" (numEq)
303 : "r" (src), "r" (stride)
4e4dcbc5 304 : "%eax", "%ebx"
3057fa66 305 );
3057fa66 306
37da00fc 307 numEq= (256 - numEq) &0xFF;
3057fa66
A
308
309#else
d5a1a995 310 for(y=0; y<BLOCK_SIZE-1; y++)
3057fa66
A
311 {
312 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
313 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
314 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
315 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
316 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
317 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
318 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
319 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
320 src+= stride;
321 }
322#endif
323/* if(abs(numEq - asmEq) > 0)
324 {
325 printf("\nasm:%d c:%d\n", asmEq, numEq);
326 for(int y=0; y<8; y++)
327 {
328 for(int x=0; x<8; x++)
329 {
330 printf("%d ", temp[x + y*stride]);
331 }
332 printf("\n");
333 }
334 }
335*/
d5a1a995
MN
336// for(int i=0; i<numEq/8; i++) src[i]=255;
337 return (numEq > vFlatnessThreshold) ? 1 : 0;
3057fa66
A
338}
339
d5a1a995 340static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
3057fa66
A
341{
342#ifdef HAVE_MMX
343 int isOk;
acced553 344 src+= stride*3;
3057fa66
A
345 asm volatile(
346// "int $3 \n\t"
347 "movq (%1, %2), %%mm0 \n\t"
348 "movq (%1, %2, 8), %%mm1 \n\t"
349 "movq %%mm0, %%mm2 \n\t"
350 "psubusb %%mm1, %%mm0 \n\t"
351 "psubusb %%mm2, %%mm1 \n\t"
352 "por %%mm1, %%mm0 \n\t" // ABS Diff
353
354 "movq pQPb, %%mm7 \n\t" // QP,..., QP
355 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
356 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
357 "pcmpeqd b00, %%mm0 \n\t"
358 "psrlq $16, %%mm0 \n\t"
359 "pcmpeqd bFF, %%mm0 \n\t"
360// "movd %%mm0, (%1, %2, 4)\n\t"
361 "movd %%mm0, %0 \n\t"
362 : "=r" (isOk)
363 : "r" (src), "r" (stride)
364 );
ac0b0b2f 365 return isOk;
3057fa66
A
366#else
367
d5a1a995
MN
368 int isOk2= 1;
369 int x;
acced553 370 src+= stride*3;
d5a1a995 371 for(x=0; x<BLOCK_SIZE; x++)
3057fa66 372 {
d5a1a995 373 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
3057fa66
A
374 }
375/* if(isOk && !isOk2 || !isOk && isOk2)
376 {
377 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
378 for(int y=0; y<9; y++)
379 {
380 for(int x=0; x<8; x++)
381 {
382 printf("%d ", src[x + y*stride]);
383 }
384 printf("\n");
385 }
386 } */
387
388 return isOk2;
389#endif
390
391}
392
393/**
acced553 394 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
a6be8111 395 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
3057fa66
A
396 */
397static inline void doVertLowPass(uint8_t *src, int stride, int QP)
398{
13e00528 399#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 400 src+= stride*3;
3057fa66
A
401 asm volatile( //"movv %0 %1 %2\n\t"
402 "pushl %0 \n\t"
403 "movq pQPb, %%mm0 \n\t" // QP,..., QP
3057fa66
A
404
405 "movq (%0), %%mm6 \n\t"
406 "movq (%0, %1), %%mm5 \n\t"
407 "movq %%mm5, %%mm1 \n\t"
408 "movq %%mm6, %%mm2 \n\t"
409 "psubusb %%mm6, %%mm5 \n\t"
410 "psubusb %%mm1, %%mm2 \n\t"
411 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
412 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
413 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
414
415 "pand %%mm2, %%mm6 \n\t"
416 "pandn %%mm1, %%mm2 \n\t"
417 "por %%mm2, %%mm6 \n\t"// First Line to Filter
418
419 "movq (%0, %1, 8), %%mm5 \n\t"
420 "leal (%0, %1, 4), %%eax \n\t"
421 "leal (%0, %1, 8), %%ebx \n\t"
422 "subl %1, %%ebx \n\t"
423 "addl %1, %0 \n\t" // %0 points to line 1 not 0
424 "movq (%0, %1, 8), %%mm7 \n\t"
425 "movq %%mm5, %%mm1 \n\t"
426 "movq %%mm7, %%mm2 \n\t"
427 "psubusb %%mm7, %%mm5 \n\t"
428 "psubusb %%mm1, %%mm2 \n\t"
429 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
430 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
431 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
432
433 "pand %%mm2, %%mm7 \n\t"
434 "pandn %%mm1, %%mm2 \n\t"
435 "por %%mm2, %%mm7 \n\t" // First Line to Filter
436
437
438 // 1 2 3 4 5 6 7 8
439 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
440 // 6 4 2 2 1 1
441 // 6 4 4 2
442 // 6 8 2
acced553 443
3057fa66
A
444 "movq (%0, %1), %%mm0 \n\t" // 1
445 "movq %%mm0, %%mm1 \n\t" // 1
13e00528
A
446 PAVGB(%%mm6, %%mm0) //1 1 /2
447 PAVGB(%%mm6, %%mm0) //3 1 /4
3057fa66
A
448
449 "movq (%0, %1, 4), %%mm2 \n\t" // 1
450 "movq %%mm2, %%mm5 \n\t" // 1
13e00528
A
451 PAVGB((%%eax), %%mm2) // 11 /2
452 PAVGB((%0, %1, 2), %%mm2) // 211 /4
3057fa66
A
453 "movq %%mm2, %%mm3 \n\t" // 211 /4
454 "movq (%0), %%mm4 \n\t" // 1
13e00528
A
455 PAVGB(%%mm4, %%mm3) // 4 211 /8
456 PAVGB(%%mm0, %%mm3) //642211 /16
3057fa66
A
457 "movq %%mm3, (%0) \n\t" // X
458 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
459 "movq %%mm1, %%mm0 \n\t" // 1
13e00528 460 PAVGB(%%mm6, %%mm0) //1 1 /2
3057fa66 461 "movq %%mm4, %%mm3 \n\t" // 1
13e00528
A
462 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
463 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
464 PAVGB((%%eax), %%mm5) // 211 /4
465 PAVGB(%%mm5, %%mm3) // 2 2211 /8
466 PAVGB(%%mm0, %%mm3) //4242211 /16
3057fa66
A
467 "movq %%mm3, (%0,%1) \n\t" // X
468 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
13e00528 469 PAVGB(%%mm4, %%mm6) //11 /2
3057fa66 470 "movq (%%ebx), %%mm0 \n\t" // 1
13e00528 471 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
3057fa66 472 "movq %%mm0, %%mm3 \n\t" // 11/2
13e00528
A
473 PAVGB(%%mm1, %%mm0) // 2 11/4
474 PAVGB(%%mm6, %%mm0) //222 11/8
475 PAVGB(%%mm2, %%mm0) //22242211/16
3057fa66
A
476 "movq (%0, %1, 2), %%mm2 \n\t" // 1
477 "movq %%mm0, (%0, %1, 2) \n\t" // X
478 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
479 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
480 PAVGB((%%ebx), %%mm0) // 11 /2
481 PAVGB(%%mm0, %%mm6) //11 11 /4
482 PAVGB(%%mm1, %%mm4) // 11 /2
483 PAVGB(%%mm2, %%mm1) // 11 /2
484 PAVGB(%%mm1, %%mm6) //1122 11 /8
485 PAVGB(%%mm5, %%mm6) //112242211 /16
3057fa66
A
486 "movq (%%eax), %%mm5 \n\t" // 1
487 "movq %%mm6, (%%eax) \n\t" // X
488 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
489 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
13e00528
A
490 PAVGB(%%mm7, %%mm6) // 11 /2
491 PAVGB(%%mm4, %%mm6) // 11 11 /4
492 PAVGB(%%mm3, %%mm6) // 11 2211 /8
493 PAVGB(%%mm5, %%mm2) // 11 /2
3057fa66 494 "movq (%0, %1, 4), %%mm4 \n\t" // 1
13e00528
A
495 PAVGB(%%mm4, %%mm2) // 112 /4
496 PAVGB(%%mm2, %%mm6) // 112242211 /16
3057fa66
A
497 "movq %%mm6, (%0, %1, 4) \n\t" // X
498 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
13e00528
A
499 PAVGB(%%mm7, %%mm1) // 11 2 /4
500 PAVGB(%%mm4, %%mm5) // 11 /2
501 PAVGB(%%mm5, %%mm0) // 11 11 /4
3057fa66 502 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
13e00528
A
503 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
504 PAVGB(%%mm0, %%mm1) // 11224222 /16
3057fa66
A
505 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
506 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
13e00528 507 PAVGB((%%ebx), %%mm2) // 112 4 /8
3057fa66 508 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
509 PAVGB(%%mm0, %%mm6) // 1 1 /2
510 PAVGB(%%mm7, %%mm6) // 1 12 /4
511 PAVGB(%%mm2, %%mm6) // 1122424 /4
3057fa66
A
512 "movq %%mm6, (%%ebx) \n\t" // X
513 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
13e00528
A
514 PAVGB(%%mm7, %%mm5) // 11 2 /4
515 PAVGB(%%mm7, %%mm5) // 11 6 /8
3057fa66 516
13e00528
A
517 PAVGB(%%mm3, %%mm0) // 112 /4
518 PAVGB(%%mm0, %%mm5) // 112246 /16
3057fa66
A
519 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
520 "popl %0\n\t"
521
522 :
523 : "r" (src), "r" (stride)
524 : "%eax", "%ebx"
525 );
3057fa66
A
526#else
527 const int l1= stride;
528 const int l2= stride + l1;
529 const int l3= stride + l2;
530 const int l4= stride + l3;
531 const int l5= stride + l4;
532 const int l6= stride + l5;
533 const int l7= stride + l6;
534 const int l8= stride + l7;
535 const int l9= stride + l8;
d5a1a995 536 int x;
acced553 537 src+= stride*3;
d5a1a995 538 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
539 {
540 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
541 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
542
543 int sums[9];
544 sums[0] = first + src[l1];
545 sums[1] = src[l1] + src[l2];
546 sums[2] = src[l2] + src[l3];
547 sums[3] = src[l3] + src[l4];
548 sums[4] = src[l4] + src[l5];
549 sums[5] = src[l5] + src[l6];
550 sums[6] = src[l6] + src[l7];
551 sums[7] = src[l7] + src[l8];
552 sums[8] = src[l8] + last;
553
554 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
e5c30e06
MN
555 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
556 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
557 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
558 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
559 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
560 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
561 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
3057fa66
A
562
563 src++;
564 }
565
566#endif
567}
568
13e00528
A
569/**
570 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
571 * values are correctly clipped (MMX2)
572 * values are wraparound (C)
573 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
574 0 8 16 24
575 x = 8
576 x/2 = 4
577 x/8 = 1
578 1 12 12 23
579 */
9f45d04d 580static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
13e00528 581{
d5a1a995 582#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 583 src+= stride*3;
13e00528
A
584// FIXME rounding
585 asm volatile(
586 "pxor %%mm7, %%mm7 \n\t" // 0
587 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
588 "leal (%0, %1), %%eax \n\t"
589 "leal (%%eax, %1, 4), %%ebx \n\t"
590// 0 1 2 3 4 5 6 7 8 9
591// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
592 "movq pQPb, %%mm0 \n\t" // QP,..., QP
593 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
594 "paddusb b02, %%mm0 \n\t"
595 "psrlw $2, %%mm0 \n\t"
596 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
597 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
598 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
599 "movq (%%ebx), %%mm3 \n\t" // line 5
600 "movq %%mm2, %%mm4 \n\t" // line 4
601 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
602 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
d5a1a995 603 PAVGB(%%mm3, %%mm5)
13e00528
A
604 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
605 "psubusb %%mm3, %%mm4 \n\t"
606 "psubusb %%mm2, %%mm3 \n\t"
607 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
608 "psubusb %%mm0, %%mm4 \n\t"
609 "pcmpeqb %%mm7, %%mm4 \n\t"
610 "pand %%mm4, %%mm5 \n\t" // d/2
611
612// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
613 "paddb %%mm5, %%mm2 \n\t"
614// "psubb %%mm6, %%mm2 \n\t"
615 "movq %%mm2, (%0,%1, 4) \n\t"
616
617 "movq (%%ebx), %%mm2 \n\t"
618// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
619 "psubb %%mm5, %%mm2 \n\t"
620// "psubb %%mm6, %%mm2 \n\t"
621 "movq %%mm2, (%%ebx) \n\t"
622
623 "paddb %%mm6, %%mm5 \n\t"
624 "psrlw $2, %%mm5 \n\t"
625 "pand b3F, %%mm5 \n\t"
626 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
627
628 "movq (%%eax, %1, 2), %%mm2 \n\t"
629 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
630 "paddsb %%mm5, %%mm2 \n\t"
631 "psubb %%mm6, %%mm2 \n\t"
632 "movq %%mm2, (%%eax, %1, 2) \n\t"
633
634 "movq (%%ebx, %1), %%mm2 \n\t"
635 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
636 "psubsb %%mm5, %%mm2 \n\t"
637 "psubb %%mm6, %%mm2 \n\t"
638 "movq %%mm2, (%%ebx, %1) \n\t"
639
640 :
641 : "r" (src), "r" (stride)
642 : "%eax", "%ebx"
643 );
644#else
645 const int l1= stride;
646 const int l2= stride + l1;
647 const int l3= stride + l2;
648 const int l4= stride + l3;
649 const int l5= stride + l4;
650 const int l6= stride + l5;
e5c30e06
MN
651// const int l7= stride + l6;
652// const int l8= stride + l7;
653// const int l9= stride + l8;
d5a1a995 654 int x;
acced553 655 src+= stride*3;
d5a1a995 656 for(x=0; x<BLOCK_SIZE; x++)
13e00528
A
657 {
658 if(ABS(src[l4]-src[l5]) < QP + QP/4)
659 {
d5a1a995
MN
660 int v = (src[l5] - src[l4]);
661
662 src[l3] +=v/8;
663 src[l4] +=v/2;
664 src[l5] -=v/2;
665 src[l6] -=v/8;
13e00528 666
13e00528
A
667 }
668 src++;
669 }
670
671#endif
672}
673
674/**
675 * Experimental Filter 1
9f45d04d
MN
676 * will not damage linear gradients
677 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
d5a1a995
MN
678 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
679 * MMX2 version does correct clipping C version doesnt
13e00528
A
680 */
681static inline void vertX1Filter(uint8_t *src, int stride, int QP)
682{
d5a1a995 683#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553
MN
684 src+= stride*3;
685
13e00528 686 asm volatile(
d5a1a995
MN
687 "pxor %%mm7, %%mm7 \n\t" // 0
688// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
689 "leal (%0, %1), %%eax \n\t"
690 "leal (%%eax, %1, 4), %%ebx \n\t"
691// 0 1 2 3 4 5 6 7 8 9
692// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
693 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
694 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
695 "movq %%mm1, %%mm2 \n\t" // line 4
696 "psubusb %%mm0, %%mm1 \n\t"
697 "psubusb %%mm2, %%mm0 \n\t"
698 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
699 "movq (%%ebx), %%mm3 \n\t" // line 5
700 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
701 "movq %%mm3, %%mm5 \n\t" // line 5
702 "psubusb %%mm4, %%mm3 \n\t"
703 "psubusb %%mm5, %%mm4 \n\t"
704 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
705 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
706 "movq %%mm2, %%mm1 \n\t" // line 4
707 "psubusb %%mm5, %%mm2 \n\t"
708 "movq %%mm2, %%mm4 \n\t"
709 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
710 "psubusb %%mm1, %%mm5 \n\t"
711 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
712 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
713 "movq %%mm4, %%mm3 \n\t" // d
714 "psubusb pQPb, %%mm4 \n\t"
715 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
9f45d04d 716 "psubusb b01, %%mm3 \n\t"
d5a1a995
MN
717 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
718
719 PAVGB(%%mm7, %%mm3) // d/2
9f45d04d
MN
720 "movq %%mm3, %%mm1 \n\t" // d/2
721 PAVGB(%%mm7, %%mm3) // d/4
722 PAVGB(%%mm1, %%mm3) // 3*d/8
d5a1a995
MN
723
724 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
725 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
726 "psubusb %%mm3, %%mm0 \n\t"
727 "pxor %%mm2, %%mm0 \n\t"
728 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
729
730 "movq (%%ebx), %%mm0 \n\t" // line 5
731 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
732 "paddusb %%mm3, %%mm0 \n\t"
733 "pxor %%mm2, %%mm0 \n\t"
734 "movq %%mm0, (%%ebx) \n\t" // line 5
735
9f45d04d 736 PAVGB(%%mm7, %%mm1) // d/4
d5a1a995
MN
737
738 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
739 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
9f45d04d 740 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
741 "pxor %%mm2, %%mm0 \n\t"
742 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
743
744 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
745 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
9f45d04d 746 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
747 "pxor %%mm2, %%mm0 \n\t"
748 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
749
9f45d04d 750 PAVGB(%%mm7, %%mm1) // d/8
d5a1a995
MN
751
752 "movq (%%eax, %1), %%mm0 \n\t" // line 2
753 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
9f45d04d 754 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
755 "pxor %%mm2, %%mm0 \n\t"
756 "movq %%mm0, (%%eax, %1) \n\t" // line 2
757
758 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
759 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
9f45d04d 760 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
761 "pxor %%mm2, %%mm0 \n\t"
762 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
13e00528
A
763
764 :
765 : "r" (src), "r" (stride)
766 : "%eax", "%ebx"
767 );
768#else
d5a1a995
MN
769
770 const int l1= stride;
771 const int l2= stride + l1;
772 const int l3= stride + l2;
773 const int l4= stride + l3;
774 const int l5= stride + l4;
775 const int l6= stride + l5;
776 const int l7= stride + l6;
e5c30e06
MN
777// const int l8= stride + l7;
778// const int l9= stride + l8;
d5a1a995 779 int x;
acced553
MN
780
781 src+= stride*3;
d5a1a995
MN
782 for(x=0; x<BLOCK_SIZE; x++)
783 {
784 int a= src[l3] - src[l4];
785 int b= src[l4] - src[l5];
9f45d04d 786 int c= src[l5] - src[l6];
d5a1a995
MN
787
788 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
789
790 if(d < QP)
791 {
792 int v = d * SIGN(-b);
793
794 src[l2] +=v/8;
795 src[l3] +=v/4;
9f45d04d
MN
796 src[l4] +=3*v/8;
797 src[l5] -=3*v/8;
d5a1a995
MN
798 src[l6] -=v/4;
799 src[l7] -=v/8;
800
801 }
802 src++;
803 }
804 /*
13e00528
A
805 const int l1= stride;
806 const int l2= stride + l1;
807 const int l3= stride + l2;
808 const int l4= stride + l3;
809 const int l5= stride + l4;
810 const int l6= stride + l5;
811 const int l7= stride + l6;
812 const int l8= stride + l7;
813 const int l9= stride + l8;
814 for(int x=0; x<BLOCK_SIZE; x++)
815 {
816 int v2= src[l2];
817 int v3= src[l3];
818 int v4= src[l4];
819 int v5= src[l5];
820 int v6= src[l6];
821 int v7= src[l7];
822
823 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
824 {
825 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
826 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
827 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
828 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
829 }
830 src++;
831 }
d5a1a995 832*/
13e00528
A
833#endif
834}
835
9f45d04d
MN
836/**
837 * Experimental Filter 1 (Horizontal)
838 * will not damage linear gradients
839 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
840 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
841 * MMX2 version does correct clipping C version doesnt
842 * not identical with the vertical one
843 */
844static inline void horizX1Filter(uint8_t *src, int stride, int QP)
845{
846 int y;
847 static uint64_t *lut= NULL;
848 if(lut==NULL)
849 {
850 int i;
851 lut= (uint64_t*)memalign(8, 256*8);
852 for(i=0; i<256; i++)
853 {
854 int v= i < 128 ? 2*i : 2*(i-256);
855/*
856//Simulate 112242211 9-Tap filter
857 uint64_t a= (v/16) & 0xFF;
858 uint64_t b= (v/8) & 0xFF;
859 uint64_t c= (v/4) & 0xFF;
860 uint64_t d= (3*v/8) & 0xFF;
861*/
862//Simulate piecewise linear interpolation
863 uint64_t a= (v/16) & 0xFF;
864 uint64_t b= (v*3/16) & 0xFF;
865 uint64_t c= (v*5/16) & 0xFF;
866 uint64_t d= (7*v/16) & 0xFF;
867 uint64_t A= (0x100 - a)&0xFF;
868 uint64_t B= (0x100 - b)&0xFF;
869 uint64_t C= (0x100 - c)&0xFF;
870 uint64_t D= (0x100 - c)&0xFF;
871
872 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
873 (D<<24) | (C<<16) | (B<<8) | (A);
874 //lut[i] = (v<<32) | (v<<24);
875 }
876 }
877
4e4dcbc5 878#if 0
9f45d04d
MN
879 asm volatile(
880 "pxor %%mm7, %%mm7 \n\t" // 0
881// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
882 "leal (%0, %1), %%eax \n\t"
883 "leal (%%eax, %1, 4), %%ebx \n\t"
884
885 "movq b80, %%mm6 \n\t"
79cccf70 886 "movd pQPb, %%mm5 \n\t" // QP
9f45d04d
MN
887 "movq %%mm5, %%mm4 \n\t"
888 "paddusb %%mm5, %%mm5 \n\t" // 2QP
889 "paddusb %%mm5, %%mm4 \n\t" // 3QP
890 "pxor %%mm5, %%mm5 \n\t" // 0
891 "psubb %%mm4, %%mm5 \n\t" // -3QP
892 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
893 "psllq $24, %%mm5 \n\t"
894
895// 0 1 2 3 4 5 6 7 8 9
896// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
897
898#define HX1old(a) \
899 "movd " #a ", %%mm0 \n\t"\
900 "movd 4" #a ", %%mm1 \n\t"\
901 "punpckldq %%mm1, %%mm0 \n\t"\
902 "movq %%mm0, %%mm1 \n\t"\
903 "movq %%mm0, %%mm2 \n\t"\
904 "psrlq $8, %%mm1 \n\t"\
905 "psubusb %%mm1, %%mm2 \n\t"\
906 "psubusb %%mm0, %%mm1 \n\t"\
907