downscale
[libav.git] / postproc / postprocess.c
CommitLineData
3057fa66
A
1/*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
19/*
3b58b885 20 C MMX MMX2 3DNow
3057fa66
A
21isVertDC Ec Ec
22isVertMinMaxOk Ec Ec
3b58b885 23doVertLowPass E e e
3057fa66
A
24doVertDefFilter Ec Ec Ec
25isHorizDC Ec Ec
4e4dcbc5
MN
26isHorizMinMaxOk a E
27doHorizLowPass E e e
e5c30e06 28doHorizDefFilter Ec Ec Ec
3057fa66 29deRing
3b58b885 30Vertical RKAlgo1 E a a
e5c30e06 31Horizontal RKAlgo1 a a
3b58b885
MN
32Vertical X1 a E E
33Horizontal X1 a E E
acced553
MN
34LinIpolDeinterlace e E E*
35CubicIpolDeinterlace a e e*
36LinBlendDeinterlace e E E*
a6be8111 37MedianDeinterlace Ec Ec
d5a1a995 38
3057fa66 39
13e00528 40* i dont have a 3dnow CPU -> its untested
3057fa66 41E = Exact implementation
acced553 42e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
43a = alternative / approximate impl
44c = checked against the other implementations (-vo md5)
45*/
46
47/*
48TODO:
9f45d04d 49verify that everything workes as it should (how?)
3057fa66
A
50reduce the time wasted on the mem transfer
51implement dering
13e00528 52implement everything in C at least (done at the moment but ...)
3057fa66
A
53unroll stuff if instructions depend too much on the prior one
54we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55move YScale thing to the end instead of fixing QP
13e00528 56write a faster and higher quality deblocking filter :)
d5a1a995
MN
57do something about the speed of the horizontal filters
58make the mainloop more flexible (variable number of blocks at once
59 (the if/else stuff per block is slowing things down)
9f45d04d 60compare the quality & speed of all filters
9f45d04d 61split this huge file
3b58b885 62fix warnings (unused vars, ...)
a6be8111 63noise reduction filters
e5c30e06 64border remover
3057fa66
A
65...
66
67Notes:
e5c30e06 68fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
13e00528
A
69*/
70
a6be8111 71//Changelog: use the CVS log
3057fa66
A
72
73#include <inttypes.h>
74#include <stdio.h>
d5a1a995 75#include <stdlib.h>
911879d1 76#include <string.h>
3057fa66 77#include "../config.h"
3057fa66 78//#undef HAVE_MMX2
13e00528 79//#define HAVE_3DNOW
3057fa66 80//#undef HAVE_MMX
13e00528 81#include "postprocess.h"
3057fa66 82
e939e1c3
A
83#define MIN(a,b) ((a) > (b) ? (b) : (a))
84#define MAX(a,b) ((a) < (b) ? (b) : (a))
85#define ABS(a) ((a) > 0 ? (a) : (-(a)))
86#define SIGN(a) ((a) > 0 ? 1 : -1)
87
88#ifdef HAVE_MMX2
89#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
90#elif defined (HAVE_3DNOW)
91#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
92#endif
3057fa66 93
911879d1
MN
94#define GET_MODE_BUFFER_SIZE 500
95#define OPTIONS_ARRAY_SIZE 10
96
97
3057fa66
A
98static uint64_t packedYOffset= 0x0000000000000000LL;
99static uint64_t packedYScale= 0x0100010001000100LL;
100static uint64_t w05= 0x0005000500050005LL;
101static uint64_t w20= 0x0020002000200020LL;
102static uint64_t w1400= 0x1400140014001400LL;
103static uint64_t bm00000001= 0x00000000000000FFLL;
104static uint64_t bm00010000= 0x000000FF00000000LL;
105static uint64_t bm00001000= 0x00000000FF000000LL;
106static uint64_t bm10000000= 0xFF00000000000000LL;
107static uint64_t bm10000001= 0xFF000000000000FFLL;
108static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
13e00528 109static uint64_t bm00000011= 0x000000000000FFFFLL;
9f45d04d 110static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
13e00528 111static uint64_t bm11000000= 0xFFFF000000000000LL;
3057fa66
A
112static uint64_t bm00011000= 0x000000FFFF000000LL;
113static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
114static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
115static uint64_t b00= 0x0000000000000000LL;
9f45d04d 116static uint64_t b01= 0x0101010101010101LL;
3057fa66
A
117static uint64_t b02= 0x0202020202020202LL;
118static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
119static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
13e00528
A
120static uint64_t b20= 0x2020202020202020LL;
121static uint64_t b80= 0x8080808080808080LL;
3057fa66
A
122static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
123static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
124static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
125static uint64_t temp0=0;
126static uint64_t temp1=0;
127static uint64_t temp2=0;
128static uint64_t temp3=0;
129static uint64_t temp4=0;
130static uint64_t temp5=0;
131static uint64_t pQPb=0;
4e4dcbc5 132static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
3057fa66
A
133
134int hFlatnessThreshold= 56 - 16;
135int vFlatnessThreshold= 56 - 16;
136
137//amount of "black" u r willing to loose to get a brightness corrected picture
138double maxClippedThreshold= 0.01;
139
911879d1 140int maxAllowedY=234;
658a85f2 141int minAllowedY=16;
3057fa66 142
911879d1
MN
143static struct PPFilter filters[]=
144{
145 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
146 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
147 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
148 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
149 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
150 {"dr", "dering", 1, 5, 6, DERING},
151 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
152 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
153 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
154 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
155 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
156 {NULL, NULL,0,0,0,0} //End Marker
157};
158
159static char *replaceTable[]=
160{
161 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels",
162 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels",
163 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
164 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
165 NULL //End Marker
166};
167
e5c30e06
MN
168static inline void unusedVariableWarningFixer()
169{
170if(
171 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
172 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
173 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
174 + bFF + b20 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
175 + temp5 + pQPb== 0) b00=0;
176}
177
a6be8111 178#ifdef TIMING
3057fa66
A
179static inline long long rdtsc()
180{
181 long long l;
182 asm volatile( "rdtsc\n\t"
183 : "=A" (l)
184 );
185// printf("%d\n", int(l/1000));
186 return l;
187}
9a722af7 188#endif
3057fa66 189
9a722af7 190#ifdef HAVE_MMX2
3057fa66
A
191static inline void prefetchnta(void *p)
192{
193 asm volatile( "prefetchnta (%0)\n\t"
194 : : "r" (p)
195 );
196}
197
198static inline void prefetcht0(void *p)
199{
200 asm volatile( "prefetcht0 (%0)\n\t"
201 : : "r" (p)
202 );
203}
204
205static inline void prefetcht1(void *p)
206{
207 asm volatile( "prefetcht1 (%0)\n\t"
208 : : "r" (p)
209 );
210}
211
212static inline void prefetcht2(void *p)
213{
214 asm volatile( "prefetcht2 (%0)\n\t"
215 : : "r" (p)
216 );
217}
9a722af7 218#endif
3057fa66
A
219
220//FIXME? |255-0| = 1 (shouldnt be a problem ...)
221/**
acced553 222 * Check if the middle 8x8 Block in the given 8x16 block is flat
3057fa66 223 */
d5a1a995 224static inline int isVertDC(uint8_t src[], int stride){
3057fa66 225 int numEq= 0;
e5c30e06 226#ifndef HAVE_MMX
d5a1a995 227 int y;
e5c30e06 228#endif
acced553 229 src+= stride*4; // src points to begin of the 8x8 Block
3057fa66 230#ifdef HAVE_MMX
37da00fc
MN
231asm volatile(
232 "leal (%1, %2), %%eax \n\t"
233 "leal (%%eax, %2, 4), %%ebx \n\t"
234// 0 1 2 3 4 5 6 7 8 9
235// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
3057fa66
A
236 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
237 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
238 "movq (%1), %%mm0 \n\t"
37da00fc 239 "movq (%%eax), %%mm1 \n\t"
3057fa66
A
240 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
241 "paddb %%mm7, %%mm0 \n\t"
242 "pcmpgtb %%mm6, %%mm0 \n\t"
243
37da00fc 244 "movq (%%eax,%2), %%mm2 \n\t"
3057fa66
A
245 "psubb %%mm2, %%mm1 \n\t"
246 "paddb %%mm7, %%mm1 \n\t"
247 "pcmpgtb %%mm6, %%mm1 \n\t"
248 "paddb %%mm1, %%mm0 \n\t"
249
37da00fc 250 "movq (%%eax, %2, 2), %%mm1 \n\t"
3057fa66
A
251 "psubb %%mm1, %%mm2 \n\t"
252 "paddb %%mm7, %%mm2 \n\t"
253 "pcmpgtb %%mm6, %%mm2 \n\t"
254 "paddb %%mm2, %%mm0 \n\t"
255
37da00fc 256 "movq (%1, %2, 4), %%mm2 \n\t"
3057fa66
A
257 "psubb %%mm2, %%mm1 \n\t"
258 "paddb %%mm7, %%mm1 \n\t"
259 "pcmpgtb %%mm6, %%mm1 \n\t"
260 "paddb %%mm1, %%mm0 \n\t"
261
37da00fc 262 "movq (%%ebx), %%mm1 \n\t"
3057fa66
A
263 "psubb %%mm1, %%mm2 \n\t"
264 "paddb %%mm7, %%mm2 \n\t"
265 "pcmpgtb %%mm6, %%mm2 \n\t"
266 "paddb %%mm2, %%mm0 \n\t"
267
37da00fc 268 "movq (%%ebx, %2), %%mm2 \n\t"
3057fa66
A
269 "psubb %%mm2, %%mm1 \n\t"
270 "paddb %%mm7, %%mm1 \n\t"
271 "pcmpgtb %%mm6, %%mm1 \n\t"
272 "paddb %%mm1, %%mm0 \n\t"
273
37da00fc 274 "movq (%%ebx, %2, 2), %%mm1 \n\t"
3057fa66
A
275 "psubb %%mm1, %%mm2 \n\t"
276 "paddb %%mm7, %%mm2 \n\t"
277 "pcmpgtb %%mm6, %%mm2 \n\t"
278 "paddb %%mm2, %%mm0 \n\t"
279
280 " \n\t"
281 "movq %%mm0, %%mm1 \n\t"
282 "psrlw $8, %%mm0 \n\t"
283 "paddb %%mm1, %%mm0 \n\t"
e5c30e06
MN
284#ifdef HAVE_MMX2
285 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
286 "paddb %%mm1, %%mm0 \n\t"
287 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
288#else
3057fa66
A
289 "movq %%mm0, %%mm1 \n\t"
290 "psrlq $16, %%mm0 \n\t"
291 "paddb %%mm1, %%mm0 \n\t"
292 "movq %%mm0, %%mm1 \n\t"
293 "psrlq $32, %%mm0 \n\t"
e5c30e06 294#endif
3057fa66 295 "paddb %%mm1, %%mm0 \n\t"
3057fa66
A
296 "movd %%mm0, %0 \n\t"
297 : "=r" (numEq)
298 : "r" (src), "r" (stride)
4e4dcbc5 299 : "%eax", "%ebx"
3057fa66 300 );
3057fa66 301
37da00fc 302 numEq= (256 - numEq) &0xFF;
3057fa66
A
303
304#else
d5a1a995 305 for(y=0; y<BLOCK_SIZE-1; y++)
3057fa66
A
306 {
307 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
308 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
309 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
310 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
311 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
312 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
313 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
314 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
315 src+= stride;
316 }
317#endif
318/* if(abs(numEq - asmEq) > 0)
319 {
320 printf("\nasm:%d c:%d\n", asmEq, numEq);
321 for(int y=0; y<8; y++)
322 {
323 for(int x=0; x<8; x++)
324 {
325 printf("%d ", temp[x + y*stride]);
326 }
327 printf("\n");
328 }
329 }
330*/
d5a1a995
MN
331// for(int i=0; i<numEq/8; i++) src[i]=255;
332 return (numEq > vFlatnessThreshold) ? 1 : 0;
3057fa66
A
333}
334
d5a1a995 335static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
3057fa66
A
336{
337#ifdef HAVE_MMX
338 int isOk;
acced553 339 src+= stride*3;
3057fa66
A
340 asm volatile(
341// "int $3 \n\t"
342 "movq (%1, %2), %%mm0 \n\t"
343 "movq (%1, %2, 8), %%mm1 \n\t"
344 "movq %%mm0, %%mm2 \n\t"
345 "psubusb %%mm1, %%mm0 \n\t"
346 "psubusb %%mm2, %%mm1 \n\t"
347 "por %%mm1, %%mm0 \n\t" // ABS Diff
348
349 "movq pQPb, %%mm7 \n\t" // QP,..., QP
350 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
351 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
352 "pcmpeqd b00, %%mm0 \n\t"
353 "psrlq $16, %%mm0 \n\t"
354 "pcmpeqd bFF, %%mm0 \n\t"
355// "movd %%mm0, (%1, %2, 4)\n\t"
356 "movd %%mm0, %0 \n\t"
357 : "=r" (isOk)
358 : "r" (src), "r" (stride)
359 );
ac0b0b2f 360 return isOk;
3057fa66
A
361#else
362
d5a1a995
MN
363 int isOk2= 1;
364 int x;
acced553 365 src+= stride*3;
d5a1a995 366 for(x=0; x<BLOCK_SIZE; x++)
3057fa66 367 {
d5a1a995 368 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
3057fa66
A
369 }
370/* if(isOk && !isOk2 || !isOk && isOk2)
371 {
372 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
373 for(int y=0; y<9; y++)
374 {
375 for(int x=0; x<8; x++)
376 {
377 printf("%d ", src[x + y*stride]);
378 }
379 printf("\n");
380 }
381 } */
382
383 return isOk2;
384#endif
385
386}
387
388/**
acced553 389 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
a6be8111 390 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
3057fa66
A
391 */
392static inline void doVertLowPass(uint8_t *src, int stride, int QP)
393{
13e00528 394#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 395 src+= stride*3;
3057fa66
A
396 asm volatile( //"movv %0 %1 %2\n\t"
397 "pushl %0 \n\t"
398 "movq pQPb, %%mm0 \n\t" // QP,..., QP
3057fa66
A
399
400 "movq (%0), %%mm6 \n\t"
401 "movq (%0, %1), %%mm5 \n\t"
402 "movq %%mm5, %%mm1 \n\t"
403 "movq %%mm6, %%mm2 \n\t"
404 "psubusb %%mm6, %%mm5 \n\t"
405 "psubusb %%mm1, %%mm2 \n\t"
406 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
407 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
408 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
409
410 "pand %%mm2, %%mm6 \n\t"
411 "pandn %%mm1, %%mm2 \n\t"
412 "por %%mm2, %%mm6 \n\t"// First Line to Filter
413
414 "movq (%0, %1, 8), %%mm5 \n\t"
415 "leal (%0, %1, 4), %%eax \n\t"
416 "leal (%0, %1, 8), %%ebx \n\t"
417 "subl %1, %%ebx \n\t"
418 "addl %1, %0 \n\t" // %0 points to line 1 not 0
419 "movq (%0, %1, 8), %%mm7 \n\t"
420 "movq %%mm5, %%mm1 \n\t"
421 "movq %%mm7, %%mm2 \n\t"
422 "psubusb %%mm7, %%mm5 \n\t"
423 "psubusb %%mm1, %%mm2 \n\t"
424 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
425 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
426 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
427
428 "pand %%mm2, %%mm7 \n\t"
429 "pandn %%mm1, %%mm2 \n\t"
430 "por %%mm2, %%mm7 \n\t" // First Line to Filter
431
432
433 // 1 2 3 4 5 6 7 8
434 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
435 // 6 4 2 2 1 1
436 // 6 4 4 2
437 // 6 8 2
acced553 438
3057fa66
A
439 "movq (%0, %1), %%mm0 \n\t" // 1
440 "movq %%mm0, %%mm1 \n\t" // 1
13e00528
A
441 PAVGB(%%mm6, %%mm0) //1 1 /2
442 PAVGB(%%mm6, %%mm0) //3 1 /4
3057fa66
A
443
444 "movq (%0, %1, 4), %%mm2 \n\t" // 1
445 "movq %%mm2, %%mm5 \n\t" // 1
13e00528
A
446 PAVGB((%%eax), %%mm2) // 11 /2
447 PAVGB((%0, %1, 2), %%mm2) // 211 /4
3057fa66
A
448 "movq %%mm2, %%mm3 \n\t" // 211 /4
449 "movq (%0), %%mm4 \n\t" // 1
13e00528
A
450 PAVGB(%%mm4, %%mm3) // 4 211 /8
451 PAVGB(%%mm0, %%mm3) //642211 /16
3057fa66
A
452 "movq %%mm3, (%0) \n\t" // X
453 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
454 "movq %%mm1, %%mm0 \n\t" // 1
13e00528 455 PAVGB(%%mm6, %%mm0) //1 1 /2
3057fa66 456 "movq %%mm4, %%mm3 \n\t" // 1
13e00528
A
457 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
458 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
459 PAVGB((%%eax), %%mm5) // 211 /4
460 PAVGB(%%mm5, %%mm3) // 2 2211 /8
461 PAVGB(%%mm0, %%mm3) //4242211 /16
3057fa66
A
462 "movq %%mm3, (%0,%1) \n\t" // X
463 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
13e00528 464 PAVGB(%%mm4, %%mm6) //11 /2
3057fa66 465 "movq (%%ebx), %%mm0 \n\t" // 1
13e00528 466 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
3057fa66 467 "movq %%mm0, %%mm3 \n\t" // 11/2
13e00528
A
468 PAVGB(%%mm1, %%mm0) // 2 11/4
469 PAVGB(%%mm6, %%mm0) //222 11/8
470 PAVGB(%%mm2, %%mm0) //22242211/16
3057fa66
A
471 "movq (%0, %1, 2), %%mm2 \n\t" // 1
472 "movq %%mm0, (%0, %1, 2) \n\t" // X
473 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
474 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
475 PAVGB((%%ebx), %%mm0) // 11 /2
476 PAVGB(%%mm0, %%mm6) //11 11 /4
477 PAVGB(%%mm1, %%mm4) // 11 /2
478 PAVGB(%%mm2, %%mm1) // 11 /2
479 PAVGB(%%mm1, %%mm6) //1122 11 /8
480 PAVGB(%%mm5, %%mm6) //112242211 /16
3057fa66
A
481 "movq (%%eax), %%mm5 \n\t" // 1
482 "movq %%mm6, (%%eax) \n\t" // X
483 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
484 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
13e00528
A
485 PAVGB(%%mm7, %%mm6) // 11 /2
486 PAVGB(%%mm4, %%mm6) // 11 11 /4
487 PAVGB(%%mm3, %%mm6) // 11 2211 /8
488 PAVGB(%%mm5, %%mm2) // 11 /2
3057fa66 489 "movq (%0, %1, 4), %%mm4 \n\t" // 1
13e00528
A
490 PAVGB(%%mm4, %%mm2) // 112 /4
491 PAVGB(%%mm2, %%mm6) // 112242211 /16
3057fa66
A
492 "movq %%mm6, (%0, %1, 4) \n\t" // X
493 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
13e00528
A
494 PAVGB(%%mm7, %%mm1) // 11 2 /4
495 PAVGB(%%mm4, %%mm5) // 11 /2
496 PAVGB(%%mm5, %%mm0) // 11 11 /4
3057fa66 497 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
13e00528
A
498 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
499 PAVGB(%%mm0, %%mm1) // 11224222 /16
3057fa66
A
500 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
501 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
13e00528 502 PAVGB((%%ebx), %%mm2) // 112 4 /8
3057fa66 503 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
504 PAVGB(%%mm0, %%mm6) // 1 1 /2
505 PAVGB(%%mm7, %%mm6) // 1 12 /4
506 PAVGB(%%mm2, %%mm6) // 1122424 /4
3057fa66
A
507 "movq %%mm6, (%%ebx) \n\t" // X
508 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
13e00528
A
509 PAVGB(%%mm7, %%mm5) // 11 2 /4
510 PAVGB(%%mm7, %%mm5) // 11 6 /8
3057fa66 511
13e00528
A
512 PAVGB(%%mm3, %%mm0) // 112 /4
513 PAVGB(%%mm0, %%mm5) // 112246 /16
3057fa66
A
514 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
515 "popl %0\n\t"
516
517 :
518 : "r" (src), "r" (stride)
519 : "%eax", "%ebx"
520 );
3057fa66
A
521#else
522 const int l1= stride;
523 const int l2= stride + l1;
524 const int l3= stride + l2;
525 const int l4= stride + l3;
526 const int l5= stride + l4;
527 const int l6= stride + l5;
528 const int l7= stride + l6;
529 const int l8= stride + l7;
530 const int l9= stride + l8;
d5a1a995 531 int x;
acced553 532 src+= stride*3;
d5a1a995 533 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
534 {
535 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
536 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
537
538 int sums[9];
539 sums[0] = first + src[l1];
540 sums[1] = src[l1] + src[l2];
541 sums[2] = src[l2] + src[l3];
542 sums[3] = src[l3] + src[l4];
543 sums[4] = src[l4] + src[l5];
544 sums[5] = src[l5] + src[l6];
545 sums[6] = src[l6] + src[l7];
546 sums[7] = src[l7] + src[l8];
547 sums[8] = src[l8] + last;
548
549 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
e5c30e06
MN
550 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
551 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
552 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
553 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
554 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
555 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
556 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
3057fa66
A
557
558 src++;
559 }
560
561#endif
562}
563
13e00528
A
564/**
565 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
566 * values are correctly clipped (MMX2)
567 * values are wraparound (C)
568 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
569 0 8 16 24
570 x = 8
571 x/2 = 4
572 x/8 = 1
573 1 12 12 23
574 */
9f45d04d 575static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
13e00528 576{
d5a1a995 577#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 578 src+= stride*3;
13e00528
A
579// FIXME rounding
580 asm volatile(
581 "pxor %%mm7, %%mm7 \n\t" // 0
582 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
583 "leal (%0, %1), %%eax \n\t"
584 "leal (%%eax, %1, 4), %%ebx \n\t"
585// 0 1 2 3 4 5 6 7 8 9
586// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
587 "movq pQPb, %%mm0 \n\t" // QP,..., QP
588 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
589 "paddusb b02, %%mm0 \n\t"
590 "psrlw $2, %%mm0 \n\t"
591 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
592 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
593 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
594 "movq (%%ebx), %%mm3 \n\t" // line 5
595 "movq %%mm2, %%mm4 \n\t" // line 4
596 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
597 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
d5a1a995 598 PAVGB(%%mm3, %%mm5)
13e00528
A
599 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
600 "psubusb %%mm3, %%mm4 \n\t"
601 "psubusb %%mm2, %%mm3 \n\t"
602 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
603 "psubusb %%mm0, %%mm4 \n\t"
604 "pcmpeqb %%mm7, %%mm4 \n\t"
605 "pand %%mm4, %%mm5 \n\t" // d/2
606
607// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
608 "paddb %%mm5, %%mm2 \n\t"
609// "psubb %%mm6, %%mm2 \n\t"
610 "movq %%mm2, (%0,%1, 4) \n\t"
611
612 "movq (%%ebx), %%mm2 \n\t"
613// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
614 "psubb %%mm5, %%mm2 \n\t"
615// "psubb %%mm6, %%mm2 \n\t"
616 "movq %%mm2, (%%ebx) \n\t"
617
618 "paddb %%mm6, %%mm5 \n\t"
619 "psrlw $2, %%mm5 \n\t"
620 "pand b3F, %%mm5 \n\t"
621 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
622
623 "movq (%%eax, %1, 2), %%mm2 \n\t"
624 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
625 "paddsb %%mm5, %%mm2 \n\t"
626 "psubb %%mm6, %%mm2 \n\t"
627 "movq %%mm2, (%%eax, %1, 2) \n\t"
628
629 "movq (%%ebx, %1), %%mm2 \n\t"
630 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
631 "psubsb %%mm5, %%mm2 \n\t"
632 "psubb %%mm6, %%mm2 \n\t"
633 "movq %%mm2, (%%ebx, %1) \n\t"
634
635 :
636 : "r" (src), "r" (stride)
637 : "%eax", "%ebx"
638 );
639#else
640 const int l1= stride;
641 const int l2= stride + l1;
642 const int l3= stride + l2;
643 const int l4= stride + l3;
644 const int l5= stride + l4;
645 const int l6= stride + l5;
e5c30e06
MN
646// const int l7= stride + l6;
647// const int l8= stride + l7;
648// const int l9= stride + l8;
d5a1a995 649 int x;
acced553 650 src+= stride*3;
d5a1a995 651 for(x=0; x<BLOCK_SIZE; x++)
13e00528
A
652 {
653 if(ABS(src[l4]-src[l5]) < QP + QP/4)
654 {
d5a1a995
MN
655 int v = (src[l5] - src[l4]);
656
657 src[l3] +=v/8;
658 src[l4] +=v/2;
659 src[l5] -=v/2;
660 src[l6] -=v/8;
13e00528 661
13e00528
A
662 }
663 src++;
664 }
665
666#endif
667}
668
669/**
670 * Experimental Filter 1
9f45d04d
MN
671 * will not damage linear gradients
672 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
d5a1a995
MN
673 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
674 * MMX2 version does correct clipping C version doesnt
13e00528
A
675 */
676static inline void vertX1Filter(uint8_t *src, int stride, int QP)
677{
d5a1a995 678#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553
MN
679 src+= stride*3;
680
13e00528 681 asm volatile(
d5a1a995
MN
682 "pxor %%mm7, %%mm7 \n\t" // 0
683// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
684 "leal (%0, %1), %%eax \n\t"
685 "leal (%%eax, %1, 4), %%ebx \n\t"
686// 0 1 2 3 4 5 6 7 8 9
687// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
688 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
689 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
690 "movq %%mm1, %%mm2 \n\t" // line 4
691 "psubusb %%mm0, %%mm1 \n\t"
692 "psubusb %%mm2, %%mm0 \n\t"
693 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
694 "movq (%%ebx), %%mm3 \n\t" // line 5
695 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
696 "movq %%mm3, %%mm5 \n\t" // line 5
697 "psubusb %%mm4, %%mm3 \n\t"
698 "psubusb %%mm5, %%mm4 \n\t"
699 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
700 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
701 "movq %%mm2, %%mm1 \n\t" // line 4
702 "psubusb %%mm5, %%mm2 \n\t"
703 "movq %%mm2, %%mm4 \n\t"
704 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
705 "psubusb %%mm1, %%mm5 \n\t"
706 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
707 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
708 "movq %%mm4, %%mm3 \n\t" // d
709 "psubusb pQPb, %%mm4 \n\t"
710 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
9f45d04d 711 "psubusb b01, %%mm3 \n\t"
d5a1a995
MN
712 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
713
714 PAVGB(%%mm7, %%mm3) // d/2
9f45d04d
MN
715 "movq %%mm3, %%mm1 \n\t" // d/2
716 PAVGB(%%mm7, %%mm3) // d/4
717 PAVGB(%%mm1, %%mm3) // 3*d/8
d5a1a995
MN
718
719 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
720 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
721 "psubusb %%mm3, %%mm0 \n\t"
722 "pxor %%mm2, %%mm0 \n\t"
723 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
724
725 "movq (%%ebx), %%mm0 \n\t" // line 5
726 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
727 "paddusb %%mm3, %%mm0 \n\t"
728 "pxor %%mm2, %%mm0 \n\t"
729 "movq %%mm0, (%%ebx) \n\t" // line 5
730
9f45d04d 731 PAVGB(%%mm7, %%mm1) // d/4
d5a1a995
MN
732
733 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
734 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
9f45d04d 735 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
736 "pxor %%mm2, %%mm0 \n\t"
737 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
738
739 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
740 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
9f45d04d 741 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
742 "pxor %%mm2, %%mm0 \n\t"
743 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
744
9f45d04d 745 PAVGB(%%mm7, %%mm1) // d/8
d5a1a995
MN
746
747 "movq (%%eax, %1), %%mm0 \n\t" // line 2
748 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
9f45d04d 749 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
750 "pxor %%mm2, %%mm0 \n\t"
751 "movq %%mm0, (%%eax, %1) \n\t" // line 2
752
753 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
754 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
9f45d04d 755 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
756 "pxor %%mm2, %%mm0 \n\t"
757 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
13e00528
A
758
759 :
760 : "r" (src), "r" (stride)
761 : "%eax", "%ebx"
762 );
763#else
d5a1a995
MN
764
765 const int l1= stride;
766 const int l2= stride + l1;
767 const int l3= stride + l2;
768 const int l4= stride + l3;
769 const int l5= stride + l4;
770 const int l6= stride + l5;
771 const int l7= stride + l6;
e5c30e06
MN
772// const int l8= stride + l7;
773// const int l9= stride + l8;
d5a1a995 774 int x;
acced553
MN
775
776 src+= stride*3;
d5a1a995
MN
777 for(x=0; x<BLOCK_SIZE; x++)
778 {
779 int a= src[l3] - src[l4];
780 int b= src[l4] - src[l5];
9f45d04d 781 int c= src[l5] - src[l6];
d5a1a995
MN
782
783 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
784
785 if(d < QP)
786 {
787 int v = d * SIGN(-b);
788
789 src[l2] +=v/8;
790 src[l3] +=v/4;
9f45d04d
MN
791 src[l4] +=3*v/8;
792 src[l5] -=3*v/8;
d5a1a995
MN
793 src[l6] -=v/4;
794 src[l7] -=v/8;
795
796 }
797 src++;
798 }
799 /*
13e00528
A
800 const int l1= stride;
801 const int l2= stride + l1;
802 const int l3= stride + l2;
803 const int l4= stride + l3;
804 const int l5= stride + l4;
805 const int l6= stride + l5;
806 const int l7= stride + l6;
807 const int l8= stride + l7;
808 const int l9= stride + l8;
809 for(int x=0; x<BLOCK_SIZE; x++)
810 {
811 int v2= src[l2];
812 int v3= src[l3];
813 int v4= src[l4];
814 int v5= src[l5];
815 int v6= src[l6];
816 int v7= src[l7];
817
818 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
819 {
820 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
821 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
822 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
823 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
824 }
825 src++;
826 }
d5a1a995 827*/
13e00528
A
828#endif
829}
830
9f45d04d
MN
831/**
832 * Experimental Filter 1 (Horizontal)
833 * will not damage linear gradients
834 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
835 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
836 * MMX2 version does correct clipping C version doesnt
837 * not identical with the vertical one
838 */
839static inline void horizX1Filter(uint8_t *src, int stride, int QP)
840{
841 int y;
842 static uint64_t *lut= NULL;
843 if(lut==NULL)
844 {
845 int i;
846 lut= (uint64_t*)memalign(8, 256*8);
847 for(i=0; i<256; i++)
848 {
849 int v= i < 128 ? 2*i : 2*(i-256);
850/*
851//Simulate 112242211 9-Tap filter
852 uint64_t a= (v/16) & 0xFF;
853 uint64_t b= (v/8) & 0xFF;
854 uint64_t c= (v/4) & 0xFF;
855 uint64_t d= (3*v/8) & 0xFF;
856*/
857//Simulate piecewise linear interpolation
858 uint64_t a= (v/16) & 0xFF;
859 uint64_t b= (v*3/16) & 0xFF;
860 uint64_t c= (v*5/16) & 0xFF;
861 uint64_t d= (7*v/16) & 0xFF;
862 uint64_t A= (0x100 - a)&0xFF;
863 uint64_t B= (0x100 - b)&0xFF;
864 uint64_t C= (0x100 - c)&0xFF;
865 uint64_t D= (0x100 - c)&0xFF;
866
867 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
868 (D<<24) | (C<<16) | (B<<8) | (A);
869 //lut[i] = (v<<32) | (v<<24);
870 }
871 }
872
4e4dcbc5 873#if 0
9f45d04d
MN
874 asm volatile(
875 "pxor %%mm7, %%mm7 \n\t" // 0
876// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
877 "leal (%0, %1), %%eax \n\t"
878 "leal (%%eax, %1, 4), %%ebx \n\t"
879
880 "movq b80, %%mm6 \n\t"
79cccf70 881 "movd pQPb, %%mm5 \n\t" // QP
9f45d04d
MN
882 "movq %%mm5, %%mm4 \n\t"
883 "paddusb %%mm5, %%mm5 \n\t" // 2QP
884 "paddusb %%mm5, %%mm4 \n\t" // 3QP
885 "pxor %%mm5, %%mm5 \n\t" // 0
886 "psubb %%mm4, %%mm5 \n\t" // -3QP
887 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
888 "psllq $24, %%mm5 \n\t"
889
890// 0 1 2 3 4 5 6 7 8 9
891// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
892
893#define HX1old(a) \
894 "movd " #a ", %%mm0 \n\t"\
895 "movd 4" #a ", %%mm1 \n\t"\
896 "punpckldq %%mm1, %%mm0 \n\t"\
897 "movq %%mm0, %%mm1 \n\t"\
898 "movq %%mm0, %%mm2 \n\t"\
899 "psrlq $8, %%mm1 \n\t"\
900 "psubusb %%mm1, %%mm2 \n\t"\
901 "psubusb %%mm0, %%mm1 \n\t"\
902