Independed compilation of SUBDIRS
[libav.git] / postproc / postprocess_template.c
CommitLineData
3057fa66
A
1/*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
19/*
3b58b885 20 C MMX MMX2 3DNow
3057fa66
A
21isVertDC Ec Ec
22isVertMinMaxOk Ec Ec
3b58b885 23doVertLowPass E e e
3057fa66
A
24doVertDefFilter Ec Ec Ec
25isHorizDC Ec Ec
26isHorizMinMaxOk a
3b58b885 27doHorizLowPass E a a
13e00528 28doHorizDefFilter E ac ac
3057fa66 29deRing
3b58b885
MN
30Vertical RKAlgo1 E a a
31Vertical X1 a E E
32Horizontal X1 a E E
acced553
MN
33LinIpolDeinterlace e E E*
34CubicIpolDeinterlace a e e*
35LinBlendDeinterlace e E E*
a6be8111 36MedianDeinterlace Ec Ec
d5a1a995 37
3057fa66 38
13e00528 39* i dont have a 3dnow CPU -> its untested
3057fa66 40E = Exact implementation
acced553 41e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
42a = alternative / approximate impl
43c = checked against the other implementations (-vo md5)
44*/
45
46/*
47TODO:
9f45d04d 48verify that everything workes as it should (how?)
3057fa66
A
49reduce the time wasted on the mem transfer
50implement dering
13e00528 51implement everything in C at least (done at the moment but ...)
3057fa66
A
52unroll stuff if instructions depend too much on the prior one
53we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
54move YScale thing to the end instead of fixing QP
13e00528 55write a faster and higher quality deblocking filter :)
d5a1a995
MN
56do something about the speed of the horizontal filters
57make the mainloop more flexible (variable number of blocks at once
58 (the if/else stuff per block is slowing things down)
9f45d04d 59compare the quality & speed of all filters
9f45d04d 60split this huge file
3b58b885 61fix warnings (unused vars, ...)
a6be8111 62noise reduction filters
9736722a 63write an exact implementation of the horizontal delocking filter
3057fa66
A
64...
65
66Notes:
67
13e00528
A
68*/
69
a6be8111 70//Changelog: use the CVS log
3057fa66
A
71
72#include <inttypes.h>
73#include <stdio.h>
d5a1a995 74#include <stdlib.h>
911879d1 75#include <string.h>
3057fa66 76#include "../config.h"
3057fa66 77//#undef HAVE_MMX2
13e00528 78//#define HAVE_3DNOW
3057fa66 79//#undef HAVE_MMX
13e00528 80#include "postprocess.h"
3057fa66 81
e939e1c3
A
82#define MIN(a,b) ((a) > (b) ? (b) : (a))
83#define MAX(a,b) ((a) < (b) ? (b) : (a))
84#define ABS(a) ((a) > 0 ? (a) : (-(a)))
85#define SIGN(a) ((a) > 0 ? 1 : -1)
86
87#ifdef HAVE_MMX2
88#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
89#elif defined (HAVE_3DNOW)
90#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
91#endif
3057fa66 92
911879d1
MN
93#define GET_MODE_BUFFER_SIZE 500
94#define OPTIONS_ARRAY_SIZE 10
95
96
3057fa66
A
97static uint64_t packedYOffset= 0x0000000000000000LL;
98static uint64_t packedYScale= 0x0100010001000100LL;
99static uint64_t w05= 0x0005000500050005LL;
100static uint64_t w20= 0x0020002000200020LL;
101static uint64_t w1400= 0x1400140014001400LL;
102static uint64_t bm00000001= 0x00000000000000FFLL;
103static uint64_t bm00010000= 0x000000FF00000000LL;
104static uint64_t bm00001000= 0x00000000FF000000LL;
105static uint64_t bm10000000= 0xFF00000000000000LL;
106static uint64_t bm10000001= 0xFF000000000000FFLL;
107static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
13e00528 108static uint64_t bm00000011= 0x000000000000FFFFLL;
9f45d04d 109static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
13e00528 110static uint64_t bm11000000= 0xFFFF000000000000LL;
3057fa66
A
111static uint64_t bm00011000= 0x000000FFFF000000LL;
112static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
113static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
114static uint64_t b00= 0x0000000000000000LL;
9f45d04d 115static uint64_t b01= 0x0101010101010101LL;
3057fa66
A
116static uint64_t b02= 0x0202020202020202LL;
117static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
118static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
13e00528
A
119static uint64_t b20= 0x2020202020202020LL;
120static uint64_t b80= 0x8080808080808080LL;
3057fa66
A
121static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
122static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
123static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
124static uint64_t temp0=0;
125static uint64_t temp1=0;
126static uint64_t temp2=0;
127static uint64_t temp3=0;
128static uint64_t temp4=0;
129static uint64_t temp5=0;
130static uint64_t pQPb=0;
658a85f2 131static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data
3057fa66
A
132
133int hFlatnessThreshold= 56 - 16;
134int vFlatnessThreshold= 56 - 16;
135
136//amount of "black" u r willing to loose to get a brightness corrected picture
137double maxClippedThreshold= 0.01;
138
911879d1 139int maxAllowedY=234;
658a85f2 140int minAllowedY=16;
3057fa66 141
911879d1
MN
142static struct PPFilter filters[]=
143{
144 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
145 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
146 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
147 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
148 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
149 {"dr", "dering", 1, 5, 6, DERING},
150 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
151 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
152 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
153 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
154 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
155 {NULL, NULL,0,0,0,0} //End Marker
156};
157
158static char *replaceTable[]=
159{
160 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels",
161 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels",
162 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
163 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
164 NULL //End Marker
165};
166
a6be8111 167#ifdef TIMING
3057fa66
A
168static inline long long rdtsc()
169{
170 long long l;
171 asm volatile( "rdtsc\n\t"
172 : "=A" (l)
173 );
174// printf("%d\n", int(l/1000));
175 return l;
176}
9a722af7 177#endif
3057fa66 178
9a722af7 179#ifdef HAVE_MMX2
3057fa66
A
180static inline void prefetchnta(void *p)
181{
182 asm volatile( "prefetchnta (%0)\n\t"
183 : : "r" (p)
184 );
185}
186
187static inline void prefetcht0(void *p)
188{
189 asm volatile( "prefetcht0 (%0)\n\t"
190 : : "r" (p)
191 );
192}
193
194static inline void prefetcht1(void *p)
195{
196 asm volatile( "prefetcht1 (%0)\n\t"
197 : : "r" (p)
198 );
199}
200
201static inline void prefetcht2(void *p)
202{
203 asm volatile( "prefetcht2 (%0)\n\t"
204 : : "r" (p)
205 );
206}
9a722af7 207#endif
3057fa66
A
208
209//FIXME? |255-0| = 1 (shouldnt be a problem ...)
210/**
acced553 211 * Check if the middle 8x8 Block in the given 8x16 block is flat
3057fa66 212 */
d5a1a995 213static inline int isVertDC(uint8_t src[], int stride){
3057fa66 214 int numEq= 0;
d5a1a995 215 int y;
acced553 216 src+= stride*4; // src points to begin of the 8x8 Block
3057fa66 217#ifdef HAVE_MMX
37da00fc
MN
218asm volatile(
219 "leal (%1, %2), %%eax \n\t"
220 "leal (%%eax, %2, 4), %%ebx \n\t"
221// 0 1 2 3 4 5 6 7 8 9
222// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
3057fa66
A
223 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
224 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
225 "movq (%1), %%mm0 \n\t"
37da00fc 226 "movq (%%eax), %%mm1 \n\t"
3057fa66
A
227 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
228 "paddb %%mm7, %%mm0 \n\t"
229 "pcmpgtb %%mm6, %%mm0 \n\t"
230
37da00fc 231 "movq (%%eax,%2), %%mm2 \n\t"
3057fa66
A
232 "psubb %%mm2, %%mm1 \n\t"
233 "paddb %%mm7, %%mm1 \n\t"
234 "pcmpgtb %%mm6, %%mm1 \n\t"
235 "paddb %%mm1, %%mm0 \n\t"
236
37da00fc 237 "movq (%%eax, %2, 2), %%mm1 \n\t"
3057fa66
A
238 "psubb %%mm1, %%mm2 \n\t"
239 "paddb %%mm7, %%mm2 \n\t"
240 "pcmpgtb %%mm6, %%mm2 \n\t"
241 "paddb %%mm2, %%mm0 \n\t"
242
37da00fc 243 "movq (%1, %2, 4), %%mm2 \n\t"
3057fa66
A
244 "psubb %%mm2, %%mm1 \n\t"
245 "paddb %%mm7, %%mm1 \n\t"
246 "pcmpgtb %%mm6, %%mm1 \n\t"
247 "paddb %%mm1, %%mm0 \n\t"
248
37da00fc 249 "movq (%%ebx), %%mm1 \n\t"
3057fa66
A
250 "psubb %%mm1, %%mm2 \n\t"
251 "paddb %%mm7, %%mm2 \n\t"
252 "pcmpgtb %%mm6, %%mm2 \n\t"
253 "paddb %%mm2, %%mm0 \n\t"
254
37da00fc 255 "movq (%%ebx, %2), %%mm2 \n\t"
3057fa66
A
256 "psubb %%mm2, %%mm1 \n\t"
257 "paddb %%mm7, %%mm1 \n\t"
258 "pcmpgtb %%mm6, %%mm1 \n\t"
259 "paddb %%mm1, %%mm0 \n\t"
260
37da00fc 261 "movq (%%ebx, %2, 2), %%mm1 \n\t"
3057fa66
A
262 "psubb %%mm1, %%mm2 \n\t"
263 "paddb %%mm7, %%mm2 \n\t"
264 "pcmpgtb %%mm6, %%mm2 \n\t"
265 "paddb %%mm2, %%mm0 \n\t"
266
267 " \n\t"
268 "movq %%mm0, %%mm1 \n\t"
269 "psrlw $8, %%mm0 \n\t"
270 "paddb %%mm1, %%mm0 \n\t"
271 "movq %%mm0, %%mm1 \n\t"
272 "psrlq $16, %%mm0 \n\t"
273 "paddb %%mm1, %%mm0 \n\t"
274 "movq %%mm0, %%mm1 \n\t"
275 "psrlq $32, %%mm0 \n\t"
276 "paddb %%mm1, %%mm0 \n\t"
3057fa66
A
277 "movd %%mm0, %0 \n\t"
278 : "=r" (numEq)
279 : "r" (src), "r" (stride)
280 );
3057fa66 281
37da00fc 282 numEq= (256 - numEq) &0xFF;
3057fa66
A
283
284#else
d5a1a995 285 for(y=0; y<BLOCK_SIZE-1; y++)
3057fa66
A
286 {
287 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
288 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
289 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
290 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
291 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
292 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
293 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
294 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
295 src+= stride;
296 }
297#endif
298/* if(abs(numEq - asmEq) > 0)
299 {
300 printf("\nasm:%d c:%d\n", asmEq, numEq);
301 for(int y=0; y<8; y++)
302 {
303 for(int x=0; x<8; x++)
304 {
305 printf("%d ", temp[x + y*stride]);
306 }
307 printf("\n");
308 }
309 }
310*/
d5a1a995
MN
311// for(int i=0; i<numEq/8; i++) src[i]=255;
312 return (numEq > vFlatnessThreshold) ? 1 : 0;
3057fa66
A
313}
314
d5a1a995 315static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
3057fa66
A
316{
317#ifdef HAVE_MMX
318 int isOk;
acced553 319 src+= stride*3;
3057fa66
A
320 asm volatile(
321// "int $3 \n\t"
322 "movq (%1, %2), %%mm0 \n\t"
323 "movq (%1, %2, 8), %%mm1 \n\t"
324 "movq %%mm0, %%mm2 \n\t"
325 "psubusb %%mm1, %%mm0 \n\t"
326 "psubusb %%mm2, %%mm1 \n\t"
327 "por %%mm1, %%mm0 \n\t" // ABS Diff
328
329 "movq pQPb, %%mm7 \n\t" // QP,..., QP
330 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
331 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
332 "pcmpeqd b00, %%mm0 \n\t"
333 "psrlq $16, %%mm0 \n\t"
334 "pcmpeqd bFF, %%mm0 \n\t"
335// "movd %%mm0, (%1, %2, 4)\n\t"
336 "movd %%mm0, %0 \n\t"
337 : "=r" (isOk)
338 : "r" (src), "r" (stride)
339 );
ac0b0b2f 340 return isOk;
3057fa66
A
341#else
342
d5a1a995
MN
343 int isOk2= 1;
344 int x;
acced553 345 src+= stride*3;
d5a1a995 346 for(x=0; x<BLOCK_SIZE; x++)
3057fa66 347 {
d5a1a995 348 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
3057fa66
A
349 }
350/* if(isOk && !isOk2 || !isOk && isOk2)
351 {
352 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
353 for(int y=0; y<9; y++)
354 {
355 for(int x=0; x<8; x++)
356 {
357 printf("%d ", src[x + y*stride]);
358 }
359 printf("\n");
360 }
361 } */
362
363 return isOk2;
364#endif
365
366}
367
368/**
acced553 369 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
a6be8111 370 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
3057fa66
A
371 */
372static inline void doVertLowPass(uint8_t *src, int stride, int QP)
373{
13e00528 374#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 375 src+= stride*3;
3057fa66
A
376 asm volatile( //"movv %0 %1 %2\n\t"
377 "pushl %0 \n\t"
378 "movq pQPb, %%mm0 \n\t" // QP,..., QP
3057fa66
A
379
380 "movq (%0), %%mm6 \n\t"
381 "movq (%0, %1), %%mm5 \n\t"
382 "movq %%mm5, %%mm1 \n\t"
383 "movq %%mm6, %%mm2 \n\t"
384 "psubusb %%mm6, %%mm5 \n\t"
385 "psubusb %%mm1, %%mm2 \n\t"
386 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
387 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
388 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
389
390 "pand %%mm2, %%mm6 \n\t"
391 "pandn %%mm1, %%mm2 \n\t"
392 "por %%mm2, %%mm6 \n\t"// First Line to Filter
393
394 "movq (%0, %1, 8), %%mm5 \n\t"
395 "leal (%0, %1, 4), %%eax \n\t"
396 "leal (%0, %1, 8), %%ebx \n\t"
397 "subl %1, %%ebx \n\t"
398 "addl %1, %0 \n\t" // %0 points to line 1 not 0
399 "movq (%0, %1, 8), %%mm7 \n\t"
400 "movq %%mm5, %%mm1 \n\t"
401 "movq %%mm7, %%mm2 \n\t"
402 "psubusb %%mm7, %%mm5 \n\t"
403 "psubusb %%mm1, %%mm2 \n\t"
404 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
405 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
406 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
407
408 "pand %%mm2, %%mm7 \n\t"
409 "pandn %%mm1, %%mm2 \n\t"
410 "por %%mm2, %%mm7 \n\t" // First Line to Filter
411
412
413 // 1 2 3 4 5 6 7 8
414 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
415 // 6 4 2 2 1 1
416 // 6 4 4 2
417 // 6 8 2
acced553 418
3057fa66
A
419 "movq (%0, %1), %%mm0 \n\t" // 1
420 "movq %%mm0, %%mm1 \n\t" // 1
13e00528
A
421 PAVGB(%%mm6, %%mm0) //1 1 /2
422 PAVGB(%%mm6, %%mm0) //3 1 /4
3057fa66
A
423
424 "movq (%0, %1, 4), %%mm2 \n\t" // 1
425 "movq %%mm2, %%mm5 \n\t" // 1
13e00528
A
426 PAVGB((%%eax), %%mm2) // 11 /2
427 PAVGB((%0, %1, 2), %%mm2) // 211 /4
3057fa66
A
428 "movq %%mm2, %%mm3 \n\t" // 211 /4
429 "movq (%0), %%mm4 \n\t" // 1
13e00528
A
430 PAVGB(%%mm4, %%mm3) // 4 211 /8
431 PAVGB(%%mm0, %%mm3) //642211 /16
3057fa66
A
432 "movq %%mm3, (%0) \n\t" // X
433 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
434 "movq %%mm1, %%mm0 \n\t" // 1
13e00528 435 PAVGB(%%mm6, %%mm0) //1 1 /2
3057fa66 436 "movq %%mm4, %%mm3 \n\t" // 1
13e00528
A
437 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
438 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
439 PAVGB((%%eax), %%mm5) // 211 /4
440 PAVGB(%%mm5, %%mm3) // 2 2211 /8
441 PAVGB(%%mm0, %%mm3) //4242211 /16
3057fa66
A
442 "movq %%mm3, (%0,%1) \n\t" // X
443 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
13e00528 444 PAVGB(%%mm4, %%mm6) //11 /2
3057fa66 445 "movq (%%ebx), %%mm0 \n\t" // 1
13e00528 446 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
3057fa66 447 "movq %%mm0, %%mm3 \n\t" // 11/2
13e00528
A
448 PAVGB(%%mm1, %%mm0) // 2 11/4
449 PAVGB(%%mm6, %%mm0) //222 11/8
450 PAVGB(%%mm2, %%mm0) //22242211/16
3057fa66
A
451 "movq (%0, %1, 2), %%mm2 \n\t" // 1
452 "movq %%mm0, (%0, %1, 2) \n\t" // X
453 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
454 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
455 PAVGB((%%ebx), %%mm0) // 11 /2
456 PAVGB(%%mm0, %%mm6) //11 11 /4
457 PAVGB(%%mm1, %%mm4) // 11 /2
458 PAVGB(%%mm2, %%mm1) // 11 /2
459 PAVGB(%%mm1, %%mm6) //1122 11 /8
460 PAVGB(%%mm5, %%mm6) //112242211 /16
3057fa66
A
461 "movq (%%eax), %%mm5 \n\t" // 1
462 "movq %%mm6, (%%eax) \n\t" // X
463 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
464 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
13e00528
A
465 PAVGB(%%mm7, %%mm6) // 11 /2
466 PAVGB(%%mm4, %%mm6) // 11 11 /4
467 PAVGB(%%mm3, %%mm6) // 11 2211 /8
468 PAVGB(%%mm5, %%mm2) // 11 /2
3057fa66 469 "movq (%0, %1, 4), %%mm4 \n\t" // 1
13e00528
A
470 PAVGB(%%mm4, %%mm2) // 112 /4
471 PAVGB(%%mm2, %%mm6) // 112242211 /16
3057fa66
A
472 "movq %%mm6, (%0, %1, 4) \n\t" // X
473 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
13e00528
A
474 PAVGB(%%mm7, %%mm1) // 11 2 /4
475 PAVGB(%%mm4, %%mm5) // 11 /2
476 PAVGB(%%mm5, %%mm0) // 11 11 /4
3057fa66 477 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
13e00528
A
478 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
479 PAVGB(%%mm0, %%mm1) // 11224222 /16
3057fa66
A
480 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
481 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
13e00528 482 PAVGB((%%ebx), %%mm2) // 112 4 /8
3057fa66 483 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
484 PAVGB(%%mm0, %%mm6) // 1 1 /2
485 PAVGB(%%mm7, %%mm6) // 1 12 /4
486 PAVGB(%%mm2, %%mm6) // 1122424 /4
3057fa66
A
487 "movq %%mm6, (%%ebx) \n\t" // X
488 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
13e00528
A
489 PAVGB(%%mm7, %%mm5) // 11 2 /4
490 PAVGB(%%mm7, %%mm5) // 11 6 /8
3057fa66 491
13e00528
A
492 PAVGB(%%mm3, %%mm0) // 112 /4
493 PAVGB(%%mm0, %%mm5) // 112246 /16
3057fa66
A
494 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
495 "popl %0\n\t"
496
497 :
498 : "r" (src), "r" (stride)
499 : "%eax", "%ebx"
500 );
3057fa66
A
501#else
502 const int l1= stride;
503 const int l2= stride + l1;
504 const int l3= stride + l2;
505 const int l4= stride + l3;
506 const int l5= stride + l4;
507 const int l6= stride + l5;
508 const int l7= stride + l6;
509 const int l8= stride + l7;
510 const int l9= stride + l8;
d5a1a995 511 int x;
acced553 512 src+= stride*3;
d5a1a995 513 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
514 {
515 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
516 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
517
518 int sums[9];
519 sums[0] = first + src[l1];
520 sums[1] = src[l1] + src[l2];
521 sums[2] = src[l2] + src[l3];
522 sums[3] = src[l3] + src[l4];
523 sums[4] = src[l4] + src[l5];
524 sums[5] = src[l5] + src[l6];
525 sums[6] = src[l6] + src[l7];
526 sums[7] = src[l7] + src[l8];
527 sums[8] = src[l8] + last;
528
529 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
530 src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
531 src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
532 src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
533 src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
534 src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
535 src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
536 src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
537
538 src++;
539 }
540
541#endif
542}
543
13e00528
A
544/**
545 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
546 * values are correctly clipped (MMX2)
547 * values are wraparound (C)
548 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
549 0 8 16 24
550 x = 8
551 x/2 = 4
552 x/8 = 1
553 1 12 12 23
554 */
9f45d04d 555static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
13e00528 556{
d5a1a995 557#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 558 src+= stride*3;
13e00528
A
559// FIXME rounding
560 asm volatile(
561 "pxor %%mm7, %%mm7 \n\t" // 0
562 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
563 "leal (%0, %1), %%eax \n\t"
564 "leal (%%eax, %1, 4), %%ebx \n\t"
565// 0 1 2 3 4 5 6 7 8 9
566// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
567 "movq pQPb, %%mm0 \n\t" // QP,..., QP
568 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
569 "paddusb b02, %%mm0 \n\t"
570 "psrlw $2, %%mm0 \n\t"
571 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
572 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
573 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
574 "movq (%%ebx), %%mm3 \n\t" // line 5
575 "movq %%mm2, %%mm4 \n\t" // line 4
576 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
577 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
d5a1a995 578 PAVGB(%%mm3, %%mm5)
13e00528
A
579 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
580 "psubusb %%mm3, %%mm4 \n\t"
581 "psubusb %%mm2, %%mm3 \n\t"
582 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
583 "psubusb %%mm0, %%mm4 \n\t"
584 "pcmpeqb %%mm7, %%mm4 \n\t"
585 "pand %%mm4, %%mm5 \n\t" // d/2
586
587// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
588 "paddb %%mm5, %%mm2 \n\t"
589// "psubb %%mm6, %%mm2 \n\t"
590 "movq %%mm2, (%0,%1, 4) \n\t"
591
592 "movq (%%ebx), %%mm2 \n\t"
593// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
594 "psubb %%mm5, %%mm2 \n\t"
595// "psubb %%mm6, %%mm2 \n\t"
596 "movq %%mm2, (%%ebx) \n\t"
597
598 "paddb %%mm6, %%mm5 \n\t"
599 "psrlw $2, %%mm5 \n\t"
600 "pand b3F, %%mm5 \n\t"
601 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
602
603 "movq (%%eax, %1, 2), %%mm2 \n\t"
604 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
605 "paddsb %%mm5, %%mm2 \n\t"
606 "psubb %%mm6, %%mm2 \n\t"
607 "movq %%mm2, (%%eax, %1, 2) \n\t"
608
609 "movq (%%ebx, %1), %%mm2 \n\t"
610 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
611 "psubsb %%mm5, %%mm2 \n\t"
612 "psubb %%mm6, %%mm2 \n\t"
613 "movq %%mm2, (%%ebx, %1) \n\t"
614
615 :
616 : "r" (src), "r" (stride)
617 : "%eax", "%ebx"
618 );
619#else
620 const int l1= stride;
621 const int l2= stride + l1;
622 const int l3= stride + l2;
623 const int l4= stride + l3;
624 const int l5= stride + l4;
625 const int l6= stride + l5;
626 const int l7= stride + l6;
627 const int l8= stride + l7;
628 const int l9= stride + l8;
d5a1a995 629 int x;
acced553 630 src+= stride*3;
d5a1a995 631 for(x=0; x<BLOCK_SIZE; x++)
13e00528
A
632 {
633 if(ABS(src[l4]-src[l5]) < QP + QP/4)
634 {
d5a1a995
MN
635 int v = (src[l5] - src[l4]);
636
637 src[l3] +=v/8;
638 src[l4] +=v/2;
639 src[l5] -=v/2;
640 src[l6] -=v/8;
13e00528 641
13e00528
A
642 }
643 src++;
644 }
645
646#endif
647}
648
649/**
650 * Experimental Filter 1
9f45d04d
MN
651 * will not damage linear gradients
652 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
d5a1a995
MN
653 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
654 * MMX2 version does correct clipping C version doesnt
13e00528
A
655 */
656static inline void vertX1Filter(uint8_t *src, int stride, int QP)
657{
d5a1a995 658#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553
MN
659 src+= stride*3;
660
13e00528 661 asm volatile(
d5a1a995
MN
662 "pxor %%mm7, %%mm7 \n\t" // 0
663// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
664 "leal (%0, %1), %%eax \n\t"
665 "leal (%%eax, %1, 4), %%ebx \n\t"
666// 0 1 2 3 4 5 6 7 8 9
667// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
668 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
669 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
670 "movq %%mm1, %%mm2 \n\t" // line 4
671 "psubusb %%mm0, %%mm1 \n\t"
672 "psubusb %%mm2, %%mm0 \n\t"
673 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
674 "movq (%%ebx), %%mm3 \n\t" // line 5
675 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
676 "movq %%mm3, %%mm5 \n\t" // line 5
677 "psubusb %%mm4, %%mm3 \n\t"
678 "psubusb %%mm5, %%mm4 \n\t"
679 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
680 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
681 "movq %%mm2, %%mm1 \n\t" // line 4
682 "psubusb %%mm5, %%mm2 \n\t"
683 "movq %%mm2, %%mm4 \n\t"
684 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
685 "psubusb %%mm1, %%mm5 \n\t"
686 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
687 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
688 "movq %%mm4, %%mm3 \n\t" // d
689 "psubusb pQPb, %%mm4 \n\t"
690 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
9f45d04d 691 "psubusb b01, %%mm3 \n\t"
d5a1a995
MN
692 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
693
694 PAVGB(%%mm7, %%mm3) // d/2
9f45d04d
MN
695 "movq %%mm3, %%mm1 \n\t" // d/2
696 PAVGB(%%mm7, %%mm3) // d/4
697 PAVGB(%%mm1, %%mm3) // 3*d/8
d5a1a995
MN
698
699 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
700 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
701 "psubusb %%mm3, %%mm0 \n\t"
702 "pxor %%mm2, %%mm0 \n\t"
703 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
704
705 "movq (%%ebx), %%mm0 \n\t" // line 5
706 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
707 "paddusb %%mm3, %%mm0 \n\t"
708 "pxor %%mm2, %%mm0 \n\t"
709 "movq %%mm0, (%%ebx) \n\t" // line 5
710
9f45d04d 711 PAVGB(%%mm7, %%mm1) // d/4
d5a1a995
MN
712
713 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
714 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
9f45d04d 715 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
716 "pxor %%mm2, %%mm0 \n\t"
717 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
718
719 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
720 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
9f45d04d 721 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
722 "pxor %%mm2, %%mm0 \n\t"
723 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
724
9f45d04d 725 PAVGB(%%mm7, %%mm1) // d/8
d5a1a995
MN
726
727 "movq (%%eax, %1), %%mm0 \n\t" // line 2
728 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
9f45d04d 729 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
730 "pxor %%mm2, %%mm0 \n\t"
731 "movq %%mm0, (%%eax, %1) \n\t" // line 2
732
733 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
734 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
9f45d04d 735 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
736 "pxor %%mm2, %%mm0 \n\t"
737 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
13e00528
A
738
739 :
740 : "r" (src), "r" (stride)
741 : "%eax", "%ebx"
742 );
743#else
d5a1a995
MN
744
745 const int l1= stride;
746 const int l2= stride + l1;
747 const int l3= stride + l2;
748 const int l4= stride + l3;
749 const int l5= stride + l4;
750 const int l6= stride + l5;
751 const int l7= stride + l6;
752 const int l8= stride + l7;
753 const int l9= stride + l8;
754 int x;
acced553
MN
755
756 src+= stride*3;
d5a1a995
MN
757 for(x=0; x<BLOCK_SIZE; x++)
758 {
759 int a= src[l3] - src[l4];
760 int b= src[l4] - src[l5];
9f45d04d 761 int c= src[l5] - src[l6];
d5a1a995
MN
762
763 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
764
765 if(d < QP)
766 {
767 int v = d * SIGN(-b);
768
769 src[l2] +=v/8;
770 src[l3] +=v/4;
9f45d04d
MN
771 src[l4] +=3*v/8;
772 src[l5] -=3*v/8;
d5a1a995
MN
773 src[l6] -=v/4;
774 src[l7] -=v/8;
775
776 }
777 src++;
778 }
779 /*
13e00528
A
780 const int l1= stride;
781 const int l2= stride + l1;
782 const int l3= stride + l2;
783 const int l4= stride + l3;
784 const int l5= stride + l4;
785 const int l6= stride + l5;
786 const int l7= stride + l6;
787 const int l8= stride + l7;
788 const int l9= stride + l8;
789 for(int x=0; x<BLOCK_SIZE; x++)
790 {
791 int v2= src[l2];
792 int v3= src[l3];
793 int v4= src[l4];
794 int v5= src[l5];
795 int v6= src[l6];
796 int v7= src[l7];
797
798 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
799 {
800 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
801 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
802 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
803 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
804 }
805 src++;
806 }
d5a1a995 807*/
13e00528
A
808#endif
809}
810
9f45d04d
MN
811/**
812 * Experimental Filter 1 (Horizontal)
813 * will not damage linear gradients
814 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
815 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
816 * MMX2 version does correct clipping C version doesnt
817 * not identical with the vertical one
818 */
819static inline void horizX1Filter(uint8_t *src, int stride, int QP)
820{
821 int y;
822 static uint64_t *lut= NULL;
823 if(lut==NULL)
824 {
825 int i;
826 lut= (uint64_t*)memalign(8, 256*8);
827 for(i=0; i<256; i++)
828 {
829 int v= i < 128 ? 2*i : 2*(i-256);
830/*
831//Simulate 112242211 9-Tap filter
832 uint64_t a= (v/16) & 0xFF;
833 uint64_t b= (v/8) & 0xFF;
834 uint64_t c= (v/4) & 0xFF;
835 uint64_t d= (3*v/8) & 0xFF;
836*/
837//Simulate piecewise linear interpolation
838 uint64_t a= (v/16) & 0xFF;
839 uint64_t b= (v*3/16) & 0xFF;
840 uint64_t c= (v*5/16) & 0xFF;
841 uint64_t d= (7*v/16) & 0xFF;
842 uint64_t A= (0x100 - a)&0xFF;
843 uint64_t B= (0x100 - b)&0xFF;
844 uint64_t C= (0x100 - c)&0xFF;
845 uint64_t D= (0x100 - c)&0xFF;
846
847 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
848 (D<<24) | (C<<16) | (B<<8) | (A);
849 //lut[i] = (v<<32) | (v<<24);
850 }
851 }
852
853#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
854 asm volatile(
855 "pxor %%mm7, %%mm7 \n\t" // 0
856// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
857 "leal (%0, %1), %%eax \n\t"
858 "leal (%%eax, %1, 4), %%ebx \n\t"
859
860 "movq b80, %%mm6 \n\t"
79cccf70 861 "movd pQPb, %%mm5 \n\t" // QP
9f45d04d
MN
862 "movq %%mm5, %%mm4 \n\t"
863 "paddusb %%mm5, %%mm5 \n\t" // 2QP
864 "paddusb %%mm5, %%mm4 \n\t" // 3QP
865 "pxor %%mm5, %%mm5 \n\t" // 0
866 "psubb %%mm4, %%mm5 \n\t" // -3QP
867 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
868 "psllq $24, %%mm5 \n\t"
869
870// 0 1 2 3 4 5 6 7 8 9
871// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
872
873#define HX1old(a) \
874 "movd " #a ", %%mm0 \n\t"\
875 "movd 4" #a ", %%mm1 \n\t"\
876 "punpckldq %%mm1, %%mm0 \n\t"\
877 "movq %%mm0, %%mm1 \n\t"\
878 "movq %%mm0, %%mm2 \n\t"\
879 "psrlq $8, %%mm1 \n\t"\
880 "psubusb %%mm1, %%mm2 \n\t"\
881 "psubusb %%mm0, %%mm1 \n\t"\
882