MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
[libav.git] / postproc / postprocess.c
CommitLineData
3057fa66
A
1/*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
19/*
3b58b885 20 C MMX MMX2 3DNow
3057fa66
A
21isVertDC Ec Ec
22isVertMinMaxOk Ec Ec
3b58b885 23doVertLowPass E e e
3057fa66
A
24doVertDefFilter Ec Ec Ec
25isHorizDC Ec Ec
4e4dcbc5
MN
26isHorizMinMaxOk a E
27doHorizLowPass E e e
e5c30e06 28doHorizDefFilter Ec Ec Ec
2e212618 29deRing E e e*
3b58b885 30Vertical RKAlgo1 E a a
e5c30e06 31Horizontal RKAlgo1 a a
3b58b885
MN
32Vertical X1 a E E
33Horizontal X1 a E E
acced553
MN
34LinIpolDeinterlace e E E*
35CubicIpolDeinterlace a e e*
36LinBlendDeinterlace e E E*
a6be8111 37MedianDeinterlace Ec Ec
d5a1a995 38
3057fa66 39
13e00528 40* i dont have a 3dnow CPU -> its untested
3057fa66 41E = Exact implementation
acced553 42e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
43a = alternative / approximate impl
44c = checked against the other implementations (-vo md5)
45*/
46
47/*
48TODO:
9f45d04d 49verify that everything workes as it should (how?)
3057fa66
A
50reduce the time wasted on the mem transfer
51implement dering
13e00528 52implement everything in C at least (done at the moment but ...)
3057fa66
A
53unroll stuff if instructions depend too much on the prior one
54we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55move YScale thing to the end instead of fixing QP
13e00528 56write a faster and higher quality deblocking filter :)
d5a1a995
MN
57do something about the speed of the horizontal filters
58make the mainloop more flexible (variable number of blocks at once
59 (the if/else stuff per block is slowing things down)
9f45d04d 60compare the quality & speed of all filters
9f45d04d 61split this huge file
3b58b885 62fix warnings (unused vars, ...)
a6be8111 63noise reduction filters
e5c30e06 64border remover
8405b3fd 65optimize c versions
3057fa66
A
66...
67
68Notes:
13e00528
A
69*/
70
a6be8111 71//Changelog: use the CVS log
3057fa66
A
72
73#include <inttypes.h>
74#include <stdio.h>
d5a1a995 75#include <stdlib.h>
911879d1 76#include <string.h>
3057fa66 77#include "../config.h"
dda87e9f
PL
78#ifdef HAVE_MALLOC_H
79#include <malloc.h>
80#endif
3057fa66 81//#undef HAVE_MMX2
13e00528 82//#define HAVE_3DNOW
3057fa66 83//#undef HAVE_MMX
13e00528 84#include "postprocess.h"
3057fa66 85
e939e1c3
A
86#define MIN(a,b) ((a) > (b) ? (b) : (a))
87#define MAX(a,b) ((a) < (b) ? (b) : (a))
88#define ABS(a) ((a) > 0 ? (a) : (-(a)))
89#define SIGN(a) ((a) > 0 ? 1 : -1)
90
91#ifdef HAVE_MMX2
92#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
93#elif defined (HAVE_3DNOW)
94#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
95#endif
3057fa66 96
2e212618
MN
97#ifdef HAVE_MMX2
98#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
99#elif defined (HAVE_MMX)
100#define PMINUB(b,a,t) \
101 "movq " #a ", " #t " \n\t"\
102 "psubusb " #b ", " #t " \n\t"\
103 "psubb " #t ", " #a " \n\t"
104#endif
105
106#ifdef HAVE_MMX2
107#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
108#elif defined (HAVE_MMX)
109#define PMAXUB(a,b) \
110 "psubusb " #a ", " #b " \n\t"\
111 "paddb " #a ", " #b " \n\t"
112#endif
113
114
911879d1
MN
115#define GET_MODE_BUFFER_SIZE 500
116#define OPTIONS_ARRAY_SIZE 10
117
118
3057fa66
A
119static uint64_t packedYOffset= 0x0000000000000000LL;
120static uint64_t packedYScale= 0x0100010001000100LL;
121static uint64_t w05= 0x0005000500050005LL;
122static uint64_t w20= 0x0020002000200020LL;
123static uint64_t w1400= 0x1400140014001400LL;
124static uint64_t bm00000001= 0x00000000000000FFLL;
125static uint64_t bm00010000= 0x000000FF00000000LL;
126static uint64_t bm00001000= 0x00000000FF000000LL;
127static uint64_t bm10000000= 0xFF00000000000000LL;
128static uint64_t bm10000001= 0xFF000000000000FFLL;
129static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
13e00528 130static uint64_t bm00000011= 0x000000000000FFFFLL;
9f45d04d 131static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
13e00528 132static uint64_t bm11000000= 0xFFFF000000000000LL;
3057fa66
A
133static uint64_t bm00011000= 0x000000FFFF000000LL;
134static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
135static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
136static uint64_t b00= 0x0000000000000000LL;
9f45d04d 137static uint64_t b01= 0x0101010101010101LL;
3057fa66
A
138static uint64_t b02= 0x0202020202020202LL;
139static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
70c5ae87
MN
140static uint64_t b04= 0x0404040404040404LL;
141static uint64_t b08= 0x0808080808080808LL;
3057fa66 142static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
13e00528
A
143static uint64_t b20= 0x2020202020202020LL;
144static uint64_t b80= 0x8080808080808080LL;
3057fa66
A
145static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
146static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
147static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
148static uint64_t temp0=0;
149static uint64_t temp1=0;
150static uint64_t temp2=0;
151static uint64_t temp3=0;
152static uint64_t temp4=0;
153static uint64_t temp5=0;
154static uint64_t pQPb=0;
70c5ae87 155static uint64_t pQPb2=0;
4e4dcbc5 156static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
3057fa66
A
157
158int hFlatnessThreshold= 56 - 16;
159int vFlatnessThreshold= 56 - 16;
160
161//amount of "black" u r willing to loose to get a brightness corrected picture
162double maxClippedThreshold= 0.01;
163
911879d1 164int maxAllowedY=234;
658a85f2 165int minAllowedY=16;
3057fa66 166
911879d1
MN
167static struct PPFilter filters[]=
168{
169 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
170 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
171 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
172 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
173 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
174 {"dr", "dering", 1, 5, 6, DERING},
175 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
176 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
177 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
178 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
179 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
180 {NULL, NULL,0,0,0,0} //End Marker
181};
182
183static char *replaceTable[]=
184{
185 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels",
186 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels",
187 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
188 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
189 NULL //End Marker
190};
191
e5c30e06
MN
192static inline void unusedVariableWarningFixer()
193{
194if(
195 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
196 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
197 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
198 + bFF + b20 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
199 + temp5 + pQPb== 0) b00=0;
200}
201
a6be8111 202#ifdef TIMING
3057fa66
A
203static inline long long rdtsc()
204{
205 long long l;
206 asm volatile( "rdtsc\n\t"
207 : "=A" (l)
208 );
209// printf("%d\n", int(l/1000));
210 return l;
211}
9a722af7 212#endif
3057fa66 213
9a722af7 214#ifdef HAVE_MMX2
3057fa66
A
215static inline void prefetchnta(void *p)
216{
217 asm volatile( "prefetchnta (%0)\n\t"
218 : : "r" (p)
219 );
220}
221
222static inline void prefetcht0(void *p)
223{
224 asm volatile( "prefetcht0 (%0)\n\t"
225 : : "r" (p)
226 );
227}
228
229static inline void prefetcht1(void *p)
230{
231 asm volatile( "prefetcht1 (%0)\n\t"
232 : : "r" (p)
233 );
234}
235
236static inline void prefetcht2(void *p)
237{
238 asm volatile( "prefetcht2 (%0)\n\t"
239 : : "r" (p)
240 );
241}
9a722af7 242#endif
3057fa66
A
243
244//FIXME? |255-0| = 1 (shouldnt be a problem ...)
245/**
acced553 246 * Check if the middle 8x8 Block in the given 8x16 block is flat
3057fa66 247 */
d5a1a995 248static inline int isVertDC(uint8_t src[], int stride){
3057fa66 249 int numEq= 0;
e5c30e06 250#ifndef HAVE_MMX
d5a1a995 251 int y;
e5c30e06 252#endif
acced553 253 src+= stride*4; // src points to begin of the 8x8 Block
3057fa66 254#ifdef HAVE_MMX
37da00fc
MN
255asm volatile(
256 "leal (%1, %2), %%eax \n\t"
257 "leal (%%eax, %2, 4), %%ebx \n\t"
258// 0 1 2 3 4 5 6 7 8 9
259// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
3057fa66
A
260 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
261 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
262 "movq (%1), %%mm0 \n\t"
37da00fc 263 "movq (%%eax), %%mm1 \n\t"
3057fa66
A
264 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
265 "paddb %%mm7, %%mm0 \n\t"
266 "pcmpgtb %%mm6, %%mm0 \n\t"
267
37da00fc 268 "movq (%%eax,%2), %%mm2 \n\t"
3057fa66
A
269 "psubb %%mm2, %%mm1 \n\t"
270 "paddb %%mm7, %%mm1 \n\t"
271 "pcmpgtb %%mm6, %%mm1 \n\t"
272 "paddb %%mm1, %%mm0 \n\t"
273
37da00fc 274 "movq (%%eax, %2, 2), %%mm1 \n\t"
3057fa66
A
275 "psubb %%mm1, %%mm2 \n\t"
276 "paddb %%mm7, %%mm2 \n\t"
277 "pcmpgtb %%mm6, %%mm2 \n\t"
278 "paddb %%mm2, %%mm0 \n\t"
279
37da00fc 280 "movq (%1, %2, 4), %%mm2 \n\t"
3057fa66
A
281 "psubb %%mm2, %%mm1 \n\t"
282 "paddb %%mm7, %%mm1 \n\t"
283 "pcmpgtb %%mm6, %%mm1 \n\t"
284 "paddb %%mm1, %%mm0 \n\t"
285
37da00fc 286 "movq (%%ebx), %%mm1 \n\t"
3057fa66
A
287 "psubb %%mm1, %%mm2 \n\t"
288 "paddb %%mm7, %%mm2 \n\t"
289 "pcmpgtb %%mm6, %%mm2 \n\t"
290 "paddb %%mm2, %%mm0 \n\t"
291
37da00fc 292 "movq (%%ebx, %2), %%mm2 \n\t"
3057fa66
A
293 "psubb %%mm2, %%mm1 \n\t"
294 "paddb %%mm7, %%mm1 \n\t"
295 "pcmpgtb %%mm6, %%mm1 \n\t"
296 "paddb %%mm1, %%mm0 \n\t"
297
37da00fc 298 "movq (%%ebx, %2, 2), %%mm1 \n\t"
3057fa66
A
299 "psubb %%mm1, %%mm2 \n\t"
300 "paddb %%mm7, %%mm2 \n\t"
301 "pcmpgtb %%mm6, %%mm2 \n\t"
302 "paddb %%mm2, %%mm0 \n\t"
303
304 " \n\t"
305 "movq %%mm0, %%mm1 \n\t"
306 "psrlw $8, %%mm0 \n\t"
307 "paddb %%mm1, %%mm0 \n\t"
e5c30e06
MN
308#ifdef HAVE_MMX2
309 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
310 "paddb %%mm1, %%mm0 \n\t"
311 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
312#else
3057fa66
A
313 "movq %%mm0, %%mm1 \n\t"
314 "psrlq $16, %%mm0 \n\t"
315 "paddb %%mm1, %%mm0 \n\t"
316 "movq %%mm0, %%mm1 \n\t"
317 "psrlq $32, %%mm0 \n\t"
e5c30e06 318#endif
3057fa66 319 "paddb %%mm1, %%mm0 \n\t"
3057fa66
A
320 "movd %%mm0, %0 \n\t"
321 : "=r" (numEq)
322 : "r" (src), "r" (stride)
4e4dcbc5 323 : "%eax", "%ebx"
3057fa66 324 );
3057fa66 325
37da00fc 326 numEq= (256 - numEq) &0xFF;
3057fa66
A
327
328#else
d5a1a995 329 for(y=0; y<BLOCK_SIZE-1; y++)
3057fa66
A
330 {
331 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
332 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
333 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
334 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
335 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
336 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
337 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
338 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
339 src+= stride;
340 }
341#endif
342/* if(abs(numEq - asmEq) > 0)
343 {
344 printf("\nasm:%d c:%d\n", asmEq, numEq);
345 for(int y=0; y<8; y++)
346 {
347 for(int x=0; x<8; x++)
348 {
349 printf("%d ", temp[x + y*stride]);
350 }
351 printf("\n");
352 }
353 }
354*/
d5a1a995
MN
355// for(int i=0; i<numEq/8; i++) src[i]=255;
356 return (numEq > vFlatnessThreshold) ? 1 : 0;
3057fa66
A
357}
358
d5a1a995 359static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
3057fa66
A
360{
361#ifdef HAVE_MMX
362 int isOk;
acced553 363 src+= stride*3;
3057fa66
A
364 asm volatile(
365// "int $3 \n\t"
366 "movq (%1, %2), %%mm0 \n\t"
367 "movq (%1, %2, 8), %%mm1 \n\t"
368 "movq %%mm0, %%mm2 \n\t"
369 "psubusb %%mm1, %%mm0 \n\t"
370 "psubusb %%mm2, %%mm1 \n\t"
371 "por %%mm1, %%mm0 \n\t" // ABS Diff
372
373 "movq pQPb, %%mm7 \n\t" // QP,..., QP
374 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
375 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
376 "pcmpeqd b00, %%mm0 \n\t"
377 "psrlq $16, %%mm0 \n\t"
378 "pcmpeqd bFF, %%mm0 \n\t"
379// "movd %%mm0, (%1, %2, 4)\n\t"
380 "movd %%mm0, %0 \n\t"
381 : "=r" (isOk)
382 : "r" (src), "r" (stride)
383 );
ac0b0b2f 384 return isOk;
3057fa66
A
385#else
386
d5a1a995
MN
387 int isOk2= 1;
388 int x;
acced553 389 src+= stride*3;
d5a1a995 390 for(x=0; x<BLOCK_SIZE; x++)
3057fa66 391 {
d5a1a995 392 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
3057fa66
A
393 }
394/* if(isOk && !isOk2 || !isOk && isOk2)
395 {
396 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
397 for(int y=0; y<9; y++)
398 {
399 for(int x=0; x<8; x++)
400 {
401 printf("%d ", src[x + y*stride]);
402 }
403 printf("\n");
404 }
405 } */
406
407 return isOk2;
408#endif
409
410}
411
412/**
acced553 413 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
a6be8111 414 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
3057fa66
A
415 */
416static inline void doVertLowPass(uint8_t *src, int stride, int QP)
417{
13e00528 418#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 419 src+= stride*3;
3057fa66 420 asm volatile( //"movv %0 %1 %2\n\t"
3057fa66 421 "movq pQPb, %%mm0 \n\t" // QP,..., QP
3057fa66
A
422
423 "movq (%0), %%mm6 \n\t"
424 "movq (%0, %1), %%mm5 \n\t"
425 "movq %%mm5, %%mm1 \n\t"
426 "movq %%mm6, %%mm2 \n\t"
427 "psubusb %%mm6, %%mm5 \n\t"
428 "psubusb %%mm1, %%mm2 \n\t"
429 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
430 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
431 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
432
433 "pand %%mm2, %%mm6 \n\t"
434 "pandn %%mm1, %%mm2 \n\t"
435 "por %%mm2, %%mm6 \n\t"// First Line to Filter
436
437 "movq (%0, %1, 8), %%mm5 \n\t"
438 "leal (%0, %1, 4), %%eax \n\t"
439 "leal (%0, %1, 8), %%ebx \n\t"
440 "subl %1, %%ebx \n\t"
441 "addl %1, %0 \n\t" // %0 points to line 1 not 0
442 "movq (%0, %1, 8), %%mm7 \n\t"
443 "movq %%mm5, %%mm1 \n\t"
444 "movq %%mm7, %%mm2 \n\t"
445 "psubusb %%mm7, %%mm5 \n\t"
446 "psubusb %%mm1, %%mm2 \n\t"
447 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
448 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
449 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
450
451 "pand %%mm2, %%mm7 \n\t"
452 "pandn %%mm1, %%mm2 \n\t"
453 "por %%mm2, %%mm7 \n\t" // First Line to Filter
454
455
456 // 1 2 3 4 5 6 7 8
457 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
458 // 6 4 2 2 1 1
459 // 6 4 4 2
460 // 6 8 2
acced553 461
3057fa66
A
462 "movq (%0, %1), %%mm0 \n\t" // 1
463 "movq %%mm0, %%mm1 \n\t" // 1
13e00528
A
464 PAVGB(%%mm6, %%mm0) //1 1 /2
465 PAVGB(%%mm6, %%mm0) //3 1 /4
3057fa66
A
466
467 "movq (%0, %1, 4), %%mm2 \n\t" // 1
468 "movq %%mm2, %%mm5 \n\t" // 1
13e00528
A
469 PAVGB((%%eax), %%mm2) // 11 /2
470 PAVGB((%0, %1, 2), %%mm2) // 211 /4
3057fa66
A
471 "movq %%mm2, %%mm3 \n\t" // 211 /4
472 "movq (%0), %%mm4 \n\t" // 1
13e00528
A
473 PAVGB(%%mm4, %%mm3) // 4 211 /8
474 PAVGB(%%mm0, %%mm3) //642211 /16
3057fa66
A
475 "movq %%mm3, (%0) \n\t" // X
476 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
477 "movq %%mm1, %%mm0 \n\t" // 1
13e00528 478 PAVGB(%%mm6, %%mm0) //1 1 /2
3057fa66 479 "movq %%mm4, %%mm3 \n\t" // 1
13e00528
A
480 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
481 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
482 PAVGB((%%eax), %%mm5) // 211 /4
483 PAVGB(%%mm5, %%mm3) // 2 2211 /8
484 PAVGB(%%mm0, %%mm3) //4242211 /16
3057fa66
A
485 "movq %%mm3, (%0,%1) \n\t" // X
486 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
13e00528 487 PAVGB(%%mm4, %%mm6) //11 /2
3057fa66 488 "movq (%%ebx), %%mm0 \n\t" // 1
13e00528 489 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
3057fa66 490 "movq %%mm0, %%mm3 \n\t" // 11/2
13e00528
A
491 PAVGB(%%mm1, %%mm0) // 2 11/4
492 PAVGB(%%mm6, %%mm0) //222 11/8
493 PAVGB(%%mm2, %%mm0) //22242211/16
3057fa66
A
494 "movq (%0, %1, 2), %%mm2 \n\t" // 1
495 "movq %%mm0, (%0, %1, 2) \n\t" // X
496 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
497 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
498 PAVGB((%%ebx), %%mm0) // 11 /2
499 PAVGB(%%mm0, %%mm6) //11 11 /4
500 PAVGB(%%mm1, %%mm4) // 11 /2
501 PAVGB(%%mm2, %%mm1) // 11 /2
502 PAVGB(%%mm1, %%mm6) //1122 11 /8
503 PAVGB(%%mm5, %%mm6) //112242211 /16
3057fa66
A
504 "movq (%%eax), %%mm5 \n\t" // 1
505 "movq %%mm6, (%%eax) \n\t" // X
506 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
507 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
13e00528
A
508 PAVGB(%%mm7, %%mm6) // 11 /2
509 PAVGB(%%mm4, %%mm6) // 11 11 /4
510 PAVGB(%%mm3, %%mm6) // 11 2211 /8
511 PAVGB(%%mm5, %%mm2) // 11 /2
3057fa66 512 "movq (%0, %1, 4), %%mm4 \n\t" // 1
13e00528
A
513 PAVGB(%%mm4, %%mm2) // 112 /4
514 PAVGB(%%mm2, %%mm6) // 112242211 /16
3057fa66
A
515 "movq %%mm6, (%0, %1, 4) \n\t" // X
516 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
13e00528
A
517 PAVGB(%%mm7, %%mm1) // 11 2 /4
518 PAVGB(%%mm4, %%mm5) // 11 /2
519 PAVGB(%%mm5, %%mm0) // 11 11 /4
3057fa66 520 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
13e00528
A
521 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
522 PAVGB(%%mm0, %%mm1) // 11224222 /16
3057fa66
A
523 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
524 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
13e00528 525 PAVGB((%%ebx), %%mm2) // 112 4 /8
3057fa66 526 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
527 PAVGB(%%mm0, %%mm6) // 1 1 /2
528 PAVGB(%%mm7, %%mm6) // 1 12 /4
529 PAVGB(%%mm2, %%mm6) // 1122424 /4
3057fa66
A
530 "movq %%mm6, (%%ebx) \n\t" // X
531 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
13e00528
A
532 PAVGB(%%mm7, %%mm5) // 11 2 /4
533 PAVGB(%%mm7, %%mm5) // 11 6 /8
3057fa66 534
13e00528
A
535 PAVGB(%%mm3, %%mm0) // 112 /4
536 PAVGB(%%mm0, %%mm5) // 112246 /16
3057fa66 537 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
8405b3fd 538 "subl %1, %0 \n\t"
3057fa66
A
539
540 :
541 : "r" (src), "r" (stride)
542 : "%eax", "%ebx"
543 );
3057fa66
A
544#else
545 const int l1= stride;
546 const int l2= stride + l1;
547 const int l3= stride + l2;
548 const int l4= stride + l3;
549 const int l5= stride + l4;
550 const int l6= stride + l5;
551 const int l7= stride + l6;
552 const int l8= stride + l7;
553 const int l9= stride + l8;
d5a1a995 554 int x;
acced553 555 src+= stride*3;
d5a1a995 556 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
557 {
558 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
559 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
560
561 int sums[9];
562 sums[0] = first + src[l1];
563 sums[1] = src[l1] + src[l2];
564 sums[2] = src[l2] + src[l3];
565 sums[3] = src[l3] + src[l4];
566 sums[4] = src[l4] + src[l5];
567 sums[5] = src[l5] + src[l6];
568 sums[6] = src[l6] + src[l7];
569 sums[7] = src[l7] + src[l8];
570 sums[8] = src[l8] + last;
571
572 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
e5c30e06
MN
573 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
574 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
575 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
576 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
577 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
578 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
579 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
3057fa66
A
580
581 src++;
582 }
583
584#endif
585}
586
13e00528
A
587/**
588 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
589 * values are correctly clipped (MMX2)
590 * values are wraparound (C)
591 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
592 0 8 16 24
593 x = 8
594 x/2 = 4
595 x/8 = 1
596 1 12 12 23
597 */
9f45d04d 598static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
13e00528 599{
d5a1a995 600#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 601 src+= stride*3;
13e00528
A
602// FIXME rounding
603 asm volatile(
604 "pxor %%mm7, %%mm7 \n\t" // 0
605 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
606 "leal (%0, %1), %%eax \n\t"
607 "leal (%%eax, %1, 4), %%ebx \n\t"
608// 0 1 2 3 4 5 6 7 8 9
609// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
610 "movq pQPb, %%mm0 \n\t" // QP,..., QP
611 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
612 "paddusb b02, %%mm0 \n\t"
613 "psrlw $2, %%mm0 \n\t"
614 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
615 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
616 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
617 "movq (%%ebx), %%mm3 \n\t" // line 5
618 "movq %%mm2, %%mm4 \n\t" // line 4
619 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
620 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
d5a1a995 621 PAVGB(%%mm3, %%mm5)
13e00528
A
622 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
623 "psubusb %%mm3, %%mm4 \n\t"
624 "psubusb %%mm2, %%mm3 \n\t"
625 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
626 "psubusb %%mm0, %%mm4 \n\t"
627 "pcmpeqb %%mm7, %%mm4 \n\t"
628 "pand %%mm4, %%mm5 \n\t" // d/2
629
630// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
631 "paddb %%mm5, %%mm2 \n\t"
632// "psubb %%mm6, %%mm2 \n\t"
633 "movq %%mm2, (%0,%1, 4) \n\t"
634
635 "movq (%%ebx), %%mm2 \n\t"
636// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
637 "psubb %%mm5, %%mm2 \n\t"
638// "psubb %%mm6, %%mm2 \n\t"
639 "movq %%mm2, (%%ebx) \n\t"
640
641 "paddb %%mm6, %%mm5 \n\t"
642 "psrlw $2, %%mm5 \n\t"
643 "pand b3F, %%mm5 \n\t"
644 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
645
646 "movq (%%eax, %1, 2), %%mm2 \n\t"
647 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
648 "paddsb %%mm5, %%mm2 \n\t"
649 "psubb %%mm6, %%mm2 \n\t"
650 "movq %%mm2, (%%eax, %1, 2) \n\t"
651
652 "movq (%%ebx, %1), %%mm2 \n\t"
653 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
654 "psubsb %%mm5, %%mm2 \n\t"
655 "psubb %%mm6, %%mm2 \n\t"
656 "movq %%mm2, (%%ebx, %1) \n\t"
657
658 :
659 : "r" (src), "r" (stride)
660 : "%eax", "%ebx"
661 );
662#else
663 const int l1= stride;
664 const int l2= stride + l1;
665 const int l3= stride + l2;
666 const int l4= stride + l3;
667 const int l5= stride + l4;
668 const int l6= stride + l5;
e5c30e06
MN
669// const int l7= stride + l6;
670// const int l8= stride + l7;
671// const int l9= stride + l8;
d5a1a995 672 int x;
3407a972 673 const int QP15= QP + (QP>>2);
acced553 674 src+= stride*3;
d5a1a995 675 for(x=0; x<BLOCK_SIZE; x++)
13e00528 676 {
3407a972
MN
677 const int v = (src[x+l5] - src[x+l4]);
678 if(ABS(v) < QP15)
13e00528 679 {
3407a972
MN
680 src[x+l3] +=v>>3;
681 src[x+l4] +=v>>1;
682 src[x+l5] -=v>>1;
683 src[x+l6] -=v>>3;
13e00528 684
13e00528 685 }
13e00528
A
686 }
687
688#endif
689}
690
691/**
692 * Experimental Filter 1
9f45d04d
MN
693 * will not damage linear gradients
694 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
d5a1a995
MN
695 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
696 * MMX2 version does correct clipping C version doesnt
13e00528
A
697 */
698static inline void vertX1Filter(uint8_t *src, int stride, int QP)
699{
d5a1a995 700#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553
MN
701 src+= stride*3;
702
13e00528 703 asm volatile(
d5a1a995
MN
704 "pxor %%mm7, %%mm7 \n\t" // 0
705// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
706 "leal (%0, %1), %%eax \n\t"
707 "leal (%%eax, %1, 4), %%ebx \n\t"
708// 0 1 2 3 4 5 6 7 8 9
709// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
710 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
711 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
712 "movq %%mm1, %%mm2 \n\t" // line 4
713 "psubusb %%mm0, %%mm1 \n\t"
714 "psubusb %%mm2, %%mm0 \n\t"
715 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
716 "movq (%%ebx), %%mm3 \n\t" // line 5
717 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
718 "movq %%mm3, %%mm5 \n\t" // line 5
719 "psubusb %%mm4, %%mm3 \n\t"
720 "psubusb %%mm5, %%mm4 \n\t"
721 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
722 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
723 "movq %%mm2, %%mm1 \n\t" // line 4
724 "psubusb %%mm5, %%mm2 \n\t"
725 "movq %%mm2, %%mm4 \n\t"
726 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
727 "psubusb %%mm1, %%mm5 \n\t"
728 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
729 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
730 "movq %%mm4, %%mm3 \n\t" // d
731 "psubusb pQPb, %%mm4 \n\t"
732 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
9f45d04d 733 "psubusb b01, %%mm3 \n\t"
d5a1a995
MN
734 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
735
736 PAVGB(%%mm7, %%mm3) // d/2
9f45d04d
MN
737 "movq %%mm3, %%mm1 \n\t" // d/2
738 PAVGB(%%mm7, %%mm3) // d/4
739 PAVGB(%%mm1, %%mm3) // 3*d/8
d5a1a995
MN
740
741 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
742 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
743 "psubusb %%mm3, %%mm0 \n\t"
744 "pxor %%mm2, %%mm0 \n\t"
745 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
746
747 "movq (%%ebx), %%mm0 \n\t" // line 5
748 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
749 "paddusb %%mm3, %%mm0 \n\t"
750 "pxor %%mm2, %%mm0 \n\t"
751 "movq %%mm0, (%%ebx) \n\t" // line 5
752
9f45d04d 753 PAVGB(%%mm7, %%mm1) // d/4
d5a1a995
MN
754
755 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
756 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
9f45d04d 757 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
758 "pxor %%mm2, %%mm0 \n\t"
759 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
760
761 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
762 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
9f45d04d 763 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
764 "pxor %%mm2, %%mm0 \n\t"
765 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
766
9f45d04d 767 PAVGB(%%mm7, %%mm1) // d/8
d5a1a995
MN
768
769 "movq (%%eax, %1), %%mm0 \n\t" // line 2
770 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
9f45d04d 771 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
772 "pxor %%mm2, %%mm0 \n\t"
773 "movq %%mm0, (%%eax, %1) \n\t" // line 2
774
775 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
776 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
9f45d04d 777 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
778 "pxor %%mm2, %%mm0 \n\t"
779 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
13e00528
A
780
781 :
782 : "r" (src), "r" (stride)
783 : "%eax", "%ebx"
784 );
785#else
d5a1a995
MN
786
787 const int l1= stride;
788 const int l2= stride + l1;
789 const int l3= stride + l2;
790 const int l4= stride + l3;
791 const int l5= stride + l4;
792 const int l6= stride + l5;
793 const int l7= stride + l6;
e5c30e06
MN
794// const int l8= stride + l7;
795// const int l9= stride + l8;
d5a1a995 796 int x;
acced553
MN
797
798 src+= stride*3;
d5a1a995
MN
799 for(x=0; x<BLOCK_SIZE; x++)
800 {
801 int a= src[l3] - src[l4];
802 int b= src[l4] - src[l5];
9f45d04d 803 int c= src[l5] - src[l6];
d5a1a995 804
3407a972
MN
805 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
806 d= MAX(d, 0);
d5a1a995
MN
807
808 if(d < QP)
809 {
810 int v = d * SIGN(-b);
811
3407a972
MN
812 src[l2] +=v>>3;
813 src[l3] +=v>>2;
814 src[l4] +=(3*v)>>3;
815 src[l5] -=(3*v)>>3;
816 src[l6] -=v>>2;
817 src[l7] -=v>>3;
d5a1a995
MN
818
819 }
820 src++;
821 }
822 /*
13e00528
A
823 const int l1= stride;
824 const int l2= stride + l1;
825 const int l3= stride + l2;
826 const int l4= stride + l3;
827 const int l5= stride + l4;
828 const int l6= stride + l5;
829 const int l7= stride + l6;
830 const int l8= stride + l7;
831 const int l9= stride + l8;
832 for(int x=0; x<BLOCK_SIZE; x++)
833 {
834 int v2= src[l2];
835 int v3= src[l3];
836 int v4= src[l4];
837 int v5= src[l5];
838 int v6= src[l6];
839 int v7= src[l7];
840
841 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
842 {
843 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
844 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
845 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
846 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
847 }
848 src++;
849 }
d5a1a995 850*/
13e00528
A
851#endif
852}
853
9f45d04d
MN
854/**
855 * Experimental Filter 1 (Horizontal)
856 * will not damage linear gradients
857 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
858 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
859 * MMX2 version does correct clipping C version doesnt
860 * not identical with the vertical one
861 */
862static inline void horizX1Filter(uint8_t *src, int stride, int QP)
863{
864 int y;
865 static uint64_t *lut= NULL;
866 if(lut==NULL)
867 {
868 int i;
869 lut= (uint64_t*)memalign(8, 256*8);
870 for(i=0; i<256; i++)
871 {
872 int v= i < 128 ? 2*i : 2*(i-256);
873/*
874//Simulate 112242211 9-Tap filter
875 uint64_t a= (v/16) & 0xFF;
876 uint64_t b= (v/8) & 0xFF;
877 uint64_t c= (v/4) & 0xFF;
878 uint64_t d= (3*v/8) & 0xFF;
879*/
880//Simulate piecewise linear interpolation
881 uint64_t a= (v/16) & 0xFF;
882 uint64_t b= (v*3/16) & 0xFF;
883 uint64_t c= (v*5/16) & 0xFF;
884 uint64_t d= (7*v/16) & 0xFF;
885 uint64_t A= (0x100 - a)&0xFF;
886 uint64_t B= (0x100 - b)&0xFF;
887 uint64_t C= (0x100 - c)&0xFF;
888 uint64_t D= (0x100 - c)&0xFF;
889
890 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
891 (D<<24) | (C<<16) | (B<<8) | (A);
892 //lut[i] = (v<<32) | (v<<24);
893 }
894 }
895
4e4dcbc5 896#if 0
9f45d04d
MN
897 asm volatile(
898 "pxor %%mm7, %%mm7 \n\t" // 0
899// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
900 "leal (%0, %1), %%eax \n\t"
901 "leal (%%eax, %1, 4), %%ebx \n\t"
902
903 "movq b80, %%mm6 \n\t"
79cccf70 904 "movd pQPb, %%mm5 \n\t" // QP
9f45d04d
MN
905 "movq %%mm5, %%mm4 \n\t"
906 "paddusb %%mm5, %%mm5 \n\t" // 2QP
907 "paddusb %%mm5, %%mm4 \n\t" // 3QP
908 "pxor %%mm5, %%mm5 \n\t" // 0
909 "psubb %%mm4, %%mm5 \n\t" // -3QP
910 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
911 "psllq $24, %%mm5 \n\t"
912
913// 0 1 2 3 4 5 6 7 8 9
914// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
915
916#define HX1old(a) \
917 "movd " #a ", %%mm0 \n\t"\
918 "movd 4" #a ", %%mm1 \n\t"\
919 "punpckldq %%mm1, %%mm0 \n\t"\
920 "movq %%mm0, %%mm1 \n\t"\
921 "movq %%mm0, %%mm2 \n\t"\
922 "psrlq $8, %%mm1 \n\t"\
923 "psubusb %%mm1, %%mm2 \n\t"\
924 "psubusb %%mm0, %%mm1 \n\t"\
925