cleanup
[libav.git] / postproc / postprocess.c
CommitLineData
3057fa66
A
1/*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
19/*
3b58b885 20 C MMX MMX2 3DNow
3057fa66
A
21isVertDC Ec Ec
22isVertMinMaxOk Ec Ec
3b58b885 23doVertLowPass E e e
7f16f6e6 24doVertDefFilter Ec Ec e e
3057fa66 25isHorizDC Ec Ec
4e4dcbc5
MN
26isHorizMinMaxOk a E
27doHorizLowPass E e e
7f16f6e6 28doHorizDefFilter Ec Ec e e
2e212618 29deRing E e e*
3b58b885 30Vertical RKAlgo1 E a a
e5c30e06 31Horizontal RKAlgo1 a a
117e45b0
MN
32Vertical X1# a E E
33Horizontal X1# a E E
acced553
MN
34LinIpolDeinterlace e E E*
35CubicIpolDeinterlace a e e*
36LinBlendDeinterlace e E E*
117e45b0 37MedianDeinterlace# Ec Ec
be44a4d7 38TempDeNoiser# E e e
d5a1a995 39
117e45b0
MN
40* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41# more or less selfinvented filters so the exactness isnt too meaningfull
3057fa66 42E = Exact implementation
acced553 43e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
44a = alternative / approximate impl
45c = checked against the other implementations (-vo md5)
46*/
47
48/*
49TODO:
9f45d04d 50verify that everything workes as it should (how?)
3057fa66 51reduce the time wasted on the mem transfer
13e00528 52implement everything in C at least (done at the moment but ...)
3057fa66
A
53unroll stuff if instructions depend too much on the prior one
54we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55move YScale thing to the end instead of fixing QP
13e00528 56write a faster and higher quality deblocking filter :)
d5a1a995
MN
57make the mainloop more flexible (variable number of blocks at once
58 (the if/else stuff per block is slowing things down)
9f45d04d 59compare the quality & speed of all filters
9f45d04d 60split this huge file
e5c30e06 61border remover
8405b3fd 62optimize c versions
117e45b0 63try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
be44a4d7 64smart blur
c09dc465 65commandline option for the deblock thresholds
3057fa66 66...
13e00528
A
67*/
68
a6be8111 69//Changelog: use the CVS log
3057fa66 70
6c426cff 71#include "../config.h"
3057fa66
A
72#include <inttypes.h>
73#include <stdio.h>
d5a1a995 74#include <stdlib.h>
911879d1 75#include <string.h>
dda87e9f
PL
76#ifdef HAVE_MALLOC_H
77#include <malloc.h>
78#endif
3057fa66 79//#undef HAVE_MMX2
13e00528 80//#define HAVE_3DNOW
3057fa66 81//#undef HAVE_MMX
7f16f6e6 82//#define DEBUG_BRIGHTNESS
13e00528 83#include "postprocess.h"
3057fa66 84
e939e1c3
A
85#define MIN(a,b) ((a) > (b) ? (b) : (a))
86#define MAX(a,b) ((a) < (b) ? (b) : (a))
87#define ABS(a) ((a) > 0 ? (a) : (-(a)))
88#define SIGN(a) ((a) > 0 ? 1 : -1)
89
90#ifdef HAVE_MMX2
91#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92#elif defined (HAVE_3DNOW)
93#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
94#endif
3057fa66 95
2e212618
MN
96#ifdef HAVE_MMX2
97#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
98#elif defined (HAVE_MMX)
99#define PMINUB(b,a,t) \
100 "movq " #a ", " #t " \n\t"\
101 "psubusb " #b ", " #t " \n\t"\
102 "psubb " #t ", " #a " \n\t"
103#endif
104
105#ifdef HAVE_MMX2
106#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
107#elif defined (HAVE_MMX)
108#define PMAXUB(a,b) \
109 "psubusb " #a ", " #b " \n\t"\
110 "paddb " #a ", " #b " \n\t"
111#endif
112
113
911879d1
MN
114#define GET_MODE_BUFFER_SIZE 500
115#define OPTIONS_ARRAY_SIZE 10
116
b28daef8 117#ifdef HAVE_MMX
3fe8e8f0
MN
118static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
119static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
b28daef8
MN
120static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
121static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
122static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL;
123static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL;
124static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL;
125static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL;
126static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL;
127static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL;
128static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL;
129static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL;
130static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL;
131static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL;
132static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL;
133static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL;
134static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL;
135static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
136static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
137static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
138static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL;
139static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL;
140static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
141static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL;
142static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL;
143static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
144static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL;
145static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL;
146static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL;
147static uint64_t __attribute__((aligned(8))) temp0=0;
148static uint64_t __attribute__((aligned(8))) temp1=0;
149static uint64_t __attribute__((aligned(8))) temp2=0;
150static uint64_t __attribute__((aligned(8))) temp3=0;
151static uint64_t __attribute__((aligned(8))) temp4=0;
152static uint64_t __attribute__((aligned(8))) temp5=0;
153static uint64_t __attribute__((aligned(8))) pQPb=0;
154static uint64_t __attribute__((aligned(8))) pQPb2=0;
155static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
a9c77978 156static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
b28daef8 157#else
3057fa66
A
158static uint64_t packedYOffset= 0x0000000000000000LL;
159static uint64_t packedYScale= 0x0100010001000100LL;
4e4dcbc5 160static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
b28daef8 161#endif
3057fa66
A
162
163int hFlatnessThreshold= 56 - 16;
164int vFlatnessThreshold= 56 - 16;
165
166//amount of "black" u r willing to loose to get a brightness corrected picture
167double maxClippedThreshold= 0.01;
168
911879d1 169int maxAllowedY=234;
658a85f2 170int minAllowedY=16;
3057fa66 171
911879d1
MN
172static struct PPFilter filters[]=
173{
174 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
175 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
176 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
177 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
178 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
179 {"dr", "dering", 1, 5, 6, DERING},
180 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
181 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
182 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
183 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
184 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
117e45b0 185 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
911879d1
MN
186 {NULL, NULL,0,0,0,0} //End Marker
187};
188
189static char *replaceTable[]=
190{
117e45b0
MN
191 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
192 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
193 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
194 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
911879d1
MN
195 NULL //End Marker
196};
197
b28daef8 198#ifdef HAVE_MMX
e5c30e06
MN
199static inline void unusedVariableWarningFixer()
200{
201if(
202 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
203 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
204 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
b28daef8 205 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
e5c30e06
MN
206 + temp5 + pQPb== 0) b00=0;
207}
b28daef8 208#endif
e5c30e06 209
a6be8111 210#ifdef TIMING
3057fa66
A
211static inline long long rdtsc()
212{
213 long long l;
214 asm volatile( "rdtsc\n\t"
215 : "=A" (l)
216 );
217// printf("%d\n", int(l/1000));
218 return l;
219}
9a722af7 220#endif
3057fa66 221
9a722af7 222#ifdef HAVE_MMX2
3057fa66
A
223static inline void prefetchnta(void *p)
224{
225 asm volatile( "prefetchnta (%0)\n\t"
226 : : "r" (p)
227 );
228}
229
230static inline void prefetcht0(void *p)
231{
232 asm volatile( "prefetcht0 (%0)\n\t"
233 : : "r" (p)
234 );
235}
236
237static inline void prefetcht1(void *p)
238{
239 asm volatile( "prefetcht1 (%0)\n\t"
240 : : "r" (p)
241 );
242}
243
244static inline void prefetcht2(void *p)
245{
246 asm volatile( "prefetcht2 (%0)\n\t"
247 : : "r" (p)
248 );
249}
9a722af7 250#endif
3057fa66
A
251
252//FIXME? |255-0| = 1 (shouldnt be a problem ...)
253/**
acced553 254 * Check if the middle 8x8 Block in the given 8x16 block is flat
3057fa66 255 */
d5a1a995 256static inline int isVertDC(uint8_t src[], int stride){
3057fa66 257 int numEq= 0;
e5c30e06 258#ifndef HAVE_MMX
d5a1a995 259 int y;
e5c30e06 260#endif
acced553 261 src+= stride*4; // src points to begin of the 8x8 Block
3057fa66 262#ifdef HAVE_MMX
37da00fc
MN
263asm volatile(
264 "leal (%1, %2), %%eax \n\t"
265 "leal (%%eax, %2, 4), %%ebx \n\t"
266// 0 1 2 3 4 5 6 7 8 9
267// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
3057fa66
A
268 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
269 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
270 "movq (%1), %%mm0 \n\t"
37da00fc 271 "movq (%%eax), %%mm1 \n\t"
3057fa66
A
272 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
273 "paddb %%mm7, %%mm0 \n\t"
274 "pcmpgtb %%mm6, %%mm0 \n\t"
275
37da00fc 276 "movq (%%eax,%2), %%mm2 \n\t"
3057fa66
A
277 "psubb %%mm2, %%mm1 \n\t"
278 "paddb %%mm7, %%mm1 \n\t"
279 "pcmpgtb %%mm6, %%mm1 \n\t"
280 "paddb %%mm1, %%mm0 \n\t"
281
37da00fc 282 "movq (%%eax, %2, 2), %%mm1 \n\t"
3057fa66
A
283 "psubb %%mm1, %%mm2 \n\t"
284 "paddb %%mm7, %%mm2 \n\t"
285 "pcmpgtb %%mm6, %%mm2 \n\t"
286 "paddb %%mm2, %%mm0 \n\t"
287
37da00fc 288 "movq (%1, %2, 4), %%mm2 \n\t"
3057fa66
A
289 "psubb %%mm2, %%mm1 \n\t"
290 "paddb %%mm7, %%mm1 \n\t"
291 "pcmpgtb %%mm6, %%mm1 \n\t"
292 "paddb %%mm1, %%mm0 \n\t"
293
37da00fc 294 "movq (%%ebx), %%mm1 \n\t"
3057fa66
A
295 "psubb %%mm1, %%mm2 \n\t"
296 "paddb %%mm7, %%mm2 \n\t"
297 "pcmpgtb %%mm6, %%mm2 \n\t"
298 "paddb %%mm2, %%mm0 \n\t"
299
37da00fc 300 "movq (%%ebx, %2), %%mm2 \n\t"
3057fa66
A
301 "psubb %%mm2, %%mm1 \n\t"
302 "paddb %%mm7, %%mm1 \n\t"
303 "pcmpgtb %%mm6, %%mm1 \n\t"
304 "paddb %%mm1, %%mm0 \n\t"
305
37da00fc 306 "movq (%%ebx, %2, 2), %%mm1 \n\t"
3057fa66
A
307 "psubb %%mm1, %%mm2 \n\t"
308 "paddb %%mm7, %%mm2 \n\t"
309 "pcmpgtb %%mm6, %%mm2 \n\t"
310 "paddb %%mm2, %%mm0 \n\t"
311
312 " \n\t"
313 "movq %%mm0, %%mm1 \n\t"
314 "psrlw $8, %%mm0 \n\t"
315 "paddb %%mm1, %%mm0 \n\t"
e5c30e06
MN
316#ifdef HAVE_MMX2
317 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
318 "paddb %%mm1, %%mm0 \n\t"
319 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
320#else
3057fa66
A
321 "movq %%mm0, %%mm1 \n\t"
322 "psrlq $16, %%mm0 \n\t"
323 "paddb %%mm1, %%mm0 \n\t"
324 "movq %%mm0, %%mm1 \n\t"
325 "psrlq $32, %%mm0 \n\t"
e5c30e06 326#endif
3057fa66 327 "paddb %%mm1, %%mm0 \n\t"
3057fa66
A
328 "movd %%mm0, %0 \n\t"
329 : "=r" (numEq)
330 : "r" (src), "r" (stride)
4e4dcbc5 331 : "%eax", "%ebx"
3057fa66 332 );
3057fa66 333
37da00fc 334 numEq= (256 - numEq) &0xFF;
3057fa66
A
335
336#else
d5a1a995 337 for(y=0; y<BLOCK_SIZE-1; y++)
3057fa66
A
338 {
339 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
340 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
341 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
342 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
343 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
344 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
345 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
346 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
347 src+= stride;
348 }
349#endif
350/* if(abs(numEq - asmEq) > 0)
351 {
352 printf("\nasm:%d c:%d\n", asmEq, numEq);
353 for(int y=0; y<8; y++)
354 {
355 for(int x=0; x<8; x++)
356 {
357 printf("%d ", temp[x + y*stride]);
358 }
359 printf("\n");
360 }
361 }
362*/
d5a1a995
MN
363// for(int i=0; i<numEq/8; i++) src[i]=255;
364 return (numEq > vFlatnessThreshold) ? 1 : 0;
3057fa66
A
365}
366
d5a1a995 367static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
3057fa66
A
368{
369#ifdef HAVE_MMX
370 int isOk;
acced553 371 src+= stride*3;
3057fa66
A
372 asm volatile(
373// "int $3 \n\t"
374 "movq (%1, %2), %%mm0 \n\t"
375 "movq (%1, %2, 8), %%mm1 \n\t"
376 "movq %%mm0, %%mm2 \n\t"
377 "psubusb %%mm1, %%mm0 \n\t"
378 "psubusb %%mm2, %%mm1 \n\t"
379 "por %%mm1, %%mm0 \n\t" // ABS Diff
380
381 "movq pQPb, %%mm7 \n\t" // QP,..., QP
382 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
383 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
384 "pcmpeqd b00, %%mm0 \n\t"
385 "psrlq $16, %%mm0 \n\t"
386 "pcmpeqd bFF, %%mm0 \n\t"
387// "movd %%mm0, (%1, %2, 4)\n\t"
388 "movd %%mm0, %0 \n\t"
389 : "=r" (isOk)
390 : "r" (src), "r" (stride)
391 );
ac0b0b2f 392 return isOk;
3057fa66
A
393#else
394
d5a1a995
MN
395 int isOk2= 1;
396 int x;
acced553 397 src+= stride*3;
d5a1a995 398 for(x=0; x<BLOCK_SIZE; x++)
3057fa66 399 {
d5a1a995 400 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
3057fa66
A
401 }
402/* if(isOk && !isOk2 || !isOk && isOk2)
403 {
404 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
405 for(int y=0; y<9; y++)
406 {
407 for(int x=0; x<8; x++)
408 {
409 printf("%d ", src[x + y*stride]);
410 }
411 printf("\n");
412 }
413 } */
414
415 return isOk2;
416#endif
417
418}
419
420/**
acced553 421 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
a6be8111 422 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
3057fa66
A
423 */
424static inline void doVertLowPass(uint8_t *src, int stride, int QP)
425{
13e00528 426#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 427 src+= stride*3;
3057fa66 428 asm volatile( //"movv %0 %1 %2\n\t"
3057fa66 429 "movq pQPb, %%mm0 \n\t" // QP,..., QP
3057fa66
A
430
431 "movq (%0), %%mm6 \n\t"
432 "movq (%0, %1), %%mm5 \n\t"
433 "movq %%mm5, %%mm1 \n\t"
434 "movq %%mm6, %%mm2 \n\t"
435 "psubusb %%mm6, %%mm5 \n\t"
436 "psubusb %%mm1, %%mm2 \n\t"
437 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
438 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
439 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
440
441 "pand %%mm2, %%mm6 \n\t"
442 "pandn %%mm1, %%mm2 \n\t"
443 "por %%mm2, %%mm6 \n\t"// First Line to Filter
444
445 "movq (%0, %1, 8), %%mm5 \n\t"
446 "leal (%0, %1, 4), %%eax \n\t"
447 "leal (%0, %1, 8), %%ebx \n\t"
448 "subl %1, %%ebx \n\t"
449 "addl %1, %0 \n\t" // %0 points to line 1 not 0
450 "movq (%0, %1, 8), %%mm7 \n\t"
451 "movq %%mm5, %%mm1 \n\t"
452 "movq %%mm7, %%mm2 \n\t"
453 "psubusb %%mm7, %%mm5 \n\t"
454 "psubusb %%mm1, %%mm2 \n\t"
455 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
456 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
457 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
458
459 "pand %%mm2, %%mm7 \n\t"
460 "pandn %%mm1, %%mm2 \n\t"
461 "por %%mm2, %%mm7 \n\t" // First Line to Filter
462
463
464 // 1 2 3 4 5 6 7 8
465 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
466 // 6 4 2 2 1 1
467 // 6 4 4 2
468 // 6 8 2
acced553 469
3057fa66
A
470 "movq (%0, %1), %%mm0 \n\t" // 1
471 "movq %%mm0, %%mm1 \n\t" // 1
13e00528
A
472 PAVGB(%%mm6, %%mm0) //1 1 /2
473 PAVGB(%%mm6, %%mm0) //3 1 /4
3057fa66
A
474
475 "movq (%0, %1, 4), %%mm2 \n\t" // 1
476 "movq %%mm2, %%mm5 \n\t" // 1
13e00528
A
477 PAVGB((%%eax), %%mm2) // 11 /2
478 PAVGB((%0, %1, 2), %%mm2) // 211 /4
3057fa66
A
479 "movq %%mm2, %%mm3 \n\t" // 211 /4
480 "movq (%0), %%mm4 \n\t" // 1
13e00528
A
481 PAVGB(%%mm4, %%mm3) // 4 211 /8
482 PAVGB(%%mm0, %%mm3) //642211 /16
3057fa66
A
483 "movq %%mm3, (%0) \n\t" // X
484 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
485 "movq %%mm1, %%mm0 \n\t" // 1
13e00528 486 PAVGB(%%mm6, %%mm0) //1 1 /2
3057fa66 487 "movq %%mm4, %%mm3 \n\t" // 1
13e00528
A
488 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
489 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
490 PAVGB((%%eax), %%mm5) // 211 /4
491 PAVGB(%%mm5, %%mm3) // 2 2211 /8
492 PAVGB(%%mm0, %%mm3) //4242211 /16
3057fa66
A
493 "movq %%mm3, (%0,%1) \n\t" // X
494 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
13e00528 495 PAVGB(%%mm4, %%mm6) //11 /2
3057fa66 496 "movq (%%ebx), %%mm0 \n\t" // 1
13e00528 497 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
3057fa66 498 "movq %%mm0, %%mm3 \n\t" // 11/2
13e00528
A
499 PAVGB(%%mm1, %%mm0) // 2 11/4
500 PAVGB(%%mm6, %%mm0) //222 11/8
501 PAVGB(%%mm2, %%mm0) //22242211/16
3057fa66
A
502 "movq (%0, %1, 2), %%mm2 \n\t" // 1
503 "movq %%mm0, (%0, %1, 2) \n\t" // X
504 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
505 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
506 PAVGB((%%ebx), %%mm0) // 11 /2
507 PAVGB(%%mm0, %%mm6) //11 11 /4
508 PAVGB(%%mm1, %%mm4) // 11 /2
509 PAVGB(%%mm2, %%mm1) // 11 /2
510 PAVGB(%%mm1, %%mm6) //1122 11 /8
511 PAVGB(%%mm5, %%mm6) //112242211 /16
3057fa66
A
512 "movq (%%eax), %%mm5 \n\t" // 1
513 "movq %%mm6, (%%eax) \n\t" // X
514 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
515 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
13e00528
A
516 PAVGB(%%mm7, %%mm6) // 11 /2
517 PAVGB(%%mm4, %%mm6) // 11 11 /4
518 PAVGB(%%mm3, %%mm6) // 11 2211 /8
519 PAVGB(%%mm5, %%mm2) // 11 /2
3057fa66 520 "movq (%0, %1, 4), %%mm4 \n\t" // 1
13e00528
A
521 PAVGB(%%mm4, %%mm2) // 112 /4
522 PAVGB(%%mm2, %%mm6) // 112242211 /16
3057fa66
A
523 "movq %%mm6, (%0, %1, 4) \n\t" // X
524 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
13e00528
A
525 PAVGB(%%mm7, %%mm1) // 11 2 /4
526 PAVGB(%%mm4, %%mm5) // 11 /2
527 PAVGB(%%mm5, %%mm0) // 11 11 /4
3057fa66 528 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
13e00528
A
529 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
530 PAVGB(%%mm0, %%mm1) // 11224222 /16
3057fa66
A
531 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
532 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
13e00528 533 PAVGB((%%ebx), %%mm2) // 112 4 /8
3057fa66 534 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
535 PAVGB(%%mm0, %%mm6) // 1 1 /2
536 PAVGB(%%mm7, %%mm6) // 1 12 /4
537 PAVGB(%%mm2, %%mm6) // 1122424 /4
3057fa66
A
538 "movq %%mm6, (%%ebx) \n\t" // X
539 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
13e00528
A
540 PAVGB(%%mm7, %%mm5) // 11 2 /4
541 PAVGB(%%mm7, %%mm5) // 11 6 /8
3057fa66 542
13e00528
A
543 PAVGB(%%mm3, %%mm0) // 112 /4
544 PAVGB(%%mm0, %%mm5) // 112246 /16
3057fa66 545 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
8405b3fd 546 "subl %1, %0 \n\t"
3057fa66
A
547
548 :
549 : "r" (src), "r" (stride)
550 : "%eax", "%ebx"
551 );
3057fa66
A
552#else
553 const int l1= stride;
554 const int l2= stride + l1;
555 const int l3= stride + l2;
556 const int l4= stride + l3;
557 const int l5= stride + l4;
558 const int l6= stride + l5;
559 const int l7= stride + l6;
560 const int l8= stride + l7;
561 const int l9= stride + l8;
d5a1a995 562 int x;
acced553 563 src+= stride*3;
d5a1a995 564 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
565 {
566 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
567 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
568
569 int sums[9];
570 sums[0] = first + src[l1];
571 sums[1] = src[l1] + src[l2];
572 sums[2] = src[l2] + src[l3];
573 sums[3] = src[l3] + src[l4];
574 sums[4] = src[l4] + src[l5];
575 sums[5] = src[l5] + src[l6];
576 sums[6] = src[l6] + src[l7];
577 sums[7] = src[l7] + src[l8];
578 sums[8] = src[l8] + last;
579
580 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
e5c30e06
MN
581 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
582 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
583 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
584 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
585 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
586 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
587 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
3057fa66
A
588
589 src++;
590 }
591
592#endif
593}
594
13e00528
A
595/**
596 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
597 * values are correctly clipped (MMX2)
598 * values are wraparound (C)
599 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
600 0 8 16 24
601 x = 8
602 x/2 = 4
603 x/8 = 1
604 1 12 12 23
605 */
9f45d04d 606static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
13e00528 607{
d5a1a995 608#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 609 src+= stride*3;
13e00528
A
610// FIXME rounding
611 asm volatile(
612 "pxor %%mm7, %%mm7 \n\t" // 0
613 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
614 "leal (%0, %1), %%eax \n\t"
615 "leal (%%eax, %1, 4), %%ebx \n\t"
616// 0 1 2 3 4 5 6 7 8 9
617// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
618 "movq pQPb, %%mm0 \n\t" // QP,..., QP
619 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
620 "paddusb b02, %%mm0 \n\t"
621 "psrlw $2, %%mm0 \n\t"
622 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
623 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
624 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
625 "movq (%%ebx), %%mm3 \n\t" // line 5
626 "movq %%mm2, %%mm4 \n\t" // line 4
627 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
628 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
d5a1a995 629 PAVGB(%%mm3, %%mm5)
13e00528
A
630 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
631 "psubusb %%mm3, %%mm4 \n\t"
632 "psubusb %%mm2, %%mm3 \n\t"
633 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
634 "psubusb %%mm0, %%mm4 \n\t"
635 "pcmpeqb %%mm7, %%mm4 \n\t"
636 "pand %%mm4, %%mm5 \n\t" // d/2
637
638// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
639 "paddb %%mm5, %%mm2 \n\t"
640// "psubb %%mm6, %%mm2 \n\t"
641 "movq %%mm2, (%0,%1, 4) \n\t"
642
643 "movq (%%ebx), %%mm2 \n\t"
644// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
645 "psubb %%mm5, %%mm2 \n\t"
646// "psubb %%mm6, %%mm2 \n\t"
647 "movq %%mm2, (%%ebx) \n\t"
648
649 "paddb %%mm6, %%mm5 \n\t"
650 "psrlw $2, %%mm5 \n\t"
651 "pand b3F, %%mm5 \n\t"
652 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
653
654 "movq (%%eax, %1, 2), %%mm2 \n\t"
655 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
656 "paddsb %%mm5, %%mm2 \n\t"
657 "psubb %%mm6, %%mm2 \n\t"
658 "movq %%mm2, (%%eax, %1, 2) \n\t"
659
660 "movq (%%ebx, %1), %%mm2 \n\t"
661 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
662 "psubsb %%mm5, %%mm2 \n\t"
663 "psubb %%mm6, %%mm2 \n\t"
664 "movq %%mm2, (%%ebx, %1) \n\t"
665
666 :
667 : "r" (src), "r" (stride)
668 : "%eax", "%ebx"
669 );
670#else
671 const int l1= stride;
672 const int l2= stride + l1;
673 const int l3= stride + l2;
674 const int l4= stride + l3;
675 const int l5= stride + l4;
676 const int l6= stride + l5;
e5c30e06
MN
677// const int l7= stride + l6;
678// const int l8= stride + l7;
679// const int l9= stride + l8;
d5a1a995 680 int x;
3407a972 681 const int QP15= QP + (QP>>2);
acced553 682 src+= stride*3;
d5a1a995 683 for(x=0; x<BLOCK_SIZE; x++)
13e00528 684 {
3407a972
MN
685 const int v = (src[x+l5] - src[x+l4]);
686 if(ABS(v) < QP15)
13e00528 687 {
3407a972
MN
688 src[x+l3] +=v>>3;
689 src[x+l4] +=v>>1;
690 src[x+l5] -=v>>1;
691 src[x+l6] -=v>>3;
13e00528 692
13e00528 693 }
13e00528
A
694 }
695
696#endif
697}
698
699/**
700 * Experimental Filter 1
9f45d04d
MN
701 * will not damage linear gradients
702 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
d5a1a995
MN
703 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
704 * MMX2 version does correct clipping C version doesnt
13e00528
A
705 */
706static inline void vertX1Filter(uint8_t *src, int stride, int QP)
707{
d5a1a995 708#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553
MN
709 src+= stride*3;
710
13e00528 711 asm volatile(
d5a1a995
MN
712 "pxor %%mm7, %%mm7 \n\t" // 0
713// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
714 "leal (%0, %1), %%eax \n\t"
715 "leal (%%eax, %1, 4), %%ebx \n\t"
716// 0 1 2 3 4 5 6 7 8 9
717// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
718 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
719 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
720 "movq %%mm1, %%mm2 \n\t" // line 4
721 "psubusb %%mm0, %%mm1 \n\t"
722 "psubusb %%mm2, %%mm0 \n\t"
723 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
724 "movq (%%ebx), %%mm3 \n\t" // line 5
725 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
726 "movq %%mm3, %%mm5 \n\t" // line 5
727 "psubusb %%mm4, %%mm3 \n\t"
728 "psubusb %%mm5, %%mm4 \n\t"
729 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
730 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
731 "movq %%mm2, %%mm1 \n\t" // line 4
732 "psubusb %%mm5, %%mm2 \n\t"
733 "movq %%mm2, %%mm4 \n\t"
734 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
735 "psubusb %%mm1, %%mm5 \n\t"
736 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
737 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
738 "movq %%mm4, %%mm3 \n\t" // d
739 "psubusb pQPb, %%mm4 \n\t"
740 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
9f45d04d 741 "psubusb b01, %%mm3 \n\t"
d5a1a995
MN
742 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
743
744 PAVGB(%%mm7, %%mm3) // d/2
9f45d04d
MN
745 "movq %%mm3, %%mm1 \n\t" // d/2
746 PAVGB(%%mm7, %%mm3) // d/4
747 PAVGB(%%mm1, %%mm3) // 3*d/8
d5a1a995
MN
748
749 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
750 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
751 "psubusb %%mm3, %%mm0 \n\t"
752 "pxor %%mm2, %%mm0 \n\t"
753 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
754
755 "movq (%%ebx), %%mm0 \n\t" // line 5
756 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
757 "paddusb %%mm3, %%mm0 \n\t"
758 "pxor %%mm2, %%mm0 \n\t"
759 "movq %%mm0, (%%ebx) \n\t" // line 5
760
9f45d04d 761 PAVGB(%%mm7, %%mm1) // d/4
d5a1a995
MN
762
763 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
764 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
9f45d04d 765 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
766 "pxor %%mm2, %%mm0 \n\t"
767 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
768
769 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
770 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
9f45d04d 771 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
772 "pxor %%mm2, %%mm0 \n\t"
773 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
774
9f45d04d 775 PAVGB(%%mm7, %%mm1) // d/8
d5a1a995
MN
776
777 "movq (%%eax, %1), %%mm0 \n\t" // line 2
778 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
9f45d04d 779 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
780 "pxor %%mm2, %%mm0 \n\t"
781 "movq %%mm0, (%%eax, %1) \n\t" // line 2
782
783 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
784 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
9f45d04d 785 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
786 "pxor %%mm2, %%mm0 \n\t"
787 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
13e00528
A
788
789 :
790 : "r" (src), "r" (stride)
791 : "%eax", "%ebx"
792 );
793#else
d5a1a995
MN
794
795 const int l1= stride;
796 const int l2= stride + l1;
797 const int l3= stride + l2;
798 const int l4= stride + l3;
799 const int l5= stride + l4;
800 const int l6= stride + l5;
801 const int l7= stride + l6;
e5c30e06
MN
802// const int l8= stride + l7;
803// const int l9= stride + l8;
d5a1a995 804 int x;
acced553
MN
805
806 src+= stride*3;
d5a1a995
MN
807 for(x=0; x<BLOCK_SIZE; x++)
808 {
809 int a= src[l3] - src[l4];
810 int b= src[l4] - src[l5];
9f45d04d 811 int c= src[l5] - src[l6];
d5a1a995 812
3407a972
MN
813 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
814 d= MAX(d, 0);
d5a1a995
MN
815
816 if(d < QP)
817 {
818 int v = d * SIGN(-b);
819
3407a972
MN
820 src[l2] +=v>>3;
821 src[l3] +=v>>2;
822 src[l4] +=(3*v)>>3;
823 src[l5] -=(3*v)>>3;
824 src[l6] -=v>>2;
825 src[l7] -=v>>3;
d5a1a995
MN
826
827 }
828 src++;
829 }
830 /*
13e00528
A
831 const int l1= stride;
832 const int l2= stride + l1;
833 const int l3= stride + l2;
834 const int l4= stride + l3;
835 const int l5= stride + l4;
836 const int l6= stride + l5;
837 const int l7= stride + l6;
838 const int l8= stride + l7;
839 const int l9= stride + l8;
840 for(int x=0; x<BLOCK_SIZE; x++)
841 {
842 int v2= src[l2];
843 int v3= src[l3];
844 int v4= src[l4];
845 int v5= src[l5];
846 int v6= src[l6];
847 int v7= src[l7];
848
849 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
850 {
851 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
852 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
853 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
854 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
855 }
856 src++;
857 }
d5a1a995 858*/
13e00528
A
859#endif
860}
861
3057fa66
A
862static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
863{
7f16f6e6
MN
864#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
865/*
866 uint8_t tmp[16];
867 const int l1= stride;
868 const int l2= stride + l1;
869 const int l3= stride + l2;
870 const int l4= (int)tmp - (int)src - stride*3;
871 const int l5= (int)tmp - (int)src - stride*3 + 8;
872 const int l6= stride*3 + l3;
873 const int l7= stride + l6;
874 const int l8= stride + l7;
875
876 memcpy(tmp, src+stride*7, 8);
877 memcpy(tmp+8, src+stride*8, 8);
878*/
879 src+= stride*4;
880 asm volatile(
881
882#if 0 //sligtly more accurate and slightly slower
883 "pxor %%mm7, %%mm7 \n\t" // 0
884 "leal (%0, %1), %%eax \n\t"
885 "leal (%%eax, %1, 4), %%ebx \n\t"
886// 0 1 2 3 4 5 6 7
887// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
888// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
889
890
891 "movq (%0, %1, 2), %%mm0 \n\t" // l2
892 "movq (%0), %%mm1 \n\t" // l0
893 "movq %%mm0, %%mm2 \n\t" // l2
894 PAVGB(%%mm7, %%mm0) // ~l2/2
895 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
896 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
897
898 "movq (%%eax), %%mm1 \n\t" // l1
899 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
900 "movq %%mm1, %%mm4 \n\t" // l1
901 PAVGB(%%mm7, %%mm1) // ~l1/2
902 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
903 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
904
905 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
906 "psubusb %%mm1, %%mm0 \n\t"
907 "psubusb %%mm4, %%mm1 \n\t"
908 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
909// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
910
911 "movq (%0, %1, 4), %%mm0 \n\t" // l4
912 "movq %%mm0, %%mm4 \n\t" // l4
913 PAVGB(%%mm7, %%mm0) // ~l4/2
914 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
915 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
916
917 "movq (%%ebx), %%mm2 \n\t" // l5
918 "movq %%mm3, %%mm5 \n\t" // l3
919 PAVGB(%%mm7, %%mm3) // ~l3/2
920 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
921 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
922
923 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
924 "psubusb %%mm3, %%mm0 \n\t"
925 "psubusb %%mm6, %%mm3 \n\t"
926 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
927 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
928// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
929
930 "movq (%%ebx, %1), %%mm6 \n\t" // l6
931 "movq %%mm6, %%mm5 \n\t" // l6
932 PAVGB(%%mm7, %%mm6) // ~l6/2
933 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
934 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
935
936 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
937 "movq %%mm2, %%mm4 \n\t" // l5
938 PAVGB(%%mm7, %%mm2) // ~l5/2
939 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
940 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
941
942 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
943 "psubusb %%mm2, %%mm6 \n\t"
944 "psubusb %%mm4, %%mm2 \n\t"
945 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
946// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
947
948
949 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
950 "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
951 "paddusb b01, %%mm4 \n\t"
952 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
953 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
954 "pand %%mm4, %%mm3 \n\t"
955
956 "movq %%mm3, %%mm1 \n\t"
957// "psubusb b01, %%mm3 \n\t"
958 PAVGB(%%mm7, %%mm3)
959 PAVGB(%%mm7, %%mm3)
960 "paddusb %%mm1, %%mm3 \n\t"
961// "paddusb b01, %%mm3 \n\t"
962
963 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
964 "movq (%0, %1, 4), %%mm5 \n\t" //l4
965 "movq (%0, %1, 4), %%mm4 \n\t" //l4
966 "psubusb %%mm6, %%mm5 \n\t"
967 "psubusb %%mm4, %%mm6 \n\t"
968 "por %%mm6, %%mm5 \n\t" // |l3-l4|
969 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
970 "pxor %%mm6, %%mm0 \n\t"
971 "pand %%mm0, %%mm3 \n\t"
972 PMINUB(%%mm5, %%mm3, %%mm0)
973
974 "psubusb b01, %%mm3 \n\t"
975 PAVGB(%%mm7, %%mm3)
976
977 "movq (%%eax, %1, 2), %%mm0 \n\t"
978 "movq (%0, %1, 4), %%mm2 \n\t"
979 "pxor %%mm6, %%mm0 \n\t"
980 "pxor %%mm6, %%mm2 \n\t"
981 "psubb %%mm3, %%mm0 \n\t"
982 "paddb %%mm3, %%mm2 \n\t"
983 "pxor %%mm6, %%mm0 \n\t"
984 "pxor %%mm6, %%mm2 \n\t"
985 "movq %%mm0, (%%eax, %1, 2) \n\t"
986 "movq %%mm2, (%0, %1, 4) \n\t"
987#endif
988
989 "leal (%0, %1), %%eax \n\t"
990 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
991// 0 1 2 3 4 5 6 7
992// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
993// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
994
995
996 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
997 "movq (%0, %1, 4), %%mm0 \n\t" // l4
998 "pxor %%mm6, %%mm1 \n\t" // -l3-1
999 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
1000// mm1=-l3-1, mm0=128-q
1001
1002 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
1003 "movq (%%eax, %1), %%mm3 \n\t" // l2
1004 "pxor %%mm6, %%mm2 \n\t" // -l5-1
1005 "movq %%mm2, %%mm5 \n\t" // -l5-1
1006 "movq b80, %%mm4 \n\t" // 128
1007 "leal (%%eax, %1, 4), %%ebx \n\t"
1008 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
1009 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
1010 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
1011 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
1012// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
1013
1014 "movq (%%eax), %%mm2 \n\t" // l1
1015 "pxor %%mm6, %%mm2 \n\t" // -l1-1
1016 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
1017 PAVGB((%0), %%mm1) // (l0-l3+256)/2
1018 "movq b80, %%mm3 \n\t" // 128
1019 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
1020 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
1021 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
1022// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
1023
1024 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
1025 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
1026 "pxor %%mm6, %%mm1 \n\t" // -l7-1
1027 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
1028 "movq b80, %%mm2 \n\t" // 128
1029 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
1030 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
1031 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
1032// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
1033
1034 "movq b00, %%mm1 \n\t" // 0
1035 "movq b00, %%mm5 \n\t" // 0
1036 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
1037 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
1038 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
1039 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
1040 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
1041
1042// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
1043
1044 "movq b00, %%mm7 \n\t" // 0
1045 "movq pQPb, %%mm2 \n\t" // QP
1046 PAVGB(%%mm6, %%mm2) // 128 + QP/2
1047 "psubb %%mm6, %%mm2 \n\t"
1048
1049 "movq %%mm4, %%mm1 \n\t"
1050 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
1051 "pxor %%mm1, %%mm4 \n\t"
1052 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
1053 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
1054 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
1055// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
1056
1057 "movq %%mm4, %%mm3 \n\t" // d
1058 "psubusb b01, %%mm4 \n\t"
1059 PAVGB(%%mm7, %%mm4) // d/32
1060 PAVGB(%%mm7, %%mm4) // (d + 32)/64
1061 "paddb %%mm3, %%mm4 \n\t" // 5d/64
1062 "pand %%mm2, %%mm4 \n\t"
1063
1064 "movq b80, %%mm5 \n\t" // 128
1065 "psubb %%mm0, %%mm5 \n\t" // q
1066 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
1067 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
1068 "pxor %%mm7, %%mm5 \n\t"
1069
1070 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
1071 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
1072
1073 "pand %%mm7, %%mm4 \n\t"
1074 "movq (%%eax, %1, 2), %%mm0 \n\t"
1075 "movq (%0, %1, 4), %%mm2 \n\t"
1076 "pxor %%mm1, %%mm0 \n\t"
1077 "pxor %%mm1, %%mm2 \n\t"
1078 "paddb %%mm4, %%mm0 \n\t"
1079 "psubb %%mm4, %%mm2 \n\t"
1080 "pxor %%mm1, %%mm0 \n\t"
1081 "pxor %%mm1, %%mm2 \n\t"
1082 "movq %%mm0, (%%eax, %1, 2) \n\t"
1083 "movq %%mm2, (%0, %1, 4) \n\t"
1084
1085 :
1086 : "r" (src), "r" (stride)
1087 : "%eax", "%ebx"
1088 );
1089
1090/*
1091 {
1092 int x;
1093 src-= stride;
1094 for(x=0; x<BLOCK_SIZE; x++)
1095 {
1096 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1097 if(ABS(middleEnergy)< 8*QP)
1098 {
1099 const int q=(src[l4] - src[l5])/2;
1100 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1101 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1102
1103 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1104 d= MAX(d, 0);
1105
1106 d= (5*d + 32) >> 6;
1107 d*= SIGN(-middleEnergy);
1108
1109 if(q>0)
1110 {
1111 d= d<0 ? 0 : d;
1112 d= d>q ? q : d;
1113 }
1114 else
1115 {
1116 d= d>0 ? 0 : d;
1117 d= d<q ? q : d;
1118 }
1119
1120 src[l4]-= d;
1121 src[l5]+= d;
1122 }
1123 src++;
1124 }
1125src-=8;
1126 for(x=0; x<8; x++)
1127 {
1128 int y;
1129 for(y=4; y<6; y++)
1130 {
1131 int d= src[x+y*stride] - tmp[x+(y-4)*8];
1132 int ad= ABS(d);
1133 static int max=0;
1134 static int sum=0;
1135 static int num=0;
1136 static int bias=0;
1137
1138 if(max<ad) max=ad;
1139 sum+= ad>3 ? 1 : 0;
1140 if(ad>3)
1141 {
1142 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
1143 }
1144 if(y==4) bias+=d;
1145 num++;
1146 if(num%1000000 == 0)
1147 {
1148 printf(" %d %d %d %d\n", num, sum, max, bias);
1149 }
1150 }
1151 }
1152}
1153*/
1154#elif defined (HAVE_MMX)
acced553 1155 src+= stride*4;
7f16f6e6 1156
3057fa66
A
1157 asm volatile(
1158 "pxor %%mm7, %%mm7 \n\t"
1159 "leal (%0, %1), %%eax \n\t"
1160 "leal (%%eax, %1, 4), %%ebx \n\t"
1161// 0 1 2 3 4 5 6 7
1162// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1163// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1164
1165 "movq (%0), %%mm0 \n\t"
1166 "movq %%mm0, %%mm1 \n\t"
1167 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1168 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1169
1170 "movq (%%eax), %%mm2 \n\t"
1171 "movq %%mm2, %%mm3 \n\t"
1172 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1173 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1174
1175 "movq (%%eax, %1), %%mm4 \n\t"
1176 "movq %%mm4, %%mm5 \n\t"
1177 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1178 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1179
1180 "paddw %%mm0, %%mm0 \n\t" // 2L0
1181 "paddw %%mm1, %%mm1 \n\t" // 2H0
1182 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1183 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1184 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1185 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1186
1187 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1188 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1189 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1190 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1191
1192 "movq (%%eax, %1, 2), %%mm2 \n\t"
1193 "movq %%mm2, %%mm3 \n\t"
1194 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1195 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1196
1197 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1198 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1199 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1200 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1201 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1202 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1203
1204 "movq (%0, %1, 4), %%mm0 \n\t"
1205 "movq %%mm0, %%mm1 \n\t"
1206 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1207 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1208
1209 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1210 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1211 "movq %%mm2, temp2 \n\t" // L3 - L4
1212 "movq %%mm3, temp3 \n\t" // H3 - H4
1213 "paddw %%mm4, %%mm4 \n\t" // 2L2
1214 "paddw %%mm5, %%mm5 \n\t" // 2H2
1215 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1216 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1217
1218 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1219 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1220 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1221 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1222//50 opcodes so far
1223 "movq (%%ebx), %%mm2 \n\t"
1224 "movq %%mm2, %%mm3 \n\t"
1225 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1226 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1227 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1228 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1229 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1230 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1231
1232 "movq (%%ebx, %1), %%mm6 \n\t"
1233 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1234 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1235 "movq (%%ebx, %1), %%mm6 \n\t"
1236 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1237 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1238
1239 "paddw %%mm0, %%mm0 \n\t" // 2L4
1240 "paddw %%mm1, %%mm1 \n\t" // 2H4
1241 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1242 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1243
1244 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1245 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1246 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1247 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1248
1249 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1250 "movq %%mm2, %%mm3 \n\t"
1251 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1252 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1253
1254 "paddw %%mm2, %%mm2 \n\t" // 2L7
1255 "paddw %%mm3, %%mm3 \n\t" // 2H7
1256 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1257 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1258
1259 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1260 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
8405b3fd
MN
1261
1262#ifdef HAVE_MMX2
1263 "movq %%mm7, %%mm6 \n\t" // 0
1264 "psubw %%mm0, %%mm6 \n\t"
1265 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1266 "movq %%mm7, %%mm6 \n\t" // 0
1267 "psubw %%mm1, %%mm6 \n\t"
1268 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1269 "movq %%mm7, %%mm6 \n\t" // 0
1270 "psubw %%mm2, %%mm6 \n\t"
1271 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1272 "movq %%mm7, %%mm6 \n\t" // 0
1273 "psubw %%mm3, %%mm6 \n\t"
1274 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1275#else
3057fa66
A
1276 "movq %%mm7, %%mm6 \n\t" // 0
1277 "pcmpgtw %%mm0, %%mm6 \n\t"
1278 "pxor %%mm6, %%mm0 \n\t"
1279 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1280 "movq %%mm7, %%mm6 \n\t" // 0
1281 "pcmpgtw %%mm1, %%mm6 \n\t"
1282 "pxor %%mm6, %%mm1 \n\t"
1283 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3057fa66
A
1284 "movq %%mm7, %%mm6 \n\t" // 0
1285 "pcmpgtw %%mm2, %%mm6 \n\t"
1286 "pxor %%mm6, %%mm2 \n\t"
1287 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1288 "movq %%mm7, %%mm6 \n\t" // 0
1289 "pcmpgtw %%mm3, %%mm6 \n\t"
1290 "pxor %%mm6, %%mm3 \n\t"
1291 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
8405b3fd 1292#endif
3057fa66
A
1293
1294#ifdef HAVE_MMX2
1295 "pminsw %%mm2, %%mm0 \n\t"
1296 "pminsw %%mm3, %%mm1 \n\t"
1297#else
1298 "movq %%mm0, %%mm6 \n\t"
1299 "psubusw %%mm2, %%mm6 \n\t"
1300 "psubw %%mm6, %%mm0 \n\t"
1301 "movq %%mm1, %%mm6 \n\t"
1302 "psubusw %%mm3, %%mm6 \n\t"
1303 "psubw %%mm6, %%mm1 \n\t"
1304#endif
1305
1306 "movq %%mm7, %%mm6 \n\t" // 0
1307 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1308 "pxor %%mm6, %%mm4 \n\t"
1309 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1310 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1311 "pxor %%mm7, %%mm5 \n\t"
1312 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1313// 100 opcodes
1314 "movd %2, %%mm2 \n\t" // QP
3057fa66
A
1315 "punpcklwd %%mm2, %%mm2 \n\t"
1316 "punpcklwd %%mm2, %%mm2 \n\t"
1317 "psllw $3, %%mm2 \n\t" // 8QP
1318 "movq %%mm2, %%mm3 \n\t" // 8QP
1319 "pcmpgtw %%mm4, %%mm2 \n\t"
1320 "pcmpgtw %%mm5, %%mm3 \n\t"
1321 "pand %%mm2, %%mm4 \n\t"
1322 "pand %%mm3, %%mm5 \n\t"
1323
1324
1325 "psubusw %%mm0, %%mm4 \n\t" // hd
1326 "psubusw %%mm1, %%mm5 \n\t" // ld
1327
1328
1329 "movq w05, %%mm2 \n\t" // 5
1330 "pmullw %%mm2, %%mm4 \n\t"
1331 "pmullw %%mm2, %%mm5 \n\t"
1332 "movq w20, %%mm2 \n\t" // 32
1333 "paddw %%mm2, %%mm4 \n\t"
1334 "paddw %%mm2, %%mm5 \n\t"
1335 "psrlw $6, %%mm4 \n\t"
1336 "psrlw $6, %%mm5 \n\t"
1337
1338/*
1339 "movq w06, %%mm2 \n\t" // 6
1340 "paddw %%mm2, %%mm4 \n\t"
1341 "paddw %%mm2, %%mm5 \n\t"
1342 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1343//FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1344 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1345 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1346*/
1347
1348 "movq temp2, %%mm0 \n\t" // L3 - L4
1349 "movq temp3, %%mm1 \n\t" // H3 - H4
1350
1351 "pxor %%mm2, %%mm2 \n\t"
1352 "pxor %%mm3, %%mm3 \n\t"
1353
3057fa66
A
1354 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1355 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1356 "pxor %%mm2, %%mm0 \n\t"
1357 "pxor %%mm3, %%mm1 \n\t"
1358 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1359 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
e5c30e06
MN
1360 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1361 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3057fa66
A
1362
1363 "pxor %%mm6, %%mm2 \n\t"
1364 "pxor %%mm7, %%mm3 \n\t"
1365 "pand %%mm2, %%mm4 \n\t"
1366 "pand %%mm3, %%mm5 \n\t"
1367
1368#ifdef HAVE_MMX2
1369 "pminsw %%mm0, %%mm4 \n\t"
1370 "pminsw %%mm1, %%mm5 \n\t"
1371#else
1372 "movq %%mm4, %%mm2 \n\t"
1373 "psubusw %%mm0, %%mm2 \n\t"
1374 "psubw %%mm2, %%mm4 \n\t"
1375 "movq %%mm5, %%mm2 \n\t"
1376 "psubusw %%mm1, %%mm2 \n\t"
1377 "psubw %%mm2, %%mm5 \n\t"
1378#endif
1379 "pxor %%mm6, %%mm4 \n\t"
1380 "pxor %%mm7, %%mm5 \n\t"
1381 "psubw %%mm6, %%mm4 \n\t"
1382 "psubw %%mm7, %%mm5 \n\t"
1383 "packsswb %%mm5, %%mm4 \n\t"
1384 "movq (%%eax, %1, 2), %%mm0 \n\t"
1385 "paddb %%mm4, %%mm0 \n\t"
1386 "movq %%mm0, (%%eax, %1, 2) \n\t"
1387 "movq (%0, %1, 4), %%mm0 \n\t"
1388 "psubb %%mm4, %%mm0 \n\t"
3057fa66
A
1389 "movq %%mm0, (%0, %1, 4) \n\t"
1390
1391 :
1392 : "r" (src), "r" (stride), "r" (QP)
1393 : "%eax", "%ebx"
1394 );
1395#else
1396 const int l1= stride;
1397 const int l2= stride + l1;
1398 const int l3= stride + l2;
1399 const int l4= stride + l3;
1400 const int l5= stride + l4;
1401 const int l6= stride + l5;
1402 const int l7= stride + l6;
1403 const int l8= stride + l7;
1404// const int l9= stride + l8;
d5a1a995 1405 int x;
acced553 1406 src+= stride*3;
d5a1a995 1407 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
1408 {
1409 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1410 if(ABS(middleEnergy) < 8*QP)
1411 {
1412 const int q=(src[l4] - src[l5])/2;
1413 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1414 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1415
1416 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1417 d= MAX(d, 0);
1418
1419 d= (5*d + 32) >> 6;
1420 d*= SIGN(-middleEnergy);
1421
1422 if(q>0)
1423 {
1424 d= d<0 ? 0 : d;
1425 d= d>q ? q : d;
1426 }
1427 else
1428 {
1429 d= d>0 ? 0 : d;
1430 d= d<q ? q : d;
1431 }
1432
1433 src[l4]-= d;
1434 src[l5]+= d;
1435 }
1436 src++;
1437 }
1438#endif
1439}
1440
3057fa66
A
1441static inline void dering(uint8_t src[], int stride, int QP)
1442{
e0f8ffae 1443#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
3057fa66 1444 asm volatile(
70c5ae87
MN
1445 "movq pQPb, %%mm0 \n\t"
1446 "paddusb %%mm0, %%mm0 \n\t"
1447 "movq %%mm0, pQPb2 \n\t"
1448
3057fa66
A
1449 "leal (%0, %1), %%eax \n\t"
1450 "leal (%%eax, %1, 4), %%ebx \n\t"
1451// 0 1 2 3 4 5 6 7 8 9
1452// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1453
70c5ae87 1454 "pcmpeqb %%mm6, %%mm6 \n\t"
3057fa66 1455 "pxor %%mm7, %%mm7 \n\t"
e0f8ffae 1456#ifdef HAVE_MMX2
3057fa66 1457#define FIND_MIN_MAX(addr)\
70c5ae87 1458 "movq " #addr ", %%mm0 \n\t"\
3057fa66
A
1459 "pminub %%mm0, %%mm6 \n\t"\
1460 "pmaxub %%mm0, %%mm7 \n\t"
e0f8ffae
MN
1461#else
1462#define FIND_MIN_MAX(addr)\
1463 "movq " #addr ", %%mm0 \n\t"\
1464 "movq %%mm6, %%mm1 \n\t"\
1465 "psubusb %%mm0, %%mm7 \n\t"\
1466 "paddb %%mm0, %%mm7 \n\t"\
1467 "psubusb %%mm0, %%mm1 \n\t"\
1468 "psubb %%mm1, %%mm6 \n\t"
1469#endif
3057fa66 1470
70c5ae87
MN
1471FIND_MIN_MAX((%%eax))
1472FIND_MIN_MAX((%%eax, %1))
1473FIND_MIN_MAX((%%eax, %1, 2))
1474FIND_MIN_MAX((%0, %1, 4))
1475FIND_MIN_MAX((%%ebx))
1476FIND_MIN_MAX((%%ebx, %1))
1477FIND_MIN_MAX((%%ebx, %1, 2))
1478FIND_MIN_MAX((%0, %1, 8))
3057fa66
A
1479
1480 "movq %%mm6, %%mm4 \n\t"
e5c30e06 1481 "psrlq $8, %%mm6 \n\t"
e5c30e06 1482#ifdef HAVE_MMX2
e0f8ffae 1483 "pminub %%mm4, %%mm6 \n\t" // min of pixels
e5c30e06
MN
1484 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1485 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1486 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
e0f8ffae 1487 "pminub %%mm4, %%mm6 \n\t"
e5c30e06 1488#else
e0f8ffae
MN
1489 "movq %%mm6, %%mm1 \n\t"
1490 "psubusb %%mm4, %%mm1 \n\t"
1491 "psubb %%mm1, %%mm6 \n\t"
3057fa66
A
1492 "movq %%mm6, %%mm4 \n\t"
1493 "psrlq $16, %%mm6 \n\t"
e0f8ffae
MN
1494 "movq %%mm6, %%mm1 \n\t"
1495 "psubusb %%mm4, %%mm1 \n\t"
1496 "psubb %%mm1, %%mm6 \n\t"
3057fa66 1497 "movq %%mm6, %%mm4 \n\t"
e5c30e06 1498 "psrlq $32, %%mm6 \n\t"
e0f8ffae
MN
1499 "movq %%mm6, %%mm1 \n\t"
1500 "psubusb %%mm4, %%mm1 \n\t"
1501 "psubb %%mm1, %%mm6 \n\t"
e5c30e06 1502#endif
e5c30e06 1503
3057fa66
A
1504
1505 "movq %%mm7, %%mm4 \n\t"
e5c30e06 1506 "psrlq $8, %%mm7 \n\t"
e5c30e06 1507#ifdef HAVE_MMX2
e0f8ffae 1508 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
e5c30e06 1509 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
e0f8ffae 1510 "pmaxub %%mm4, %%mm7 \n\t"
e5c30e06 1511 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
e0f8ffae 1512 "pmaxub %%mm4, %%mm7 \n\t"
e5c30e06 1513#else
e0f8ffae
MN
1514 "psubusb %%mm4, %%mm7 \n\t"
1515 "paddb %%mm4, %%mm7 \n\t"
3057fa66
A
1516 "movq %%mm7, %%mm4 \n\t"
1517 "psrlq $16, %%mm7 \n\t"
e0f8ffae
MN
1518 "psubusb %%mm4, %%mm7 \n\t"
1519 "paddb %%mm4, %%mm7 \n\t"
3057fa66 1520 "movq %%mm7, %%mm4 \n\t"
e5c30e06 1521 "psrlq $32, %%mm7 \n\t"
e0f8ffae
MN
1522 "psubusb %%mm4, %%mm7 \n\t"
1523 "paddb %%mm4, %%mm7 \n\t"
e5c30e06 1524#endif
70c5ae87 1525 PAVGB(%%mm6, %%mm7) // a=(max + min)/2
e5c30e06
MN
1526 "punpcklbw %%mm7, %%mm7 \n\t"
1527 "punpcklbw %%mm7, %%mm7 \n\t"
1528 "punpcklbw %%mm7, %%mm7 \n\t"
70c5ae87
MN
1529 "movq %%mm7, temp0 \n\t"
1530
1531 "movq (%0), %%mm0 \n\t" // L10
1532 "movq %%mm0, %%mm1 \n\t" // L10
1533 "movq %%mm0, %%mm2 \n\t" // L10
1534 "psllq $8, %%mm1 \n\t"
1535 "psrlq $8, %%mm2 \n\t"
1536 "movd -4(%0), %%mm3 \n\t"
1537 "movd 8(%0), %%mm4 \n\t"
1538 "psrlq $24, %%mm3 \n\t"
1539 "psllq $56, %%mm4 \n\t"
1540 "por %%mm3, %%mm1 \n\t" // L00
1541 "por %%mm4, %%mm2 \n\t" // L20
1542 "movq %%mm1, %%mm3 \n\t" // L00
1543 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1544 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1545 "psubusb %%mm7, %%mm0 \n\t"
1546 "psubusb %%mm7, %%mm2 \n\t"
1547 "psubusb %%mm7, %%mm3 \n\t"
1548 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1
1549 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1
1550 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1
1551 "paddb %%mm2, %%mm0 \n\t"
1552 "paddb %%mm3, %%mm0 \n\t"
1553
1554 "movq (%%eax), %%mm2 \n\t" // L11
1555 "movq %%mm2, %%mm3 \n\t" // L11
1556 "movq %%mm2, %%mm4 \n\t" // L11
1557 "psllq $8, %%mm3 \n\t"
1558 "psrlq $8, %%mm4 \n\t"
1559 "movd -4(%%eax), %%mm5 \n\t"
1560 "movd 8(%%eax), %%mm6 \n\t"
1561 "psrlq $24, %%mm5 \n\t"
1562 "psllq $56, %%mm6 \n\t"
1563 "por %%mm5, %%mm3 \n\t" // L01
1564 "por %%mm6, %%mm4 \n\t" // L21
1565 "movq %%mm3, %%mm5 \n\t" // L01
1566 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1567 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1568 "psubusb %%mm7, %%mm2 \n\t"
1569 "psubusb %%mm7, %%mm4 \n\t"
1570 "psubusb %%mm7, %%mm5 \n\t"
1571 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1
1572 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1
1573 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1
1574 "paddb %%mm4, %%mm2 \n\t"
1575 "paddb %%mm5, %%mm2 \n\t"
1576// 0, 2, 3, 1
1577#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1578 "movq " #src ", " #sx " \n\t" /* src[0] */\
1579 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1580 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1581 "psllq $8, " #lx " \n\t"\
1582 "psrlq $8, " #t0 " \n\t"\
1583 "movd -4" #src ", " #t1 " \n\t"\
1584 "psrlq $24, " #t1 " \n\t"\
1585 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1586 "movd 8" #src ", " #t1 " \n\t"\
1587 "psllq $56, " #t1 " \n\t"\
1588 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1589 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1590 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1591 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
9927c7ee
MN
1592 PAVGB(lx, pplx) \
1593 "movq " #lx ", temp1 \n\t"\
1594 "movq temp0, " #lx " \n\t"\
8405b3fd
MN
1595 "psubusb " #lx ", " #t1 " \n\t"\
1596 "psubusb " #lx ", " #t0 " \n\t"\
1597 "psubusb " #lx ", " #sx " \n\t"\
9927c7ee 1598 "movq b00, " #lx " \n\t"\
8405b3fd
MN
1599 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1600 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1601 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
70c5ae87
MN
1602 "paddb " #t1 ", " #t0 " \n\t"\
1603 "paddb " #t0 ", " #sx " \n\t"\
1604\
70c5ae87
MN
1605 PAVGB(plx, pplx) /* filtered */\
1606 "movq " #dst ", " #t0 " \n\t" /* dst */\
2e212618
MN
1607 "movq " #t0 ", " #t1 " \n\t" /* dst */\
1608 "psubusb pQPb2, " #t0 " \n\t"\
1609 "paddusb pQPb2, " #t1 " \n\t"\
1610 PMAXUB(t0, pplx)\
1611 PMINUB(t1, pplx, t0)\
70c5ae87
MN
1612 "paddb " #sx ", " #ppsx " \n\t"\
1613 "paddb " #psx ", " #ppsx " \n\t"\
1614 "#paddb b02, " #ppsx " \n\t"\
1615 "pand b08, " #ppsx " \n\t"\
8405b3fd 1616 "pcmpeqb " #lx ", " #ppsx " \n\t"\
2e212618 1617 "pand " #ppsx ", " #pplx " \n\t"\
70c5ae87 1618 "pandn " #dst ", " #ppsx " \n\t"\
8405b3fd 1619 "por " #pplx ", " #ppsx " \n\t"\
9927c7ee
MN
1620 "movq " #ppsx ", " #dst " \n\t"\
1621 "movq temp1, " #lx " \n\t"
2e212618 1622
70c5ae87
MN
1623/*
16240000000
16251111111
e5c30e06 1626
70c5ae87
MN
16271111110
16281111101
16291111100
16301111011
16311111010
16321111001
e5c30e06 1633
70c5ae87
MN
16341111000
16351110111
e5c30e06 1636
70c5ae87
MN
1637*/
1638//DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1639DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1640DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1641DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1642DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1643DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1644DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1645DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1646DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
3057fa66
A
1647
1648
1649 : : "r" (src), "r" (stride), "r" (QP)
1650 : "%eax", "%ebx"
1651 );
1652#else
2e212618
MN
1653 int y;
1654 int min=255;
1655 int max=0;
1656 int avg;
1657 uint8_t *p;
1658 int s[10];
1659
1660 for(y=1; y<9; y++)
1661 {
1662 int x;
1663 p= src + stride*y;
1664 for(x=1; x<9; x++)
1665 {
1666 p++;
1667 if(*p > max) max= *p;
1668 if(*p < min) min= *p;
1669 }
1670 }
1671 avg= (min + max + 1)/2;
1672
1673 for(y=0; y<10; y++)
1674 {
1675 int x;
1676 int t = 0;
1677 p= src + stride*y;
1678 for(x=0; x<10; x++)
1679 {
1680 if(*p > avg) t |= (1<<x);
1681 p++;
1682 }
1683 t |= (~t)<<16;
1684 t &= (t<<1) & (t>>1);
1685 s[y] = t;
1686 }
1687
1688 for(y=1; y<9; y++)
1689 {
1690 int x;
1691 int t = s[y-1] & s[y] & s[y+1];
1692 t|= t>>16;
1693
1694 p= src + stride*y;
1695 for(x=1; x<9; x++)
1696 {
1697 p++;
1698 if(t & (1<<x))
1699 {
1700 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1701 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1702 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1703 f= (f + 8)>>4;
1704
1705 if (*p + 2*QP < f) *p= *p + 2*QP;
1706 else if(*p - 2*QP > f) *p= *p - 2*QP;
1707 else *p=f;
1708 }
1709 }
1710 }
3057fa66 1711
3057fa66
A
1712#endif
1713}
1714
3b58b885
MN
1715/**
1716 * Deinterlaces the given block
7fb36f6c
MN
1717 * will be called for every 8x8 block and can read & write from line 4-15
1718 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1719 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885
MN
1720 */
1721static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1722{
1723#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1724 src+= 4*stride;
3b58b885
MN
1725 asm volatile(
1726 "leal (%0, %1), %%eax \n\t"
1727 "leal (%%eax, %1, 4), %%ebx \n\t"
1728// 0 1 2 3 4 5 6 7 8 9
1729// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1730
1731 "movq (%0), %%mm0 \n\t"
1732 "movq (%%eax, %1), %%mm1 \n\t"
acced553 1733 PAVGB(%%mm1, %%mm0)
3b58b885
MN
1734 "movq %%mm0, (%%eax) \n\t"
1735 "movq (%0, %1, 4), %%mm0 \n\t"
acced553 1736 PAVGB(%%mm0, %%mm1)
3b58b885
MN
1737 "movq %%mm1, (%%eax, %1, 2) \n\t"
1738 "movq (%%ebx, %1), %%mm1 \n\t"
acced553 1739 PAVGB(%%mm1, %%mm0)
3b58b885
MN
1740 "movq %%mm0, (%%ebx) \n\t"
1741 "movq (%0, %1, 8), %%mm0 \n\t"
acced553 1742 PAVGB(%%mm0, %%mm1)
3b58b885
MN
1743 "movq %%mm1, (%%ebx, %1, 2) \n\t"
1744
1745 : : "r" (src), "r" (stride)
1746 : "%eax", "%ebx"
1747 );
1748#else
1749 int x;
7fb36f6c 1750 src+= 4*stride;
3b58b885
MN
1751 for(x=0; x<8; x++)
1752 {
1753 src[stride] = (src[0] + src[stride*2])>>1;
1754 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1755 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1756 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1757 src++;
1758 }
1759#endif
1760}
1761
1762/**
1763 * Deinterlaces the given block
7fb36f6c
MN
1764 * will be called for every 8x8 block and can read & write from line 4-15
1765 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1766 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1767 * this filter will read lines 3-15 and write 7-13
acced553 1768 * no cliping in C version
3b58b885 1769 */
acced553 1770static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
3b58b885
MN
1771{
1772#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1773 src+= stride*3;
3b58b885
MN
1774 asm volatile(
1775 "leal (%0, %1), %%eax \n\t"
1776 "leal (%%eax, %1, 4), %%ebx \n\t"
acced553
MN
1777 "leal (%%ebx, %1, 4), %%ecx \n\t"
1778 "addl %1, %%ecx \n\t"
1779 "pxor %%mm7, %%mm7 \n\t"
1780// 0 1 2 3 4 5 6 7 8 9 10
1781// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
3b58b885 1782
acced553
MN
1783#define DEINT_CUBIC(a,b,c,d,e)\
1784 "movq " #a ", %%mm0 \n\t"\
1785 "movq " #b ", %%mm1 \n\t"\
1786 "movq " #d ", %%mm2 \n\t"\
1787 "movq " #e ", %%mm3 \n\t"\
1788 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1789 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1790 "movq %%mm0, %%mm2 \n\t"\
1791 "punpcklbw %%mm7, %%mm0 \n\t"\
1792 "punpckhbw %%mm7, %%mm2 \n\t"\
1793 "movq %%mm1, %%mm3 \n\t"\
1794 "punpcklbw %%mm7, %%mm1 \n\t"\
1795 "punpckhbw %%mm7, %%mm3 \n\t"\
1796 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1797 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1798 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1799 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1800 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1801 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1802 "packuswb %%mm3, %%mm1 \n\t"\
1803 "movq %%mm1, " #c " \n\t"
1804
1805DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1806DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1807DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1808DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
3b58b885
MN
1809
1810 : : "r" (src), "r" (stride)
acced553 1811 : "%eax", "%ebx", "ecx"
3b58b885
MN
1812 );
1813#else
1814 int x;
7fb36f6c 1815 src+= stride*3;
3b58b885
MN
1816 for(x=0; x<8; x++)
1817 {
acced553
MN
1818 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1819 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1820 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1821 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
3b58b885
MN
1822 src++;
1823 }
1824#endif
1825}
1826
1827/**
1828 * Deinterlaces the given block
7fb36f6c
MN
1829 * will be called for every 8x8 block and can read & write from line 4-15
1830 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1831 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885 1832 * will shift the image up by 1 line (FIXME if this is a problem)
7fb36f6c 1833 * this filter will read lines 4-13 and write 4-11
3b58b885
MN
1834 */
1835static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
1836{
1837#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1838 src+= 4*stride;
3b58b885
MN
1839 asm volatile(
1840 "leal (%0, %1), %%eax \n\t"
1841 "leal (%%eax, %1, 4), %%ebx \n\t"
1842// 0 1 2 3 4 5 6 7 8 9
1843// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1844
1845 "movq (%0), %%mm0 \n\t" // L0
1846 "movq (%%eax, %1), %%mm1 \n\t" // L2
1847 PAVGB(%%mm1, %%mm0) // L0+L2
1848 "movq (%%eax), %%mm2 \n\t" // L1
1849 PAVGB(%%mm2, %%mm0)
1850 "movq %%mm0, (%0) \n\t"
1851 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
1852 PAVGB(%%mm0, %%mm2) // L1+L3
1853 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1854 "movq %%mm2, (%%eax) \n\t"
1855 "movq (%0, %1, 4), %%mm2 \n\t" // L4
1856 PAVGB(%%mm2, %%mm1) // L2+L4
1857 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1858 "movq %%mm1, (%%eax, %1) \n\t"
1859 "movq (%%ebx), %%mm1 \n\t" // L5
1860 PAVGB(%%mm1, %%mm0) // L3+L5
1861 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1862 "movq %%mm0, (%%eax, %1, 2) \n\t"
1863 "movq (%%ebx, %1), %%mm0 \n\t" // L6
1864 PAVGB(%%mm0, %%mm2) // L4+L6
1865 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1866 "movq %%mm2, (%0, %1, 4) \n\t"
1867 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
1868 PAVGB(%%mm2, %%mm1) // L5+L7
1869 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1870 "movq %%mm1, (%%ebx) \n\t"
1871 "movq (%0, %1, 8), %%mm1 \n\t" // L8
1872 PAVGB(%%mm1, %%mm0) // L6+L8
1873 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1874 "movq %%mm0, (%%ebx, %1) \n\t"
1875 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
1876 PAVGB(%%mm0, %%mm2) // L7+L9
1877 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1878 "movq %%mm2, (%%ebx, %1, 2) \n\t"
1879
1880
1881 : : "r" (src), "r" (stride)
1882 : "%eax", "%ebx"
1883 );
1884#else
1885 int x;
7fb36f6c 1886 src+= 4*stride;
3b58b885
MN
1887 for(x=0; x<8; x++)
1888 {
1889 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
1890 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
1891 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
1892 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
1893 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
1894 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
1895 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
1896 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
1897 src++;
1898 }
1899#endif
1900}
1901
1902/**
1903 * Deinterlaces the given block
7fb36f6c
MN
1904 * will be called for every 8x8 block and can read & write from line 4-15,
1905 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1906 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885
MN
1907 */
1908static inline void deInterlaceMedian(uint8_t src[], int stride)
1909{
a6be8111 1910#ifdef HAVE_MMX
7fb36f6c 1911 src+= 4*stride;
a6be8111 1912#ifdef HAVE_MMX2
3b58b885
MN
1913 asm volatile(
1914 "leal (%0, %1), %%eax \n\t"
1915 "leal (%%eax, %1, 4), %%ebx \n\t"
1916// 0 1 2 3 4 5 6 7 8 9
1917// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1918
1919 "movq (%0), %%mm0 \n\t" //
1920 "movq (%%eax, %1), %%mm2 \n\t" //
1921 "movq (%%eax), %%mm1 \n\t" //
1922 "movq %%mm0, %%mm3 \n\t"
1923 "pmaxub %%mm1, %%mm0 \n\t" //
1924 "pminub %%mm3, %%mm1 \n\t" //
1925 "pmaxub %%mm2, %%mm1 \n\t" //
1926 "pminub %%mm1, %%mm0 \n\t"
1927 "movq %%mm0, (%%eax) \n\t"
1928
1929 "movq (%0, %1, 4), %%mm0 \n\t" //
1930 "movq (%%eax, %1, 2), %%mm1 \n\t" //
1931 "movq %%mm2, %%mm3 \n\t"
1932 "pmaxub %%mm1, %%mm2 \n\t" //
1933 "pminub %%mm3, %%mm1 \n\t" //
1934 "pmaxub %%mm0, %%mm1 \n\t" //
1935 "pminub %%mm1, %%mm2 \n\t"
1936 "movq %%mm2, (%%eax, %1, 2) \n\t"
1937
1938 "movq (%%ebx), %%mm2 \n\t" //
1939 "movq (%%ebx, %1), %%mm1 \n\t" //
1940 "movq %%mm2, %%mm3 \n\t"
1941 "pmaxub %%mm0, %%mm2 \n\t" //
1942 "pminub %%mm3, %%mm0 \n\t" //
1943 "pmaxub %%mm1, %%mm0 \n\t" //
1944 "pminub %%mm0, %%mm2 \n\t"
1945 "movq %%mm2, (%%ebx) \n\t"
1946
1947 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
1948 "movq (%0, %1, 8), %%mm0 \n\t" //
1949 "movq %%mm2, %%mm3 \n\t"
1950 "pmaxub %%mm0, %%mm2 \n\t" //
1951 "pminub %%mm3, %%mm0 \n\t" //
1952 "pmaxub %%mm1, %%mm0 \n\t" //
1953 "pminub %%mm0, %%mm2 \n\t"
1954 "movq %%mm2, (%%ebx, %1, 2) \n\t"
1955
1956
1957 : : "r" (src), "r" (stride)
1958 : "%eax", "%ebx"
1959 );
a6be8111
MN
1960
1961#else // MMX without MMX2
1962 asm volatile(
1963 "leal (%0, %1), %%eax \n\t"
1964 "leal (%%eax, %1, 4), %%ebx \n\t"
1965// 0 1 2 3 4 5 6 7 8 9
1966// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1967 "pxor %%mm7, %%mm7 \n\t"
1968
1969#define MEDIAN(a,b,c)\
1970 "movq " #a ", %%mm0 \n\t"\
1971 "movq " #b ", %%mm2 \n\t"\
1972 "movq " #c ", %%mm1 \n\t"\
1973 "movq %%mm0, %%mm3 \n\t"\
1974 "movq %%mm1, %%mm4 \n\t"\
1975 "movq %%mm2, %%mm5 \n\t"\
1976 "psubusb %%mm1, %%mm3 \n\t"\
1977 "psubusb %%mm2, %%mm4 \n\t"\
1978 "psubusb %%mm0, %%mm5 \n\t"\
1979 "pcmpeqb %%mm7, %%mm3 \n\t"\
1980 "pcmpeqb %%mm7, %%mm4 \n\t"\
1981 "pcmpeqb %%mm7, %%mm5 \n\t"\
1982 "movq %%mm3, %%mm6 \n\t"\
1983 "pxor %%mm4, %%mm3 \n\t"\
1984 "pxor %%mm5, %%mm4 \n\t"\
1985 "pxor %%mm6, %%mm5 \n\t"\
1986 "por %%mm3, %%mm1 \n\t"\
1987 "por %%mm4, %%mm2 \n\t"\
1988 "por %%mm5, %%mm0 \n\t"\
1989 "pand %%mm2, %%mm0 \n\t"\
1990 "pand %%mm1, %%mm0 \n\t"\
1991 "movq %%mm0, " #b " \n\t"
1992
1993MEDIAN((%0), (%%eax), (%%eax, %1))
1994MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
1995MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
1996MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
1997
1998 : : "r" (src), "r" (stride)
1999 : "%eax", "%ebx"
2000 );
2001#endif // MMX
3b58b885
MN
2002#else
2003 //FIXME
2004 int x;
7fb36f6c 2005 src+= 4*stride;
3b58b885
MN
2006 for(x=0; x<8; x++)
2007 {
2008 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2009 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2010 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2011 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2012 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2013 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2014 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2015 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2016 src++;
2017 }
2018#endif
2019}
2020
e5c30e06 2021#ifdef HAVE_MMX
4e4dcbc5
MN
2022/**
2023 * transposes and shift the given 8x8 Block into dst1 and dst2
2024 */
2025static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2026{
2027 asm(
2028 "leal (%0, %1), %%eax \n\t"
2029 "leal (%%eax, %1, 4), %%ebx \n\t"
2030// 0 1 2 3 4 5 6 7 8 9
2031// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2032 "movq (%0), %%mm0 \n\t" // 12345678
2033 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2034 "movq %%mm0, %%mm2 \n\t" // 12345678
2035 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2036 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2037
2038 "movq (%%eax, %1), %%mm1 \n\t"
2039 "movq (%%eax, %1, 2), %%mm3 \n\t"
2040 "movq %%mm1, %%mm4 \n\t"
2041 "punpcklbw %%mm3, %%mm1 \n\t"
2042 "punpckhbw %%mm3, %%mm4 \n\t"
2043
2044 "movq %%mm0, %%mm3 \n\t"
2045 "punpcklwd %%mm1, %%mm0 \n\t"
2046 "punpckhwd %%mm1, %%mm3 \n\t"
2047 "movq %%mm2, %%mm1 \n\t"
2048 "punpcklwd %%mm4, %%mm2 \n\t"
2049 "punpckhwd %%mm4, %%mm1 \n\t"
2050
2051 "movd %%mm0, 128(%2) \n\t"
2052 "psrlq $32, %%mm0 \n\t"
2053 "movd %%mm0, 144(%2) \n\t"
2054 "movd %%mm3, 160(%2) \n\t"
2055 "psrlq $32, %%mm3 \n\t"
2056 "movd %%mm3, 176(%2) \n\t"
2057 "movd %%mm3, 48(%3) \n\t"
2058 "movd %%mm2, 192(%2) \n\t"
2059 "movd %%mm2, 64(%3) \n\t"
2060 "psrlq $32, %%mm2 \n\t"
2061 "movd %%mm2, 80(%3) \n\t"
2062 "movd %%mm1, 96(%3) \n\t"
2063 "psrlq $32, %%mm1 \n\t"
2064 "movd %%mm1, 112(%3) \n\t"
2065
2066 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2067 "movq (%%ebx), %%mm1 \n\t" // abcdefgh
2068 "movq %%mm0, %%mm2 \n\t" // 12345678
2069 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2070 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2071
2072 "movq (%%ebx, %1), %%mm1 \n\t"
2073 "movq (%%ebx, %1, 2), %%mm3 \n\t"
2074 "movq %%mm1, %%mm4 \n\t"
2075 "punpcklbw %%mm3, %%mm1 \n\t"
2076 "punpckhbw %%mm3, %%mm4 \n\t"
2077
2078 "movq %%mm0, %%mm3 \n\t"
2079 "punpcklwd %%mm1, %%mm0 \n\t"
2080 "punpckhwd %%mm1, %%mm3 \n\t"
2081 "movq %%mm2, %%mm1 \n\t"
2082 "punpcklwd %%mm4, %%mm2 \n\t"
2083 "punpckhwd %%mm4, %%mm1 \n\t"
2084
2085 "movd %%mm0, 132(%2) \n\t"
2086 "psrlq $32, %%mm0 \n\t"
2087 "movd %%mm0, 148(%2) \n\t"
2088 "movd %%mm3, 164(%2) \n\t"
2089 "psrlq $32, %%mm3 \n\t"
2090 "movd %%mm3, 180(%2) \n\t"
2091 "movd %%mm3, 52(%3) \n\t"
2092 "movd %%mm2, 196(%2) \n\t"
2093 "movd %%mm2, 68(%3) \n\t"
2094 "psrlq $32, %%mm2 \n\t"
2095 "movd %%mm2, 84(%3) \n\t"
2096 "movd %%mm1, 100(%3) \n\t"
2097 "psrlq $32, %%mm1 \n\t"
2098 "movd %%mm1, 116(%3) \n\t"
2099
2100
2101 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2102 : "%eax", "%ebx"
2103 );
2104}
2105
2106/**
2107 * transposes the given 8x8 block
2108 */
2109static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2110{
2111 asm(
2112 "leal (%0, %1), %%eax \n\t"
2113 "leal (%%eax, %1, 4), %%ebx \n\t"
2114// 0 1 2 3 4 5 6 7 8 9
2115// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2116 "movq (%2), %%mm0 \n\t" // 12345678
2117 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2118 "movq %%mm0, %%mm2 \n\t" // 12345678
2119 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2120 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2121
2122 "movq 32(%2), %%mm1 \n\t"
2123 "movq 48(%2), %%mm3 \n\t"
2124 "movq %%mm1, %%mm4 \n\t"
2125 "punpcklbw %%mm3, %%mm1 \n\t"
2126 "punpckhbw %%mm3, %%mm4 \n\t"
2127
2128 "movq %%mm0, %%mm3 \n\t"
2129 "punpcklwd %%mm1, %%mm0 \n\t"
2130 "punpckhwd %%mm1, %%mm3 \n\t"
2131 "movq %%mm2, %%mm1 \n\t"
2132 "punpcklwd %%mm4, %%mm2 \n\t"
2133 "punpckhwd %%mm4, %%mm1 \n\t"
2134
2135 "movd %%mm0, (%0) \n\t"
2136 "psrlq $32, %%mm0 \n\t"
2137 "movd %%mm0, (%%eax) \n\t"
2138 "movd %%mm3, (%%eax, %1) \n\t"
2139 "psrlq $32, %%mm3 \n\t"
2140 "movd %%mm3, (%%eax, %1, 2) \n\t"
2141 "movd %%mm2, (%0, %1, 4) \n\t"
2142 "psrlq $32, %%mm2 \n\t"
2143 "movd %%mm2, (%%ebx) \n\t"
2144 "movd %%mm1, (%%ebx, %1) \n\t"
2145 "psrlq $32, %%mm1 \n\t"
2146 "movd %%mm1, (%%ebx, %1, 2) \n\t"
2147
2148
2149 "movq 64(%2), %%mm0 \n\t" // 12345678
2150 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2151 "movq %%mm0, %%mm2 \n\t" // 12345678
2152 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2153 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2154
2155 "movq 96(%2), %%mm1 \n\t"
2156 "movq 112(%2), %%mm3 \n\t"
2157 "movq %%mm1, %%mm4 \n\t"
2158 "punpcklbw %%mm3, %%mm1 \n\t"
2159 "punpckhbw %%mm3, %%mm4 \n\t"
2160
2161 "movq %%mm0, %%mm3 \n\t"
2162 "punpcklwd %%mm1, %%mm0 \n\t"
2163 "punpckhwd %%mm1, %%mm3 \n\t"
2164 "movq %%mm2, %%mm1 \n\t"
2165 "punpcklwd %%mm4, %%mm2 \n\t"
2166 "punpckhwd %%mm4, %%mm1 \n\t"
2167
2168 "movd %%mm0, 4(%0) \n\t"
2169 "psrlq $32, %%mm0 \n\t"
2170 "movd %%mm0, 4(%%eax) \n\t"
2171 "movd %%mm3, 4(%%eax, %1) \n\t"
2172 "psrlq $32, %%mm3 \n\t"
2173 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2174 "movd %%mm2, 4(%0, %1, 4) \n\t"
2175 "psrlq $32, %%mm2 \n\t"
2176 "movd %%mm2, 4(%%ebx) \n\t"
2177 "movd %%mm1, 4(%%ebx, %1) \n\t"
2178 "psrlq $32, %%mm1 \n\t"
2179 "movd %%mm1, 4(%%ebx, %1, 2) \n\t"
2180
2181 :: "r" (dst), "r" (dstStride), "r" (src)
2182 : "%eax", "%ebx"
2183 );
2184}
e5c30e06 2185#endif
be44a4d7 2186//static int test=0;
4e4dcbc5 2187
117e45b0 2188static void inline tempNoiseReducer(uint8_t *src, int stride,
a9c77978 2189 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
117e45b0 2190{
be44a4d7
MN
2191#define FAST_L2_DIFF
2192//#define L1_DIFF //u should change the thresholds too if u try that one
2193#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2194 asm volatile(
2195 "leal (%2, %2, 2), %%eax \n\t" // 3*stride
2196 "leal (%2, %2, 4), %%ebx \n\t" // 5*stride
2197 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2198// 0 1 2 3 4 5 6 7 8 9
2199// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2
2200//FIXME reorder?
2201#ifdef L1_DIFF //needs mmx2
2202 "movq (%0), %%mm0 \n\t" // L0
2203 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2204 "movq (%0, %2), %%mm1 \n\t" // L1
2205 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2206 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2207 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2208 "movq (%0, %%eax), %%mm3 \n\t" // L3
2209 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
2210
2211 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2212 "paddw %%mm1, %%mm0 \n\t"
2213 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2214 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2215 "paddw %%mm2, %%mm0 \n\t"
2216 "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5|
2217 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2218 "paddw %%mm3, %%mm0 \n\t"
2219 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
2220 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2221 "paddw %%mm4, %%mm0 \n\t"
2222 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
2223 "paddw %%mm5, %%mm6 \n\t"
2224 "paddw %%mm7, %%mm6 \n\t"
2225 "paddw %%mm6, %%mm0 \n\t"
2226#elif defined (FAST_L2_DIFF)
2227 "pcmpeqb %%mm7, %%mm7 \n\t"
2228 "movq b80, %%mm6 \n\t"
2229 "pxor %%mm0, %%mm0 \n\t"
2230#define L2_DIFF_CORE(a, b)\
2231 "movq " #a ", %%mm5 \n\t"\
2232 "movq " #b ", %%mm2 \n\t"\
2233 "pxor %%mm7, %%mm2 \n\t"\
2234 PAVGB(%%mm2, %%mm5)\
2235 "paddb %%mm6, %%mm5 \n\t"\
2236 "movq %%mm5, %%mm2 \n\t"\
2237 "psllw $8, %%mm5 \n\t"\
2238 "pmaddwd %%mm5, %%mm5 \n\t"\
2239 "pmaddwd %%mm2, %%mm2 \n\t"\
2240 "paddd %%mm2, %%mm5 \n\t"\
2241 "psrld $14, %%mm5 \n\t"\
2242 "paddd %%mm5, %%mm0 \n\t"
2243
2244L2_DIFF_CORE((%0), (%1))
2245L2_DIFF_CORE((%0, %2), (%1, %2))
2246L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2247L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2248L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2249L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2250L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2251L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2252
2253#else
2254 "pxor %%mm7, %%mm7 \n\t"
2255 "pxor %%mm0, %%mm0 \n\t"
2256#define L2_DIFF_CORE(a, b)\
2257 "movq " #a ", %%mm5 \n\t"\
2258 "movq " #b ", %%mm2 \n\t"\
2259 "movq %%mm5, %%mm1 \n\t"\
2260 "movq %%mm2, %%mm3 \n\t"\
2261 "punpcklbw %%mm7, %%mm5 \n\t"\
2262 "punpckhbw %%mm7, %%mm1 \n\t"\
2263 "punpcklbw %%mm7, %%mm2 \n\t"\
2264 "punpckhbw %%mm7, %%mm3 \n\t"\
2265 "psubw %%mm2, %%mm5 \n\t"\
2266 "psubw %%mm3, %%mm1 \n\t"\
2267 "pmaddwd %%mm5, %%mm5 \n\t"\
2268 "pmaddwd %%mm1, %%mm1 \n\t"\
2269 "paddd %%mm1, %%mm5 \n\t"\
2270 "paddd %%mm5, %%mm0 \n\t"
2271
2272L2_DIFF_CORE((%0), (%1))
2273L2_DIFF_CORE((%0, %2), (%1, %2))
2274L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2275L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2276L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2277L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2278L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2279L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2280
2281#endif
2282
2283 "movq %%mm0, %%mm4 \n\t"
2284 "psrlq $32, %%mm0 \n\t"
2285 "paddd %%mm0, %%mm4 \n\t"
2286 "movd %%mm4, %%ecx \n\t"
a9c77978
MN
2287 "shll $2, %%ecx \n\t"
2288 "movl %3, %%ebx \n\t"
2289 "addl -4(%%ebx), %%ecx \n\t"
2290 "addl 4(%%ebx), %%ecx \n\t"
2291 "addl -1024(%%ebx), %%ecx \n\t"
2292 "addl $4, %%ecx \n\t"
2293 "addl 1024(%%ebx), %%ecx \n\t"
2294 "shrl $3, %%ecx \n\t"
2295 "movl %%ecx, (%%ebx) \n\t"
2296 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride
2297
be44a4d7
MN
2298// "movl %3, %%ecx \n\t"
2299// "movl %%ecx, test \n\t"
2300// "jmp 4f \n\t"
a9c77978 2301 "cmpl 4+maxTmpNoise, %%ecx \n\t"
be44a4d7 2302 " jb 2f \n\t"
a9c77978 2303 "cmpl 8+maxTmpNoise, %%ecx \n\t"
be44a4d7
MN
2304 " jb 1f \n\t"
2305
2306 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2307 "movq (%0), %%mm0 \n\t" // L0
2308 "movq (%0, %2), %%mm1 \n\t" // L1
2309 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2310 "movq (%0, %%eax), %%mm3 \n\t" // L3
2311 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2312 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2313 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2314 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2315 "movq %%mm0, (%1) \n\t" // L0
2316 "movq %%mm1, (%1, %2) \n\t" // L1
2317 "movq %%mm2, (%1, %2, 2) \n\t" // L2
2318 "movq %%mm3, (%1, %%eax) \n\t" // L3
2319 "movq %%mm4, (%1, %2, 4) \n\t" // L4
2320 "movq %%mm5, (%1, %%ebx) \n\t" // L5
2321 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
2322 "movq %%mm7, (%1, %%ecx) \n\t" // L7
2323 "jmp 4f \n\t"
2324
2325 "1: \n\t"
2326 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2327 "movq (%0), %%mm0 \n\t" // L0
2328 "pavgb (%1), %%mm0 \n\t" // L0
2329 "movq (%0, %2), %%mm1 \n\t" // L1
2330 "pavgb (%1, %2), %%mm1 \n\t" // L1
2331 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2332 "pavgb (%1, %2, 2), %%mm2 \n\t" // L2
2333 "movq (%0, %%eax), %%mm3 \n\t" // L3
2334 "pavgb (%1, %%eax), %%mm3 \n\t" // L3
2335 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2336 "pavgb (%1, %2, 4), %%mm4 \n\t" // L4
2337 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2338 "pavgb (%1, %%ebx), %%mm5 \n\t" // L5
2339 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2340 "pavgb (%1, %%eax, 2), %%mm6 \n\t" // L6
2341 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2342 "pavgb (%1, %%ecx), %%mm7 \n\t" // L7
2343 "movq %%mm0, (%1) \n\t" // R0
2344 "movq %%mm1, (%1, %2) \n\t" // R1
2345 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2346 "movq %%mm3, (%1, %%eax) \n\t" // R3
2347 "movq %%mm4, (%1, %2, 4) \n\t" // R4
2348 "movq %%mm5, (%1, %%ebx) \n\t" // R5
2349 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
2350 "movq %%mm7, (%1, %%ecx) \n\t" // R7
2351 "movq %%mm0, (%0) \n\t" // L0
2352 "movq %%mm1, (%0, %2) \n\t" // L1
2353 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2354 "movq %%mm3, (%0, %%eax) \n\t" // L3
2355 "movq %%mm4, (%0, %2, 4) \n\t" // L4
2356 "movq %%mm5, (%0, %%ebx) \n\t" // L5
2357 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
2358 "movq %%mm7, (%0, %%ecx) \n\t" // L7
2359 "jmp 4f \n\t"
2360
2361 "2: \n\t"
a9c77978 2362 "cmpl maxTmpNoise, %%ecx \n\t"
be44a4d7
MN
2363 " jb 3f \n\t"
2364
2365 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2366 "movq (%0), %%mm0 \n\t" // L0
2367 "movq (%0, %2), %%mm1 \n\t" // L1
2368 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2369 "movq (%0, %%eax), %%mm3 \n\t" // L3
2370 "movq (%1), %%mm4 \n\t" // R0
2371 "movq (%1, %2), %%mm5 \n\t" // R1
2372 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2373 "movq (%1, %%eax), %%mm7 \n\t" // R3
2374 PAVGB(%%mm4, %%mm0)
2375 PAVGB(%%mm5, %%mm1)
2376 PAVGB(%%mm6, %%mm2)
2377 PAVGB(%%mm7, %%mm3)
2378 PAVGB(%%mm4, %%mm0)
2379 PAVGB(%%mm5, %%mm1)
2380 PAVGB(%%mm6, %%mm2)
2381 PAVGB(%%mm7, %%mm3)
2382 "movq %%mm0, (%1) \n\t" // R0
2383 "movq %%mm1, (%1, %2) \n\t" // R1
2384 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2385 "movq %%mm3, (%1, %%eax) \n\t" // R3
2386 "movq %%mm0, (%0) \n\t" // L0
2387 "movq %%mm1, (%0, %2) \n\t" // L1
2388 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2389 "movq %%mm3, (%0, %%eax) \n\t" // L3
2390
2391 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2392 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2393 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2394 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2395 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2396 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2397 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2398 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2399 PAVGB(%%mm4, %%mm0)
2400 PAVGB(%%mm5, %%mm1)
2401 PAVGB(%%mm6, %%mm2)
2402 PAVGB(%%mm7, %%mm3)
2403 PAVGB(%%mm4, %%mm0)
2404 PAVGB(%%mm5, %%mm1)
2405 PAVGB(%%mm6, %%mm2)
2406 PAVGB(%%mm7, %%mm3)
2407 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2408 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2409 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2410 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2411 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2412 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2413 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2414 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2415 "jmp 4f \n\t"
2416
2417 "3: \n\t"
2418 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2419 "movq (%0), %%mm0 \n\t" // L0
2420 "movq (%0, %2), %%mm1 \n\t" // L1
2421 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2422 "movq (%0, %%eax), %%mm3 \n\t" // L3
2423 "movq (%1), %%mm4 \n\t" // R0
2424 "movq (%1, %2), %%mm5 \n\t" // R1
2425 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2426 "movq (%1, %%eax), %%mm7 \n\t" // R3
2427 PAVGB(%%mm4, %%mm0)
2428 PAVGB(%%mm5, %%mm1)
2429 PAVGB(%%mm6, %%mm2)
2430 PAVGB(%%mm7, %%mm3)
2431 PAVGB(%%mm4, %%mm0)
2432 PAVGB(%%mm5, %%mm1)
2433 PAVGB(%%mm6, %%mm2)
2434 PAVGB(%%mm7, %%mm3)
2435 PAVGB(%%mm4, %%mm0)
2436 PAVGB(%%mm5, %%mm1)
2437 PAVGB(%%mm6, %%mm2)
2438 PAVGB(%%mm7, %%mm3)
2439 "movq %%mm0, (%1) \n\t" // R0
2440 "movq %%mm1, (%1, %2) \n\t" // R1
2441 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2442 "movq %%mm3, (%1, %%eax) \n\t" // R3
2443 "movq %%mm0, (%0) \n\t" // L0
2444 "movq %%mm1, (%0, %2) \n\t" // L1
2445 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2446 "movq %%mm3, (%0, %%eax) \n\t" // L3
2447
2448 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2449 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2450 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2451 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2452 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2453 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2454 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2455 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2456 PAVGB(%%mm4, %%mm0)
2457 PAVGB(%%mm5, %%mm1)
2458 PAVGB(%%mm6, %%mm2)
2459 PAVGB(%%mm7, %%mm3)
2460 PAVGB(%%mm4, %%mm0)
2461 PAVGB(%%mm5, %%mm1)
2462 PAVGB(%%mm6, %%mm2)
2463 PAVGB(%%mm7, %%mm3)
2464 PAVGB(%%mm4, %%mm0)
2465 PAVGB(%%mm5, %%mm1)
2466 PAVGB(%%mm6, %%mm2)
2467 PAVGB(%%mm7, %%mm3)
2468 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2469 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2470 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2471 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2472 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2473 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2474 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2475 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2476
2477 "4: \n\t"
2478
a9c77978 2479 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
be44a4d7
MN
2480 : "%eax", "%ebx", "%ecx", "memory"
2481 );
2482//printf("%d\n", test);
2483#else
117e45b0
MN
2484 int y;
2485 int d=0;
2486 int sysd=0;
a9c77978 2487 int i;
117e45b0
MN
2488
2489 for(y=0; y<8; y++)
2490 {
2491 int x;
2492 for(x=0; x<8; x++)
2493 {
2494 int ref= tempBlured[ x + y*stride ];
2495 int cur= src[ x + y*stride ];
2496 int d1=ref - cur;
be44a4d7
MN
2497// if(x==0 || x==7) d1+= d1>>1;
2498// if(y==0 || y==7) d1+= d1>>1;
2499// d+= ABS(d1);
2500 d+= d1*d1;
117e45b0
MN
2501 sysd+= d1;
2502 }
2503 }
a9c77978
MN
2504 i=d;
2505 d= (
2506 4*d
2507 +(*(tempBluredPast-256))
2508 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2509 +(*(tempBluredPast+256))
2510 +4)>>3;
2511 *tempBluredPast=i;
2512// ((*tempBluredPast)*3 + d + 2)>>2;
2513
117e45b0
MN
2514//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2515/*
2516Switch between
2517 1 0 0 0 0 0 0 (0)
251864 32 16 8 4 2 1 (1)
251964 48 36 27 20 15 11 (33) (approx)
252064 56 49 43 37 33 29 (200) (approx)
2521*/
2522 if(d > maxNoise[1])
2523 {
2524 if(d < maxNoise[2])
2525 {
2526 for(y=0; y<8; y++)
2527 {
2528 int x;
2529 for(x=0; x<8; x++)
2530 {
2531 int ref= tempBlured[ x + y*stride ];
2532 int cur= src[ x + y*stride ];
2533 tempBlured[ x + y*stride ]=
2534 src[ x + y*stride ]=
2535 (ref + cur + 1)>>1;
2536 }
2537 }
2538 }
2539 else
2540 {
2541 for(y=0; y<8; y++)
2542 {
2543 int x;
2544 for(x=0; x<8; x++)
2545 {
2546 tempBlured[ x + y*stride ]= src[ x + y*stride ];
2547 }
2548 }
2549 }
2550 }
2551 else
2552 {
2553 if(d < maxNoise[0])
2554 {
2555 for(y=0; y<8; y++)
2556 {
2557 int x;
2558 for(x=0; x<8; x++)
2559 {
2560 int ref= tempBlured[ x + y*stride ];
2561 int cur= src[ x + y*stride ];
2562 tempBlured[ x + y*stride ]=
2563 src[ x + y*stride ]=
2564 (ref*7 + cur + 4)>>3;
2565 }
2566 }
2567 }
2568 else
2569 {
2570 for(y=0; y<8; y++)
2571 {
2572 int x;
2573 for(x=0; x<8; x++)
2574 {
2575 int ref= tempBlured[ x + y*stride ];
2576 int cur= src[ x + y*stride ];
2577 tempBlured[ x + y*stride ]=
2578 src[ x + y*stride ]=
2579 (ref*3 + cur + 2)>>2;
2580 }
2581 }
2582 }
2583 }
be44a4d7 2584#endif
117e45b0
MN
2585}
2586
9a722af7
A
2587#ifdef HAVE_ODIVX_POSTPROCESS
2588#include "../opendivx/postprocess.h"
2589int use_old_pp=0;
2590#endif
13e00528 2591
9a722af7 2592static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
117e45b0 2593 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
13e00528 2594
911879d1
MN
2595/* -pp Command line Help
2596NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2597
2598-pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2599
2600long form example:
2601-pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2602short form example:
2603-pp vb:a,hb:a,lb -pp de,-vb
117e45b0
MN
2604more examples:
2605-pp tn:64:128:256
911879d1
MN
2606
2607Filters Options
2608short long name short long option Description
2609* * a autoq cpu power dependant enabler
2610 c chrom chrominance filtring enabled
2611 y nochrom chrominance filtring disabled
2612hb hdeblock horizontal deblocking filter
2613vb vdeblock vertical deblocking filter
2614vr rkvdeblock
2615h1 x1hdeblock Experimental horizontal deblock filter 1
2616v1 x1vdeblock Experimental vertical deblock filter 1
2617dr dering not implemented yet
2618al autolevels automatic brightness / contrast fixer
2619 f fullyrange stretch luminance range to (0..255)
2620lb linblenddeint linear blend deinterlacer
2621li linipoldeint linear interpolating deinterlacer
2622ci cubicipoldeint cubic interpolating deinterlacer
2623md mediandeint median deinterlacer
2624de default hdeblock:a,vdeblock:a,dering:a,autolevels
2625fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
117e45b0 2626tn tmpnoise (3 Thresholds) Temporal Noise Reducer
911879d1
MN
2627*/
2628
2629/**
2630 * returns a PPMode struct which will have a non 0 error variable if an error occured
2631 * name is the string after "-pp" on the command line
2632 * quality is a number from 0 to GET_PP_QUALITY_MAX
2633 */
2634struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2635{
2636 char temp[GET_MODE_BUFFER_SIZE];
2637 char *p= temp;
2638 char *filterDelimiters= ",";
2639 char *optionDelimiters= ":";
117e45b0 2640 struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
911879d1
MN
2641 char *filterToken;
2642
2643 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2644
117e45b0
MN
2645 printf("%s\n", name);
2646
911879d1 2647 for(;;){
911879d1 2648 char *filterName;
117e45b0 2649 int q= 1000000; //GET_PP_QUALITY_MAX;
911879d1
MN
2650 int chrom=-1;
2651 char *option;
2652 char *options[OPTIONS_ARRAY_SIZE];
2653 int i;
2654 int filterNameOk=0;
2655 int numOfUnknownOptions=0;
2656 int enable=1; //does the user want us to enabled or disabled the filter
2657
2658 filterToken= strtok(p, filterDelimiters);
2659 if(filterToken == NULL) break;
117e45b0 2660 p+= strlen(filterToken) + 1; // p points to next filterToken
911879d1
MN
2661 filterName= strtok(filterToken, optionDelimiters);
2662 printf("%s::%s\n", filterToken, filterName);
2663
2664 if(*filterName == '-')
2665 {
2666 enable=0;
2667 filterName++;
2668 }
117e45b0 2669
911879d1
MN
2670 for(;;){ //for all options
2671 option= strtok(NULL, optionDelimiters);
2672 if(option == NULL) break;
2673
2674 printf("%s\n", option);
2675 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2676 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2677 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2678 else
2679 {
2680 options[numOfUnknownOptions] = option;
2681 numOfUnknownOptions++;
911879d1
MN
2682 }
2683 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2684 }
117e45b0 2685 options[numOfUnknownOptions] = NULL;
911879d1
MN
2686
2687 /* replace stuff from the replace Table */
2688 for(i=0; replaceTable[2*i]!=NULL; i++)
2689 {
2690 if(!strcmp(replaceTable[2*i], filterName))
2691 {
2692 int newlen= strlen(replaceTable[2*i + 1]);
2693 int plen;
2694 int spaceLeft;
2695
2696 if(p==NULL) p= temp, *p=0; //last filter
2697 else p--, *p=','; //not last filter
2698
2699 plen= strlen(p);
2700 spaceLeft= (int)p - (int)temp + plen;
2701 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2702 {
2703 ppMode.error++;
2704 break;
2705 }
2706 memmove(p + newlen, p, plen+1);
2707 memcpy(p, replaceTable[2*i + 1], newlen);
2708 filterNameOk=1;
2709 }
2710 }
2711
2712 for(i=0; filters[i].shortName!=NULL; i++)
2713 {
117e45b0 2714// printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
911879d1
MN
2715 if( !strcmp(filters[i].longName, filterName)
2716 || !strcmp(filters[i].shortName, filterName))
2717 {
2718 ppMode.lumMode &= ~filters[i].mask;
2719 ppMode.chromMode &= ~filters[i].mask;
2720
2721 filterNameOk=1;
2722 if(!enable) break; // user wants to disable it
2723
2724 if(q >= filters[i].minLumQuality)
2725 ppMode.lumMode|= filters[i].mask;
2726 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2727 if(q >= filters[i].minChromQuality)
2728 ppMode.chromMode|= filters[i].mask;
2729
2730 if(filters[i].mask == LEVEL_FIX)
2731 {
2732 int o;
2733 ppMode.minAllowedY= 16;
2734 ppMode.maxAllowedY= 234;
2735 for(o=0; options[o]!=NULL; o++)
2736 if( !strcmp(options[o],"fullyrange")
2737 ||!strcmp(options[o],"f"))
2738 {
2739 ppMode.minAllowedY= 0;
2740 ppMode.maxAllowedY= 255;
2741 numOfUnknownOptions--;
2742 }
2743 }
117e45b0
MN
2744 else if(filters[i].mask == TEMP_NOISE_FILTER)
2745 {
2746 int o;
2747 int numOfNoises=0;
2748 ppMode.maxTmpNoise[0]= 150;
2749 ppMode.maxTmpNoise[1]= 200;
2750 ppMode.maxTmpNoise[2]= 400;
2751
2752 for(o=0; options[o]!=NULL; o++)
2753 {
2754 char *tail;
2755 ppMode.maxTmpNoise[numOfNoises]=
2756 strtol(options[o], &tail, 0);
2757 if(tail!=options[o])
2758 {
2759 numOfNoises++;
2760 numOfUnknownOptions--;
2761 if(numOfNoises >= 3) break;
2762 }
2763 }
2764 }
911879d1
MN
2765 }
2766 }
2767 if(!filterNameOk) ppMode.error++;
2768 ppMode.error += numOfUnknownOptions;
2769 }
2770
815cbfe7 2771#ifdef HAVE_ODIVX_POSTPROCESS
911879d1
MN
2772 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2773 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2774 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2775 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2776 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2777 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
815cbfe7 2778#endif
911879d1
MN
2779
2780 return ppMode;
2781}
2782
3057fa66 2783/**
117e45b0 2784 * Obsolete, dont use it, use postprocess2() instead
3057fa66 2785 */
3057fa66
A
2786void postprocess(unsigned char * src[], int src_stride,
2787 unsigned char * dst[], int dst_stride,
2788 int horizontal_size, int vertical_size,
2789 QP_STORE_T *QP_store, int QP_stride,
2790 int mode)
2791{
117e45b0
MN
2792 struct PPMode ppMode;
2793 static QP_STORE_T zeroArray[2048/8];
911879d1
MN
2794/*
2795 static int qual=0;
2796
117e45b0
MN
2797 ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
2798 printf("OK\n");
911879d1
MN
2799 qual++;
2800 qual%=7;
117e45b0
MN
2801 printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
2802 qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
911879d1
MN
2803 postprocess2(src, src_stride, dst, dst_stride,
2804 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2805
2806 return;
2807*/
815cbfe7
MN
2808 if(QP_store==NULL)
2809 {
2810 QP_store= zeroArray;
2811 QP_stride= 0;
2812 }
13e00528 2813
117e45b0
MN
2814 ppMode.lumMode= mode;
2815 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2816 ppMode.chromMode= mode;
be44a4d7
MN
2817 ppMode.maxTmpNoise[0]= 700;
2818 ppMode.maxTmpNoise[1]= 1500;
2819 ppMode.maxTmpNoise[2]= 3000;
117e45b0 2820
9a722af7
A
2821#ifdef HAVE_ODIVX_POSTPROCESS
2822// Note: I could make this shit outside of this file, but it would mean one
2823// more function call...
2824 if(use_old_pp){
2825 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2826 return;
2827 }
2828#endif
2829
13e00528 2830 postProcess(src[0], src_stride, dst[0], dst_stride,
117e45b0 2831 horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
3057fa66
A
2832
2833 horizontal_size >>= 1;
2834 vertical_size >>= 1;
2835 src_stride >>= 1;
2836 dst_stride >>= 1;
2837
2838 if(1)
2839 {
13e00528 2840 postProcess(src[1], src_stride, dst[1], dst_stride,
117e45b0 2841 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
13e00528 2842 postProcess(src[2], src_stride, dst[2], dst_stride,
117e45b0 2843 horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
3057fa66
A
2844 }
2845 else
2846 {
117e45b0
MN
2847 memset(dst[1], 128, dst_stride*vertical_size);
2848 memset(dst[2], 128, dst_stride*vertical_size);
2849// memcpy(dst[1], src[1], src_stride*horizontal_size);
2850// memcpy(dst[2], src[2], src_stride*horizontal_size);
3057fa66
A
2851 }
2852}
9a722af7 2853
911879d1
MN
2854void postprocess2(unsigned char * src[], int src_stride,
2855 unsigned char * dst[], int dst_stride,
2856 int horizontal_size, int vertical_size,
2857 QP_STORE_T *QP_store, int QP_stride,
2858 struct PPMode *mode)
2859{
2860
815cbfe7
MN
2861 static QP_STORE_T zeroArray[2048/8];
2862 if(QP_store==NULL)
2863 {
2864 QP_store= zeroArray;
2865 QP_stride= 0;
2866 }
2867
911879d1
MN
2868#ifdef HAVE_ODIVX_POSTPROCESS
2869// Note: I could make this shit outside of this file, but it would mean one
2870// more function call...
2871 if(use_old_pp){
2872 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
2873 mode->oldMode);
2874 return;
2875 }
2876#endif
2877
2878 postProcess(src[0], src_stride, dst[0], dst_stride,
117e45b0 2879 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
911879d1
MN
2880
2881 horizontal_size >>= 1;
2882 vertical_size >>= 1;
2883 src_stride >>= 1;
2884 dst_stride >>= 1;
2885
2886 postProcess(src[1], src_stride, dst[1], dst_stride,
117e45b0 2887 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
911879d1 2888 postProcess(src[2], src_stride, dst[2], dst_stride,
117e45b0 2889 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
911879d1
MN
2890}
2891
2892
13e00528
A
2893/**
2894 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
9a722af7 2895 * 0 <= quality <= 6
13e00528 2896 */
9a722af7
A
2897int getPpModeForQuality(int quality){
2898 int modes[1+GET_PP_QUALITY_MAX]= {
2899 0,
2900#if 1
2901 // horizontal filters first
2902 LUM_H_DEBLOCK,
2903 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2904 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2905 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2906 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2907 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2908#else
2909 // vertical filters first
13e00528
A
2910 LUM_V_DEBLOCK,
2911 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2912 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2913 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2914 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2915 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
9a722af7
A
2916#endif
2917 };
2918
2919#ifdef HAVE_ODIVX_POSTPROCESS
2920 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2921 0,
2922 PP_DEBLOCK_Y_H,
2923 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2924 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2925 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2926 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2927 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2928 };
2929 if(use_old_pp) return odivx_modes[quality];
2930#endif
2931 return modes[quality];
3057fa66
A
2932}
2933
2934/**
2935 * Copies a block from src to dst and fixes the blacklevel
d5a1a995
MN
2936 * numLines must be a multiple of 4
2937 * levelFix == 0 -> dont touch the brighness & contrast
3057fa66 2938 */
d5a1a995 2939static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
c09dc465 2940 int levelFix)
3057fa66 2941{
e5c30e06 2942#ifndef HAVE_MMX
d5a1a995 2943 int i;
e5c30e06 2944#endif
d5a1a995
MN
2945 if(levelFix)
2946 {
3057fa66
A
2947#ifdef HAVE_MMX
2948 asm volatile(
3057fa66
A
2949 "leal (%2,%2), %%eax \n\t"
2950 "leal (%3,%3), %%ebx \n\t"
2951 "movq packedYOffset, %%mm2 \n\t"
2952 "movq packedYScale, %%mm3 \n\t"
5b65f0df 2953 "pxor %%mm4, %%mm4 \n\t"
3057fa66 2954
3057fa66
A
2955#define SCALED_CPY \
2956 "movq (%0), %%mm0 \n\t"\
534a602d 2957 "movq (%0), %%mm5 \n\t"\
5b65f0df
MN
2958 "punpcklbw %%mm4, %%mm0 \n\t"\
2959 "punpckhbw %%mm4, %%mm5 \n\t"\
57d04d3f
MN
2960 "psubw %%mm2, %%mm0 \n\t"\
2961 "psubw %%mm2, %%mm5 \n\t"\
534a602d 2962 "movq (%0,%2), %%mm1 \n\t"\
57d04d3f
MN
2963 "psllw $6, %%mm0 \n\t"\
2964 "psllw $6, %%mm5 \n\t"\
5b65f0df 2965 "pmulhw %%mm3, %%mm0 \n\t"\
534a602d 2966 "movq (%0,%2), %%mm6 \n\t"\
5b65f0df 2967 "pmulhw %%mm3, %%mm5 \n\t"\
5b65f0df 2968 "punpcklbw %%mm4, %%mm1 \n\t"\
534a602d 2969 "punpckhbw %%mm4, %%mm6 \n\t"\
57d04d3f 2970 "psubw %%mm2, %%mm1 \n\t"\
534a602d 2971 "psubw %%mm2, %%mm6 \n\t"\
57d04d3f 2972 "psllw $6, %%mm1 \n\t"\
534a602d 2973 "psllw $6, %%mm6 \n\t"\
5b65f0df 2974 "pmulhw %%mm3, %%mm1 \n\t"\
534a602d
MN
2975 "pmulhw %%mm3, %%mm6 \n\t"\
2976 "addl %%eax, %0 \n\t"\
2977 "packuswb %%mm5, %%mm0 \n\t"\
2978 "packuswb %%mm6, %%mm1 \n\t"\
2979 "movq %%mm0, (%1) \n\t"\
5b65f0df 2980 "movq %%mm1, (%1, %3) \n\t"\
3057fa66 2981
d5a1a995 2982SCALED_CPY
3057fa66 2983 "addl %%ebx, %1 \n\t"
d5a1a995 2984SCALED_CPY
3057fa66 2985 "addl %%ebx, %1 \n\t"
534a602d
MN
2986SCALED_CPY
2987 "addl %%ebx, %1 \n\t"
2988SCALED_CPY
d5a1a995 2989
37da00fc
MN
2990 : "+r"(src),
2991 "+r"(dst)
2992 :"r" (srcStride),
534a602d 2993 "r" (dstStride)
d5a1a995
MN
2994 : "%eax", "%ebx"
2995 );
2996#else
c09dc465 2997 for(i=0; i<8; i++)
d5a1a995
MN
2998 memcpy( &(dst[dstStride*i]),
2999 &(src[srcStride*i]), BLOCK_SIZE);
3000#endif
3001 }
3002 else
3003 {
3004#ifdef HAVE_MMX
3005 asm volatile(
d5a1a995
MN
3006 "pushl %0 \n\t"
3007 "pushl %1 \n\t"
3008 "leal (%2,%2), %%eax \n\t"
3009 "leal (%3,%3), %%ebx \n\t"
d5a1a995
MN
3010
3011#define SIMPLE_CPY \
3012 "movq (%0), %%mm0 \n\t"\
3013 "movq (%0,%2), %%mm1 \n\t"\
3014 "movq %%mm0, (%1) \n\t"\
3015 "movq %%mm1, (%1, %3) \n\t"\
3016
d5a1a995
MN
3017SIMPLE_CPY
3018 "addl %%eax, %0 \n\t"
3019 "addl %%ebx, %1 \n\t"
3020SIMPLE_CPY
3057fa66
A
3021 "addl %%eax, %0 \n\t"
3022 "addl %%ebx, %1 \n\t"
c09dc465
MN
3023SIMPLE_CPY
3024 "addl %%eax, %0 \n\t"
3025 "addl %%ebx, %1 \n\t"
3026SIMPLE_CPY
d5a1a995 3027
3057fa66
A
3028 "popl %1 \n\t"
3029 "popl %0 \n\t"
3030 : : "r" (src),
3031 "r" (dst),
3032 "r" (srcStride),
c09dc465 3033 "r" (dstStride)
3057fa66
A
3034 : "%eax", "%ebx"
3035 );
3036#else
c09dc465 3037 for(i=0; i<8; i++)
3057fa66
A
3038 memcpy( &(dst[dstStride*i]),
3039 &(src[srcStride*i]), BLOCK_SIZE);
3040#endif
d5a1a995 3041 }
3057fa66
A
3042}
3043
3044
3045/**
3046 * Filters array of bytes (Y or U or V values)
3047 */
9a722af7 3048static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
117e45b0 3049 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
3057fa66 3050{
d5a1a995 3051 int x,y;
117e45b0
MN
3052 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
3053
d5a1a995
MN
3054