use fastmemcpy for chrominance if no chrominance filtering is done
[libav.git] / postproc / postprocess.c
CommitLineData
3057fa66
A
1/*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
19/*
3b58b885 20 C MMX MMX2 3DNow
3057fa66
A
21isVertDC Ec Ec
22isVertMinMaxOk Ec Ec
3b58b885 23doVertLowPass E e e
7f16f6e6 24doVertDefFilter Ec Ec e e
3057fa66 25isHorizDC Ec Ec
4e4dcbc5
MN
26isHorizMinMaxOk a E
27doHorizLowPass E e e
7f16f6e6 28doHorizDefFilter Ec Ec e e
2e212618 29deRing E e e*
3b58b885 30Vertical RKAlgo1 E a a
e5c30e06 31Horizontal RKAlgo1 a a
117e45b0
MN
32Vertical X1# a E E
33Horizontal X1# a E E
acced553
MN
34LinIpolDeinterlace e E E*
35CubicIpolDeinterlace a e e*
36LinBlendDeinterlace e E E*
117e45b0 37MedianDeinterlace# Ec Ec
be44a4d7 38TempDeNoiser# E e e
d5a1a995 39
117e45b0
MN
40* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41# more or less selfinvented filters so the exactness isnt too meaningfull
3057fa66 42E = Exact implementation
acced553 43e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
44a = alternative / approximate impl
45c = checked against the other implementations (-vo md5)
46*/
47
48/*
49TODO:
3057fa66 50reduce the time wasted on the mem transfer
13e00528 51implement everything in C at least (done at the moment but ...)
3057fa66
A
52unroll stuff if instructions depend too much on the prior one
53we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
54move YScale thing to the end instead of fixing QP
13e00528 55write a faster and higher quality deblocking filter :)
d5a1a995
MN
56make the mainloop more flexible (variable number of blocks at once
57 (the if/else stuff per block is slowing things down)
9f45d04d 58compare the quality & speed of all filters
9f45d04d 59split this huge file
e5c30e06 60border remover
8405b3fd 61optimize c versions
117e45b0 62try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
be44a4d7 63smart blur
cd38e322 64commandline option for the deblock / dering thresholds
3057fa66 65...
13e00528
A
66*/
67
a6be8111 68//Changelog: use the CVS log
3057fa66 69
6c426cff 70#include "../config.h"
3057fa66
A
71#include <inttypes.h>
72#include <stdio.h>
d5a1a995 73#include <stdlib.h>
911879d1 74#include <string.h>
dda87e9f
PL
75#ifdef HAVE_MALLOC_H
76#include <malloc.h>
77#endif
3057fa66 78//#undef HAVE_MMX2
13e00528 79//#define HAVE_3DNOW
3057fa66 80//#undef HAVE_MMX
7f16f6e6 81//#define DEBUG_BRIGHTNESS
4e1349d4 82#include "../libvo/fastmemcpy.h"
13e00528 83#include "postprocess.h"
3057fa66 84
e939e1c3
A
85#define MIN(a,b) ((a) > (b) ? (b) : (a))
86#define MAX(a,b) ((a) < (b) ? (b) : (a))
87#define ABS(a) ((a) > 0 ? (a) : (-(a)))
88#define SIGN(a) ((a) > 0 ? 1 : -1)
89
90#ifdef HAVE_MMX2
91#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92#elif defined (HAVE_3DNOW)
93#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
94#endif
3057fa66 95
2e212618
MN
96#ifdef HAVE_MMX2
97#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
98#elif defined (HAVE_MMX)
99#define PMINUB(b,a,t) \
100 "movq " #a ", " #t " \n\t"\
101 "psubusb " #b ", " #t " \n\t"\
102 "psubb " #t ", " #a " \n\t"
103#endif
104
105#ifdef HAVE_MMX2
106#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
107#elif defined (HAVE_MMX)
108#define PMAXUB(a,b) \
109 "psubusb " #a ", " #b " \n\t"\
110 "paddb " #a ", " #b " \n\t"
111#endif
112
113
911879d1
MN
114#define GET_MODE_BUFFER_SIZE 500
115#define OPTIONS_ARRAY_SIZE 10
116
b28daef8 117#ifdef HAVE_MMX
3fe8e8f0
MN
118static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
119static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
b28daef8
MN
120static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
121static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
122static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL;
123static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL;
124static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL;
125static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL;
126static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL;
127static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL;
128static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL;
129static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL;
130static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL;
131static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL;
132static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL;
133static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL;
134static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL;
135static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
136static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
137static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
138static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL;
139static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL;
140static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
141static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL;
142static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL;
143static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
144static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL;
145static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL;
146static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL;
147static uint64_t __attribute__((aligned(8))) temp0=0;
148static uint64_t __attribute__((aligned(8))) temp1=0;
149static uint64_t __attribute__((aligned(8))) temp2=0;
150static uint64_t __attribute__((aligned(8))) temp3=0;
151static uint64_t __attribute__((aligned(8))) temp4=0;
152static uint64_t __attribute__((aligned(8))) temp5=0;
153static uint64_t __attribute__((aligned(8))) pQPb=0;
154static uint64_t __attribute__((aligned(8))) pQPb2=0;
155static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
a9c77978 156static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
b28daef8 157#else
3057fa66
A
158static uint64_t packedYOffset= 0x0000000000000000LL;
159static uint64_t packedYScale= 0x0100010001000100LL;
4e4dcbc5 160static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
b28daef8 161#endif
3057fa66
A
162
163int hFlatnessThreshold= 56 - 16;
164int vFlatnessThreshold= 56 - 16;
cd38e322 165int deringThreshold= 20;
3057fa66
A
166
167//amount of "black" u r willing to loose to get a brightness corrected picture
168double maxClippedThreshold= 0.01;
169
911879d1 170int maxAllowedY=234;
658a85f2 171int minAllowedY=16;
3057fa66 172
911879d1
MN
173static struct PPFilter filters[]=
174{
175 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
176 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
177 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
178 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
179 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
180 {"dr", "dering", 1, 5, 6, DERING},
181 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
182 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
183 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
184 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
185 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
117e45b0 186 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
911879d1
MN
187 {NULL, NULL,0,0,0,0} //End Marker
188};
189
190static char *replaceTable[]=
191{
117e45b0
MN
192 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
193 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
194 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
195 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
911879d1
MN
196 NULL //End Marker
197};
198
b28daef8 199#ifdef HAVE_MMX
e5c30e06
MN
200static inline void unusedVariableWarningFixer()
201{
202if(
203 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
204 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
205 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
b28daef8 206 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
e5c30e06
MN
207 + temp5 + pQPb== 0) b00=0;
208}
b28daef8 209#endif
e5c30e06 210
a6be8111 211#ifdef TIMING
3057fa66
A
212static inline long long rdtsc()
213{
214 long long l;
215 asm volatile( "rdtsc\n\t"
216 : "=A" (l)
217 );
218// printf("%d\n", int(l/1000));
219 return l;
220}
9a722af7 221#endif
3057fa66 222
9a722af7 223#ifdef HAVE_MMX2
3057fa66
A
224static inline void prefetchnta(void *p)
225{
226 asm volatile( "prefetchnta (%0)\n\t"
227 : : "r" (p)
228 );
229}
230
231static inline void prefetcht0(void *p)
232{
233 asm volatile( "prefetcht0 (%0)\n\t"
234 : : "r" (p)
235 );
236}
237
238static inline void prefetcht1(void *p)
239{
240 asm volatile( "prefetcht1 (%0)\n\t"
241 : : "r" (p)
242 );
243}
244
245static inline void prefetcht2(void *p)
246{
247 asm volatile( "prefetcht2 (%0)\n\t"
248 : : "r" (p)
249 );
250}
9a722af7 251#endif
3057fa66
A
252
253//FIXME? |255-0| = 1 (shouldnt be a problem ...)
254/**
acced553 255 * Check if the middle 8x8 Block in the given 8x16 block is flat
3057fa66 256 */
d5a1a995 257static inline int isVertDC(uint8_t src[], int stride){
3057fa66 258 int numEq= 0;
e5c30e06 259#ifndef HAVE_MMX
d5a1a995 260 int y;
e5c30e06 261#endif
acced553 262 src+= stride*4; // src points to begin of the 8x8 Block
3057fa66 263#ifdef HAVE_MMX
37da00fc
MN
264asm volatile(
265 "leal (%1, %2), %%eax \n\t"
266 "leal (%%eax, %2, 4), %%ebx \n\t"
267// 0 1 2 3 4 5 6 7 8 9
268// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
3057fa66
A
269 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
270 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
271 "movq (%1), %%mm0 \n\t"
37da00fc 272 "movq (%%eax), %%mm1 \n\t"
3057fa66
A
273 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
274 "paddb %%mm7, %%mm0 \n\t"
275 "pcmpgtb %%mm6, %%mm0 \n\t"
276
37da00fc 277 "movq (%%eax,%2), %%mm2 \n\t"
3057fa66
A
278 "psubb %%mm2, %%mm1 \n\t"
279 "paddb %%mm7, %%mm1 \n\t"
280 "pcmpgtb %%mm6, %%mm1 \n\t"
281 "paddb %%mm1, %%mm0 \n\t"
282
37da00fc 283 "movq (%%eax, %2, 2), %%mm1 \n\t"
3057fa66
A
284 "psubb %%mm1, %%mm2 \n\t"
285 "paddb %%mm7, %%mm2 \n\t"
286 "pcmpgtb %%mm6, %%mm2 \n\t"
287 "paddb %%mm2, %%mm0 \n\t"
288
37da00fc 289 "movq (%1, %2, 4), %%mm2 \n\t"
3057fa66
A
290 "psubb %%mm2, %%mm1 \n\t"
291 "paddb %%mm7, %%mm1 \n\t"
292 "pcmpgtb %%mm6, %%mm1 \n\t"
293 "paddb %%mm1, %%mm0 \n\t"
294
37da00fc 295 "movq (%%ebx), %%mm1 \n\t"
3057fa66
A
296 "psubb %%mm1, %%mm2 \n\t"
297 "paddb %%mm7, %%mm2 \n\t"
298 "pcmpgtb %%mm6, %%mm2 \n\t"
299 "paddb %%mm2, %%mm0 \n\t"
300
37da00fc 301 "movq (%%ebx, %2), %%mm2 \n\t"
3057fa66
A
302 "psubb %%mm2, %%mm1 \n\t"
303 "paddb %%mm7, %%mm1 \n\t"
304 "pcmpgtb %%mm6, %%mm1 \n\t"
305 "paddb %%mm1, %%mm0 \n\t"
306
37da00fc 307 "movq (%%ebx, %2, 2), %%mm1 \n\t"
3057fa66
A
308 "psubb %%mm1, %%mm2 \n\t"
309 "paddb %%mm7, %%mm2 \n\t"
310 "pcmpgtb %%mm6, %%mm2 \n\t"
311 "paddb %%mm2, %%mm0 \n\t"
312
313 " \n\t"
cd38e322
MN
314#ifdef HAVE_MMX2
315 "pxor %%mm7, %%mm7 \n\t"
316 "psadbw %%mm7, %%mm0 \n\t"
317#else
3057fa66
A
318 "movq %%mm0, %%mm1 \n\t"
319 "psrlw $8, %%mm0 \n\t"
320 "paddb %%mm1, %%mm0 \n\t"
321 "movq %%mm0, %%mm1 \n\t"
322 "psrlq $16, %%mm0 \n\t"
323 "paddb %%mm1, %%mm0 \n\t"
324 "movq %%mm0, %%mm1 \n\t"
325 "psrlq $32, %%mm0 \n\t"
326 "paddb %%mm1, %%mm0 \n\t"
cd38e322 327#endif
3057fa66
A
328 "movd %%mm0, %0 \n\t"
329 : "=r" (numEq)
330 : "r" (src), "r" (stride)
cd38e322 331 : "%ebx"
3057fa66 332 );
cd38e322 333 numEq= (-numEq) &0xFF;
3057fa66
A
334
335#else
d5a1a995 336 for(y=0; y<BLOCK_SIZE-1; y++)
3057fa66
A
337 {
338 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
339 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
340 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
341 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
342 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
343 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
344 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
345 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
346 src+= stride;
347 }
348#endif
349/* if(abs(numEq - asmEq) > 0)
350 {
351 printf("\nasm:%d c:%d\n", asmEq, numEq);
352 for(int y=0; y<8; y++)
353 {
354 for(int x=0; x<8; x++)
355 {
356 printf("%d ", temp[x + y*stride]);
357 }
358 printf("\n");
359 }
360 }
361*/
d5a1a995
MN
362// for(int i=0; i<numEq/8; i++) src[i]=255;
363 return (numEq > vFlatnessThreshold) ? 1 : 0;
3057fa66
A
364}
365
d5a1a995 366static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
3057fa66
A
367{
368#ifdef HAVE_MMX
369 int isOk;
acced553 370 src+= stride*3;
3057fa66
A
371 asm volatile(
372// "int $3 \n\t"
373 "movq (%1, %2), %%mm0 \n\t"
374 "movq (%1, %2, 8), %%mm1 \n\t"
375 "movq %%mm0, %%mm2 \n\t"
376 "psubusb %%mm1, %%mm0 \n\t"
377 "psubusb %%mm2, %%mm1 \n\t"
378 "por %%mm1, %%mm0 \n\t" // ABS Diff
379
380 "movq pQPb, %%mm7 \n\t" // QP,..., QP
381 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
382 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
383 "pcmpeqd b00, %%mm0 \n\t"
384 "psrlq $16, %%mm0 \n\t"
385 "pcmpeqd bFF, %%mm0 \n\t"
386// "movd %%mm0, (%1, %2, 4)\n\t"
387 "movd %%mm0, %0 \n\t"
388 : "=r" (isOk)
389 : "r" (src), "r" (stride)
390 );
ac0b0b2f 391 return isOk;
3057fa66
A
392#else
393
d5a1a995
MN
394 int isOk2= 1;
395 int x;
acced553 396 src+= stride*3;
d5a1a995 397 for(x=0; x<BLOCK_SIZE; x++)
3057fa66 398 {
d5a1a995 399 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
3057fa66
A
400 }
401/* if(isOk && !isOk2 || !isOk && isOk2)
402 {
403 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
404 for(int y=0; y<9; y++)
405 {
406 for(int x=0; x<8; x++)
407 {
408 printf("%d ", src[x + y*stride]);
409 }
410 printf("\n");
411 }
412 } */
413
414 return isOk2;
415#endif
416
417}
418
419/**
acced553 420 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
a6be8111 421 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
3057fa66
A
422 */
423static inline void doVertLowPass(uint8_t *src, int stride, int QP)
424{
13e00528 425#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 426 src+= stride*3;
3057fa66 427 asm volatile( //"movv %0 %1 %2\n\t"
3057fa66 428 "movq pQPb, %%mm0 \n\t" // QP,..., QP
3057fa66
A
429
430 "movq (%0), %%mm6 \n\t"
431 "movq (%0, %1), %%mm5 \n\t"
432 "movq %%mm5, %%mm1 \n\t"
433 "movq %%mm6, %%mm2 \n\t"
434 "psubusb %%mm6, %%mm5 \n\t"
435 "psubusb %%mm1, %%mm2 \n\t"
436 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
437 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
438 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
439
440 "pand %%mm2, %%mm6 \n\t"
441 "pandn %%mm1, %%mm2 \n\t"
442 "por %%mm2, %%mm6 \n\t"// First Line to Filter
443
444 "movq (%0, %1, 8), %%mm5 \n\t"
445 "leal (%0, %1, 4), %%eax \n\t"
446 "leal (%0, %1, 8), %%ebx \n\t"
447 "subl %1, %%ebx \n\t"
448 "addl %1, %0 \n\t" // %0 points to line 1 not 0
449 "movq (%0, %1, 8), %%mm7 \n\t"
450 "movq %%mm5, %%mm1 \n\t"
451 "movq %%mm7, %%mm2 \n\t"
452 "psubusb %%mm7, %%mm5 \n\t"
453 "psubusb %%mm1, %%mm2 \n\t"
454 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
455 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
456 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
457
458 "pand %%mm2, %%mm7 \n\t"
459 "pandn %%mm1, %%mm2 \n\t"
460 "por %%mm2, %%mm7 \n\t" // First Line to Filter
461
462
463 // 1 2 3 4 5 6 7 8
464 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
465 // 6 4 2 2 1 1
466 // 6 4 4 2
467 // 6 8 2
acced553 468
3057fa66
A
469 "movq (%0, %1), %%mm0 \n\t" // 1
470 "movq %%mm0, %%mm1 \n\t" // 1
13e00528
A
471 PAVGB(%%mm6, %%mm0) //1 1 /2
472 PAVGB(%%mm6, %%mm0) //3 1 /4
3057fa66
A
473
474 "movq (%0, %1, 4), %%mm2 \n\t" // 1
475 "movq %%mm2, %%mm5 \n\t" // 1
13e00528
A
476 PAVGB((%%eax), %%mm2) // 11 /2
477 PAVGB((%0, %1, 2), %%mm2) // 211 /4
3057fa66
A
478 "movq %%mm2, %%mm3 \n\t" // 211 /4
479 "movq (%0), %%mm4 \n\t" // 1
13e00528
A
480 PAVGB(%%mm4, %%mm3) // 4 211 /8
481 PAVGB(%%mm0, %%mm3) //642211 /16
3057fa66
A
482 "movq %%mm3, (%0) \n\t" // X
483 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
484 "movq %%mm1, %%mm0 \n\t" // 1
13e00528 485 PAVGB(%%mm6, %%mm0) //1 1 /2
3057fa66 486 "movq %%mm4, %%mm3 \n\t" // 1
13e00528
A
487 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
488 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
489 PAVGB((%%eax), %%mm5) // 211 /4
490 PAVGB(%%mm5, %%mm3) // 2 2211 /8
491 PAVGB(%%mm0, %%mm3) //4242211 /16
3057fa66
A
492 "movq %%mm3, (%0,%1) \n\t" // X
493 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
13e00528 494 PAVGB(%%mm4, %%mm6) //11 /2
3057fa66 495 "movq (%%ebx), %%mm0 \n\t" // 1
13e00528 496 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
3057fa66 497 "movq %%mm0, %%mm3 \n\t" // 11/2
13e00528
A
498 PAVGB(%%mm1, %%mm0) // 2 11/4
499 PAVGB(%%mm6, %%mm0) //222 11/8
500 PAVGB(%%mm2, %%mm0) //22242211/16
3057fa66
A
501 "movq (%0, %1, 2), %%mm2 \n\t" // 1
502 "movq %%mm0, (%0, %1, 2) \n\t" // X
503 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
504 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
505 PAVGB((%%ebx), %%mm0) // 11 /2
506 PAVGB(%%mm0, %%mm6) //11 11 /4
507 PAVGB(%%mm1, %%mm4) // 11 /2
508 PAVGB(%%mm2, %%mm1) // 11 /2
509 PAVGB(%%mm1, %%mm6) //1122 11 /8
510 PAVGB(%%mm5, %%mm6) //112242211 /16
3057fa66
A
511 "movq (%%eax), %%mm5 \n\t" // 1
512 "movq %%mm6, (%%eax) \n\t" // X
513 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
514 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
13e00528
A
515 PAVGB(%%mm7, %%mm6) // 11 /2
516 PAVGB(%%mm4, %%mm6) // 11 11 /4
517 PAVGB(%%mm3, %%mm6) // 11 2211 /8
518 PAVGB(%%mm5, %%mm2) // 11 /2
3057fa66 519 "movq (%0, %1, 4), %%mm4 \n\t" // 1
13e00528
A
520 PAVGB(%%mm4, %%mm2) // 112 /4
521 PAVGB(%%mm2, %%mm6) // 112242211 /16
3057fa66
A
522 "movq %%mm6, (%0, %1, 4) \n\t" // X
523 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
13e00528
A
524 PAVGB(%%mm7, %%mm1) // 11 2 /4
525 PAVGB(%%mm4, %%mm5) // 11 /2
526 PAVGB(%%mm5, %%mm0) // 11 11 /4
3057fa66 527 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
13e00528
A
528 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
529 PAVGB(%%mm0, %%mm1) // 11224222 /16
3057fa66
A
530 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
531 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
13e00528 532 PAVGB((%%ebx), %%mm2) // 112 4 /8
3057fa66 533 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
534 PAVGB(%%mm0, %%mm6) // 1 1 /2
535 PAVGB(%%mm7, %%mm6) // 1 12 /4
536 PAVGB(%%mm2, %%mm6) // 1122424 /4
3057fa66
A
537 "movq %%mm6, (%%ebx) \n\t" // X
538 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
13e00528
A
539 PAVGB(%%mm7, %%mm5) // 11 2 /4
540 PAVGB(%%mm7, %%mm5) // 11 6 /8
3057fa66 541
13e00528
A
542 PAVGB(%%mm3, %%mm0) // 112 /4
543 PAVGB(%%mm0, %%mm5) // 112246 /16
3057fa66 544 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
8405b3fd 545 "subl %1, %0 \n\t"
3057fa66
A
546
547 :
548 : "r" (src), "r" (stride)
549 : "%eax", "%ebx"
550 );
3057fa66
A
551#else
552 const int l1= stride;
553 const int l2= stride + l1;
554 const int l3= stride + l2;
555 const int l4= stride + l3;
556 const int l5= stride + l4;
557 const int l6= stride + l5;
558 const int l7= stride + l6;
559 const int l8= stride + l7;
560 const int l9= stride + l8;
d5a1a995 561 int x;
acced553 562 src+= stride*3;
d5a1a995 563 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
564 {
565 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
566 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
567
568 int sums[9];
569 sums[0] = first + src[l1];
570 sums[1] = src[l1] + src[l2];
571 sums[2] = src[l2] + src[l3];
572 sums[3] = src[l3] + src[l4];
573 sums[4] = src[l4] + src[l5];
574 sums[5] = src[l5] + src[l6];
575 sums[6] = src[l6] + src[l7];
576 sums[7] = src[l7] + src[l8];
577 sums[8] = src[l8] + last;
578
579 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
e5c30e06
MN
580 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
581 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
582 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
583 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
584 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
585 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
586 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
3057fa66
A
587
588 src++;
589 }
590
591#endif
592}
593
13e00528
A
594/**
595 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
596 * values are correctly clipped (MMX2)
597 * values are wraparound (C)
598 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
599 0 8 16 24
600 x = 8
601 x/2 = 4
602 x/8 = 1
603 1 12 12 23
604 */
9f45d04d 605static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
13e00528 606{
d5a1a995 607#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 608 src+= stride*3;
13e00528
A
609// FIXME rounding
610 asm volatile(
611 "pxor %%mm7, %%mm7 \n\t" // 0
612 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
613 "leal (%0, %1), %%eax \n\t"
614 "leal (%%eax, %1, 4), %%ebx \n\t"
615// 0 1 2 3 4 5 6 7 8 9
616// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
617 "movq pQPb, %%mm0 \n\t" // QP,..., QP
618 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
619 "paddusb b02, %%mm0 \n\t"
620 "psrlw $2, %%mm0 \n\t"
621 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
622 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
623 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
624 "movq (%%ebx), %%mm3 \n\t" // line 5
625 "movq %%mm2, %%mm4 \n\t" // line 4
626 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
627 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
d5a1a995 628 PAVGB(%%mm3, %%mm5)
13e00528
A
629 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
630 "psubusb %%mm3, %%mm4 \n\t"
631 "psubusb %%mm2, %%mm3 \n\t"
632 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
633 "psubusb %%mm0, %%mm4 \n\t"
634 "pcmpeqb %%mm7, %%mm4 \n\t"
635 "pand %%mm4, %%mm5 \n\t" // d/2
636
637// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
638 "paddb %%mm5, %%mm2 \n\t"
639// "psubb %%mm6, %%mm2 \n\t"
640 "movq %%mm2, (%0,%1, 4) \n\t"
641
642 "movq (%%ebx), %%mm2 \n\t"
643// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
644 "psubb %%mm5, %%mm2 \n\t"
645// "psubb %%mm6, %%mm2 \n\t"
646 "movq %%mm2, (%%ebx) \n\t"
647
648 "paddb %%mm6, %%mm5 \n\t"
649 "psrlw $2, %%mm5 \n\t"
650 "pand b3F, %%mm5 \n\t"
651 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
652
653 "movq (%%eax, %1, 2), %%mm2 \n\t"
654 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
655 "paddsb %%mm5, %%mm2 \n\t"
656 "psubb %%mm6, %%mm2 \n\t"
657 "movq %%mm2, (%%eax, %1, 2) \n\t"
658
659 "movq (%%ebx, %1), %%mm2 \n\t"
660 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
661 "psubsb %%mm5, %%mm2 \n\t"
662 "psubb %%mm6, %%mm2 \n\t"
663 "movq %%mm2, (%%ebx, %1) \n\t"
664
665 :
666 : "r" (src), "r" (stride)
667 : "%eax", "%ebx"
668 );
669#else
670 const int l1= stride;
671 const int l2= stride + l1;
672 const int l3= stride + l2;
673 const int l4= stride + l3;
674 const int l5= stride + l4;
675 const int l6= stride + l5;
e5c30e06
MN
676// const int l7= stride + l6;
677// const int l8= stride + l7;
678// const int l9= stride + l8;
d5a1a995 679 int x;
3407a972 680 const int QP15= QP + (QP>>2);
acced553 681 src+= stride*3;
d5a1a995 682 for(x=0; x<BLOCK_SIZE; x++)
13e00528 683 {
3407a972
MN
684 const int v = (src[x+l5] - src[x+l4]);
685 if(ABS(v) < QP15)
13e00528 686 {
3407a972
MN
687 src[x+l3] +=v>>3;
688 src[x+l4] +=v>>1;
689 src[x+l5] -=v>>1;
690 src[x+l6] -=v>>3;
13e00528 691
13e00528 692 }
13e00528
A
693 }
694
695#endif
696}
697
698/**
699 * Experimental Filter 1
9f45d04d
MN
700 * will not damage linear gradients
701 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
d5a1a995
MN
702 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
703 * MMX2 version does correct clipping C version doesnt
13e00528
A
704 */
705static inline void vertX1Filter(uint8_t *src, int stride, int QP)
706{
d5a1a995 707#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553
MN
708 src+= stride*3;
709
13e00528 710 asm volatile(
d5a1a995
MN
711 "pxor %%mm7, %%mm7 \n\t" // 0
712// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
713 "leal (%0, %1), %%eax \n\t"
714 "leal (%%eax, %1, 4), %%ebx \n\t"
715// 0 1 2 3 4 5 6 7 8 9
716// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
717 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
718 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
719 "movq %%mm1, %%mm2 \n\t" // line 4
720 "psubusb %%mm0, %%mm1 \n\t"
721 "psubusb %%mm2, %%mm0 \n\t"
722 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
723 "movq (%%ebx), %%mm3 \n\t" // line 5
724 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
725 "movq %%mm3, %%mm5 \n\t" // line 5
726 "psubusb %%mm4, %%mm3 \n\t"
727 "psubusb %%mm5, %%mm4 \n\t"
728 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
729 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
730 "movq %%mm2, %%mm1 \n\t" // line 4
731 "psubusb %%mm5, %%mm2 \n\t"
732 "movq %%mm2, %%mm4 \n\t"
733 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
734 "psubusb %%mm1, %%mm5 \n\t"
735 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
736 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
737 "movq %%mm4, %%mm3 \n\t" // d
738 "psubusb pQPb, %%mm4 \n\t"
739 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
9f45d04d 740 "psubusb b01, %%mm3 \n\t"
d5a1a995
MN
741 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
742
743 PAVGB(%%mm7, %%mm3) // d/2
9f45d04d
MN
744 "movq %%mm3, %%mm1 \n\t" // d/2
745 PAVGB(%%mm7, %%mm3) // d/4
746 PAVGB(%%mm1, %%mm3) // 3*d/8
d5a1a995
MN
747
748 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
749 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
750 "psubusb %%mm3, %%mm0 \n\t"
751 "pxor %%mm2, %%mm0 \n\t"
752 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
753
754 "movq (%%ebx), %%mm0 \n\t" // line 5
755 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
756 "paddusb %%mm3, %%mm0 \n\t"
757 "pxor %%mm2, %%mm0 \n\t"
758 "movq %%mm0, (%%ebx) \n\t" // line 5
759
9f45d04d 760 PAVGB(%%mm7, %%mm1) // d/4
d5a1a995
MN
761
762 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
763 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
9f45d04d 764 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
765 "pxor %%mm2, %%mm0 \n\t"
766 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
767
768 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
769 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
9f45d04d 770 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
771 "pxor %%mm2, %%mm0 \n\t"
772 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
773
9f45d04d 774 PAVGB(%%mm7, %%mm1) // d/8
d5a1a995
MN
775
776 "movq (%%eax, %1), %%mm0 \n\t" // line 2
777 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
9f45d04d 778 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
779 "pxor %%mm2, %%mm0 \n\t"
780 "movq %%mm0, (%%eax, %1) \n\t" // line 2
781
782 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
783 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
9f45d04d 784 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
785 "pxor %%mm2, %%mm0 \n\t"
786 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
13e00528
A
787
788 :
789 : "r" (src), "r" (stride)
790 : "%eax", "%ebx"
791 );
792#else
d5a1a995
MN
793
794 const int l1= stride;
795 const int l2= stride + l1;
796 const int l3= stride + l2;
797 const int l4= stride + l3;
798 const int l5= stride + l4;
799 const int l6= stride + l5;
800 const int l7= stride + l6;
e5c30e06
MN
801// const int l8= stride + l7;
802// const int l9= stride + l8;
d5a1a995 803 int x;
acced553
MN
804
805 src+= stride*3;
d5a1a995
MN
806 for(x=0; x<BLOCK_SIZE; x++)
807 {
808 int a= src[l3] - src[l4];
809 int b= src[l4] - src[l5];
9f45d04d 810 int c= src[l5] - src[l6];
d5a1a995 811
3407a972
MN
812 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
813 d= MAX(d, 0);
d5a1a995
MN
814
815 if(d < QP)
816 {
817 int v = d * SIGN(-b);
818
3407a972
MN
819 src[l2] +=v>>3;
820 src[l3] +=v>>2;
821 src[l4] +=(3*v)>>3;
822 src[l5] -=(3*v)>>3;
823 src[l6] -=v>>2;
824 src[l7] -=v>>3;
d5a1a995
MN
825
826 }
827 src++;
828 }
829 /*
13e00528
A
830 const int l1= stride;
831 const int l2= stride + l1;
832 const int l3= stride + l2;
833 const int l4= stride + l3;
834 const int l5= stride + l4;
835 const int l6= stride + l5;
836 const int l7= stride + l6;
837 const int l8= stride + l7;
838 const int l9= stride + l8;
839 for(int x=0; x<BLOCK_SIZE; x++)
840 {
841 int v2= src[l2];
842 int v3= src[l3];
843 int v4= src[l4];
844 int v5= src[l5];
845 int v6= src[l6];
846 int v7= src[l7];
847
848 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
849 {
850 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
851 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
852 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
853 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
854 }
855 src++;
856 }
d5a1a995 857*/
13e00528
A
858#endif
859}
860
cf5ec61d
MN
861/**
862 * Experimental Filter 1 (Horizontal)
863 * will not damage linear gradients
864 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
865 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
866 * MMX2 version does correct clipping C version doesnt
867 * not identical with the vertical one
868 */
869static inline void horizX1Filter(uint8_t *src, int stride, int QP)
870{
871 int y;
872//FIXME (has little in common with the mmx2 version)
873 for(y=0; y<BLOCK_SIZE; y++)
874 {
875 int a= src[1] - src[2];
876 int b= src[3] - src[4];
877 int c= src[5] - src[6];
878
879 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
880
881 if(d < QP)
882 {
883 int v = d * SIGN(-b);
884
885 src[1] +=v/8;
886 src[2] +=v/4;
887 src[3] +=3*v/8;
888 src[4] -=3*v/8;
889 src[5] -=v/4;
890 src[6] -=v/8;
891
892 }
893 src+=stride;
894 }
895}
896
897
3057fa66
A
898static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
899{
7f16f6e6
MN
900#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
901/*
902 uint8_t tmp[16];
903 const int l1= stride;
904 const int l2= stride + l1;
905 const int l3= stride + l2;
906 const int l4= (int)tmp - (int)src - stride*3;
907 const int l5= (int)tmp - (int)src - stride*3 + 8;
908 const int l6= stride*3 + l3;
909 const int l7= stride + l6;
910 const int l8= stride + l7;
911
912 memcpy(tmp, src+stride*7, 8);
913 memcpy(tmp+8, src+stride*8, 8);
914*/
915 src+= stride*4;
916 asm volatile(
917
918#if 0 //sligtly more accurate and slightly slower
919 "pxor %%mm7, %%mm7 \n\t" // 0
920 "leal (%0, %1), %%eax \n\t"
921 "leal (%%eax, %1, 4), %%ebx \n\t"
922// 0 1 2 3 4 5 6 7
923// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
924// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
925
926
927 "movq (%0, %1, 2), %%mm0 \n\t" // l2
928 "movq (%0), %%mm1 \n\t" // l0
929 "movq %%mm0, %%mm2 \n\t" // l2
930 PAVGB(%%mm7, %%mm0) // ~l2/2
931 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
932 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
933
934 "movq (%%eax), %%mm1 \n\t" // l1
935 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
936 "movq %%mm1, %%mm4 \n\t" // l1
937 PAVGB(%%mm7, %%mm1) // ~l1/2
938 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
939 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
940
941 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
942 "psubusb %%mm1, %%mm0 \n\t"
943 "psubusb %%mm4, %%mm1 \n\t"
944 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
945// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
946
947 "movq (%0, %1, 4), %%mm0 \n\t" // l4
948 "movq %%mm0, %%mm4 \n\t" // l4
949 PAVGB(%%mm7, %%mm0) // ~l4/2
950 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
951 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
952
953 "movq (%%ebx), %%mm2 \n\t" // l5
954 "movq %%mm3, %%mm5 \n\t" // l3
955 PAVGB(%%mm7, %%mm3) // ~l3/2
956 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
957 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
958
959 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
960 "psubusb %%mm3, %%mm0 \n\t"
961 "psubusb %%mm6, %%mm3 \n\t"
962 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
963 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
964// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
965
966 "movq (%%ebx, %1), %%mm6 \n\t" // l6
967 "movq %%mm6, %%mm5 \n\t" // l6
968 PAVGB(%%mm7, %%mm6) // ~l6/2
969 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
970 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
971
972 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
973 "movq %%mm2, %%mm4 \n\t" // l5
974 PAVGB(%%mm7, %%mm2) // ~l5/2
975 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
976 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
977
978 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
979 "psubusb %%mm2, %%mm6 \n\t"
980 "psubusb %%mm4, %%mm2 \n\t"
981 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
982// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
983
984
985 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
986 "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
987 "paddusb b01, %%mm4 \n\t"
988 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
989 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
990 "pand %%mm4, %%mm3 \n\t"
991
992 "movq %%mm3, %%mm1 \n\t"
993// "psubusb b01, %%mm3 \n\t"
994 PAVGB(%%mm7, %%mm3)
995 PAVGB(%%mm7, %%mm3)
996 "paddusb %%mm1, %%mm3 \n\t"
997// "paddusb b01, %%mm3 \n\t"
998
999 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
1000 "movq (%0, %1, 4), %%mm5 \n\t" //l4
1001 "movq (%0, %1, 4), %%mm4 \n\t" //l4
1002 "psubusb %%mm6, %%mm5 \n\t"
1003 "psubusb %%mm4, %%mm6 \n\t"
1004 "por %%mm6, %%mm5 \n\t" // |l3-l4|
1005 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
1006 "pxor %%mm6, %%mm0 \n\t"
1007 "pand %%mm0, %%mm3 \n\t"
1008 PMINUB(%%mm5, %%mm3, %%mm0)
1009
1010 "psubusb b01, %%mm3 \n\t"
1011 PAVGB(%%mm7, %%mm3)
1012
1013 "movq (%%eax, %1, 2), %%mm0 \n\t"
1014 "movq (%0, %1, 4), %%mm2 \n\t"
1015 "pxor %%mm6, %%mm0 \n\t"
1016 "pxor %%mm6, %%mm2 \n\t"
1017 "psubb %%mm3, %%mm0 \n\t"
1018 "paddb %%mm3, %%mm2 \n\t"
1019 "pxor %%mm6, %%mm0 \n\t"
1020 "pxor %%mm6, %%mm2 \n\t"
1021 "movq %%mm0, (%%eax, %1, 2) \n\t"
1022 "movq %%mm2, (%0, %1, 4) \n\t"
1023#endif
1024
1025 "leal (%0, %1), %%eax \n\t"
1026 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
1027// 0 1 2 3 4 5 6 7
1028// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1029// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1030
1031
1032 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
1033 "movq (%0, %1, 4), %%mm0 \n\t" // l4
1034 "pxor %%mm6, %%mm1 \n\t" // -l3-1
1035 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
1036// mm1=-l3-1, mm0=128-q
1037
1038 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
1039 "movq (%%eax, %1), %%mm3 \n\t" // l2
1040 "pxor %%mm6, %%mm2 \n\t" // -l5-1
1041 "movq %%mm2, %%mm5 \n\t" // -l5-1
1042 "movq b80, %%mm4 \n\t" // 128
1043 "leal (%%eax, %1, 4), %%ebx \n\t"
1044 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
1045 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
1046 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
1047 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
1048// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
1049
1050 "movq (%%eax), %%mm2 \n\t" // l1
1051 "pxor %%mm6, %%mm2 \n\t" // -l1-1
1052 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
1053 PAVGB((%0), %%mm1) // (l0-l3+256)/2
1054 "movq b80, %%mm3 \n\t" // 128
1055 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
1056 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
1057 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
1058// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
1059
1060 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
1061 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
1062 "pxor %%mm6, %%mm1 \n\t" // -l7-1
1063 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
1064 "movq b80, %%mm2 \n\t" // 128
1065 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
1066 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
1067 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
1068// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
1069
1070 "movq b00, %%mm1 \n\t" // 0
1071 "movq b00, %%mm5 \n\t" // 0
1072 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
1073 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
1074 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
1075 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
1076 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
1077
1078// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
1079
1080 "movq b00, %%mm7 \n\t" // 0
1081 "movq pQPb, %%mm2 \n\t" // QP
1082 PAVGB(%%mm6, %%mm2) // 128 + QP/2
1083 "psubb %%mm6, %%mm2 \n\t"
1084
1085 "movq %%mm4, %%mm1 \n\t"
1086 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
1087 "pxor %%mm1, %%mm4 \n\t"
1088 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
1089 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
1090 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
1091// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
1092
1093 "movq %%mm4, %%mm3 \n\t" // d
1094 "psubusb b01, %%mm4 \n\t"
1095 PAVGB(%%mm7, %%mm4) // d/32
1096 PAVGB(%%mm7, %%mm4) // (d + 32)/64
1097 "paddb %%mm3, %%mm4 \n\t" // 5d/64
1098 "pand %%mm2, %%mm4 \n\t"
1099
1100 "movq b80, %%mm5 \n\t" // 128
1101 "psubb %%mm0, %%mm5 \n\t" // q
1102 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
1103 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
1104 "pxor %%mm7, %%mm5 \n\t"
1105
1106 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
1107 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
1108
1109 "pand %%mm7, %%mm4 \n\t"
1110 "movq (%%eax, %1, 2), %%mm0 \n\t"
1111 "movq (%0, %1, 4), %%mm2 \n\t"
1112 "pxor %%mm1, %%mm0 \n\t"
1113 "pxor %%mm1, %%mm2 \n\t"
1114 "paddb %%mm4, %%mm0 \n\t"
1115 "psubb %%mm4, %%mm2 \n\t"
1116 "pxor %%mm1, %%mm0 \n\t"
1117 "pxor %%mm1, %%mm2 \n\t"
1118 "movq %%mm0, (%%eax, %1, 2) \n\t"
1119 "movq %%mm2, (%0, %1, 4) \n\t"
1120
1121 :
1122 : "r" (src), "r" (stride)
1123 : "%eax", "%ebx"
1124 );
1125
1126/*
1127 {
1128 int x;
1129 src-= stride;
1130 for(x=0; x<BLOCK_SIZE; x++)
1131 {
1132 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1133 if(ABS(middleEnergy)< 8*QP)
1134 {
1135 const int q=(src[l4] - src[l5])/2;
1136 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1137 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1138
1139 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1140 d= MAX(d, 0);
1141
1142 d= (5*d + 32) >> 6;
1143 d*= SIGN(-middleEnergy);
1144
1145 if(q>0)
1146 {
1147 d= d<0 ? 0 : d;
1148 d= d>q ? q : d;
1149 }
1150 else
1151 {
1152 d= d>0 ? 0 : d;
1153 d= d<q ? q : d;
1154 }
1155
1156 src[l4]-= d;
1157 src[l5]+= d;
1158 }
1159 src++;
1160 }
1161src-=8;
1162 for(x=0; x<8; x++)
1163 {
1164 int y;
1165 for(y=4; y<6; y++)
1166 {
1167 int d= src[x+y*stride] - tmp[x+(y-4)*8];
1168 int ad= ABS(d);
1169 static int max=0;
1170 static int sum=0;
1171 static int num=0;
1172 static int bias=0;
1173
1174 if(max<ad) max=ad;
1175 sum+= ad>3 ? 1 : 0;
1176 if(ad>3)
1177 {
1178 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
1179 }
1180 if(y==4) bias+=d;
1181 num++;
1182 if(num%1000000 == 0)
1183 {
1184 printf(" %d %d %d %d\n", num, sum, max, bias);
1185 }
1186 }
1187 }
1188}
1189*/
1190#elif defined (HAVE_MMX)
acced553 1191 src+= stride*4;
7f16f6e6 1192
3057fa66
A
1193 asm volatile(
1194 "pxor %%mm7, %%mm7 \n\t"
1195 "leal (%0, %1), %%eax \n\t"
1196 "leal (%%eax, %1, 4), %%ebx \n\t"
1197// 0 1 2 3 4 5 6 7
1198// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1199// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1200
1201 "movq (%0), %%mm0 \n\t"
1202 "movq %%mm0, %%mm1 \n\t"
1203 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1204 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1205
1206 "movq (%%eax), %%mm2 \n\t"
1207 "movq %%mm2, %%mm3 \n\t"
1208 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1209 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1210
1211 "movq (%%eax, %1), %%mm4 \n\t"
1212 "movq %%mm4, %%mm5 \n\t"
1213 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1214 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1215
1216 "paddw %%mm0, %%mm0 \n\t" // 2L0
1217 "paddw %%mm1, %%mm1 \n\t" // 2H0
1218 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1219 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1220 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1221 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1222
1223 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1224 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1225 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1226 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1227
1228 "movq (%%eax, %1, 2), %%mm2 \n\t"
1229 "movq %%mm2, %%mm3 \n\t"
1230 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1231 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1232
1233 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1234 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1235 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1236 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1237 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1238 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1239
1240 "movq (%0, %1, 4), %%mm0 \n\t"
1241 "movq %%mm0, %%mm1 \n\t"
1242 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1243 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1244
1245 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1246 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1247 "movq %%mm2, temp2 \n\t" // L3 - L4
1248 "movq %%mm3, temp3 \n\t" // H3 - H4
1249 "paddw %%mm4, %%mm4 \n\t" // 2L2
1250 "paddw %%mm5, %%mm5 \n\t" // 2H2
1251 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1252 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1253
1254 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1255 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1256 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1257 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1258//50 opcodes so far
1259 "movq (%%ebx), %%mm2 \n\t"
1260 "movq %%mm2, %%mm3 \n\t"
1261 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1262 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1263 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1264 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1265 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1266 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1267
1268 "movq (%%ebx, %1), %%mm6 \n\t"
1269 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1270 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1271 "movq (%%ebx, %1), %%mm6 \n\t"
1272 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1273 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1274
1275 "paddw %%mm0, %%mm0 \n\t" // 2L4
1276 "paddw %%mm1, %%mm1 \n\t" // 2H4
1277 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1278 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1279
1280 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1281 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1282 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1283 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1284
1285 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1286 "movq %%mm2, %%mm3 \n\t"
1287 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1288 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1289
1290 "paddw %%mm2, %%mm2 \n\t" // 2L7
1291 "paddw %%mm3, %%mm3 \n\t" // 2H7
1292 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1293 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1294
1295 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1296 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
8405b3fd
MN
1297
1298#ifdef HAVE_MMX2
1299 "movq %%mm7, %%mm6 \n\t" // 0
1300 "psubw %%mm0, %%mm6 \n\t"
1301 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1302 "movq %%mm7, %%mm6 \n\t" // 0
1303 "psubw %%mm1, %%mm6 \n\t"
1304 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1305 "movq %%mm7, %%mm6 \n\t" // 0
1306 "psubw %%mm2, %%mm6 \n\t"
1307 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1308 "movq %%mm7, %%mm6 \n\t" // 0
1309 "psubw %%mm3, %%mm6 \n\t"
1310 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1311#else
3057fa66
A
1312 "movq %%mm7, %%mm6 \n\t" // 0
1313 "pcmpgtw %%mm0, %%mm6 \n\t"
1314 "pxor %%mm6, %%mm0 \n\t"
1315 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1316 "movq %%mm7, %%mm6 \n\t" // 0
1317 "pcmpgtw %%mm1, %%mm6 \n\t"
1318 "pxor %%mm6, %%mm1 \n\t"
1319 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3057fa66
A
1320 "movq %%mm7, %%mm6 \n\t" // 0
1321 "pcmpgtw %%mm2, %%mm6 \n\t"
1322 "pxor %%mm6, %%mm2 \n\t"
1323 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1324 "movq %%mm7, %%mm6 \n\t" // 0
1325 "pcmpgtw %%mm3, %%mm6 \n\t"
1326 "pxor %%mm6, %%mm3 \n\t"
1327 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
8405b3fd 1328#endif
3057fa66
A
1329
1330#ifdef HAVE_MMX2
1331 "pminsw %%mm2, %%mm0 \n\t"
1332 "pminsw %%mm3, %%mm1 \n\t"
1333#else
1334 "movq %%mm0, %%mm6 \n\t"
1335 "psubusw %%mm2, %%mm6 \n\t"
1336 "psubw %%mm6, %%mm0 \n\t"
1337 "movq %%mm1, %%mm6 \n\t"
1338 "psubusw %%mm3, %%mm6 \n\t"
1339 "psubw %%mm6, %%mm1 \n\t"
1340#endif
1341
1342 "movq %%mm7, %%mm6 \n\t" // 0
1343 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1344 "pxor %%mm6, %%mm4 \n\t"
1345 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1346 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1347 "pxor %%mm7, %%mm5 \n\t"
1348 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1349// 100 opcodes
1350 "movd %2, %%mm2 \n\t" // QP
3057fa66
A
1351 "punpcklwd %%mm2, %%mm2 \n\t"
1352 "punpcklwd %%mm2, %%mm2 \n\t"
1353 "psllw $3, %%mm2 \n\t" // 8QP
1354 "movq %%mm2, %%mm3 \n\t" // 8QP
1355 "pcmpgtw %%mm4, %%mm2 \n\t"
1356 "pcmpgtw %%mm5, %%mm3 \n\t"
1357 "pand %%mm2, %%mm4 \n\t"
1358 "pand %%mm3, %%mm5 \n\t"
1359
1360
1361 "psubusw %%mm0, %%mm4 \n\t" // hd
1362 "psubusw %%mm1, %%mm5 \n\t" // ld
1363
1364
1365 "movq w05, %%mm2 \n\t" // 5
1366 "pmullw %%mm2, %%mm4 \n\t"
1367 "pmullw %%mm2, %%mm5 \n\t"
1368 "movq w20, %%mm2 \n\t" // 32
1369 "paddw %%mm2, %%mm4 \n\t"
1370 "paddw %%mm2, %%mm5 \n\t"
1371 "psrlw $6, %%mm4 \n\t"
1372 "psrlw $6, %%mm5 \n\t"
1373
1374/*
1375 "movq w06, %%mm2 \n\t" // 6
1376 "paddw %%mm2, %%mm4 \n\t"
1377 "paddw %%mm2, %%mm5 \n\t"
1378 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1379//FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1380 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1381 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1382*/
1383
1384 "movq temp2, %%mm0 \n\t" // L3 - L4
1385 "movq temp3, %%mm1 \n\t" // H3 - H4
1386
1387 "pxor %%mm2, %%mm2 \n\t"
1388 "pxor %%mm3, %%mm3 \n\t"
1389
3057fa66
A
1390 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1391 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1392 "pxor %%mm2, %%mm0 \n\t"
1393 "pxor %%mm3, %%mm1 \n\t"
1394 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1395 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
e5c30e06
MN
1396 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1397 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3057fa66
A
1398
1399 "pxor %%mm6, %%mm2 \n\t"
1400 "pxor %%mm7, %%mm3 \n\t"
1401 "pand %%mm2, %%mm4 \n\t"
1402 "pand %%mm3, %%mm5 \n\t"
1403
1404#ifdef HAVE_MMX2
1405 "pminsw %%mm0, %%mm4 \n\t"
1406 "pminsw %%mm1, %%mm5 \n\t"
1407#else
1408 "movq %%mm4, %%mm2 \n\t"
1409 "psubusw %%mm0, %%mm2 \n\t"
1410 "psubw %%mm2, %%mm4 \n\t"
1411 "movq %%mm5, %%mm2 \n\t"
1412 "psubusw %%mm1, %%mm2 \n\t"
1413 "psubw %%mm2, %%mm5 \n\t"
1414#endif
1415 "pxor %%mm6, %%mm4 \n\t"
1416 "pxor %%mm7, %%mm5 \n\t"
1417 "psubw %%mm6, %%mm4 \n\t"
1418 "psubw %%mm7, %%mm5 \n\t"
1419 "packsswb %%mm5, %%mm4 \n\t"
1420 "movq (%%eax, %1, 2), %%mm0 \n\t"
1421 "paddb %%mm4, %%mm0 \n\t"
1422 "movq %%mm0, (%%eax, %1, 2) \n\t"
1423 "movq (%0, %1, 4), %%mm0 \n\t"
1424 "psubb %%mm4, %%mm0 \n\t"
3057fa66
A
1425 "movq %%mm0, (%0, %1, 4) \n\t"
1426
1427 :
1428 : "r" (src), "r" (stride), "r" (QP)
1429 : "%eax", "%ebx"
1430 );
1431#else
1432 const int l1= stride;
1433 const int l2= stride + l1;
1434 const int l3= stride + l2;
1435 const int l4= stride + l3;
1436 const int l5= stride + l4;
1437 const int l6= stride + l5;
1438 const int l7= stride + l6;
1439 const int l8= stride + l7;
1440// const int l9= stride + l8;
d5a1a995 1441 int x;
acced553 1442 src+= stride*3;
d5a1a995 1443 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
1444 {
1445 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1446 if(ABS(middleEnergy) < 8*QP)
1447 {
1448 const int q=(src[l4] - src[l5])/2;
1449 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1450 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1451
1452 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1453 d= MAX(d, 0);
1454
1455 d= (5*d + 32) >> 6;
1456 d*= SIGN(-middleEnergy);
1457
1458 if(q>0)
1459 {
1460 d= d<0 ? 0 : d;
1461 d= d>q ? q : d;
1462 }
1463 else
1464 {
1465 d= d>0 ? 0 : d;
1466 d= d<q ? q : d;
1467 }
1468
1469 src[l4]-= d;
1470 src[l5]+= d;
1471 }
1472 src++;
1473 }
1474#endif
1475}
1476
cf5ec61d
MN
1477/**
1478 * Check if the given 8x8 Block is mostly "flat"
1479 */
1480static inline int isHorizDC(uint8_t src[], int stride)
1481{
1482 int numEq= 0;
1483 int y;
1484 for(y=0; y<BLOCK_SIZE; y++)
1485 {
1486 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1487 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1488 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1489 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1490 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1491 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1492 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1493 src+= stride;
1494 }
1495 return numEq > hFlatnessThreshold;
1496}
1497
1498static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1499{
1500 if(abs(src[0] - src[7]) > 2*QP) return 0;
1501
1502 return 1;
1503}
1504
1505static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1506{
1507 int y;
1508 for(y=0; y<BLOCK_SIZE; y++)
1509 {
1510 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1511
1512 if(ABS(middleEnergy) < 8*QP)
1513 {
1514 const int q=(dst[3] - dst[4])/2;
1515 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1516 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1517
1518 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1519 d= MAX(d, 0);
1520
1521 d= (5*d + 32) >> 6;
1522 d*= SIGN(-middleEnergy);
1523
1524 if(q>0)
1525 {
1526 d= d<0 ? 0 : d;
1527 d= d>q ? q : d;
1528 }
1529 else
1530 {
1531 d= d>0 ? 0 : d;
1532 d= d<q ? q : d;
1533 }
1534
1535 dst[3]-= d;
1536 dst[4]+= d;
1537 }
1538 dst+= stride;
1539 }
1540}
1541
1542/**
1543 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1544 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1545 */
1546static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1547{
1548
1549 int y;
1550 for(y=0; y<BLOCK_SIZE; y++)
1551 {
1552 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1553 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1554
1555 int sums[9];
1556 sums[0] = first + dst[0];
1557 sums[1] = dst[0] + dst[1];
1558 sums[2] = dst[1] + dst[2];
1559 sums[3] = dst[2] + dst[3];
1560 sums[4] = dst[3] + dst[4];
1561 sums[5] = dst[4] + dst[5];
1562 sums[6] = dst[5] + dst[6];
1563 sums[7] = dst[6] + dst[7];
1564 sums[8] = dst[7] + last;
1565
1566 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1567 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1568 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1569 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1570 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1571 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1572 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1573 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1574
1575 dst+= stride;
1576 }
1577}
1578
1579
3057fa66
A
1580static inline void dering(uint8_t src[], int stride, int QP)
1581{
e0f8ffae 1582#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
3057fa66 1583 asm volatile(
70c5ae87
MN
1584 "movq pQPb, %%mm0 \n\t"
1585 "paddusb %%mm0, %%mm0 \n\t"
1586 "movq %%mm0, pQPb2 \n\t"
1587
3057fa66
A
1588 "leal (%0, %1), %%eax \n\t"
1589 "leal (%%eax, %1, 4), %%ebx \n\t"
1590// 0 1 2 3 4 5 6 7 8 9
1591// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1592
cd38e322
MN
1593 "pcmpeqb %%mm7, %%mm7 \n\t"
1594 "pxor %%mm6, %%mm6 \n\t"
e0f8ffae 1595#ifdef HAVE_MMX2
3057fa66 1596#define FIND_MIN_MAX(addr)\
70c5ae87 1597 "movq " #addr ", %%mm0 \n\t"\
cd38e322
MN
1598 "pminub %%mm0, %%mm7 \n\t"\
1599 "pmaxub %%mm0, %%mm6 \n\t"
e0f8ffae
MN
1600#else
1601#define FIND_MIN_MAX(addr)\
1602 "movq " #addr ", %%mm0 \n\t"\
cd38e322
MN
1603 "movq %%mm7, %%mm1 \n\t"\
1604 "psubusb %%mm0, %%mm6 \n\t"\
1605 "paddb %%mm0, %%mm6 \n\t"\
e0f8ffae 1606 "psubusb %%mm0, %%mm1 \n\t"\
cd38e322 1607 "psubb %%mm1, %%mm7 \n\t"
e0f8ffae 1608#endif
3057fa66 1609
70c5ae87
MN
1610FIND_MIN_MAX((%%eax))
1611FIND_MIN_MAX((%%eax, %1))
1612FIND_MIN_MAX((%%eax, %1, 2))
1613FIND_MIN_MAX((%0, %1, 4))
1614FIND_MIN_MAX((%%ebx))
1615FIND_MIN_MAX((%%ebx, %1))
1616FIND_MIN_MAX((%%ebx, %1, 2))
1617FIND_MIN_MAX((%0, %1, 8))
3057fa66 1618
3057fa66 1619 "movq %%mm7, %%mm4 \n\t"
e5c30e06 1620 "psrlq $8, %%mm7 \n\t"
e5c30e06 1621#ifdef HAVE_MMX2
cd38e322 1622 "pminub %%mm4, %%mm7 \n\t" // min of pixels
e5c30e06 1623 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
cd38e322 1624 "pminub %%mm4, %%mm7 \n\t" // min of pixels
e5c30e06 1625 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
cd38e322 1626 "pminub %%mm4, %%mm7 \n\t"
e5c30e06 1627#else
cd38e322
MN
1628 "movq %%mm7, %%mm1 \n\t"
1629 "psubusb %%mm4, %%mm1 \n\t"
1630 "psubb %%mm1, %%mm7 \n\t"
3057fa66
A
1631 "movq %%mm7, %%mm4 \n\t"
1632 "psrlq $16, %%mm7 \n\t"
cd38e322
MN
1633 "movq %%mm7, %%mm1 \n\t"
1634 "psubusb %%mm4, %%mm1 \n\t"
1635 "psubb %%mm1, %%mm7 \n\t"
3057fa66 1636 "movq %%mm7, %%mm4 \n\t"
e5c30e06 1637 "psrlq $32, %%mm7 \n\t"
cd38e322
MN
1638 "movq %%mm7, %%mm1 \n\t"
1639 "psubusb %%mm4, %%mm1 \n\t"
1640 "psubb %%mm1, %%mm7 \n\t"
e5c30e06 1641#endif
cd38e322
MN
1642
1643
1644 "movq %%mm6, %%mm4 \n\t"
1645 "psrlq $8, %%mm6 \n\t"
1646#ifdef HAVE_MMX2
1647 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1648 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1649 "pmaxub %%mm4, %%mm6 \n\t"
1650 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1651 "pmaxub %%mm4, %%mm6 \n\t"
1652#else
1653 "psubusb %%mm4, %%mm6 \n\t"
1654 "paddb %%mm4, %%mm6 \n\t"
1655 "movq %%mm6, %%mm4 \n\t"
1656 "psrlq $16, %%mm6 \n\t"
1657 "psubusb %%mm4, %%mm6 \n\t"
1658 "paddb %%mm4, %%mm6 \n\t"
1659 "movq %%mm6, %%mm4 \n\t"
1660 "psrlq $32, %%mm6 \n\t"
1661 "psubusb %%mm4, %%mm6 \n\t"
1662 "paddb %%mm4, %%mm6 \n\t"
1663#endif
1664 "movq %%mm6, %%mm0 \n\t" // max
1665 "psubb %%mm7, %%mm6 \n\t" // max - min
1666 "movd %%mm6, %%ecx \n\t"
1667 "cmpb deringThreshold, %%cl \n\t"
1668 " jb 1f \n\t"
1669 PAVGB(%%mm0, %%mm7) // a=(max + min)/2
e5c30e06
MN
1670 "punpcklbw %%mm7, %%mm7 \n\t"
1671 "punpcklbw %%mm7, %%mm7 \n\t"
1672 "punpcklbw %%mm7, %%mm7 \n\t"
70c5ae87
MN
1673 "movq %%mm7, temp0 \n\t"
1674
1675 "movq (%0), %%mm0 \n\t" // L10
1676 "movq %%mm0, %%mm1 \n\t" // L10
1677 "movq %%mm0, %%mm2 \n\t" // L10
1678 "psllq $8, %%mm1 \n\t"
1679 "psrlq $8, %%mm2 \n\t"
1680 "movd -4(%0), %%mm3 \n\t"
1681 "movd 8(%0), %%mm4 \n\t"
1682 "psrlq $24, %%mm3 \n\t"
1683 "psllq $56, %%mm4 \n\t"
1684 "por %%mm3, %%mm1 \n\t" // L00
1685 "por %%mm4, %%mm2 \n\t" // L20
1686 "movq %%mm1, %%mm3 \n\t" // L00
1687 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1688 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1689 "psubusb %%mm7, %%mm0 \n\t"
1690 "psubusb %%mm7, %%mm2 \n\t"
1691 "psubusb %%mm7, %%mm3 \n\t"
1692 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1
1693 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1
1694 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1
1695 "paddb %%mm2, %%mm0 \n\t"
1696 "paddb %%mm3, %%mm0 \n\t"
1697
1698 "movq (%%eax), %%mm2 \n\t" // L11
1699 "movq %%mm2, %%mm3 \n\t" // L11
1700 "movq %%mm2, %%mm4 \n\t" // L11
1701 "psllq $8, %%mm3 \n\t"
1702 "psrlq $8, %%mm4 \n\t"
1703 "movd -4(%%eax), %%mm5 \n\t"
1704 "movd 8(%%eax), %%mm6 \n\t"
1705 "psrlq $24, %%mm5 \n\t"
1706 "psllq $56, %%mm6 \n\t"
1707 "por %%mm5, %%mm3 \n\t" // L01
1708 "por %%mm6, %%mm4 \n\t" // L21
1709 "movq %%mm3, %%mm5 \n\t" // L01
1710 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1711 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1712 "psubusb %%mm7, %%mm2 \n\t"
1713 "psubusb %%mm7, %%mm4 \n\t"
1714 "psubusb %%mm7, %%mm5 \n\t"
1715 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1
1716 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1
1717 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1
1718 "paddb %%mm4, %%mm2 \n\t"
1719 "paddb %%mm5, %%mm2 \n\t"
1720// 0, 2, 3, 1
1721#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1722 "movq " #src ", " #sx " \n\t" /* src[0] */\
1723 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1724 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1725 "psllq $8, " #lx " \n\t"\
1726 "psrlq $8, " #t0 " \n\t"\
1727 "movd -4" #src ", " #t1 " \n\t"\
1728 "psrlq $24, " #t1 " \n\t"\
1729 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1730 "movd 8" #src ", " #t1 " \n\t"\
1731 "psllq $56, " #t1 " \n\t"\
1732 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1733 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1734 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1735 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
9927c7ee
MN
1736 PAVGB(lx, pplx) \
1737 "movq " #lx ", temp1 \n\t"\
1738 "movq temp0, " #lx " \n\t"\
8405b3fd
MN
1739 "psubusb " #lx ", " #t1 " \n\t"\
1740 "psubusb " #lx ", " #t0 " \n\t"\
1741 "psubusb " #lx ", " #sx " \n\t"\
9927c7ee 1742 "movq b00, " #lx " \n\t"\
8405b3fd
MN
1743 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1744 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1745 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
70c5ae87
MN
1746 "paddb " #t1 ", " #t0 " \n\t"\
1747 "paddb " #t0 ", " #sx " \n\t"\
1748\
70c5ae87
MN
1749 PAVGB(plx, pplx) /* filtered */\
1750 "movq " #dst ", " #t0 " \n\t" /* dst */\
2e212618
MN
1751 "movq " #t0 ", " #t1 " \n\t" /* dst */\
1752 "psubusb pQPb2, " #t0 " \n\t"\
1753 "paddusb pQPb2, " #t1 " \n\t"\
1754 PMAXUB(t0, pplx)\
1755 PMINUB(t1, pplx, t0)\
70c5ae87
MN
1756 "paddb " #sx ", " #ppsx " \n\t"\
1757 "paddb " #psx ", " #ppsx " \n\t"\
1758 "#paddb b02, " #ppsx " \n\t"\
1759 "pand b08, " #ppsx " \n\t"\
8405b3fd 1760 "pcmpeqb " #lx ", " #ppsx " \n\t"\
2e212618 1761 "pand " #ppsx ", " #pplx " \n\t"\
70c5ae87 1762 "pandn " #dst ", " #ppsx " \n\t"\
8405b3fd 1763 "por " #pplx ", " #ppsx " \n\t"\
9927c7ee
MN
1764 "movq " #ppsx ", " #dst " \n\t"\
1765 "movq temp1, " #lx " \n\t"
2e212618 1766
70c5ae87
MN
1767/*
17680000000
17691111111
e5c30e06 1770
70c5ae87
MN
17711111110
17721111101
17731111100
17741111011
17751111010
17761111001
e5c30e06 1777
70c5ae87
MN
17781111000
17791110111
e5c30e06 1780
70c5ae87
MN
1781*/
1782//DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1783DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1784DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1785DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1786DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1787DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1788DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1789DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1790DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
3057fa66 1791
cd38e322 1792 "1: \n\t"
3057fa66 1793 : : "r" (src), "r" (stride), "r" (QP)
cd38e322 1794 : "%eax", "%ebx", "%ecx"
3057fa66
A
1795 );
1796#else
2e212618
MN
1797 int y;
1798 int min=255;
1799 int max=0;
1800 int avg;
1801 uint8_t *p;
1802 int s[10];
1803
1804 for(y=1; y<9; y++)
1805 {
1806 int x;
1807 p= src + stride*y;
1808 for(x=1; x<9; x++)
1809 {
1810 p++;
1811 if(*p > max) max= *p;
1812 if(*p < min) min= *p;
1813 }
1814 }
1815 avg= (min + max + 1)/2;
1816
cd38e322
MN
1817 if(max - min <deringThreshold) return;
1818
2e212618
MN
1819 for(y=0; y<10; y++)
1820 {
1821 int x;
1822 int t = 0;
1823 p= src + stride*y;
1824 for(x=0; x<10; x++)
1825 {
1826 if(*p > avg) t |= (1<<x);
1827 p++;
1828 }
1829 t |= (~t)<<16;
1830 t &= (t<<1) & (t>>1);
1831 s[y] = t;
1832 }
1833
1834 for(y=1; y<9; y++)
1835 {
1836 int x;
1837 int t = s[y-1] & s[y] & s[y+1];
1838 t|= t>>16;
1839
1840 p= src + stride*y;
1841 for(x=1; x<9; x++)
1842 {
1843 p++;
1844 if(t & (1<<x))
1845 {
1846 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1847 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1848 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1849 f= (f + 8)>>4;
1850
cd38e322
MN
1851#ifdef DEBUG_DERING_THRESHOLD
1852 asm volatile("emms\n\t":);
1853 {
1854 static long long numPixels=0;
1855 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1856// if((max-min)<20 || (max-min)*QP<200)
1857// if((max-min)*QP < 500)
1858// if(max-min<QP/2)
1859 if(max-min < 20)
1860 {
1861 static int numSkiped=0;
1862 static int errorSum=0;
1863 static int worstQP=0;
1864 static int worstRange=0;
1865 static int worstDiff=0;
1866 int diff= (f - *p);
1867 int absDiff= ABS(diff);
1868 int error= diff*diff;
1869
1870 if(x==1 || x==8 || y==1 || y==8) continue;
1871
1872 numSkiped++;
1873 if(absDiff > worstDiff)
1874 {
1875 worstDiff= absDiff;
1876 worstQP= QP;
1877 worstRange= max-min;
1878 }
1879 errorSum+= error;
1880
1881 if(1024LL*1024LL*1024LL % numSkiped == 0)
1882 {
1883 printf( "sum:%1.3f, skip:%d, wQP:%d, "
1884 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1885 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1886 worstDiff, (float)numSkiped/numPixels);
1887 }
1888 }
1889 }
1890#endif
2e212618
MN
1891 if (*p + 2*QP < f) *p= *p + 2*QP;
1892 else if(*p - 2*QP > f) *p= *p - 2*QP;
1893 else *p=f;
1894 }
1895 }
1896 }
cd38e322
MN
1897#ifdef DEBUG_DERING_THRESHOLD
1898 if(max-min < 20)
1899 {
1900 for(y=1; y<9; y++)
1901 {
1902 int x;
1903 int t = 0;
1904 p= src + stride*y;
1905 for(x=1; x<9; x++)
1906 {
1907 p++;
1908 *p = MIN(*p + 20, 255);
1909 }
1910 }
1911// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1912 }
1913#endif
3057fa66
A
1914#endif
1915}
1916
3b58b885
MN
1917/**
1918 * Deinterlaces the given block
7fb36f6c
MN
1919 * will be called for every 8x8 block and can read & write from line 4-15
1920 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1921 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885
MN
1922 */
1923static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1924{
1925#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1926 src+= 4*stride;
3b58b885
MN
1927 asm volatile(
1928 "leal (%0, %1), %%eax \n\t"
1929 "leal (%%eax, %1, 4), %%ebx \n\t"
1930// 0 1 2 3 4 5 6 7 8 9
1931// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1932
1933 "movq (%0), %%mm0 \n\t"
1934 "movq (%%eax, %1), %%mm1 \n\t"
acced553 1935 PAVGB(%%mm1, %%mm0)
3b58b885
MN
1936 "movq %%mm0, (%%eax) \n\t"
1937 "movq (%0, %1, 4), %%mm0 \n\t"
acced553 1938 PAVGB(%%mm0, %%mm1)
3b58b885
MN
1939 "movq %%mm1, (%%eax, %1, 2) \n\t"
1940 "movq (%%ebx, %1), %%mm1 \n\t"
acced553 1941 PAVGB(%%mm1, %%mm0)
3b58b885
MN
1942 "movq %%mm0, (%%ebx) \n\t"
1943 "movq (%0, %1, 8), %%mm0 \n\t"
acced553 1944 PAVGB(%%mm0, %%mm1)
3b58b885
MN
1945 "movq %%mm1, (%%ebx, %1, 2) \n\t"
1946
1947 : : "r" (src), "r" (stride)
1948 : "%eax", "%ebx"
1949 );
1950#else
1951 int x;
7fb36f6c 1952 src+= 4*stride;
3b58b885
MN
1953 for(x=0; x<8; x++)
1954 {
1955 src[stride] = (src[0] + src[stride*2])>>1;
1956 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1957 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1958 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1959 src++;
1960 }
1961#endif
1962}
1963
1964/**
1965 * Deinterlaces the given block
7fb36f6c
MN
1966 * will be called for every 8x8 block and can read & write from line 4-15
1967 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1968 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1969 * this filter will read lines 3-15 and write 7-13
acced553 1970 * no cliping in C version
3b58b885 1971 */
acced553 1972static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
3b58b885
MN
1973{
1974#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1975 src+= stride*3;
3b58b885
MN
1976 asm volatile(
1977 "leal (%0, %1), %%eax \n\t"
1978 "leal (%%eax, %1, 4), %%ebx \n\t"
acced553
MN
1979 "leal (%%ebx, %1, 4), %%ecx \n\t"
1980 "addl %1, %%ecx \n\t"
1981 "pxor %%mm7, %%mm7 \n\t"
1982// 0 1 2 3 4 5 6 7 8 9 10
1983// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
3b58b885 1984
acced553
MN
1985#define DEINT_CUBIC(a,b,c,d,e)\
1986 "movq " #a ", %%mm0 \n\t"\
1987 "movq " #b ", %%mm1 \n\t"\
1988 "movq " #d ", %%mm2 \n\t"\
1989 "movq " #e ", %%mm3 \n\t"\
1990 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1991 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1992 "movq %%mm0, %%mm2 \n\t"\
1993 "punpcklbw %%mm7, %%mm0 \n\t"\
1994 "punpckhbw %%mm7, %%mm2 \n\t"\
1995 "movq %%mm1, %%mm3 \n\t"\
1996 "punpcklbw %%mm7, %%mm1 \n\t"\
1997 "punpckhbw %%mm7, %%mm3 \n\t"\
1998 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1999 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
2000 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
2001 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
2002 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
2003 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
2004 "packuswb %%mm3, %%mm1 \n\t"\
2005 "movq %%mm1, " #c " \n\t"
2006
2007DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
2008DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
2009DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
2010DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
3b58b885
MN
2011
2012 : : "r" (src), "r" (stride)
acced553 2013 : "%eax", "%ebx", "ecx"
3b58b885
MN
2014 );
2015#else
2016 int x;
7fb36f6c 2017 src+= stride*3;
3b58b885
MN
2018 for(x=0; x<8; x++)
2019 {
acced553
MN
2020 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2021 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2022 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2023 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
3b58b885
MN
2024 src++;
2025 }
2026#endif
2027}
2028
2029/**
2030 * Deinterlaces the given block
7fb36f6c
MN
2031 * will be called for every 8x8 block and can read & write from line 4-15
2032 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2033 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885 2034 * will shift the image up by 1 line (FIXME if this is a problem)
7fb36f6c 2035 * this filter will read lines 4-13 and write 4-11
3b58b885
MN
2036 */
2037static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2038{
2039#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 2040 src+= 4*stride;
3b58b885
MN
2041 asm volatile(
2042 "leal (%0, %1), %%eax \n\t"
2043 "leal (%%eax, %1, 4), %%ebx \n\t"
2044// 0 1 2 3 4 5 6 7 8 9
2045// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2046
2047 "movq (%0), %%mm0 \n\t" // L0
2048 "movq (%%eax, %1), %%mm1 \n\t" // L2
2049 PAVGB(%%mm1, %%mm0) // L0+L2
2050 "movq (%%eax), %%mm2 \n\t" // L1
2051 PAVGB(%%mm2, %%mm0)
2052 "movq %%mm0, (%0) \n\t"
2053 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
2054 PAVGB(%%mm0, %%mm2) // L1+L3
2055 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
2056 "movq %%mm2, (%%eax) \n\t"
2057 "movq (%0, %1, 4), %%mm2 \n\t" // L4
2058 PAVGB(%%mm2, %%mm1) // L2+L4
2059 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
2060 "movq %%mm1, (%%eax, %1) \n\t"
2061 "movq (%%ebx), %%mm1 \n\t" // L5
2062 PAVGB(%%mm1, %%mm0) // L3+L5
2063 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2064 "movq %%mm0, (%%eax, %1, 2) \n\t"
2065 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2066 PAVGB(%%mm0, %%mm2) // L4+L6
2067 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2068 "movq %%mm2, (%0, %1, 4) \n\t"
2069 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2070 PAVGB(%%mm2, %%mm1) // L5+L7
2071 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2072 "movq %%mm1, (%%ebx) \n\t"
2073 "movq (%0, %1, 8), %%mm1 \n\t" // L8
2074 PAVGB(%%mm1, %%mm0) // L6+L8
2075 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
2076 "movq %%mm0, (%%ebx, %1) \n\t"
2077 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
2078 PAVGB(%%mm0, %%mm2) // L7+L9
2079 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
2080 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2081
2082
2083 : : "r" (src), "r" (stride)
2084 : "%eax", "%ebx"
2085 );
2086#else
2087 int x;
7fb36f6c 2088 src+= 4*stride;
3b58b885
MN
2089 for(x=0; x<8; x++)
2090 {
2091 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2092 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2093 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2094 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2095 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2096 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2097 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2098 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2099 src++;
2100 }
2101#endif
2102}
2103
2104/**
2105 * Deinterlaces the given block
7fb36f6c
MN
2106 * will be called for every 8x8 block and can read & write from line 4-15,
2107 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2108 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885
MN
2109 */
2110static inline void deInterlaceMedian(uint8_t src[], int stride)
2111{
a6be8111 2112#ifdef HAVE_MMX
7fb36f6c 2113 src+= 4*stride;
a6be8111 2114#ifdef HAVE_MMX2
3b58b885
MN
2115 asm volatile(
2116 "leal (%0, %1), %%eax \n\t"
2117 "leal (%%eax, %1, 4), %%ebx \n\t"
2118// 0 1 2 3 4 5 6 7 8 9
2119// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2120
2121 "movq (%0), %%mm0 \n\t" //
2122 "movq (%%eax, %1), %%mm2 \n\t" //
2123 "movq (%%eax), %%mm1 \n\t" //
2124 "movq %%mm0, %%mm3 \n\t"
2125 "pmaxub %%mm1, %%mm0 \n\t" //
2126 "pminub %%mm3, %%mm1 \n\t" //
2127 "pmaxub %%mm2, %%mm1 \n\t" //
2128 "pminub %%mm1, %%mm0 \n\t"
2129 "movq %%mm0, (%%eax) \n\t"
2130
2131 "movq (%0, %1, 4), %%mm0 \n\t" //
2132 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2133 "movq %%mm2, %%mm3 \n\t"
2134 "pmaxub %%mm1, %%mm2 \n\t" //
2135 "pminub %%mm3, %%mm1 \n\t" //
2136 "pmaxub %%mm0, %%mm1 \n\t" //
2137 "pminub %%mm1, %%mm2 \n\t"
2138 "movq %%mm2, (%%eax, %1, 2) \n\t"
2139
2140 "movq (%%ebx), %%mm2 \n\t" //
2141 "movq (%%ebx, %1), %%mm1 \n\t" //
2142 "movq %%mm2, %%mm3 \n\t"
2143 "pmaxub %%mm0, %%mm2 \n\t" //
2144 "pminub %%mm3, %%mm0 \n\t" //
2145 "pmaxub %%mm1, %%mm0 \n\t" //
2146 "pminub %%mm0, %%mm2 \n\t"
2147 "movq %%mm2, (%%ebx) \n\t"
2148
2149 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
2150 "movq (%0, %1, 8), %%mm0 \n\t" //
2151 "movq %%mm2, %%mm3 \n\t"
2152 "pmaxub %%mm0, %%mm2 \n\t" //
2153 "pminub %%mm3, %%mm0 \n\t" //
2154 "pmaxub %%mm1, %%mm0 \n\t" //
2155 "pminub %%mm0, %%mm2 \n\t"
2156 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2157
2158
2159 : : "r" (src), "r" (stride)
2160 : "%eax", "%ebx"
2161 );
a6be8111
MN
2162
2163#else // MMX without MMX2
2164 asm volatile(
2165 "leal (%0, %1), %%eax \n\t"
2166 "leal (%%eax, %1, 4), %%ebx \n\t"
2167// 0 1 2 3 4 5 6 7 8 9
2168// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2169 "pxor %%mm7, %%mm7 \n\t"
2170
2171#define MEDIAN(a,b,c)\
2172 "movq " #a ", %%mm0 \n\t"\
2173 "movq " #b ", %%mm2 \n\t"\
2174 "movq " #c ", %%mm1 \n\t"\
2175 "movq %%mm0, %%mm3 \n\t"\
2176 "movq %%mm1, %%mm4 \n\t"\
2177 "movq %%mm2, %%mm5 \n\t"\
2178 "psubusb %%mm1, %%mm3 \n\t"\
2179 "psubusb %%mm2, %%mm4 \n\t"\
2180 "psubusb %%mm0, %%mm5 \n\t"\
2181 "pcmpeqb %%mm7, %%mm3 \n\t"\
2182 "pcmpeqb %%mm7, %%mm4 \n\t"\
2183 "pcmpeqb %%mm7, %%mm5 \n\t"\
2184 "movq %%mm3, %%mm6 \n\t"\
2185 "pxor %%mm4, %%mm3 \n\t"\
2186 "pxor %%mm5, %%mm4 \n\t"\
2187 "pxor %%mm6, %%mm5 \n\t"\
2188 "por %%mm3, %%mm1 \n\t"\
2189 "por %%mm4, %%mm2 \n\t"\
2190 "por %%mm5, %%mm0 \n\t"\
2191 "pand %%mm2, %%mm0 \n\t"\
2192 "pand %%mm1, %%mm0 \n\t"\
2193 "movq %%mm0, " #b " \n\t"
2194
2195MEDIAN((%0), (%%eax), (%%eax, %1))
2196MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2197MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2198MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2199
2200 : : "r" (src), "r" (stride)
2201 : "%eax", "%ebx"
2202 );
2203#endif // MMX
3b58b885
MN
2204#else
2205 //FIXME
2206 int x;
7fb36f6c 2207 src+= 4*stride;
3b58b885
MN
2208 for(x=0; x<8; x++)
2209 {
2210 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2211 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2212 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2213 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2214 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2215 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2216 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2217 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2218 src++;
2219 }
2220#endif
2221}
2222
e5c30e06 2223#ifdef HAVE_MMX
4e4dcbc5
MN
2224/**
2225 * transposes and shift the given 8x8 Block into dst1 and dst2
2226 */
2227static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2228{
2229 asm(
2230 "leal (%0, %1), %%eax \n\t"
2231 "leal (%%eax, %1, 4), %%ebx \n\t"
2232// 0 1 2 3 4 5 6 7 8 9
2233// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2234 "movq (%0), %%mm0 \n\t" // 12345678
2235 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2236 "movq %%mm0, %%mm2 \n\t" // 12345678
2237 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2238 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2239
2240 "movq (%%eax, %1), %%mm1 \n\t"
2241 "movq (%%eax, %1, 2), %%mm3 \n\t"
2242 "movq %%mm1, %%mm4 \n\t"
2243 "punpcklbw %%mm3, %%mm1 \n\t"
2244 "punpckhbw %%mm3, %%mm4 \n\t"
2245
2246 "movq %%mm0, %%mm3 \n\t"
2247 "punpcklwd %%mm1, %%mm0 \n\t"
2248 "punpckhwd %%mm1, %%mm3 \n\t"
2249 "movq %%mm2, %%mm1 \n\t"
2250 "punpcklwd %%mm4, %%mm2 \n\t"
2251 "punpckhwd %%mm4, %%mm1 \n\t"
2252
2253 "movd %%mm0, 128(%2) \n\t"
2254 "psrlq $32, %%mm0 \n\t"
2255 "movd %%mm0, 144(%2) \n\t"
2256 "movd %%mm3, 160(%2) \n\t"
2257 "psrlq $32, %%mm3 \n\t"
2258 "movd %%mm3, 176(%2) \n\t"
2259 "movd %%mm3, 48(%3) \n\t"
2260 "movd %%mm2, 192(%2) \n\t"
2261 "movd %%mm2, 64(%3) \n\t"
2262 "psrlq $32, %%mm2 \n\t"
2263 "movd %%mm2, 80(%3) \n\t"
2264 "movd %%mm1, 96(%3) \n\t"
2265 "psrlq $32, %%mm1 \n\t"
2266 "movd %%mm1, 112(%3) \n\t"
2267
2268 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2269 "movq (%%ebx), %%mm1 \n\t" // abcdefgh
2270 "movq %%mm0, %%mm2 \n\t" // 12345678
2271 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2272 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2273
2274 "movq (%%ebx, %1), %%mm1 \n\t"
2275 "movq (%%ebx, %1, 2), %%mm3 \n\t"
2276 "movq %%mm1, %%mm4 \n\t"
2277 "punpcklbw %%mm3, %%mm1 \n\t"
2278 "punpckhbw %%mm3, %%mm4 \n\t"
2279
2280 "movq %%mm0, %%mm3 \n\t"
2281 "punpcklwd %%mm1, %%mm0 \n\t"
2282 "punpckhwd %%mm1, %%mm3 \n\t"
2283 "movq %%mm2, %%mm1 \n\t"
2284 "punpcklwd %%mm4, %%mm2 \n\t"
2285 "punpckhwd %%mm4, %%mm1 \n\t"
2286
2287 "movd %%mm0, 132(%2) \n\t"
2288 "psrlq $32, %%mm0 \n\t"
2289 "movd %%mm0, 148(%2) \n\t"
2290 "movd %%mm3, 164(%2) \n\t"
2291 "psrlq $32, %%mm3 \n\t"
2292 "movd %%mm3, 180(%2) \n\t"
2293 "movd %%mm3, 52(%3) \n\t"
2294 "movd %%mm2, 196(%2) \n\t"
2295 "movd %%mm2, 68(%3) \n\t"
2296 "psrlq $32, %%mm2 \n\t"
2297 "movd %%mm2, 84(%3) \n\t"
2298 "movd %%mm1, 100(%3) \n\t"
2299 "psrlq $32, %%mm1 \n\t"
2300 "movd %%mm1, 116(%3) \n\t"
2301
2302
2303 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2304 : "%eax", "%ebx"
2305 );
2306}
2307
2308/**
2309 * transposes the given 8x8 block
2310 */
2311static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2312{
2313 asm(
2314 "leal (%0, %1), %%eax \n\t"
2315 "leal (%%eax, %1, 4), %%ebx \n\t"
2316// 0 1 2 3 4 5 6 7 8 9
2317// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2318 "movq (%2), %%mm0 \n\t" // 12345678
2319 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2320 "movq %%mm0, %%mm2 \n\t" // 12345678
2321 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2322 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2323
2324 "movq 32(%2), %%mm1 \n\t"
2325 "movq 48(%2), %%mm3 \n\t"
2326 "movq %%mm1, %%mm4 \n\t"
2327 "punpcklbw %%mm3, %%mm1 \n\t"
2328 "punpckhbw %%mm3, %%mm4 \n\t"
2329
2330 "movq %%mm0, %%mm3 \n\t"
2331 "punpcklwd %%mm1, %%mm0 \n\t"
2332 "punpckhwd %%mm1, %%mm3 \n\t"
2333 "movq %%mm2, %%mm1 \n\t"
2334 "punpcklwd %%mm4, %%mm2 \n\t"
2335 "punpckhwd %%mm4, %%mm1 \n\t"
2336
2337 "movd %%mm0, (%0) \n\t"
2338 "psrlq $32, %%mm0 \n\t"
2339 "movd %%mm0, (%%eax) \n\t"
2340 "movd %%mm3, (%%eax, %1) \n\t"
2341 "psrlq $32, %%mm3 \n\t"
2342 "movd %%mm3, (%%eax, %1, 2) \n\t"
2343 "movd %%mm2, (%0, %1, 4) \n\t"
2344 "psrlq $32, %%mm2 \n\t"
2345 "movd %%mm2, (%%ebx) \n\t"
2346 "movd %%mm1, (%%ebx, %1) \n\t"
2347 "psrlq $32, %%mm1 \n\t"
2348 "movd %%mm1, (%%ebx, %1, 2) \n\t"
2349
2350
2351 "movq 64(%2), %%mm0 \n\t" // 12345678
2352 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2353 "movq %%mm0, %%mm2 \n\t" // 12345678
2354 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2355 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2356
2357 "movq 96(%2), %%mm1 \n\t"
2358 "movq 112(%2), %%mm3 \n\t"
2359 "movq %%mm1, %%mm4 \n\t"
2360 "punpcklbw %%mm3, %%mm1 \n\t"
2361 "punpckhbw %%mm3, %%mm4 \n\t"
2362
2363 "movq %%mm0, %%mm3 \n\t"
2364 "punpcklwd %%mm1, %%mm0 \n\t"
2365 "punpckhwd %%mm1, %%mm3 \n\t"
2366 "movq %%mm2, %%mm1 \n\t"
2367 "punpcklwd %%mm4, %%mm2 \n\t"
2368 "punpckhwd %%mm4, %%mm1 \n\t"
2369
2370 "movd %%mm0, 4(%0) \n\t"
2371 "psrlq $32, %%mm0 \n\t"
2372 "movd %%mm0, 4(%%eax) \n\t"
2373 "movd %%mm3, 4(%%eax, %1) \n\t"
2374 "psrlq $32, %%mm3 \n\t"
2375 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2376 "movd %%mm2, 4(%0, %1, 4) \n\t"
2377 "psrlq $32, %%mm2 \n\t"
2378 "movd %%mm2, 4(%%ebx) \n\t"
2379 "movd %%mm1, 4(%%ebx, %1) \n\t"
2380 "psrlq $32, %%mm1 \n\t"
2381 "movd %%mm1, 4(%%ebx, %1, 2) \n\t"
2382
2383 :: "r" (dst), "r" (dstStride), "r" (src)
2384 : "%eax", "%ebx"
2385 );
2386}
e5c30e06 2387#endif
be44a4d7 2388//static int test=0;
4e4dcbc5 2389
117e45b0 2390static void inline tempNoiseReducer(uint8_t *src, int stride,
a9c77978 2391 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
117e45b0 2392{
be44a4d7
MN
2393#define FAST_L2_DIFF
2394//#define L1_DIFF //u should change the thresholds too if u try that one
2395#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2396 asm volatile(
2397 "leal (%2, %2, 2), %%eax \n\t" // 3*stride
2398 "leal (%2, %2, 4), %%ebx \n\t" // 5*stride
2399 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2400// 0 1 2 3 4 5 6 7 8 9
2401// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2
2402//FIXME reorder?
2403#ifdef L1_DIFF //needs mmx2
2404 "movq (%0), %%mm0 \n\t" // L0
2405 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2406 "movq (%0, %2), %%mm1 \n\t" // L1
2407 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2408 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2409 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2410 "movq (%0, %%eax), %%mm3 \n\t" // L3
2411 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
2412
2413 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2414 "paddw %%mm1, %%mm0 \n\t"
2415 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2416 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2417 "paddw %%mm2, %%mm0 \n\t"
2418 "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5|
2419 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2420 "paddw %%mm3, %%mm0 \n\t"
2421 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
2422 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2423 "paddw %%mm4, %%mm0 \n\t"
2424 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
2425 "paddw %%mm5, %%mm6 \n\t"
2426 "paddw %%mm7, %%mm6 \n\t"
2427 "paddw %%mm6, %%mm0 \n\t"
2428#elif defined (FAST_L2_DIFF)
2429 "pcmpeqb %%mm7, %%mm7 \n\t"
2430 "movq b80, %%mm6 \n\t"
2431 "pxor %%mm0, %%mm0 \n\t"
2432#define L2_DIFF_CORE(a, b)\
2433 "movq " #a ", %%mm5 \n\t"\
2434 "movq " #b ", %%mm2 \n\t"\
2435 "pxor %%mm7, %%mm2 \n\t"\
2436 PAVGB(%%mm2, %%mm5)\
2437 "paddb %%mm6, %%mm5 \n\t"\
2438 "movq %%mm5, %%mm2 \n\t"\
2439 "psllw $8, %%mm5 \n\t"\
2440 "pmaddwd %%mm5, %%mm5 \n\t"\
2441 "pmaddwd %%mm2, %%mm2 \n\t"\
2442 "paddd %%mm2, %%mm5 \n\t"\
2443 "psrld $14, %%mm5 \n\t"\
2444 "paddd %%mm5, %%mm0 \n\t"
2445
2446L2_DIFF_CORE((%0), (%1))
2447L2_DIFF_CORE((%0, %2), (%1, %2))
2448L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2449L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2450L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2451L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2452L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2453L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2454
2455#else
2456 "pxor %%mm7, %%mm7 \n\t"
2457 "pxor %%mm0, %%mm0 \n\t"
2458#define L2_DIFF_CORE(a, b)\
2459 "movq " #a ", %%mm5 \n\t"\
2460 "movq " #b ", %%mm2 \n\t"\
2461 "movq %%mm5, %%mm1 \n\t"\
2462 "movq %%mm2, %%mm3 \n\t"\
2463 "punpcklbw %%mm7, %%mm5 \n\t"\
2464 "punpckhbw %%mm7, %%mm1 \n\t"\
2465 "punpcklbw %%mm7, %%mm2 \n\t"\
2466 "punpckhbw %%mm7, %%mm3 \n\t"\
2467 "psubw %%mm2, %%mm5 \n\t"\
2468 "psubw %%mm3, %%mm1 \n\t"\
2469 "pmaddwd %%mm5, %%mm5 \n\t"\
2470 "pmaddwd %%mm1, %%mm1 \n\t"\
2471 "paddd %%mm1, %%mm5 \n\t"\
2472 "paddd %%mm5, %%mm0 \n\t"
2473
2474L2_DIFF_CORE((%0), (%1))
2475L2_DIFF_CORE((%0, %2), (%1, %2))
2476L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2477L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2478L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2479L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2480L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2481L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2482
2483#endif
2484
2485 "movq %%mm0, %%mm4 \n\t"
2486 "psrlq $32, %%mm0 \n\t"
2487 "paddd %%mm0, %%mm4 \n\t"
2488 "movd %%mm4, %%ecx \n\t"
a9c77978
MN
2489 "shll $2, %%ecx \n\t"
2490 "movl %3, %%ebx \n\t"
2491 "addl -4(%%ebx), %%ecx \n\t"
2492 "addl 4(%%ebx), %%ecx \n\t"
2493 "addl -1024(%%ebx), %%ecx \n\t"
2494 "addl $4, %%ecx \n\t"
2495 "addl 1024(%%ebx), %%ecx \n\t"
2496 "shrl $3, %%ecx \n\t"
2497 "movl %%ecx, (%%ebx) \n\t"
2498 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride
2499
be44a4d7
MN
2500// "movl %3, %%ecx \n\t"
2501// "movl %%ecx, test \n\t"
2502// "jmp 4f \n\t"
a9c77978 2503 "cmpl 4+maxTmpNoise, %%ecx \n\t"
be44a4d7 2504 " jb 2f \n\t"
a9c77978 2505 "cmpl 8+maxTmpNoise, %%ecx \n\t"
be44a4d7
MN
2506 " jb 1f \n\t"
2507
2508 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2509 "movq (%0), %%mm0 \n\t" // L0
2510 "movq (%0, %2), %%mm1 \n\t" // L1
2511 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2512 "movq (%0, %%eax), %%mm3 \n\t" // L3
2513 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2514 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2515 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2516 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2517 "movq %%mm0, (%1) \n\t" // L0
2518 "movq %%mm1, (%1, %2) \n\t" // L1
2519 "movq %%mm2, (%1, %2, 2) \n\t" // L2
2520 "movq %%mm3, (%1, %%eax) \n\t" // L3
2521 "movq %%mm4, (%1, %2, 4) \n\t" // L4
2522 "movq %%mm5, (%1, %%ebx) \n\t" // L5
2523 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
2524 "movq %%mm7, (%1, %%ecx) \n\t" // L7
2525 "jmp 4f \n\t"
2526
2527 "1: \n\t"
2528 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2529 "movq (%0), %%mm0 \n\t" // L0
2530 "pavgb (%1), %%mm0 \n\t" // L0
2531 "movq (%0, %2), %%mm1 \n\t" // L1
2532 "pavgb (%1, %2), %%mm1 \n\t" // L1
2533 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2534 "pavgb (%1, %2, 2), %%mm2 \n\t" // L2
2535 "movq (%0, %%eax), %%mm3 \n\t" // L3
2536 "pavgb (%1, %%eax), %%mm3 \n\t" // L3
2537 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2538 "pavgb (%1, %2, 4), %%mm4 \n\t" // L4
2539 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2540 "pavgb (%1, %%ebx), %%mm5 \n\t" // L5
2541 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2542 "pavgb (%1, %%eax, 2), %%mm6 \n\t" // L6
2543 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2544 "pavgb (%1, %%ecx), %%mm7 \n\t" // L7
2545 "movq %%mm0, (%1) \n\t" // R0
2546 "movq %%mm1, (%1, %2) \n\t" // R1
2547 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2548 "movq %%mm3, (%1, %%eax) \n\t" // R3
2549 "movq %%mm4, (%1, %2, 4) \n\t" // R4
2550 "movq %%mm5, (%1, %%ebx) \n\t" // R5
2551 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
2552 "movq %%mm7, (%1, %%ecx) \n\t" // R7
2553 "movq %%mm0, (%0) \n\t" // L0
2554 "movq %%mm1, (%0, %2) \n\t" // L1
2555 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2556 "movq %%mm3, (%0, %%eax) \n\t" // L3
2557 "movq %%mm4, (%0, %2, 4) \n\t" // L4
2558 "movq %%mm5, (%0, %%ebx) \n\t" // L5
2559 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
2560 "movq %%mm7, (%0, %%ecx) \n\t" // L7
2561 "jmp 4f \n\t"
2562
2563 "2: \n\t"
a9c77978 2564 "cmpl maxTmpNoise, %%ecx \n\t"
be44a4d7
MN
2565 " jb 3f \n\t"
2566
2567 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2568 "movq (%0), %%mm0 \n\t" // L0
2569 "movq (%0, %2), %%mm1 \n\t" // L1
2570 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2571 "movq (%0, %%eax), %%mm3 \n\t" // L3
2572 "movq (%1), %%mm4 \n\t" // R0
2573 "movq (%1, %2), %%mm5 \n\t" // R1
2574 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2575 "movq (%1, %%eax), %%mm7 \n\t" // R3
2576 PAVGB(%%mm4, %%mm0)
2577 PAVGB(%%mm5, %%mm1)
2578 PAVGB(%%mm6, %%mm2)
2579 PAVGB(%%mm7, %%mm3)
2580 PAVGB(%%mm4, %%mm0)
2581 PAVGB(%%mm5, %%mm1)
2582 PAVGB(%%mm6, %%mm2)
2583 PAVGB(%%mm7, %%mm3)
2584 "movq %%mm0, (%1) \n\t" // R0
2585 "movq %%mm1, (%1, %2) \n\t" // R1
2586 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2587 "movq %%mm3, (%1, %%eax) \n\t" // R3
2588 "movq %%mm0, (%0) \n\t" // L0
2589 "movq %%mm1, (%0, %2) \n\t" // L1
2590 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2591 "movq %%mm3, (%0, %%eax) \n\t" // L3
2592
2593 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2594 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2595 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2596 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2597 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2598 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2599 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2600 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2601 PAVGB(%%mm4, %%mm0)
2602 PAVGB(%%mm5, %%mm1)
2603 PAVGB(%%mm6, %%mm2)
2604 PAVGB(%%mm7, %%mm3)
2605 PAVGB(%%mm4, %%mm0)
2606 PAVGB(%%mm5, %%mm1)
2607 PAVGB(%%mm6, %%mm2)
2608 PAVGB(%%mm7, %%mm3)
2609 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2610 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2611 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2612 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2613 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2614 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2615 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2616 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2617 "jmp 4f \n\t"
2618
2619 "3: \n\t"
2620 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2621 "movq (%0), %%mm0 \n\t" // L0
2622 "movq (%0, %2), %%mm1 \n\t" // L1
2623 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2624 "movq (%0, %%eax), %%mm3 \n\t" // L3
2625 "movq (%1), %%mm4 \n\t" // R0
2626 "movq (%1, %2), %%mm5 \n\t" // R1
2627 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2628 "movq (%1, %%eax), %%mm7 \n\t" // R3
2629 PAVGB(%%mm4, %%mm0)
2630 PAVGB(%%mm5, %%mm1)
2631 PAVGB(%%mm6, %%mm2)
2632 PAVGB(%%mm7, %%mm3)
2633 PAVGB(%%mm4, %%mm0)
2634 PAVGB(%%mm5, %%mm1)
2635 PAVGB(%%mm6, %%mm2)
2636 PAVGB(%%mm7, %%mm3)
2637 PAVGB(%%mm4, %%mm0)
2638 PAVGB(%%mm5, %%mm1)
2639 PAVGB(%%mm6, %%mm2)
2640 PAVGB(%%mm7, %%mm3)
2641 "movq %%mm0, (%1) \n\t" // R0
2642 "movq %%mm1, (%1, %2) \n\t" // R1
2643 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2644 "movq %%mm3, (%1, %%eax) \n\t" // R3
2645 "movq %%mm0, (%0) \n\t" // L0
2646 "movq %%mm1, (%0, %2) \n\t" // L1
2647 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2648 "movq %%mm3, (%0, %%eax) \n\t" // L3
2649
2650 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2651 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2652 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2653 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2654 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2655 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2656 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2657 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2658 PAVGB(%%mm4, %%mm0)
2659 PAVGB(%%mm5, %%mm1)
2660 PAVGB(%%mm6, %%mm2)
2661 PAVGB(%%mm7, %%mm3)
2662 PAVGB(%%mm4, %%mm0)
2663 PAVGB(%%mm5, %%mm1)
2664 PAVGB(%%mm6, %%mm2)
2665 PAVGB(%%mm7, %%mm3)
2666 PAVGB(%%mm4, %%mm0)
2667 PAVGB(%%mm5, %%mm1)
2668 PAVGB(%%mm6, %%mm2)
2669 PAVGB(%%mm7, %%mm3)
2670 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2671 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2672 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2673 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2674 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2675 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2676 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2677 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2678
2679 "4: \n\t"
2680
a9c77978 2681 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
be44a4d7
MN
2682 : "%eax", "%ebx", "%ecx", "memory"
2683 );
2684//printf("%d\n", test);
2685#else
117e45b0
MN
2686 int y;
2687 int d=0;
2688 int sysd=0;
a9c77978 2689 int i;
117e45b0
MN
2690
2691 for(y=0; y<8; y++)
2692 {
2693 int x;
2694 for(x=0; x<8; x++)
2695 {
2696 int ref= tempBlured[ x + y*stride ];
2697 int cur= src[ x + y*stride ];
2698 int d1=ref - cur;
be44a4d7
MN
2699// if(x==0 || x==7) d1+= d1>>1;
2700// if(y==0 || y==7) d1+= d1>>1;
2701// d+= ABS(d1);
2702 d+= d1*d1;
117e45b0
MN
2703 sysd+= d1;
2704 }
2705 }
a9c77978
MN
2706 i=d;
2707 d= (
2708 4*d
2709 +(*(tempBluredPast-256))
2710 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2711 +(*(tempBluredPast+256))
2712 +4)>>3;
2713 *tempBluredPast=i;
2714// ((*tempBluredPast)*3 + d + 2)>>2;
2715
117e45b0
MN
2716//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2717/*
2718Switch between
2719 1 0 0 0 0 0 0 (0)
272064 32 16 8 4 2 1 (1)
272164 48 36 27 20 15 11 (33) (approx)
272264 56 49 43 37 33 29 (200) (approx)
2723*/
2724 if(d > maxNoise[1])
2725 {
2726 if(d < maxNoise[2])
2727 {
2728 for(y=0; y<8; y++)
2729 {
2730 int x;
2731 for(x=0; x<8; x++)
2732 {
2733 int ref= tempBlured[ x + y*stride ];
2734 int cur= src[ x + y*stride ];
2735 tempBlured[ x + y*stride ]=
2736 src[ x + y*stride ]=
2737 (ref + cur + 1)>>1;
2738 }
2739 }
2740 }
2741 else
2742 {
2743 for(y=0; y<8; y++)
2744 {
2745 int x;
2746 for(x=0; x<8; x++)
2747 {
2748 tempBlured[ x + y*stride ]= src[ x + y*stride ];
2749 }
2750 }
2751 }
2752 }
2753 else
2754 {
2755 if(d < maxNoise[0])
2756 {
2757 for(y=0; y<8; y++)
2758 {
2759 int x;
2760 for(x=0; x<8; x++)
2761 {
2762 int ref= tempBlured[ x + y*stride ];
2763 int cur= src[ x + y*stride ];
2764 tempBlured[ x + y*stride ]=
2765 src[ x + y*stride ]=
2766 (ref*7 + cur + 4)>>3;
2767 }
2768 }
2769 }
2770 else
2771 {
2772 for(y=0; y<8; y++)
2773 {
2774 int x;
2775 for(x=0; x<8; x++)
2776 {
2777 int ref= tempBlured[ x + y*stride ];
2778 int cur= src[ x + y*stride ];
2779 tempBlured[ x + y*stride ]=
2780 src[ x + y*stride ]=
2781 (ref*3 + cur + 2)>>2;
2782 }
2783 }
2784 }
2785 }
be44a4d7 2786#endif
117e45b0
MN
2787}
2788
9a722af7
A
2789#ifdef HAVE_ODIVX_POSTPROCESS
2790#include "../opendivx/postprocess.h"
2791int use_old_pp=0;
2792#endif
13e00528 2793
9a722af7 2794static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
117e45b0 2795 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
13e00528 2796
911879d1
MN
2797/* -pp Command line Help
2798NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2799
2800-pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2801
2802long form example:
2803-pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2804short form example:
2805-pp vb:a,hb:a,lb -pp de,-vb
117e45b0
MN
2806more examples:
2807-pp tn:64:128:256
911879d1
MN
2808
2809Filters Options
2810short long name short long option Description
2811* * a autoq cpu power dependant enabler
2812 c chrom chrominance filtring enabled
2813 y nochrom chrominance filtring disabled
2814hb hdeblock horizontal deblocking filter
2815vb vdeblock vertical deblocking filter
2816vr rkvdeblock
2817h1 x1hdeblock Experimental horizontal deblock filter 1
2818v1 x1vdeblock Experimental vertical deblock filter 1
2819dr dering not implemented yet
2820al autolevels automatic brightness / contrast fixer
2821 f fullyrange stretch luminance range to (0..255)
2822lb linblenddeint linear blend deinterlacer
2823li linipoldeint linear interpolating deinterlacer
2824ci cubicipoldeint cubic interpolating deinterlacer
2825md mediandeint median deinterlacer
2826de default hdeblock:a,vdeblock:a,dering:a,autolevels
2827fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
117e45b0 2828tn tmpnoise (3 Thresholds) Temporal Noise Reducer
911879d1
MN
2829*/
2830
2831/**
2832 * returns a PPMode struct which will have a non 0 error variable if an error occured
2833 * name is the string after "-pp" on the command line
2834 * quality is a number from 0 to GET_PP_QUALITY_MAX
2835 */
2836struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2837{
2838 char temp[GET_MODE_BUFFER_SIZE];
2839 char *p= temp;
2840 char *filterDelimiters= ",";
2841 char *optionDelimiters= ":";
117e45b0 2842 struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
911879d1
MN
2843 char *filterToken;
2844
2845 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2846
117e45b0
MN
2847 printf("%s\n", name);
2848
911879d1 2849 for(;;){
911879d1 2850 char *filterName;
117e45b0 2851 int q= 1000000; //GET_PP_QUALITY_MAX;
911879d1
MN
2852 int chrom=-1;
2853 char *option;
2854 char *options[OPTIONS_ARRAY_SIZE];
2855 int i;
2856 int filterNameOk=0;
2857 int numOfUnknownOptions=0;
2858 int enable=1; //does the user want us to enabled or disabled the filter
2859
2860 filterToken= strtok(p, filterDelimiters);
2861 if(filterToken == NULL) break;
117e45b0 2862 p+= strlen(filterToken) + 1; // p points to next filterToken
911879d1
MN
2863 filterName= strtok(filterToken, optionDelimiters);
2864 printf("%s::%s\n", filterToken, filterName);
2865
2866 if(*filterName == '-')
2867 {
2868 enable=0;
2869 filterName++;
2870 }
117e45b0 2871
911879d1
MN
2872 for(;;){ //for all options
2873 option= strtok(NULL, optionDelimiters);
2874 if(option == NULL) break;
2875
2876 printf("%s\n", option);
2877 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2878 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2879 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2880 else
2881 {
2882 options[numOfUnknownOptions] = option;
2883 numOfUnknownOptions++;
911879d1
MN
2884 }
2885 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2886 }
117e45b0 2887 options[numOfUnknownOptions] = NULL;
911879d1
MN
2888
2889 /* replace stuff from the replace Table */
2890 for(i=0; replaceTable[2*i]!=NULL; i++)
2891 {
2892 if(!strcmp(replaceTable[2*i], filterName))
2893 {
2894 int newlen= strlen(replaceTable[2*i + 1]);
2895 int plen;
2896 int spaceLeft;
2897
2898 if(p==NULL) p= temp, *p=0; //last filter
2899 else p--, *p=','; //not last filter
2900
2901 plen= strlen(p);
2902 spaceLeft= (int)p - (int)temp + plen;
2903 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2904 {
2905 ppMode.error++;
2906 break;
2907 }
2908 memmove(p + newlen, p, plen+1);
2909 memcpy(p, replaceTable[2*i + 1], newlen);
2910 filterNameOk=1;
2911 }
2912 }
2913
2914 for(i=0; filters[i].shortName!=NULL; i++)
2915 {
117e45b0 2916// printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
911879d1
MN
2917 if( !strcmp(filters[i].longName, filterName)
2918 || !strcmp(filters[i].shortName, filterName))
2919 {
2920 ppMode.lumMode &= ~filters[i].mask;
2921 ppMode.chromMode &= ~filters[i].mask;
2922
2923 filterNameOk=1;
2924 if(!enable) break; // user wants to disable it
2925
2926 if(q >= filters[i].minLumQuality)
2927 ppMode.lumMode|= filters[i].mask;
2928 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2929 if(q >= filters[i].minChromQuality)
2930 ppMode.chromMode|= filters[i].mask;
2931
2932 if(filters[i].mask == LEVEL_FIX)
2933 {
2934 int o;
2935 ppMode.minAllowedY= 16;
2936 ppMode.maxAllowedY= 234;
2937 for(o=0; options[o]!=NULL; o++)
2938 if( !strcmp(options[o],"fullyrange")
2939 ||!strcmp(options[o],"f"))
2940 {
2941 ppMode.minAllowedY= 0;
2942 ppMode.maxAllowedY= 255;
2943 numOfUnknownOptions--;
2944 }
2945 }
117e45b0
MN
2946 else if(filters[i].mask == TEMP_NOISE_FILTER)
2947 {
2948 int o;
2949 int numOfNoises=0;
2950 ppMode.maxTmpNoise[0]= 150;
2951 ppMode.maxTmpNoise[1]= 200;
2952 ppMode.maxTmpNoise[2]= 400;
2953
2954 for(o=0; options[o]!=NULL; o++)
2955 {
2956 char *tail;
2957 ppMode.maxTmpNoise[numOfNoises]=
2958 strtol(options[o], &tail, 0);
2959 if(tail!=options[o])
2960 {
2961 numOfNoises++;
2962 numOfUnknownOptions--;
2963 if(numOfNoises >= 3) break;
2964 }
2965 }
2966 }
911879d1
MN
2967 }
2968 }
2969 if(!filterNameOk) ppMode.error++;
2970 ppMode.error += numOfUnknownOptions;
2971 }
2972
815cbfe7 2973#ifdef HAVE_ODIVX_POSTPROCESS
911879d1
MN
2974 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2975 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2976 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2977 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2978 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2979 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
815cbfe7 2980#endif
911879d1
MN
2981
2982 return ppMode;
2983}
2984
3057fa66 2985/**
117e45b0 2986 * Obsolete, dont use it, use postprocess2() instead
3057fa66 2987 */
3057fa66
A
2988void postprocess(unsigned char * src[], int src_stride,
2989 unsigned char * dst[], int dst_stride,
2990 int horizontal_size, int vertical_size,
2991 QP_STORE_T *QP_store, int QP_stride,
2992 int mode)
2993{
117e45b0
MN
2994 struct PPMode ppMode;
2995 static QP_STORE_T zeroArray[2048/8];
911879d1
MN
2996/*
2997 static int qual=0;
2998
117e45b0
MN
2999 ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
3000 printf("OK\n");
911879d1
MN
3001 qual++;
3002 qual%=7;
117e45b0
MN
3003 printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
3004 qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
911879d1
MN
3005 postprocess2(src, src_stride, dst, dst_stride,
3006 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
3007
3008 return;
3009*/
815cbfe7
MN
3010 if(QP_store==NULL)
3011 {
3012 QP_store= zeroArray;
3013 QP_stride= 0;
3014 }
13e00528 3015
117e45b0
MN
3016 ppMode.lumMode= mode;
3017 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
3018 ppMode.chromMode= mode;
be44a4d7
MN
3019 ppMode.maxTmpNoise[0]= 700;
3020 ppMode.maxTmpNoise[1]= 1500;
3021 ppMode.maxTmpNoise[2]= 3000;
117e45b0 3022
9a722af7
A
3023#ifdef HAVE_ODIVX_POSTPROCESS
3024// Note: I could make this shit outside of this file, but it would mean one
3025// more function call...
3026 if(use_old_pp){
3027 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
3028 return;
3029 }
3030#endif
3031
13e00528 3032 postProcess(src[0], src_stride, dst[0], dst_stride,
117e45b0 3033 horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
3057fa66
A
3034
3035 horizontal_size >>= 1;
3036 vertical_size >>= 1;
3037 src_stride >>= 1;
3038 dst_stride >>= 1;
3039
4e1349d4 3040 if(ppMode.chromMode)
3057fa66 3041 {
13e00528 3042 postProcess(src[1], src_stride, dst[1], dst_stride,
117e45b0 3043 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
13e00528 3044 postProcess(src[2], src_stride, dst[2], dst_stride,
117e45b0 3045 horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
3057fa66 3046 }
4e1349d4
MN
3047 else if(src_stride == dst_stride)
3048 {
3049 memcpy(dst[1], src[1], src_stride*vertical_size);
3050 memcpy(dst[2], src[2], src_stride*vertical_size);
3051 }
3057fa66
A
3052 else
3053 {
4e1349d4
MN
3054 int y;
3055 for(y=0; y<vertical_size; y++)
3056 {
3057 memcpy(&(dst[1][y*dst_stride]), &(src[1][y*src_stride]), horizontal_size);
3058 memcpy(&(dst[2][y*dst_stride]), &(src[2][y*src_stride]), horizontal_size);
3059 }
3060 }
3061
3062#if 0
117e45b0
MN
3063 memset(dst[1], 128, dst_stride*vertical_size);
3064 memset(dst[2], 128, dst_stride*vertical_size);
4e1349d4 3065#endif
3057fa66 3066}
9a722af7 3067
911879d1
MN
3068void postprocess2(unsigned char * src[], int src_stride,
3069 unsigned char * dst[], int dst_stride,
3070 int horizontal_size, int vertical_size,
3071 QP_STORE_T *QP_store, int QP_stride,
3072 struct PPMode *mode)
3073{
3074
815cbfe7
MN
3075 static QP_STORE_T zeroArray[2048/8];
3076 if(QP_store==NULL)
3077 {
3078 QP_store= zeroArray;
3079 QP_stride= 0;
3080 }
3081
911879d1
MN
3082#ifdef HAVE_ODIVX_POSTPROCESS
3083// Note: I could make this shit outside of this file, but it would mean one
3084// more function call...
3085 if(use_old_pp){
3086 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
3087 mode->oldMode);
3088 return;
3089 }
3090#endif
3091
3092 postProcess(src[0], src_stride, dst[0], dst_stride,
117e45b0 3093 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
911879d1
MN
3094
3095 horizontal_size >>= 1;
3096 vertical_size >>= 1;
3097 src_stride >>= 1;
3098 dst_stride >>= 1;
3099
4e1349d4
MN
3100 if(mode->chromMode)
3101 {
3102 postProcess(src[1], src_stride, dst[1], dst_stride,
3103 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
3104 postProcess(src[2], src_stride, dst[2], dst_stride,
3105 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
3106 }
3107 else if(src_stride == dst_stride)
3108 {
3109 memcpy(dst[1], src[1], src_stride*vertical_size);
3110 memcpy(dst[2], src[2], src_stride*vertical_size);
3111 }
3112 else
3113 {
3114 int y;
3115 for(y=0; y<vertical_size; y++)
3116 {
3117 memcpy(&(dst[1][y*dst_stride]), &(src[1][y*src_stride]), horizontal_size);
3118 memcpy(&(dst[2][y*dst_stride]), &(src[2][y*src_stride]), horizontal_size);
3119 }
3120 }
911879d1
MN
3121}
3122
3123
13e00528
A
3124/**
3125 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
9a722af7 3126 * 0 <= quality <= 6
13e00528 3127 */
9a722af7
A
3128int getPpModeForQuality(int quality){
3129 int modes[1+GET_PP_QUALITY_MAX]= {
3130 0,
3131#if 1
3132 // horizontal filters first
3133 LUM_H_DEBLOCK,
3134 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
3135 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
3136 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
3137 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
3138 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
3139#else
3140 // vertical filters first
13e00528
A
3141 LUM_V_DEBLOCK,
3142 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
3143 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
3144 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
3145 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
3146 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
9a722af7
A
3147#endif
3148 };
3149
3150#ifdef HAVE_ODIVX_POSTPROCESS
3151 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
3152 0,
3153 PP_DEBLOCK_Y_H,
3154 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
3155 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
3156 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
3157 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
3158 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
3159 };
3160 if(use_old_pp) return odivx_modes[quality];
3161#endif
3162 return modes[quality];
3057fa66
A
3163}
3164
3165/**
3166 * Copies a block from src to dst and fixes the blacklevel
d5a1a995
MN
3167 * numLines must be a multiple of 4
3168 * levelFix == 0 -> dont touch the brighness & contrast
3057fa66 3169 */
d5a1a995 3170static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
c09dc465 3171 int levelFix)
3057fa66 3172{
e5c30e06 3173#ifndef HAVE_MMX
d5a1a995 3174 int i;
e5c30e06 3175#endif
d5a1a995
MN
3176 if(levelFix)
3177 {
3057fa66
A
3178#ifdef HAVE_MMX
3179 asm volatile(
043ba56f
MN
3180 "leal (%0,%2), %%eax \n\t"
3181 "leal (%1,%3), %%ebx \n\t"
3057fa66
A
3182 "movq packedYOffset, %%mm2 \n\t"
3183 "movq packedYScale, %%mm3 \n\t"
5b65f0df 3184 "pxor %%mm4, %%mm4 \n\t"
3057fa66 3185
043ba56f
MN
3186#define SCALED_CPY(src1, src2, dst1, dst2) \
3187 "movq " #src1 ", %%mm0 \n\t"\
3188 "movq " #src1 ", %%mm5 \n\t"\
5b65f0df
MN
3189 "punpcklbw %%mm4, %%mm0 \n\t"\
3190 "punpckhbw %%mm4, %%mm5 \n\t"\
57d04d3f
MN
3191 "psubw %%mm2, %%mm0 \n\t"\
3192 "psubw %%mm2, %%mm5 \n\t"\
043ba56f 3193 "movq " #src2 ", %%mm1 \n\t"\
57d04d3f
MN
3194 "psllw $6, %%mm0 \n\t"\
3195 "psllw $6, %%mm5 \n\t"\
5b65f0df 3196 "pmulhw %%mm3, %%mm0 \n\t"\
043ba56f 3197 "movq " #src2 ", %%mm6 \n\t"\
5b65f0df 3198 "pmulhw %%mm3, %%mm5 \n\t"\
5b65f0df 3199 "punpcklbw %%mm4, %%mm1 \n\t"\
534a602d 3200 "punpckhbw %%mm4, %%mm6 \n\t"\
57d04d3f 3201 "psubw %%mm2, %%mm1 \n\t"\
534a602d 3202 "psubw %%mm2, %%mm6 \n\t"\
57d04d3f 3203 "psllw $6, %%mm1 \n\t"\
534a602d 3204 "psllw $6, %%mm6 \n\t"\
5b65f0df 3205 "pmulhw %%mm3, %%mm1 \n\t"\
534a602d 3206 "pmulhw %%mm3, %%mm6 \n\t"\
534a602d
MN
3207 "packuswb %%mm5, %%mm0 \n\t"\
3208 "packuswb %%mm6, %%mm1 \n\t"\
043ba56f
MN
3209 "movq %%mm0, " #dst1 " \n\t"\
3210 "movq %%mm1, " #dst2 " \n\t"\
3211
3212SCALED_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3213SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3214SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3215 "leal (%%eax,%2,4), %%eax \n\t"
3216 "leal (%%ebx,%3,4), %%ebx \n\t"
3217SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3218
3219
3220 : : "r"(src),
3221 "r"(dst),
3222 "r" (srcStride),
534a602d 3223 "r" (dstStride)
d5a1a995
MN
3224 : "%eax", "%ebx"
3225 );
3226#else
c09dc465 3227 for(i=0; i<8; i++)
d5a1a995
MN
3228 memcpy( &(dst[dstStride*i]),
3229 &(src[srcStride*i]), BLOCK_SIZE);
3230#endif
3231 }
3232 else
3233 {
3234#ifdef HAVE_MMX
3235 asm volatile(
043ba56f
MN
3236 "leal (%0,%2), %%eax \n\t"
3237 "leal (%1,%3), %%ebx \n\t"
3238
3239#define SIMPLE_CPY(src1, src2, dst1, dst2) \
3240 "movq " #src1 ", %%mm0 \n\t"\
3241 "movq " #src2 ", %%mm1 \n\t"\
3242 "movq %%mm0, " #dst1 " \n\t"\
3243 "movq %%mm1, " #dst2 " \n\t"\
3244
3245SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3246SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3247SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3248 "leal (%%eax,%2,4), %%eax \n\t"
3249 "leal (%%ebx,%3,4), %%ebx \n\t"
3250SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3251
3057fa66
A
3252 : : "r" (src),
3253 "r" (dst),
3254 "r" (srcStride),
c09dc465 3255 "r" (dstStride)
3057fa66
A
3256 : "%eax", "%ebx"
3257 );
3258#else
c09dc465 3259 for(i=0; i<8; i++)
3057fa66
A
3260 memcpy( &(dst[dstStride*i]),
3261 &(src[srcStride*i]), BLOCK_SIZE);
3262#endif
d5a1a995 3263 }
3057fa66
A
3264}
3265
3266
3267/**
3268 * Filters array of bytes (Y or U or V values)
3269 */
9a722af7 3270static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
117e45b0 3271 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
3057fa66 3272{
d5a1a995 3273 int x,y;
117e45b0
MN
3274 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
3275
d5a1a995
MN
3276