1% speedup
[libav.git] / postproc / postprocess_template.c
CommitLineData
3057fa66
A
1/*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
19/*
3b58b885 20 C MMX MMX2 3DNow
3057fa66
A
21isVertDC Ec Ec
22isVertMinMaxOk Ec Ec
3b58b885 23doVertLowPass E e e
7f16f6e6 24doVertDefFilter Ec Ec e e
3057fa66 25isHorizDC Ec Ec
4e4dcbc5
MN
26isHorizMinMaxOk a E
27doHorizLowPass E e e
7f16f6e6 28doHorizDefFilter Ec Ec e e
2e212618 29deRing E e e*
3b58b885 30Vertical RKAlgo1 E a a
e5c30e06 31Horizontal RKAlgo1 a a
117e45b0
MN
32Vertical X1# a E E
33Horizontal X1# a E E
acced553
MN
34LinIpolDeinterlace e E E*
35CubicIpolDeinterlace a e e*
36LinBlendDeinterlace e E E*
117e45b0 37MedianDeinterlace# Ec Ec
be44a4d7 38TempDeNoiser# E e e
d5a1a995 39
117e45b0
MN
40* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41# more or less selfinvented filters so the exactness isnt too meaningfull
3057fa66 42E = Exact implementation
acced553 43e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
44a = alternative / approximate impl
45c = checked against the other implementations (-vo md5)
46*/
47
48/*
49TODO:
9f45d04d 50verify that everything workes as it should (how?)
3057fa66 51reduce the time wasted on the mem transfer
13e00528 52implement everything in C at least (done at the moment but ...)
3057fa66
A
53unroll stuff if instructions depend too much on the prior one
54we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55move YScale thing to the end instead of fixing QP
13e00528 56write a faster and higher quality deblocking filter :)
d5a1a995
MN
57make the mainloop more flexible (variable number of blocks at once
58 (the if/else stuff per block is slowing things down)
9f45d04d 59compare the quality & speed of all filters
9f45d04d 60split this huge file
e5c30e06 61border remover
8405b3fd 62optimize c versions
117e45b0 63try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
be44a4d7 64smart blur
c09dc465 65commandline option for the deblock thresholds
3057fa66 66...
13e00528
A
67*/
68
a6be8111 69//Changelog: use the CVS log
3057fa66 70
6c426cff 71#include "../config.h"
3057fa66
A
72#include <inttypes.h>
73#include <stdio.h>
d5a1a995 74#include <stdlib.h>
911879d1 75#include <string.h>
dda87e9f
PL
76#ifdef HAVE_MALLOC_H
77#include <malloc.h>
78#endif
3057fa66 79//#undef HAVE_MMX2
13e00528 80//#define HAVE_3DNOW
3057fa66 81//#undef HAVE_MMX
7f16f6e6 82//#define DEBUG_BRIGHTNESS
13e00528 83#include "postprocess.h"
3057fa66 84
e939e1c3
A
85#define MIN(a,b) ((a) > (b) ? (b) : (a))
86#define MAX(a,b) ((a) < (b) ? (b) : (a))
87#define ABS(a) ((a) > 0 ? (a) : (-(a)))
88#define SIGN(a) ((a) > 0 ? 1 : -1)
89
90#ifdef HAVE_MMX2
91#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92#elif defined (HAVE_3DNOW)
93#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
94#endif
3057fa66 95
2e212618
MN
96#ifdef HAVE_MMX2
97#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
98#elif defined (HAVE_MMX)
99#define PMINUB(b,a,t) \
100 "movq " #a ", " #t " \n\t"\
101 "psubusb " #b ", " #t " \n\t"\
102 "psubb " #t ", " #a " \n\t"
103#endif
104
105#ifdef HAVE_MMX2
106#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
107#elif defined (HAVE_MMX)
108#define PMAXUB(a,b) \
109 "psubusb " #a ", " #b " \n\t"\
110 "paddb " #a ", " #b " \n\t"
111#endif
112
113
911879d1
MN
114#define GET_MODE_BUFFER_SIZE 500
115#define OPTIONS_ARRAY_SIZE 10
116
b28daef8 117#ifdef HAVE_MMX
3fe8e8f0
MN
118static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
119static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
b28daef8
MN
120static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
121static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
122static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL;
123static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL;
124static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL;
125static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL;
126static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL;
127static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL;
128static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL;
129static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL;
130static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL;
131static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL;
132static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL;
133static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL;
134static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL;
135static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
136static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
137static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
138static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL;
139static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL;
140static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
141static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL;
142static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL;
143static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
144static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL;
145static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL;
146static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL;
147static uint64_t __attribute__((aligned(8))) temp0=0;
148static uint64_t __attribute__((aligned(8))) temp1=0;
149static uint64_t __attribute__((aligned(8))) temp2=0;
150static uint64_t __attribute__((aligned(8))) temp3=0;
151static uint64_t __attribute__((aligned(8))) temp4=0;
152static uint64_t __attribute__((aligned(8))) temp5=0;
153static uint64_t __attribute__((aligned(8))) pQPb=0;
154static uint64_t __attribute__((aligned(8))) pQPb2=0;
155static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
a9c77978 156static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
b28daef8 157#else
3057fa66
A
158static uint64_t packedYOffset= 0x0000000000000000LL;
159static uint64_t packedYScale= 0x0100010001000100LL;
4e4dcbc5 160static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
b28daef8 161#endif
3057fa66
A
162
163int hFlatnessThreshold= 56 - 16;
164int vFlatnessThreshold= 56 - 16;
165
166//amount of "black" u r willing to loose to get a brightness corrected picture
167double maxClippedThreshold= 0.01;
168
911879d1 169int maxAllowedY=234;
658a85f2 170int minAllowedY=16;
3057fa66 171
911879d1
MN
172static struct PPFilter filters[]=
173{
174 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
175 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
176 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
177 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
178 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
179 {"dr", "dering", 1, 5, 6, DERING},
180 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
181 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
182 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
183 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
184 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
117e45b0 185 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
911879d1
MN
186 {NULL, NULL,0,0,0,0} //End Marker
187};
188
189static char *replaceTable[]=
190{
117e45b0
MN
191 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
192 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
193 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
194 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
911879d1
MN
195 NULL //End Marker
196};
197
b28daef8 198#ifdef HAVE_MMX
e5c30e06
MN
199static inline void unusedVariableWarningFixer()
200{
201if(
202 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
203 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
204 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
b28daef8 205 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
e5c30e06
MN
206 + temp5 + pQPb== 0) b00=0;
207}
b28daef8 208#endif
e5c30e06 209
a6be8111 210#ifdef TIMING
3057fa66
A
211static inline long long rdtsc()
212{
213 long long l;
214 asm volatile( "rdtsc\n\t"
215 : "=A" (l)
216 );
217// printf("%d\n", int(l/1000));
218 return l;
219}
9a722af7 220#endif
3057fa66 221
9a722af7 222#ifdef HAVE_MMX2
3057fa66
A
223static inline void prefetchnta(void *p)
224{
225 asm volatile( "prefetchnta (%0)\n\t"
226 : : "r" (p)
227 );
228}
229
230static inline void prefetcht0(void *p)
231{
232 asm volatile( "prefetcht0 (%0)\n\t"
233 : : "r" (p)
234 );
235}
236
237static inline void prefetcht1(void *p)
238{
239 asm volatile( "prefetcht1 (%0)\n\t"
240 : : "r" (p)
241 );
242}
243
244static inline void prefetcht2(void *p)
245{
246 asm volatile( "prefetcht2 (%0)\n\t"
247 : : "r" (p)
248 );
249}
9a722af7 250#endif
3057fa66
A
251
252//FIXME? |255-0| = 1 (shouldnt be a problem ...)
253/**
acced553 254 * Check if the middle 8x8 Block in the given 8x16 block is flat
3057fa66 255 */
d5a1a995 256static inline int isVertDC(uint8_t src[], int stride){
3057fa66 257 int numEq= 0;
e5c30e06 258#ifndef HAVE_MMX
d5a1a995 259 int y;
e5c30e06 260#endif
acced553 261 src+= stride*4; // src points to begin of the 8x8 Block
3057fa66 262#ifdef HAVE_MMX
37da00fc
MN
263asm volatile(
264 "leal (%1, %2), %%eax \n\t"
265 "leal (%%eax, %2, 4), %%ebx \n\t"
266// 0 1 2 3 4 5 6 7 8 9
267// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
3057fa66
A
268 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
269 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
270 "movq (%1), %%mm0 \n\t"
37da00fc 271 "movq (%%eax), %%mm1 \n\t"
3057fa66
A
272 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
273 "paddb %%mm7, %%mm0 \n\t"
274 "pcmpgtb %%mm6, %%mm0 \n\t"
275
37da00fc 276 "movq (%%eax,%2), %%mm2 \n\t"
3057fa66
A
277 "psubb %%mm2, %%mm1 \n\t"
278 "paddb %%mm7, %%mm1 \n\t"
279 "pcmpgtb %%mm6, %%mm1 \n\t"
280 "paddb %%mm1, %%mm0 \n\t"
281
37da00fc 282 "movq (%%eax, %2, 2), %%mm1 \n\t"
3057fa66
A
283 "psubb %%mm1, %%mm2 \n\t"
284 "paddb %%mm7, %%mm2 \n\t"
285 "pcmpgtb %%mm6, %%mm2 \n\t"
286 "paddb %%mm2, %%mm0 \n\t"
287
37da00fc 288 "movq (%1, %2, 4), %%mm2 \n\t"
3057fa66
A
289 "psubb %%mm2, %%mm1 \n\t"
290 "paddb %%mm7, %%mm1 \n\t"
291 "pcmpgtb %%mm6, %%mm1 \n\t"
292 "paddb %%mm1, %%mm0 \n\t"
293
37da00fc 294 "movq (%%ebx), %%mm1 \n\t"
3057fa66
A
295 "psubb %%mm1, %%mm2 \n\t"
296 "paddb %%mm7, %%mm2 \n\t"
297 "pcmpgtb %%mm6, %%mm2 \n\t"
298 "paddb %%mm2, %%mm0 \n\t"
299
37da00fc 300 "movq (%%ebx, %2), %%mm2 \n\t"
3057fa66
A
301 "psubb %%mm2, %%mm1 \n\t"
302 "paddb %%mm7, %%mm1 \n\t"
303 "pcmpgtb %%mm6, %%mm1 \n\t"
304 "paddb %%mm1, %%mm0 \n\t"
305
37da00fc 306 "movq (%%ebx, %2, 2), %%mm1 \n\t"
3057fa66
A
307 "psubb %%mm1, %%mm2 \n\t"
308 "paddb %%mm7, %%mm2 \n\t"
309 "pcmpgtb %%mm6, %%mm2 \n\t"
310 "paddb %%mm2, %%mm0 \n\t"
311
312 " \n\t"
313 "movq %%mm0, %%mm1 \n\t"
314 "psrlw $8, %%mm0 \n\t"
315 "paddb %%mm1, %%mm0 \n\t"
e5c30e06
MN
316#ifdef HAVE_MMX2
317 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
318 "paddb %%mm1, %%mm0 \n\t"
319 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
320#else
3057fa66
A
321 "movq %%mm0, %%mm1 \n\t"
322 "psrlq $16, %%mm0 \n\t"
323 "paddb %%mm1, %%mm0 \n\t"
324 "movq %%mm0, %%mm1 \n\t"
325 "psrlq $32, %%mm0 \n\t"
e5c30e06 326#endif
3057fa66 327 "paddb %%mm1, %%mm0 \n\t"
3057fa66
A
328 "movd %%mm0, %0 \n\t"
329 : "=r" (numEq)
330 : "r" (src), "r" (stride)
4e4dcbc5 331 : "%eax", "%ebx"
3057fa66 332 );
3057fa66 333
37da00fc 334 numEq= (256 - numEq) &0xFF;
3057fa66
A
335
336#else
d5a1a995 337 for(y=0; y<BLOCK_SIZE-1; y++)
3057fa66
A
338 {
339 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
340 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
341 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
342 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
343 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
344 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
345 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
346 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
347 src+= stride;
348 }
349#endif
350/* if(abs(numEq - asmEq) > 0)
351 {
352 printf("\nasm:%d c:%d\n", asmEq, numEq);
353 for(int y=0; y<8; y++)
354 {
355 for(int x=0; x<8; x++)
356 {
357 printf("%d ", temp[x + y*stride]);
358 }
359 printf("\n");
360 }
361 }
362*/
d5a1a995
MN
363// for(int i=0; i<numEq/8; i++) src[i]=255;
364 return (numEq > vFlatnessThreshold) ? 1 : 0;
3057fa66
A
365}
366
d5a1a995 367static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
3057fa66
A
368{
369#ifdef HAVE_MMX
370 int isOk;
acced553 371 src+= stride*3;
3057fa66
A
372 asm volatile(
373// "int $3 \n\t"
374 "movq (%1, %2), %%mm0 \n\t"
375 "movq (%1, %2, 8), %%mm1 \n\t"
376 "movq %%mm0, %%mm2 \n\t"
377 "psubusb %%mm1, %%mm0 \n\t"
378 "psubusb %%mm2, %%mm1 \n\t"
379 "por %%mm1, %%mm0 \n\t" // ABS Diff
380
381 "movq pQPb, %%mm7 \n\t" // QP,..., QP
382 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
383 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
384 "pcmpeqd b00, %%mm0 \n\t"
385 "psrlq $16, %%mm0 \n\t"
386 "pcmpeqd bFF, %%mm0 \n\t"
387// "movd %%mm0, (%1, %2, 4)\n\t"
388 "movd %%mm0, %0 \n\t"
389 : "=r" (isOk)
390 : "r" (src), "r" (stride)
391 );
ac0b0b2f 392 return isOk;
3057fa66
A
393#else
394
d5a1a995
MN
395 int isOk2= 1;
396 int x;
acced553 397 src+= stride*3;
d5a1a995 398 for(x=0; x<BLOCK_SIZE; x++)
3057fa66 399 {
d5a1a995 400 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
3057fa66
A
401 }
402/* if(isOk && !isOk2 || !isOk && isOk2)
403 {
404 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
405 for(int y=0; y<9; y++)
406 {
407 for(int x=0; x<8; x++)
408 {
409 printf("%d ", src[x + y*stride]);
410 }
411 printf("\n");
412 }
413 } */
414
415 return isOk2;
416#endif
417
418}
419
420/**
acced553 421 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
a6be8111 422 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
3057fa66
A
423 */
424static inline void doVertLowPass(uint8_t *src, int stride, int QP)
425{
13e00528 426#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 427 src+= stride*3;
3057fa66 428 asm volatile( //"movv %0 %1 %2\n\t"
3057fa66 429 "movq pQPb, %%mm0 \n\t" // QP,..., QP
3057fa66
A
430
431 "movq (%0), %%mm6 \n\t"
432 "movq (%0, %1), %%mm5 \n\t"
433 "movq %%mm5, %%mm1 \n\t"
434 "movq %%mm6, %%mm2 \n\t"
435 "psubusb %%mm6, %%mm5 \n\t"
436 "psubusb %%mm1, %%mm2 \n\t"
437 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
438 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
439 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
440
441 "pand %%mm2, %%mm6 \n\t"
442 "pandn %%mm1, %%mm2 \n\t"
443 "por %%mm2, %%mm6 \n\t"// First Line to Filter
444
445 "movq (%0, %1, 8), %%mm5 \n\t"
446 "leal (%0, %1, 4), %%eax \n\t"
447 "leal (%0, %1, 8), %%ebx \n\t"
448 "subl %1, %%ebx \n\t"
449 "addl %1, %0 \n\t" // %0 points to line 1 not 0
450 "movq (%0, %1, 8), %%mm7 \n\t"
451 "movq %%mm5, %%mm1 \n\t"
452 "movq %%mm7, %%mm2 \n\t"
453 "psubusb %%mm7, %%mm5 \n\t"
454 "psubusb %%mm1, %%mm2 \n\t"
455 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
456 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
457 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
458
459 "pand %%mm2, %%mm7 \n\t"
460 "pandn %%mm1, %%mm2 \n\t"
461 "por %%mm2, %%mm7 \n\t" // First Line to Filter
462
463
464 // 1 2 3 4 5 6 7 8
465 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
466 // 6 4 2 2 1 1
467 // 6 4 4 2
468 // 6 8 2
acced553 469
3057fa66
A
470 "movq (%0, %1), %%mm0 \n\t" // 1
471 "movq %%mm0, %%mm1 \n\t" // 1
13e00528
A
472 PAVGB(%%mm6, %%mm0) //1 1 /2
473 PAVGB(%%mm6, %%mm0) //3 1 /4
3057fa66
A
474
475 "movq (%0, %1, 4), %%mm2 \n\t" // 1
476 "movq %%mm2, %%mm5 \n\t" // 1
13e00528
A
477 PAVGB((%%eax), %%mm2) // 11 /2
478 PAVGB((%0, %1, 2), %%mm2) // 211 /4
3057fa66
A
479 "movq %%mm2, %%mm3 \n\t" // 211 /4
480 "movq (%0), %%mm4 \n\t" // 1
13e00528
A
481 PAVGB(%%mm4, %%mm3) // 4 211 /8
482 PAVGB(%%mm0, %%mm3) //642211 /16
3057fa66
A
483 "movq %%mm3, (%0) \n\t" // X
484 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
485 "movq %%mm1, %%mm0 \n\t" // 1
13e00528 486 PAVGB(%%mm6, %%mm0) //1 1 /2
3057fa66 487 "movq %%mm4, %%mm3 \n\t" // 1
13e00528
A
488 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
489 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
490 PAVGB((%%eax), %%mm5) // 211 /4
491 PAVGB(%%mm5, %%mm3) // 2 2211 /8
492 PAVGB(%%mm0, %%mm3) //4242211 /16
3057fa66
A
493 "movq %%mm3, (%0,%1) \n\t" // X
494 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
13e00528 495 PAVGB(%%mm4, %%mm6) //11 /2
3057fa66 496 "movq (%%ebx), %%mm0 \n\t" // 1
13e00528 497 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
3057fa66 498 "movq %%mm0, %%mm3 \n\t" // 11/2
13e00528
A
499 PAVGB(%%mm1, %%mm0) // 2 11/4
500 PAVGB(%%mm6, %%mm0) //222 11/8
501 PAVGB(%%mm2, %%mm0) //22242211/16
3057fa66
A
502 "movq (%0, %1, 2), %%mm2 \n\t" // 1
503 "movq %%mm0, (%0, %1, 2) \n\t" // X
504 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
505 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
506 PAVGB((%%ebx), %%mm0) // 11 /2
507 PAVGB(%%mm0, %%mm6) //11 11 /4
508 PAVGB(%%mm1, %%mm4) // 11 /2
509 PAVGB(%%mm2, %%mm1) // 11 /2
510 PAVGB(%%mm1, %%mm6) //1122 11 /8
511 PAVGB(%%mm5, %%mm6) //112242211 /16
3057fa66
A
512 "movq (%%eax), %%mm5 \n\t" // 1
513 "movq %%mm6, (%%eax) \n\t" // X
514 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
515 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
13e00528
A
516 PAVGB(%%mm7, %%mm6) // 11 /2
517 PAVGB(%%mm4, %%mm6) // 11 11 /4
518 PAVGB(%%mm3, %%mm6) // 11 2211 /8
519 PAVGB(%%mm5, %%mm2) // 11 /2
3057fa66 520 "movq (%0, %1, 4), %%mm4 \n\t" // 1
13e00528
A
521 PAVGB(%%mm4, %%mm2) // 112 /4
522 PAVGB(%%mm2, %%mm6) // 112242211 /16
3057fa66
A
523 "movq %%mm6, (%0, %1, 4) \n\t" // X
524 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
13e00528
A
525 PAVGB(%%mm7, %%mm1) // 11 2 /4
526 PAVGB(%%mm4, %%mm5) // 11 /2
527 PAVGB(%%mm5, %%mm0) // 11 11 /4
3057fa66 528 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
13e00528
A
529 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
530 PAVGB(%%mm0, %%mm1) // 11224222 /16
3057fa66
A
531 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
532 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
13e00528 533 PAVGB((%%ebx), %%mm2) // 112 4 /8
3057fa66 534 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
535 PAVGB(%%mm0, %%mm6) // 1 1 /2
536 PAVGB(%%mm7, %%mm6) // 1 12 /4
537 PAVGB(%%mm2, %%mm6) // 1122424 /4
3057fa66
A
538 "movq %%mm6, (%%ebx) \n\t" // X
539 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
13e00528
A
540 PAVGB(%%mm7, %%mm5) // 11 2 /4
541 PAVGB(%%mm7, %%mm5) // 11 6 /8
3057fa66 542
13e00528
A
543 PAVGB(%%mm3, %%mm0) // 112 /4
544 PAVGB(%%mm0, %%mm5) // 112246 /16
3057fa66 545 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
8405b3fd 546 "subl %1, %0 \n\t"
3057fa66
A
547
548 :
549 : "r" (src), "r" (stride)
550 : "%eax", "%ebx"
551 );
3057fa66
A
552#else
553 const int l1= stride;
554 const int l2= stride + l1;
555 const int l3= stride + l2;
556 const int l4= stride + l3;
557 const int l5= stride + l4;
558 const int l6= stride + l5;
559 const int l7= stride + l6;
560 const int l8= stride + l7;
561 const int l9= stride + l8;
d5a1a995 562 int x;
acced553 563 src+= stride*3;
d5a1a995 564 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
565 {
566 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
567 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
568
569 int sums[9];
570 sums[0] = first + src[l1];
571 sums[1] = src[l1] + src[l2];
572 sums[2] = src[l2] + src[l3];
573 sums[3] = src[l3] + src[l4];
574 sums[4] = src[l4] + src[l5];
575 sums[5] = src[l5] + src[l6];
576 sums[6] = src[l6] + src[l7];
577 sums[7] = src[l7] + src[l8];
578 sums[8] = src[l8] + last;
579
580 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
e5c30e06
MN
581 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
582 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
583 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
584 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
585 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
586 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
587 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
3057fa66
A
588
589 src++;
590 }
591
592#endif
593}
594
13e00528
A
595/**
596 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
597 * values are correctly clipped (MMX2)
598 * values are wraparound (C)
599 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
600 0 8 16 24
601 x = 8
602 x/2 = 4
603 x/8 = 1
604 1 12 12 23
605 */
9f45d04d 606static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
13e00528 607{
d5a1a995 608#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 609 src+= stride*3;
13e00528
A
610// FIXME rounding
611 asm volatile(
612 "pxor %%mm7, %%mm7 \n\t" // 0
613 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
614 "leal (%0, %1), %%eax \n\t"
615 "leal (%%eax, %1, 4), %%ebx \n\t"
616// 0 1 2 3 4 5 6 7 8 9
617// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
618 "movq pQPb, %%mm0 \n\t" // QP,..., QP
619 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
620 "paddusb b02, %%mm0 \n\t"
621 "psrlw $2, %%mm0 \n\t"
622 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
623 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
624 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
625 "movq (%%ebx), %%mm3 \n\t" // line 5
626 "movq %%mm2, %%mm4 \n\t" // line 4
627 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
628 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
d5a1a995 629 PAVGB(%%mm3, %%mm5)
13e00528
A
630 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
631 "psubusb %%mm3, %%mm4 \n\t"
632 "psubusb %%mm2, %%mm3 \n\t"
633 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
634 "psubusb %%mm0, %%mm4 \n\t"
635 "pcmpeqb %%mm7, %%mm4 \n\t"
636 "pand %%mm4, %%mm5 \n\t" // d/2
637
638// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
639 "paddb %%mm5, %%mm2 \n\t"
640// "psubb %%mm6, %%mm2 \n\t"
641 "movq %%mm2, (%0,%1, 4) \n\t"
642
643 "movq (%%ebx), %%mm2 \n\t"
644// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
645 "psubb %%mm5, %%mm2 \n\t"
646// "psubb %%mm6, %%mm2 \n\t"
647 "movq %%mm2, (%%ebx) \n\t"
648
649 "paddb %%mm6, %%mm5 \n\t"
650 "psrlw $2, %%mm5 \n\t"
651 "pand b3F, %%mm5 \n\t"
652 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
653
654 "movq (%%eax, %1, 2), %%mm2 \n\t"
655 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
656 "paddsb %%mm5, %%mm2 \n\t"
657 "psubb %%mm6, %%mm2 \n\t"
658 "movq %%mm2, (%%eax, %1, 2) \n\t"
659
660 "movq (%%ebx, %1), %%mm2 \n\t"
661 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
662 "psubsb %%mm5, %%mm2 \n\t"
663 "psubb %%mm6, %%mm2 \n\t"
664 "movq %%mm2, (%%ebx, %1) \n\t"
665
666 :
667 : "r" (src), "r" (stride)
668 : "%eax", "%ebx"
669 );
670#else
671 const int l1= stride;
672 const int l2= stride + l1;
673 const int l3= stride + l2;
674 const int l4= stride + l3;
675 const int l5= stride + l4;
676 const int l6= stride + l5;
e5c30e06
MN
677// const int l7= stride + l6;
678// const int l8= stride + l7;
679// const int l9= stride + l8;
d5a1a995 680 int x;
3407a972 681 const int QP15= QP + (QP>>2);
acced553 682 src+= stride*3;
d5a1a995 683 for(x=0; x<BLOCK_SIZE; x++)
13e00528 684 {
3407a972
MN
685 const int v = (src[x+l5] - src[x+l4]);
686 if(ABS(v) < QP15)
13e00528 687 {
3407a972
MN
688 src[x+l3] +=v>>3;
689 src[x+l4] +=v>>1;
690 src[x+l5] -=v>>1;
691 src[x+l6] -=v>>3;
13e00528 692
13e00528 693 }
13e00528
A
694 }
695
696#endif
697}
698
699/**
700 * Experimental Filter 1
9f45d04d
MN
701 * will not damage linear gradients
702 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
d5a1a995
MN
703 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
704 * MMX2 version does correct clipping C version doesnt
13e00528
A
705 */
706static inline void vertX1Filter(uint8_t *src, int stride, int QP)
707{
d5a1a995 708#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553
MN
709 src+= stride*3;
710
13e00528 711 asm volatile(
d5a1a995
MN
712 "pxor %%mm7, %%mm7 \n\t" // 0
713// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
714 "leal (%0, %1), %%eax \n\t"
715 "leal (%%eax, %1, 4), %%ebx \n\t"
716// 0 1 2 3 4 5 6 7 8 9
717// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
718 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
719 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
720 "movq %%mm1, %%mm2 \n\t" // line 4
721 "psubusb %%mm0, %%mm1 \n\t"
722 "psubusb %%mm2, %%mm0 \n\t"
723 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
724 "movq (%%ebx), %%mm3 \n\t" // line 5
725 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
726 "movq %%mm3, %%mm5 \n\t" // line 5
727 "psubusb %%mm4, %%mm3 \n\t"
728 "psubusb %%mm5, %%mm4 \n\t"
729 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
730 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
731 "movq %%mm2, %%mm1 \n\t" // line 4
732 "psubusb %%mm5, %%mm2 \n\t"
733 "movq %%mm2, %%mm4 \n\t"
734 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
735 "psubusb %%mm1, %%mm5 \n\t"
736 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
737 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
738 "movq %%mm4, %%mm3 \n\t" // d
739 "psubusb pQPb, %%mm4 \n\t"
740 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
9f45d04d 741 "psubusb b01, %%mm3 \n\t"
d5a1a995
MN
742 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
743
744 PAVGB(%%mm7, %%mm3) // d/2
9f45d04d
MN
745 "movq %%mm3, %%mm1 \n\t" // d/2
746 PAVGB(%%mm7, %%mm3) // d/4
747 PAVGB(%%mm1, %%mm3) // 3*d/8
d5a1a995
MN
748
749 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
750 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
751 "psubusb %%mm3, %%mm0 \n\t"
752 "pxor %%mm2, %%mm0 \n\t"
753 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
754
755 "movq (%%ebx), %%mm0 \n\t" // line 5
756 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
757 "paddusb %%mm3, %%mm0 \n\t"
758 "pxor %%mm2, %%mm0 \n\t"
759 "movq %%mm0, (%%ebx) \n\t" // line 5
760
9f45d04d 761 PAVGB(%%mm7, %%mm1) // d/4
d5a1a995
MN
762
763 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
764 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
9f45d04d 765 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
766 "pxor %%mm2, %%mm0 \n\t"
767 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
768
769 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
770 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
9f45d04d 771 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
772 "pxor %%mm2, %%mm0 \n\t"
773 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
774
9f45d04d 775 PAVGB(%%mm7, %%mm1) // d/8
d5a1a995
MN
776
777 "movq (%%eax, %1), %%mm0 \n\t" // line 2
778 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
9f45d04d 779 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
780 "pxor %%mm2, %%mm0 \n\t"
781 "movq %%mm0, (%%eax, %1) \n\t" // line 2
782
783 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
784 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
9f45d04d 785 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
786 "pxor %%mm2, %%mm0 \n\t"
787 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
13e00528
A
788
789 :
790 : "r" (src), "r" (stride)
791 : "%eax", "%ebx"
792 );
793#else
d5a1a995
MN
794
795 const int l1= stride;
796 const int l2= stride + l1;
797 const int l3= stride + l2;
798 const int l4= stride + l3;
799 const int l5= stride + l4;
800 const int l6= stride + l5;
801 const int l7= stride + l6;
e5c30e06
MN
802// const int l8= stride + l7;
803// const int l9= stride + l8;
d5a1a995 804 int x;
acced553
MN
805
806 src+= stride*3;
d5a1a995
MN
807 for(x=0; x<BLOCK_SIZE; x++)
808 {
809 int a= src[l3] - src[l4];
810 int b= src[l4] - src[l5];
9f45d04d 811 int c= src[l5] - src[l6];
d5a1a995 812
3407a972
MN
813 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
814 d= MAX(d, 0);
d5a1a995
MN
815
816 if(d < QP)
817 {
818 int v = d * SIGN(-b);
819
3407a972
MN
820 src[l2] +=v>>3;
821 src[l3] +=v>>2;
822 src[l4] +=(3*v)>>3;
823 src[l5] -=(3*v)>>3;
824 src[l6] -=v>>2;
825 src[l7] -=v>>3;
d5a1a995
MN
826
827 }
828 src++;
829 }
830 /*
13e00528
A
831 const int l1= stride;
832 const int l2= stride + l1;
833 const int l3= stride + l2;
834 const int l4= stride + l3;
835 const int l5= stride + l4;
836 const int l6= stride + l5;
837 const int l7= stride + l6;
838 const int l8= stride + l7;
839 const int l9= stride + l8;
840 for(int x=0; x<BLOCK_SIZE; x++)
841 {
842 int v2= src[l2];
843 int v3= src[l3];
844 int v4= src[l4];
845 int v5= src[l5];
846 int v6= src[l6];
847 int v7= src[l7];
848
849 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
850 {
851 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
852 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
853 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
854 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
855 }
856 src++;
857 }
d5a1a995 858*/
13e00528
A
859#endif
860}
861
cf5ec61d
MN
862/**
863 * Experimental Filter 1 (Horizontal)
864 * will not damage linear gradients
865 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
866 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
867 * MMX2 version does correct clipping C version doesnt
868 * not identical with the vertical one
869 */
870static inline void horizX1Filter(uint8_t *src, int stride, int QP)
871{
872 int y;
873//FIXME (has little in common with the mmx2 version)
874 for(y=0; y<BLOCK_SIZE; y++)
875 {
876 int a= src[1] - src[2];
877 int b= src[3] - src[4];
878 int c= src[5] - src[6];
879
880 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
881
882 if(d < QP)
883 {
884 int v = d * SIGN(-b);
885
886 src[1] +=v/8;
887 src[2] +=v/4;
888 src[3] +=3*v/8;
889 src[4] -=3*v/8;
890 src[5] -=v/4;
891 src[6] -=v/8;
892
893 }
894 src+=stride;
895 }
896}
897
898
3057fa66
A
899static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
900{
7f16f6e6
MN
901#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
902/*
903 uint8_t tmp[16];
904 const int l1= stride;
905 const int l2= stride + l1;
906 const int l3= stride + l2;
907 const int l4= (int)tmp - (int)src - stride*3;
908 const int l5= (int)tmp - (int)src - stride*3 + 8;
909 const int l6= stride*3 + l3;
910 const int l7= stride + l6;
911 const int l8= stride + l7;
912
913 memcpy(tmp, src+stride*7, 8);
914 memcpy(tmp+8, src+stride*8, 8);
915*/
916 src+= stride*4;
917 asm volatile(
918
919#if 0 //sligtly more accurate and slightly slower
920 "pxor %%mm7, %%mm7 \n\t" // 0
921 "leal (%0, %1), %%eax \n\t"
922 "leal (%%eax, %1, 4), %%ebx \n\t"
923// 0 1 2 3 4 5 6 7
924// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
925// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
926
927
928 "movq (%0, %1, 2), %%mm0 \n\t" // l2
929 "movq (%0), %%mm1 \n\t" // l0
930 "movq %%mm0, %%mm2 \n\t" // l2
931 PAVGB(%%mm7, %%mm0) // ~l2/2
932 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
933 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
934
935 "movq (%%eax), %%mm1 \n\t" // l1
936 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
937 "movq %%mm1, %%mm4 \n\t" // l1
938 PAVGB(%%mm7, %%mm1) // ~l1/2
939 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
940 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
941
942 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
943 "psubusb %%mm1, %%mm0 \n\t"
944 "psubusb %%mm4, %%mm1 \n\t"
945 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
946// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
947
948 "movq (%0, %1, 4), %%mm0 \n\t" // l4
949 "movq %%mm0, %%mm4 \n\t" // l4
950 PAVGB(%%mm7, %%mm0) // ~l4/2
951 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
952 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
953
954 "movq (%%ebx), %%mm2 \n\t" // l5
955 "movq %%mm3, %%mm5 \n\t" // l3
956 PAVGB(%%mm7, %%mm3) // ~l3/2
957 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
958 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
959
960 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
961 "psubusb %%mm3, %%mm0 \n\t"
962 "psubusb %%mm6, %%mm3 \n\t"
963 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
964 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
965// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
966
967 "movq (%%ebx, %1), %%mm6 \n\t" // l6
968 "movq %%mm6, %%mm5 \n\t" // l6
969 PAVGB(%%mm7, %%mm6) // ~l6/2
970 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
971 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
972
973 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
974 "movq %%mm2, %%mm4 \n\t" // l5
975 PAVGB(%%mm7, %%mm2) // ~l5/2
976 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
977 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
978
979 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
980 "psubusb %%mm2, %%mm6 \n\t"
981 "psubusb %%mm4, %%mm2 \n\t"
982 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
983// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
984
985
986 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
987 "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
988 "paddusb b01, %%mm4 \n\t"
989 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
990 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
991 "pand %%mm4, %%mm3 \n\t"
992
993 "movq %%mm3, %%mm1 \n\t"
994// "psubusb b01, %%mm3 \n\t"
995 PAVGB(%%mm7, %%mm3)
996 PAVGB(%%mm7, %%mm3)
997 "paddusb %%mm1, %%mm3 \n\t"
998// "paddusb b01, %%mm3 \n\t"
999
1000 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
1001 "movq (%0, %1, 4), %%mm5 \n\t" //l4
1002 "movq (%0, %1, 4), %%mm4 \n\t" //l4
1003 "psubusb %%mm6, %%mm5 \n\t"
1004 "psubusb %%mm4, %%mm6 \n\t"
1005 "por %%mm6, %%mm5 \n\t" // |l3-l4|
1006 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
1007 "pxor %%mm6, %%mm0 \n\t"
1008 "pand %%mm0, %%mm3 \n\t"
1009 PMINUB(%%mm5, %%mm3, %%mm0)
1010
1011 "psubusb b01, %%mm3 \n\t"
1012 PAVGB(%%mm7, %%mm3)
1013
1014 "movq (%%eax, %1, 2), %%mm0 \n\t"
1015 "movq (%0, %1, 4), %%mm2 \n\t"
1016 "pxor %%mm6, %%mm0 \n\t"
1017 "pxor %%mm6, %%mm2 \n\t"
1018 "psubb %%mm3, %%mm0 \n\t"
1019 "paddb %%mm3, %%mm2 \n\t"
1020 "pxor %%mm6, %%mm0 \n\t"
1021 "pxor %%mm6, %%mm2 \n\t"
1022 "movq %%mm0, (%%eax, %1, 2) \n\t"
1023 "movq %%mm2, (%0, %1, 4) \n\t"
1024#endif
1025
1026 "leal (%0, %1), %%eax \n\t"
1027 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
1028// 0 1 2 3 4 5 6 7
1029// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1030// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1031
1032
1033 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
1034 "movq (%0, %1, 4), %%mm0 \n\t" // l4
1035 "pxor %%mm6, %%mm1 \n\t" // -l3-1
1036 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
1037// mm1=-l3-1, mm0=128-q
1038
1039 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
1040 "movq (%%eax, %1), %%mm3 \n\t" // l2
1041 "pxor %%mm6, %%mm2 \n\t" // -l5-1
1042 "movq %%mm2, %%mm5 \n\t" // -l5-1
1043 "movq b80, %%mm4 \n\t" // 128
1044 "leal (%%eax, %1, 4), %%ebx \n\t"
1045 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
1046 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
1047 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
1048 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
1049// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
1050
1051 "movq (%%eax), %%mm2 \n\t" // l1
1052 "pxor %%mm6, %%mm2 \n\t" // -l1-1
1053 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
1054 PAVGB((%0), %%mm1) // (l0-l3+256)/2
1055 "movq b80, %%mm3 \n\t" // 128
1056 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
1057 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
1058 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
1059// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
1060
1061 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
1062 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
1063 "pxor %%mm6, %%mm1 \n\t" // -l7-1
1064 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
1065 "movq b80, %%mm2 \n\t" // 128
1066 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
1067 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
1068 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
1069// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
1070
1071 "movq b00, %%mm1 \n\t" // 0
1072 "movq b00, %%mm5 \n\t" // 0
1073 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
1074 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
1075 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
1076 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
1077 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
1078
1079// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
1080
1081 "movq b00, %%mm7 \n\t" // 0
1082 "movq pQPb, %%mm2 \n\t" // QP
1083 PAVGB(%%mm6, %%mm2) // 128 + QP/2
1084 "psubb %%mm6, %%mm2 \n\t"
1085
1086 "movq %%mm4, %%mm1 \n\t"
1087 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
1088 "pxor %%mm1, %%mm4 \n\t"
1089 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
1090 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
1091 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
1092// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
1093
1094 "movq %%mm4, %%mm3 \n\t" // d
1095 "psubusb b01, %%mm4 \n\t"
1096 PAVGB(%%mm7, %%mm4) // d/32
1097 PAVGB(%%mm7, %%mm4) // (d + 32)/64
1098 "paddb %%mm3, %%mm4 \n\t" // 5d/64
1099 "pand %%mm2, %%mm4 \n\t"
1100
1101 "movq b80, %%mm5 \n\t" // 128
1102 "psubb %%mm0, %%mm5 \n\t" // q
1103 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
1104 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
1105 "pxor %%mm7, %%mm5 \n\t"
1106
1107 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
1108 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
1109
1110 "pand %%mm7, %%mm4 \n\t"
1111 "movq (%%eax, %1, 2), %%mm0 \n\t"
1112 "movq (%0, %1, 4), %%mm2 \n\t"
1113 "pxor %%mm1, %%mm0 \n\t"
1114 "pxor %%mm1, %%mm2 \n\t"
1115 "paddb %%mm4, %%mm0 \n\t"
1116 "psubb %%mm4, %%mm2 \n\t"
1117 "pxor %%mm1, %%mm0 \n\t"
1118 "pxor %%mm1, %%mm2 \n\t"
1119 "movq %%mm0, (%%eax, %1, 2) \n\t"
1120 "movq %%mm2, (%0, %1, 4) \n\t"
1121
1122 :
1123 : "r" (src), "r" (stride)
1124 : "%eax", "%ebx"
1125 );
1126
1127/*
1128 {
1129 int x;
1130 src-= stride;
1131 for(x=0; x<BLOCK_SIZE; x++)
1132 {
1133 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1134 if(ABS(middleEnergy)< 8*QP)
1135 {
1136 const int q=(src[l4] - src[l5])/2;
1137 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1138 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1139
1140 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1141 d= MAX(d, 0);
1142
1143 d= (5*d + 32) >> 6;
1144 d*= SIGN(-middleEnergy);
1145
1146 if(q>0)
1147 {
1148 d= d<0 ? 0 : d;
1149 d= d>q ? q : d;
1150 }
1151 else
1152 {
1153 d= d>0 ? 0 : d;
1154 d= d<q ? q : d;
1155 }
1156
1157 src[l4]-= d;
1158 src[l5]+= d;
1159 }
1160 src++;
1161 }
1162src-=8;
1163 for(x=0; x<8; x++)
1164 {
1165 int y;
1166 for(y=4; y<6; y++)
1167 {
1168 int d= src[x+y*stride] - tmp[x+(y-4)*8];
1169 int ad= ABS(d);
1170 static int max=0;
1171 static int sum=0;
1172 static int num=0;
1173 static int bias=0;
1174
1175 if(max<ad) max=ad;
1176 sum+= ad>3 ? 1 : 0;
1177 if(ad>3)
1178 {
1179 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
1180 }
1181 if(y==4) bias+=d;
1182 num++;
1183 if(num%1000000 == 0)
1184 {
1185 printf(" %d %d %d %d\n", num, sum, max, bias);
1186 }
1187 }
1188 }
1189}
1190*/
1191#elif defined (HAVE_MMX)
acced553 1192 src+= stride*4;
7f16f6e6 1193
3057fa66
A
1194 asm volatile(
1195 "pxor %%mm7, %%mm7 \n\t"
1196 "leal (%0, %1), %%eax \n\t"
1197 "leal (%%eax, %1, 4), %%ebx \n\t"
1198// 0 1 2 3 4 5 6 7
1199// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1200// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1201
1202 "movq (%0), %%mm0 \n\t"
1203 "movq %%mm0, %%mm1 \n\t"
1204 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1205 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1206
1207 "movq (%%eax), %%mm2 \n\t"
1208 "movq %%mm2, %%mm3 \n\t"
1209 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1210 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1211
1212 "movq (%%eax, %1), %%mm4 \n\t"
1213 "movq %%mm4, %%mm5 \n\t"
1214 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1215 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1216
1217 "paddw %%mm0, %%mm0 \n\t" // 2L0
1218 "paddw %%mm1, %%mm1 \n\t" // 2H0
1219 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1220 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1221 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1222 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1223
1224 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1225 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1226 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1227 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1228
1229 "movq (%%eax, %1, 2), %%mm2 \n\t"
1230 "movq %%mm2, %%mm3 \n\t"
1231 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1232 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1233
1234 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1235 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1236 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1237 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1238 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1239 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1240
1241 "movq (%0, %1, 4), %%mm0 \n\t"
1242 "movq %%mm0, %%mm1 \n\t"
1243 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1244 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1245
1246 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1247 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1248 "movq %%mm2, temp2 \n\t" // L3 - L4
1249 "movq %%mm3, temp3 \n\t" // H3 - H4
1250 "paddw %%mm4, %%mm4 \n\t" // 2L2
1251 "paddw %%mm5, %%mm5 \n\t" // 2H2
1252 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1253 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1254
1255 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1256 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1257 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1258 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1259//50 opcodes so far
1260 "movq (%%ebx), %%mm2 \n\t"
1261 "movq %%mm2, %%mm3 \n\t"
1262 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1263 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1264 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1265 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1266 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1267 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1268
1269 "movq (%%ebx, %1), %%mm6 \n\t"
1270 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1271 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1272 "movq (%%ebx, %1), %%mm6 \n\t"
1273 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1274 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1275
1276 "paddw %%mm0, %%mm0 \n\t" // 2L4
1277 "paddw %%mm1, %%mm1 \n\t" // 2H4
1278 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1279 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1280
1281 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1282 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1283 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1284 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1285
1286 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1287 "movq %%mm2, %%mm3 \n\t"
1288 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1289 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1290
1291 "paddw %%mm2, %%mm2 \n\t" // 2L7
1292 "paddw %%mm3, %%mm3 \n\t" // 2H7
1293 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1294 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1295
1296 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1297 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
8405b3fd
MN
1298
1299#ifdef HAVE_MMX2
1300 "movq %%mm7, %%mm6 \n\t" // 0
1301 "psubw %%mm0, %%mm6 \n\t"
1302 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1303 "movq %%mm7, %%mm6 \n\t" // 0
1304 "psubw %%mm1, %%mm6 \n\t"
1305 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1306 "movq %%mm7, %%mm6 \n\t" // 0
1307 "psubw %%mm2, %%mm6 \n\t"
1308 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1309 "movq %%mm7, %%mm6 \n\t" // 0
1310 "psubw %%mm3, %%mm6 \n\t"
1311 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1312#else
3057fa66
A
1313 "movq %%mm7, %%mm6 \n\t" // 0
1314 "pcmpgtw %%mm0, %%mm6 \n\t"
1315 "pxor %%mm6, %%mm0 \n\t"
1316 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1317 "movq %%mm7, %%mm6 \n\t" // 0
1318 "pcmpgtw %%mm1, %%mm6 \n\t"
1319 "pxor %%mm6, %%mm1 \n\t"
1320 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3057fa66
A
1321 "movq %%mm7, %%mm6 \n\t" // 0
1322 "pcmpgtw %%mm2, %%mm6 \n\t"
1323 "pxor %%mm6, %%mm2 \n\t"
1324 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1325 "movq %%mm7, %%mm6 \n\t" // 0
1326 "pcmpgtw %%mm3, %%mm6 \n\t"
1327 "pxor %%mm6, %%mm3 \n\t"
1328 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
8405b3fd 1329#endif
3057fa66
A
1330
1331#ifdef HAVE_MMX2
1332 "pminsw %%mm2, %%mm0 \n\t"
1333 "pminsw %%mm3, %%mm1 \n\t"
1334#else
1335 "movq %%mm0, %%mm6 \n\t"
1336 "psubusw %%mm2, %%mm6 \n\t"
1337 "psubw %%mm6, %%mm0 \n\t"
1338 "movq %%mm1, %%mm6 \n\t"
1339 "psubusw %%mm3, %%mm6 \n\t"
1340 "psubw %%mm6, %%mm1 \n\t"
1341#endif
1342
1343 "movq %%mm7, %%mm6 \n\t" // 0
1344 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1345 "pxor %%mm6, %%mm4 \n\t"
1346 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1347 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1348 "pxor %%mm7, %%mm5 \n\t"
1349 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1350// 100 opcodes
1351 "movd %2, %%mm2 \n\t" // QP
3057fa66
A
1352 "punpcklwd %%mm2, %%mm2 \n\t"
1353 "punpcklwd %%mm2, %%mm2 \n\t"
1354 "psllw $3, %%mm2 \n\t" // 8QP
1355 "movq %%mm2, %%mm3 \n\t" // 8QP
1356 "pcmpgtw %%mm4, %%mm2 \n\t"
1357 "pcmpgtw %%mm5, %%mm3 \n\t"
1358 "pand %%mm2, %%mm4 \n\t"
1359 "pand %%mm3, %%mm5 \n\t"
1360
1361
1362 "psubusw %%mm0, %%mm4 \n\t" // hd
1363 "psubusw %%mm1, %%mm5 \n\t" // ld
1364
1365
1366 "movq w05, %%mm2 \n\t" // 5
1367 "pmullw %%mm2, %%mm4 \n\t"
1368 "pmullw %%mm2, %%mm5 \n\t"
1369 "movq w20, %%mm2 \n\t" // 32
1370 "paddw %%mm2, %%mm4 \n\t"
1371 "paddw %%mm2, %%mm5 \n\t"
1372 "psrlw $6, %%mm4 \n\t"
1373 "psrlw $6, %%mm5 \n\t"
1374
1375/*
1376 "movq w06, %%mm2 \n\t" // 6
1377 "paddw %%mm2, %%mm4 \n\t"
1378 "paddw %%mm2, %%mm5 \n\t"
1379 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1380//FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1381 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1382 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1383*/
1384
1385 "movq temp2, %%mm0 \n\t" // L3 - L4
1386 "movq temp3, %%mm1 \n\t" // H3 - H4
1387
1388 "pxor %%mm2, %%mm2 \n\t"
1389 "pxor %%mm3, %%mm3 \n\t"
1390
3057fa66
A
1391 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1392 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1393 "pxor %%mm2, %%mm0 \n\t"
1394 "pxor %%mm3, %%mm1 \n\t"
1395 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1396 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
e5c30e06
MN
1397 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1398 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3057fa66
A
1399
1400 "pxor %%mm6, %%mm2 \n\t"
1401 "pxor %%mm7, %%mm3 \n\t"
1402 "pand %%mm2, %%mm4 \n\t"
1403 "pand %%mm3, %%mm5 \n\t"
1404
1405#ifdef HAVE_MMX2
1406 "pminsw %%mm0, %%mm4 \n\t"
1407 "pminsw %%mm1, %%mm5 \n\t"
1408#else
1409 "movq %%mm4, %%mm2 \n\t"
1410 "psubusw %%mm0, %%mm2 \n\t"
1411 "psubw %%mm2, %%mm4 \n\t"
1412 "movq %%mm5, %%mm2 \n\t"
1413 "psubusw %%mm1, %%mm2 \n\t"
1414 "psubw %%mm2, %%mm5 \n\t"
1415#endif
1416 "pxor %%mm6, %%mm4 \n\t"
1417 "pxor %%mm7, %%mm5 \n\t"
1418 "psubw %%mm6, %%mm4 \n\t"
1419 "psubw %%mm7, %%mm5 \n\t"
1420 "packsswb %%mm5, %%mm4 \n\t"
1421 "movq (%%eax, %1, 2), %%mm0 \n\t"
1422 "paddb %%mm4, %%mm0 \n\t"
1423 "movq %%mm0, (%%eax, %1, 2) \n\t"
1424 "movq (%0, %1, 4), %%mm0 \n\t"
1425 "psubb %%mm4, %%mm0 \n\t"
3057fa66
A
1426 "movq %%mm0, (%0, %1, 4) \n\t"
1427
1428 :
1429 : "r" (src), "r" (stride), "r" (QP)
1430 : "%eax", "%ebx"
1431 );
1432#else
1433 const int l1= stride;
1434 const int l2= stride + l1;
1435 const int l3= stride + l2;
1436 const int l4= stride + l3;
1437 const int l5= stride + l4;
1438 const int l6= stride + l5;
1439 const int l7= stride + l6;
1440 const int l8= stride + l7;
1441// const int l9= stride + l8;
d5a1a995 1442 int x;
acced553 1443 src+= stride*3;
d5a1a995 1444 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
1445 {
1446 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1447 if(ABS(middleEnergy) < 8*QP)
1448 {
1449 const int q=(src[l4] - src[l5])/2;
1450 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1451 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1452
1453 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1454 d= MAX(d, 0);
1455
1456 d= (5*d + 32) >> 6;
1457 d*= SIGN(-middleEnergy);
1458
1459 if(q>0)
1460 {
1461 d= d<0 ? 0 : d;
1462 d= d>q ? q : d;
1463 }
1464 else
1465 {
1466 d= d>0 ? 0 : d;
1467 d= d<q ? q : d;
1468 }
1469
1470 src[l4]-= d;
1471 src[l5]+= d;
1472 }
1473 src++;
1474 }
1475#endif
1476}
1477
cf5ec61d
MN
1478/**
1479 * Check if the given 8x8 Block is mostly "flat"
1480 */
1481static inline int isHorizDC(uint8_t src[], int stride)
1482{
1483 int numEq= 0;
1484 int y;
1485 for(y=0; y<BLOCK_SIZE; y++)
1486 {
1487 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1488 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1489 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1490 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1491 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1492 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1493 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1494 src+= stride;
1495 }
1496 return numEq > hFlatnessThreshold;
1497}
1498
1499static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1500{
1501 if(abs(src[0] - src[7]) > 2*QP) return 0;
1502
1503 return 1;
1504}
1505
1506static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1507{
1508 int y;
1509 for(y=0; y<BLOCK_SIZE; y++)
1510 {
1511 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1512
1513 if(ABS(middleEnergy) < 8*QP)
1514 {
1515 const int q=(dst[3] - dst[4])/2;
1516 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1517 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1518
1519 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1520 d= MAX(d, 0);
1521
1522 d= (5*d + 32) >> 6;
1523 d*= SIGN(-middleEnergy);
1524
1525 if(q>0)
1526 {
1527 d= d<0 ? 0 : d;
1528 d= d>q ? q : d;
1529 }
1530 else
1531 {
1532 d= d>0 ? 0 : d;
1533 d= d<q ? q : d;
1534 }
1535
1536 dst[3]-= d;
1537 dst[4]+= d;
1538 }
1539 dst+= stride;
1540 }
1541}
1542
1543/**
1544 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1545 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1546 */
1547static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1548{
1549
1550 int y;
1551 for(y=0; y<BLOCK_SIZE; y++)
1552 {
1553 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1554 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1555
1556 int sums[9];
1557 sums[0] = first + dst[0];
1558 sums[1] = dst[0] + dst[1];
1559 sums[2] = dst[1] + dst[2];
1560 sums[3] = dst[2] + dst[3];
1561 sums[4] = dst[3] + dst[4];
1562 sums[5] = dst[4] + dst[5];
1563 sums[6] = dst[5] + dst[6];
1564 sums[7] = dst[6] + dst[7];
1565 sums[8] = dst[7] + last;
1566
1567 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1568 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1569 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1570 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1571 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1572 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1573 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1574 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1575
1576 dst+= stride;
1577 }
1578}
1579
1580
3057fa66
A
1581static inline void dering(uint8_t src[], int stride, int QP)
1582{
e0f8ffae 1583#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
3057fa66 1584 asm volatile(
70c5ae87
MN
1585 "movq pQPb, %%mm0 \n\t"
1586 "paddusb %%mm0, %%mm0 \n\t"
1587 "movq %%mm0, pQPb2 \n\t"
1588
3057fa66
A
1589 "leal (%0, %1), %%eax \n\t"
1590 "leal (%%eax, %1, 4), %%ebx \n\t"
1591// 0 1 2 3 4 5 6 7 8 9
1592// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1593
70c5ae87 1594 "pcmpeqb %%mm6, %%mm6 \n\t"
3057fa66 1595 "pxor %%mm7, %%mm7 \n\t"
e0f8ffae 1596#ifdef HAVE_MMX2
3057fa66 1597#define FIND_MIN_MAX(addr)\
70c5ae87 1598 "movq " #addr ", %%mm0 \n\t"\
3057fa66
A
1599 "pminub %%mm0, %%mm6 \n\t"\
1600 "pmaxub %%mm0, %%mm7 \n\t"
e0f8ffae
MN
1601#else
1602#define FIND_MIN_MAX(addr)\
1603 "movq " #addr ", %%mm0 \n\t"\
1604 "movq %%mm6, %%mm1 \n\t"\
1605 "psubusb %%mm0, %%mm7 \n\t"\
1606 "paddb %%mm0, %%mm7 \n\t"\
1607 "psubusb %%mm0, %%mm1 \n\t"\
1608 "psubb %%mm1, %%mm6 \n\t"
1609#endif
3057fa66 1610
70c5ae87
MN
1611FIND_MIN_MAX((%%eax))
1612FIND_MIN_MAX((%%eax, %1))
1613FIND_MIN_MAX((%%eax, %1, 2))
1614FIND_MIN_MAX((%0, %1, 4))
1615FIND_MIN_MAX((%%ebx))
1616FIND_MIN_MAX((%%ebx, %1))
1617FIND_MIN_MAX((%%ebx, %1, 2))
1618FIND_MIN_MAX((%0, %1, 8))
3057fa66
A
1619
1620 "movq %%mm6, %%mm4 \n\t"
e5c30e06 1621 "psrlq $8, %%mm6 \n\t"
e5c30e06 1622#ifdef HAVE_MMX2
e0f8ffae 1623 "pminub %%mm4, %%mm6 \n\t" // min of pixels
e5c30e06
MN
1624 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1625 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1626 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
e0f8ffae 1627 "pminub %%mm4, %%mm6 \n\t"
e5c30e06 1628#else
e0f8ffae
MN
1629 "movq %%mm6, %%mm1 \n\t"
1630 "psubusb %%mm4, %%mm1 \n\t"
1631 "psubb %%mm1, %%mm6 \n\t"
3057fa66
A
1632 "movq %%mm6, %%mm4 \n\t"
1633 "psrlq $16, %%mm6 \n\t"
e0f8ffae
MN
1634 "movq %%mm6, %%mm1 \n\t"
1635 "psubusb %%mm4, %%mm1 \n\t"
1636 "psubb %%mm1, %%mm6 \n\t"
3057fa66 1637 "movq %%mm6, %%mm4 \n\t"
e5c30e06 1638 "psrlq $32, %%mm6 \n\t"
e0f8ffae
MN
1639 "movq %%mm6, %%mm1 \n\t"
1640 "psubusb %%mm4, %%mm1 \n\t"
1641 "psubb %%mm1, %%mm6 \n\t"
e5c30e06 1642#endif
e5c30e06 1643
3057fa66
A
1644
1645 "movq %%mm7, %%mm4 \n\t"
e5c30e06 1646 "psrlq $8, %%mm7 \n\t"
e5c30e06 1647#ifdef HAVE_MMX2
e0f8ffae 1648 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
e5c30e06 1649 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
e0f8ffae 1650 "pmaxub %%mm4, %%mm7 \n\t"
e5c30e06 1651 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
e0f8ffae 1652 "pmaxub %%mm4, %%mm7 \n\t"
e5c30e06 1653#else
e0f8ffae
MN
1654 "psubusb %%mm4, %%mm7 \n\t"
1655 "paddb %%mm4, %%mm7 \n\t"
3057fa66
A
1656 "movq %%mm7, %%mm4 \n\t"
1657 "psrlq $16, %%mm7 \n\t"
e0f8ffae
MN
1658 "psubusb %%mm4, %%mm7 \n\t"
1659 "paddb %%mm4, %%mm7 \n\t"
3057fa66 1660 "movq %%mm7, %%mm4 \n\t"
e5c30e06 1661 "psrlq $32, %%mm7 \n\t"
e0f8ffae
MN
1662 "psubusb %%mm4, %%mm7 \n\t"
1663 "paddb %%mm4, %%mm7 \n\t"
e5c30e06 1664#endif
70c5ae87 1665 PAVGB(%%mm6, %%mm7) // a=(max + min)/2
e5c30e06
MN
1666 "punpcklbw %%mm7, %%mm7 \n\t"
1667 "punpcklbw %%mm7, %%mm7 \n\t"
1668 "punpcklbw %%mm7, %%mm7 \n\t"
70c5ae87
MN
1669 "movq %%mm7, temp0 \n\t"
1670
1671 "movq (%0), %%mm0 \n\t" // L10
1672 "movq %%mm0, %%mm1 \n\t" // L10
1673 "movq %%mm0, %%mm2 \n\t" // L10
1674 "psllq $8, %%mm1 \n\t"
1675 "psrlq $8, %%mm2 \n\t"
1676 "movd -4(%0), %%mm3 \n\t"
1677 "movd 8(%0), %%mm4 \n\t"
1678 "psrlq $24, %%mm3 \n\t"
1679 "psllq $56, %%mm4 \n\t"
1680 "por %%mm3, %%mm1 \n\t" // L00
1681 "por %%mm4, %%mm2 \n\t" // L20
1682 "movq %%mm1, %%mm3 \n\t" // L00
1683 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1684 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1685 "psubusb %%mm7, %%mm0 \n\t"
1686 "psubusb %%mm7, %%mm2 \n\t"
1687 "psubusb %%mm7, %%mm3 \n\t"
1688 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1
1689 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1
1690 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1
1691 "paddb %%mm2, %%mm0 \n\t"
1692 "paddb %%mm3, %%mm0 \n\t"
1693
1694 "movq (%%eax), %%mm2 \n\t" // L11
1695 "movq %%mm2, %%mm3 \n\t" // L11
1696 "movq %%mm2, %%mm4 \n\t" // L11
1697 "psllq $8, %%mm3 \n\t"
1698 "psrlq $8, %%mm4 \n\t"
1699 "movd -4(%%eax), %%mm5 \n\t"
1700 "movd 8(%%eax), %%mm6 \n\t"
1701 "psrlq $24, %%mm5 \n\t"
1702 "psllq $56, %%mm6 \n\t"
1703 "por %%mm5, %%mm3 \n\t" // L01
1704 "por %%mm6, %%mm4 \n\t" // L21
1705 "movq %%mm3, %%mm5 \n\t" // L01
1706 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1707 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1708 "psubusb %%mm7, %%mm2 \n\t"
1709 "psubusb %%mm7, %%mm4 \n\t"
1710 "psubusb %%mm7, %%mm5 \n\t"
1711 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1
1712 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1
1713 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1
1714 "paddb %%mm4, %%mm2 \n\t"
1715 "paddb %%mm5, %%mm2 \n\t"
1716// 0, 2, 3, 1
1717#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1718 "movq " #src ", " #sx " \n\t" /* src[0] */\
1719 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1720 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1721 "psllq $8, " #lx " \n\t"\
1722 "psrlq $8, " #t0 " \n\t"\
1723 "movd -4" #src ", " #t1 " \n\t"\
1724 "psrlq $24, " #t1 " \n\t"\
1725 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1726 "movd 8" #src ", " #t1 " \n\t"\
1727 "psllq $56, " #t1 " \n\t"\
1728 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1729 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1730 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1731 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
9927c7ee
MN
1732 PAVGB(lx, pplx) \
1733 "movq " #lx ", temp1 \n\t"\
1734 "movq temp0, " #lx " \n\t"\
8405b3fd
MN
1735 "psubusb " #lx ", " #t1 " \n\t"\
1736 "psubusb " #lx ", " #t0 " \n\t"\
1737 "psubusb " #lx ", " #sx " \n\t"\
9927c7ee 1738 "movq b00, " #lx " \n\t"\
8405b3fd
MN
1739 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1740 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1741 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
70c5ae87
MN
1742 "paddb " #t1 ", " #t0 " \n\t"\
1743 "paddb " #t0 ", " #sx " \n\t"\
1744\
70c5ae87
MN
1745 PAVGB(plx, pplx) /* filtered */\
1746 "movq " #dst ", " #t0 " \n\t" /* dst */\
2e212618
MN
1747 "movq " #t0 ", " #t1 " \n\t" /* dst */\
1748 "psubusb pQPb2, " #t0 " \n\t"\
1749 "paddusb pQPb2, " #t1 " \n\t"\
1750 PMAXUB(t0, pplx)\
1751 PMINUB(t1, pplx, t0)\
70c5ae87
MN
1752 "paddb " #sx ", " #ppsx " \n\t"\
1753 "paddb " #psx ", " #ppsx " \n\t"\
1754 "#paddb b02, " #ppsx " \n\t"\
1755 "pand b08, " #ppsx " \n\t"\
8405b3fd 1756 "pcmpeqb " #lx ", " #ppsx " \n\t"\
2e212618 1757 "pand " #ppsx ", " #pplx " \n\t"\
70c5ae87 1758 "pandn " #dst ", " #ppsx " \n\t"\
8405b3fd 1759 "por " #pplx ", " #ppsx " \n\t"\
9927c7ee
MN
1760 "movq " #ppsx ", " #dst " \n\t"\
1761 "movq temp1, " #lx " \n\t"
2e212618 1762
70c5ae87
MN
1763/*
17640000000
17651111111
e5c30e06 1766
70c5ae87
MN
17671111110
17681111101
17691111100
17701111011
17711111010
17721111001
e5c30e06 1773
70c5ae87
MN
17741111000
17751110111
e5c30e06 1776
70c5ae87
MN
1777*/
1778//DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1779DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1780DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1781DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1782DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1783DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1784DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1785DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1786DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
3057fa66
A
1787
1788
1789 : : "r" (src), "r" (stride), "r" (QP)
1790 : "%eax", "%ebx"
1791 );
1792#else
2e212618
MN
1793 int y;
1794 int min=255;
1795 int max=0;
1796 int avg;
1797 uint8_t *p;
1798 int s[10];
1799
1800 for(y=1; y<9; y++)
1801 {
1802 int x;
1803 p= src + stride*y;
1804 for(x=1; x<9; x++)
1805 {
1806 p++;
1807 if(*p > max) max= *p;
1808 if(*p < min) min= *p;
1809 }
1810 }
1811 avg= (min + max + 1)/2;
1812
1813 for(y=0; y<10; y++)
1814 {
1815 int x;
1816 int t = 0;
1817 p= src + stride*y;
1818 for(x=0; x<10; x++)
1819 {
1820 if(*p > avg) t |= (1<<x);
1821 p++;
1822 }
1823 t |= (~t)<<16;
1824 t &= (t<<1) & (t>>1);
1825 s[y] = t;
1826 }
1827
1828 for(y=1; y<9; y++)
1829 {
1830 int x;
1831 int t = s[y-1] & s[y] & s[y+1];
1832 t|= t>>16;
1833
1834 p= src + stride*y;
1835 for(x=1; x<9; x++)
1836 {
1837 p++;
1838 if(t & (1<<x))
1839 {
1840 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1841 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1842 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1843 f= (f + 8)>>4;
1844
1845 if (*p + 2*QP < f) *p= *p + 2*QP;
1846 else if(*p - 2*QP > f) *p= *p - 2*QP;
1847 else *p=f;
1848 }
1849 }
1850 }
3057fa66 1851
3057fa66
A
1852#endif
1853}
1854
3b58b885
MN
1855/**
1856 * Deinterlaces the given block
7fb36f6c
MN
1857 * will be called for every 8x8 block and can read & write from line 4-15
1858 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1859 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885
MN
1860 */
1861static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1862{
1863#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1864 src+= 4*stride;
3b58b885
MN
1865 asm volatile(
1866 "leal (%0, %1), %%eax \n\t"
1867 "leal (%%eax, %1, 4), %%ebx \n\t"
1868// 0 1 2 3 4 5 6 7 8 9
1869// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1870
1871 "movq (%0), %%mm0 \n\t"
1872 "movq (%%eax, %1), %%mm1 \n\t"
acced553 1873 PAVGB(%%mm1, %%mm0)
3b58b885
MN
1874 "movq %%mm0, (%%eax) \n\t"
1875 "movq (%0, %1, 4), %%mm0 \n\t"
acced553 1876 PAVGB(%%mm0, %%mm1)
3b58b885
MN
1877 "movq %%mm1, (%%eax, %1, 2) \n\t"
1878 "movq (%%ebx, %1), %%mm1 \n\t"
acced553 1879 PAVGB(%%mm1, %%mm0)
3b58b885
MN
1880 "movq %%mm0, (%%ebx) \n\t"
1881 "movq (%0, %1, 8), %%mm0 \n\t"
acced553 1882 PAVGB(%%mm0, %%mm1)
3b58b885
MN
1883 "movq %%mm1, (%%ebx, %1, 2) \n\t"
1884
1885 : : "r" (src), "r" (stride)
1886 : "%eax", "%ebx"
1887 );
1888#else
1889 int x;
7fb36f6c 1890 src+= 4*stride;
3b58b885
MN
1891 for(x=0; x<8; x++)
1892 {
1893 src[stride] = (src[0] + src[stride*2])>>1;
1894 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1895 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1896 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1897 src++;
1898 }
1899#endif
1900}
1901
1902/**
1903 * Deinterlaces the given block
7fb36f6c
MN
1904 * will be called for every 8x8 block and can read & write from line 4-15
1905 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1906 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1907 * this filter will read lines 3-15 and write 7-13
acced553 1908 * no cliping in C version
3b58b885 1909 */
acced553 1910static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
3b58b885
MN
1911{
1912#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1913 src+= stride*3;
3b58b885
MN
1914 asm volatile(
1915 "leal (%0, %1), %%eax \n\t"
1916 "leal (%%eax, %1, 4), %%ebx \n\t"
acced553
MN
1917 "leal (%%ebx, %1, 4), %%ecx \n\t"
1918 "addl %1, %%ecx \n\t"
1919 "pxor %%mm7, %%mm7 \n\t"
1920// 0 1 2 3 4 5 6 7 8 9 10
1921// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
3b58b885 1922
acced553
MN
1923#define DEINT_CUBIC(a,b,c,d,e)\
1924 "movq " #a ", %%mm0 \n\t"\
1925 "movq " #b ", %%mm1 \n\t"\
1926 "movq " #d ", %%mm2 \n\t"\
1927 "movq " #e ", %%mm3 \n\t"\
1928 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1929 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1930 "movq %%mm0, %%mm2 \n\t"\
1931 "punpcklbw %%mm7, %%mm0 \n\t"\
1932 "punpckhbw %%mm7, %%mm2 \n\t"\
1933 "movq %%mm1, %%mm3 \n\t"\
1934 "punpcklbw %%mm7, %%mm1 \n\t"\
1935 "punpckhbw %%mm7, %%mm3 \n\t"\
1936 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1937 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1938 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1939 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1940 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1941 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1942 "packuswb %%mm3, %%mm1 \n\t"\
1943 "movq %%mm1, " #c " \n\t"
1944
1945DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1946DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1947DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1948DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
3b58b885
MN
1949
1950 : : "r" (src), "r" (stride)
acced553 1951 : "%eax", "%ebx", "ecx"
3b58b885
MN
1952 );
1953#else
1954 int x;
7fb36f6c 1955 src+= stride*3;
3b58b885
MN
1956 for(x=0; x<8; x++)
1957 {
acced553
MN
1958 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1959 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1960 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1961 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
3b58b885
MN
1962 src++;
1963 }
1964#endif
1965}
1966
1967/**
1968 * Deinterlaces the given block
7fb36f6c
MN
1969 * will be called for every 8x8 block and can read & write from line 4-15
1970 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1971 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885 1972 * will shift the image up by 1 line (FIXME if this is a problem)
7fb36f6c 1973 * this filter will read lines 4-13 and write 4-11
3b58b885
MN
1974 */
1975static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
1976{
1977#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1978 src+= 4*stride;
3b58b885
MN
1979 asm volatile(
1980 "leal (%0, %1), %%eax \n\t"
1981 "leal (%%eax, %1, 4), %%ebx \n\t"
1982// 0 1 2 3 4 5 6 7 8 9
1983// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1984
1985 "movq (%0), %%mm0 \n\t" // L0
1986 "movq (%%eax, %1), %%mm1 \n\t" // L2
1987 PAVGB(%%mm1, %%mm0) // L0+L2
1988 "movq (%%eax), %%mm2 \n\t" // L1
1989 PAVGB(%%mm2, %%mm0)
1990 "movq %%mm0, (%0) \n\t"
1991 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
1992 PAVGB(%%mm0, %%mm2) // L1+L3
1993 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1994 "movq %%mm2, (%%eax) \n\t"
1995 "movq (%0, %1, 4), %%mm2 \n\t" // L4
1996 PAVGB(%%mm2, %%mm1) // L2+L4
1997 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1998 "movq %%mm1, (%%eax, %1) \n\t"
1999 "movq (%%ebx), %%mm1 \n\t" // L5
2000 PAVGB(%%mm1, %%mm0) // L3+L5
2001 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2002 "movq %%mm0, (%%eax, %1, 2) \n\t"
2003 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2004 PAVGB(%%mm0, %%mm2) // L4+L6
2005 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2006 "movq %%mm2, (%0, %1, 4) \n\t"
2007 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2008 PAVGB(%%mm2, %%mm1) // L5+L7
2009 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2010 "movq %%mm1, (%%ebx) \n\t"
2011 "movq (%0, %1, 8), %%mm1 \n\t" // L8
2012 PAVGB(%%mm1, %%mm0) // L6+L8
2013 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
2014 "movq %%mm0, (%%ebx, %1) \n\t"
2015 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
2016 PAVGB(%%mm0, %%mm2) // L7+L9
2017 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
2018 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2019
2020
2021 : : "r" (src), "r" (stride)
2022 : "%eax", "%ebx"
2023 );
2024#else
2025 int x;
7fb36f6c 2026 src+= 4*stride;
3b58b885
MN
2027 for(x=0; x<8; x++)
2028 {
2029 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2030 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2031 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2032 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2033 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2034 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2035 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2036 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2037 src++;
2038 }
2039#endif
2040}
2041
2042/**
2043 * Deinterlaces the given block
7fb36f6c
MN
2044 * will be called for every 8x8 block and can read & write from line 4-15,
2045 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2046 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885
MN
2047 */
2048static inline void deInterlaceMedian(uint8_t src[], int stride)
2049{
a6be8111 2050#ifdef HAVE_MMX
7fb36f6c 2051 src+= 4*stride;
a6be8111 2052#ifdef HAVE_MMX2
3b58b885
MN
2053 asm volatile(
2054 "leal (%0, %1), %%eax \n\t"
2055 "leal (%%eax, %1, 4), %%ebx \n\t"
2056// 0 1 2 3 4 5 6 7 8 9
2057// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2058
2059 "movq (%0), %%mm0 \n\t" //
2060 "movq (%%eax, %1), %%mm2 \n\t" //
2061 "movq (%%eax), %%mm1 \n\t" //
2062 "movq %%mm0, %%mm3 \n\t"
2063 "pmaxub %%mm1, %%mm0 \n\t" //
2064 "pminub %%mm3, %%mm1 \n\t" //
2065 "pmaxub %%mm2, %%mm1 \n\t" //
2066 "pminub %%mm1, %%mm0 \n\t"
2067 "movq %%mm0, (%%eax) \n\t"
2068
2069 "movq (%0, %1, 4), %%mm0 \n\t" //
2070 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2071 "movq %%mm2, %%mm3 \n\t"
2072 "pmaxub %%mm1, %%mm2 \n\t" //
2073 "pminub %%mm3, %%mm1 \n\t" //
2074 "pmaxub %%mm0, %%mm1 \n\t" //
2075 "pminub %%mm1, %%mm2 \n\t"
2076 "movq %%mm2, (%%eax, %1, 2) \n\t"
2077
2078 "movq (%%ebx), %%mm2 \n\t" //
2079 "movq (%%ebx, %1), %%mm1 \n\t" //
2080 "movq %%mm2, %%mm3 \n\t"
2081 "pmaxub %%mm0, %%mm2 \n\t" //
2082 "pminub %%mm3, %%mm0 \n\t" //
2083 "pmaxub %%mm1, %%mm0 \n\t" //
2084 "pminub %%mm0, %%mm2 \n\t"
2085 "movq %%mm2, (%%ebx) \n\t"
2086
2087 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
2088 "movq (%0, %1, 8), %%mm0 \n\t" //
2089 "movq %%mm2, %%mm3 \n\t"
2090 "pmaxub %%mm0, %%mm2 \n\t" //
2091 "pminub %%mm3, %%mm0 \n\t" //
2092 "pmaxub %%mm1, %%mm0 \n\t" //
2093 "pminub %%mm0, %%mm2 \n\t"
2094 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2095
2096
2097 : : "r" (src), "r" (stride)
2098 : "%eax", "%ebx"
2099 );
a6be8111
MN
2100
2101#else // MMX without MMX2
2102 asm volatile(
2103 "leal (%0, %1), %%eax \n\t"
2104 "leal (%%eax, %1, 4), %%ebx \n\t"
2105// 0 1 2 3 4 5 6 7 8 9
2106// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2107 "pxor %%mm7, %%mm7 \n\t"
2108
2109#define MEDIAN(a,b,c)\
2110 "movq " #a ", %%mm0 \n\t"\
2111 "movq " #b ", %%mm2 \n\t"\
2112 "movq " #c ", %%mm1 \n\t"\
2113 "movq %%mm0, %%mm3 \n\t"\
2114 "movq %%mm1, %%mm4 \n\t"\
2115 "movq %%mm2, %%mm5 \n\t"\
2116 "psubusb %%mm1, %%mm3 \n\t"\
2117 "psubusb %%mm2, %%mm4 \n\t"\
2118 "psubusb %%mm0, %%mm5 \n\t"\
2119 "pcmpeqb %%mm7, %%mm3 \n\t"\
2120 "pcmpeqb %%mm7, %%mm4 \n\t"\
2121 "pcmpeqb %%mm7, %%mm5 \n\t"\
2122 "movq %%mm3, %%mm6 \n\t"\
2123 "pxor %%mm4, %%mm3 \n\t"\
2124 "pxor %%mm5, %%mm4 \n\t"\
2125 "pxor %%mm6, %%mm5 \n\t"\
2126 "por %%mm3, %%mm1 \n\t"\
2127 "por %%mm4, %%mm2 \n\t"\
2128 "por %%mm5, %%mm0 \n\t"\
2129 "pand %%mm2, %%mm0 \n\t"\
2130 "pand %%mm1, %%mm0 \n\t"\
2131 "movq %%mm0, " #b " \n\t"
2132
2133MEDIAN((%0), (%%eax), (%%eax, %1))
2134MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2135MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2136MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2137
2138 : : "r" (src), "r" (stride)
2139 : "%eax", "%ebx"
2140 );
2141#endif // MMX
3b58b885
MN
2142#else
2143 //FIXME
2144 int x;
7fb36f6c 2145 src+= 4*stride;
3b58b885
MN
2146 for(x=0; x<8; x++)
2147 {
2148 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2149 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2150 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2151 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2152 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2153 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2154 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2155 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2156 src++;
2157 }
2158#endif
2159}
2160
e5c30e06 2161#ifdef HAVE_MMX
4e4dcbc5
MN
2162/**
2163 * transposes and shift the given 8x8 Block into dst1 and dst2
2164 */
2165static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2166{
2167 asm(
2168 "leal (%0, %1), %%eax \n\t"
2169 "leal (%%eax, %1, 4), %%ebx \n\t"
2170// 0 1 2 3 4 5 6 7 8 9
2171// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2172 "movq (%0), %%mm0 \n\t" // 12345678
2173 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2174 "movq %%mm0, %%mm2 \n\t" // 12345678
2175 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2176 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2177
2178 "movq (%%eax, %1), %%mm1 \n\t"
2179 "movq (%%eax, %1, 2), %%mm3 \n\t"
2180 "movq %%mm1, %%mm4 \n\t"
2181 "punpcklbw %%mm3, %%mm1 \n\t"
2182 "punpckhbw %%mm3, %%mm4 \n\t"
2183
2184 "movq %%mm0, %%mm3 \n\t"
2185 "punpcklwd %%mm1, %%mm0 \n\t"
2186 "punpckhwd %%mm1, %%mm3 \n\t"
2187 "movq %%mm2, %%mm1 \n\t"
2188 "punpcklwd %%mm4, %%mm2 \n\t"
2189 "punpckhwd %%mm4, %%mm1 \n\t"
2190
2191 "movd %%mm0, 128(%2) \n\t"
2192 "psrlq $32, %%mm0 \n\t"
2193 "movd %%mm0, 144(%2) \n\t"
2194 "movd %%mm3, 160(%2) \n\t"
2195 "psrlq $32, %%mm3 \n\t"
2196 "movd %%mm3, 176(%2) \n\t"
2197 "movd %%mm3, 48(%3) \n\t"
2198 "movd %%mm2, 192(%2) \n\t"
2199 "movd %%mm2, 64(%3) \n\t"
2200 "psrlq $32, %%mm2 \n\t"
2201 "movd %%mm2, 80(%3) \n\t"
2202 "movd %%mm1, 96(%3) \n\t"
2203 "psrlq $32, %%mm1 \n\t"
2204 "movd %%mm1, 112(%3) \n\t"
2205
2206 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2207 "movq (%%ebx), %%mm1 \n\t" // abcdefgh
2208 "movq %%mm0, %%mm2 \n\t" // 12345678
2209 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2210 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2211
2212 "movq (%%ebx, %1), %%mm1 \n\t"
2213 "movq (%%ebx, %1, 2), %%mm3 \n\t"
2214 "movq %%mm1, %%mm4 \n\t"
2215 "punpcklbw %%mm3, %%mm1 \n\t"
2216 "punpckhbw %%mm3, %%mm4 \n\t"
2217
2218 "movq %%mm0, %%mm3 \n\t"
2219 "punpcklwd %%mm1, %%mm0 \n\t"
2220 "punpckhwd %%mm1, %%mm3 \n\t"
2221 "movq %%mm2, %%mm1 \n\t"
2222 "punpcklwd %%mm4, %%mm2 \n\t"
2223 "punpckhwd %%mm4, %%mm1 \n\t"
2224
2225 "movd %%mm0, 132(%2) \n\t"
2226 "psrlq $32, %%mm0 \n\t"
2227 "movd %%mm0, 148(%2) \n\t"
2228 "movd %%mm3, 164(%2) \n\t"
2229 "psrlq $32, %%mm3 \n\t"
2230 "movd %%mm3, 180(%2) \n\t"
2231 "movd %%mm3, 52(%3) \n\t"
2232 "movd %%mm2, 196(%2) \n\t"
2233 "movd %%mm2, 68(%3) \n\t"
2234 "psrlq $32, %%mm2 \n\t"
2235 "movd %%mm2, 84(%3) \n\t"
2236 "movd %%mm1, 100(%3) \n\t"
2237 "psrlq $32, %%mm1 \n\t"
2238 "movd %%mm1, 116(%3) \n\t"
2239
2240
2241 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2242 : "%eax", "%ebx"
2243 );
2244}
2245
2246/**
2247 * transposes the given 8x8 block
2248 */
2249static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2250{
2251 asm(
2252 "leal (%0, %1), %%eax \n\t"
2253 "leal (%%eax, %1, 4), %%ebx \n\t"
2254// 0 1 2 3 4 5 6 7 8 9
2255// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2256 "movq (%2), %%mm0 \n\t" // 12345678
2257 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2258 "movq %%mm0, %%mm2 \n\t" // 12345678
2259 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2260 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2261
2262 "movq 32(%2), %%mm1 \n\t"
2263 "movq 48(%2), %%mm3 \n\t"
2264 "movq %%mm1, %%mm4 \n\t"
2265 "punpcklbw %%mm3, %%mm1 \n\t"
2266 "punpckhbw %%mm3, %%mm4 \n\t"
2267
2268 "movq %%mm0, %%mm3 \n\t"
2269 "punpcklwd %%mm1, %%mm0 \n\t"
2270 "punpckhwd %%mm1, %%mm3 \n\t"
2271 "movq %%mm2, %%mm1 \n\t"
2272 "punpcklwd %%mm4, %%mm2 \n\t"
2273 "punpckhwd %%mm4, %%mm1 \n\t"
2274
2275 "movd %%mm0, (%0) \n\t"
2276 "psrlq $32, %%mm0 \n\t"
2277 "movd %%mm0, (%%eax) \n\t"
2278 "movd %%mm3, (%%eax, %1) \n\t"
2279 "psrlq $32, %%mm3 \n\t"
2280 "movd %%mm3, (%%eax, %1, 2) \n\t"
2281 "movd %%mm2, (%0, %1, 4) \n\t"
2282 "psrlq $32, %%mm2 \n\t"
2283 "movd %%mm2, (%%ebx) \n\t"
2284 "movd %%mm1, (%%ebx, %1) \n\t"
2285 "psrlq $32, %%mm1 \n\t"
2286 "movd %%mm1, (%%ebx, %1, 2) \n\t"
2287
2288
2289 "movq 64(%2), %%mm0 \n\t" // 12345678
2290 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2291 "movq %%mm0, %%mm2 \n\t" // 12345678
2292 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2293 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2294
2295 "movq 96(%2), %%mm1 \n\t"
2296 "movq 112(%2), %%mm3 \n\t"
2297 "movq %%mm1, %%mm4 \n\t"
2298 "punpcklbw %%mm3, %%mm1 \n\t"
2299 "punpckhbw %%mm3, %%mm4 \n\t"
2300
2301 "movq %%mm0, %%mm3 \n\t"
2302 "punpcklwd %%mm1, %%mm0 \n\t"
2303 "punpckhwd %%mm1, %%mm3 \n\t"
2304 "movq %%mm2, %%mm1 \n\t"
2305 "punpcklwd %%mm4, %%mm2 \n\t"
2306 "punpckhwd %%mm4, %%mm1 \n\t"
2307
2308 "movd %%mm0, 4(%0) \n\t"
2309 "psrlq $32, %%mm0 \n\t"
2310 "movd %%mm0, 4(%%eax) \n\t"
2311 "movd %%mm3, 4(%%eax, %1) \n\t"
2312 "psrlq $32, %%mm3 \n\t"
2313 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2314 "movd %%mm2, 4(%0, %1, 4) \n\t"
2315 "psrlq $32, %%mm2 \n\t"
2316 "movd %%mm2, 4(%%ebx) \n\t"
2317 "movd %%mm1, 4(%%ebx, %1) \n\t"
2318 "psrlq $32, %%mm1 \n\t"
2319 "movd %%mm1, 4(%%ebx, %1, 2) \n\t"
2320
2321 :: "r" (dst), "r" (dstStride), "r" (src)
2322 : "%eax", "%ebx"
2323 );
2324}
e5c30e06 2325#endif
be44a4d7 2326//static int test=0;
4e4dcbc5 2327
117e45b0 2328static void inline tempNoiseReducer(uint8_t *src, int stride,
a9c77978 2329 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
117e45b0 2330{
be44a4d7
MN
2331#define FAST_L2_DIFF
2332//#define L1_DIFF //u should change the thresholds too if u try that one
2333#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2334 asm volatile(
2335 "leal (%2, %2, 2), %%eax \n\t" // 3*stride
2336 "leal (%2, %2, 4), %%ebx \n\t" // 5*stride
2337 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2338// 0 1 2 3 4 5 6 7 8 9
2339// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2
2340//FIXME reorder?
2341#ifdef L1_DIFF //needs mmx2
2342 "movq (%0), %%mm0 \n\t" // L0
2343 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2344 "movq (%0, %2), %%mm1 \n\t" // L1
2345 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2346 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2347 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2348 "movq (%0, %%eax), %%mm3 \n\t" // L3
2349 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
2350
2351 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2352 "paddw %%mm1, %%mm0 \n\t"
2353 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2354 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2355 "paddw %%mm2, %%mm0 \n\t"
2356 "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5|
2357 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2358 "paddw %%mm3, %%mm0 \n\t"
2359 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
2360 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2361 "paddw %%mm4, %%mm0 \n\t"
2362 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
2363 "paddw %%mm5, %%mm6 \n\t"
2364 "paddw %%mm7, %%mm6 \n\t"
2365 "paddw %%mm6, %%mm0 \n\t"
2366#elif defined (FAST_L2_DIFF)
2367 "pcmpeqb %%mm7, %%mm7 \n\t"
2368 "movq b80, %%mm6 \n\t"
2369 "pxor %%mm0, %%mm0 \n\t"
2370#define L2_DIFF_CORE(a, b)\
2371 "movq " #a ", %%mm5 \n\t"\
2372 "movq " #b ", %%mm2 \n\t"\
2373 "pxor %%mm7, %%mm2 \n\t"\
2374 PAVGB(%%mm2, %%mm5)\
2375 "paddb %%mm6, %%mm5 \n\t"\
2376 "movq %%mm5, %%mm2 \n\t"\
2377 "psllw $8, %%mm5 \n\t"\
2378 "pmaddwd %%mm5, %%mm5 \n\t"\
2379 "pmaddwd %%mm2, %%mm2 \n\t"\
2380 "paddd %%mm2, %%mm5 \n\t"\
2381 "psrld $14, %%mm5 \n\t"\
2382 "paddd %%mm5, %%mm0 \n\t"
2383
2384L2_DIFF_CORE((%0), (%1))
2385L2_DIFF_CORE((%0, %2), (%1, %2))
2386L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2387L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2388L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2389L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2390L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2391L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2392
2393#else
2394 "pxor %%mm7, %%mm7 \n\t"
2395 "pxor %%mm0, %%mm0 \n\t"
2396#define L2_DIFF_CORE(a, b)\
2397 "movq " #a ", %%mm5 \n\t"\
2398 "movq " #b ", %%mm2 \n\t"\
2399 "movq %%mm5, %%mm1 \n\t"\
2400 "movq %%mm2, %%mm3 \n\t"\
2401 "punpcklbw %%mm7, %%mm5 \n\t"\
2402 "punpckhbw %%mm7, %%mm1 \n\t"\
2403 "punpcklbw %%mm7, %%mm2 \n\t"\
2404 "punpckhbw %%mm7, %%mm3 \n\t"\
2405 "psubw %%mm2, %%mm5 \n\t"\
2406 "psubw %%mm3, %%mm1 \n\t"\
2407 "pmaddwd %%mm5, %%mm5 \n\t"\
2408 "pmaddwd %%mm1, %%mm1 \n\t"\
2409 "paddd %%mm1, %%mm5 \n\t"\
2410 "paddd %%mm5, %%mm0 \n\t"
2411
2412L2_DIFF_CORE((%0), (%1))
2413L2_DIFF_CORE((%0, %2), (%1, %2))
2414L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2415L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2416L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2417L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2418L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2419L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2420
2421#endif
2422
2423 "movq %%mm0, %%mm4 \n\t"
2424 "psrlq $32, %%mm0 \n\t"
2425 "paddd %%mm0, %%mm4 \n\t"
2426 "movd %%mm4, %%ecx \n\t"
a9c77978
MN
2427 "shll $2, %%ecx \n\t"
2428 "movl %3, %%ebx \n\t"
2429 "addl -4(%%ebx), %%ecx \n\t"
2430 "addl 4(%%ebx), %%ecx \n\t"
2431 "addl -1024(%%ebx), %%ecx \n\t"
2432 "addl $4, %%ecx \n\t"
2433 "addl 1024(%%ebx), %%ecx \n\t"
2434 "shrl $3, %%ecx \n\t"
2435 "movl %%ecx, (%%ebx) \n\t"
2436 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride
2437
be44a4d7
MN
2438// "movl %3, %%ecx \n\t"
2439// "movl %%ecx, test \n\t"
2440// "jmp 4f \n\t"
a9c77978 2441 "cmpl 4+maxTmpNoise, %%ecx \n\t"
be44a4d7 2442 " jb 2f \n\t"
a9c77978 2443 "cmpl 8+maxTmpNoise, %%ecx \n\t"
be44a4d7
MN
2444 " jb 1f \n\t"
2445
2446 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2447 "movq (%0), %%mm0 \n\t" // L0
2448 "movq (%0, %2), %%mm1 \n\t" // L1
2449 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2450 "movq (%0, %%eax), %%mm3 \n\t" // L3
2451 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2452 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2453 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2454 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2455 "movq %%mm0, (%1) \n\t" // L0
2456 "movq %%mm1, (%1, %2) \n\t" // L1
2457 "movq %%mm2, (%1, %2, 2) \n\t" // L2
2458 "movq %%mm3, (%1, %%eax) \n\t" // L3
2459 "movq %%mm4, (%1, %2, 4) \n\t" // L4
2460 "movq %%mm5, (%1, %%ebx) \n\t" // L5
2461 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
2462 "movq %%mm7, (%1, %%ecx) \n\t" // L7
2463 "jmp 4f \n\t"
2464
2465 "1: \n\t"
2466 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2467 "movq (%0), %%mm0 \n\t" // L0
2468 "pavgb (%1), %%mm0 \n\t" // L0
2469 "movq (%0, %2), %%mm1 \n\t" // L1
2470 "pavgb (%1, %2), %%mm1 \n\t" // L1
2471 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2472 "pavgb (%1, %2, 2), %%mm2 \n\t" // L2
2473 "movq (%0, %%eax), %%mm3 \n\t" // L3
2474 "pavgb (%1, %%eax), %%mm3 \n\t" // L3
2475 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2476 "pavgb (%1, %2, 4), %%mm4 \n\t" // L4
2477 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2478 "pavgb (%1, %%ebx), %%mm5 \n\t" // L5
2479 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2480 "pavgb (%1, %%eax, 2), %%mm6 \n\t" // L6
2481 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2482 "pavgb (%1, %%ecx), %%mm7 \n\t" // L7
2483 "movq %%mm0, (%1) \n\t" // R0
2484 "movq %%mm1, (%1, %2) \n\t" // R1
2485 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2486 "movq %%mm3, (%1, %%eax) \n\t" // R3
2487 "movq %%mm4, (%1, %2, 4) \n\t" // R4
2488 "movq %%mm5, (%1, %%ebx) \n\t" // R5
2489 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
2490 "movq %%mm7, (%1, %%ecx) \n\t" // R7
2491 "movq %%mm0, (%0) \n\t" // L0
2492 "movq %%mm1, (%0, %2) \n\t" // L1
2493 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2494 "movq %%mm3, (%0, %%eax) \n\t" // L3
2495 "movq %%mm4, (%0, %2, 4) \n\t" // L4
2496 "movq %%mm5, (%0, %%ebx) \n\t" // L5
2497 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
2498 "movq %%mm7, (%0, %%ecx) \n\t" // L7
2499 "jmp 4f \n\t"
2500
2501 "2: \n\t"
a9c77978 2502 "cmpl maxTmpNoise, %%ecx \n\t"
be44a4d7
MN
2503 " jb 3f \n\t"
2504
2505 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2506 "movq (%0), %%mm0 \n\t" // L0
2507 "movq (%0, %2), %%mm1 \n\t" // L1
2508 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2509 "movq (%0, %%eax), %%mm3 \n\t" // L3
2510 "movq (%1), %%mm4 \n\t" // R0
2511 "movq (%1, %2), %%mm5 \n\t" // R1
2512 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2513 "movq (%1, %%eax), %%mm7 \n\t" // R3
2514 PAVGB(%%mm4, %%mm0)
2515 PAVGB(%%mm5, %%mm1)
2516 PAVGB(%%mm6, %%mm2)
2517 PAVGB(%%mm7, %%mm3)
2518 PAVGB(%%mm4, %%mm0)
2519 PAVGB(%%mm5, %%mm1)
2520 PAVGB(%%mm6, %%mm2)
2521 PAVGB(%%mm7, %%mm3)
2522 "movq %%mm0, (%1) \n\t" // R0
2523 "movq %%mm1, (%1, %2) \n\t" // R1
2524 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2525 "movq %%mm3, (%1, %%eax) \n\t" // R3
2526 "movq %%mm0, (%0) \n\t" // L0
2527 "movq %%mm1, (%0, %2) \n\t" // L1
2528 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2529 "movq %%mm3, (%0, %%eax) \n\t" // L3
2530
2531 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2532 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2533 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2534 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2535 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2536 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2537 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2538 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2539 PAVGB(%%mm4, %%mm0)
2540 PAVGB(%%mm5, %%mm1)
2541 PAVGB(%%mm6, %%mm2)
2542 PAVGB(%%mm7, %%mm3)
2543 PAVGB(%%mm4, %%mm0)
2544 PAVGB(%%mm5, %%mm1)
2545 PAVGB(%%mm6, %%mm2)
2546 PAVGB(%%mm7, %%mm3)
2547 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2548 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2549 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2550 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2551 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2552 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2553 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2554 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2555 "jmp 4f \n\t"
2556
2557 "3: \n\t"
2558 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2559 "movq (%0), %%mm0 \n\t" // L0
2560 "movq (%0, %2), %%mm1 \n\t" // L1
2561 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2562 "movq (%0, %%eax), %%mm3 \n\t" // L3
2563 "movq (%1), %%mm4 \n\t" // R0
2564 "movq (%1, %2), %%mm5 \n\t" // R1
2565 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2566 "movq (%1, %%eax), %%mm7 \n\t" // R3
2567 PAVGB(%%mm4, %%mm0)
2568 PAVGB(%%mm5, %%mm1)
2569 PAVGB(%%mm6, %%mm2)
2570 PAVGB(%%mm7, %%mm3)
2571 PAVGB(%%mm4, %%mm0)
2572 PAVGB(%%mm5, %%mm1)
2573 PAVGB(%%mm6, %%mm2)
2574 PAVGB(%%mm7, %%mm3)
2575 PAVGB(%%mm4, %%mm0)
2576 PAVGB(%%mm5, %%mm1)
2577 PAVGB(%%mm6, %%mm2)
2578 PAVGB(%%mm7, %%mm3)
2579 "movq %%mm0, (%1) \n\t" // R0
2580 "movq %%mm1, (%1, %2) \n\t" // R1
2581 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2582 "movq %%mm3, (%1, %%eax) \n\t" // R3
2583 "movq %%mm0, (%0) \n\t" // L0
2584 "movq %%mm1, (%0, %2) \n\t" // L1
2585 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2586 "movq %%mm3, (%0, %%eax) \n\t" // L3
2587
2588 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2589 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2590 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2591 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2592 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2593 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2594 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2595 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2596 PAVGB(%%mm4, %%mm0)
2597 PAVGB(%%mm5, %%mm1)
2598 PAVGB(%%mm6, %%mm2)
2599 PAVGB(%%mm7, %%mm3)
2600 PAVGB(%%mm4, %%mm0)
2601 PAVGB(%%mm5, %%mm1)
2602 PAVGB(%%mm6, %%mm2)
2603 PAVGB(%%mm7, %%mm3)
2604 PAVGB(%%mm4, %%mm0)
2605 PAVGB(%%mm5, %%mm1)
2606 PAVGB(%%mm6, %%mm2)
2607 PAVGB(%%mm7, %%mm3)
2608 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2609 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2610 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2611 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2612 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2613 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2614 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2615 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2616
2617 "4: \n\t"
2618
a9c77978 2619 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
be44a4d7
MN
2620 : "%eax", "%ebx", "%ecx", "memory"
2621 );
2622//printf("%d\n", test);
2623#else
117e45b0
MN
2624 int y;
2625 int d=0;
2626 int sysd=0;
a9c77978 2627 int i;
117e45b0
MN
2628
2629 for(y=0; y<8; y++)
2630 {
2631 int x;
2632 for(x=0; x<8; x++)
2633 {
2634 int ref= tempBlured[ x + y*stride ];
2635 int cur= src[ x + y*stride ];
2636 int d1=ref - cur;
be44a4d7
MN
2637// if(x==0 || x==7) d1+= d1>>1;
2638// if(y==0 || y==7) d1+= d1>>1;
2639// d+= ABS(d1);
2640 d+= d1*d1;
117e45b0
MN
2641 sysd+= d1;
2642 }
2643 }
a9c77978
MN
2644 i=d;
2645 d= (
2646 4*d
2647 +(*(tempBluredPast-256))
2648 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2649 +(*(tempBluredPast+256))
2650 +4)>>3;
2651 *tempBluredPast=i;
2652// ((*tempBluredPast)*3 + d + 2)>>2;
2653
117e45b0
MN
2654//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2655/*
2656Switch between
2657 1 0 0 0 0 0 0 (0)
265864 32 16 8 4 2 1 (1)
265964 48 36 27 20 15 11 (33) (approx)
266064 56 49 43 37 33 29 (200) (approx)
2661*/
2662 if(d > maxNoise[1])
2663 {
2664 if(d < maxNoise[2])
2665 {
2666 for(y=0; y<8; y++)
2667 {
2668 int x;
2669 for(x=0; x<8; x++)
2670 {
2671 int ref= tempBlured[ x + y*stride ];
2672 int cur= src[ x + y*stride ];
2673 tempBlured[ x + y*stride ]=
2674 src[ x + y*stride ]=
2675 (ref + cur + 1)>>1;
2676 }
2677 }
2678 }
2679 else
2680 {
2681 for(y=0; y<8; y++)
2682 {
2683 int x;
2684 for(x=0; x<8; x++)
2685 {
2686 tempBlured[ x + y*stride ]= src[ x + y*stride ];
2687 }
2688 }
2689 }
2690 }
2691 else
2692 {
2693 if(d < maxNoise[0])
2694 {
2695 for(y=0; y<8; y++)
2696 {
2697 int x;
2698 for(x=0; x<8; x++)
2699 {
2700 int ref= tempBlured[ x + y*stride ];
2701 int cur= src[ x + y*stride ];
2702 tempBlured[ x + y*stride ]=
2703 src[ x + y*stride ]=
2704 (ref*7 + cur + 4)>>3;
2705 }
2706 }
2707 }
2708 else
2709 {
2710 for(y=0; y<8; y++)
2711 {
2712 int x;
2713 for(x=0; x<8; x++)
2714 {
2715 int ref= tempBlured[ x + y*stride ];
2716 int cur= src[ x + y*stride ];
2717 tempBlured[ x + y*stride ]=
2718 src[ x + y*stride ]=
2719 (ref*3 + cur + 2)>>2;
2720 }
2721 }
2722 }
2723 }
be44a4d7 2724#endif
117e45b0
MN
2725}
2726
9a722af7
A
2727#ifdef HAVE_ODIVX_POSTPROCESS
2728#include "../opendivx/postprocess.h"
2729int use_old_pp=0;
2730#endif
13e00528 2731
9a722af7 2732static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
117e45b0 2733 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
13e00528 2734
911879d1
MN
2735/* -pp Command line Help
2736NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2737
2738-pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2739
2740long form example:
2741-pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2742short form example:
2743-pp vb:a,hb:a,lb -pp de,-vb
117e45b0
MN
2744more examples:
2745-pp tn:64:128:256
911879d1
MN
2746
2747Filters Options
2748short long name short long option Description
2749* * a autoq cpu power dependant enabler
2750 c chrom chrominance filtring enabled
2751 y nochrom chrominance filtring disabled
2752hb hdeblock horizontal deblocking filter
2753vb vdeblock vertical deblocking filter
2754vr rkvdeblock
2755h1 x1hdeblock Experimental horizontal deblock filter 1
2756v1 x1vdeblock Experimental vertical deblock filter 1
2757dr dering not implemented yet
2758al autolevels automatic brightness / contrast fixer
2759 f fullyrange stretch luminance range to (0..255)
2760lb linblenddeint linear blend deinterlacer
2761li linipoldeint linear interpolating deinterlacer
2762ci cubicipoldeint cubic interpolating deinterlacer
2763md mediandeint median deinterlacer
2764de default hdeblock:a,vdeblock:a,dering:a,autolevels
2765fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
117e45b0 2766tn tmpnoise (3 Thresholds) Temporal Noise Reducer
911879d1
MN
2767*/
2768
2769/**
2770 * returns a PPMode struct which will have a non 0 error variable if an error occured
2771 * name is the string after "-pp" on the command line
2772 * quality is a number from 0 to GET_PP_QUALITY_MAX
2773 */
2774struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2775{
2776 char temp[GET_MODE_BUFFER_SIZE];
2777 char *p= temp;
2778 char *filterDelimiters= ",";
2779 char *optionDelimiters= ":";
117e45b0 2780 struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
911879d1
MN
2781 char *filterToken;
2782
2783 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2784
117e45b0
MN
2785 printf("%s\n", name);
2786
911879d1 2787 for(;;){
911879d1 2788 char *filterName;
117e45b0 2789 int q= 1000000; //GET_PP_QUALITY_MAX;
911879d1
MN
2790 int chrom=-1;
2791 char *option;
2792 char *options[OPTIONS_ARRAY_SIZE];
2793 int i;
2794 int filterNameOk=0;
2795 int numOfUnknownOptions=0;
2796 int enable=1; //does the user want us to enabled or disabled the filter
2797
2798 filterToken= strtok(p, filterDelimiters);
2799 if(filterToken == NULL) break;
117e45b0 2800 p+= strlen(filterToken) + 1; // p points to next filterToken
911879d1
MN
2801 filterName= strtok(filterToken, optionDelimiters);
2802 printf("%s::%s\n", filterToken, filterName);
2803
2804 if(*filterName == '-')
2805 {
2806 enable=0;
2807 filterName++;
2808 }
117e45b0 2809
911879d1
MN
2810 for(;;){ //for all options
2811 option= strtok(NULL, optionDelimiters);
2812 if(option == NULL) break;
2813
2814 printf("%s\n", option);
2815 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2816 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2817 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2818 else
2819 {
2820 options[numOfUnknownOptions] = option;
2821 numOfUnknownOptions++;
911879d1
MN
2822 }
2823 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2824 }
117e45b0 2825 options[numOfUnknownOptions] = NULL;
911879d1
MN
2826
2827 /* replace stuff from the replace Table */
2828 for(i=0; replaceTable[2*i]!=NULL; i++)
2829 {
2830 if(!strcmp(replaceTable[2*i], filterName))
2831 {
2832 int newlen= strlen(replaceTable[2*i + 1]);
2833 int plen;
2834 int spaceLeft;
2835
2836 if(p==NULL) p= temp, *p=0; //last filter
2837 else p--, *p=','; //not last filter
2838
2839 plen= strlen(p);
2840 spaceLeft= (int)p - (int)temp + plen;
2841 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2842 {
2843 ppMode.error++;
2844 break;
2845 }
2846 memmove(p + newlen, p, plen+1);
2847 memcpy(p, replaceTable[2*i + 1], newlen);
2848 filterNameOk=1;
2849 }
2850 }
2851
2852 for(i=0; filters[i].shortName!=NULL; i++)
2853 {
117e45b0 2854// printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
911879d1
MN
2855 if( !strcmp(filters[i].longName, filterName)
2856 || !strcmp(filters[i].shortName, filterName))
2857 {
2858 ppMode.lumMode &= ~filters[i].mask;
2859 ppMode.chromMode &= ~filters[i].mask;
2860
2861 filterNameOk=1;
2862 if(!enable) break; // user wants to disable it
2863
2864 if(q >= filters[i].minLumQuality)
2865 ppMode.lumMode|= filters[i].mask;
2866 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2867 if(q >= filters[i].minChromQuality)
2868 ppMode.chromMode|= filters[i].mask;
2869
2870 if(filters[i].mask == LEVEL_FIX)
2871 {
2872 int o;
2873 ppMode.minAllowedY= 16;
2874 ppMode.maxAllowedY= 234;
2875 for(o=0; options[o]!=NULL; o++)
2876 if( !strcmp(options[o],"fullyrange")
2877 ||!strcmp(options[o],"f"))
2878 {
2879 ppMode.minAllowedY= 0;
2880 ppMode.maxAllowedY= 255;
2881 numOfUnknownOptions--;
2882 }
2883 }
117e45b0
MN
2884 else if(filters[i].mask == TEMP_NOISE_FILTER)
2885 {
2886 int o;
2887 int numOfNoises=0;
2888 ppMode.maxTmpNoise[0]= 150;
2889 ppMode.maxTmpNoise[1]= 200;
2890 ppMode.maxTmpNoise[2]= 400;
2891
2892 for(o=0; options[o]!=NULL; o++)
2893 {
2894 char *tail;
2895 ppMode.maxTmpNoise[numOfNoises]=
2896 strtol(options[o], &tail, 0);
2897 if(tail!=options[o])
2898 {
2899 numOfNoises++;
2900 numOfUnknownOptions--;
2901 if(numOfNoises >= 3) break;
2902 }
2903 }
2904 }
911879d1
MN
2905 }
2906 }
2907 if(!filterNameOk) ppMode.error++;
2908 ppMode.error += numOfUnknownOptions;
2909 }
2910
815cbfe7 2911#ifdef HAVE_ODIVX_POSTPROCESS
911879d1
MN
2912 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2913 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2914 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2915 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2916 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2917 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
815cbfe7 2918#endif
911879d1
MN
2919
2920 return ppMode;
2921}
2922
3057fa66 2923/**
117e45b0 2924 * Obsolete, dont use it, use postprocess2() instead
3057fa66 2925 */
3057fa66
A
2926void postprocess(unsigned char * src[], int src_stride,
2927 unsigned char * dst[], int dst_stride,
2928 int horizontal_size, int vertical_size,
2929 QP_STORE_T *QP_store, int QP_stride,
2930 int mode)
2931{
117e45b0
MN
2932 struct PPMode ppMode;
2933 static QP_STORE_T zeroArray[2048/8];
911879d1
MN
2934/*
2935 static int qual=0;
2936
117e45b0
MN
2937 ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
2938 printf("OK\n");
911879d1
MN
2939 qual++;
2940 qual%=7;
117e45b0
MN
2941 printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
2942 qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
911879d1
MN
2943 postprocess2(src, src_stride, dst, dst_stride,
2944 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2945
2946 return;
2947*/
815cbfe7
MN
2948 if(QP_store==NULL)
2949 {
2950 QP_store= zeroArray;
2951 QP_stride= 0;
2952 }
13e00528 2953
117e45b0
MN
2954 ppMode.lumMode= mode;
2955 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2956 ppMode.chromMode= mode;
be44a4d7
MN
2957 ppMode.maxTmpNoise[0]= 700;
2958 ppMode.maxTmpNoise[1]= 1500;
2959 ppMode.maxTmpNoise[2]= 3000;
117e45b0 2960
9a722af7
A
2961#ifdef HAVE_ODIVX_POSTPROCESS
2962// Note: I could make this shit outside of this file, but it would mean one
2963// more function call...
2964 if(use_old_pp){
2965 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2966 return;
2967 }
2968#endif
2969
13e00528 2970 postProcess(src[0], src_stride, dst[0], dst_stride,
117e45b0 2971 horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
3057fa66
A
2972
2973 horizontal_size >>= 1;
2974 vertical_size >>= 1;
2975 src_stride >>= 1;
2976 dst_stride >>= 1;
2977
2978 if(1)
2979 {
13e00528 2980 postProcess(src[1], src_stride, dst[1], dst_stride,
117e45b0 2981 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
13e00528 2982 postProcess(src[2], src_stride, dst[2], dst_stride,
117e45b0 2983 horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
3057fa66
A
2984 }
2985 else
2986 {
117e45b0
MN
2987 memset(dst[1], 128, dst_stride*vertical_size);
2988 memset(dst[2], 128, dst_stride*vertical_size);
2989// memcpy(dst[1], src[1], src_stride*horizontal_size);
2990// memcpy(dst[2], src[2], src_stride*horizontal_size);
3057fa66
A
2991 }
2992}
9a722af7 2993
911879d1
MN
2994void postprocess2(unsigned char * src[], int src_stride,
2995 unsigned char * dst[], int dst_stride,
2996 int horizontal_size, int vertical_size,
2997 QP_STORE_T *QP_store, int QP_stride,
2998 struct PPMode *mode)
2999{
3000
815cbfe7
MN
3001 static QP_STORE_T zeroArray[2048/8];
3002 if(QP_store==NULL)
3003 {
3004 QP_store= zeroArray;
3005 QP_stride= 0;
3006 }
3007
911879d1
MN
3008#ifdef HAVE_ODIVX_POSTPROCESS
3009// Note: I could make this shit outside of this file, but it would mean one
3010// more function call...
3011 if(use_old_pp){
3012 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
3013 mode->oldMode);
3014 return;
3015 }
3016#endif
3017
3018 postProcess(src[0], src_stride, dst[0], dst_stride,
117e45b0 3019 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
911879d1
MN
3020
3021 horizontal_size >>= 1;
3022 vertical_size >>= 1;
3023 src_stride >>= 1;
3024 dst_stride >>= 1;
3025
3026 postProcess(src[1], src_stride, dst[1], dst_stride,
117e45b0 3027 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
911879d1 3028 postProcess(src[2], src_stride, dst[2], dst_stride,
117e45b0 3029 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
911879d1
MN
3030}
3031
3032
13e00528
A
3033/**
3034 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
9a722af7 3035 * 0 <= quality <= 6
13e00528 3036 */
9a722af7
A
3037int getPpModeForQuality(int quality){
3038 int modes[1+GET_PP_QUALITY_MAX]= {
3039 0,
3040#if 1
3041 // horizontal filters first
3042 LUM_H_DEBLOCK,
3043 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
3044 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
3045 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
3046 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
3047 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
3048#else
3049 // vertical filters first
13e00528
A
3050 LUM_V_DEBLOCK,
3051 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
3052 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
3053 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
3054 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
3055 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
9a722af7
A
3056#endif
3057 };
3058
3059#ifdef HAVE_ODIVX_POSTPROCESS
3060 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
3061 0,
3062 PP_DEBLOCK_Y_H,
3063 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
3064 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
3065 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
3066 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
3067 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
3068 };
3069 if(use_old_pp) return odivx_modes[quality];
3070#endif
3071 return modes[quality];
3057fa66
A
3072}
3073
3074/**
3075 * Copies a block from src to dst and fixes the blacklevel
d5a1a995
MN
3076 * numLines must be a multiple of 4
3077 * levelFix == 0 -> dont touch the brighness & contrast
3057fa66 3078 */
d5a1a995 3079static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
c09dc465 3080 int levelFix)
3057fa66 3081{
e5c30e06 3082#ifndef HAVE_MMX
d5a1a995 3083 int i;
e5c30e06 3084#endif
d5a1a995
MN
3085 if(levelFix)
3086 {
3057fa66
A
3087#ifdef HAVE_MMX
3088 asm volatile(
043ba56f
MN
3089 "leal (%0,%2), %%eax \n\t"
3090 "leal (%1,%3), %%ebx \n\t"
3057fa66
A
3091 "movq packedYOffset, %%mm2 \n\t"
3092 "movq packedYScale, %%mm3 \n\t"
5b65f0df 3093 "pxor %%mm4, %%mm4 \n\t"
3057fa66 3094
043ba56f
MN
3095#define SCALED_CPY(src1, src2, dst1, dst2) \
3096 "movq " #src1 ", %%mm0 \n\t"\
3097 "movq " #src1 ", %%mm5 \n\t"\
5b65f0df
MN
3098 "punpcklbw %%mm4, %%mm0 \n\t"\
3099 "punpckhbw %%mm4, %%mm5 \n\t"\
57d04d3f
MN
3100 "psubw %%mm2, %%mm0 \n\t"\
3101 "psubw %%mm2, %%mm5 \n\t"\
043ba56f 3102 "movq " #src2 ", %%mm1 \n\t"\
57d04d3f
MN
3103 "psllw $6, %%mm0 \n\t"\
3104 "psllw $6, %%mm5 \n\t"\
5b65f0df 3105 "pmulhw %%mm3, %%mm0 \n\t"\
043ba56f 3106 "movq " #src2 ", %%mm6 \n\t"\
5b65f0df 3107 "pmulhw %%mm3, %%mm5 \n\t"\
5b65f0df 3108 "punpcklbw %%mm4, %%mm1 \n\t"\
534a602d 3109 "punpckhbw %%mm4, %%mm6 \n\t"\
57d04d3f 3110 "psubw %%mm2, %%mm1 \n\t"\
534a602d 3111 "psubw %%mm2, %%mm6 \n\t"\
57d04d3f 3112 "psllw $6, %%mm1 \n\t"\
534a602d 3113 "psllw $6, %%mm6 \n\t"\
5b65f0df 3114 "pmulhw %%mm3, %%mm1 \n\t"\
534a602d 3115 "pmulhw %%mm3, %%mm6 \n\t"\
534a602d
MN
3116 "packuswb %%mm5, %%mm0 \n\t"\
3117 "packuswb %%mm6, %%mm1 \n\t"\
043ba56f
MN
3118 "movq %%mm0, " #dst1 " \n\t"\
3119 "movq %%mm1, " #dst2 " \n\t"\
3120
3121SCALED_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3122SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3123SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3124 "leal (%%eax,%2,4), %%eax \n\t"
3125 "leal (%%ebx,%3,4), %%ebx \n\t"
3126SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3127
3128
3129 : : "r"(src),
3130 "r"(dst),
3131 "r" (srcStride),
534a602d 3132 "r" (dstStride)
d5a1a995
MN
3133 : "%eax", "%ebx"
3134 );
3135#else
c09dc465 3136 for(i=0; i<8; i++)
d5a1a995
MN
3137 memcpy( &(dst[dstStride*i]),
3138 &(src[srcStride*i]), BLOCK_SIZE);
3139#endif
3140 }
3141 else
3142 {
3143#ifdef HAVE_MMX
3144 asm volatile(
043ba56f
MN
3145 "leal (%0,%2), %%eax \n\t"
3146 "leal (%1,%3), %%ebx \n\t"
3147
3148#define SIMPLE_CPY(src1, src2, dst1, dst2) \
3149 "movq " #src1 ", %%mm0 \n\t"\
3150 "movq " #src2 ", %%mm1 \n\t"\
3151 "movq %%mm0, " #dst1 " \n\t"\
3152 "movq %%mm1, " #dst2 " \n\t"\
3153
3154SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3155SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3156SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3157 "leal (%%eax,%2,4), %%eax \n\t"
3158 "leal (%%ebx,%3,4), %%ebx \n\t"
3159SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3160
3057fa66
A
3161 : : "r" (src),
3162 "r" (dst),
3163 "r" (srcStride),
c09dc465 3164 "r" (dstStride)
3057fa66
A
3165 : "%eax", "%ebx"
3166 );
3167#else
c09dc465 3168 for(i=0; i<8; i++)
3057fa66
A
3169 memcpy( &(dst[dstStride*i]),
3170 &(src[srcStride*i]), BLOCK_SIZE);
3171#endif
d5a1a995 3172 }
3057fa66
A
3173}
3174
3175
3176/**
3177 * Filters array of bytes (Y or U or V values)
3178 */
9a722af7 3179static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
117e45b0 3180 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
3057fa66 3181{
d5a1a995 3182 int x,y;
117e45b0
MN
3183 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
3184
d5a1a995
MN
3185