- Preliminary RTP friendly mode for H.263.
[libav.git] / postproc / postprocess_template.c
CommitLineData
3057fa66
A
1/*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
19/*
3b58b885 20 C MMX MMX2 3DNow
3057fa66
A
21isVertDC Ec Ec
22isVertMinMaxOk Ec Ec
3b58b885 23doVertLowPass E e e
3057fa66
A
24doVertDefFilter Ec Ec Ec
25isHorizDC Ec Ec
4e4dcbc5
MN
26isHorizMinMaxOk a E
27doHorizLowPass E e e
e5c30e06 28doHorizDefFilter Ec Ec Ec
2e212618 29deRing E e e*
3b58b885 30Vertical RKAlgo1 E a a
e5c30e06 31Horizontal RKAlgo1 a a
117e45b0
MN
32Vertical X1# a E E
33Horizontal X1# a E E
acced553
MN
34LinIpolDeinterlace e E E*
35CubicIpolDeinterlace a e e*
36LinBlendDeinterlace e E E*
117e45b0 37MedianDeinterlace# Ec Ec
be44a4d7 38TempDeNoiser# E e e
d5a1a995 39
117e45b0
MN
40* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41# more or less selfinvented filters so the exactness isnt too meaningfull
3057fa66 42E = Exact implementation
acced553 43e = allmost exact implementation (slightly different rounding,...)
3057fa66
A
44a = alternative / approximate impl
45c = checked against the other implementations (-vo md5)
46*/
47
48/*
49TODO:
9f45d04d 50verify that everything workes as it should (how?)
3057fa66 51reduce the time wasted on the mem transfer
13e00528 52implement everything in C at least (done at the moment but ...)
3057fa66
A
53unroll stuff if instructions depend too much on the prior one
54we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55move YScale thing to the end instead of fixing QP
13e00528 56write a faster and higher quality deblocking filter :)
d5a1a995
MN
57make the mainloop more flexible (variable number of blocks at once
58 (the if/else stuff per block is slowing things down)
9f45d04d 59compare the quality & speed of all filters
9f45d04d 60split this huge file
e5c30e06 61border remover
8405b3fd 62optimize c versions
117e45b0 63try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
be44a4d7 64smart blur
3057fa66
A
65...
66
67Notes:
13e00528
A
68*/
69
a6be8111 70//Changelog: use the CVS log
3057fa66 71
6c426cff 72#include "../config.h"
3057fa66
A
73#include <inttypes.h>
74#include <stdio.h>
d5a1a995 75#include <stdlib.h>
911879d1 76#include <string.h>
dda87e9f
PL
77#ifdef HAVE_MALLOC_H
78#include <malloc.h>
79#endif
3057fa66 80//#undef HAVE_MMX2
13e00528 81//#define HAVE_3DNOW
3057fa66 82//#undef HAVE_MMX
13e00528 83#include "postprocess.h"
3057fa66 84
e939e1c3
A
85#define MIN(a,b) ((a) > (b) ? (b) : (a))
86#define MAX(a,b) ((a) < (b) ? (b) : (a))
87#define ABS(a) ((a) > 0 ? (a) : (-(a)))
88#define SIGN(a) ((a) > 0 ? 1 : -1)
89
90#ifdef HAVE_MMX2
91#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92#elif defined (HAVE_3DNOW)
93#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
94#endif
3057fa66 95
2e212618
MN
96#ifdef HAVE_MMX2
97#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
98#elif defined (HAVE_MMX)
99#define PMINUB(b,a,t) \
100 "movq " #a ", " #t " \n\t"\
101 "psubusb " #b ", " #t " \n\t"\
102 "psubb " #t ", " #a " \n\t"
103#endif
104
105#ifdef HAVE_MMX2
106#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
107#elif defined (HAVE_MMX)
108#define PMAXUB(a,b) \
109 "psubusb " #a ", " #b " \n\t"\
110 "paddb " #a ", " #b " \n\t"
111#endif
112
113
911879d1
MN
114#define GET_MODE_BUFFER_SIZE 500
115#define OPTIONS_ARRAY_SIZE 10
116
b28daef8 117#ifdef HAVE_MMX
3fe8e8f0
MN
118static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
119static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
b28daef8
MN
120static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
121static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
122static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL;
123static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL;
124static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL;
125static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL;
126static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL;
127static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL;
128static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL;
129static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL;
130static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL;
131static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL;
132static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL;
133static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL;
134static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL;
135static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
136static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
137static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
138static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL;
139static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL;
140static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
141static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL;
142static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL;
143static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
144static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL;
145static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL;
146static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL;
147static uint64_t __attribute__((aligned(8))) temp0=0;
148static uint64_t __attribute__((aligned(8))) temp1=0;
149static uint64_t __attribute__((aligned(8))) temp2=0;
150static uint64_t __attribute__((aligned(8))) temp3=0;
151static uint64_t __attribute__((aligned(8))) temp4=0;
152static uint64_t __attribute__((aligned(8))) temp5=0;
153static uint64_t __attribute__((aligned(8))) pQPb=0;
154static uint64_t __attribute__((aligned(8))) pQPb2=0;
155static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
a9c77978 156static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
b28daef8 157#else
3057fa66
A
158static uint64_t packedYOffset= 0x0000000000000000LL;
159static uint64_t packedYScale= 0x0100010001000100LL;
4e4dcbc5 160static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
b28daef8 161#endif
3057fa66
A
162
163int hFlatnessThreshold= 56 - 16;
164int vFlatnessThreshold= 56 - 16;
165
166//amount of "black" u r willing to loose to get a brightness corrected picture
167double maxClippedThreshold= 0.01;
168
911879d1 169int maxAllowedY=234;
658a85f2 170int minAllowedY=16;
3057fa66 171
911879d1
MN
172static struct PPFilter filters[]=
173{
174 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
175 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
176 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
177 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
178 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
179 {"dr", "dering", 1, 5, 6, DERING},
180 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
181 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
182 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
183 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
184 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
117e45b0 185 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
911879d1
MN
186 {NULL, NULL,0,0,0,0} //End Marker
187};
188
189static char *replaceTable[]=
190{
117e45b0
MN
191 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
192 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
193 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
194 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
911879d1
MN
195 NULL //End Marker
196};
197
b28daef8 198#ifdef HAVE_MMX
e5c30e06
MN
199static inline void unusedVariableWarningFixer()
200{
201if(
202 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
203 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
204 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
b28daef8 205 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
e5c30e06
MN
206 + temp5 + pQPb== 0) b00=0;
207}
b28daef8 208#endif
e5c30e06 209
a6be8111 210#ifdef TIMING
3057fa66
A
211static inline long long rdtsc()
212{
213 long long l;
214 asm volatile( "rdtsc\n\t"
215 : "=A" (l)
216 );
217// printf("%d\n", int(l/1000));
218 return l;
219}
9a722af7 220#endif
3057fa66 221
9a722af7 222#ifdef HAVE_MMX2
3057fa66
A
223static inline void prefetchnta(void *p)
224{
225 asm volatile( "prefetchnta (%0)\n\t"
226 : : "r" (p)
227 );
228}
229
230static inline void prefetcht0(void *p)
231{
232 asm volatile( "prefetcht0 (%0)\n\t"
233 : : "r" (p)
234 );
235}
236
237static inline void prefetcht1(void *p)
238{
239 asm volatile( "prefetcht1 (%0)\n\t"
240 : : "r" (p)
241 );
242}
243
244static inline void prefetcht2(void *p)
245{
246 asm volatile( "prefetcht2 (%0)\n\t"
247 : : "r" (p)
248 );
249}
9a722af7 250#endif
3057fa66
A
251
252//FIXME? |255-0| = 1 (shouldnt be a problem ...)
253/**
acced553 254 * Check if the middle 8x8 Block in the given 8x16 block is flat
3057fa66 255 */
d5a1a995 256static inline int isVertDC(uint8_t src[], int stride){
3057fa66 257 int numEq= 0;
e5c30e06 258#ifndef HAVE_MMX
d5a1a995 259 int y;
e5c30e06 260#endif
acced553 261 src+= stride*4; // src points to begin of the 8x8 Block
3057fa66 262#ifdef HAVE_MMX
37da00fc
MN
263asm volatile(
264 "leal (%1, %2), %%eax \n\t"
265 "leal (%%eax, %2, 4), %%ebx \n\t"
266// 0 1 2 3 4 5 6 7 8 9
267// %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
3057fa66
A
268 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
269 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
270 "movq (%1), %%mm0 \n\t"
37da00fc 271 "movq (%%eax), %%mm1 \n\t"
3057fa66
A
272 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
273 "paddb %%mm7, %%mm0 \n\t"
274 "pcmpgtb %%mm6, %%mm0 \n\t"
275
37da00fc 276 "movq (%%eax,%2), %%mm2 \n\t"
3057fa66
A
277 "psubb %%mm2, %%mm1 \n\t"
278 "paddb %%mm7, %%mm1 \n\t"
279 "pcmpgtb %%mm6, %%mm1 \n\t"
280 "paddb %%mm1, %%mm0 \n\t"
281
37da00fc 282 "movq (%%eax, %2, 2), %%mm1 \n\t"
3057fa66
A
283 "psubb %%mm1, %%mm2 \n\t"
284 "paddb %%mm7, %%mm2 \n\t"
285 "pcmpgtb %%mm6, %%mm2 \n\t"
286 "paddb %%mm2, %%mm0 \n\t"
287
37da00fc 288 "movq (%1, %2, 4), %%mm2 \n\t"
3057fa66
A
289 "psubb %%mm2, %%mm1 \n\t"
290 "paddb %%mm7, %%mm1 \n\t"
291 "pcmpgtb %%mm6, %%mm1 \n\t"
292 "paddb %%mm1, %%mm0 \n\t"
293
37da00fc 294 "movq (%%ebx), %%mm1 \n\t"
3057fa66
A
295 "psubb %%mm1, %%mm2 \n\t"
296 "paddb %%mm7, %%mm2 \n\t"
297 "pcmpgtb %%mm6, %%mm2 \n\t"
298 "paddb %%mm2, %%mm0 \n\t"
299
37da00fc 300 "movq (%%ebx, %2), %%mm2 \n\t"
3057fa66
A
301 "psubb %%mm2, %%mm1 \n\t"
302 "paddb %%mm7, %%mm1 \n\t"
303 "pcmpgtb %%mm6, %%mm1 \n\t"
304 "paddb %%mm1, %%mm0 \n\t"
305
37da00fc 306 "movq (%%ebx, %2, 2), %%mm1 \n\t"
3057fa66
A
307 "psubb %%mm1, %%mm2 \n\t"
308 "paddb %%mm7, %%mm2 \n\t"
309 "pcmpgtb %%mm6, %%mm2 \n\t"
310 "paddb %%mm2, %%mm0 \n\t"
311
312 " \n\t"
313 "movq %%mm0, %%mm1 \n\t"
314 "psrlw $8, %%mm0 \n\t"
315 "paddb %%mm1, %%mm0 \n\t"
e5c30e06
MN
316#ifdef HAVE_MMX2
317 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
318 "paddb %%mm1, %%mm0 \n\t"
319 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
320#else
3057fa66
A
321 "movq %%mm0, %%mm1 \n\t"
322 "psrlq $16, %%mm0 \n\t"
323 "paddb %%mm1, %%mm0 \n\t"
324 "movq %%mm0, %%mm1 \n\t"
325 "psrlq $32, %%mm0 \n\t"
e5c30e06 326#endif
3057fa66 327 "paddb %%mm1, %%mm0 \n\t"
3057fa66
A
328 "movd %%mm0, %0 \n\t"
329 : "=r" (numEq)
330 : "r" (src), "r" (stride)
4e4dcbc5 331 : "%eax", "%ebx"
3057fa66 332 );
3057fa66 333
37da00fc 334 numEq= (256 - numEq) &0xFF;
3057fa66
A
335
336#else
d5a1a995 337 for(y=0; y<BLOCK_SIZE-1; y++)
3057fa66
A
338 {
339 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
340 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
341 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
342 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
343 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
344 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
345 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
346 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
347 src+= stride;
348 }
349#endif
350/* if(abs(numEq - asmEq) > 0)
351 {
352 printf("\nasm:%d c:%d\n", asmEq, numEq);
353 for(int y=0; y<8; y++)
354 {
355 for(int x=0; x<8; x++)
356 {
357 printf("%d ", temp[x + y*stride]);
358 }
359 printf("\n");
360 }
361 }
362*/
d5a1a995
MN
363// for(int i=0; i<numEq/8; i++) src[i]=255;
364 return (numEq > vFlatnessThreshold) ? 1 : 0;
3057fa66
A
365}
366
d5a1a995 367static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
3057fa66
A
368{
369#ifdef HAVE_MMX
370 int isOk;
acced553 371 src+= stride*3;
3057fa66
A
372 asm volatile(
373// "int $3 \n\t"
374 "movq (%1, %2), %%mm0 \n\t"
375 "movq (%1, %2, 8), %%mm1 \n\t"
376 "movq %%mm0, %%mm2 \n\t"
377 "psubusb %%mm1, %%mm0 \n\t"
378 "psubusb %%mm2, %%mm1 \n\t"
379 "por %%mm1, %%mm0 \n\t" // ABS Diff
380
381 "movq pQPb, %%mm7 \n\t" // QP,..., QP
382 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
383 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
384 "pcmpeqd b00, %%mm0 \n\t"
385 "psrlq $16, %%mm0 \n\t"
386 "pcmpeqd bFF, %%mm0 \n\t"
387// "movd %%mm0, (%1, %2, 4)\n\t"
388 "movd %%mm0, %0 \n\t"
389 : "=r" (isOk)
390 : "r" (src), "r" (stride)
391 );
ac0b0b2f 392 return isOk;
3057fa66
A
393#else
394
d5a1a995
MN
395 int isOk2= 1;
396 int x;
acced553 397 src+= stride*3;
d5a1a995 398 for(x=0; x<BLOCK_SIZE; x++)
3057fa66 399 {
d5a1a995 400 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
3057fa66
A
401 }
402/* if(isOk && !isOk2 || !isOk && isOk2)
403 {
404 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
405 for(int y=0; y<9; y++)
406 {
407 for(int x=0; x<8; x++)
408 {
409 printf("%d ", src[x + y*stride]);
410 }
411 printf("\n");
412 }
413 } */
414
415 return isOk2;
416#endif
417
418}
419
420/**
acced553 421 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
a6be8111 422 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
3057fa66
A
423 */
424static inline void doVertLowPass(uint8_t *src, int stride, int QP)
425{
13e00528 426#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 427 src+= stride*3;
3057fa66 428 asm volatile( //"movv %0 %1 %2\n\t"
3057fa66 429 "movq pQPb, %%mm0 \n\t" // QP,..., QP
3057fa66
A
430
431 "movq (%0), %%mm6 \n\t"
432 "movq (%0, %1), %%mm5 \n\t"
433 "movq %%mm5, %%mm1 \n\t"
434 "movq %%mm6, %%mm2 \n\t"
435 "psubusb %%mm6, %%mm5 \n\t"
436 "psubusb %%mm1, %%mm2 \n\t"
437 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
438 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
439 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
440
441 "pand %%mm2, %%mm6 \n\t"
442 "pandn %%mm1, %%mm2 \n\t"
443 "por %%mm2, %%mm6 \n\t"// First Line to Filter
444
445 "movq (%0, %1, 8), %%mm5 \n\t"
446 "leal (%0, %1, 4), %%eax \n\t"
447 "leal (%0, %1, 8), %%ebx \n\t"
448 "subl %1, %%ebx \n\t"
449 "addl %1, %0 \n\t" // %0 points to line 1 not 0
450 "movq (%0, %1, 8), %%mm7 \n\t"
451 "movq %%mm5, %%mm1 \n\t"
452 "movq %%mm7, %%mm2 \n\t"
453 "psubusb %%mm7, %%mm5 \n\t"
454 "psubusb %%mm1, %%mm2 \n\t"
455 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
456 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
457 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
458
459 "pand %%mm2, %%mm7 \n\t"
460 "pandn %%mm1, %%mm2 \n\t"
461 "por %%mm2, %%mm7 \n\t" // First Line to Filter
462
463
464 // 1 2 3 4 5 6 7 8
465 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
466 // 6 4 2 2 1 1
467 // 6 4 4 2
468 // 6 8 2
acced553 469
3057fa66
A
470 "movq (%0, %1), %%mm0 \n\t" // 1
471 "movq %%mm0, %%mm1 \n\t" // 1
13e00528
A
472 PAVGB(%%mm6, %%mm0) //1 1 /2
473 PAVGB(%%mm6, %%mm0) //3 1 /4
3057fa66
A
474
475 "movq (%0, %1, 4), %%mm2 \n\t" // 1
476 "movq %%mm2, %%mm5 \n\t" // 1
13e00528
A
477 PAVGB((%%eax), %%mm2) // 11 /2
478 PAVGB((%0, %1, 2), %%mm2) // 211 /4
3057fa66
A
479 "movq %%mm2, %%mm3 \n\t" // 211 /4
480 "movq (%0), %%mm4 \n\t" // 1
13e00528
A
481 PAVGB(%%mm4, %%mm3) // 4 211 /8
482 PAVGB(%%mm0, %%mm3) //642211 /16
3057fa66
A
483 "movq %%mm3, (%0) \n\t" // X
484 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
485 "movq %%mm1, %%mm0 \n\t" // 1
13e00528 486 PAVGB(%%mm6, %%mm0) //1 1 /2
3057fa66 487 "movq %%mm4, %%mm3 \n\t" // 1
13e00528
A
488 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
489 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
490 PAVGB((%%eax), %%mm5) // 211 /4
491 PAVGB(%%mm5, %%mm3) // 2 2211 /8
492 PAVGB(%%mm0, %%mm3) //4242211 /16
3057fa66
A
493 "movq %%mm3, (%0,%1) \n\t" // X
494 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
13e00528 495 PAVGB(%%mm4, %%mm6) //11 /2
3057fa66 496 "movq (%%ebx), %%mm0 \n\t" // 1
13e00528 497 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
3057fa66 498 "movq %%mm0, %%mm3 \n\t" // 11/2
13e00528
A
499 PAVGB(%%mm1, %%mm0) // 2 11/4
500 PAVGB(%%mm6, %%mm0) //222 11/8
501 PAVGB(%%mm2, %%mm0) //22242211/16
3057fa66
A
502 "movq (%0, %1, 2), %%mm2 \n\t" // 1
503 "movq %%mm0, (%0, %1, 2) \n\t" // X
504 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
505 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
506 PAVGB((%%ebx), %%mm0) // 11 /2
507 PAVGB(%%mm0, %%mm6) //11 11 /4
508 PAVGB(%%mm1, %%mm4) // 11 /2
509 PAVGB(%%mm2, %%mm1) // 11 /2
510 PAVGB(%%mm1, %%mm6) //1122 11 /8
511 PAVGB(%%mm5, %%mm6) //112242211 /16
3057fa66
A
512 "movq (%%eax), %%mm5 \n\t" // 1
513 "movq %%mm6, (%%eax) \n\t" // X
514 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
515 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
13e00528
A
516 PAVGB(%%mm7, %%mm6) // 11 /2
517 PAVGB(%%mm4, %%mm6) // 11 11 /4
518 PAVGB(%%mm3, %%mm6) // 11 2211 /8
519 PAVGB(%%mm5, %%mm2) // 11 /2
3057fa66 520 "movq (%0, %1, 4), %%mm4 \n\t" // 1
13e00528
A
521 PAVGB(%%mm4, %%mm2) // 112 /4
522 PAVGB(%%mm2, %%mm6) // 112242211 /16
3057fa66
A
523 "movq %%mm6, (%0, %1, 4) \n\t" // X
524 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
13e00528
A
525 PAVGB(%%mm7, %%mm1) // 11 2 /4
526 PAVGB(%%mm4, %%mm5) // 11 /2
527 PAVGB(%%mm5, %%mm0) // 11 11 /4
3057fa66 528 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
13e00528
A
529 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
530 PAVGB(%%mm0, %%mm1) // 11224222 /16
3057fa66
A
531 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
532 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
13e00528 533 PAVGB((%%ebx), %%mm2) // 112 4 /8
3057fa66 534 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
535 PAVGB(%%mm0, %%mm6) // 1 1 /2
536 PAVGB(%%mm7, %%mm6) // 1 12 /4
537 PAVGB(%%mm2, %%mm6) // 1122424 /4
3057fa66
A
538 "movq %%mm6, (%%ebx) \n\t" // X
539 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
13e00528
A
540 PAVGB(%%mm7, %%mm5) // 11 2 /4
541 PAVGB(%%mm7, %%mm5) // 11 6 /8
3057fa66 542
13e00528
A
543 PAVGB(%%mm3, %%mm0) // 112 /4
544 PAVGB(%%mm0, %%mm5) // 112246 /16
3057fa66 545 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
8405b3fd 546 "subl %1, %0 \n\t"
3057fa66
A
547
548 :
549 : "r" (src), "r" (stride)
550 : "%eax", "%ebx"
551 );
3057fa66
A
552#else
553 const int l1= stride;
554 const int l2= stride + l1;
555 const int l3= stride + l2;
556 const int l4= stride + l3;
557 const int l5= stride + l4;
558 const int l6= stride + l5;
559 const int l7= stride + l6;
560 const int l8= stride + l7;
561 const int l9= stride + l8;
d5a1a995 562 int x;
acced553 563 src+= stride*3;
d5a1a995 564 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
565 {
566 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
567 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
568
569 int sums[9];
570 sums[0] = first + src[l1];
571 sums[1] = src[l1] + src[l2];
572 sums[2] = src[l2] + src[l3];
573 sums[3] = src[l3] + src[l4];
574 sums[4] = src[l4] + src[l5];
575 sums[5] = src[l5] + src[l6];
576 sums[6] = src[l6] + src[l7];
577 sums[7] = src[l7] + src[l8];
578 sums[8] = src[l8] + last;
579
580 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
e5c30e06
MN
581 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
582 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
583 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
584 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
585 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
586 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
587 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
3057fa66
A
588
589 src++;
590 }
591
592#endif
593}
594
13e00528
A
595/**
596 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
597 * values are correctly clipped (MMX2)
598 * values are wraparound (C)
599 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
600 0 8 16 24
601 x = 8
602 x/2 = 4
603 x/8 = 1
604 1 12 12 23
605 */
9f45d04d 606static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
13e00528 607{
d5a1a995 608#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 609 src+= stride*3;
13e00528
A
610// FIXME rounding
611 asm volatile(
612 "pxor %%mm7, %%mm7 \n\t" // 0
613 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
614 "leal (%0, %1), %%eax \n\t"
615 "leal (%%eax, %1, 4), %%ebx \n\t"
616// 0 1 2 3 4 5 6 7 8 9
617// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
618 "movq pQPb, %%mm0 \n\t" // QP,..., QP
619 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
620 "paddusb b02, %%mm0 \n\t"
621 "psrlw $2, %%mm0 \n\t"
622 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
623 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
624 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
625 "movq (%%ebx), %%mm3 \n\t" // line 5
626 "movq %%mm2, %%mm4 \n\t" // line 4
627 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
628 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
d5a1a995 629 PAVGB(%%mm3, %%mm5)
13e00528
A
630 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
631 "psubusb %%mm3, %%mm4 \n\t"
632 "psubusb %%mm2, %%mm3 \n\t"
633 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
634 "psubusb %%mm0, %%mm4 \n\t"
635 "pcmpeqb %%mm7, %%mm4 \n\t"
636 "pand %%mm4, %%mm5 \n\t" // d/2
637
638// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
639 "paddb %%mm5, %%mm2 \n\t"
640// "psubb %%mm6, %%mm2 \n\t"
641 "movq %%mm2, (%0,%1, 4) \n\t"
642
643 "movq (%%ebx), %%mm2 \n\t"
644// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
645 "psubb %%mm5, %%mm2 \n\t"
646// "psubb %%mm6, %%mm2 \n\t"
647 "movq %%mm2, (%%ebx) \n\t"
648
649 "paddb %%mm6, %%mm5 \n\t"
650 "psrlw $2, %%mm5 \n\t"
651 "pand b3F, %%mm5 \n\t"
652 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
653
654 "movq (%%eax, %1, 2), %%mm2 \n\t"
655 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
656 "paddsb %%mm5, %%mm2 \n\t"
657 "psubb %%mm6, %%mm2 \n\t"
658 "movq %%mm2, (%%eax, %1, 2) \n\t"
659
660 "movq (%%ebx, %1), %%mm2 \n\t"
661 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
662 "psubsb %%mm5, %%mm2 \n\t"
663 "psubb %%mm6, %%mm2 \n\t"
664 "movq %%mm2, (%%ebx, %1) \n\t"
665
666 :
667 : "r" (src), "r" (stride)
668 : "%eax", "%ebx"
669 );
670#else
671 const int l1= stride;
672 const int l2= stride + l1;
673 const int l3= stride + l2;
674 const int l4= stride + l3;
675 const int l5= stride + l4;
676 const int l6= stride + l5;
e5c30e06
MN
677// const int l7= stride + l6;
678// const int l8= stride + l7;
679// const int l9= stride + l8;
d5a1a995 680 int x;
3407a972 681 const int QP15= QP + (QP>>2);
acced553 682 src+= stride*3;
d5a1a995 683 for(x=0; x<BLOCK_SIZE; x++)
13e00528 684 {
3407a972
MN
685 const int v = (src[x+l5] - src[x+l4]);
686 if(ABS(v) < QP15)
13e00528 687 {
3407a972
MN
688 src[x+l3] +=v>>3;
689 src[x+l4] +=v>>1;
690 src[x+l5] -=v>>1;
691 src[x+l6] -=v>>3;
13e00528 692
13e00528 693 }
13e00528
A
694 }
695
696#endif
697}
698
699/**
700 * Experimental Filter 1
9f45d04d
MN
701 * will not damage linear gradients
702 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
d5a1a995
MN
703 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
704 * MMX2 version does correct clipping C version doesnt
13e00528
A
705 */
706static inline void vertX1Filter(uint8_t *src, int stride, int QP)
707{
d5a1a995 708#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553
MN
709 src+= stride*3;
710
13e00528 711 asm volatile(
d5a1a995
MN
712 "pxor %%mm7, %%mm7 \n\t" // 0
713// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
714 "leal (%0, %1), %%eax \n\t"
715 "leal (%%eax, %1, 4), %%ebx \n\t"
716// 0 1 2 3 4 5 6 7 8 9
717// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
718 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
719 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
720 "movq %%mm1, %%mm2 \n\t" // line 4
721 "psubusb %%mm0, %%mm1 \n\t"
722 "psubusb %%mm2, %%mm0 \n\t"
723 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
724 "movq (%%ebx), %%mm3 \n\t" // line 5
725 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
726 "movq %%mm3, %%mm5 \n\t" // line 5
727 "psubusb %%mm4, %%mm3 \n\t"
728 "psubusb %%mm5, %%mm4 \n\t"
729 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
730 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
731 "movq %%mm2, %%mm1 \n\t" // line 4
732 "psubusb %%mm5, %%mm2 \n\t"
733 "movq %%mm2, %%mm4 \n\t"
734 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
735 "psubusb %%mm1, %%mm5 \n\t"
736 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
737 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
738 "movq %%mm4, %%mm3 \n\t" // d
739 "psubusb pQPb, %%mm4 \n\t"
740 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
9f45d04d 741 "psubusb b01, %%mm3 \n\t"
d5a1a995
MN
742 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
743
744 PAVGB(%%mm7, %%mm3) // d/2
9f45d04d
MN
745 "movq %%mm3, %%mm1 \n\t" // d/2
746 PAVGB(%%mm7, %%mm3) // d/4
747 PAVGB(%%mm1, %%mm3) // 3*d/8
d5a1a995
MN
748
749 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
750 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
751 "psubusb %%mm3, %%mm0 \n\t"
752 "pxor %%mm2, %%mm0 \n\t"
753 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
754
755 "movq (%%ebx), %%mm0 \n\t" // line 5
756 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
757 "paddusb %%mm3, %%mm0 \n\t"
758 "pxor %%mm2, %%mm0 \n\t"
759 "movq %%mm0, (%%ebx) \n\t" // line 5
760
9f45d04d 761 PAVGB(%%mm7, %%mm1) // d/4
d5a1a995
MN
762
763 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
764 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
9f45d04d 765 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
766 "pxor %%mm2, %%mm0 \n\t"
767 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
768
769 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
770 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
9f45d04d 771 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
772 "pxor %%mm2, %%mm0 \n\t"
773 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
774
9f45d04d 775 PAVGB(%%mm7, %%mm1) // d/8
d5a1a995
MN
776
777 "movq (%%eax, %1), %%mm0 \n\t" // line 2
778 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
9f45d04d 779 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
780 "pxor %%mm2, %%mm0 \n\t"
781 "movq %%mm0, (%%eax, %1) \n\t" // line 2
782
783 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
784 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
9f45d04d 785 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
786 "pxor %%mm2, %%mm0 \n\t"
787 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
13e00528
A
788
789 :
790 : "r" (src), "r" (stride)
791 : "%eax", "%ebx"
792 );
793#else
d5a1a995
MN
794
795 const int l1= stride;
796 const int l2= stride + l1;
797 const int l3= stride + l2;
798 const int l4= stride + l3;
799 const int l5= stride + l4;
800 const int l6= stride + l5;
801 const int l7= stride + l6;
e5c30e06
MN
802// const int l8= stride + l7;
803// const int l9= stride + l8;
d5a1a995 804 int x;
acced553
MN
805
806 src+= stride*3;
d5a1a995
MN
807 for(x=0; x<BLOCK_SIZE; x++)
808 {
809 int a= src[l3] - src[l4];
810 int b= src[l4] - src[l5];
9f45d04d 811 int c= src[l5] - src[l6];
d5a1a995 812
3407a972
MN
813 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
814 d= MAX(d, 0);
d5a1a995
MN
815
816 if(d < QP)
817 {
818 int v = d * SIGN(-b);
819
3407a972
MN
820 src[l2] +=v>>3;
821 src[l3] +=v>>2;
822 src[l4] +=(3*v)>>3;
823 src[l5] -=(3*v)>>3;
824 src[l6] -=v>>2;
825 src[l7] -=v>>3;
d5a1a995
MN
826
827 }
828 src++;
829 }
830 /*
13e00528
A
831 const int l1= stride;
832 const int l2= stride + l1;
833 const int l3= stride + l2;
834 const int l4= stride + l3;
835 const int l5= stride + l4;
836 const int l6= stride + l5;
837 const int l7= stride + l6;
838 const int l8= stride + l7;
839 const int l9= stride + l8;
840 for(int x=0; x<BLOCK_SIZE; x++)
841 {
842 int v2= src[l2];
843 int v3= src[l3];
844 int v4= src[l4];
845 int v5= src[l5];
846 int v6= src[l6];
847 int v7= src[l7];
848
849 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
850 {
851 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
852 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
853 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
854 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
855 }
856 src++;
857 }
d5a1a995 858*/
13e00528
A
859#endif
860}
861
9f45d04d
MN
862/**
863 * Experimental Filter 1 (Horizontal)
864 * will not damage linear gradients
865 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
866 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
867 * MMX2 version does correct clipping C version doesnt
868 * not identical with the vertical one
869 */
870static inline void horizX1Filter(uint8_t *src, int stride, int QP)
871{
872 int y;
873 static uint64_t *lut= NULL;
874 if(lut==NULL)
875 {
876 int i;
877 lut= (uint64_t*)memalign(8, 256*8);
878 for(i=0; i<256; i++)
879 {
880 int v= i < 128 ? 2*i : 2*(i-256);
881/*
882//Simulate 112242211 9-Tap filter
883 uint64_t a= (v/16) & 0xFF;
884 uint64_t b= (v/8) & 0xFF;
885 uint64_t c= (v/4) & 0xFF;
886 uint64_t d= (3*v/8) & 0xFF;
887*/
888//Simulate piecewise linear interpolation
889 uint64_t a= (v/16) & 0xFF;
890 uint64_t b= (v*3/16) & 0xFF;
891 uint64_t c= (v*5/16) & 0xFF;
892 uint64_t d= (7*v/16) & 0xFF;
893 uint64_t A= (0x100 - a)&0xFF;
894 uint64_t B= (0x100 - b)&0xFF;
895 uint64_t C= (0x100 - c)&0xFF;
896 uint64_t D= (0x100 - c)&0xFF;
897
898 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
899 (D<<24) | (C<<16) | (B<<8) | (A);
900 //lut[i] = (v<<32) | (v<<24);
901 }
902 }
903
4e4dcbc5 904#if 0
9f45d04d
MN
905 asm volatile(
906 "pxor %%mm7, %%mm7 \n\t" // 0
907// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
908 "leal (%0, %1), %%eax \n\t"
909 "leal (%%eax, %1, 4), %%ebx \n\t"
910
911 "movq b80, %%mm6 \n\t"
79cccf70 912 "movd pQPb, %%mm5 \n\t" // QP
9f45d04d
MN
913 "movq %%mm5, %%mm4 \n\t"
914 "paddusb %%mm5, %%mm5 \n\t" // 2QP
915 "paddusb %%mm5, %%mm4 \n\t" // 3QP
916 "pxor %%mm5, %%mm5 \n\t" // 0
917 "psubb %%mm4, %%mm5 \n\t" // -3QP
918 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
919 "psllq $24, %%mm5 \n\t"
920
921// 0 1 2 3 4 5 6 7 8 9
922// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
923
924#define HX1old(a) \
925 "movd " #a ", %%mm0 \n\t"\
926 "movd 4" #a ", %%mm1 \n\t"\
927 "punpckldq %%mm1, %%mm0 \n\t"\
928 "movq %%mm0, %%mm1 \n\t"\
929 "movq %%mm0, %%mm2 \n\t"\
930 "psrlq $8, %%mm1 \n\t"\
931 "psubusb %%mm1, %%mm2 \n\t"\
932 "psubusb %%mm0, %%mm1 \n\t"\
933