use native timebase for seeking
[libav.git] / libavcodec / libpostproc / postprocess_template.c
CommitLineData
3057fa66 1/*
bdd677ac 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3057fa66
A
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
b304569a
MN
19/**
20 * @file postprocess_template.c
21 * mmx/mmx2/3dnow postprocess code.
22 */
23
24
cc9b0679
MN
25#undef PAVGB
26#undef PMINUB
27#undef PMAXUB
e939e1c3
A
28
29#ifdef HAVE_MMX2
30#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
31#elif defined (HAVE_3DNOW)
32#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
33#endif
3057fa66 34
2e212618
MN
35#ifdef HAVE_MMX2
36#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
37#elif defined (HAVE_MMX)
38#define PMINUB(b,a,t) \
39 "movq " #a ", " #t " \n\t"\
40 "psubusb " #b ", " #t " \n\t"\
41 "psubb " #t ", " #a " \n\t"
42#endif
43
44#ifdef HAVE_MMX2
45#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
46#elif defined (HAVE_MMX)
47#define PMAXUB(a,b) \
48 "psubusb " #a ", " #b " \n\t"\
49 "paddb " #a ", " #b " \n\t"
50#endif
51
3057fa66 52//FIXME? |255-0| = 1 (shouldnt be a problem ...)
9c9e467d 53#ifdef HAVE_MMX
3057fa66 54/**
acced553 55 * Check if the middle 8x8 Block in the given 8x16 block is flat
3057fa66 56 */
cb482d25
MN
57static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
58 int numEq= 0, dcOk;
acced553 59 src+= stride*4; // src points to begin of the 8x8 Block
37da00fc 60asm volatile(
1e79606d
MN
61 "movq %0, %%mm7 \n\t"
62 "movq %1, %%mm6 \n\t"
63 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
64 );
65
66asm volatile(
cb482d25 67 "leal (%2, %3), %%eax \n\t"
37da00fc 68// 0 1 2 3 4 5 6 7 8 9
9c9e467d 69// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
ec487e5d 70
cb482d25 71 "movq (%2), %%mm0 \n\t"
37da00fc 72 "movq (%%eax), %%mm1 \n\t"
cb482d25
MN
73 "movq %%mm0, %%mm3 \n\t"
74 "movq %%mm0, %%mm4 \n\t"
75 PMAXUB(%%mm1, %%mm4)
76 PMINUB(%%mm1, %%mm3, %%mm5)
3057fa66
A
77 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
78 "paddb %%mm7, %%mm0 \n\t"
79 "pcmpgtb %%mm6, %%mm0 \n\t"
80
cb482d25
MN
81 "movq (%%eax,%3), %%mm2 \n\t"
82 PMAXUB(%%mm2, %%mm4)
83 PMINUB(%%mm2, %%mm3, %%mm5)
3057fa66
A
84 "psubb %%mm2, %%mm1 \n\t"
85 "paddb %%mm7, %%mm1 \n\t"
86 "pcmpgtb %%mm6, %%mm1 \n\t"
87 "paddb %%mm1, %%mm0 \n\t"
88
cb482d25
MN
89 "movq (%%eax, %3, 2), %%mm1 \n\t"
90 PMAXUB(%%mm1, %%mm4)
91 PMINUB(%%mm1, %%mm3, %%mm5)
3057fa66
A
92 "psubb %%mm1, %%mm2 \n\t"
93 "paddb %%mm7, %%mm2 \n\t"
94 "pcmpgtb %%mm6, %%mm2 \n\t"
95 "paddb %%mm2, %%mm0 \n\t"
9c9e467d 96
cb482d25 97 "leal (%%eax, %3, 4), %%eax \n\t"
3057fa66 98
cb482d25
MN
99 "movq (%2, %3, 4), %%mm2 \n\t"
100 PMAXUB(%%mm2, %%mm4)
101 PMINUB(%%mm2, %%mm3, %%mm5)
3057fa66
A
102 "psubb %%mm2, %%mm1 \n\t"
103 "paddb %%mm7, %%mm1 \n\t"
104 "pcmpgtb %%mm6, %%mm1 \n\t"
105 "paddb %%mm1, %%mm0 \n\t"
106
9c9e467d 107 "movq (%%eax), %%mm1 \n\t"
cb482d25
MN
108 PMAXUB(%%mm1, %%mm4)
109 PMINUB(%%mm1, %%mm3, %%mm5)
3057fa66
A
110 "psubb %%mm1, %%mm2 \n\t"
111 "paddb %%mm7, %%mm2 \n\t"
112 "pcmpgtb %%mm6, %%mm2 \n\t"
113 "paddb %%mm2, %%mm0 \n\t"
114
cb482d25
MN
115 "movq (%%eax, %3), %%mm2 \n\t"
116 PMAXUB(%%mm2, %%mm4)
117 PMINUB(%%mm2, %%mm3, %%mm5)
3057fa66
A
118 "psubb %%mm2, %%mm1 \n\t"
119 "paddb %%mm7, %%mm1 \n\t"
120 "pcmpgtb %%mm6, %%mm1 \n\t"
121 "paddb %%mm1, %%mm0 \n\t"
122
cb482d25
MN
123 "movq (%%eax, %3, 2), %%mm1 \n\t"
124 PMAXUB(%%mm1, %%mm4)
125 PMINUB(%%mm1, %%mm3, %%mm5)
3057fa66
A
126 "psubb %%mm1, %%mm2 \n\t"
127 "paddb %%mm7, %%mm2 \n\t"
128 "pcmpgtb %%mm6, %%mm2 \n\t"
129 "paddb %%mm2, %%mm0 \n\t"
cb482d25 130 "psubusb %%mm3, %%mm4 \n\t"
3057fa66
A
131
132 " \n\t"
cd38e322
MN
133#ifdef HAVE_MMX2
134 "pxor %%mm7, %%mm7 \n\t"
135 "psadbw %%mm7, %%mm0 \n\t"
136#else
3057fa66
A
137 "movq %%mm0, %%mm1 \n\t"
138 "psrlw $8, %%mm0 \n\t"
139 "paddb %%mm1, %%mm0 \n\t"
140 "movq %%mm0, %%mm1 \n\t"
141 "psrlq $16, %%mm0 \n\t"
142 "paddb %%mm1, %%mm0 \n\t"
143 "movq %%mm0, %%mm1 \n\t"
144 "psrlq $32, %%mm0 \n\t"
145 "paddb %%mm1, %%mm0 \n\t"
cd38e322 146#endif
1e79606d 147 "movq %4, %%mm7 \n\t" // QP,..., QP
cb482d25
MN
148 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
149 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
150 "packssdw %%mm4, %%mm4 \n\t"
3057fa66 151 "movd %%mm0, %0 \n\t"
cb482d25
MN
152 "movd %%mm4, %1 \n\t"
153
154 : "=r" (numEq), "=r" (dcOk)
1e79606d 155 : "r" (src), "r" (stride), "m" (c->pQPb)
9c9e467d 156 : "%eax"
3057fa66 157 );
cb482d25 158
cd38e322 159 numEq= (-numEq) &0xFF;
cb482d25
MN
160 if(numEq > c->ppMode.flatnessThreshold){
161 if(dcOk) return 0;
162 else return 1;
163 }else{
164 return 2;
165 }
3057fa66 166}
9c9e467d 167#endif
3057fa66 168
3057fa66 169/**
acced553 170 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
a6be8111 171 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
3057fa66 172 */
b0ac780a 173#ifndef HAVE_ALTIVEC
9c9e467d 174static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
3057fa66 175{
13e00528 176#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 177 src+= stride*3;
3057fa66 178 asm volatile( //"movv %0 %1 %2\n\t"
9c9e467d
MN
179 "movq %2, %%mm0 \n\t" // QP,..., QP
180 "pxor %%mm4, %%mm4 \n\t"
3057fa66
A
181
182 "movq (%0), %%mm6 \n\t"
183 "movq (%0, %1), %%mm5 \n\t"
184 "movq %%mm5, %%mm1 \n\t"
185 "movq %%mm6, %%mm2 \n\t"
186 "psubusb %%mm6, %%mm5 \n\t"
187 "psubusb %%mm1, %%mm2 \n\t"
188 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
189 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
9c9e467d 190 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
3057fa66
A
191
192 "pand %%mm2, %%mm6 \n\t"
193 "pandn %%mm1, %%mm2 \n\t"
194 "por %%mm2, %%mm6 \n\t"// First Line to Filter
195
196 "movq (%0, %1, 8), %%mm5 \n\t"
197 "leal (%0, %1, 4), %%eax \n\t"
9c9e467d
MN
198 "leal (%0, %1, 8), %%ecx \n\t"
199 "subl %1, %%ecx \n\t"
3057fa66
A
200 "addl %1, %0 \n\t" // %0 points to line 1 not 0
201 "movq (%0, %1, 8), %%mm7 \n\t"
202 "movq %%mm5, %%mm1 \n\t"
203 "movq %%mm7, %%mm2 \n\t"
204 "psubusb %%mm7, %%mm5 \n\t"
205 "psubusb %%mm1, %%mm2 \n\t"
206 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
207 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
9c9e467d 208 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
3057fa66
A
209
210 "pand %%mm2, %%mm7 \n\t"
211 "pandn %%mm1, %%mm2 \n\t"
212 "por %%mm2, %%mm7 \n\t" // First Line to Filter
213
214
215 // 1 2 3 4 5 6 7 8
9c9e467d 216 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
3057fa66
A
217 // 6 4 2 2 1 1
218 // 6 4 4 2
219 // 6 8 2
acced553 220
3057fa66
A
221 "movq (%0, %1), %%mm0 \n\t" // 1
222 "movq %%mm0, %%mm1 \n\t" // 1
13e00528
A
223 PAVGB(%%mm6, %%mm0) //1 1 /2
224 PAVGB(%%mm6, %%mm0) //3 1 /4
3057fa66
A
225
226 "movq (%0, %1, 4), %%mm2 \n\t" // 1
227 "movq %%mm2, %%mm5 \n\t" // 1
13e00528
A
228 PAVGB((%%eax), %%mm2) // 11 /2
229 PAVGB((%0, %1, 2), %%mm2) // 211 /4
3057fa66
A
230 "movq %%mm2, %%mm3 \n\t" // 211 /4
231 "movq (%0), %%mm4 \n\t" // 1
13e00528
A
232 PAVGB(%%mm4, %%mm3) // 4 211 /8
233 PAVGB(%%mm0, %%mm3) //642211 /16
3057fa66
A
234 "movq %%mm3, (%0) \n\t" // X
235 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
236 "movq %%mm1, %%mm0 \n\t" // 1
13e00528 237 PAVGB(%%mm6, %%mm0) //1 1 /2
3057fa66 238 "movq %%mm4, %%mm3 \n\t" // 1
13e00528
A
239 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
240 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
241 PAVGB((%%eax), %%mm5) // 211 /4
242 PAVGB(%%mm5, %%mm3) // 2 2211 /8
243 PAVGB(%%mm0, %%mm3) //4242211 /16
3057fa66
A
244 "movq %%mm3, (%0,%1) \n\t" // X
245 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
13e00528 246 PAVGB(%%mm4, %%mm6) //11 /2
9c9e467d 247 "movq (%%ecx), %%mm0 \n\t" // 1
13e00528 248 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
3057fa66 249 "movq %%mm0, %%mm3 \n\t" // 11/2
13e00528
A
250 PAVGB(%%mm1, %%mm0) // 2 11/4
251 PAVGB(%%mm6, %%mm0) //222 11/8
252 PAVGB(%%mm2, %%mm0) //22242211/16
3057fa66
A
253 "movq (%0, %1, 2), %%mm2 \n\t" // 1
254 "movq %%mm0, (%0, %1, 2) \n\t" // X
255 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
256 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
9c9e467d 257 PAVGB((%%ecx), %%mm0) // 11 /2
13e00528
A
258 PAVGB(%%mm0, %%mm6) //11 11 /4
259 PAVGB(%%mm1, %%mm4) // 11 /2
260 PAVGB(%%mm2, %%mm1) // 11 /2
261 PAVGB(%%mm1, %%mm6) //1122 11 /8
262 PAVGB(%%mm5, %%mm6) //112242211 /16
3057fa66
A
263 "movq (%%eax), %%mm5 \n\t" // 1
264 "movq %%mm6, (%%eax) \n\t" // X
265 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
266 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
13e00528
A
267 PAVGB(%%mm7, %%mm6) // 11 /2
268 PAVGB(%%mm4, %%mm6) // 11 11 /4
269 PAVGB(%%mm3, %%mm6) // 11 2211 /8
270 PAVGB(%%mm5, %%mm2) // 11 /2
3057fa66 271 "movq (%0, %1, 4), %%mm4 \n\t" // 1
13e00528
A
272 PAVGB(%%mm4, %%mm2) // 112 /4
273 PAVGB(%%mm2, %%mm6) // 112242211 /16
3057fa66
A
274 "movq %%mm6, (%0, %1, 4) \n\t" // X
275 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
13e00528
A
276 PAVGB(%%mm7, %%mm1) // 11 2 /4
277 PAVGB(%%mm4, %%mm5) // 11 /2
278 PAVGB(%%mm5, %%mm0) // 11 11 /4
3057fa66 279 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
13e00528
A
280 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
281 PAVGB(%%mm0, %%mm1) // 11224222 /16
3057fa66
A
282 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
283 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
9c9e467d 284 PAVGB((%%ecx), %%mm2) // 112 4 /8
3057fa66 285 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
13e00528
A
286 PAVGB(%%mm0, %%mm6) // 1 1 /2
287 PAVGB(%%mm7, %%mm6) // 1 12 /4
288 PAVGB(%%mm2, %%mm6) // 1122424 /4
9c9e467d 289 "movq %%mm6, (%%ecx) \n\t" // X
3057fa66 290 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
13e00528
A
291 PAVGB(%%mm7, %%mm5) // 11 2 /4
292 PAVGB(%%mm7, %%mm5) // 11 6 /8
3057fa66 293
13e00528
A
294 PAVGB(%%mm3, %%mm0) // 112 /4
295 PAVGB(%%mm0, %%mm5) // 112246 /16
3057fa66 296 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
8405b3fd 297 "subl %1, %0 \n\t"
3057fa66
A
298
299 :
9c9e467d
MN
300 : "r" (src), "r" (stride), "m" (c->pQPb)
301 : "%eax", "%ecx"
3057fa66 302 );
3057fa66
A
303#else
304 const int l1= stride;
305 const int l2= stride + l1;
306 const int l3= stride + l2;
307 const int l4= stride + l3;
308 const int l5= stride + l4;
309 const int l6= stride + l5;
310 const int l7= stride + l6;
311 const int l8= stride + l7;
312 const int l9= stride + l8;
d5a1a995 313 int x;
acced553 314 src+= stride*3;
d5a1a995 315 for(x=0; x<BLOCK_SIZE; x++)
3057fa66 316 {
9c9e467d
MN
317 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
318 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
3057fa66 319
8c8bbd10
MN
320 int sums[10];
321 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
322 sums[1] = sums[0] - first + src[l4];
323 sums[2] = sums[1] - first + src[l5];
324 sums[3] = sums[2] - first + src[l6];
325 sums[4] = sums[3] - first + src[l7];
326 sums[5] = sums[4] - src[l1] + src[l8];
327 sums[6] = sums[5] - src[l2] + last;
328 sums[7] = sums[6] - src[l3] + last;
329 sums[8] = sums[7] - src[l4] + last;
330 sums[9] = sums[8] - src[l5] + last;
331
332 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
333 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
334 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
335 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
336 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
337 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
338 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
339 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
3057fa66
A
340
341 src++;
342 }
3057fa66
A
343#endif
344}
b0ac780a 345#endif //HAVE_ALTIVEC
3057fa66 346
9c9e467d 347#if 0
13e00528
A
348/**
349 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
350 * values are correctly clipped (MMX2)
351 * values are wraparound (C)
352 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
353 0 8 16 24
354 x = 8
355 x/2 = 4
356 x/8 = 1
357 1 12 12 23
358 */
cc9b0679 359static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
13e00528 360{
d5a1a995 361#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553 362 src+= stride*3;
13e00528
A
363// FIXME rounding
364 asm volatile(
365 "pxor %%mm7, %%mm7 \n\t" // 0
9b464428 366 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
13e00528 367 "leal (%0, %1), %%eax \n\t"
9c9e467d 368 "leal (%%eax, %1, 4), %%ecx \n\t"
13e00528 369// 0 1 2 3 4 5 6 7 8 9
9c9e467d 370// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
9b464428 371 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP
13e00528 372 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
9b464428 373 "paddusb "MANGLE(b02)", %%mm0 \n\t"
13e00528 374 "psrlw $2, %%mm0 \n\t"
9b464428 375 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4
13e00528
A
376 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
377 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
9c9e467d 378 "movq (%%ecx), %%mm3 \n\t" // line 5
13e00528
A
379 "movq %%mm2, %%mm4 \n\t" // line 4
380 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
381 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
d5a1a995 382 PAVGB(%%mm3, %%mm5)
13e00528
A
383 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
384 "psubusb %%mm3, %%mm4 \n\t"
385 "psubusb %%mm2, %%mm3 \n\t"
386 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
387 "psubusb %%mm0, %%mm4 \n\t"
388 "pcmpeqb %%mm7, %%mm4 \n\t"
389 "pand %%mm4, %%mm5 \n\t" // d/2
390
391// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
392 "paddb %%mm5, %%mm2 \n\t"
393// "psubb %%mm6, %%mm2 \n\t"
394 "movq %%mm2, (%0,%1, 4) \n\t"
395
9c9e467d 396 "movq (%%ecx), %%mm2 \n\t"
13e00528
A
397// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
398 "psubb %%mm5, %%mm2 \n\t"
399// "psubb %%mm6, %%mm2 \n\t"
9c9e467d 400 "movq %%mm2, (%%ecx) \n\t"
13e00528
A
401
402 "paddb %%mm6, %%mm5 \n\t"
403 "psrlw $2, %%mm5 \n\t"
9b464428
FB
404 "pand "MANGLE(b3F)", %%mm5 \n\t"
405 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8
13e00528
A
406
407 "movq (%%eax, %1, 2), %%mm2 \n\t"
408 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
409 "paddsb %%mm5, %%mm2 \n\t"
410 "psubb %%mm6, %%mm2 \n\t"
411 "movq %%mm2, (%%eax, %1, 2) \n\t"
412
9c9e467d 413 "movq (%%ecx, %1), %%mm2 \n\t"
13e00528
A
414 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
415 "psubsb %%mm5, %%mm2 \n\t"
416 "psubb %%mm6, %%mm2 \n\t"
9c9e467d 417 "movq %%mm2, (%%ecx, %1) \n\t"
13e00528
A
418
419 :
420 : "r" (src), "r" (stride)
9c9e467d 421 : "%eax", "%ecx"
13e00528
A
422 );
423#else
424 const int l1= stride;
425 const int l2= stride + l1;
426 const int l3= stride + l2;
427 const int l4= stride + l3;
428 const int l5= stride + l4;
429 const int l6= stride + l5;
e5c30e06
MN
430// const int l7= stride + l6;
431// const int l8= stride + l7;
432// const int l9= stride + l8;
d5a1a995 433 int x;
3407a972 434 const int QP15= QP + (QP>>2);
acced553 435 src+= stride*3;
d5a1a995 436 for(x=0; x<BLOCK_SIZE; x++)
13e00528 437 {
3407a972
MN
438 const int v = (src[x+l5] - src[x+l4]);
439 if(ABS(v) < QP15)
13e00528 440 {
3407a972
MN
441 src[x+l3] +=v>>3;
442 src[x+l4] +=v>>1;
443 src[x+l5] -=v>>1;
444 src[x+l6] -=v>>3;
13e00528 445
13e00528 446 }
13e00528
A
447 }
448
449#endif
450}
9c9e467d 451#endif
13e00528
A
452
453/**
454 * Experimental Filter 1
9f45d04d
MN
455 * will not damage linear gradients
456 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
d5a1a995
MN
457 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
458 * MMX2 version does correct clipping C version doesnt
13e00528 459 */
9c9e467d 460static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
13e00528 461{
d5a1a995 462#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
acced553
MN
463 src+= stride*3;
464
13e00528 465 asm volatile(
d5a1a995 466 "pxor %%mm7, %%mm7 \n\t" // 0
d5a1a995 467 "leal (%0, %1), %%eax \n\t"
9c9e467d 468 "leal (%%eax, %1, 4), %%ecx \n\t"
d5a1a995 469// 0 1 2 3 4 5 6 7 8 9
9c9e467d 470// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
d5a1a995
MN
471 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
472 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
473 "movq %%mm1, %%mm2 \n\t" // line 4
474 "psubusb %%mm0, %%mm1 \n\t"
475 "psubusb %%mm2, %%mm0 \n\t"
476 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
9c9e467d
MN
477 "movq (%%ecx), %%mm3 \n\t" // line 5
478 "movq (%%ecx, %1), %%mm4 \n\t" // line 6
d5a1a995
MN
479 "movq %%mm3, %%mm5 \n\t" // line 5
480 "psubusb %%mm4, %%mm3 \n\t"
481 "psubusb %%mm5, %%mm4 \n\t"
482 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
483 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
484 "movq %%mm2, %%mm1 \n\t" // line 4
485 "psubusb %%mm5, %%mm2 \n\t"
486 "movq %%mm2, %%mm4 \n\t"
487 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
488 "psubusb %%mm1, %%mm5 \n\t"
489 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
490 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
491 "movq %%mm4, %%mm3 \n\t" // d
9c9e467d 492 "movq %2, %%mm0 \n\t"
dc16b332
MN
493 "paddusb %%mm0, %%mm0 \n\t"
494 "psubusb %%mm0, %%mm4 \n\t"
d5a1a995 495 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
9b464428 496 "psubusb "MANGLE(b01)", %%mm3 \n\t"
d5a1a995
MN
497 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
498
499 PAVGB(%%mm7, %%mm3) // d/2
9f45d04d
MN
500 "movq %%mm3, %%mm1 \n\t" // d/2
501 PAVGB(%%mm7, %%mm3) // d/4
502 PAVGB(%%mm1, %%mm3) // 3*d/8
d5a1a995
MN
503
504 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
505 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
506 "psubusb %%mm3, %%mm0 \n\t"
507 "pxor %%mm2, %%mm0 \n\t"
508 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
509
9c9e467d 510 "movq (%%ecx), %%mm0 \n\t" // line 5
d5a1a995
MN
511 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
512 "paddusb %%mm3, %%mm0 \n\t"
513 "pxor %%mm2, %%mm0 \n\t"
9c9e467d 514 "movq %%mm0, (%%ecx) \n\t" // line 5
d5a1a995 515
9f45d04d 516 PAVGB(%%mm7, %%mm1) // d/4
d5a1a995
MN
517
518 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
519 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
9f45d04d 520 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
521 "pxor %%mm2, %%mm0 \n\t"
522 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
523
9c9e467d 524 "movq (%%ecx, %1), %%mm0 \n\t" // line 6
d5a1a995 525 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
9f45d04d 526 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995 527 "pxor %%mm2, %%mm0 \n\t"
9c9e467d 528 "movq %%mm0, (%%ecx, %1) \n\t" // line 6
d5a1a995 529
9f45d04d 530 PAVGB(%%mm7, %%mm1) // d/8
d5a1a995
MN
531
532 "movq (%%eax, %1), %%mm0 \n\t" // line 2
533 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
9f45d04d 534 "psubusb %%mm1, %%mm0 \n\t"
d5a1a995
MN
535 "pxor %%mm2, %%mm0 \n\t"
536 "movq %%mm0, (%%eax, %1) \n\t" // line 2
537
9c9e467d 538 "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7
d5a1a995 539 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
9f45d04d 540 "paddusb %%mm1, %%mm0 \n\t"
d5a1a995 541 "pxor %%mm2, %%mm0 \n\t"
9c9e467d 542 "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7
13e00528
A
543
544 :
9c9e467d
MN
545 : "r" (src), "r" (stride), "m" (co->pQPb)
546 : "%eax", "%ecx"
13e00528
A
547 );
548#else
d5a1a995
MN
549
550 const int l1= stride;
551 const int l2= stride + l1;
552 const int l3= stride + l2;
553 const int l4= stride + l3;
554 const int l5= stride + l4;
555 const int l6= stride + l5;
556 const int l7= stride + l6;
e5c30e06
MN
557// const int l8= stride + l7;
558// const int l9= stride + l8;
d5a1a995 559 int x;
acced553
MN
560
561 src+= stride*3;
d5a1a995
MN
562 for(x=0; x<BLOCK_SIZE; x++)
563 {
564 int a= src[l3] - src[l4];
565 int b= src[l4] - src[l5];
9f45d04d 566 int c= src[l5] - src[l6];
d5a1a995 567
3407a972
MN
568 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
569 d= MAX(d, 0);
d5a1a995 570
9c9e467d 571 if(d < co->QP*2)
d5a1a995
MN
572 {
573 int v = d * SIGN(-b);
574
3407a972
MN
575 src[l2] +=v>>3;
576 src[l3] +=v>>2;
577 src[l4] +=(3*v)>>3;
578 src[l5] -=(3*v)>>3;
579 src[l6] -=v>>2;
580 src[l7] -=v>>3;
d5a1a995
MN
581
582 }
583 src++;
584 }
13e00528
A
585#endif
586}
587
b0ac780a 588#ifndef HAVE_ALTIVEC
9c9e467d 589static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
3057fa66 590{
7f16f6e6
MN
591#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
592/*
593 uint8_t tmp[16];
594 const int l1= stride;
595 const int l2= stride + l1;
596 const int l3= stride + l2;
597 const int l4= (int)tmp - (int)src - stride*3;
598 const int l5= (int)tmp - (int)src - stride*3 + 8;
599 const int l6= stride*3 + l3;
600 const int l7= stride + l6;
601 const int l8= stride + l7;
602
603 memcpy(tmp, src+stride*7, 8);
604 memcpy(tmp+8, src+stride*8, 8);
605*/
606 src+= stride*4;
607 asm volatile(
608
609#if 0 //sligtly more accurate and slightly slower
610 "pxor %%mm7, %%mm7 \n\t" // 0
611 "leal (%0, %1), %%eax \n\t"
9c9e467d 612 "leal (%%eax, %1, 4), %%ecx \n\t"
7f16f6e6 613// 0 1 2 3 4 5 6 7
9c9e467d
MN
614// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
615// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
7f16f6e6
MN
616
617
618 "movq (%0, %1, 2), %%mm0 \n\t" // l2
619 "movq (%0), %%mm1 \n\t" // l0
620 "movq %%mm0, %%mm2 \n\t" // l2
621 PAVGB(%%mm7, %%mm0) // ~l2/2
622 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
623 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
624
625 "movq (%%eax), %%mm1 \n\t" // l1
626 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
627 "movq %%mm1, %%mm4 \n\t" // l1
628 PAVGB(%%mm7, %%mm1) // ~l1/2
629 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
630 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
631
632 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
633 "psubusb %%mm1, %%mm0 \n\t"
634 "psubusb %%mm4, %%mm1 \n\t"
635 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
636// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
637
638 "movq (%0, %1, 4), %%mm0 \n\t" // l4
639 "movq %%mm0, %%mm4 \n\t" // l4
640 PAVGB(%%mm7, %%mm0) // ~l4/2
641 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
642 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
643
9c9e467d 644 "movq (%%ecx), %%mm2 \n\t" // l5
7f16f6e6
MN
645 "movq %%mm3, %%mm5 \n\t" // l3
646 PAVGB(%%mm7, %%mm3) // ~l3/2
647 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
648 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
649
650 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
651 "psubusb %%mm3, %%mm0 \n\t"
652 "psubusb %%mm6, %%mm3 \n\t"
653 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
654 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
655// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
656
9c9e467d 657 "movq (%%ecx, %1), %%mm6 \n\t" // l6
7f16f6e6
MN
658 "movq %%mm6, %%mm5 \n\t" // l6
659 PAVGB(%%mm7, %%mm6) // ~l6/2
660 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
661 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
662
9c9e467d 663 "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7
7f16f6e6
MN
664 "movq %%mm2, %%mm4 \n\t" // l5
665 PAVGB(%%mm7, %%mm2) // ~l5/2
666 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
667 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
668
669 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
670 "psubusb %%mm2, %%mm6 \n\t"
671 "psubusb %%mm4, %%mm2 \n\t"
672 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
673// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
674
675
676 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
9c9e467d 677 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
9b464428 678 "paddusb "MANGLE(b01)", %%mm4 \n\t"
7f16f6e6
MN
679 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
680 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
681 "pand %%mm4, %%mm3 \n\t"
682
683 "movq %%mm3, %%mm1 \n\t"
9b464428 684// "psubusb "MANGLE(b01)", %%mm3 \n\t"
7f16f6e6
MN
685 PAVGB(%%mm7, %%mm3)
686 PAVGB(%%mm7, %%mm3)
687 "paddusb %%mm1, %%mm3 \n\t"
9b464428 688// "paddusb "MANGLE(b01)", %%mm3 \n\t"
7f16f6e6
MN
689
690 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
691 "movq (%0, %1, 4), %%mm5 \n\t" //l4
692 "movq (%0, %1, 4), %%mm4 \n\t" //l4
693 "psubusb %%mm6, %%mm5 \n\t"
694 "psubusb %%mm4, %%mm6 \n\t"
695 "por %%mm6, %%mm5 \n\t" // |l3-l4|
696 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
697 "pxor %%mm6, %%mm0 \n\t"
698 "pand %%mm0, %%mm3 \n\t"
699 PMINUB(%%mm5, %%mm3, %%mm0)
700
9b464428 701 "psubusb "MANGLE(b01)", %%mm3 \n\t"
7f16f6e6
MN
702 PAVGB(%%mm7, %%mm3)
703
704 "movq (%%eax, %1, 2), %%mm0 \n\t"
705 "movq (%0, %1, 4), %%mm2 \n\t"
706 "pxor %%mm6, %%mm0 \n\t"
707 "pxor %%mm6, %%mm2 \n\t"
708 "psubb %%mm3, %%mm0 \n\t"
709 "paddb %%mm3, %%mm2 \n\t"
710 "pxor %%mm6, %%mm0 \n\t"
711 "pxor %%mm6, %%mm2 \n\t"
712 "movq %%mm0, (%%eax, %1, 2) \n\t"
713 "movq %%mm2, (%0, %1, 4) \n\t"
714#endif
715
716 "leal (%0, %1), %%eax \n\t"
717 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
718// 0 1 2 3 4 5 6 7
9c9e467d
MN
719// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
720// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
7f16f6e6
MN
721
722
723 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
724 "movq (%0, %1, 4), %%mm0 \n\t" // l4
725 "pxor %%mm6, %%mm1 \n\t" // -l3-1
726 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
727// mm1=-l3-1, mm0=128-q
728
729 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
730 "movq (%%eax, %1), %%mm3 \n\t" // l2
731 "pxor %%mm6, %%mm2 \n\t" // -l5-1
732 "movq %%mm2, %%mm5 \n\t" // -l5-1
9b464428 733 "movq "MANGLE(b80)", %%mm4 \n\t" // 128
9c9e467d 734 "leal (%%eax, %1, 4), %%ecx \n\t"
7f16f6e6
MN
735 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
736 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
737 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
738 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
739// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
740
741 "movq (%%eax), %%mm2 \n\t" // l1
742 "pxor %%mm6, %%mm2 \n\t" // -l1-1
743 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
744 PAVGB((%0), %%mm1) // (l0-l3+256)/2
9b464428 745 "movq "MANGLE(b80)", %%mm3 \n\t" // 128
7f16f6e6
MN
746 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
747 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
748 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
749// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
750
9c9e467d
MN
751 PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2
752 "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7
7f16f6e6
MN
753 "pxor %%mm6, %%mm1 \n\t" // -l7-1
754 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
9b464428 755 "movq "MANGLE(b80)", %%mm2 \n\t" // 128
7f16f6e6
MN
756 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
757 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
758 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
759// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
760
9b464428
FB
761 "movq "MANGLE(b00)", %%mm1 \n\t" // 0
762 "movq "MANGLE(b00)", %%mm5 \n\t" // 0
7f16f6e6
MN
763 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
764 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
765 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
766 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
767 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
768
769// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
770
9b464428 771 "movq "MANGLE(b00)", %%mm7 \n\t" // 0
9c9e467d 772 "movq %2, %%mm2 \n\t" // QP
7f16f6e6
MN
773 PAVGB(%%mm6, %%mm2) // 128 + QP/2
774 "psubb %%mm6, %%mm2 \n\t"
775
776 "movq %%mm4, %%mm1 \n\t"
777 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
778 "pxor %%mm1, %%mm4 \n\t"
779 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
780 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
781 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
782// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
783
784 "movq %%mm4, %%mm3 \n\t" // d
9b464428 785 "psubusb "MANGLE(b01)", %%mm4 \n\t"
7f16f6e6
MN
786 PAVGB(%%mm7, %%mm4) // d/32
787 PAVGB(%%mm7, %%mm4) // (d + 32)/64
788 "paddb %%mm3, %%mm4 \n\t" // 5d/64
789 "pand %%mm2, %%mm4 \n\t"
790
9b464428 791 "movq "MANGLE(b80)", %%mm5 \n\t" // 128
7f16f6e6
MN
792 "psubb %%mm0, %%mm5 \n\t" // q
793 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
794 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
795 "pxor %%mm7, %%mm5 \n\t"
796
797 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
798 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
799
800 "pand %%mm7, %%mm4 \n\t"
801 "movq (%%eax, %1, 2), %%mm0 \n\t"
802 "movq (%0, %1, 4), %%mm2 \n\t"
803 "pxor %%mm1, %%mm0 \n\t"
804 "pxor %%mm1, %%mm2 \n\t"
805 "paddb %%mm4, %%mm0 \n\t"
806 "psubb %%mm4, %%mm2 \n\t"
807 "pxor %%mm1, %%mm0 \n\t"
808 "pxor %%mm1, %%mm2 \n\t"
809 "movq %%mm0, (%%eax, %1, 2) \n\t"
810 "movq %%mm2, (%0, %1, 4) \n\t"
811
812 :
9c9e467d
MN
813 : "r" (src), "r" (stride), "m" (c->pQPb)
814 : "%eax", "%ecx"
7f16f6e6
MN
815 );
816
817/*
818 {
819 int x;
820 src-= stride;
821 for(x=0; x<BLOCK_SIZE; x++)
822 {
823 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
824 if(ABS(middleEnergy)< 8*QP)
825 {
826 const int q=(src[l4] - src[l5])/2;
827 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
828 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
829
830 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
831 d= MAX(d, 0);
832
833 d= (5*d + 32) >> 6;
834 d*= SIGN(-middleEnergy);
835
836 if(q>0)
837 {
838 d= d<0 ? 0 : d;
839 d= d>q ? q : d;
840 }
841 else
842 {
843 d= d>0 ? 0 : d;
844 d= d<q ? q : d;
845 }
846
847 src[l4]-= d;
848 src[l5]+= d;
849 }
850 src++;
851 }
852src-=8;
853 for(x=0; x<8; x++)
854 {
855 int y;
856 for(y=4; y<6; y++)
857 {
858 int d= src[x+y*stride] - tmp[x+(y-4)*8];
859 int ad= ABS(d);
860 static int max=0;
861 static int sum=0;
862 static int num=0;
863 static int bias=0;
864
865 if(max<ad) max=ad;
866 sum+= ad>3 ? 1 : 0;
867 if(ad>3)
868 {
869 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
870 }
871 if(y==4) bias+=d;
872 num++;
873 if(num%1000000 == 0)
874 {
875 printf(" %d %d %d %d\n", num, sum, max, bias);
876 }
877 }
878 }
879}
880*/
881#elif defined (HAVE_MMX)
acced553 882 src+= stride*4;
3057fa66
A
883 asm volatile(
884 "pxor %%mm7, %%mm7 \n\t"
9c9e467d
MN
885 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars
886 "andl $0xFFFFFFF8, %%ecx \n\t" // align
3057fa66 887// 0 1 2 3 4 5 6 7
9c9e467d
MN
888// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
889// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
3057fa66
A
890
891 "movq (%0), %%mm0 \n\t"
892 "movq %%mm0, %%mm1 \n\t"
893 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
894 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
895
01dc3aa4
MN
896 "movq (%0, %1), %%mm2 \n\t"
897 "leal (%0, %1, 2), %%eax \n\t"
3057fa66
A
898 "movq %%mm2, %%mm3 \n\t"
899 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
900 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
901
01dc3aa4 902 "movq (%%eax), %%mm4 \n\t"
3057fa66
A
903 "movq %%mm4, %%mm5 \n\t"
904 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
905 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
906
907 "paddw %%mm0, %%mm0 \n\t" // 2L0
908 "paddw %%mm1, %%mm1 \n\t" // 2H0
909 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
910 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
911 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
912 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
913
914 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
915 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
916 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
917 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
918
01dc3aa4 919 "movq (%%eax, %1), %%mm2 \n\t"
3057fa66
A
920 "movq %%mm2, %%mm3 \n\t"
921 "punpcklbw %%mm7, %%mm2 \n\t" // L3
922 "punpckhbw %%mm7, %%mm3 \n\t" // H3
923
924 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
925 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
926 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
927 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
9c9e467d
MN
928 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
929 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
3057fa66 930
01dc3aa4 931 "movq (%%eax, %1, 2), %%mm0 \n\t"
3057fa66
A
932 "movq %%mm0, %%mm1 \n\t"
933 "punpcklbw %%mm7, %%mm0 \n\t" // L4
934 "punpckhbw %%mm7, %%mm1 \n\t" // H4
935
936 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
937 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
9c9e467d
MN
938 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4
939 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4
3057fa66
A
940 "paddw %%mm4, %%mm4 \n\t" // 2L2
941 "paddw %%mm5, %%mm5 \n\t" // 2H2
942 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
943 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
944
01dc3aa4 945 "leal (%%eax, %1), %0 \n\t"
3057fa66
A
946 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
947 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
948 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
949 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
950//50 opcodes so far
01dc3aa4 951 "movq (%0, %1, 2), %%mm2 \n\t"
3057fa66
A
952 "movq %%mm2, %%mm3 \n\t"
953 "punpcklbw %%mm7, %%mm2 \n\t" // L5
954 "punpckhbw %%mm7, %%mm3 \n\t" // H5
955 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
956 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
957 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
958 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
959
01dc3aa4 960 "movq (%%eax, %1, 4), %%mm6 \n\t"
3057fa66
A
961 "punpcklbw %%mm7, %%mm6 \n\t" // L6
962 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
01dc3aa4 963 "movq (%%eax, %1, 4), %%mm6 \n\t"
3057fa66
A
964 "punpckhbw %%mm7, %%mm6 \n\t" // H6
965 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
966
967 "paddw %%mm0, %%mm0 \n\t" // 2L4
968 "paddw %%mm1, %%mm1 \n\t" // 2H4
969 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
970 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
971
972 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
973 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
974 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
975 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
976
01dc3aa4 977 "movq (%0, %1, 4), %%mm2 \n\t"
3057fa66
A
978 "movq %%mm2, %%mm3 \n\t"
979 "punpcklbw %%mm7, %%mm2 \n\t" // L7
980 "punpckhbw %%mm7, %%mm3 \n\t" // H7
981
982 "paddw %%mm2, %%mm2 \n\t" // 2L7
983 "paddw %%mm3, %%mm3 \n\t" // 2H7
984 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
985 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
986
9c9e467d
MN
987 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
988 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
8405b3fd
MN
989
990#ifdef HAVE_MMX2
991 "movq %%mm7, %%mm6 \n\t" // 0
992 "psubw %%mm0, %%mm6 \n\t"
993 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
994 "movq %%mm7, %%mm6 \n\t" // 0
995 "psubw %%mm1, %%mm6 \n\t"
996 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
997 "movq %%mm7, %%mm6 \n\t" // 0
998 "psubw %%mm2, %%mm6 \n\t"
999 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1000 "movq %%mm7, %%mm6 \n\t" // 0
1001 "psubw %%mm3, %%mm6 \n\t"
1002 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1003#else
3057fa66
A
1004 "movq %%mm7, %%mm6 \n\t" // 0
1005 "pcmpgtw %%mm0, %%mm6 \n\t"
1006 "pxor %%mm6, %%mm0 \n\t"
1007 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1008 "movq %%mm7, %%mm6 \n\t" // 0
1009 "pcmpgtw %%mm1, %%mm6 \n\t"
1010 "pxor %%mm6, %%mm1 \n\t"
1011 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3057fa66
A
1012 "movq %%mm7, %%mm6 \n\t" // 0
1013 "pcmpgtw %%mm2, %%mm6 \n\t"
1014 "pxor %%mm6, %%mm2 \n\t"
1015 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1016 "movq %%mm7, %%mm6 \n\t" // 0
1017 "pcmpgtw %%mm3, %%mm6 \n\t"
1018 "pxor %%mm6, %%mm3 \n\t"
1019 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
8405b3fd 1020#endif
3057fa66
A
1021
1022#ifdef HAVE_MMX2
1023 "pminsw %%mm2, %%mm0 \n\t"
1024 "pminsw %%mm3, %%mm1 \n\t"
1025#else
1026 "movq %%mm0, %%mm6 \n\t"
1027 "psubusw %%mm2, %%mm6 \n\t"
1028 "psubw %%mm6, %%mm0 \n\t"
1029 "movq %%mm1, %%mm6 \n\t"
1030 "psubusw %%mm3, %%mm6 \n\t"
1031 "psubw %%mm6, %%mm1 \n\t"
1032#endif
1033
792a5a7c
MN
1034 "movd %2, %%mm2 \n\t" // QP
1035 "punpcklbw %%mm7, %%mm2 \n\t"
1036
3057fa66
A
1037 "movq %%mm7, %%mm6 \n\t" // 0
1038 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1039 "pxor %%mm6, %%mm4 \n\t"
1040 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1041 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1042 "pxor %%mm7, %%mm5 \n\t"
1043 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1044// 100 opcodes
3057fa66
A
1045 "psllw $3, %%mm2 \n\t" // 8QP
1046 "movq %%mm2, %%mm3 \n\t" // 8QP
1047 "pcmpgtw %%mm4, %%mm2 \n\t"
1048 "pcmpgtw %%mm5, %%mm3 \n\t"
1049 "pand %%mm2, %%mm4 \n\t"
1050 "pand %%mm3, %%mm5 \n\t"
1051
1052
1053 "psubusw %%mm0, %%mm4 \n\t" // hd
1054 "psubusw %%mm1, %%mm5 \n\t" // ld
1055
1056
bf1595c4 1057 "movq "MANGLE(w05)", %%mm2 \n\t" // 5
3057fa66
A
1058 "pmullw %%mm2, %%mm4 \n\t"
1059 "pmullw %%mm2, %%mm5 \n\t"
bf1595c4 1060 "movq "MANGLE(w20)", %%mm2 \n\t" // 32
3057fa66
A
1061 "paddw %%mm2, %%mm4 \n\t"
1062 "paddw %%mm2, %%mm5 \n\t"
1063 "psrlw $6, %%mm4 \n\t"
1064 "psrlw $6, %%mm5 \n\t"
1065
9c9e467d
MN
1066 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4
1067 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4
3057fa66
A
1068
1069 "pxor %%mm2, %%mm2 \n\t"
1070 "pxor %%mm3, %%mm3 \n\t"
1071
3057fa66
A
1072 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1073 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1074 "pxor %%mm2, %%mm0 \n\t"
1075 "pxor %%mm3, %%mm1 \n\t"
1076 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1077 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
e5c30e06
MN
1078 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1079 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3057fa66
A
1080
1081 "pxor %%mm6, %%mm2 \n\t"
1082 "pxor %%mm7, %%mm3 \n\t"
1083 "pand %%mm2, %%mm4 \n\t"
1084 "pand %%mm3, %%mm5 \n\t"
1085
1086#ifdef HAVE_MMX2
1087 "pminsw %%mm0, %%mm4 \n\t"
1088 "pminsw %%mm1, %%mm5 \n\t"
1089#else
1090 "movq %%mm4, %%mm2 \n\t"
1091 "psubusw %%mm0, %%mm2 \n\t"
1092 "psubw %%mm2, %%mm4 \n\t"
1093 "movq %%mm5, %%mm2 \n\t"
1094 "psubusw %%mm1, %%mm2 \n\t"
1095 "psubw %%mm2, %%mm5 \n\t"
1096#endif
1097 "pxor %%mm6, %%mm4 \n\t"
1098 "pxor %%mm7, %%mm5 \n\t"
1099 "psubw %%mm6, %%mm4 \n\t"
1100 "psubw %%mm7, %%mm5 \n\t"
1101 "packsswb %%mm5, %%mm4 \n\t"
01dc3aa4 1102 "movq (%0), %%mm0 \n\t"
3057fa66 1103 "paddb %%mm4, %%mm0 \n\t"
01dc3aa4
MN
1104 "movq %%mm0, (%0) \n\t"
1105 "movq (%0, %1), %%mm0 \n\t"
3057fa66 1106 "psubb %%mm4, %%mm0 \n\t"
01dc3aa4 1107 "movq %%mm0, (%0, %1) \n\t"
3057fa66 1108
01dc3aa4
MN
1109 : "+r" (src)
1110 : "r" (stride), "m" (c->pQPb)
1111 : "%eax", "%ecx"
3057fa66
A
1112 );
1113#else
1114 const int l1= stride;
1115 const int l2= stride + l1;
1116 const int l3= stride + l2;
1117 const int l4= stride + l3;
1118 const int l5= stride + l4;
1119 const int l6= stride + l5;
1120 const int l7= stride + l6;
1121 const int l8= stride + l7;
1122// const int l9= stride + l8;
d5a1a995 1123 int x;
acced553 1124 src+= stride*3;
d5a1a995 1125 for(x=0; x<BLOCK_SIZE; x++)
3057fa66
A
1126 {
1127 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
9c9e467d 1128 if(ABS(middleEnergy) < 8*c->QP)
3057fa66
A
1129 {
1130 const int q=(src[l4] - src[l5])/2;
1131 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1132 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1133
1134 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1135 d= MAX(d, 0);
1136
1137 d= (5*d + 32) >> 6;
1138 d*= SIGN(-middleEnergy);
1139
1140 if(q>0)
1141 {
1142 d= d<0 ? 0 : d;
1143 d= d>q ? q : d;
1144 }
1145 else
1146 {
1147 d= d>0 ? 0 : d;
1148 d= d<q ? q : d;
1149 }
1150
1151 src[l4]-= d;
1152 src[l5]+= d;
1153 }
1154 src++;
1155 }
1156#endif
1157}
b0ac780a 1158#endif //HAVE_ALTIVEC
3057fa66 1159
b0ac780a 1160#ifndef HAVE_ALTIVEC
9c9e467d 1161static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
3057fa66 1162{
e0f8ffae 1163#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
3057fa66 1164 asm volatile(
9c9e467d
MN
1165 "pxor %%mm6, %%mm6 \n\t"
1166 "pcmpeqb %%mm7, %%mm7 \n\t"
1167 "movq %2, %%mm0 \n\t"
1168 "punpcklbw %%mm6, %%mm0 \n\t"
1169 "psrlw $1, %%mm0 \n\t"
1170 "psubw %%mm7, %%mm0 \n\t"
1171 "packuswb %%mm0, %%mm0 \n\t"
1172 "movq %%mm0, %3 \n\t"
70c5ae87 1173
3057fa66 1174 "leal (%0, %1), %%eax \n\t"
9c9e467d
MN
1175 "leal (%%eax, %1, 4), %%edx \n\t"
1176
3057fa66 1177// 0 1 2 3 4 5 6 7 8 9
9c9e467d 1178// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
3057fa66 1179
cc9b0679 1180#undef FIND_MIN_MAX
e0f8ffae 1181#ifdef HAVE_MMX2
3057fa66 1182#define FIND_MIN_MAX(addr)\
70c5ae87 1183 "movq " #addr ", %%mm0 \n\t"\
cd38e322
MN
1184 "pminub %%mm0, %%mm7 \n\t"\
1185 "pmaxub %%mm0, %%mm6 \n\t"
e0f8ffae
MN
1186#else
1187#define FIND_MIN_MAX(addr)\
1188 "movq " #addr ", %%mm0 \n\t"\
cd38e322
MN
1189 "movq %%mm7, %%mm1 \n\t"\
1190 "psubusb %%mm0, %%mm6 \n\t"\
1191 "paddb %%mm0, %%mm6 \n\t"\
e0f8ffae 1192 "psubusb %%mm0, %%mm1 \n\t"\
cd38e322 1193 "psubb %%mm1, %%mm7 \n\t"
e0f8ffae 1194#endif
3057fa66 1195
70c5ae87
MN
1196FIND_MIN_MAX((%%eax))
1197FIND_MIN_MAX((%%eax, %1))
1198FIND_MIN_MAX((%%eax, %1, 2))
1199FIND_MIN_MAX((%0, %1, 4))
9c9e467d
MN
1200FIND_MIN_MAX((%%edx))
1201FIND_MIN_MAX((%%edx, %1))
1202FIND_MIN_MAX((%%edx, %1, 2))
70c5ae87 1203FIND_MIN_MAX((%0, %1, 8))
3057fa66 1204
3057fa66 1205 "movq %%mm7, %%mm4 \n\t"
e5c30e06 1206 "psrlq $8, %%mm7 \n\t"
e5c30e06 1207#ifdef HAVE_MMX2
cd38e322 1208 "pminub %%mm4, %%mm7 \n\t" // min of pixels
e5c30e06 1209 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
cd38e322 1210 "pminub %%mm4, %%mm7 \n\t" // min of pixels
e5c30e06 1211 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
cd38e322 1212 "pminub %%mm4, %%mm7 \n\t"
e5c30e06 1213#else
cd38e322
MN
1214 "movq %%mm7, %%mm1 \n\t"
1215 "psubusb %%mm4, %%mm1 \n\t"
1216 "psubb %%mm1, %%mm7 \n\t"
3057fa66
A
1217 "movq %%mm7, %%mm4 \n\t"
1218 "psrlq $16, %%mm7 \n\t"
cd38e322
MN
1219 "movq %%mm7, %%mm1 \n\t"
1220 "psubusb %%mm4, %%mm1 \n\t"
1221 "psubb %%mm1, %%mm7 \n\t"
3057fa66 1222 "movq %%mm7, %%mm4 \n\t"
e5c30e06 1223 "psrlq $32, %%mm7 \n\t"
cd38e322
MN
1224 "movq %%mm7, %%mm1 \n\t"
1225 "psubusb %%mm4, %%mm1 \n\t"
1226 "psubb %%mm1, %%mm7 \n\t"
e5c30e06 1227#endif
cd38e322
MN
1228
1229
1230 "movq %%mm6, %%mm4 \n\t"
1231 "psrlq $8, %%mm6 \n\t"
1232#ifdef HAVE_MMX2
1233 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1234 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1235 "pmaxub %%mm4, %%mm6 \n\t"
1236 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1237 "pmaxub %%mm4, %%mm6 \n\t"
1238#else
1239 "psubusb %%mm4, %%mm6 \n\t"
1240 "paddb %%mm4, %%mm6 \n\t"
1241 "movq %%mm6, %%mm4 \n\t"
1242 "psrlq $16, %%mm6 \n\t"
1243 "psubusb %%mm4, %%mm6 \n\t"
1244 "paddb %%mm4, %%mm6 \n\t"
1245 "movq %%mm6, %%mm4 \n\t"
1246 "psrlq $32, %%mm6 \n\t"
1247 "psubusb %%mm4, %%mm6 \n\t"
1248 "paddb %%mm4, %%mm6 \n\t"
1249#endif
1250 "movq %%mm6, %%mm0 \n\t" // max
1251 "psubb %%mm7, %%mm6 \n\t" // max - min
1252 "movd %%mm6, %%ecx \n\t"
9b464428 1253 "cmpb "MANGLE(deringThreshold)", %%cl \n\t"
cd38e322 1254 " jb 1f \n\t"
9c9e467d
MN
1255 "leal -24(%%esp), %%ecx \n\t"
1256 "andl $0xFFFFFFF8, %%ecx \n\t"
cd38e322 1257 PAVGB(%%mm0, %%mm7) // a=(max + min)/2
e5c30e06
MN
1258 "punpcklbw %%mm7, %%mm7 \n\t"
1259 "punpcklbw %%mm7, %%mm7 \n\t"
1260 "punpcklbw %%mm7, %%mm7 \n\t"
9c9e467d 1261 "movq %%mm7, (%%ecx) \n\t"
70c5ae87
MN
1262
1263 "movq (%0), %%mm0 \n\t" // L10
1264 "movq %%mm0, %%mm1 \n\t" // L10
1265 "movq %%mm0, %%mm2 \n\t" // L10
1266 "psllq $8, %%mm1 \n\t"
1267 "psrlq $8, %%mm2 \n\t"
1268 "movd -4(%0), %%mm3 \n\t"
1269 "movd 8(%0), %%mm4 \n\t"
1270 "psrlq $24, %%mm3 \n\t"
1271 "psllq $56, %%mm4 \n\t"
1272 "por %%mm3, %%mm1 \n\t" // L00
1273 "por %%mm4, %%mm2 \n\t" // L20
1274 "movq %%mm1, %%mm3 \n\t" // L00
1275 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1276 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1277 "psubusb %%mm7, %%mm0 \n\t"
1278 "psubusb %%mm7, %%mm2 \n\t"
1279 "psubusb %%mm7, %%mm3 \n\t"
9b464428
FB
1280 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
1281 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
1282 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
70c5ae87
MN
1283 "paddb %%mm2, %%mm0 \n\t"
1284 "paddb %%mm3, %%mm0 \n\t"
1285
1286 "movq (%%eax), %%mm2 \n\t" // L11
1287 "movq %%mm2, %%mm3 \n\t" // L11
1288 "movq %%mm2, %%mm4 \n\t" // L11
1289 "psllq $8, %%mm3 \n\t"
1290 "psrlq $8, %%mm4 \n\t"
1291 "movd -4(%%eax), %%mm5 \n\t"
1292 "movd 8(%%eax), %%mm6 \n\t"
1293 "psrlq $24, %%mm5 \n\t"
1294 "psllq $56, %%mm6 \n\t"
1295 "por %%mm5, %%mm3 \n\t" // L01
1296 "por %%mm6, %%mm4 \n\t" // L21
1297 "movq %%mm3, %%mm5 \n\t" // L01
1298 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1299 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1300 "psubusb %%mm7, %%mm2 \n\t"
1301 "psubusb %%mm7, %%mm4 \n\t"
1302 "psubusb %%mm7, %%mm5 \n\t"
9b464428
FB
1303 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
1304 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
1305 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
70c5ae87
MN
1306 "paddb %%mm4, %%mm2 \n\t"
1307 "paddb %%mm5, %%mm2 \n\t"
1308// 0, 2, 3, 1
1309#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1310 "movq " #src ", " #sx " \n\t" /* src[0] */\
1311 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1312 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1313 "psllq $8, " #lx " \n\t"\
1314 "psrlq $8, " #t0 " \n\t"\
1315 "movd -4" #src ", " #t1 " \n\t"\
1316 "psrlq $24, " #t1 " \n\t"\
1317 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1318 "movd 8" #src ", " #t1 " \n\t"\
1319 "psllq $56, " #t1 " \n\t"\
1320 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1321 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1322 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1323 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
9927c7ee 1324 PAVGB(lx, pplx) \
9c9e467d
MN
1325 "movq " #lx ", 8(%%ecx) \n\t"\
1326 "movq (%%ecx), " #lx " \n\t"\
8405b3fd
MN
1327 "psubusb " #lx ", " #t1 " \n\t"\
1328 "psubusb " #lx ", " #t0 " \n\t"\
1329 "psubusb " #lx ", " #sx " \n\t"\
9b464428 1330 "movq "MANGLE(b00)", " #lx " \n\t"\
8405b3fd
MN
1331 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1332 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1333 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
70c5ae87
MN
1334 "paddb " #t1 ", " #t0 " \n\t"\
1335 "paddb " #t0 ", " #sx " \n\t"\
1336\
70c5ae87
MN
1337 PAVGB(plx, pplx) /* filtered */\
1338 "movq " #dst ", " #t0 " \n\t" /* dst */\
2e212618 1339 "movq " #t0 ", " #t1 " \n\t" /* dst */\
9c9e467d
MN
1340 "psubusb %3, " #t0 " \n\t"\
1341 "paddusb %3, " #t1 " \n\t"\
2e212618
MN
1342 PMAXUB(t0, pplx)\
1343 PMINUB(t1, pplx, t0)\
70c5ae87
MN
1344 "paddb " #sx ", " #ppsx " \n\t"\
1345 "paddb " #psx ", " #ppsx " \n\t"\
9b464428
FB
1346 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
1347 "pand "MANGLE(b08)", " #ppsx " \n\t"\
8405b3fd 1348 "pcmpeqb " #lx ", " #ppsx " \n\t"\
2e212618 1349 "pand " #ppsx ", " #pplx " \n\t"\
70c5ae87 1350 "pandn " #dst ", " #ppsx " \n\t"\
8405b3fd 1351 "por " #pplx ", " #ppsx " \n\t"\
9927c7ee 1352 "movq " #ppsx ", " #dst " \n\t"\
9c9e467d 1353 "movq 8(%%ecx), " #lx " \n\t"
2e212618 1354
70c5ae87
MN
1355/*
13560000000
13571111111
e5c30e06 1358
70c5ae87
MN
13591111110
13601111101
13611111100
13621111011
13631111010
13641111001
e5c30e06 1365
70c5ae87
MN
13661111000
13671110111
e5c30e06 1368
70c5ae87
MN
1369*/
1370//DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1371DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1372DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1373DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
9c9e467d
MN
1374DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1375DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1376DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1377DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1378DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
3057fa66 1379
cd38e322 1380 "1: \n\t"
9c9e467d
MN
1381 : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
1382 : "%eax", "%edx", "%ecx"
3057fa66
A
1383 );
1384#else
2e212618
MN
1385 int y;
1386 int min=255;
1387 int max=0;
1388 int avg;
1389 uint8_t *p;
1390 int s[10];
9c9e467d 1391 const int QP2= c->QP/2 + 1;
2e212618
MN
1392
1393 for(y=1; y<9; y++)
1394 {
1395 int x;
1396 p= src + stride*y;
1397 for(x=1; x<9; x++)
1398 {
1399 p++;
1400 if(*p > max) max= *p;
1401 if(*p < min) min= *p;
1402 }
1403 }
9c9e467d 1404 avg= (min + max + 1)>>1;
2e212618 1405
cd38e322
MN
1406 if(max - min <deringThreshold) return;
1407
2e212618
MN
1408 for(y=0; y<10; y++)
1409 {
2e212618 1410 int t = 0;
9c9e467d
MN
1411
1412 if(src[stride*y + 0] > avg) t+= 1;
1413 if(src[stride*y + 1] > avg) t+= 2;
1414 if(src[stride*y + 2] > avg) t+= 4;
1415 if(src[stride*y + 3] > avg) t+= 8;
1416 if(src[stride*y + 4] > avg) t+= 16;
1417 if(src[stride*y + 5] > avg) t+= 32;
1418 if(src[stride*y + 6] > avg) t+= 64;
1419 if(src[stride*y + 7] > avg) t+= 128;
1420 if(src[stride*y + 8] > avg) t+= 256;
1421 if(src[stride*y + 9] > avg) t+= 512;
1422
2e212618
MN
1423 t |= (~t)<<16;
1424 t &= (t<<1) & (t>>1);
1425 s[y] = t;
1426 }
9c9e467d 1427
2e212618
MN
1428 for(y=1; y<9; y++)
1429 {
2e212618
MN
1430 int t = s[y-1] & s[y] & s[y+1];
1431 t|= t>>16;
9c9e467d
MN
1432 s[y-1]= t;
1433 }
1434
1435 for(y=1; y<9; y++)
1436 {
1437 int x;
1438 int t = s[y-1];
2e212618
MN
1439
1440 p= src + stride*y;
1441 for(x=1; x<9; x++)
1442 {
1443 p++;
1444 if(t & (1<<x))
1445 {
1446 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1447 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1448 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1449 f= (f + 8)>>4;
1450
cd38e322
MN
1451#ifdef DEBUG_DERING_THRESHOLD
1452 asm volatile("emms\n\t":);
1453 {
1454 static long long numPixels=0;
1455 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1456// if((max-min)<20 || (max-min)*QP<200)
1457// if((max-min)*QP < 500)
1458// if(max-min<QP/2)
1459 if(max-min < 20)
1460 {
1461 static int numSkiped=0;
1462 static int errorSum=0;
1463 static int worstQP=0;
1464 static int worstRange=0;
1465 static int worstDiff=0;
1466 int diff= (f - *p);
1467 int absDiff= ABS(diff);
1468 int error= diff*diff;
1469
1470 if(x==1 || x==8 || y==1 || y==8) continue;
1471
1472 numSkiped++;
1473 if(absDiff > worstDiff)
1474 {
1475 worstDiff= absDiff;
1476 worstQP= QP;
1477 worstRange= max-min;
1478 }
1479 errorSum+= error;
1480
1481 if(1024LL*1024LL*1024LL % numSkiped == 0)
1482 {
1483 printf( "sum:%1.3f, skip:%d, wQP:%d, "
1484 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1485 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1486 worstDiff, (float)numSkiped/numPixels);
1487 }
1488 }
1489 }
1490#endif
9c9e467d
MN
1491 if (*p + QP2 < f) *p= *p + QP2;
1492 else if(*p - QP2 > f) *p= *p - QP2;
2e212618
MN
1493 else *p=f;
1494 }
1495 }
1496 }
cd38e322
MN
1497#ifdef DEBUG_DERING_THRESHOLD
1498 if(max-min < 20)
1499 {
1500 for(y=1; y<9; y++)
1501 {
1502 int x;
1503 int t = 0;
1504 p= src + stride*y;
1505 for(x=1; x<9; x++)
1506 {
1507 p++;
1508 *p = MIN(*p + 20, 255);
1509 }
1510 }
1511// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1512 }
1513#endif
3057fa66
A
1514#endif
1515}
b0ac780a 1516#endif //HAVE_ALTIVEC
3057fa66 1517
3b58b885 1518/**
b304569a 1519 * Deinterlaces the given block by linearly interpolating every second line.
7fb36f6c
MN
1520 * will be called for every 8x8 block and can read & write from line 4-15
1521 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1522 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885 1523 */
cc9b0679 1524static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
3b58b885
MN
1525{
1526#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1527 src+= 4*stride;
3b58b885
MN
1528 asm volatile(
1529 "leal (%0, %1), %%eax \n\t"
9c9e467d 1530 "leal (%%eax, %1, 4), %%ecx \n\t"
3b58b885 1531// 0 1 2 3 4 5 6 7 8 9
9c9e467d 1532// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
3b58b885
MN
1533
1534 "movq (%0), %%mm0 \n\t"
1535 "movq (%%eax, %1), %%mm1 \n\t"
acced553 1536 PAVGB(%%mm1, %%mm0)
3b58b885
MN
1537 "movq %%mm0, (%%eax) \n\t"
1538 "movq (%0, %1, 4), %%mm0 \n\t"
acced553 1539 PAVGB(%%mm0, %%mm1)
3b58b885 1540 "movq %%mm1, (%%eax, %1, 2) \n\t"
9c9e467d 1541 "movq (%%ecx, %1), %%mm1 \n\t"
acced553 1542 PAVGB(%%mm1, %%mm0)
9c9e467d 1543 "movq %%mm0, (%%ecx) \n\t"
3b58b885 1544 "movq (%0, %1, 8), %%mm0 \n\t"
acced553 1545 PAVGB(%%mm0, %%mm1)
9c9e467d 1546 "movq %%mm1, (%%ecx, %1, 2) \n\t"
3b58b885
MN
1547
1548 : : "r" (src), "r" (stride)
9c9e467d 1549 : "%eax", "%ecx"
3b58b885
MN
1550 );
1551#else
99d33fa3 1552 int a, b, x;
7fb36f6c 1553 src+= 4*stride;
99d33fa3
MN
1554
1555 for(x=0; x<2; x++){
1556 a= *(uint32_t*)&src[stride*0];
1557 b= *(uint32_t*)&src[stride*2];
1558 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1559 a= *(uint32_t*)&src[stride*4];
1560 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1561 b= *(uint32_t*)&src[stride*6];
1562 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1563 a= *(uint32_t*)&src[stride*8];
1564 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1565 src += 4;
3b58b885
MN
1566 }
1567#endif
1568}
1569
1570/**
b304569a 1571 * Deinterlaces the given block by cubic interpolating every second line.
7fb36f6c
MN
1572 * will be called for every 8x8 block and can read & write from line 4-15
1573 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1574 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1575 * this filter will read lines 3-15 and write 7-13
3b58b885 1576 */
cc9b0679 1577static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
3b58b885
MN
1578{
1579#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1580 src+= stride*3;
3b58b885
MN
1581 asm volatile(
1582 "leal (%0, %1), %%eax \n\t"
9c9e467d
MN
1583 "leal (%%eax, %1, 4), %%edx \n\t"
1584 "leal (%%edx, %1, 4), %%ecx \n\t"
acced553
MN
1585 "addl %1, %%ecx \n\t"
1586 "pxor %%mm7, %%mm7 \n\t"
1587// 0 1 2 3 4 5 6 7 8 9 10
9c9e467d 1588// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
3b58b885 1589
acced553
MN
1590#define DEINT_CUBIC(a,b,c,d,e)\
1591 "movq " #a ", %%mm0 \n\t"\
1592 "movq " #b ", %%mm1 \n\t"\
1593 "movq " #d ", %%mm2 \n\t"\
1594 "movq " #e ", %%mm3 \n\t"\
1595 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1596 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1597 "movq %%mm0, %%mm2 \n\t"\
1598 "punpcklbw %%mm7, %%mm0 \n\t"\
1599 "punpckhbw %%mm7, %%mm2 \n\t"\
1600 "movq %%mm1, %%mm3 \n\t"\
1601 "punpcklbw %%mm7, %%mm1 \n\t"\
1602 "punpckhbw %%mm7, %%mm3 \n\t"\
1603 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1604 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1605 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1606 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1607 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1608 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1609 "packuswb %%mm3, %%mm1 \n\t"\
1610 "movq %%mm1, " #c " \n\t"
1611
9c9e467d
MN
1612DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
1613DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
1614DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
1615DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
3b58b885
MN
1616
1617 : : "r" (src), "r" (stride)
9c9e467d 1618 : "%eax", "%edx", "ecx"
3b58b885
MN
1619 );
1620#else
1621 int x;
7fb36f6c 1622 src+= stride*3;
3b58b885
MN
1623 for(x=0; x<8; x++)
1624 {
134eb1e5
MN
1625 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1626 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1627 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1628 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
3b58b885
MN
1629 src++;
1630 }
1631#endif
1632}
1633
1634/**
b304569a 1635 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
7fb36f6c
MN
1636 * will be called for every 8x8 block and can read & write from line 4-15
1637 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1638 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
9c9e467d 1639 * this filter will read lines 4-13 and write 5-11
9c9e467d
MN
1640 */
1641static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1642{
1643#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1644 src+= stride*4;
1645 asm volatile(
1646 "leal (%0, %1), %%eax \n\t"
1647 "leal (%%eax, %1, 4), %%edx \n\t"
1648 "pxor %%mm7, %%mm7 \n\t"
1649 "movq (%2), %%mm0 \n\t"
1650// 0 1 2 3 4 5 6 7 8 9 10
1651// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1652
1653#define DEINT_FF(a,b,c,d)\
1654 "movq " #a ", %%mm1 \n\t"\
1655 "movq " #b ", %%mm2 \n\t"\
1656 "movq " #c ", %%mm3 \n\t"\
1657 "movq " #d ", %%mm4 \n\t"\
1658 PAVGB(%%mm3, %%mm1) \
1659 PAVGB(%%mm4, %%mm0) \
1660 "movq %%mm0, %%mm3 \n\t"\
1661 "punpcklbw %%mm7, %%mm0 \n\t"\
1662 "punpckhbw %%mm7, %%mm3 \n\t"\
1663 "movq %%mm1, %%mm4 \n\t"\
1664 "punpcklbw %%mm7, %%mm1 \n\t"\
1665 "punpckhbw %%mm7, %%mm4 \n\t"\
1666 "psllw $2, %%mm1 \n\t"\
1667 "psllw $2, %%mm4 \n\t"\
1668 "psubw %%mm0, %%mm1 \n\t"\
1669 "psubw %%mm3, %%mm4 \n\t"\
1670 "movq %%mm2, %%mm5 \n\t"\
1671 "movq %%mm2, %%mm0 \n\t"\
1672 "punpcklbw %%mm7, %%mm2 \n\t"\
1673 "punpckhbw %%mm7, %%mm5 \n\t"\
1674 "paddw %%mm2, %%mm1 \n\t"\
1675 "paddw %%mm5, %%mm4 \n\t"\
1676 "psraw $2, %%mm1 \n\t"\
1677 "psraw $2, %%mm4 \n\t"\
1678 "packuswb %%mm4, %%mm1 \n\t"\
1679 "movq %%mm1, " #b " \n\t"\
1680
1681DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2))
1682DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) )
1683DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2))
1684DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
1685
1686 "movq %%mm0, (%2) \n\t"
1687 : : "r" (src), "r" (stride), "r"(tmp)
1688 : "%eax", "%edx"
1689 );
1690#else
1691 int x;
1692 src+= stride*4;
1693 for(x=0; x<8; x++)
1694 {
1695 int t1= tmp[x];
1696 int t2= src[stride*1];
1697
134eb1e5 1698 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
9c9e467d 1699 t1= src[stride*4];
134eb1e5 1700 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
9c9e467d 1701 t2= src[stride*6];
134eb1e5 1702 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
9c9e467d 1703 t1= src[stride*8];
134eb1e5 1704 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
9c9e467d
MN
1705 tmp[x]= t1;
1706
1707 src++;
1708 }
1709#endif
1710}
1711
1712/**
134eb1e5
MN
1713 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
1714 * will be called for every 8x8 block and can read & write from line 4-15
1715 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1716 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1717 * this filter will read lines 4-13 and write 4-11
1718 */
1719static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1720{
1721#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1722 src+= stride*4;
1723 asm volatile(
1724 "leal (%0, %1), %%eax \n\t"
1725 "leal (%%eax, %1, 4), %%edx \n\t"
1726 "pxor %%mm7, %%mm7 \n\t"
1727 "movq (%2), %%mm0 \n\t"
1728 "movq (%3), %%mm1 \n\t"
1729// 0 1 2 3 4 5 6 7 8 9 10
1730// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1731
1732#define DEINT_L5(t1,t2,a,b,c)\
1733 "movq " #a ", %%mm2 \n\t"\
1734 "movq " #b ", %%mm3 \n\t"\
1735 "movq " #c ", %%mm4 \n\t"\
1736 PAVGB(t2, %%mm3) \
1737 PAVGB(t1, %%mm4) \
1738 "movq %%mm2, %%mm5 \n\t"\
1739 "movq %%mm2, " #t1 " \n\t"\
1740 "punpcklbw %%mm7, %%mm2 \n\t"\
1741 "punpckhbw %%mm7, %%mm5 \n\t"\
1742 "movq %%mm2, %%mm6 \n\t"\
1743 "paddw %%mm2, %%mm2 \n\t"\
1744 "paddw %%mm6, %%mm2 \n\t"\
1745 "movq %%mm5, %%mm6 \n\t"\
1746 "paddw %%mm5, %%mm5 \n\t"\
1747 "paddw %%mm6, %%mm5 \n\t"\
1748 "movq %%mm3, %%mm6 \n\t"\
1749 "punpcklbw %%mm7, %%mm3 \n\t"\
1750 "punpckhbw %%mm7, %%mm6 \n\t"\
1751 "paddw %%mm3, %%mm3 \n\t"\
1752 "paddw %%mm6, %%mm6 \n\t"\
1753 "paddw %%mm3, %%mm2 \n\t"\
1754 "paddw %%mm6, %%mm5 \n\t"\
1755 "movq %%mm4, %%mm6 \n\t"\
1756 "punpcklbw %%mm7, %%mm4 \n\t"\
1757 "punpckhbw %%mm7, %%mm6 \n\t"\
1758 "psubw %%mm4, %%mm2 \n\t"\
1759 "psubw %%mm6, %%mm5 \n\t"\
1760 "psraw $2, %%mm2 \n\t"\
1761 "psraw $2, %%mm5 \n\t"\
1762 "packuswb %%mm5, %%mm2 \n\t"\
1763 "movq %%mm2, " #a " \n\t"\
1764
1765DEINT_L5(%%mm0, %%mm1, (%0) , (%%eax) , (%%eax, %1) )
1766DEINT_L5(%%mm1, %%mm0, (%%eax) , (%%eax, %1) , (%%eax, %1, 2))
1767DEINT_L5(%%mm0, %%mm1, (%%eax, %1) , (%%eax, %1, 2), (%0, %1, 4) )
1768DEINT_L5(%%mm1, %%mm0, (%%eax, %1, 2), (%0, %1, 4) , (%%edx) )
1769DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%edx) , (%%edx, %1) )
1770DEINT_L5(%%mm1, %%mm0, (%%edx) , (%%edx, %1) , (%%edx, %1, 2))
1771DEINT_L5(%%mm0, %%mm1, (%%edx, %1) , (%%edx, %1, 2), (%0, %1, 8) )
1772DEINT_L5(%%mm1, %%mm0, (%%edx, %1, 2), (%0, %1, 8) , (%%edx, %1, 4))
1773
1774 "movq %%mm0, (%2) \n\t"
1775 "movq %%mm1, (%3) \n\t"
1776 : : "r" (src), "r" (stride), "r"(tmp), "r"(tmp2)
1777 : "%eax", "%edx"
1778 );
1779#else
1780 int x;
1781 src+= stride*4;
1782 for(x=0; x<8; x++)
1783 {
1784 int t1= tmp[x];
1785 int t2= tmp2[x];
1786 int t3= src[0];
1787
1788 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1789 t1= src[stride*1];
1790 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1791 t2= src[stride*2];
1792 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1793 t3= src[stride*3];
1794 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1795 t1= src[stride*4];
1796 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1797 t2= src[stride*5];
1798 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1799 t3= src[stride*6];
1800 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1801 t1= src[stride*7];
1802 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1803
1804 tmp[x]= t3;
1805 tmp2[x]= t1;
1806
1807 src++;
1808 }
1809#endif
1810}
1811
1812/**
b304569a 1813 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
9c9e467d
MN
1814 * will be called for every 8x8 block and can read & write from line 4-15
1815 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1816 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
7fb36f6c 1817 * this filter will read lines 4-13 and write 4-11
3b58b885 1818 */
13ba9ae4 1819static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
3b58b885
MN
1820{
1821#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
7fb36f6c 1822 src+= 4*stride;
3b58b885
MN
1823 asm volatile(
1824 "leal (%0, %1), %%eax \n\t"
9c9e467d 1825 "leal (%%eax, %1, 4), %%edx \n\t"
3b58b885 1826// 0 1 2 3 4 5 6 7 8 9
9c9e467d 1827// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
3b58b885 1828
13ba9ae4
MN
1829 "movq (%2), %%mm0 \n\t" // L0
1830 "movq (%%eax), %%mm1 \n\t" // L2
3b58b885 1831 PAVGB(%%mm1, %%mm0) // L0+L2
13ba9ae4 1832 "movq (%0), %%mm2 \n\t" // L1
3b58b885
MN
1833 PAVGB(%%mm2, %%mm0)
1834 "movq %%mm0, (%0) \n\t"
13ba9ae4 1835 "movq (%%eax, %1), %%mm0 \n\t" // L3
3b58b885
MN
1836 PAVGB(%%mm0, %%mm2) // L1+L3
1837 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1838 "movq %%mm2, (%%eax) \n\t"
13ba9ae4 1839 "movq (%%eax, %1, 2), %%mm2 \n\t" // L4
3b58b885
MN
1840 PAVGB(%%mm2, %%mm1) // L2+L4
1841 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1842 "movq %%mm1, (%%eax, %1) \n\t"
13ba9ae4 1843 "movq (%0, %1, 4), %%mm1 \n\t" // L5
3b58b885
MN
1844 PAVGB(%%mm1, %%mm0) // L3+L5
1845 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1846 "movq %%mm0, (%%eax, %1, 2) \n\t"
13ba9ae4 1847 "movq (%%edx), %%mm0 \n\t" // L6
3b58b885
MN
1848 PAVGB(%%mm0, %%mm2) // L4+L6
1849 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1850 "movq %%mm2, (%0, %1, 4) \n\t"
13ba9ae4 1851 "movq (%%edx, %1), %%mm2 \n\t" // L7
3b58b885
MN
1852 PAVGB(%%mm2, %%mm1) // L5+L7
1853 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
9c9e467d 1854 "movq %%mm1, (%%edx) \n\t"
13ba9ae4 1855 "movq (%%edx, %1, 2), %%mm1 \n\t" // L8
3b58b885
MN
1856 PAVGB(%%mm1, %%mm0) // L6+L8
1857 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
9c9e467d 1858 "movq %%mm0, (%%edx, %1) \n\t"
13ba9ae4 1859 "movq (%0, %1, 8), %%mm0 \n\t" // L9
3b58b885
MN
1860 PAVGB(%%mm0, %%mm2) // L7+L9
1861 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
9c9e467d 1862 "movq %%mm2, (%%edx, %1, 2) \n\t"
13ba9ae4 1863 "movq %%mm1, (%2) \n\t"
3b58b885 1864
13ba9ae4 1865 : : "r" (src), "r" (stride), "r" (tmp)
9c9e467d 1866 : "%eax", "%edx"
3b58b885
MN
1867 );
1868#else
99d33fa3 1869 int a, b, c, x;
7fb36f6c 1870 src+= 4*stride;
99d33fa3
MN
1871
1872 for(x=0; x<2; x++){
13ba9ae4
MN
1873 a= *(uint32_t*)&tmp[stride*0];
1874 b= *(uint32_t*)&src[stride*0];
1875 c= *(uint32_t*)&src[stride*1];
99d33fa3
MN
1876 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1877 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1878
13ba9ae4 1879 a= *(uint32_t*)&src[stride*2];
99d33fa3
MN
1880 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1881 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1882
13ba9ae4 1883 b= *(uint32_t*)&src[stride*3];
99d33fa3
MN
1884 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1885 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1886
13ba9ae4 1887 c= *(uint32_t*)&src[stride*4];
99d33fa3
MN
1888 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1889 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1890
13ba9ae4 1891 a= *(uint32_t*)&src[stride*5];
99d33fa3
MN
1892 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1893 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1894
13ba9ae4 1895 b= *(uint32_t*)&src[stride*6];
99d33fa3
MN
1896 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1897 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1898
13ba9ae4 1899 c= *(uint32_t*)&src[stride*7];
99d33fa3
MN
1900 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1901 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1902
13ba9ae4 1903 a= *(uint32_t*)&src[stride*8];
99d33fa3
MN
1904 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1905 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1906
13ba9ae4 1907 *(uint32_t*)&tmp[stride*0]= c;
99d33fa3 1908 src += 4;
13ba9ae4 1909 tmp += 4;
3b58b885
MN
1910 }
1911#endif
1912}
1913
1914/**
b304569a 1915 * Deinterlaces the given block by applying a median filter to every second line.
7fb36f6c
MN
1916 * will be called for every 8x8 block and can read & write from line 4-15,
1917 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1918 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
3b58b885 1919 */
cc9b0679 1920static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
3b58b885 1921{
a6be8111 1922#ifdef HAVE_MMX
7fb36f6c 1923 src+= 4*stride;
a6be8111 1924#ifdef HAVE_MMX2
3b58b885
MN
1925 asm volatile(
1926 "leal (%0, %1), %%eax \n\t"
9c9e467d 1927 "leal (%%eax, %1, 4), %%edx \n\t"
3b58b885 1928// 0 1 2 3 4 5 6 7 8 9
9c9e467d 1929// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
3b58b885
MN
1930
1931 "movq (%0), %%mm0 \n\t" //
1932 "movq (%%eax, %1), %%mm2 \n\t" //
1933 "movq (%%eax), %%mm1 \n\t" //
1934 "movq %%mm0, %%mm3 \n\t"
1935 "pmaxub %%mm1, %%mm0 \n\t" //
1936 "pminub %%mm3, %%mm1 \n\t" //
1937 "pmaxub %%mm2, %%mm1 \n\t" //
1938 "pminub %%mm1, %%mm0 \n\t"
1939 "movq %%mm0, (%%eax) \n\t"
1940
1941 "movq (%0, %1, 4), %%mm0 \n\t" //
1942 "movq (%%eax, %1, 2), %%mm1 \n\t" //
1943 "movq %%mm2, %%mm3 \n\t"
1944 "pmaxub %%mm1, %%mm2 \n\t" //
1945 "pminub %%mm3, %%mm1 \n\t" //
1946 "pmaxub %%mm0, %%mm1 \n\t" //
1947 "pminub %%mm1, %%mm2 \n\t"
1948 "movq %%mm2, (%%eax, %1, 2) \n\t"
1949
9c9e467d
MN
1950 "movq (%%edx), %%mm2 \n\t" //
1951 "movq (%%edx, %1), %%mm1 \n\t" //
3b58b885
MN
1952 "movq %%mm2, %%mm3 \n\t"
1953 "pmaxub %%mm0, %%mm2 \n\t" //
1954 "pminub %%mm3, %%mm0 \n\t" //
1955 "pmaxub %%mm1, %%mm0 \n\t" //
1956 "pminub %%mm0, %%mm2 \n\t"
9c9e467d 1957 "movq %%mm2, (%%edx) \n\t"
3b58b885 1958
9c9e467d 1959 "movq (%%edx, %1, 2), %%mm2 \n\t" //
3b58b885
MN
1960 "movq (%0, %1, 8), %%mm0 \n\t" //
1961 "movq %%mm2, %%mm3 \n\t"
1962 "pmaxub %%mm0, %%mm2 \n\t" //
1963 "pminub %%mm3, %%mm0 \n\t" //
1964 "pmaxub %%mm1, %%mm0 \n\t" //
1965 "pminub %%mm0, %%mm2 \n\t"
9c9e467d 1966 "movq %%mm2, (%%edx, %1, 2) \n\t"
3b58b885
MN
1967
1968
1969 : : "r" (src), "r" (stride)
9c9e467d 1970 : "%eax", "%edx"
3b58b885 1971 );
a6be8111
MN
1972
1973#else // MMX without MMX2
1974 asm volatile(
1975 "leal (%0, %1), %%eax \n\t"
9c9e467d 1976 "leal (%%eax, %1, 4), %%edx \n\t"
a6be8111 1977// 0 1 2 3 4 5 6 7 8 9
9c9e467d 1978// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
a6be8111
MN
1979 "pxor %%mm7, %%mm7 \n\t"
1980
1981#define MEDIAN(a,b,c)\
1982 "movq " #a ", %%mm0 \n\t"\
1983 "movq " #b ", %%mm2 \n\t"\
1984 "movq " #c ", %%mm1 \n\t"\
1985 "movq %%mm0, %%mm3 \n\t"\
1986 "movq %%mm1, %%mm4 \n\t"\
1987 "movq %%mm2, %%mm5 \n\t"\
1988 "psubusb %%mm1, %%mm3 \n\t"\
1989 "psubusb %%mm2, %%mm4 \n\t"\
1990 "psubusb %%mm0, %%mm5 \n\t"\
1991 "pcmpeqb %%mm7, %%mm3 \n\t"\
1992 "pcmpeqb %%mm7, %%mm4 \n\t"\
1993 "pcmpeqb %%mm7, %%mm5 \n\t"\
1994 "movq %%mm3, %%mm6 \n\t"\
1995 "pxor %%mm4, %%mm3 \n\t"\
1996 "pxor %%mm5, %%mm4 \n\t"\
1997 "pxor %%mm6, %%mm5 \n\t"\
1998 "por %%mm3, %%mm1 \n\t"\
1999 "por %%mm4, %%mm2 \n\t"\
2000 "por %%mm5, %%mm0 \n\t"\
2001 "pand %%mm2, %%mm0 \n\t"\
2002 "pand %%mm1, %%mm0 \n\t"\
2003 "movq %%mm0, " #b " \n\t"
2004
2005MEDIAN((%0), (%%eax), (%%eax, %1))
2006MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
9c9e467d
MN
2007MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
2008MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
a6be8111
MN
2009
2010 : : "r" (src), "r" (stride)
9c9e467d 2011 : "%eax", "%edx"
a6be8111
MN
2012 );
2013#endif // MMX
3b58b885 2014#else
9b1663fc 2015 int x, y;
7fb36f6c 2016 src+= 4*stride;
9b1663fc 2017 // FIXME - there should be a way to do a few columns in parallel like w/mmx
3b58b885
MN
2018 for(x=0; x<8; x++)
2019 {
9b1663fc
RFI
2020 uint8_t *colsrc = src;
2021 for (y=0; y<4; y++)
2022 {
2023 int a, b, c, d, e, f;
2024 a = colsrc[0 ];
2025 b = colsrc[stride ];
2026 c = colsrc[stride*2];
2027 d = (a-b)>>31;
2028 e = (b-c)>>31;
2029 f = (c-a)>>31;
2030 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2031 colsrc += stride*2;
2032 }
3b58b885
MN
2033 src++;
2034 }
2035#endif
2036}
2037
e5c30e06 2038#ifdef HAVE_MMX
4e4dcbc5
MN
2039/**
2040 * transposes and shift the given 8x8 Block into dst1 and dst2
2041 */
cc9b0679 2042static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
4e4dcbc5
MN
2043{
2044 asm(
2045 "leal (%0, %1), %%eax \n\t"
4e4dcbc5 2046// 0 1 2 3 4 5 6 7 8 9
9c9e467d 2047// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
4e4dcbc5
MN
2048 "movq (%0), %%mm0 \n\t" // 12345678
2049 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2050 "movq %%mm0, %%mm2 \n\t" // 12345678
2051 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2052 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2053
2054 "movq (%%eax, %1), %%mm1 \n\t"
2055 "movq (%%eax, %1, 2), %%mm3 \n\t"
2056 "movq %%mm1, %%mm4 \n\t"
2057 "punpcklbw %%mm3, %%mm1 \n\t"
2058 "punpckhbw %%mm3, %%mm4 \n\t"
2059
2060 "movq %%mm0, %%mm3 \n\t"
2061 "punpcklwd %%mm1, %%mm0 \n\t"
2062 "punpckhwd %%mm1, %%mm3 \n\t"
2063 "movq %%mm2, %%mm1 \n\t"
2064 "punpcklwd %%mm4, %%mm2 \n\t"
2065 "punpckhwd %%mm4, %%mm1 \n\t"
2066
2067 "movd %%mm0, 128(%2) \n\t"
2068 "psrlq $32, %%mm0 \n\t"
2069 "movd %%mm0, 144(%2) \n\t"
2070 "movd %%mm3, 160(%2) \n\t"
2071 "psrlq $32, %%mm3 \n\t"
2072 "movd %%mm3, 176(%2) \n\t"
2073 "movd %%mm3, 48(%3) \n\t"
2074 "movd %%mm2, 192(%2) \n\t"
2075 "movd %%mm2, 64(%3) \n\t"
2076 "psrlq $32, %%mm2 \n\t"
2077 "movd %%mm2, 80(%3) \n\t"
2078 "movd %%mm1, 96(%3) \n\t"
2079 "psrlq $32, %%mm1 \n\t"
2080 "movd %%mm1, 112(%3) \n\t"
2081
abd140db
MN
2082 "leal (%%eax, %1, 4), %%eax \n\t"
2083
4e4dcbc5 2084 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
abd140db 2085 "movq (%%eax), %%mm1 \n\t" // abcdefgh
4e4dcbc5
MN
2086 "movq %%mm0, %%mm2 \n\t" // 12345678
2087 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2088 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2089
abd140db
MN
2090 "movq (%%eax, %1), %%mm1 \n\t"
2091 "movq (%%eax, %1, 2), %%mm3 \n\t"
4e4dcbc5
MN
2092 "movq %%mm1, %%mm4 \n\t"
2093 "punpcklbw %%mm3, %%mm1 \n\t"
2094 "punpckhbw %%mm3, %%mm4 \n\t"
2095
2096 "movq %%mm0, %%mm3 \n\t"
2097 "punpcklwd %%mm1, %%mm0 \n\t"
2098 "punpckhwd %%mm1, %%mm3 \n\t"
2099 "movq %%mm2, %%mm1 \n\t"
2100 "punpcklwd %%mm4, %%mm2 \n\t"
2101 "punpckhwd %%mm4, %%mm1 \n\t"
2102
2103 "movd %%mm0, 132(%2) \n\t"
2104 "psrlq $32, %%mm0 \n\t"
2105 "movd %%mm0, 148(%2) \n\t"
2106 "movd %%mm3, 164(%2) \n\t"
2107 "psrlq $32, %%mm3 \n\t"
2108 "movd %%mm3, 180(%2) \n\t"
2109 "movd %%mm3, 52(%3) \n\t"
2110 "movd %%mm2, 196(%2) \n\t"
2111 "movd %%mm2, 68(%3) \n\t"
2112 "psrlq $32, %%mm2 \n\t"
2113 "movd %%mm2, 84(%3) \n\t"
2114 "movd %%mm1, 100(%3) \n\t"
2115 "psrlq $32, %%mm1 \n\t"
2116 "movd %%mm1, 116(%3) \n\t"
2117
2118
2119 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
abd140db 2120 : "%eax"
4e4dcbc5
MN
2121 );
2122}
2123
2124/**
2125 * transposes the given 8x8 block
2126 */
cc9b0679 2127static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
4e4dcbc5
MN
2128{
2129 asm(
2130 "leal (%0, %1), %%eax \n\t"
9c9e467d 2131 "leal (%%eax, %1, 4), %%edx \n\t"
4e4dcbc5 2132// 0 1 2 3 4 5 6 7 8 9
9c9e467d 2133// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
4e4dcbc5
MN
2134 "movq (%2), %%mm0 \n\t" // 12345678
2135 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2136 "movq %%mm0, %%mm2 \n\t" // 12345678
2137 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2138 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2139
2140 "movq 32(%2), %%mm1 \n\t"
2141 "movq 48(%2), %%mm3 \n\t"
2142 "movq %%mm1, %%mm4 \n\t"
2143 "punpcklbw %%mm3, %%mm1 \n\t"
2144 "punpckhbw %%mm3, %%mm4 \n\t"
2145
2146 "movq %%mm0, %%mm3 \n\t"
2147 "punpcklwd %%mm1, %%mm0 \n\t"
2148 "punpckhwd %%mm1, %%mm3 \n\t"
2149 "movq %%mm2, %%mm1 \n\t"
2150 "punpcklwd %%mm4, %%mm2 \n\t"
2151 "punpckhwd %%mm4, %%mm1 \n\t"
2152
2153 "movd %%mm0, (%0) \n\t"
2154 "psrlq $32, %%mm0 \n\t"
2155 "movd %%mm0, (%%eax) \n\t"
2156 "movd %%mm3, (%%eax, %1) \n\t"
2157 "psrlq $32, %%mm3 \n\t"
2158 "movd %%mm3, (%%eax, %1, 2) \n\t"
2159 "movd %%mm2, (%0, %1, 4) \n\t"
2160 "psrlq $32, %%mm2 \n\t"
9c9e467d
MN
2161 "movd %%mm2, (%%edx) \n\t"
2162 "movd %%mm1, (%%edx, %1) \n\t"
4e4dcbc5 2163 "psrlq $32, %%mm1 \n\t"
9c9e467d 2164 "movd %%mm1, (%%edx, %1, 2) \n\t"
4e4dcbc5
MN
2165
2166
2167 "movq 64(%2), %%mm0 \n\t" // 12345678
2168 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2169 "movq %%mm0, %%mm2 \n\t" // 12345678
2170 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2171 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2172
2173 "movq 96(%2), %%mm1 \n\t"
2174 "movq 112(%2), %%mm3 \n\t"
2175 "movq %%mm1, %%mm4 \n\t"
2176 "punpcklbw %%mm3, %%mm1 \n\t"
2177 "punpckhbw %%mm3, %%mm4 \n\t"
2178
2179 "movq %%mm0, %%mm3 \n\t"
2180 "punpcklwd %%mm1, %%mm0 \n\t"
2181 "punpckhwd %%mm1, %%mm3 \n\t"
2182 "movq %%mm2, %%mm1 \n\t"
2183 "punpcklwd %%mm4, %%mm2 \n\t"
2184 "punpckhwd %%mm4, %%mm1 \n\t"
2185
2186 "movd %%mm0, 4(%0) \n\t"
2187 "psrlq $32, %%mm0 \n\t"
2188 "movd %%mm0, 4(%%eax) \n\t"
2189 "movd %%mm3, 4(%%eax, %1) \n\t"
2190 "psrlq $32, %%mm3 \n\t"
2191 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2192 "movd %%mm2, 4(%0, %1, 4) \n\t"
2193 "psrlq $32, %%mm2 \n\t"
9c9e467d
MN
2194 "movd %%mm2, 4(%%edx) \n\t"
2195 "movd %%mm1, 4(%%edx, %1) \n\t"
4e4dcbc5 2196 "psrlq $32, %%mm1 \n\t"
9c9e467d 2197 "movd %%mm1, 4(%%edx, %1, 2) \n\t"
4e4dcbc5
MN
2198
2199 :: "r" (dst), "r" (dstStride), "r" (src)
9c9e467d 2200 : "%eax", "%edx"
4e4dcbc5
MN
2201 );
2202}
e5c30e06 2203#endif
be44a4d7 2204//static int test=0;
4e4dcbc5 2205
a7b2871c 2206#ifndef HAVE_ALTIVEC
a2596758 2207static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
a9c77978 2208 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
117e45b0 2209{
9c9e467d
MN
2210 // to save a register (FIXME do this outside of the loops)
2211 tempBluredPast[127]= maxNoise[0];
2212 tempBluredPast[128]= maxNoise[1];
2213 tempBluredPast[129]= maxNoise[2];
2214
be44a4d7
MN
2215#define FAST_L2_DIFF
2216//#define L1_DIFF //u should change the thresholds too if u try that one
2217#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2218 asm volatile(
2219 "leal (%2, %2, 2), %%eax \n\t" // 3*stride
9c9e467d
MN
2220 "leal (%2, %2, 4), %%edx \n\t" // 5*stride
2221 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
be44a4d7 2222// 0 1 2 3 4 5 6 7 8 9
9c9e467d 2223// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
be44a4d7
MN
2224//FIXME reorder?
2225#ifdef L1_DIFF //needs mmx2
2226 "movq (%0), %%mm0 \n\t" // L0
2227 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2228 "movq (%0, %2), %%mm1 \n\t" // L1
2229 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2230 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2231 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2232 "movq (%0, %%eax), %%mm3 \n\t" // L3
2233 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
2234
2235 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2236 "paddw %%mm1, %%mm0 \n\t"
2237 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
9c9e467d 2238 "movq (%0, %%edx), %%mm5 \n\t" // L5
be44a4d7 2239 "paddw %%mm2, %%mm0 \n\t"
9c9e467d 2240 "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5|
be44a4d7
MN
2241 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2242 "paddw %%mm3, %%mm0 \n\t"
2243 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
2244 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2245 "paddw %%mm4, %%mm0 \n\t"
2246 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
2247 "paddw %%mm5, %%mm6 \n\t"
2248 "paddw %%mm7, %%mm6 \n\t"
2249 "paddw %%mm6, %%mm0 \n\t"
2250#elif defined (FAST_L2_DIFF)
2251 "pcmpeqb %%mm7, %%mm7 \n\t"
9b464428 2252 "movq "MANGLE(b80)", %%mm6 \n\t"
be44a4d7
MN
2253 "pxor %%mm0, %%mm0 \n\t"
2254#define L2_DIFF_CORE(a, b)\
2255 "movq " #a ", %%mm5 \n\t"\
2256 "movq " #b ", %%mm2 \n\t"\
2257 "pxor %%mm7, %%mm2 \n\t"\
2258 PAVGB(%%mm2, %%mm5)\
2259 "paddb %%mm6, %%mm5 \n\t"\
2260 "movq %%mm5, %%mm2 \n\t"\
2261 "psllw $8, %%mm5 \n\t"\
2262 "pmaddwd %%mm5, %%mm5 \n\t"\
2263 "pmaddwd %%mm2, %%mm2 \n\t"\
2264 "paddd %%mm2, %%mm5 \n\t"\
2265 "psrld $14, %%mm5 \n\t"\
2266 "paddd %%mm5, %%mm0 \n\t"
2267
2268L2_DIFF_CORE((%0), (%1))
2269L2_DIFF_CORE((%0, %2), (%1, %2))
2270L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2271L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2272L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
9c9e467d 2273L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
be44a4d7
MN
2274L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2275L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2276
2277#else
2278 "pxor %%mm7, %%mm7 \n\t"
2279 "pxor %%mm0, %%mm0 \n\t"
2280#define L2_DIFF_CORE(a, b)\
2281 "movq " #a ", %%mm5 \n\t"\
2282 "movq " #b ", %%mm2 \n\t"\
2283 "movq %%mm5, %%mm1 \n\t"\
2284 "movq %%mm2, %%mm3 \n\t"\
2285 "punpcklbw %%mm7, %%mm5 \n\t"\
2286 "punpckhbw %%mm7, %%mm1 \n\t"\
2287 "punpcklbw %%mm7, %%mm2 \n\t"\
2288 "punpckhbw %%mm7, %%mm3 \n\t"\
2289 "psubw %%mm2, %%mm5 \n\t"\
2290 "psubw %%mm3, %%mm1 \n\t"\
2291 "pmaddwd %%mm5, %%mm5 \n\t"\
2292 "pmaddwd %%mm1, %%mm1 \n\t"\
2293 "paddd %%mm1, %%mm5 \n\t"\
2294 "paddd %%mm5, %%mm0 \n\t"
2295
2296L2_DIFF_CORE((%0), (%1))
2297L2_DIFF_CORE((%0, %2), (%1, %2))
2298L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2299L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2300L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
9c9e467d 2301L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
be44a4d7
MN
2302L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2303L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2304
2305#endif
2306
2307 "movq %%mm0, %%mm4 \n\t"
2308 "psrlq $32, %%mm0 \n\t"
2309 "paddd %%mm0, %%mm4 \n\t"
2310 "movd %%mm4, %%ecx \n\t"
a9c77978 2311 "shll $2, %%ecx \n\t"
9c9e467d
MN
2312 "movl %3, %%edx \n\t"
2313 "addl -4(%%edx), %%ecx \n\t"
2314 "addl 4(%%edx), %%ecx \n\t"
2315 "addl -1024(%%edx), %%ecx \n\t"
a9c77978 2316 "addl $4, %%ecx \n\t"
9c9e467d 2317 "addl 1024(%%edx), %%ecx \n\t"
a9c77978 2318 "shrl $3, %%ecx \n\t"
9c9e467d 2319 "movl %%ecx, (%%edx) \n\t"
a9c77978 2320
9b464428 2321// "movl %3, %%ecx \n\t"
be44a4d7
MN
2322// "movl %%ecx, test \n\t"
2323// "jmp 4f \n\t"
9c9e467d 2324 "cmpl 512(%%edx), %%ecx \n\t"
be44a4d7 2325 " jb 2f \n\t"
9c9e467d 2326 "cmpl 516(%%edx), %%ecx \n\t"
be44a4d7
MN
2327 " jb 1f \n\t"
2328
9c9e467d
MN
2329 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2330 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
be44a4d7
MN
2331 "movq (%0), %%mm0 \n\t" // L0
2332 "movq (%0, %2), %%mm1 \n\t" // L1
2333 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2334 "movq (%0, %%eax), %%mm3 \n\t" // L3
2335 "movq (%0, %2, 4), %%mm4 \n\t" // L4
9c9e467d 2336 "movq (%0, %%edx), %%mm5 \n\t" // L5
be44a4d7
MN
2337 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2338 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2339 "movq %%mm0, (%1) \n\t" // L0
2340 "movq %%mm1, (%1, %2) \n\t" // L1
2341 "movq %%mm2, (%1, %2, 2) \n\t" // L2
2342 "movq %%mm3, (%1, %%eax) \n\t" // L3
2343 "movq %%mm4, (%1, %2, 4) \n\t" // L4
9c9e467d 2344 "movq %%mm5, (%1, %%edx) \n\t" // L5
be44a4d7
MN
2345 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
2346 "movq %%mm7, (%1, %%ecx) \n\t" // L7
2347 "jmp 4f \n\t"
2348
2349 "1: \n\t"
9c9e467d
MN
2350 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2351 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
be44a4d7 2352 "movq (%0), %%mm0 \n\t" // L0
413dec62 2353 PAVGB((%1), %%mm0) // L0
be44a4d7 2354 "movq (%0, %2), %%mm1 \n\t" // L1
413dec62 2355 PAVGB((%1, %2), %%mm1) // L1
be44a4d7 2356 "movq (%0, %2, 2), %%mm2 \n\t" // L2
413dec62 2357 PAVGB((%1, %2, 2), %%mm2) // L2
be44a4d7 2358 "movq (%0, %%eax), %%mm3 \n\t" // L3
413dec62 2359 PAVGB((%1, %%eax), %%mm3) // L3
be44a4d7 2360 "movq (%0, %2, 4), %%mm4 \n\t" // L4
413dec62 2361 PAVGB((%1, %2, 4), %%mm4) // L4
9c9e467d
MN
2362 "movq (%0, %%edx), %%mm5 \n\t" // L5
2363 PAVGB((%1, %%edx), %%mm5) // L5
be44a4d7 2364 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
413dec62 2365 PAVGB((%1, %%eax, 2), %%mm6) // L6
be44a4d7 2366 "movq (%0, %%ecx), %%mm7 \n\t" // L7
413dec62 2367 PAVGB((%1, %%ecx), %%mm7) // L7
be44a4d7
MN
2368 "movq %%mm0, (%1) \n\t" // R0
2369 "movq %%mm1, (%1, %2) \n\t" // R1
2370 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2371 "movq %%mm3, (%1, %%eax) \n\t" // R3
2372 "movq %%mm4, (%1, %2, 4) \n\t" // R4
9c9e467d 2373 "movq %%mm5, (%1, %%edx) \n\t" // R5
be44a4d7
MN
2374 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
2375 "movq %%mm7, (%1, %%ecx) \n\t" // R7
2376 "movq %%mm0, (%0) \n\t" // L0
2377 "movq %%mm1, (%0, %2) \n\t" // L1
2378 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2379 "movq %%mm3, (%0, %%eax) \n\t" // L3
2380 "movq %%mm4, (%0, %2, 4) \n\t" // L4
9c9e467d 2381 "movq %%mm5, (%0, %%edx) \n\t" // L5
be44a4d7
MN
2382 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
2383 "movq %%mm7, (%0, %%ecx) \n\t" // L7
2384 "jmp 4f \n\t"
2385
2386 "2: \n\t"
9c9e467d 2387 "cmpl 508(%%edx), %%ecx \n\t"
be44a4d7
MN
2388 " jb 3f \n\t"
2389
9c9e467d
MN
2390 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2391 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
be44a4d7
MN
2392 "movq (%0), %%mm0 \n\t" // L0
2393 "movq (%0, %2), %%mm1 \n\t" // L1
2394 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2395 "movq (%0, %%eax), %%mm3 \n\t" // L3
2396 "movq (%1), %%mm4 \n\t" // R0
2397 "movq (%1, %2), %%mm5 \n\t" // R1
2398 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2399 "movq (%1, %%eax), %%mm7 \n\t" // R3
2400 PAVGB(%%mm4, %%mm0)
2401 PAVGB(%%mm5, %%mm1)
2402 PAVGB(%%mm6, %%mm2)
2403 PAVGB(%%mm7, %%mm3)
2404 PAVGB(%%mm4, %%mm0)
2405 PAVGB(%%mm5, %%mm1)
2406 PAVGB(%%mm6, %%mm2)
2407 PAVGB(%%mm7, %%mm3)
2408 "movq %%mm0, (%1) \n\t" // R0
2409 "movq %%mm1, (%1, %2) \n\t" // R1
2410 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2411 "movq %%mm3, (%1, %%eax) \n\t" // R3
2412 "movq %%mm0, (%0) \n\t" // L0
2413 "movq %%mm1, (%0, %2) \n\t" // L1
2414 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2415 "movq %%mm3, (%0, %%eax) \n\t" // L3
2416
2417 "movq (%0, %2, 4), %%mm0 \n\t" // L4
9c9e467d 2418 "movq (%0, %%edx), %%mm1 \n\t" // L5
be44a4d7
MN
2419 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2420 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2421 "movq (%1, %2, 4), %%mm4 \n\t" // R4
9c9e467d 2422 "movq (%1, %%edx), %%mm5 \n\t" // R5
be44a4d7
MN
2423 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2424 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2425 PAVGB(%%mm4, %%mm0)
2426 PAVGB(%%mm5, %%mm1)
2427 PAVGB(%%mm6, %%mm2)
2428 PAVGB(%%mm7, %%mm3)
2429 PAVGB(%%mm4, %%mm0)
2430 PAVGB(%%mm5, %%mm1)
2431 PAVGB(%%mm6, %%mm2)
2432 PAVGB(%%mm7, %%mm3)
2433 "movq %%mm0, (%1, %2, 4) \n\t" // R4
9c9e467d 2434 "movq %%mm1, (%1, %%edx) \n\t" // R5
be44a4d7
MN
2435 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2436 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2437 "movq %%mm0, (%0, %2, 4) \n\t" // L4
9c9e467d 2438 "movq %%mm1, (%0, %%edx) \n\t" // L5
be44a4d7
MN
2439 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2440 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2441 "jmp 4f \n\t"
2442
2443 "3: \n\t"
9c9e467d
MN
2444 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2445 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
be44a4d7
MN
2446 "movq (%0), %%mm0 \n\t" // L0
2447 "movq (%0, %2), %%mm1 \n\t" // L1
2448 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2449 "movq (%0, %%eax), %%mm3 \n\t" // L3
2450 "movq (%1), %%mm4 \n\t" // R0
2451 "movq (%1, %2), %%mm5 \n\t" // R1
2452 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2453 "movq (%1, %%eax), %%mm7 \n\t" // R3
2454 PAVGB(%%mm4, %%mm0)
2455 PAVGB(%%mm5, %%mm1)
2456 PAVGB(%%mm6, %%mm2)
2457 PAVGB(%%mm7, %%mm3)
2458 PAVGB(%%mm4, %%mm0)
2459 PAVGB(%%mm5, %%mm1)
2460 PAVGB(%%mm6, %%mm2)
2461 PAVGB(%%mm7, %%mm3)
2462 PAVGB(%%mm4, %%mm0)
2463 PAVGB(%%mm5, %%mm1)
2464 PAVGB(%%mm6, %%mm2)
2465 PAVGB(%%mm7, %%mm3)
2466 "movq %%mm0, (%1) \n\t" // R0
2467 "movq %%mm1, (%1, %2) \n\t" // R1
2468 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2469 "movq %%mm3, (%1, %%eax) \n\t" // R3
2470 "movq %%mm0, (%0) \n\t" // L0
2471 "movq %%mm1, (%0, %2) \n\t" // L1
2472 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2473 "movq %%mm3, (%0, %%eax) \n\t" // L3
2474
2475 "movq (%0, %2, 4), %%mm0 \n\t" // L4
9c9e467d 2476 "movq (%0, %%edx), %%mm1 \n\t" // L5
be44a4d7
MN
2477 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2478 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2479 "movq (%1, %2, 4), %%mm4 \n\t" // R4
9c9e467d 2480 "movq (%1, %%edx), %%mm5 \n\t" // R5
be44a4d7
MN
2481 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2482 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2483 PAVGB(%%mm4, %%mm0)
2484 PAVGB(%%mm5, %%mm1)
2485 PAVGB(%%mm6, %%mm2)
2486 PAVGB(%%mm7, %%mm3)
2487 PAVGB(%%mm4, %%mm0)
2488 PAVGB(%%mm5, %%mm1)
2489 PAVGB(%%mm6, %%mm2)
2490 PAVGB(%%mm7, %%mm3)
2491 PAVGB(%%mm4, %%mm0)
2492 PAVGB(%%mm5, %%mm1)
2493 PAVGB(%%mm6, %%mm2)
2494 PAVGB(%%mm7, %%mm3)
2495 "movq %%mm0, (%1, %2, 4) \n\t" // R4
9c9e467d 2496 "movq %%mm1, (%1, %%edx) \n\t" // R5
be44a4d7
MN
2497 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2498 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2499 "movq %%mm0, (%0, %2, 4) \n\t" // L4
9c9e467d 2500 "movq %%mm1, (%0, %%edx) \n\t" // L5
be44a4d7
MN
2501 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2502 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2503
2504 "4: \n\t"
2505
a9c77978 2506 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
9c9e467d 2507 : "%eax", "%edx", "%ecx", "memory"
be44a4d7
MN
2508 );
2509//printf("%d\n", test);
2510#else
5936be4c 2511{
117e45b0
MN
2512 int y;
2513 int d=0;
a7b2871c 2514// int sysd=0;
a9c77978 2515 int i;
117e45b0
MN
2516
2517 for(y=0; y<8; y++)
2518 {
2519 int x;
2520 for(x=0; x<8; x++)
2521 {
2522 int ref= tempBlured[ x + y*stride ];
2523 int cur= src[ x + y*stride ];
2524 int d1=ref - cur;
be44a4d7
MN
2525// if(x==0 || x==7) d1+= d1>>1;
2526// if(y==0 || y==7) d1+= d1>>1;
2527// d+= ABS(d1);
2528 d+= d1*d1;
a7b2871c 2529// sysd+= d1;
117e45b0
MN
2530 }
2531 }
a9c77978
MN
2532 i=d;
2533 d= (
2534 4*d
2535 +(*(tempBluredPast-256))
2536 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2537 +(*(tempBluredPast+256))
2538 +4)>>3;
2539 *tempBluredPast=i;
2540// ((*tempBluredPast)*3 + d + 2)>>2;
2541
117e45b0
MN
2542//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2543/*
2544Switch between
2545 1 0 0 0 0 0 0 (0)
254664 32 16 8 4 2 1 (1)
254764 48 36 27 20 15 11 (33) (approx)
254864 56 49 43 37 33 29 (200) (approx)
2549*/
2550 if(d > maxNoise[1])
2551 {
2552 if(d < maxNoise[2])
2553 {
2554 for(y=0; y<8; y++)
2555 {
2556 int x;
2557 for(x=0; x<8; x++)
2558 {
2559 int ref= tempBlured[ x + y*stride ];
2560 int cur= src[ x + y*stride ];
2561 tempBlured[ x + y*stride ]=
2562 src[ x + y*stride ]=
2563 (ref + cur + 1)>>1;
2564 }
2565 }
2566 }
2567 else
2568 {
2569 for(y=0; y<8; y++)
2570 {
2571 int x;
2572 for(x=0; x<8; x++)
2573 {
2574 tempBlured[ x + y*stride ]= src[ x + y*stride ];
2575 }
2576 }
2577 }
2578 }
2579 else
2580 {
2581 if(d < maxNoise[0])
2582 {
2583 for(y=0; y<8; y++)
2584 {
2585 int x;
2586 for(x=0; x<8; x++)
2587 {
2588 int ref= tempBlured[ x + y*stride ];
2589 int cur= src[ x + y*stride ];
2590 tempBlured[ x + y*stride ]=
2591 src[ x + y*stride ]=
2592 (ref*7 + cur + 4)>>3;
2593 }
2594 }
2595 }
2596 else
2597 {
2598 for(y=0; y<8; y++)
2599 {
2600 int x;
2601 for(x=0; x<8; x++)
2602 {
2603 int ref= tempBlured[ x + y*stride ];
2604 int cur= src[ x + y*stride ];
2605 tempBlured[ x + y*stride ]=
2606 src[ x + y*stride ]=
2607 (ref*3 + cur + 2)>>2;
2608 }
2609 }
2610 }
2611 }
5936be4c 2612}
be44a4d7 2613#endif
117e45b0 2614}
a7b2871c 2615#endif //HAVE_ALTIVEC
117e45b0 2616
792a5a7c
MN
2617#ifdef HAVE_MMX
2618/**
2619 * accurate deblock filter
2620 */
2621static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
792a5a7c 2622 int64_t dc_mask, eq_mask;
39d89b69 2623 int64_t sums[10*8*2];
792a5a7c
MN
2624 src+= step*3; // src points to begin of the 8x8 Block
2625//START_TIMER
2626asm volatile(
2627 "movq %0, %%mm7 \n\t"
2628 "movq %1, %%mm6 \n\t"
2629 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2630 );
2631
2632asm volatile(
2633 "leal (%2, %3), %%eax \n\t"
2634// 0 1 2 3 4 5 6 7 8 9
2635// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2636
2637 "movq (%2), %%mm0 \n\t"
2638 "movq (%%eax), %%mm1 \n\t"
2639 "movq %%mm1, %%mm3 \n\t"
2640 "movq %%mm1, %%mm4 \n\t"
2641 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
2642 "paddb %%mm7, %%mm0 \n\t"
2643 "pcmpgtb %%mm6, %%mm0 \n\t"
2644
2645 "movq (%%eax,%3), %%mm2 \n\t"
2646 PMAXUB(%%mm2, %%mm4)
2647 PMINUB(%%mm2, %%mm3, %%mm5)
2648 "psubb %%mm2, %%mm1 \n\t"
2649 "paddb %%mm7, %%mm1 \n\t"
2650 "pcmpgtb %%mm6, %%mm1 \n\t"
2651 "paddb %%mm1, %%mm0 \n\t"
2652
2653 "movq (%%eax, %3, 2), %%mm1 \n\t"
2654 PMAXUB(%%mm1, %%mm4)
2655 PMINUB(%%mm1, %%mm3, %%mm5)
2656 "psubb %%mm1, %%mm2 \n\t"
2657 "paddb %%mm7, %%mm2 \n\t"
2658 "pcmpgtb %%mm6, %%mm2 \n\t"
2659 "paddb %%mm2, %%mm0 \n\t"
2660
2661 "leal (%%eax, %3, 4), %%eax \n\t"
2662
2663 "movq (%2, %3, 4), %%mm2 \n\t"
2664 PMAXUB(%%mm2, %%mm4)
2665 PMINUB(%%mm2, %%mm3, %%mm5)
2666 "psubb %%mm2, %%mm1 \n\t"
2667 "paddb %%mm7, %%mm1 \n\t"
2668 "pcmpgtb %%mm6, %%mm1 \n\t"
2669 "paddb %%mm1, %%mm0 \n\t"
2670
2671 "movq (%%eax), %%mm1 \n\t"
2672 PMAXUB(%%mm1, %%mm4)
2673 PMINUB(%%mm1, %%mm3, %%mm5)
2674 "psubb %%mm1, %%mm2 \n\t"
2675 "paddb %%mm7, %%mm2 \n\t"
2676 "pcmpgtb %%mm6, %%mm2 \n\t"
2677 "paddb %%mm2, %%mm0 \n\t"
2678
2679 "movq (%%eax, %3), %%mm2 \n\t"
2680 PMAXUB(%%mm2, %%mm4)
2681 PMINUB(%%mm2, %%mm3, %%mm5)
2682 "psubb %%mm2, %%mm1 \n\t"
2683 "paddb %%mm7, %%mm1 \n\t"
2684 "pcmpgtb %%mm6, %%mm1 \n\t"
2685 "paddb %%mm1, %%mm0 \n\t"
2686
2687 "movq (%%eax, %3, 2), %%mm1 \n\t"
2688 PMAXUB(%%mm1, %%mm4)
2689 PMINUB(%%mm1, %%mm3, %%mm5)
2690 "psubb %%mm1, %%mm2 \n\t"
2691 "paddb %%mm7, %%mm2 \n\t"
2692 "pcmpgtb %%mm6, %%mm2 \n\t"
2693 "paddb %%mm2, %%mm0 \n\t"
2694
2695 "movq (%2, %3, 8), %%mm2 \n\t"
2696 PMAXUB(%%mm2, %%mm4)
2697 PMINUB(%%mm2, %%mm3, %%mm5)
2698 "psubb %%mm2, %%mm1 \n\t"
2699 "paddb %%mm7, %%mm1 \n\t"
2700 "pcmpgtb %%mm6, %%mm1 \n\t"
2701 "paddb %%mm1, %%mm0 \n\t"
2702
2703 "movq (%%eax, %3, 4), %%mm1 \n\t"
2704 "psubb %%mm1, %%mm2 \n\t"
2705 "paddb %%mm7, %%mm2 \n\t"
2706 "pcmpgtb %%mm6, %%mm2 \n\t"
2707 "paddb %%mm2, %%mm0 \n\t"
2708 "psubusb %%mm3, %%mm4 \n\t"
2709
a5cd3c48 2710 "pxor %%mm6, %%mm6 \n\t"
792a5a7c
MN
2711 "movq %4, %%mm7 \n\t" // QP,..., QP
2712 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
a5cd3c48
MN
2713 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2714 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2715 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
792a5a7c
MN
2716 "movq %%mm7, %1 \n\t"
2717
792a5a7c
MN
2718 "movq %5, %%mm7 \n\t"
2719 "punpcklbw %%mm7, %%mm7 \n\t"
2720 "punpcklbw %%mm7, %%mm7 \n\t"
2721 "punpcklbw %%mm7, %%mm7 \n\t"
2722 "psubb %%mm0, %%mm6 \n\t"
2723 "pcmpgtb %%mm7, %%mm6 \n\t"
2724 "movq %%mm6, %0 \n\t"
2725
2726 : "=m" (eq_mask), "=m" (dc_mask)
2727 : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2728 : "%eax"
2729 );
2730
39d89b69
MN
2731 if(dc_mask & eq_mask){
2732 int offset= -8*step;
2733 int64_t *temp_sums= sums;
2734
2735 asm volatile(
2736 "movq %2, %%mm0 \n\t" // QP,..., QP
2737 "pxor %%mm4, %%mm4 \n\t"
2738
2739 "movq (%0), %%mm6 \n\t"
2740 "movq (%0, %1), %%mm5 \n\t"
2741 "movq %%mm5, %%mm1 \n\t"
2742 "movq %%mm6, %%mm2 \n\t"
2743 "psubusb %%mm6, %%mm5 \n\t"
2744 "psubusb %%mm1, %%mm2 \n\t"
2745 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2746 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2747 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2748
2749 "pxor %%mm6, %%mm1 \n\t"
2750 "pand %%mm0, %%mm1 \n\t"
2751 "pxor %%mm1, %%mm6 \n\t"
2752 // 0:QP 6:First
2753
2754 "movq (%0, %1, 8), %%mm5 \n\t"
2755 "addl %1, %0 \n\t" // %0 points to line 1 not 0
2756 "movq (%0, %1, 8), %%mm7 \n\t"
2757 "movq %%mm5, %%mm1 \n\t"
2758 "movq %%mm7, %%mm2 \n\t"
2759 "psubusb %%mm7, %%mm5 \n\t"
2760 "psubusb %%mm1, %%mm2 \n\t"
2761 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2762 "movq %2, %%mm0 \n\t" // QP,..., QP
2763 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2764 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2765
2766 "pxor %%mm7, %%mm1 \n\t"
2767 "pand %%mm0, %%mm1 \n\t"
2768 "pxor %%mm1, %%mm7 \n\t"
2769
2770 "movq %%mm6, %%mm5 \n\t"
2771 "punpckhbw %%mm4, %%mm6 \n\t"
2772 "punpcklbw %%mm4, %%mm5 \n\t"
2773 // 4:0 5/6:First 7:Last
2774
2775 "movq %%mm5, %%mm0 \n\t"
2776 "movq %%mm6, %%mm1 \n\t"
2777 "psllw $2, %%mm0 \n\t"
2778 "psllw $2, %%mm1 \n\t"
2779 "paddw "MANGLE(w04)", %%mm0 \n\t"
2780 "paddw "MANGLE(w04)", %%mm1 \n\t"
2781
2782#define NEXT\
2783 "movq (%0), %%mm2 \n\t"\
2784 "movq (%0), %%mm3 \n\t"\
2785 "addl %1, %0 \n\t"\
2786 "punpcklbw %%mm4, %%mm2 \n\t"\
2787 "punpckhbw %%mm4, %%mm3 \n\t"\
2788 "paddw %%mm2, %%mm0 \n\t"\
2789 "paddw %%mm3, %%mm1 \n\t"
2790
2791#define PREV\
2792 "movq (%0), %%mm2 \n\t"\
2793 "movq (%0), %%mm3 \n\t"\
2794 "addl %1, %0 \n\t"\
2795 "punpcklbw %%mm4, %%mm2 \n\t"\
2796 "punpckhbw %%mm4, %%mm3 \n\t"\
2797 "psubw %%mm2, %%mm0 \n\t"\
2798 "psubw %%mm3, %%mm1 \n\t"
2799
2800
2801 NEXT //0
2802 NEXT //1
2803 NEXT //2
2804 "movq %%mm0, (%3) \n\t"
2805 "movq %%mm1, 8(%3) \n\t"
2806
2807 NEXT //3
2808 "psubw %%mm5, %%mm0 \n\t"
2809 "psubw %%mm6, %%mm1 \n\t"
2810 "movq %%mm0, 16(%3) \n\t"
2811 "movq %%mm1, 24(%3) \n\t"
2812
2813 NEXT //4
2814 "psubw %%mm5, %%mm0 \n\t"
2815 "psubw %%mm6, %%mm1 \n\t"
2816 "movq %%mm0, 32(%3) \n\t"
2817 "movq %%mm1, 40(%3) \n\t"
2818
2819 NEXT //5
2820 "psubw %%mm5, %%mm0 \n\t"
2821 "psubw %%mm6, %%mm1 \n\t"
2822 "movq %%mm0, 48(%3) \n\t"
2823 "movq %%mm1, 56(%3) \n\t"
2824
2825 NEXT //6
2826 "psubw %%mm5, %%mm0 \n\t"
2827 "psubw %%mm6, %%mm1 \n\t"
2828 "movq %%mm0, 64(%3) \n\t"
2829 "movq %%mm1, 72(%3) \n\t"
2830
2831 "movq %%mm7, %%mm6 \n\t"
2832 "punpckhbw %%mm4, %%mm7 \n\t"
2833 "punpcklbw %%mm4, %%mm6 \n\t"
2834
2835 NEXT //7
2836 "movl %4, %0 \n\t"
2837 "addl %1, %0 \n\t"
2838 PREV //0
2839 "movq %%mm0, 80(%3) \n\t"
2840 "movq %%mm1, 88(%3) \n\t"
2841
2842 PREV //1
2843 "paddw %%mm6, %%mm0 \n\t"
2844 "paddw %%mm7, %%mm1 \n\t"
2845 "movq %%mm0, 96(%3) \n\t"
2846 "movq %%mm1, 104(%3) \n\t"
2847
2848 PREV //2
2849 "paddw %%mm6, %%mm0 \n\t"
2850 "paddw %%mm7, %%mm1 \n\t"
2851 "movq %%mm0, 112(%3) \n\t"
2852 "movq %%mm1, 120(%3) \n\t"
2853
2854 PREV //3
2855 "paddw %%mm6, %%mm0 \n\t"
2856 "paddw %%mm7, %%mm1 \n\t"
2857 "movq %%mm0, 128(%3) \n\t"
2858 "movq %%mm1, 136(%3) \n\t"
2859
2860 PREV //4
2861 "paddw %%mm6, %%mm0 \n\t"
2862 "paddw %%mm7, %%mm1 \n\t"
2863 "movq %%mm0, 144(%3) \n\t"
2864 "movq %%mm1, 152(%3) \n\t"
2865
2866 "movl %4, %0 \n\t" //FIXME
2867
2868 : "+&r"(src)
2869 : "r" (step), "m" (c->pQPb), "r"(sums), "g"(src)
2870 );
2871
2872 src+= step; // src points to begin of the 8x8 Block
2873
2874 asm volatile(
2875 "movq %4, %%mm6 \n\t"
2876 "pcmpeqb %%mm5, %%mm5 \n\t"
2877 "pxor %%mm6, %%mm5 \n\t"
2878 "pxor %%mm7, %%mm7 \n\t"
2879
2880 "1: \n\t"
2881 "movq (%1), %%mm0 \n\t"
2882 "movq 8(%1), %%mm1 \n\t"
2883 "paddw 32(%1), %%mm0 \n\t"
2884 "paddw 40(%1), %%mm1 \n\t"
2885 "movq (%0, %3), %%mm2 \n\t"
2886 "movq %%mm2, %%mm3 \n\t"
2887 "movq %%mm2, %%mm4 \n\t"
2888 "punpcklbw %%mm7, %%mm2 \n\t"
2889 "punpckhbw %%mm7, %%mm3 \n\t"
2890 "paddw %%mm2, %%mm0 \n\t"
2891 "paddw %%mm3, %%mm1 \n\t"
2892 "paddw %%mm2, %%mm0 \n\t"
2893 "paddw %%mm3, %%mm1 \n\t"
2894 "psrlw $4, %%mm0 \n\t"
2895 "psrlw $4, %%mm1 \n\t"
2896 "packuswb %%mm1, %%mm0 \n\t"
2897 "pand %%mm6, %%mm0 \n\t"
2898 "pand %%mm5, %%mm4 \n\t"
2899 "por %%mm4, %%mm0 \n\t"
2900 "movq %%mm0, (%0, %3) \n\t"
2901 "addl $16, %1 \n\t"
2902 "addl %2, %0 \n\t"
2903 " js 1b \n\t"
2904
2905 : "+r"(offset), "+r"(temp_sums)
2906 : "r" (step), "r"(src - offset), "m"(dc_mask & eq_mask)
2907 );
2908 }else
2909 src+= step; // src points to begin of the 8x8 Block
792a5a7c
MN
2910
2911 if(eq_mask != -1LL){
39d89b69 2912 uint8_t *temp_src= src;
792a5a7c
MN
2913 asm volatile(
2914 "pxor %%mm7, %%mm7 \n\t"
2915 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars
2916 "andl $0xFFFFFFF8, %%ecx \n\t" // align
2917// 0 1 2 3 4 5 6 7 8 9
2918// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2919
2920 "movq (%0), %%mm0 \n\t"
2921 "movq %%mm0, %%mm1 \n\t"
2922 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2923 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2924
2925 "movq (%0, %1), %%mm2 \n\t"
2926 "leal (%0, %1, 2), %%eax \n\t"
2927 "movq %%mm2, %%mm3 \n\t"
2928 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2929 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2930
2931 "movq (%%eax), %%mm4 \n\t"
2932 "movq %%mm4, %%mm5 \n\t"
2933 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2934 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2935
2936 "paddw %%mm0, %%mm0 \n\t" // 2L0
2937 "paddw %%mm1, %%mm1 \n\t" // 2H0
2938 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2939 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2940 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2941 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2942
2943 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2944 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2945 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2946 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2947
2948 "movq (%%eax, %1), %%mm2 \n\t"
2949 "movq %%mm2, %%mm3 \n\t"
2950 "punpcklbw %%mm7, %%mm2 \n\t" // L3
2951 "punpckhbw %%mm7, %%mm3 \n\t" // H3
2952
2953 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2954 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2955 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2956 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2957 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2958 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2959
2960 "movq (%%eax, %1, 2), %%mm0 \n\t"
2961 "movq %%mm0, %%mm1 \n\t"
2962 "punpcklbw %%mm7, %%mm0 \n\t" // L4
2963 "punpckhbw %%mm7, %%mm1 \n\t" // H4
2964
2965 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2966 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2967 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4
2968 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4
2969 "paddw %%mm4, %%mm4 \n\t" // 2L2
2970 "paddw %%mm5, %%mm5 \n\t" // 2H2
2971 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2972 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2973
2974 "leal (%%eax, %1), %0 \n\t"
2975 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2976 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2977 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2978 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2979//50 opcodes so far
2980 "movq (%0, %1, 2), %%mm2 \n\t"
2981 "movq %%mm2, %%mm3 \n\t"
2982 "punpcklbw %%mm7, %%mm2 \n\t" // L5
2983 "punpckhbw %%mm7, %%mm3 \n\t" // H5
2984 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2985 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2986 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2987 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2988
2989 "movq (%%eax, %1, 4), %%mm6 \n\t"
2990 "punpcklbw %%mm7, %%mm6 \n\t" // L6
2991 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2992 "movq (%%eax, %1, 4), %%mm6 \n\t"
2993 "punpckhbw %%mm7, %%mm6 \n\t" // H6
2994 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2995
2996 "paddw %%mm0, %%mm0 \n\t" // 2L4
2997 "paddw %%mm1, %%mm1 \n\t" // 2H4
2998 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2999 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
3000
3001 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
3002 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
3003 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
3004 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
3005
3006 "movq (%0, %1, 4), %%mm2 \n\t"
3007 "movq %%mm2, %%mm3 \n\t"
3008 "punpcklbw %%mm7, %%mm2 \n\t" // L7
3009 "punpckhbw %%mm7, %%mm3 \n\t" // H7
3010
3011 "paddw %%mm2, %%mm2 \n\t" // 2L7
3012 "paddw %%mm3, %%mm3 \n\t" // 2H7
3013 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
3014 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
3015
3016 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
3017 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
3018
3019#ifdef HAVE_MMX2
3020 "movq %%mm7, %%mm6 \n\t" // 0
3021 "psubw %%mm0, %%mm6 \n\t"
3022 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3023 "movq %%mm7, %%mm6 \n\t" // 0
3024 "psubw %%mm1, %%mm6 \n\t"
3025 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3026 "movq %%mm7, %%mm6 \n\t" // 0
3027 "psubw %%mm2, %%mm6 \n\t"
3028 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3029 "movq %%mm7, %%mm6 \n\t" // 0
3030 "psubw %%mm3, %%mm6 \n\t"
3031 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3032#else
3033 "movq %%mm7, %%mm6 \n\t" // 0
3034 "pcmpgtw %%mm0, %%mm6 \n\t"
3035 "pxor %%mm6, %%mm0 \n\t"
3036 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3037 "movq %%mm7, %%mm6 \n\t" // 0
3038 "pcmpgtw %%mm1, %%mm6 \n\t"
3039 "pxor %%mm6, %%mm1 \n\t"
3040 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3041 "movq %%mm7, %%mm6 \n\t" // 0
3042 "pcmpgtw %%mm2, %%mm6 \n\t"
3043 "pxor %%mm6, %%mm2 \n\t"
3044 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3045 "movq %%mm7, %%mm6 \n\t" // 0
3046 "pcmpgtw %%mm3, %%mm6 \n\t"
3047 "pxor %%mm6, %%mm3 \n\t"
3048 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3049#endif
3050
3051#ifdef HAVE_MMX2
3052 "pminsw %%mm2, %%mm0 \n\t"
3053 "pminsw %%mm3, %%mm1 \n\t"
3054#else
3055 "movq %%mm0, %%mm6 \n\t"
3056 "psubusw %%mm2, %%mm6 \n\t"
3057 "psubw %%mm6, %%mm0 \n\t"
3058 "movq %%mm1, %%mm6 \n\t"
3059 "psubusw %%mm3, %%mm6 \n\t"
3060 "psubw %%mm6, %%mm1 \n\t"
3061#endif
3062
3063 "movd %2, %%mm2 \n\t" // QP
3064 "punpcklbw %%mm7, %%mm2 \n\t"
3065
3066 "movq %%mm7, %%mm6 \n\t" // 0
3067 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
3068 "pxor %%mm6, %%mm4 \n\t"
3069 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
3070 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
3071 "pxor %%mm7, %%mm5 \n\t"
3072 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3073// 100 opcodes
3074 "psllw $3, %%mm2 \n\t" // 8QP
3075 "movq %%mm2, %%mm3 \n\t" // 8QP
3076 "pcmpgtw %%mm4, %%mm2 \n\t"
3077 "pcmpgtw %%mm5, %%mm3 \n\t"
3078 "pand %%mm2, %%mm4 \n\t"
3079 "pand %%mm3, %%mm5 \n\t"
3080
3081
3082 "psubusw %%mm0, %%mm4 \n\t" // hd
3083 "psubusw %%mm1, %%mm5 \n\t" // ld
3084
3085
3086 "movq "MANGLE(w05)", %%mm2 \n\t" // 5
3087 "pmullw %%mm2, %%mm4 \n\t"
3088 "pmullw %%mm2, %%mm5 \n\t"
3089 "movq "MANGLE(w20)", %%mm2 \n\t" // 32
3090 "paddw %%mm2, %%mm4 \n\t"
3091 "paddw %%mm2, %%mm5 \n\t"
3092 "psrlw $6, %%mm4 \n\t"
3093 "psrlw $6, %%mm5 \n\t"
3094
3095 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4
3096 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4
3097
3098 "pxor %%mm2, %%mm2 \n\t"
3099 "pxor %%mm3, %%mm3 \n\t"
3100
3101 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
3102 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
3103 "pxor %%mm2, %%mm0 \n\t"
3104 "pxor %%mm3, %%mm1 \n\t"
3105 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
3106 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
3107 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
3108 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3109
3110 "pxor %%mm6, %%mm2 \n\t"
3111 "pxor %%mm7, %%mm3 \n\t"
3112 "pand %%mm2, %%mm4 \n\t"
3113 "pand %%mm3, %%mm5 \n\t"
3114
3115#ifdef HAVE_MMX2
3116 "pminsw %%mm0, %%mm4 \n\t"
3117 "pminsw %%mm1, %%mm5 \n\t"
3118#else
3119 "movq %%mm4, %%mm2 \n\t"
3120 "psubusw %%mm0, %%mm2 \n\t"
3121 "psubw %%mm2, %%mm4 \n\t"
3122 "movq %%mm5, %%mm2 \n\t"
3123 "psubusw %%mm1, %%mm2 \n\t"
3124 "psubw %%mm2, %%mm5 \n\t"
3125#endif
3126 "pxor %%mm6, %%mm4 \n\t"
3127 "pxor %%mm7, %%mm5 \n\t"
3128 "psubw %%mm6, %%mm4 \n\t"
3129 "psubw %%mm7, %%mm5 \n\t"
3130 "packsswb %%mm5, %%mm4 \n\t"
3131 "movq %3, %%mm1 \n\t"
3132 "pandn %%mm4, %%mm1 \n\t"
3133 "movq (%0), %%mm0 \n\t"
3134 "paddb %%mm1, %%mm0 \n\t"
3135 "movq %%mm0, (%0) \n\t"
3136 "movq (%0, %1), %%mm0 \n\t"
3137 "psubb %%mm1, %%mm0 \n\t"
3138 "movq %%mm0, (%0, %1) \n\t"
3139
39d89b69 3140 : "+r" (temp_src)
792a5a7c
MN
3141 : "r" (step), "m" (c->pQPb), "m"(eq_mask)
3142 : "%eax", "%ecx"
3143 );
792a5a7c
MN
3144 }
3145/*if(step==16){
3146 STOP_TIMER("step16")
3147}else{
3148 STOP_TIMER("stepX")
3149}*/
3150}
3151#endif //HAVE_MMX
3152
cc9b0679 3153static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
9c9e467d 3154 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
13e00528 3155
3057fa66
A
3156/**
3157 * Copies a block from src to dst and fixes the blacklevel
d5a1a995 3158 * levelFix == 0 -> dont touch the brighness & contrast
3057fa66 3159 */
6b791538
A
3160#undef SCALED_CPY
3161
cc9b0679 3162static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
9c9e467d 3163 int levelFix, int64_t *packedOffsetAndScale)
3057fa66 3164{
e5c30e06 3165#ifndef HAVE_MMX
d5a1a995 3166 int i;
e5c30e06 3167#endif
d5a1a995
MN
3168 if(levelFix)
3169 {
3057fa66
A
3170#ifdef HAVE_MMX
3171 asm volatile(
9c9e467d
MN
3172 "movq (%%eax), %%mm2 \n\t" // packedYOffset
3173 "movq 8(%%eax), %%mm3 \n\t" // packedYScale
3174 "leal (%2,%4), %%eax \n\t"
3175 "leal (%3,%5), %%edx \n\t"
5b65f0df 3176 "pxor %%mm4, %%mm4 \n\t"
6e9b4840
MN
3177#ifdef HAVE_MMX2
3178#define SCALED_CPY(src1, src2, dst1, dst2) \
3179 "movq " #src1 ", %%mm0 \n\t"\
3180 "movq " #src1 ", %%mm5 \n\t"\
3181 "movq " #src2 ", %%mm1 \n\t"\
3182 "movq " #src2 ", %%mm6 \n\t"\
3183 "punpcklbw %%mm0, %%mm0 \n\t"\
3184 "punpckhbw %%mm5, %%mm5 \n\t"\
3185 "punpcklbw %%mm1, %%mm1 \n\t"\
3186 "punpckhbw %%mm6, %%mm6 \n\t"\
3187 "pmulhuw %%mm3, %%mm0 \n\t"\
3188 "pmulhuw %%mm3, %%mm5 \n\t"\
3189 "pmulhuw %%mm3, %%mm1 \n\t"\
3190 "pmulhuw %%mm3, %%mm6 \n\t"\
3191 "psubw %%mm2, %%mm0 \n\t"\
3192 "psubw %%mm2, %%mm5 \n\t"\
3193 "psubw %%mm2, %%mm1 \n\t"\
3194 "psubw %%mm2, %%mm6 \n\t"\
3195 "packuswb %%mm5, %%mm0 \n\t"\
3196 "packuswb %%mm6, %%mm1 \n\t"\
3197 "movq %%mm0, " #dst1 " \n\t"\
3198 "movq %%mm1, " #dst2 " \n\t"\
3057fa66 3199
6e9b4840 3200#else //HAVE_MMX2
043ba56f
MN
3201#define SCALED_CPY(src1, src2, dst1, dst2) \
3202 "movq " #src1 ", %%mm0 \n\t"\
3203 "movq " #src1 ", %%mm5 \n\t"\
5b65f0df
MN
3204 "punpcklbw %%mm4, %%mm0 \n\t"\
3205 "punpckhbw %%mm4, %%mm5 \n\t"\
57d04d3f
MN
3206 "psubw %%mm2, %%mm0 \n\t"\
3207 "psubw %%mm2, %%mm5 \n\t"\
043ba56f 3208 "movq " #src2 ", %%mm1 \n\t"\
57d04d3f
MN
3209 "psllw $6, %%mm0 \n\t"\
3210 "psllw $6, %%mm5 \n\t"\
5b65f0df 3211 "pmulhw %%mm3, %%mm0 \n\t"\
043ba56f 3212 "movq " #src2 ", %%mm6 \n\t"\
5b65f0df 3213 "pmulhw %%mm3, %%mm5 \n\t"\
5b65f0df 3214 "punpcklbw %%mm4, %%mm1 \n\t"\
534a602d 3215 "punpckhbw %%mm4, %%mm6 \n\t"\
57d04d3f 3216 "psubw %%mm2, %%mm1 \n\t"\
534a602d 3217 "psubw %%mm2, %%mm6 \n\t"\
57d04d3f 3218 "psllw $6, %%mm1 \n\t"\
534a602d 3219 "psllw $6, %%mm6 \n\t"\
5b65f0df 3220 "pmulhw %%mm3, %%mm1 \n\t"\
534a602d 3221 "pmulhw %%mm3, %%mm6 \n\t"\
534a602d
MN
3222 "packuswb %%mm5, %%mm0 \n\t"\
3223 "packuswb %%mm6, %%mm1 \n\t"\
043ba56f
MN
3224 "movq %%mm0, " #dst1 " \n\t"\
3225 "movq %%mm1, " #dst2 " \n\t"\
3226
6e9b4840
MN
3227#endif //!HAVE_MMX2
3228
9c9e467d
MN
3229SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
3230SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
3231SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
3232 "leal (%%eax,%4,4), %%eax \n\t"
3233 "leal (%%edx,%5,4), %%edx \n\t"
3234SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
043ba56f
MN
3235
3236
9c9e467d
MN
3237 : "=&a" (packedOffsetAndScale)
3238 : "0" (packedOffsetAndScale),
3239 "r"(src),
043ba56f
MN
3240 "r"(dst),
3241 "r" (srcStride),
534a602d 3242 "r" (dstStride)
9c9e467d 3243 : "%edx"
d5a1a995
MN
3244 );
3245#else
c09dc465 3246 for(i=0; i<8; i++)
d5a1a995
MN
3247 memcpy( &(dst[dstStride*i]),
3248 &(src[srcStride*i]), BLOCK_SIZE);
3249#endif
3250 }
3251 else
3252 {
3253#ifdef HAVE_MMX
3254 asm volatile(
043ba56f 3255 "leal (%0,%2), %%eax \n\t"
9c9e467d 3256 "leal (%1,%3), %%edx \n\t"
043ba56f
MN
3257
3258#define SIMPLE_CPY(src1, src2, dst1, dst2) \
3259 "movq " #src1 ", %%mm0 \n\t"\
3260 "movq " #src2 ", %%mm1 \n\t"\
3261 "movq %%mm0, " #dst1 " \n\t"\
3262 "movq %%mm1, " #dst2 " \n\t"\
3263
3264SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
9c9e467d
MN
3265SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
3266SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
043ba56f 3267 "leal (%%eax,%2,4), %%eax \n\t"
9c9e467d
MN
3268 "leal (%%edx,%3,4), %%edx \n\t"
3269SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
043ba56f 3270
3057fa66
A
3271 : : "r" (src),
3272 "r" (dst),
3273 "r" (srcStride),
c09dc465 3274 "r" (dstStride)
9c9e467d 3275 : "%eax", "%edx"
3057fa66
A
3276 );
3277#else
c09dc465 3278 for(i=0; i<8; i++)
3057fa66
A
3279 memcpy( &(dst[dstStride*i]),
3280 &(src[srcStride*i]), BLOCK_SIZE);
3281#endif
d5a1a995 3282 }
3057fa66
A
3283}
3284
4b6388d1
MN
3285/**
3286 * Duplicates the given 8 src pixels ? times upward
3287 */
3288static inline void RENAME(duplicate)(uint8_t src[], int stride)
3289{
3290#ifdef HAVE_MMX
3291 asm volatile(
3292 "movq (%0), %%mm0 \n\t"
3293 "addl %1, %0 \n\t"
3294 "movq %%mm0, (%0) \n\t"
3295 "movq %%mm0, (%0, %1) \n\t"
3296 "movq %%mm0, (%0, %1, 2) \n\t"
3297 : "+r" (src)
3298 : "r" (-stride)
3299 );
3300#else
3301 int i;
3302 uint8_t *p=src;
3303 for(i=0; i<3; i++)
3304 {
3305 p-= stride;
3306 memcpy(p, src, 8);
3307 }
3308#endif
3309}
3057fa66
A
3310
3311/**