33c8a2692eb54fd4f3ad6e79f59e2da3f0cd9a5a
[libav.git] / libavcodec / i386 / dsputil_mmx_rnd.h
1 /*
2 * DSP utils mmx functions are compiled twice for rnd/no_rnd
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27 /* This header intentionally has no multiple inclusion guards. It is meant to
28 * be included multiple times and generates different code depending on the
29 * value of certain #defines. */
30
31 // put_pixels
32 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
33 {
34 MOVQ_BFE(mm6);
35 asm volatile(
36 "lea (%3, %3), %%"REG_a" \n\t"
37 ASMALIGN(3)
38 "1: \n\t"
39 "movq (%1), %%mm0 \n\t"
40 "movq 1(%1), %%mm1 \n\t"
41 "movq (%1, %3), %%mm2 \n\t"
42 "movq 1(%1, %3), %%mm3 \n\t"
43 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
44 "movq %%mm4, (%2) \n\t"
45 "movq %%mm5, (%2, %3) \n\t"
46 "add %%"REG_a", %1 \n\t"
47 "add %%"REG_a", %2 \n\t"
48 "movq (%1), %%mm0 \n\t"
49 "movq 1(%1), %%mm1 \n\t"
50 "movq (%1, %3), %%mm2 \n\t"
51 "movq 1(%1, %3), %%mm3 \n\t"
52 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
53 "movq %%mm4, (%2) \n\t"
54 "movq %%mm5, (%2, %3) \n\t"
55 "add %%"REG_a", %1 \n\t"
56 "add %%"REG_a", %2 \n\t"
57 "subl $4, %0 \n\t"
58 "jnz 1b \n\t"
59 :"+g"(h), "+S"(pixels), "+D"(block)
60 :"r"((long)line_size)
61 :REG_a, "memory");
62 }
63
64 static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
65 {
66 MOVQ_BFE(mm6);
67 asm volatile(
68 "testl $1, %0 \n\t"
69 " jz 1f \n\t"
70 "movq (%1), %%mm0 \n\t"
71 "movq (%2), %%mm1 \n\t"
72 "add %4, %1 \n\t"
73 "add $8, %2 \n\t"
74 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
75 "movq %%mm4, (%3) \n\t"
76 "add %5, %3 \n\t"
77 "decl %0 \n\t"
78 ASMALIGN(3)
79 "1: \n\t"
80 "movq (%1), %%mm0 \n\t"
81 "movq (%2), %%mm1 \n\t"
82 "add %4, %1 \n\t"
83 "movq (%1), %%mm2 \n\t"
84 "movq 8(%2), %%mm3 \n\t"
85 "add %4, %1 \n\t"
86 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
87 "movq %%mm4, (%3) \n\t"
88 "add %5, %3 \n\t"
89 "movq %%mm5, (%3) \n\t"
90 "add %5, %3 \n\t"
91 "movq (%1), %%mm0 \n\t"
92 "movq 16(%2), %%mm1 \n\t"
93 "add %4, %1 \n\t"
94 "movq (%1), %%mm2 \n\t"
95 "movq 24(%2), %%mm3 \n\t"
96 "add %4, %1 \n\t"
97 "add $32, %2 \n\t"
98 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
99 "movq %%mm4, (%3) \n\t"
100 "add %5, %3 \n\t"
101 "movq %%mm5, (%3) \n\t"
102 "add %5, %3 \n\t"
103 "subl $4, %0 \n\t"
104 "jnz 1b \n\t"
105 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
106 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
107 #else
108 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
109 #endif
110 :"S"((long)src1Stride), "D"((long)dstStride)
111 :"memory");
112 }
113
114 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
115 {
116 MOVQ_BFE(mm6);
117 asm volatile(
118 "lea (%3, %3), %%"REG_a" \n\t"
119 ASMALIGN(3)
120 "1: \n\t"
121 "movq (%1), %%mm0 \n\t"
122 "movq 1(%1), %%mm1 \n\t"
123 "movq (%1, %3), %%mm2 \n\t"
124 "movq 1(%1, %3), %%mm3 \n\t"
125 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
126 "movq %%mm4, (%2) \n\t"
127 "movq %%mm5, (%2, %3) \n\t"
128 "movq 8(%1), %%mm0 \n\t"
129 "movq 9(%1), %%mm1 \n\t"
130 "movq 8(%1, %3), %%mm2 \n\t"
131 "movq 9(%1, %3), %%mm3 \n\t"
132 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
133 "movq %%mm4, 8(%2) \n\t"
134 "movq %%mm5, 8(%2, %3) \n\t"
135 "add %%"REG_a", %1 \n\t"
136 "add %%"REG_a", %2 \n\t"
137 "movq (%1), %%mm0 \n\t"
138 "movq 1(%1), %%mm1 \n\t"
139 "movq (%1, %3), %%mm2 \n\t"
140 "movq 1(%1, %3), %%mm3 \n\t"
141 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
142 "movq %%mm4, (%2) \n\t"
143 "movq %%mm5, (%2, %3) \n\t"
144 "movq 8(%1), %%mm0 \n\t"
145 "movq 9(%1), %%mm1 \n\t"
146 "movq 8(%1, %3), %%mm2 \n\t"
147 "movq 9(%1, %3), %%mm3 \n\t"
148 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
149 "movq %%mm4, 8(%2) \n\t"
150 "movq %%mm5, 8(%2, %3) \n\t"
151 "add %%"REG_a", %1 \n\t"
152 "add %%"REG_a", %2 \n\t"
153 "subl $4, %0 \n\t"
154 "jnz 1b \n\t"
155 :"+g"(h), "+S"(pixels), "+D"(block)
156 :"r"((long)line_size)
157 :REG_a, "memory");
158 }
159
160 static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
161 {
162 MOVQ_BFE(mm6);
163 asm volatile(
164 "testl $1, %0 \n\t"
165 " jz 1f \n\t"
166 "movq (%1), %%mm0 \n\t"
167 "movq (%2), %%mm1 \n\t"
168 "movq 8(%1), %%mm2 \n\t"
169 "movq 8(%2), %%mm3 \n\t"
170 "add %4, %1 \n\t"
171 "add $16, %2 \n\t"
172 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
173 "movq %%mm4, (%3) \n\t"
174 "movq %%mm5, 8(%3) \n\t"
175 "add %5, %3 \n\t"
176 "decl %0 \n\t"
177 ASMALIGN(3)
178 "1: \n\t"
179 "movq (%1), %%mm0 \n\t"
180 "movq (%2), %%mm1 \n\t"
181 "movq 8(%1), %%mm2 \n\t"
182 "movq 8(%2), %%mm3 \n\t"
183 "add %4, %1 \n\t"
184 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
185 "movq %%mm4, (%3) \n\t"
186 "movq %%mm5, 8(%3) \n\t"
187 "add %5, %3 \n\t"
188 "movq (%1), %%mm0 \n\t"
189 "movq 16(%2), %%mm1 \n\t"
190 "movq 8(%1), %%mm2 \n\t"
191 "movq 24(%2), %%mm3 \n\t"
192 "add %4, %1 \n\t"
193 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
194 "movq %%mm4, (%3) \n\t"
195 "movq %%mm5, 8(%3) \n\t"
196 "add %5, %3 \n\t"
197 "add $32, %2 \n\t"
198 "subl $2, %0 \n\t"
199 "jnz 1b \n\t"
200 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
201 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
202 #else
203 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
204 #endif
205 :"S"((long)src1Stride), "D"((long)dstStride)
206 :"memory");
207 }
208
209 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
210 {
211 MOVQ_BFE(mm6);
212 asm volatile(
213 "lea (%3, %3), %%"REG_a" \n\t"
214 "movq (%1), %%mm0 \n\t"
215 ASMALIGN(3)
216 "1: \n\t"
217 "movq (%1, %3), %%mm1 \n\t"
218 "movq (%1, %%"REG_a"),%%mm2 \n\t"
219 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
220 "movq %%mm4, (%2) \n\t"
221 "movq %%mm5, (%2, %3) \n\t"
222 "add %%"REG_a", %1 \n\t"
223 "add %%"REG_a", %2 \n\t"
224 "movq (%1, %3), %%mm1 \n\t"
225 "movq (%1, %%"REG_a"),%%mm0 \n\t"
226 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
227 "movq %%mm4, (%2) \n\t"
228 "movq %%mm5, (%2, %3) \n\t"
229 "add %%"REG_a", %1 \n\t"
230 "add %%"REG_a", %2 \n\t"
231 "subl $4, %0 \n\t"
232 "jnz 1b \n\t"
233 :"+g"(h), "+S"(pixels), "+D"(block)
234 :"r"((long)line_size)
235 :REG_a, "memory");
236 }
237
238 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
239 {
240 MOVQ_ZERO(mm7);
241 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
242 asm volatile(
243 "movq (%1), %%mm0 \n\t"
244 "movq 1(%1), %%mm4 \n\t"
245 "movq %%mm0, %%mm1 \n\t"
246 "movq %%mm4, %%mm5 \n\t"
247 "punpcklbw %%mm7, %%mm0 \n\t"
248 "punpcklbw %%mm7, %%mm4 \n\t"
249 "punpckhbw %%mm7, %%mm1 \n\t"
250 "punpckhbw %%mm7, %%mm5 \n\t"
251 "paddusw %%mm0, %%mm4 \n\t"
252 "paddusw %%mm1, %%mm5 \n\t"
253 "xor %%"REG_a", %%"REG_a" \n\t"
254 "add %3, %1 \n\t"
255 ASMALIGN(3)
256 "1: \n\t"
257 "movq (%1, %%"REG_a"), %%mm0 \n\t"
258 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
259 "movq %%mm0, %%mm1 \n\t"
260 "movq %%mm2, %%mm3 \n\t"
261 "punpcklbw %%mm7, %%mm0 \n\t"
262 "punpcklbw %%mm7, %%mm2 \n\t"
263 "punpckhbw %%mm7, %%mm1 \n\t"
264 "punpckhbw %%mm7, %%mm3 \n\t"
265 "paddusw %%mm2, %%mm0 \n\t"
266 "paddusw %%mm3, %%mm1 \n\t"
267 "paddusw %%mm6, %%mm4 \n\t"
268 "paddusw %%mm6, %%mm5 \n\t"
269 "paddusw %%mm0, %%mm4 \n\t"
270 "paddusw %%mm1, %%mm5 \n\t"
271 "psrlw $2, %%mm4 \n\t"
272 "psrlw $2, %%mm5 \n\t"
273 "packuswb %%mm5, %%mm4 \n\t"
274 "movq %%mm4, (%2, %%"REG_a") \n\t"
275 "add %3, %%"REG_a" \n\t"
276
277 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
278 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
279 "movq %%mm2, %%mm3 \n\t"
280 "movq %%mm4, %%mm5 \n\t"
281 "punpcklbw %%mm7, %%mm2 \n\t"
282 "punpcklbw %%mm7, %%mm4 \n\t"
283 "punpckhbw %%mm7, %%mm3 \n\t"
284 "punpckhbw %%mm7, %%mm5 \n\t"
285 "paddusw %%mm2, %%mm4 \n\t"
286 "paddusw %%mm3, %%mm5 \n\t"
287 "paddusw %%mm6, %%mm0 \n\t"
288 "paddusw %%mm6, %%mm1 \n\t"
289 "paddusw %%mm4, %%mm0 \n\t"
290 "paddusw %%mm5, %%mm1 \n\t"
291 "psrlw $2, %%mm0 \n\t"
292 "psrlw $2, %%mm1 \n\t"
293 "packuswb %%mm1, %%mm0 \n\t"
294 "movq %%mm0, (%2, %%"REG_a") \n\t"
295 "add %3, %%"REG_a" \n\t"
296
297 "subl $2, %0 \n\t"
298 "jnz 1b \n\t"
299 :"+g"(h), "+S"(pixels)
300 :"D"(block), "r"((long)line_size)
301 :REG_a, "memory");
302 }
303
304 // avg_pixels
305 static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
306 {
307 MOVQ_BFE(mm6);
308 JUMPALIGN();
309 do {
310 asm volatile(
311 "movd %0, %%mm0 \n\t"
312 "movd %1, %%mm1 \n\t"
313 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
314 "movd %%mm2, %0 \n\t"
315 :"+m"(*block)
316 :"m"(*pixels)
317 :"memory");
318 pixels += line_size;
319 block += line_size;
320 }
321 while (--h);
322 }
323
324 // in case more speed is needed - unroling would certainly help
325 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
326 {
327 MOVQ_BFE(mm6);
328 JUMPALIGN();
329 do {
330 asm volatile(
331 "movq %0, %%mm0 \n\t"
332 "movq %1, %%mm1 \n\t"
333 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
334 "movq %%mm2, %0 \n\t"
335 :"+m"(*block)
336 :"m"(*pixels)
337 :"memory");
338 pixels += line_size;
339 block += line_size;
340 }
341 while (--h);
342 }
343
344 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
345 {
346 MOVQ_BFE(mm6);
347 JUMPALIGN();
348 do {
349 asm volatile(
350 "movq %0, %%mm0 \n\t"
351 "movq %1, %%mm1 \n\t"
352 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
353 "movq %%mm2, %0 \n\t"
354 "movq 8%0, %%mm0 \n\t"
355 "movq 8%1, %%mm1 \n\t"
356 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
357 "movq %%mm2, 8%0 \n\t"
358 :"+m"(*block)
359 :"m"(*pixels)
360 :"memory");
361 pixels += line_size;
362 block += line_size;
363 }
364 while (--h);
365 }
366
367 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
368 {
369 MOVQ_BFE(mm6);
370 JUMPALIGN();
371 do {
372 asm volatile(
373 "movq %1, %%mm0 \n\t"
374 "movq 1%1, %%mm1 \n\t"
375 "movq %0, %%mm3 \n\t"
376 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
377 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
378 "movq %%mm0, %0 \n\t"
379 :"+m"(*block)
380 :"m"(*pixels)
381 :"memory");
382 pixels += line_size;
383 block += line_size;
384 } while (--h);
385 }
386
387 static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
388 {
389 MOVQ_BFE(mm6);
390 JUMPALIGN();
391 do {
392 asm volatile(
393 "movq %1, %%mm0 \n\t"
394 "movq %2, %%mm1 \n\t"
395 "movq %0, %%mm3 \n\t"
396 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
397 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
398 "movq %%mm0, %0 \n\t"
399 :"+m"(*dst)
400 :"m"(*src1), "m"(*src2)
401 :"memory");
402 dst += dstStride;
403 src1 += src1Stride;
404 src2 += 8;
405 } while (--h);
406 }
407
408 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
409 {
410 MOVQ_BFE(mm6);
411 JUMPALIGN();
412 do {
413 asm volatile(
414 "movq %1, %%mm0 \n\t"
415 "movq 1%1, %%mm1 \n\t"
416 "movq %0, %%mm3 \n\t"
417 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
418 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
419 "movq %%mm0, %0 \n\t"
420 "movq 8%1, %%mm0 \n\t"
421 "movq 9%1, %%mm1 \n\t"
422 "movq 8%0, %%mm3 \n\t"
423 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
424 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
425 "movq %%mm0, 8%0 \n\t"
426 :"+m"(*block)
427 :"m"(*pixels)
428 :"memory");
429 pixels += line_size;
430 block += line_size;
431 } while (--h);
432 }
433
434 static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
435 {
436 MOVQ_BFE(mm6);
437 JUMPALIGN();
438 do {
439 asm volatile(
440 "movq %1, %%mm0 \n\t"
441 "movq %2, %%mm1 \n\t"
442 "movq %0, %%mm3 \n\t"
443 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
444 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
445 "movq %%mm0, %0 \n\t"
446 "movq 8%1, %%mm0 \n\t"
447 "movq 8%2, %%mm1 \n\t"
448 "movq 8%0, %%mm3 \n\t"
449 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
450 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
451 "movq %%mm0, 8%0 \n\t"
452 :"+m"(*dst)
453 :"m"(*src1), "m"(*src2)
454 :"memory");
455 dst += dstStride;
456 src1 += src1Stride;
457 src2 += 16;
458 } while (--h);
459 }
460
461 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
462 {
463 MOVQ_BFE(mm6);
464 asm volatile(
465 "lea (%3, %3), %%"REG_a" \n\t"
466 "movq (%1), %%mm0 \n\t"
467 ASMALIGN(3)
468 "1: \n\t"
469 "movq (%1, %3), %%mm1 \n\t"
470 "movq (%1, %%"REG_a"), %%mm2 \n\t"
471 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
472 "movq (%2), %%mm3 \n\t"
473 PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
474 "movq (%2, %3), %%mm3 \n\t"
475 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
476 "movq %%mm0, (%2) \n\t"
477 "movq %%mm1, (%2, %3) \n\t"
478 "add %%"REG_a", %1 \n\t"
479 "add %%"REG_a", %2 \n\t"
480
481 "movq (%1, %3), %%mm1 \n\t"
482 "movq (%1, %%"REG_a"), %%mm0 \n\t"
483 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
484 "movq (%2), %%mm3 \n\t"
485 PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
486 "movq (%2, %3), %%mm3 \n\t"
487 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
488 "movq %%mm2, (%2) \n\t"
489 "movq %%mm1, (%2, %3) \n\t"
490 "add %%"REG_a", %1 \n\t"
491 "add %%"REG_a", %2 \n\t"
492
493 "subl $4, %0 \n\t"
494 "jnz 1b \n\t"
495 :"+g"(h), "+S"(pixels), "+D"(block)
496 :"r"((long)line_size)
497 :REG_a, "memory");
498 }
499
500 // this routine is 'slightly' suboptimal but mostly unused
501 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
502 {
503 MOVQ_ZERO(mm7);
504 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
505 asm volatile(
506 "movq (%1), %%mm0 \n\t"
507 "movq 1(%1), %%mm4 \n\t"
508 "movq %%mm0, %%mm1 \n\t"
509 "movq %%mm4, %%mm5 \n\t"
510 "punpcklbw %%mm7, %%mm0 \n\t"
511 "punpcklbw %%mm7, %%mm4 \n\t"
512 "punpckhbw %%mm7, %%mm1 \n\t"
513 "punpckhbw %%mm7, %%mm5 \n\t"
514 "paddusw %%mm0, %%mm4 \n\t"
515 "paddusw %%mm1, %%mm5 \n\t"
516 "xor %%"REG_a", %%"REG_a" \n\t"
517 "add %3, %1 \n\t"
518 ASMALIGN(3)
519 "1: \n\t"
520 "movq (%1, %%"REG_a"), %%mm0 \n\t"
521 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
522 "movq %%mm0, %%mm1 \n\t"
523 "movq %%mm2, %%mm3 \n\t"
524 "punpcklbw %%mm7, %%mm0 \n\t"
525 "punpcklbw %%mm7, %%mm2 \n\t"
526 "punpckhbw %%mm7, %%mm1 \n\t"
527 "punpckhbw %%mm7, %%mm3 \n\t"
528 "paddusw %%mm2, %%mm0 \n\t"
529 "paddusw %%mm3, %%mm1 \n\t"
530 "paddusw %%mm6, %%mm4 \n\t"
531 "paddusw %%mm6, %%mm5 \n\t"
532 "paddusw %%mm0, %%mm4 \n\t"
533 "paddusw %%mm1, %%mm5 \n\t"
534 "psrlw $2, %%mm4 \n\t"
535 "psrlw $2, %%mm5 \n\t"
536 "movq (%2, %%"REG_a"), %%mm3 \n\t"
537 "packuswb %%mm5, %%mm4 \n\t"
538 "pcmpeqd %%mm2, %%mm2 \n\t"
539 "paddb %%mm2, %%mm2 \n\t"
540 PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
541 "movq %%mm5, (%2, %%"REG_a") \n\t"
542 "add %3, %%"REG_a" \n\t"
543
544 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
545 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
546 "movq %%mm2, %%mm3 \n\t"
547 "movq %%mm4, %%mm5 \n\t"
548 "punpcklbw %%mm7, %%mm2 \n\t"
549 "punpcklbw %%mm7, %%mm4 \n\t"
550 "punpckhbw %%mm7, %%mm3 \n\t"
551 "punpckhbw %%mm7, %%mm5 \n\t"
552 "paddusw %%mm2, %%mm4 \n\t"
553 "paddusw %%mm3, %%mm5 \n\t"
554 "paddusw %%mm6, %%mm0 \n\t"
555 "paddusw %%mm6, %%mm1 \n\t"
556 "paddusw %%mm4, %%mm0 \n\t"
557 "paddusw %%mm5, %%mm1 \n\t"
558 "psrlw $2, %%mm0 \n\t"
559 "psrlw $2, %%mm1 \n\t"
560 "movq (%2, %%"REG_a"), %%mm3 \n\t"
561 "packuswb %%mm1, %%mm0 \n\t"
562 "pcmpeqd %%mm2, %%mm2 \n\t"
563 "paddb %%mm2, %%mm2 \n\t"
564 PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
565 "movq %%mm1, (%2, %%"REG_a") \n\t"
566 "add %3, %%"REG_a" \n\t"
567
568 "subl $2, %0 \n\t"
569 "jnz 1b \n\t"
570 :"+g"(h), "+S"(pixels)
571 :"D"(block), "r"((long)line_size)
572 :REG_a, "memory");
573 }
574
575 //FIXME optimize
576 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
577 DEF(put, pixels8_y2)(block , pixels , line_size, h);
578 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
579 }
580
581 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
582 DEF(put, pixels8_xy2)(block , pixels , line_size, h);
583 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
584 }
585
586 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
587 DEF(avg, pixels8_y2)(block , pixels , line_size, h);
588 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
589 }
590
591 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
592 DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
593 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
594 }