Commit | Line | Data |
---|---|---|
de6d9b64 FB |
1 | /* |
2 | * MMX optimized DSP utils | |
3 | * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 | * | |
5 | * This program is free software; you can redistribute it and/or modify | |
6 | * it under the terms of the GNU General Public License as published by | |
7 | * the Free Software Foundation; either version 2 of the License, or | |
8 | * (at your option) any later version. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, | |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | * GNU General Public License for more details. | |
14 | * | |
15 | * You should have received a copy of the GNU General Public License | |
16 | * along with this program; if not, write to the Free Software | |
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 | * | |
19 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 | */ | |
21 | ||
22 | #include "../dsputil.h" | |
d962f6fd | 23 | #include "../simple_idct.h" |
607dce96 | 24 | #include "../mangle.h" |
de6d9b64 | 25 | |
7d650cb5 FB |
26 | int mm_flags; /* multimedia extension flags */ |
27 | ||
ba6802de MN |
28 | int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
29 | int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 | int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
32 | ||
33 | int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 | int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 | int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
37 | ||
38 | int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 | int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 | int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
42 | ||
43 | int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 | int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 | int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
47 | ||
8def0299 FB |
48 | /* external functions, from idct_mmx.c */ |
49 | void ff_mmx_idct(DCTELEM *block); | |
50 | void ff_mmxext_idct(DCTELEM *block); | |
4af7bcc1 | 51 | |
de6d9b64 | 52 | /* pixel operations */ |
a7bd8797 MN |
53 | static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
54 | static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
55 | static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
a9b3f630 NK |
56 | //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
57 | //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
de6d9b64 | 58 | |
d6a4c0b1 ZK |
59 | #define JUMPALIGN() __asm __volatile (".balign 8"::) |
60 | #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) | |
61 | ||
62 | #ifndef PIC | |
63 | #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) | |
64 | #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) | |
a7bd8797 | 65 | #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t" |
d6a4c0b1 ZK |
66 | #else |
67 | // for shared library it's better to use this way for accessing constants | |
68 | // pcmpeqd -> -1 | |
69 | #define MOVQ_WONE(regd) \ | |
70 | __asm __volatile ( \ | |
71 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
72 | "psrlw $15, %%" #regd ::) | |
73 | ||
74 | #define MOVQ_WTWO(regd) \ | |
75 | __asm __volatile ( \ | |
76 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
77 | "psrlw $15, %%" #regd " \n\t" \ | |
78 | "psllw $1, %%" #regd ::) | |
a7bd8797 MN |
79 | |
80 | #define MOVQ_BONE(regd) \ | |
81 | "pcmpeqd " #regd ", " #regd " \n\t" \ | |
82 | "psrlw $15, " #regd " \n\t"\ | |
83 | "packuswb " #regd ", " #regd " \n\t" | |
d6a4c0b1 ZK |
84 | #endif |
85 | ||
a7bd8797 | 86 | |
de6d9b64 FB |
87 | /***********************************/ |
88 | /* 3Dnow specific */ | |
89 | ||
90 | #define DEF(x) x ## _3dnow | |
91 | /* for Athlons PAVGUSB is prefered */ | |
92 | #define PAVGB "pavgusb" | |
93 | ||
94 | #include "dsputil_mmx_avg.h" | |
95 | ||
96 | #undef DEF | |
97 | #undef PAVGB | |
98 | ||
99 | /***********************************/ | |
100 | /* MMX2 specific */ | |
101 | ||
607dce96 | 102 | #define DEF(x) x ## _mmx2 |
de6d9b64 FB |
103 | |
104 | /* Introduced only in MMX2 set */ | |
105 | #define PAVGB "pavgb" | |
106 | ||
107 | #include "dsputil_mmx_avg.h" | |
108 | ||
109 | #undef DEF | |
110 | #undef PAVGB | |
111 | ||
112 | /***********************************/ | |
113 | /* standard MMX */ | |
114 | ||
115 | static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
116 | { | |
607dce96 MN |
117 | asm volatile( |
118 | "movl $-128, %%eax \n\t" | |
119 | "pxor %%mm7, %%mm7 \n\t" | |
120 | ".balign 16 \n\t" | |
121 | "1: \n\t" | |
122 | "movq (%0), %%mm0 \n\t" | |
123 | "movq (%0, %2), %%mm2 \n\t" | |
124 | "movq %%mm0, %%mm1 \n\t" | |
125 | "movq %%mm2, %%mm3 \n\t" | |
126 | "punpcklbw %%mm7, %%mm0 \n\t" | |
127 | "punpckhbw %%mm7, %%mm1 \n\t" | |
128 | "punpcklbw %%mm7, %%mm2 \n\t" | |
129 | "punpckhbw %%mm7, %%mm3 \n\t" | |
130 | "movq %%mm0, (%1, %%eax)\n\t" | |
131 | "movq %%mm1, 8(%1, %%eax)\n\t" | |
132 | "movq %%mm2, 16(%1, %%eax)\n\t" | |
133 | "movq %%mm3, 24(%1, %%eax)\n\t" | |
134 | "addl %3, %0 \n\t" | |
135 | "addl $32, %%eax \n\t" | |
136 | "js 1b \n\t" | |
137 | : "+r" (pixels) | |
138 | : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
139 | : "%eax" | |
140 | ); | |
de6d9b64 FB |
141 | } |
142 | ||
9dbcbd92 MN |
143 | static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
144 | { | |
145 | asm volatile( | |
607dce96 | 146 | "pxor %%mm7, %%mm7 \n\t" |
9dbcbd92 | 147 | "movl $-128, %%eax \n\t" |
607dce96 | 148 | ".balign 16 \n\t" |
9dbcbd92 MN |
149 | "1: \n\t" |
150 | "movq (%0), %%mm0 \n\t" | |
151 | "movq (%1), %%mm2 \n\t" | |
152 | "movq %%mm0, %%mm1 \n\t" | |
153 | "movq %%mm2, %%mm3 \n\t" | |
154 | "punpcklbw %%mm7, %%mm0 \n\t" | |
155 | "punpckhbw %%mm7, %%mm1 \n\t" | |
156 | "punpcklbw %%mm7, %%mm2 \n\t" | |
157 | "punpckhbw %%mm7, %%mm3 \n\t" | |
158 | "psubw %%mm2, %%mm0 \n\t" | |
159 | "psubw %%mm3, %%mm1 \n\t" | |
160 | "movq %%mm0, (%2, %%eax)\n\t" | |
161 | "movq %%mm1, 8(%2, %%eax)\n\t" | |
162 | "addl %3, %0 \n\t" | |
163 | "addl %3, %1 \n\t" | |
164 | "addl $16, %%eax \n\t" | |
165 | "jnz 1b \n\t" | |
166 | : "+r" (s1), "+r" (s2) | |
167 | : "r" (block+64), "r" (stride) | |
168 | : "%eax" | |
169 | ); | |
170 | } | |
171 | ||
de6d9b64 FB |
172 | static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
173 | { | |
174 | const DCTELEM *p; | |
175 | UINT8 *pix; | |
de6d9b64 FB |
176 | |
177 | /* read the pixels */ | |
178 | p = block; | |
179 | pix = pixels; | |
d6a4c0b1 | 180 | /* unrolled loop */ |
de6d9b64 | 181 | __asm __volatile( |
a822a479 NK |
182 | "movq %3, %%mm0\n\t" |
183 | "movq 8%3, %%mm1\n\t" | |
184 | "movq 16%3, %%mm2\n\t" | |
185 | "movq 24%3, %%mm3\n\t" | |
186 | "movq 32%3, %%mm4\n\t" | |
187 | "movq 40%3, %%mm5\n\t" | |
188 | "movq 48%3, %%mm6\n\t" | |
189 | "movq 56%3, %%mm7\n\t" | |
de6d9b64 FB |
190 | "packuswb %%mm1, %%mm0\n\t" |
191 | "packuswb %%mm3, %%mm2\n\t" | |
192 | "packuswb %%mm5, %%mm4\n\t" | |
193 | "packuswb %%mm7, %%mm6\n\t" | |
a822a479 NK |
194 | "movq %%mm0, (%0)\n\t" |
195 | "movq %%mm2, (%0, %1)\n\t" | |
196 | "movq %%mm4, (%0, %1, 2)\n\t" | |
197 | "movq %%mm6, (%0, %2)\n\t" | |
198 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) | |
de6d9b64 FB |
199 | :"memory"); |
200 | pix += line_size*4; | |
201 | p += 32; | |
d6a4c0b1 ZK |
202 | |
203 | // if here would be an exact copy of the code above | |
204 | // compiler would generate some very strange code | |
205 | // thus using "r" | |
206 | __asm __volatile( | |
207 | "movq (%3), %%mm0\n\t" | |
208 | "movq 8(%3), %%mm1\n\t" | |
209 | "movq 16(%3), %%mm2\n\t" | |
210 | "movq 24(%3), %%mm3\n\t" | |
211 | "movq 32(%3), %%mm4\n\t" | |
212 | "movq 40(%3), %%mm5\n\t" | |
213 | "movq 48(%3), %%mm6\n\t" | |
214 | "movq 56(%3), %%mm7\n\t" | |
215 | "packuswb %%mm1, %%mm0\n\t" | |
216 | "packuswb %%mm3, %%mm2\n\t" | |
217 | "packuswb %%mm5, %%mm4\n\t" | |
218 | "packuswb %%mm7, %%mm6\n\t" | |
219 | "movq %%mm0, (%0)\n\t" | |
220 | "movq %%mm2, (%0, %1)\n\t" | |
221 | "movq %%mm4, (%0, %1, 2)\n\t" | |
222 | "movq %%mm6, (%0, %2)\n\t" | |
223 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) | |
224 | :"memory"); | |
de6d9b64 FB |
225 | } |
226 | ||
227 | static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
228 | { | |
229 | const DCTELEM *p; | |
230 | UINT8 *pix; | |
231 | int i; | |
232 | ||
233 | /* read the pixels */ | |
234 | p = block; | |
235 | pix = pixels; | |
d6a4c0b1 ZK |
236 | MOVQ_ZERO(mm7); |
237 | i = 4; | |
cd8e5f96 | 238 | do { |
de6d9b64 | 239 | __asm __volatile( |
cd8e5f96 ZK |
240 | "movq (%2), %%mm0\n\t" |
241 | "movq 8(%2), %%mm1\n\t" | |
242 | "movq 16(%2), %%mm2\n\t" | |
243 | "movq 24(%2), %%mm3\n\t" | |
de6d9b64 FB |
244 | "movq %0, %%mm4\n\t" |
245 | "movq %1, %%mm6\n\t" | |
246 | "movq %%mm4, %%mm5\n\t" | |
247 | "punpcklbw %%mm7, %%mm4\n\t" | |
248 | "punpckhbw %%mm7, %%mm5\n\t" | |
249 | "paddsw %%mm4, %%mm0\n\t" | |
250 | "paddsw %%mm5, %%mm1\n\t" | |
251 | "movq %%mm6, %%mm5\n\t" | |
252 | "punpcklbw %%mm7, %%mm6\n\t" | |
253 | "punpckhbw %%mm7, %%mm5\n\t" | |
254 | "paddsw %%mm6, %%mm2\n\t" | |
255 | "paddsw %%mm5, %%mm3\n\t" | |
256 | "packuswb %%mm1, %%mm0\n\t" | |
257 | "packuswb %%mm3, %%mm2\n\t" | |
258 | "movq %%mm0, %0\n\t" | |
259 | "movq %%mm2, %1\n\t" | |
a822a479 | 260 | :"+m"(*pix), "+m"(*(pix+line_size)) |
cd8e5f96 | 261 | :"r"(p) |
de6d9b64 FB |
262 | :"memory"); |
263 | pix += line_size*2; | |
264 | p += 16; | |
cd8e5f96 | 265 | } while (--i); |
de6d9b64 FB |
266 | } |
267 | ||
268 | static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
269 | { | |
607dce96 MN |
270 | #if 0 //FIXME h==4 case |
271 | asm volatile( | |
272 | "xorl %%eax, %%eax \n\t" | |
273 | "movl %3, %%esi \n\t" | |
274 | "1: \n\t" | |
275 | "movq (%1, %%eax), %%mm0 \n\t" | |
276 | "movq %%mm0, (%0, %%eax) \n\t" | |
277 | "addl %2, %%eax \n\t" | |
278 | "movq (%1, %%eax), %%mm0 \n\t" | |
279 | "movq %%mm0, (%0, %%eax) \n\t" | |
280 | "addl %2, %%eax \n\t" | |
281 | "movq (%1, %%eax), %%mm0 \n\t" | |
282 | "movq %%mm0, (%0, %%eax) \n\t" | |
283 | "addl %2, %%eax \n\t" | |
284 | "movq (%1, %%eax), %%mm0 \n\t" | |
285 | "movq %%mm0, (%0, %%eax) \n\t" | |
286 | "addl %2, %%eax \n\t" | |
287 | "movq (%1, %%eax), %%mm0 \n\t" | |
288 | "movq %%mm0, (%0, %%eax) \n\t" | |
289 | "addl %2, %%eax \n\t" | |
290 | "movq (%1, %%eax), %%mm0 \n\t" | |
291 | "movq %%mm0, (%0, %%eax) \n\t" | |
292 | "addl %2, %%eax \n\t" | |
293 | "movq (%1, %%eax), %%mm0 \n\t" | |
294 | "movq %%mm0, (%0, %%eax) \n\t" | |
295 | "addl %2, %%eax \n\t" | |
296 | "movq (%1, %%eax), %%mm0 \n\t" | |
297 | "movq %%mm0, (%0, %%eax) \n\t" | |
298 | "addl %2, %%eax \n\t" | |
299 | "subl $8, %%esi \n\t" | |
300 | " jnz 1b \n\t" | |
301 | :: "r" (block), "r" (pixels), "r"(line_size), "m"(h) | |
302 | : "%eax", "%esi", "memory" | |
303 | ); | |
d6a4c0b1 | 304 | #else |
607dce96 MN |
305 | asm volatile( |
306 | "xorl %%eax, %%eax \n\t" | |
307 | "movl %3, %%esi \n\t" | |
308 | "1: \n\t" | |
309 | "movq (%1, %%eax), %%mm0 \n\t" | |
310 | "movq %%mm0, (%0, %%eax) \n\t" | |
311 | "addl %2, %%eax \n\t" | |
312 | "movq (%1, %%eax), %%mm0 \n\t" | |
313 | "movq %%mm0, (%0, %%eax) \n\t" | |
314 | "addl %2, %%eax \n\t" | |
315 | "movq (%1, %%eax), %%mm0 \n\t" | |
316 | "movq %%mm0, (%0, %%eax) \n\t" | |
317 | "addl %2, %%eax \n\t" | |
318 | "movq (%1, %%eax), %%mm0 \n\t" | |
319 | "movq %%mm0, (%0, %%eax) \n\t" | |
320 | "addl %2, %%eax \n\t" | |
321 | "subl $4, %%esi \n\t" | |
322 | " jnz 1b \n\t" | |
323 | :: "r" (block), "r" (pixels), "r"(line_size), "m"(h) | |
324 | : "%eax", "%esi", "memory" | |
325 | ); | |
d6a4c0b1 | 326 | #endif |
de6d9b64 FB |
327 | } |
328 | ||
329 | static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
330 | { | |
331 | UINT8 *p; | |
332 | const UINT8 *pix; | |
333 | p = block; | |
334 | pix = pixels; | |
d6a4c0b1 ZK |
335 | MOVQ_ZERO(mm7); |
336 | MOVQ_WONE(mm4); | |
337 | JUMPALIGN(); | |
de6d9b64 FB |
338 | do { |
339 | __asm __volatile( | |
340 | "movq %1, %%mm0\n\t" | |
341 | "movq 1%1, %%mm1\n\t" | |
342 | "movq %%mm0, %%mm2\n\t" | |
343 | "movq %%mm1, %%mm3\n\t" | |
344 | "punpcklbw %%mm7, %%mm0\n\t" | |
345 | "punpcklbw %%mm7, %%mm1\n\t" | |
346 | "punpckhbw %%mm7, %%mm2\n\t" | |
347 | "punpckhbw %%mm7, %%mm3\n\t" | |
348 | "paddusw %%mm1, %%mm0\n\t" | |
349 | "paddusw %%mm3, %%mm2\n\t" | |
350 | "paddusw %%mm4, %%mm0\n\t" | |
351 | "paddusw %%mm4, %%mm2\n\t" | |
352 | "psrlw $1, %%mm0\n\t" | |
353 | "psrlw $1, %%mm2\n\t" | |
354 | "packuswb %%mm2, %%mm0\n\t" | |
355 | "movq %%mm0, %0\n\t" | |
356 | :"=m"(*p) | |
357 | :"m"(*pix) | |
358 | :"memory"); | |
359 | pix += line_size; p += line_size; | |
360 | } while (--h); | |
de6d9b64 FB |
361 | } |
362 | ||
363 | static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
364 | { | |
365 | UINT8 *p; | |
366 | const UINT8 *pix; | |
367 | p = block; | |
368 | pix = pixels; | |
d6a4c0b1 ZK |
369 | MOVQ_ZERO(mm7); |
370 | MOVQ_WONE(mm4); | |
371 | JUMPALIGN(); | |
de6d9b64 FB |
372 | do { |
373 | __asm __volatile( | |
374 | "movq %1, %%mm0\n\t" | |
375 | "movq %2, %%mm1\n\t" | |
376 | "movq %%mm0, %%mm2\n\t" | |
377 | "movq %%mm1, %%mm3\n\t" | |
378 | "punpcklbw %%mm7, %%mm0\n\t" | |
379 | "punpcklbw %%mm7, %%mm1\n\t" | |
380 | "punpckhbw %%mm7, %%mm2\n\t" | |
381 | "punpckhbw %%mm7, %%mm3\n\t" | |
382 | "paddusw %%mm1, %%mm0\n\t" | |
383 | "paddusw %%mm3, %%mm2\n\t" | |
384 | "paddusw %%mm4, %%mm0\n\t" | |
385 | "paddusw %%mm4, %%mm2\n\t" | |
386 | "psrlw $1, %%mm0\n\t" | |
387 | "psrlw $1, %%mm2\n\t" | |
388 | "packuswb %%mm2, %%mm0\n\t" | |
389 | "movq %%mm0, %0\n\t" | |
390 | :"=m"(*p) | |
391 | :"m"(*pix), | |
392 | "m"(*(pix+line_size)) | |
393 | :"memory"); | |
394 | pix += line_size; | |
395 | p += line_size; | |
396 | } while (--h); | |
de6d9b64 FB |
397 | } |
398 | ||
399 | static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
400 | { | |
401 | UINT8 *p; | |
402 | const UINT8 *pix; | |
403 | p = block; | |
d6a4c0b1 ZK |
404 | pix = pixels; // 1s |
405 | MOVQ_ZERO(mm7); | |
406 | MOVQ_WTWO(mm6); | |
407 | JUMPALIGN(); | |
de6d9b64 FB |
408 | do { |
409 | __asm __volatile( | |
410 | "movq %1, %%mm0\n\t" | |
411 | "movq %2, %%mm1\n\t" | |
412 | "movq 1%1, %%mm4\n\t" | |
413 | "movq 1%2, %%mm5\n\t" | |
414 | "movq %%mm0, %%mm2\n\t" | |
415 | "movq %%mm1, %%mm3\n\t" | |
416 | "punpcklbw %%mm7, %%mm0\n\t" | |
417 | "punpcklbw %%mm7, %%mm1\n\t" | |
418 | "punpckhbw %%mm7, %%mm2\n\t" | |
419 | "punpckhbw %%mm7, %%mm3\n\t" | |
420 | "paddusw %%mm1, %%mm0\n\t" | |
421 | "paddusw %%mm3, %%mm2\n\t" | |
422 | "movq %%mm4, %%mm1\n\t" | |
423 | "movq %%mm5, %%mm3\n\t" | |
424 | "punpcklbw %%mm7, %%mm4\n\t" | |
425 | "punpcklbw %%mm7, %%mm5\n\t" | |
426 | "punpckhbw %%mm7, %%mm1\n\t" | |
427 | "punpckhbw %%mm7, %%mm3\n\t" | |
428 | "paddusw %%mm5, %%mm4\n\t" | |
429 | "paddusw %%mm3, %%mm1\n\t" | |
430 | "paddusw %%mm6, %%mm4\n\t" | |
431 | "paddusw %%mm6, %%mm1\n\t" | |
432 | "paddusw %%mm4, %%mm0\n\t" | |
433 | "paddusw %%mm1, %%mm2\n\t" | |
434 | "psrlw $2, %%mm0\n\t" | |
435 | "psrlw $2, %%mm2\n\t" | |
436 | "packuswb %%mm2, %%mm0\n\t" | |
437 | "movq %%mm0, %0\n\t" | |
438 | :"=m"(*p) | |
439 | :"m"(*pix), | |
440 | "m"(*(pix+line_size)) | |
441 | :"memory"); | |
442 | pix += line_size; | |
443 | p += line_size; | |
444 | } while(--h); | |
de6d9b64 FB |
445 | } |
446 | ||
447 | static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
448 | { | |
449 | UINT8 *p; | |
450 | const UINT8 *pix; | |
451 | p = block; | |
452 | pix = pixels; | |
d6a4c0b1 | 453 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
454 | do { |
455 | __asm __volatile( | |
456 | "movq %1, %%mm0\n\t" | |
457 | "movq 1%1, %%mm1\n\t" | |
458 | "movq %%mm0, %%mm2\n\t" | |
459 | "movq %%mm1, %%mm3\n\t" | |
460 | "punpcklbw %%mm7, %%mm0\n\t" | |
461 | "punpcklbw %%mm7, %%mm1\n\t" | |
462 | "punpckhbw %%mm7, %%mm2\n\t" | |
463 | "punpckhbw %%mm7, %%mm3\n\t" | |
464 | "paddusw %%mm1, %%mm0\n\t" | |
465 | "paddusw %%mm3, %%mm2\n\t" | |
466 | "psrlw $1, %%mm0\n\t" | |
467 | "psrlw $1, %%mm2\n\t" | |
468 | "packuswb %%mm2, %%mm0\n\t" | |
469 | "movq %%mm0, %0\n\t" | |
470 | :"=m"(*p) | |
471 | :"m"(*pix) | |
472 | :"memory"); | |
473 | pix += line_size; | |
474 | p += line_size; | |
475 | } while (--h); | |
de6d9b64 FB |
476 | } |
477 | ||
478 | static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
479 | { | |
480 | UINT8 *p; | |
481 | const UINT8 *pix; | |
482 | p = block; | |
483 | pix = pixels; | |
d6a4c0b1 ZK |
484 | MOVQ_ZERO(mm7); |
485 | JUMPALIGN(); | |
de6d9b64 FB |
486 | do { |
487 | __asm __volatile( | |
488 | "movq %1, %%mm0\n\t" | |
489 | "movq %2, %%mm1\n\t" | |
490 | "movq %%mm0, %%mm2\n\t" | |
491 | "movq %%mm1, %%mm3\n\t" | |
492 | "punpcklbw %%mm7, %%mm0\n\t" | |
493 | "punpcklbw %%mm7, %%mm1\n\t" | |
494 | "punpckhbw %%mm7, %%mm2\n\t" | |
495 | "punpckhbw %%mm7, %%mm3\n\t" | |
496 | "paddusw %%mm1, %%mm0\n\t" | |
497 | "paddusw %%mm3, %%mm2\n\t" | |
498 | "psrlw $1, %%mm0\n\t" | |
499 | "psrlw $1, %%mm2\n\t" | |
500 | "packuswb %%mm2, %%mm0\n\t" | |
501 | "movq %%mm0, %0\n\t" | |
502 | :"=m"(*p) | |
503 | :"m"(*pix), | |
504 | "m"(*(pix+line_size)) | |
505 | :"memory"); | |
506 | pix += line_size; | |
507 | p += line_size; | |
508 | } while(--h); | |
de6d9b64 FB |
509 | } |
510 | ||
511 | static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
512 | { | |
513 | UINT8 *p; | |
514 | const UINT8 *pix; | |
515 | p = block; | |
516 | pix = pixels; | |
d6a4c0b1 ZK |
517 | MOVQ_ZERO(mm7); |
518 | MOVQ_WONE(mm6); | |
519 | JUMPALIGN(); | |
de6d9b64 FB |
520 | do { |
521 | __asm __volatile( | |
522 | "movq %1, %%mm0\n\t" | |
523 | "movq %2, %%mm1\n\t" | |
524 | "movq 1%1, %%mm4\n\t" | |
525 | "movq 1%2, %%mm5\n\t" | |
526 | "movq %%mm0, %%mm2\n\t" | |
527 | "movq %%mm1, %%mm3\n\t" | |
528 | "punpcklbw %%mm7, %%mm0\n\t" | |
529 | "punpcklbw %%mm7, %%mm1\n\t" | |
530 | "punpckhbw %%mm7, %%mm2\n\t" | |
531 | "punpckhbw %%mm7, %%mm3\n\t" | |
532 | "paddusw %%mm1, %%mm0\n\t" | |
533 | "paddusw %%mm3, %%mm2\n\t" | |
534 | "movq %%mm4, %%mm1\n\t" | |
535 | "movq %%mm5, %%mm3\n\t" | |
536 | "punpcklbw %%mm7, %%mm4\n\t" | |
537 | "punpcklbw %%mm7, %%mm5\n\t" | |
538 | "punpckhbw %%mm7, %%mm1\n\t" | |
539 | "punpckhbw %%mm7, %%mm3\n\t" | |
540 | "paddusw %%mm5, %%mm4\n\t" | |
541 | "paddusw %%mm3, %%mm1\n\t" | |
542 | "paddusw %%mm6, %%mm4\n\t" | |
543 | "paddusw %%mm6, %%mm1\n\t" | |
544 | "paddusw %%mm4, %%mm0\n\t" | |
545 | "paddusw %%mm1, %%mm2\n\t" | |
546 | "psrlw $2, %%mm0\n\t" | |
547 | "psrlw $2, %%mm2\n\t" | |
548 | "packuswb %%mm2, %%mm0\n\t" | |
549 | "movq %%mm0, %0\n\t" | |
550 | :"=m"(*p) | |
551 | :"m"(*pix), | |
552 | "m"(*(pix+line_size)) | |
553 | :"memory"); | |
554 | pix += line_size; | |
555 | p += line_size; | |
556 | } while(--h); | |
de6d9b64 FB |
557 | } |
558 | ||
559 | static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
560 | { | |
561 | UINT8 *p; | |
562 | const UINT8 *pix; | |
563 | p = block; | |
564 | pix = pixels; | |
d6a4c0b1 ZK |
565 | MOVQ_ZERO(mm7); |
566 | MOVQ_WONE(mm6); | |
567 | JUMPALIGN(); | |
de6d9b64 FB |
568 | do { |
569 | __asm __volatile( | |
570 | "movq %0, %%mm0\n\t" | |
571 | "movq %1, %%mm1\n\t" | |
572 | "movq %%mm0, %%mm2\n\t" | |
573 | "movq %%mm1, %%mm3\n\t" | |
574 | "punpcklbw %%mm7, %%mm0\n\t" | |
575 | "punpcklbw %%mm7, %%mm1\n\t" | |
576 | "punpckhbw %%mm7, %%mm2\n\t" | |
577 | "punpckhbw %%mm7, %%mm3\n\t" | |
578 | "paddusw %%mm1, %%mm0\n\t" | |
579 | "paddusw %%mm3, %%mm2\n\t" | |
580 | "paddusw %%mm6, %%mm0\n\t" | |
581 | "paddusw %%mm6, %%mm2\n\t" | |
582 | "psrlw $1, %%mm0\n\t" | |
583 | "psrlw $1, %%mm2\n\t" | |
584 | "packuswb %%mm2, %%mm0\n\t" | |
585 | "movq %%mm0, %0\n\t" | |
a822a479 | 586 | :"+m"(*p) |
de6d9b64 FB |
587 | :"m"(*pix) |
588 | :"memory"); | |
589 | pix += line_size; | |
590 | p += line_size; | |
591 | } | |
592 | while (--h); | |
de6d9b64 FB |
593 | } |
594 | ||
595 | static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
596 | { | |
597 | UINT8 *p; | |
598 | const UINT8 *pix; | |
599 | p = block; | |
600 | pix = pixels; | |
d6a4c0b1 ZK |
601 | MOVQ_ZERO(mm7); |
602 | MOVQ_WONE(mm6); | |
603 | JUMPALIGN(); | |
de6d9b64 FB |
604 | do { |
605 | __asm __volatile( | |
606 | "movq %1, %%mm1\n\t" | |
607 | "movq %0, %%mm0\n\t" | |
608 | "movq 1%1, %%mm4\n\t" | |
609 | "movq %%mm0, %%mm2\n\t" | |
610 | "movq %%mm1, %%mm3\n\t" | |
611 | "movq %%mm4, %%mm5\n\t" | |
612 | "punpcklbw %%mm7, %%mm1\n\t" | |
613 | "punpckhbw %%mm7, %%mm3\n\t" | |
614 | "punpcklbw %%mm7, %%mm4\n\t" | |
615 | "punpckhbw %%mm7, %%mm5\n\t" | |
616 | "punpcklbw %%mm7, %%mm0\n\t" | |
617 | "punpckhbw %%mm7, %%mm2\n\t" | |
618 | "paddusw %%mm4, %%mm1\n\t" | |
619 | "paddusw %%mm5, %%mm3\n\t" | |
620 | "paddusw %%mm6, %%mm1\n\t" | |
621 | "paddusw %%mm6, %%mm3\n\t" | |
622 | "psrlw $1, %%mm1\n\t" | |
623 | "psrlw $1, %%mm3\n\t" | |
624 | "paddusw %%mm6, %%mm0\n\t" | |
625 | "paddusw %%mm6, %%mm2\n\t" | |
626 | "paddusw %%mm1, %%mm0\n\t" | |
627 | "paddusw %%mm3, %%mm2\n\t" | |
628 | "psrlw $1, %%mm0\n\t" | |
629 | "psrlw $1, %%mm2\n\t" | |
630 | "packuswb %%mm2, %%mm0\n\t" | |
631 | "movq %%mm0, %0\n\t" | |
a822a479 | 632 | :"+m"(*p) |
de6d9b64 FB |
633 | :"m"(*pix) |
634 | :"memory"); | |
635 | pix += line_size; | |
636 | p += line_size; | |
637 | } while (--h); | |
de6d9b64 FB |
638 | } |
639 | ||
640 | static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
641 | { | |
642 | UINT8 *p; | |
643 | const UINT8 *pix; | |
644 | p = block; | |
645 | pix = pixels; | |
d6a4c0b1 ZK |
646 | MOVQ_ZERO(mm7); |
647 | MOVQ_WONE(mm6); | |
648 | JUMPALIGN(); | |
de6d9b64 FB |
649 | do { |
650 | __asm __volatile( | |
651 | "movq %1, %%mm1\n\t" | |
652 | "movq %0, %%mm0\n\t" | |
653 | "movq %2, %%mm4\n\t" | |
654 | "movq %%mm0, %%mm2\n\t" | |
655 | "movq %%mm1, %%mm3\n\t" | |
656 | "movq %%mm4, %%mm5\n\t" | |
657 | "punpcklbw %%mm7, %%mm1\n\t" | |
658 | "punpckhbw %%mm7, %%mm3\n\t" | |
659 | "punpcklbw %%mm7, %%mm4\n\t" | |
660 | "punpckhbw %%mm7, %%mm5\n\t" | |
661 | "punpcklbw %%mm7, %%mm0\n\t" | |
662 | "punpckhbw %%mm7, %%mm2\n\t" | |
663 | "paddusw %%mm4, %%mm1\n\t" | |
664 | "paddusw %%mm5, %%mm3\n\t" | |
665 | "paddusw %%mm6, %%mm1\n\t" | |
666 | "paddusw %%mm6, %%mm3\n\t" | |
667 | "psrlw $1, %%mm1\n\t" | |
668 | "psrlw $1, %%mm3\n\t" | |
669 | "paddusw %%mm6, %%mm0\n\t" | |
670 | "paddusw %%mm6, %%mm2\n\t" | |
671 | "paddusw %%mm1, %%mm0\n\t" | |
672 | "paddusw %%mm3, %%mm2\n\t" | |
673 | "psrlw $1, %%mm0\n\t" | |
674 | "psrlw $1, %%mm2\n\t" | |
675 | "packuswb %%mm2, %%mm0\n\t" | |
676 | "movq %%mm0, %0\n\t" | |
a822a479 | 677 | :"+m"(*p) |
de6d9b64 FB |
678 | :"m"(*pix), "m"(*(pix+line_size)) |
679 | :"memory"); | |
680 | pix += line_size; | |
681 | p += line_size ; | |
682 | } while(--h); | |
de6d9b64 FB |
683 | } |
684 | ||
685 | static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
686 | { | |
687 | UINT8 *p; | |
688 | const UINT8 *pix; | |
689 | p = block; | |
690 | pix = pixels; | |
d6a4c0b1 ZK |
691 | MOVQ_ZERO(mm7); |
692 | // this doesn't seem to be used offten - so | |
693 | // the inside usage of mm_wone is not optimized | |
694 | MOVQ_WTWO(mm6); | |
de6d9b64 FB |
695 | do { |
696 | __asm __volatile( | |
697 | "movq %1, %%mm0\n\t" | |
698 | "movq %2, %%mm1\n\t" | |
699 | "movq 1%1, %%mm4\n\t" | |
700 | "movq 1%2, %%mm5\n\t" | |
701 | "movq %%mm0, %%mm2\n\t" | |
702 | "movq %%mm1, %%mm3\n\t" | |
703 | "punpcklbw %%mm7, %%mm0\n\t" | |
704 | "punpcklbw %%mm7, %%mm1\n\t" | |
705 | "punpckhbw %%mm7, %%mm2\n\t" | |
706 | "punpckhbw %%mm7, %%mm3\n\t" | |
707 | "paddusw %%mm1, %%mm0\n\t" | |
708 | "paddusw %%mm3, %%mm2\n\t" | |
709 | "movq %%mm4, %%mm1\n\t" | |
710 | "movq %%mm5, %%mm3\n\t" | |
711 | "punpcklbw %%mm7, %%mm4\n\t" | |
712 | "punpcklbw %%mm7, %%mm5\n\t" | |
713 | "punpckhbw %%mm7, %%mm1\n\t" | |
714 | "punpckhbw %%mm7, %%mm3\n\t" | |
715 | "paddusw %%mm5, %%mm4\n\t" | |
716 | "paddusw %%mm3, %%mm1\n\t" | |
717 | "paddusw %%mm6, %%mm4\n\t" | |
718 | "paddusw %%mm6, %%mm1\n\t" | |
719 | "paddusw %%mm4, %%mm0\n\t" | |
720 | "paddusw %%mm1, %%mm2\n\t" | |
721 | "movq %3, %%mm5\n\t" | |
722 | "psrlw $2, %%mm0\n\t" | |
723 | "movq %0, %%mm1\n\t" | |
724 | "psrlw $2, %%mm2\n\t" | |
725 | "movq %%mm1, %%mm3\n\t" | |
726 | "punpcklbw %%mm7, %%mm1\n\t" | |
727 | "punpckhbw %%mm7, %%mm3\n\t" | |
728 | "paddusw %%mm1, %%mm0\n\t" | |
729 | "paddusw %%mm3, %%mm2\n\t" | |
730 | "paddusw %%mm5, %%mm0\n\t" | |
731 | "paddusw %%mm5, %%mm2\n\t" | |
732 | "psrlw $1, %%mm0\n\t" | |
733 | "psrlw $1, %%mm2\n\t" | |
734 | "packuswb %%mm2, %%mm0\n\t" | |
735 | "movq %%mm0, %0\n\t" | |
a822a479 | 736 | :"+m"(*p) |
de6d9b64 | 737 | :"m"(*pix), |
a9b3f630 | 738 | "m"(*(pix+line_size)), "m"(mm_wone) |
de6d9b64 FB |
739 | :"memory"); |
740 | pix += line_size; | |
741 | p += line_size ; | |
742 | } while(--h); | |
de6d9b64 FB |
743 | } |
744 | ||
745 | static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
746 | { | |
747 | UINT8 *p; | |
748 | const UINT8 *pix; | |
749 | p = block; | |
750 | pix = pixels; | |
d6a4c0b1 | 751 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
752 | do { |
753 | __asm __volatile( | |
754 | "movq %1, %%mm0\n\t" | |
755 | "movq %0, %%mm1\n\t" | |
756 | "movq %%mm0, %%mm2\n\t" | |
757 | "movq %%mm1, %%mm3\n\t" | |
758 | "punpcklbw %%mm7, %%mm0\n\t" | |
759 | "punpcklbw %%mm7, %%mm1\n\t" | |
760 | "punpckhbw %%mm7, %%mm2\n\t" | |
761 | "punpckhbw %%mm7, %%mm3\n\t" | |
762 | "paddusw %%mm1, %%mm0\n\t" | |
763 | "paddusw %%mm3, %%mm2\n\t" | |
764 | "psrlw $1, %%mm0\n\t" | |
765 | "psrlw $1, %%mm2\n\t" | |
766 | "packuswb %%mm2, %%mm0\n\t" | |
767 | "movq %%mm0, %0\n\t" | |
a822a479 | 768 | :"+m"(*p) |
de6d9b64 FB |
769 | :"m"(*pix) |
770 | :"memory"); | |
771 | pix += line_size; | |
772 | p += line_size ; | |
773 | } while (--h); | |
de6d9b64 FB |
774 | } |
775 | ||
776 | static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
777 | { | |
778 | UINT8 *p; | |
779 | const UINT8 *pix; | |
780 | p = block; | |
781 | pix = pixels; | |
d6a4c0b1 | 782 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
783 | do { |
784 | __asm __volatile( | |
785 | "movq %1, %%mm0\n\t" | |
786 | "movq 1%1, %%mm1\n\t" | |
787 | "movq %0, %%mm4\n\t" | |
788 | "movq %%mm0, %%mm2\n\t" | |
789 | "movq %%mm1, %%mm3\n\t" | |
790 | "movq %%mm4, %%mm5\n\t" | |
791 | "punpcklbw %%mm7, %%mm0\n\t" | |
792 | "punpcklbw %%mm7, %%mm1\n\t" | |
793 | "punpckhbw %%mm7, %%mm2\n\t" | |
794 | "punpckhbw %%mm7, %%mm3\n\t" | |
795 | "punpcklbw %%mm7, %%mm4\n\t" | |
796 | "punpckhbw %%mm7, %%mm5\n\t" | |
797 | "paddusw %%mm1, %%mm0\n\t" | |
798 | "paddusw %%mm3, %%mm2\n\t" | |
799 | "psrlw $1, %%mm0\n\t" | |
800 | "psrlw $1, %%mm2\n\t" | |
801 | "paddusw %%mm4, %%mm0\n\t" | |
802 | "paddusw %%mm5, %%mm2\n\t" | |
803 | "psrlw $1, %%mm0\n\t" | |
804 | "psrlw $1, %%mm2\n\t" | |
805 | "packuswb %%mm2, %%mm0\n\t" | |
806 | "movq %%mm0, %0\n\t" | |
a822a479 | 807 | :"+m"(*p) |
de6d9b64 FB |
808 | :"m"(*pix) |
809 | :"memory"); | |
810 | pix += line_size; | |
811 | p += line_size; | |
812 | } while (--h); | |
de6d9b64 FB |
813 | } |
814 | ||
815 | static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
816 | { | |
817 | UINT8 *p; | |
818 | const UINT8 *pix; | |
819 | p = block; | |
820 | pix = pixels; | |
d6a4c0b1 | 821 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
822 | do { |
823 | __asm __volatile( | |
824 | "movq %1, %%mm0\n\t" | |
825 | "movq %2, %%mm1\n\t" | |
826 | "movq %0, %%mm4\n\t" | |
827 | "movq %%mm0, %%mm2\n\t" | |
828 | "movq %%mm1, %%mm3\n\t" | |
829 | "movq %%mm4, %%mm5\n\t" | |
830 | "punpcklbw %%mm7, %%mm0\n\t" | |
831 | "punpcklbw %%mm7, %%mm1\n\t" | |
832 | "punpckhbw %%mm7, %%mm2\n\t" | |
833 | "punpckhbw %%mm7, %%mm3\n\t" | |
834 | "punpcklbw %%mm7, %%mm4\n\t" | |
835 | "punpckhbw %%mm7, %%mm5\n\t" | |
836 | "paddusw %%mm1, %%mm0\n\t" | |
837 | "paddusw %%mm3, %%mm2\n\t" | |
838 | "psrlw $1, %%mm0\n\t" | |
839 | "psrlw $1, %%mm2\n\t" | |
840 | "paddusw %%mm4, %%mm0\n\t" | |
841 | "paddusw %%mm5, %%mm2\n\t" | |
842 | "psrlw $1, %%mm0\n\t" | |
843 | "psrlw $1, %%mm2\n\t" | |
844 | "packuswb %%mm2, %%mm0\n\t" | |
845 | "movq %%mm0, %0\n\t" | |
a822a479 | 846 | :"+m"(*p) |
de6d9b64 FB |
847 | :"m"(*pix), "m"(*(pix+line_size)) |
848 | :"memory"); | |
849 | pix += line_size; | |
850 | p += line_size ; | |
851 | } while(--h); | |
de6d9b64 FB |
852 | } |
853 | ||
854 | static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
855 | { | |
856 | UINT8 *p; | |
857 | const UINT8 *pix; | |
858 | p = block; | |
859 | pix = pixels; | |
d6a4c0b1 ZK |
860 | MOVQ_ZERO(mm7); |
861 | MOVQ_WONE(mm6); | |
862 | JUMPALIGN(); | |
de6d9b64 FB |
863 | do { |
864 | __asm __volatile( | |
865 | "movq %1, %%mm0\n\t" | |
866 | "movq %2, %%mm1\n\t" | |
867 | "movq 1%1, %%mm4\n\t" | |
868 | "movq 1%2, %%mm5\n\t" | |
869 | "movq %%mm0, %%mm2\n\t" | |
870 | "movq %%mm1, %%mm3\n\t" | |
871 | "punpcklbw %%mm7, %%mm0\n\t" | |
872 | "punpcklbw %%mm7, %%mm1\n\t" | |
873 | "punpckhbw %%mm7, %%mm2\n\t" | |
874 | "punpckhbw %%mm7, %%mm3\n\t" | |
875 | "paddusw %%mm1, %%mm0\n\t" | |
876 | "paddusw %%mm3, %%mm2\n\t" | |
877 | "movq %%mm4, %%mm1\n\t" | |
878 | "movq %%mm5, %%mm3\n\t" | |
879 | "punpcklbw %%mm7, %%mm4\n\t" | |
880 | "punpcklbw %%mm7, %%mm5\n\t" | |
881 | "punpckhbw %%mm7, %%mm1\n\t" | |
882 | "punpckhbw %%mm7, %%mm3\n\t" | |
883 | "paddusw %%mm5, %%mm4\n\t" | |
884 | "paddusw %%mm3, %%mm1\n\t" | |
885 | "paddusw %%mm6, %%mm4\n\t" | |
886 | "paddusw %%mm6, %%mm1\n\t" | |
887 | "paddusw %%mm4, %%mm0\n\t" | |
888 | "paddusw %%mm1, %%mm2\n\t" | |
889 | "movq %0, %%mm1\n\t" | |
890 | "psrlw $2, %%mm0\n\t" | |
891 | "movq %%mm1, %%mm3\n\t" | |
892 | "psrlw $2, %%mm2\n\t" | |
893 | "punpcklbw %%mm7, %%mm1\n\t" | |
894 | "punpckhbw %%mm7, %%mm3\n\t" | |
895 | "paddusw %%mm1, %%mm0\n\t" | |
896 | "paddusw %%mm3, %%mm2\n\t" | |
897 | "psrlw $1, %%mm0\n\t" | |
898 | "psrlw $1, %%mm2\n\t" | |
899 | "packuswb %%mm2, %%mm0\n\t" | |
900 | "movq %%mm0, %0\n\t" | |
a822a479 | 901 | :"+m"(*p) |
de6d9b64 FB |
902 | :"m"(*pix), |
903 | "m"(*(pix+line_size)) | |
904 | :"memory"); | |
905 | pix += line_size; | |
906 | p += line_size; | |
907 | } while(--h); | |
de6d9b64 FB |
908 | } |
909 | ||
649c00c9 MN |
910 | static void clear_blocks_mmx(DCTELEM *blocks) |
911 | { | |
912 | asm volatile( | |
913 | "pxor %%mm7, %%mm7 \n\t" | |
914 | "movl $-128*6, %%eax \n\t" | |
915 | "1: \n\t" | |
916 | "movq %%mm7, (%0, %%eax) \n\t" | |
917 | "movq %%mm7, 8(%0, %%eax) \n\t" | |
918 | "movq %%mm7, 16(%0, %%eax) \n\t" | |
919 | "movq %%mm7, 24(%0, %%eax) \n\t" | |
920 | "addl $32, %%eax \n\t" | |
921 | " js 1b \n\t" | |
922 | : : "r" (((int)blocks)+128*6) | |
923 | : "%eax" | |
924 | ); | |
925 | } | |
926 | ||
61a4e8ae | 927 | #if 0 |
d6a4c0b1 | 928 | static void just_return() { return; } |
61a4e8ae | 929 | #endif |
d6a4c0b1 | 930 | |
dcb9cd4b | 931 | #ifndef TESTCPU_MAIN |
de6d9b64 FB |
932 | void dsputil_init_mmx(void) |
933 | { | |
934 | mm_flags = mm_support(); | |
f4470e09 MN |
935 | #if 1 |
936 | printf("libavcodec: CPU flags:"); | |
de6d9b64 FB |
937 | if (mm_flags & MM_MMX) |
938 | printf(" mmx"); | |
939 | if (mm_flags & MM_MMXEXT) | |
940 | printf(" mmxext"); | |
941 | if (mm_flags & MM_3DNOW) | |
942 | printf(" 3dnow"); | |
943 | if (mm_flags & MM_SSE) | |
944 | printf(" sse"); | |
945 | if (mm_flags & MM_SSE2) | |
946 | printf(" sse2"); | |
947 | printf("\n"); | |
948 | #endif | |
949 | ||
950 | if (mm_flags & MM_MMX) { | |
951 | get_pixels = get_pixels_mmx; | |
9dbcbd92 | 952 | diff_pixels = diff_pixels_mmx; |
de6d9b64 FB |
953 | put_pixels_clamped = put_pixels_clamped_mmx; |
954 | add_pixels_clamped = add_pixels_clamped_mmx; | |
649c00c9 | 955 | clear_blocks= clear_blocks_mmx; |
dcb9cd4b | 956 | |
ba6802de MN |
957 | pix_abs16x16 = pix_abs16x16_mmx; |
958 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
959 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
de6d9b64 | 960 | pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
ba6802de MN |
961 | pix_abs8x8 = pix_abs8x8_mmx; |
962 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
963 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
964 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
de6d9b64 FB |
965 | av_fdct = fdct_mmx; |
966 | ||
967 | put_pixels_tab[0] = put_pixels_mmx; | |
968 | put_pixels_tab[1] = put_pixels_x2_mmx; | |
969 | put_pixels_tab[2] = put_pixels_y2_mmx; | |
970 | put_pixels_tab[3] = put_pixels_xy2_mmx; | |
971 | ||
972 | put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
973 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
974 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
975 | put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
dcb9cd4b | 976 | |
de6d9b64 FB |
977 | avg_pixels_tab[0] = avg_pixels_mmx; |
978 | avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
979 | avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
980 | avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
981 | ||
982 | avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
983 | avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
984 | avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
985 | avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
607dce96 | 986 | |
de6d9b64 | 987 | if (mm_flags & MM_MMXEXT) { |
ba6802de MN |
988 | pix_abs16x16 = pix_abs16x16_mmx2; |
989 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
990 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
991 | pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
dcb9cd4b | 992 | |
ba6802de MN |
993 | pix_abs8x8 = pix_abs8x8_mmx2; |
994 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
995 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
996 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
607dce96 MN |
997 | |
998 | put_pixels_tab[1] = put_pixels_x2_mmx2; | |
999 | put_pixels_tab[2] = put_pixels_y2_mmx2; | |
1000 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; | |
1001 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; | |
dcb9cd4b | 1002 | |
607dce96 MN |
1003 | avg_pixels_tab[0] = avg_pixels_mmx2; |
1004 | avg_pixels_tab[1] = avg_pixels_x2_mmx2; | |
1005 | avg_pixels_tab[2] = avg_pixels_y2_mmx2; | |
1006 | avg_pixels_tab[3] = avg_pixels_xy2_mmx2; | |
de6d9b64 FB |
1007 | } else if (mm_flags & MM_3DNOW) { |
1008 | put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1009 | put_pixels_tab[2] = put_pixels_y2_3dnow; | |
607dce96 MN |
1010 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; |
1011 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; | |
61a4e8ae | 1012 | |
de6d9b64 FB |
1013 | avg_pixels_tab[0] = avg_pixels_3dnow; |
1014 | avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1015 | avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1016 | avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
de6d9b64 | 1017 | } |
4af7bcc1 | 1018 | |
8def0299 FB |
1019 | /* idct */ |
1020 | if (mm_flags & MM_MMXEXT) { | |
1021 | ff_idct = ff_mmxext_idct; | |
1022 | } else { | |
1023 | ff_idct = ff_mmx_idct; | |
1024 | } | |
d962f6fd A |
1025 | #ifdef SIMPLE_IDCT |
1026 | // ff_idct = simple_idct; | |
1027 | ff_idct = simple_idct_mmx; | |
1028 | #endif | |
de6d9b64 | 1029 | } |
d6a4c0b1 ZK |
1030 | |
1031 | #if 0 | |
1032 | // for speed testing | |
1033 | get_pixels = just_return; | |
1034 | put_pixels_clamped = just_return; | |
1035 | add_pixels_clamped = just_return; | |
1036 | ||
1037 | pix_abs16x16 = just_return; | |
1038 | pix_abs16x16_x2 = just_return; | |
1039 | pix_abs16x16_y2 = just_return; | |
1040 | pix_abs16x16_xy2 = just_return; | |
1041 | ||
1042 | put_pixels_tab[0] = just_return; | |
1043 | put_pixels_tab[1] = just_return; | |
1044 | put_pixels_tab[2] = just_return; | |
1045 | put_pixels_tab[3] = just_return; | |
1046 | ||
1047 | put_no_rnd_pixels_tab[0] = just_return; | |
1048 | put_no_rnd_pixels_tab[1] = just_return; | |
1049 | put_no_rnd_pixels_tab[2] = just_return; | |
1050 | put_no_rnd_pixels_tab[3] = just_return; | |
1051 | ||
1052 | avg_pixels_tab[0] = just_return; | |
1053 | avg_pixels_tab[1] = just_return; | |
1054 | avg_pixels_tab[2] = just_return; | |
1055 | avg_pixels_tab[3] = just_return; | |
1056 | ||
1057 | avg_no_rnd_pixels_tab[0] = just_return; | |
1058 | avg_no_rnd_pixels_tab[1] = just_return; | |
1059 | avg_no_rnd_pixels_tab[2] = just_return; | |
1060 | avg_no_rnd_pixels_tab[3] = just_return; | |
1061 | ||
d6a4c0b1 ZK |
1062 | //av_fdct = just_return; |
1063 | //ff_idct = just_return; | |
1064 | #endif | |
de6d9b64 | 1065 | } |
4f12a497 FB |
1066 | |
1067 | /* remove any non bit exact operation (testing purpose). NOTE that | |
1068 | this function should be kept as small as possible because it is | |
1069 | always difficult to test automatically non bit exact cases. */ | |
1070 | void dsputil_set_bit_exact_mmx(void) | |
1071 | { | |
1072 | if (mm_flags & MM_MMX) { | |
1073 | if (mm_flags & MM_MMXEXT) { | |
1074 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1075 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1076 | avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1077 | } else if (mm_flags & MM_3DNOW) { | |
1078 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1079 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1080 | avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1081 | } | |
1082 | } | |
1083 | } | |
dcb9cd4b | 1084 | |
dcb9cd4b | 1085 | #endif |