Commit | Line | Data |
---|---|---|
de6d9b64 FB |
1 | /* |
2 | * MMX optimized DSP utils | |
3 | * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 | * | |
5 | * This program is free software; you can redistribute it and/or modify | |
6 | * it under the terms of the GNU General Public License as published by | |
7 | * the Free Software Foundation; either version 2 of the License, or | |
8 | * (at your option) any later version. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, | |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | * GNU General Public License for more details. | |
14 | * | |
15 | * You should have received a copy of the GNU General Public License | |
16 | * along with this program; if not, write to the Free Software | |
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 | * | |
19 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 | */ | |
21 | ||
22 | #include "../dsputil.h" | |
d962f6fd | 23 | #include "../simple_idct.h" |
de6d9b64 | 24 | |
7d650cb5 FB |
25 | int mm_flags; /* multimedia extension flags */ |
26 | ||
ba6802de MN |
27 | int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
28 | int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 | int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 | int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | ||
32 | int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 | int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 | int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 | int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | ||
37 | int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 | int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 | int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 | int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | ||
42 | int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 | int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 | int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 | int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | ||
8def0299 FB |
47 | /* external functions, from idct_mmx.c */ |
48 | void ff_mmx_idct(DCTELEM *block); | |
49 | void ff_mmxext_idct(DCTELEM *block); | |
4af7bcc1 | 50 | |
de6d9b64 | 51 | /* pixel operations */ |
ba6802de MN |
52 | static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL; |
53 | static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL; | |
a9b3f630 NK |
54 | //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
55 | //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
de6d9b64 | 56 | |
d6a4c0b1 ZK |
57 | #define JUMPALIGN() __asm __volatile (".balign 8"::) |
58 | #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) | |
59 | ||
60 | #ifndef PIC | |
61 | #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) | |
62 | #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) | |
63 | #else | |
64 | // for shared library it's better to use this way for accessing constants | |
65 | // pcmpeqd -> -1 | |
66 | #define MOVQ_WONE(regd) \ | |
67 | __asm __volatile ( \ | |
68 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
69 | "psrlw $15, %%" #regd ::) | |
70 | ||
71 | #define MOVQ_WTWO(regd) \ | |
72 | __asm __volatile ( \ | |
73 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
74 | "psrlw $15, %%" #regd " \n\t" \ | |
75 | "psllw $1, %%" #regd ::) | |
76 | #endif | |
77 | ||
de6d9b64 FB |
78 | /***********************************/ |
79 | /* 3Dnow specific */ | |
80 | ||
81 | #define DEF(x) x ## _3dnow | |
82 | /* for Athlons PAVGUSB is prefered */ | |
83 | #define PAVGB "pavgusb" | |
84 | ||
85 | #include "dsputil_mmx_avg.h" | |
86 | ||
87 | #undef DEF | |
88 | #undef PAVGB | |
89 | ||
90 | /***********************************/ | |
91 | /* MMX2 specific */ | |
92 | ||
93 | #define DEF(x) x ## _sse | |
94 | ||
95 | /* Introduced only in MMX2 set */ | |
96 | #define PAVGB "pavgb" | |
97 | ||
98 | #include "dsputil_mmx_avg.h" | |
99 | ||
100 | #undef DEF | |
101 | #undef PAVGB | |
102 | ||
103 | /***********************************/ | |
104 | /* standard MMX */ | |
105 | ||
106 | static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
107 | { | |
108 | DCTELEM *p; | |
109 | const UINT8 *pix; | |
110 | int i; | |
111 | ||
112 | /* read the pixels */ | |
113 | p = block; | |
114 | pix = pixels; | |
d6a4c0b1 | 115 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
116 | for(i=0;i<4;i++) { |
117 | __asm __volatile( | |
118 | "movq %1, %%mm0\n\t" | |
119 | "movq %2, %%mm1\n\t" | |
120 | "movq %%mm0, %%mm2\n\t" | |
121 | "movq %%mm1, %%mm3\n\t" | |
122 | "punpcklbw %%mm7, %%mm0\n\t" | |
123 | "punpckhbw %%mm7, %%mm2\n\t" | |
124 | "punpcklbw %%mm7, %%mm1\n\t" | |
125 | "punpckhbw %%mm7, %%mm3\n\t" | |
126 | "movq %%mm0, %0\n\t" | |
127 | "movq %%mm2, 8%0\n\t" | |
128 | "movq %%mm1, 16%0\n\t" | |
129 | "movq %%mm3, 24%0\n\t" | |
130 | :"=m"(*p) | |
131 | :"m"(*pix), "m"(*(pix+line_size)) | |
132 | :"memory"); | |
133 | pix += line_size*2; | |
134 | p += 16; | |
135 | } | |
de6d9b64 FB |
136 | } |
137 | ||
9dbcbd92 MN |
138 | static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
139 | { | |
140 | asm volatile( | |
141 | ".balign 16 \n\t" | |
142 | "movl $-128, %%eax \n\t" | |
143 | "1: \n\t" | |
144 | "movq (%0), %%mm0 \n\t" | |
145 | "movq (%1), %%mm2 \n\t" | |
146 | "movq %%mm0, %%mm1 \n\t" | |
147 | "movq %%mm2, %%mm3 \n\t" | |
148 | "punpcklbw %%mm7, %%mm0 \n\t" | |
149 | "punpckhbw %%mm7, %%mm1 \n\t" | |
150 | "punpcklbw %%mm7, %%mm2 \n\t" | |
151 | "punpckhbw %%mm7, %%mm3 \n\t" | |
152 | "psubw %%mm2, %%mm0 \n\t" | |
153 | "psubw %%mm3, %%mm1 \n\t" | |
154 | "movq %%mm0, (%2, %%eax)\n\t" | |
155 | "movq %%mm1, 8(%2, %%eax)\n\t" | |
156 | "addl %3, %0 \n\t" | |
157 | "addl %3, %1 \n\t" | |
158 | "addl $16, %%eax \n\t" | |
159 | "jnz 1b \n\t" | |
160 | : "+r" (s1), "+r" (s2) | |
161 | : "r" (block+64), "r" (stride) | |
162 | : "%eax" | |
163 | ); | |
164 | } | |
165 | ||
de6d9b64 FB |
166 | static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
167 | { | |
168 | const DCTELEM *p; | |
169 | UINT8 *pix; | |
de6d9b64 FB |
170 | |
171 | /* read the pixels */ | |
172 | p = block; | |
173 | pix = pixels; | |
d6a4c0b1 | 174 | /* unrolled loop */ |
de6d9b64 | 175 | __asm __volatile( |
a822a479 NK |
176 | "movq %3, %%mm0\n\t" |
177 | "movq 8%3, %%mm1\n\t" | |
178 | "movq 16%3, %%mm2\n\t" | |
179 | "movq 24%3, %%mm3\n\t" | |
180 | "movq 32%3, %%mm4\n\t" | |
181 | "movq 40%3, %%mm5\n\t" | |
182 | "movq 48%3, %%mm6\n\t" | |
183 | "movq 56%3, %%mm7\n\t" | |
de6d9b64 FB |
184 | "packuswb %%mm1, %%mm0\n\t" |
185 | "packuswb %%mm3, %%mm2\n\t" | |
186 | "packuswb %%mm5, %%mm4\n\t" | |
187 | "packuswb %%mm7, %%mm6\n\t" | |
a822a479 NK |
188 | "movq %%mm0, (%0)\n\t" |
189 | "movq %%mm2, (%0, %1)\n\t" | |
190 | "movq %%mm4, (%0, %1, 2)\n\t" | |
191 | "movq %%mm6, (%0, %2)\n\t" | |
192 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) | |
de6d9b64 FB |
193 | :"memory"); |
194 | pix += line_size*4; | |
195 | p += 32; | |
d6a4c0b1 ZK |
196 | |
197 | // if here would be an exact copy of the code above | |
198 | // compiler would generate some very strange code | |
199 | // thus using "r" | |
200 | __asm __volatile( | |
201 | "movq (%3), %%mm0\n\t" | |
202 | "movq 8(%3), %%mm1\n\t" | |
203 | "movq 16(%3), %%mm2\n\t" | |
204 | "movq 24(%3), %%mm3\n\t" | |
205 | "movq 32(%3), %%mm4\n\t" | |
206 | "movq 40(%3), %%mm5\n\t" | |
207 | "movq 48(%3), %%mm6\n\t" | |
208 | "movq 56(%3), %%mm7\n\t" | |
209 | "packuswb %%mm1, %%mm0\n\t" | |
210 | "packuswb %%mm3, %%mm2\n\t" | |
211 | "packuswb %%mm5, %%mm4\n\t" | |
212 | "packuswb %%mm7, %%mm6\n\t" | |
213 | "movq %%mm0, (%0)\n\t" | |
214 | "movq %%mm2, (%0, %1)\n\t" | |
215 | "movq %%mm4, (%0, %1, 2)\n\t" | |
216 | "movq %%mm6, (%0, %2)\n\t" | |
217 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) | |
218 | :"memory"); | |
de6d9b64 FB |
219 | } |
220 | ||
221 | static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
222 | { | |
223 | const DCTELEM *p; | |
224 | UINT8 *pix; | |
225 | int i; | |
226 | ||
227 | /* read the pixels */ | |
228 | p = block; | |
229 | pix = pixels; | |
d6a4c0b1 ZK |
230 | MOVQ_ZERO(mm7); |
231 | i = 4; | |
cd8e5f96 | 232 | do { |
de6d9b64 | 233 | __asm __volatile( |
cd8e5f96 ZK |
234 | "movq (%2), %%mm0\n\t" |
235 | "movq 8(%2), %%mm1\n\t" | |
236 | "movq 16(%2), %%mm2\n\t" | |
237 | "movq 24(%2), %%mm3\n\t" | |
de6d9b64 FB |
238 | "movq %0, %%mm4\n\t" |
239 | "movq %1, %%mm6\n\t" | |
240 | "movq %%mm4, %%mm5\n\t" | |
241 | "punpcklbw %%mm7, %%mm4\n\t" | |
242 | "punpckhbw %%mm7, %%mm5\n\t" | |
243 | "paddsw %%mm4, %%mm0\n\t" | |
244 | "paddsw %%mm5, %%mm1\n\t" | |
245 | "movq %%mm6, %%mm5\n\t" | |
246 | "punpcklbw %%mm7, %%mm6\n\t" | |
247 | "punpckhbw %%mm7, %%mm5\n\t" | |
248 | "paddsw %%mm6, %%mm2\n\t" | |
249 | "paddsw %%mm5, %%mm3\n\t" | |
250 | "packuswb %%mm1, %%mm0\n\t" | |
251 | "packuswb %%mm3, %%mm2\n\t" | |
252 | "movq %%mm0, %0\n\t" | |
253 | "movq %%mm2, %1\n\t" | |
a822a479 | 254 | :"+m"(*pix), "+m"(*(pix+line_size)) |
cd8e5f96 | 255 | :"r"(p) |
de6d9b64 FB |
256 | :"memory"); |
257 | pix += line_size*2; | |
258 | p += 16; | |
cd8e5f96 | 259 | } while (--i); |
de6d9b64 FB |
260 | } |
261 | ||
262 | static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
263 | { | |
d6a4c0b1 | 264 | int hh; |
de6d9b64 FB |
265 | UINT8 *p; |
266 | const UINT8 *pix; | |
d6a4c0b1 | 267 | |
de6d9b64 | 268 | p = block; |
d6a4c0b1 ZK |
269 | pix = pixels; // 2s |
270 | #if 0 | |
271 | do { | |
272 | __asm __volatile( | |
273 | "movq %1, %%mm0\n\t" | |
274 | "movq %%mm0, %0\n\t" | |
275 | :"=m"(*p) | |
276 | :"m"(*pix) | |
277 | :"memory"); | |
278 | pix += line_size; | |
279 | p += line_size; | |
280 | } while (--h); | |
281 | #else | |
282 | // this optimized code is not very usefull | |
283 | // the above loop is definitely faster | |
284 | // at least on Celeron 500MHz | |
285 | hh = h & 3; | |
286 | while (hh) { | |
287 | __asm __volatile( | |
288 | "movq %1, %%mm0\n\t" | |
289 | "movq %%mm0, %0\n\t" | |
290 | :"=m"(*p) | |
291 | :"m"(*pix) | |
292 | :"memory"); | |
293 | pix += line_size; | |
294 | p += line_size; | |
295 | hh--; | |
296 | } | |
de6d9b64 | 297 | hh=h>>2; |
d6a4c0b1 | 298 | while (hh) { |
de6d9b64 | 299 | __asm __volatile( |
a822a479 NK |
300 | "movq (%1), %%mm0 \n\t" |
301 | "movq (%1, %2), %%mm1 \n\t" | |
302 | "movq (%1, %2, 2), %%mm2 \n\t" | |
303 | "movq (%1, %3), %%mm3 \n\t" | |
304 | "movq %%mm0, (%0) \n\t" | |
305 | "movq %%mm1, (%0, %2) \n\t" | |
306 | "movq %%mm2, (%0, %2, 2) \n\t" | |
307 | "movq %%mm3, (%0, %3) \n\t" | |
308 | ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3) | |
de6d9b64 | 309 | :"memory"); |
d6a4c0b1 ZK |
310 | pix += line_size*4; |
311 | p += line_size*4; | |
312 | hh--; | |
de6d9b64 | 313 | } |
d6a4c0b1 | 314 | #endif |
de6d9b64 FB |
315 | } |
316 | ||
317 | static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
318 | { | |
319 | UINT8 *p; | |
320 | const UINT8 *pix; | |
321 | p = block; | |
322 | pix = pixels; | |
d6a4c0b1 ZK |
323 | MOVQ_ZERO(mm7); |
324 | MOVQ_WONE(mm4); | |
325 | JUMPALIGN(); | |
de6d9b64 FB |
326 | do { |
327 | __asm __volatile( | |
328 | "movq %1, %%mm0\n\t" | |
329 | "movq 1%1, %%mm1\n\t" | |
330 | "movq %%mm0, %%mm2\n\t" | |
331 | "movq %%mm1, %%mm3\n\t" | |
332 | "punpcklbw %%mm7, %%mm0\n\t" | |
333 | "punpcklbw %%mm7, %%mm1\n\t" | |
334 | "punpckhbw %%mm7, %%mm2\n\t" | |
335 | "punpckhbw %%mm7, %%mm3\n\t" | |
336 | "paddusw %%mm1, %%mm0\n\t" | |
337 | "paddusw %%mm3, %%mm2\n\t" | |
338 | "paddusw %%mm4, %%mm0\n\t" | |
339 | "paddusw %%mm4, %%mm2\n\t" | |
340 | "psrlw $1, %%mm0\n\t" | |
341 | "psrlw $1, %%mm2\n\t" | |
342 | "packuswb %%mm2, %%mm0\n\t" | |
343 | "movq %%mm0, %0\n\t" | |
344 | :"=m"(*p) | |
345 | :"m"(*pix) | |
346 | :"memory"); | |
347 | pix += line_size; p += line_size; | |
348 | } while (--h); | |
de6d9b64 FB |
349 | } |
350 | ||
351 | static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
352 | { | |
353 | UINT8 *p; | |
354 | const UINT8 *pix; | |
355 | p = block; | |
356 | pix = pixels; | |
d6a4c0b1 ZK |
357 | MOVQ_ZERO(mm7); |
358 | MOVQ_WONE(mm4); | |
359 | JUMPALIGN(); | |
de6d9b64 FB |
360 | do { |
361 | __asm __volatile( | |
362 | "movq %1, %%mm0\n\t" | |
363 | "movq %2, %%mm1\n\t" | |
364 | "movq %%mm0, %%mm2\n\t" | |
365 | "movq %%mm1, %%mm3\n\t" | |
366 | "punpcklbw %%mm7, %%mm0\n\t" | |
367 | "punpcklbw %%mm7, %%mm1\n\t" | |
368 | "punpckhbw %%mm7, %%mm2\n\t" | |
369 | "punpckhbw %%mm7, %%mm3\n\t" | |
370 | "paddusw %%mm1, %%mm0\n\t" | |
371 | "paddusw %%mm3, %%mm2\n\t" | |
372 | "paddusw %%mm4, %%mm0\n\t" | |
373 | "paddusw %%mm4, %%mm2\n\t" | |
374 | "psrlw $1, %%mm0\n\t" | |
375 | "psrlw $1, %%mm2\n\t" | |
376 | "packuswb %%mm2, %%mm0\n\t" | |
377 | "movq %%mm0, %0\n\t" | |
378 | :"=m"(*p) | |
379 | :"m"(*pix), | |
380 | "m"(*(pix+line_size)) | |
381 | :"memory"); | |
382 | pix += line_size; | |
383 | p += line_size; | |
384 | } while (--h); | |
de6d9b64 FB |
385 | } |
386 | ||
387 | static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
388 | { | |
389 | UINT8 *p; | |
390 | const UINT8 *pix; | |
391 | p = block; | |
d6a4c0b1 ZK |
392 | pix = pixels; // 1s |
393 | MOVQ_ZERO(mm7); | |
394 | MOVQ_WTWO(mm6); | |
395 | JUMPALIGN(); | |
de6d9b64 FB |
396 | do { |
397 | __asm __volatile( | |
398 | "movq %1, %%mm0\n\t" | |
399 | "movq %2, %%mm1\n\t" | |
400 | "movq 1%1, %%mm4\n\t" | |
401 | "movq 1%2, %%mm5\n\t" | |
402 | "movq %%mm0, %%mm2\n\t" | |
403 | "movq %%mm1, %%mm3\n\t" | |
404 | "punpcklbw %%mm7, %%mm0\n\t" | |
405 | "punpcklbw %%mm7, %%mm1\n\t" | |
406 | "punpckhbw %%mm7, %%mm2\n\t" | |
407 | "punpckhbw %%mm7, %%mm3\n\t" | |
408 | "paddusw %%mm1, %%mm0\n\t" | |
409 | "paddusw %%mm3, %%mm2\n\t" | |
410 | "movq %%mm4, %%mm1\n\t" | |
411 | "movq %%mm5, %%mm3\n\t" | |
412 | "punpcklbw %%mm7, %%mm4\n\t" | |
413 | "punpcklbw %%mm7, %%mm5\n\t" | |
414 | "punpckhbw %%mm7, %%mm1\n\t" | |
415 | "punpckhbw %%mm7, %%mm3\n\t" | |
416 | "paddusw %%mm5, %%mm4\n\t" | |
417 | "paddusw %%mm3, %%mm1\n\t" | |
418 | "paddusw %%mm6, %%mm4\n\t" | |
419 | "paddusw %%mm6, %%mm1\n\t" | |
420 | "paddusw %%mm4, %%mm0\n\t" | |
421 | "paddusw %%mm1, %%mm2\n\t" | |
422 | "psrlw $2, %%mm0\n\t" | |
423 | "psrlw $2, %%mm2\n\t" | |
424 | "packuswb %%mm2, %%mm0\n\t" | |
425 | "movq %%mm0, %0\n\t" | |
426 | :"=m"(*p) | |
427 | :"m"(*pix), | |
428 | "m"(*(pix+line_size)) | |
429 | :"memory"); | |
430 | pix += line_size; | |
431 | p += line_size; | |
432 | } while(--h); | |
de6d9b64 FB |
433 | } |
434 | ||
435 | static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
436 | { | |
437 | UINT8 *p; | |
438 | const UINT8 *pix; | |
439 | p = block; | |
440 | pix = pixels; | |
d6a4c0b1 | 441 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
442 | do { |
443 | __asm __volatile( | |
444 | "movq %1, %%mm0\n\t" | |
445 | "movq 1%1, %%mm1\n\t" | |
446 | "movq %%mm0, %%mm2\n\t" | |
447 | "movq %%mm1, %%mm3\n\t" | |
448 | "punpcklbw %%mm7, %%mm0\n\t" | |
449 | "punpcklbw %%mm7, %%mm1\n\t" | |
450 | "punpckhbw %%mm7, %%mm2\n\t" | |
451 | "punpckhbw %%mm7, %%mm3\n\t" | |
452 | "paddusw %%mm1, %%mm0\n\t" | |
453 | "paddusw %%mm3, %%mm2\n\t" | |
454 | "psrlw $1, %%mm0\n\t" | |
455 | "psrlw $1, %%mm2\n\t" | |
456 | "packuswb %%mm2, %%mm0\n\t" | |
457 | "movq %%mm0, %0\n\t" | |
458 | :"=m"(*p) | |
459 | :"m"(*pix) | |
460 | :"memory"); | |
461 | pix += line_size; | |
462 | p += line_size; | |
463 | } while (--h); | |
de6d9b64 FB |
464 | } |
465 | ||
466 | static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
467 | { | |
468 | UINT8 *p; | |
469 | const UINT8 *pix; | |
470 | p = block; | |
471 | pix = pixels; | |
d6a4c0b1 ZK |
472 | MOVQ_ZERO(mm7); |
473 | JUMPALIGN(); | |
de6d9b64 FB |
474 | do { |
475 | __asm __volatile( | |
476 | "movq %1, %%mm0\n\t" | |
477 | "movq %2, %%mm1\n\t" | |
478 | "movq %%mm0, %%mm2\n\t" | |
479 | "movq %%mm1, %%mm3\n\t" | |
480 | "punpcklbw %%mm7, %%mm0\n\t" | |
481 | "punpcklbw %%mm7, %%mm1\n\t" | |
482 | "punpckhbw %%mm7, %%mm2\n\t" | |
483 | "punpckhbw %%mm7, %%mm3\n\t" | |
484 | "paddusw %%mm1, %%mm0\n\t" | |
485 | "paddusw %%mm3, %%mm2\n\t" | |
486 | "psrlw $1, %%mm0\n\t" | |
487 | "psrlw $1, %%mm2\n\t" | |
488 | "packuswb %%mm2, %%mm0\n\t" | |
489 | "movq %%mm0, %0\n\t" | |
490 | :"=m"(*p) | |
491 | :"m"(*pix), | |
492 | "m"(*(pix+line_size)) | |
493 | :"memory"); | |
494 | pix += line_size; | |
495 | p += line_size; | |
496 | } while(--h); | |
de6d9b64 FB |
497 | } |
498 | ||
499 | static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
500 | { | |
501 | UINT8 *p; | |
502 | const UINT8 *pix; | |
503 | p = block; | |
504 | pix = pixels; | |
d6a4c0b1 ZK |
505 | MOVQ_ZERO(mm7); |
506 | MOVQ_WONE(mm6); | |
507 | JUMPALIGN(); | |
de6d9b64 FB |
508 | do { |
509 | __asm __volatile( | |
510 | "movq %1, %%mm0\n\t" | |
511 | "movq %2, %%mm1\n\t" | |
512 | "movq 1%1, %%mm4\n\t" | |
513 | "movq 1%2, %%mm5\n\t" | |
514 | "movq %%mm0, %%mm2\n\t" | |
515 | "movq %%mm1, %%mm3\n\t" | |
516 | "punpcklbw %%mm7, %%mm0\n\t" | |
517 | "punpcklbw %%mm7, %%mm1\n\t" | |
518 | "punpckhbw %%mm7, %%mm2\n\t" | |
519 | "punpckhbw %%mm7, %%mm3\n\t" | |
520 | "paddusw %%mm1, %%mm0\n\t" | |
521 | "paddusw %%mm3, %%mm2\n\t" | |
522 | "movq %%mm4, %%mm1\n\t" | |
523 | "movq %%mm5, %%mm3\n\t" | |
524 | "punpcklbw %%mm7, %%mm4\n\t" | |
525 | "punpcklbw %%mm7, %%mm5\n\t" | |
526 | "punpckhbw %%mm7, %%mm1\n\t" | |
527 | "punpckhbw %%mm7, %%mm3\n\t" | |
528 | "paddusw %%mm5, %%mm4\n\t" | |
529 | "paddusw %%mm3, %%mm1\n\t" | |
530 | "paddusw %%mm6, %%mm4\n\t" | |
531 | "paddusw %%mm6, %%mm1\n\t" | |
532 | "paddusw %%mm4, %%mm0\n\t" | |
533 | "paddusw %%mm1, %%mm2\n\t" | |
534 | "psrlw $2, %%mm0\n\t" | |
535 | "psrlw $2, %%mm2\n\t" | |
536 | "packuswb %%mm2, %%mm0\n\t" | |
537 | "movq %%mm0, %0\n\t" | |
538 | :"=m"(*p) | |
539 | :"m"(*pix), | |
540 | "m"(*(pix+line_size)) | |
541 | :"memory"); | |
542 | pix += line_size; | |
543 | p += line_size; | |
544 | } while(--h); | |
de6d9b64 FB |
545 | } |
546 | ||
547 | static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
548 | { | |
549 | UINT8 *p; | |
550 | const UINT8 *pix; | |
551 | p = block; | |
552 | pix = pixels; | |
d6a4c0b1 ZK |
553 | MOVQ_ZERO(mm7); |
554 | MOVQ_WONE(mm6); | |
555 | JUMPALIGN(); | |
de6d9b64 FB |
556 | do { |
557 | __asm __volatile( | |
558 | "movq %0, %%mm0\n\t" | |
559 | "movq %1, %%mm1\n\t" | |
560 | "movq %%mm0, %%mm2\n\t" | |
561 | "movq %%mm1, %%mm3\n\t" | |
562 | "punpcklbw %%mm7, %%mm0\n\t" | |
563 | "punpcklbw %%mm7, %%mm1\n\t" | |
564 | "punpckhbw %%mm7, %%mm2\n\t" | |
565 | "punpckhbw %%mm7, %%mm3\n\t" | |
566 | "paddusw %%mm1, %%mm0\n\t" | |
567 | "paddusw %%mm3, %%mm2\n\t" | |
568 | "paddusw %%mm6, %%mm0\n\t" | |
569 | "paddusw %%mm6, %%mm2\n\t" | |
570 | "psrlw $1, %%mm0\n\t" | |
571 | "psrlw $1, %%mm2\n\t" | |
572 | "packuswb %%mm2, %%mm0\n\t" | |
573 | "movq %%mm0, %0\n\t" | |
a822a479 | 574 | :"+m"(*p) |
de6d9b64 FB |
575 | :"m"(*pix) |
576 | :"memory"); | |
577 | pix += line_size; | |
578 | p += line_size; | |
579 | } | |
580 | while (--h); | |
de6d9b64 FB |
581 | } |
582 | ||
583 | static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
584 | { | |
585 | UINT8 *p; | |
586 | const UINT8 *pix; | |
587 | p = block; | |
588 | pix = pixels; | |
d6a4c0b1 ZK |
589 | MOVQ_ZERO(mm7); |
590 | MOVQ_WONE(mm6); | |
591 | JUMPALIGN(); | |
de6d9b64 FB |
592 | do { |
593 | __asm __volatile( | |
594 | "movq %1, %%mm1\n\t" | |
595 | "movq %0, %%mm0\n\t" | |
596 | "movq 1%1, %%mm4\n\t" | |
597 | "movq %%mm0, %%mm2\n\t" | |
598 | "movq %%mm1, %%mm3\n\t" | |
599 | "movq %%mm4, %%mm5\n\t" | |
600 | "punpcklbw %%mm7, %%mm1\n\t" | |
601 | "punpckhbw %%mm7, %%mm3\n\t" | |
602 | "punpcklbw %%mm7, %%mm4\n\t" | |
603 | "punpckhbw %%mm7, %%mm5\n\t" | |
604 | "punpcklbw %%mm7, %%mm0\n\t" | |
605 | "punpckhbw %%mm7, %%mm2\n\t" | |
606 | "paddusw %%mm4, %%mm1\n\t" | |
607 | "paddusw %%mm5, %%mm3\n\t" | |
608 | "paddusw %%mm6, %%mm1\n\t" | |
609 | "paddusw %%mm6, %%mm3\n\t" | |
610 | "psrlw $1, %%mm1\n\t" | |
611 | "psrlw $1, %%mm3\n\t" | |
612 | "paddusw %%mm6, %%mm0\n\t" | |
613 | "paddusw %%mm6, %%mm2\n\t" | |
614 | "paddusw %%mm1, %%mm0\n\t" | |
615 | "paddusw %%mm3, %%mm2\n\t" | |
616 | "psrlw $1, %%mm0\n\t" | |
617 | "psrlw $1, %%mm2\n\t" | |
618 | "packuswb %%mm2, %%mm0\n\t" | |
619 | "movq %%mm0, %0\n\t" | |
a822a479 | 620 | :"+m"(*p) |
de6d9b64 FB |
621 | :"m"(*pix) |
622 | :"memory"); | |
623 | pix += line_size; | |
624 | p += line_size; | |
625 | } while (--h); | |
de6d9b64 FB |
626 | } |
627 | ||
628 | static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
629 | { | |
630 | UINT8 *p; | |
631 | const UINT8 *pix; | |
632 | p = block; | |
633 | pix = pixels; | |
d6a4c0b1 ZK |
634 | MOVQ_ZERO(mm7); |
635 | MOVQ_WONE(mm6); | |
636 | JUMPALIGN(); | |
de6d9b64 FB |
637 | do { |
638 | __asm __volatile( | |
639 | "movq %1, %%mm1\n\t" | |
640 | "movq %0, %%mm0\n\t" | |
641 | "movq %2, %%mm4\n\t" | |
642 | "movq %%mm0, %%mm2\n\t" | |
643 | "movq %%mm1, %%mm3\n\t" | |
644 | "movq %%mm4, %%mm5\n\t" | |
645 | "punpcklbw %%mm7, %%mm1\n\t" | |
646 | "punpckhbw %%mm7, %%mm3\n\t" | |
647 | "punpcklbw %%mm7, %%mm4\n\t" | |
648 | "punpckhbw %%mm7, %%mm5\n\t" | |
649 | "punpcklbw %%mm7, %%mm0\n\t" | |
650 | "punpckhbw %%mm7, %%mm2\n\t" | |
651 | "paddusw %%mm4, %%mm1\n\t" | |
652 | "paddusw %%mm5, %%mm3\n\t" | |
653 | "paddusw %%mm6, %%mm1\n\t" | |
654 | "paddusw %%mm6, %%mm3\n\t" | |
655 | "psrlw $1, %%mm1\n\t" | |
656 | "psrlw $1, %%mm3\n\t" | |
657 | "paddusw %%mm6, %%mm0\n\t" | |
658 | "paddusw %%mm6, %%mm2\n\t" | |
659 | "paddusw %%mm1, %%mm0\n\t" | |
660 | "paddusw %%mm3, %%mm2\n\t" | |
661 | "psrlw $1, %%mm0\n\t" | |
662 | "psrlw $1, %%mm2\n\t" | |
663 | "packuswb %%mm2, %%mm0\n\t" | |
664 | "movq %%mm0, %0\n\t" | |
a822a479 | 665 | :"+m"(*p) |
de6d9b64 FB |
666 | :"m"(*pix), "m"(*(pix+line_size)) |
667 | :"memory"); | |
668 | pix += line_size; | |
669 | p += line_size ; | |
670 | } while(--h); | |
de6d9b64 FB |
671 | } |
672 | ||
673 | static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
674 | { | |
675 | UINT8 *p; | |
676 | const UINT8 *pix; | |
677 | p = block; | |
678 | pix = pixels; | |
d6a4c0b1 ZK |
679 | MOVQ_ZERO(mm7); |
680 | // this doesn't seem to be used offten - so | |
681 | // the inside usage of mm_wone is not optimized | |
682 | MOVQ_WTWO(mm6); | |
de6d9b64 FB |
683 | do { |
684 | __asm __volatile( | |
685 | "movq %1, %%mm0\n\t" | |
686 | "movq %2, %%mm1\n\t" | |
687 | "movq 1%1, %%mm4\n\t" | |
688 | "movq 1%2, %%mm5\n\t" | |
689 | "movq %%mm0, %%mm2\n\t" | |
690 | "movq %%mm1, %%mm3\n\t" | |
691 | "punpcklbw %%mm7, %%mm0\n\t" | |
692 | "punpcklbw %%mm7, %%mm1\n\t" | |
693 | "punpckhbw %%mm7, %%mm2\n\t" | |
694 | "punpckhbw %%mm7, %%mm3\n\t" | |
695 | "paddusw %%mm1, %%mm0\n\t" | |
696 | "paddusw %%mm3, %%mm2\n\t" | |
697 | "movq %%mm4, %%mm1\n\t" | |
698 | "movq %%mm5, %%mm3\n\t" | |
699 | "punpcklbw %%mm7, %%mm4\n\t" | |
700 | "punpcklbw %%mm7, %%mm5\n\t" | |
701 | "punpckhbw %%mm7, %%mm1\n\t" | |
702 | "punpckhbw %%mm7, %%mm3\n\t" | |
703 | "paddusw %%mm5, %%mm4\n\t" | |
704 | "paddusw %%mm3, %%mm1\n\t" | |
705 | "paddusw %%mm6, %%mm4\n\t" | |
706 | "paddusw %%mm6, %%mm1\n\t" | |
707 | "paddusw %%mm4, %%mm0\n\t" | |
708 | "paddusw %%mm1, %%mm2\n\t" | |
709 | "movq %3, %%mm5\n\t" | |
710 | "psrlw $2, %%mm0\n\t" | |
711 | "movq %0, %%mm1\n\t" | |
712 | "psrlw $2, %%mm2\n\t" | |
713 | "movq %%mm1, %%mm3\n\t" | |
714 | "punpcklbw %%mm7, %%mm1\n\t" | |
715 | "punpckhbw %%mm7, %%mm3\n\t" | |
716 | "paddusw %%mm1, %%mm0\n\t" | |
717 | "paddusw %%mm3, %%mm2\n\t" | |
718 | "paddusw %%mm5, %%mm0\n\t" | |
719 | "paddusw %%mm5, %%mm2\n\t" | |
720 | "psrlw $1, %%mm0\n\t" | |
721 | "psrlw $1, %%mm2\n\t" | |
722 | "packuswb %%mm2, %%mm0\n\t" | |
723 | "movq %%mm0, %0\n\t" | |
a822a479 | 724 | :"+m"(*p) |
de6d9b64 | 725 | :"m"(*pix), |
a9b3f630 | 726 | "m"(*(pix+line_size)), "m"(mm_wone) |
de6d9b64 FB |
727 | :"memory"); |
728 | pix += line_size; | |
729 | p += line_size ; | |
730 | } while(--h); | |
de6d9b64 FB |
731 | } |
732 | ||
733 | static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
734 | { | |
735 | UINT8 *p; | |
736 | const UINT8 *pix; | |
737 | p = block; | |
738 | pix = pixels; | |
d6a4c0b1 | 739 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
740 | do { |
741 | __asm __volatile( | |
742 | "movq %1, %%mm0\n\t" | |
743 | "movq %0, %%mm1\n\t" | |
744 | "movq %%mm0, %%mm2\n\t" | |
745 | "movq %%mm1, %%mm3\n\t" | |
746 | "punpcklbw %%mm7, %%mm0\n\t" | |
747 | "punpcklbw %%mm7, %%mm1\n\t" | |
748 | "punpckhbw %%mm7, %%mm2\n\t" | |
749 | "punpckhbw %%mm7, %%mm3\n\t" | |
750 | "paddusw %%mm1, %%mm0\n\t" | |
751 | "paddusw %%mm3, %%mm2\n\t" | |
752 | "psrlw $1, %%mm0\n\t" | |
753 | "psrlw $1, %%mm2\n\t" | |
754 | "packuswb %%mm2, %%mm0\n\t" | |
755 | "movq %%mm0, %0\n\t" | |
a822a479 | 756 | :"+m"(*p) |
de6d9b64 FB |
757 | :"m"(*pix) |
758 | :"memory"); | |
759 | pix += line_size; | |
760 | p += line_size ; | |
761 | } while (--h); | |
de6d9b64 FB |
762 | } |
763 | ||
764 | static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
765 | { | |
766 | UINT8 *p; | |
767 | const UINT8 *pix; | |
768 | p = block; | |
769 | pix = pixels; | |
d6a4c0b1 | 770 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
771 | do { |
772 | __asm __volatile( | |
773 | "movq %1, %%mm0\n\t" | |
774 | "movq 1%1, %%mm1\n\t" | |
775 | "movq %0, %%mm4\n\t" | |
776 | "movq %%mm0, %%mm2\n\t" | |
777 | "movq %%mm1, %%mm3\n\t" | |
778 | "movq %%mm4, %%mm5\n\t" | |
779 | "punpcklbw %%mm7, %%mm0\n\t" | |
780 | "punpcklbw %%mm7, %%mm1\n\t" | |
781 | "punpckhbw %%mm7, %%mm2\n\t" | |
782 | "punpckhbw %%mm7, %%mm3\n\t" | |
783 | "punpcklbw %%mm7, %%mm4\n\t" | |
784 | "punpckhbw %%mm7, %%mm5\n\t" | |
785 | "paddusw %%mm1, %%mm0\n\t" | |
786 | "paddusw %%mm3, %%mm2\n\t" | |
787 | "psrlw $1, %%mm0\n\t" | |
788 | "psrlw $1, %%mm2\n\t" | |
789 | "paddusw %%mm4, %%mm0\n\t" | |
790 | "paddusw %%mm5, %%mm2\n\t" | |
791 | "psrlw $1, %%mm0\n\t" | |
792 | "psrlw $1, %%mm2\n\t" | |
793 | "packuswb %%mm2, %%mm0\n\t" | |
794 | "movq %%mm0, %0\n\t" | |
a822a479 | 795 | :"+m"(*p) |
de6d9b64 FB |
796 | :"m"(*pix) |
797 | :"memory"); | |
798 | pix += line_size; | |
799 | p += line_size; | |
800 | } while (--h); | |
de6d9b64 FB |
801 | } |
802 | ||
803 | static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
804 | { | |
805 | UINT8 *p; | |
806 | const UINT8 *pix; | |
807 | p = block; | |
808 | pix = pixels; | |
d6a4c0b1 | 809 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
810 | do { |
811 | __asm __volatile( | |
812 | "movq %1, %%mm0\n\t" | |
813 | "movq %2, %%mm1\n\t" | |
814 | "movq %0, %%mm4\n\t" | |
815 | "movq %%mm0, %%mm2\n\t" | |
816 | "movq %%mm1, %%mm3\n\t" | |
817 | "movq %%mm4, %%mm5\n\t" | |
818 | "punpcklbw %%mm7, %%mm0\n\t" | |
819 | "punpcklbw %%mm7, %%mm1\n\t" | |
820 | "punpckhbw %%mm7, %%mm2\n\t" | |
821 | "punpckhbw %%mm7, %%mm3\n\t" | |
822 | "punpcklbw %%mm7, %%mm4\n\t" | |
823 | "punpckhbw %%mm7, %%mm5\n\t" | |
824 | "paddusw %%mm1, %%mm0\n\t" | |
825 | "paddusw %%mm3, %%mm2\n\t" | |
826 | "psrlw $1, %%mm0\n\t" | |
827 | "psrlw $1, %%mm2\n\t" | |
828 | "paddusw %%mm4, %%mm0\n\t" | |
829 | "paddusw %%mm5, %%mm2\n\t" | |
830 | "psrlw $1, %%mm0\n\t" | |
831 | "psrlw $1, %%mm2\n\t" | |
832 | "packuswb %%mm2, %%mm0\n\t" | |
833 | "movq %%mm0, %0\n\t" | |
a822a479 | 834 | :"+m"(*p) |
de6d9b64 FB |
835 | :"m"(*pix), "m"(*(pix+line_size)) |
836 | :"memory"); | |
837 | pix += line_size; | |
838 | p += line_size ; | |
839 | } while(--h); | |
de6d9b64 FB |
840 | } |
841 | ||
842 | static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
843 | { | |
844 | UINT8 *p; | |
845 | const UINT8 *pix; | |
846 | p = block; | |
847 | pix = pixels; | |
d6a4c0b1 ZK |
848 | MOVQ_ZERO(mm7); |
849 | MOVQ_WONE(mm6); | |
850 | JUMPALIGN(); | |
de6d9b64 FB |
851 | do { |
852 | __asm __volatile( | |
853 | "movq %1, %%mm0\n\t" | |
854 | "movq %2, %%mm1\n\t" | |
855 | "movq 1%1, %%mm4\n\t" | |
856 | "movq 1%2, %%mm5\n\t" | |
857 | "movq %%mm0, %%mm2\n\t" | |
858 | "movq %%mm1, %%mm3\n\t" | |
859 | "punpcklbw %%mm7, %%mm0\n\t" | |
860 | "punpcklbw %%mm7, %%mm1\n\t" | |
861 | "punpckhbw %%mm7, %%mm2\n\t" | |
862 | "punpckhbw %%mm7, %%mm3\n\t" | |
863 | "paddusw %%mm1, %%mm0\n\t" | |
864 | "paddusw %%mm3, %%mm2\n\t" | |
865 | "movq %%mm4, %%mm1\n\t" | |
866 | "movq %%mm5, %%mm3\n\t" | |
867 | "punpcklbw %%mm7, %%mm4\n\t" | |
868 | "punpcklbw %%mm7, %%mm5\n\t" | |
869 | "punpckhbw %%mm7, %%mm1\n\t" | |
870 | "punpckhbw %%mm7, %%mm3\n\t" | |
871 | "paddusw %%mm5, %%mm4\n\t" | |
872 | "paddusw %%mm3, %%mm1\n\t" | |
873 | "paddusw %%mm6, %%mm4\n\t" | |
874 | "paddusw %%mm6, %%mm1\n\t" | |
875 | "paddusw %%mm4, %%mm0\n\t" | |
876 | "paddusw %%mm1, %%mm2\n\t" | |
877 | "movq %0, %%mm1\n\t" | |
878 | "psrlw $2, %%mm0\n\t" | |
879 | "movq %%mm1, %%mm3\n\t" | |
880 | "psrlw $2, %%mm2\n\t" | |
881 | "punpcklbw %%mm7, %%mm1\n\t" | |
882 | "punpckhbw %%mm7, %%mm3\n\t" | |
883 | "paddusw %%mm1, %%mm0\n\t" | |
884 | "paddusw %%mm3, %%mm2\n\t" | |
885 | "psrlw $1, %%mm0\n\t" | |
886 | "psrlw $1, %%mm2\n\t" | |
887 | "packuswb %%mm2, %%mm0\n\t" | |
888 | "movq %%mm0, %0\n\t" | |
a822a479 | 889 | :"+m"(*p) |
de6d9b64 FB |
890 | :"m"(*pix), |
891 | "m"(*(pix+line_size)) | |
892 | :"memory"); | |
893 | pix += line_size; | |
894 | p += line_size; | |
895 | } while(--h); | |
de6d9b64 FB |
896 | } |
897 | ||
898 | static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
899 | { | |
900 | DCTELEM *p; | |
901 | const UINT8 *pix; | |
902 | p = block; | |
903 | pix = pixels; | |
d6a4c0b1 | 904 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
905 | do { |
906 | __asm __volatile( | |
907 | "movq %0, %%mm0\n\t" | |
908 | "movq %1, %%mm2\n\t" | |
909 | "movq 8%0, %%mm1\n\t" | |
910 | "movq %%mm2, %%mm3\n\t" | |
911 | "punpcklbw %%mm7, %%mm2\n\t" | |
912 | "punpckhbw %%mm7, %%mm3\n\t" | |
913 | "psubsw %%mm2, %%mm0\n\t" | |
914 | "psubsw %%mm3, %%mm1\n\t" | |
915 | "movq %%mm0, %0\n\t" | |
916 | "movq %%mm1, 8%0\n\t" | |
a822a479 | 917 | :"+m"(*p) |
de6d9b64 FB |
918 | :"m"(*pix) |
919 | :"memory"); | |
920 | pix += line_size; | |
921 | p += 8; | |
922 | } while (--h); | |
de6d9b64 FB |
923 | } |
924 | ||
925 | static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
926 | { | |
927 | DCTELEM *p; | |
928 | const UINT8 *pix; | |
929 | p = block; | |
930 | pix = pixels; | |
d6a4c0b1 ZK |
931 | MOVQ_ZERO(mm7); |
932 | MOVQ_WONE(mm6); | |
933 | JUMPALIGN(); | |
de6d9b64 FB |
934 | do { |
935 | __asm __volatile( | |
936 | "movq %0, %%mm0\n\t" | |
937 | "movq %1, %%mm2\n\t" | |
938 | "movq 8%0, %%mm1\n\t" | |
939 | "movq 1%1, %%mm4\n\t" | |
940 | "movq %%mm2, %%mm3\n\t" | |
941 | "movq %%mm4, %%mm5\n\t" | |
942 | "punpcklbw %%mm7, %%mm2\n\t" | |
943 | "punpckhbw %%mm7, %%mm3\n\t" | |
944 | "punpcklbw %%mm7, %%mm4\n\t" | |
945 | "punpckhbw %%mm7, %%mm5\n\t" | |
946 | "paddusw %%mm4, %%mm2\n\t" | |
947 | "paddusw %%mm5, %%mm3\n\t" | |
948 | "paddusw %%mm6, %%mm2\n\t" | |
949 | "paddusw %%mm6, %%mm3\n\t" | |
950 | "psrlw $1, %%mm2\n\t" | |
951 | "psrlw $1, %%mm3\n\t" | |
952 | "psubsw %%mm2, %%mm0\n\t" | |
953 | "psubsw %%mm3, %%mm1\n\t" | |
954 | "movq %%mm0, %0\n\t" | |
955 | "movq %%mm1, 8%0\n\t" | |
a822a479 | 956 | :"+m"(*p) |
de6d9b64 FB |
957 | :"m"(*pix) |
958 | :"memory"); | |
959 | pix += line_size; | |
960 | p += 8; | |
961 | } while (--h); | |
de6d9b64 FB |
962 | } |
963 | ||
964 | static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
965 | { | |
966 | DCTELEM *p; | |
967 | const UINT8 *pix; | |
968 | p = block; | |
969 | pix = pixels; | |
d6a4c0b1 ZK |
970 | MOVQ_ZERO(mm7); |
971 | MOVQ_WONE(mm6); | |
de6d9b64 FB |
972 | do { |
973 | __asm __volatile( | |
974 | "movq %0, %%mm0\n\t" | |
975 | "movq %1, %%mm2\n\t" | |
976 | "movq 8%0, %%mm1\n\t" | |
977 | "movq %2, %%mm4\n\t" | |
978 | "movq %%mm2, %%mm3\n\t" | |
979 | "movq %%mm4, %%mm5\n\t" | |
980 | "punpcklbw %%mm7, %%mm2\n\t" | |
981 | "punpckhbw %%mm7, %%mm3\n\t" | |
982 | "punpcklbw %%mm7, %%mm4\n\t" | |
983 | "punpckhbw %%mm7, %%mm5\n\t" | |
984 | "paddusw %%mm4, %%mm2\n\t" | |
985 | "paddusw %%mm5, %%mm3\n\t" | |
986 | "paddusw %%mm6, %%mm2\n\t" | |
987 | "paddusw %%mm6, %%mm3\n\t" | |
988 | "psrlw $1, %%mm2\n\t" | |
989 | "psrlw $1, %%mm3\n\t" | |
990 | "psubsw %%mm2, %%mm0\n\t" | |
991 | "psubsw %%mm3, %%mm1\n\t" | |
992 | "movq %%mm0, %0\n\t" | |
993 | "movq %%mm1, 8%0\n\t" | |
a822a479 | 994 | :"+m"(*p) |
de6d9b64 FB |
995 | :"m"(*pix), "m"(*(pix+line_size)) |
996 | :"memory"); | |
997 | pix += line_size; | |
998 | p += 8; | |
999 | } while (--h); | |
de6d9b64 FB |
1000 | } |
1001 | ||
1002 | static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
1003 | { | |
1004 | DCTELEM *p; | |
1005 | const UINT8 *pix; | |
1006 | p = block; | |
1007 | pix = pixels; | |
d6a4c0b1 ZK |
1008 | MOVQ_ZERO(mm7); |
1009 | MOVQ_WTWO(mm6); | |
1010 | JUMPALIGN(); | |
de6d9b64 FB |
1011 | do { |
1012 | __asm __volatile( | |
1013 | "movq %1, %%mm0\n\t" | |
1014 | "movq %2, %%mm1\n\t" | |
1015 | "movq 1%1, %%mm4\n\t" | |
1016 | "movq 1%2, %%mm5\n\t" | |
1017 | "movq %%mm0, %%mm2\n\t" | |
1018 | "movq %%mm1, %%mm3\n\t" | |
1019 | "punpcklbw %%mm7, %%mm0\n\t" | |
1020 | "punpcklbw %%mm7, %%mm1\n\t" | |
1021 | "punpckhbw %%mm7, %%mm2\n\t" | |
1022 | "punpckhbw %%mm7, %%mm3\n\t" | |
1023 | "paddusw %%mm1, %%mm0\n\t" | |
1024 | "paddusw %%mm3, %%mm2\n\t" | |
1025 | "movq %%mm4, %%mm1\n\t" | |
1026 | "movq %%mm5, %%mm3\n\t" | |
1027 | "punpcklbw %%mm7, %%mm4\n\t" | |
1028 | "punpcklbw %%mm7, %%mm5\n\t" | |
1029 | "punpckhbw %%mm7, %%mm1\n\t" | |
1030 | "punpckhbw %%mm7, %%mm3\n\t" | |
1031 | "paddusw %%mm5, %%mm4\n\t" | |
1032 | "paddusw %%mm3, %%mm1\n\t" | |
1033 | "paddusw %%mm6, %%mm4\n\t" | |
1034 | "paddusw %%mm6, %%mm1\n\t" | |
1035 | "paddusw %%mm4, %%mm0\n\t" | |
1036 | "paddusw %%mm1, %%mm2\n\t" | |
1037 | "movq %0, %%mm1\n\t" | |
1038 | "movq 8%0, %%mm3\n\t" | |
1039 | "psrlw $2, %%mm0\n\t" | |
1040 | "psrlw $2, %%mm2\n\t" | |
1041 | "psubsw %%mm0, %%mm1\n\t" | |
1042 | "psubsw %%mm2, %%mm3\n\t" | |
1043 | "movq %%mm1, %0\n\t" | |
1044 | "movq %%mm3, 8%0\n\t" | |
a822a479 | 1045 | :"+m"(*p) |
de6d9b64 FB |
1046 | :"m"(*pix), |
1047 | "m"(*(pix+line_size)) | |
1048 | :"memory"); | |
1049 | pix += line_size; | |
1050 | p += 8 ; | |
1051 | } while(--h); | |
de6d9b64 FB |
1052 | } |
1053 | ||
649c00c9 MN |
1054 | static void clear_blocks_mmx(DCTELEM *blocks) |
1055 | { | |
1056 | asm volatile( | |
1057 | "pxor %%mm7, %%mm7 \n\t" | |
1058 | "movl $-128*6, %%eax \n\t" | |
1059 | "1: \n\t" | |
1060 | "movq %%mm7, (%0, %%eax) \n\t" | |
1061 | "movq %%mm7, 8(%0, %%eax) \n\t" | |
1062 | "movq %%mm7, 16(%0, %%eax) \n\t" | |
1063 | "movq %%mm7, 24(%0, %%eax) \n\t" | |
1064 | "addl $32, %%eax \n\t" | |
1065 | " js 1b \n\t" | |
1066 | : : "r" (((int)blocks)+128*6) | |
1067 | : "%eax" | |
1068 | ); | |
1069 | } | |
1070 | ||
d6a4c0b1 ZK |
1071 | static void just_return() { return; } |
1072 | ||
de6d9b64 FB |
1073 | void dsputil_init_mmx(void) |
1074 | { | |
1075 | mm_flags = mm_support(); | |
f4470e09 MN |
1076 | #if 1 |
1077 | printf("libavcodec: CPU flags:"); | |
de6d9b64 FB |
1078 | if (mm_flags & MM_MMX) |
1079 | printf(" mmx"); | |
1080 | if (mm_flags & MM_MMXEXT) | |
1081 | printf(" mmxext"); | |
1082 | if (mm_flags & MM_3DNOW) | |
1083 | printf(" 3dnow"); | |
1084 | if (mm_flags & MM_SSE) | |
1085 | printf(" sse"); | |
1086 | if (mm_flags & MM_SSE2) | |
1087 | printf(" sse2"); | |
1088 | printf("\n"); | |
1089 | #endif | |
1090 | ||
1091 | if (mm_flags & MM_MMX) { | |
1092 | get_pixels = get_pixels_mmx; | |
9dbcbd92 | 1093 | diff_pixels = diff_pixels_mmx; |
de6d9b64 FB |
1094 | put_pixels_clamped = put_pixels_clamped_mmx; |
1095 | add_pixels_clamped = add_pixels_clamped_mmx; | |
649c00c9 MN |
1096 | clear_blocks= clear_blocks_mmx; |
1097 | ||
ba6802de MN |
1098 | pix_abs16x16 = pix_abs16x16_mmx; |
1099 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
1100 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
de6d9b64 | 1101 | pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
ba6802de MN |
1102 | pix_abs8x8 = pix_abs8x8_mmx; |
1103 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
1104 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
1105 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
de6d9b64 FB |
1106 | av_fdct = fdct_mmx; |
1107 | ||
1108 | put_pixels_tab[0] = put_pixels_mmx; | |
1109 | put_pixels_tab[1] = put_pixels_x2_mmx; | |
1110 | put_pixels_tab[2] = put_pixels_y2_mmx; | |
1111 | put_pixels_tab[3] = put_pixels_xy2_mmx; | |
1112 | ||
1113 | put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
1114 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1115 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1116 | put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
1117 | ||
1118 | avg_pixels_tab[0] = avg_pixels_mmx; | |
1119 | avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1120 | avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1121 | avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1122 | ||
1123 | avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1124 | avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1125 | avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1126 | avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
1127 | ||
1128 | sub_pixels_tab[0] = sub_pixels_mmx; | |
1129 | sub_pixels_tab[1] = sub_pixels_x2_mmx; | |
1130 | sub_pixels_tab[2] = sub_pixels_y2_mmx; | |
1131 | sub_pixels_tab[3] = sub_pixels_xy2_mmx; | |
1132 | ||
1133 | if (mm_flags & MM_MMXEXT) { | |
ba6802de MN |
1134 | pix_abs16x16 = pix_abs16x16_mmx2; |
1135 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
1136 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
1137 | pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
1138 | ||
1139 | pix_abs8x8 = pix_abs8x8_mmx2; | |
1140 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
1141 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
1142 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
1143 | ||
de6d9b64 FB |
1144 | put_pixels_tab[1] = put_pixels_x2_sse; |
1145 | put_pixels_tab[2] = put_pixels_y2_sse; | |
1146 | ||
1147 | avg_pixels_tab[0] = avg_pixels_sse; | |
1148 | avg_pixels_tab[1] = avg_pixels_x2_sse; | |
1149 | avg_pixels_tab[2] = avg_pixels_y2_sse; | |
1150 | avg_pixels_tab[3] = avg_pixels_xy2_sse; | |
1151 | ||
1152 | sub_pixels_tab[1] = sub_pixels_x2_sse; | |
1153 | sub_pixels_tab[2] = sub_pixels_y2_sse; | |
1154 | } else if (mm_flags & MM_3DNOW) { | |
1155 | put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1156 | put_pixels_tab[2] = put_pixels_y2_3dnow; | |
1157 | ||
1158 | avg_pixels_tab[0] = avg_pixels_3dnow; | |
1159 | avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1160 | avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1161 | avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1162 | ||
1163 | sub_pixels_tab[1] = sub_pixels_x2_3dnow; | |
1164 | sub_pixels_tab[2] = sub_pixels_y2_3dnow; | |
1165 | } | |
4af7bcc1 | 1166 | |
8def0299 FB |
1167 | /* idct */ |
1168 | if (mm_flags & MM_MMXEXT) { | |
1169 | ff_idct = ff_mmxext_idct; | |
1170 | } else { | |
1171 | ff_idct = ff_mmx_idct; | |
1172 | } | |
d962f6fd A |
1173 | #ifdef SIMPLE_IDCT |
1174 | // ff_idct = simple_idct; | |
1175 | ff_idct = simple_idct_mmx; | |
1176 | #endif | |
de6d9b64 | 1177 | } |
d6a4c0b1 ZK |
1178 | |
1179 | #if 0 | |
1180 | // for speed testing | |
1181 | get_pixels = just_return; | |
1182 | put_pixels_clamped = just_return; | |
1183 | add_pixels_clamped = just_return; | |
1184 | ||
1185 | pix_abs16x16 = just_return; | |
1186 | pix_abs16x16_x2 = just_return; | |
1187 | pix_abs16x16_y2 = just_return; | |
1188 | pix_abs16x16_xy2 = just_return; | |
1189 | ||
1190 | put_pixels_tab[0] = just_return; | |
1191 | put_pixels_tab[1] = just_return; | |
1192 | put_pixels_tab[2] = just_return; | |
1193 | put_pixels_tab[3] = just_return; | |
1194 | ||
1195 | put_no_rnd_pixels_tab[0] = just_return; | |
1196 | put_no_rnd_pixels_tab[1] = just_return; | |
1197 | put_no_rnd_pixels_tab[2] = just_return; | |
1198 | put_no_rnd_pixels_tab[3] = just_return; | |
1199 | ||
1200 | avg_pixels_tab[0] = just_return; | |
1201 | avg_pixels_tab[1] = just_return; | |
1202 | avg_pixels_tab[2] = just_return; | |
1203 | avg_pixels_tab[3] = just_return; | |
1204 | ||
1205 | avg_no_rnd_pixels_tab[0] = just_return; | |
1206 | avg_no_rnd_pixels_tab[1] = just_return; | |
1207 | avg_no_rnd_pixels_tab[2] = just_return; | |
1208 | avg_no_rnd_pixels_tab[3] = just_return; | |
1209 | ||
1210 | sub_pixels_tab[0] = just_return; | |
1211 | sub_pixels_tab[1] = just_return; | |
1212 | sub_pixels_tab[2] = just_return; | |
1213 | sub_pixels_tab[3] = just_return; | |
1214 | ||
1215 | //av_fdct = just_return; | |
1216 | //ff_idct = just_return; | |
1217 | #endif | |
de6d9b64 | 1218 | } |