Commit | Line | Data |
---|---|---|
de6d9b64 FB |
1 | /* |
2 | * MMX optimized DSP utils | |
3 | * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 | * | |
5 | * This program is free software; you can redistribute it and/or modify | |
6 | * it under the terms of the GNU General Public License as published by | |
7 | * the Free Software Foundation; either version 2 of the License, or | |
8 | * (at your option) any later version. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, | |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | * GNU General Public License for more details. | |
14 | * | |
15 | * You should have received a copy of the GNU General Public License | |
16 | * along with this program; if not, write to the Free Software | |
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 | * | |
19 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 | */ | |
21 | ||
22 | #include "../dsputil.h" | |
d962f6fd | 23 | #include "../simple_idct.h" |
de6d9b64 | 24 | |
7d650cb5 FB |
25 | int mm_flags; /* multimedia extension flags */ |
26 | ||
de6d9b64 FB |
27 | int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); |
28 | int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
29 | int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
30 | int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
31 | int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
32 | ||
8def0299 FB |
33 | /* external functions, from idct_mmx.c */ |
34 | void ff_mmx_idct(DCTELEM *block); | |
35 | void ff_mmxext_idct(DCTELEM *block); | |
4af7bcc1 | 36 | |
de6d9b64 | 37 | /* pixel operations */ |
a9b3f630 NK |
38 | static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; |
39 | static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | |
40 | //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | |
41 | //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
de6d9b64 | 42 | |
d6a4c0b1 ZK |
43 | #define JUMPALIGN() __asm __volatile (".balign 8"::) |
44 | #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) | |
45 | ||
46 | #ifndef PIC | |
47 | #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) | |
48 | #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) | |
49 | #else | |
50 | // for shared library it's better to use this way for accessing constants | |
51 | // pcmpeqd -> -1 | |
52 | #define MOVQ_WONE(regd) \ | |
53 | __asm __volatile ( \ | |
54 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
55 | "psrlw $15, %%" #regd ::) | |
56 | ||
57 | #define MOVQ_WTWO(regd) \ | |
58 | __asm __volatile ( \ | |
59 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
60 | "psrlw $15, %%" #regd " \n\t" \ | |
61 | "psllw $1, %%" #regd ::) | |
62 | #endif | |
63 | ||
de6d9b64 FB |
64 | /***********************************/ |
65 | /* 3Dnow specific */ | |
66 | ||
67 | #define DEF(x) x ## _3dnow | |
68 | /* for Athlons PAVGUSB is prefered */ | |
69 | #define PAVGB "pavgusb" | |
70 | ||
71 | #include "dsputil_mmx_avg.h" | |
72 | ||
73 | #undef DEF | |
74 | #undef PAVGB | |
75 | ||
76 | /***********************************/ | |
77 | /* MMX2 specific */ | |
78 | ||
79 | #define DEF(x) x ## _sse | |
80 | ||
81 | /* Introduced only in MMX2 set */ | |
82 | #define PAVGB "pavgb" | |
83 | ||
84 | #include "dsputil_mmx_avg.h" | |
85 | ||
86 | #undef DEF | |
87 | #undef PAVGB | |
88 | ||
89 | /***********************************/ | |
90 | /* standard MMX */ | |
91 | ||
92 | static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
93 | { | |
94 | DCTELEM *p; | |
95 | const UINT8 *pix; | |
96 | int i; | |
97 | ||
98 | /* read the pixels */ | |
99 | p = block; | |
100 | pix = pixels; | |
d6a4c0b1 | 101 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
102 | for(i=0;i<4;i++) { |
103 | __asm __volatile( | |
104 | "movq %1, %%mm0\n\t" | |
105 | "movq %2, %%mm1\n\t" | |
106 | "movq %%mm0, %%mm2\n\t" | |
107 | "movq %%mm1, %%mm3\n\t" | |
108 | "punpcklbw %%mm7, %%mm0\n\t" | |
109 | "punpckhbw %%mm7, %%mm2\n\t" | |
110 | "punpcklbw %%mm7, %%mm1\n\t" | |
111 | "punpckhbw %%mm7, %%mm3\n\t" | |
112 | "movq %%mm0, %0\n\t" | |
113 | "movq %%mm2, 8%0\n\t" | |
114 | "movq %%mm1, 16%0\n\t" | |
115 | "movq %%mm3, 24%0\n\t" | |
116 | :"=m"(*p) | |
117 | :"m"(*pix), "m"(*(pix+line_size)) | |
118 | :"memory"); | |
119 | pix += line_size*2; | |
120 | p += 16; | |
121 | } | |
de6d9b64 FB |
122 | } |
123 | ||
124 | static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
125 | { | |
126 | const DCTELEM *p; | |
127 | UINT8 *pix; | |
de6d9b64 FB |
128 | |
129 | /* read the pixels */ | |
130 | p = block; | |
131 | pix = pixels; | |
d6a4c0b1 | 132 | /* unrolled loop */ |
de6d9b64 | 133 | __asm __volatile( |
a822a479 NK |
134 | "movq %3, %%mm0\n\t" |
135 | "movq 8%3, %%mm1\n\t" | |
136 | "movq 16%3, %%mm2\n\t" | |
137 | "movq 24%3, %%mm3\n\t" | |
138 | "movq 32%3, %%mm4\n\t" | |
139 | "movq 40%3, %%mm5\n\t" | |
140 | "movq 48%3, %%mm6\n\t" | |
141 | "movq 56%3, %%mm7\n\t" | |
de6d9b64 FB |
142 | "packuswb %%mm1, %%mm0\n\t" |
143 | "packuswb %%mm3, %%mm2\n\t" | |
144 | "packuswb %%mm5, %%mm4\n\t" | |
145 | "packuswb %%mm7, %%mm6\n\t" | |
a822a479 NK |
146 | "movq %%mm0, (%0)\n\t" |
147 | "movq %%mm2, (%0, %1)\n\t" | |
148 | "movq %%mm4, (%0, %1, 2)\n\t" | |
149 | "movq %%mm6, (%0, %2)\n\t" | |
150 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) | |
de6d9b64 FB |
151 | :"memory"); |
152 | pix += line_size*4; | |
153 | p += 32; | |
d6a4c0b1 ZK |
154 | |
155 | // if here would be an exact copy of the code above | |
156 | // compiler would generate some very strange code | |
157 | // thus using "r" | |
158 | __asm __volatile( | |
159 | "movq (%3), %%mm0\n\t" | |
160 | "movq 8(%3), %%mm1\n\t" | |
161 | "movq 16(%3), %%mm2\n\t" | |
162 | "movq 24(%3), %%mm3\n\t" | |
163 | "movq 32(%3), %%mm4\n\t" | |
164 | "movq 40(%3), %%mm5\n\t" | |
165 | "movq 48(%3), %%mm6\n\t" | |
166 | "movq 56(%3), %%mm7\n\t" | |
167 | "packuswb %%mm1, %%mm0\n\t" | |
168 | "packuswb %%mm3, %%mm2\n\t" | |
169 | "packuswb %%mm5, %%mm4\n\t" | |
170 | "packuswb %%mm7, %%mm6\n\t" | |
171 | "movq %%mm0, (%0)\n\t" | |
172 | "movq %%mm2, (%0, %1)\n\t" | |
173 | "movq %%mm4, (%0, %1, 2)\n\t" | |
174 | "movq %%mm6, (%0, %2)\n\t" | |
175 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) | |
176 | :"memory"); | |
de6d9b64 FB |
177 | } |
178 | ||
179 | static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
180 | { | |
181 | const DCTELEM *p; | |
182 | UINT8 *pix; | |
183 | int i; | |
184 | ||
185 | /* read the pixels */ | |
186 | p = block; | |
187 | pix = pixels; | |
d6a4c0b1 ZK |
188 | MOVQ_ZERO(mm7); |
189 | i = 4; | |
190 | while (i) { | |
de6d9b64 FB |
191 | __asm __volatile( |
192 | "movq %2, %%mm0\n\t" | |
193 | "movq 8%2, %%mm1\n\t" | |
194 | "movq 16%2, %%mm2\n\t" | |
195 | "movq 24%2, %%mm3\n\t" | |
196 | "movq %0, %%mm4\n\t" | |
197 | "movq %1, %%mm6\n\t" | |
198 | "movq %%mm4, %%mm5\n\t" | |
199 | "punpcklbw %%mm7, %%mm4\n\t" | |
200 | "punpckhbw %%mm7, %%mm5\n\t" | |
201 | "paddsw %%mm4, %%mm0\n\t" | |
202 | "paddsw %%mm5, %%mm1\n\t" | |
203 | "movq %%mm6, %%mm5\n\t" | |
204 | "punpcklbw %%mm7, %%mm6\n\t" | |
205 | "punpckhbw %%mm7, %%mm5\n\t" | |
206 | "paddsw %%mm6, %%mm2\n\t" | |
207 | "paddsw %%mm5, %%mm3\n\t" | |
208 | "packuswb %%mm1, %%mm0\n\t" | |
209 | "packuswb %%mm3, %%mm2\n\t" | |
210 | "movq %%mm0, %0\n\t" | |
211 | "movq %%mm2, %1\n\t" | |
a822a479 | 212 | :"+m"(*pix), "+m"(*(pix+line_size)) |
de6d9b64 FB |
213 | :"m"(*p) |
214 | :"memory"); | |
215 | pix += line_size*2; | |
216 | p += 16; | |
d6a4c0b1 ZK |
217 | i--; |
218 | }; | |
de6d9b64 FB |
219 | } |
220 | ||
221 | static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
222 | { | |
d6a4c0b1 | 223 | int hh; |
de6d9b64 FB |
224 | UINT8 *p; |
225 | const UINT8 *pix; | |
d6a4c0b1 | 226 | |
de6d9b64 | 227 | p = block; |
d6a4c0b1 ZK |
228 | pix = pixels; // 2s |
229 | #if 0 | |
230 | do { | |
231 | __asm __volatile( | |
232 | "movq %1, %%mm0\n\t" | |
233 | "movq %%mm0, %0\n\t" | |
234 | :"=m"(*p) | |
235 | :"m"(*pix) | |
236 | :"memory"); | |
237 | pix += line_size; | |
238 | p += line_size; | |
239 | } while (--h); | |
240 | #else | |
241 | // this optimized code is not very usefull | |
242 | // the above loop is definitely faster | |
243 | // at least on Celeron 500MHz | |
244 | hh = h & 3; | |
245 | while (hh) { | |
246 | __asm __volatile( | |
247 | "movq %1, %%mm0\n\t" | |
248 | "movq %%mm0, %0\n\t" | |
249 | :"=m"(*p) | |
250 | :"m"(*pix) | |
251 | :"memory"); | |
252 | pix += line_size; | |
253 | p += line_size; | |
254 | hh--; | |
255 | } | |
de6d9b64 | 256 | hh=h>>2; |
d6a4c0b1 | 257 | while (hh) { |
de6d9b64 | 258 | __asm __volatile( |
a822a479 NK |
259 | "movq (%1), %%mm0 \n\t" |
260 | "movq (%1, %2), %%mm1 \n\t" | |
261 | "movq (%1, %2, 2), %%mm2 \n\t" | |
262 | "movq (%1, %3), %%mm3 \n\t" | |
263 | "movq %%mm0, (%0) \n\t" | |
264 | "movq %%mm1, (%0, %2) \n\t" | |
265 | "movq %%mm2, (%0, %2, 2) \n\t" | |
266 | "movq %%mm3, (%0, %3) \n\t" | |
267 | ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3) | |
de6d9b64 | 268 | :"memory"); |
d6a4c0b1 ZK |
269 | pix += line_size*4; |
270 | p += line_size*4; | |
271 | hh--; | |
de6d9b64 | 272 | } |
d6a4c0b1 | 273 | #endif |
de6d9b64 FB |
274 | } |
275 | ||
276 | static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
277 | { | |
278 | UINT8 *p; | |
279 | const UINT8 *pix; | |
280 | p = block; | |
281 | pix = pixels; | |
d6a4c0b1 ZK |
282 | MOVQ_ZERO(mm7); |
283 | MOVQ_WONE(mm4); | |
284 | JUMPALIGN(); | |
de6d9b64 FB |
285 | do { |
286 | __asm __volatile( | |
287 | "movq %1, %%mm0\n\t" | |
288 | "movq 1%1, %%mm1\n\t" | |
289 | "movq %%mm0, %%mm2\n\t" | |
290 | "movq %%mm1, %%mm3\n\t" | |
291 | "punpcklbw %%mm7, %%mm0\n\t" | |
292 | "punpcklbw %%mm7, %%mm1\n\t" | |
293 | "punpckhbw %%mm7, %%mm2\n\t" | |
294 | "punpckhbw %%mm7, %%mm3\n\t" | |
295 | "paddusw %%mm1, %%mm0\n\t" | |
296 | "paddusw %%mm3, %%mm2\n\t" | |
297 | "paddusw %%mm4, %%mm0\n\t" | |
298 | "paddusw %%mm4, %%mm2\n\t" | |
299 | "psrlw $1, %%mm0\n\t" | |
300 | "psrlw $1, %%mm2\n\t" | |
301 | "packuswb %%mm2, %%mm0\n\t" | |
302 | "movq %%mm0, %0\n\t" | |
303 | :"=m"(*p) | |
304 | :"m"(*pix) | |
305 | :"memory"); | |
306 | pix += line_size; p += line_size; | |
307 | } while (--h); | |
de6d9b64 FB |
308 | } |
309 | ||
310 | static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
311 | { | |
312 | UINT8 *p; | |
313 | const UINT8 *pix; | |
314 | p = block; | |
315 | pix = pixels; | |
d6a4c0b1 ZK |
316 | MOVQ_ZERO(mm7); |
317 | MOVQ_WONE(mm4); | |
318 | JUMPALIGN(); | |
de6d9b64 FB |
319 | do { |
320 | __asm __volatile( | |
321 | "movq %1, %%mm0\n\t" | |
322 | "movq %2, %%mm1\n\t" | |
323 | "movq %%mm0, %%mm2\n\t" | |
324 | "movq %%mm1, %%mm3\n\t" | |
325 | "punpcklbw %%mm7, %%mm0\n\t" | |
326 | "punpcklbw %%mm7, %%mm1\n\t" | |
327 | "punpckhbw %%mm7, %%mm2\n\t" | |
328 | "punpckhbw %%mm7, %%mm3\n\t" | |
329 | "paddusw %%mm1, %%mm0\n\t" | |
330 | "paddusw %%mm3, %%mm2\n\t" | |
331 | "paddusw %%mm4, %%mm0\n\t" | |
332 | "paddusw %%mm4, %%mm2\n\t" | |
333 | "psrlw $1, %%mm0\n\t" | |
334 | "psrlw $1, %%mm2\n\t" | |
335 | "packuswb %%mm2, %%mm0\n\t" | |
336 | "movq %%mm0, %0\n\t" | |
337 | :"=m"(*p) | |
338 | :"m"(*pix), | |
339 | "m"(*(pix+line_size)) | |
340 | :"memory"); | |
341 | pix += line_size; | |
342 | p += line_size; | |
343 | } while (--h); | |
de6d9b64 FB |
344 | } |
345 | ||
346 | static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
347 | { | |
348 | UINT8 *p; | |
349 | const UINT8 *pix; | |
350 | p = block; | |
d6a4c0b1 ZK |
351 | pix = pixels; // 1s |
352 | MOVQ_ZERO(mm7); | |
353 | MOVQ_WTWO(mm6); | |
354 | JUMPALIGN(); | |
de6d9b64 FB |
355 | do { |
356 | __asm __volatile( | |
357 | "movq %1, %%mm0\n\t" | |
358 | "movq %2, %%mm1\n\t" | |
359 | "movq 1%1, %%mm4\n\t" | |
360 | "movq 1%2, %%mm5\n\t" | |
361 | "movq %%mm0, %%mm2\n\t" | |
362 | "movq %%mm1, %%mm3\n\t" | |
363 | "punpcklbw %%mm7, %%mm0\n\t" | |
364 | "punpcklbw %%mm7, %%mm1\n\t" | |
365 | "punpckhbw %%mm7, %%mm2\n\t" | |
366 | "punpckhbw %%mm7, %%mm3\n\t" | |
367 | "paddusw %%mm1, %%mm0\n\t" | |
368 | "paddusw %%mm3, %%mm2\n\t" | |
369 | "movq %%mm4, %%mm1\n\t" | |
370 | "movq %%mm5, %%mm3\n\t" | |
371 | "punpcklbw %%mm7, %%mm4\n\t" | |
372 | "punpcklbw %%mm7, %%mm5\n\t" | |
373 | "punpckhbw %%mm7, %%mm1\n\t" | |
374 | "punpckhbw %%mm7, %%mm3\n\t" | |
375 | "paddusw %%mm5, %%mm4\n\t" | |
376 | "paddusw %%mm3, %%mm1\n\t" | |
377 | "paddusw %%mm6, %%mm4\n\t" | |
378 | "paddusw %%mm6, %%mm1\n\t" | |
379 | "paddusw %%mm4, %%mm0\n\t" | |
380 | "paddusw %%mm1, %%mm2\n\t" | |
381 | "psrlw $2, %%mm0\n\t" | |
382 | "psrlw $2, %%mm2\n\t" | |
383 | "packuswb %%mm2, %%mm0\n\t" | |
384 | "movq %%mm0, %0\n\t" | |
385 | :"=m"(*p) | |
386 | :"m"(*pix), | |
387 | "m"(*(pix+line_size)) | |
388 | :"memory"); | |
389 | pix += line_size; | |
390 | p += line_size; | |
391 | } while(--h); | |
de6d9b64 FB |
392 | } |
393 | ||
394 | static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
395 | { | |
396 | UINT8 *p; | |
397 | const UINT8 *pix; | |
398 | p = block; | |
399 | pix = pixels; | |
d6a4c0b1 | 400 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
401 | do { |
402 | __asm __volatile( | |
403 | "movq %1, %%mm0\n\t" | |
404 | "movq 1%1, %%mm1\n\t" | |
405 | "movq %%mm0, %%mm2\n\t" | |
406 | "movq %%mm1, %%mm3\n\t" | |
407 | "punpcklbw %%mm7, %%mm0\n\t" | |
408 | "punpcklbw %%mm7, %%mm1\n\t" | |
409 | "punpckhbw %%mm7, %%mm2\n\t" | |
410 | "punpckhbw %%mm7, %%mm3\n\t" | |
411 | "paddusw %%mm1, %%mm0\n\t" | |
412 | "paddusw %%mm3, %%mm2\n\t" | |
413 | "psrlw $1, %%mm0\n\t" | |
414 | "psrlw $1, %%mm2\n\t" | |
415 | "packuswb %%mm2, %%mm0\n\t" | |
416 | "movq %%mm0, %0\n\t" | |
417 | :"=m"(*p) | |
418 | :"m"(*pix) | |
419 | :"memory"); | |
420 | pix += line_size; | |
421 | p += line_size; | |
422 | } while (--h); | |
de6d9b64 FB |
423 | } |
424 | ||
425 | static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
426 | { | |
427 | UINT8 *p; | |
428 | const UINT8 *pix; | |
429 | p = block; | |
430 | pix = pixels; | |
d6a4c0b1 ZK |
431 | MOVQ_ZERO(mm7); |
432 | JUMPALIGN(); | |
de6d9b64 FB |
433 | do { |
434 | __asm __volatile( | |
435 | "movq %1, %%mm0\n\t" | |
436 | "movq %2, %%mm1\n\t" | |
437 | "movq %%mm0, %%mm2\n\t" | |
438 | "movq %%mm1, %%mm3\n\t" | |
439 | "punpcklbw %%mm7, %%mm0\n\t" | |
440 | "punpcklbw %%mm7, %%mm1\n\t" | |
441 | "punpckhbw %%mm7, %%mm2\n\t" | |
442 | "punpckhbw %%mm7, %%mm3\n\t" | |
443 | "paddusw %%mm1, %%mm0\n\t" | |
444 | "paddusw %%mm3, %%mm2\n\t" | |
445 | "psrlw $1, %%mm0\n\t" | |
446 | "psrlw $1, %%mm2\n\t" | |
447 | "packuswb %%mm2, %%mm0\n\t" | |
448 | "movq %%mm0, %0\n\t" | |
449 | :"=m"(*p) | |
450 | :"m"(*pix), | |
451 | "m"(*(pix+line_size)) | |
452 | :"memory"); | |
453 | pix += line_size; | |
454 | p += line_size; | |
455 | } while(--h); | |
de6d9b64 FB |
456 | } |
457 | ||
458 | static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
459 | { | |
460 | UINT8 *p; | |
461 | const UINT8 *pix; | |
462 | p = block; | |
463 | pix = pixels; | |
d6a4c0b1 ZK |
464 | MOVQ_ZERO(mm7); |
465 | MOVQ_WONE(mm6); | |
466 | JUMPALIGN(); | |
de6d9b64 FB |
467 | do { |
468 | __asm __volatile( | |
469 | "movq %1, %%mm0\n\t" | |
470 | "movq %2, %%mm1\n\t" | |
471 | "movq 1%1, %%mm4\n\t" | |
472 | "movq 1%2, %%mm5\n\t" | |
473 | "movq %%mm0, %%mm2\n\t" | |
474 | "movq %%mm1, %%mm3\n\t" | |
475 | "punpcklbw %%mm7, %%mm0\n\t" | |
476 | "punpcklbw %%mm7, %%mm1\n\t" | |
477 | "punpckhbw %%mm7, %%mm2\n\t" | |
478 | "punpckhbw %%mm7, %%mm3\n\t" | |
479 | "paddusw %%mm1, %%mm0\n\t" | |
480 | "paddusw %%mm3, %%mm2\n\t" | |
481 | "movq %%mm4, %%mm1\n\t" | |
482 | "movq %%mm5, %%mm3\n\t" | |
483 | "punpcklbw %%mm7, %%mm4\n\t" | |
484 | "punpcklbw %%mm7, %%mm5\n\t" | |
485 | "punpckhbw %%mm7, %%mm1\n\t" | |
486 | "punpckhbw %%mm7, %%mm3\n\t" | |
487 | "paddusw %%mm5, %%mm4\n\t" | |
488 | "paddusw %%mm3, %%mm1\n\t" | |
489 | "paddusw %%mm6, %%mm4\n\t" | |
490 | "paddusw %%mm6, %%mm1\n\t" | |
491 | "paddusw %%mm4, %%mm0\n\t" | |
492 | "paddusw %%mm1, %%mm2\n\t" | |
493 | "psrlw $2, %%mm0\n\t" | |
494 | "psrlw $2, %%mm2\n\t" | |
495 | "packuswb %%mm2, %%mm0\n\t" | |
496 | "movq %%mm0, %0\n\t" | |
497 | :"=m"(*p) | |
498 | :"m"(*pix), | |
499 | "m"(*(pix+line_size)) | |
500 | :"memory"); | |
501 | pix += line_size; | |
502 | p += line_size; | |
503 | } while(--h); | |
de6d9b64 FB |
504 | } |
505 | ||
506 | static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
507 | { | |
508 | UINT8 *p; | |
509 | const UINT8 *pix; | |
510 | p = block; | |
511 | pix = pixels; | |
d6a4c0b1 ZK |
512 | MOVQ_ZERO(mm7); |
513 | MOVQ_WONE(mm6); | |
514 | JUMPALIGN(); | |
de6d9b64 FB |
515 | do { |
516 | __asm __volatile( | |
517 | "movq %0, %%mm0\n\t" | |
518 | "movq %1, %%mm1\n\t" | |
519 | "movq %%mm0, %%mm2\n\t" | |
520 | "movq %%mm1, %%mm3\n\t" | |
521 | "punpcklbw %%mm7, %%mm0\n\t" | |
522 | "punpcklbw %%mm7, %%mm1\n\t" | |
523 | "punpckhbw %%mm7, %%mm2\n\t" | |
524 | "punpckhbw %%mm7, %%mm3\n\t" | |
525 | "paddusw %%mm1, %%mm0\n\t" | |
526 | "paddusw %%mm3, %%mm2\n\t" | |
527 | "paddusw %%mm6, %%mm0\n\t" | |
528 | "paddusw %%mm6, %%mm2\n\t" | |
529 | "psrlw $1, %%mm0\n\t" | |
530 | "psrlw $1, %%mm2\n\t" | |
531 | "packuswb %%mm2, %%mm0\n\t" | |
532 | "movq %%mm0, %0\n\t" | |
a822a479 | 533 | :"+m"(*p) |
de6d9b64 FB |
534 | :"m"(*pix) |
535 | :"memory"); | |
536 | pix += line_size; | |
537 | p += line_size; | |
538 | } | |
539 | while (--h); | |
de6d9b64 FB |
540 | } |
541 | ||
542 | static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
543 | { | |
544 | UINT8 *p; | |
545 | const UINT8 *pix; | |
546 | p = block; | |
547 | pix = pixels; | |
d6a4c0b1 ZK |
548 | MOVQ_ZERO(mm7); |
549 | MOVQ_WONE(mm6); | |
550 | JUMPALIGN(); | |
de6d9b64 FB |
551 | do { |
552 | __asm __volatile( | |
553 | "movq %1, %%mm1\n\t" | |
554 | "movq %0, %%mm0\n\t" | |
555 | "movq 1%1, %%mm4\n\t" | |
556 | "movq %%mm0, %%mm2\n\t" | |
557 | "movq %%mm1, %%mm3\n\t" | |
558 | "movq %%mm4, %%mm5\n\t" | |
559 | "punpcklbw %%mm7, %%mm1\n\t" | |
560 | "punpckhbw %%mm7, %%mm3\n\t" | |
561 | "punpcklbw %%mm7, %%mm4\n\t" | |
562 | "punpckhbw %%mm7, %%mm5\n\t" | |
563 | "punpcklbw %%mm7, %%mm0\n\t" | |
564 | "punpckhbw %%mm7, %%mm2\n\t" | |
565 | "paddusw %%mm4, %%mm1\n\t" | |
566 | "paddusw %%mm5, %%mm3\n\t" | |
567 | "paddusw %%mm6, %%mm1\n\t" | |
568 | "paddusw %%mm6, %%mm3\n\t" | |
569 | "psrlw $1, %%mm1\n\t" | |
570 | "psrlw $1, %%mm3\n\t" | |
571 | "paddusw %%mm6, %%mm0\n\t" | |
572 | "paddusw %%mm6, %%mm2\n\t" | |
573 | "paddusw %%mm1, %%mm0\n\t" | |
574 | "paddusw %%mm3, %%mm2\n\t" | |
575 | "psrlw $1, %%mm0\n\t" | |
576 | "psrlw $1, %%mm2\n\t" | |
577 | "packuswb %%mm2, %%mm0\n\t" | |
578 | "movq %%mm0, %0\n\t" | |
a822a479 | 579 | :"+m"(*p) |
de6d9b64 FB |
580 | :"m"(*pix) |
581 | :"memory"); | |
582 | pix += line_size; | |
583 | p += line_size; | |
584 | } while (--h); | |
de6d9b64 FB |
585 | } |
586 | ||
587 | static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
588 | { | |
589 | UINT8 *p; | |
590 | const UINT8 *pix; | |
591 | p = block; | |
592 | pix = pixels; | |
d6a4c0b1 ZK |
593 | MOVQ_ZERO(mm7); |
594 | MOVQ_WONE(mm6); | |
595 | JUMPALIGN(); | |
de6d9b64 FB |
596 | do { |
597 | __asm __volatile( | |
598 | "movq %1, %%mm1\n\t" | |
599 | "movq %0, %%mm0\n\t" | |
600 | "movq %2, %%mm4\n\t" | |
601 | "movq %%mm0, %%mm2\n\t" | |
602 | "movq %%mm1, %%mm3\n\t" | |
603 | "movq %%mm4, %%mm5\n\t" | |
604 | "punpcklbw %%mm7, %%mm1\n\t" | |
605 | "punpckhbw %%mm7, %%mm3\n\t" | |
606 | "punpcklbw %%mm7, %%mm4\n\t" | |
607 | "punpckhbw %%mm7, %%mm5\n\t" | |
608 | "punpcklbw %%mm7, %%mm0\n\t" | |
609 | "punpckhbw %%mm7, %%mm2\n\t" | |
610 | "paddusw %%mm4, %%mm1\n\t" | |
611 | "paddusw %%mm5, %%mm3\n\t" | |
612 | "paddusw %%mm6, %%mm1\n\t" | |
613 | "paddusw %%mm6, %%mm3\n\t" | |
614 | "psrlw $1, %%mm1\n\t" | |
615 | "psrlw $1, %%mm3\n\t" | |
616 | "paddusw %%mm6, %%mm0\n\t" | |
617 | "paddusw %%mm6, %%mm2\n\t" | |
618 | "paddusw %%mm1, %%mm0\n\t" | |
619 | "paddusw %%mm3, %%mm2\n\t" | |
620 | "psrlw $1, %%mm0\n\t" | |
621 | "psrlw $1, %%mm2\n\t" | |
622 | "packuswb %%mm2, %%mm0\n\t" | |
623 | "movq %%mm0, %0\n\t" | |
a822a479 | 624 | :"+m"(*p) |
de6d9b64 FB |
625 | :"m"(*pix), "m"(*(pix+line_size)) |
626 | :"memory"); | |
627 | pix += line_size; | |
628 | p += line_size ; | |
629 | } while(--h); | |
de6d9b64 FB |
630 | } |
631 | ||
632 | static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
633 | { | |
634 | UINT8 *p; | |
635 | const UINT8 *pix; | |
636 | p = block; | |
637 | pix = pixels; | |
d6a4c0b1 ZK |
638 | MOVQ_ZERO(mm7); |
639 | // this doesn't seem to be used offten - so | |
640 | // the inside usage of mm_wone is not optimized | |
641 | MOVQ_WTWO(mm6); | |
de6d9b64 FB |
642 | do { |
643 | __asm __volatile( | |
644 | "movq %1, %%mm0\n\t" | |
645 | "movq %2, %%mm1\n\t" | |
646 | "movq 1%1, %%mm4\n\t" | |
647 | "movq 1%2, %%mm5\n\t" | |
648 | "movq %%mm0, %%mm2\n\t" | |
649 | "movq %%mm1, %%mm3\n\t" | |
650 | "punpcklbw %%mm7, %%mm0\n\t" | |
651 | "punpcklbw %%mm7, %%mm1\n\t" | |
652 | "punpckhbw %%mm7, %%mm2\n\t" | |
653 | "punpckhbw %%mm7, %%mm3\n\t" | |
654 | "paddusw %%mm1, %%mm0\n\t" | |
655 | "paddusw %%mm3, %%mm2\n\t" | |
656 | "movq %%mm4, %%mm1\n\t" | |
657 | "movq %%mm5, %%mm3\n\t" | |
658 | "punpcklbw %%mm7, %%mm4\n\t" | |
659 | "punpcklbw %%mm7, %%mm5\n\t" | |
660 | "punpckhbw %%mm7, %%mm1\n\t" | |
661 | "punpckhbw %%mm7, %%mm3\n\t" | |
662 | "paddusw %%mm5, %%mm4\n\t" | |
663 | "paddusw %%mm3, %%mm1\n\t" | |
664 | "paddusw %%mm6, %%mm4\n\t" | |
665 | "paddusw %%mm6, %%mm1\n\t" | |
666 | "paddusw %%mm4, %%mm0\n\t" | |
667 | "paddusw %%mm1, %%mm2\n\t" | |
668 | "movq %3, %%mm5\n\t" | |
669 | "psrlw $2, %%mm0\n\t" | |
670 | "movq %0, %%mm1\n\t" | |
671 | "psrlw $2, %%mm2\n\t" | |
672 | "movq %%mm1, %%mm3\n\t" | |
673 | "punpcklbw %%mm7, %%mm1\n\t" | |
674 | "punpckhbw %%mm7, %%mm3\n\t" | |
675 | "paddusw %%mm1, %%mm0\n\t" | |
676 | "paddusw %%mm3, %%mm2\n\t" | |
677 | "paddusw %%mm5, %%mm0\n\t" | |
678 | "paddusw %%mm5, %%mm2\n\t" | |
679 | "psrlw $1, %%mm0\n\t" | |
680 | "psrlw $1, %%mm2\n\t" | |
681 | "packuswb %%mm2, %%mm0\n\t" | |
682 | "movq %%mm0, %0\n\t" | |
a822a479 | 683 | :"+m"(*p) |
de6d9b64 | 684 | :"m"(*pix), |
a9b3f630 | 685 | "m"(*(pix+line_size)), "m"(mm_wone) |
de6d9b64 FB |
686 | :"memory"); |
687 | pix += line_size; | |
688 | p += line_size ; | |
689 | } while(--h); | |
de6d9b64 FB |
690 | } |
691 | ||
692 | static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
693 | { | |
694 | UINT8 *p; | |
695 | const UINT8 *pix; | |
696 | p = block; | |
697 | pix = pixels; | |
d6a4c0b1 | 698 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
699 | do { |
700 | __asm __volatile( | |
701 | "movq %1, %%mm0\n\t" | |
702 | "movq %0, %%mm1\n\t" | |
703 | "movq %%mm0, %%mm2\n\t" | |
704 | "movq %%mm1, %%mm3\n\t" | |
705 | "punpcklbw %%mm7, %%mm0\n\t" | |
706 | "punpcklbw %%mm7, %%mm1\n\t" | |
707 | "punpckhbw %%mm7, %%mm2\n\t" | |
708 | "punpckhbw %%mm7, %%mm3\n\t" | |
709 | "paddusw %%mm1, %%mm0\n\t" | |
710 | "paddusw %%mm3, %%mm2\n\t" | |
711 | "psrlw $1, %%mm0\n\t" | |
712 | "psrlw $1, %%mm2\n\t" | |
713 | "packuswb %%mm2, %%mm0\n\t" | |
714 | "movq %%mm0, %0\n\t" | |
a822a479 | 715 | :"+m"(*p) |
de6d9b64 FB |
716 | :"m"(*pix) |
717 | :"memory"); | |
718 | pix += line_size; | |
719 | p += line_size ; | |
720 | } while (--h); | |
de6d9b64 FB |
721 | } |
722 | ||
723 | static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
724 | { | |
725 | UINT8 *p; | |
726 | const UINT8 *pix; | |
727 | p = block; | |
728 | pix = pixels; | |
d6a4c0b1 | 729 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
730 | do { |
731 | __asm __volatile( | |
732 | "movq %1, %%mm0\n\t" | |
733 | "movq 1%1, %%mm1\n\t" | |
734 | "movq %0, %%mm4\n\t" | |
735 | "movq %%mm0, %%mm2\n\t" | |
736 | "movq %%mm1, %%mm3\n\t" | |
737 | "movq %%mm4, %%mm5\n\t" | |
738 | "punpcklbw %%mm7, %%mm0\n\t" | |
739 | "punpcklbw %%mm7, %%mm1\n\t" | |
740 | "punpckhbw %%mm7, %%mm2\n\t" | |
741 | "punpckhbw %%mm7, %%mm3\n\t" | |
742 | "punpcklbw %%mm7, %%mm4\n\t" | |
743 | "punpckhbw %%mm7, %%mm5\n\t" | |
744 | "paddusw %%mm1, %%mm0\n\t" | |
745 | "paddusw %%mm3, %%mm2\n\t" | |
746 | "psrlw $1, %%mm0\n\t" | |
747 | "psrlw $1, %%mm2\n\t" | |
748 | "paddusw %%mm4, %%mm0\n\t" | |
749 | "paddusw %%mm5, %%mm2\n\t" | |
750 | "psrlw $1, %%mm0\n\t" | |
751 | "psrlw $1, %%mm2\n\t" | |
752 | "packuswb %%mm2, %%mm0\n\t" | |
753 | "movq %%mm0, %0\n\t" | |
a822a479 | 754 | :"+m"(*p) |
de6d9b64 FB |
755 | :"m"(*pix) |
756 | :"memory"); | |
757 | pix += line_size; | |
758 | p += line_size; | |
759 | } while (--h); | |
de6d9b64 FB |
760 | } |
761 | ||
762 | static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
763 | { | |
764 | UINT8 *p; | |
765 | const UINT8 *pix; | |
766 | p = block; | |
767 | pix = pixels; | |
d6a4c0b1 | 768 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
769 | do { |
770 | __asm __volatile( | |
771 | "movq %1, %%mm0\n\t" | |
772 | "movq %2, %%mm1\n\t" | |
773 | "movq %0, %%mm4\n\t" | |
774 | "movq %%mm0, %%mm2\n\t" | |
775 | "movq %%mm1, %%mm3\n\t" | |
776 | "movq %%mm4, %%mm5\n\t" | |
777 | "punpcklbw %%mm7, %%mm0\n\t" | |
778 | "punpcklbw %%mm7, %%mm1\n\t" | |
779 | "punpckhbw %%mm7, %%mm2\n\t" | |
780 | "punpckhbw %%mm7, %%mm3\n\t" | |
781 | "punpcklbw %%mm7, %%mm4\n\t" | |
782 | "punpckhbw %%mm7, %%mm5\n\t" | |
783 | "paddusw %%mm1, %%mm0\n\t" | |
784 | "paddusw %%mm3, %%mm2\n\t" | |
785 | "psrlw $1, %%mm0\n\t" | |
786 | "psrlw $1, %%mm2\n\t" | |
787 | "paddusw %%mm4, %%mm0\n\t" | |
788 | "paddusw %%mm5, %%mm2\n\t" | |
789 | "psrlw $1, %%mm0\n\t" | |
790 | "psrlw $1, %%mm2\n\t" | |
791 | "packuswb %%mm2, %%mm0\n\t" | |
792 | "movq %%mm0, %0\n\t" | |
a822a479 | 793 | :"+m"(*p) |
de6d9b64 FB |
794 | :"m"(*pix), "m"(*(pix+line_size)) |
795 | :"memory"); | |
796 | pix += line_size; | |
797 | p += line_size ; | |
798 | } while(--h); | |
de6d9b64 FB |
799 | } |
800 | ||
801 | static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
802 | { | |
803 | UINT8 *p; | |
804 | const UINT8 *pix; | |
805 | p = block; | |
806 | pix = pixels; | |
d6a4c0b1 ZK |
807 | MOVQ_ZERO(mm7); |
808 | MOVQ_WONE(mm6); | |
809 | JUMPALIGN(); | |
de6d9b64 FB |
810 | do { |
811 | __asm __volatile( | |
812 | "movq %1, %%mm0\n\t" | |
813 | "movq %2, %%mm1\n\t" | |
814 | "movq 1%1, %%mm4\n\t" | |
815 | "movq 1%2, %%mm5\n\t" | |
816 | "movq %%mm0, %%mm2\n\t" | |
817 | "movq %%mm1, %%mm3\n\t" | |
818 | "punpcklbw %%mm7, %%mm0\n\t" | |
819 | "punpcklbw %%mm7, %%mm1\n\t" | |
820 | "punpckhbw %%mm7, %%mm2\n\t" | |
821 | "punpckhbw %%mm7, %%mm3\n\t" | |
822 | "paddusw %%mm1, %%mm0\n\t" | |
823 | "paddusw %%mm3, %%mm2\n\t" | |
824 | "movq %%mm4, %%mm1\n\t" | |
825 | "movq %%mm5, %%mm3\n\t" | |
826 | "punpcklbw %%mm7, %%mm4\n\t" | |
827 | "punpcklbw %%mm7, %%mm5\n\t" | |
828 | "punpckhbw %%mm7, %%mm1\n\t" | |
829 | "punpckhbw %%mm7, %%mm3\n\t" | |
830 | "paddusw %%mm5, %%mm4\n\t" | |
831 | "paddusw %%mm3, %%mm1\n\t" | |
832 | "paddusw %%mm6, %%mm4\n\t" | |
833 | "paddusw %%mm6, %%mm1\n\t" | |
834 | "paddusw %%mm4, %%mm0\n\t" | |
835 | "paddusw %%mm1, %%mm2\n\t" | |
836 | "movq %0, %%mm1\n\t" | |
837 | "psrlw $2, %%mm0\n\t" | |
838 | "movq %%mm1, %%mm3\n\t" | |
839 | "psrlw $2, %%mm2\n\t" | |
840 | "punpcklbw %%mm7, %%mm1\n\t" | |
841 | "punpckhbw %%mm7, %%mm3\n\t" | |
842 | "paddusw %%mm1, %%mm0\n\t" | |
843 | "paddusw %%mm3, %%mm2\n\t" | |
844 | "psrlw $1, %%mm0\n\t" | |
845 | "psrlw $1, %%mm2\n\t" | |
846 | "packuswb %%mm2, %%mm0\n\t" | |
847 | "movq %%mm0, %0\n\t" | |
a822a479 | 848 | :"+m"(*p) |
de6d9b64 FB |
849 | :"m"(*pix), |
850 | "m"(*(pix+line_size)) | |
851 | :"memory"); | |
852 | pix += line_size; | |
853 | p += line_size; | |
854 | } while(--h); | |
de6d9b64 FB |
855 | } |
856 | ||
857 | static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
858 | { | |
859 | DCTELEM *p; | |
860 | const UINT8 *pix; | |
861 | p = block; | |
862 | pix = pixels; | |
d6a4c0b1 | 863 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
864 | do { |
865 | __asm __volatile( | |
866 | "movq %0, %%mm0\n\t" | |
867 | "movq %1, %%mm2\n\t" | |
868 | "movq 8%0, %%mm1\n\t" | |
869 | "movq %%mm2, %%mm3\n\t" | |
870 | "punpcklbw %%mm7, %%mm2\n\t" | |
871 | "punpckhbw %%mm7, %%mm3\n\t" | |
872 | "psubsw %%mm2, %%mm0\n\t" | |
873 | "psubsw %%mm3, %%mm1\n\t" | |
874 | "movq %%mm0, %0\n\t" | |
875 | "movq %%mm1, 8%0\n\t" | |
a822a479 | 876 | :"+m"(*p) |
de6d9b64 FB |
877 | :"m"(*pix) |
878 | :"memory"); | |
879 | pix += line_size; | |
880 | p += 8; | |
881 | } while (--h); | |
de6d9b64 FB |
882 | } |
883 | ||
884 | static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
885 | { | |
886 | DCTELEM *p; | |
887 | const UINT8 *pix; | |
888 | p = block; | |
889 | pix = pixels; | |
d6a4c0b1 ZK |
890 | MOVQ_ZERO(mm7); |
891 | MOVQ_WONE(mm6); | |
892 | JUMPALIGN(); | |
de6d9b64 FB |
893 | do { |
894 | __asm __volatile( | |
895 | "movq %0, %%mm0\n\t" | |
896 | "movq %1, %%mm2\n\t" | |
897 | "movq 8%0, %%mm1\n\t" | |
898 | "movq 1%1, %%mm4\n\t" | |
899 | "movq %%mm2, %%mm3\n\t" | |
900 | "movq %%mm4, %%mm5\n\t" | |
901 | "punpcklbw %%mm7, %%mm2\n\t" | |
902 | "punpckhbw %%mm7, %%mm3\n\t" | |
903 | "punpcklbw %%mm7, %%mm4\n\t" | |
904 | "punpckhbw %%mm7, %%mm5\n\t" | |
905 | "paddusw %%mm4, %%mm2\n\t" | |
906 | "paddusw %%mm5, %%mm3\n\t" | |
907 | "paddusw %%mm6, %%mm2\n\t" | |
908 | "paddusw %%mm6, %%mm3\n\t" | |
909 | "psrlw $1, %%mm2\n\t" | |
910 | "psrlw $1, %%mm3\n\t" | |
911 | "psubsw %%mm2, %%mm0\n\t" | |
912 | "psubsw %%mm3, %%mm1\n\t" | |
913 | "movq %%mm0, %0\n\t" | |
914 | "movq %%mm1, 8%0\n\t" | |
a822a479 | 915 | :"+m"(*p) |
de6d9b64 FB |
916 | :"m"(*pix) |
917 | :"memory"); | |
918 | pix += line_size; | |
919 | p += 8; | |
920 | } while (--h); | |
de6d9b64 FB |
921 | } |
922 | ||
923 | static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
924 | { | |
925 | DCTELEM *p; | |
926 | const UINT8 *pix; | |
927 | p = block; | |
928 | pix = pixels; | |
d6a4c0b1 ZK |
929 | MOVQ_ZERO(mm7); |
930 | MOVQ_WONE(mm6); | |
de6d9b64 FB |
931 | do { |
932 | __asm __volatile( | |
933 | "movq %0, %%mm0\n\t" | |
934 | "movq %1, %%mm2\n\t" | |
935 | "movq 8%0, %%mm1\n\t" | |
936 | "movq %2, %%mm4\n\t" | |
937 | "movq %%mm2, %%mm3\n\t" | |
938 | "movq %%mm4, %%mm5\n\t" | |
939 | "punpcklbw %%mm7, %%mm2\n\t" | |
940 | "punpckhbw %%mm7, %%mm3\n\t" | |
941 | "punpcklbw %%mm7, %%mm4\n\t" | |
942 | "punpckhbw %%mm7, %%mm5\n\t" | |
943 | "paddusw %%mm4, %%mm2\n\t" | |
944 | "paddusw %%mm5, %%mm3\n\t" | |
945 | "paddusw %%mm6, %%mm2\n\t" | |
946 | "paddusw %%mm6, %%mm3\n\t" | |
947 | "psrlw $1, %%mm2\n\t" | |
948 | "psrlw $1, %%mm3\n\t" | |
949 | "psubsw %%mm2, %%mm0\n\t" | |
950 | "psubsw %%mm3, %%mm1\n\t" | |
951 | "movq %%mm0, %0\n\t" | |
952 | "movq %%mm1, 8%0\n\t" | |
a822a479 | 953 | :"+m"(*p) |
de6d9b64 FB |
954 | :"m"(*pix), "m"(*(pix+line_size)) |
955 | :"memory"); | |
956 | pix += line_size; | |
957 | p += 8; | |
958 | } while (--h); | |
de6d9b64 FB |
959 | } |
960 | ||
961 | static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
962 | { | |
963 | DCTELEM *p; | |
964 | const UINT8 *pix; | |
965 | p = block; | |
966 | pix = pixels; | |
d6a4c0b1 ZK |
967 | MOVQ_ZERO(mm7); |
968 | MOVQ_WTWO(mm6); | |
969 | JUMPALIGN(); | |
de6d9b64 FB |
970 | do { |
971 | __asm __volatile( | |
972 | "movq %1, %%mm0\n\t" | |
973 | "movq %2, %%mm1\n\t" | |
974 | "movq 1%1, %%mm4\n\t" | |
975 | "movq 1%2, %%mm5\n\t" | |
976 | "movq %%mm0, %%mm2\n\t" | |
977 | "movq %%mm1, %%mm3\n\t" | |
978 | "punpcklbw %%mm7, %%mm0\n\t" | |
979 | "punpcklbw %%mm7, %%mm1\n\t" | |
980 | "punpckhbw %%mm7, %%mm2\n\t" | |
981 | "punpckhbw %%mm7, %%mm3\n\t" | |
982 | "paddusw %%mm1, %%mm0\n\t" | |
983 | "paddusw %%mm3, %%mm2\n\t" | |
984 | "movq %%mm4, %%mm1\n\t" | |
985 | "movq %%mm5, %%mm3\n\t" | |
986 | "punpcklbw %%mm7, %%mm4\n\t" | |
987 | "punpcklbw %%mm7, %%mm5\n\t" | |
988 | "punpckhbw %%mm7, %%mm1\n\t" | |
989 | "punpckhbw %%mm7, %%mm3\n\t" | |
990 | "paddusw %%mm5, %%mm4\n\t" | |
991 | "paddusw %%mm3, %%mm1\n\t" | |
992 | "paddusw %%mm6, %%mm4\n\t" | |
993 | "paddusw %%mm6, %%mm1\n\t" | |
994 | "paddusw %%mm4, %%mm0\n\t" | |
995 | "paddusw %%mm1, %%mm2\n\t" | |
996 | "movq %0, %%mm1\n\t" | |
997 | "movq 8%0, %%mm3\n\t" | |
998 | "psrlw $2, %%mm0\n\t" | |
999 | "psrlw $2, %%mm2\n\t" | |
1000 | "psubsw %%mm0, %%mm1\n\t" | |
1001 | "psubsw %%mm2, %%mm3\n\t" | |
1002 | "movq %%mm1, %0\n\t" | |
1003 | "movq %%mm3, 8%0\n\t" | |
a822a479 | 1004 | :"+m"(*p) |
de6d9b64 FB |
1005 | :"m"(*pix), |
1006 | "m"(*(pix+line_size)) | |
1007 | :"memory"); | |
1008 | pix += line_size; | |
1009 | p += 8 ; | |
1010 | } while(--h); | |
de6d9b64 FB |
1011 | } |
1012 | ||
d6a4c0b1 ZK |
1013 | static void just_return() { return; } |
1014 | ||
de6d9b64 FB |
1015 | void dsputil_init_mmx(void) |
1016 | { | |
1017 | mm_flags = mm_support(); | |
f4470e09 MN |
1018 | #if 1 |
1019 | printf("libavcodec: CPU flags:"); | |
de6d9b64 FB |
1020 | if (mm_flags & MM_MMX) |
1021 | printf(" mmx"); | |
1022 | if (mm_flags & MM_MMXEXT) | |
1023 | printf(" mmxext"); | |
1024 | if (mm_flags & MM_3DNOW) | |
1025 | printf(" 3dnow"); | |
1026 | if (mm_flags & MM_SSE) | |
1027 | printf(" sse"); | |
1028 | if (mm_flags & MM_SSE2) | |
1029 | printf(" sse2"); | |
1030 | printf("\n"); | |
1031 | #endif | |
1032 | ||
1033 | if (mm_flags & MM_MMX) { | |
1034 | get_pixels = get_pixels_mmx; | |
1035 | put_pixels_clamped = put_pixels_clamped_mmx; | |
1036 | add_pixels_clamped = add_pixels_clamped_mmx; | |
1037 | ||
1038 | pix_abs16x16 = pix_abs16x16_mmx; | |
1039 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
1040 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
1041 | pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | |
1042 | av_fdct = fdct_mmx; | |
1043 | ||
1044 | put_pixels_tab[0] = put_pixels_mmx; | |
1045 | put_pixels_tab[1] = put_pixels_x2_mmx; | |
1046 | put_pixels_tab[2] = put_pixels_y2_mmx; | |
1047 | put_pixels_tab[3] = put_pixels_xy2_mmx; | |
1048 | ||
1049 | put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
1050 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1051 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1052 | put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
1053 | ||
1054 | avg_pixels_tab[0] = avg_pixels_mmx; | |
1055 | avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1056 | avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1057 | avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1058 | ||
1059 | avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1060 | avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1061 | avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1062 | avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
1063 | ||
1064 | sub_pixels_tab[0] = sub_pixels_mmx; | |
1065 | sub_pixels_tab[1] = sub_pixels_x2_mmx; | |
1066 | sub_pixels_tab[2] = sub_pixels_y2_mmx; | |
1067 | sub_pixels_tab[3] = sub_pixels_xy2_mmx; | |
1068 | ||
1069 | if (mm_flags & MM_MMXEXT) { | |
1070 | pix_abs16x16 = pix_abs16x16_sse; | |
1071 | } | |
1072 | ||
1073 | if (mm_flags & MM_SSE) { | |
1074 | put_pixels_tab[1] = put_pixels_x2_sse; | |
1075 | put_pixels_tab[2] = put_pixels_y2_sse; | |
1076 | ||
1077 | avg_pixels_tab[0] = avg_pixels_sse; | |
1078 | avg_pixels_tab[1] = avg_pixels_x2_sse; | |
1079 | avg_pixels_tab[2] = avg_pixels_y2_sse; | |
1080 | avg_pixels_tab[3] = avg_pixels_xy2_sse; | |
1081 | ||
1082 | sub_pixels_tab[1] = sub_pixels_x2_sse; | |
1083 | sub_pixels_tab[2] = sub_pixels_y2_sse; | |
1084 | } else if (mm_flags & MM_3DNOW) { | |
1085 | put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1086 | put_pixels_tab[2] = put_pixels_y2_3dnow; | |
1087 | ||
1088 | avg_pixels_tab[0] = avg_pixels_3dnow; | |
1089 | avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1090 | avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1091 | avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1092 | ||
1093 | sub_pixels_tab[1] = sub_pixels_x2_3dnow; | |
1094 | sub_pixels_tab[2] = sub_pixels_y2_3dnow; | |
1095 | } | |
4af7bcc1 | 1096 | |
8def0299 FB |
1097 | /* idct */ |
1098 | if (mm_flags & MM_MMXEXT) { | |
1099 | ff_idct = ff_mmxext_idct; | |
1100 | } else { | |
1101 | ff_idct = ff_mmx_idct; | |
1102 | } | |
d962f6fd A |
1103 | #ifdef SIMPLE_IDCT |
1104 | // ff_idct = simple_idct; | |
1105 | ff_idct = simple_idct_mmx; | |
1106 | #endif | |
de6d9b64 | 1107 | } |
d6a4c0b1 ZK |
1108 | |
1109 | #if 0 | |
1110 | // for speed testing | |
1111 | get_pixels = just_return; | |
1112 | put_pixels_clamped = just_return; | |
1113 | add_pixels_clamped = just_return; | |
1114 | ||
1115 | pix_abs16x16 = just_return; | |
1116 | pix_abs16x16_x2 = just_return; | |
1117 | pix_abs16x16_y2 = just_return; | |
1118 | pix_abs16x16_xy2 = just_return; | |
1119 | ||
1120 | put_pixels_tab[0] = just_return; | |
1121 | put_pixels_tab[1] = just_return; | |
1122 | put_pixels_tab[2] = just_return; | |
1123 | put_pixels_tab[3] = just_return; | |
1124 | ||
1125 | put_no_rnd_pixels_tab[0] = just_return; | |
1126 | put_no_rnd_pixels_tab[1] = just_return; | |
1127 | put_no_rnd_pixels_tab[2] = just_return; | |
1128 | put_no_rnd_pixels_tab[3] = just_return; | |
1129 | ||
1130 | avg_pixels_tab[0] = just_return; | |
1131 | avg_pixels_tab[1] = just_return; | |
1132 | avg_pixels_tab[2] = just_return; | |
1133 | avg_pixels_tab[3] = just_return; | |
1134 | ||
1135 | avg_no_rnd_pixels_tab[0] = just_return; | |
1136 | avg_no_rnd_pixels_tab[1] = just_return; | |
1137 | avg_no_rnd_pixels_tab[2] = just_return; | |
1138 | avg_no_rnd_pixels_tab[3] = just_return; | |
1139 | ||
1140 | sub_pixels_tab[0] = just_return; | |
1141 | sub_pixels_tab[1] = just_return; | |
1142 | sub_pixels_tab[2] = just_return; | |
1143 | sub_pixels_tab[3] = just_return; | |
1144 | ||
1145 | //av_fdct = just_return; | |
1146 | //ff_idct = just_return; | |
1147 | #endif | |
de6d9b64 | 1148 | } |