Commit | Line | Data |
---|---|---|
de6d9b64 FB |
1 | /* |
2 | * MMX optimized DSP utils | |
ff4ec49e | 3 | * Copyright (c) 2000, 2001 Fabrice Bellard. |
de6d9b64 | 4 | * |
ff4ec49e FB |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Lesser General Public | |
7 | * License as published by the Free Software Foundation; either | |
8 | * version 2 of the License, or (at your option) any later version. | |
de6d9b64 | 9 | * |
ff4ec49e | 10 | * This library is distributed in the hope that it will be useful, |
de6d9b64 | 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
ff4ec49e FB |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Lesser General Public License for more details. | |
de6d9b64 | 14 | * |
ff4ec49e FB |
15 | * You should have received a copy of the GNU Lesser General Public |
16 | * License along with this library; if not, write to the Free Software | |
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
de6d9b64 FB |
18 | * |
19 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 | */ | |
21 | ||
22 | #include "../dsputil.h" | |
d962f6fd | 23 | #include "../simple_idct.h" |
de6d9b64 | 24 | |
7d650cb5 FB |
25 | int mm_flags; /* multimedia extension flags */ |
26 | ||
ba6802de MN |
27 | int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
28 | int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 | int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 | int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | ||
32 | int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 | int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 | int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 | int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | ||
37 | int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 | int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 | int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 | int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | ||
42 | int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 | int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 | int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 | int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | ||
8def0299 FB |
47 | /* external functions, from idct_mmx.c */ |
48 | void ff_mmx_idct(DCTELEM *block); | |
49 | void ff_mmxext_idct(DCTELEM *block); | |
4af7bcc1 | 50 | |
de6d9b64 | 51 | /* pixel operations */ |
def60345 | 52 | static const uint64_t mm_bfe __attribute__ ((aligned(8))) = 0xfefefefefefefefeULL; |
a7bd8797 MN |
53 | static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
54 | static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
55 | static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
a9b3f630 NK |
56 | //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
57 | //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
de6d9b64 | 58 | |
d6a4c0b1 ZK |
59 | #define JUMPALIGN() __asm __volatile (".balign 8"::) |
60 | #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) | |
61 | ||
62 | #ifndef PIC | |
63 | #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) | |
64 | #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) | |
a7bd8797 | 65 | #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t" |
def60345 | 66 | #define MOVQ_BFE(regd) "movq "MANGLE(mm_bfe)", "#regd" \n\t" |
d6a4c0b1 ZK |
67 | #else |
68 | // for shared library it's better to use this way for accessing constants | |
69 | // pcmpeqd -> -1 | |
70 | #define MOVQ_WONE(regd) \ | |
71 | __asm __volatile ( \ | |
72 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
73 | "psrlw $15, %%" #regd ::) | |
74 | ||
75 | #define MOVQ_WTWO(regd) \ | |
76 | __asm __volatile ( \ | |
77 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
78 | "psrlw $15, %%" #regd " \n\t" \ | |
79 | "psllw $1, %%" #regd ::) | |
a7bd8797 MN |
80 | |
81 | #define MOVQ_BONE(regd) \ | |
82 | "pcmpeqd " #regd ", " #regd " \n\t" \ | |
83 | "psrlw $15, " #regd " \n\t"\ | |
84 | "packuswb " #regd ", " #regd " \n\t" | |
def60345 ZK |
85 | |
86 | #define MOVQ_BFE(regd) \ | |
87 | "pcmpeqd " #regd ", " #regd " \n\t"\ | |
88 | "paddb " #regd ", " #regd " \n\t" | |
d6a4c0b1 ZK |
89 | #endif |
90 | ||
def60345 ZK |
91 | // using mm6 as temporary and for the output result |
92 | // first argument is unmodifed and second is trashed | |
93 | // mm7 is supposed to contain 0xfefefefefefefefe | |
91abb473 ZK |
94 | #define PAVGB_MMX_NO_RND(rega, regb, regr) \ |
95 | "movq " #rega ", " #regr " \n\t"\ | |
96 | "pand " #regb ", " #regr " \n\t"\ | |
def60345 ZK |
97 | "pxor " #rega ", " #regb " \n\t"\ |
98 | "pand %%mm7, " #regb " \n\t"\ | |
99 | "psrlq $1, " #regb " \n\t"\ | |
91abb473 | 100 | "paddb " #regb ", " #regr " \n\t" |
def60345 | 101 | |
91abb473 ZK |
102 | #define PAVGB_MMX(rega, regb, regr) \ |
103 | "movq " #rega ", " #regr " \n\t"\ | |
104 | "por " #regb ", " #regr " \n\t"\ | |
def60345 ZK |
105 | "pxor " #rega ", " #regb " \n\t"\ |
106 | "pand %%mm7, " #regb " \n\t"\ | |
107 | "psrlq $1, " #regb " \n\t"\ | |
91abb473 | 108 | "psubb " #regb ", " #regr " \n\t" |
def60345 | 109 | |
91abb473 ZK |
110 | /***********************************/ |
111 | /* MMX no rounding */ | |
112 | #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx | |
113 | ||
114 | #define PAVGB(a, b) PAVGB_MMX_NO_RND(a, b, %%mm6) | |
115 | #define PAVGBR(a, b, c) PAVGB_MMX_NO_RND(a, b, c) | |
116 | #include "dsputil_mmx_rnd.h" | |
117 | ||
118 | #undef DEF | |
119 | #undef PAVGB | |
120 | #undef PAVGBR | |
121 | /***********************************/ | |
122 | /* MMX rounding */ | |
123 | ||
124 | #define DEF(x, y) x ## _ ## y ##_mmx | |
125 | ||
126 | #define PAVGB(a, b) PAVGB_MMX(a, b, %%mm6) | |
127 | #define PAVGBR(a, b, c) PAVGB_MMX(a, b, c) | |
128 | #include "dsputil_mmx_rnd.h" | |
129 | ||
130 | #undef DEF | |
131 | #undef PAVGB | |
132 | #undef PAVGBR | |
a7bd8797 | 133 | |
de6d9b64 FB |
134 | /***********************************/ |
135 | /* 3Dnow specific */ | |
136 | ||
137 | #define DEF(x) x ## _3dnow | |
138 | /* for Athlons PAVGUSB is prefered */ | |
139 | #define PAVGB "pavgusb" | |
140 | ||
141 | #include "dsputil_mmx_avg.h" | |
142 | ||
143 | #undef DEF | |
144 | #undef PAVGB | |
145 | ||
146 | /***********************************/ | |
147 | /* MMX2 specific */ | |
148 | ||
607dce96 | 149 | #define DEF(x) x ## _mmx2 |
de6d9b64 FB |
150 | |
151 | /* Introduced only in MMX2 set */ | |
152 | #define PAVGB "pavgb" | |
153 | ||
154 | #include "dsputil_mmx_avg.h" | |
155 | ||
156 | #undef DEF | |
157 | #undef PAVGB | |
158 | ||
159 | /***********************************/ | |
160 | /* standard MMX */ | |
161 | ||
162 | static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
163 | { | |
607dce96 MN |
164 | asm volatile( |
165 | "movl $-128, %%eax \n\t" | |
166 | "pxor %%mm7, %%mm7 \n\t" | |
167 | ".balign 16 \n\t" | |
168 | "1: \n\t" | |
169 | "movq (%0), %%mm0 \n\t" | |
170 | "movq (%0, %2), %%mm2 \n\t" | |
171 | "movq %%mm0, %%mm1 \n\t" | |
172 | "movq %%mm2, %%mm3 \n\t" | |
173 | "punpcklbw %%mm7, %%mm0 \n\t" | |
174 | "punpckhbw %%mm7, %%mm1 \n\t" | |
175 | "punpcklbw %%mm7, %%mm2 \n\t" | |
176 | "punpckhbw %%mm7, %%mm3 \n\t" | |
177 | "movq %%mm0, (%1, %%eax)\n\t" | |
178 | "movq %%mm1, 8(%1, %%eax)\n\t" | |
179 | "movq %%mm2, 16(%1, %%eax)\n\t" | |
180 | "movq %%mm3, 24(%1, %%eax)\n\t" | |
181 | "addl %3, %0 \n\t" | |
182 | "addl $32, %%eax \n\t" | |
183 | "js 1b \n\t" | |
184 | : "+r" (pixels) | |
185 | : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
186 | : "%eax" | |
187 | ); | |
de6d9b64 FB |
188 | } |
189 | ||
9dbcbd92 MN |
190 | static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
191 | { | |
192 | asm volatile( | |
607dce96 | 193 | "pxor %%mm7, %%mm7 \n\t" |
9dbcbd92 | 194 | "movl $-128, %%eax \n\t" |
607dce96 | 195 | ".balign 16 \n\t" |
9dbcbd92 MN |
196 | "1: \n\t" |
197 | "movq (%0), %%mm0 \n\t" | |
198 | "movq (%1), %%mm2 \n\t" | |
199 | "movq %%mm0, %%mm1 \n\t" | |
200 | "movq %%mm2, %%mm3 \n\t" | |
201 | "punpcklbw %%mm7, %%mm0 \n\t" | |
202 | "punpckhbw %%mm7, %%mm1 \n\t" | |
203 | "punpcklbw %%mm7, %%mm2 \n\t" | |
204 | "punpckhbw %%mm7, %%mm3 \n\t" | |
205 | "psubw %%mm2, %%mm0 \n\t" | |
206 | "psubw %%mm3, %%mm1 \n\t" | |
207 | "movq %%mm0, (%2, %%eax)\n\t" | |
208 | "movq %%mm1, 8(%2, %%eax)\n\t" | |
209 | "addl %3, %0 \n\t" | |
210 | "addl %3, %1 \n\t" | |
211 | "addl $16, %%eax \n\t" | |
212 | "jnz 1b \n\t" | |
213 | : "+r" (s1), "+r" (s2) | |
214 | : "r" (block+64), "r" (stride) | |
215 | : "%eax" | |
216 | ); | |
217 | } | |
218 | ||
de6d9b64 FB |
219 | static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
220 | { | |
221 | const DCTELEM *p; | |
222 | UINT8 *pix; | |
de6d9b64 FB |
223 | |
224 | /* read the pixels */ | |
225 | p = block; | |
226 | pix = pixels; | |
d6a4c0b1 | 227 | /* unrolled loop */ |
de6d9b64 | 228 | __asm __volatile( |
a822a479 NK |
229 | "movq %3, %%mm0\n\t" |
230 | "movq 8%3, %%mm1\n\t" | |
231 | "movq 16%3, %%mm2\n\t" | |
232 | "movq 24%3, %%mm3\n\t" | |
233 | "movq 32%3, %%mm4\n\t" | |
234 | "movq 40%3, %%mm5\n\t" | |
235 | "movq 48%3, %%mm6\n\t" | |
236 | "movq 56%3, %%mm7\n\t" | |
de6d9b64 FB |
237 | "packuswb %%mm1, %%mm0\n\t" |
238 | "packuswb %%mm3, %%mm2\n\t" | |
239 | "packuswb %%mm5, %%mm4\n\t" | |
240 | "packuswb %%mm7, %%mm6\n\t" | |
a822a479 NK |
241 | "movq %%mm0, (%0)\n\t" |
242 | "movq %%mm2, (%0, %1)\n\t" | |
243 | "movq %%mm4, (%0, %1, 2)\n\t" | |
244 | "movq %%mm6, (%0, %2)\n\t" | |
245 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) | |
de6d9b64 FB |
246 | :"memory"); |
247 | pix += line_size*4; | |
248 | p += 32; | |
d6a4c0b1 ZK |
249 | |
250 | // if here would be an exact copy of the code above | |
251 | // compiler would generate some very strange code | |
252 | // thus using "r" | |
253 | __asm __volatile( | |
254 | "movq (%3), %%mm0\n\t" | |
255 | "movq 8(%3), %%mm1\n\t" | |
256 | "movq 16(%3), %%mm2\n\t" | |
257 | "movq 24(%3), %%mm3\n\t" | |
258 | "movq 32(%3), %%mm4\n\t" | |
259 | "movq 40(%3), %%mm5\n\t" | |
260 | "movq 48(%3), %%mm6\n\t" | |
261 | "movq 56(%3), %%mm7\n\t" | |
262 | "packuswb %%mm1, %%mm0\n\t" | |
263 | "packuswb %%mm3, %%mm2\n\t" | |
264 | "packuswb %%mm5, %%mm4\n\t" | |
265 | "packuswb %%mm7, %%mm6\n\t" | |
266 | "movq %%mm0, (%0)\n\t" | |
267 | "movq %%mm2, (%0, %1)\n\t" | |
268 | "movq %%mm4, (%0, %1, 2)\n\t" | |
269 | "movq %%mm6, (%0, %2)\n\t" | |
270 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) | |
271 | :"memory"); | |
de6d9b64 FB |
272 | } |
273 | ||
274 | static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
275 | { | |
276 | const DCTELEM *p; | |
277 | UINT8 *pix; | |
278 | int i; | |
279 | ||
280 | /* read the pixels */ | |
281 | p = block; | |
282 | pix = pixels; | |
d6a4c0b1 ZK |
283 | MOVQ_ZERO(mm7); |
284 | i = 4; | |
cd8e5f96 | 285 | do { |
de6d9b64 | 286 | __asm __volatile( |
cd8e5f96 ZK |
287 | "movq (%2), %%mm0\n\t" |
288 | "movq 8(%2), %%mm1\n\t" | |
289 | "movq 16(%2), %%mm2\n\t" | |
290 | "movq 24(%2), %%mm3\n\t" | |
de6d9b64 FB |
291 | "movq %0, %%mm4\n\t" |
292 | "movq %1, %%mm6\n\t" | |
293 | "movq %%mm4, %%mm5\n\t" | |
294 | "punpcklbw %%mm7, %%mm4\n\t" | |
295 | "punpckhbw %%mm7, %%mm5\n\t" | |
296 | "paddsw %%mm4, %%mm0\n\t" | |
297 | "paddsw %%mm5, %%mm1\n\t" | |
298 | "movq %%mm6, %%mm5\n\t" | |
299 | "punpcklbw %%mm7, %%mm6\n\t" | |
300 | "punpckhbw %%mm7, %%mm5\n\t" | |
301 | "paddsw %%mm6, %%mm2\n\t" | |
302 | "paddsw %%mm5, %%mm3\n\t" | |
303 | "packuswb %%mm1, %%mm0\n\t" | |
304 | "packuswb %%mm3, %%mm2\n\t" | |
305 | "movq %%mm0, %0\n\t" | |
306 | "movq %%mm2, %1\n\t" | |
a822a479 | 307 | :"+m"(*pix), "+m"(*(pix+line_size)) |
cd8e5f96 | 308 | :"r"(p) |
de6d9b64 FB |
309 | :"memory"); |
310 | pix += line_size*2; | |
311 | p += 16; | |
cd8e5f96 | 312 | } while (--i); |
de6d9b64 FB |
313 | } |
314 | ||
315 | static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
316 | { | |
31ddcf98 ZK |
317 | asm volatile |
318 | ( | |
319 | "lea (%3, %3), %%eax \n\t" | |
52af45ad | 320 | ".balign 8 \n\t" |
31ddcf98 ZK |
321 | "1: \n\t" |
322 | "movq (%1), %%mm0 \n\t" | |
323 | "movq (%1, %3), %%mm1 \n\t" | |
324 | "movq %%mm0, (%2) \n\t" | |
325 | "movq %%mm1, (%2, %3) \n\t" | |
326 | "addl %%eax, %1 \n\t" | |
327 | "addl %%eax, %2 \n\t" | |
328 | "movq (%1), %%mm0 \n\t" | |
329 | "movq (%1, %3), %%mm1 \n\t" | |
330 | "movq %%mm0, (%2) \n\t" | |
331 | "movq %%mm1, (%2, %3) \n\t" | |
332 | "addl %%eax, %1 \n\t" | |
333 | "addl %%eax, %2 \n\t" | |
334 | "subl $4, %0 \n\t" | |
335 | "jnz 1b \n\t" | |
336 | : "+g"(h), "+r" (pixels), "+r" (block) | |
337 | : "r"(line_size) | |
338 | : "%eax", "memory" | |
339 | ); | |
de6d9b64 FB |
340 | } |
341 | ||
def60345 | 342 | #if 0 |
de6d9b64 FB |
343 | static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
344 | { | |
345 | UINT8 *p; | |
346 | const UINT8 *pix; | |
347 | p = block; | |
d6a4c0b1 ZK |
348 | pix = pixels; // 1s |
349 | MOVQ_ZERO(mm7); | |
350 | MOVQ_WTWO(mm6); | |
351 | JUMPALIGN(); | |
de6d9b64 FB |
352 | do { |
353 | __asm __volatile( | |
354 | "movq %1, %%mm0\n\t" | |
355 | "movq %2, %%mm1\n\t" | |
356 | "movq 1%1, %%mm4\n\t" | |
357 | "movq 1%2, %%mm5\n\t" | |
358 | "movq %%mm0, %%mm2\n\t" | |
359 | "movq %%mm1, %%mm3\n\t" | |
360 | "punpcklbw %%mm7, %%mm0\n\t" | |
361 | "punpcklbw %%mm7, %%mm1\n\t" | |
362 | "punpckhbw %%mm7, %%mm2\n\t" | |
363 | "punpckhbw %%mm7, %%mm3\n\t" | |
364 | "paddusw %%mm1, %%mm0\n\t" | |
365 | "paddusw %%mm3, %%mm2\n\t" | |
366 | "movq %%mm4, %%mm1\n\t" | |
367 | "movq %%mm5, %%mm3\n\t" | |
368 | "punpcklbw %%mm7, %%mm4\n\t" | |
369 | "punpcklbw %%mm7, %%mm5\n\t" | |
370 | "punpckhbw %%mm7, %%mm1\n\t" | |
371 | "punpckhbw %%mm7, %%mm3\n\t" | |
372 | "paddusw %%mm5, %%mm4\n\t" | |
373 | "paddusw %%mm3, %%mm1\n\t" | |
374 | "paddusw %%mm6, %%mm4\n\t" | |
375 | "paddusw %%mm6, %%mm1\n\t" | |
376 | "paddusw %%mm4, %%mm0\n\t" | |
377 | "paddusw %%mm1, %%mm2\n\t" | |
378 | "psrlw $2, %%mm0\n\t" | |
379 | "psrlw $2, %%mm2\n\t" | |
380 | "packuswb %%mm2, %%mm0\n\t" | |
381 | "movq %%mm0, %0\n\t" | |
382 | :"=m"(*p) | |
383 | :"m"(*pix), | |
384 | "m"(*(pix+line_size)) | |
385 | :"memory"); | |
386 | pix += line_size; | |
387 | p += line_size; | |
388 | } while(--h); | |
de6d9b64 FB |
389 | } |
390 | ||
de6d9b64 FB |
391 | static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) |
392 | { | |
393 | UINT8 *p; | |
394 | const UINT8 *pix; | |
395 | p = block; | |
396 | pix = pixels; | |
d6a4c0b1 ZK |
397 | MOVQ_ZERO(mm7); |
398 | MOVQ_WONE(mm6); | |
399 | JUMPALIGN(); | |
de6d9b64 FB |
400 | do { |
401 | __asm __volatile( | |
402 | "movq %1, %%mm0\n\t" | |
403 | "movq %2, %%mm1\n\t" | |
404 | "movq 1%1, %%mm4\n\t" | |
405 | "movq 1%2, %%mm5\n\t" | |
406 | "movq %%mm0, %%mm2\n\t" | |
407 | "movq %%mm1, %%mm3\n\t" | |
408 | "punpcklbw %%mm7, %%mm0\n\t" | |
409 | "punpcklbw %%mm7, %%mm1\n\t" | |
410 | "punpckhbw %%mm7, %%mm2\n\t" | |
411 | "punpckhbw %%mm7, %%mm3\n\t" | |
412 | "paddusw %%mm1, %%mm0\n\t" | |
413 | "paddusw %%mm3, %%mm2\n\t" | |
414 | "movq %%mm4, %%mm1\n\t" | |
415 | "movq %%mm5, %%mm3\n\t" | |
416 | "punpcklbw %%mm7, %%mm4\n\t" | |
417 | "punpcklbw %%mm7, %%mm5\n\t" | |
418 | "punpckhbw %%mm7, %%mm1\n\t" | |
419 | "punpckhbw %%mm7, %%mm3\n\t" | |
420 | "paddusw %%mm5, %%mm4\n\t" | |
421 | "paddusw %%mm3, %%mm1\n\t" | |
422 | "paddusw %%mm6, %%mm4\n\t" | |
423 | "paddusw %%mm6, %%mm1\n\t" | |
424 | "paddusw %%mm4, %%mm0\n\t" | |
425 | "paddusw %%mm1, %%mm2\n\t" | |
426 | "psrlw $2, %%mm0\n\t" | |
427 | "psrlw $2, %%mm2\n\t" | |
428 | "packuswb %%mm2, %%mm0\n\t" | |
429 | "movq %%mm0, %0\n\t" | |
430 | :"=m"(*p) | |
431 | :"m"(*pix), | |
432 | "m"(*(pix+line_size)) | |
433 | :"memory"); | |
434 | pix += line_size; | |
435 | p += line_size; | |
436 | } while(--h); | |
de6d9b64 | 437 | } |
91abb473 | 438 | #endif |
de6d9b64 FB |
439 | static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
440 | { | |
441 | UINT8 *p; | |
442 | const UINT8 *pix; | |
443 | p = block; | |
444 | pix = pixels; | |
d6a4c0b1 ZK |
445 | MOVQ_ZERO(mm7); |
446 | MOVQ_WONE(mm6); | |
447 | JUMPALIGN(); | |
de6d9b64 FB |
448 | do { |
449 | __asm __volatile( | |
450 | "movq %0, %%mm0\n\t" | |
451 | "movq %1, %%mm1\n\t" | |
452 | "movq %%mm0, %%mm2\n\t" | |
453 | "movq %%mm1, %%mm3\n\t" | |
454 | "punpcklbw %%mm7, %%mm0\n\t" | |
455 | "punpcklbw %%mm7, %%mm1\n\t" | |
456 | "punpckhbw %%mm7, %%mm2\n\t" | |
457 | "punpckhbw %%mm7, %%mm3\n\t" | |
458 | "paddusw %%mm1, %%mm0\n\t" | |
459 | "paddusw %%mm3, %%mm2\n\t" | |
460 | "paddusw %%mm6, %%mm0\n\t" | |
461 | "paddusw %%mm6, %%mm2\n\t" | |
462 | "psrlw $1, %%mm0\n\t" | |
463 | "psrlw $1, %%mm2\n\t" | |
464 | "packuswb %%mm2, %%mm0\n\t" | |
465 | "movq %%mm0, %0\n\t" | |
a822a479 | 466 | :"+m"(*p) |
de6d9b64 FB |
467 | :"m"(*pix) |
468 | :"memory"); | |
469 | pix += line_size; | |
470 | p += line_size; | |
471 | } | |
472 | while (--h); | |
de6d9b64 FB |
473 | } |
474 | ||
475 | static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
476 | { | |
477 | UINT8 *p; | |
478 | const UINT8 *pix; | |
479 | p = block; | |
480 | pix = pixels; | |
d6a4c0b1 ZK |
481 | MOVQ_ZERO(mm7); |
482 | MOVQ_WONE(mm6); | |
483 | JUMPALIGN(); | |
de6d9b64 FB |
484 | do { |
485 | __asm __volatile( | |
486 | "movq %1, %%mm1\n\t" | |
487 | "movq %0, %%mm0\n\t" | |
488 | "movq 1%1, %%mm4\n\t" | |
489 | "movq %%mm0, %%mm2\n\t" | |
490 | "movq %%mm1, %%mm3\n\t" | |
491 | "movq %%mm4, %%mm5\n\t" | |
492 | "punpcklbw %%mm7, %%mm1\n\t" | |
493 | "punpckhbw %%mm7, %%mm3\n\t" | |
494 | "punpcklbw %%mm7, %%mm4\n\t" | |
495 | "punpckhbw %%mm7, %%mm5\n\t" | |
496 | "punpcklbw %%mm7, %%mm0\n\t" | |
497 | "punpckhbw %%mm7, %%mm2\n\t" | |
498 | "paddusw %%mm4, %%mm1\n\t" | |
499 | "paddusw %%mm5, %%mm3\n\t" | |
500 | "paddusw %%mm6, %%mm1\n\t" | |
501 | "paddusw %%mm6, %%mm3\n\t" | |
502 | "psrlw $1, %%mm1\n\t" | |
503 | "psrlw $1, %%mm3\n\t" | |
504 | "paddusw %%mm6, %%mm0\n\t" | |
505 | "paddusw %%mm6, %%mm2\n\t" | |
506 | "paddusw %%mm1, %%mm0\n\t" | |
507 | "paddusw %%mm3, %%mm2\n\t" | |
508 | "psrlw $1, %%mm0\n\t" | |
509 | "psrlw $1, %%mm2\n\t" | |
510 | "packuswb %%mm2, %%mm0\n\t" | |
511 | "movq %%mm0, %0\n\t" | |
a822a479 | 512 | :"+m"(*p) |
de6d9b64 FB |
513 | :"m"(*pix) |
514 | :"memory"); | |
515 | pix += line_size; | |
516 | p += line_size; | |
517 | } while (--h); | |
de6d9b64 FB |
518 | } |
519 | ||
520 | static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
521 | { | |
522 | UINT8 *p; | |
523 | const UINT8 *pix; | |
524 | p = block; | |
525 | pix = pixels; | |
d6a4c0b1 ZK |
526 | MOVQ_ZERO(mm7); |
527 | MOVQ_WONE(mm6); | |
528 | JUMPALIGN(); | |
de6d9b64 FB |
529 | do { |
530 | __asm __volatile( | |
531 | "movq %1, %%mm1\n\t" | |
532 | "movq %0, %%mm0\n\t" | |
533 | "movq %2, %%mm4\n\t" | |
534 | "movq %%mm0, %%mm2\n\t" | |
535 | "movq %%mm1, %%mm3\n\t" | |
536 | "movq %%mm4, %%mm5\n\t" | |
537 | "punpcklbw %%mm7, %%mm1\n\t" | |
538 | "punpckhbw %%mm7, %%mm3\n\t" | |
539 | "punpcklbw %%mm7, %%mm4\n\t" | |
540 | "punpckhbw %%mm7, %%mm5\n\t" | |
541 | "punpcklbw %%mm7, %%mm0\n\t" | |
542 | "punpckhbw %%mm7, %%mm2\n\t" | |
543 | "paddusw %%mm4, %%mm1\n\t" | |
544 | "paddusw %%mm5, %%mm3\n\t" | |
545 | "paddusw %%mm6, %%mm1\n\t" | |
546 | "paddusw %%mm6, %%mm3\n\t" | |
547 | "psrlw $1, %%mm1\n\t" | |
548 | "psrlw $1, %%mm3\n\t" | |
549 | "paddusw %%mm6, %%mm0\n\t" | |
550 | "paddusw %%mm6, %%mm2\n\t" | |
551 | "paddusw %%mm1, %%mm0\n\t" | |
552 | "paddusw %%mm3, %%mm2\n\t" | |
553 | "psrlw $1, %%mm0\n\t" | |
554 | "psrlw $1, %%mm2\n\t" | |
555 | "packuswb %%mm2, %%mm0\n\t" | |
556 | "movq %%mm0, %0\n\t" | |
a822a479 | 557 | :"+m"(*p) |
de6d9b64 FB |
558 | :"m"(*pix), "m"(*(pix+line_size)) |
559 | :"memory"); | |
560 | pix += line_size; | |
561 | p += line_size ; | |
562 | } while(--h); | |
de6d9b64 FB |
563 | } |
564 | ||
565 | static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
566 | { | |
567 | UINT8 *p; | |
568 | const UINT8 *pix; | |
569 | p = block; | |
570 | pix = pixels; | |
d6a4c0b1 ZK |
571 | MOVQ_ZERO(mm7); |
572 | // this doesn't seem to be used offten - so | |
573 | // the inside usage of mm_wone is not optimized | |
574 | MOVQ_WTWO(mm6); | |
de6d9b64 FB |
575 | do { |
576 | __asm __volatile( | |
577 | "movq %1, %%mm0\n\t" | |
578 | "movq %2, %%mm1\n\t" | |
579 | "movq 1%1, %%mm4\n\t" | |
580 | "movq 1%2, %%mm5\n\t" | |
581 | "movq %%mm0, %%mm2\n\t" | |
582 | "movq %%mm1, %%mm3\n\t" | |
583 | "punpcklbw %%mm7, %%mm0\n\t" | |
584 | "punpcklbw %%mm7, %%mm1\n\t" | |
585 | "punpckhbw %%mm7, %%mm2\n\t" | |
586 | "punpckhbw %%mm7, %%mm3\n\t" | |
587 | "paddusw %%mm1, %%mm0\n\t" | |
588 | "paddusw %%mm3, %%mm2\n\t" | |
589 | "movq %%mm4, %%mm1\n\t" | |
590 | "movq %%mm5, %%mm3\n\t" | |
591 | "punpcklbw %%mm7, %%mm4\n\t" | |
592 | "punpcklbw %%mm7, %%mm5\n\t" | |
593 | "punpckhbw %%mm7, %%mm1\n\t" | |
594 | "punpckhbw %%mm7, %%mm3\n\t" | |
595 | "paddusw %%mm5, %%mm4\n\t" | |
596 | "paddusw %%mm3, %%mm1\n\t" | |
597 | "paddusw %%mm6, %%mm4\n\t" | |
598 | "paddusw %%mm6, %%mm1\n\t" | |
599 | "paddusw %%mm4, %%mm0\n\t" | |
600 | "paddusw %%mm1, %%mm2\n\t" | |
601 | "movq %3, %%mm5\n\t" | |
602 | "psrlw $2, %%mm0\n\t" | |
603 | "movq %0, %%mm1\n\t" | |
604 | "psrlw $2, %%mm2\n\t" | |
605 | "movq %%mm1, %%mm3\n\t" | |
606 | "punpcklbw %%mm7, %%mm1\n\t" | |
607 | "punpckhbw %%mm7, %%mm3\n\t" | |
608 | "paddusw %%mm1, %%mm0\n\t" | |
609 | "paddusw %%mm3, %%mm2\n\t" | |
610 | "paddusw %%mm5, %%mm0\n\t" | |
611 | "paddusw %%mm5, %%mm2\n\t" | |
612 | "psrlw $1, %%mm0\n\t" | |
613 | "psrlw $1, %%mm2\n\t" | |
614 | "packuswb %%mm2, %%mm0\n\t" | |
615 | "movq %%mm0, %0\n\t" | |
a822a479 | 616 | :"+m"(*p) |
de6d9b64 | 617 | :"m"(*pix), |
a9b3f630 | 618 | "m"(*(pix+line_size)), "m"(mm_wone) |
de6d9b64 FB |
619 | :"memory"); |
620 | pix += line_size; | |
621 | p += line_size ; | |
622 | } while(--h); | |
de6d9b64 FB |
623 | } |
624 | ||
625 | static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
626 | { | |
627 | UINT8 *p; | |
628 | const UINT8 *pix; | |
629 | p = block; | |
630 | pix = pixels; | |
d6a4c0b1 | 631 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
632 | do { |
633 | __asm __volatile( | |
634 | "movq %1, %%mm0\n\t" | |
635 | "movq %0, %%mm1\n\t" | |
636 | "movq %%mm0, %%mm2\n\t" | |
637 | "movq %%mm1, %%mm3\n\t" | |
638 | "punpcklbw %%mm7, %%mm0\n\t" | |
639 | "punpcklbw %%mm7, %%mm1\n\t" | |
640 | "punpckhbw %%mm7, %%mm2\n\t" | |
641 | "punpckhbw %%mm7, %%mm3\n\t" | |
642 | "paddusw %%mm1, %%mm0\n\t" | |
643 | "paddusw %%mm3, %%mm2\n\t" | |
644 | "psrlw $1, %%mm0\n\t" | |
645 | "psrlw $1, %%mm2\n\t" | |
646 | "packuswb %%mm2, %%mm0\n\t" | |
647 | "movq %%mm0, %0\n\t" | |
a822a479 | 648 | :"+m"(*p) |
de6d9b64 FB |
649 | :"m"(*pix) |
650 | :"memory"); | |
651 | pix += line_size; | |
652 | p += line_size ; | |
653 | } while (--h); | |
de6d9b64 FB |
654 | } |
655 | ||
656 | static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
657 | { | |
658 | UINT8 *p; | |
659 | const UINT8 *pix; | |
660 | p = block; | |
661 | pix = pixels; | |
d6a4c0b1 | 662 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
663 | do { |
664 | __asm __volatile( | |
665 | "movq %1, %%mm0\n\t" | |
666 | "movq 1%1, %%mm1\n\t" | |
667 | "movq %0, %%mm4\n\t" | |
668 | "movq %%mm0, %%mm2\n\t" | |
669 | "movq %%mm1, %%mm3\n\t" | |
670 | "movq %%mm4, %%mm5\n\t" | |
671 | "punpcklbw %%mm7, %%mm0\n\t" | |
672 | "punpcklbw %%mm7, %%mm1\n\t" | |
673 | "punpckhbw %%mm7, %%mm2\n\t" | |
674 | "punpckhbw %%mm7, %%mm3\n\t" | |
675 | "punpcklbw %%mm7, %%mm4\n\t" | |
676 | "punpckhbw %%mm7, %%mm5\n\t" | |
677 | "paddusw %%mm1, %%mm0\n\t" | |
678 | "paddusw %%mm3, %%mm2\n\t" | |
679 | "psrlw $1, %%mm0\n\t" | |
680 | "psrlw $1, %%mm2\n\t" | |
681 | "paddusw %%mm4, %%mm0\n\t" | |
682 | "paddusw %%mm5, %%mm2\n\t" | |
683 | "psrlw $1, %%mm0\n\t" | |
684 | "psrlw $1, %%mm2\n\t" | |
685 | "packuswb %%mm2, %%mm0\n\t" | |
686 | "movq %%mm0, %0\n\t" | |
a822a479 | 687 | :"+m"(*p) |
de6d9b64 FB |
688 | :"m"(*pix) |
689 | :"memory"); | |
690 | pix += line_size; | |
691 | p += line_size; | |
692 | } while (--h); | |
de6d9b64 FB |
693 | } |
694 | ||
695 | static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
696 | { | |
697 | UINT8 *p; | |
698 | const UINT8 *pix; | |
699 | p = block; | |
700 | pix = pixels; | |
d6a4c0b1 | 701 | MOVQ_ZERO(mm7); |
de6d9b64 FB |
702 | do { |
703 | __asm __volatile( | |
704 | "movq %1, %%mm0\n\t" | |
705 | "movq %2, %%mm1\n\t" | |
706 | "movq %0, %%mm4\n\t" | |
707 | "movq %%mm0, %%mm2\n\t" | |
708 | "movq %%mm1, %%mm3\n\t" | |
709 | "movq %%mm4, %%mm5\n\t" | |
710 | "punpcklbw %%mm7, %%mm0\n\t" | |
711 | "punpcklbw %%mm7, %%mm1\n\t" | |
712 | "punpckhbw %%mm7, %%mm2\n\t" | |
713 | "punpckhbw %%mm7, %%mm3\n\t" | |
714 | "punpcklbw %%mm7, %%mm4\n\t" | |
715 | "punpckhbw %%mm7, %%mm5\n\t" | |
716 | "paddusw %%mm1, %%mm0\n\t" | |
717 | "paddusw %%mm3, %%mm2\n\t" | |
718 | "psrlw $1, %%mm0\n\t" | |
719 | "psrlw $1, %%mm2\n\t" | |
720 | "paddusw %%mm4, %%mm0\n\t" | |
721 | "paddusw %%mm5, %%mm2\n\t" | |
722 | "psrlw $1, %%mm0\n\t" | |
723 | "psrlw $1, %%mm2\n\t" | |
724 | "packuswb %%mm2, %%mm0\n\t" | |
725 | "movq %%mm0, %0\n\t" | |
a822a479 | 726 | :"+m"(*p) |
de6d9b64 FB |
727 | :"m"(*pix), "m"(*(pix+line_size)) |
728 | :"memory"); | |
729 | pix += line_size; | |
730 | p += line_size ; | |
731 | } while(--h); | |
de6d9b64 FB |
732 | } |
733 | ||
734 | static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
735 | { | |
736 | UINT8 *p; | |
737 | const UINT8 *pix; | |
738 | p = block; | |
739 | pix = pixels; | |
d6a4c0b1 ZK |
740 | MOVQ_ZERO(mm7); |
741 | MOVQ_WONE(mm6); | |
742 | JUMPALIGN(); | |
de6d9b64 FB |
743 | do { |
744 | __asm __volatile( | |
745 | "movq %1, %%mm0\n\t" | |
746 | "movq %2, %%mm1\n\t" | |
747 | "movq 1%1, %%mm4\n\t" | |
748 | "movq 1%2, %%mm5\n\t" | |
749 | "movq %%mm0, %%mm2\n\t" | |
750 | "movq %%mm1, %%mm3\n\t" | |
751 | "punpcklbw %%mm7, %%mm0\n\t" | |
752 | "punpcklbw %%mm7, %%mm1\n\t" | |
753 | "punpckhbw %%mm7, %%mm2\n\t" | |
754 | "punpckhbw %%mm7, %%mm3\n\t" | |
755 | "paddusw %%mm1, %%mm0\n\t" | |
756 | "paddusw %%mm3, %%mm2\n\t" | |
757 | "movq %%mm4, %%mm1\n\t" | |
758 | "movq %%mm5, %%mm3\n\t" | |
759 | "punpcklbw %%mm7, %%mm4\n\t" | |
760 | "punpcklbw %%mm7, %%mm5\n\t" | |
761 | "punpckhbw %%mm7, %%mm1\n\t" | |
762 | "punpckhbw %%mm7, %%mm3\n\t" | |
763 | "paddusw %%mm5, %%mm4\n\t" | |
764 | "paddusw %%mm3, %%mm1\n\t" | |
765 | "paddusw %%mm6, %%mm4\n\t" | |
766 | "paddusw %%mm6, %%mm1\n\t" | |
767 | "paddusw %%mm4, %%mm0\n\t" | |
768 | "paddusw %%mm1, %%mm2\n\t" | |
769 | "movq %0, %%mm1\n\t" | |
770 | "psrlw $2, %%mm0\n\t" | |
771 | "movq %%mm1, %%mm3\n\t" | |
772 | "psrlw $2, %%mm2\n\t" | |
773 | "punpcklbw %%mm7, %%mm1\n\t" | |
774 | "punpckhbw %%mm7, %%mm3\n\t" | |
775 | "paddusw %%mm1, %%mm0\n\t" | |
776 | "paddusw %%mm3, %%mm2\n\t" | |
777 | "psrlw $1, %%mm0\n\t" | |
778 | "psrlw $1, %%mm2\n\t" | |
779 | "packuswb %%mm2, %%mm0\n\t" | |
780 | "movq %%mm0, %0\n\t" | |
a822a479 | 781 | :"+m"(*p) |
de6d9b64 FB |
782 | :"m"(*pix), |
783 | "m"(*(pix+line_size)) | |
784 | :"memory"); | |
785 | pix += line_size; | |
786 | p += line_size; | |
787 | } while(--h); | |
de6d9b64 FB |
788 | } |
789 | ||
649c00c9 MN |
790 | static void clear_blocks_mmx(DCTELEM *blocks) |
791 | { | |
792 | asm volatile( | |
793 | "pxor %%mm7, %%mm7 \n\t" | |
794 | "movl $-128*6, %%eax \n\t" | |
795 | "1: \n\t" | |
796 | "movq %%mm7, (%0, %%eax) \n\t" | |
797 | "movq %%mm7, 8(%0, %%eax) \n\t" | |
798 | "movq %%mm7, 16(%0, %%eax) \n\t" | |
799 | "movq %%mm7, 24(%0, %%eax) \n\t" | |
800 | "addl $32, %%eax \n\t" | |
801 | " js 1b \n\t" | |
802 | : : "r" (((int)blocks)+128*6) | |
803 | : "%eax" | |
804 | ); | |
805 | } | |
806 | ||
61a4e8ae | 807 | #if 0 |
d6a4c0b1 | 808 | static void just_return() { return; } |
61a4e8ae | 809 | #endif |
d6a4c0b1 | 810 | |
de6d9b64 FB |
811 | void dsputil_init_mmx(void) |
812 | { | |
813 | mm_flags = mm_support(); | |
f4470e09 MN |
814 | #if 1 |
815 | printf("libavcodec: CPU flags:"); | |
de6d9b64 FB |
816 | if (mm_flags & MM_MMX) |
817 | printf(" mmx"); | |
818 | if (mm_flags & MM_MMXEXT) | |
819 | printf(" mmxext"); | |
820 | if (mm_flags & MM_3DNOW) | |
821 | printf(" 3dnow"); | |
822 | if (mm_flags & MM_SSE) | |
823 | printf(" sse"); | |
824 | if (mm_flags & MM_SSE2) | |
825 | printf(" sse2"); | |
826 | printf("\n"); | |
827 | #endif | |
828 | ||
829 | if (mm_flags & MM_MMX) { | |
830 | get_pixels = get_pixels_mmx; | |
9dbcbd92 | 831 | diff_pixels = diff_pixels_mmx; |
de6d9b64 FB |
832 | put_pixels_clamped = put_pixels_clamped_mmx; |
833 | add_pixels_clamped = add_pixels_clamped_mmx; | |
649c00c9 | 834 | clear_blocks= clear_blocks_mmx; |
dcb9cd4b | 835 | |
ba6802de MN |
836 | pix_abs16x16 = pix_abs16x16_mmx; |
837 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
838 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
de6d9b64 | 839 | pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
ba6802de MN |
840 | pix_abs8x8 = pix_abs8x8_mmx; |
841 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
842 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
843 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
de6d9b64 FB |
844 | av_fdct = fdct_mmx; |
845 | ||
846 | put_pixels_tab[0] = put_pixels_mmx; | |
847 | put_pixels_tab[1] = put_pixels_x2_mmx; | |
848 | put_pixels_tab[2] = put_pixels_y2_mmx; | |
849 | put_pixels_tab[3] = put_pixels_xy2_mmx; | |
850 | ||
851 | put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
852 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
853 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
854 | put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
dcb9cd4b | 855 | |
de6d9b64 FB |
856 | avg_pixels_tab[0] = avg_pixels_mmx; |
857 | avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
858 | avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
859 | avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
860 | ||
861 | avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
862 | avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
863 | avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
864 | avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
607dce96 | 865 | |
de6d9b64 | 866 | if (mm_flags & MM_MMXEXT) { |
ba6802de MN |
867 | pix_abs16x16 = pix_abs16x16_mmx2; |
868 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
869 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
870 | pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
dcb9cd4b | 871 | |
ba6802de MN |
872 | pix_abs8x8 = pix_abs8x8_mmx2; |
873 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
874 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
875 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
607dce96 MN |
876 | |
877 | put_pixels_tab[1] = put_pixels_x2_mmx2; | |
878 | put_pixels_tab[2] = put_pixels_y2_mmx2; | |
879 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; | |
880 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; | |
dcb9cd4b | 881 | |
607dce96 MN |
882 | avg_pixels_tab[0] = avg_pixels_mmx2; |
883 | avg_pixels_tab[1] = avg_pixels_x2_mmx2; | |
884 | avg_pixels_tab[2] = avg_pixels_y2_mmx2; | |
885 | avg_pixels_tab[3] = avg_pixels_xy2_mmx2; | |
de6d9b64 FB |
886 | } else if (mm_flags & MM_3DNOW) { |
887 | put_pixels_tab[1] = put_pixels_x2_3dnow; | |
888 | put_pixels_tab[2] = put_pixels_y2_3dnow; | |
607dce96 MN |
889 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; |
890 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; | |
61a4e8ae | 891 | |
de6d9b64 FB |
892 | avg_pixels_tab[0] = avg_pixels_3dnow; |
893 | avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
894 | avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
895 | avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
de6d9b64 | 896 | } |
4af7bcc1 | 897 | |
8def0299 FB |
898 | /* idct */ |
899 | if (mm_flags & MM_MMXEXT) { | |
900 | ff_idct = ff_mmxext_idct; | |
901 | } else { | |
902 | ff_idct = ff_mmx_idct; | |
903 | } | |
d962f6fd A |
904 | #ifdef SIMPLE_IDCT |
905 | // ff_idct = simple_idct; | |
906 | ff_idct = simple_idct_mmx; | |
907 | #endif | |
de6d9b64 | 908 | } |
d6a4c0b1 ZK |
909 | |
910 | #if 0 | |
911 | // for speed testing | |
912 | get_pixels = just_return; | |
913 | put_pixels_clamped = just_return; | |
914 | add_pixels_clamped = just_return; | |
915 | ||
916 | pix_abs16x16 = just_return; | |
917 | pix_abs16x16_x2 = just_return; | |
918 | pix_abs16x16_y2 = just_return; | |
919 | pix_abs16x16_xy2 = just_return; | |
920 | ||
921 | put_pixels_tab[0] = just_return; | |
922 | put_pixels_tab[1] = just_return; | |
923 | put_pixels_tab[2] = just_return; | |
924 | put_pixels_tab[3] = just_return; | |
925 | ||
926 | put_no_rnd_pixels_tab[0] = just_return; | |
927 | put_no_rnd_pixels_tab[1] = just_return; | |
928 | put_no_rnd_pixels_tab[2] = just_return; | |
929 | put_no_rnd_pixels_tab[3] = just_return; | |
930 | ||
931 | avg_pixels_tab[0] = just_return; | |
932 | avg_pixels_tab[1] = just_return; | |
933 | avg_pixels_tab[2] = just_return; | |
934 | avg_pixels_tab[3] = just_return; | |
935 | ||
936 | avg_no_rnd_pixels_tab[0] = just_return; | |
937 | avg_no_rnd_pixels_tab[1] = just_return; | |
938 | avg_no_rnd_pixels_tab[2] = just_return; | |
939 | avg_no_rnd_pixels_tab[3] = just_return; | |
940 | ||
d6a4c0b1 ZK |
941 | //av_fdct = just_return; |
942 | //ff_idct = just_return; | |
943 | #endif | |
de6d9b64 | 944 | } |
4f12a497 FB |
945 | |
946 | /* remove any non bit exact operation (testing purpose). NOTE that | |
947 | this function should be kept as small as possible because it is | |
948 | always difficult to test automatically non bit exact cases. */ | |
949 | void dsputil_set_bit_exact_mmx(void) | |
950 | { | |
951 | if (mm_flags & MM_MMX) { | |
952 | if (mm_flags & MM_MMXEXT) { | |
953 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
954 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
955 | avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
956 | } else if (mm_flags & MM_3DNOW) { | |
957 | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
958 | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
959 | avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
960 | } | |
961 | } | |
962 | } |