Commit | Line | Data |
---|---|---|
de6d9b64 FB |
1 | /* |
2 | * MMX optimized DSP utils | |
ff4ec49e | 3 | * Copyright (c) 2000, 2001 Fabrice Bellard. |
de6d9b64 | 4 | * |
ff4ec49e FB |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Lesser General Public | |
7 | * License as published by the Free Software Foundation; either | |
8 | * version 2 of the License, or (at your option) any later version. | |
de6d9b64 | 9 | * |
ff4ec49e | 10 | * This library is distributed in the hope that it will be useful, |
de6d9b64 | 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
ff4ec49e FB |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Lesser General Public License for more details. | |
de6d9b64 | 14 | * |
ff4ec49e FB |
15 | * You should have received a copy of the GNU Lesser General Public |
16 | * License along with this library; if not, write to the Free Software | |
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
de6d9b64 FB |
18 | * |
19 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 | */ | |
21 | ||
22 | #include "../dsputil.h" | |
d962f6fd | 23 | #include "../simple_idct.h" |
de6d9b64 | 24 | |
7d650cb5 FB |
25 | int mm_flags; /* multimedia extension flags */ |
26 | ||
ba6802de MN |
27 | int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
28 | int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 | int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 | int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | ||
32 | int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 | int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 | int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 | int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | ||
37 | int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 | int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 | int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 | int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | ||
42 | int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 | int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 | int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 | int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | ||
8def0299 FB |
47 | /* external functions, from idct_mmx.c */ |
48 | void ff_mmx_idct(DCTELEM *block); | |
49 | void ff_mmxext_idct(DCTELEM *block); | |
4af7bcc1 | 50 | |
de6d9b64 | 51 | /* pixel operations */ |
a7bd8797 MN |
52 | static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
53 | static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
54 | static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
de6d9b64 | 55 | |
d6a4c0b1 ZK |
56 | #define JUMPALIGN() __asm __volatile (".balign 8"::) |
57 | #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) | |
58 | ||
fca0f0e5 ZK |
59 | #define MOVQ_WONE(regd) \ |
60 | __asm __volatile ( \ | |
61 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
62 | "psrlw $15, %%" #regd ::) | |
63 | ||
64 | #define MOVQ_BFE(regd) \ | |
65 | __asm __volatile ( \ | |
66 | "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
67 | "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
68 | ||
d6a4c0b1 | 69 | #ifndef PIC |
fca0f0e5 | 70 | #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) |
d6a4c0b1 ZK |
71 | #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
72 | #else | |
73 | // for shared library it's better to use this way for accessing constants | |
74 | // pcmpeqd -> -1 | |
fca0f0e5 | 75 | #define MOVQ_BONE(regd) \ |
d6a4c0b1 | 76 | __asm __volatile ( \ |
fca0f0e5 ZK |
77 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
78 | "psrlw $15, %%" #regd " \n\t" \ | |
79 | "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
d6a4c0b1 ZK |
80 | |
81 | #define MOVQ_WTWO(regd) \ | |
82 | __asm __volatile ( \ | |
fca0f0e5 ZK |
83 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
84 | "psrlw $15, %%" #regd " \n\t" \ | |
85 | "psllw $1, %%" #regd " \n\t"::) | |
a7bd8797 | 86 | |
d6a4c0b1 ZK |
87 | #endif |
88 | ||
fca0f0e5 | 89 | // using regr as temporary and for the output result |
def60345 | 90 | // first argument is unmodifed and second is trashed |
39825f31 ZK |
91 | // regfe is supposed to contain 0xfefefefefefefefe |
92 | #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
91abb473 ZK |
93 | "movq " #rega ", " #regr " \n\t"\ |
94 | "pand " #regb ", " #regr " \n\t"\ | |
def60345 | 95 | "pxor " #rega ", " #regb " \n\t"\ |
39825f31 | 96 | "pand " #regfe "," #regb " \n\t"\ |
def60345 | 97 | "psrlq $1, " #regb " \n\t"\ |
91abb473 | 98 | "paddb " #regb ", " #regr " \n\t" |
def60345 | 99 | |
39825f31 | 100 | #define PAVGB_MMX(rega, regb, regr, regfe) \ |
91abb473 ZK |
101 | "movq " #rega ", " #regr " \n\t"\ |
102 | "por " #regb ", " #regr " \n\t"\ | |
def60345 | 103 | "pxor " #rega ", " #regb " \n\t"\ |
39825f31 | 104 | "pand " #regfe "," #regb " \n\t"\ |
def60345 | 105 | "psrlq $1, " #regb " \n\t"\ |
91abb473 | 106 | "psubb " #regb ", " #regr " \n\t" |
def60345 | 107 | |
39825f31 | 108 | // mm6 is supposed to contain 0xfefefefefefefefe |
6aa6ea8e ZK |
109 | #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
110 | "movq " #rega ", " #regr " \n\t"\ | |
111 | "movq " #regc ", " #regp " \n\t"\ | |
112 | "pand " #regb ", " #regr " \n\t"\ | |
113 | "pand " #regd ", " #regp " \n\t"\ | |
114 | "pxor " #rega ", " #regb " \n\t"\ | |
115 | "pxor " #regc ", " #regd " \n\t"\ | |
fca0f0e5 ZK |
116 | "pand %%mm6, " #regb " \n\t"\ |
117 | "pand %%mm6, " #regd " \n\t"\ | |
6aa6ea8e ZK |
118 | "psrlq $1, " #regb " \n\t"\ |
119 | "psrlq $1, " #regd " \n\t"\ | |
120 | "paddb " #regb ", " #regr " \n\t"\ | |
121 | "paddb " #regd ", " #regp " \n\t" | |
122 | ||
123 | #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ | |
124 | "movq " #rega ", " #regr " \n\t"\ | |
125 | "movq " #regc ", " #regp " \n\t"\ | |
126 | "por " #regb ", " #regr " \n\t"\ | |
127 | "por " #regd ", " #regp " \n\t"\ | |
128 | "pxor " #rega ", " #regb " \n\t"\ | |
129 | "pxor " #regc ", " #regd " \n\t"\ | |
fca0f0e5 ZK |
130 | "pand %%mm6, " #regb " \n\t"\ |
131 | "pand %%mm6, " #regd " \n\t"\ | |
6aa6ea8e ZK |
132 | "psrlq $1, " #regd " \n\t"\ |
133 | "psrlq $1, " #regb " \n\t"\ | |
134 | "psubb " #regb ", " #regr " \n\t"\ | |
135 | "psubb " #regd ", " #regp " \n\t" | |
136 | ||
91abb473 ZK |
137 | /***********************************/ |
138 | /* MMX no rounding */ | |
139 | #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx | |
fca0f0e5 | 140 | #define SET_RND MOVQ_WONE |
6aa6ea8e | 141 | #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) |
39825f31 | 142 | #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) |
fca0f0e5 | 143 | |
91abb473 ZK |
144 | #include "dsputil_mmx_rnd.h" |
145 | ||
146 | #undef DEF | |
fca0f0e5 | 147 | #undef SET_RND |
6aa6ea8e | 148 | #undef PAVGBP |
39825f31 | 149 | #undef PAVGB |
91abb473 ZK |
150 | /***********************************/ |
151 | /* MMX rounding */ | |
152 | ||
153 | #define DEF(x, y) x ## _ ## y ##_mmx | |
fca0f0e5 | 154 | #define SET_RND MOVQ_WTWO |
6aa6ea8e | 155 | #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
39825f31 | 156 | #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
fca0f0e5 | 157 | |
91abb473 ZK |
158 | #include "dsputil_mmx_rnd.h" |
159 | ||
160 | #undef DEF | |
fca0f0e5 | 161 | #undef SET_RND |
6aa6ea8e | 162 | #undef PAVGBP |
39825f31 | 163 | #undef PAVGB |
a7bd8797 | 164 | |
de6d9b64 FB |
165 | /***********************************/ |
166 | /* 3Dnow specific */ | |
167 | ||
168 | #define DEF(x) x ## _3dnow | |
169 | /* for Athlons PAVGUSB is prefered */ | |
170 | #define PAVGB "pavgusb" | |
171 | ||
172 | #include "dsputil_mmx_avg.h" | |
173 | ||
174 | #undef DEF | |
175 | #undef PAVGB | |
176 | ||
177 | /***********************************/ | |
178 | /* MMX2 specific */ | |
179 | ||
607dce96 | 180 | #define DEF(x) x ## _mmx2 |
de6d9b64 FB |
181 | |
182 | /* Introduced only in MMX2 set */ | |
183 | #define PAVGB "pavgb" | |
184 | ||
185 | #include "dsputil_mmx_avg.h" | |
186 | ||
187 | #undef DEF | |
188 | #undef PAVGB | |
189 | ||
190 | /***********************************/ | |
191 | /* standard MMX */ | |
192 | ||
193 | static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
194 | { | |
607dce96 MN |
195 | asm volatile( |
196 | "movl $-128, %%eax \n\t" | |
197 | "pxor %%mm7, %%mm7 \n\t" | |
198 | ".balign 16 \n\t" | |
199 | "1: \n\t" | |
200 | "movq (%0), %%mm0 \n\t" | |
201 | "movq (%0, %2), %%mm2 \n\t" | |
202 | "movq %%mm0, %%mm1 \n\t" | |
203 | "movq %%mm2, %%mm3 \n\t" | |
204 | "punpcklbw %%mm7, %%mm0 \n\t" | |
205 | "punpckhbw %%mm7, %%mm1 \n\t" | |
206 | "punpcklbw %%mm7, %%mm2 \n\t" | |
207 | "punpckhbw %%mm7, %%mm3 \n\t" | |
208 | "movq %%mm0, (%1, %%eax)\n\t" | |
209 | "movq %%mm1, 8(%1, %%eax)\n\t" | |
210 | "movq %%mm2, 16(%1, %%eax)\n\t" | |
211 | "movq %%mm3, 24(%1, %%eax)\n\t" | |
212 | "addl %3, %0 \n\t" | |
213 | "addl $32, %%eax \n\t" | |
214 | "js 1b \n\t" | |
215 | : "+r" (pixels) | |
216 | : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
217 | : "%eax" | |
218 | ); | |
de6d9b64 FB |
219 | } |
220 | ||
9dbcbd92 MN |
221 | static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
222 | { | |
223 | asm volatile( | |
607dce96 | 224 | "pxor %%mm7, %%mm7 \n\t" |
9dbcbd92 | 225 | "movl $-128, %%eax \n\t" |
607dce96 | 226 | ".balign 16 \n\t" |
9dbcbd92 MN |
227 | "1: \n\t" |
228 | "movq (%0), %%mm0 \n\t" | |
229 | "movq (%1), %%mm2 \n\t" | |
230 | "movq %%mm0, %%mm1 \n\t" | |
231 | "movq %%mm2, %%mm3 \n\t" | |
232 | "punpcklbw %%mm7, %%mm0 \n\t" | |
233 | "punpckhbw %%mm7, %%mm1 \n\t" | |
234 | "punpcklbw %%mm7, %%mm2 \n\t" | |
235 | "punpckhbw %%mm7, %%mm3 \n\t" | |
236 | "psubw %%mm2, %%mm0 \n\t" | |
237 | "psubw %%mm3, %%mm1 \n\t" | |
238 | "movq %%mm0, (%2, %%eax)\n\t" | |
239 | "movq %%mm1, 8(%2, %%eax)\n\t" | |
240 | "addl %3, %0 \n\t" | |
241 | "addl %3, %1 \n\t" | |
242 | "addl $16, %%eax \n\t" | |
243 | "jnz 1b \n\t" | |
244 | : "+r" (s1), "+r" (s2) | |
245 | : "r" (block+64), "r" (stride) | |
246 | : "%eax" | |
247 | ); | |
248 | } | |
249 | ||
de6d9b64 FB |
250 | static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
251 | { | |
252 | const DCTELEM *p; | |
253 | UINT8 *pix; | |
de6d9b64 FB |
254 | |
255 | /* read the pixels */ | |
256 | p = block; | |
257 | pix = pixels; | |
d6a4c0b1 | 258 | /* unrolled loop */ |
de6d9b64 | 259 | __asm __volatile( |
a822a479 NK |
260 | "movq %3, %%mm0\n\t" |
261 | "movq 8%3, %%mm1\n\t" | |
262 | "movq 16%3, %%mm2\n\t" | |
263 | "movq 24%3, %%mm3\n\t" | |
264 | "movq 32%3, %%mm4\n\t" | |
265 | "movq 40%3, %%mm5\n\t" | |
266 | "movq 48%3, %%mm6\n\t" | |
267 | "movq 56%3, %%mm7\n\t" | |
de6d9b64 FB |
268 | "packuswb %%mm1, %%mm0\n\t" |
269 | "packuswb %%mm3, %%mm2\n\t" | |
270 | "packuswb %%mm5, %%mm4\n\t" | |
271 | "packuswb %%mm7, %%mm6\n\t" | |
a822a479 NK |
272 | "movq %%mm0, (%0)\n\t" |
273 | "movq %%mm2, (%0, %1)\n\t" | |
274 | "movq %%mm4, (%0, %1, 2)\n\t" | |
275 | "movq %%mm6, (%0, %2)\n\t" | |
276 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) | |
de6d9b64 FB |
277 | :"memory"); |
278 | pix += line_size*4; | |
279 | p += 32; | |
d6a4c0b1 ZK |
280 | |
281 | // if here would be an exact copy of the code above | |
282 | // compiler would generate some very strange code | |
283 | // thus using "r" | |
284 | __asm __volatile( | |
285 | "movq (%3), %%mm0\n\t" | |
286 | "movq 8(%3), %%mm1\n\t" | |
287 | "movq 16(%3), %%mm2\n\t" | |
288 | "movq 24(%3), %%mm3\n\t" | |
289 | "movq 32(%3), %%mm4\n\t" | |
290 | "movq 40(%3), %%mm5\n\t" | |
291 | "movq 48(%3), %%mm6\n\t" | |
292 | "movq 56(%3), %%mm7\n\t" | |
293 | "packuswb %%mm1, %%mm0\n\t" | |
294 | "packuswb %%mm3, %%mm2\n\t" | |
295 | "packuswb %%mm5, %%mm4\n\t" | |
296 | "packuswb %%mm7, %%mm6\n\t" | |
297 | "movq %%mm0, (%0)\n\t" | |
298 | "movq %%mm2, (%0, %1)\n\t" | |
299 | "movq %%mm4, (%0, %1, 2)\n\t" | |
300 | "movq %%mm6, (%0, %2)\n\t" | |
301 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) | |
302 | :"memory"); | |
de6d9b64 FB |
303 | } |
304 | ||
305 | static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
306 | { | |
307 | const DCTELEM *p; | |
308 | UINT8 *pix; | |
309 | int i; | |
310 | ||
311 | /* read the pixels */ | |
312 | p = block; | |
313 | pix = pixels; | |
d6a4c0b1 ZK |
314 | MOVQ_ZERO(mm7); |
315 | i = 4; | |
cd8e5f96 | 316 | do { |
de6d9b64 | 317 | __asm __volatile( |
cd8e5f96 ZK |
318 | "movq (%2), %%mm0\n\t" |
319 | "movq 8(%2), %%mm1\n\t" | |
320 | "movq 16(%2), %%mm2\n\t" | |
321 | "movq 24(%2), %%mm3\n\t" | |
de6d9b64 FB |
322 | "movq %0, %%mm4\n\t" |
323 | "movq %1, %%mm6\n\t" | |
324 | "movq %%mm4, %%mm5\n\t" | |
325 | "punpcklbw %%mm7, %%mm4\n\t" | |
326 | "punpckhbw %%mm7, %%mm5\n\t" | |
327 | "paddsw %%mm4, %%mm0\n\t" | |
328 | "paddsw %%mm5, %%mm1\n\t" | |
329 | "movq %%mm6, %%mm5\n\t" | |
330 | "punpcklbw %%mm7, %%mm6\n\t" | |
331 | "punpckhbw %%mm7, %%mm5\n\t" | |
332 | "paddsw %%mm6, %%mm2\n\t" | |
333 | "paddsw %%mm5, %%mm3\n\t" | |
334 | "packuswb %%mm1, %%mm0\n\t" | |
335 | "packuswb %%mm3, %%mm2\n\t" | |
336 | "movq %%mm0, %0\n\t" | |
337 | "movq %%mm2, %1\n\t" | |
a822a479 | 338 | :"+m"(*pix), "+m"(*(pix+line_size)) |
cd8e5f96 | 339 | :"r"(p) |
de6d9b64 FB |
340 | :"memory"); |
341 | pix += line_size*2; | |
342 | p += 16; | |
cd8e5f96 | 343 | } while (--i); |
de6d9b64 FB |
344 | } |
345 | ||
b3184779 | 346 | static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
de6d9b64 | 347 | { |
39825f31 | 348 | __asm __volatile( |
31ddcf98 | 349 | "lea (%3, %3), %%eax \n\t" |
52af45ad | 350 | ".balign 8 \n\t" |
31ddcf98 ZK |
351 | "1: \n\t" |
352 | "movq (%1), %%mm0 \n\t" | |
353 | "movq (%1, %3), %%mm1 \n\t" | |
354 | "movq %%mm0, (%2) \n\t" | |
355 | "movq %%mm1, (%2, %3) \n\t" | |
356 | "addl %%eax, %1 \n\t" | |
357 | "addl %%eax, %2 \n\t" | |
358 | "movq (%1), %%mm0 \n\t" | |
359 | "movq (%1, %3), %%mm1 \n\t" | |
360 | "movq %%mm0, (%2) \n\t" | |
361 | "movq %%mm1, (%2, %3) \n\t" | |
362 | "addl %%eax, %1 \n\t" | |
363 | "addl %%eax, %2 \n\t" | |
364 | "subl $4, %0 \n\t" | |
365 | "jnz 1b \n\t" | |
366 | : "+g"(h), "+r" (pixels), "+r" (block) | |
367 | : "r"(line_size) | |
368 | : "%eax", "memory" | |
369 | ); | |
de6d9b64 FB |
370 | } |
371 | ||
b3184779 MN |
372 | static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
373 | { | |
374 | __asm __volatile( | |
375 | "lea (%3, %3), %%eax \n\t" | |
376 | ".balign 8 \n\t" | |
377 | "1: \n\t" | |
378 | "movq (%1), %%mm0 \n\t" | |
379 | "movq 8(%1), %%mm4 \n\t" | |
380 | "movq (%1, %3), %%mm1 \n\t" | |
381 | "movq 8(%1, %3), %%mm5 \n\t" | |
382 | "movq %%mm0, (%2) \n\t" | |
383 | "movq %%mm4, 8(%2) \n\t" | |
384 | "movq %%mm1, (%2, %3) \n\t" | |
385 | "movq %%mm5, 8(%2, %3) \n\t" | |
386 | "addl %%eax, %1 \n\t" | |
387 | "addl %%eax, %2 \n\t" | |
388 | "movq (%1), %%mm0 \n\t" | |
389 | "movq 8(%1), %%mm4 \n\t" | |
390 | "movq (%1, %3), %%mm1 \n\t" | |
391 | "movq 8(%1, %3), %%mm5 \n\t" | |
392 | "movq %%mm0, (%2) \n\t" | |
393 | "movq %%mm4, 8(%2) \n\t" | |
394 | "movq %%mm1, (%2, %3) \n\t" | |
395 | "movq %%mm5, 8(%2, %3) \n\t" | |
396 | "addl %%eax, %1 \n\t" | |
397 | "addl %%eax, %2 \n\t" | |
398 | "subl $4, %0 \n\t" | |
399 | "jnz 1b \n\t" | |
400 | : "+g"(h), "+r" (pixels), "+r" (block) | |
401 | : "r"(line_size) | |
402 | : "%eax", "memory" | |
403 | ); | |
404 | } | |
405 | ||
649c00c9 MN |
406 | static void clear_blocks_mmx(DCTELEM *blocks) |
407 | { | |
39825f31 | 408 | __asm __volatile( |
649c00c9 MN |
409 | "pxor %%mm7, %%mm7 \n\t" |
410 | "movl $-128*6, %%eax \n\t" | |
411 | "1: \n\t" | |
412 | "movq %%mm7, (%0, %%eax) \n\t" | |
413 | "movq %%mm7, 8(%0, %%eax) \n\t" | |
414 | "movq %%mm7, 16(%0, %%eax) \n\t" | |
415 | "movq %%mm7, 24(%0, %%eax) \n\t" | |
416 | "addl $32, %%eax \n\t" | |
417 | " js 1b \n\t" | |
418 | : : "r" (((int)blocks)+128*6) | |
419 | : "%eax" | |
420 | ); | |
421 | } | |
422 | ||
61a4e8ae | 423 | #if 0 |
d6a4c0b1 | 424 | static void just_return() { return; } |
61a4e8ae | 425 | #endif |
d6a4c0b1 | 426 | |
de6d9b64 FB |
427 | void dsputil_init_mmx(void) |
428 | { | |
429 | mm_flags = mm_support(); | |
1565dabc LB |
430 | #if 0 |
431 | fprintf(stderr, "libavcodec: CPU flags:"); | |
de6d9b64 | 432 | if (mm_flags & MM_MMX) |
1565dabc | 433 | fprintf(stderr, " mmx"); |
de6d9b64 | 434 | if (mm_flags & MM_MMXEXT) |
1565dabc | 435 | fprintf(stderr, " mmxext"); |
de6d9b64 | 436 | if (mm_flags & MM_3DNOW) |
1565dabc | 437 | fprintf(stderr, " 3dnow"); |
de6d9b64 | 438 | if (mm_flags & MM_SSE) |
1565dabc | 439 | fprintf(stderr, " sse"); |
de6d9b64 | 440 | if (mm_flags & MM_SSE2) |
1565dabc LB |
441 | fprintf(stderr, " sse2"); |
442 | fprintf(stderr, "\n"); | |
de6d9b64 FB |
443 | #endif |
444 | ||
445 | if (mm_flags & MM_MMX) { | |
446 | get_pixels = get_pixels_mmx; | |
9dbcbd92 | 447 | diff_pixels = diff_pixels_mmx; |
de6d9b64 FB |
448 | put_pixels_clamped = put_pixels_clamped_mmx; |
449 | add_pixels_clamped = add_pixels_clamped_mmx; | |
649c00c9 | 450 | clear_blocks= clear_blocks_mmx; |
dcb9cd4b | 451 | |
ba6802de MN |
452 | pix_abs16x16 = pix_abs16x16_mmx; |
453 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
454 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
de6d9b64 | 455 | pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
ba6802de MN |
456 | pix_abs8x8 = pix_abs8x8_mmx; |
457 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
458 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
459 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
4afeaec9 | 460 | |
b3184779 MN |
461 | put_pixels_tab[0][0] = put_pixels16_mmx; |
462 | put_pixels_tab[0][1] = put_pixels16_x2_mmx; | |
463 | put_pixels_tab[0][2] = put_pixels16_y2_mmx; | |
464 | put_pixels_tab[0][3] = put_pixels16_xy2_mmx; | |
465 | ||
466 | put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx; | |
467 | put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; | |
468 | put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; | |
469 | put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; | |
470 | ||
471 | avg_pixels_tab[0][0] = avg_pixels16_mmx; | |
472 | avg_pixels_tab[0][1] = avg_pixels16_x2_mmx; | |
473 | avg_pixels_tab[0][2] = avg_pixels16_y2_mmx; | |
474 | avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; | |
475 | ||
476 | avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx; | |
477 | avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; | |
478 | avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; | |
479 | avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; | |
480 | ||
481 | put_pixels_tab[1][0] = put_pixels8_mmx; | |
482 | put_pixels_tab[1][1] = put_pixels8_x2_mmx; | |
483 | put_pixels_tab[1][2] = put_pixels8_y2_mmx; | |
484 | put_pixels_tab[1][3] = put_pixels8_xy2_mmx; | |
485 | ||
486 | put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx; | |
487 | put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; | |
488 | put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; | |
489 | put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; | |
490 | ||
491 | avg_pixels_tab[1][0] = avg_pixels8_mmx; | |
492 | avg_pixels_tab[1][1] = avg_pixels8_x2_mmx; | |
493 | avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; | |
494 | avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; | |
495 | ||
496 | avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; | |
497 | avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; | |
498 | avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; | |
499 | avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; | |
607dce96 | 500 | |
de6d9b64 | 501 | if (mm_flags & MM_MMXEXT) { |
ba6802de MN |
502 | pix_abs16x16 = pix_abs16x16_mmx2; |
503 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
504 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
505 | pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
dcb9cd4b | 506 | |
ba6802de MN |
507 | pix_abs8x8 = pix_abs8x8_mmx2; |
508 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
509 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
510 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
607dce96 | 511 | |
b3184779 MN |
512 | put_pixels_tab[0][1] = put_pixels16_x2_mmx2; |
513 | put_pixels_tab[0][2] = put_pixels16_y2_mmx2; | |
514 | put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |
515 | put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | |
516 | ||
517 | avg_pixels_tab[0][0] = avg_pixels16_mmx2; | |
518 | avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; | |
519 | avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; | |
520 | avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | |
521 | ||
522 | put_pixels_tab[1][1] = put_pixels8_x2_mmx2; | |
523 | put_pixels_tab[1][2] = put_pixels8_y2_mmx2; | |
524 | put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
525 | put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
526 | ||
527 | avg_pixels_tab[1][0] = avg_pixels8_mmx2; | |
528 | avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; | |
529 | avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; | |
530 | avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
de6d9b64 | 531 | } else if (mm_flags & MM_3DNOW) { |
b3184779 MN |
532 | put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
533 | put_pixels_tab[0][2] = put_pixels16_y2_3dnow; | |
534 | put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
535 | put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
536 | ||
537 | avg_pixels_tab[0][0] = avg_pixels16_3dnow; | |
538 | avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; | |
539 | avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; | |
540 | avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
541 | ||
542 | put_pixels_tab[1][1] = put_pixels8_x2_3dnow; | |
543 | put_pixels_tab[1][2] = put_pixels8_y2_3dnow; | |
544 | put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
545 | put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
546 | ||
547 | avg_pixels_tab[1][0] = avg_pixels8_3dnow; | |
548 | avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; | |
549 | avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; | |
550 | avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
de6d9b64 | 551 | } |
4af7bcc1 | 552 | |
8def0299 FB |
553 | /* idct */ |
554 | if (mm_flags & MM_MMXEXT) { | |
555 | ff_idct = ff_mmxext_idct; | |
556 | } else { | |
557 | ff_idct = ff_mmx_idct; | |
558 | } | |
d962f6fd A |
559 | #ifdef SIMPLE_IDCT |
560 | // ff_idct = simple_idct; | |
561 | ff_idct = simple_idct_mmx; | |
562 | #endif | |
de6d9b64 | 563 | } |
d6a4c0b1 ZK |
564 | |
565 | #if 0 | |
566 | // for speed testing | |
567 | get_pixels = just_return; | |
568 | put_pixels_clamped = just_return; | |
569 | add_pixels_clamped = just_return; | |
570 | ||
571 | pix_abs16x16 = just_return; | |
572 | pix_abs16x16_x2 = just_return; | |
573 | pix_abs16x16_y2 = just_return; | |
574 | pix_abs16x16_xy2 = just_return; | |
575 | ||
576 | put_pixels_tab[0] = just_return; | |
577 | put_pixels_tab[1] = just_return; | |
578 | put_pixels_tab[2] = just_return; | |
579 | put_pixels_tab[3] = just_return; | |
580 | ||
581 | put_no_rnd_pixels_tab[0] = just_return; | |
582 | put_no_rnd_pixels_tab[1] = just_return; | |
583 | put_no_rnd_pixels_tab[2] = just_return; | |
584 | put_no_rnd_pixels_tab[3] = just_return; | |
585 | ||
586 | avg_pixels_tab[0] = just_return; | |
587 | avg_pixels_tab[1] = just_return; | |
588 | avg_pixels_tab[2] = just_return; | |
589 | avg_pixels_tab[3] = just_return; | |
590 | ||
591 | avg_no_rnd_pixels_tab[0] = just_return; | |
592 | avg_no_rnd_pixels_tab[1] = just_return; | |
593 | avg_no_rnd_pixels_tab[2] = just_return; | |
594 | avg_no_rnd_pixels_tab[3] = just_return; | |
595 | ||
d6a4c0b1 ZK |
596 | //av_fdct = just_return; |
597 | //ff_idct = just_return; | |
598 | #endif | |
de6d9b64 | 599 | } |
4f12a497 | 600 | |
e7fce5e9 MN |
601 | void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block); |
602 | ||
603 | /** | |
604 | * this will send coeff matrixes which would have different results for the 16383 type MMX vs C IDCTs to the C IDCT | |
605 | */ | |
606 | void bit_exact_idct_put(UINT8 *dest, int line_size, INT16 *block){ | |
607 | if( block[0]>1022 && block[1]==0 && block[4 ]==0 && block[5 ]==0 | |
608 | && block[8]==0 && block[9]==0 && block[12]==0 && block[13]==0){ | |
609 | int16_t tmp[64]; | |
610 | int i; | |
611 | ||
612 | for(i=0; i<64; i++) | |
613 | tmp[i]= block[i]; | |
614 | for(i=0; i<64; i++) | |
615 | block[i]= tmp[block_permute_op(i)]; | |
616 | ||
617 | simple_idct_put(dest, line_size, block); | |
618 | } | |
619 | else | |
620 | gen_idct_put(dest, line_size, block); | |
621 | } | |
622 | ||
4f12a497 FB |
623 | /* remove any non bit exact operation (testing purpose). NOTE that |
624 | this function should be kept as small as possible because it is | |
625 | always difficult to test automatically non bit exact cases. */ | |
626 | void dsputil_set_bit_exact_mmx(void) | |
627 | { | |
628 | if (mm_flags & MM_MMX) { | |
b3184779 MN |
629 | |
630 | /* MMX2 & 3DNOW */ | |
631 | put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; | |
632 | put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; | |
633 | avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; | |
634 | put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; | |
635 | put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; | |
636 | avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; | |
4afeaec9 | 637 | |
b3184779 | 638 | if (mm_flags & MM_MMXEXT) { |
4afeaec9 MN |
639 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx; |
640 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
641 | pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | |
642 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
643 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
644 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
4f12a497 | 645 | } |
e7fce5e9 MN |
646 | #ifdef SIMPLE_IDCT |
647 | if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx) | |
648 | ff_idct_put= bit_exact_idct_put; | |
649 | #endif | |
4f12a497 FB |
650 | } |
651 | } |