Commit | Line | Data |
---|---|---|
de6d9b64 FB |
1 | /* |
2 | * MMX optimized DSP utils | |
ff4ec49e | 3 | * Copyright (c) 2000, 2001 Fabrice Bellard. |
de6d9b64 | 4 | * |
ff4ec49e FB |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Lesser General Public | |
7 | * License as published by the Free Software Foundation; either | |
8 | * version 2 of the License, or (at your option) any later version. | |
de6d9b64 | 9 | * |
ff4ec49e | 10 | * This library is distributed in the hope that it will be useful, |
de6d9b64 | 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
ff4ec49e FB |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Lesser General Public License for more details. | |
de6d9b64 | 14 | * |
ff4ec49e FB |
15 | * You should have received a copy of the GNU Lesser General Public |
16 | * License along with this library; if not, write to the Free Software | |
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
de6d9b64 FB |
18 | * |
19 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 | */ | |
21 | ||
22 | #include "../dsputil.h" | |
23 | ||
7d650cb5 FB |
24 | int mm_flags; /* multimedia extension flags */ |
25 | ||
ba6802de MN |
26 | int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
27 | int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
28 | int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 | int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 | ||
31 | int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
32 | int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 | int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 | int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 | ||
36 | int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
37 | int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 | int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 | int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 | ||
41 | int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
42 | int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 | int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 | int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 | ||
de6d9b64 | 46 | /* pixel operations */ |
a7bd8797 MN |
47 | static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
48 | static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
49 | static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
de6d9b64 | 50 | |
d6a4c0b1 ZK |
51 | #define JUMPALIGN() __asm __volatile (".balign 8"::) |
52 | #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) | |
53 | ||
fca0f0e5 ZK |
54 | #define MOVQ_WONE(regd) \ |
55 | __asm __volatile ( \ | |
56 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
57 | "psrlw $15, %%" #regd ::) | |
58 | ||
59 | #define MOVQ_BFE(regd) \ | |
60 | __asm __volatile ( \ | |
61 | "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
62 | "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
63 | ||
d6a4c0b1 | 64 | #ifndef PIC |
fca0f0e5 | 65 | #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) |
d6a4c0b1 ZK |
66 | #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
67 | #else | |
68 | // for shared library it's better to use this way for accessing constants | |
69 | // pcmpeqd -> -1 | |
fca0f0e5 | 70 | #define MOVQ_BONE(regd) \ |
d6a4c0b1 | 71 | __asm __volatile ( \ |
fca0f0e5 ZK |
72 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
73 | "psrlw $15, %%" #regd " \n\t" \ | |
74 | "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
d6a4c0b1 ZK |
75 | |
76 | #define MOVQ_WTWO(regd) \ | |
77 | __asm __volatile ( \ | |
fca0f0e5 ZK |
78 | "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
79 | "psrlw $15, %%" #regd " \n\t" \ | |
80 | "psllw $1, %%" #regd " \n\t"::) | |
a7bd8797 | 81 | |
d6a4c0b1 ZK |
82 | #endif |
83 | ||
fca0f0e5 | 84 | // using regr as temporary and for the output result |
def60345 | 85 | // first argument is unmodifed and second is trashed |
39825f31 ZK |
86 | // regfe is supposed to contain 0xfefefefefefefefe |
87 | #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
91abb473 ZK |
88 | "movq " #rega ", " #regr " \n\t"\ |
89 | "pand " #regb ", " #regr " \n\t"\ | |
def60345 | 90 | "pxor " #rega ", " #regb " \n\t"\ |
39825f31 | 91 | "pand " #regfe "," #regb " \n\t"\ |
def60345 | 92 | "psrlq $1, " #regb " \n\t"\ |
91abb473 | 93 | "paddb " #regb ", " #regr " \n\t" |
def60345 | 94 | |
39825f31 | 95 | #define PAVGB_MMX(rega, regb, regr, regfe) \ |
91abb473 ZK |
96 | "movq " #rega ", " #regr " \n\t"\ |
97 | "por " #regb ", " #regr " \n\t"\ | |
def60345 | 98 | "pxor " #rega ", " #regb " \n\t"\ |
39825f31 | 99 | "pand " #regfe "," #regb " \n\t"\ |
def60345 | 100 | "psrlq $1, " #regb " \n\t"\ |
91abb473 | 101 | "psubb " #regb ", " #regr " \n\t" |
def60345 | 102 | |
39825f31 | 103 | // mm6 is supposed to contain 0xfefefefefefefefe |
6aa6ea8e ZK |
104 | #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
105 | "movq " #rega ", " #regr " \n\t"\ | |
106 | "movq " #regc ", " #regp " \n\t"\ | |
107 | "pand " #regb ", " #regr " \n\t"\ | |
108 | "pand " #regd ", " #regp " \n\t"\ | |
109 | "pxor " #rega ", " #regb " \n\t"\ | |
110 | "pxor " #regc ", " #regd " \n\t"\ | |
fca0f0e5 ZK |
111 | "pand %%mm6, " #regb " \n\t"\ |
112 | "pand %%mm6, " #regd " \n\t"\ | |
6aa6ea8e ZK |
113 | "psrlq $1, " #regb " \n\t"\ |
114 | "psrlq $1, " #regd " \n\t"\ | |
115 | "paddb " #regb ", " #regr " \n\t"\ | |
116 | "paddb " #regd ", " #regp " \n\t" | |
117 | ||
118 | #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ | |
119 | "movq " #rega ", " #regr " \n\t"\ | |
120 | "movq " #regc ", " #regp " \n\t"\ | |
121 | "por " #regb ", " #regr " \n\t"\ | |
122 | "por " #regd ", " #regp " \n\t"\ | |
123 | "pxor " #rega ", " #regb " \n\t"\ | |
124 | "pxor " #regc ", " #regd " \n\t"\ | |
fca0f0e5 ZK |
125 | "pand %%mm6, " #regb " \n\t"\ |
126 | "pand %%mm6, " #regd " \n\t"\ | |
6aa6ea8e ZK |
127 | "psrlq $1, " #regd " \n\t"\ |
128 | "psrlq $1, " #regb " \n\t"\ | |
129 | "psubb " #regb ", " #regr " \n\t"\ | |
130 | "psubb " #regd ", " #regp " \n\t" | |
131 | ||
91abb473 ZK |
132 | /***********************************/ |
133 | /* MMX no rounding */ | |
134 | #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx | |
fca0f0e5 | 135 | #define SET_RND MOVQ_WONE |
6aa6ea8e | 136 | #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) |
39825f31 | 137 | #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) |
fca0f0e5 | 138 | |
91abb473 ZK |
139 | #include "dsputil_mmx_rnd.h" |
140 | ||
141 | #undef DEF | |
fca0f0e5 | 142 | #undef SET_RND |
6aa6ea8e | 143 | #undef PAVGBP |
39825f31 | 144 | #undef PAVGB |
91abb473 ZK |
145 | /***********************************/ |
146 | /* MMX rounding */ | |
147 | ||
148 | #define DEF(x, y) x ## _ ## y ##_mmx | |
fca0f0e5 | 149 | #define SET_RND MOVQ_WTWO |
6aa6ea8e | 150 | #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
39825f31 | 151 | #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
fca0f0e5 | 152 | |
91abb473 ZK |
153 | #include "dsputil_mmx_rnd.h" |
154 | ||
155 | #undef DEF | |
fca0f0e5 | 156 | #undef SET_RND |
6aa6ea8e | 157 | #undef PAVGBP |
39825f31 | 158 | #undef PAVGB |
a7bd8797 | 159 | |
de6d9b64 FB |
160 | /***********************************/ |
161 | /* 3Dnow specific */ | |
162 | ||
163 | #define DEF(x) x ## _3dnow | |
164 | /* for Athlons PAVGUSB is prefered */ | |
165 | #define PAVGB "pavgusb" | |
166 | ||
167 | #include "dsputil_mmx_avg.h" | |
168 | ||
169 | #undef DEF | |
170 | #undef PAVGB | |
171 | ||
172 | /***********************************/ | |
173 | /* MMX2 specific */ | |
174 | ||
607dce96 | 175 | #define DEF(x) x ## _mmx2 |
de6d9b64 FB |
176 | |
177 | /* Introduced only in MMX2 set */ | |
178 | #define PAVGB "pavgb" | |
179 | ||
180 | #include "dsputil_mmx_avg.h" | |
181 | ||
182 | #undef DEF | |
183 | #undef PAVGB | |
184 | ||
185 | /***********************************/ | |
186 | /* standard MMX */ | |
187 | ||
188 | static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
189 | { | |
607dce96 MN |
190 | asm volatile( |
191 | "movl $-128, %%eax \n\t" | |
192 | "pxor %%mm7, %%mm7 \n\t" | |
193 | ".balign 16 \n\t" | |
194 | "1: \n\t" | |
195 | "movq (%0), %%mm0 \n\t" | |
196 | "movq (%0, %2), %%mm2 \n\t" | |
197 | "movq %%mm0, %%mm1 \n\t" | |
198 | "movq %%mm2, %%mm3 \n\t" | |
199 | "punpcklbw %%mm7, %%mm0 \n\t" | |
200 | "punpckhbw %%mm7, %%mm1 \n\t" | |
201 | "punpcklbw %%mm7, %%mm2 \n\t" | |
202 | "punpckhbw %%mm7, %%mm3 \n\t" | |
203 | "movq %%mm0, (%1, %%eax)\n\t" | |
204 | "movq %%mm1, 8(%1, %%eax)\n\t" | |
205 | "movq %%mm2, 16(%1, %%eax)\n\t" | |
206 | "movq %%mm3, 24(%1, %%eax)\n\t" | |
207 | "addl %3, %0 \n\t" | |
208 | "addl $32, %%eax \n\t" | |
209 | "js 1b \n\t" | |
210 | : "+r" (pixels) | |
211 | : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
212 | : "%eax" | |
213 | ); | |
de6d9b64 FB |
214 | } |
215 | ||
9dbcbd92 MN |
216 | static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
217 | { | |
218 | asm volatile( | |
607dce96 | 219 | "pxor %%mm7, %%mm7 \n\t" |
9dbcbd92 | 220 | "movl $-128, %%eax \n\t" |
607dce96 | 221 | ".balign 16 \n\t" |
9dbcbd92 MN |
222 | "1: \n\t" |
223 | "movq (%0), %%mm0 \n\t" | |
224 | "movq (%1), %%mm2 \n\t" | |
225 | "movq %%mm0, %%mm1 \n\t" | |
226 | "movq %%mm2, %%mm3 \n\t" | |
227 | "punpcklbw %%mm7, %%mm0 \n\t" | |
228 | "punpckhbw %%mm7, %%mm1 \n\t" | |
229 | "punpcklbw %%mm7, %%mm2 \n\t" | |
230 | "punpckhbw %%mm7, %%mm3 \n\t" | |
231 | "psubw %%mm2, %%mm0 \n\t" | |
232 | "psubw %%mm3, %%mm1 \n\t" | |
233 | "movq %%mm0, (%2, %%eax)\n\t" | |
234 | "movq %%mm1, 8(%2, %%eax)\n\t" | |
235 | "addl %3, %0 \n\t" | |
236 | "addl %3, %1 \n\t" | |
237 | "addl $16, %%eax \n\t" | |
238 | "jnz 1b \n\t" | |
239 | : "+r" (s1), "+r" (s2) | |
240 | : "r" (block+64), "r" (stride) | |
241 | : "%eax" | |
242 | ); | |
243 | } | |
244 | ||
de6d9b64 FB |
245 | static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
246 | { | |
247 | const DCTELEM *p; | |
248 | UINT8 *pix; | |
de6d9b64 FB |
249 | |
250 | /* read the pixels */ | |
251 | p = block; | |
252 | pix = pixels; | |
d6a4c0b1 | 253 | /* unrolled loop */ |
de6d9b64 | 254 | __asm __volatile( |
a822a479 NK |
255 | "movq %3, %%mm0\n\t" |
256 | "movq 8%3, %%mm1\n\t" | |
257 | "movq 16%3, %%mm2\n\t" | |
258 | "movq 24%3, %%mm3\n\t" | |
259 | "movq 32%3, %%mm4\n\t" | |
260 | "movq 40%3, %%mm5\n\t" | |
261 | "movq 48%3, %%mm6\n\t" | |
262 | "movq 56%3, %%mm7\n\t" | |
de6d9b64 FB |
263 | "packuswb %%mm1, %%mm0\n\t" |
264 | "packuswb %%mm3, %%mm2\n\t" | |
265 | "packuswb %%mm5, %%mm4\n\t" | |
266 | "packuswb %%mm7, %%mm6\n\t" | |
a822a479 NK |
267 | "movq %%mm0, (%0)\n\t" |
268 | "movq %%mm2, (%0, %1)\n\t" | |
269 | "movq %%mm4, (%0, %1, 2)\n\t" | |
270 | "movq %%mm6, (%0, %2)\n\t" | |
271 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) | |
de6d9b64 FB |
272 | :"memory"); |
273 | pix += line_size*4; | |
274 | p += 32; | |
d6a4c0b1 ZK |
275 | |
276 | // if here would be an exact copy of the code above | |
277 | // compiler would generate some very strange code | |
278 | // thus using "r" | |
279 | __asm __volatile( | |
280 | "movq (%3), %%mm0\n\t" | |
281 | "movq 8(%3), %%mm1\n\t" | |
282 | "movq 16(%3), %%mm2\n\t" | |
283 | "movq 24(%3), %%mm3\n\t" | |
284 | "movq 32(%3), %%mm4\n\t" | |
285 | "movq 40(%3), %%mm5\n\t" | |
286 | "movq 48(%3), %%mm6\n\t" | |
287 | "movq 56(%3), %%mm7\n\t" | |
288 | "packuswb %%mm1, %%mm0\n\t" | |
289 | "packuswb %%mm3, %%mm2\n\t" | |
290 | "packuswb %%mm5, %%mm4\n\t" | |
291 | "packuswb %%mm7, %%mm6\n\t" | |
292 | "movq %%mm0, (%0)\n\t" | |
293 | "movq %%mm2, (%0, %1)\n\t" | |
294 | "movq %%mm4, (%0, %1, 2)\n\t" | |
295 | "movq %%mm6, (%0, %2)\n\t" | |
296 | ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) | |
297 | :"memory"); | |
de6d9b64 FB |
298 | } |
299 | ||
300 | static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
301 | { | |
302 | const DCTELEM *p; | |
303 | UINT8 *pix; | |
304 | int i; | |
305 | ||
306 | /* read the pixels */ | |
307 | p = block; | |
308 | pix = pixels; | |
d6a4c0b1 ZK |
309 | MOVQ_ZERO(mm7); |
310 | i = 4; | |
cd8e5f96 | 311 | do { |
de6d9b64 | 312 | __asm __volatile( |
cd8e5f96 ZK |
313 | "movq (%2), %%mm0\n\t" |
314 | "movq 8(%2), %%mm1\n\t" | |
315 | "movq 16(%2), %%mm2\n\t" | |
316 | "movq 24(%2), %%mm3\n\t" | |
de6d9b64 FB |
317 | "movq %0, %%mm4\n\t" |
318 | "movq %1, %%mm6\n\t" | |
319 | "movq %%mm4, %%mm5\n\t" | |
320 | "punpcklbw %%mm7, %%mm4\n\t" | |
321 | "punpckhbw %%mm7, %%mm5\n\t" | |
322 | "paddsw %%mm4, %%mm0\n\t" | |
323 | "paddsw %%mm5, %%mm1\n\t" | |
324 | "movq %%mm6, %%mm5\n\t" | |
325 | "punpcklbw %%mm7, %%mm6\n\t" | |
326 | "punpckhbw %%mm7, %%mm5\n\t" | |
327 | "paddsw %%mm6, %%mm2\n\t" | |
328 | "paddsw %%mm5, %%mm3\n\t" | |
329 | "packuswb %%mm1, %%mm0\n\t" | |
330 | "packuswb %%mm3, %%mm2\n\t" | |
331 | "movq %%mm0, %0\n\t" | |
332 | "movq %%mm2, %1\n\t" | |
a822a479 | 333 | :"+m"(*pix), "+m"(*(pix+line_size)) |
cd8e5f96 | 334 | :"r"(p) |
de6d9b64 FB |
335 | :"memory"); |
336 | pix += line_size*2; | |
337 | p += 16; | |
cd8e5f96 | 338 | } while (--i); |
de6d9b64 FB |
339 | } |
340 | ||
b3184779 | 341 | static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
de6d9b64 | 342 | { |
39825f31 | 343 | __asm __volatile( |
31ddcf98 | 344 | "lea (%3, %3), %%eax \n\t" |
52af45ad | 345 | ".balign 8 \n\t" |
31ddcf98 ZK |
346 | "1: \n\t" |
347 | "movq (%1), %%mm0 \n\t" | |
348 | "movq (%1, %3), %%mm1 \n\t" | |
349 | "movq %%mm0, (%2) \n\t" | |
350 | "movq %%mm1, (%2, %3) \n\t" | |
351 | "addl %%eax, %1 \n\t" | |
352 | "addl %%eax, %2 \n\t" | |
353 | "movq (%1), %%mm0 \n\t" | |
354 | "movq (%1, %3), %%mm1 \n\t" | |
355 | "movq %%mm0, (%2) \n\t" | |
356 | "movq %%mm1, (%2, %3) \n\t" | |
357 | "addl %%eax, %1 \n\t" | |
358 | "addl %%eax, %2 \n\t" | |
359 | "subl $4, %0 \n\t" | |
360 | "jnz 1b \n\t" | |
361 | : "+g"(h), "+r" (pixels), "+r" (block) | |
362 | : "r"(line_size) | |
363 | : "%eax", "memory" | |
364 | ); | |
de6d9b64 FB |
365 | } |
366 | ||
b3184779 MN |
367 | static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
368 | { | |
369 | __asm __volatile( | |
370 | "lea (%3, %3), %%eax \n\t" | |
371 | ".balign 8 \n\t" | |
372 | "1: \n\t" | |
373 | "movq (%1), %%mm0 \n\t" | |
374 | "movq 8(%1), %%mm4 \n\t" | |
375 | "movq (%1, %3), %%mm1 \n\t" | |
376 | "movq 8(%1, %3), %%mm5 \n\t" | |
377 | "movq %%mm0, (%2) \n\t" | |
378 | "movq %%mm4, 8(%2) \n\t" | |
379 | "movq %%mm1, (%2, %3) \n\t" | |
380 | "movq %%mm5, 8(%2, %3) \n\t" | |
381 | "addl %%eax, %1 \n\t" | |
382 | "addl %%eax, %2 \n\t" | |
383 | "movq (%1), %%mm0 \n\t" | |
384 | "movq 8(%1), %%mm4 \n\t" | |
385 | "movq (%1, %3), %%mm1 \n\t" | |
386 | "movq 8(%1, %3), %%mm5 \n\t" | |
387 | "movq %%mm0, (%2) \n\t" | |
388 | "movq %%mm4, 8(%2) \n\t" | |
389 | "movq %%mm1, (%2, %3) \n\t" | |
390 | "movq %%mm5, 8(%2, %3) \n\t" | |
391 | "addl %%eax, %1 \n\t" | |
392 | "addl %%eax, %2 \n\t" | |
393 | "subl $4, %0 \n\t" | |
394 | "jnz 1b \n\t" | |
395 | : "+g"(h), "+r" (pixels), "+r" (block) | |
396 | : "r"(line_size) | |
397 | : "%eax", "memory" | |
398 | ); | |
399 | } | |
400 | ||
649c00c9 MN |
401 | static void clear_blocks_mmx(DCTELEM *blocks) |
402 | { | |
39825f31 | 403 | __asm __volatile( |
649c00c9 MN |
404 | "pxor %%mm7, %%mm7 \n\t" |
405 | "movl $-128*6, %%eax \n\t" | |
406 | "1: \n\t" | |
407 | "movq %%mm7, (%0, %%eax) \n\t" | |
408 | "movq %%mm7, 8(%0, %%eax) \n\t" | |
409 | "movq %%mm7, 16(%0, %%eax) \n\t" | |
410 | "movq %%mm7, 24(%0, %%eax) \n\t" | |
411 | "addl $32, %%eax \n\t" | |
412 | " js 1b \n\t" | |
413 | : : "r" (((int)blocks)+128*6) | |
414 | : "%eax" | |
415 | ); | |
416 | } | |
417 | ||
084c726b MN |
418 | static int pix_sum16_mmx(UINT8 * pix, int line_size){ |
419 | const int h=16; | |
420 | int sum; | |
421 | int index= -line_size*h; | |
422 | ||
423 | __asm __volatile( | |
424 | "pxor %%mm7, %%mm7 \n\t" | |
425 | "pxor %%mm6, %%mm6 \n\t" | |
426 | "1: \n\t" | |
427 | "movq (%2, %1), %%mm0 \n\t" | |
428 | "movq (%2, %1), %%mm1 \n\t" | |
429 | "movq 8(%2, %1), %%mm2 \n\t" | |
430 | "movq 8(%2, %1), %%mm3 \n\t" | |
431 | "punpcklbw %%mm7, %%mm0 \n\t" | |
432 | "punpckhbw %%mm7, %%mm1 \n\t" | |
433 | "punpcklbw %%mm7, %%mm2 \n\t" | |
434 | "punpckhbw %%mm7, %%mm3 \n\t" | |
435 | "paddw %%mm0, %%mm1 \n\t" | |
436 | "paddw %%mm2, %%mm3 \n\t" | |
437 | "paddw %%mm1, %%mm3 \n\t" | |
438 | "paddw %%mm3, %%mm6 \n\t" | |
439 | "addl %3, %1 \n\t" | |
440 | " js 1b \n\t" | |
441 | "movq %%mm6, %%mm5 \n\t" | |
442 | "psrlq $32, %%mm6 \n\t" | |
443 | "paddw %%mm5, %%mm6 \n\t" | |
444 | "movq %%mm6, %%mm5 \n\t" | |
445 | "psrlq $16, %%mm6 \n\t" | |
446 | "paddw %%mm5, %%mm6 \n\t" | |
447 | "movd %%mm6, %0 \n\t" | |
448 | "andl $0xFFFF, %0 \n\t" | |
449 | : "=&r" (sum), "+r" (index) | |
450 | : "r" (pix - index), "r" (line_size) | |
451 | ); | |
452 | ||
453 | return sum; | |
454 | } | |
455 | ||
61a4e8ae | 456 | #if 0 |
d6a4c0b1 | 457 | static void just_return() { return; } |
61a4e8ae | 458 | #endif |
d6a4c0b1 | 459 | |
de6d9b64 FB |
460 | void dsputil_init_mmx(void) |
461 | { | |
462 | mm_flags = mm_support(); | |
1565dabc LB |
463 | #if 0 |
464 | fprintf(stderr, "libavcodec: CPU flags:"); | |
de6d9b64 | 465 | if (mm_flags & MM_MMX) |
1565dabc | 466 | fprintf(stderr, " mmx"); |
de6d9b64 | 467 | if (mm_flags & MM_MMXEXT) |
1565dabc | 468 | fprintf(stderr, " mmxext"); |
de6d9b64 | 469 | if (mm_flags & MM_3DNOW) |
1565dabc | 470 | fprintf(stderr, " 3dnow"); |
de6d9b64 | 471 | if (mm_flags & MM_SSE) |
1565dabc | 472 | fprintf(stderr, " sse"); |
de6d9b64 | 473 | if (mm_flags & MM_SSE2) |
1565dabc LB |
474 | fprintf(stderr, " sse2"); |
475 | fprintf(stderr, "\n"); | |
de6d9b64 FB |
476 | #endif |
477 | ||
478 | if (mm_flags & MM_MMX) { | |
479 | get_pixels = get_pixels_mmx; | |
9dbcbd92 | 480 | diff_pixels = diff_pixels_mmx; |
de6d9b64 FB |
481 | put_pixels_clamped = put_pixels_clamped_mmx; |
482 | add_pixels_clamped = add_pixels_clamped_mmx; | |
649c00c9 | 483 | clear_blocks= clear_blocks_mmx; |
084c726b | 484 | pix_sum= pix_sum16_mmx; |
dcb9cd4b | 485 | |
ba6802de MN |
486 | pix_abs16x16 = pix_abs16x16_mmx; |
487 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
488 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
de6d9b64 | 489 | pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
ba6802de MN |
490 | pix_abs8x8 = pix_abs8x8_mmx; |
491 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
492 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
493 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
4afeaec9 | 494 | |
b3184779 MN |
495 | put_pixels_tab[0][0] = put_pixels16_mmx; |
496 | put_pixels_tab[0][1] = put_pixels16_x2_mmx; | |
497 | put_pixels_tab[0][2] = put_pixels16_y2_mmx; | |
498 | put_pixels_tab[0][3] = put_pixels16_xy2_mmx; | |
499 | ||
500 | put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx; | |
501 | put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; | |
502 | put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; | |
503 | put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; | |
504 | ||
505 | avg_pixels_tab[0][0] = avg_pixels16_mmx; | |
506 | avg_pixels_tab[0][1] = avg_pixels16_x2_mmx; | |
507 | avg_pixels_tab[0][2] = avg_pixels16_y2_mmx; | |
508 | avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; | |
509 | ||
510 | avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx; | |
511 | avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; | |
512 | avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; | |
513 | avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; | |
514 | ||
515 | put_pixels_tab[1][0] = put_pixels8_mmx; | |
516 | put_pixels_tab[1][1] = put_pixels8_x2_mmx; | |
517 | put_pixels_tab[1][2] = put_pixels8_y2_mmx; | |
518 | put_pixels_tab[1][3] = put_pixels8_xy2_mmx; | |
519 | ||
520 | put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx; | |
521 | put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; | |
522 | put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; | |
523 | put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; | |
524 | ||
525 | avg_pixels_tab[1][0] = avg_pixels8_mmx; | |
526 | avg_pixels_tab[1][1] = avg_pixels8_x2_mmx; | |
527 | avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; | |
528 | avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; | |
529 | ||
530 | avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; | |
531 | avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; | |
532 | avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; | |
533 | avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; | |
607dce96 | 534 | |
de6d9b64 | 535 | if (mm_flags & MM_MMXEXT) { |
ba6802de MN |
536 | pix_abs16x16 = pix_abs16x16_mmx2; |
537 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
538 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
539 | pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
dcb9cd4b | 540 | |
ba6802de MN |
541 | pix_abs8x8 = pix_abs8x8_mmx2; |
542 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
543 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
544 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
607dce96 | 545 | |
b3184779 MN |
546 | put_pixels_tab[0][1] = put_pixels16_x2_mmx2; |
547 | put_pixels_tab[0][2] = put_pixels16_y2_mmx2; | |
548 | put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |
549 | put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | |
550 | ||
551 | avg_pixels_tab[0][0] = avg_pixels16_mmx2; | |
552 | avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; | |
553 | avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; | |
554 | avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | |
555 | ||
556 | put_pixels_tab[1][1] = put_pixels8_x2_mmx2; | |
557 | put_pixels_tab[1][2] = put_pixels8_y2_mmx2; | |
558 | put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
559 | put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
560 | ||
561 | avg_pixels_tab[1][0] = avg_pixels8_mmx2; | |
562 | avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; | |
563 | avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; | |
564 | avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
de6d9b64 | 565 | } else if (mm_flags & MM_3DNOW) { |
b3184779 MN |
566 | put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
567 | put_pixels_tab[0][2] = put_pixels16_y2_3dnow; | |
568 | put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
569 | put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
570 | ||
571 | avg_pixels_tab[0][0] = avg_pixels16_3dnow; | |
572 | avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; | |
573 | avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; | |
574 | avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
575 | ||
576 | put_pixels_tab[1][1] = put_pixels8_x2_3dnow; | |
577 | put_pixels_tab[1][2] = put_pixels8_y2_3dnow; | |
578 | put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
579 | put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
580 | ||
581 | avg_pixels_tab[1][0] = avg_pixels8_3dnow; | |
582 | avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; | |
583 | avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; | |
584 | avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
de6d9b64 FB |
585 | } |
586 | } | |
d6a4c0b1 ZK |
587 | |
588 | #if 0 | |
589 | // for speed testing | |
590 | get_pixels = just_return; | |
591 | put_pixels_clamped = just_return; | |
592 | add_pixels_clamped = just_return; | |
593 | ||
594 | pix_abs16x16 = just_return; | |
595 | pix_abs16x16_x2 = just_return; | |
596 | pix_abs16x16_y2 = just_return; | |
597 | pix_abs16x16_xy2 = just_return; | |
598 | ||
599 | put_pixels_tab[0] = just_return; | |
600 | put_pixels_tab[1] = just_return; | |
601 | put_pixels_tab[2] = just_return; | |
602 | put_pixels_tab[3] = just_return; | |
603 | ||
604 | put_no_rnd_pixels_tab[0] = just_return; | |
605 | put_no_rnd_pixels_tab[1] = just_return; | |
606 | put_no_rnd_pixels_tab[2] = just_return; | |
607 | put_no_rnd_pixels_tab[3] = just_return; | |
608 | ||
609 | avg_pixels_tab[0] = just_return; | |
610 | avg_pixels_tab[1] = just_return; | |
611 | avg_pixels_tab[2] = just_return; | |
612 | avg_pixels_tab[3] = just_return; | |
613 | ||
614 | avg_no_rnd_pixels_tab[0] = just_return; | |
615 | avg_no_rnd_pixels_tab[1] = just_return; | |
616 | avg_no_rnd_pixels_tab[2] = just_return; | |
617 | avg_no_rnd_pixels_tab[3] = just_return; | |
618 | ||
d6a4c0b1 ZK |
619 | //av_fdct = just_return; |
620 | //ff_idct = just_return; | |
621 | #endif | |
de6d9b64 | 622 | } |
4f12a497 FB |
623 | |
624 | /* remove any non bit exact operation (testing purpose). NOTE that | |
625 | this function should be kept as small as possible because it is | |
626 | always difficult to test automatically non bit exact cases. */ | |
627 | void dsputil_set_bit_exact_mmx(void) | |
628 | { | |
629 | if (mm_flags & MM_MMX) { | |
b3184779 MN |
630 | |
631 | /* MMX2 & 3DNOW */ | |
632 | put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; | |
633 | put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; | |
634 | avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; | |
635 | put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; | |
636 | put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; | |
637 | avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; | |
4afeaec9 | 638 | |
b3184779 | 639 | if (mm_flags & MM_MMXEXT) { |
4afeaec9 MN |
640 | pix_abs16x16_x2 = pix_abs16x16_x2_mmx; |
641 | pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
642 | pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | |
643 | pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
644 | pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
645 | pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
4f12a497 FB |
646 | } |
647 | } | |
648 | } |