Commit | Line | Data |
---|---|---|
05c4072b MN |
1 | /* |
2 | * Copyright (c) 2002 Brian Foley | |
3 | * Copyright (c) 2002 Dieter Shirley | |
fe50f385 | 4 | * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> |
05c4072b MN |
5 | * |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
19 | */ | |
20 | ||
59925ef2 | 21 | #include "../dsputil.h" |
a9a07762 MN |
22 | |
23 | #include "gcc_fixes.h" | |
24 | ||
05c4072b | 25 | #include "dsputil_altivec.h" |
59925ef2 | 26 | |
3b991c54 | 27 | #ifdef CONFIG_DARWIN |
59925ef2 | 28 | #include <sys/sysctl.h> |
3b991c54 RD |
29 | #else /* CONFIG_DARWIN */ |
30 | #include <signal.h> | |
31 | #include <setjmp.h> | |
32 | ||
33 | static sigjmp_buf jmpbuf; | |
34 | static volatile sig_atomic_t canjump = 0; | |
35 | ||
36 | static void sigill_handler (int sig) | |
37 | { | |
38 | if (!canjump) { | |
39 | signal (sig, SIG_DFL); | |
40 | raise (sig); | |
41 | } | |
42 | ||
43 | canjump = 0; | |
44 | siglongjmp (jmpbuf, 1); | |
45 | } | |
46 | #endif /* CONFIG_DARWIN */ | |
59925ef2 | 47 | |
bb198e19 | 48 | int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
f2677d6b | 49 | { |
4013fcf4 FB |
50 | int i; |
51 | int s __attribute__((aligned(16))); | |
3b991c54 | 52 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
4013fcf4 | 53 | vector unsigned char *tv; |
f2677d6b BF |
54 | vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; |
55 | vector unsigned int sad; | |
56 | vector signed int sumdiffs; | |
57 | ||
58 | s = 0; | |
3b991c54 | 59 | sad = (vector unsigned int)vec_splat_u32(0); |
bb198e19 | 60 | for(i=0;i<h;i++) { |
f2677d6b BF |
61 | /* |
62 | Read unaligned pixels into our vectors. The vectors are as follows: | |
63 | pix1v: pix1[0]-pix1[15] | |
64 | pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] | |
65 | */ | |
66 | tv = (vector unsigned char *) pix1; | |
67 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); | |
68 | ||
69 | tv = (vector unsigned char *) &pix2[0]; | |
70 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); | |
71 | ||
72 | tv = (vector unsigned char *) &pix2[1]; | |
73 | pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); | |
74 | ||
75 | /* Calculate the average vector */ | |
76 | avgv = vec_avg(pix2v, pix2iv); | |
77 | ||
78 | /* Calculate a sum of abs differences vector */ | |
79 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); | |
80 | ||
81 | /* Add each 4 pixel group together and put 4 results into sad */ | |
82 | sad = vec_sum4s(t5, sad); | |
83 | ||
84 | pix1 += line_size; | |
85 | pix2 += line_size; | |
86 | } | |
87 | /* Sum up the four partial sums, and put the result into s */ | |
88 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
89 | sumdiffs = vec_splat(sumdiffs, 3); | |
90 | vec_ste(sumdiffs, 0, &s); | |
91 | ||
92 | return s; | |
93 | } | |
94 | ||
bb198e19 | 95 | int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
f2677d6b | 96 | { |
4013fcf4 FB |
97 | int i; |
98 | int s __attribute__((aligned(16))); | |
3b991c54 | 99 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
4013fcf4 | 100 | vector unsigned char *tv; |
f2677d6b BF |
101 | vector unsigned char pix1v, pix2v, pix3v, avgv, t5; |
102 | vector unsigned int sad; | |
103 | vector signed int sumdiffs; | |
104 | uint8_t *pix3 = pix2 + line_size; | |
105 | ||
106 | s = 0; | |
3b991c54 | 107 | sad = (vector unsigned int)vec_splat_u32(0); |
f2677d6b BF |
108 | |
109 | /* | |
110 | Due to the fact that pix3 = pix2 + line_size, the pix3 of one | |
111 | iteration becomes pix2 in the next iteration. We can use this | |
112 | fact to avoid a potentially expensive unaligned read, each | |
113 | time around the loop. | |
114 | Read unaligned pixels into our vectors. The vectors are as follows: | |
115 | pix2v: pix2[0]-pix2[15] | |
116 | Split the pixel vectors into shorts | |
117 | */ | |
118 | tv = (vector unsigned char *) &pix2[0]; | |
119 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); | |
120 | ||
bb198e19 | 121 | for(i=0;i<h;i++) { |
f2677d6b BF |
122 | /* |
123 | Read unaligned pixels into our vectors. The vectors are as follows: | |
124 | pix1v: pix1[0]-pix1[15] | |
125 | pix3v: pix3[0]-pix3[15] | |
126 | */ | |
127 | tv = (vector unsigned char *) pix1; | |
128 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); | |
129 | ||
130 | tv = (vector unsigned char *) &pix3[0]; | |
131 | pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); | |
132 | ||
133 | /* Calculate the average vector */ | |
134 | avgv = vec_avg(pix2v, pix3v); | |
135 | ||
136 | /* Calculate a sum of abs differences vector */ | |
137 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); | |
138 | ||
139 | /* Add each 4 pixel group together and put 4 results into sad */ | |
140 | sad = vec_sum4s(t5, sad); | |
141 | ||
142 | pix1 += line_size; | |
143 | pix2v = pix3v; | |
144 | pix3 += line_size; | |
145 | ||
146 | } | |
147 | ||
148 | /* Sum up the four partial sums, and put the result into s */ | |
149 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
150 | sumdiffs = vec_splat(sumdiffs, 3); | |
151 | vec_ste(sumdiffs, 0, &s); | |
152 | return s; | |
153 | } | |
154 | ||
bb198e19 | 155 | int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
f2677d6b | 156 | { |
4013fcf4 FB |
157 | int i; |
158 | int s __attribute__((aligned(16))); | |
f2677d6b | 159 | uint8_t *pix3 = pix2 + line_size; |
3b991c54 RD |
160 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
161 | const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); | |
4013fcf4 | 162 | vector unsigned char *tv, avgv, t5; |
f2677d6b BF |
163 | vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; |
164 | vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; | |
165 | vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; | |
4013fcf4 | 166 | vector unsigned short avghv, avglv; |
f2677d6b BF |
167 | vector unsigned short t1, t2, t3, t4; |
168 | vector unsigned int sad; | |
169 | vector signed int sumdiffs; | |
170 | ||
3b991c54 | 171 | sad = (vector unsigned int)vec_splat_u32(0); |
f2677d6b BF |
172 | |
173 | s = 0; | |
174 | ||
175 | /* | |
176 | Due to the fact that pix3 = pix2 + line_size, the pix3 of one | |
177 | iteration becomes pix2 in the next iteration. We can use this | |
178 | fact to avoid a potentially expensive unaligned read, as well | |
179 | as some splitting, and vector addition each time around the loop. | |
180 | Read unaligned pixels into our vectors. The vectors are as follows: | |
181 | pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] | |
182 | Split the pixel vectors into shorts | |
183 | */ | |
184 | tv = (vector unsigned char *) &pix2[0]; | |
185 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); | |
186 | ||
187 | tv = (vector unsigned char *) &pix2[1]; | |
188 | pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); | |
189 | ||
190 | pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); | |
191 | pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); | |
192 | pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); | |
193 | pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); | |
194 | t1 = vec_add(pix2hv, pix2ihv); | |
195 | t2 = vec_add(pix2lv, pix2ilv); | |
196 | ||
bb198e19 | 197 | for(i=0;i<h;i++) { |
f2677d6b BF |
198 | /* |
199 | Read unaligned pixels into our vectors. The vectors are as follows: | |
200 | pix1v: pix1[0]-pix1[15] | |
201 | pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] | |
202 | */ | |
203 | tv = (vector unsigned char *) pix1; | |
204 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); | |
205 | ||
206 | tv = (vector unsigned char *) &pix3[0]; | |
207 | pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); | |
208 | ||
209 | tv = (vector unsigned char *) &pix3[1]; | |
210 | pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); | |
211 | ||
212 | /* | |
213 | Note that Altivec does have vec_avg, but this works on vector pairs | |
214 | and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding | |
215 | would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. | |
216 | Instead, we have to split the pixel vectors into vectors of shorts, | |
217 | and do the averaging by hand. | |
218 | */ | |
219 | ||
220 | /* Split the pixel vectors into shorts */ | |
221 | pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); | |
222 | pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); | |
223 | pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); | |
224 | pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); | |
225 | ||
226 | /* Do the averaging on them */ | |
227 | t3 = vec_add(pix3hv, pix3ihv); | |
228 | t4 = vec_add(pix3lv, pix3ilv); | |
229 | ||
9c76bd48 BF |
230 | avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |
231 | avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); | |
f2677d6b BF |
232 | |
233 | /* Pack the shorts back into a result */ | |
234 | avgv = vec_pack(avghv, avglv); | |
235 | ||
236 | /* Calculate a sum of abs differences vector */ | |
237 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); | |
238 | ||
239 | /* Add each 4 pixel group together and put 4 results into sad */ | |
240 | sad = vec_sum4s(t5, sad); | |
241 | ||
242 | pix1 += line_size; | |
243 | pix3 += line_size; | |
244 | /* Transfer the calculated values for pix3 into pix2 */ | |
245 | t1 = t3; | |
246 | t2 = t4; | |
247 | } | |
248 | /* Sum up the four partial sums, and put the result into s */ | |
249 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
250 | sumdiffs = vec_splat(sumdiffs, 3); | |
251 | vec_ste(sumdiffs, 0, &s); | |
252 | ||
253 | return s; | |
254 | } | |
255 | ||
bb198e19 | 256 | int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
59925ef2 | 257 | { |
4013fcf4 FB |
258 | int i; |
259 | int s __attribute__((aligned(16))); | |
3b991c54 | 260 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
261 | vector unsigned char perm1, perm2, *pix1v, *pix2v; |
262 | vector unsigned char t1, t2, t3,t4, t5; | |
4013fcf4 | 263 | vector unsigned int sad; |
59925ef2 BF |
264 | vector signed int sumdiffs; |
265 | ||
3b991c54 | 266 | sad = (vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
267 | |
268 | ||
bb198e19 | 269 | for(i=0;i<h;i++) { |
59925ef2 BF |
270 | /* Read potentially unaligned pixels into t1 and t2 */ |
271 | perm1 = vec_lvsl(0, pix1); | |
272 | pix1v = (vector unsigned char *) pix1; | |
273 | perm2 = vec_lvsl(0, pix2); | |
274 | pix2v = (vector unsigned char *) pix2; | |
275 | t1 = vec_perm(pix1v[0], pix1v[1], perm1); | |
276 | t2 = vec_perm(pix2v[0], pix2v[1], perm2); | |
277 | ||
278 | /* Calculate a sum of abs differences vector */ | |
279 | t3 = vec_max(t1, t2); | |
280 | t4 = vec_min(t1, t2); | |
281 | t5 = vec_sub(t3, t4); | |
282 | ||
283 | /* Add each 4 pixel group together and put 4 results into sad */ | |
284 | sad = vec_sum4s(t5, sad); | |
285 | ||
286 | pix1 += line_size; | |
287 | pix2 += line_size; | |
288 | } | |
289 | ||
290 | /* Sum up the four partial sums, and put the result into s */ | |
291 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
292 | sumdiffs = vec_splat(sumdiffs, 3); | |
293 | vec_ste(sumdiffs, 0, &s); | |
294 | ||
295 | return s; | |
296 | } | |
297 | ||
bb198e19 | 298 | int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
59925ef2 | 299 | { |
4013fcf4 FB |
300 | int i; |
301 | int s __attribute__((aligned(16))); | |
3b991c54 | 302 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
303 | vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |
304 | vector unsigned char t1, t2, t3,t4, t5; | |
4013fcf4 | 305 | vector unsigned int sad; |
59925ef2 BF |
306 | vector signed int sumdiffs; |
307 | ||
3b991c54 | 308 | sad = (vector unsigned int)vec_splat_u32(0); |
a9a07762 MN |
309 | |
310 | permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); | |
59925ef2 | 311 | |
bb198e19 | 312 | for(i=0;i<h;i++) { |
59925ef2 BF |
313 | /* Read potentially unaligned pixels into t1 and t2 |
314 | Since we're reading 16 pixels, and actually only want 8, | |
315 | mask out the last 8 pixels. The 0s don't change the sum. */ | |
316 | perm1 = vec_lvsl(0, pix1); | |
317 | pix1v = (vector unsigned char *) pix1; | |
318 | perm2 = vec_lvsl(0, pix2); | |
319 | pix2v = (vector unsigned char *) pix2; | |
320 | t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); | |
321 | t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); | |
322 | ||
323 | /* Calculate a sum of abs differences vector */ | |
324 | t3 = vec_max(t1, t2); | |
325 | t4 = vec_min(t1, t2); | |
326 | t5 = vec_sub(t3, t4); | |
327 | ||
328 | /* Add each 4 pixel group together and put 4 results into sad */ | |
329 | sad = vec_sum4s(t5, sad); | |
330 | ||
331 | pix1 += line_size; | |
332 | pix2 += line_size; | |
333 | } | |
334 | ||
335 | /* Sum up the four partial sums, and put the result into s */ | |
336 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
337 | sumdiffs = vec_splat(sumdiffs, 3); | |
338 | vec_ste(sumdiffs, 0, &s); | |
339 | ||
340 | return s; | |
341 | } | |
342 | ||
f2677d6b BF |
343 | int pix_norm1_altivec(uint8_t *pix, int line_size) |
344 | { | |
4013fcf4 FB |
345 | int i; |
346 | int s __attribute__((aligned(16))); | |
3b991c54 | 347 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
4013fcf4 | 348 | vector unsigned char *tv; |
f2677d6b | 349 | vector unsigned char pixv; |
f2677d6b BF |
350 | vector unsigned int sv; |
351 | vector signed int sum; | |
4013fcf4 | 352 | |
3b991c54 | 353 | sv = (vector unsigned int)vec_splat_u32(0); |
f2677d6b BF |
354 | |
355 | s = 0; | |
356 | for (i = 0; i < 16; i++) { | |
357 | /* Read in the potentially unaligned pixels */ | |
358 | tv = (vector unsigned char *) pix; | |
359 | pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); | |
360 | ||
9c76bd48 BF |
361 | /* Square the values, and add them to our sum */ |
362 | sv = vec_msum(pixv, pixv, sv); | |
f2677d6b BF |
363 | |
364 | pix += line_size; | |
365 | } | |
366 | /* Sum up the four partial sums, and put the result into s */ | |
367 | sum = vec_sums((vector signed int) sv, (vector signed int) zero); | |
368 | sum = vec_splat(sum, 3); | |
369 | vec_ste(sum, 0, &s); | |
370 | ||
371 | return s; | |
372 | } | |
373 | ||
4013fcf4 FB |
374 | /** |
375 | * Sum of Squared Errors for a 8x8 block. | |
376 | * AltiVec-enhanced. | |
bb198e19 | 377 | * It's the sad8_altivec code above w/ squaring added. |
4013fcf4 | 378 | */ |
bb198e19 | 379 | int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
4013fcf4 FB |
380 | { |
381 | int i; | |
382 | int s __attribute__((aligned(16))); | |
3b991c54 | 383 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
4013fcf4 FB |
384 | vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |
385 | vector unsigned char t1, t2, t3,t4, t5; | |
386 | vector unsigned int sum; | |
387 | vector signed int sumsqr; | |
388 | ||
3b991c54 | 389 | sum = (vector unsigned int)vec_splat_u32(0); |
a9a07762 MN |
390 | |
391 | permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); | |
392 | ||
4013fcf4 | 393 | |
bb198e19 | 394 | for(i=0;i<h;i++) { |
4013fcf4 FB |
395 | /* Read potentially unaligned pixels into t1 and t2 |
396 | Since we're reading 16 pixels, and actually only want 8, | |
397 | mask out the last 8 pixels. The 0s don't change the sum. */ | |
398 | perm1 = vec_lvsl(0, pix1); | |
399 | pix1v = (vector unsigned char *) pix1; | |
400 | perm2 = vec_lvsl(0, pix2); | |
401 | pix2v = (vector unsigned char *) pix2; | |
402 | t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); | |
403 | t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); | |
404 | ||
405 | /* | |
406 | Since we want to use unsigned chars, we can take advantage | |
407 | of the fact that abs(a-b)^2 = (a-b)^2. | |
408 | */ | |
409 | ||
410 | /* Calculate abs differences vector */ | |
411 | t3 = vec_max(t1, t2); | |
412 | t4 = vec_min(t1, t2); | |
413 | t5 = vec_sub(t3, t4); | |
414 | ||
415 | /* Square the values and add them to our sum */ | |
416 | sum = vec_msum(t5, t5, sum); | |
417 | ||
418 | pix1 += line_size; | |
419 | pix2 += line_size; | |
420 | } | |
421 | ||
422 | /* Sum up the four partial sums, and put the result into s */ | |
423 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
424 | sumsqr = vec_splat(sumsqr, 3); | |
425 | vec_ste(sumsqr, 0, &s); | |
426 | ||
427 | return s; | |
428 | } | |
429 | ||
430 | /** | |
431 | * Sum of Squared Errors for a 16x16 block. | |
432 | * AltiVec-enhanced. | |
bb198e19 | 433 | * It's the sad16_altivec code above w/ squaring added. |
4013fcf4 | 434 | */ |
bb198e19 | 435 | int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
59925ef2 | 436 | { |
4013fcf4 FB |
437 | int i; |
438 | int s __attribute__((aligned(16))); | |
3b991c54 | 439 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
4013fcf4 FB |
440 | vector unsigned char perm1, perm2, *pix1v, *pix2v; |
441 | vector unsigned char t1, t2, t3,t4, t5; | |
442 | vector unsigned int sum; | |
443 | vector signed int sumsqr; | |
444 | ||
3b991c54 | 445 | sum = (vector unsigned int)vec_splat_u32(0); |
4013fcf4 | 446 | |
bb198e19 | 447 | for(i=0;i<h;i++) { |
4013fcf4 FB |
448 | /* Read potentially unaligned pixels into t1 and t2 */ |
449 | perm1 = vec_lvsl(0, pix1); | |
450 | pix1v = (vector unsigned char *) pix1; | |
451 | perm2 = vec_lvsl(0, pix2); | |
452 | pix2v = (vector unsigned char *) pix2; | |
453 | t1 = vec_perm(pix1v[0], pix1v[1], perm1); | |
454 | t2 = vec_perm(pix2v[0], pix2v[1], perm2); | |
455 | ||
456 | /* | |
457 | Since we want to use unsigned chars, we can take advantage | |
458 | of the fact that abs(a-b)^2 = (a-b)^2. | |
459 | */ | |
460 | ||
461 | /* Calculate abs differences vector */ | |
462 | t3 = vec_max(t1, t2); | |
463 | t4 = vec_min(t1, t2); | |
464 | t5 = vec_sub(t3, t4); | |
465 | ||
466 | /* Square the values and add them to our sum */ | |
467 | sum = vec_msum(t5, t5, sum); | |
468 | ||
469 | pix1 += line_size; | |
470 | pix2 += line_size; | |
471 | } | |
472 | ||
473 | /* Sum up the four partial sums, and put the result into s */ | |
474 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
475 | sumsqr = vec_splat(sumsqr, 3); | |
476 | vec_ste(sumsqr, 0, &s); | |
477 | ||
478 | return s; | |
479 | } | |
59925ef2 | 480 | |
0c1a9eda | 481 | int pix_sum_altivec(uint8_t * pix, int line_size) |
4013fcf4 | 482 | { |
3b991c54 | 483 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
484 | vector unsigned char perm, *pixv; |
485 | vector unsigned char t1; | |
4013fcf4 | 486 | vector unsigned int sad; |
59925ef2 BF |
487 | vector signed int sumdiffs; |
488 | ||
4013fcf4 FB |
489 | int i; |
490 | int s __attribute__((aligned(16))); | |
491 | ||
3b991c54 | 492 | sad = (vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
493 | |
494 | for (i = 0; i < 16; i++) { | |
495 | /* Read the potentially unaligned 16 pixels into t1 */ | |
496 | perm = vec_lvsl(0, pix); | |
497 | pixv = (vector unsigned char *) pix; | |
498 | t1 = vec_perm(pixv[0], pixv[1], perm); | |
499 | ||
500 | /* Add each 4 pixel group together and put 4 results into sad */ | |
501 | sad = vec_sum4s(t1, sad); | |
502 | ||
503 | pix += line_size; | |
504 | } | |
505 | ||
506 | /* Sum up the four partial sums, and put the result into s */ | |
507 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
508 | sumdiffs = vec_splat(sumdiffs, 3); | |
509 | vec_ste(sumdiffs, 0, &s); | |
510 | ||
511 | return s; | |
512 | } | |
513 | ||
0c1a9eda | 514 | void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) |
05c4072b MN |
515 | { |
516 | int i; | |
517 | vector unsigned char perm, bytes, *pixv; | |
3b991c54 | 518 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
05c4072b MN |
519 | vector signed short shorts; |
520 | ||
521 | for(i=0;i<8;i++) | |
522 | { | |
523 | // Read potentially unaligned pixels. | |
524 | // We're reading 16 pixels, and actually only want 8, | |
525 | // but we simply ignore the extras. | |
526 | perm = vec_lvsl(0, pixels); | |
527 | pixv = (vector unsigned char *) pixels; | |
528 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
529 | ||
530 | // convert the bytes into shorts | |
531 | shorts = (vector signed short)vec_mergeh(zero, bytes); | |
532 | ||
533 | // save the data to the block, we assume the block is 16-byte aligned | |
534 | vec_st(shorts, i*16, (vector signed short*)block); | |
535 | ||
536 | pixels += line_size; | |
537 | } | |
538 | } | |
539 | ||
0c1a9eda ZK |
540 | void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, |
541 | const uint8_t *s2, int stride) | |
05c4072b MN |
542 | { |
543 | int i; | |
544 | vector unsigned char perm, bytes, *pixv; | |
3b991c54 | 545 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
05c4072b MN |
546 | vector signed short shorts1, shorts2; |
547 | ||
548 | for(i=0;i<4;i++) | |
549 | { | |
550 | // Read potentially unaligned pixels | |
551 | // We're reading 16 pixels, and actually only want 8, | |
552 | // but we simply ignore the extras. | |
553 | perm = vec_lvsl(0, s1); | |
554 | pixv = (vector unsigned char *) s1; | |
555 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
556 | ||
557 | // convert the bytes into shorts | |
558 | shorts1 = (vector signed short)vec_mergeh(zero, bytes); | |
559 | ||
560 | // Do the same for the second block of pixels | |
561 | perm = vec_lvsl(0, s2); | |
562 | pixv = (vector unsigned char *) s2; | |
563 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
564 | ||
565 | // convert the bytes into shorts | |
566 | shorts2 = (vector signed short)vec_mergeh(zero, bytes); | |
567 | ||
568 | // Do the subtraction | |
569 | shorts1 = vec_sub(shorts1, shorts2); | |
570 | ||
571 | // save the data to the block, we assume the block is 16-byte aligned | |
572 | vec_st(shorts1, 0, (vector signed short*)block); | |
573 | ||
574 | s1 += stride; | |
575 | s2 += stride; | |
576 | block += 8; | |
577 | ||
578 | ||
579 | // The code below is a copy of the code above... This is a manual | |
580 | // unroll. | |
581 | ||
582 | // Read potentially unaligned pixels | |
583 | // We're reading 16 pixels, and actually only want 8, | |
584 | // but we simply ignore the extras. | |
585 | perm = vec_lvsl(0, s1); | |
586 | pixv = (vector unsigned char *) s1; | |
587 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
588 | ||
589 | // convert the bytes into shorts | |
590 | shorts1 = (vector signed short)vec_mergeh(zero, bytes); | |
591 | ||
592 | // Do the same for the second block of pixels | |
593 | perm = vec_lvsl(0, s2); | |
594 | pixv = (vector unsigned char *) s2; | |
595 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
596 | ||
597 | // convert the bytes into shorts | |
598 | shorts2 = (vector signed short)vec_mergeh(zero, bytes); | |
599 | ||
600 | // Do the subtraction | |
601 | shorts1 = vec_sub(shorts1, shorts2); | |
602 | ||
603 | // save the data to the block, we assume the block is 16-byte aligned | |
604 | vec_st(shorts1, 0, (vector signed short*)block); | |
605 | ||
606 | s1 += stride; | |
607 | s2 += stride; | |
608 | block += 8; | |
609 | } | |
610 | } | |
611 | ||
e629ab68 | 612 | void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { |
db40a39a | 613 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
e629ab68 RD |
614 | int i; |
615 | for(i=0; i+7<w; i++){ | |
616 | dst[i+0] += src[i+0]; | |
617 | dst[i+1] += src[i+1]; | |
618 | dst[i+2] += src[i+2]; | |
619 | dst[i+3] += src[i+3]; | |
620 | dst[i+4] += src[i+4]; | |
621 | dst[i+5] += src[i+5]; | |
622 | dst[i+6] += src[i+6]; | |
623 | dst[i+7] += src[i+7]; | |
624 | } | |
625 | for(; i<w; i++) | |
626 | dst[i+0] += src[i+0]; | |
db40a39a | 627 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
e629ab68 | 628 | register int i; |
db40a39a MN |
629 | register vector unsigned char vdst, vsrc; |
630 | ||
631 | /* dst and src are 16 bytes-aligned (guaranteed) */ | |
632 | for(i = 0 ; (i + 15) < w ; i++) | |
e629ab68 | 633 | { |
db40a39a MN |
634 | vdst = vec_ld(i << 4, (unsigned char*)dst); |
635 | vsrc = vec_ld(i << 4, (unsigned char*)src); | |
e629ab68 | 636 | vdst = vec_add(vsrc, vdst); |
db40a39a | 637 | vec_st(vdst, i << 4, (unsigned char*)dst); |
e629ab68 | 638 | } |
db40a39a | 639 | /* if w is not a multiple of 16 */ |
e629ab68 RD |
640 | for (; (i < w) ; i++) |
641 | { | |
642 | dst[i] = src[i]; | |
643 | } | |
db40a39a MN |
644 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
645 | } | |
646 | ||
fe50f385 | 647 | /* next one assumes that ((line_size % 16) == 0) */ |
db40a39a MN |
648 | void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
649 | { | |
e45a2872 | 650 | POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); |
db40a39a MN |
651 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
652 | int i; | |
653 | ||
e45a2872 | 654 | POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); |
db40a39a MN |
655 | |
656 | for(i=0; i<h; i++) { | |
657 | *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); | |
658 | *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); | |
659 | *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); | |
660 | *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); | |
661 | pixels+=line_size; | |
662 | block +=line_size; | |
663 | } | |
664 | ||
e45a2872 | 665 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); |
db40a39a MN |
666 | |
667 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
db40a39a | 668 | register vector unsigned char pixelsv1, pixelsv2; |
e45a2872 RD |
669 | register vector unsigned char pixelsv1B, pixelsv2B; |
670 | register vector unsigned char pixelsv1C, pixelsv2C; | |
671 | register vector unsigned char pixelsv1D, pixelsv2D; | |
672 | ||
fe50f385 | 673 | register vector unsigned char perm = vec_lvsl(0, pixels); |
db40a39a | 674 | int i; |
e45a2872 RD |
675 | register int line_size_2 = line_size << 1; |
676 | register int line_size_3 = line_size + line_size_2; | |
677 | register int line_size_4 = line_size << 2; | |
678 | ||
679 | POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); | |
680 | // hand-unrolling the loop by 4 gains about 15% | |
681 | // mininum execution time goes from 74 to 60 cycles | |
682 | // it's faster than -funroll-loops, but using | |
683 | // -funroll-loops w/ this is bad - 74 cycles again. | |
684 | // all this is on a 7450, tuning for the 7450 | |
685 | #if 0 | |
db40a39a MN |
686 | for(i=0; i<h; i++) { |
687 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
688 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
fe50f385 | 689 | vec_st(vec_perm(pixelsv1, pixelsv2, perm), |
35e5fb06 | 690 | 0, (unsigned char*)block); |
db40a39a MN |
691 | pixels+=line_size; |
692 | block +=line_size; | |
693 | } | |
e45a2872 RD |
694 | #else |
695 | for(i=0; i<h; i+=4) { | |
696 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
697 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
698 | pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); | |
699 | pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels); | |
700 | pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); | |
701 | pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels); | |
702 | pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); | |
703 | pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels); | |
704 | vec_st(vec_perm(pixelsv1, pixelsv2, perm), | |
705 | 0, (unsigned char*)block); | |
706 | vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), | |
707 | line_size, (unsigned char*)block); | |
708 | vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), | |
709 | line_size_2, (unsigned char*)block); | |
710 | vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), | |
711 | line_size_3, (unsigned char*)block); | |
712 | pixels+=line_size_4; | |
713 | block +=line_size_4; | |
714 | } | |
715 | #endif | |
716 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); | |
db40a39a MN |
717 | |
718 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
719 | } | |
720 | ||
fe50f385 | 721 | /* next one assumes that ((line_size % 16) == 0) */ |
db40a39a MN |
722 | #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) |
723 | void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
724 | { | |
e45a2872 | 725 | POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); |
db40a39a MN |
726 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
727 | int i; | |
728 | ||
e45a2872 | 729 | POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); |
db40a39a MN |
730 | |
731 | for(i=0; i<h; i++) { | |
732 | op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); | |
733 | op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); | |
734 | op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); | |
735 | op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); | |
736 | pixels+=line_size; | |
737 | block +=line_size; | |
738 | } | |
739 | ||
e45a2872 | 740 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); |
db40a39a MN |
741 | |
742 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
db40a39a | 743 | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; |
fe50f385 | 744 | register vector unsigned char perm = vec_lvsl(0, pixels); |
db40a39a MN |
745 | int i; |
746 | ||
e45a2872 | 747 | POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); |
db40a39a MN |
748 | |
749 | for(i=0; i<h; i++) { | |
750 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
751 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
752 | blockv = vec_ld(0, block); | |
fe50f385 | 753 | pixelsv = vec_perm(pixelsv1, pixelsv2, perm); |
db40a39a MN |
754 | blockv = vec_avg(blockv,pixelsv); |
755 | vec_st(blockv, 0, (unsigned char*)block); | |
756 | pixels+=line_size; | |
757 | block +=line_size; | |
758 | } | |
759 | ||
e45a2872 | 760 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); |
db40a39a MN |
761 | |
762 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
e629ab68 | 763 | } |
05c4072b | 764 | |
fe50f385 RD |
765 | /* next one assumes that ((line_size % 8) == 0) */ |
766 | void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
35e5fb06 | 767 | { |
e45a2872 | 768 | POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); |
35e5fb06 RD |
769 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
770 | int i; | |
e45a2872 | 771 | POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); |
35e5fb06 RD |
772 | for (i = 0; i < h; i++) { |
773 | *((uint32_t *) (block)) = | |
774 | (((*((uint32_t *) (block))) | | |
775 | ((((const struct unaligned_32 *) (pixels))->l))) - | |
776 | ((((*((uint32_t *) (block))) ^ | |
777 | ((((const struct unaligned_32 *) (pixels))-> | |
778 | l))) & 0xFEFEFEFEUL) >> 1)); | |
779 | *((uint32_t *) (block + 4)) = | |
780 | (((*((uint32_t *) (block + 4))) | | |
781 | ((((const struct unaligned_32 *) (pixels + 4))->l))) - | |
782 | ((((*((uint32_t *) (block + 4))) ^ | |
783 | ((((const struct unaligned_32 *) (pixels + | |
784 | 4))-> | |
785 | l))) & 0xFEFEFEFEUL) >> 1)); | |
786 | pixels += line_size; | |
787 | block += line_size; | |
788 | } | |
e45a2872 | 789 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); |
35e5fb06 RD |
790 | |
791 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
792 | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | |
793 | int i; | |
794 | ||
e45a2872 | 795 | POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); |
35e5fb06 RD |
796 | |
797 | for (i = 0; i < h; i++) { | |
798 | /* | |
799 | block is 8 bytes-aligned, so we're either in the | |
800 | left block (16 bytes-aligned) or in the right block (not) | |
801 | */ | |
802 | int rightside = ((unsigned long)block & 0x0000000F); | |
803 | ||
804 | blockv = vec_ld(0, block); | |
805 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
806 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
807 | pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); | |
808 | ||
809 | if (rightside) | |
810 | { | |
811 | pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); | |
812 | } | |
813 | else | |
814 | { | |
815 | pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); | |
816 | } | |
817 | ||
818 | blockv = vec_avg(blockv, pixelsv); | |
819 | ||
820 | vec_st(blockv, 0, block); | |
821 | ||
822 | pixels += line_size; | |
823 | block += line_size; | |
824 | } | |
825 | ||
e45a2872 | 826 | POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); |
35e5fb06 RD |
827 | |
828 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
829 | } | |
830 | ||
fe50f385 | 831 | /* next one assumes that ((line_size % 8) == 0) */ |
35e5fb06 RD |
832 | void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
833 | { | |
e45a2872 | 834 | POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); |
35e5fb06 RD |
835 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
836 | int j; | |
e45a2872 | 837 | POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); |
35e5fb06 RD |
838 | for (j = 0; j < 2; j++) { |
839 | int i; | |
840 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
841 | const uint32_t b = | |
842 | (((const struct unaligned_32 *) (pixels + 1))->l); | |
843 | uint32_t l0 = | |
844 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
845 | uint32_t h0 = | |
846 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
847 | uint32_t l1, h1; | |
848 | pixels += line_size; | |
849 | for (i = 0; i < h; i += 2) { | |
850 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
851 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
852 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
853 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
854 | *((uint32_t *) block) = | |
855 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
856 | pixels += line_size; | |
857 | block += line_size; | |
858 | a = (((const struct unaligned_32 *) (pixels))->l); | |
859 | b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
860 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
861 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
862 | *((uint32_t *) block) = | |
863 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
864 | pixels += line_size; | |
865 | block += line_size; | |
866 | } pixels += 4 - line_size * (h + 1); | |
867 | block += 4 - line_size * h; | |
868 | } | |
869 | ||
e45a2872 | 870 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); |
35e5fb06 RD |
871 | |
872 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
873 | register int i; | |
874 | register vector unsigned char | |
875 | pixelsv1, pixelsv2, | |
876 | pixelsavg; | |
877 | register vector unsigned char | |
878 | blockv, temp1, temp2; | |
879 | register vector unsigned short | |
880 | pixelssum1, pixelssum2, temp3; | |
3b991c54 RD |
881 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
882 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
35e5fb06 RD |
883 | |
884 | temp1 = vec_ld(0, pixels); | |
885 | temp2 = vec_ld(16, pixels); | |
886 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
887 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
888 | { | |
889 | pixelsv2 = temp2; | |
890 | } | |
891 | else | |
892 | { | |
893 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
894 | } | |
895 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
896 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
897 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
898 | (vector unsigned short)pixelsv2); | |
899 | pixelssum1 = vec_add(pixelssum1, vctwo); | |
900 | ||
e45a2872 | 901 | POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); |
35e5fb06 RD |
902 | for (i = 0; i < h ; i++) { |
903 | int rightside = ((unsigned long)block & 0x0000000F); | |
904 | blockv = vec_ld(0, block); | |
905 | ||
906 | temp1 = vec_ld(line_size, pixels); | |
907 | temp2 = vec_ld(line_size + 16, pixels); | |
908 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
909 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
910 | { | |
911 | pixelsv2 = temp2; | |
912 | } | |
913 | else | |
914 | { | |
915 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
916 | } | |
917 | ||
918 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
919 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
920 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
921 | (vector unsigned short)pixelsv2); | |
922 | temp3 = vec_add(pixelssum1, pixelssum2); | |
923 | temp3 = vec_sra(temp3, vctwo); | |
924 | pixelssum1 = vec_add(pixelssum2, vctwo); | |
925 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
926 | ||
927 | if (rightside) | |
928 | { | |
929 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
930 | } | |
931 | else | |
932 | { | |
933 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
934 | } | |
935 | ||
936 | vec_st(blockv, 0, block); | |
937 | ||
938 | block += line_size; | |
939 | pixels += line_size; | |
940 | } | |
941 | ||
e45a2872 | 942 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); |
35e5fb06 RD |
943 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
944 | } | |
945 | ||
fe50f385 RD |
946 | /* next one assumes that ((line_size % 8) == 0) */ |
947 | void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
948 | { | |
e45a2872 | 949 | POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); |
fe50f385 RD |
950 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
951 | int j; | |
e45a2872 | 952 | POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
fe50f385 RD |
953 | for (j = 0; j < 2; j++) { |
954 | int i; | |
955 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
956 | const uint32_t b = | |
957 | (((const struct unaligned_32 *) (pixels + 1))->l); | |
958 | uint32_t l0 = | |
959 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
960 | uint32_t h0 = | |
961 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
962 | uint32_t l1, h1; | |
963 | pixels += line_size; | |
964 | for (i = 0; i < h; i += 2) { | |
965 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
966 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
967 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
968 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
969 | *((uint32_t *) block) = | |
970 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
971 | pixels += line_size; | |
972 | block += line_size; | |
973 | a = (((const struct unaligned_32 *) (pixels))->l); | |
974 | b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
975 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
976 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
977 | *((uint32_t *) block) = | |
978 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
979 | pixels += line_size; | |
980 | block += line_size; | |
981 | } pixels += 4 - line_size * (h + 1); | |
982 | block += 4 - line_size * h; | |
983 | } | |
984 | ||
e45a2872 | 985 | POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
fe50f385 RD |
986 | |
987 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
988 | register int i; | |
989 | register vector unsigned char | |
990 | pixelsv1, pixelsv2, | |
991 | pixelsavg; | |
992 | register vector unsigned char | |
993 | blockv, temp1, temp2; | |
994 | register vector unsigned short | |
995 | pixelssum1, pixelssum2, temp3; | |
3b991c54 RD |
996 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
997 | register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
998 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
fe50f385 RD |
999 | |
1000 | temp1 = vec_ld(0, pixels); | |
1001 | temp2 = vec_ld(16, pixels); | |
1002 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1003 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1004 | { | |
1005 | pixelsv2 = temp2; | |
1006 | } | |
1007 | else | |
1008 | { | |
1009 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1010 | } | |
1011 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1012 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1013 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1014 | (vector unsigned short)pixelsv2); | |
1015 | pixelssum1 = vec_add(pixelssum1, vcone); | |
1016 | ||
e45a2872 | 1017 | POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
fe50f385 RD |
1018 | for (i = 0; i < h ; i++) { |
1019 | int rightside = ((unsigned long)block & 0x0000000F); | |
1020 | blockv = vec_ld(0, block); | |
1021 | ||
1022 | temp1 = vec_ld(line_size, pixels); | |
1023 | temp2 = vec_ld(line_size + 16, pixels); | |
1024 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1025 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1026 | { | |
1027 | pixelsv2 = temp2; | |
1028 | } | |
1029 | else | |
1030 | { | |
1031 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1032 | } | |
1033 | ||
1034 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1035 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1036 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1037 | (vector unsigned short)pixelsv2); | |
1038 | temp3 = vec_add(pixelssum1, pixelssum2); | |
1039 | temp3 = vec_sra(temp3, vctwo); | |
1040 | pixelssum1 = vec_add(pixelssum2, vcone); | |
1041 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
1042 | ||
1043 | if (rightside) | |
1044 | { | |
1045 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
1046 | } | |
1047 | else | |
1048 | { | |
1049 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
1050 | } | |
1051 | ||
1052 | vec_st(blockv, 0, block); | |
1053 | ||
1054 | block += line_size; | |
1055 | pixels += line_size; | |
1056 | } | |
1057 | ||
e45a2872 | 1058 | POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
fe50f385 RD |
1059 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
1060 | } | |
1061 | ||
1062 | /* next one assumes that ((line_size % 16) == 0) */ | |
1063 | void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
1064 | { | |
e45a2872 | 1065 | POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); |
fe50f385 RD |
1066 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
1067 | int j; | |
e45a2872 | 1068 | POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); |
fe50f385 RD |
1069 | for (j = 0; j < 4; j++) { |
1070 | int i; | |
1071 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1072 | const uint32_t b = | |
1073 | (((const struct unaligned_32 *) (pixels + 1))->l); | |
1074 | uint32_t l0 = | |
1075 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
1076 | uint32_t h0 = | |
1077 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1078 | uint32_t l1, h1; | |
1079 | pixels += line_size; | |
1080 | for (i = 0; i < h; i += 2) { | |
1081 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1082 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1083 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
1084 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1085 | *((uint32_t *) block) = | |
1086 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1087 | pixels += line_size; | |
1088 | block += line_size; | |
1089 | a = (((const struct unaligned_32 *) (pixels))->l); | |
1090 | b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1091 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
1092 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1093 | *((uint32_t *) block) = | |
1094 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1095 | pixels += line_size; | |
1096 | block += line_size; | |
1097 | } pixels += 4 - line_size * (h + 1); | |
1098 | block += 4 - line_size * h; | |
1099 | } | |
1100 | ||
e45a2872 | 1101 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); |
fe50f385 RD |
1102 | |
1103 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1104 | register int i; | |
1105 | register vector unsigned char | |
1106 | pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
1107 | register vector unsigned char | |
1108 | blockv, temp1, temp2; | |
1109 | register vector unsigned short | |
1110 | pixelssum1, pixelssum2, temp3, | |
1111 | pixelssum3, pixelssum4, temp4; | |
3b991c54 RD |
1112 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
1113 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
3efd4952 | 1114 | |
e45a2872 | 1115 | POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); |
3efd4952 | 1116 | |
fe50f385 RD |
1117 | temp1 = vec_ld(0, pixels); |
1118 | temp2 = vec_ld(16, pixels); | |
1119 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1120 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1121 | { | |
1122 | pixelsv2 = temp2; | |
1123 | } | |
1124 | else | |
1125 | { | |
1126 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1127 | } | |
1128 | pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1129 | pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1130 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1131 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1132 | pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
1133 | (vector unsigned short)pixelsv4); | |
1134 | pixelssum3 = vec_add(pixelssum3, vctwo); | |
1135 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1136 | (vector unsigned short)pixelsv2); | |
1137 | pixelssum1 = vec_add(pixelssum1, vctwo); | |
1138 | ||
fe50f385 RD |
1139 | for (i = 0; i < h ; i++) { |
1140 | blockv = vec_ld(0, block); | |
1141 | ||
1142 | temp1 = vec_ld(line_size, pixels); | |
1143 | temp2 = vec_ld(line_size + 16, pixels); | |
1144 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1145 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1146 | { | |
1147 | pixelsv2 = temp2; | |
1148 | } | |
1149 | else | |
1150 | { | |
1151 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1152 | } | |
1153 | ||
1154 | pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1155 | pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1156 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1157 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1158 | ||
1159 | pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
1160 | (vector unsigned short)pixelsv4); | |
1161 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1162 | (vector unsigned short)pixelsv2); | |
1163 | temp4 = vec_add(pixelssum3, pixelssum4); | |
1164 | temp4 = vec_sra(temp4, vctwo); | |
1165 | temp3 = vec_add(pixelssum1, pixelssum2); | |
1166 | temp3 = vec_sra(temp3, vctwo); | |
1167 | ||
1168 | pixelssum3 = vec_add(pixelssum4, vctwo); | |
1169 | pixelssum1 = vec_add(pixelssum2, vctwo); | |
1170 | ||
1171 | blockv = vec_packsu(temp3, temp4); | |
1172 | ||
1173 | vec_st(blockv, 0, block); | |
1174 | ||
1175 | block += line_size; | |
1176 | pixels += line_size; | |
1177 | } | |
1178 | ||
e45a2872 | 1179 | POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); |
fe50f385 RD |
1180 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
1181 | } | |
1182 | ||
1183 | /* next one assumes that ((line_size % 16) == 0) */ | |
1184 | void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
1185 | { | |
e45a2872 | 1186 | POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); |
fe50f385 RD |
1187 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
1188 | int j; | |
e45a2872 | 1189 | POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
fe50f385 RD |
1190 | for (j = 0; j < 4; j++) { |
1191 | int i; | |
1192 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1193 | const uint32_t b = | |
1194 | (((const struct unaligned_32 *) (pixels + 1))->l); | |
1195 | uint32_t l0 = | |
1196 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
1197 | uint32_t h0 = | |
1198 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1199 | uint32_t l1, h1; | |
1200 | pixels += line_size; | |
1201 | for (i = 0; i < h; i += 2) { | |
1202 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1203 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1204 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
1205 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1206 | *((uint32_t *) block) = | |
1207 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1208 | pixels += line_size; | |
1209 | block += line_size; | |
1210 | a = (((const struct unaligned_32 *) (pixels))->l); | |
1211 | b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1212 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
1213 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1214 | *((uint32_t *) block) = | |
1215 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1216 | pixels += line_size; | |
1217 | block += line_size; | |
1218 | } pixels += 4 - line_size * (h + 1); | |
1219 | block += 4 - line_size * h; | |
1220 | } | |
1221 | ||
e45a2872 | 1222 | POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
fe50f385 RD |
1223 | |
1224 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1225 | register int i; | |
1226 | register vector unsigned char | |
1227 | pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
1228 | register vector unsigned char | |
1229 | blockv, temp1, temp2; | |
1230 | register vector unsigned short | |
1231 | pixelssum1, pixelssum2, temp3, | |
1232 | pixelssum3, pixelssum4, temp4; | |
3b991c54 RD |
1233 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
1234 | register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
1235 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
3efd4952 | 1236 | |
e45a2872 | 1237 | POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
3efd4952 | 1238 | |
fe50f385 RD |
1239 | temp1 = vec_ld(0, pixels); |
1240 | temp2 = vec_ld(16, pixels); | |
1241 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1242 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1243 | { | |
1244 | pixelsv2 = temp2; | |
1245 | } | |
1246 | else | |
1247 | { | |
1248 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1249 | } | |
1250 | pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1251 | pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1252 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1253 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1254 | pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
1255 | (vector unsigned short)pixelsv4); | |
1256 | pixelssum3 = vec_add(pixelssum3, vcone); | |
1257 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1258 | (vector unsigned short)pixelsv2); | |
1259 | pixelssum1 = vec_add(pixelssum1, vcone); | |
1260 | ||
fe50f385 RD |
1261 | for (i = 0; i < h ; i++) { |
1262 | blockv = vec_ld(0, block); | |
1263 | ||
1264 | temp1 = vec_ld(line_size, pixels); | |
1265 | temp2 = vec_ld(line_size + 16, pixels); | |
1266 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1267 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1268 | { | |
1269 | pixelsv2 = temp2; | |
1270 | } | |
1271 | else | |
1272 | { | |
1273 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1274 | } | |
1275 | ||
1276 | pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1277 | pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1278 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1279 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1280 | ||
1281 | pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
1282 | (vector unsigned short)pixelsv4); | |
1283 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1284 | (vector unsigned short)pixelsv2); | |
1285 | temp4 = vec_add(pixelssum3, pixelssum4); | |
1286 | temp4 = vec_sra(temp4, vctwo); | |
1287 | temp3 = vec_add(pixelssum1, pixelssum2); | |
1288 | temp3 = vec_sra(temp3, vctwo); | |
1289 | ||
1290 | pixelssum3 = vec_add(pixelssum4, vcone); | |
1291 | pixelssum1 = vec_add(pixelssum2, vcone); | |
1292 | ||
1293 | blockv = vec_packsu(temp3, temp4); | |
1294 | ||
1295 | vec_st(blockv, 0, block); | |
1296 | ||
1297 | block += line_size; | |
1298 | pixels += line_size; | |
1299 | } | |
1300 | ||
e45a2872 | 1301 | POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
fe50f385 RD |
1302 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
1303 | } | |
1304 | ||
59925ef2 BF |
1305 | int has_altivec(void) |
1306 | { | |
3b991c54 | 1307 | #ifdef CONFIG_DARWIN |
59925ef2 BF |
1308 | int sels[2] = {CTL_HW, HW_VECTORUNIT}; |
1309 | int has_vu = 0; | |
1310 | size_t len = sizeof(has_vu); | |
1311 | int err; | |
1312 | ||
1313 | err = sysctl(sels, 2, &has_vu, &len, NULL, 0); | |
1314 | ||
1315 | if (err == 0) return (has_vu != 0); | |
3b991c54 RD |
1316 | #else /* CONFIG_DARWIN */ |
1317 | /* no Darwin, do it the brute-force way */ | |
1318 | /* this is borrowed from the libmpeg2 library */ | |
1319 | { | |
1320 | signal (SIGILL, sigill_handler); | |
1321 | if (sigsetjmp (jmpbuf, 1)) { | |
1322 | signal (SIGILL, SIG_DFL); | |
1323 | } else { | |
1324 | canjump = 1; | |
1325 | ||
1326 | asm volatile ("mtspr 256, %0\n\t" | |
1327 | "vand %%v0, %%v0, %%v0" | |
1328 | : | |
1329 | : "r" (-1)); | |
1330 | ||
1331 | signal (SIGILL, SIG_DFL); | |
1332 | return 1; | |
1333 | } | |
1334 | } | |
1335 | #endif /* CONFIG_DARWIN */ | |
59925ef2 BF |
1336 | return 0; |
1337 | } |