Commit | Line | Data |
---|---|---|
05c4072b MN |
1 | /* |
2 | * Copyright (c) 2002 Brian Foley | |
3 | * Copyright (c) 2002 Dieter Shirley | |
fe50f385 | 4 | * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> |
05c4072b MN |
5 | * |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
19 | */ | |
20 | ||
59925ef2 | 21 | #include "../dsputil.h" |
05c4072b | 22 | #include "dsputil_altivec.h" |
59925ef2 | 23 | |
3b991c54 | 24 | #ifdef CONFIG_DARWIN |
59925ef2 | 25 | #include <sys/sysctl.h> |
3b991c54 RD |
26 | #else /* CONFIG_DARWIN */ |
27 | #include <signal.h> | |
28 | #include <setjmp.h> | |
29 | ||
30 | static sigjmp_buf jmpbuf; | |
31 | static volatile sig_atomic_t canjump = 0; | |
32 | ||
33 | static void sigill_handler (int sig) | |
34 | { | |
35 | if (!canjump) { | |
36 | signal (sig, SIG_DFL); | |
37 | raise (sig); | |
38 | } | |
39 | ||
40 | canjump = 0; | |
41 | siglongjmp (jmpbuf, 1); | |
42 | } | |
43 | #endif /* CONFIG_DARWIN */ | |
59925ef2 | 44 | |
f2677d6b BF |
45 | int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
46 | { | |
4013fcf4 FB |
47 | int i; |
48 | int s __attribute__((aligned(16))); | |
3b991c54 | 49 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
4013fcf4 | 50 | vector unsigned char *tv; |
f2677d6b BF |
51 | vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; |
52 | vector unsigned int sad; | |
53 | vector signed int sumdiffs; | |
54 | ||
55 | s = 0; | |
3b991c54 | 56 | sad = (vector unsigned int)vec_splat_u32(0); |
f2677d6b BF |
57 | for(i=0;i<16;i++) { |
58 | /* | |
59 | Read unaligned pixels into our vectors. The vectors are as follows: | |
60 | pix1v: pix1[0]-pix1[15] | |
61 | pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] | |
62 | */ | |
63 | tv = (vector unsigned char *) pix1; | |
64 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); | |
65 | ||
66 | tv = (vector unsigned char *) &pix2[0]; | |
67 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); | |
68 | ||
69 | tv = (vector unsigned char *) &pix2[1]; | |
70 | pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); | |
71 | ||
72 | /* Calculate the average vector */ | |
73 | avgv = vec_avg(pix2v, pix2iv); | |
74 | ||
75 | /* Calculate a sum of abs differences vector */ | |
76 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); | |
77 | ||
78 | /* Add each 4 pixel group together and put 4 results into sad */ | |
79 | sad = vec_sum4s(t5, sad); | |
80 | ||
81 | pix1 += line_size; | |
82 | pix2 += line_size; | |
83 | } | |
84 | /* Sum up the four partial sums, and put the result into s */ | |
85 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
86 | sumdiffs = vec_splat(sumdiffs, 3); | |
87 | vec_ste(sumdiffs, 0, &s); | |
88 | ||
89 | return s; | |
90 | } | |
91 | ||
92 | int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) | |
93 | { | |
4013fcf4 FB |
94 | int i; |
95 | int s __attribute__((aligned(16))); | |
3b991c54 | 96 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
4013fcf4 | 97 | vector unsigned char *tv; |
f2677d6b BF |
98 | vector unsigned char pix1v, pix2v, pix3v, avgv, t5; |
99 | vector unsigned int sad; | |
100 | vector signed int sumdiffs; | |
101 | uint8_t *pix3 = pix2 + line_size; | |
102 | ||
103 | s = 0; | |
3b991c54 | 104 | sad = (vector unsigned int)vec_splat_u32(0); |
f2677d6b BF |
105 | |
106 | /* | |
107 | Due to the fact that pix3 = pix2 + line_size, the pix3 of one | |
108 | iteration becomes pix2 in the next iteration. We can use this | |
109 | fact to avoid a potentially expensive unaligned read, each | |
110 | time around the loop. | |
111 | Read unaligned pixels into our vectors. The vectors are as follows: | |
112 | pix2v: pix2[0]-pix2[15] | |
113 | Split the pixel vectors into shorts | |
114 | */ | |
115 | tv = (vector unsigned char *) &pix2[0]; | |
116 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); | |
117 | ||
118 | for(i=0;i<16;i++) { | |
119 | /* | |
120 | Read unaligned pixels into our vectors. The vectors are as follows: | |
121 | pix1v: pix1[0]-pix1[15] | |
122 | pix3v: pix3[0]-pix3[15] | |
123 | */ | |
124 | tv = (vector unsigned char *) pix1; | |
125 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); | |
126 | ||
127 | tv = (vector unsigned char *) &pix3[0]; | |
128 | pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); | |
129 | ||
130 | /* Calculate the average vector */ | |
131 | avgv = vec_avg(pix2v, pix3v); | |
132 | ||
133 | /* Calculate a sum of abs differences vector */ | |
134 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); | |
135 | ||
136 | /* Add each 4 pixel group together and put 4 results into sad */ | |
137 | sad = vec_sum4s(t5, sad); | |
138 | ||
139 | pix1 += line_size; | |
140 | pix2v = pix3v; | |
141 | pix3 += line_size; | |
142 | ||
143 | } | |
144 | ||
145 | /* Sum up the four partial sums, and put the result into s */ | |
146 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
147 | sumdiffs = vec_splat(sumdiffs, 3); | |
148 | vec_ste(sumdiffs, 0, &s); | |
149 | return s; | |
150 | } | |
151 | ||
152 | int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) | |
153 | { | |
4013fcf4 FB |
154 | int i; |
155 | int s __attribute__((aligned(16))); | |
f2677d6b | 156 | uint8_t *pix3 = pix2 + line_size; |
3b991c54 RD |
157 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
158 | const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); | |
4013fcf4 | 159 | vector unsigned char *tv, avgv, t5; |
f2677d6b BF |
160 | vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; |
161 | vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; | |
162 | vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; | |
4013fcf4 | 163 | vector unsigned short avghv, avglv; |
f2677d6b BF |
164 | vector unsigned short t1, t2, t3, t4; |
165 | vector unsigned int sad; | |
166 | vector signed int sumdiffs; | |
167 | ||
3b991c54 | 168 | sad = (vector unsigned int)vec_splat_u32(0); |
f2677d6b BF |
169 | |
170 | s = 0; | |
171 | ||
172 | /* | |
173 | Due to the fact that pix3 = pix2 + line_size, the pix3 of one | |
174 | iteration becomes pix2 in the next iteration. We can use this | |
175 | fact to avoid a potentially expensive unaligned read, as well | |
176 | as some splitting, and vector addition each time around the loop. | |
177 | Read unaligned pixels into our vectors. The vectors are as follows: | |
178 | pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] | |
179 | Split the pixel vectors into shorts | |
180 | */ | |
181 | tv = (vector unsigned char *) &pix2[0]; | |
182 | pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); | |
183 | ||
184 | tv = (vector unsigned char *) &pix2[1]; | |
185 | pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); | |
186 | ||
187 | pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); | |
188 | pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); | |
189 | pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); | |
190 | pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); | |
191 | t1 = vec_add(pix2hv, pix2ihv); | |
192 | t2 = vec_add(pix2lv, pix2ilv); | |
193 | ||
194 | for(i=0;i<16;i++) { | |
195 | /* | |
196 | Read unaligned pixels into our vectors. The vectors are as follows: | |
197 | pix1v: pix1[0]-pix1[15] | |
198 | pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] | |
199 | */ | |
200 | tv = (vector unsigned char *) pix1; | |
201 | pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); | |
202 | ||
203 | tv = (vector unsigned char *) &pix3[0]; | |
204 | pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); | |
205 | ||
206 | tv = (vector unsigned char *) &pix3[1]; | |
207 | pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); | |
208 | ||
209 | /* | |
210 | Note that Altivec does have vec_avg, but this works on vector pairs | |
211 | and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding | |
212 | would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. | |
213 | Instead, we have to split the pixel vectors into vectors of shorts, | |
214 | and do the averaging by hand. | |
215 | */ | |
216 | ||
217 | /* Split the pixel vectors into shorts */ | |
218 | pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); | |
219 | pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); | |
220 | pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); | |
221 | pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); | |
222 | ||
223 | /* Do the averaging on them */ | |
224 | t3 = vec_add(pix3hv, pix3ihv); | |
225 | t4 = vec_add(pix3lv, pix3ilv); | |
226 | ||
9c76bd48 BF |
227 | avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |
228 | avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); | |
f2677d6b BF |
229 | |
230 | /* Pack the shorts back into a result */ | |
231 | avgv = vec_pack(avghv, avglv); | |
232 | ||
233 | /* Calculate a sum of abs differences vector */ | |
234 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); | |
235 | ||
236 | /* Add each 4 pixel group together and put 4 results into sad */ | |
237 | sad = vec_sum4s(t5, sad); | |
238 | ||
239 | pix1 += line_size; | |
240 | pix3 += line_size; | |
241 | /* Transfer the calculated values for pix3 into pix2 */ | |
242 | t1 = t3; | |
243 | t2 = t4; | |
244 | } | |
245 | /* Sum up the four partial sums, and put the result into s */ | |
246 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
247 | sumdiffs = vec_splat(sumdiffs, 3); | |
248 | vec_ste(sumdiffs, 0, &s); | |
249 | ||
250 | return s; | |
251 | } | |
252 | ||
59925ef2 BF |
253 | int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
254 | { | |
4013fcf4 FB |
255 | int i; |
256 | int s __attribute__((aligned(16))); | |
3b991c54 | 257 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
258 | vector unsigned char perm1, perm2, *pix1v, *pix2v; |
259 | vector unsigned char t1, t2, t3,t4, t5; | |
4013fcf4 | 260 | vector unsigned int sad; |
59925ef2 BF |
261 | vector signed int sumdiffs; |
262 | ||
3b991c54 | 263 | sad = (vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
264 | |
265 | ||
266 | for(i=0;i<16;i++) { | |
267 | /* Read potentially unaligned pixels into t1 and t2 */ | |
268 | perm1 = vec_lvsl(0, pix1); | |
269 | pix1v = (vector unsigned char *) pix1; | |
270 | perm2 = vec_lvsl(0, pix2); | |
271 | pix2v = (vector unsigned char *) pix2; | |
272 | t1 = vec_perm(pix1v[0], pix1v[1], perm1); | |
273 | t2 = vec_perm(pix2v[0], pix2v[1], perm2); | |
274 | ||
275 | /* Calculate a sum of abs differences vector */ | |
276 | t3 = vec_max(t1, t2); | |
277 | t4 = vec_min(t1, t2); | |
278 | t5 = vec_sub(t3, t4); | |
279 | ||
280 | /* Add each 4 pixel group together and put 4 results into sad */ | |
281 | sad = vec_sum4s(t5, sad); | |
282 | ||
283 | pix1 += line_size; | |
284 | pix2 += line_size; | |
285 | } | |
286 | ||
287 | /* Sum up the four partial sums, and put the result into s */ | |
288 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
289 | sumdiffs = vec_splat(sumdiffs, 3); | |
290 | vec_ste(sumdiffs, 0, &s); | |
291 | ||
292 | return s; | |
293 | } | |
294 | ||
295 | int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) | |
296 | { | |
4013fcf4 FB |
297 | int i; |
298 | int s __attribute__((aligned(16))); | |
3b991c54 | 299 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
300 | vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |
301 | vector unsigned char t1, t2, t3,t4, t5; | |
4013fcf4 | 302 | vector unsigned int sad; |
59925ef2 BF |
303 | vector signed int sumdiffs; |
304 | ||
3b991c54 RD |
305 | sad = (vector unsigned int)vec_splat_u32(0); |
306 | #ifdef CONFIG_DARWIN | |
307 | permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); | |
308 | #else | |
309 | permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; | |
310 | #endif | |
59925ef2 BF |
311 | |
312 | for(i=0;i<8;i++) { | |
313 | /* Read potentially unaligned pixels into t1 and t2 | |
314 | Since we're reading 16 pixels, and actually only want 8, | |
315 | mask out the last 8 pixels. The 0s don't change the sum. */ | |
316 | perm1 = vec_lvsl(0, pix1); | |
317 | pix1v = (vector unsigned char *) pix1; | |
318 | perm2 = vec_lvsl(0, pix2); | |
319 | pix2v = (vector unsigned char *) pix2; | |
320 | t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); | |
321 | t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); | |
322 | ||
323 | /* Calculate a sum of abs differences vector */ | |
324 | t3 = vec_max(t1, t2); | |
325 | t4 = vec_min(t1, t2); | |
326 | t5 = vec_sub(t3, t4); | |
327 | ||
328 | /* Add each 4 pixel group together and put 4 results into sad */ | |
329 | sad = vec_sum4s(t5, sad); | |
330 | ||
331 | pix1 += line_size; | |
332 | pix2 += line_size; | |
333 | } | |
334 | ||
335 | /* Sum up the four partial sums, and put the result into s */ | |
336 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
337 | sumdiffs = vec_splat(sumdiffs, 3); | |
338 | vec_ste(sumdiffs, 0, &s); | |
339 | ||
340 | return s; | |
341 | } | |
342 | ||
f2677d6b BF |
343 | int pix_norm1_altivec(uint8_t *pix, int line_size) |
344 | { | |
4013fcf4 FB |
345 | int i; |
346 | int s __attribute__((aligned(16))); | |
3b991c54 | 347 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
4013fcf4 | 348 | vector unsigned char *tv; |
f2677d6b | 349 | vector unsigned char pixv; |
f2677d6b BF |
350 | vector unsigned int sv; |
351 | vector signed int sum; | |
4013fcf4 | 352 | |
3b991c54 | 353 | sv = (vector unsigned int)vec_splat_u32(0); |
f2677d6b BF |
354 | |
355 | s = 0; | |
356 | for (i = 0; i < 16; i++) { | |
357 | /* Read in the potentially unaligned pixels */ | |
358 | tv = (vector unsigned char *) pix; | |
359 | pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); | |
360 | ||
9c76bd48 BF |
361 | /* Square the values, and add them to our sum */ |
362 | sv = vec_msum(pixv, pixv, sv); | |
f2677d6b BF |
363 | |
364 | pix += line_size; | |
365 | } | |
366 | /* Sum up the four partial sums, and put the result into s */ | |
367 | sum = vec_sums((vector signed int) sv, (vector signed int) zero); | |
368 | sum = vec_splat(sum, 3); | |
369 | vec_ste(sum, 0, &s); | |
370 | ||
371 | return s; | |
372 | } | |
373 | ||
4013fcf4 FB |
374 | /** |
375 | * Sum of Squared Errors for a 8x8 block. | |
376 | * AltiVec-enhanced. | |
377 | * It's the pix_abs8x8_altivec code above w/ squaring added. | |
378 | */ | |
379 | int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) | |
380 | { | |
381 | int i; | |
382 | int s __attribute__((aligned(16))); | |
3b991c54 | 383 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
4013fcf4 FB |
384 | vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |
385 | vector unsigned char t1, t2, t3,t4, t5; | |
386 | vector unsigned int sum; | |
387 | vector signed int sumsqr; | |
388 | ||
3b991c54 RD |
389 | sum = (vector unsigned int)vec_splat_u32(0); |
390 | #ifdef CONFIG_DARWIN | |
391 | permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); | |
392 | #else | |
393 | permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; | |
394 | #endif | |
4013fcf4 FB |
395 | |
396 | for(i=0;i<8;i++) { | |
397 | /* Read potentially unaligned pixels into t1 and t2 | |
398 | Since we're reading 16 pixels, and actually only want 8, | |
399 | mask out the last 8 pixels. The 0s don't change the sum. */ | |
400 | perm1 = vec_lvsl(0, pix1); | |
401 | pix1v = (vector unsigned char *) pix1; | |
402 | perm2 = vec_lvsl(0, pix2); | |
403 | pix2v = (vector unsigned char *) pix2; | |
404 | t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); | |
405 | t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); | |
406 | ||
407 | /* | |
408 | Since we want to use unsigned chars, we can take advantage | |
409 | of the fact that abs(a-b)^2 = (a-b)^2. | |
410 | */ | |
411 | ||
412 | /* Calculate abs differences vector */ | |
413 | t3 = vec_max(t1, t2); | |
414 | t4 = vec_min(t1, t2); | |
415 | t5 = vec_sub(t3, t4); | |
416 | ||
417 | /* Square the values and add them to our sum */ | |
418 | sum = vec_msum(t5, t5, sum); | |
419 | ||
420 | pix1 += line_size; | |
421 | pix2 += line_size; | |
422 | } | |
423 | ||
424 | /* Sum up the four partial sums, and put the result into s */ | |
425 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
426 | sumsqr = vec_splat(sumsqr, 3); | |
427 | vec_ste(sumsqr, 0, &s); | |
428 | ||
429 | return s; | |
430 | } | |
431 | ||
432 | /** | |
433 | * Sum of Squared Errors for a 16x16 block. | |
434 | * AltiVec-enhanced. | |
435 | * It's the pix_abs16x16_altivec code above w/ squaring added. | |
436 | */ | |
437 | int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) | |
59925ef2 | 438 | { |
4013fcf4 FB |
439 | int i; |
440 | int s __attribute__((aligned(16))); | |
3b991c54 | 441 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
4013fcf4 FB |
442 | vector unsigned char perm1, perm2, *pix1v, *pix2v; |
443 | vector unsigned char t1, t2, t3,t4, t5; | |
444 | vector unsigned int sum; | |
445 | vector signed int sumsqr; | |
446 | ||
3b991c54 | 447 | sum = (vector unsigned int)vec_splat_u32(0); |
4013fcf4 FB |
448 | |
449 | for(i=0;i<16;i++) { | |
450 | /* Read potentially unaligned pixels into t1 and t2 */ | |
451 | perm1 = vec_lvsl(0, pix1); | |
452 | pix1v = (vector unsigned char *) pix1; | |
453 | perm2 = vec_lvsl(0, pix2); | |
454 | pix2v = (vector unsigned char *) pix2; | |
455 | t1 = vec_perm(pix1v[0], pix1v[1], perm1); | |
456 | t2 = vec_perm(pix2v[0], pix2v[1], perm2); | |
457 | ||
458 | /* | |
459 | Since we want to use unsigned chars, we can take advantage | |
460 | of the fact that abs(a-b)^2 = (a-b)^2. | |
461 | */ | |
462 | ||
463 | /* Calculate abs differences vector */ | |
464 | t3 = vec_max(t1, t2); | |
465 | t4 = vec_min(t1, t2); | |
466 | t5 = vec_sub(t3, t4); | |
467 | ||
468 | /* Square the values and add them to our sum */ | |
469 | sum = vec_msum(t5, t5, sum); | |
470 | ||
471 | pix1 += line_size; | |
472 | pix2 += line_size; | |
473 | } | |
474 | ||
475 | /* Sum up the four partial sums, and put the result into s */ | |
476 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
477 | sumsqr = vec_splat(sumsqr, 3); | |
478 | vec_ste(sumsqr, 0, &s); | |
479 | ||
480 | return s; | |
481 | } | |
59925ef2 | 482 | |
4013fcf4 FB |
483 | int pix_sum_altivec(UINT8 * pix, int line_size) |
484 | { | |
3b991c54 | 485 | const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
486 | vector unsigned char perm, *pixv; |
487 | vector unsigned char t1; | |
4013fcf4 | 488 | vector unsigned int sad; |
59925ef2 BF |
489 | vector signed int sumdiffs; |
490 | ||
4013fcf4 FB |
491 | int i; |
492 | int s __attribute__((aligned(16))); | |
493 | ||
3b991c54 | 494 | sad = (vector unsigned int)vec_splat_u32(0); |
59925ef2 BF |
495 | |
496 | for (i = 0; i < 16; i++) { | |
497 | /* Read the potentially unaligned 16 pixels into t1 */ | |
498 | perm = vec_lvsl(0, pix); | |
499 | pixv = (vector unsigned char *) pix; | |
500 | t1 = vec_perm(pixv[0], pixv[1], perm); | |
501 | ||
502 | /* Add each 4 pixel group together and put 4 results into sad */ | |
503 | sad = vec_sum4s(t1, sad); | |
504 | ||
505 | pix += line_size; | |
506 | } | |
507 | ||
508 | /* Sum up the four partial sums, and put the result into s */ | |
509 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
510 | sumdiffs = vec_splat(sumdiffs, 3); | |
511 | vec_ste(sumdiffs, 0, &s); | |
512 | ||
513 | return s; | |
514 | } | |
515 | ||
05c4072b MN |
516 | void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size) |
517 | { | |
518 | int i; | |
519 | vector unsigned char perm, bytes, *pixv; | |
3b991c54 | 520 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
05c4072b MN |
521 | vector signed short shorts; |
522 | ||
523 | for(i=0;i<8;i++) | |
524 | { | |
525 | // Read potentially unaligned pixels. | |
526 | // We're reading 16 pixels, and actually only want 8, | |
527 | // but we simply ignore the extras. | |
528 | perm = vec_lvsl(0, pixels); | |
529 | pixv = (vector unsigned char *) pixels; | |
530 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
531 | ||
532 | // convert the bytes into shorts | |
533 | shorts = (vector signed short)vec_mergeh(zero, bytes); | |
534 | ||
535 | // save the data to the block, we assume the block is 16-byte aligned | |
536 | vec_st(shorts, i*16, (vector signed short*)block); | |
537 | ||
538 | pixels += line_size; | |
539 | } | |
540 | } | |
541 | ||
542 | void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1, | |
543 | const UINT8 *s2, int stride) | |
544 | { | |
545 | int i; | |
546 | vector unsigned char perm, bytes, *pixv; | |
3b991c54 | 547 | const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
05c4072b MN |
548 | vector signed short shorts1, shorts2; |
549 | ||
550 | for(i=0;i<4;i++) | |
551 | { | |
552 | // Read potentially unaligned pixels | |
553 | // We're reading 16 pixels, and actually only want 8, | |
554 | // but we simply ignore the extras. | |
555 | perm = vec_lvsl(0, s1); | |
556 | pixv = (vector unsigned char *) s1; | |
557 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
558 | ||
559 | // convert the bytes into shorts | |
560 | shorts1 = (vector signed short)vec_mergeh(zero, bytes); | |
561 | ||
562 | // Do the same for the second block of pixels | |
563 | perm = vec_lvsl(0, s2); | |
564 | pixv = (vector unsigned char *) s2; | |
565 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
566 | ||
567 | // convert the bytes into shorts | |
568 | shorts2 = (vector signed short)vec_mergeh(zero, bytes); | |
569 | ||
570 | // Do the subtraction | |
571 | shorts1 = vec_sub(shorts1, shorts2); | |
572 | ||
573 | // save the data to the block, we assume the block is 16-byte aligned | |
574 | vec_st(shorts1, 0, (vector signed short*)block); | |
575 | ||
576 | s1 += stride; | |
577 | s2 += stride; | |
578 | block += 8; | |
579 | ||
580 | ||
581 | // The code below is a copy of the code above... This is a manual | |
582 | // unroll. | |
583 | ||
584 | // Read potentially unaligned pixels | |
585 | // We're reading 16 pixels, and actually only want 8, | |
586 | // but we simply ignore the extras. | |
587 | perm = vec_lvsl(0, s1); | |
588 | pixv = (vector unsigned char *) s1; | |
589 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
590 | ||
591 | // convert the bytes into shorts | |
592 | shorts1 = (vector signed short)vec_mergeh(zero, bytes); | |
593 | ||
594 | // Do the same for the second block of pixels | |
595 | perm = vec_lvsl(0, s2); | |
596 | pixv = (vector unsigned char *) s2; | |
597 | bytes = vec_perm(pixv[0], pixv[1], perm); | |
598 | ||
599 | // convert the bytes into shorts | |
600 | shorts2 = (vector signed short)vec_mergeh(zero, bytes); | |
601 | ||
602 | // Do the subtraction | |
603 | shorts1 = vec_sub(shorts1, shorts2); | |
604 | ||
605 | // save the data to the block, we assume the block is 16-byte aligned | |
606 | vec_st(shorts1, 0, (vector signed short*)block); | |
607 | ||
608 | s1 += stride; | |
609 | s2 += stride; | |
610 | block += 8; | |
611 | } | |
612 | } | |
613 | ||
e629ab68 RD |
614 | int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { |
615 | return pix_abs16x16_altivec(a,b,stride); | |
616 | } | |
617 | ||
618 | int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { | |
619 | return pix_abs8x8_altivec(a,b,stride); | |
620 | } | |
621 | ||
622 | void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { | |
db40a39a | 623 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
e629ab68 RD |
624 | int i; |
625 | for(i=0; i+7<w; i++){ | |
626 | dst[i+0] += src[i+0]; | |
627 | dst[i+1] += src[i+1]; | |
628 | dst[i+2] += src[i+2]; | |
629 | dst[i+3] += src[i+3]; | |
630 | dst[i+4] += src[i+4]; | |
631 | dst[i+5] += src[i+5]; | |
632 | dst[i+6] += src[i+6]; | |
633 | dst[i+7] += src[i+7]; | |
634 | } | |
635 | for(; i<w; i++) | |
636 | dst[i+0] += src[i+0]; | |
db40a39a | 637 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
e629ab68 | 638 | register int i; |
db40a39a MN |
639 | register vector unsigned char vdst, vsrc; |
640 | ||
641 | /* dst and src are 16 bytes-aligned (guaranteed) */ | |
642 | for(i = 0 ; (i + 15) < w ; i++) | |
e629ab68 | 643 | { |
db40a39a MN |
644 | vdst = vec_ld(i << 4, (unsigned char*)dst); |
645 | vsrc = vec_ld(i << 4, (unsigned char*)src); | |
e629ab68 | 646 | vdst = vec_add(vsrc, vdst); |
db40a39a | 647 | vec_st(vdst, i << 4, (unsigned char*)dst); |
e629ab68 | 648 | } |
db40a39a | 649 | /* if w is not a multiple of 16 */ |
e629ab68 RD |
650 | for (; (i < w) ; i++) |
651 | { | |
652 | dst[i] = src[i]; | |
653 | } | |
db40a39a MN |
654 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
655 | } | |
656 | ||
fe50f385 | 657 | /* next one assumes that ((line_size % 16) == 0) */ |
db40a39a MN |
658 | void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
659 | { | |
35e5fb06 | 660 | POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1); |
db40a39a MN |
661 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
662 | int i; | |
663 | ||
35e5fb06 | 664 | POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1); |
db40a39a MN |
665 | |
666 | for(i=0; i<h; i++) { | |
667 | *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); | |
668 | *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); | |
669 | *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); | |
670 | *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); | |
671 | pixels+=line_size; | |
672 | block +=line_size; | |
673 | } | |
674 | ||
35e5fb06 | 675 | POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); |
db40a39a MN |
676 | |
677 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
db40a39a | 678 | register vector unsigned char pixelsv1, pixelsv2; |
fe50f385 | 679 | register vector unsigned char perm = vec_lvsl(0, pixels); |
db40a39a MN |
680 | int i; |
681 | ||
35e5fb06 | 682 | POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1); |
db40a39a MN |
683 | |
684 | for(i=0; i<h; i++) { | |
685 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
686 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
fe50f385 | 687 | vec_st(vec_perm(pixelsv1, pixelsv2, perm), |
35e5fb06 | 688 | 0, (unsigned char*)block); |
db40a39a MN |
689 | pixels+=line_size; |
690 | block +=line_size; | |
691 | } | |
692 | ||
35e5fb06 | 693 | POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); |
db40a39a MN |
694 | |
695 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
696 | } | |
697 | ||
fe50f385 | 698 | /* next one assumes that ((line_size % 16) == 0) */ |
db40a39a MN |
699 | #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) |
700 | void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
701 | { | |
35e5fb06 | 702 | POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1); |
db40a39a MN |
703 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
704 | int i; | |
705 | ||
35e5fb06 | 706 | POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); |
db40a39a MN |
707 | |
708 | for(i=0; i<h; i++) { | |
709 | op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); | |
710 | op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); | |
711 | op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); | |
712 | op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); | |
713 | pixels+=line_size; | |
714 | block +=line_size; | |
715 | } | |
716 | ||
35e5fb06 | 717 | POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); |
db40a39a MN |
718 | |
719 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
db40a39a | 720 | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; |
fe50f385 | 721 | register vector unsigned char perm = vec_lvsl(0, pixels); |
db40a39a MN |
722 | int i; |
723 | ||
35e5fb06 | 724 | POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); |
db40a39a MN |
725 | |
726 | for(i=0; i<h; i++) { | |
727 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
728 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
729 | blockv = vec_ld(0, block); | |
fe50f385 | 730 | pixelsv = vec_perm(pixelsv1, pixelsv2, perm); |
db40a39a MN |
731 | blockv = vec_avg(blockv,pixelsv); |
732 | vec_st(blockv, 0, (unsigned char*)block); | |
733 | pixels+=line_size; | |
734 | block +=line_size; | |
735 | } | |
736 | ||
35e5fb06 | 737 | POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); |
db40a39a MN |
738 | |
739 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
e629ab68 | 740 | } |
05c4072b | 741 | |
fe50f385 RD |
742 | /* next one assumes that ((line_size % 8) == 0) */ |
743 | void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
35e5fb06 RD |
744 | { |
745 | POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1); | |
746 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
747 | int i; | |
748 | POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1); | |
749 | for (i = 0; i < h; i++) { | |
750 | *((uint32_t *) (block)) = | |
751 | (((*((uint32_t *) (block))) | | |
752 | ((((const struct unaligned_32 *) (pixels))->l))) - | |
753 | ((((*((uint32_t *) (block))) ^ | |
754 | ((((const struct unaligned_32 *) (pixels))-> | |
755 | l))) & 0xFEFEFEFEUL) >> 1)); | |
756 | *((uint32_t *) (block + 4)) = | |
757 | (((*((uint32_t *) (block + 4))) | | |
758 | ((((const struct unaligned_32 *) (pixels + 4))->l))) - | |
759 | ((((*((uint32_t *) (block + 4))) ^ | |
760 | ((((const struct unaligned_32 *) (pixels + | |
761 | 4))-> | |
762 | l))) & 0xFEFEFEFEUL) >> 1)); | |
763 | pixels += line_size; | |
764 | block += line_size; | |
765 | } | |
766 | POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1); | |
767 | ||
768 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
769 | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | |
770 | int i; | |
771 | ||
772 | POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1); | |
773 | ||
774 | for (i = 0; i < h; i++) { | |
775 | /* | |
776 | block is 8 bytes-aligned, so we're either in the | |
777 | left block (16 bytes-aligned) or in the right block (not) | |
778 | */ | |
779 | int rightside = ((unsigned long)block & 0x0000000F); | |
780 | ||
781 | blockv = vec_ld(0, block); | |
782 | pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
783 | pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
784 | pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); | |
785 | ||
786 | if (rightside) | |
787 | { | |
788 | pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); | |
789 | } | |
790 | else | |
791 | { | |
792 | pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); | |
793 | } | |
794 | ||
795 | blockv = vec_avg(blockv, pixelsv); | |
796 | ||
797 | vec_st(blockv, 0, block); | |
798 | ||
799 | pixels += line_size; | |
800 | block += line_size; | |
801 | } | |
802 | ||
803 | POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1); | |
804 | ||
805 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
806 | } | |
807 | ||
fe50f385 | 808 | /* next one assumes that ((line_size % 8) == 0) */ |
35e5fb06 RD |
809 | void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
810 | { | |
811 | POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1); | |
812 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
813 | int j; | |
814 | POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1); | |
815 | for (j = 0; j < 2; j++) { | |
816 | int i; | |
817 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
818 | const uint32_t b = | |
819 | (((const struct unaligned_32 *) (pixels + 1))->l); | |
820 | uint32_t l0 = | |
821 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
822 | uint32_t h0 = | |
823 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
824 | uint32_t l1, h1; | |
825 | pixels += line_size; | |
826 | for (i = 0; i < h; i += 2) { | |
827 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
828 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
829 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
830 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
831 | *((uint32_t *) block) = | |
832 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
833 | pixels += line_size; | |
834 | block += line_size; | |
835 | a = (((const struct unaligned_32 *) (pixels))->l); | |
836 | b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
837 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
838 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
839 | *((uint32_t *) block) = | |
840 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
841 | pixels += line_size; | |
842 | block += line_size; | |
843 | } pixels += 4 - line_size * (h + 1); | |
844 | block += 4 - line_size * h; | |
845 | } | |
846 | ||
847 | POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); | |
848 | ||
849 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
850 | register int i; | |
851 | register vector unsigned char | |
852 | pixelsv1, pixelsv2, | |
853 | pixelsavg; | |
854 | register vector unsigned char | |
855 | blockv, temp1, temp2; | |
856 | register vector unsigned short | |
857 | pixelssum1, pixelssum2, temp3; | |
3b991c54 RD |
858 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
859 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
35e5fb06 RD |
860 | |
861 | temp1 = vec_ld(0, pixels); | |
862 | temp2 = vec_ld(16, pixels); | |
863 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
864 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
865 | { | |
866 | pixelsv2 = temp2; | |
867 | } | |
868 | else | |
869 | { | |
870 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
871 | } | |
872 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
873 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
874 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
875 | (vector unsigned short)pixelsv2); | |
876 | pixelssum1 = vec_add(pixelssum1, vctwo); | |
877 | ||
878 | POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1); | |
879 | for (i = 0; i < h ; i++) { | |
880 | int rightside = ((unsigned long)block & 0x0000000F); | |
881 | blockv = vec_ld(0, block); | |
882 | ||
883 | temp1 = vec_ld(line_size, pixels); | |
884 | temp2 = vec_ld(line_size + 16, pixels); | |
885 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
886 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
887 | { | |
888 | pixelsv2 = temp2; | |
889 | } | |
890 | else | |
891 | { | |
892 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
893 | } | |
894 | ||
895 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
896 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
897 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
898 | (vector unsigned short)pixelsv2); | |
899 | temp3 = vec_add(pixelssum1, pixelssum2); | |
900 | temp3 = vec_sra(temp3, vctwo); | |
901 | pixelssum1 = vec_add(pixelssum2, vctwo); | |
902 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
903 | ||
904 | if (rightside) | |
905 | { | |
906 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
907 | } | |
908 | else | |
909 | { | |
910 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
911 | } | |
912 | ||
913 | vec_st(blockv, 0, block); | |
914 | ||
915 | block += line_size; | |
916 | pixels += line_size; | |
917 | } | |
918 | ||
919 | POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); | |
920 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
921 | } | |
922 | ||
fe50f385 RD |
923 | /* next one assumes that ((line_size % 8) == 0) */ |
924 | void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
925 | { | |
926 | POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
927 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
928 | int j; | |
929 | POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
930 | for (j = 0; j < 2; j++) { | |
931 | int i; | |
932 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
933 | const uint32_t b = | |
934 | (((const struct unaligned_32 *) (pixels + 1))->l); | |
935 | uint32_t l0 = | |
936 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
937 | uint32_t h0 = | |
938 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
939 | uint32_t l1, h1; | |
940 | pixels += line_size; | |
941 | for (i = 0; i < h; i += 2) { | |
942 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
943 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
944 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
945 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
946 | *((uint32_t *) block) = | |
947 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
948 | pixels += line_size; | |
949 | block += line_size; | |
950 | a = (((const struct unaligned_32 *) (pixels))->l); | |
951 | b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
952 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
953 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
954 | *((uint32_t *) block) = | |
955 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
956 | pixels += line_size; | |
957 | block += line_size; | |
958 | } pixels += 4 - line_size * (h + 1); | |
959 | block += 4 - line_size * h; | |
960 | } | |
961 | ||
962 | POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
963 | ||
964 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
965 | register int i; | |
966 | register vector unsigned char | |
967 | pixelsv1, pixelsv2, | |
968 | pixelsavg; | |
969 | register vector unsigned char | |
970 | blockv, temp1, temp2; | |
971 | register vector unsigned short | |
972 | pixelssum1, pixelssum2, temp3; | |
3b991c54 RD |
973 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
974 | register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
975 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
fe50f385 RD |
976 | |
977 | temp1 = vec_ld(0, pixels); | |
978 | temp2 = vec_ld(16, pixels); | |
979 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
980 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
981 | { | |
982 | pixelsv2 = temp2; | |
983 | } | |
984 | else | |
985 | { | |
986 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
987 | } | |
988 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
989 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
990 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
991 | (vector unsigned short)pixelsv2); | |
992 | pixelssum1 = vec_add(pixelssum1, vcone); | |
993 | ||
994 | POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
995 | for (i = 0; i < h ; i++) { | |
996 | int rightside = ((unsigned long)block & 0x0000000F); | |
997 | blockv = vec_ld(0, block); | |
998 | ||
999 | temp1 = vec_ld(line_size, pixels); | |
1000 | temp2 = vec_ld(line_size + 16, pixels); | |
1001 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1002 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1003 | { | |
1004 | pixelsv2 = temp2; | |
1005 | } | |
1006 | else | |
1007 | { | |
1008 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1009 | } | |
1010 | ||
1011 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1012 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1013 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1014 | (vector unsigned short)pixelsv2); | |
1015 | temp3 = vec_add(pixelssum1, pixelssum2); | |
1016 | temp3 = vec_sra(temp3, vctwo); | |
1017 | pixelssum1 = vec_add(pixelssum2, vcone); | |
1018 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
1019 | ||
1020 | if (rightside) | |
1021 | { | |
1022 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
1023 | } | |
1024 | else | |
1025 | { | |
1026 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
1027 | } | |
1028 | ||
1029 | vec_st(blockv, 0, block); | |
1030 | ||
1031 | block += line_size; | |
1032 | pixels += line_size; | |
1033 | } | |
1034 | ||
1035 | POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
1036 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1037 | } | |
1038 | ||
1039 | /* next one assumes that ((line_size % 16) == 0) */ | |
1040 | void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
1041 | { | |
1042 | POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1); | |
1043 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
1044 | int j; | |
1045 | POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1046 | for (j = 0; j < 4; j++) { | |
1047 | int i; | |
1048 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1049 | const uint32_t b = | |
1050 | (((const struct unaligned_32 *) (pixels + 1))->l); | |
1051 | uint32_t l0 = | |
1052 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
1053 | uint32_t h0 = | |
1054 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1055 | uint32_t l1, h1; | |
1056 | pixels += line_size; | |
1057 | for (i = 0; i < h; i += 2) { | |
1058 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1059 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1060 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
1061 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1062 | *((uint32_t *) block) = | |
1063 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1064 | pixels += line_size; | |
1065 | block += line_size; | |
1066 | a = (((const struct unaligned_32 *) (pixels))->l); | |
1067 | b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1068 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
1069 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1070 | *((uint32_t *) block) = | |
1071 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1072 | pixels += line_size; | |
1073 | block += line_size; | |
1074 | } pixels += 4 - line_size * (h + 1); | |
1075 | block += 4 - line_size * h; | |
1076 | } | |
1077 | ||
1078 | POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1079 | ||
1080 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1081 | register int i; | |
1082 | register vector unsigned char | |
1083 | pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
1084 | register vector unsigned char | |
1085 | blockv, temp1, temp2; | |
1086 | register vector unsigned short | |
1087 | pixelssum1, pixelssum2, temp3, | |
1088 | pixelssum3, pixelssum4, temp4; | |
3b991c54 RD |
1089 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
1090 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
fe50f385 RD |
1091 | |
1092 | temp1 = vec_ld(0, pixels); | |
1093 | temp2 = vec_ld(16, pixels); | |
1094 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1095 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1096 | { | |
1097 | pixelsv2 = temp2; | |
1098 | } | |
1099 | else | |
1100 | { | |
1101 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1102 | } | |
1103 | pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1104 | pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1105 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1106 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1107 | pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
1108 | (vector unsigned short)pixelsv4); | |
1109 | pixelssum3 = vec_add(pixelssum3, vctwo); | |
1110 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1111 | (vector unsigned short)pixelsv2); | |
1112 | pixelssum1 = vec_add(pixelssum1, vctwo); | |
1113 | ||
1114 | POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1115 | for (i = 0; i < h ; i++) { | |
1116 | blockv = vec_ld(0, block); | |
1117 | ||
1118 | temp1 = vec_ld(line_size, pixels); | |
1119 | temp2 = vec_ld(line_size + 16, pixels); | |
1120 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1121 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1122 | { | |
1123 | pixelsv2 = temp2; | |
1124 | } | |
1125 | else | |
1126 | { | |
1127 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1128 | } | |
1129 | ||
1130 | pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1131 | pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1132 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1133 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1134 | ||
1135 | pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
1136 | (vector unsigned short)pixelsv4); | |
1137 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1138 | (vector unsigned short)pixelsv2); | |
1139 | temp4 = vec_add(pixelssum3, pixelssum4); | |
1140 | temp4 = vec_sra(temp4, vctwo); | |
1141 | temp3 = vec_add(pixelssum1, pixelssum2); | |
1142 | temp3 = vec_sra(temp3, vctwo); | |
1143 | ||
1144 | pixelssum3 = vec_add(pixelssum4, vctwo); | |
1145 | pixelssum1 = vec_add(pixelssum2, vctwo); | |
1146 | ||
1147 | blockv = vec_packsu(temp3, temp4); | |
1148 | ||
1149 | vec_st(blockv, 0, block); | |
1150 | ||
1151 | block += line_size; | |
1152 | pixels += line_size; | |
1153 | } | |
1154 | ||
1155 | POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1156 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1157 | } | |
1158 | ||
1159 | /* next one assumes that ((line_size % 16) == 0) */ | |
1160 | void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
1161 | { | |
1162 | POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1163 | #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
1164 | int j; | |
1165 | POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1166 | for (j = 0; j < 4; j++) { | |
1167 | int i; | |
1168 | const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1169 | const uint32_t b = | |
1170 | (((const struct unaligned_32 *) (pixels + 1))->l); | |
1171 | uint32_t l0 = | |
1172 | (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
1173 | uint32_t h0 = | |
1174 | ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1175 | uint32_t l1, h1; | |
1176 | pixels += line_size; | |
1177 | for (i = 0; i < h; i += 2) { | |
1178 | uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1179 | uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1180 | l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
1181 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1182 | *((uint32_t *) block) = | |
1183 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1184 | pixels += line_size; | |
1185 | block += line_size; | |
1186 | a = (((const struct unaligned_32 *) (pixels))->l); | |
1187 | b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1188 | l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
1189 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1190 | *((uint32_t *) block) = | |
1191 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1192 | pixels += line_size; | |
1193 | block += line_size; | |
1194 | } pixels += 4 - line_size * (h + 1); | |
1195 | block += 4 - line_size * h; | |
1196 | } | |
1197 | ||
1198 | POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1199 | ||
1200 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1201 | register int i; | |
1202 | register vector unsigned char | |
1203 | pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
1204 | register vector unsigned char | |
1205 | blockv, temp1, temp2; | |
1206 | register vector unsigned short | |
1207 | pixelssum1, pixelssum2, temp3, | |
1208 | pixelssum3, pixelssum4, temp4; | |
3b991c54 RD |
1209 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
1210 | register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
1211 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
fe50f385 RD |
1212 | |
1213 | temp1 = vec_ld(0, pixels); | |
1214 | temp2 = vec_ld(16, pixels); | |
1215 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1216 | if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1217 | { | |
1218 | pixelsv2 = temp2; | |
1219 | } | |
1220 | else | |
1221 | { | |
1222 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1223 | } | |
1224 | pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1225 | pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1226 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1227 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1228 | pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
1229 | (vector unsigned short)pixelsv4); | |
1230 | pixelssum3 = vec_add(pixelssum3, vcone); | |
1231 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1232 | (vector unsigned short)pixelsv2); | |
1233 | pixelssum1 = vec_add(pixelssum1, vcone); | |
1234 | ||
1235 | POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1236 | for (i = 0; i < h ; i++) { | |
1237 | blockv = vec_ld(0, block); | |
1238 | ||
1239 | temp1 = vec_ld(line_size, pixels); | |
1240 | temp2 = vec_ld(line_size + 16, pixels); | |
1241 | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1242 | if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1243 | { | |
1244 | pixelsv2 = temp2; | |
1245 | } | |
1246 | else | |
1247 | { | |
1248 | pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1249 | } | |
1250 | ||
1251 | pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1252 | pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1253 | pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1254 | pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1255 | ||
1256 | pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
1257 | (vector unsigned short)pixelsv4); | |
1258 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1259 | (vector unsigned short)pixelsv2); | |
1260 | temp4 = vec_add(pixelssum3, pixelssum4); | |
1261 | temp4 = vec_sra(temp4, vctwo); | |
1262 | temp3 = vec_add(pixelssum1, pixelssum2); | |
1263 | temp3 = vec_sra(temp3, vctwo); | |
1264 | ||
1265 | pixelssum3 = vec_add(pixelssum4, vcone); | |
1266 | pixelssum1 = vec_add(pixelssum2, vcone); | |
1267 | ||
1268 | blockv = vec_packsu(temp3, temp4); | |
1269 | ||
1270 | vec_st(blockv, 0, block); | |
1271 | ||
1272 | block += line_size; | |
1273 | pixels += line_size; | |
1274 | } | |
1275 | ||
1276 | POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1277 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1278 | } | |
1279 | ||
59925ef2 BF |
1280 | int has_altivec(void) |
1281 | { | |
3b991c54 | 1282 | #ifdef CONFIG_DARWIN |
59925ef2 BF |
1283 | int sels[2] = {CTL_HW, HW_VECTORUNIT}; |
1284 | int has_vu = 0; | |
1285 | size_t len = sizeof(has_vu); | |
1286 | int err; | |
1287 | ||
1288 | err = sysctl(sels, 2, &has_vu, &len, NULL, 0); | |
1289 | ||
1290 | if (err == 0) return (has_vu != 0); | |
3b991c54 RD |
1291 | #else /* CONFIG_DARWIN */ |
1292 | /* no Darwin, do it the brute-force way */ | |
1293 | /* this is borrowed from the libmpeg2 library */ | |
1294 | { | |
1295 | signal (SIGILL, sigill_handler); | |
1296 | if (sigsetjmp (jmpbuf, 1)) { | |
1297 | signal (SIGILL, SIG_DFL); | |
1298 | } else { | |
1299 | canjump = 1; | |
1300 | ||
1301 | asm volatile ("mtspr 256, %0\n\t" | |
1302 | "vand %%v0, %%v0, %%v0" | |
1303 | : | |
1304 | : "r" (-1)); | |
1305 | ||
1306 | signal (SIGILL, SIG_DFL); | |
1307 | return 1; | |
1308 | } | |
1309 | } | |
1310 | #endif /* CONFIG_DARWIN */ | |
59925ef2 BF |
1311 | return 0; |
1312 | } |