Altivec on non darwin systems patch by Romain Dolbeau
[libav.git] / libavcodec / ppc / dsputil_altivec.c
1 /*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21 #include "../dsputil.h"
22 #include "dsputil_altivec.h"
23
24 #ifdef CONFIG_DARWIN
25 #include <sys/sysctl.h>
26 #else /* CONFIG_DARWIN */
27 #include <signal.h>
28 #include <setjmp.h>
29
30 static sigjmp_buf jmpbuf;
31 static volatile sig_atomic_t canjump = 0;
32
33 static void sigill_handler (int sig)
34 {
35 if (!canjump) {
36 signal (sig, SIG_DFL);
37 raise (sig);
38 }
39
40 canjump = 0;
41 siglongjmp (jmpbuf, 1);
42 }
43 #endif /* CONFIG_DARWIN */
44
45 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
46 {
47 int i;
48 int s __attribute__((aligned(16)));
49 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
50 vector unsigned char *tv;
51 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
52 vector unsigned int sad;
53 vector signed int sumdiffs;
54
55 s = 0;
56 sad = (vector unsigned int)vec_splat_u32(0);
57 for(i=0;i<16;i++) {
58 /*
59 Read unaligned pixels into our vectors. The vectors are as follows:
60 pix1v: pix1[0]-pix1[15]
61 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
62 */
63 tv = (vector unsigned char *) pix1;
64 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
65
66 tv = (vector unsigned char *) &pix2[0];
67 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
68
69 tv = (vector unsigned char *) &pix2[1];
70 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
71
72 /* Calculate the average vector */
73 avgv = vec_avg(pix2v, pix2iv);
74
75 /* Calculate a sum of abs differences vector */
76 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
77
78 /* Add each 4 pixel group together and put 4 results into sad */
79 sad = vec_sum4s(t5, sad);
80
81 pix1 += line_size;
82 pix2 += line_size;
83 }
84 /* Sum up the four partial sums, and put the result into s */
85 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
86 sumdiffs = vec_splat(sumdiffs, 3);
87 vec_ste(sumdiffs, 0, &s);
88
89 return s;
90 }
91
92 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
93 {
94 int i;
95 int s __attribute__((aligned(16)));
96 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
97 vector unsigned char *tv;
98 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
99 vector unsigned int sad;
100 vector signed int sumdiffs;
101 uint8_t *pix3 = pix2 + line_size;
102
103 s = 0;
104 sad = (vector unsigned int)vec_splat_u32(0);
105
106 /*
107 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
108 iteration becomes pix2 in the next iteration. We can use this
109 fact to avoid a potentially expensive unaligned read, each
110 time around the loop.
111 Read unaligned pixels into our vectors. The vectors are as follows:
112 pix2v: pix2[0]-pix2[15]
113 Split the pixel vectors into shorts
114 */
115 tv = (vector unsigned char *) &pix2[0];
116 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
117
118 for(i=0;i<16;i++) {
119 /*
120 Read unaligned pixels into our vectors. The vectors are as follows:
121 pix1v: pix1[0]-pix1[15]
122 pix3v: pix3[0]-pix3[15]
123 */
124 tv = (vector unsigned char *) pix1;
125 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
126
127 tv = (vector unsigned char *) &pix3[0];
128 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
129
130 /* Calculate the average vector */
131 avgv = vec_avg(pix2v, pix3v);
132
133 /* Calculate a sum of abs differences vector */
134 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
135
136 /* Add each 4 pixel group together and put 4 results into sad */
137 sad = vec_sum4s(t5, sad);
138
139 pix1 += line_size;
140 pix2v = pix3v;
141 pix3 += line_size;
142
143 }
144
145 /* Sum up the four partial sums, and put the result into s */
146 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
147 sumdiffs = vec_splat(sumdiffs, 3);
148 vec_ste(sumdiffs, 0, &s);
149 return s;
150 }
151
152 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
153 {
154 int i;
155 int s __attribute__((aligned(16)));
156 uint8_t *pix3 = pix2 + line_size;
157 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
158 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
159 vector unsigned char *tv, avgv, t5;
160 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
161 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
162 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
163 vector unsigned short avghv, avglv;
164 vector unsigned short t1, t2, t3, t4;
165 vector unsigned int sad;
166 vector signed int sumdiffs;
167
168 sad = (vector unsigned int)vec_splat_u32(0);
169
170 s = 0;
171
172 /*
173 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
174 iteration becomes pix2 in the next iteration. We can use this
175 fact to avoid a potentially expensive unaligned read, as well
176 as some splitting, and vector addition each time around the loop.
177 Read unaligned pixels into our vectors. The vectors are as follows:
178 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
179 Split the pixel vectors into shorts
180 */
181 tv = (vector unsigned char *) &pix2[0];
182 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
183
184 tv = (vector unsigned char *) &pix2[1];
185 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
186
187 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
188 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
189 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
190 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
191 t1 = vec_add(pix2hv, pix2ihv);
192 t2 = vec_add(pix2lv, pix2ilv);
193
194 for(i=0;i<16;i++) {
195 /*
196 Read unaligned pixels into our vectors. The vectors are as follows:
197 pix1v: pix1[0]-pix1[15]
198 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
199 */
200 tv = (vector unsigned char *) pix1;
201 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
202
203 tv = (vector unsigned char *) &pix3[0];
204 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
205
206 tv = (vector unsigned char *) &pix3[1];
207 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
208
209 /*
210 Note that Altivec does have vec_avg, but this works on vector pairs
211 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
212 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
213 Instead, we have to split the pixel vectors into vectors of shorts,
214 and do the averaging by hand.
215 */
216
217 /* Split the pixel vectors into shorts */
218 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
219 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
220 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
221 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
222
223 /* Do the averaging on them */
224 t3 = vec_add(pix3hv, pix3ihv);
225 t4 = vec_add(pix3lv, pix3ilv);
226
227 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
228 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
229
230 /* Pack the shorts back into a result */
231 avgv = vec_pack(avghv, avglv);
232
233 /* Calculate a sum of abs differences vector */
234 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
235
236 /* Add each 4 pixel group together and put 4 results into sad */
237 sad = vec_sum4s(t5, sad);
238
239 pix1 += line_size;
240 pix3 += line_size;
241 /* Transfer the calculated values for pix3 into pix2 */
242 t1 = t3;
243 t2 = t4;
244 }
245 /* Sum up the four partial sums, and put the result into s */
246 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
247 sumdiffs = vec_splat(sumdiffs, 3);
248 vec_ste(sumdiffs, 0, &s);
249
250 return s;
251 }
252
253 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
254 {
255 int i;
256 int s __attribute__((aligned(16)));
257 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
258 vector unsigned char perm1, perm2, *pix1v, *pix2v;
259 vector unsigned char t1, t2, t3,t4, t5;
260 vector unsigned int sad;
261 vector signed int sumdiffs;
262
263 sad = (vector unsigned int)vec_splat_u32(0);
264
265
266 for(i=0;i<16;i++) {
267 /* Read potentially unaligned pixels into t1 and t2 */
268 perm1 = vec_lvsl(0, pix1);
269 pix1v = (vector unsigned char *) pix1;
270 perm2 = vec_lvsl(0, pix2);
271 pix2v = (vector unsigned char *) pix2;
272 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
273 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
274
275 /* Calculate a sum of abs differences vector */
276 t3 = vec_max(t1, t2);
277 t4 = vec_min(t1, t2);
278 t5 = vec_sub(t3, t4);
279
280 /* Add each 4 pixel group together and put 4 results into sad */
281 sad = vec_sum4s(t5, sad);
282
283 pix1 += line_size;
284 pix2 += line_size;
285 }
286
287 /* Sum up the four partial sums, and put the result into s */
288 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
289 sumdiffs = vec_splat(sumdiffs, 3);
290 vec_ste(sumdiffs, 0, &s);
291
292 return s;
293 }
294
295 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
296 {
297 int i;
298 int s __attribute__((aligned(16)));
299 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
300 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
301 vector unsigned char t1, t2, t3,t4, t5;
302 vector unsigned int sad;
303 vector signed int sumdiffs;
304
305 sad = (vector unsigned int)vec_splat_u32(0);
306 #ifdef CONFIG_DARWIN
307 permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
308 #else
309 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
310 #endif
311
312 for(i=0;i<8;i++) {
313 /* Read potentially unaligned pixels into t1 and t2
314 Since we're reading 16 pixels, and actually only want 8,
315 mask out the last 8 pixels. The 0s don't change the sum. */
316 perm1 = vec_lvsl(0, pix1);
317 pix1v = (vector unsigned char *) pix1;
318 perm2 = vec_lvsl(0, pix2);
319 pix2v = (vector unsigned char *) pix2;
320 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
321 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
322
323 /* Calculate a sum of abs differences vector */
324 t3 = vec_max(t1, t2);
325 t4 = vec_min(t1, t2);
326 t5 = vec_sub(t3, t4);
327
328 /* Add each 4 pixel group together and put 4 results into sad */
329 sad = vec_sum4s(t5, sad);
330
331 pix1 += line_size;
332 pix2 += line_size;
333 }
334
335 /* Sum up the four partial sums, and put the result into s */
336 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
337 sumdiffs = vec_splat(sumdiffs, 3);
338 vec_ste(sumdiffs, 0, &s);
339
340 return s;
341 }
342
343 int pix_norm1_altivec(uint8_t *pix, int line_size)
344 {
345 int i;
346 int s __attribute__((aligned(16)));
347 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
348 vector unsigned char *tv;
349 vector unsigned char pixv;
350 vector unsigned int sv;
351 vector signed int sum;
352
353 sv = (vector unsigned int)vec_splat_u32(0);
354
355 s = 0;
356 for (i = 0; i < 16; i++) {
357 /* Read in the potentially unaligned pixels */
358 tv = (vector unsigned char *) pix;
359 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
360
361 /* Square the values, and add them to our sum */
362 sv = vec_msum(pixv, pixv, sv);
363
364 pix += line_size;
365 }
366 /* Sum up the four partial sums, and put the result into s */
367 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
368 sum = vec_splat(sum, 3);
369 vec_ste(sum, 0, &s);
370
371 return s;
372 }
373
374 /**
375 * Sum of Squared Errors for a 8x8 block.
376 * AltiVec-enhanced.
377 * It's the pix_abs8x8_altivec code above w/ squaring added.
378 */
379 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
380 {
381 int i;
382 int s __attribute__((aligned(16)));
383 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
384 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
385 vector unsigned char t1, t2, t3,t4, t5;
386 vector unsigned int sum;
387 vector signed int sumsqr;
388
389 sum = (vector unsigned int)vec_splat_u32(0);
390 #ifdef CONFIG_DARWIN
391 permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
392 #else
393 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
394 #endif
395
396 for(i=0;i<8;i++) {
397 /* Read potentially unaligned pixels into t1 and t2
398 Since we're reading 16 pixels, and actually only want 8,
399 mask out the last 8 pixels. The 0s don't change the sum. */
400 perm1 = vec_lvsl(0, pix1);
401 pix1v = (vector unsigned char *) pix1;
402 perm2 = vec_lvsl(0, pix2);
403 pix2v = (vector unsigned char *) pix2;
404 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
405 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
406
407 /*
408 Since we want to use unsigned chars, we can take advantage
409 of the fact that abs(a-b)^2 = (a-b)^2.
410 */
411
412 /* Calculate abs differences vector */
413 t3 = vec_max(t1, t2);
414 t4 = vec_min(t1, t2);
415 t5 = vec_sub(t3, t4);
416
417 /* Square the values and add them to our sum */
418 sum = vec_msum(t5, t5, sum);
419
420 pix1 += line_size;
421 pix2 += line_size;
422 }
423
424 /* Sum up the four partial sums, and put the result into s */
425 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
426 sumsqr = vec_splat(sumsqr, 3);
427 vec_ste(sumsqr, 0, &s);
428
429 return s;
430 }
431
432 /**
433 * Sum of Squared Errors for a 16x16 block.
434 * AltiVec-enhanced.
435 * It's the pix_abs16x16_altivec code above w/ squaring added.
436 */
437 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
438 {
439 int i;
440 int s __attribute__((aligned(16)));
441 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
442 vector unsigned char perm1, perm2, *pix1v, *pix2v;
443 vector unsigned char t1, t2, t3,t4, t5;
444 vector unsigned int sum;
445 vector signed int sumsqr;
446
447 sum = (vector unsigned int)vec_splat_u32(0);
448
449 for(i=0;i<16;i++) {
450 /* Read potentially unaligned pixels into t1 and t2 */
451 perm1 = vec_lvsl(0, pix1);
452 pix1v = (vector unsigned char *) pix1;
453 perm2 = vec_lvsl(0, pix2);
454 pix2v = (vector unsigned char *) pix2;
455 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
456 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
457
458 /*
459 Since we want to use unsigned chars, we can take advantage
460 of the fact that abs(a-b)^2 = (a-b)^2.
461 */
462
463 /* Calculate abs differences vector */
464 t3 = vec_max(t1, t2);
465 t4 = vec_min(t1, t2);
466 t5 = vec_sub(t3, t4);
467
468 /* Square the values and add them to our sum */
469 sum = vec_msum(t5, t5, sum);
470
471 pix1 += line_size;
472 pix2 += line_size;
473 }
474
475 /* Sum up the four partial sums, and put the result into s */
476 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
477 sumsqr = vec_splat(sumsqr, 3);
478 vec_ste(sumsqr, 0, &s);
479
480 return s;
481 }
482
483 int pix_sum_altivec(UINT8 * pix, int line_size)
484 {
485 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
486 vector unsigned char perm, *pixv;
487 vector unsigned char t1;
488 vector unsigned int sad;
489 vector signed int sumdiffs;
490
491 int i;
492 int s __attribute__((aligned(16)));
493
494 sad = (vector unsigned int)vec_splat_u32(0);
495
496 for (i = 0; i < 16; i++) {
497 /* Read the potentially unaligned 16 pixels into t1 */
498 perm = vec_lvsl(0, pix);
499 pixv = (vector unsigned char *) pix;
500 t1 = vec_perm(pixv[0], pixv[1], perm);
501
502 /* Add each 4 pixel group together and put 4 results into sad */
503 sad = vec_sum4s(t1, sad);
504
505 pix += line_size;
506 }
507
508 /* Sum up the four partial sums, and put the result into s */
509 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
510 sumdiffs = vec_splat(sumdiffs, 3);
511 vec_ste(sumdiffs, 0, &s);
512
513 return s;
514 }
515
516 void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
517 {
518 int i;
519 vector unsigned char perm, bytes, *pixv;
520 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
521 vector signed short shorts;
522
523 for(i=0;i<8;i++)
524 {
525 // Read potentially unaligned pixels.
526 // We're reading 16 pixels, and actually only want 8,
527 // but we simply ignore the extras.
528 perm = vec_lvsl(0, pixels);
529 pixv = (vector unsigned char *) pixels;
530 bytes = vec_perm(pixv[0], pixv[1], perm);
531
532 // convert the bytes into shorts
533 shorts = (vector signed short)vec_mergeh(zero, bytes);
534
535 // save the data to the block, we assume the block is 16-byte aligned
536 vec_st(shorts, i*16, (vector signed short*)block);
537
538 pixels += line_size;
539 }
540 }
541
542 void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
543 const UINT8 *s2, int stride)
544 {
545 int i;
546 vector unsigned char perm, bytes, *pixv;
547 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
548 vector signed short shorts1, shorts2;
549
550 for(i=0;i<4;i++)
551 {
552 // Read potentially unaligned pixels
553 // We're reading 16 pixels, and actually only want 8,
554 // but we simply ignore the extras.
555 perm = vec_lvsl(0, s1);
556 pixv = (vector unsigned char *) s1;
557 bytes = vec_perm(pixv[0], pixv[1], perm);
558
559 // convert the bytes into shorts
560 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
561
562 // Do the same for the second block of pixels
563 perm = vec_lvsl(0, s2);
564 pixv = (vector unsigned char *) s2;
565 bytes = vec_perm(pixv[0], pixv[1], perm);
566
567 // convert the bytes into shorts
568 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
569
570 // Do the subtraction
571 shorts1 = vec_sub(shorts1, shorts2);
572
573 // save the data to the block, we assume the block is 16-byte aligned
574 vec_st(shorts1, 0, (vector signed short*)block);
575
576 s1 += stride;
577 s2 += stride;
578 block += 8;
579
580
581 // The code below is a copy of the code above... This is a manual
582 // unroll.
583
584 // Read potentially unaligned pixels
585 // We're reading 16 pixels, and actually only want 8,
586 // but we simply ignore the extras.
587 perm = vec_lvsl(0, s1);
588 pixv = (vector unsigned char *) s1;
589 bytes = vec_perm(pixv[0], pixv[1], perm);
590
591 // convert the bytes into shorts
592 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
593
594 // Do the same for the second block of pixels
595 perm = vec_lvsl(0, s2);
596 pixv = (vector unsigned char *) s2;
597 bytes = vec_perm(pixv[0], pixv[1], perm);
598
599 // convert the bytes into shorts
600 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
601
602 // Do the subtraction
603 shorts1 = vec_sub(shorts1, shorts2);
604
605 // save the data to the block, we assume the block is 16-byte aligned
606 vec_st(shorts1, 0, (vector signed short*)block);
607
608 s1 += stride;
609 s2 += stride;
610 block += 8;
611 }
612 }
613
614 int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
615 return pix_abs16x16_altivec(a,b,stride);
616 }
617
618 int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
619 return pix_abs8x8_altivec(a,b,stride);
620 }
621
622 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
623 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
624 int i;
625 for(i=0; i+7<w; i++){
626 dst[i+0] += src[i+0];
627 dst[i+1] += src[i+1];
628 dst[i+2] += src[i+2];
629 dst[i+3] += src[i+3];
630 dst[i+4] += src[i+4];
631 dst[i+5] += src[i+5];
632 dst[i+6] += src[i+6];
633 dst[i+7] += src[i+7];
634 }
635 for(; i<w; i++)
636 dst[i+0] += src[i+0];
637 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
638 register int i;
639 register vector unsigned char vdst, vsrc;
640
641 /* dst and src are 16 bytes-aligned (guaranteed) */
642 for(i = 0 ; (i + 15) < w ; i++)
643 {
644 vdst = vec_ld(i << 4, (unsigned char*)dst);
645 vsrc = vec_ld(i << 4, (unsigned char*)src);
646 vdst = vec_add(vsrc, vdst);
647 vec_st(vdst, i << 4, (unsigned char*)dst);
648 }
649 /* if w is not a multiple of 16 */
650 for (; (i < w) ; i++)
651 {
652 dst[i] = src[i];
653 }
654 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
655 }
656
657 /* next one assumes that ((line_size % 16) == 0) */
658 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
659 {
660 POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1);
661 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
662 int i;
663
664 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
665
666 for(i=0; i<h; i++) {
667 *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
668 *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
669 *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
670 *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
671 pixels+=line_size;
672 block +=line_size;
673 }
674
675 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
676
677 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
678 register vector unsigned char pixelsv1, pixelsv2;
679 register vector unsigned char perm = vec_lvsl(0, pixels);
680 int i;
681
682 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
683
684 for(i=0; i<h; i++) {
685 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
686 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
687 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
688 0, (unsigned char*)block);
689 pixels+=line_size;
690 block +=line_size;
691 }
692
693 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
694
695 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
696 }
697
698 /* next one assumes that ((line_size % 16) == 0) */
699 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
700 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
701 {
702 POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1);
703 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
704 int i;
705
706 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
707
708 for(i=0; i<h; i++) {
709 op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
710 op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
711 op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
712 op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
713 pixels+=line_size;
714 block +=line_size;
715 }
716
717 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
718
719 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
720 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
721 register vector unsigned char perm = vec_lvsl(0, pixels);
722 int i;
723
724 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
725
726 for(i=0; i<h; i++) {
727 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
728 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
729 blockv = vec_ld(0, block);
730 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
731 blockv = vec_avg(blockv,pixelsv);
732 vec_st(blockv, 0, (unsigned char*)block);
733 pixels+=line_size;
734 block +=line_size;
735 }
736
737 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
738
739 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
740 }
741
742 /* next one assumes that ((line_size % 8) == 0) */
743 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
744 {
745 POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1);
746 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
747 int i;
748 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
749 for (i = 0; i < h; i++) {
750 *((uint32_t *) (block)) =
751 (((*((uint32_t *) (block))) |
752 ((((const struct unaligned_32 *) (pixels))->l))) -
753 ((((*((uint32_t *) (block))) ^
754 ((((const struct unaligned_32 *) (pixels))->
755 l))) & 0xFEFEFEFEUL) >> 1));
756 *((uint32_t *) (block + 4)) =
757 (((*((uint32_t *) (block + 4))) |
758 ((((const struct unaligned_32 *) (pixels + 4))->l))) -
759 ((((*((uint32_t *) (block + 4))) ^
760 ((((const struct unaligned_32 *) (pixels +
761 4))->
762 l))) & 0xFEFEFEFEUL) >> 1));
763 pixels += line_size;
764 block += line_size;
765 }
766 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
767
768 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
769 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
770 int i;
771
772 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
773
774 for (i = 0; i < h; i++) {
775 /*
776 block is 8 bytes-aligned, so we're either in the
777 left block (16 bytes-aligned) or in the right block (not)
778 */
779 int rightside = ((unsigned long)block & 0x0000000F);
780
781 blockv = vec_ld(0, block);
782 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
783 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
784 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
785
786 if (rightside)
787 {
788 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
789 }
790 else
791 {
792 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
793 }
794
795 blockv = vec_avg(blockv, pixelsv);
796
797 vec_st(blockv, 0, block);
798
799 pixels += line_size;
800 block += line_size;
801 }
802
803 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
804
805 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
806 }
807
808 /* next one assumes that ((line_size % 8) == 0) */
809 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
810 {
811 POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1);
812 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
813 int j;
814 POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
815 for (j = 0; j < 2; j++) {
816 int i;
817 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
818 const uint32_t b =
819 (((const struct unaligned_32 *) (pixels + 1))->l);
820 uint32_t l0 =
821 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
822 uint32_t h0 =
823 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
824 uint32_t l1, h1;
825 pixels += line_size;
826 for (i = 0; i < h; i += 2) {
827 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
828 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
829 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
830 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
831 *((uint32_t *) block) =
832 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
833 pixels += line_size;
834 block += line_size;
835 a = (((const struct unaligned_32 *) (pixels))->l);
836 b = (((const struct unaligned_32 *) (pixels + 1))->l);
837 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
838 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
839 *((uint32_t *) block) =
840 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
841 pixels += line_size;
842 block += line_size;
843 } pixels += 4 - line_size * (h + 1);
844 block += 4 - line_size * h;
845 }
846
847 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
848
849 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
850 register int i;
851 register vector unsigned char
852 pixelsv1, pixelsv2,
853 pixelsavg;
854 register vector unsigned char
855 blockv, temp1, temp2;
856 register vector unsigned short
857 pixelssum1, pixelssum2, temp3;
858 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
859 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
860
861 temp1 = vec_ld(0, pixels);
862 temp2 = vec_ld(16, pixels);
863 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
864 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
865 {
866 pixelsv2 = temp2;
867 }
868 else
869 {
870 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
871 }
872 pixelsv1 = vec_mergeh(vczero, pixelsv1);
873 pixelsv2 = vec_mergeh(vczero, pixelsv2);
874 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
875 (vector unsigned short)pixelsv2);
876 pixelssum1 = vec_add(pixelssum1, vctwo);
877
878 POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
879 for (i = 0; i < h ; i++) {
880 int rightside = ((unsigned long)block & 0x0000000F);
881 blockv = vec_ld(0, block);
882
883 temp1 = vec_ld(line_size, pixels);
884 temp2 = vec_ld(line_size + 16, pixels);
885 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
886 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
887 {
888 pixelsv2 = temp2;
889 }
890 else
891 {
892 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
893 }
894
895 pixelsv1 = vec_mergeh(vczero, pixelsv1);
896 pixelsv2 = vec_mergeh(vczero, pixelsv2);
897 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
898 (vector unsigned short)pixelsv2);
899 temp3 = vec_add(pixelssum1, pixelssum2);
900 temp3 = vec_sra(temp3, vctwo);
901 pixelssum1 = vec_add(pixelssum2, vctwo);
902 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
903
904 if (rightside)
905 {
906 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
907 }
908 else
909 {
910 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
911 }
912
913 vec_st(blockv, 0, block);
914
915 block += line_size;
916 pixels += line_size;
917 }
918
919 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
920 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
921 }
922
923 /* next one assumes that ((line_size % 8) == 0) */
924 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
925 {
926 POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
927 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
928 int j;
929 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
930 for (j = 0; j < 2; j++) {
931 int i;
932 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
933 const uint32_t b =
934 (((const struct unaligned_32 *) (pixels + 1))->l);
935 uint32_t l0 =
936 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
937 uint32_t h0 =
938 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
939 uint32_t l1, h1;
940 pixels += line_size;
941 for (i = 0; i < h; i += 2) {
942 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
943 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
944 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
945 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
946 *((uint32_t *) block) =
947 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
948 pixels += line_size;
949 block += line_size;
950 a = (((const struct unaligned_32 *) (pixels))->l);
951 b = (((const struct unaligned_32 *) (pixels + 1))->l);
952 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
953 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
954 *((uint32_t *) block) =
955 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
956 pixels += line_size;
957 block += line_size;
958 } pixels += 4 - line_size * (h + 1);
959 block += 4 - line_size * h;
960 }
961
962 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
963
964 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
965 register int i;
966 register vector unsigned char
967 pixelsv1, pixelsv2,
968 pixelsavg;
969 register vector unsigned char
970 blockv, temp1, temp2;
971 register vector unsigned short
972 pixelssum1, pixelssum2, temp3;
973 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
974 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
975 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
976
977 temp1 = vec_ld(0, pixels);
978 temp2 = vec_ld(16, pixels);
979 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
980 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
981 {
982 pixelsv2 = temp2;
983 }
984 else
985 {
986 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
987 }
988 pixelsv1 = vec_mergeh(vczero, pixelsv1);
989 pixelsv2 = vec_mergeh(vczero, pixelsv2);
990 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
991 (vector unsigned short)pixelsv2);
992 pixelssum1 = vec_add(pixelssum1, vcone);
993
994 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
995 for (i = 0; i < h ; i++) {
996 int rightside = ((unsigned long)block & 0x0000000F);
997 blockv = vec_ld(0, block);
998
999 temp1 = vec_ld(line_size, pixels);
1000 temp2 = vec_ld(line_size + 16, pixels);
1001 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1002 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1003 {
1004 pixelsv2 = temp2;
1005 }
1006 else
1007 {
1008 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1009 }
1010
1011 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1012 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1013 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1014 (vector unsigned short)pixelsv2);
1015 temp3 = vec_add(pixelssum1, pixelssum2);
1016 temp3 = vec_sra(temp3, vctwo);
1017 pixelssum1 = vec_add(pixelssum2, vcone);
1018 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1019
1020 if (rightside)
1021 {
1022 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1023 }
1024 else
1025 {
1026 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1027 }
1028
1029 vec_st(blockv, 0, block);
1030
1031 block += line_size;
1032 pixels += line_size;
1033 }
1034
1035 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1036 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1037 }
1038
1039 /* next one assumes that ((line_size % 16) == 0) */
1040 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1041 {
1042 POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1);
1043 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1044 int j;
1045 POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1046 for (j = 0; j < 4; j++) {
1047 int i;
1048 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1049 const uint32_t b =
1050 (((const struct unaligned_32 *) (pixels + 1))->l);
1051 uint32_t l0 =
1052 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1053 uint32_t h0 =
1054 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1055 uint32_t l1, h1;
1056 pixels += line_size;
1057 for (i = 0; i < h; i += 2) {
1058 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1059 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1060 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1061 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1062 *((uint32_t *) block) =
1063 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1064 pixels += line_size;
1065 block += line_size;
1066 a = (((const struct unaligned_32 *) (pixels))->l);
1067 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1068 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1069 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1070 *((uint32_t *) block) =
1071 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1072 pixels += line_size;
1073 block += line_size;
1074 } pixels += 4 - line_size * (h + 1);
1075 block += 4 - line_size * h;
1076 }
1077
1078 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1079
1080 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1081 register int i;
1082 register vector unsigned char
1083 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1084 register vector unsigned char
1085 blockv, temp1, temp2;
1086 register vector unsigned short
1087 pixelssum1, pixelssum2, temp3,
1088 pixelssum3, pixelssum4, temp4;
1089 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1090 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1091
1092 temp1 = vec_ld(0, pixels);
1093 temp2 = vec_ld(16, pixels);
1094 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1095 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1096 {
1097 pixelsv2 = temp2;
1098 }
1099 else
1100 {
1101 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1102 }
1103 pixelsv3 = vec_mergel(vczero, pixelsv1);
1104 pixelsv4 = vec_mergel(vczero, pixelsv2);
1105 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1106 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1107 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1108 (vector unsigned short)pixelsv4);
1109 pixelssum3 = vec_add(pixelssum3, vctwo);
1110 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1111 (vector unsigned short)pixelsv2);
1112 pixelssum1 = vec_add(pixelssum1, vctwo);
1113
1114 POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1115 for (i = 0; i < h ; i++) {
1116 blockv = vec_ld(0, block);
1117
1118 temp1 = vec_ld(line_size, pixels);
1119 temp2 = vec_ld(line_size + 16, pixels);
1120 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1121 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1122 {
1123 pixelsv2 = temp2;
1124 }
1125 else
1126 {
1127 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1128 }
1129
1130 pixelsv3 = vec_mergel(vczero, pixelsv1);
1131 pixelsv4 = vec_mergel(vczero, pixelsv2);
1132 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1133 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1134
1135 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1136 (vector unsigned short)pixelsv4);
1137 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1138 (vector unsigned short)pixelsv2);
1139 temp4 = vec_add(pixelssum3, pixelssum4);
1140 temp4 = vec_sra(temp4, vctwo);
1141 temp3 = vec_add(pixelssum1, pixelssum2);
1142 temp3 = vec_sra(temp3, vctwo);
1143
1144 pixelssum3 = vec_add(pixelssum4, vctwo);
1145 pixelssum1 = vec_add(pixelssum2, vctwo);
1146
1147 blockv = vec_packsu(temp3, temp4);
1148
1149 vec_st(blockv, 0, block);
1150
1151 block += line_size;
1152 pixels += line_size;
1153 }
1154
1155 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1156 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1157 }
1158
1159 /* next one assumes that ((line_size % 16) == 0) */
1160 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1161 {
1162 POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1163 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1164 int j;
1165 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1166 for (j = 0; j < 4; j++) {
1167 int i;
1168 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1169 const uint32_t b =
1170 (((const struct unaligned_32 *) (pixels + 1))->l);
1171 uint32_t l0 =
1172 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1173 uint32_t h0 =
1174 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1175 uint32_t l1, h1;
1176 pixels += line_size;
1177 for (i = 0; i < h; i += 2) {
1178 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1179 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1180 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1181 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1182 *((uint32_t *) block) =
1183 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1184 pixels += line_size;
1185 block += line_size;
1186 a = (((const struct unaligned_32 *) (pixels))->l);
1187 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1188 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1189 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1190 *((uint32_t *) block) =
1191 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1192 pixels += line_size;
1193 block += line_size;
1194 } pixels += 4 - line_size * (h + 1);
1195 block += 4 - line_size * h;
1196 }
1197
1198 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1199
1200 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1201 register int i;
1202 register vector unsigned char
1203 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1204 register vector unsigned char
1205 blockv, temp1, temp2;
1206 register vector unsigned short
1207 pixelssum1, pixelssum2, temp3,
1208 pixelssum3, pixelssum4, temp4;
1209 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1210 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1211 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1212
1213 temp1 = vec_ld(0, pixels);
1214 temp2 = vec_ld(16, pixels);
1215 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1216 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1217 {
1218 pixelsv2 = temp2;
1219 }
1220 else
1221 {
1222 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1223 }
1224 pixelsv3 = vec_mergel(vczero, pixelsv1);
1225 pixelsv4 = vec_mergel(vczero, pixelsv2);
1226 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1227 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1228 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1229 (vector unsigned short)pixelsv4);
1230 pixelssum3 = vec_add(pixelssum3, vcone);
1231 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1232 (vector unsigned short)pixelsv2);
1233 pixelssum1 = vec_add(pixelssum1, vcone);
1234
1235 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1236 for (i = 0; i < h ; i++) {
1237 blockv = vec_ld(0, block);
1238
1239 temp1 = vec_ld(line_size, pixels);
1240 temp2 = vec_ld(line_size + 16, pixels);
1241 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1242 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1243 {
1244 pixelsv2 = temp2;
1245 }
1246 else
1247 {
1248 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1249 }
1250
1251 pixelsv3 = vec_mergel(vczero, pixelsv1);
1252 pixelsv4 = vec_mergel(vczero, pixelsv2);
1253 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1254 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1255
1256 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1257 (vector unsigned short)pixelsv4);
1258 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1259 (vector unsigned short)pixelsv2);
1260 temp4 = vec_add(pixelssum3, pixelssum4);
1261 temp4 = vec_sra(temp4, vctwo);
1262 temp3 = vec_add(pixelssum1, pixelssum2);
1263 temp3 = vec_sra(temp3, vctwo);
1264
1265 pixelssum3 = vec_add(pixelssum4, vcone);
1266 pixelssum1 = vec_add(pixelssum2, vcone);
1267
1268 blockv = vec_packsu(temp3, temp4);
1269
1270 vec_st(blockv, 0, block);
1271
1272 block += line_size;
1273 pixels += line_size;
1274 }
1275
1276 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1277 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1278 }
1279
1280 int has_altivec(void)
1281 {
1282 #ifdef CONFIG_DARWIN
1283 int sels[2] = {CTL_HW, HW_VECTORUNIT};
1284 int has_vu = 0;
1285 size_t len = sizeof(has_vu);
1286 int err;
1287
1288 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1289
1290 if (err == 0) return (has_vu != 0);
1291 #else /* CONFIG_DARWIN */
1292 /* no Darwin, do it the brute-force way */
1293 /* this is borrowed from the libmpeg2 library */
1294 {
1295 signal (SIGILL, sigill_handler);
1296 if (sigsetjmp (jmpbuf, 1)) {
1297 signal (SIGILL, SIG_DFL);
1298 } else {
1299 canjump = 1;
1300
1301 asm volatile ("mtspr 256, %0\n\t"
1302 "vand %%v0, %%v0, %%v0"
1303 :
1304 : "r" (-1));
1305
1306 signal (SIGILL, SIG_DFL);
1307 return 1;
1308 }
1309 }
1310 #endif /* CONFIG_DARWIN */
1311 return 0;
1312 }