2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #include "../dsputil.h"
22 #include "dsputil_altivec.h"
25 #include <sys/sysctl.h>
26 #else /* CONFIG_DARWIN */
30 static sigjmp_buf jmpbuf
;
31 static volatile sig_atomic_t canjump
= 0;
33 static void sigill_handler (int sig
)
36 signal (sig
, SIG_DFL
);
41 siglongjmp (jmpbuf
, 1);
43 #endif /* CONFIG_DARWIN */
45 int pix_abs16x16_x2_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
48 int s
__attribute__((aligned(16)));
49 const vector
unsigned char zero
= (const vector
unsigned char)vec_splat_u8(0);
50 vector
unsigned char *tv
;
51 vector
unsigned char pix1v
, pix2v
, pix2iv
, avgv
, t5
;
52 vector
unsigned int sad
;
53 vector
signed int sumdiffs
;
56 sad
= (vector
unsigned int)vec_splat_u32(0);
59 Read unaligned pixels into our vectors. The vectors are as follows:
60 pix1v: pix1[0]-pix1[15]
61 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
63 tv
= (vector
unsigned char *) pix1
;
64 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
66 tv
= (vector
unsigned char *) &pix2
[0];
67 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
69 tv
= (vector
unsigned char *) &pix2
[1];
70 pix2iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[1]));
72 /* Calculate the average vector */
73 avgv
= vec_avg(pix2v
, pix2iv
);
75 /* Calculate a sum of abs differences vector */
76 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
78 /* Add each 4 pixel group together and put 4 results into sad */
79 sad
= vec_sum4s(t5
, sad
);
84 /* Sum up the four partial sums, and put the result into s */
85 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
86 sumdiffs
= vec_splat(sumdiffs
, 3);
87 vec_ste(sumdiffs
, 0, &s
);
92 int pix_abs16x16_y2_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
95 int s
__attribute__((aligned(16)));
96 const vector
unsigned char zero
= (const vector
unsigned char)vec_splat_u8(0);
97 vector
unsigned char *tv
;
98 vector
unsigned char pix1v
, pix2v
, pix3v
, avgv
, t5
;
99 vector
unsigned int sad
;
100 vector
signed int sumdiffs
;
101 uint8_t *pix3
= pix2
+ line_size
;
104 sad
= (vector
unsigned int)vec_splat_u32(0);
107 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
108 iteration becomes pix2 in the next iteration. We can use this
109 fact to avoid a potentially expensive unaligned read, each
110 time around the loop.
111 Read unaligned pixels into our vectors. The vectors are as follows:
112 pix2v: pix2[0]-pix2[15]
113 Split the pixel vectors into shorts
115 tv
= (vector
unsigned char *) &pix2
[0];
116 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
120 Read unaligned pixels into our vectors. The vectors are as follows:
121 pix1v: pix1[0]-pix1[15]
122 pix3v: pix3[0]-pix3[15]
124 tv
= (vector
unsigned char *) pix1
;
125 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
127 tv
= (vector
unsigned char *) &pix3
[0];
128 pix3v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[0]));
130 /* Calculate the average vector */
131 avgv
= vec_avg(pix2v
, pix3v
);
133 /* Calculate a sum of abs differences vector */
134 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
136 /* Add each 4 pixel group together and put 4 results into sad */
137 sad
= vec_sum4s(t5
, sad
);
145 /* Sum up the four partial sums, and put the result into s */
146 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
147 sumdiffs
= vec_splat(sumdiffs
, 3);
148 vec_ste(sumdiffs
, 0, &s
);
152 int pix_abs16x16_xy2_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
155 int s
__attribute__((aligned(16)));
156 uint8_t *pix3
= pix2
+ line_size
;
157 const vector
unsigned char zero
= (const vector
unsigned char)vec_splat_u8(0);
158 const vector
unsigned short two
= (const vector
unsigned short)vec_splat_u16(2);
159 vector
unsigned char *tv
, avgv
, t5
;
160 vector
unsigned char pix1v
, pix2v
, pix3v
, pix2iv
, pix3iv
;
161 vector
unsigned short pix2lv
, pix2hv
, pix2ilv
, pix2ihv
;
162 vector
unsigned short pix3lv
, pix3hv
, pix3ilv
, pix3ihv
;
163 vector
unsigned short avghv
, avglv
;
164 vector
unsigned short t1
, t2
, t3
, t4
;
165 vector
unsigned int sad
;
166 vector
signed int sumdiffs
;
168 sad
= (vector
unsigned int)vec_splat_u32(0);
173 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
174 iteration becomes pix2 in the next iteration. We can use this
175 fact to avoid a potentially expensive unaligned read, as well
176 as some splitting, and vector addition each time around the loop.
177 Read unaligned pixels into our vectors. The vectors are as follows:
178 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
179 Split the pixel vectors into shorts
181 tv
= (vector
unsigned char *) &pix2
[0];
182 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
184 tv
= (vector
unsigned char *) &pix2
[1];
185 pix2iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[1]));
187 pix2hv
= (vector
unsigned short) vec_mergeh(zero
, pix2v
);
188 pix2lv
= (vector
unsigned short) vec_mergel(zero
, pix2v
);
189 pix2ihv
= (vector
unsigned short) vec_mergeh(zero
, pix2iv
);
190 pix2ilv
= (vector
unsigned short) vec_mergel(zero
, pix2iv
);
191 t1
= vec_add(pix2hv
, pix2ihv
);
192 t2
= vec_add(pix2lv
, pix2ilv
);
196 Read unaligned pixels into our vectors. The vectors are as follows:
197 pix1v: pix1[0]-pix1[15]
198 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
200 tv
= (vector
unsigned char *) pix1
;
201 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
203 tv
= (vector
unsigned char *) &pix3
[0];
204 pix3v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[0]));
206 tv
= (vector
unsigned char *) &pix3
[1];
207 pix3iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[1]));
210 Note that Altivec does have vec_avg, but this works on vector pairs
211 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
212 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
213 Instead, we have to split the pixel vectors into vectors of shorts,
214 and do the averaging by hand.
217 /* Split the pixel vectors into shorts */
218 pix3hv
= (vector
unsigned short) vec_mergeh(zero
, pix3v
);
219 pix3lv
= (vector
unsigned short) vec_mergel(zero
, pix3v
);
220 pix3ihv
= (vector
unsigned short) vec_mergeh(zero
, pix3iv
);
221 pix3ilv
= (vector
unsigned short) vec_mergel(zero
, pix3iv
);
223 /* Do the averaging on them */
224 t3
= vec_add(pix3hv
, pix3ihv
);
225 t4
= vec_add(pix3lv
, pix3ilv
);
227 avghv
= vec_sr(vec_add(vec_add(t1
, t3
), two
), two
);
228 avglv
= vec_sr(vec_add(vec_add(t2
, t4
), two
), two
);
230 /* Pack the shorts back into a result */
231 avgv
= vec_pack(avghv
, avglv
);
233 /* Calculate a sum of abs differences vector */
234 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
236 /* Add each 4 pixel group together and put 4 results into sad */
237 sad
= vec_sum4s(t5
, sad
);
241 /* Transfer the calculated values for pix3 into pix2 */
245 /* Sum up the four partial sums, and put the result into s */
246 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
247 sumdiffs
= vec_splat(sumdiffs
, 3);
248 vec_ste(sumdiffs
, 0, &s
);
253 int pix_abs16x16_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
256 int s
__attribute__((aligned(16)));
257 const vector
unsigned int zero
= (const vector
unsigned int)vec_splat_u32(0);
258 vector
unsigned char perm1
, perm2
, *pix1v
, *pix2v
;
259 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
260 vector
unsigned int sad
;
261 vector
signed int sumdiffs
;
263 sad
= (vector
unsigned int)vec_splat_u32(0);
267 /* Read potentially unaligned pixels into t1 and t2 */
268 perm1
= vec_lvsl(0, pix1
);
269 pix1v
= (vector
unsigned char *) pix1
;
270 perm2
= vec_lvsl(0, pix2
);
271 pix2v
= (vector
unsigned char *) pix2
;
272 t1
= vec_perm(pix1v
[0], pix1v
[1], perm1
);
273 t2
= vec_perm(pix2v
[0], pix2v
[1], perm2
);
275 /* Calculate a sum of abs differences vector */
276 t3
= vec_max(t1
, t2
);
277 t4
= vec_min(t1
, t2
);
278 t5
= vec_sub(t3
, t4
);
280 /* Add each 4 pixel group together and put 4 results into sad */
281 sad
= vec_sum4s(t5
, sad
);
287 /* Sum up the four partial sums, and put the result into s */
288 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
289 sumdiffs
= vec_splat(sumdiffs
, 3);
290 vec_ste(sumdiffs
, 0, &s
);
295 int pix_abs8x8_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
298 int s
__attribute__((aligned(16)));
299 const vector
unsigned int zero
= (const vector
unsigned int)vec_splat_u32(0);
300 vector
unsigned char perm1
, perm2
, permclear
, *pix1v
, *pix2v
;
301 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
302 vector
unsigned int sad
;
303 vector
signed int sumdiffs
;
305 sad
= (vector
unsigned int)vec_splat_u32(0);
307 permclear
= (vector
unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
309 permclear
= (vector
unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
313 /* Read potentially unaligned pixels into t1 and t2
314 Since we're reading 16 pixels, and actually only want 8,
315 mask out the last 8 pixels. The 0s don't change the sum. */
316 perm1
= vec_lvsl(0, pix1
);
317 pix1v
= (vector
unsigned char *) pix1
;
318 perm2
= vec_lvsl(0, pix2
);
319 pix2v
= (vector
unsigned char *) pix2
;
320 t1
= vec_and(vec_perm(pix1v
[0], pix1v
[1], perm1
), permclear
);
321 t2
= vec_and(vec_perm(pix2v
[0], pix2v
[1], perm2
), permclear
);
323 /* Calculate a sum of abs differences vector */
324 t3
= vec_max(t1
, t2
);
325 t4
= vec_min(t1
, t2
);
326 t5
= vec_sub(t3
, t4
);
328 /* Add each 4 pixel group together and put 4 results into sad */
329 sad
= vec_sum4s(t5
, sad
);
335 /* Sum up the four partial sums, and put the result into s */
336 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
337 sumdiffs
= vec_splat(sumdiffs
, 3);
338 vec_ste(sumdiffs
, 0, &s
);
343 int pix_norm1_altivec(uint8_t *pix
, int line_size
)
346 int s
__attribute__((aligned(16)));
347 const vector
unsigned int zero
= (const vector
unsigned int)vec_splat_u32(0);
348 vector
unsigned char *tv
;
349 vector
unsigned char pixv
;
350 vector
unsigned int sv
;
351 vector
signed int sum
;
353 sv
= (vector
unsigned int)vec_splat_u32(0);
356 for (i
= 0; i
< 16; i
++) {
357 /* Read in the potentially unaligned pixels */
358 tv
= (vector
unsigned char *) pix
;
359 pixv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix
));
361 /* Square the values, and add them to our sum */
362 sv
= vec_msum(pixv
, pixv
, sv
);
366 /* Sum up the four partial sums, and put the result into s */
367 sum
= vec_sums((vector
signed int) sv
, (vector
signed int) zero
);
368 sum
= vec_splat(sum
, 3);
375 * Sum of Squared Errors for a 8x8 block.
377 * It's the pix_abs8x8_altivec code above w/ squaring added.
379 int sse8_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
)
382 int s
__attribute__((aligned(16)));
383 const vector
unsigned int zero
= (const vector
unsigned int)vec_splat_u32(0);
384 vector
unsigned char perm1
, perm2
, permclear
, *pix1v
, *pix2v
;
385 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
386 vector
unsigned int sum
;
387 vector
signed int sumsqr
;
389 sum
= (vector
unsigned int)vec_splat_u32(0);
391 permclear
= (vector
unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
393 permclear
= (vector
unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
397 /* Read potentially unaligned pixels into t1 and t2
398 Since we're reading 16 pixels, and actually only want 8,
399 mask out the last 8 pixels. The 0s don't change the sum. */
400 perm1
= vec_lvsl(0, pix1
);
401 pix1v
= (vector
unsigned char *) pix1
;
402 perm2
= vec_lvsl(0, pix2
);
403 pix2v
= (vector
unsigned char *) pix2
;
404 t1
= vec_and(vec_perm(pix1v
[0], pix1v
[1], perm1
), permclear
);
405 t2
= vec_and(vec_perm(pix2v
[0], pix2v
[1], perm2
), permclear
);
408 Since we want to use unsigned chars, we can take advantage
409 of the fact that abs(a-b)^2 = (a-b)^2.
412 /* Calculate abs differences vector */
413 t3
= vec_max(t1
, t2
);
414 t4
= vec_min(t1
, t2
);
415 t5
= vec_sub(t3
, t4
);
417 /* Square the values and add them to our sum */
418 sum
= vec_msum(t5
, t5
, sum
);
424 /* Sum up the four partial sums, and put the result into s */
425 sumsqr
= vec_sums((vector
signed int) sum
, (vector
signed int) zero
);
426 sumsqr
= vec_splat(sumsqr
, 3);
427 vec_ste(sumsqr
, 0, &s
);
433 * Sum of Squared Errors for a 16x16 block.
435 * It's the pix_abs16x16_altivec code above w/ squaring added.
437 int sse16_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
)
440 int s
__attribute__((aligned(16)));
441 const vector
unsigned int zero
= (const vector
unsigned int)vec_splat_u32(0);
442 vector
unsigned char perm1
, perm2
, *pix1v
, *pix2v
;
443 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
444 vector
unsigned int sum
;
445 vector
signed int sumsqr
;
447 sum
= (vector
unsigned int)vec_splat_u32(0);
450 /* Read potentially unaligned pixels into t1 and t2 */
451 perm1
= vec_lvsl(0, pix1
);
452 pix1v
= (vector
unsigned char *) pix1
;
453 perm2
= vec_lvsl(0, pix2
);
454 pix2v
= (vector
unsigned char *) pix2
;
455 t1
= vec_perm(pix1v
[0], pix1v
[1], perm1
);
456 t2
= vec_perm(pix2v
[0], pix2v
[1], perm2
);
459 Since we want to use unsigned chars, we can take advantage
460 of the fact that abs(a-b)^2 = (a-b)^2.
463 /* Calculate abs differences vector */
464 t3
= vec_max(t1
, t2
);
465 t4
= vec_min(t1
, t2
);
466 t5
= vec_sub(t3
, t4
);
468 /* Square the values and add them to our sum */
469 sum
= vec_msum(t5
, t5
, sum
);
475 /* Sum up the four partial sums, and put the result into s */
476 sumsqr
= vec_sums((vector
signed int) sum
, (vector
signed int) zero
);
477 sumsqr
= vec_splat(sumsqr
, 3);
478 vec_ste(sumsqr
, 0, &s
);
483 int pix_sum_altivec(UINT8
* pix
, int line_size
)
485 const vector
unsigned int zero
= (const vector
unsigned int)vec_splat_u32(0);
486 vector
unsigned char perm
, *pixv
;
487 vector
unsigned char t1
;
488 vector
unsigned int sad
;
489 vector
signed int sumdiffs
;
492 int s
__attribute__((aligned(16)));
494 sad
= (vector
unsigned int)vec_splat_u32(0);
496 for (i
= 0; i
< 16; i
++) {
497 /* Read the potentially unaligned 16 pixels into t1 */
498 perm
= vec_lvsl(0, pix
);
499 pixv
= (vector
unsigned char *) pix
;
500 t1
= vec_perm(pixv
[0], pixv
[1], perm
);
502 /* Add each 4 pixel group together and put 4 results into sad */
503 sad
= vec_sum4s(t1
, sad
);
508 /* Sum up the four partial sums, and put the result into s */
509 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
510 sumdiffs
= vec_splat(sumdiffs
, 3);
511 vec_ste(sumdiffs
, 0, &s
);
516 void get_pixels_altivec(DCTELEM
*restrict block
, const UINT8
*pixels
, int line_size
)
519 vector
unsigned char perm
, bytes
, *pixv
;
520 const vector
unsigned char zero
= (const vector
unsigned char)vec_splat_u8(0);
521 vector
signed short shorts
;
525 // Read potentially unaligned pixels.
526 // We're reading 16 pixels, and actually only want 8,
527 // but we simply ignore the extras.
528 perm
= vec_lvsl(0, pixels
);
529 pixv
= (vector
unsigned char *) pixels
;
530 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
532 // convert the bytes into shorts
533 shorts
= (vector
signed short)vec_mergeh(zero
, bytes
);
535 // save the data to the block, we assume the block is 16-byte aligned
536 vec_st(shorts
, i
*16, (vector
signed short*)block
);
542 void diff_pixels_altivec(DCTELEM
*restrict block
, const UINT8
*s1
,
543 const UINT8
*s2
, int stride
)
546 vector
unsigned char perm
, bytes
, *pixv
;
547 const vector
unsigned char zero
= (const vector
unsigned char)vec_splat_u8(0);
548 vector
signed short shorts1
, shorts2
;
552 // Read potentially unaligned pixels
553 // We're reading 16 pixels, and actually only want 8,
554 // but we simply ignore the extras.
555 perm
= vec_lvsl(0, s1
);
556 pixv
= (vector
unsigned char *) s1
;
557 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
559 // convert the bytes into shorts
560 shorts1
= (vector
signed short)vec_mergeh(zero
, bytes
);
562 // Do the same for the second block of pixels
563 perm
= vec_lvsl(0, s2
);
564 pixv
= (vector
unsigned char *) s2
;
565 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
567 // convert the bytes into shorts
568 shorts2
= (vector
signed short)vec_mergeh(zero
, bytes
);
570 // Do the subtraction
571 shorts1
= vec_sub(shorts1
, shorts2
);
573 // save the data to the block, we assume the block is 16-byte aligned
574 vec_st(shorts1
, 0, (vector
signed short*)block
);
581 // The code below is a copy of the code above... This is a manual
584 // Read potentially unaligned pixels
585 // We're reading 16 pixels, and actually only want 8,
586 // but we simply ignore the extras.
587 perm
= vec_lvsl(0, s1
);
588 pixv
= (vector
unsigned char *) s1
;
589 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
591 // convert the bytes into shorts
592 shorts1
= (vector
signed short)vec_mergeh(zero
, bytes
);
594 // Do the same for the second block of pixels
595 perm
= vec_lvsl(0, s2
);
596 pixv
= (vector
unsigned char *) s2
;
597 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
599 // convert the bytes into shorts
600 shorts2
= (vector
signed short)vec_mergeh(zero
, bytes
);
602 // Do the subtraction
603 shorts1
= vec_sub(shorts1
, shorts2
);
605 // save the data to the block, we assume the block is 16-byte aligned
606 vec_st(shorts1
, 0, (vector
signed short*)block
);
614 int sad16x16_altivec(void *s
, uint8_t *a
, uint8_t *b
, int stride
) {
615 return pix_abs16x16_altivec(a
,b
,stride
);
618 int sad8x8_altivec(void *s
, uint8_t *a
, uint8_t *b
, int stride
) {
619 return pix_abs8x8_altivec(a
,b
,stride
);
622 void add_bytes_altivec(uint8_t *dst
, uint8_t *src
, int w
) {
623 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
625 for(i
=0; i
+7<w
; i
++){
626 dst
[i
+0] += src
[i
+0];
627 dst
[i
+1] += src
[i
+1];
628 dst
[i
+2] += src
[i
+2];
629 dst
[i
+3] += src
[i
+3];
630 dst
[i
+4] += src
[i
+4];
631 dst
[i
+5] += src
[i
+5];
632 dst
[i
+6] += src
[i
+6];
633 dst
[i
+7] += src
[i
+7];
636 dst
[i
+0] += src
[i
+0];
637 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
639 register vector
unsigned char vdst
, vsrc
;
641 /* dst and src are 16 bytes-aligned (guaranteed) */
642 for(i
= 0 ; (i
+ 15) < w
; i
++)
644 vdst
= vec_ld(i
<< 4, (unsigned char*)dst
);
645 vsrc
= vec_ld(i
<< 4, (unsigned char*)src
);
646 vdst
= vec_add(vsrc
, vdst
);
647 vec_st(vdst
, i
<< 4, (unsigned char*)dst
);
649 /* if w is not a multiple of 16 */
650 for (; (i
< w
) ; i
++)
654 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
657 /* next one assumes that ((line_size % 16) == 0) */
658 void put_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
660 POWERPC_TBL_DECLARE(altivec_put_pixels16_num
, 1);
661 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
664 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num
, 1);
667 *((uint32_t*)(block
)) = (((const struct unaligned_32
*) (pixels
))->l
);
668 *((uint32_t*)(block
+4)) = (((const struct unaligned_32
*) (pixels
+4))->l
);
669 *((uint32_t*)(block
+8)) = (((const struct unaligned_32
*) (pixels
+8))->l
);
670 *((uint32_t*)(block
+12)) = (((const struct unaligned_32
*) (pixels
+12))->l
);
675 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num
, 1);
677 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
678 register vector
unsigned char pixelsv1
, pixelsv2
;
679 register vector
unsigned char perm
= vec_lvsl(0, pixels
);
682 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num
, 1);
685 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
686 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
687 vec_st(vec_perm(pixelsv1
, pixelsv2
, perm
),
688 0, (unsigned char*)block
);
693 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num
, 1);
695 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
698 /* next one assumes that ((line_size % 16) == 0) */
699 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
700 void avg_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
702 POWERPC_TBL_DECLARE(altivec_avg_pixels16_num
, 1);
703 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
706 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num
, 1);
709 op_avg(*((uint32_t*)(block
)),(((const struct unaligned_32
*)(pixels
))->l
));
710 op_avg(*((uint32_t*)(block
+4)),(((const struct unaligned_32
*)(pixels
+4))->l
));
711 op_avg(*((uint32_t*)(block
+8)),(((const struct unaligned_32
*)(pixels
+8))->l
));
712 op_avg(*((uint32_t*)(block
+12)),(((const struct unaligned_32
*)(pixels
+12))->l
));
717 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num
, 1);
719 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
720 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv
, blockv
;
721 register vector
unsigned char perm
= vec_lvsl(0, pixels
);
724 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num
, 1);
727 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
728 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
729 blockv
= vec_ld(0, block
);
730 pixelsv
= vec_perm(pixelsv1
, pixelsv2
, perm
);
731 blockv
= vec_avg(blockv
,pixelsv
);
732 vec_st(blockv
, 0, (unsigned char*)block
);
737 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num
, 1);
739 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
742 /* next one assumes that ((line_size % 8) == 0) */
743 void avg_pixels8_altivec(uint8_t * block
, const uint8_t * pixels
, int line_size
, int h
)
745 POWERPC_TBL_DECLARE(altivec_avg_pixels8_num
, 1);
746 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
748 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num
, 1);
749 for (i
= 0; i
< h
; i
++) {
750 *((uint32_t *) (block
)) =
751 (((*((uint32_t *) (block
))) |
752 ((((const struct unaligned_32
*) (pixels
))->l
))) -
753 ((((*((uint32_t *) (block
))) ^
754 ((((const struct unaligned_32
*) (pixels
))->
755 l
))) & 0xFEFEFEFEUL
) >> 1));
756 *((uint32_t *) (block
+ 4)) =
757 (((*((uint32_t *) (block
+ 4))) |
758 ((((const struct unaligned_32
*) (pixels
+ 4))->l
))) -
759 ((((*((uint32_t *) (block
+ 4))) ^
760 ((((const struct unaligned_32
*) (pixels
+
762 l
))) & 0xFEFEFEFEUL
) >> 1));
766 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num
, 1);
768 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
769 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv
, blockv
;
772 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num
, 1);
774 for (i
= 0; i
< h
; i
++) {
776 block is 8 bytes-aligned, so we're either in the
777 left block (16 bytes-aligned) or in the right block (not)
779 int rightside
= ((unsigned long)block
& 0x0000000F);
781 blockv
= vec_ld(0, block
);
782 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
783 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
784 pixelsv
= vec_perm(pixelsv1
, pixelsv2
, vec_lvsl(0, pixels
));
788 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(0,1,s0
,s1
));
792 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(s0
,s1
,2,3));
795 blockv
= vec_avg(blockv
, pixelsv
);
797 vec_st(blockv
, 0, block
);
803 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num
, 1);
805 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
808 /* next one assumes that ((line_size % 8) == 0) */
809 void put_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
811 POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num
, 1);
812 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
814 POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num
, 1);
815 for (j
= 0; j
< 2; j
++) {
817 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
819 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
821 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
823 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
826 for (i
= 0; i
< h
; i
+= 2) {
827 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
828 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
829 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
830 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
831 *((uint32_t *) block
) =
832 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
835 a
= (((const struct unaligned_32
*) (pixels
))->l
);
836 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
837 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
838 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
839 *((uint32_t *) block
) =
840 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
843 } pixels
+= 4 - line_size
* (h
+ 1);
844 block
+= 4 - line_size
* h
;
847 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num
, 1);
849 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
851 register vector
unsigned char
854 register vector
unsigned char
855 blockv
, temp1
, temp2
;
856 register vector
unsigned short
857 pixelssum1
, pixelssum2
, temp3
;
858 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
859 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
861 temp1
= vec_ld(0, pixels
);
862 temp2
= vec_ld(16, pixels
);
863 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
864 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
870 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
872 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
873 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
874 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
875 (vector
unsigned short)pixelsv2
);
876 pixelssum1
= vec_add(pixelssum1
, vctwo
);
878 POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num
, 1);
879 for (i
= 0; i
< h
; i
++) {
880 int rightside
= ((unsigned long)block
& 0x0000000F);
881 blockv
= vec_ld(0, block
);
883 temp1
= vec_ld(line_size
, pixels
);
884 temp2
= vec_ld(line_size
+ 16, pixels
);
885 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
886 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
892 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
895 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
896 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
897 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
898 (vector
unsigned short)pixelsv2
);
899 temp3
= vec_add(pixelssum1
, pixelssum2
);
900 temp3
= vec_sra(temp3
, vctwo
);
901 pixelssum1
= vec_add(pixelssum2
, vctwo
);
902 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
906 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
910 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
913 vec_st(blockv
, 0, block
);
919 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num
, 1);
920 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
923 /* next one assumes that ((line_size % 8) == 0) */
924 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
926 POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num
, 1);
927 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
929 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
930 for (j
= 0; j
< 2; j
++) {
932 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
934 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
936 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
938 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
941 for (i
= 0; i
< h
; i
+= 2) {
942 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
943 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
944 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
945 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
946 *((uint32_t *) block
) =
947 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
950 a
= (((const struct unaligned_32
*) (pixels
))->l
);
951 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
952 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
953 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
954 *((uint32_t *) block
) =
955 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
958 } pixels
+= 4 - line_size
* (h
+ 1);
959 block
+= 4 - line_size
* h
;
962 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
964 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
966 register vector
unsigned char
969 register vector
unsigned char
970 blockv
, temp1
, temp2
;
971 register vector
unsigned short
972 pixelssum1
, pixelssum2
, temp3
;
973 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
974 register const vector
unsigned short vcone
= (const vector
unsigned short)vec_splat_u16(1);
975 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
977 temp1
= vec_ld(0, pixels
);
978 temp2
= vec_ld(16, pixels
);
979 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
980 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
986 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
988 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
989 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
990 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
991 (vector
unsigned short)pixelsv2
);
992 pixelssum1
= vec_add(pixelssum1
, vcone
);
994 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
995 for (i
= 0; i
< h
; i
++) {
996 int rightside
= ((unsigned long)block
& 0x0000000F);
997 blockv
= vec_ld(0, block
);
999 temp1
= vec_ld(line_size
, pixels
);
1000 temp2
= vec_ld(line_size
+ 16, pixels
);
1001 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1002 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1008 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1011 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1012 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1013 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1014 (vector
unsigned short)pixelsv2
);
1015 temp3
= vec_add(pixelssum1
, pixelssum2
);
1016 temp3
= vec_sra(temp3
, vctwo
);
1017 pixelssum1
= vec_add(pixelssum2
, vcone
);
1018 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
1022 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
1026 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
1029 vec_st(blockv
, 0, block
);
1032 pixels
+= line_size
;
1035 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
1036 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1039 /* next one assumes that ((line_size % 16) == 0) */
1040 void put_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, int line_size
, int h
)
1042 POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num
, 1);
1043 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1045 POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num
, 1);
1046 for (j
= 0; j
< 4; j
++) {
1048 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1050 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1052 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
1054 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1056 pixels
+= line_size
;
1057 for (i
= 0; i
< h
; i
+= 2) {
1058 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1059 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1060 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
1061 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1062 *((uint32_t *) block
) =
1063 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1064 pixels
+= line_size
;
1066 a
= (((const struct unaligned_32
*) (pixels
))->l
);
1067 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1068 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
1069 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1070 *((uint32_t *) block
) =
1071 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1072 pixels
+= line_size
;
1074 } pixels
+= 4 - line_size
* (h
+ 1);
1075 block
+= 4 - line_size
* h
;
1078 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num
, 1);
1080 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1082 register vector
unsigned char
1083 pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
1084 register vector
unsigned char
1085 blockv
, temp1
, temp2
;
1086 register vector
unsigned short
1087 pixelssum1
, pixelssum2
, temp3
,
1088 pixelssum3
, pixelssum4
, temp4
;
1089 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
1090 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
1092 temp1
= vec_ld(0, pixels
);
1093 temp2
= vec_ld(16, pixels
);
1094 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
1095 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
1101 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
1103 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1104 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1105 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1106 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1107 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
1108 (vector
unsigned short)pixelsv4
);
1109 pixelssum3
= vec_add(pixelssum3
, vctwo
);
1110 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
1111 (vector
unsigned short)pixelsv2
);
1112 pixelssum1
= vec_add(pixelssum1
, vctwo
);
1114 POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num
, 1);
1115 for (i
= 0; i
< h
; i
++) {
1116 blockv
= vec_ld(0, block
);
1118 temp1
= vec_ld(line_size
, pixels
);
1119 temp2
= vec_ld(line_size
+ 16, pixels
);
1120 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1121 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1127 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1130 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1131 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1132 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1133 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1135 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
1136 (vector
unsigned short)pixelsv4
);
1137 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1138 (vector
unsigned short)pixelsv2
);
1139 temp4
= vec_add(pixelssum3
, pixelssum4
);
1140 temp4
= vec_sra(temp4
, vctwo
);
1141 temp3
= vec_add(pixelssum1
, pixelssum2
);
1142 temp3
= vec_sra(temp3
, vctwo
);
1144 pixelssum3
= vec_add(pixelssum4
, vctwo
);
1145 pixelssum1
= vec_add(pixelssum2
, vctwo
);
1147 blockv
= vec_packsu(temp3
, temp4
);
1149 vec_st(blockv
, 0, block
);
1152 pixels
+= line_size
;
1155 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num
, 1);
1156 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1159 /* next one assumes that ((line_size % 16) == 0) */
1160 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, int line_size
, int h
)
1162 POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1163 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1165 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1166 for (j
= 0; j
< 4; j
++) {
1168 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1170 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1172 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
1174 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1176 pixels
+= line_size
;
1177 for (i
= 0; i
< h
; i
+= 2) {
1178 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1179 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1180 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
1181 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1182 *((uint32_t *) block
) =
1183 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1184 pixels
+= line_size
;
1186 a
= (((const struct unaligned_32
*) (pixels
))->l
);
1187 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1188 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
1189 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1190 *((uint32_t *) block
) =
1191 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1192 pixels
+= line_size
;
1194 } pixels
+= 4 - line_size
* (h
+ 1);
1195 block
+= 4 - line_size
* h
;
1198 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1200 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1202 register vector
unsigned char
1203 pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
1204 register vector
unsigned char
1205 blockv
, temp1
, temp2
;
1206 register vector
unsigned short
1207 pixelssum1
, pixelssum2
, temp3
,
1208 pixelssum3
, pixelssum4
, temp4
;
1209 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
1210 register const vector
unsigned short vcone
= (const vector
unsigned short)vec_splat_u16(1);
1211 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
1213 temp1
= vec_ld(0, pixels
);
1214 temp2
= vec_ld(16, pixels
);
1215 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
1216 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
1222 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
1224 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1225 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1226 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1227 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1228 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
1229 (vector
unsigned short)pixelsv4
);
1230 pixelssum3
= vec_add(pixelssum3
, vcone
);
1231 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
1232 (vector
unsigned short)pixelsv2
);
1233 pixelssum1
= vec_add(pixelssum1
, vcone
);
1235 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1236 for (i
= 0; i
< h
; i
++) {
1237 blockv
= vec_ld(0, block
);
1239 temp1
= vec_ld(line_size
, pixels
);
1240 temp2
= vec_ld(line_size
+ 16, pixels
);
1241 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1242 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1248 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1251 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1252 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1253 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1254 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1256 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
1257 (vector
unsigned short)pixelsv4
);
1258 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1259 (vector
unsigned short)pixelsv2
);
1260 temp4
= vec_add(pixelssum3
, pixelssum4
);
1261 temp4
= vec_sra(temp4
, vctwo
);
1262 temp3
= vec_add(pixelssum1
, pixelssum2
);
1263 temp3
= vec_sra(temp3
, vctwo
);
1265 pixelssum3
= vec_add(pixelssum4
, vcone
);
1266 pixelssum1
= vec_add(pixelssum2
, vcone
);
1268 blockv
= vec_packsu(temp3
, temp4
);
1270 vec_st(blockv
, 0, block
);
1273 pixels
+= line_size
;
1276 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1277 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1280 int has_altivec(void)
1282 #ifdef CONFIG_DARWIN
1283 int sels
[2] = {CTL_HW
, HW_VECTORUNIT
};
1285 size_t len
= sizeof(has_vu
);
1288 err
= sysctl(sels
, 2, &has_vu
, &len
, NULL
, 0);
1290 if (err
== 0) return (has_vu
!= 0);
1291 #else /* CONFIG_DARWIN */
1292 /* no Darwin, do it the brute-force way */
1293 /* this is borrowed from the libmpeg2 library */
1295 signal (SIGILL
, sigill_handler
);
1296 if (sigsetjmp (jmpbuf
, 1)) {
1297 signal (SIGILL
, SIG_DFL
);
1301 asm volatile ("mtspr 256, %0\n\t"
1302 "vand %%v0, %%v0, %%v0"
1306 signal (SIGILL
, SIG_DFL
);
1310 #endif /* CONFIG_DARWIN */