2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #include "../dsputil.h"
23 #include "gcc_fixes.h"
25 #include "dsputil_altivec.h"
28 #include <sys/sysctl.h>
29 #else /* CONFIG_DARWIN */
33 static sigjmp_buf jmpbuf
;
34 static volatile sig_atomic_t canjump
= 0;
36 static void sigill_handler (int sig
)
39 signal (sig
, SIG_DFL
);
44 siglongjmp (jmpbuf
, 1);
46 #endif /* CONFIG_DARWIN */
48 int sad16_x2_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
51 int s
__attribute__((aligned(16)));
52 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
53 vector
unsigned char *tv
;
54 vector
unsigned char pix1v
, pix2v
, pix2iv
, avgv
, t5
;
55 vector
unsigned int sad
;
56 vector
signed int sumdiffs
;
59 sad
= (vector
unsigned int)vec_splat_u32(0);
62 Read unaligned pixels into our vectors. The vectors are as follows:
63 pix1v: pix1[0]-pix1[15]
64 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
66 tv
= (vector
unsigned char *) pix1
;
67 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
69 tv
= (vector
unsigned char *) &pix2
[0];
70 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
72 tv
= (vector
unsigned char *) &pix2
[1];
73 pix2iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[1]));
75 /* Calculate the average vector */
76 avgv
= vec_avg(pix2v
, pix2iv
);
78 /* Calculate a sum of abs differences vector */
79 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
81 /* Add each 4 pixel group together and put 4 results into sad */
82 sad
= vec_sum4s(t5
, sad
);
87 /* Sum up the four partial sums, and put the result into s */
88 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
89 sumdiffs
= vec_splat(sumdiffs
, 3);
90 vec_ste(sumdiffs
, 0, &s
);
95 int sad16_y2_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
98 int s
__attribute__((aligned(16)));
99 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
100 vector
unsigned char *tv
;
101 vector
unsigned char pix1v
, pix2v
, pix3v
, avgv
, t5
;
102 vector
unsigned int sad
;
103 vector
signed int sumdiffs
;
104 uint8_t *pix3
= pix2
+ line_size
;
107 sad
= (vector
unsigned int)vec_splat_u32(0);
110 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
111 iteration becomes pix2 in the next iteration. We can use this
112 fact to avoid a potentially expensive unaligned read, each
113 time around the loop.
114 Read unaligned pixels into our vectors. The vectors are as follows:
115 pix2v: pix2[0]-pix2[15]
116 Split the pixel vectors into shorts
118 tv
= (vector
unsigned char *) &pix2
[0];
119 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
123 Read unaligned pixels into our vectors. The vectors are as follows:
124 pix1v: pix1[0]-pix1[15]
125 pix3v: pix3[0]-pix3[15]
127 tv
= (vector
unsigned char *) pix1
;
128 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
130 tv
= (vector
unsigned char *) &pix3
[0];
131 pix3v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[0]));
133 /* Calculate the average vector */
134 avgv
= vec_avg(pix2v
, pix3v
);
136 /* Calculate a sum of abs differences vector */
137 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
139 /* Add each 4 pixel group together and put 4 results into sad */
140 sad
= vec_sum4s(t5
, sad
);
148 /* Sum up the four partial sums, and put the result into s */
149 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
150 sumdiffs
= vec_splat(sumdiffs
, 3);
151 vec_ste(sumdiffs
, 0, &s
);
155 int sad16_xy2_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
158 int s
__attribute__((aligned(16)));
159 uint8_t *pix3
= pix2
+ line_size
;
160 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
161 const_vector
unsigned short two
= (const_vector
unsigned short)vec_splat_u16(2);
162 vector
unsigned char *tv
, avgv
, t5
;
163 vector
unsigned char pix1v
, pix2v
, pix3v
, pix2iv
, pix3iv
;
164 vector
unsigned short pix2lv
, pix2hv
, pix2ilv
, pix2ihv
;
165 vector
unsigned short pix3lv
, pix3hv
, pix3ilv
, pix3ihv
;
166 vector
unsigned short avghv
, avglv
;
167 vector
unsigned short t1
, t2
, t3
, t4
;
168 vector
unsigned int sad
;
169 vector
signed int sumdiffs
;
171 sad
= (vector
unsigned int)vec_splat_u32(0);
176 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
177 iteration becomes pix2 in the next iteration. We can use this
178 fact to avoid a potentially expensive unaligned read, as well
179 as some splitting, and vector addition each time around the loop.
180 Read unaligned pixels into our vectors. The vectors are as follows:
181 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
182 Split the pixel vectors into shorts
184 tv
= (vector
unsigned char *) &pix2
[0];
185 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
187 tv
= (vector
unsigned char *) &pix2
[1];
188 pix2iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[1]));
190 pix2hv
= (vector
unsigned short) vec_mergeh(zero
, pix2v
);
191 pix2lv
= (vector
unsigned short) vec_mergel(zero
, pix2v
);
192 pix2ihv
= (vector
unsigned short) vec_mergeh(zero
, pix2iv
);
193 pix2ilv
= (vector
unsigned short) vec_mergel(zero
, pix2iv
);
194 t1
= vec_add(pix2hv
, pix2ihv
);
195 t2
= vec_add(pix2lv
, pix2ilv
);
199 Read unaligned pixels into our vectors. The vectors are as follows:
200 pix1v: pix1[0]-pix1[15]
201 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
203 tv
= (vector
unsigned char *) pix1
;
204 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
206 tv
= (vector
unsigned char *) &pix3
[0];
207 pix3v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[0]));
209 tv
= (vector
unsigned char *) &pix3
[1];
210 pix3iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[1]));
213 Note that Altivec does have vec_avg, but this works on vector pairs
214 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
215 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
216 Instead, we have to split the pixel vectors into vectors of shorts,
217 and do the averaging by hand.
220 /* Split the pixel vectors into shorts */
221 pix3hv
= (vector
unsigned short) vec_mergeh(zero
, pix3v
);
222 pix3lv
= (vector
unsigned short) vec_mergel(zero
, pix3v
);
223 pix3ihv
= (vector
unsigned short) vec_mergeh(zero
, pix3iv
);
224 pix3ilv
= (vector
unsigned short) vec_mergel(zero
, pix3iv
);
226 /* Do the averaging on them */
227 t3
= vec_add(pix3hv
, pix3ihv
);
228 t4
= vec_add(pix3lv
, pix3ilv
);
230 avghv
= vec_sr(vec_add(vec_add(t1
, t3
), two
), two
);
231 avglv
= vec_sr(vec_add(vec_add(t2
, t4
), two
), two
);
233 /* Pack the shorts back into a result */
234 avgv
= vec_pack(avghv
, avglv
);
236 /* Calculate a sum of abs differences vector */
237 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
239 /* Add each 4 pixel group together and put 4 results into sad */
240 sad
= vec_sum4s(t5
, sad
);
244 /* Transfer the calculated values for pix3 into pix2 */
248 /* Sum up the four partial sums, and put the result into s */
249 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
250 sumdiffs
= vec_splat(sumdiffs
, 3);
251 vec_ste(sumdiffs
, 0, &s
);
256 int sad16_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
259 int s
__attribute__((aligned(16)));
260 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
261 vector
unsigned char perm1
, perm2
, *pix1v
, *pix2v
;
262 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
263 vector
unsigned int sad
;
264 vector
signed int sumdiffs
;
266 sad
= (vector
unsigned int)vec_splat_u32(0);
270 /* Read potentially unaligned pixels into t1 and t2 */
271 perm1
= vec_lvsl(0, pix1
);
272 pix1v
= (vector
unsigned char *) pix1
;
273 perm2
= vec_lvsl(0, pix2
);
274 pix2v
= (vector
unsigned char *) pix2
;
275 t1
= vec_perm(pix1v
[0], pix1v
[1], perm1
);
276 t2
= vec_perm(pix2v
[0], pix2v
[1], perm2
);
278 /* Calculate a sum of abs differences vector */
279 t3
= vec_max(t1
, t2
);
280 t4
= vec_min(t1
, t2
);
281 t5
= vec_sub(t3
, t4
);
283 /* Add each 4 pixel group together and put 4 results into sad */
284 sad
= vec_sum4s(t5
, sad
);
290 /* Sum up the four partial sums, and put the result into s */
291 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
292 sumdiffs
= vec_splat(sumdiffs
, 3);
293 vec_ste(sumdiffs
, 0, &s
);
298 int sad8_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
301 int s
__attribute__((aligned(16)));
302 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
303 vector
unsigned char perm1
, perm2
, permclear
, *pix1v
, *pix2v
;
304 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
305 vector
unsigned int sad
;
306 vector
signed int sumdiffs
;
308 sad
= (vector
unsigned int)vec_splat_u32(0);
310 permclear
= (vector
unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
313 /* Read potentially unaligned pixels into t1 and t2
314 Since we're reading 16 pixels, and actually only want 8,
315 mask out the last 8 pixels. The 0s don't change the sum. */
316 perm1
= vec_lvsl(0, pix1
);
317 pix1v
= (vector
unsigned char *) pix1
;
318 perm2
= vec_lvsl(0, pix2
);
319 pix2v
= (vector
unsigned char *) pix2
;
320 t1
= vec_and(vec_perm(pix1v
[0], pix1v
[1], perm1
), permclear
);
321 t2
= vec_and(vec_perm(pix2v
[0], pix2v
[1], perm2
), permclear
);
323 /* Calculate a sum of abs differences vector */
324 t3
= vec_max(t1
, t2
);
325 t4
= vec_min(t1
, t2
);
326 t5
= vec_sub(t3
, t4
);
328 /* Add each 4 pixel group together and put 4 results into sad */
329 sad
= vec_sum4s(t5
, sad
);
335 /* Sum up the four partial sums, and put the result into s */
336 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
337 sumdiffs
= vec_splat(sumdiffs
, 3);
338 vec_ste(sumdiffs
, 0, &s
);
343 int pix_norm1_altivec(uint8_t *pix
, int line_size
)
346 int s
__attribute__((aligned(16)));
347 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
348 vector
unsigned char *tv
;
349 vector
unsigned char pixv
;
350 vector
unsigned int sv
;
351 vector
signed int sum
;
353 sv
= (vector
unsigned int)vec_splat_u32(0);
356 for (i
= 0; i
< 16; i
++) {
357 /* Read in the potentially unaligned pixels */
358 tv
= (vector
unsigned char *) pix
;
359 pixv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix
));
361 /* Square the values, and add them to our sum */
362 sv
= vec_msum(pixv
, pixv
, sv
);
366 /* Sum up the four partial sums, and put the result into s */
367 sum
= vec_sums((vector
signed int) sv
, (vector
signed int) zero
);
368 sum
= vec_splat(sum
, 3);
375 * Sum of Squared Errors for a 8x8 block.
377 * It's the sad8_altivec code above w/ squaring added.
379 int sse8_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
382 int s
__attribute__((aligned(16)));
383 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
384 vector
unsigned char perm1
, perm2
, permclear
, *pix1v
, *pix2v
;
385 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
386 vector
unsigned int sum
;
387 vector
signed int sumsqr
;
389 sum
= (vector
unsigned int)vec_splat_u32(0);
391 permclear
= (vector
unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
395 /* Read potentially unaligned pixels into t1 and t2
396 Since we're reading 16 pixels, and actually only want 8,
397 mask out the last 8 pixels. The 0s don't change the sum. */
398 perm1
= vec_lvsl(0, pix1
);
399 pix1v
= (vector
unsigned char *) pix1
;
400 perm2
= vec_lvsl(0, pix2
);
401 pix2v
= (vector
unsigned char *) pix2
;
402 t1
= vec_and(vec_perm(pix1v
[0], pix1v
[1], perm1
), permclear
);
403 t2
= vec_and(vec_perm(pix2v
[0], pix2v
[1], perm2
), permclear
);
406 Since we want to use unsigned chars, we can take advantage
407 of the fact that abs(a-b)^2 = (a-b)^2.
410 /* Calculate abs differences vector */
411 t3
= vec_max(t1
, t2
);
412 t4
= vec_min(t1
, t2
);
413 t5
= vec_sub(t3
, t4
);
415 /* Square the values and add them to our sum */
416 sum
= vec_msum(t5
, t5
, sum
);
422 /* Sum up the four partial sums, and put the result into s */
423 sumsqr
= vec_sums((vector
signed int) sum
, (vector
signed int) zero
);
424 sumsqr
= vec_splat(sumsqr
, 3);
425 vec_ste(sumsqr
, 0, &s
);
431 * Sum of Squared Errors for a 16x16 block.
433 * It's the sad16_altivec code above w/ squaring added.
435 int sse16_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
438 int s
__attribute__((aligned(16)));
439 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
440 vector
unsigned char perm1
, perm2
, *pix1v
, *pix2v
;
441 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
442 vector
unsigned int sum
;
443 vector
signed int sumsqr
;
445 sum
= (vector
unsigned int)vec_splat_u32(0);
448 /* Read potentially unaligned pixels into t1 and t2 */
449 perm1
= vec_lvsl(0, pix1
);
450 pix1v
= (vector
unsigned char *) pix1
;
451 perm2
= vec_lvsl(0, pix2
);
452 pix2v
= (vector
unsigned char *) pix2
;
453 t1
= vec_perm(pix1v
[0], pix1v
[1], perm1
);
454 t2
= vec_perm(pix2v
[0], pix2v
[1], perm2
);
457 Since we want to use unsigned chars, we can take advantage
458 of the fact that abs(a-b)^2 = (a-b)^2.
461 /* Calculate abs differences vector */
462 t3
= vec_max(t1
, t2
);
463 t4
= vec_min(t1
, t2
);
464 t5
= vec_sub(t3
, t4
);
466 /* Square the values and add them to our sum */
467 sum
= vec_msum(t5
, t5
, sum
);
473 /* Sum up the four partial sums, and put the result into s */
474 sumsqr
= vec_sums((vector
signed int) sum
, (vector
signed int) zero
);
475 sumsqr
= vec_splat(sumsqr
, 3);
476 vec_ste(sumsqr
, 0, &s
);
481 int pix_sum_altivec(uint8_t * pix
, int line_size
)
483 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
484 vector
unsigned char perm
, *pixv
;
485 vector
unsigned char t1
;
486 vector
unsigned int sad
;
487 vector
signed int sumdiffs
;
490 int s
__attribute__((aligned(16)));
492 sad
= (vector
unsigned int)vec_splat_u32(0);
494 for (i
= 0; i
< 16; i
++) {
495 /* Read the potentially unaligned 16 pixels into t1 */
496 perm
= vec_lvsl(0, pix
);
497 pixv
= (vector
unsigned char *) pix
;
498 t1
= vec_perm(pixv
[0], pixv
[1], perm
);
500 /* Add each 4 pixel group together and put 4 results into sad */
501 sad
= vec_sum4s(t1
, sad
);
506 /* Sum up the four partial sums, and put the result into s */
507 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
508 sumdiffs
= vec_splat(sumdiffs
, 3);
509 vec_ste(sumdiffs
, 0, &s
);
514 void get_pixels_altivec(DCTELEM
*restrict block
, const uint8_t *pixels
, int line_size
)
517 vector
unsigned char perm
, bytes
, *pixv
;
518 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
519 vector
signed short shorts
;
523 // Read potentially unaligned pixels.
524 // We're reading 16 pixels, and actually only want 8,
525 // but we simply ignore the extras.
526 perm
= vec_lvsl(0, pixels
);
527 pixv
= (vector
unsigned char *) pixels
;
528 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
530 // convert the bytes into shorts
531 shorts
= (vector
signed short)vec_mergeh(zero
, bytes
);
533 // save the data to the block, we assume the block is 16-byte aligned
534 vec_st(shorts
, i
*16, (vector
signed short*)block
);
540 void diff_pixels_altivec(DCTELEM
*restrict block
, const uint8_t *s1
,
541 const uint8_t *s2
, int stride
)
544 vector
unsigned char perm
, bytes
, *pixv
;
545 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
546 vector
signed short shorts1
, shorts2
;
550 // Read potentially unaligned pixels
551 // We're reading 16 pixels, and actually only want 8,
552 // but we simply ignore the extras.
553 perm
= vec_lvsl(0, s1
);
554 pixv
= (vector
unsigned char *) s1
;
555 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
557 // convert the bytes into shorts
558 shorts1
= (vector
signed short)vec_mergeh(zero
, bytes
);
560 // Do the same for the second block of pixels
561 perm
= vec_lvsl(0, s2
);
562 pixv
= (vector
unsigned char *) s2
;
563 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
565 // convert the bytes into shorts
566 shorts2
= (vector
signed short)vec_mergeh(zero
, bytes
);
568 // Do the subtraction
569 shorts1
= vec_sub(shorts1
, shorts2
);
571 // save the data to the block, we assume the block is 16-byte aligned
572 vec_st(shorts1
, 0, (vector
signed short*)block
);
579 // The code below is a copy of the code above... This is a manual
582 // Read potentially unaligned pixels
583 // We're reading 16 pixels, and actually only want 8,
584 // but we simply ignore the extras.
585 perm
= vec_lvsl(0, s1
);
586 pixv
= (vector
unsigned char *) s1
;
587 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
589 // convert the bytes into shorts
590 shorts1
= (vector
signed short)vec_mergeh(zero
, bytes
);
592 // Do the same for the second block of pixels
593 perm
= vec_lvsl(0, s2
);
594 pixv
= (vector
unsigned char *) s2
;
595 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
597 // convert the bytes into shorts
598 shorts2
= (vector
signed short)vec_mergeh(zero
, bytes
);
600 // Do the subtraction
601 shorts1
= vec_sub(shorts1
, shorts2
);
603 // save the data to the block, we assume the block is 16-byte aligned
604 vec_st(shorts1
, 0, (vector
signed short*)block
);
612 void add_bytes_altivec(uint8_t *dst
, uint8_t *src
, int w
) {
613 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
615 for(i
=0; i
+7<w
; i
++){
616 dst
[i
+0] += src
[i
+0];
617 dst
[i
+1] += src
[i
+1];
618 dst
[i
+2] += src
[i
+2];
619 dst
[i
+3] += src
[i
+3];
620 dst
[i
+4] += src
[i
+4];
621 dst
[i
+5] += src
[i
+5];
622 dst
[i
+6] += src
[i
+6];
623 dst
[i
+7] += src
[i
+7];
626 dst
[i
+0] += src
[i
+0];
627 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
629 register vector
unsigned char vdst
, vsrc
;
631 /* dst and src are 16 bytes-aligned (guaranteed) */
632 for(i
= 0 ; (i
+ 15) < w
; i
++)
634 vdst
= vec_ld(i
<< 4, (unsigned char*)dst
);
635 vsrc
= vec_ld(i
<< 4, (unsigned char*)src
);
636 vdst
= vec_add(vsrc
, vdst
);
637 vec_st(vdst
, i
<< 4, (unsigned char*)dst
);
639 /* if w is not a multiple of 16 */
640 for (; (i
< w
) ; i
++)
644 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
647 /* next one assumes that ((line_size % 16) == 0) */
648 void put_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
650 POWERPC_PERF_DECLARE(altivec_put_pixels16_num
, 1);
651 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
654 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num
, 1);
657 *((uint32_t*)(block
)) = LD32(pixels
);
658 *((uint32_t*)(block
+4)) = LD32(pixels
+4);
659 *((uint32_t*)(block
+8)) = LD32(pixels
+8);
660 *((uint32_t*)(block
+12)) = LD32(pixels
+12);
665 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num
, 1);
667 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
668 register vector
unsigned char pixelsv1
, pixelsv2
;
669 register vector
unsigned char pixelsv1B
, pixelsv2B
;
670 register vector
unsigned char pixelsv1C
, pixelsv2C
;
671 register vector
unsigned char pixelsv1D
, pixelsv2D
;
673 register vector
unsigned char perm
= vec_lvsl(0, pixels
);
675 register int line_size_2
= line_size
<< 1;
676 register int line_size_3
= line_size
+ line_size_2
;
677 register int line_size_4
= line_size
<< 2;
679 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num
, 1);
680 // hand-unrolling the loop by 4 gains about 15%
681 // mininum execution time goes from 74 to 60 cycles
682 // it's faster than -funroll-loops, but using
683 // -funroll-loops w/ this is bad - 74 cycles again.
684 // all this is on a 7450, tuning for the 7450
687 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
688 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
689 vec_st(vec_perm(pixelsv1
, pixelsv2
, perm
),
690 0, (unsigned char*)block
);
695 for(i
=0; i
<h
; i
+=4) {
696 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
697 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
698 pixelsv1B
= vec_ld(line_size
, (unsigned char*)pixels
);
699 pixelsv2B
= vec_ld(16 + line_size
, (unsigned char*)pixels
);
700 pixelsv1C
= vec_ld(line_size_2
, (unsigned char*)pixels
);
701 pixelsv2C
= vec_ld(16 + line_size_2
, (unsigned char*)pixels
);
702 pixelsv1D
= vec_ld(line_size_3
, (unsigned char*)pixels
);
703 pixelsv2D
= vec_ld(16 + line_size_3
, (unsigned char*)pixels
);
704 vec_st(vec_perm(pixelsv1
, pixelsv2
, perm
),
705 0, (unsigned char*)block
);
706 vec_st(vec_perm(pixelsv1B
, pixelsv2B
, perm
),
707 line_size
, (unsigned char*)block
);
708 vec_st(vec_perm(pixelsv1C
, pixelsv2C
, perm
),
709 line_size_2
, (unsigned char*)block
);
710 vec_st(vec_perm(pixelsv1D
, pixelsv2D
, perm
),
711 line_size_3
, (unsigned char*)block
);
716 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num
, 1);
718 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
721 /* next one assumes that ((line_size % 16) == 0) */
722 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
723 void avg_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
725 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num
, 1);
726 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
729 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num
, 1);
732 op_avg(*((uint32_t*)(block
)),LD32(pixels
));
733 op_avg(*((uint32_t*)(block
+4)),LD32(pixels
+4));
734 op_avg(*((uint32_t*)(block
+8)),LD32(pixels
+8));
735 op_avg(*((uint32_t*)(block
+12)),LD32(pixels
+12));
740 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num
, 1);
742 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
743 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv
, blockv
;
744 register vector
unsigned char perm
= vec_lvsl(0, pixels
);
747 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num
, 1);
750 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
751 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
752 blockv
= vec_ld(0, block
);
753 pixelsv
= vec_perm(pixelsv1
, pixelsv2
, perm
);
754 blockv
= vec_avg(blockv
,pixelsv
);
755 vec_st(blockv
, 0, (unsigned char*)block
);
760 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num
, 1);
762 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
765 /* next one assumes that ((line_size % 8) == 0) */
766 void avg_pixels8_altivec(uint8_t * block
, const uint8_t * pixels
, int line_size
, int h
)
768 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num
, 1);
769 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
771 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num
, 1);
772 for (i
= 0; i
< h
; i
++) {
773 *((uint32_t *) (block
)) =
774 (((*((uint32_t *) (block
))) |
775 ((((const struct unaligned_32
*) (pixels
))->l
))) -
776 ((((*((uint32_t *) (block
))) ^
777 ((((const struct unaligned_32
*) (pixels
))->
778 l
))) & 0xFEFEFEFEUL
) >> 1));
779 *((uint32_t *) (block
+ 4)) =
780 (((*((uint32_t *) (block
+ 4))) |
781 ((((const struct unaligned_32
*) (pixels
+ 4))->l
))) -
782 ((((*((uint32_t *) (block
+ 4))) ^
783 ((((const struct unaligned_32
*) (pixels
+
785 l
))) & 0xFEFEFEFEUL
) >> 1));
789 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num
, 1);
791 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
792 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv
, blockv
;
795 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num
, 1);
797 for (i
= 0; i
< h
; i
++) {
799 block is 8 bytes-aligned, so we're either in the
800 left block (16 bytes-aligned) or in the right block (not)
802 int rightside
= ((unsigned long)block
& 0x0000000F);
804 blockv
= vec_ld(0, block
);
805 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
806 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
807 pixelsv
= vec_perm(pixelsv1
, pixelsv2
, vec_lvsl(0, pixels
));
811 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(0,1,s0
,s1
));
815 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(s0
,s1
,2,3));
818 blockv
= vec_avg(blockv
, pixelsv
);
820 vec_st(blockv
, 0, block
);
826 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num
, 1);
828 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
831 /* next one assumes that ((line_size % 8) == 0) */
832 void put_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
834 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num
, 1);
835 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
837 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num
, 1);
838 for (j
= 0; j
< 2; j
++) {
840 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
842 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
844 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
846 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
849 for (i
= 0; i
< h
; i
+= 2) {
850 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
851 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
852 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
853 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
854 *((uint32_t *) block
) =
855 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
858 a
= (((const struct unaligned_32
*) (pixels
))->l
);
859 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
860 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
861 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
862 *((uint32_t *) block
) =
863 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
866 } pixels
+= 4 - line_size
* (h
+ 1);
867 block
+= 4 - line_size
* h
;
870 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num
, 1);
872 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
874 register vector
unsigned char
877 register vector
unsigned char
878 blockv
, temp1
, temp2
;
879 register vector
unsigned short
880 pixelssum1
, pixelssum2
, temp3
;
881 register const_vector
unsigned char vczero
= (const_vector
unsigned char)vec_splat_u8(0);
882 register const_vector
unsigned short vctwo
= (const_vector
unsigned short)vec_splat_u16(2);
884 temp1
= vec_ld(0, pixels
);
885 temp2
= vec_ld(16, pixels
);
886 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
887 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
893 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
895 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
896 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
897 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
898 (vector
unsigned short)pixelsv2
);
899 pixelssum1
= vec_add(pixelssum1
, vctwo
);
901 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num
, 1);
902 for (i
= 0; i
< h
; i
++) {
903 int rightside
= ((unsigned long)block
& 0x0000000F);
904 blockv
= vec_ld(0, block
);
906 temp1
= vec_ld(line_size
, pixels
);
907 temp2
= vec_ld(line_size
+ 16, pixels
);
908 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
909 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
915 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
918 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
919 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
920 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
921 (vector
unsigned short)pixelsv2
);
922 temp3
= vec_add(pixelssum1
, pixelssum2
);
923 temp3
= vec_sra(temp3
, vctwo
);
924 pixelssum1
= vec_add(pixelssum2
, vctwo
);
925 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
929 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
933 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
936 vec_st(blockv
, 0, block
);
942 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num
, 1);
943 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
946 /* next one assumes that ((line_size % 8) == 0) */
947 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
949 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num
, 1);
950 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
952 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
953 for (j
= 0; j
< 2; j
++) {
955 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
957 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
959 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
961 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
964 for (i
= 0; i
< h
; i
+= 2) {
965 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
966 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
967 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
968 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
969 *((uint32_t *) block
) =
970 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
973 a
= (((const struct unaligned_32
*) (pixels
))->l
);
974 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
975 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
976 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
977 *((uint32_t *) block
) =
978 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
981 } pixels
+= 4 - line_size
* (h
+ 1);
982 block
+= 4 - line_size
* h
;
985 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
987 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
989 register vector
unsigned char
992 register vector
unsigned char
993 blockv
, temp1
, temp2
;
994 register vector
unsigned short
995 pixelssum1
, pixelssum2
, temp3
;
996 register const_vector
unsigned char vczero
= (const_vector
unsigned char)vec_splat_u8(0);
997 register const_vector
unsigned short vcone
= (const_vector
unsigned short)vec_splat_u16(1);
998 register const_vector
unsigned short vctwo
= (const_vector
unsigned short)vec_splat_u16(2);
1000 temp1
= vec_ld(0, pixels
);
1001 temp2
= vec_ld(16, pixels
);
1002 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
1003 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
1009 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
1011 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1012 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1013 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
1014 (vector
unsigned short)pixelsv2
);
1015 pixelssum1
= vec_add(pixelssum1
, vcone
);
1017 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
1018 for (i
= 0; i
< h
; i
++) {
1019 int rightside
= ((unsigned long)block
& 0x0000000F);
1020 blockv
= vec_ld(0, block
);
1022 temp1
= vec_ld(line_size
, pixels
);
1023 temp2
= vec_ld(line_size
+ 16, pixels
);
1024 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1025 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1031 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1034 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1035 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1036 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1037 (vector
unsigned short)pixelsv2
);
1038 temp3
= vec_add(pixelssum1
, pixelssum2
);
1039 temp3
= vec_sra(temp3
, vctwo
);
1040 pixelssum1
= vec_add(pixelssum2
, vcone
);
1041 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
1045 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
1049 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
1052 vec_st(blockv
, 0, block
);
1055 pixels
+= line_size
;
1058 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
1059 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1062 /* next one assumes that ((line_size % 16) == 0) */
1063 void put_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, int line_size
, int h
)
1065 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num
, 1);
1066 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1068 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num
, 1);
1069 for (j
= 0; j
< 4; j
++) {
1071 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1073 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1075 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
1077 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1079 pixels
+= line_size
;
1080 for (i
= 0; i
< h
; i
+= 2) {
1081 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1082 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1083 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
1084 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1085 *((uint32_t *) block
) =
1086 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1087 pixels
+= line_size
;
1089 a
= (((const struct unaligned_32
*) (pixels
))->l
);
1090 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1091 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
1092 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1093 *((uint32_t *) block
) =
1094 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1095 pixels
+= line_size
;
1097 } pixels
+= 4 - line_size
* (h
+ 1);
1098 block
+= 4 - line_size
* h
;
1101 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num
, 1);
1103 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1105 register vector
unsigned char
1106 pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
1107 register vector
unsigned char
1108 blockv
, temp1
, temp2
;
1109 register vector
unsigned short
1110 pixelssum1
, pixelssum2
, temp3
,
1111 pixelssum3
, pixelssum4
, temp4
;
1112 register const_vector
unsigned char vczero
= (const_vector
unsigned char)vec_splat_u8(0);
1113 register const_vector
unsigned short vctwo
= (const_vector
unsigned short)vec_splat_u16(2);
1115 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num
, 1);
1117 temp1
= vec_ld(0, pixels
);
1118 temp2
= vec_ld(16, pixels
);
1119 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
1120 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
1126 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
1128 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1129 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1130 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1131 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1132 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
1133 (vector
unsigned short)pixelsv4
);
1134 pixelssum3
= vec_add(pixelssum3
, vctwo
);
1135 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
1136 (vector
unsigned short)pixelsv2
);
1137 pixelssum1
= vec_add(pixelssum1
, vctwo
);
1139 for (i
= 0; i
< h
; i
++) {
1140 blockv
= vec_ld(0, block
);
1142 temp1
= vec_ld(line_size
, pixels
);
1143 temp2
= vec_ld(line_size
+ 16, pixels
);
1144 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1145 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1151 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1154 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1155 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1156 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1157 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1159 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
1160 (vector
unsigned short)pixelsv4
);
1161 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1162 (vector
unsigned short)pixelsv2
);
1163 temp4
= vec_add(pixelssum3
, pixelssum4
);
1164 temp4
= vec_sra(temp4
, vctwo
);
1165 temp3
= vec_add(pixelssum1
, pixelssum2
);
1166 temp3
= vec_sra(temp3
, vctwo
);
1168 pixelssum3
= vec_add(pixelssum4
, vctwo
);
1169 pixelssum1
= vec_add(pixelssum2
, vctwo
);
1171 blockv
= vec_packsu(temp3
, temp4
);
1173 vec_st(blockv
, 0, block
);
1176 pixels
+= line_size
;
1179 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num
, 1);
1180 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1183 /* next one assumes that ((line_size % 16) == 0) */
1184 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, int line_size
, int h
)
1186 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1187 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1189 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1190 for (j
= 0; j
< 4; j
++) {
1192 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1194 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1196 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
1198 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1200 pixels
+= line_size
;
1201 for (i
= 0; i
< h
; i
+= 2) {
1202 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1203 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1204 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
1205 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1206 *((uint32_t *) block
) =
1207 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1208 pixels
+= line_size
;
1210 a
= (((const struct unaligned_32
*) (pixels
))->l
);
1211 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1212 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
1213 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1214 *((uint32_t *) block
) =
1215 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1216 pixels
+= line_size
;
1218 } pixels
+= 4 - line_size
* (h
+ 1);
1219 block
+= 4 - line_size
* h
;
1222 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1224 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1226 register vector
unsigned char
1227 pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
1228 register vector
unsigned char
1229 blockv
, temp1
, temp2
;
1230 register vector
unsigned short
1231 pixelssum1
, pixelssum2
, temp3
,
1232 pixelssum3
, pixelssum4
, temp4
;
1233 register const_vector
unsigned char vczero
= (const_vector
unsigned char)vec_splat_u8(0);
1234 register const_vector
unsigned short vcone
= (const_vector
unsigned short)vec_splat_u16(1);
1235 register const_vector
unsigned short vctwo
= (const_vector
unsigned short)vec_splat_u16(2);
1237 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1239 temp1
= vec_ld(0, pixels
);
1240 temp2
= vec_ld(16, pixels
);
1241 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
1242 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
1248 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
1250 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1251 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1252 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1253 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1254 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
1255 (vector
unsigned short)pixelsv4
);
1256 pixelssum3
= vec_add(pixelssum3
, vcone
);
1257 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
1258 (vector
unsigned short)pixelsv2
);
1259 pixelssum1
= vec_add(pixelssum1
, vcone
);
1261 for (i
= 0; i
< h
; i
++) {
1262 blockv
= vec_ld(0, block
);
1264 temp1
= vec_ld(line_size
, pixels
);
1265 temp2
= vec_ld(line_size
+ 16, pixels
);
1266 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1267 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1273 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1276 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1277 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1278 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1279 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1281 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
1282 (vector
unsigned short)pixelsv4
);
1283 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1284 (vector
unsigned short)pixelsv2
);
1285 temp4
= vec_add(pixelssum3
, pixelssum4
);
1286 temp4
= vec_sra(temp4
, vctwo
);
1287 temp3
= vec_add(pixelssum1
, pixelssum2
);
1288 temp3
= vec_sra(temp3
, vctwo
);
1290 pixelssum3
= vec_add(pixelssum4
, vcone
);
1291 pixelssum1
= vec_add(pixelssum2
, vcone
);
1293 blockv
= vec_packsu(temp3
, temp4
);
1295 vec_st(blockv
, 0, block
);
1298 pixels
+= line_size
;
1301 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1302 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1305 int has_altivec(void)
1307 #ifdef CONFIG_DARWIN
1308 int sels
[2] = {CTL_HW
, HW_VECTORUNIT
};
1310 size_t len
= sizeof(has_vu
);
1313 err
= sysctl(sels
, 2, &has_vu
, &len
, NULL
, 0);
1315 if (err
== 0) return (has_vu
!= 0);
1316 #else /* CONFIG_DARWIN */
1317 /* no Darwin, do it the brute-force way */
1318 /* this is borrowed from the libmpeg2 library */
1320 signal (SIGILL
, sigill_handler
);
1321 if (sigsetjmp (jmpbuf
, 1)) {
1322 signal (SIGILL
, SIG_DFL
);
1326 asm volatile ("mtspr 256, %0\n\t"
1327 "vand %%v0, %%v0, %%v0"
1331 signal (SIGILL
, SIG_DFL
);
1335 #endif /* CONFIG_DARWIN */