2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "../dsputil.h"
23 #include "gcc_fixes.h"
25 #include "dsputil_altivec.h"
28 #include <sys/sysctl.h>
29 #else /* CONFIG_DARWIN */
31 #include <exec/exec.h>
32 #include <interfaces/exec.h>
33 #include <proto/exec.h>
34 #else /* __AMIGAOS4__ */
38 static sigjmp_buf jmpbuf
;
39 static volatile sig_atomic_t canjump
= 0;
41 static void sigill_handler (int sig
)
44 signal (sig
, SIG_DFL
);
49 siglongjmp (jmpbuf
, 1);
51 #endif /* CONFIG_DARWIN */
52 #endif /* __AMIGAOS4__ */
54 int sad16_x2_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
57 int s
__attribute__((aligned(16)));
58 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
59 vector
unsigned char *tv
;
60 vector
unsigned char pix1v
, pix2v
, pix2iv
, avgv
, t5
;
61 vector
unsigned int sad
;
62 vector
signed int sumdiffs
;
65 sad
= (vector
unsigned int)vec_splat_u32(0);
68 Read unaligned pixels into our vectors. The vectors are as follows:
69 pix1v: pix1[0]-pix1[15]
70 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
72 tv
= (vector
unsigned char *) pix1
;
73 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
75 tv
= (vector
unsigned char *) &pix2
[0];
76 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
78 tv
= (vector
unsigned char *) &pix2
[1];
79 pix2iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[1]));
81 /* Calculate the average vector */
82 avgv
= vec_avg(pix2v
, pix2iv
);
84 /* Calculate a sum of abs differences vector */
85 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
87 /* Add each 4 pixel group together and put 4 results into sad */
88 sad
= vec_sum4s(t5
, sad
);
93 /* Sum up the four partial sums, and put the result into s */
94 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
95 sumdiffs
= vec_splat(sumdiffs
, 3);
96 vec_ste(sumdiffs
, 0, &s
);
101 int sad16_y2_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
104 int s
__attribute__((aligned(16)));
105 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
106 vector
unsigned char *tv
;
107 vector
unsigned char pix1v
, pix2v
, pix3v
, avgv
, t5
;
108 vector
unsigned int sad
;
109 vector
signed int sumdiffs
;
110 uint8_t *pix3
= pix2
+ line_size
;
113 sad
= (vector
unsigned int)vec_splat_u32(0);
116 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
117 iteration becomes pix2 in the next iteration. We can use this
118 fact to avoid a potentially expensive unaligned read, each
119 time around the loop.
120 Read unaligned pixels into our vectors. The vectors are as follows:
121 pix2v: pix2[0]-pix2[15]
122 Split the pixel vectors into shorts
124 tv
= (vector
unsigned char *) &pix2
[0];
125 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
129 Read unaligned pixels into our vectors. The vectors are as follows:
130 pix1v: pix1[0]-pix1[15]
131 pix3v: pix3[0]-pix3[15]
133 tv
= (vector
unsigned char *) pix1
;
134 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
136 tv
= (vector
unsigned char *) &pix3
[0];
137 pix3v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[0]));
139 /* Calculate the average vector */
140 avgv
= vec_avg(pix2v
, pix3v
);
142 /* Calculate a sum of abs differences vector */
143 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
145 /* Add each 4 pixel group together and put 4 results into sad */
146 sad
= vec_sum4s(t5
, sad
);
154 /* Sum up the four partial sums, and put the result into s */
155 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
156 sumdiffs
= vec_splat(sumdiffs
, 3);
157 vec_ste(sumdiffs
, 0, &s
);
161 int sad16_xy2_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
164 int s
__attribute__((aligned(16)));
165 uint8_t *pix3
= pix2
+ line_size
;
166 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
167 const_vector
unsigned short two
= (const_vector
unsigned short)vec_splat_u16(2);
168 vector
unsigned char *tv
, avgv
, t5
;
169 vector
unsigned char pix1v
, pix2v
, pix3v
, pix2iv
, pix3iv
;
170 vector
unsigned short pix2lv
, pix2hv
, pix2ilv
, pix2ihv
;
171 vector
unsigned short pix3lv
, pix3hv
, pix3ilv
, pix3ihv
;
172 vector
unsigned short avghv
, avglv
;
173 vector
unsigned short t1
, t2
, t3
, t4
;
174 vector
unsigned int sad
;
175 vector
signed int sumdiffs
;
177 sad
= (vector
unsigned int)vec_splat_u32(0);
182 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
183 iteration becomes pix2 in the next iteration. We can use this
184 fact to avoid a potentially expensive unaligned read, as well
185 as some splitting, and vector addition each time around the loop.
186 Read unaligned pixels into our vectors. The vectors are as follows:
187 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
188 Split the pixel vectors into shorts
190 tv
= (vector
unsigned char *) &pix2
[0];
191 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
193 tv
= (vector
unsigned char *) &pix2
[1];
194 pix2iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[1]));
196 pix2hv
= (vector
unsigned short) vec_mergeh(zero
, pix2v
);
197 pix2lv
= (vector
unsigned short) vec_mergel(zero
, pix2v
);
198 pix2ihv
= (vector
unsigned short) vec_mergeh(zero
, pix2iv
);
199 pix2ilv
= (vector
unsigned short) vec_mergel(zero
, pix2iv
);
200 t1
= vec_add(pix2hv
, pix2ihv
);
201 t2
= vec_add(pix2lv
, pix2ilv
);
205 Read unaligned pixels into our vectors. The vectors are as follows:
206 pix1v: pix1[0]-pix1[15]
207 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
209 tv
= (vector
unsigned char *) pix1
;
210 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
212 tv
= (vector
unsigned char *) &pix3
[0];
213 pix3v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[0]));
215 tv
= (vector
unsigned char *) &pix3
[1];
216 pix3iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[1]));
219 Note that Altivec does have vec_avg, but this works on vector pairs
220 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
221 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
222 Instead, we have to split the pixel vectors into vectors of shorts,
223 and do the averaging by hand.
226 /* Split the pixel vectors into shorts */
227 pix3hv
= (vector
unsigned short) vec_mergeh(zero
, pix3v
);
228 pix3lv
= (vector
unsigned short) vec_mergel(zero
, pix3v
);
229 pix3ihv
= (vector
unsigned short) vec_mergeh(zero
, pix3iv
);
230 pix3ilv
= (vector
unsigned short) vec_mergel(zero
, pix3iv
);
232 /* Do the averaging on them */
233 t3
= vec_add(pix3hv
, pix3ihv
);
234 t4
= vec_add(pix3lv
, pix3ilv
);
236 avghv
= vec_sr(vec_add(vec_add(t1
, t3
), two
), two
);
237 avglv
= vec_sr(vec_add(vec_add(t2
, t4
), two
), two
);
239 /* Pack the shorts back into a result */
240 avgv
= vec_pack(avghv
, avglv
);
242 /* Calculate a sum of abs differences vector */
243 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
245 /* Add each 4 pixel group together and put 4 results into sad */
246 sad
= vec_sum4s(t5
, sad
);
250 /* Transfer the calculated values for pix3 into pix2 */
254 /* Sum up the four partial sums, and put the result into s */
255 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
256 sumdiffs
= vec_splat(sumdiffs
, 3);
257 vec_ste(sumdiffs
, 0, &s
);
262 int sad16_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
265 int s
__attribute__((aligned(16)));
266 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
267 vector
unsigned char perm1
, perm2
, *pix1v
, *pix2v
;
268 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
269 vector
unsigned int sad
;
270 vector
signed int sumdiffs
;
272 sad
= (vector
unsigned int)vec_splat_u32(0);
276 /* Read potentially unaligned pixels into t1 and t2 */
277 perm1
= vec_lvsl(0, pix1
);
278 pix1v
= (vector
unsigned char *) pix1
;
279 perm2
= vec_lvsl(0, pix2
);
280 pix2v
= (vector
unsigned char *) pix2
;
281 t1
= vec_perm(pix1v
[0], pix1v
[1], perm1
);
282 t2
= vec_perm(pix2v
[0], pix2v
[1], perm2
);
284 /* Calculate a sum of abs differences vector */
285 t3
= vec_max(t1
, t2
);
286 t4
= vec_min(t1
, t2
);
287 t5
= vec_sub(t3
, t4
);
289 /* Add each 4 pixel group together and put 4 results into sad */
290 sad
= vec_sum4s(t5
, sad
);
296 /* Sum up the four partial sums, and put the result into s */
297 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
298 sumdiffs
= vec_splat(sumdiffs
, 3);
299 vec_ste(sumdiffs
, 0, &s
);
304 int sad8_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
307 int s
__attribute__((aligned(16)));
308 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
309 vector
unsigned char perm1
, perm2
, permclear
, *pix1v
, *pix2v
;
310 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
311 vector
unsigned int sad
;
312 vector
signed int sumdiffs
;
314 sad
= (vector
unsigned int)vec_splat_u32(0);
316 permclear
= (vector
unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
319 /* Read potentially unaligned pixels into t1 and t2
320 Since we're reading 16 pixels, and actually only want 8,
321 mask out the last 8 pixels. The 0s don't change the sum. */
322 perm1
= vec_lvsl(0, pix1
);
323 pix1v
= (vector
unsigned char *) pix1
;
324 perm2
= vec_lvsl(0, pix2
);
325 pix2v
= (vector
unsigned char *) pix2
;
326 t1
= vec_and(vec_perm(pix1v
[0], pix1v
[1], perm1
), permclear
);
327 t2
= vec_and(vec_perm(pix2v
[0], pix2v
[1], perm2
), permclear
);
329 /* Calculate a sum of abs differences vector */
330 t3
= vec_max(t1
, t2
);
331 t4
= vec_min(t1
, t2
);
332 t5
= vec_sub(t3
, t4
);
334 /* Add each 4 pixel group together and put 4 results into sad */
335 sad
= vec_sum4s(t5
, sad
);
341 /* Sum up the four partial sums, and put the result into s */
342 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
343 sumdiffs
= vec_splat(sumdiffs
, 3);
344 vec_ste(sumdiffs
, 0, &s
);
349 int pix_norm1_altivec(uint8_t *pix
, int line_size
)
352 int s
__attribute__((aligned(16)));
353 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
354 vector
unsigned char *tv
;
355 vector
unsigned char pixv
;
356 vector
unsigned int sv
;
357 vector
signed int sum
;
359 sv
= (vector
unsigned int)vec_splat_u32(0);
362 for (i
= 0; i
< 16; i
++) {
363 /* Read in the potentially unaligned pixels */
364 tv
= (vector
unsigned char *) pix
;
365 pixv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix
));
367 /* Square the values, and add them to our sum */
368 sv
= vec_msum(pixv
, pixv
, sv
);
372 /* Sum up the four partial sums, and put the result into s */
373 sum
= vec_sums((vector
signed int) sv
, (vector
signed int) zero
);
374 sum
= vec_splat(sum
, 3);
381 * Sum of Squared Errors for a 8x8 block.
383 * It's the sad8_altivec code above w/ squaring added.
385 int sse8_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
388 int s
__attribute__((aligned(16)));
389 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
390 vector
unsigned char perm1
, perm2
, permclear
, *pix1v
, *pix2v
;
391 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
392 vector
unsigned int sum
;
393 vector
signed int sumsqr
;
395 sum
= (vector
unsigned int)vec_splat_u32(0);
397 permclear
= (vector
unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
401 /* Read potentially unaligned pixels into t1 and t2
402 Since we're reading 16 pixels, and actually only want 8,
403 mask out the last 8 pixels. The 0s don't change the sum. */
404 perm1
= vec_lvsl(0, pix1
);
405 pix1v
= (vector
unsigned char *) pix1
;
406 perm2
= vec_lvsl(0, pix2
);
407 pix2v
= (vector
unsigned char *) pix2
;
408 t1
= vec_and(vec_perm(pix1v
[0], pix1v
[1], perm1
), permclear
);
409 t2
= vec_and(vec_perm(pix2v
[0], pix2v
[1], perm2
), permclear
);
412 Since we want to use unsigned chars, we can take advantage
413 of the fact that abs(a-b)^2 = (a-b)^2.
416 /* Calculate abs differences vector */
417 t3
= vec_max(t1
, t2
);
418 t4
= vec_min(t1
, t2
);
419 t5
= vec_sub(t3
, t4
);
421 /* Square the values and add them to our sum */
422 sum
= vec_msum(t5
, t5
, sum
);
428 /* Sum up the four partial sums, and put the result into s */
429 sumsqr
= vec_sums((vector
signed int) sum
, (vector
signed int) zero
);
430 sumsqr
= vec_splat(sumsqr
, 3);
431 vec_ste(sumsqr
, 0, &s
);
437 * Sum of Squared Errors for a 16x16 block.
439 * It's the sad16_altivec code above w/ squaring added.
441 int sse16_altivec(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
444 int s
__attribute__((aligned(16)));
445 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
446 vector
unsigned char perm1
, perm2
, *pix1v
, *pix2v
;
447 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
448 vector
unsigned int sum
;
449 vector
signed int sumsqr
;
451 sum
= (vector
unsigned int)vec_splat_u32(0);
454 /* Read potentially unaligned pixels into t1 and t2 */
455 perm1
= vec_lvsl(0, pix1
);
456 pix1v
= (vector
unsigned char *) pix1
;
457 perm2
= vec_lvsl(0, pix2
);
458 pix2v
= (vector
unsigned char *) pix2
;
459 t1
= vec_perm(pix1v
[0], pix1v
[1], perm1
);
460 t2
= vec_perm(pix2v
[0], pix2v
[1], perm2
);
463 Since we want to use unsigned chars, we can take advantage
464 of the fact that abs(a-b)^2 = (a-b)^2.
467 /* Calculate abs differences vector */
468 t3
= vec_max(t1
, t2
);
469 t4
= vec_min(t1
, t2
);
470 t5
= vec_sub(t3
, t4
);
472 /* Square the values and add them to our sum */
473 sum
= vec_msum(t5
, t5
, sum
);
479 /* Sum up the four partial sums, and put the result into s */
480 sumsqr
= vec_sums((vector
signed int) sum
, (vector
signed int) zero
);
481 sumsqr
= vec_splat(sumsqr
, 3);
482 vec_ste(sumsqr
, 0, &s
);
487 int pix_sum_altivec(uint8_t * pix
, int line_size
)
489 const_vector
unsigned int zero
= (const_vector
unsigned int)vec_splat_u32(0);
490 vector
unsigned char perm
, *pixv
;
491 vector
unsigned char t1
;
492 vector
unsigned int sad
;
493 vector
signed int sumdiffs
;
496 int s
__attribute__((aligned(16)));
498 sad
= (vector
unsigned int)vec_splat_u32(0);
500 for (i
= 0; i
< 16; i
++) {
501 /* Read the potentially unaligned 16 pixels into t1 */
502 perm
= vec_lvsl(0, pix
);
503 pixv
= (vector
unsigned char *) pix
;
504 t1
= vec_perm(pixv
[0], pixv
[1], perm
);
506 /* Add each 4 pixel group together and put 4 results into sad */
507 sad
= vec_sum4s(t1
, sad
);
512 /* Sum up the four partial sums, and put the result into s */
513 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
514 sumdiffs
= vec_splat(sumdiffs
, 3);
515 vec_ste(sumdiffs
, 0, &s
);
520 void get_pixels_altivec(DCTELEM
*restrict block
, const uint8_t *pixels
, int line_size
)
523 vector
unsigned char perm
, bytes
, *pixv
;
524 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
525 vector
signed short shorts
;
529 // Read potentially unaligned pixels.
530 // We're reading 16 pixels, and actually only want 8,
531 // but we simply ignore the extras.
532 perm
= vec_lvsl(0, pixels
);
533 pixv
= (vector
unsigned char *) pixels
;
534 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
536 // convert the bytes into shorts
537 shorts
= (vector
signed short)vec_mergeh(zero
, bytes
);
539 // save the data to the block, we assume the block is 16-byte aligned
540 vec_st(shorts
, i
*16, (vector
signed short*)block
);
546 void diff_pixels_altivec(DCTELEM
*restrict block
, const uint8_t *s1
,
547 const uint8_t *s2
, int stride
)
550 vector
unsigned char perm
, bytes
, *pixv
;
551 const_vector
unsigned char zero
= (const_vector
unsigned char)vec_splat_u8(0);
552 vector
signed short shorts1
, shorts2
;
556 // Read potentially unaligned pixels
557 // We're reading 16 pixels, and actually only want 8,
558 // but we simply ignore the extras.
559 perm
= vec_lvsl(0, s1
);
560 pixv
= (vector
unsigned char *) s1
;
561 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
563 // convert the bytes into shorts
564 shorts1
= (vector
signed short)vec_mergeh(zero
, bytes
);
566 // Do the same for the second block of pixels
567 perm
= vec_lvsl(0, s2
);
568 pixv
= (vector
unsigned char *) s2
;
569 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
571 // convert the bytes into shorts
572 shorts2
= (vector
signed short)vec_mergeh(zero
, bytes
);
574 // Do the subtraction
575 shorts1
= vec_sub(shorts1
, shorts2
);
577 // save the data to the block, we assume the block is 16-byte aligned
578 vec_st(shorts1
, 0, (vector
signed short*)block
);
585 // The code below is a copy of the code above... This is a manual
588 // Read potentially unaligned pixels
589 // We're reading 16 pixels, and actually only want 8,
590 // but we simply ignore the extras.
591 perm
= vec_lvsl(0, s1
);
592 pixv
= (vector
unsigned char *) s1
;
593 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
595 // convert the bytes into shorts
596 shorts1
= (vector
signed short)vec_mergeh(zero
, bytes
);
598 // Do the same for the second block of pixels
599 perm
= vec_lvsl(0, s2
);
600 pixv
= (vector
unsigned char *) s2
;
601 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
603 // convert the bytes into shorts
604 shorts2
= (vector
signed short)vec_mergeh(zero
, bytes
);
606 // Do the subtraction
607 shorts1
= vec_sub(shorts1
, shorts2
);
609 // save the data to the block, we assume the block is 16-byte aligned
610 vec_st(shorts1
, 0, (vector
signed short*)block
);
618 void add_bytes_altivec(uint8_t *dst
, uint8_t *src
, int w
) {
619 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
621 for(i
=0; i
+7<w
; i
++){
622 dst
[i
+0] += src
[i
+0];
623 dst
[i
+1] += src
[i
+1];
624 dst
[i
+2] += src
[i
+2];
625 dst
[i
+3] += src
[i
+3];
626 dst
[i
+4] += src
[i
+4];
627 dst
[i
+5] += src
[i
+5];
628 dst
[i
+6] += src
[i
+6];
629 dst
[i
+7] += src
[i
+7];
632 dst
[i
+0] += src
[i
+0];
633 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
635 register vector
unsigned char vdst
, vsrc
;
637 /* dst and src are 16 bytes-aligned (guaranteed) */
638 for(i
= 0 ; (i
+ 15) < w
; i
++)
640 vdst
= vec_ld(i
<< 4, (unsigned char*)dst
);
641 vsrc
= vec_ld(i
<< 4, (unsigned char*)src
);
642 vdst
= vec_add(vsrc
, vdst
);
643 vec_st(vdst
, i
<< 4, (unsigned char*)dst
);
645 /* if w is not a multiple of 16 */
646 for (; (i
< w
) ; i
++)
650 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
653 /* next one assumes that ((line_size % 16) == 0) */
654 void put_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
656 POWERPC_PERF_DECLARE(altivec_put_pixels16_num
, 1);
657 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
660 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num
, 1);
663 *((uint32_t*)(block
)) = LD32(pixels
);
664 *((uint32_t*)(block
+4)) = LD32(pixels
+4);
665 *((uint32_t*)(block
+8)) = LD32(pixels
+8);
666 *((uint32_t*)(block
+12)) = LD32(pixels
+12);
671 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num
, 1);
673 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
674 register vector
unsigned char pixelsv1
, pixelsv2
;
675 register vector
unsigned char pixelsv1B
, pixelsv2B
;
676 register vector
unsigned char pixelsv1C
, pixelsv2C
;
677 register vector
unsigned char pixelsv1D
, pixelsv2D
;
679 register vector
unsigned char perm
= vec_lvsl(0, pixels
);
681 register int line_size_2
= line_size
<< 1;
682 register int line_size_3
= line_size
+ line_size_2
;
683 register int line_size_4
= line_size
<< 2;
685 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num
, 1);
686 // hand-unrolling the loop by 4 gains about 15%
687 // mininum execution time goes from 74 to 60 cycles
688 // it's faster than -funroll-loops, but using
689 // -funroll-loops w/ this is bad - 74 cycles again.
690 // all this is on a 7450, tuning for the 7450
693 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
694 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
695 vec_st(vec_perm(pixelsv1
, pixelsv2
, perm
),
696 0, (unsigned char*)block
);
701 for(i
=0; i
<h
; i
+=4) {
702 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
703 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
704 pixelsv1B
= vec_ld(line_size
, (unsigned char*)pixels
);
705 pixelsv2B
= vec_ld(16 + line_size
, (unsigned char*)pixels
);
706 pixelsv1C
= vec_ld(line_size_2
, (unsigned char*)pixels
);
707 pixelsv2C
= vec_ld(16 + line_size_2
, (unsigned char*)pixels
);
708 pixelsv1D
= vec_ld(line_size_3
, (unsigned char*)pixels
);
709 pixelsv2D
= vec_ld(16 + line_size_3
, (unsigned char*)pixels
);
710 vec_st(vec_perm(pixelsv1
, pixelsv2
, perm
),
711 0, (unsigned char*)block
);
712 vec_st(vec_perm(pixelsv1B
, pixelsv2B
, perm
),
713 line_size
, (unsigned char*)block
);
714 vec_st(vec_perm(pixelsv1C
, pixelsv2C
, perm
),
715 line_size_2
, (unsigned char*)block
);
716 vec_st(vec_perm(pixelsv1D
, pixelsv2D
, perm
),
717 line_size_3
, (unsigned char*)block
);
722 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num
, 1);
724 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
727 /* next one assumes that ((line_size % 16) == 0) */
728 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
729 void avg_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
731 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num
, 1);
732 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
735 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num
, 1);
738 op_avg(*((uint32_t*)(block
)),LD32(pixels
));
739 op_avg(*((uint32_t*)(block
+4)),LD32(pixels
+4));
740 op_avg(*((uint32_t*)(block
+8)),LD32(pixels
+8));
741 op_avg(*((uint32_t*)(block
+12)),LD32(pixels
+12));
746 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num
, 1);
748 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
749 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv
, blockv
;
750 register vector
unsigned char perm
= vec_lvsl(0, pixels
);
753 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num
, 1);
756 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
757 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
758 blockv
= vec_ld(0, block
);
759 pixelsv
= vec_perm(pixelsv1
, pixelsv2
, perm
);
760 blockv
= vec_avg(blockv
,pixelsv
);
761 vec_st(blockv
, 0, (unsigned char*)block
);
766 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num
, 1);
768 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
771 /* next one assumes that ((line_size % 8) == 0) */
772 void avg_pixels8_altivec(uint8_t * block
, const uint8_t * pixels
, int line_size
, int h
)
774 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num
, 1);
775 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
777 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num
, 1);
778 for (i
= 0; i
< h
; i
++) {
779 *((uint32_t *) (block
)) =
780 (((*((uint32_t *) (block
))) |
781 ((((const struct unaligned_32
*) (pixels
))->l
))) -
782 ((((*((uint32_t *) (block
))) ^
783 ((((const struct unaligned_32
*) (pixels
))->
784 l
))) & 0xFEFEFEFEUL
) >> 1));
785 *((uint32_t *) (block
+ 4)) =
786 (((*((uint32_t *) (block
+ 4))) |
787 ((((const struct unaligned_32
*) (pixels
+ 4))->l
))) -
788 ((((*((uint32_t *) (block
+ 4))) ^
789 ((((const struct unaligned_32
*) (pixels
+
791 l
))) & 0xFEFEFEFEUL
) >> 1));
795 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num
, 1);
797 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
798 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv
, blockv
;
801 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num
, 1);
803 for (i
= 0; i
< h
; i
++) {
805 block is 8 bytes-aligned, so we're either in the
806 left block (16 bytes-aligned) or in the right block (not)
808 int rightside
= ((unsigned long)block
& 0x0000000F);
810 blockv
= vec_ld(0, block
);
811 pixelsv1
= vec_ld(0, (unsigned char*)pixels
);
812 pixelsv2
= vec_ld(16, (unsigned char*)pixels
);
813 pixelsv
= vec_perm(pixelsv1
, pixelsv2
, vec_lvsl(0, pixels
));
817 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(0,1,s0
,s1
));
821 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(s0
,s1
,2,3));
824 blockv
= vec_avg(blockv
, pixelsv
);
826 vec_st(blockv
, 0, block
);
832 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num
, 1);
834 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
837 /* next one assumes that ((line_size % 8) == 0) */
838 void put_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
840 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num
, 1);
841 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
843 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num
, 1);
844 for (j
= 0; j
< 2; j
++) {
846 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
848 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
850 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
852 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
855 for (i
= 0; i
< h
; i
+= 2) {
856 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
857 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
858 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
859 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
860 *((uint32_t *) block
) =
861 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
864 a
= (((const struct unaligned_32
*) (pixels
))->l
);
865 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
866 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
867 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
868 *((uint32_t *) block
) =
869 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
872 } pixels
+= 4 - line_size
* (h
+ 1);
873 block
+= 4 - line_size
* h
;
876 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num
, 1);
878 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
880 register vector
unsigned char
883 register vector
unsigned char
884 blockv
, temp1
, temp2
;
885 register vector
unsigned short
886 pixelssum1
, pixelssum2
, temp3
;
887 register const_vector
unsigned char vczero
= (const_vector
unsigned char)vec_splat_u8(0);
888 register const_vector
unsigned short vctwo
= (const_vector
unsigned short)vec_splat_u16(2);
890 temp1
= vec_ld(0, pixels
);
891 temp2
= vec_ld(16, pixels
);
892 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
893 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
899 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
901 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
902 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
903 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
904 (vector
unsigned short)pixelsv2
);
905 pixelssum1
= vec_add(pixelssum1
, vctwo
);
907 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num
, 1);
908 for (i
= 0; i
< h
; i
++) {
909 int rightside
= ((unsigned long)block
& 0x0000000F);
910 blockv
= vec_ld(0, block
);
912 temp1
= vec_ld(line_size
, pixels
);
913 temp2
= vec_ld(line_size
+ 16, pixels
);
914 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
915 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
921 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
924 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
925 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
926 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
927 (vector
unsigned short)pixelsv2
);
928 temp3
= vec_add(pixelssum1
, pixelssum2
);
929 temp3
= vec_sra(temp3
, vctwo
);
930 pixelssum1
= vec_add(pixelssum2
, vctwo
);
931 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
935 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
939 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
942 vec_st(blockv
, 0, block
);
948 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num
, 1);
949 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
952 /* next one assumes that ((line_size % 8) == 0) */
953 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
955 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num
, 1);
956 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
958 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
959 for (j
= 0; j
< 2; j
++) {
961 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
963 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
965 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
967 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
970 for (i
= 0; i
< h
; i
+= 2) {
971 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
972 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
973 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
974 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
975 *((uint32_t *) block
) =
976 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
979 a
= (((const struct unaligned_32
*) (pixels
))->l
);
980 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
981 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
982 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
983 *((uint32_t *) block
) =
984 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
987 } pixels
+= 4 - line_size
* (h
+ 1);
988 block
+= 4 - line_size
* h
;
991 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
993 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
995 register vector
unsigned char
998 register vector
unsigned char
999 blockv
, temp1
, temp2
;
1000 register vector
unsigned short
1001 pixelssum1
, pixelssum2
, temp3
;
1002 register const_vector
unsigned char vczero
= (const_vector
unsigned char)vec_splat_u8(0);
1003 register const_vector
unsigned short vcone
= (const_vector
unsigned short)vec_splat_u16(1);
1004 register const_vector
unsigned short vctwo
= (const_vector
unsigned short)vec_splat_u16(2);
1006 temp1
= vec_ld(0, pixels
);
1007 temp2
= vec_ld(16, pixels
);
1008 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
1009 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
1015 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
1017 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1018 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1019 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
1020 (vector
unsigned short)pixelsv2
);
1021 pixelssum1
= vec_add(pixelssum1
, vcone
);
1023 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
1024 for (i
= 0; i
< h
; i
++) {
1025 int rightside
= ((unsigned long)block
& 0x0000000F);
1026 blockv
= vec_ld(0, block
);
1028 temp1
= vec_ld(line_size
, pixels
);
1029 temp2
= vec_ld(line_size
+ 16, pixels
);
1030 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1031 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1037 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1040 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1041 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1042 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1043 (vector
unsigned short)pixelsv2
);
1044 temp3
= vec_add(pixelssum1
, pixelssum2
);
1045 temp3
= vec_sra(temp3
, vctwo
);
1046 pixelssum1
= vec_add(pixelssum2
, vcone
);
1047 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
1051 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
1055 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
1058 vec_st(blockv
, 0, block
);
1061 pixels
+= line_size
;
1064 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num
, 1);
1065 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1068 /* next one assumes that ((line_size % 16) == 0) */
1069 void put_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, int line_size
, int h
)
1071 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num
, 1);
1072 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1074 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num
, 1);
1075 for (j
= 0; j
< 4; j
++) {
1077 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1079 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1081 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
1083 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1085 pixels
+= line_size
;
1086 for (i
= 0; i
< h
; i
+= 2) {
1087 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1088 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1089 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
1090 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1091 *((uint32_t *) block
) =
1092 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1093 pixels
+= line_size
;
1095 a
= (((const struct unaligned_32
*) (pixels
))->l
);
1096 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1097 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
1098 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1099 *((uint32_t *) block
) =
1100 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1101 pixels
+= line_size
;
1103 } pixels
+= 4 - line_size
* (h
+ 1);
1104 block
+= 4 - line_size
* h
;
1107 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num
, 1);
1109 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1111 register vector
unsigned char
1112 pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
1113 register vector
unsigned char
1114 blockv
, temp1
, temp2
;
1115 register vector
unsigned short
1116 pixelssum1
, pixelssum2
, temp3
,
1117 pixelssum3
, pixelssum4
, temp4
;
1118 register const_vector
unsigned char vczero
= (const_vector
unsigned char)vec_splat_u8(0);
1119 register const_vector
unsigned short vctwo
= (const_vector
unsigned short)vec_splat_u16(2);
1121 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num
, 1);
1123 temp1
= vec_ld(0, pixels
);
1124 temp2
= vec_ld(16, pixels
);
1125 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
1126 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
1132 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
1134 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1135 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1136 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1137 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1138 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
1139 (vector
unsigned short)pixelsv4
);
1140 pixelssum3
= vec_add(pixelssum3
, vctwo
);
1141 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
1142 (vector
unsigned short)pixelsv2
);
1143 pixelssum1
= vec_add(pixelssum1
, vctwo
);
1145 for (i
= 0; i
< h
; i
++) {
1146 blockv
= vec_ld(0, block
);
1148 temp1
= vec_ld(line_size
, pixels
);
1149 temp2
= vec_ld(line_size
+ 16, pixels
);
1150 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1151 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1157 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1160 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1161 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1162 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1163 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1165 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
1166 (vector
unsigned short)pixelsv4
);
1167 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1168 (vector
unsigned short)pixelsv2
);
1169 temp4
= vec_add(pixelssum3
, pixelssum4
);
1170 temp4
= vec_sra(temp4
, vctwo
);
1171 temp3
= vec_add(pixelssum1
, pixelssum2
);
1172 temp3
= vec_sra(temp3
, vctwo
);
1174 pixelssum3
= vec_add(pixelssum4
, vctwo
);
1175 pixelssum1
= vec_add(pixelssum2
, vctwo
);
1177 blockv
= vec_packsu(temp3
, temp4
);
1179 vec_st(blockv
, 0, block
);
1182 pixels
+= line_size
;
1185 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num
, 1);
1186 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1189 /* next one assumes that ((line_size % 16) == 0) */
1190 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, int line_size
, int h
)
1192 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1193 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1195 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1196 for (j
= 0; j
< 4; j
++) {
1198 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1200 (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1202 (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
1204 ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1206 pixels
+= line_size
;
1207 for (i
= 0; i
< h
; i
+= 2) {
1208 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1209 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1210 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
1211 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1212 *((uint32_t *) block
) =
1213 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1214 pixels
+= line_size
;
1216 a
= (((const struct unaligned_32
*) (pixels
))->l
);
1217 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1218 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x01010101UL
;
1219 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1220 *((uint32_t *) block
) =
1221 h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
);
1222 pixels
+= line_size
;
1224 } pixels
+= 4 - line_size
* (h
+ 1);
1225 block
+= 4 - line_size
* h
;
1228 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1230 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1232 register vector
unsigned char
1233 pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
1234 register vector
unsigned char
1235 blockv
, temp1
, temp2
;
1236 register vector
unsigned short
1237 pixelssum1
, pixelssum2
, temp3
,
1238 pixelssum3
, pixelssum4
, temp4
;
1239 register const_vector
unsigned char vczero
= (const_vector
unsigned char)vec_splat_u8(0);
1240 register const_vector
unsigned short vcone
= (const_vector
unsigned short)vec_splat_u16(1);
1241 register const_vector
unsigned short vctwo
= (const_vector
unsigned short)vec_splat_u16(2);
1243 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1245 temp1
= vec_ld(0, pixels
);
1246 temp2
= vec_ld(16, pixels
);
1247 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
1248 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
1254 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
1256 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1257 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1258 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1259 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1260 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
1261 (vector
unsigned short)pixelsv4
);
1262 pixelssum3
= vec_add(pixelssum3
, vcone
);
1263 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
1264 (vector
unsigned short)pixelsv2
);
1265 pixelssum1
= vec_add(pixelssum1
, vcone
);
1267 for (i
= 0; i
< h
; i
++) {
1268 blockv
= vec_ld(0, block
);
1270 temp1
= vec_ld(line_size
, pixels
);
1271 temp2
= vec_ld(line_size
+ 16, pixels
);
1272 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1273 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1279 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1282 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
1283 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
1284 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1285 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1287 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
1288 (vector
unsigned short)pixelsv4
);
1289 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1290 (vector
unsigned short)pixelsv2
);
1291 temp4
= vec_add(pixelssum3
, pixelssum4
);
1292 temp4
= vec_sra(temp4
, vctwo
);
1293 temp3
= vec_add(pixelssum1
, pixelssum2
);
1294 temp3
= vec_sra(temp3
, vctwo
);
1296 pixelssum3
= vec_add(pixelssum4
, vcone
);
1297 pixelssum1
= vec_add(pixelssum2
, vcone
);
1299 blockv
= vec_packsu(temp3
, temp4
);
1301 vec_st(blockv
, 0, block
);
1304 pixels
+= line_size
;
1307 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num
, 1);
1308 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1311 int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s
, uint8_t *dst
, uint8_t *src
, int stride
, int h
){
1312 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num
, 1);
1314 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num
, 1);
1315 register const_vector
unsigned char vzero
= (const_vector
unsigned char)vec_splat_u8(0);
1316 register vector
signed short temp0
, temp1
, temp2
, temp3
, temp4
, temp5
, temp6
, temp7
;
1318 register const_vector
signed short vprod1
= (const_vector
signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
1319 register const_vector
signed short vprod2
= (const_vector
signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
1320 register const_vector
signed short vprod3
= (const_vector
signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
1321 register const_vector
unsigned char perm1
= (const_vector
unsigned char)
1322 AVV(0x02, 0x03, 0x00, 0x01,
1323 0x06, 0x07, 0x04, 0x05,
1324 0x0A, 0x0B, 0x08, 0x09,
1325 0x0E, 0x0F, 0x0C, 0x0D);
1326 register const_vector
unsigned char perm2
= (const_vector
unsigned char)
1327 AVV(0x04, 0x05, 0x06, 0x07,
1328 0x00, 0x01, 0x02, 0x03,
1329 0x0C, 0x0D, 0x0E, 0x0F,
1330 0x08, 0x09, 0x0A, 0x0B);
1331 register const_vector
unsigned char perm3
= (const_vector
unsigned char)
1332 AVV(0x08, 0x09, 0x0A, 0x0B,
1333 0x0C, 0x0D, 0x0E, 0x0F,
1334 0x00, 0x01, 0x02, 0x03,
1335 0x04, 0x05, 0x06, 0x07);
1337 #define ONEITERBUTTERFLY(i, res) \
1339 register vector unsigned char src1, src2, srcO; \
1340 register vector unsigned char dst1, dst2, dstO; \
1341 src1 = vec_ld(stride * i, src); \
1342 if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
1343 src2 = vec_ld((stride * i) + 16, src); \
1344 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1345 dst1 = vec_ld(stride * i, dst); \
1346 if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
1347 dst2 = vec_ld((stride * i) + 16, dst); \
1348 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1349 /* promote the unsigned chars to signed shorts */ \
1350 /* we're in the 8x8 function, we only care for the first 8 */ \
1351 register vector signed short srcV = \
1352 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1353 register vector signed short dstV = \
1354 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1355 /* substractions inside the first butterfly */ \
1356 register vector signed short but0 = vec_sub(srcV, dstV); \
1357 register vector signed short op1 = vec_perm(but0, but0, perm1); \
1358 register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
1359 register vector signed short op2 = vec_perm(but1, but1, perm2); \
1360 register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
1361 register vector signed short op3 = vec_perm(but2, but2, perm3); \
1362 res = vec_mladd(but2, vprod3, op3); \
1364 ONEITERBUTTERFLY(0, temp0
);
1365 ONEITERBUTTERFLY(1, temp1
);
1366 ONEITERBUTTERFLY(2, temp2
);
1367 ONEITERBUTTERFLY(3, temp3
);
1368 ONEITERBUTTERFLY(4, temp4
);
1369 ONEITERBUTTERFLY(5, temp5
);
1370 ONEITERBUTTERFLY(6, temp6
);
1371 ONEITERBUTTERFLY(7, temp7
);
1373 #undef ONEITERBUTTERFLY
1375 register vector
signed int vsum
;
1376 register vector
signed short line0
= vec_add(temp0
, temp1
);
1377 register vector
signed short line1
= vec_sub(temp0
, temp1
);
1378 register vector
signed short line2
= vec_add(temp2
, temp3
);
1379 register vector
signed short line3
= vec_sub(temp2
, temp3
);
1380 register vector
signed short line4
= vec_add(temp4
, temp5
);
1381 register vector
signed short line5
= vec_sub(temp4
, temp5
);
1382 register vector
signed short line6
= vec_add(temp6
, temp7
);
1383 register vector
signed short line7
= vec_sub(temp6
, temp7
);
1385 register vector
signed short line0B
= vec_add(line0
, line2
);
1386 register vector
signed short line2B
= vec_sub(line0
, line2
);
1387 register vector
signed short line1B
= vec_add(line1
, line3
);
1388 register vector
signed short line3B
= vec_sub(line1
, line3
);
1389 register vector
signed short line4B
= vec_add(line4
, line6
);
1390 register vector
signed short line6B
= vec_sub(line4
, line6
);
1391 register vector
signed short line5B
= vec_add(line5
, line7
);
1392 register vector
signed short line7B
= vec_sub(line5
, line7
);
1394 register vector
signed short line0C
= vec_add(line0B
, line4B
);
1395 register vector
signed short line4C
= vec_sub(line0B
, line4B
);
1396 register vector
signed short line1C
= vec_add(line1B
, line5B
);
1397 register vector
signed short line5C
= vec_sub(line1B
, line5B
);
1398 register vector
signed short line2C
= vec_add(line2B
, line6B
);
1399 register vector
signed short line6C
= vec_sub(line2B
, line6B
);
1400 register vector
signed short line3C
= vec_add(line3B
, line7B
);
1401 register vector
signed short line7C
= vec_sub(line3B
, line7B
);
1403 vsum
= vec_sum4s(vec_abs(line0C
), vec_splat_s32(0));
1404 vsum
= vec_sum4s(vec_abs(line1C
), vsum
);
1405 vsum
= vec_sum4s(vec_abs(line2C
), vsum
);
1406 vsum
= vec_sum4s(vec_abs(line3C
), vsum
);
1407 vsum
= vec_sum4s(vec_abs(line4C
), vsum
);
1408 vsum
= vec_sum4s(vec_abs(line5C
), vsum
);
1409 vsum
= vec_sum4s(vec_abs(line6C
), vsum
);
1410 vsum
= vec_sum4s(vec_abs(line7C
), vsum
);
1411 vsum
= vec_sums(vsum
, (vector
signed int)vzero
);
1412 vsum
= vec_splat(vsum
, 3);
1413 vec_ste(vsum
, 0, &sum
);
1415 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num
, 1);
1420 16x8 works with 16 elements ; it allows to avoid replicating
1421 loads, and give the compiler more rooms for scheduling.
1422 It's only used from inside hadamard8_diff16_altivec.
1424 Unfortunately, it seems gcc-3.3 is a bit dumb, and
1425 the compiled code has a LOT of spill code, it seems
1426 gcc (unlike xlc) cannot keep everything in registers
1427 by itself. The following code include hand-made
1428 registers allocation. It's not clean, but on
1429 a 7450 the resulting code is much faster (best case
1430 fall from 700+ cycles to 550).
1432 xlc doesn't add spill code, but it doesn't know how to
1433 schedule for the 7450, and its code isn't much faster than
1434 gcc-3.3 on the 7450 (but uses 25% less instructions...)
1436 On the 970, the hand-made RA is still a win (arount 690
1437 vs. around 780), but xlc goes to around 660 on the
1441 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s
, uint8_t *dst
, uint8_t *src
, int stride
, int h
) {
1443 register vector
signed short
1452 register vector
signed short
1461 register const_vector
unsigned char vzero
REG_v(v31
)= (const_vector
unsigned char)vec_splat_u8(0);
1463 register const_vector
signed short vprod1
REG_v(v16
)= (const_vector
signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
1464 register const_vector
signed short vprod2
REG_v(v17
)= (const_vector
signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
1465 register const_vector
signed short vprod3
REG_v(v18
)= (const_vector
signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
1466 register const_vector
unsigned char perm1
REG_v(v19
)= (const_vector
unsigned char)
1467 AVV(0x02, 0x03, 0x00, 0x01,
1468 0x06, 0x07, 0x04, 0x05,
1469 0x0A, 0x0B, 0x08, 0x09,
1470 0x0E, 0x0F, 0x0C, 0x0D);
1471 register const_vector
unsigned char perm2
REG_v(v20
)= (const_vector
unsigned char)
1472 AVV(0x04, 0x05, 0x06, 0x07,
1473 0x00, 0x01, 0x02, 0x03,
1474 0x0C, 0x0D, 0x0E, 0x0F,
1475 0x08, 0x09, 0x0A, 0x0B);
1476 register const_vector
unsigned char perm3
REG_v(v21
)= (const_vector
unsigned char)
1477 AVV(0x08, 0x09, 0x0A, 0x0B,
1478 0x0C, 0x0D, 0x0E, 0x0F,
1479 0x00, 0x01, 0x02, 0x03,
1480 0x04, 0x05, 0x06, 0x07);
1482 #define ONEITERBUTTERFLY(i, res1, res2) \
1484 register vector unsigned char src1 REG_v(v22), src2 REG_v(v23); \
1485 register vector unsigned char dst1 REG_v(v24), dst2 REG_v(v25); \
1486 src1 = vec_ld(stride * i, src); \
1487 src2 = vec_ld((stride * i) + 16, src); \
1488 register vector unsigned char srcO REG_v(v22) = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1489 dst1 = vec_ld(stride * i, dst); \
1490 dst2 = vec_ld((stride * i) + 16, dst); \
1491 register vector unsigned char dstO REG_v(v23) = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1492 /* promote the unsigned chars to signed shorts */ \
1493 register vector signed short srcV REG_v(v24) = \
1494 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1495 register vector signed short dstV REG_v(v25) = \
1496 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1497 register vector signed short srcW REG_v(v26) = \
1498 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
1499 register vector signed short dstW REG_v(v27) = \
1500 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
1501 /* substractions inside the first butterfly */ \
1502 register vector signed short but0 REG_v(v28) = vec_sub(srcV, dstV); \
1503 register vector signed short but0S REG_v(v29) = vec_sub(srcW, dstW); \
1504 register vector signed short op1 REG_v(v30) = vec_perm(but0, but0, perm1); \
1505 register vector signed short but1 REG_v(v22) = vec_mladd(but0, vprod1, op1); \
1506 register vector signed short op1S REG_v(v23) = vec_perm(but0S, but0S, perm1); \
1507 register vector signed short but1S REG_v(v24) = vec_mladd(but0S, vprod1, op1S); \
1508 register vector signed short op2 REG_v(v25) = vec_perm(but1, but1, perm2); \
1509 register vector signed short but2 REG_v(v26) = vec_mladd(but1, vprod2, op2); \
1510 register vector signed short op2S REG_v(v27) = vec_perm(but1S, but1S, perm2); \
1511 register vector signed short but2S REG_v(v28) = vec_mladd(but1S, vprod2, op2S); \
1512 register vector signed short op3 REG_v(v29) = vec_perm(but2, but2, perm3); \
1513 res1 = vec_mladd(but2, vprod3, op3); \
1514 register vector signed short op3S REG_v(v30) = vec_perm(but2S, but2S, perm3); \
1515 res2 = vec_mladd(but2S, vprod3, op3S); \
1517 ONEITERBUTTERFLY(0, temp0
, temp0S
);
1518 ONEITERBUTTERFLY(1, temp1
, temp1S
);
1519 ONEITERBUTTERFLY(2, temp2
, temp2S
);
1520 ONEITERBUTTERFLY(3, temp3
, temp3S
);
1521 ONEITERBUTTERFLY(4, temp4
, temp4S
);
1522 ONEITERBUTTERFLY(5, temp5
, temp5S
);
1523 ONEITERBUTTERFLY(6, temp6
, temp6S
);
1524 ONEITERBUTTERFLY(7, temp7
, temp7S
);
1526 #undef ONEITERBUTTERFLY
1528 register vector
signed int vsum
;
1529 register vector
signed short line0
= vec_add(temp0
, temp1
);
1530 register vector
signed short line1
= vec_sub(temp0
, temp1
);
1531 register vector
signed short line2
= vec_add(temp2
, temp3
);
1532 register vector
signed short line3
= vec_sub(temp2
, temp3
);
1533 register vector
signed short line4
= vec_add(temp4
, temp5
);
1534 register vector
signed short line5
= vec_sub(temp4
, temp5
);
1535 register vector
signed short line6
= vec_add(temp6
, temp7
);
1536 register vector
signed short line7
= vec_sub(temp6
, temp7
);
1538 register vector
signed short line0B
= vec_add(line0
, line2
);
1539 register vector
signed short line2B
= vec_sub(line0
, line2
);
1540 register vector
signed short line1B
= vec_add(line1
, line3
);
1541 register vector
signed short line3B
= vec_sub(line1
, line3
);
1542 register vector
signed short line4B
= vec_add(line4
, line6
);
1543 register vector
signed short line6B
= vec_sub(line4
, line6
);
1544 register vector
signed short line5B
= vec_add(line5
, line7
);
1545 register vector
signed short line7B
= vec_sub(line5
, line7
);
1547 register vector
signed short line0C
= vec_add(line0B
, line4B
);
1548 register vector
signed short line4C
= vec_sub(line0B
, line4B
);
1549 register vector
signed short line1C
= vec_add(line1B
, line5B
);
1550 register vector
signed short line5C
= vec_sub(line1B
, line5B
);
1551 register vector
signed short line2C
= vec_add(line2B
, line6B
);
1552 register vector
signed short line6C
= vec_sub(line2B
, line6B
);
1553 register vector
signed short line3C
= vec_add(line3B
, line7B
);
1554 register vector
signed short line7C
= vec_sub(line3B
, line7B
);
1556 vsum
= vec_sum4s(vec_abs(line0C
), vec_splat_s32(0));
1557 vsum
= vec_sum4s(vec_abs(line1C
), vsum
);
1558 vsum
= vec_sum4s(vec_abs(line2C
), vsum
);
1559 vsum
= vec_sum4s(vec_abs(line3C
), vsum
);
1560 vsum
= vec_sum4s(vec_abs(line4C
), vsum
);
1561 vsum
= vec_sum4s(vec_abs(line5C
), vsum
);
1562 vsum
= vec_sum4s(vec_abs(line6C
), vsum
);
1563 vsum
= vec_sum4s(vec_abs(line7C
), vsum
);
1565 register vector
signed short line0S
= vec_add(temp0S
, temp1S
);
1566 register vector
signed short line1S
= vec_sub(temp0S
, temp1S
);
1567 register vector
signed short line2S
= vec_add(temp2S
, temp3S
);
1568 register vector
signed short line3S
= vec_sub(temp2S
, temp3S
);
1569 register vector
signed short line4S
= vec_add(temp4S
, temp5S
);
1570 register vector
signed short line5S
= vec_sub(temp4S
, temp5S
);
1571 register vector
signed short line6S
= vec_add(temp6S
, temp7S
);
1572 register vector
signed short line7S
= vec_sub(temp6S
, temp7S
);
1574 register vector
signed short line0BS
= vec_add(line0S
, line2S
);
1575 register vector
signed short line2BS
= vec_sub(line0S
, line2S
);
1576 register vector
signed short line1BS
= vec_add(line1S
, line3S
);
1577 register vector
signed short line3BS
= vec_sub(line1S
, line3S
);
1578 register vector
signed short line4BS
= vec_add(line4S
, line6S
);
1579 register vector
signed short line6BS
= vec_sub(line4S
, line6S
);
1580 register vector
signed short line5BS
= vec_add(line5S
, line7S
);
1581 register vector
signed short line7BS
= vec_sub(line5S
, line7S
);
1583 register vector
signed short line0CS
= vec_add(line0BS
, line4BS
);
1584 register vector
signed short line4CS
= vec_sub(line0BS
, line4BS
);
1585 register vector
signed short line1CS
= vec_add(line1BS
, line5BS
);
1586 register vector
signed short line5CS
= vec_sub(line1BS
, line5BS
);
1587 register vector
signed short line2CS
= vec_add(line2BS
, line6BS
);
1588 register vector
signed short line6CS
= vec_sub(line2BS
, line6BS
);
1589 register vector
signed short line3CS
= vec_add(line3BS
, line7BS
);
1590 register vector
signed short line7CS
= vec_sub(line3BS
, line7BS
);
1592 vsum
= vec_sum4s(vec_abs(line0CS
), vsum
);
1593 vsum
= vec_sum4s(vec_abs(line1CS
), vsum
);
1594 vsum
= vec_sum4s(vec_abs(line2CS
), vsum
);
1595 vsum
= vec_sum4s(vec_abs(line3CS
), vsum
);
1596 vsum
= vec_sum4s(vec_abs(line4CS
), vsum
);
1597 vsum
= vec_sum4s(vec_abs(line5CS
), vsum
);
1598 vsum
= vec_sum4s(vec_abs(line6CS
), vsum
);
1599 vsum
= vec_sum4s(vec_abs(line7CS
), vsum
);
1600 vsum
= vec_sums(vsum
, (vector
signed int)vzero
);
1601 vsum
= vec_splat(vsum
, 3);
1602 vec_ste(vsum
, 0, &sum
);
1607 int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s
, uint8_t *dst
, uint8_t *src
, int stride
, int h
){
1608 POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num
, 1);
1610 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num
, 1);
1611 score
= hadamard8_diff16x8_altivec(s
, dst
, src
, stride
, 8);
1615 score
+= hadamard8_diff16x8_altivec(s
, dst
, src
, stride
, 8);
1617 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num
, 1);
1621 int has_altivec(void)
1625 extern struct ExecIFace
*IExec
;
1627 IExec
->GetCPUInfoTags(GCIT_VectorUnit
, &result
, TAG_DONE
);
1628 if (result
== VECTORTYPE_ALTIVEC
) return 1;
1630 #else /* __AMIGAOS4__ */
1632 #ifdef CONFIG_DARWIN
1633 int sels
[2] = {CTL_HW
, HW_VECTORUNIT
};
1635 size_t len
= sizeof(has_vu
);
1638 err
= sysctl(sels
, 2, &has_vu
, &len
, NULL
, 0);
1640 if (err
== 0) return (has_vu
!= 0);
1641 #else /* CONFIG_DARWIN */
1642 /* no Darwin, do it the brute-force way */
1643 /* this is borrowed from the libmpeg2 library */
1645 signal (SIGILL
, sigill_handler
);
1646 if (sigsetjmp (jmpbuf
, 1)) {
1647 signal (SIGILL
, SIG_DFL
);
1651 asm volatile ("mtspr 256, %0\n\t"
1652 "vand %%v0, %%v0, %%v0"
1656 signal (SIGILL
, SIG_DFL
);
1660 #endif /* CONFIG_DARWIN */
1662 #endif /* __AMIGAOS4__ */
1665 /* next one assumes that ((line_size % 8) == 0) */
1666 void avg_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, int line_size
, int h
)
1668 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num
, 1);
1669 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1672 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num
, 1);
1673 for (j
= 0; j
< 2; j
++) {
1675 const uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1676 const uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1677 uint32_t l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
1678 uint32_t h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1680 pixels
+= line_size
;
1681 for (i
= 0; i
< h
; i
+= 2) {
1682 uint32_t a
= (((const struct unaligned_32
*) (pixels
))->l
);
1683 uint32_t b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1684 l1
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
);
1685 h1
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1686 *((uint32_t *) block
) = rnd_avg32(*((uint32_t *) block
), h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
));
1687 pixels
+= line_size
;
1689 a
= (((const struct unaligned_32
*) (pixels
))->l
);
1690 b
= (((const struct unaligned_32
*) (pixels
+ 1))->l
);
1691 l0
= (a
& 0x03030303UL
) + (b
& 0x03030303UL
) + 0x02020202UL
;
1692 h0
= ((a
& 0xFCFCFCFCUL
) >> 2) + ((b
& 0xFCFCFCFCUL
) >> 2);
1693 *((uint32_t *) block
) = rnd_avg32(*((uint32_t *) block
), h0
+ h1
+ (((l0
+ l1
) >> 2) & 0x0F0F0F0FUL
));
1694 pixels
+= line_size
;
1696 } pixels
+= 4 - line_size
* (h
+ 1);
1697 block
+= 4 - line_size
* h
;
1699 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num
, 1);
1700 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1702 register vector
unsigned char
1705 register vector
unsigned char
1706 blockv
, temp1
, temp2
, blocktemp
;
1707 register vector
unsigned short
1708 pixelssum1
, pixelssum2
, temp3
;
1709 register const_vector
unsigned char vczero
= (const_vector
unsigned char)vec_splat_u8(0);
1710 register const_vector
unsigned short vctwo
= (const_vector
unsigned short)vec_splat_u16(2);
1712 temp1
= vec_ld(0, pixels
);
1713 temp2
= vec_ld(16, pixels
);
1714 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
1715 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F)
1721 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
1723 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1724 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1725 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
1726 (vector
unsigned short)pixelsv2
);
1727 pixelssum1
= vec_add(pixelssum1
, vctwo
);
1729 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num
, 1);
1730 for (i
= 0; i
< h
; i
++) {
1731 int rightside
= ((unsigned long)block
& 0x0000000F);
1732 blockv
= vec_ld(0, block
);
1734 temp1
= vec_ld(line_size
, pixels
);
1735 temp2
= vec_ld(line_size
+ 16, pixels
);
1736 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
1737 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F)
1743 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
1746 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
1747 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
1748 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
1749 (vector
unsigned short)pixelsv2
);
1750 temp3
= vec_add(pixelssum1
, pixelssum2
);
1751 temp3
= vec_sra(temp3
, vctwo
);
1752 pixelssum1
= vec_add(pixelssum2
, vctwo
);
1753 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
1757 blocktemp
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
1761 blocktemp
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
1764 blockv
= vec_avg(blocktemp
, blockv
);
1765 vec_st(blockv
, 0, block
);
1768 pixels
+= line_size
;
1771 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num
, 1);
1772 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */