31464fb7a6535d87879e3ee02a413373d5ad2af7
[libav.git] / libavcodec / ppc / dsputil_altivec.c
1 /*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "../dsputil.h"
22
23 #include "gcc_fixes.h"
24
25 #include "dsputil_altivec.h"
26
27 #ifdef CONFIG_DARWIN
28 #include <sys/sysctl.h>
29 #else /* CONFIG_DARWIN */
30 #ifdef __AMIGAOS4__
31 #include <exec/exec.h>
32 #include <interfaces/exec.h>
33 #include <proto/exec.h>
34 #else /* __AMIGAOS4__ */
35 #include <signal.h>
36 #include <setjmp.h>
37
38 static sigjmp_buf jmpbuf;
39 static volatile sig_atomic_t canjump = 0;
40
41 static void sigill_handler (int sig)
42 {
43 if (!canjump) {
44 signal (sig, SIG_DFL);
45 raise (sig);
46 }
47
48 canjump = 0;
49 siglongjmp (jmpbuf, 1);
50 }
51 #endif /* CONFIG_DARWIN */
52 #endif /* __AMIGAOS4__ */
53
54 int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
55 {
56 int i;
57 int s __attribute__((aligned(16)));
58 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
59 vector unsigned char *tv;
60 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
61 vector unsigned int sad;
62 vector signed int sumdiffs;
63
64 s = 0;
65 sad = (vector unsigned int)vec_splat_u32(0);
66 for(i=0;i<h;i++) {
67 /*
68 Read unaligned pixels into our vectors. The vectors are as follows:
69 pix1v: pix1[0]-pix1[15]
70 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
71 */
72 tv = (vector unsigned char *) pix1;
73 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
74
75 tv = (vector unsigned char *) &pix2[0];
76 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
77
78 tv = (vector unsigned char *) &pix2[1];
79 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
80
81 /* Calculate the average vector */
82 avgv = vec_avg(pix2v, pix2iv);
83
84 /* Calculate a sum of abs differences vector */
85 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
86
87 /* Add each 4 pixel group together and put 4 results into sad */
88 sad = vec_sum4s(t5, sad);
89
90 pix1 += line_size;
91 pix2 += line_size;
92 }
93 /* Sum up the four partial sums, and put the result into s */
94 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
95 sumdiffs = vec_splat(sumdiffs, 3);
96 vec_ste(sumdiffs, 0, &s);
97
98 return s;
99 }
100
101 int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
102 {
103 int i;
104 int s __attribute__((aligned(16)));
105 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
106 vector unsigned char *tv;
107 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
108 vector unsigned int sad;
109 vector signed int sumdiffs;
110 uint8_t *pix3 = pix2 + line_size;
111
112 s = 0;
113 sad = (vector unsigned int)vec_splat_u32(0);
114
115 /*
116 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
117 iteration becomes pix2 in the next iteration. We can use this
118 fact to avoid a potentially expensive unaligned read, each
119 time around the loop.
120 Read unaligned pixels into our vectors. The vectors are as follows:
121 pix2v: pix2[0]-pix2[15]
122 Split the pixel vectors into shorts
123 */
124 tv = (vector unsigned char *) &pix2[0];
125 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
126
127 for(i=0;i<h;i++) {
128 /*
129 Read unaligned pixels into our vectors. The vectors are as follows:
130 pix1v: pix1[0]-pix1[15]
131 pix3v: pix3[0]-pix3[15]
132 */
133 tv = (vector unsigned char *) pix1;
134 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
135
136 tv = (vector unsigned char *) &pix3[0];
137 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
138
139 /* Calculate the average vector */
140 avgv = vec_avg(pix2v, pix3v);
141
142 /* Calculate a sum of abs differences vector */
143 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
144
145 /* Add each 4 pixel group together and put 4 results into sad */
146 sad = vec_sum4s(t5, sad);
147
148 pix1 += line_size;
149 pix2v = pix3v;
150 pix3 += line_size;
151
152 }
153
154 /* Sum up the four partial sums, and put the result into s */
155 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
156 sumdiffs = vec_splat(sumdiffs, 3);
157 vec_ste(sumdiffs, 0, &s);
158 return s;
159 }
160
161 int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
162 {
163 int i;
164 int s __attribute__((aligned(16)));
165 uint8_t *pix3 = pix2 + line_size;
166 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
167 const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
168 vector unsigned char *tv, avgv, t5;
169 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
170 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
171 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
172 vector unsigned short avghv, avglv;
173 vector unsigned short t1, t2, t3, t4;
174 vector unsigned int sad;
175 vector signed int sumdiffs;
176
177 sad = (vector unsigned int)vec_splat_u32(0);
178
179 s = 0;
180
181 /*
182 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
183 iteration becomes pix2 in the next iteration. We can use this
184 fact to avoid a potentially expensive unaligned read, as well
185 as some splitting, and vector addition each time around the loop.
186 Read unaligned pixels into our vectors. The vectors are as follows:
187 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
188 Split the pixel vectors into shorts
189 */
190 tv = (vector unsigned char *) &pix2[0];
191 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
192
193 tv = (vector unsigned char *) &pix2[1];
194 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
195
196 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
197 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
198 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
199 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
200 t1 = vec_add(pix2hv, pix2ihv);
201 t2 = vec_add(pix2lv, pix2ilv);
202
203 for(i=0;i<h;i++) {
204 /*
205 Read unaligned pixels into our vectors. The vectors are as follows:
206 pix1v: pix1[0]-pix1[15]
207 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
208 */
209 tv = (vector unsigned char *) pix1;
210 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
211
212 tv = (vector unsigned char *) &pix3[0];
213 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
214
215 tv = (vector unsigned char *) &pix3[1];
216 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
217
218 /*
219 Note that Altivec does have vec_avg, but this works on vector pairs
220 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
221 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
222 Instead, we have to split the pixel vectors into vectors of shorts,
223 and do the averaging by hand.
224 */
225
226 /* Split the pixel vectors into shorts */
227 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
228 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
229 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
230 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
231
232 /* Do the averaging on them */
233 t3 = vec_add(pix3hv, pix3ihv);
234 t4 = vec_add(pix3lv, pix3ilv);
235
236 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
237 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
238
239 /* Pack the shorts back into a result */
240 avgv = vec_pack(avghv, avglv);
241
242 /* Calculate a sum of abs differences vector */
243 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
244
245 /* Add each 4 pixel group together and put 4 results into sad */
246 sad = vec_sum4s(t5, sad);
247
248 pix1 += line_size;
249 pix3 += line_size;
250 /* Transfer the calculated values for pix3 into pix2 */
251 t1 = t3;
252 t2 = t4;
253 }
254 /* Sum up the four partial sums, and put the result into s */
255 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
256 sumdiffs = vec_splat(sumdiffs, 3);
257 vec_ste(sumdiffs, 0, &s);
258
259 return s;
260 }
261
262 int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
263 {
264 int i;
265 int s __attribute__((aligned(16)));
266 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
267 vector unsigned char perm1, perm2, *pix1v, *pix2v;
268 vector unsigned char t1, t2, t3,t4, t5;
269 vector unsigned int sad;
270 vector signed int sumdiffs;
271
272 sad = (vector unsigned int)vec_splat_u32(0);
273
274
275 for(i=0;i<h;i++) {
276 /* Read potentially unaligned pixels into t1 and t2 */
277 perm1 = vec_lvsl(0, pix1);
278 pix1v = (vector unsigned char *) pix1;
279 perm2 = vec_lvsl(0, pix2);
280 pix2v = (vector unsigned char *) pix2;
281 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
282 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
283
284 /* Calculate a sum of abs differences vector */
285 t3 = vec_max(t1, t2);
286 t4 = vec_min(t1, t2);
287 t5 = vec_sub(t3, t4);
288
289 /* Add each 4 pixel group together and put 4 results into sad */
290 sad = vec_sum4s(t5, sad);
291
292 pix1 += line_size;
293 pix2 += line_size;
294 }
295
296 /* Sum up the four partial sums, and put the result into s */
297 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
298 sumdiffs = vec_splat(sumdiffs, 3);
299 vec_ste(sumdiffs, 0, &s);
300
301 return s;
302 }
303
304 int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
305 {
306 int i;
307 int s __attribute__((aligned(16)));
308 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
309 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
310 vector unsigned char t1, t2, t3,t4, t5;
311 vector unsigned int sad;
312 vector signed int sumdiffs;
313
314 sad = (vector unsigned int)vec_splat_u32(0);
315
316 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
317
318 for(i=0;i<h;i++) {
319 /* Read potentially unaligned pixels into t1 and t2
320 Since we're reading 16 pixels, and actually only want 8,
321 mask out the last 8 pixels. The 0s don't change the sum. */
322 perm1 = vec_lvsl(0, pix1);
323 pix1v = (vector unsigned char *) pix1;
324 perm2 = vec_lvsl(0, pix2);
325 pix2v = (vector unsigned char *) pix2;
326 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
327 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
328
329 /* Calculate a sum of abs differences vector */
330 t3 = vec_max(t1, t2);
331 t4 = vec_min(t1, t2);
332 t5 = vec_sub(t3, t4);
333
334 /* Add each 4 pixel group together and put 4 results into sad */
335 sad = vec_sum4s(t5, sad);
336
337 pix1 += line_size;
338 pix2 += line_size;
339 }
340
341 /* Sum up the four partial sums, and put the result into s */
342 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
343 sumdiffs = vec_splat(sumdiffs, 3);
344 vec_ste(sumdiffs, 0, &s);
345
346 return s;
347 }
348
349 int pix_norm1_altivec(uint8_t *pix, int line_size)
350 {
351 int i;
352 int s __attribute__((aligned(16)));
353 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
354 vector unsigned char *tv;
355 vector unsigned char pixv;
356 vector unsigned int sv;
357 vector signed int sum;
358
359 sv = (vector unsigned int)vec_splat_u32(0);
360
361 s = 0;
362 for (i = 0; i < 16; i++) {
363 /* Read in the potentially unaligned pixels */
364 tv = (vector unsigned char *) pix;
365 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
366
367 /* Square the values, and add them to our sum */
368 sv = vec_msum(pixv, pixv, sv);
369
370 pix += line_size;
371 }
372 /* Sum up the four partial sums, and put the result into s */
373 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
374 sum = vec_splat(sum, 3);
375 vec_ste(sum, 0, &s);
376
377 return s;
378 }
379
380 /**
381 * Sum of Squared Errors for a 8x8 block.
382 * AltiVec-enhanced.
383 * It's the sad8_altivec code above w/ squaring added.
384 */
385 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
386 {
387 int i;
388 int s __attribute__((aligned(16)));
389 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
390 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
391 vector unsigned char t1, t2, t3,t4, t5;
392 vector unsigned int sum;
393 vector signed int sumsqr;
394
395 sum = (vector unsigned int)vec_splat_u32(0);
396
397 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
398
399
400 for(i=0;i<h;i++) {
401 /* Read potentially unaligned pixels into t1 and t2
402 Since we're reading 16 pixels, and actually only want 8,
403 mask out the last 8 pixels. The 0s don't change the sum. */
404 perm1 = vec_lvsl(0, pix1);
405 pix1v = (vector unsigned char *) pix1;
406 perm2 = vec_lvsl(0, pix2);
407 pix2v = (vector unsigned char *) pix2;
408 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
409 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
410
411 /*
412 Since we want to use unsigned chars, we can take advantage
413 of the fact that abs(a-b)^2 = (a-b)^2.
414 */
415
416 /* Calculate abs differences vector */
417 t3 = vec_max(t1, t2);
418 t4 = vec_min(t1, t2);
419 t5 = vec_sub(t3, t4);
420
421 /* Square the values and add them to our sum */
422 sum = vec_msum(t5, t5, sum);
423
424 pix1 += line_size;
425 pix2 += line_size;
426 }
427
428 /* Sum up the four partial sums, and put the result into s */
429 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
430 sumsqr = vec_splat(sumsqr, 3);
431 vec_ste(sumsqr, 0, &s);
432
433 return s;
434 }
435
436 /**
437 * Sum of Squared Errors for a 16x16 block.
438 * AltiVec-enhanced.
439 * It's the sad16_altivec code above w/ squaring added.
440 */
441 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
442 {
443 int i;
444 int s __attribute__((aligned(16)));
445 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
446 vector unsigned char perm1, perm2, *pix1v, *pix2v;
447 vector unsigned char t1, t2, t3,t4, t5;
448 vector unsigned int sum;
449 vector signed int sumsqr;
450
451 sum = (vector unsigned int)vec_splat_u32(0);
452
453 for(i=0;i<h;i++) {
454 /* Read potentially unaligned pixels into t1 and t2 */
455 perm1 = vec_lvsl(0, pix1);
456 pix1v = (vector unsigned char *) pix1;
457 perm2 = vec_lvsl(0, pix2);
458 pix2v = (vector unsigned char *) pix2;
459 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
460 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
461
462 /*
463 Since we want to use unsigned chars, we can take advantage
464 of the fact that abs(a-b)^2 = (a-b)^2.
465 */
466
467 /* Calculate abs differences vector */
468 t3 = vec_max(t1, t2);
469 t4 = vec_min(t1, t2);
470 t5 = vec_sub(t3, t4);
471
472 /* Square the values and add them to our sum */
473 sum = vec_msum(t5, t5, sum);
474
475 pix1 += line_size;
476 pix2 += line_size;
477 }
478
479 /* Sum up the four partial sums, and put the result into s */
480 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
481 sumsqr = vec_splat(sumsqr, 3);
482 vec_ste(sumsqr, 0, &s);
483
484 return s;
485 }
486
487 int pix_sum_altivec(uint8_t * pix, int line_size)
488 {
489 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
490 vector unsigned char perm, *pixv;
491 vector unsigned char t1;
492 vector unsigned int sad;
493 vector signed int sumdiffs;
494
495 int i;
496 int s __attribute__((aligned(16)));
497
498 sad = (vector unsigned int)vec_splat_u32(0);
499
500 for (i = 0; i < 16; i++) {
501 /* Read the potentially unaligned 16 pixels into t1 */
502 perm = vec_lvsl(0, pix);
503 pixv = (vector unsigned char *) pix;
504 t1 = vec_perm(pixv[0], pixv[1], perm);
505
506 /* Add each 4 pixel group together and put 4 results into sad */
507 sad = vec_sum4s(t1, sad);
508
509 pix += line_size;
510 }
511
512 /* Sum up the four partial sums, and put the result into s */
513 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
514 sumdiffs = vec_splat(sumdiffs, 3);
515 vec_ste(sumdiffs, 0, &s);
516
517 return s;
518 }
519
520 void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
521 {
522 int i;
523 vector unsigned char perm, bytes, *pixv;
524 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
525 vector signed short shorts;
526
527 for(i=0;i<8;i++)
528 {
529 // Read potentially unaligned pixels.
530 // We're reading 16 pixels, and actually only want 8,
531 // but we simply ignore the extras.
532 perm = vec_lvsl(0, pixels);
533 pixv = (vector unsigned char *) pixels;
534 bytes = vec_perm(pixv[0], pixv[1], perm);
535
536 // convert the bytes into shorts
537 shorts = (vector signed short)vec_mergeh(zero, bytes);
538
539 // save the data to the block, we assume the block is 16-byte aligned
540 vec_st(shorts, i*16, (vector signed short*)block);
541
542 pixels += line_size;
543 }
544 }
545
546 void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
547 const uint8_t *s2, int stride)
548 {
549 int i;
550 vector unsigned char perm, bytes, *pixv;
551 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
552 vector signed short shorts1, shorts2;
553
554 for(i=0;i<4;i++)
555 {
556 // Read potentially unaligned pixels
557 // We're reading 16 pixels, and actually only want 8,
558 // but we simply ignore the extras.
559 perm = vec_lvsl(0, s1);
560 pixv = (vector unsigned char *) s1;
561 bytes = vec_perm(pixv[0], pixv[1], perm);
562
563 // convert the bytes into shorts
564 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
565
566 // Do the same for the second block of pixels
567 perm = vec_lvsl(0, s2);
568 pixv = (vector unsigned char *) s2;
569 bytes = vec_perm(pixv[0], pixv[1], perm);
570
571 // convert the bytes into shorts
572 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
573
574 // Do the subtraction
575 shorts1 = vec_sub(shorts1, shorts2);
576
577 // save the data to the block, we assume the block is 16-byte aligned
578 vec_st(shorts1, 0, (vector signed short*)block);
579
580 s1 += stride;
581 s2 += stride;
582 block += 8;
583
584
585 // The code below is a copy of the code above... This is a manual
586 // unroll.
587
588 // Read potentially unaligned pixels
589 // We're reading 16 pixels, and actually only want 8,
590 // but we simply ignore the extras.
591 perm = vec_lvsl(0, s1);
592 pixv = (vector unsigned char *) s1;
593 bytes = vec_perm(pixv[0], pixv[1], perm);
594
595 // convert the bytes into shorts
596 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
597
598 // Do the same for the second block of pixels
599 perm = vec_lvsl(0, s2);
600 pixv = (vector unsigned char *) s2;
601 bytes = vec_perm(pixv[0], pixv[1], perm);
602
603 // convert the bytes into shorts
604 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
605
606 // Do the subtraction
607 shorts1 = vec_sub(shorts1, shorts2);
608
609 // save the data to the block, we assume the block is 16-byte aligned
610 vec_st(shorts1, 0, (vector signed short*)block);
611
612 s1 += stride;
613 s2 += stride;
614 block += 8;
615 }
616 }
617
618 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
619 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
620 int i;
621 for(i=0; i+7<w; i++){
622 dst[i+0] += src[i+0];
623 dst[i+1] += src[i+1];
624 dst[i+2] += src[i+2];
625 dst[i+3] += src[i+3];
626 dst[i+4] += src[i+4];
627 dst[i+5] += src[i+5];
628 dst[i+6] += src[i+6];
629 dst[i+7] += src[i+7];
630 }
631 for(; i<w; i++)
632 dst[i+0] += src[i+0];
633 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
634 register int i;
635 register vector unsigned char vdst, vsrc;
636
637 /* dst and src are 16 bytes-aligned (guaranteed) */
638 for(i = 0 ; (i + 15) < w ; i++)
639 {
640 vdst = vec_ld(i << 4, (unsigned char*)dst);
641 vsrc = vec_ld(i << 4, (unsigned char*)src);
642 vdst = vec_add(vsrc, vdst);
643 vec_st(vdst, i << 4, (unsigned char*)dst);
644 }
645 /* if w is not a multiple of 16 */
646 for (; (i < w) ; i++)
647 {
648 dst[i] = src[i];
649 }
650 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
651 }
652
653 /* next one assumes that ((line_size % 16) == 0) */
654 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
655 {
656 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
657 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
658 int i;
659
660 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
661
662 for(i=0; i<h; i++) {
663 *((uint32_t*)(block)) = LD32(pixels);
664 *((uint32_t*)(block+4)) = LD32(pixels+4);
665 *((uint32_t*)(block+8)) = LD32(pixels+8);
666 *((uint32_t*)(block+12)) = LD32(pixels+12);
667 pixels+=line_size;
668 block +=line_size;
669 }
670
671 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
672
673 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
674 register vector unsigned char pixelsv1, pixelsv2;
675 register vector unsigned char pixelsv1B, pixelsv2B;
676 register vector unsigned char pixelsv1C, pixelsv2C;
677 register vector unsigned char pixelsv1D, pixelsv2D;
678
679 register vector unsigned char perm = vec_lvsl(0, pixels);
680 int i;
681 register int line_size_2 = line_size << 1;
682 register int line_size_3 = line_size + line_size_2;
683 register int line_size_4 = line_size << 2;
684
685 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
686 // hand-unrolling the loop by 4 gains about 15%
687 // mininum execution time goes from 74 to 60 cycles
688 // it's faster than -funroll-loops, but using
689 // -funroll-loops w/ this is bad - 74 cycles again.
690 // all this is on a 7450, tuning for the 7450
691 #if 0
692 for(i=0; i<h; i++) {
693 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
694 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
695 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
696 0, (unsigned char*)block);
697 pixels+=line_size;
698 block +=line_size;
699 }
700 #else
701 for(i=0; i<h; i+=4) {
702 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
703 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
704 pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
705 pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
706 pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
707 pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
708 pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
709 pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
710 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
711 0, (unsigned char*)block);
712 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
713 line_size, (unsigned char*)block);
714 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
715 line_size_2, (unsigned char*)block);
716 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
717 line_size_3, (unsigned char*)block);
718 pixels+=line_size_4;
719 block +=line_size_4;
720 }
721 #endif
722 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
723
724 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
725 }
726
727 /* next one assumes that ((line_size % 16) == 0) */
728 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
729 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
730 {
731 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
732 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
733 int i;
734
735 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
736
737 for(i=0; i<h; i++) {
738 op_avg(*((uint32_t*)(block)),LD32(pixels));
739 op_avg(*((uint32_t*)(block+4)),LD32(pixels+4));
740 op_avg(*((uint32_t*)(block+8)),LD32(pixels+8));
741 op_avg(*((uint32_t*)(block+12)),LD32(pixels+12));
742 pixels+=line_size;
743 block +=line_size;
744 }
745
746 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
747
748 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
749 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
750 register vector unsigned char perm = vec_lvsl(0, pixels);
751 int i;
752
753 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
754
755 for(i=0; i<h; i++) {
756 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
757 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
758 blockv = vec_ld(0, block);
759 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
760 blockv = vec_avg(blockv,pixelsv);
761 vec_st(blockv, 0, (unsigned char*)block);
762 pixels+=line_size;
763 block +=line_size;
764 }
765
766 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
767
768 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
769 }
770
771 /* next one assumes that ((line_size % 8) == 0) */
772 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
773 {
774 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
775 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
776 int i;
777 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
778 for (i = 0; i < h; i++) {
779 *((uint32_t *) (block)) =
780 (((*((uint32_t *) (block))) |
781 ((((const struct unaligned_32 *) (pixels))->l))) -
782 ((((*((uint32_t *) (block))) ^
783 ((((const struct unaligned_32 *) (pixels))->
784 l))) & 0xFEFEFEFEUL) >> 1));
785 *((uint32_t *) (block + 4)) =
786 (((*((uint32_t *) (block + 4))) |
787 ((((const struct unaligned_32 *) (pixels + 4))->l))) -
788 ((((*((uint32_t *) (block + 4))) ^
789 ((((const struct unaligned_32 *) (pixels +
790 4))->
791 l))) & 0xFEFEFEFEUL) >> 1));
792 pixels += line_size;
793 block += line_size;
794 }
795 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
796
797 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
798 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
799 int i;
800
801 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
802
803 for (i = 0; i < h; i++) {
804 /*
805 block is 8 bytes-aligned, so we're either in the
806 left block (16 bytes-aligned) or in the right block (not)
807 */
808 int rightside = ((unsigned long)block & 0x0000000F);
809
810 blockv = vec_ld(0, block);
811 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
812 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
813 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
814
815 if (rightside)
816 {
817 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
818 }
819 else
820 {
821 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
822 }
823
824 blockv = vec_avg(blockv, pixelsv);
825
826 vec_st(blockv, 0, block);
827
828 pixels += line_size;
829 block += line_size;
830 }
831
832 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
833
834 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
835 }
836
837 /* next one assumes that ((line_size % 8) == 0) */
838 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
839 {
840 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
841 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
842 int j;
843 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
844 for (j = 0; j < 2; j++) {
845 int i;
846 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
847 const uint32_t b =
848 (((const struct unaligned_32 *) (pixels + 1))->l);
849 uint32_t l0 =
850 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
851 uint32_t h0 =
852 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
853 uint32_t l1, h1;
854 pixels += line_size;
855 for (i = 0; i < h; i += 2) {
856 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
857 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
858 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
859 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
860 *((uint32_t *) block) =
861 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
862 pixels += line_size;
863 block += line_size;
864 a = (((const struct unaligned_32 *) (pixels))->l);
865 b = (((const struct unaligned_32 *) (pixels + 1))->l);
866 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
867 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
868 *((uint32_t *) block) =
869 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
870 pixels += line_size;
871 block += line_size;
872 } pixels += 4 - line_size * (h + 1);
873 block += 4 - line_size * h;
874 }
875
876 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
877
878 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
879 register int i;
880 register vector unsigned char
881 pixelsv1, pixelsv2,
882 pixelsavg;
883 register vector unsigned char
884 blockv, temp1, temp2;
885 register vector unsigned short
886 pixelssum1, pixelssum2, temp3;
887 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
888 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
889
890 temp1 = vec_ld(0, pixels);
891 temp2 = vec_ld(16, pixels);
892 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
893 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
894 {
895 pixelsv2 = temp2;
896 }
897 else
898 {
899 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
900 }
901 pixelsv1 = vec_mergeh(vczero, pixelsv1);
902 pixelsv2 = vec_mergeh(vczero, pixelsv2);
903 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
904 (vector unsigned short)pixelsv2);
905 pixelssum1 = vec_add(pixelssum1, vctwo);
906
907 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
908 for (i = 0; i < h ; i++) {
909 int rightside = ((unsigned long)block & 0x0000000F);
910 blockv = vec_ld(0, block);
911
912 temp1 = vec_ld(line_size, pixels);
913 temp2 = vec_ld(line_size + 16, pixels);
914 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
915 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
916 {
917 pixelsv2 = temp2;
918 }
919 else
920 {
921 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
922 }
923
924 pixelsv1 = vec_mergeh(vczero, pixelsv1);
925 pixelsv2 = vec_mergeh(vczero, pixelsv2);
926 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
927 (vector unsigned short)pixelsv2);
928 temp3 = vec_add(pixelssum1, pixelssum2);
929 temp3 = vec_sra(temp3, vctwo);
930 pixelssum1 = vec_add(pixelssum2, vctwo);
931 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
932
933 if (rightside)
934 {
935 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
936 }
937 else
938 {
939 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
940 }
941
942 vec_st(blockv, 0, block);
943
944 block += line_size;
945 pixels += line_size;
946 }
947
948 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
949 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
950 }
951
952 /* next one assumes that ((line_size % 8) == 0) */
953 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
954 {
955 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
956 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
957 int j;
958 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
959 for (j = 0; j < 2; j++) {
960 int i;
961 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
962 const uint32_t b =
963 (((const struct unaligned_32 *) (pixels + 1))->l);
964 uint32_t l0 =
965 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
966 uint32_t h0 =
967 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
968 uint32_t l1, h1;
969 pixels += line_size;
970 for (i = 0; i < h; i += 2) {
971 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
972 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
973 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
974 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
975 *((uint32_t *) block) =
976 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
977 pixels += line_size;
978 block += line_size;
979 a = (((const struct unaligned_32 *) (pixels))->l);
980 b = (((const struct unaligned_32 *) (pixels + 1))->l);
981 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
982 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
983 *((uint32_t *) block) =
984 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
985 pixels += line_size;
986 block += line_size;
987 } pixels += 4 - line_size * (h + 1);
988 block += 4 - line_size * h;
989 }
990
991 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
992
993 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
994 register int i;
995 register vector unsigned char
996 pixelsv1, pixelsv2,
997 pixelsavg;
998 register vector unsigned char
999 blockv, temp1, temp2;
1000 register vector unsigned short
1001 pixelssum1, pixelssum2, temp3;
1002 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1003 register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
1004 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
1005
1006 temp1 = vec_ld(0, pixels);
1007 temp2 = vec_ld(16, pixels);
1008 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1009 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1010 {
1011 pixelsv2 = temp2;
1012 }
1013 else
1014 {
1015 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1016 }
1017 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1018 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1019 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1020 (vector unsigned short)pixelsv2);
1021 pixelssum1 = vec_add(pixelssum1, vcone);
1022
1023 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1024 for (i = 0; i < h ; i++) {
1025 int rightside = ((unsigned long)block & 0x0000000F);
1026 blockv = vec_ld(0, block);
1027
1028 temp1 = vec_ld(line_size, pixels);
1029 temp2 = vec_ld(line_size + 16, pixels);
1030 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1031 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1032 {
1033 pixelsv2 = temp2;
1034 }
1035 else
1036 {
1037 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1038 }
1039
1040 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1041 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1042 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1043 (vector unsigned short)pixelsv2);
1044 temp3 = vec_add(pixelssum1, pixelssum2);
1045 temp3 = vec_sra(temp3, vctwo);
1046 pixelssum1 = vec_add(pixelssum2, vcone);
1047 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1048
1049 if (rightside)
1050 {
1051 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1052 }
1053 else
1054 {
1055 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1056 }
1057
1058 vec_st(blockv, 0, block);
1059
1060 block += line_size;
1061 pixels += line_size;
1062 }
1063
1064 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1065 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1066 }
1067
1068 /* next one assumes that ((line_size % 16) == 0) */
1069 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1070 {
1071 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
1072 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1073 int j;
1074 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1075 for (j = 0; j < 4; j++) {
1076 int i;
1077 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1078 const uint32_t b =
1079 (((const struct unaligned_32 *) (pixels + 1))->l);
1080 uint32_t l0 =
1081 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1082 uint32_t h0 =
1083 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1084 uint32_t l1, h1;
1085 pixels += line_size;
1086 for (i = 0; i < h; i += 2) {
1087 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1088 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1089 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1090 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1091 *((uint32_t *) block) =
1092 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1093 pixels += line_size;
1094 block += line_size;
1095 a = (((const struct unaligned_32 *) (pixels))->l);
1096 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1097 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1098 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1099 *((uint32_t *) block) =
1100 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1101 pixels += line_size;
1102 block += line_size;
1103 } pixels += 4 - line_size * (h + 1);
1104 block += 4 - line_size * h;
1105 }
1106
1107 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1108
1109 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1110 register int i;
1111 register vector unsigned char
1112 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1113 register vector unsigned char
1114 blockv, temp1, temp2;
1115 register vector unsigned short
1116 pixelssum1, pixelssum2, temp3,
1117 pixelssum3, pixelssum4, temp4;
1118 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1119 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
1120
1121 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1122
1123 temp1 = vec_ld(0, pixels);
1124 temp2 = vec_ld(16, pixels);
1125 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1126 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1127 {
1128 pixelsv2 = temp2;
1129 }
1130 else
1131 {
1132 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1133 }
1134 pixelsv3 = vec_mergel(vczero, pixelsv1);
1135 pixelsv4 = vec_mergel(vczero, pixelsv2);
1136 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1137 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1138 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1139 (vector unsigned short)pixelsv4);
1140 pixelssum3 = vec_add(pixelssum3, vctwo);
1141 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1142 (vector unsigned short)pixelsv2);
1143 pixelssum1 = vec_add(pixelssum1, vctwo);
1144
1145 for (i = 0; i < h ; i++) {
1146 blockv = vec_ld(0, block);
1147
1148 temp1 = vec_ld(line_size, pixels);
1149 temp2 = vec_ld(line_size + 16, pixels);
1150 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1151 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1152 {
1153 pixelsv2 = temp2;
1154 }
1155 else
1156 {
1157 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1158 }
1159
1160 pixelsv3 = vec_mergel(vczero, pixelsv1);
1161 pixelsv4 = vec_mergel(vczero, pixelsv2);
1162 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1163 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1164
1165 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1166 (vector unsigned short)pixelsv4);
1167 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1168 (vector unsigned short)pixelsv2);
1169 temp4 = vec_add(pixelssum3, pixelssum4);
1170 temp4 = vec_sra(temp4, vctwo);
1171 temp3 = vec_add(pixelssum1, pixelssum2);
1172 temp3 = vec_sra(temp3, vctwo);
1173
1174 pixelssum3 = vec_add(pixelssum4, vctwo);
1175 pixelssum1 = vec_add(pixelssum2, vctwo);
1176
1177 blockv = vec_packsu(temp3, temp4);
1178
1179 vec_st(blockv, 0, block);
1180
1181 block += line_size;
1182 pixels += line_size;
1183 }
1184
1185 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1186 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1187 }
1188
1189 /* next one assumes that ((line_size % 16) == 0) */
1190 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1191 {
1192 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1193 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1194 int j;
1195 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1196 for (j = 0; j < 4; j++) {
1197 int i;
1198 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1199 const uint32_t b =
1200 (((const struct unaligned_32 *) (pixels + 1))->l);
1201 uint32_t l0 =
1202 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1203 uint32_t h0 =
1204 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1205 uint32_t l1, h1;
1206 pixels += line_size;
1207 for (i = 0; i < h; i += 2) {
1208 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1209 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1210 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1211 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1212 *((uint32_t *) block) =
1213 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1214 pixels += line_size;
1215 block += line_size;
1216 a = (((const struct unaligned_32 *) (pixels))->l);
1217 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1218 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1219 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1220 *((uint32_t *) block) =
1221 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1222 pixels += line_size;
1223 block += line_size;
1224 } pixels += 4 - line_size * (h + 1);
1225 block += 4 - line_size * h;
1226 }
1227
1228 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1229
1230 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1231 register int i;
1232 register vector unsigned char
1233 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1234 register vector unsigned char
1235 blockv, temp1, temp2;
1236 register vector unsigned short
1237 pixelssum1, pixelssum2, temp3,
1238 pixelssum3, pixelssum4, temp4;
1239 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1240 register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
1241 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
1242
1243 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1244
1245 temp1 = vec_ld(0, pixels);
1246 temp2 = vec_ld(16, pixels);
1247 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1248 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1249 {
1250 pixelsv2 = temp2;
1251 }
1252 else
1253 {
1254 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1255 }
1256 pixelsv3 = vec_mergel(vczero, pixelsv1);
1257 pixelsv4 = vec_mergel(vczero, pixelsv2);
1258 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1259 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1260 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1261 (vector unsigned short)pixelsv4);
1262 pixelssum3 = vec_add(pixelssum3, vcone);
1263 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1264 (vector unsigned short)pixelsv2);
1265 pixelssum1 = vec_add(pixelssum1, vcone);
1266
1267 for (i = 0; i < h ; i++) {
1268 blockv = vec_ld(0, block);
1269
1270 temp1 = vec_ld(line_size, pixels);
1271 temp2 = vec_ld(line_size + 16, pixels);
1272 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1273 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1274 {
1275 pixelsv2 = temp2;
1276 }
1277 else
1278 {
1279 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1280 }
1281
1282 pixelsv3 = vec_mergel(vczero, pixelsv1);
1283 pixelsv4 = vec_mergel(vczero, pixelsv2);
1284 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1285 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1286
1287 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1288 (vector unsigned short)pixelsv4);
1289 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1290 (vector unsigned short)pixelsv2);
1291 temp4 = vec_add(pixelssum3, pixelssum4);
1292 temp4 = vec_sra(temp4, vctwo);
1293 temp3 = vec_add(pixelssum1, pixelssum2);
1294 temp3 = vec_sra(temp3, vctwo);
1295
1296 pixelssum3 = vec_add(pixelssum4, vcone);
1297 pixelssum1 = vec_add(pixelssum2, vcone);
1298
1299 blockv = vec_packsu(temp3, temp4);
1300
1301 vec_st(blockv, 0, block);
1302
1303 block += line_size;
1304 pixels += line_size;
1305 }
1306
1307 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1308 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1309 }
1310
1311 #ifdef CONFIG_DARWIN
1312 int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1313 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
1314 int sum;
1315 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
1316 register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
1317 register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1318 {
1319 register const_vector signed short vprod1 = (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
1320 register const_vector signed short vprod2 = (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
1321 register const_vector signed short vprod3 = (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
1322 register const_vector unsigned char perm1 = (const_vector unsigned char)
1323 AVV(0x02, 0x03, 0x00, 0x01,
1324 0x06, 0x07, 0x04, 0x05,
1325 0x0A, 0x0B, 0x08, 0x09,
1326 0x0E, 0x0F, 0x0C, 0x0D);
1327 register const_vector unsigned char perm2 = (const_vector unsigned char)
1328 AVV(0x04, 0x05, 0x06, 0x07,
1329 0x00, 0x01, 0x02, 0x03,
1330 0x0C, 0x0D, 0x0E, 0x0F,
1331 0x08, 0x09, 0x0A, 0x0B);
1332 register const_vector unsigned char perm3 = (const_vector unsigned char)
1333 AVV(0x08, 0x09, 0x0A, 0x0B,
1334 0x0C, 0x0D, 0x0E, 0x0F,
1335 0x00, 0x01, 0x02, 0x03,
1336 0x04, 0x05, 0x06, 0x07);
1337
1338 #define ONEITERBUTTERFLY(i, res) \
1339 { \
1340 register vector unsigned char src1, src2, srcO; \
1341 register vector unsigned char dst1, dst2, dstO; \
1342 src1 = vec_ld(stride * i, src); \
1343 if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
1344 src2 = vec_ld((stride * i) + 16, src); \
1345 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1346 dst1 = vec_ld(stride * i, dst); \
1347 if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
1348 dst2 = vec_ld((stride * i) + 16, dst); \
1349 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1350 /* promote the unsigned chars to signed shorts */ \
1351 /* we're in the 8x8 function, we only care for the first 8 */ \
1352 register vector signed short srcV = \
1353 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1354 register vector signed short dstV = \
1355 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1356 /* substractions inside the first butterfly */ \
1357 register vector signed short but0 = vec_sub(srcV, dstV); \
1358 register vector signed short op1 = vec_perm(but0, but0, perm1); \
1359 register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
1360 register vector signed short op2 = vec_perm(but1, but1, perm2); \
1361 register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
1362 register vector signed short op3 = vec_perm(but2, but2, perm3); \
1363 res = vec_mladd(but2, vprod3, op3); \
1364 }
1365 ONEITERBUTTERFLY(0, temp0);
1366 ONEITERBUTTERFLY(1, temp1);
1367 ONEITERBUTTERFLY(2, temp2);
1368 ONEITERBUTTERFLY(3, temp3);
1369 ONEITERBUTTERFLY(4, temp4);
1370 ONEITERBUTTERFLY(5, temp5);
1371 ONEITERBUTTERFLY(6, temp6);
1372 ONEITERBUTTERFLY(7, temp7);
1373 }
1374 #undef ONEITERBUTTERFLY
1375 {
1376 register vector signed int vsum;
1377 register vector signed short line0 = vec_add(temp0, temp1);
1378 register vector signed short line1 = vec_sub(temp0, temp1);
1379 register vector signed short line2 = vec_add(temp2, temp3);
1380 register vector signed short line3 = vec_sub(temp2, temp3);
1381 register vector signed short line4 = vec_add(temp4, temp5);
1382 register vector signed short line5 = vec_sub(temp4, temp5);
1383 register vector signed short line6 = vec_add(temp6, temp7);
1384 register vector signed short line7 = vec_sub(temp6, temp7);
1385
1386 register vector signed short line0B = vec_add(line0, line2);
1387 register vector signed short line2B = vec_sub(line0, line2);
1388 register vector signed short line1B = vec_add(line1, line3);
1389 register vector signed short line3B = vec_sub(line1, line3);
1390 register vector signed short line4B = vec_add(line4, line6);
1391 register vector signed short line6B = vec_sub(line4, line6);
1392 register vector signed short line5B = vec_add(line5, line7);
1393 register vector signed short line7B = vec_sub(line5, line7);
1394
1395 register vector signed short line0C = vec_add(line0B, line4B);
1396 register vector signed short line4C = vec_sub(line0B, line4B);
1397 register vector signed short line1C = vec_add(line1B, line5B);
1398 register vector signed short line5C = vec_sub(line1B, line5B);
1399 register vector signed short line2C = vec_add(line2B, line6B);
1400 register vector signed short line6C = vec_sub(line2B, line6B);
1401 register vector signed short line3C = vec_add(line3B, line7B);
1402 register vector signed short line7C = vec_sub(line3B, line7B);
1403
1404 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1405 vsum = vec_sum4s(vec_abs(line1C), vsum);
1406 vsum = vec_sum4s(vec_abs(line2C), vsum);
1407 vsum = vec_sum4s(vec_abs(line3C), vsum);
1408 vsum = vec_sum4s(vec_abs(line4C), vsum);
1409 vsum = vec_sum4s(vec_abs(line5C), vsum);
1410 vsum = vec_sum4s(vec_abs(line6C), vsum);
1411 vsum = vec_sum4s(vec_abs(line7C), vsum);
1412 vsum = vec_sums(vsum, (vector signed int)vzero);
1413 vsum = vec_splat(vsum, 3);
1414 vec_ste(vsum, 0, &sum);
1415 }
1416 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
1417 return sum;
1418 }
1419
1420 /*
1421 16x8 works with 16 elements ; it allows to avoid replicating
1422 loads, and give the compiler more rooms for scheduling.
1423 It's only used from inside hadamard8_diff16_altivec.
1424
1425 Unfortunately, it seems gcc-3.3 is a bit dumb, and
1426 the compiled code has a LOT of spill code, it seems
1427 gcc (unlike xlc) cannot keep everything in registers
1428 by itself. The following code include hand-made
1429 registers allocation. It's not clean, but on
1430 a 7450 the resulting code is much faster (best case
1431 fall from 700+ cycles to 550).
1432
1433 xlc doesn't add spill code, but it doesn't know how to
1434 schedule for the 7450, and its code isn't much faster than
1435 gcc-3.3 on the 7450 (but uses 25% less instructions...)
1436
1437 On the 970, the hand-made RA is still a win (arount 690
1438 vs. around 780), but xlc goes to around 660 on the
1439 regular C code...
1440 */
1441
1442 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
1443 int sum;
1444 register vector signed short
1445 temp0 asm ("v0"),
1446 temp1 asm ("v1"),
1447 temp2 asm ("v2"),
1448 temp3 asm ("v3"),
1449 temp4 asm ("v4"),
1450 temp5 asm ("v5"),
1451 temp6 asm ("v6"),
1452 temp7 asm ("v7");
1453 register vector signed short
1454 temp0S asm ("v8"),
1455 temp1S asm ("v9"),
1456 temp2S asm ("v10"),
1457 temp3S asm ("v11"),
1458 temp4S asm ("v12"),
1459 temp5S asm ("v13"),
1460 temp6S asm ("v14"),
1461 temp7S asm ("v15");
1462 register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
1463 {
1464 register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
1465 register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
1466 register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
1467 register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
1468 AVV(0x02, 0x03, 0x00, 0x01,
1469 0x06, 0x07, 0x04, 0x05,
1470 0x0A, 0x0B, 0x08, 0x09,
1471 0x0E, 0x0F, 0x0C, 0x0D);
1472 register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
1473 AVV(0x04, 0x05, 0x06, 0x07,
1474 0x00, 0x01, 0x02, 0x03,
1475 0x0C, 0x0D, 0x0E, 0x0F,
1476 0x08, 0x09, 0x0A, 0x0B);
1477 register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
1478 AVV(0x08, 0x09, 0x0A, 0x0B,
1479 0x0C, 0x0D, 0x0E, 0x0F,
1480 0x00, 0x01, 0x02, 0x03,
1481 0x04, 0x05, 0x06, 0x07);
1482
1483 #define ONEITERBUTTERFLY(i, res1, res2) \
1484 { \
1485 register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
1486 register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
1487 src1 = vec_ld(stride * i, src); \
1488 src2 = vec_ld((stride * i) + 16, src); \
1489 register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1490 dst1 = vec_ld(stride * i, dst); \
1491 dst2 = vec_ld((stride * i) + 16, dst); \
1492 register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1493 /* promote the unsigned chars to signed shorts */ \
1494 register vector signed short srcV asm ("v24") = \
1495 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1496 register vector signed short dstV asm ("v25") = \
1497 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1498 register vector signed short srcW asm ("v26") = \
1499 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
1500 register vector signed short dstW asm ("v27") = \
1501 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
1502 /* substractions inside the first butterfly */ \
1503 register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
1504 register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
1505 register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
1506 register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
1507 register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
1508 register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
1509 register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
1510 register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
1511 register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
1512 register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
1513 register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
1514 res1 = vec_mladd(but2, vprod3, op3); \
1515 register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
1516 res2 = vec_mladd(but2S, vprod3, op3S); \
1517 }
1518 ONEITERBUTTERFLY(0, temp0, temp0S);
1519 ONEITERBUTTERFLY(1, temp1, temp1S);
1520 ONEITERBUTTERFLY(2, temp2, temp2S);
1521 ONEITERBUTTERFLY(3, temp3, temp3S);
1522 ONEITERBUTTERFLY(4, temp4, temp4S);
1523 ONEITERBUTTERFLY(5, temp5, temp5S);
1524 ONEITERBUTTERFLY(6, temp6, temp6S);
1525 ONEITERBUTTERFLY(7, temp7, temp7S);
1526 }
1527 #undef ONEITERBUTTERFLY
1528 {
1529 register vector signed int vsum;
1530 register vector signed short line0 = vec_add(temp0, temp1);
1531 register vector signed short line1 = vec_sub(temp0, temp1);
1532 register vector signed short line2 = vec_add(temp2, temp3);
1533 register vector signed short line3 = vec_sub(temp2, temp3);
1534 register vector signed short line4 = vec_add(temp4, temp5);
1535 register vector signed short line5 = vec_sub(temp4, temp5);
1536 register vector signed short line6 = vec_add(temp6, temp7);
1537 register vector signed short line7 = vec_sub(temp6, temp7);
1538
1539 register vector signed short line0B = vec_add(line0, line2);
1540 register vector signed short line2B = vec_sub(line0, line2);
1541 register vector signed short line1B = vec_add(line1, line3);
1542 register vector signed short line3B = vec_sub(line1, line3);
1543 register vector signed short line4B = vec_add(line4, line6);
1544 register vector signed short line6B = vec_sub(line4, line6);
1545 register vector signed short line5B = vec_add(line5, line7);
1546 register vector signed short line7B = vec_sub(line5, line7);
1547
1548 register vector signed short line0C = vec_add(line0B, line4B);
1549 register vector signed short line4C = vec_sub(line0B, line4B);
1550 register vector signed short line1C = vec_add(line1B, line5B);
1551 register vector signed short line5C = vec_sub(line1B, line5B);
1552 register vector signed short line2C = vec_add(line2B, line6B);
1553 register vector signed short line6C = vec_sub(line2B, line6B);
1554 register vector signed short line3C = vec_add(line3B, line7B);
1555 register vector signed short line7C = vec_sub(line3B, line7B);
1556
1557 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1558 vsum = vec_sum4s(vec_abs(line1C), vsum);
1559 vsum = vec_sum4s(vec_abs(line2C), vsum);
1560 vsum = vec_sum4s(vec_abs(line3C), vsum);
1561 vsum = vec_sum4s(vec_abs(line4C), vsum);
1562 vsum = vec_sum4s(vec_abs(line5C), vsum);
1563 vsum = vec_sum4s(vec_abs(line6C), vsum);
1564 vsum = vec_sum4s(vec_abs(line7C), vsum);
1565
1566 register vector signed short line0S = vec_add(temp0S, temp1S);
1567 register vector signed short line1S = vec_sub(temp0S, temp1S);
1568 register vector signed short line2S = vec_add(temp2S, temp3S);
1569 register vector signed short line3S = vec_sub(temp2S, temp3S);
1570 register vector signed short line4S = vec_add(temp4S, temp5S);
1571 register vector signed short line5S = vec_sub(temp4S, temp5S);
1572 register vector signed short line6S = vec_add(temp6S, temp7S);
1573 register vector signed short line7S = vec_sub(temp6S, temp7S);
1574
1575 register vector signed short line0BS = vec_add(line0S, line2S);
1576 register vector signed short line2BS = vec_sub(line0S, line2S);
1577 register vector signed short line1BS = vec_add(line1S, line3S);
1578 register vector signed short line3BS = vec_sub(line1S, line3S);
1579 register vector signed short line4BS = vec_add(line4S, line6S);
1580 register vector signed short line6BS = vec_sub(line4S, line6S);
1581 register vector signed short line5BS = vec_add(line5S, line7S);
1582 register vector signed short line7BS = vec_sub(line5S, line7S);
1583
1584 register vector signed short line0CS = vec_add(line0BS, line4BS);
1585 register vector signed short line4CS = vec_sub(line0BS, line4BS);
1586 register vector signed short line1CS = vec_add(line1BS, line5BS);
1587 register vector signed short line5CS = vec_sub(line1BS, line5BS);
1588 register vector signed short line2CS = vec_add(line2BS, line6BS);
1589 register vector signed short line6CS = vec_sub(line2BS, line6BS);
1590 register vector signed short line3CS = vec_add(line3BS, line7BS);
1591 register vector signed short line7CS = vec_sub(line3BS, line7BS);
1592
1593 vsum = vec_sum4s(vec_abs(line0CS), vsum);
1594 vsum = vec_sum4s(vec_abs(line1CS), vsum);
1595 vsum = vec_sum4s(vec_abs(line2CS), vsum);
1596 vsum = vec_sum4s(vec_abs(line3CS), vsum);
1597 vsum = vec_sum4s(vec_abs(line4CS), vsum);
1598 vsum = vec_sum4s(vec_abs(line5CS), vsum);
1599 vsum = vec_sum4s(vec_abs(line6CS), vsum);
1600 vsum = vec_sum4s(vec_abs(line7CS), vsum);
1601 vsum = vec_sums(vsum, (vector signed int)vzero);
1602 vsum = vec_splat(vsum, 3);
1603 vec_ste(vsum, 0, &sum);
1604 }
1605 return sum;
1606 }
1607
1608 int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1609 POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
1610 int score;
1611 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
1612 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1613 if (h==16) {
1614 dst += 8*stride;
1615 src += 8*stride;
1616 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1617 }
1618 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
1619 return score;
1620 }
1621 #endif //CONFIG_DARWIN
1622
1623 int has_altivec(void)
1624 {
1625 #ifdef __AMIGAOS4__
1626 ULONG result = 0;
1627 extern struct ExecIFace *IExec;
1628
1629 IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
1630 if (result == VECTORTYPE_ALTIVEC) return 1;
1631 return 0;
1632 #else /* __AMIGAOS4__ */
1633
1634 #ifdef CONFIG_DARWIN
1635 int sels[2] = {CTL_HW, HW_VECTORUNIT};
1636 int has_vu = 0;
1637 size_t len = sizeof(has_vu);
1638 int err;
1639
1640 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1641
1642 if (err == 0) return (has_vu != 0);
1643 #else /* CONFIG_DARWIN */
1644 /* no Darwin, do it the brute-force way */
1645 /* this is borrowed from the libmpeg2 library */
1646 {
1647 signal (SIGILL, sigill_handler);
1648 if (sigsetjmp (jmpbuf, 1)) {
1649 signal (SIGILL, SIG_DFL);
1650 } else {
1651 canjump = 1;
1652
1653 asm volatile ("mtspr 256, %0\n\t"
1654 "vand %%v0, %%v0, %%v0"
1655 :
1656 : "r" (-1));
1657
1658 signal (SIGILL, SIG_DFL);
1659 return 1;
1660 }
1661 }
1662 #endif /* CONFIG_DARWIN */
1663 return 0;
1664 #endif /* __AMIGAOS4__ */
1665 }
1666
1667 /* next one assumes that ((line_size % 8) == 0) */
1668 void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
1669 {
1670 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
1671 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1672
1673 int j;
1674 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
1675 for (j = 0; j < 2; j++) {
1676 int i;
1677 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1678 const uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1679 uint32_t l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1680 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1681 uint32_t l1, h1;
1682 pixels += line_size;
1683 for (i = 0; i < h; i += 2) {
1684 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1685 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1686 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1687 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1688 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1689 pixels += line_size;
1690 block += line_size;
1691 a = (((const struct unaligned_32 *) (pixels))->l);
1692 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1693 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1694 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1695 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1696 pixels += line_size;
1697 block += line_size;
1698 } pixels += 4 - line_size * (h + 1);
1699 block += 4 - line_size * h;
1700 }
1701 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
1702 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1703 register int i;
1704 register vector unsigned char
1705 pixelsv1, pixelsv2,
1706 pixelsavg;
1707 register vector unsigned char
1708 blockv, temp1, temp2, blocktemp;
1709 register vector unsigned short
1710 pixelssum1, pixelssum2, temp3;
1711 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1712 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
1713
1714 temp1 = vec_ld(0, pixels);
1715 temp2 = vec_ld(16, pixels);
1716 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1717 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1718 {
1719 pixelsv2 = temp2;
1720 }
1721 else
1722 {
1723 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1724 }
1725 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1726 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1727 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1728 (vector unsigned short)pixelsv2);
1729 pixelssum1 = vec_add(pixelssum1, vctwo);
1730
1731 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
1732 for (i = 0; i < h ; i++) {
1733 int rightside = ((unsigned long)block & 0x0000000F);
1734 blockv = vec_ld(0, block);
1735
1736 temp1 = vec_ld(line_size, pixels);
1737 temp2 = vec_ld(line_size + 16, pixels);
1738 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1739 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1740 {
1741 pixelsv2 = temp2;
1742 }
1743 else
1744 {
1745 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1746 }
1747
1748 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1749 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1750 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1751 (vector unsigned short)pixelsv2);
1752 temp3 = vec_add(pixelssum1, pixelssum2);
1753 temp3 = vec_sra(temp3, vctwo);
1754 pixelssum1 = vec_add(pixelssum2, vctwo);
1755 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1756
1757 if (rightside)
1758 {
1759 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1760 }
1761 else
1762 {
1763 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1764 }
1765
1766 blockv = vec_avg(blocktemp, blockv);
1767 vec_st(blockv, 0, block);
1768
1769 block += line_size;
1770 pixels += line_size;
1771 }
1772
1773 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
1774 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1775 }