ed34a2d92ba952cac61216384e82108c4d18ed11
[libav.git] / libavcodec / ppc / dsputil_altivec.c
1 /*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 #include "../dsputil.h"
21 #include "dsputil_altivec.h"
22
23 #if CONFIG_DARWIN
24 #include <sys/sysctl.h>
25 #endif
26
27 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
28 {
29 int s, i;
30 vector unsigned char *tv, zero;
31 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
32 vector unsigned int sad;
33 vector signed int sumdiffs;
34
35 s = 0;
36 zero = vec_splat_u8(0);
37 sad = vec_splat_u32(0);
38 for(i=0;i<16;i++) {
39 /*
40 Read unaligned pixels into our vectors. The vectors are as follows:
41 pix1v: pix1[0]-pix1[15]
42 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
43 */
44 tv = (vector unsigned char *) pix1;
45 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
46
47 tv = (vector unsigned char *) &pix2[0];
48 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
49
50 tv = (vector unsigned char *) &pix2[1];
51 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
52
53 /* Calculate the average vector */
54 avgv = vec_avg(pix2v, pix2iv);
55
56 /* Calculate a sum of abs differences vector */
57 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
58
59 /* Add each 4 pixel group together and put 4 results into sad */
60 sad = vec_sum4s(t5, sad);
61
62 pix1 += line_size;
63 pix2 += line_size;
64 }
65 /* Sum up the four partial sums, and put the result into s */
66 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
67 sumdiffs = vec_splat(sumdiffs, 3);
68 vec_ste(sumdiffs, 0, &s);
69
70 return s;
71 }
72
73 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
74 {
75 int s, i;
76 vector unsigned char *tv, zero;
77 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
78 vector unsigned int sad;
79 vector signed int sumdiffs;
80 uint8_t *pix3 = pix2 + line_size;
81
82 s = 0;
83 zero = vec_splat_u8(0);
84 sad = vec_splat_u32(0);
85
86 /*
87 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
88 iteration becomes pix2 in the next iteration. We can use this
89 fact to avoid a potentially expensive unaligned read, each
90 time around the loop.
91 Read unaligned pixels into our vectors. The vectors are as follows:
92 pix2v: pix2[0]-pix2[15]
93 Split the pixel vectors into shorts
94 */
95 tv = (vector unsigned char *) &pix2[0];
96 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
97
98 for(i=0;i<16;i++) {
99 /*
100 Read unaligned pixels into our vectors. The vectors are as follows:
101 pix1v: pix1[0]-pix1[15]
102 pix3v: pix3[0]-pix3[15]
103 */
104 tv = (vector unsigned char *) pix1;
105 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
106
107 tv = (vector unsigned char *) &pix3[0];
108 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
109
110 /* Calculate the average vector */
111 avgv = vec_avg(pix2v, pix3v);
112
113 /* Calculate a sum of abs differences vector */
114 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
115
116 /* Add each 4 pixel group together and put 4 results into sad */
117 sad = vec_sum4s(t5, sad);
118
119 pix1 += line_size;
120 pix2v = pix3v;
121 pix3 += line_size;
122
123 }
124
125 /* Sum up the four partial sums, and put the result into s */
126 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
127 sumdiffs = vec_splat(sumdiffs, 3);
128 vec_ste(sumdiffs, 0, &s);
129 return s;
130 }
131
132 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
133 {
134 int s, i;
135 uint8_t *pix3 = pix2 + line_size;
136 vector unsigned char *tv, avgv, t5, zero;
137 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
138 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
139 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
140 vector unsigned short avghv, avglv, two;
141 vector unsigned short t1, t2, t3, t4;
142 vector unsigned int sad;
143 vector signed int sumdiffs;
144
145 zero = vec_splat_u8(0);
146 two = vec_splat_u16(2);
147 sad = vec_splat_u32(0);
148
149 s = 0;
150
151 /*
152 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
153 iteration becomes pix2 in the next iteration. We can use this
154 fact to avoid a potentially expensive unaligned read, as well
155 as some splitting, and vector addition each time around the loop.
156 Read unaligned pixels into our vectors. The vectors are as follows:
157 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
158 Split the pixel vectors into shorts
159 */
160 tv = (vector unsigned char *) &pix2[0];
161 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
162
163 tv = (vector unsigned char *) &pix2[1];
164 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
165
166 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
167 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
168 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
169 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
170 t1 = vec_add(pix2hv, pix2ihv);
171 t2 = vec_add(pix2lv, pix2ilv);
172
173 for(i=0;i<16;i++) {
174 /*
175 Read unaligned pixels into our vectors. The vectors are as follows:
176 pix1v: pix1[0]-pix1[15]
177 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
178 */
179 tv = (vector unsigned char *) pix1;
180 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
181
182 tv = (vector unsigned char *) &pix3[0];
183 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
184
185 tv = (vector unsigned char *) &pix3[1];
186 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
187
188 /*
189 Note that Altivec does have vec_avg, but this works on vector pairs
190 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
191 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
192 Instead, we have to split the pixel vectors into vectors of shorts,
193 and do the averaging by hand.
194 */
195
196 /* Split the pixel vectors into shorts */
197 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
198 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
199 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
200 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
201
202 /* Do the averaging on them */
203 t3 = vec_add(pix3hv, pix3ihv);
204 t4 = vec_add(pix3lv, pix3ilv);
205
206 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
207 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
208
209 /* Pack the shorts back into a result */
210 avgv = vec_pack(avghv, avglv);
211
212 /* Calculate a sum of abs differences vector */
213 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
214
215 /* Add each 4 pixel group together and put 4 results into sad */
216 sad = vec_sum4s(t5, sad);
217
218 pix1 += line_size;
219 pix3 += line_size;
220 /* Transfer the calculated values for pix3 into pix2 */
221 t1 = t3;
222 t2 = t4;
223 }
224 /* Sum up the four partial sums, and put the result into s */
225 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
226 sumdiffs = vec_splat(sumdiffs, 3);
227 vec_ste(sumdiffs, 0, &s);
228
229 return s;
230 }
231
232 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
233 {
234 int i, s;
235 vector unsigned char perm1, perm2, *pix1v, *pix2v;
236 vector unsigned char t1, t2, t3,t4, t5;
237 vector unsigned int sad, zero;
238 vector signed int sumdiffs;
239
240 zero = (vector unsigned int) (0);
241 sad = (vector unsigned int) (0);
242
243
244 for(i=0;i<16;i++) {
245 /* Read potentially unaligned pixels into t1 and t2 */
246 perm1 = vec_lvsl(0, pix1);
247 pix1v = (vector unsigned char *) pix1;
248 perm2 = vec_lvsl(0, pix2);
249 pix2v = (vector unsigned char *) pix2;
250 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
251 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
252
253 /* Calculate a sum of abs differences vector */
254 t3 = vec_max(t1, t2);
255 t4 = vec_min(t1, t2);
256 t5 = vec_sub(t3, t4);
257
258 /* Add each 4 pixel group together and put 4 results into sad */
259 sad = vec_sum4s(t5, sad);
260
261 pix1 += line_size;
262 pix2 += line_size;
263 }
264
265 /* Sum up the four partial sums, and put the result into s */
266 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
267 sumdiffs = vec_splat(sumdiffs, 3);
268 vec_ste(sumdiffs, 0, &s);
269
270 return s;
271 }
272
273 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
274 {
275 int i, s;
276 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
277 vector unsigned char t1, t2, t3,t4, t5;
278 vector unsigned int sad, zero;
279 vector signed int sumdiffs;
280
281 zero = (vector unsigned int) (0);
282 sad = (vector unsigned int) (0);
283 permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
284
285 for(i=0;i<8;i++) {
286 /* Read potentially unaligned pixels into t1 and t2
287 Since we're reading 16 pixels, and actually only want 8,
288 mask out the last 8 pixels. The 0s don't change the sum. */
289 perm1 = vec_lvsl(0, pix1);
290 pix1v = (vector unsigned char *) pix1;
291 perm2 = vec_lvsl(0, pix2);
292 pix2v = (vector unsigned char *) pix2;
293 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
294 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
295
296 /* Calculate a sum of abs differences vector */
297 t3 = vec_max(t1, t2);
298 t4 = vec_min(t1, t2);
299 t5 = vec_sub(t3, t4);
300
301 /* Add each 4 pixel group together and put 4 results into sad */
302 sad = vec_sum4s(t5, sad);
303
304 pix1 += line_size;
305 pix2 += line_size;
306 }
307
308 /* Sum up the four partial sums, and put the result into s */
309 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
310 sumdiffs = vec_splat(sumdiffs, 3);
311 vec_ste(sumdiffs, 0, &s);
312
313 return s;
314 }
315
316 int pix_norm1_altivec(uint8_t *pix, int line_size)
317 {
318 int s, i;
319 vector unsigned char *tv, zero;
320 vector unsigned char pixv;
321 vector unsigned int sv;
322 vector signed int sum;
323
324 zero = vec_splat_u8(0);
325 sv = vec_splat_u32(0);
326
327 s = 0;
328 for (i = 0; i < 16; i++) {
329 /* Read in the potentially unaligned pixels */
330 tv = (vector unsigned char *) pix;
331 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
332
333 /* Square the values, and add them to our sum */
334 sv = vec_msum(pixv, pixv, sv);
335
336 pix += line_size;
337 }
338 /* Sum up the four partial sums, and put the result into s */
339 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
340 sum = vec_splat(sum, 3);
341 vec_ste(sum, 0, &s);
342
343 return s;
344 }
345
346
347 int pix_norm_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
348 {
349 int s, i;
350 vector unsigned char *tv, zero;
351 vector unsigned char pix1v, pix2v, t5;
352 vector unsigned int sv;
353 vector signed int sum;
354
355 zero = vec_splat_u8(0);
356 sv = vec_splat_u32(0);
357 s = 0;
358 for (i = 0; i < 16; i++) {
359 /* Read in the potentially unaligned pixels */
360 tv = (vector unsigned char *) pix1;
361 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
362
363 tv = (vector unsigned char *) pix2;
364 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix2));
365
366 /*
367 Since we want to use unsigned chars, we can take advantage
368 of the fact that abs(a-b)^2 = (a-b)^2.
369 */
370
371 /* Calculate a sum of abs differences vector */
372 t5 = vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v));
373
374 /* Square the values and add them to our sum */
375 sv = vec_msum(t5, t5, sv);
376
377 pix1 += line_size;
378 pix2 += line_size;
379 }
380 /* Sum up the four partial sums, and put the result into s */
381 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
382 sum = vec_splat(sum, 3);
383 vec_ste(sum, 0, &s);
384 return s;
385 }
386
387
388 int pix_sum_altivec(UINT8 * pix, int line_size)
389 {
390
391 vector unsigned char perm, *pixv;
392 vector unsigned char t1;
393 vector unsigned int sad, zero;
394 vector signed int sumdiffs;
395
396 int s, i;
397
398 zero = (vector unsigned int) (0);
399 sad = (vector unsigned int) (0);
400
401 for (i = 0; i < 16; i++) {
402 /* Read the potentially unaligned 16 pixels into t1 */
403 perm = vec_lvsl(0, pix);
404 pixv = (vector unsigned char *) pix;
405 t1 = vec_perm(pixv[0], pixv[1], perm);
406
407 /* Add each 4 pixel group together and put 4 results into sad */
408 sad = vec_sum4s(t1, sad);
409
410 pix += line_size;
411 }
412
413 /* Sum up the four partial sums, and put the result into s */
414 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
415 sumdiffs = vec_splat(sumdiffs, 3);
416 vec_ste(sumdiffs, 0, &s);
417
418 return s;
419 }
420
421 void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
422 {
423 int i;
424 vector unsigned char perm, bytes, *pixv;
425 vector unsigned char zero = (vector unsigned char) (0);
426 vector signed short shorts;
427
428 for(i=0;i<8;i++)
429 {
430 // Read potentially unaligned pixels.
431 // We're reading 16 pixels, and actually only want 8,
432 // but we simply ignore the extras.
433 perm = vec_lvsl(0, pixels);
434 pixv = (vector unsigned char *) pixels;
435 bytes = vec_perm(pixv[0], pixv[1], perm);
436
437 // convert the bytes into shorts
438 shorts = (vector signed short)vec_mergeh(zero, bytes);
439
440 // save the data to the block, we assume the block is 16-byte aligned
441 vec_st(shorts, i*16, (vector signed short*)block);
442
443 pixels += line_size;
444 }
445 }
446
447 void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
448 const UINT8 *s2, int stride)
449 {
450 int i;
451 vector unsigned char perm, bytes, *pixv;
452 vector unsigned char zero = (vector unsigned char) (0);
453 vector signed short shorts1, shorts2;
454
455 for(i=0;i<4;i++)
456 {
457 // Read potentially unaligned pixels
458 // We're reading 16 pixels, and actually only want 8,
459 // but we simply ignore the extras.
460 perm = vec_lvsl(0, s1);
461 pixv = (vector unsigned char *) s1;
462 bytes = vec_perm(pixv[0], pixv[1], perm);
463
464 // convert the bytes into shorts
465 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
466
467 // Do the same for the second block of pixels
468 perm = vec_lvsl(0, s2);
469 pixv = (vector unsigned char *) s2;
470 bytes = vec_perm(pixv[0], pixv[1], perm);
471
472 // convert the bytes into shorts
473 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
474
475 // Do the subtraction
476 shorts1 = vec_sub(shorts1, shorts2);
477
478 // save the data to the block, we assume the block is 16-byte aligned
479 vec_st(shorts1, 0, (vector signed short*)block);
480
481 s1 += stride;
482 s2 += stride;
483 block += 8;
484
485
486 // The code below is a copy of the code above... This is a manual
487 // unroll.
488
489 // Read potentially unaligned pixels
490 // We're reading 16 pixels, and actually only want 8,
491 // but we simply ignore the extras.
492 perm = vec_lvsl(0, s1);
493 pixv = (vector unsigned char *) s1;
494 bytes = vec_perm(pixv[0], pixv[1], perm);
495
496 // convert the bytes into shorts
497 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
498
499 // Do the same for the second block of pixels
500 perm = vec_lvsl(0, s2);
501 pixv = (vector unsigned char *) s2;
502 bytes = vec_perm(pixv[0], pixv[1], perm);
503
504 // convert the bytes into shorts
505 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
506
507 // Do the subtraction
508 shorts1 = vec_sub(shorts1, shorts2);
509
510 // save the data to the block, we assume the block is 16-byte aligned
511 vec_st(shorts1, 0, (vector signed short*)block);
512
513 s1 += stride;
514 s2 += stride;
515 block += 8;
516 }
517 }
518
519
520 int has_altivec(void)
521 {
522 #if CONFIG_DARWIN
523 int sels[2] = {CTL_HW, HW_VECTORUNIT};
524 int has_vu = 0;
525 size_t len = sizeof(has_vu);
526 int err;
527
528 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
529
530 if (err == 0) return (has_vu != 0);
531 #endif
532 return 0;
533 }
534