altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot...
[libav.git] / libavcodec / ppc / dsputil_altivec.c
1 /*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 #include "../dsputil.h"
21 #include "dsputil_altivec.h"
22
23 #if CONFIG_DARWIN
24 #include <sys/sysctl.h>
25 #endif
26
27 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
28 {
29 int s, i;
30 vector unsigned char *tv, zero;
31 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
32 vector unsigned int sad;
33 vector signed int sumdiffs;
34
35 s = 0;
36 zero = vec_splat_u8(0);
37 sad = vec_splat_u32(0);
38 for(i=0;i<16;i++) {
39 /*
40 Read unaligned pixels into our vectors. The vectors are as follows:
41 pix1v: pix1[0]-pix1[15]
42 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
43 */
44 tv = (vector unsigned char *) pix1;
45 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
46
47 tv = (vector unsigned char *) &pix2[0];
48 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
49
50 tv = (vector unsigned char *) &pix2[1];
51 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
52
53 /* Calculate the average vector */
54 avgv = vec_avg(pix2v, pix2iv);
55
56 /* Calculate a sum of abs differences vector */
57 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
58
59 /* Add each 4 pixel group together and put 4 results into sad */
60 sad = vec_sum4s(t5, sad);
61
62 pix1 += line_size;
63 pix2 += line_size;
64 }
65 /* Sum up the four partial sums, and put the result into s */
66 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
67 sumdiffs = vec_splat(sumdiffs, 3);
68 vec_ste(sumdiffs, 0, &s);
69
70 return s;
71 }
72
73 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
74 {
75 int s, i;
76 vector unsigned char *tv, zero;
77 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
78 vector unsigned int sad;
79 vector signed int sumdiffs;
80 uint8_t *pix3 = pix2 + line_size;
81
82 s = 0;
83 zero = vec_splat_u8(0);
84 sad = vec_splat_u32(0);
85
86 /*
87 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
88 iteration becomes pix2 in the next iteration. We can use this
89 fact to avoid a potentially expensive unaligned read, each
90 time around the loop.
91 Read unaligned pixels into our vectors. The vectors are as follows:
92 pix2v: pix2[0]-pix2[15]
93 Split the pixel vectors into shorts
94 */
95 tv = (vector unsigned char *) &pix2[0];
96 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
97
98 for(i=0;i<16;i++) {
99 /*
100 Read unaligned pixels into our vectors. The vectors are as follows:
101 pix1v: pix1[0]-pix1[15]
102 pix3v: pix3[0]-pix3[15]
103 */
104 tv = (vector unsigned char *) pix1;
105 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
106
107 tv = (vector unsigned char *) &pix3[0];
108 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
109
110 /* Calculate the average vector */
111 avgv = vec_avg(pix2v, pix3v);
112
113 /* Calculate a sum of abs differences vector */
114 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
115
116 /* Add each 4 pixel group together and put 4 results into sad */
117 sad = vec_sum4s(t5, sad);
118
119 pix1 += line_size;
120 pix2v = pix3v;
121 pix3 += line_size;
122
123 }
124
125 /* Sum up the four partial sums, and put the result into s */
126 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
127 sumdiffs = vec_splat(sumdiffs, 3);
128 vec_ste(sumdiffs, 0, &s);
129 return s;
130 }
131
132 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
133 {
134 int s, i;
135 uint8_t *pix3 = pix2 + line_size;
136 vector unsigned char *tv, avgv, t5, zero;
137 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
138 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
139 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
140 vector unsigned short avghv, avglv, two, shift_mask;
141 vector unsigned short t1, t2, t3, t4;
142 vector unsigned int sad;
143 vector signed int sumdiffs;
144
145 shift_mask = (vector unsigned short) (0x3fff, 0x3fff, 0x3fff, 0x3fff,
146 0x3fff, 0x3fff, 0x3fff, 0x3fff);
147 zero = vec_splat_u8(0);
148 two = vec_splat_u16(2);
149 sad = vec_splat_u32(0);
150
151 s = 0;
152
153 /*
154 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
155 iteration becomes pix2 in the next iteration. We can use this
156 fact to avoid a potentially expensive unaligned read, as well
157 as some splitting, and vector addition each time around the loop.
158 Read unaligned pixels into our vectors. The vectors are as follows:
159 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
160 Split the pixel vectors into shorts
161 */
162 tv = (vector unsigned char *) &pix2[0];
163 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
164
165 tv = (vector unsigned char *) &pix2[1];
166 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
167
168 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
169 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
170 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
171 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
172 t1 = vec_add(pix2hv, pix2ihv);
173 t2 = vec_add(pix2lv, pix2ilv);
174
175 for(i=0;i<16;i++) {
176 /*
177 Read unaligned pixels into our vectors. The vectors are as follows:
178 pix1v: pix1[0]-pix1[15]
179 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
180 */
181 tv = (vector unsigned char *) pix1;
182 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
183
184 tv = (vector unsigned char *) &pix3[0];
185 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
186
187 tv = (vector unsigned char *) &pix3[1];
188 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
189
190 /*
191 Note that Altivec does have vec_avg, but this works on vector pairs
192 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
193 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
194 Instead, we have to split the pixel vectors into vectors of shorts,
195 and do the averaging by hand.
196 */
197
198 /* Split the pixel vectors into shorts */
199 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
200 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
201 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
202 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
203
204 /* Do the averaging on them */
205 t3 = vec_add(pix3hv, pix3ihv);
206 t4 = vec_add(pix3lv, pix3ilv);
207
208 avghv = vec_add(vec_add(t1, t3), two);
209 avghv= vec_and(vec_srl(avghv, two), shift_mask);
210
211 avglv = vec_add(vec_add(t2, t4), two);
212 avglv = vec_and(vec_srl(avglv, two), shift_mask);
213
214 /* Pack the shorts back into a result */
215 avgv = vec_pack(avghv, avglv);
216
217 /* Calculate a sum of abs differences vector */
218 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
219
220 /* Add each 4 pixel group together and put 4 results into sad */
221 sad = vec_sum4s(t5, sad);
222
223 pix1 += line_size;
224 pix3 += line_size;
225 /* Transfer the calculated values for pix3 into pix2 */
226 t1 = t3;
227 t2 = t4;
228 }
229 /* Sum up the four partial sums, and put the result into s */
230 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
231 sumdiffs = vec_splat(sumdiffs, 3);
232 vec_ste(sumdiffs, 0, &s);
233
234 return s;
235 }
236
237 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
238 {
239 int i, s;
240 vector unsigned char perm1, perm2, *pix1v, *pix2v;
241 vector unsigned char t1, t2, t3,t4, t5;
242 vector unsigned int sad, zero;
243 vector signed int sumdiffs;
244
245 zero = (vector unsigned int) (0);
246 sad = (vector unsigned int) (0);
247
248
249 for(i=0;i<16;i++) {
250 /* Read potentially unaligned pixels into t1 and t2 */
251 perm1 = vec_lvsl(0, pix1);
252 pix1v = (vector unsigned char *) pix1;
253 perm2 = vec_lvsl(0, pix2);
254 pix2v = (vector unsigned char *) pix2;
255 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
256 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
257
258 /* Calculate a sum of abs differences vector */
259 t3 = vec_max(t1, t2);
260 t4 = vec_min(t1, t2);
261 t5 = vec_sub(t3, t4);
262
263 /* Add each 4 pixel group together and put 4 results into sad */
264 sad = vec_sum4s(t5, sad);
265
266 pix1 += line_size;
267 pix2 += line_size;
268 }
269
270 /* Sum up the four partial sums, and put the result into s */
271 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
272 sumdiffs = vec_splat(sumdiffs, 3);
273 vec_ste(sumdiffs, 0, &s);
274
275 return s;
276 }
277
278 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
279 {
280 int i, s;
281 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
282 vector unsigned char t1, t2, t3,t4, t5;
283 vector unsigned int sad, zero;
284 vector signed int sumdiffs;
285
286 zero = (vector unsigned int) (0);
287 sad = (vector unsigned int) (0);
288 permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
289
290 for(i=0;i<8;i++) {
291 /* Read potentially unaligned pixels into t1 and t2
292 Since we're reading 16 pixels, and actually only want 8,
293 mask out the last 8 pixels. The 0s don't change the sum. */
294 perm1 = vec_lvsl(0, pix1);
295 pix1v = (vector unsigned char *) pix1;
296 perm2 = vec_lvsl(0, pix2);
297 pix2v = (vector unsigned char *) pix2;
298 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
299 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
300
301 /* Calculate a sum of abs differences vector */
302 t3 = vec_max(t1, t2);
303 t4 = vec_min(t1, t2);
304 t5 = vec_sub(t3, t4);
305
306 /* Add each 4 pixel group together and put 4 results into sad */
307 sad = vec_sum4s(t5, sad);
308
309 pix1 += line_size;
310 pix2 += line_size;
311 }
312
313 /* Sum up the four partial sums, and put the result into s */
314 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
315 sumdiffs = vec_splat(sumdiffs, 3);
316 vec_ste(sumdiffs, 0, &s);
317
318 return s;
319 }
320
321 int pix_norm1_altivec(uint8_t *pix, int line_size)
322 {
323 int s, i;
324 vector unsigned char *tv, zero;
325 vector unsigned char pixv;
326 vector unsigned short pixlv, pixhv, zeros;
327 vector unsigned int sv;
328 vector signed int sum;
329 vector unsigned char perm_stoint_h = (vector unsigned char)
330 (16, 16, 0, 1, 16, 16, 2, 3, 16, 16, 4, 5, 16, 16, 6, 7);
331
332 vector unsigned char perm_stoint_l = (vector unsigned char)
333 (16, 16, 8, 9, 16, 16, 10, 11, 16, 16, 12, 13, 16, 16, 14, 15);
334
335 zero = vec_splat_u8(0);
336 zeros = vec_splat_u16(0);
337 sv = vec_splat_u32(0);
338
339 s = 0;
340 for (i = 0; i < 16; i++) {
341 /* Read in the potentially unaligned pixels */
342 tv = (vector unsigned char *) pix;
343 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
344
345 /* Split them into two vectors of shorts */
346 pixhv = (vector unsigned short) vec_mergeh(zero, pixv);
347 pixlv = (vector unsigned short) vec_mergel(zero, pixv);
348
349
350 /* Square the values and add them to our sum */
351 sv = vec_msum(pixhv, pixhv, sv);
352 sv = vec_msum(pixlv, pixlv, sv);
353
354 pix += line_size;
355 }
356 /* Sum up the four partial sums, and put the result into s */
357 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
358 sum = vec_splat(sum, 3);
359 vec_ste(sum, 0, &s);
360
361 return s;
362 }
363
364 int pix_sum_altivec(UINT8 * pix, int line_size)
365 {
366
367 vector unsigned char perm, *pixv;
368 vector unsigned char t1;
369 vector unsigned int sad, zero;
370 vector signed int sumdiffs;
371
372 int s, i;
373
374 zero = (vector unsigned int) (0);
375 sad = (vector unsigned int) (0);
376
377 for (i = 0; i < 16; i++) {
378 /* Read the potentially unaligned 16 pixels into t1 */
379 perm = vec_lvsl(0, pix);
380 pixv = (vector unsigned char *) pix;
381 t1 = vec_perm(pixv[0], pixv[1], perm);
382
383 /* Add each 4 pixel group together and put 4 results into sad */
384 sad = vec_sum4s(t1, sad);
385
386 pix += line_size;
387 }
388
389 /* Sum up the four partial sums, and put the result into s */
390 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
391 sumdiffs = vec_splat(sumdiffs, 3);
392 vec_ste(sumdiffs, 0, &s);
393
394 return s;
395 }
396
397 void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
398 {
399 int i;
400 vector unsigned char perm, bytes, *pixv;
401 vector unsigned char zero = (vector unsigned char) (0);
402 vector signed short shorts;
403
404 for(i=0;i<8;i++)
405 {
406 // Read potentially unaligned pixels.
407 // We're reading 16 pixels, and actually only want 8,
408 // but we simply ignore the extras.
409 perm = vec_lvsl(0, pixels);
410 pixv = (vector unsigned char *) pixels;
411 bytes = vec_perm(pixv[0], pixv[1], perm);
412
413 // convert the bytes into shorts
414 shorts = (vector signed short)vec_mergeh(zero, bytes);
415
416 // save the data to the block, we assume the block is 16-byte aligned
417 vec_st(shorts, i*16, (vector signed short*)block);
418
419 pixels += line_size;
420 }
421 }
422
423 void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
424 const UINT8 *s2, int stride)
425 {
426 int i;
427 vector unsigned char perm, bytes, *pixv;
428 vector unsigned char zero = (vector unsigned char) (0);
429 vector signed short shorts1, shorts2;
430
431 for(i=0;i<4;i++)
432 {
433 // Read potentially unaligned pixels
434 // We're reading 16 pixels, and actually only want 8,
435 // but we simply ignore the extras.
436 perm = vec_lvsl(0, s1);
437 pixv = (vector unsigned char *) s1;
438 bytes = vec_perm(pixv[0], pixv[1], perm);
439
440 // convert the bytes into shorts
441 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
442
443 // Do the same for the second block of pixels
444 perm = vec_lvsl(0, s2);
445 pixv = (vector unsigned char *) s2;
446 bytes = vec_perm(pixv[0], pixv[1], perm);
447
448 // convert the bytes into shorts
449 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
450
451 // Do the subtraction
452 shorts1 = vec_sub(shorts1, shorts2);
453
454 // save the data to the block, we assume the block is 16-byte aligned
455 vec_st(shorts1, 0, (vector signed short*)block);
456
457 s1 += stride;
458 s2 += stride;
459 block += 8;
460
461
462 // The code below is a copy of the code above... This is a manual
463 // unroll.
464
465 // Read potentially unaligned pixels
466 // We're reading 16 pixels, and actually only want 8,
467 // but we simply ignore the extras.
468 perm = vec_lvsl(0, s1);
469 pixv = (vector unsigned char *) s1;
470 bytes = vec_perm(pixv[0], pixv[1], perm);
471
472 // convert the bytes into shorts
473 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
474
475 // Do the same for the second block of pixels
476 perm = vec_lvsl(0, s2);
477 pixv = (vector unsigned char *) s2;
478 bytes = vec_perm(pixv[0], pixv[1], perm);
479
480 // convert the bytes into shorts
481 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
482
483 // Do the subtraction
484 shorts1 = vec_sub(shorts1, shorts2);
485
486 // save the data to the block, we assume the block is 16-byte aligned
487 vec_st(shorts1, 0, (vector signed short*)block);
488
489 s1 += stride;
490 s2 += stride;
491 block += 8;
492 }
493 }
494
495
496 int has_altivec(void)
497 {
498 #if CONFIG_DARWIN
499 int sels[2] = {CTL_HW, HW_VECTORUNIT};
500 int has_vu = 0;
501 size_t len = sizeof(has_vu);
502 int err;
503
504 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
505
506 if (err == 0) return (has_vu != 0);
507 #endif
508 return 0;
509 }
510