925b6c83b78a0664904938ea30c811d54eed2337
[libav.git] / libavcodec / ppc / dsputil_altivec.c
1 /*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include "config.h"
24 #if HAVE_ALTIVEC_H
25 #include <altivec.h>
26 #endif
27 #include "libavcodec/dsputil.h"
28 #include "dsputil_ppc.h"
29 #include "util_altivec.h"
30 #include "types_altivec.h"
31 #include "dsputil_altivec.h"
32
33 static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
34 {
35 int i;
36 int s;
37 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
38 vector unsigned char *tv;
39 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
40 vector unsigned int sad;
41 vector signed int sumdiffs;
42
43 s = 0;
44 sad = (vector unsigned int)vec_splat_u32(0);
45 for (i = 0; i < h; i++) {
46 /* Read unaligned pixels into our vectors. The vectors are as follows:
47 pix1v: pix1[0]-pix1[15]
48 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */
49 tv = (vector unsigned char *) pix1;
50 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
51
52 tv = (vector unsigned char *) &pix2[0];
53 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
54
55 tv = (vector unsigned char *) &pix2[1];
56 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
57
58 /* Calculate the average vector */
59 avgv = vec_avg(pix2v, pix2iv);
60
61 /* Calculate a sum of abs differences vector */
62 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
63
64 /* Add each 4 pixel group together and put 4 results into sad */
65 sad = vec_sum4s(t5, sad);
66
67 pix1 += line_size;
68 pix2 += line_size;
69 }
70 /* Sum up the four partial sums, and put the result into s */
71 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
72 sumdiffs = vec_splat(sumdiffs, 3);
73 vec_ste(sumdiffs, 0, &s);
74
75 return s;
76 }
77
78 static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
79 {
80 int i;
81 int s;
82 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
83 vector unsigned char *tv;
84 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
85 vector unsigned int sad;
86 vector signed int sumdiffs;
87 uint8_t *pix3 = pix2 + line_size;
88
89 s = 0;
90 sad = (vector unsigned int)vec_splat_u32(0);
91
92 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
93 iteration becomes pix2 in the next iteration. We can use this
94 fact to avoid a potentially expensive unaligned read, each
95 time around the loop.
96 Read unaligned pixels into our vectors. The vectors are as follows:
97 pix2v: pix2[0]-pix2[15]
98 Split the pixel vectors into shorts */
99 tv = (vector unsigned char *) &pix2[0];
100 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
101
102 for (i = 0; i < h; i++) {
103 /* Read unaligned pixels into our vectors. The vectors are as follows:
104 pix1v: pix1[0]-pix1[15]
105 pix3v: pix3[0]-pix3[15] */
106 tv = (vector unsigned char *) pix1;
107 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
108
109 tv = (vector unsigned char *) &pix3[0];
110 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
111
112 /* Calculate the average vector */
113 avgv = vec_avg(pix2v, pix3v);
114
115 /* Calculate a sum of abs differences vector */
116 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
117
118 /* Add each 4 pixel group together and put 4 results into sad */
119 sad = vec_sum4s(t5, sad);
120
121 pix1 += line_size;
122 pix2v = pix3v;
123 pix3 += line_size;
124
125 }
126
127 /* Sum up the four partial sums, and put the result into s */
128 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
129 sumdiffs = vec_splat(sumdiffs, 3);
130 vec_ste(sumdiffs, 0, &s);
131 return s;
132 }
133
134 static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
135 {
136 int i;
137 int s;
138 uint8_t *pix3 = pix2 + line_size;
139 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
140 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
141 vector unsigned char *tv, avgv, t5;
142 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
143 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
144 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
145 vector unsigned short avghv, avglv;
146 vector unsigned short t1, t2, t3, t4;
147 vector unsigned int sad;
148 vector signed int sumdiffs;
149
150 sad = (vector unsigned int)vec_splat_u32(0);
151
152 s = 0;
153
154 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
155 iteration becomes pix2 in the next iteration. We can use this
156 fact to avoid a potentially expensive unaligned read, as well
157 as some splitting, and vector addition each time around the loop.
158 Read unaligned pixels into our vectors. The vectors are as follows:
159 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
160 Split the pixel vectors into shorts */
161 tv = (vector unsigned char *) &pix2[0];
162 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
163
164 tv = (vector unsigned char *) &pix2[1];
165 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
166
167 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
168 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
169 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
170 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
171 t1 = vec_add(pix2hv, pix2ihv);
172 t2 = vec_add(pix2lv, pix2ilv);
173
174 for (i = 0; i < h; i++) {
175 /* Read unaligned pixels into our vectors. The vectors are as follows:
176 pix1v: pix1[0]-pix1[15]
177 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */
178 tv = (vector unsigned char *) pix1;
179 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
180
181 tv = (vector unsigned char *) &pix3[0];
182 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
183
184 tv = (vector unsigned char *) &pix3[1];
185 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
186
187 /* Note that AltiVec does have vec_avg, but this works on vector pairs
188 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
189 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
190 Instead, we have to split the pixel vectors into vectors of shorts,
191 and do the averaging by hand. */
192
193 /* Split the pixel vectors into shorts */
194 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
195 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
196 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
197 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
198
199 /* Do the averaging on them */
200 t3 = vec_add(pix3hv, pix3ihv);
201 t4 = vec_add(pix3lv, pix3ilv);
202
203 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
204 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
205
206 /* Pack the shorts back into a result */
207 avgv = vec_pack(avghv, avglv);
208
209 /* Calculate a sum of abs differences vector */
210 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
211
212 /* Add each 4 pixel group together and put 4 results into sad */
213 sad = vec_sum4s(t5, sad);
214
215 pix1 += line_size;
216 pix3 += line_size;
217 /* Transfer the calculated values for pix3 into pix2 */
218 t1 = t3;
219 t2 = t4;
220 }
221 /* Sum up the four partial sums, and put the result into s */
222 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
223 sumdiffs = vec_splat(sumdiffs, 3);
224 vec_ste(sumdiffs, 0, &s);
225
226 return s;
227 }
228
229 static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
230 {
231 int i;
232 int s;
233 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
234 vector unsigned char perm1, perm2, *pix1v, *pix2v;
235 vector unsigned char t1, t2, t3,t4, t5;
236 vector unsigned int sad;
237 vector signed int sumdiffs;
238
239 sad = (vector unsigned int)vec_splat_u32(0);
240
241
242 for (i = 0; i < h; i++) {
243 /* Read potentially unaligned pixels into t1 and t2 */
244 perm1 = vec_lvsl(0, pix1);
245 pix1v = (vector unsigned char *) pix1;
246 perm2 = vec_lvsl(0, pix2);
247 pix2v = (vector unsigned char *) pix2;
248 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
249 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
250
251 /* Calculate a sum of abs differences vector */
252 t3 = vec_max(t1, t2);
253 t4 = vec_min(t1, t2);
254 t5 = vec_sub(t3, t4);
255
256 /* Add each 4 pixel group together and put 4 results into sad */
257 sad = vec_sum4s(t5, sad);
258
259 pix1 += line_size;
260 pix2 += line_size;
261 }
262
263 /* Sum up the four partial sums, and put the result into s */
264 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
265 sumdiffs = vec_splat(sumdiffs, 3);
266 vec_ste(sumdiffs, 0, &s);
267
268 return s;
269 }
270
271 static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272 {
273 int i;
274 int s;
275 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
276 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
277 vector unsigned char t1, t2, t3,t4, t5;
278 vector unsigned int sad;
279 vector signed int sumdiffs;
280
281 sad = (vector unsigned int)vec_splat_u32(0);
282
283 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
284
285 for (i = 0; i < h; i++) {
286 /* Read potentially unaligned pixels into t1 and t2
287 Since we're reading 16 pixels, and actually only want 8,
288 mask out the last 8 pixels. The 0s don't change the sum. */
289 perm1 = vec_lvsl(0, pix1);
290 pix1v = (vector unsigned char *) pix1;
291 perm2 = vec_lvsl(0, pix2);
292 pix2v = (vector unsigned char *) pix2;
293 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
294 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
295
296 /* Calculate a sum of abs differences vector */
297 t3 = vec_max(t1, t2);
298 t4 = vec_min(t1, t2);
299 t5 = vec_sub(t3, t4);
300
301 /* Add each 4 pixel group together and put 4 results into sad */
302 sad = vec_sum4s(t5, sad);
303
304 pix1 += line_size;
305 pix2 += line_size;
306 }
307
308 /* Sum up the four partial sums, and put the result into s */
309 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
310 sumdiffs = vec_splat(sumdiffs, 3);
311 vec_ste(sumdiffs, 0, &s);
312
313 return s;
314 }
315
316 static int pix_norm1_altivec(uint8_t *pix, int line_size)
317 {
318 int i;
319 int s;
320 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
321 vector unsigned char *tv;
322 vector unsigned char pixv;
323 vector unsigned int sv;
324 vector signed int sum;
325
326 sv = (vector unsigned int)vec_splat_u32(0);
327
328 s = 0;
329 for (i = 0; i < 16; i++) {
330 /* Read in the potentially unaligned pixels */
331 tv = (vector unsigned char *) pix;
332 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
333
334 /* Square the values, and add them to our sum */
335 sv = vec_msum(pixv, pixv, sv);
336
337 pix += line_size;
338 }
339 /* Sum up the four partial sums, and put the result into s */
340 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
341 sum = vec_splat(sum, 3);
342 vec_ste(sum, 0, &s);
343
344 return s;
345 }
346
347 /**
348 * Sum of Squared Errors for a 8x8 block.
349 * AltiVec-enhanced.
350 * It's the sad8_altivec code above w/ squaring added.
351 */
352 static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
353 {
354 int i;
355 int s;
356 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
357 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
358 vector unsigned char t1, t2, t3,t4, t5;
359 vector unsigned int sum;
360 vector signed int sumsqr;
361
362 sum = (vector unsigned int)vec_splat_u32(0);
363
364 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
365
366
367 for (i = 0; i < h; i++) {
368 /* Read potentially unaligned pixels into t1 and t2
369 Since we're reading 16 pixels, and actually only want 8,
370 mask out the last 8 pixels. The 0s don't change the sum. */
371 perm1 = vec_lvsl(0, pix1);
372 pix1v = (vector unsigned char *) pix1;
373 perm2 = vec_lvsl(0, pix2);
374 pix2v = (vector unsigned char *) pix2;
375 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
376 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
377
378 /* Since we want to use unsigned chars, we can take advantage
379 of the fact that abs(a-b)^2 = (a-b)^2. */
380
381 /* Calculate abs differences vector */
382 t3 = vec_max(t1, t2);
383 t4 = vec_min(t1, t2);
384 t5 = vec_sub(t3, t4);
385
386 /* Square the values and add them to our sum */
387 sum = vec_msum(t5, t5, sum);
388
389 pix1 += line_size;
390 pix2 += line_size;
391 }
392
393 /* Sum up the four partial sums, and put the result into s */
394 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
395 sumsqr = vec_splat(sumsqr, 3);
396 vec_ste(sumsqr, 0, &s);
397
398 return s;
399 }
400
401 /**
402 * Sum of Squared Errors for a 16x16 block.
403 * AltiVec-enhanced.
404 * It's the sad16_altivec code above w/ squaring added.
405 */
406 static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
407 {
408 int i;
409 int s;
410 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
411 vector unsigned char perm1, perm2, *pix1v, *pix2v;
412 vector unsigned char t1, t2, t3,t4, t5;
413 vector unsigned int sum;
414 vector signed int sumsqr;
415
416 sum = (vector unsigned int)vec_splat_u32(0);
417
418 for (i = 0; i < h; i++) {
419 /* Read potentially unaligned pixels into t1 and t2 */
420 perm1 = vec_lvsl(0, pix1);
421 pix1v = (vector unsigned char *) pix1;
422 perm2 = vec_lvsl(0, pix2);
423 pix2v = (vector unsigned char *) pix2;
424 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
425 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
426
427 /* Since we want to use unsigned chars, we can take advantage
428 of the fact that abs(a-b)^2 = (a-b)^2. */
429
430 /* Calculate abs differences vector */
431 t3 = vec_max(t1, t2);
432 t4 = vec_min(t1, t2);
433 t5 = vec_sub(t3, t4);
434
435 /* Square the values and add them to our sum */
436 sum = vec_msum(t5, t5, sum);
437
438 pix1 += line_size;
439 pix2 += line_size;
440 }
441
442 /* Sum up the four partial sums, and put the result into s */
443 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
444 sumsqr = vec_splat(sumsqr, 3);
445 vec_ste(sumsqr, 0, &s);
446
447 return s;
448 }
449
450 static int pix_sum_altivec(uint8_t * pix, int line_size)
451 {
452 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
453 vector unsigned char perm, *pixv;
454 vector unsigned char t1;
455 vector unsigned int sad;
456 vector signed int sumdiffs;
457
458 int i;
459 int s;
460
461 sad = (vector unsigned int)vec_splat_u32(0);
462
463 for (i = 0; i < 16; i++) {
464 /* Read the potentially unaligned 16 pixels into t1 */
465 perm = vec_lvsl(0, pix);
466 pixv = (vector unsigned char *) pix;
467 t1 = vec_perm(pixv[0], pixv[1], perm);
468
469 /* Add each 4 pixel group together and put 4 results into sad */
470 sad = vec_sum4s(t1, sad);
471
472 pix += line_size;
473 }
474
475 /* Sum up the four partial sums, and put the result into s */
476 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
477 sumdiffs = vec_splat(sumdiffs, 3);
478 vec_ste(sumdiffs, 0, &s);
479
480 return s;
481 }
482
483 static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
484 {
485 int i;
486 vector unsigned char perm, bytes, *pixv;
487 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
488 vector signed short shorts;
489
490 for (i = 0; i < 8; i++) {
491 // Read potentially unaligned pixels.
492 // We're reading 16 pixels, and actually only want 8,
493 // but we simply ignore the extras.
494 perm = vec_lvsl(0, pixels);
495 pixv = (vector unsigned char *) pixels;
496 bytes = vec_perm(pixv[0], pixv[1], perm);
497
498 // convert the bytes into shorts
499 shorts = (vector signed short)vec_mergeh(zero, bytes);
500
501 // save the data to the block, we assume the block is 16-byte aligned
502 vec_st(shorts, i*16, (vector signed short*)block);
503
504 pixels += line_size;
505 }
506 }
507
508 static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
509 const uint8_t *s2, int stride)
510 {
511 int i;
512 vector unsigned char perm, bytes, *pixv;
513 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
514 vector signed short shorts1, shorts2;
515
516 for (i = 0; i < 4; i++) {
517 // Read potentially unaligned pixels
518 // We're reading 16 pixels, and actually only want 8,
519 // but we simply ignore the extras.
520 perm = vec_lvsl(0, s1);
521 pixv = (vector unsigned char *) s1;
522 bytes = vec_perm(pixv[0], pixv[1], perm);
523
524 // convert the bytes into shorts
525 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
526
527 // Do the same for the second block of pixels
528 perm = vec_lvsl(0, s2);
529 pixv = (vector unsigned char *) s2;
530 bytes = vec_perm(pixv[0], pixv[1], perm);
531
532 // convert the bytes into shorts
533 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
534
535 // Do the subtraction
536 shorts1 = vec_sub(shorts1, shorts2);
537
538 // save the data to the block, we assume the block is 16-byte aligned
539 vec_st(shorts1, 0, (vector signed short*)block);
540
541 s1 += stride;
542 s2 += stride;
543 block += 8;
544
545
546 // The code below is a copy of the code above... This is a manual
547 // unroll.
548
549 // Read potentially unaligned pixels
550 // We're reading 16 pixels, and actually only want 8,
551 // but we simply ignore the extras.
552 perm = vec_lvsl(0, s1);
553 pixv = (vector unsigned char *) s1;
554 bytes = vec_perm(pixv[0], pixv[1], perm);
555
556 // convert the bytes into shorts
557 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
558
559 // Do the same for the second block of pixels
560 perm = vec_lvsl(0, s2);
561 pixv = (vector unsigned char *) s2;
562 bytes = vec_perm(pixv[0], pixv[1], perm);
563
564 // convert the bytes into shorts
565 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
566
567 // Do the subtraction
568 shorts1 = vec_sub(shorts1, shorts2);
569
570 // save the data to the block, we assume the block is 16-byte aligned
571 vec_st(shorts1, 0, (vector signed short*)block);
572
573 s1 += stride;
574 s2 += stride;
575 block += 8;
576 }
577 }
578
579
580 static void clear_block_altivec(DCTELEM *block) {
581 LOAD_ZERO;
582 vec_st(zero_s16v, 0, block);
583 vec_st(zero_s16v, 16, block);
584 vec_st(zero_s16v, 32, block);
585 vec_st(zero_s16v, 48, block);
586 vec_st(zero_s16v, 64, block);
587 vec_st(zero_s16v, 80, block);
588 vec_st(zero_s16v, 96, block);
589 vec_st(zero_s16v, 112, block);
590 }
591
592
593 static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
594 register int i;
595 register vector unsigned char vdst, vsrc;
596
597 /* dst and src are 16 bytes-aligned (guaranteed) */
598 for (i = 0 ; (i + 15) < w ; i+=16) {
599 vdst = vec_ld(i, (unsigned char*)dst);
600 vsrc = vec_ld(i, (unsigned char*)src);
601 vdst = vec_add(vsrc, vdst);
602 vec_st(vdst, i, (unsigned char*)dst);
603 }
604 /* if w is not a multiple of 16 */
605 for (; (i < w) ; i++) {
606 dst[i] = src[i];
607 }
608 }
609
610 /* next one assumes that ((line_size % 16) == 0) */
611 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
612 {
613 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
614 register vector unsigned char pixelsv1, pixelsv2;
615 register vector unsigned char pixelsv1B, pixelsv2B;
616 register vector unsigned char pixelsv1C, pixelsv2C;
617 register vector unsigned char pixelsv1D, pixelsv2D;
618
619 register vector unsigned char perm = vec_lvsl(0, pixels);
620 int i;
621 register int line_size_2 = line_size << 1;
622 register int line_size_3 = line_size + line_size_2;
623 register int line_size_4 = line_size << 2;
624
625 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
626 // hand-unrolling the loop by 4 gains about 15%
627 // mininum execution time goes from 74 to 60 cycles
628 // it's faster than -funroll-loops, but using
629 // -funroll-loops w/ this is bad - 74 cycles again.
630 // all this is on a 7450, tuning for the 7450
631 #if 0
632 for (i = 0; i < h; i++) {
633 pixelsv1 = vec_ld(0, pixels);
634 pixelsv2 = vec_ld(16, pixels);
635 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
636 0, block);
637 pixels+=line_size;
638 block +=line_size;
639 }
640 #else
641 for (i = 0; i < h; i += 4) {
642 pixelsv1 = vec_ld( 0, pixels);
643 pixelsv2 = vec_ld(15, pixels);
644 pixelsv1B = vec_ld(line_size, pixels);
645 pixelsv2B = vec_ld(15 + line_size, pixels);
646 pixelsv1C = vec_ld(line_size_2, pixels);
647 pixelsv2C = vec_ld(15 + line_size_2, pixels);
648 pixelsv1D = vec_ld(line_size_3, pixels);
649 pixelsv2D = vec_ld(15 + line_size_3, pixels);
650 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
651 0, (unsigned char*)block);
652 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
653 line_size, (unsigned char*)block);
654 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
655 line_size_2, (unsigned char*)block);
656 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
657 line_size_3, (unsigned char*)block);
658 pixels+=line_size_4;
659 block +=line_size_4;
660 }
661 #endif
662 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
663 }
664
665 /* next one assumes that ((line_size % 16) == 0) */
666 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
667 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
668 {
669 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
670 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
671 register vector unsigned char perm = vec_lvsl(0, pixels);
672 int i;
673
674 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
675
676 for (i = 0; i < h; i++) {
677 pixelsv1 = vec_ld( 0, pixels);
678 pixelsv2 = vec_ld(16,pixels);
679 blockv = vec_ld(0, block);
680 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
681 blockv = vec_avg(blockv,pixelsv);
682 vec_st(blockv, 0, (unsigned char*)block);
683 pixels+=line_size;
684 block +=line_size;
685 }
686
687 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
688 }
689
690 /* next one assumes that ((line_size % 8) == 0) */
691 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
692 {
693 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
694 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
695 int i;
696
697 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
698
699 for (i = 0; i < h; i++) {
700 /* block is 8 bytes-aligned, so we're either in the
701 left block (16 bytes-aligned) or in the right block (not) */
702 int rightside = ((unsigned long)block & 0x0000000F);
703
704 blockv = vec_ld(0, block);
705 pixelsv1 = vec_ld( 0, pixels);
706 pixelsv2 = vec_ld(16, pixels);
707 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
708
709 if (rightside) {
710 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
711 } else {
712 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
713 }
714
715 blockv = vec_avg(blockv, pixelsv);
716
717 vec_st(blockv, 0, block);
718
719 pixels += line_size;
720 block += line_size;
721 }
722
723 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
724 }
725
726 /* next one assumes that ((line_size % 8) == 0) */
727 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
728 {
729 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
730 register int i;
731 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
732 register vector unsigned char blockv, temp1, temp2;
733 register vector unsigned short pixelssum1, pixelssum2, temp3;
734 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
735 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
736
737 temp1 = vec_ld(0, pixels);
738 temp2 = vec_ld(16, pixels);
739 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
740 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
741 pixelsv2 = temp2;
742 } else {
743 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
744 }
745 pixelsv1 = vec_mergeh(vczero, pixelsv1);
746 pixelsv2 = vec_mergeh(vczero, pixelsv2);
747 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
748 (vector unsigned short)pixelsv2);
749 pixelssum1 = vec_add(pixelssum1, vctwo);
750
751 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
752 for (i = 0; i < h ; i++) {
753 int rightside = ((unsigned long)block & 0x0000000F);
754 blockv = vec_ld(0, block);
755
756 temp1 = vec_ld(line_size, pixels);
757 temp2 = vec_ld(line_size + 16, pixels);
758 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
759 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
760 pixelsv2 = temp2;
761 } else {
762 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
763 }
764
765 pixelsv1 = vec_mergeh(vczero, pixelsv1);
766 pixelsv2 = vec_mergeh(vczero, pixelsv2);
767 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
768 (vector unsigned short)pixelsv2);
769 temp3 = vec_add(pixelssum1, pixelssum2);
770 temp3 = vec_sra(temp3, vctwo);
771 pixelssum1 = vec_add(pixelssum2, vctwo);
772 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
773
774 if (rightside) {
775 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
776 } else {
777 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
778 }
779
780 vec_st(blockv, 0, block);
781
782 block += line_size;
783 pixels += line_size;
784 }
785
786 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
787 }
788
789 /* next one assumes that ((line_size % 8) == 0) */
790 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
791 {
792 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
793 register int i;
794 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
795 register vector unsigned char blockv, temp1, temp2;
796 register vector unsigned short pixelssum1, pixelssum2, temp3;
797 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
798 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
799 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
800
801 temp1 = vec_ld(0, pixels);
802 temp2 = vec_ld(16, pixels);
803 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
804 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
805 pixelsv2 = temp2;
806 } else {
807 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
808 }
809 pixelsv1 = vec_mergeh(vczero, pixelsv1);
810 pixelsv2 = vec_mergeh(vczero, pixelsv2);
811 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
812 (vector unsigned short)pixelsv2);
813 pixelssum1 = vec_add(pixelssum1, vcone);
814
815 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
816 for (i = 0; i < h ; i++) {
817 int rightside = ((unsigned long)block & 0x0000000F);
818 blockv = vec_ld(0, block);
819
820 temp1 = vec_ld(line_size, pixels);
821 temp2 = vec_ld(line_size + 16, pixels);
822 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
823 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
824 pixelsv2 = temp2;
825 } else {
826 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
827 }
828
829 pixelsv1 = vec_mergeh(vczero, pixelsv1);
830 pixelsv2 = vec_mergeh(vczero, pixelsv2);
831 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
832 (vector unsigned short)pixelsv2);
833 temp3 = vec_add(pixelssum1, pixelssum2);
834 temp3 = vec_sra(temp3, vctwo);
835 pixelssum1 = vec_add(pixelssum2, vcone);
836 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
837
838 if (rightside) {
839 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
840 } else {
841 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
842 }
843
844 vec_st(blockv, 0, block);
845
846 block += line_size;
847 pixels += line_size;
848 }
849
850 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
851 }
852
853 /* next one assumes that ((line_size % 16) == 0) */
854 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
855 {
856 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
857 register int i;
858 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
859 register vector unsigned char blockv, temp1, temp2;
860 register vector unsigned short temp3, temp4,
861 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
862 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
863 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
864
865 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
866
867 temp1 = vec_ld(0, pixels);
868 temp2 = vec_ld(16, pixels);
869 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
870 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
871 pixelsv2 = temp2;
872 } else {
873 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
874 }
875 pixelsv3 = vec_mergel(vczero, pixelsv1);
876 pixelsv4 = vec_mergel(vczero, pixelsv2);
877 pixelsv1 = vec_mergeh(vczero, pixelsv1);
878 pixelsv2 = vec_mergeh(vczero, pixelsv2);
879 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
880 (vector unsigned short)pixelsv4);
881 pixelssum3 = vec_add(pixelssum3, vctwo);
882 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
883 (vector unsigned short)pixelsv2);
884 pixelssum1 = vec_add(pixelssum1, vctwo);
885
886 for (i = 0; i < h ; i++) {
887 blockv = vec_ld(0, block);
888
889 temp1 = vec_ld(line_size, pixels);
890 temp2 = vec_ld(line_size + 16, pixels);
891 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
892 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
893 pixelsv2 = temp2;
894 } else {
895 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
896 }
897
898 pixelsv3 = vec_mergel(vczero, pixelsv1);
899 pixelsv4 = vec_mergel(vczero, pixelsv2);
900 pixelsv1 = vec_mergeh(vczero, pixelsv1);
901 pixelsv2 = vec_mergeh(vczero, pixelsv2);
902
903 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
904 (vector unsigned short)pixelsv4);
905 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
906 (vector unsigned short)pixelsv2);
907 temp4 = vec_add(pixelssum3, pixelssum4);
908 temp4 = vec_sra(temp4, vctwo);
909 temp3 = vec_add(pixelssum1, pixelssum2);
910 temp3 = vec_sra(temp3, vctwo);
911
912 pixelssum3 = vec_add(pixelssum4, vctwo);
913 pixelssum1 = vec_add(pixelssum2, vctwo);
914
915 blockv = vec_packsu(temp3, temp4);
916
917 vec_st(blockv, 0, block);
918
919 block += line_size;
920 pixels += line_size;
921 }
922
923 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
924 }
925
926 /* next one assumes that ((line_size % 16) == 0) */
927 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
928 {
929 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
930 register int i;
931 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
932 register vector unsigned char blockv, temp1, temp2;
933 register vector unsigned short temp3, temp4,
934 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
935 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
936 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
937 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
938
939 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
940
941 temp1 = vec_ld(0, pixels);
942 temp2 = vec_ld(16, pixels);
943 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
944 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
945 pixelsv2 = temp2;
946 } else {
947 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
948 }
949 pixelsv3 = vec_mergel(vczero, pixelsv1);
950 pixelsv4 = vec_mergel(vczero, pixelsv2);
951 pixelsv1 = vec_mergeh(vczero, pixelsv1);
952 pixelsv2 = vec_mergeh(vczero, pixelsv2);
953 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
954 (vector unsigned short)pixelsv4);
955 pixelssum3 = vec_add(pixelssum3, vcone);
956 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
957 (vector unsigned short)pixelsv2);
958 pixelssum1 = vec_add(pixelssum1, vcone);
959
960 for (i = 0; i < h ; i++) {
961 blockv = vec_ld(0, block);
962
963 temp1 = vec_ld(line_size, pixels);
964 temp2 = vec_ld(line_size + 16, pixels);
965 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
966 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
967 pixelsv2 = temp2;
968 } else {
969 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
970 }
971
972 pixelsv3 = vec_mergel(vczero, pixelsv1);
973 pixelsv4 = vec_mergel(vczero, pixelsv2);
974 pixelsv1 = vec_mergeh(vczero, pixelsv1);
975 pixelsv2 = vec_mergeh(vczero, pixelsv2);
976
977 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
978 (vector unsigned short)pixelsv4);
979 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
980 (vector unsigned short)pixelsv2);
981 temp4 = vec_add(pixelssum3, pixelssum4);
982 temp4 = vec_sra(temp4, vctwo);
983 temp3 = vec_add(pixelssum1, pixelssum2);
984 temp3 = vec_sra(temp3, vctwo);
985
986 pixelssum3 = vec_add(pixelssum4, vcone);
987 pixelssum1 = vec_add(pixelssum2, vcone);
988
989 blockv = vec_packsu(temp3, temp4);
990
991 vec_st(blockv, 0, block);
992
993 block += line_size;
994 pixels += line_size;
995 }
996
997 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
998 }
999
1000 static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1001 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
1002 int sum;
1003 register const vector unsigned char vzero =
1004 (const vector unsigned char)vec_splat_u8(0);
1005 register vector signed short temp0, temp1, temp2, temp3, temp4,
1006 temp5, temp6, temp7;
1007 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
1008 {
1009 register const vector signed short vprod1 =(const vector signed short)
1010 { 1,-1, 1,-1, 1,-1, 1,-1 };
1011 register const vector signed short vprod2 =(const vector signed short)
1012 { 1, 1,-1,-1, 1, 1,-1,-1 };
1013 register const vector signed short vprod3 =(const vector signed short)
1014 { 1, 1, 1, 1,-1,-1,-1,-1 };
1015 register const vector unsigned char perm1 = (const vector unsigned char)
1016 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
1017 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
1018 register const vector unsigned char perm2 = (const vector unsigned char)
1019 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
1020 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
1021 register const vector unsigned char perm3 = (const vector unsigned char)
1022 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1023 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
1024
1025 #define ONEITERBUTTERFLY(i, res) \
1026 { \
1027 register vector unsigned char src1, src2, srcO; \
1028 register vector unsigned char dst1, dst2, dstO; \
1029 register vector signed short srcV, dstV; \
1030 register vector signed short but0, but1, but2, op1, op2, op3; \
1031 src1 = vec_ld(stride * i, src); \
1032 src2 = vec_ld((stride * i) + 15, src); \
1033 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1034 dst1 = vec_ld(stride * i, dst); \
1035 dst2 = vec_ld((stride * i) + 15, dst); \
1036 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1037 /* promote the unsigned chars to signed shorts */ \
1038 /* we're in the 8x8 function, we only care for the first 8 */ \
1039 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1040 (vector signed char)srcO); \
1041 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1042 (vector signed char)dstO); \
1043 /* subtractions inside the first butterfly */ \
1044 but0 = vec_sub(srcV, dstV); \
1045 op1 = vec_perm(but0, but0, perm1); \
1046 but1 = vec_mladd(but0, vprod1, op1); \
1047 op2 = vec_perm(but1, but1, perm2); \
1048 but2 = vec_mladd(but1, vprod2, op2); \
1049 op3 = vec_perm(but2, but2, perm3); \
1050 res = vec_mladd(but2, vprod3, op3); \
1051 }
1052 ONEITERBUTTERFLY(0, temp0);
1053 ONEITERBUTTERFLY(1, temp1);
1054 ONEITERBUTTERFLY(2, temp2);
1055 ONEITERBUTTERFLY(3, temp3);
1056 ONEITERBUTTERFLY(4, temp4);
1057 ONEITERBUTTERFLY(5, temp5);
1058 ONEITERBUTTERFLY(6, temp6);
1059 ONEITERBUTTERFLY(7, temp7);
1060 }
1061 #undef ONEITERBUTTERFLY
1062 {
1063 register vector signed int vsum;
1064 register vector signed short line0 = vec_add(temp0, temp1);
1065 register vector signed short line1 = vec_sub(temp0, temp1);
1066 register vector signed short line2 = vec_add(temp2, temp3);
1067 register vector signed short line3 = vec_sub(temp2, temp3);
1068 register vector signed short line4 = vec_add(temp4, temp5);
1069 register vector signed short line5 = vec_sub(temp4, temp5);
1070 register vector signed short line6 = vec_add(temp6, temp7);
1071 register vector signed short line7 = vec_sub(temp6, temp7);
1072
1073 register vector signed short line0B = vec_add(line0, line2);
1074 register vector signed short line2B = vec_sub(line0, line2);
1075 register vector signed short line1B = vec_add(line1, line3);
1076 register vector signed short line3B = vec_sub(line1, line3);
1077 register vector signed short line4B = vec_add(line4, line6);
1078 register vector signed short line6B = vec_sub(line4, line6);
1079 register vector signed short line5B = vec_add(line5, line7);
1080 register vector signed short line7B = vec_sub(line5, line7);
1081
1082 register vector signed short line0C = vec_add(line0B, line4B);
1083 register vector signed short line4C = vec_sub(line0B, line4B);
1084 register vector signed short line1C = vec_add(line1B, line5B);
1085 register vector signed short line5C = vec_sub(line1B, line5B);
1086 register vector signed short line2C = vec_add(line2B, line6B);
1087 register vector signed short line6C = vec_sub(line2B, line6B);
1088 register vector signed short line3C = vec_add(line3B, line7B);
1089 register vector signed short line7C = vec_sub(line3B, line7B);
1090
1091 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1092 vsum = vec_sum4s(vec_abs(line1C), vsum);
1093 vsum = vec_sum4s(vec_abs(line2C), vsum);
1094 vsum = vec_sum4s(vec_abs(line3C), vsum);
1095 vsum = vec_sum4s(vec_abs(line4C), vsum);
1096 vsum = vec_sum4s(vec_abs(line5C), vsum);
1097 vsum = vec_sum4s(vec_abs(line6C), vsum);
1098 vsum = vec_sum4s(vec_abs(line7C), vsum);
1099 vsum = vec_sums(vsum, (vector signed int)vzero);
1100 vsum = vec_splat(vsum, 3);
1101 vec_ste(vsum, 0, &sum);
1102 }
1103 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
1104 return sum;
1105 }
1106
1107 /*
1108 16x8 works with 16 elements; it allows to avoid replicating loads, and
1109 give the compiler more rooms for scheduling. It's only used from
1110 inside hadamard8_diff16_altivec.
1111
1112 Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT
1113 of spill code, it seems gcc (unlike xlc) cannot keep everything in registers
1114 by itself. The following code include hand-made registers allocation. It's not
1115 clean, but on a 7450 the resulting code is much faster (best case fall from
1116 700+ cycles to 550).
1117
1118 xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,
1119 and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less
1120 instructions...)
1121
1122 On the 970, the hand-made RA is still a win (around 690 vs. around 780), but
1123 xlc goes to around 660 on the regular C code...
1124 */
1125
1126 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
1127 int sum;
1128 register vector signed short
1129 temp0 __asm__ ("v0"),
1130 temp1 __asm__ ("v1"),
1131 temp2 __asm__ ("v2"),
1132 temp3 __asm__ ("v3"),
1133 temp4 __asm__ ("v4"),
1134 temp5 __asm__ ("v5"),
1135 temp6 __asm__ ("v6"),
1136 temp7 __asm__ ("v7");
1137 register vector signed short
1138 temp0S __asm__ ("v8"),
1139 temp1S __asm__ ("v9"),
1140 temp2S __asm__ ("v10"),
1141 temp3S __asm__ ("v11"),
1142 temp4S __asm__ ("v12"),
1143 temp5S __asm__ ("v13"),
1144 temp6S __asm__ ("v14"),
1145 temp7S __asm__ ("v15");
1146 register const vector unsigned char vzero __asm__ ("v31") =
1147 (const vector unsigned char)vec_splat_u8(0);
1148 {
1149 register const vector signed short vprod1 __asm__ ("v16") =
1150 (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
1151 register const vector signed short vprod2 __asm__ ("v17") =
1152 (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
1153 register const vector signed short vprod3 __asm__ ("v18") =
1154 (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
1155 register const vector unsigned char perm1 __asm__ ("v19") =
1156 (const vector unsigned char)
1157 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
1158 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
1159 register const vector unsigned char perm2 __asm__ ("v20") =
1160 (const vector unsigned char)
1161 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
1162 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
1163 register const vector unsigned char perm3 __asm__ ("v21") =
1164 (const vector unsigned char)
1165 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1166 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
1167
1168 #define ONEITERBUTTERFLY(i, res1, res2) \
1169 { \
1170 register vector unsigned char src1 __asm__ ("v22"), \
1171 src2 __asm__ ("v23"), \
1172 dst1 __asm__ ("v24"), \
1173 dst2 __asm__ ("v25"), \
1174 srcO __asm__ ("v22"), \
1175 dstO __asm__ ("v23"); \
1176 \
1177 register vector signed short srcV __asm__ ("v24"), \
1178 dstV __asm__ ("v25"), \
1179 srcW __asm__ ("v26"), \
1180 dstW __asm__ ("v27"), \
1181 but0 __asm__ ("v28"), \
1182 but0S __asm__ ("v29"), \
1183 op1 __asm__ ("v30"), \
1184 but1 __asm__ ("v22"), \
1185 op1S __asm__ ("v23"), \
1186 but1S __asm__ ("v24"), \
1187 op2 __asm__ ("v25"), \
1188 but2 __asm__ ("v26"), \
1189 op2S __asm__ ("v27"), \
1190 but2S __asm__ ("v28"), \
1191 op3 __asm__ ("v29"), \
1192 op3S __asm__ ("v30"); \
1193 \
1194 src1 = vec_ld(stride * i, src); \
1195 src2 = vec_ld((stride * i) + 16, src); \
1196 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1197 dst1 = vec_ld(stride * i, dst); \
1198 dst2 = vec_ld((stride * i) + 16, dst); \
1199 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1200 /* promote the unsigned chars to signed shorts */ \
1201 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1202 (vector signed char)srcO); \
1203 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1204 (vector signed char)dstO); \
1205 srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
1206 (vector signed char)srcO); \
1207 dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
1208 (vector signed char)dstO); \
1209 /* subtractions inside the first butterfly */ \
1210 but0 = vec_sub(srcV, dstV); \
1211 but0S = vec_sub(srcW, dstW); \
1212 op1 = vec_perm(but0, but0, perm1); \
1213 but1 = vec_mladd(but0, vprod1, op1); \
1214 op1S = vec_perm(but0S, but0S, perm1); \
1215 but1S = vec_mladd(but0S, vprod1, op1S); \
1216 op2 = vec_perm(but1, but1, perm2); \
1217 but2 = vec_mladd(but1, vprod2, op2); \
1218 op2S = vec_perm(but1S, but1S, perm2); \
1219 but2S = vec_mladd(but1S, vprod2, op2S); \
1220 op3 = vec_perm(but2, but2, perm3); \
1221 res1 = vec_mladd(but2, vprod3, op3); \
1222 op3S = vec_perm(but2S, but2S, perm3); \
1223 res2 = vec_mladd(but2S, vprod3, op3S); \
1224 }
1225 ONEITERBUTTERFLY(0, temp0, temp0S);
1226 ONEITERBUTTERFLY(1, temp1, temp1S);
1227 ONEITERBUTTERFLY(2, temp2, temp2S);
1228 ONEITERBUTTERFLY(3, temp3, temp3S);
1229 ONEITERBUTTERFLY(4, temp4, temp4S);
1230 ONEITERBUTTERFLY(5, temp5, temp5S);
1231 ONEITERBUTTERFLY(6, temp6, temp6S);
1232 ONEITERBUTTERFLY(7, temp7, temp7S);
1233 }
1234 #undef ONEITERBUTTERFLY
1235 {
1236 register vector signed int vsum;
1237 register vector signed short line0S, line1S, line2S, line3S, line4S,
1238 line5S, line6S, line7S, line0BS,line2BS,
1239 line1BS,line3BS,line4BS,line6BS,line5BS,
1240 line7BS,line0CS,line4CS,line1CS,line5CS,
1241 line2CS,line6CS,line3CS,line7CS;
1242
1243 register vector signed short line0 = vec_add(temp0, temp1);
1244 register vector signed short line1 = vec_sub(temp0, temp1);
1245 register vector signed short line2 = vec_add(temp2, temp3);
1246 register vector signed short line3 = vec_sub(temp2, temp3);
1247 register vector signed short line4 = vec_add(temp4, temp5);
1248 register vector signed short line5 = vec_sub(temp4, temp5);
1249 register vector signed short line6 = vec_add(temp6, temp7);
1250 register vector signed short line7 = vec_sub(temp6, temp7);
1251
1252 register vector signed short line0B = vec_add(line0, line2);
1253 register vector signed short line2B = vec_sub(line0, line2);
1254 register vector signed short line1B = vec_add(line1, line3);
1255 register vector signed short line3B = vec_sub(line1, line3);
1256 register vector signed short line4B = vec_add(line4, line6);
1257 register vector signed short line6B = vec_sub(line4, line6);
1258 register vector signed short line5B = vec_add(line5, line7);
1259 register vector signed short line7B = vec_sub(line5, line7);
1260
1261 register vector signed short line0C = vec_add(line0B, line4B);
1262 register vector signed short line4C = vec_sub(line0B, line4B);
1263 register vector signed short line1C = vec_add(line1B, line5B);
1264 register vector signed short line5C = vec_sub(line1B, line5B);
1265 register vector signed short line2C = vec_add(line2B, line6B);
1266 register vector signed short line6C = vec_sub(line2B, line6B);
1267 register vector signed short line3C = vec_add(line3B, line7B);
1268 register vector signed short line7C = vec_sub(line3B, line7B);
1269
1270 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1271 vsum = vec_sum4s(vec_abs(line1C), vsum);
1272 vsum = vec_sum4s(vec_abs(line2C), vsum);
1273 vsum = vec_sum4s(vec_abs(line3C), vsum);
1274 vsum = vec_sum4s(vec_abs(line4C), vsum);
1275 vsum = vec_sum4s(vec_abs(line5C), vsum);
1276 vsum = vec_sum4s(vec_abs(line6C), vsum);
1277 vsum = vec_sum4s(vec_abs(line7C), vsum);
1278
1279 line0S = vec_add(temp0S, temp1S);
1280 line1S = vec_sub(temp0S, temp1S);
1281 line2S = vec_add(temp2S, temp3S);
1282 line3S = vec_sub(temp2S, temp3S);
1283 line4S = vec_add(temp4S, temp5S);
1284 line5S = vec_sub(temp4S, temp5S);
1285 line6S = vec_add(temp6S, temp7S);
1286 line7S = vec_sub(temp6S, temp7S);
1287
1288 line0BS = vec_add(line0S, line2S);
1289 line2BS = vec_sub(line0S, line2S);
1290 line1BS = vec_add(line1S, line3S);
1291 line3BS = vec_sub(line1S, line3S);
1292 line4BS = vec_add(line4S, line6S);
1293 line6BS = vec_sub(line4S, line6S);
1294 line5BS = vec_add(line5S, line7S);
1295 line7BS = vec_sub(line5S, line7S);
1296
1297 line0CS = vec_add(line0BS, line4BS);
1298 line4CS = vec_sub(line0BS, line4BS);
1299 line1CS = vec_add(line1BS, line5BS);
1300 line5CS = vec_sub(line1BS, line5BS);
1301 line2CS = vec_add(line2BS, line6BS);
1302 line6CS = vec_sub(line2BS, line6BS);
1303 line3CS = vec_add(line3BS, line7BS);
1304 line7CS = vec_sub(line3BS, line7BS);
1305
1306 vsum = vec_sum4s(vec_abs(line0CS), vsum);
1307 vsum = vec_sum4s(vec_abs(line1CS), vsum);
1308 vsum = vec_sum4s(vec_abs(line2CS), vsum);
1309 vsum = vec_sum4s(vec_abs(line3CS), vsum);
1310 vsum = vec_sum4s(vec_abs(line4CS), vsum);
1311 vsum = vec_sum4s(vec_abs(line5CS), vsum);
1312 vsum = vec_sum4s(vec_abs(line6CS), vsum);
1313 vsum = vec_sum4s(vec_abs(line7CS), vsum);
1314 vsum = vec_sums(vsum, (vector signed int)vzero);
1315 vsum = vec_splat(vsum, 3);
1316 vec_ste(vsum, 0, &sum);
1317 }
1318 return sum;
1319 }
1320
1321 static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1322 POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
1323 int score;
1324 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
1325 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1326 if (h==16) {
1327 dst += 8*stride;
1328 src += 8*stride;
1329 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1330 }
1331 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
1332 return score;
1333 }
1334
1335 static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
1336 int blocksize)
1337 {
1338 int i;
1339 vector float m, a;
1340 vector bool int t0, t1;
1341 const vector unsigned int v_31 = //XXX
1342 vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
1343 for (i = 0; i < blocksize; i += 4) {
1344 m = vec_ld(0, mag+i);
1345 a = vec_ld(0, ang+i);
1346 t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
1347 t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
1348 a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
1349 t0 = (vector bool int)vec_and(a, t1);
1350 t1 = (vector bool int)vec_andc(a, t1);
1351 a = vec_sub(m, (vector float)t1);
1352 m = vec_add(m, (vector float)t0);
1353 vec_stl(a, 0, ang+i);
1354 vec_stl(m, 0, mag+i);
1355 }
1356 }
1357
1358 /* next one assumes that ((line_size % 8) == 0) */
1359 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
1360 {
1361 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
1362 register int i;
1363 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
1364 register vector unsigned char blockv, temp1, temp2, blocktemp;
1365 register vector unsigned short pixelssum1, pixelssum2, temp3;
1366
1367 register const vector unsigned char vczero = (const vector unsigned char)
1368 vec_splat_u8(0);
1369 register const vector unsigned short vctwo = (const vector unsigned short)
1370 vec_splat_u16(2);
1371
1372 temp1 = vec_ld(0, pixels);
1373 temp2 = vec_ld(16, pixels);
1374 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1375 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
1376 pixelsv2 = temp2;
1377 } else {
1378 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1379 }
1380 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1381 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1382 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1383 (vector unsigned short)pixelsv2);
1384 pixelssum1 = vec_add(pixelssum1, vctwo);
1385
1386 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
1387 for (i = 0; i < h ; i++) {
1388 int rightside = ((unsigned long)block & 0x0000000F);
1389 blockv = vec_ld(0, block);
1390
1391 temp1 = vec_ld(line_size, pixels);
1392 temp2 = vec_ld(line_size + 16, pixels);
1393 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1394 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
1395 pixelsv2 = temp2;
1396 } else {
1397 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1398 }
1399
1400 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1401 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1402 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1403 (vector unsigned short)pixelsv2);
1404 temp3 = vec_add(pixelssum1, pixelssum2);
1405 temp3 = vec_sra(temp3, vctwo);
1406 pixelssum1 = vec_add(pixelssum2, vctwo);
1407 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1408
1409 if (rightside) {
1410 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1411 } else {
1412 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1413 }
1414
1415 blockv = vec_avg(blocktemp, blockv);
1416 vec_st(blockv, 0, block);
1417
1418 block += line_size;
1419 pixels += line_size;
1420 }
1421
1422 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
1423 }
1424
1425 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
1426 {
1427 c->pix_abs[0][1] = sad16_x2_altivec;
1428 c->pix_abs[0][2] = sad16_y2_altivec;
1429 c->pix_abs[0][3] = sad16_xy2_altivec;
1430 c->pix_abs[0][0] = sad16_altivec;
1431 c->pix_abs[1][0] = sad8_altivec;
1432 c->sad[0]= sad16_altivec;
1433 c->sad[1]= sad8_altivec;
1434 c->pix_norm1 = pix_norm1_altivec;
1435 c->sse[1]= sse8_altivec;
1436 c->sse[0]= sse16_altivec;
1437 c->pix_sum = pix_sum_altivec;
1438 c->diff_pixels = diff_pixels_altivec;
1439 c->get_pixels = get_pixels_altivec;
1440 c->clear_block = clear_block_altivec;
1441 c->add_bytes= add_bytes_altivec;
1442 c->put_pixels_tab[0][0] = put_pixels16_altivec;
1443 /* the two functions do the same thing, so use the same code */
1444 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
1445 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
1446 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
1447 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
1448 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
1449 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
1450 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
1451 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
1452
1453 c->hadamard8_diff[0] = hadamard8_diff16_altivec;
1454 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
1455 if (CONFIG_VORBIS_DECODER)
1456 c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
1457 }