interlaced motion estimation
[libav.git] / libavcodec / ppc / dsputil_altivec.c
CommitLineData
05c4072b
MN
1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
fe50f385 4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
05c4072b
MN
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
59925ef2 21#include "../dsputil.h"
a9a07762
MN
22
23#include "gcc_fixes.h"
24
05c4072b 25#include "dsputil_altivec.h"
59925ef2 26
3b991c54 27#ifdef CONFIG_DARWIN
59925ef2 28#include <sys/sysctl.h>
3b991c54
RD
29#else /* CONFIG_DARWIN */
30#include <signal.h>
31#include <setjmp.h>
32
33static sigjmp_buf jmpbuf;
34static volatile sig_atomic_t canjump = 0;
35
36static void sigill_handler (int sig)
37{
38 if (!canjump) {
39 signal (sig, SIG_DFL);
40 raise (sig);
41 }
42
43 canjump = 0;
44 siglongjmp (jmpbuf, 1);
45}
46#endif /* CONFIG_DARWIN */
59925ef2 47
bb198e19 48int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
f2677d6b 49{
4013fcf4
FB
50 int i;
51 int s __attribute__((aligned(16)));
3b991c54 52 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
4013fcf4 53 vector unsigned char *tv;
f2677d6b
BF
54 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
55 vector unsigned int sad;
56 vector signed int sumdiffs;
57
58 s = 0;
3b991c54 59 sad = (vector unsigned int)vec_splat_u32(0);
bb198e19 60 for(i=0;i<h;i++) {
f2677d6b
BF
61 /*
62 Read unaligned pixels into our vectors. The vectors are as follows:
63 pix1v: pix1[0]-pix1[15]
64 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
65 */
66 tv = (vector unsigned char *) pix1;
67 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
68
69 tv = (vector unsigned char *) &pix2[0];
70 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
71
72 tv = (vector unsigned char *) &pix2[1];
73 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
74
75 /* Calculate the average vector */
76 avgv = vec_avg(pix2v, pix2iv);
77
78 /* Calculate a sum of abs differences vector */
79 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
80
81 /* Add each 4 pixel group together and put 4 results into sad */
82 sad = vec_sum4s(t5, sad);
83
84 pix1 += line_size;
85 pix2 += line_size;
86 }
87 /* Sum up the four partial sums, and put the result into s */
88 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
89 sumdiffs = vec_splat(sumdiffs, 3);
90 vec_ste(sumdiffs, 0, &s);
91
92 return s;
93}
94
bb198e19 95int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
f2677d6b 96{
4013fcf4
FB
97 int i;
98 int s __attribute__((aligned(16)));
3b991c54 99 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
4013fcf4 100 vector unsigned char *tv;
f2677d6b
BF
101 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
102 vector unsigned int sad;
103 vector signed int sumdiffs;
104 uint8_t *pix3 = pix2 + line_size;
105
106 s = 0;
3b991c54 107 sad = (vector unsigned int)vec_splat_u32(0);
f2677d6b
BF
108
109 /*
110 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
111 iteration becomes pix2 in the next iteration. We can use this
112 fact to avoid a potentially expensive unaligned read, each
113 time around the loop.
114 Read unaligned pixels into our vectors. The vectors are as follows:
115 pix2v: pix2[0]-pix2[15]
116 Split the pixel vectors into shorts
117 */
118 tv = (vector unsigned char *) &pix2[0];
119 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
120
bb198e19 121 for(i=0;i<h;i++) {
f2677d6b
BF
122 /*
123 Read unaligned pixels into our vectors. The vectors are as follows:
124 pix1v: pix1[0]-pix1[15]
125 pix3v: pix3[0]-pix3[15]
126 */
127 tv = (vector unsigned char *) pix1;
128 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
129
130 tv = (vector unsigned char *) &pix3[0];
131 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
132
133 /* Calculate the average vector */
134 avgv = vec_avg(pix2v, pix3v);
135
136 /* Calculate a sum of abs differences vector */
137 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
138
139 /* Add each 4 pixel group together and put 4 results into sad */
140 sad = vec_sum4s(t5, sad);
141
142 pix1 += line_size;
143 pix2v = pix3v;
144 pix3 += line_size;
145
146 }
147
148 /* Sum up the four partial sums, and put the result into s */
149 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
150 sumdiffs = vec_splat(sumdiffs, 3);
151 vec_ste(sumdiffs, 0, &s);
152 return s;
153}
154
bb198e19 155int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
f2677d6b 156{
4013fcf4
FB
157 int i;
158 int s __attribute__((aligned(16)));
f2677d6b 159 uint8_t *pix3 = pix2 + line_size;
3b991c54
RD
160 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
161 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
4013fcf4 162 vector unsigned char *tv, avgv, t5;
f2677d6b
BF
163 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
164 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
165 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
4013fcf4 166 vector unsigned short avghv, avglv;
f2677d6b
BF
167 vector unsigned short t1, t2, t3, t4;
168 vector unsigned int sad;
169 vector signed int sumdiffs;
170
3b991c54 171 sad = (vector unsigned int)vec_splat_u32(0);
f2677d6b
BF
172
173 s = 0;
174
175 /*
176 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
177 iteration becomes pix2 in the next iteration. We can use this
178 fact to avoid a potentially expensive unaligned read, as well
179 as some splitting, and vector addition each time around the loop.
180 Read unaligned pixels into our vectors. The vectors are as follows:
181 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
182 Split the pixel vectors into shorts
183 */
184 tv = (vector unsigned char *) &pix2[0];
185 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
186
187 tv = (vector unsigned char *) &pix2[1];
188 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
189
190 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
191 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
192 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
193 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
194 t1 = vec_add(pix2hv, pix2ihv);
195 t2 = vec_add(pix2lv, pix2ilv);
196
bb198e19 197 for(i=0;i<h;i++) {
f2677d6b
BF
198 /*
199 Read unaligned pixels into our vectors. The vectors are as follows:
200 pix1v: pix1[0]-pix1[15]
201 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
202 */
203 tv = (vector unsigned char *) pix1;
204 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
205
206 tv = (vector unsigned char *) &pix3[0];
207 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
208
209 tv = (vector unsigned char *) &pix3[1];
210 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
211
212 /*
213 Note that Altivec does have vec_avg, but this works on vector pairs
214 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
215 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
216 Instead, we have to split the pixel vectors into vectors of shorts,
217 and do the averaging by hand.
218 */
219
220 /* Split the pixel vectors into shorts */
221 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
222 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
223 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
224 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
225
226 /* Do the averaging on them */
227 t3 = vec_add(pix3hv, pix3ihv);
228 t4 = vec_add(pix3lv, pix3ilv);
229
9c76bd48
BF
230 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
231 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
f2677d6b
BF
232
233 /* Pack the shorts back into a result */
234 avgv = vec_pack(avghv, avglv);
235
236 /* Calculate a sum of abs differences vector */
237 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
238
239 /* Add each 4 pixel group together and put 4 results into sad */
240 sad = vec_sum4s(t5, sad);
241
242 pix1 += line_size;
243 pix3 += line_size;
244 /* Transfer the calculated values for pix3 into pix2 */
245 t1 = t3;
246 t2 = t4;
247 }
248 /* Sum up the four partial sums, and put the result into s */
249 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
250 sumdiffs = vec_splat(sumdiffs, 3);
251 vec_ste(sumdiffs, 0, &s);
252
253 return s;
254}
255
bb198e19 256int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
59925ef2 257{
4013fcf4
FB
258 int i;
259 int s __attribute__((aligned(16)));
3b991c54 260 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
59925ef2
BF
261 vector unsigned char perm1, perm2, *pix1v, *pix2v;
262 vector unsigned char t1, t2, t3,t4, t5;
4013fcf4 263 vector unsigned int sad;
59925ef2
BF
264 vector signed int sumdiffs;
265
3b991c54 266 sad = (vector unsigned int)vec_splat_u32(0);
59925ef2
BF
267
268
bb198e19 269 for(i=0;i<h;i++) {
59925ef2
BF
270 /* Read potentially unaligned pixels into t1 and t2 */
271 perm1 = vec_lvsl(0, pix1);
272 pix1v = (vector unsigned char *) pix1;
273 perm2 = vec_lvsl(0, pix2);
274 pix2v = (vector unsigned char *) pix2;
275 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
276 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
277
278 /* Calculate a sum of abs differences vector */
279 t3 = vec_max(t1, t2);
280 t4 = vec_min(t1, t2);
281 t5 = vec_sub(t3, t4);
282
283 /* Add each 4 pixel group together and put 4 results into sad */
284 sad = vec_sum4s(t5, sad);
285
286 pix1 += line_size;
287 pix2 += line_size;
288 }
289
290 /* Sum up the four partial sums, and put the result into s */
291 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
292 sumdiffs = vec_splat(sumdiffs, 3);
293 vec_ste(sumdiffs, 0, &s);
294
295 return s;
296}
297
bb198e19 298int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
59925ef2 299{
4013fcf4
FB
300 int i;
301 int s __attribute__((aligned(16)));
3b991c54 302 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
59925ef2
BF
303 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
304 vector unsigned char t1, t2, t3,t4, t5;
4013fcf4 305 vector unsigned int sad;
59925ef2
BF
306 vector signed int sumdiffs;
307
3b991c54 308 sad = (vector unsigned int)vec_splat_u32(0);
a9a07762
MN
309
310 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
59925ef2 311
bb198e19 312 for(i=0;i<h;i++) {
59925ef2
BF
313 /* Read potentially unaligned pixels into t1 and t2
314 Since we're reading 16 pixels, and actually only want 8,
315 mask out the last 8 pixels. The 0s don't change the sum. */
316 perm1 = vec_lvsl(0, pix1);
317 pix1v = (vector unsigned char *) pix1;
318 perm2 = vec_lvsl(0, pix2);
319 pix2v = (vector unsigned char *) pix2;
320 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
321 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
322
323 /* Calculate a sum of abs differences vector */
324 t3 = vec_max(t1, t2);
325 t4 = vec_min(t1, t2);
326 t5 = vec_sub(t3, t4);
327
328 /* Add each 4 pixel group together and put 4 results into sad */
329 sad = vec_sum4s(t5, sad);
330
331 pix1 += line_size;
332 pix2 += line_size;
333 }
334
335 /* Sum up the four partial sums, and put the result into s */
336 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
337 sumdiffs = vec_splat(sumdiffs, 3);
338 vec_ste(sumdiffs, 0, &s);
339
340 return s;
341}
342
f2677d6b
BF
343int pix_norm1_altivec(uint8_t *pix, int line_size)
344{
4013fcf4
FB
345 int i;
346 int s __attribute__((aligned(16)));
3b991c54 347 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
4013fcf4 348 vector unsigned char *tv;
f2677d6b 349 vector unsigned char pixv;
f2677d6b
BF
350 vector unsigned int sv;
351 vector signed int sum;
4013fcf4 352
3b991c54 353 sv = (vector unsigned int)vec_splat_u32(0);
f2677d6b
BF
354
355 s = 0;
356 for (i = 0; i < 16; i++) {
357 /* Read in the potentially unaligned pixels */
358 tv = (vector unsigned char *) pix;
359 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
360
9c76bd48
BF
361 /* Square the values, and add them to our sum */
362 sv = vec_msum(pixv, pixv, sv);
f2677d6b
BF
363
364 pix += line_size;
365 }
366 /* Sum up the four partial sums, and put the result into s */
367 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
368 sum = vec_splat(sum, 3);
369 vec_ste(sum, 0, &s);
370
371 return s;
372}
373
4013fcf4
FB
374/**
375 * Sum of Squared Errors for a 8x8 block.
376 * AltiVec-enhanced.
bb198e19 377 * It's the sad8_altivec code above w/ squaring added.
4013fcf4 378 */
bb198e19 379int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
4013fcf4
FB
380{
381 int i;
382 int s __attribute__((aligned(16)));
3b991c54 383 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
4013fcf4
FB
384 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
385 vector unsigned char t1, t2, t3,t4, t5;
386 vector unsigned int sum;
387 vector signed int sumsqr;
388
3b991c54 389 sum = (vector unsigned int)vec_splat_u32(0);
a9a07762
MN
390
391 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
392
4013fcf4 393
bb198e19 394 for(i=0;i<h;i++) {
4013fcf4
FB
395 /* Read potentially unaligned pixels into t1 and t2
396 Since we're reading 16 pixels, and actually only want 8,
397 mask out the last 8 pixels. The 0s don't change the sum. */
398 perm1 = vec_lvsl(0, pix1);
399 pix1v = (vector unsigned char *) pix1;
400 perm2 = vec_lvsl(0, pix2);
401 pix2v = (vector unsigned char *) pix2;
402 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
403 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
404
405 /*
406 Since we want to use unsigned chars, we can take advantage
407 of the fact that abs(a-b)^2 = (a-b)^2.
408 */
409
410 /* Calculate abs differences vector */
411 t3 = vec_max(t1, t2);
412 t4 = vec_min(t1, t2);
413 t5 = vec_sub(t3, t4);
414
415 /* Square the values and add them to our sum */
416 sum = vec_msum(t5, t5, sum);
417
418 pix1 += line_size;
419 pix2 += line_size;
420 }
421
422 /* Sum up the four partial sums, and put the result into s */
423 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
424 sumsqr = vec_splat(sumsqr, 3);
425 vec_ste(sumsqr, 0, &s);
426
427 return s;
428}
429
430/**
431 * Sum of Squared Errors for a 16x16 block.
432 * AltiVec-enhanced.
bb198e19 433 * It's the sad16_altivec code above w/ squaring added.
4013fcf4 434 */
bb198e19 435int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
59925ef2 436{
4013fcf4
FB
437 int i;
438 int s __attribute__((aligned(16)));
3b991c54 439 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
4013fcf4
FB
440 vector unsigned char perm1, perm2, *pix1v, *pix2v;
441 vector unsigned char t1, t2, t3,t4, t5;
442 vector unsigned int sum;
443 vector signed int sumsqr;
444
3b991c54 445 sum = (vector unsigned int)vec_splat_u32(0);
4013fcf4 446
bb198e19 447 for(i=0;i<h;i++) {
4013fcf4
FB
448 /* Read potentially unaligned pixels into t1 and t2 */
449 perm1 = vec_lvsl(0, pix1);
450 pix1v = (vector unsigned char *) pix1;
451 perm2 = vec_lvsl(0, pix2);
452 pix2v = (vector unsigned char *) pix2;
453 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
454 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
455
456 /*
457 Since we want to use unsigned chars, we can take advantage
458 of the fact that abs(a-b)^2 = (a-b)^2.
459 */
460
461 /* Calculate abs differences vector */
462 t3 = vec_max(t1, t2);
463 t4 = vec_min(t1, t2);
464 t5 = vec_sub(t3, t4);
465
466 /* Square the values and add them to our sum */
467 sum = vec_msum(t5, t5, sum);
468
469 pix1 += line_size;
470 pix2 += line_size;
471 }
472
473 /* Sum up the four partial sums, and put the result into s */
474 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
475 sumsqr = vec_splat(sumsqr, 3);
476 vec_ste(sumsqr, 0, &s);
477
478 return s;
479}
59925ef2 480
0c1a9eda 481int pix_sum_altivec(uint8_t * pix, int line_size)
4013fcf4 482{
3b991c54 483 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
59925ef2
BF
484 vector unsigned char perm, *pixv;
485 vector unsigned char t1;
4013fcf4 486 vector unsigned int sad;
59925ef2
BF
487 vector signed int sumdiffs;
488
4013fcf4
FB
489 int i;
490 int s __attribute__((aligned(16)));
491
3b991c54 492 sad = (vector unsigned int)vec_splat_u32(0);
59925ef2
BF
493
494 for (i = 0; i < 16; i++) {
495 /* Read the potentially unaligned 16 pixels into t1 */
496 perm = vec_lvsl(0, pix);
497 pixv = (vector unsigned char *) pix;
498 t1 = vec_perm(pixv[0], pixv[1], perm);
499
500 /* Add each 4 pixel group together and put 4 results into sad */
501 sad = vec_sum4s(t1, sad);
502
503 pix += line_size;
504 }
505
506 /* Sum up the four partial sums, and put the result into s */
507 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
508 sumdiffs = vec_splat(sumdiffs, 3);
509 vec_ste(sumdiffs, 0, &s);
510
511 return s;
512}
513
0c1a9eda 514void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
05c4072b
MN
515{
516 int i;
517 vector unsigned char perm, bytes, *pixv;
3b991c54 518 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
05c4072b
MN
519 vector signed short shorts;
520
521 for(i=0;i<8;i++)
522 {
523 // Read potentially unaligned pixels.
524 // We're reading 16 pixels, and actually only want 8,
525 // but we simply ignore the extras.
526 perm = vec_lvsl(0, pixels);
527 pixv = (vector unsigned char *) pixels;
528 bytes = vec_perm(pixv[0], pixv[1], perm);
529
530 // convert the bytes into shorts
531 shorts = (vector signed short)vec_mergeh(zero, bytes);
532
533 // save the data to the block, we assume the block is 16-byte aligned
534 vec_st(shorts, i*16, (vector signed short*)block);
535
536 pixels += line_size;
537 }
538}
539
0c1a9eda
ZK
540void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
541 const uint8_t *s2, int stride)
05c4072b
MN
542{
543 int i;
544 vector unsigned char perm, bytes, *pixv;
3b991c54 545 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
05c4072b
MN
546 vector signed short shorts1, shorts2;
547
548 for(i=0;i<4;i++)
549 {
550 // Read potentially unaligned pixels
551 // We're reading 16 pixels, and actually only want 8,
552 // but we simply ignore the extras.
553 perm = vec_lvsl(0, s1);
554 pixv = (vector unsigned char *) s1;
555 bytes = vec_perm(pixv[0], pixv[1], perm);
556
557 // convert the bytes into shorts
558 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
559
560 // Do the same for the second block of pixels
561 perm = vec_lvsl(0, s2);
562 pixv = (vector unsigned char *) s2;
563 bytes = vec_perm(pixv[0], pixv[1], perm);
564
565 // convert the bytes into shorts
566 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
567
568 // Do the subtraction
569 shorts1 = vec_sub(shorts1, shorts2);
570
571 // save the data to the block, we assume the block is 16-byte aligned
572 vec_st(shorts1, 0, (vector signed short*)block);
573
574 s1 += stride;
575 s2 += stride;
576 block += 8;
577
578
579 // The code below is a copy of the code above... This is a manual
580 // unroll.
581
582 // Read potentially unaligned pixels
583 // We're reading 16 pixels, and actually only want 8,
584 // but we simply ignore the extras.
585 perm = vec_lvsl(0, s1);
586 pixv = (vector unsigned char *) s1;
587 bytes = vec_perm(pixv[0], pixv[1], perm);
588
589 // convert the bytes into shorts
590 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
591
592 // Do the same for the second block of pixels
593 perm = vec_lvsl(0, s2);
594 pixv = (vector unsigned char *) s2;
595 bytes = vec_perm(pixv[0], pixv[1], perm);
596
597 // convert the bytes into shorts
598 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
599
600 // Do the subtraction
601 shorts1 = vec_sub(shorts1, shorts2);
602
603 // save the data to the block, we assume the block is 16-byte aligned
604 vec_st(shorts1, 0, (vector signed short*)block);
605
606 s1 += stride;
607 s2 += stride;
608 block += 8;
609 }
610}
611
e629ab68 612void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
db40a39a 613#ifdef ALTIVEC_USE_REFERENCE_C_CODE
e629ab68
RD
614 int i;
615 for(i=0; i+7<w; i++){
616 dst[i+0] += src[i+0];
617 dst[i+1] += src[i+1];
618 dst[i+2] += src[i+2];
619 dst[i+3] += src[i+3];
620 dst[i+4] += src[i+4];
621 dst[i+5] += src[i+5];
622 dst[i+6] += src[i+6];
623 dst[i+7] += src[i+7];
624 }
625 for(; i<w; i++)
626 dst[i+0] += src[i+0];
db40a39a 627#else /* ALTIVEC_USE_REFERENCE_C_CODE */
e629ab68 628 register int i;
db40a39a
MN
629 register vector unsigned char vdst, vsrc;
630
631 /* dst and src are 16 bytes-aligned (guaranteed) */
632 for(i = 0 ; (i + 15) < w ; i++)
e629ab68 633 {
db40a39a
MN
634 vdst = vec_ld(i << 4, (unsigned char*)dst);
635 vsrc = vec_ld(i << 4, (unsigned char*)src);
e629ab68 636 vdst = vec_add(vsrc, vdst);
db40a39a 637 vec_st(vdst, i << 4, (unsigned char*)dst);
e629ab68 638 }
db40a39a 639 /* if w is not a multiple of 16 */
e629ab68
RD
640 for (; (i < w) ; i++)
641 {
642 dst[i] = src[i];
643 }
db40a39a
MN
644#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
645}
646
fe50f385 647/* next one assumes that ((line_size % 16) == 0) */
db40a39a
MN
648void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
649{
e45a2872 650POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
db40a39a
MN
651#ifdef ALTIVEC_USE_REFERENCE_C_CODE
652 int i;
653
e45a2872 654POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
db40a39a
MN
655
656 for(i=0; i<h; i++) {
657 *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
658 *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
659 *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
660 *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
661 pixels+=line_size;
662 block +=line_size;
663 }
664
e45a2872 665POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
db40a39a
MN
666
667#else /* ALTIVEC_USE_REFERENCE_C_CODE */
db40a39a 668 register vector unsigned char pixelsv1, pixelsv2;
e45a2872
RD
669 register vector unsigned char pixelsv1B, pixelsv2B;
670 register vector unsigned char pixelsv1C, pixelsv2C;
671 register vector unsigned char pixelsv1D, pixelsv2D;
672
fe50f385 673 register vector unsigned char perm = vec_lvsl(0, pixels);
db40a39a 674 int i;
e45a2872
RD
675 register int line_size_2 = line_size << 1;
676 register int line_size_3 = line_size + line_size_2;
677 register int line_size_4 = line_size << 2;
678
679POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
680// hand-unrolling the loop by 4 gains about 15%
681// mininum execution time goes from 74 to 60 cycles
682// it's faster than -funroll-loops, but using
683// -funroll-loops w/ this is bad - 74 cycles again.
684// all this is on a 7450, tuning for the 7450
685#if 0
db40a39a
MN
686 for(i=0; i<h; i++) {
687 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
688 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
fe50f385 689 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
35e5fb06 690 0, (unsigned char*)block);
db40a39a
MN
691 pixels+=line_size;
692 block +=line_size;
693 }
e45a2872
RD
694#else
695 for(i=0; i<h; i+=4) {
696 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
697 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
698 pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
699 pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
700 pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
701 pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
702 pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
703 pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
704 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
705 0, (unsigned char*)block);
706 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
707 line_size, (unsigned char*)block);
708 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
709 line_size_2, (unsigned char*)block);
710 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
711 line_size_3, (unsigned char*)block);
712 pixels+=line_size_4;
713 block +=line_size_4;
714 }
715#endif
716POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
db40a39a
MN
717
718#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
719}
720
fe50f385 721/* next one assumes that ((line_size % 16) == 0) */
db40a39a
MN
722#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
723void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
724{
e45a2872 725POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
db40a39a
MN
726#ifdef ALTIVEC_USE_REFERENCE_C_CODE
727 int i;
728
e45a2872 729POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
db40a39a
MN
730
731 for(i=0; i<h; i++) {
732 op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
733 op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
734 op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
735 op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
736 pixels+=line_size;
737 block +=line_size;
738 }
739
e45a2872 740POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
db40a39a
MN
741
742#else /* ALTIVEC_USE_REFERENCE_C_CODE */
db40a39a 743 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
fe50f385 744 register vector unsigned char perm = vec_lvsl(0, pixels);
db40a39a
MN
745 int i;
746
e45a2872 747POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
db40a39a
MN
748
749 for(i=0; i<h; i++) {
750 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
751 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
752 blockv = vec_ld(0, block);
fe50f385 753 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
db40a39a
MN
754 blockv = vec_avg(blockv,pixelsv);
755 vec_st(blockv, 0, (unsigned char*)block);
756 pixels+=line_size;
757 block +=line_size;
758 }
759
e45a2872 760POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
db40a39a
MN
761
762#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
e629ab68 763}
05c4072b 764
fe50f385
RD
765/* next one assumes that ((line_size % 8) == 0) */
766void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
35e5fb06 767{
e45a2872 768POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
35e5fb06
RD
769#ifdef ALTIVEC_USE_REFERENCE_C_CODE
770 int i;
e45a2872 771POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
35e5fb06
RD
772 for (i = 0; i < h; i++) {
773 *((uint32_t *) (block)) =
774 (((*((uint32_t *) (block))) |
775 ((((const struct unaligned_32 *) (pixels))->l))) -
776 ((((*((uint32_t *) (block))) ^
777 ((((const struct unaligned_32 *) (pixels))->
778 l))) & 0xFEFEFEFEUL) >> 1));
779 *((uint32_t *) (block + 4)) =
780 (((*((uint32_t *) (block + 4))) |
781 ((((const struct unaligned_32 *) (pixels + 4))->l))) -
782 ((((*((uint32_t *) (block + 4))) ^
783 ((((const struct unaligned_32 *) (pixels +
784 4))->
785 l))) & 0xFEFEFEFEUL) >> 1));
786 pixels += line_size;
787 block += line_size;
788 }
e45a2872 789POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
35e5fb06
RD
790
791#else /* ALTIVEC_USE_REFERENCE_C_CODE */
792 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
793 int i;
794
e45a2872 795POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
35e5fb06
RD
796
797 for (i = 0; i < h; i++) {
798 /*
799 block is 8 bytes-aligned, so we're either in the
800 left block (16 bytes-aligned) or in the right block (not)
801 */
802 int rightside = ((unsigned long)block & 0x0000000F);
803
804 blockv = vec_ld(0, block);
805 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
806 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
807 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
808
809 if (rightside)
810 {
811 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
812 }
813 else
814 {
815 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
816 }
817
818 blockv = vec_avg(blockv, pixelsv);
819
820 vec_st(blockv, 0, block);
821
822 pixels += line_size;
823 block += line_size;
824 }
825
e45a2872 826POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
35e5fb06
RD
827
828#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
829}
830
fe50f385 831/* next one assumes that ((line_size % 8) == 0) */
35e5fb06
RD
832void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
833{
e45a2872 834POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
835#ifdef ALTIVEC_USE_REFERENCE_C_CODE
836 int j;
e45a2872 837POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
838 for (j = 0; j < 2; j++) {
839 int i;
840 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
841 const uint32_t b =
842 (((const struct unaligned_32 *) (pixels + 1))->l);
843 uint32_t l0 =
844 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
845 uint32_t h0 =
846 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
847 uint32_t l1, h1;
848 pixels += line_size;
849 for (i = 0; i < h; i += 2) {
850 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
851 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
852 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
853 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
854 *((uint32_t *) block) =
855 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
856 pixels += line_size;
857 block += line_size;
858 a = (((const struct unaligned_32 *) (pixels))->l);
859 b = (((const struct unaligned_32 *) (pixels + 1))->l);
860 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
861 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
862 *((uint32_t *) block) =
863 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
864 pixels += line_size;
865 block += line_size;
866 } pixels += 4 - line_size * (h + 1);
867 block += 4 - line_size * h;
868 }
869
e45a2872 870POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
871
872#else /* ALTIVEC_USE_REFERENCE_C_CODE */
873 register int i;
874 register vector unsigned char
875 pixelsv1, pixelsv2,
876 pixelsavg;
877 register vector unsigned char
878 blockv, temp1, temp2;
879 register vector unsigned short
880 pixelssum1, pixelssum2, temp3;
3b991c54
RD
881 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
882 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
35e5fb06
RD
883
884 temp1 = vec_ld(0, pixels);
885 temp2 = vec_ld(16, pixels);
886 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
887 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
888 {
889 pixelsv2 = temp2;
890 }
891 else
892 {
893 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
894 }
895 pixelsv1 = vec_mergeh(vczero, pixelsv1);
896 pixelsv2 = vec_mergeh(vczero, pixelsv2);
897 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
898 (vector unsigned short)pixelsv2);
899 pixelssum1 = vec_add(pixelssum1, vctwo);
900
e45a2872 901POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
902 for (i = 0; i < h ; i++) {
903 int rightside = ((unsigned long)block & 0x0000000F);
904 blockv = vec_ld(0, block);
905
906 temp1 = vec_ld(line_size, pixels);
907 temp2 = vec_ld(line_size + 16, pixels);
908 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
909 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
910 {
911 pixelsv2 = temp2;
912 }
913 else
914 {
915 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
916 }
917
918 pixelsv1 = vec_mergeh(vczero, pixelsv1);
919 pixelsv2 = vec_mergeh(vczero, pixelsv2);
920 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
921 (vector unsigned short)pixelsv2);
922 temp3 = vec_add(pixelssum1, pixelssum2);
923 temp3 = vec_sra(temp3, vctwo);
924 pixelssum1 = vec_add(pixelssum2, vctwo);
925 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
926
927 if (rightside)
928 {
929 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
930 }
931 else
932 {
933 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
934 }
935
936 vec_st(blockv, 0, block);
937
938 block += line_size;
939 pixels += line_size;
940 }
941
e45a2872 942POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
943#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
944}
945
fe50f385
RD
946/* next one assumes that ((line_size % 8) == 0) */
947void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
948{
e45a2872 949POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
950#ifdef ALTIVEC_USE_REFERENCE_C_CODE
951 int j;
e45a2872 952POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
953 for (j = 0; j < 2; j++) {
954 int i;
955 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
956 const uint32_t b =
957 (((const struct unaligned_32 *) (pixels + 1))->l);
958 uint32_t l0 =
959 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
960 uint32_t h0 =
961 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
962 uint32_t l1, h1;
963 pixels += line_size;
964 for (i = 0; i < h; i += 2) {
965 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
966 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
967 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
968 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
969 *((uint32_t *) block) =
970 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
971 pixels += line_size;
972 block += line_size;
973 a = (((const struct unaligned_32 *) (pixels))->l);
974 b = (((const struct unaligned_32 *) (pixels + 1))->l);
975 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
976 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
977 *((uint32_t *) block) =
978 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
979 pixels += line_size;
980 block += line_size;
981 } pixels += 4 - line_size * (h + 1);
982 block += 4 - line_size * h;
983 }
984
e45a2872 985POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
986
987#else /* ALTIVEC_USE_REFERENCE_C_CODE */
988 register int i;
989 register vector unsigned char
990 pixelsv1, pixelsv2,
991 pixelsavg;
992 register vector unsigned char
993 blockv, temp1, temp2;
994 register vector unsigned short
995 pixelssum1, pixelssum2, temp3;
3b991c54
RD
996 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
997 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
998 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
fe50f385
RD
999
1000 temp1 = vec_ld(0, pixels);
1001 temp2 = vec_ld(16, pixels);
1002 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1003 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1004 {
1005 pixelsv2 = temp2;
1006 }
1007 else
1008 {
1009 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1010 }
1011 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1012 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1013 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1014 (vector unsigned short)pixelsv2);
1015 pixelssum1 = vec_add(pixelssum1, vcone);
1016
e45a2872 1017POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
1018 for (i = 0; i < h ; i++) {
1019 int rightside = ((unsigned long)block & 0x0000000F);
1020 blockv = vec_ld(0, block);
1021
1022 temp1 = vec_ld(line_size, pixels);
1023 temp2 = vec_ld(line_size + 16, pixels);
1024 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1025 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1026 {
1027 pixelsv2 = temp2;
1028 }
1029 else
1030 {
1031 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1032 }
1033
1034 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1035 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1036 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1037 (vector unsigned short)pixelsv2);
1038 temp3 = vec_add(pixelssum1, pixelssum2);
1039 temp3 = vec_sra(temp3, vctwo);
1040 pixelssum1 = vec_add(pixelssum2, vcone);
1041 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1042
1043 if (rightside)
1044 {
1045 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1046 }
1047 else
1048 {
1049 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1050 }
1051
1052 vec_st(blockv, 0, block);
1053
1054 block += line_size;
1055 pixels += line_size;
1056 }
1057
e45a2872 1058POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
1059#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1060}
1061
1062/* next one assumes that ((line_size % 16) == 0) */
1063void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1064{
e45a2872 1065POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
fe50f385
RD
1066#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1067 int j;
e45a2872 1068POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
fe50f385
RD
1069 for (j = 0; j < 4; j++) {
1070 int i;
1071 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1072 const uint32_t b =
1073 (((const struct unaligned_32 *) (pixels + 1))->l);
1074 uint32_t l0 =
1075 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1076 uint32_t h0 =
1077 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1078 uint32_t l1, h1;
1079 pixels += line_size;
1080 for (i = 0; i < h; i += 2) {
1081 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1082 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1083 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1084 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1085 *((uint32_t *) block) =
1086 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1087 pixels += line_size;
1088 block += line_size;
1089 a = (((const struct unaligned_32 *) (pixels))->l);
1090 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1091 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1092 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1093 *((uint32_t *) block) =
1094 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1095 pixels += line_size;
1096 block += line_size;
1097 } pixels += 4 - line_size * (h + 1);
1098 block += 4 - line_size * h;
1099 }
1100
e45a2872 1101POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
fe50f385
RD
1102
1103#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1104 register int i;
1105 register vector unsigned char
1106 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1107 register vector unsigned char
1108 blockv, temp1, temp2;
1109 register vector unsigned short
1110 pixelssum1, pixelssum2, temp3,
1111 pixelssum3, pixelssum4, temp4;
3b991c54
RD
1112 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1113 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
3efd4952 1114
e45a2872 1115POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
3efd4952 1116
fe50f385
RD
1117 temp1 = vec_ld(0, pixels);
1118 temp2 = vec_ld(16, pixels);
1119 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1120 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1121 {
1122 pixelsv2 = temp2;
1123 }
1124 else
1125 {
1126 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1127 }
1128 pixelsv3 = vec_mergel(vczero, pixelsv1);
1129 pixelsv4 = vec_mergel(vczero, pixelsv2);
1130 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1131 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1132 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1133 (vector unsigned short)pixelsv4);
1134 pixelssum3 = vec_add(pixelssum3, vctwo);
1135 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1136 (vector unsigned short)pixelsv2);
1137 pixelssum1 = vec_add(pixelssum1, vctwo);
1138
fe50f385
RD
1139 for (i = 0; i < h ; i++) {
1140 blockv = vec_ld(0, block);
1141
1142 temp1 = vec_ld(line_size, pixels);
1143 temp2 = vec_ld(line_size + 16, pixels);
1144 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1145 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1146 {
1147 pixelsv2 = temp2;
1148 }
1149 else
1150 {
1151 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1152 }
1153
1154 pixelsv3 = vec_mergel(vczero, pixelsv1);
1155 pixelsv4 = vec_mergel(vczero, pixelsv2);
1156 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1157 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1158
1159 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1160 (vector unsigned short)pixelsv4);
1161 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1162 (vector unsigned short)pixelsv2);
1163 temp4 = vec_add(pixelssum3, pixelssum4);
1164 temp4 = vec_sra(temp4, vctwo);
1165 temp3 = vec_add(pixelssum1, pixelssum2);
1166 temp3 = vec_sra(temp3, vctwo);
1167
1168 pixelssum3 = vec_add(pixelssum4, vctwo);
1169 pixelssum1 = vec_add(pixelssum2, vctwo);
1170
1171 blockv = vec_packsu(temp3, temp4);
1172
1173 vec_st(blockv, 0, block);
1174
1175 block += line_size;
1176 pixels += line_size;
1177 }
1178
e45a2872 1179POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
fe50f385
RD
1180#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1181}
1182
1183/* next one assumes that ((line_size % 16) == 0) */
1184void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1185{
e45a2872 1186POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
fe50f385
RD
1187#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1188 int j;
e45a2872 1189POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
fe50f385
RD
1190 for (j = 0; j < 4; j++) {
1191 int i;
1192 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1193 const uint32_t b =
1194 (((const struct unaligned_32 *) (pixels + 1))->l);
1195 uint32_t l0 =
1196 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1197 uint32_t h0 =
1198 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1199 uint32_t l1, h1;
1200 pixels += line_size;
1201 for (i = 0; i < h; i += 2) {
1202 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1203 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1204 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1205 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1206 *((uint32_t *) block) =
1207 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1208 pixels += line_size;
1209 block += line_size;
1210 a = (((const struct unaligned_32 *) (pixels))->l);
1211 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1212 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1213 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1214 *((uint32_t *) block) =
1215 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1216 pixels += line_size;
1217 block += line_size;
1218 } pixels += 4 - line_size * (h + 1);
1219 block += 4 - line_size * h;
1220 }
1221
e45a2872 1222POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
fe50f385
RD
1223
1224#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1225 register int i;
1226 register vector unsigned char
1227 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1228 register vector unsigned char
1229 blockv, temp1, temp2;
1230 register vector unsigned short
1231 pixelssum1, pixelssum2, temp3,
1232 pixelssum3, pixelssum4, temp4;
3b991c54
RD
1233 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1234 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1235 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
3efd4952 1236
e45a2872 1237POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
3efd4952 1238
fe50f385
RD
1239 temp1 = vec_ld(0, pixels);
1240 temp2 = vec_ld(16, pixels);
1241 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1242 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1243 {
1244 pixelsv2 = temp2;
1245 }
1246 else
1247 {
1248 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1249 }
1250 pixelsv3 = vec_mergel(vczero, pixelsv1);
1251 pixelsv4 = vec_mergel(vczero, pixelsv2);
1252 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1253 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1254 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1255 (vector unsigned short)pixelsv4);
1256 pixelssum3 = vec_add(pixelssum3, vcone);
1257 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1258 (vector unsigned short)pixelsv2);
1259 pixelssum1 = vec_add(pixelssum1, vcone);
1260
fe50f385
RD
1261 for (i = 0; i < h ; i++) {
1262 blockv = vec_ld(0, block);
1263
1264 temp1 = vec_ld(line_size, pixels);
1265 temp2 = vec_ld(line_size + 16, pixels);
1266 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1267 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1268 {
1269 pixelsv2 = temp2;
1270 }
1271 else
1272 {
1273 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1274 }
1275
1276 pixelsv3 = vec_mergel(vczero, pixelsv1);
1277 pixelsv4 = vec_mergel(vczero, pixelsv2);
1278 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1279 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1280
1281 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1282 (vector unsigned short)pixelsv4);
1283 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1284 (vector unsigned short)pixelsv2);
1285 temp4 = vec_add(pixelssum3, pixelssum4);
1286 temp4 = vec_sra(temp4, vctwo);
1287 temp3 = vec_add(pixelssum1, pixelssum2);
1288 temp3 = vec_sra(temp3, vctwo);
1289
1290 pixelssum3 = vec_add(pixelssum4, vcone);
1291 pixelssum1 = vec_add(pixelssum2, vcone);
1292
1293 blockv = vec_packsu(temp3, temp4);
1294
1295 vec_st(blockv, 0, block);
1296
1297 block += line_size;
1298 pixels += line_size;
1299 }
1300
e45a2872 1301POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
fe50f385
RD
1302#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1303}
1304
59925ef2
BF
1305int has_altivec(void)
1306{
3b991c54 1307#ifdef CONFIG_DARWIN
59925ef2
BF
1308 int sels[2] = {CTL_HW, HW_VECTORUNIT};
1309 int has_vu = 0;
1310 size_t len = sizeof(has_vu);
1311 int err;
1312
1313 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1314
1315 if (err == 0) return (has_vu != 0);
3b991c54
RD
1316#else /* CONFIG_DARWIN */
1317/* no Darwin, do it the brute-force way */
1318/* this is borrowed from the libmpeg2 library */
1319 {
1320 signal (SIGILL, sigill_handler);
1321 if (sigsetjmp (jmpbuf, 1)) {
1322 signal (SIGILL, SIG_DFL);
1323 } else {
1324 canjump = 1;
1325
1326 asm volatile ("mtspr 256, %0\n\t"
1327 "vand %%v0, %%v0, %%v0"
1328 :
1329 : "r" (-1));
1330
1331 signal (SIGILL, SIG_DFL);
1332 return 1;
1333 }
1334 }
1335#endif /* CONFIG_DARWIN */
59925ef2
BF
1336 return 0;
1337}