hadamard8_diff* enabled on linux/ppc
[libav.git] / libavcodec / ppc / dsputil_altivec.c
CommitLineData
05c4072b
MN
1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
c4a17148 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
05c4072b
MN
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
5509bffa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
05c4072b 19 */
115329f1 20
59925ef2 21#include "../dsputil.h"
a9a07762
MN
22
23#include "gcc_fixes.h"
24
05c4072b 25#include "dsputil_altivec.h"
59925ef2 26
3b991c54 27#ifdef CONFIG_DARWIN
59925ef2 28#include <sys/sysctl.h>
3b991c54 29#else /* CONFIG_DARWIN */
69339009
C
30#ifdef __AMIGAOS4__
31#include <exec/exec.h>
32#include <interfaces/exec.h>
33#include <proto/exec.h>
34#else /* __AMIGAOS4__ */
3b991c54
RD
35#include <signal.h>
36#include <setjmp.h>
37
38static sigjmp_buf jmpbuf;
39static volatile sig_atomic_t canjump = 0;
40
41static void sigill_handler (int sig)
42{
43 if (!canjump) {
44 signal (sig, SIG_DFL);
45 raise (sig);
46 }
115329f1 47
3b991c54
RD
48 canjump = 0;
49 siglongjmp (jmpbuf, 1);
50}
51#endif /* CONFIG_DARWIN */
69339009 52#endif /* __AMIGAOS4__ */
59925ef2 53
bb198e19 54int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
f2677d6b 55{
4013fcf4
FB
56 int i;
57 int s __attribute__((aligned(16)));
aab34ca0 58 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
4013fcf4 59 vector unsigned char *tv;
f2677d6b
BF
60 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
61 vector unsigned int sad;
62 vector signed int sumdiffs;
63
64 s = 0;
3b991c54 65 sad = (vector unsigned int)vec_splat_u32(0);
bb198e19 66 for(i=0;i<h;i++) {
f2677d6b
BF
67 /*
68 Read unaligned pixels into our vectors. The vectors are as follows:
69 pix1v: pix1[0]-pix1[15]
bb270c08 70 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
f2677d6b
BF
71 */
72 tv = (vector unsigned char *) pix1;
73 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
115329f1 74
f2677d6b
BF
75 tv = (vector unsigned char *) &pix2[0];
76 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
77
78 tv = (vector unsigned char *) &pix2[1];
79 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
80
81 /* Calculate the average vector */
82 avgv = vec_avg(pix2v, pix2iv);
83
84 /* Calculate a sum of abs differences vector */
85 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
86
87 /* Add each 4 pixel group together and put 4 results into sad */
88 sad = vec_sum4s(t5, sad);
115329f1 89
f2677d6b
BF
90 pix1 += line_size;
91 pix2 += line_size;
92 }
93 /* Sum up the four partial sums, and put the result into s */
94 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
95 sumdiffs = vec_splat(sumdiffs, 3);
96 vec_ste(sumdiffs, 0, &s);
97
98 return s;
99}
100
bb198e19 101int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
f2677d6b 102{
4013fcf4
FB
103 int i;
104 int s __attribute__((aligned(16)));
aab34ca0 105 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
4013fcf4 106 vector unsigned char *tv;
f2677d6b
BF
107 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
108 vector unsigned int sad;
109 vector signed int sumdiffs;
110 uint8_t *pix3 = pix2 + line_size;
111
112 s = 0;
3b991c54 113 sad = (vector unsigned int)vec_splat_u32(0);
f2677d6b
BF
114
115 /*
116 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
117 iteration becomes pix2 in the next iteration. We can use this
118 fact to avoid a potentially expensive unaligned read, each
119 time around the loop.
120 Read unaligned pixels into our vectors. The vectors are as follows:
121 pix2v: pix2[0]-pix2[15]
122 Split the pixel vectors into shorts
123 */
124 tv = (vector unsigned char *) &pix2[0];
125 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
115329f1 126
bb198e19 127 for(i=0;i<h;i++) {
f2677d6b
BF
128 /*
129 Read unaligned pixels into our vectors. The vectors are as follows:
130 pix1v: pix1[0]-pix1[15]
131 pix3v: pix3[0]-pix3[15]
132 */
133 tv = (vector unsigned char *) pix1;
134 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
135
136 tv = (vector unsigned char *) &pix3[0];
137 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
138
139 /* Calculate the average vector */
140 avgv = vec_avg(pix2v, pix3v);
141
142 /* Calculate a sum of abs differences vector */
143 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
144
145 /* Add each 4 pixel group together and put 4 results into sad */
146 sad = vec_sum4s(t5, sad);
115329f1 147
f2677d6b
BF
148 pix1 += line_size;
149 pix2v = pix3v;
150 pix3 += line_size;
115329f1 151
f2677d6b 152 }
115329f1 153
f2677d6b
BF
154 /* Sum up the four partial sums, and put the result into s */
155 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
156 sumdiffs = vec_splat(sumdiffs, 3);
157 vec_ste(sumdiffs, 0, &s);
115329f1 158 return s;
f2677d6b
BF
159}
160
bb198e19 161int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
f2677d6b 162{
4013fcf4
FB
163 int i;
164 int s __attribute__((aligned(16)));
f2677d6b 165 uint8_t *pix3 = pix2 + line_size;
aab34ca0
MN
166 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
167 const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
4013fcf4 168 vector unsigned char *tv, avgv, t5;
f2677d6b
BF
169 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
170 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
171 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
4013fcf4 172 vector unsigned short avghv, avglv;
f2677d6b
BF
173 vector unsigned short t1, t2, t3, t4;
174 vector unsigned int sad;
175 vector signed int sumdiffs;
176
3b991c54 177 sad = (vector unsigned int)vec_splat_u32(0);
115329f1 178
f2677d6b
BF
179 s = 0;
180
181 /*
182 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
183 iteration becomes pix2 in the next iteration. We can use this
184 fact to avoid a potentially expensive unaligned read, as well
185 as some splitting, and vector addition each time around the loop.
186 Read unaligned pixels into our vectors. The vectors are as follows:
bb270c08 187 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
f2677d6b
BF
188 Split the pixel vectors into shorts
189 */
190 tv = (vector unsigned char *) &pix2[0];
191 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
192
193 tv = (vector unsigned char *) &pix2[1];
194 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
195
196 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
197 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
198 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
199 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
200 t1 = vec_add(pix2hv, pix2ihv);
201 t2 = vec_add(pix2lv, pix2ilv);
115329f1 202
bb198e19 203 for(i=0;i<h;i++) {
f2677d6b
BF
204 /*
205 Read unaligned pixels into our vectors. The vectors are as follows:
206 pix1v: pix1[0]-pix1[15]
bb270c08 207 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
f2677d6b
BF
208 */
209 tv = (vector unsigned char *) pix1;
210 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
211
212 tv = (vector unsigned char *) &pix3[0];
213 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
214
215 tv = (vector unsigned char *) &pix3[1];
216 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
217
218 /*
219 Note that Altivec does have vec_avg, but this works on vector pairs
220 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
221 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
222 Instead, we have to split the pixel vectors into vectors of shorts,
223 and do the averaging by hand.
224 */
225
226 /* Split the pixel vectors into shorts */
227 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
228 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
229 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
230 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
231
232 /* Do the averaging on them */
233 t3 = vec_add(pix3hv, pix3ihv);
234 t4 = vec_add(pix3lv, pix3ilv);
235
9c76bd48
BF
236 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
237 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
f2677d6b
BF
238
239 /* Pack the shorts back into a result */
240 avgv = vec_pack(avghv, avglv);
241
242 /* Calculate a sum of abs differences vector */
243 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
244
245 /* Add each 4 pixel group together and put 4 results into sad */
246 sad = vec_sum4s(t5, sad);
247
248 pix1 += line_size;
249 pix3 += line_size;
250 /* Transfer the calculated values for pix3 into pix2 */
251 t1 = t3;
252 t2 = t4;
253 }
254 /* Sum up the four partial sums, and put the result into s */
255 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
256 sumdiffs = vec_splat(sumdiffs, 3);
257 vec_ste(sumdiffs, 0, &s);
258
259 return s;
260}
261
bb198e19 262int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
59925ef2 263{
4013fcf4
FB
264 int i;
265 int s __attribute__((aligned(16)));
aab34ca0 266 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
59925ef2
BF
267 vector unsigned char perm1, perm2, *pix1v, *pix2v;
268 vector unsigned char t1, t2, t3,t4, t5;
4013fcf4 269 vector unsigned int sad;
59925ef2 270 vector signed int sumdiffs;
115329f1 271
3b991c54 272 sad = (vector unsigned int)vec_splat_u32(0);
59925ef2
BF
273
274
bb198e19 275 for(i=0;i<h;i++) {
bb270c08 276 /* Read potentially unaligned pixels into t1 and t2 */
59925ef2
BF
277 perm1 = vec_lvsl(0, pix1);
278 pix1v = (vector unsigned char *) pix1;
279 perm2 = vec_lvsl(0, pix2);
280 pix2v = (vector unsigned char *) pix2;
281 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
282 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
115329f1 283
bb270c08 284 /* Calculate a sum of abs differences vector */
59925ef2
BF
285 t3 = vec_max(t1, t2);
286 t4 = vec_min(t1, t2);
287 t5 = vec_sub(t3, t4);
115329f1 288
bb270c08 289 /* Add each 4 pixel group together and put 4 results into sad */
59925ef2
BF
290 sad = vec_sum4s(t5, sad);
291
292 pix1 += line_size;
293 pix2 += line_size;
294 }
295
296 /* Sum up the four partial sums, and put the result into s */
297 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
298 sumdiffs = vec_splat(sumdiffs, 3);
299 vec_ste(sumdiffs, 0, &s);
115329f1 300
59925ef2
BF
301 return s;
302}
303
bb198e19 304int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
59925ef2 305{
4013fcf4
FB
306 int i;
307 int s __attribute__((aligned(16)));
aab34ca0 308 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
59925ef2
BF
309 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
310 vector unsigned char t1, t2, t3,t4, t5;
4013fcf4 311 vector unsigned int sad;
59925ef2
BF
312 vector signed int sumdiffs;
313
3b991c54 314 sad = (vector unsigned int)vec_splat_u32(0);
a9a07762
MN
315
316 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
59925ef2 317
bb198e19 318 for(i=0;i<h;i++) {
bb270c08
DB
319 /* Read potentially unaligned pixels into t1 and t2
320 Since we're reading 16 pixels, and actually only want 8,
321 mask out the last 8 pixels. The 0s don't change the sum. */
59925ef2
BF
322 perm1 = vec_lvsl(0, pix1);
323 pix1v = (vector unsigned char *) pix1;
324 perm2 = vec_lvsl(0, pix2);
325 pix2v = (vector unsigned char *) pix2;
326 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
327 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
328
bb270c08 329 /* Calculate a sum of abs differences vector */
59925ef2
BF
330 t3 = vec_max(t1, t2);
331 t4 = vec_min(t1, t2);
332 t5 = vec_sub(t3, t4);
333
bb270c08 334 /* Add each 4 pixel group together and put 4 results into sad */
59925ef2
BF
335 sad = vec_sum4s(t5, sad);
336
337 pix1 += line_size;
338 pix2 += line_size;
339 }
340
341 /* Sum up the four partial sums, and put the result into s */
342 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
343 sumdiffs = vec_splat(sumdiffs, 3);
344 vec_ste(sumdiffs, 0, &s);
345
346 return s;
347}
348
f2677d6b
BF
349int pix_norm1_altivec(uint8_t *pix, int line_size)
350{
4013fcf4
FB
351 int i;
352 int s __attribute__((aligned(16)));
aab34ca0 353 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
4013fcf4 354 vector unsigned char *tv;
f2677d6b 355 vector unsigned char pixv;
f2677d6b
BF
356 vector unsigned int sv;
357 vector signed int sum;
115329f1 358
3b991c54 359 sv = (vector unsigned int)vec_splat_u32(0);
115329f1 360
f2677d6b
BF
361 s = 0;
362 for (i = 0; i < 16; i++) {
363 /* Read in the potentially unaligned pixels */
364 tv = (vector unsigned char *) pix;
365 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
366
9c76bd48
BF
367 /* Square the values, and add them to our sum */
368 sv = vec_msum(pixv, pixv, sv);
f2677d6b
BF
369
370 pix += line_size;
371 }
372 /* Sum up the four partial sums, and put the result into s */
373 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
374 sum = vec_splat(sum, 3);
375 vec_ste(sum, 0, &s);
376
377 return s;
378}
379
4013fcf4
FB
380/**
381 * Sum of Squared Errors for a 8x8 block.
382 * AltiVec-enhanced.
bb198e19 383 * It's the sad8_altivec code above w/ squaring added.
4013fcf4 384 */
bb198e19 385int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
4013fcf4
FB
386{
387 int i;
388 int s __attribute__((aligned(16)));
aab34ca0 389 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
4013fcf4
FB
390 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
391 vector unsigned char t1, t2, t3,t4, t5;
392 vector unsigned int sum;
393 vector signed int sumsqr;
115329f1 394
3b991c54 395 sum = (vector unsigned int)vec_splat_u32(0);
a9a07762
MN
396
397 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
398
115329f1 399
bb198e19 400 for(i=0;i<h;i++) {
bb270c08
DB
401 /* Read potentially unaligned pixels into t1 and t2
402 Since we're reading 16 pixels, and actually only want 8,
403 mask out the last 8 pixels. The 0s don't change the sum. */
4013fcf4
FB
404 perm1 = vec_lvsl(0, pix1);
405 pix1v = (vector unsigned char *) pix1;
406 perm2 = vec_lvsl(0, pix2);
407 pix2v = (vector unsigned char *) pix2;
408 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
409 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
410
411 /*
412 Since we want to use unsigned chars, we can take advantage
413 of the fact that abs(a-b)^2 = (a-b)^2.
414 */
115329f1 415
bb270c08 416 /* Calculate abs differences vector */
4013fcf4
FB
417 t3 = vec_max(t1, t2);
418 t4 = vec_min(t1, t2);
419 t5 = vec_sub(t3, t4);
115329f1 420
4013fcf4
FB
421 /* Square the values and add them to our sum */
422 sum = vec_msum(t5, t5, sum);
115329f1 423
4013fcf4
FB
424 pix1 += line_size;
425 pix2 += line_size;
426 }
115329f1 427
4013fcf4
FB
428 /* Sum up the four partial sums, and put the result into s */
429 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
430 sumsqr = vec_splat(sumsqr, 3);
431 vec_ste(sumsqr, 0, &s);
115329f1 432
4013fcf4
FB
433 return s;
434}
435
436/**
437 * Sum of Squared Errors for a 16x16 block.
438 * AltiVec-enhanced.
bb198e19 439 * It's the sad16_altivec code above w/ squaring added.
4013fcf4 440 */
bb198e19 441int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
59925ef2 442{
4013fcf4
FB
443 int i;
444 int s __attribute__((aligned(16)));
aab34ca0 445 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
4013fcf4
FB
446 vector unsigned char perm1, perm2, *pix1v, *pix2v;
447 vector unsigned char t1, t2, t3,t4, t5;
448 vector unsigned int sum;
449 vector signed int sumsqr;
115329f1 450
3b991c54 451 sum = (vector unsigned int)vec_splat_u32(0);
115329f1 452
bb198e19 453 for(i=0;i<h;i++) {
bb270c08 454 /* Read potentially unaligned pixels into t1 and t2 */
4013fcf4
FB
455 perm1 = vec_lvsl(0, pix1);
456 pix1v = (vector unsigned char *) pix1;
457 perm2 = vec_lvsl(0, pix2);
458 pix2v = (vector unsigned char *) pix2;
459 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
460 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
461
462 /*
463 Since we want to use unsigned chars, we can take advantage
464 of the fact that abs(a-b)^2 = (a-b)^2.
465 */
115329f1 466
bb270c08 467 /* Calculate abs differences vector */
4013fcf4
FB
468 t3 = vec_max(t1, t2);
469 t4 = vec_min(t1, t2);
470 t5 = vec_sub(t3, t4);
115329f1 471
4013fcf4
FB
472 /* Square the values and add them to our sum */
473 sum = vec_msum(t5, t5, sum);
115329f1 474
4013fcf4
FB
475 pix1 += line_size;
476 pix2 += line_size;
477 }
115329f1 478
4013fcf4
FB
479 /* Sum up the four partial sums, and put the result into s */
480 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
481 sumsqr = vec_splat(sumsqr, 3);
482 vec_ste(sumsqr, 0, &s);
115329f1 483
4013fcf4
FB
484 return s;
485}
59925ef2 486
0c1a9eda 487int pix_sum_altivec(uint8_t * pix, int line_size)
4013fcf4 488{
aab34ca0 489 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
59925ef2
BF
490 vector unsigned char perm, *pixv;
491 vector unsigned char t1;
4013fcf4 492 vector unsigned int sad;
59925ef2
BF
493 vector signed int sumdiffs;
494
4013fcf4
FB
495 int i;
496 int s __attribute__((aligned(16)));
115329f1 497
3b991c54 498 sad = (vector unsigned int)vec_splat_u32(0);
115329f1 499
59925ef2 500 for (i = 0; i < 16; i++) {
bb270c08 501 /* Read the potentially unaligned 16 pixels into t1 */
59925ef2
BF
502 perm = vec_lvsl(0, pix);
503 pixv = (vector unsigned char *) pix;
504 t1 = vec_perm(pixv[0], pixv[1], perm);
505
bb270c08 506 /* Add each 4 pixel group together and put 4 results into sad */
59925ef2 507 sad = vec_sum4s(t1, sad);
115329f1 508
59925ef2
BF
509 pix += line_size;
510 }
115329f1 511
59925ef2
BF
512 /* Sum up the four partial sums, and put the result into s */
513 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
514 sumdiffs = vec_splat(sumdiffs, 3);
515 vec_ste(sumdiffs, 0, &s);
115329f1 516
59925ef2
BF
517 return s;
518}
519
0c1a9eda 520void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
05c4072b
MN
521{
522 int i;
523 vector unsigned char perm, bytes, *pixv;
aab34ca0 524 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
05c4072b
MN
525 vector signed short shorts;
526
527 for(i=0;i<8;i++)
528 {
529 // Read potentially unaligned pixels.
530 // We're reading 16 pixels, and actually only want 8,
531 // but we simply ignore the extras.
532 perm = vec_lvsl(0, pixels);
533 pixv = (vector unsigned char *) pixels;
534 bytes = vec_perm(pixv[0], pixv[1], perm);
535
536 // convert the bytes into shorts
537 shorts = (vector signed short)vec_mergeh(zero, bytes);
538
539 // save the data to the block, we assume the block is 16-byte aligned
540 vec_st(shorts, i*16, (vector signed short*)block);
541
542 pixels += line_size;
543 }
544}
545
0c1a9eda
ZK
546void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
547 const uint8_t *s2, int stride)
05c4072b
MN
548{
549 int i;
550 vector unsigned char perm, bytes, *pixv;
aab34ca0 551 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
05c4072b
MN
552 vector signed short shorts1, shorts2;
553
554 for(i=0;i<4;i++)
555 {
556 // Read potentially unaligned pixels
557 // We're reading 16 pixels, and actually only want 8,
558 // but we simply ignore the extras.
559 perm = vec_lvsl(0, s1);
560 pixv = (vector unsigned char *) s1;
561 bytes = vec_perm(pixv[0], pixv[1], perm);
562
563 // convert the bytes into shorts
564 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
565
566 // Do the same for the second block of pixels
567 perm = vec_lvsl(0, s2);
568 pixv = (vector unsigned char *) s2;
569 bytes = vec_perm(pixv[0], pixv[1], perm);
570
571 // convert the bytes into shorts
572 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
573
574 // Do the subtraction
575 shorts1 = vec_sub(shorts1, shorts2);
576
577 // save the data to the block, we assume the block is 16-byte aligned
578 vec_st(shorts1, 0, (vector signed short*)block);
579
580 s1 += stride;
581 s2 += stride;
582 block += 8;
583
584
585 // The code below is a copy of the code above... This is a manual
586 // unroll.
587
588 // Read potentially unaligned pixels
589 // We're reading 16 pixels, and actually only want 8,
590 // but we simply ignore the extras.
591 perm = vec_lvsl(0, s1);
592 pixv = (vector unsigned char *) s1;
593 bytes = vec_perm(pixv[0], pixv[1], perm);
594
595 // convert the bytes into shorts
596 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
597
598 // Do the same for the second block of pixels
599 perm = vec_lvsl(0, s2);
600 pixv = (vector unsigned char *) s2;
601 bytes = vec_perm(pixv[0], pixv[1], perm);
602
603 // convert the bytes into shorts
604 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
605
606 // Do the subtraction
607 shorts1 = vec_sub(shorts1, shorts2);
608
609 // save the data to the block, we assume the block is 16-byte aligned
610 vec_st(shorts1, 0, (vector signed short*)block);
611
612 s1 += stride;
613 s2 += stride;
614 block += 8;
615 }
616}
617
e629ab68 618void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
db40a39a 619#ifdef ALTIVEC_USE_REFERENCE_C_CODE
e629ab68
RD
620 int i;
621 for(i=0; i+7<w; i++){
622 dst[i+0] += src[i+0];
623 dst[i+1] += src[i+1];
624 dst[i+2] += src[i+2];
625 dst[i+3] += src[i+3];
626 dst[i+4] += src[i+4];
627 dst[i+5] += src[i+5];
628 dst[i+6] += src[i+6];
629 dst[i+7] += src[i+7];
630 }
631 for(; i<w; i++)
632 dst[i+0] += src[i+0];
db40a39a 633#else /* ALTIVEC_USE_REFERENCE_C_CODE */
e629ab68 634 register int i;
db40a39a 635 register vector unsigned char vdst, vsrc;
115329f1 636
db40a39a
MN
637 /* dst and src are 16 bytes-aligned (guaranteed) */
638 for(i = 0 ; (i + 15) < w ; i++)
e629ab68 639 {
db40a39a
MN
640 vdst = vec_ld(i << 4, (unsigned char*)dst);
641 vsrc = vec_ld(i << 4, (unsigned char*)src);
e629ab68 642 vdst = vec_add(vsrc, vdst);
db40a39a 643 vec_st(vdst, i << 4, (unsigned char*)dst);
e629ab68 644 }
db40a39a 645 /* if w is not a multiple of 16 */
e629ab68
RD
646 for (; (i < w) ; i++)
647 {
648 dst[i] = src[i];
649 }
db40a39a
MN
650#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
651}
652
fe50f385 653/* next one assumes that ((line_size % 16) == 0) */
db40a39a
MN
654void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
655{
e45a2872 656POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
db40a39a
MN
657#ifdef ALTIVEC_USE_REFERENCE_C_CODE
658 int i;
659
e45a2872 660POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
db40a39a
MN
661
662 for(i=0; i<h; i++) {
aab34ca0
MN
663 *((uint32_t*)(block)) = LD32(pixels);
664 *((uint32_t*)(block+4)) = LD32(pixels+4);
665 *((uint32_t*)(block+8)) = LD32(pixels+8);
666 *((uint32_t*)(block+12)) = LD32(pixels+12);
db40a39a
MN
667 pixels+=line_size;
668 block +=line_size;
669 }
670
e45a2872 671POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
db40a39a
MN
672
673#else /* ALTIVEC_USE_REFERENCE_C_CODE */
db40a39a 674 register vector unsigned char pixelsv1, pixelsv2;
e45a2872
RD
675 register vector unsigned char pixelsv1B, pixelsv2B;
676 register vector unsigned char pixelsv1C, pixelsv2C;
677 register vector unsigned char pixelsv1D, pixelsv2D;
678
fe50f385 679 register vector unsigned char perm = vec_lvsl(0, pixels);
db40a39a 680 int i;
e45a2872
RD
681 register int line_size_2 = line_size << 1;
682 register int line_size_3 = line_size + line_size_2;
683 register int line_size_4 = line_size << 2;
684
685POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
686// hand-unrolling the loop by 4 gains about 15%
687// mininum execution time goes from 74 to 60 cycles
688// it's faster than -funroll-loops, but using
689// -funroll-loops w/ this is bad - 74 cycles again.
690// all this is on a 7450, tuning for the 7450
691#if 0
db40a39a
MN
692 for(i=0; i<h; i++) {
693 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
694 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
fe50f385 695 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
35e5fb06 696 0, (unsigned char*)block);
db40a39a
MN
697 pixels+=line_size;
698 block +=line_size;
699 }
e45a2872
RD
700#else
701 for(i=0; i<h; i+=4) {
702 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
703 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
704 pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
705 pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
706 pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
707 pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
708 pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
709 pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
710 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
711 0, (unsigned char*)block);
712 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
713 line_size, (unsigned char*)block);
714 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
715 line_size_2, (unsigned char*)block);
716 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
717 line_size_3, (unsigned char*)block);
718 pixels+=line_size_4;
719 block +=line_size_4;
720 }
721#endif
722POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
db40a39a
MN
723
724#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
725}
726
fe50f385 727/* next one assumes that ((line_size % 16) == 0) */
db40a39a
MN
728#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
729void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
730{
e45a2872 731POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
db40a39a
MN
732#ifdef ALTIVEC_USE_REFERENCE_C_CODE
733 int i;
734
e45a2872 735POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
db40a39a
MN
736
737 for(i=0; i<h; i++) {
aab34ca0
MN
738 op_avg(*((uint32_t*)(block)),LD32(pixels));
739 op_avg(*((uint32_t*)(block+4)),LD32(pixels+4));
740 op_avg(*((uint32_t*)(block+8)),LD32(pixels+8));
741 op_avg(*((uint32_t*)(block+12)),LD32(pixels+12));
db40a39a
MN
742 pixels+=line_size;
743 block +=line_size;
744 }
745
e45a2872 746POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
db40a39a
MN
747
748#else /* ALTIVEC_USE_REFERENCE_C_CODE */
db40a39a 749 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
fe50f385 750 register vector unsigned char perm = vec_lvsl(0, pixels);
db40a39a
MN
751 int i;
752
e45a2872 753POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
db40a39a
MN
754
755 for(i=0; i<h; i++) {
756 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
757 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
758 blockv = vec_ld(0, block);
fe50f385 759 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
db40a39a
MN
760 blockv = vec_avg(blockv,pixelsv);
761 vec_st(blockv, 0, (unsigned char*)block);
762 pixels+=line_size;
763 block +=line_size;
764 }
765
e45a2872 766POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
db40a39a
MN
767
768#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
e629ab68 769}
05c4072b 770
fe50f385
RD
771/* next one assumes that ((line_size % 8) == 0) */
772void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
35e5fb06 773{
e45a2872 774POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
35e5fb06
RD
775#ifdef ALTIVEC_USE_REFERENCE_C_CODE
776 int i;
e45a2872 777POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
35e5fb06
RD
778 for (i = 0; i < h; i++) {
779 *((uint32_t *) (block)) =
780 (((*((uint32_t *) (block))) |
781 ((((const struct unaligned_32 *) (pixels))->l))) -
782 ((((*((uint32_t *) (block))) ^
783 ((((const struct unaligned_32 *) (pixels))->
784 l))) & 0xFEFEFEFEUL) >> 1));
785 *((uint32_t *) (block + 4)) =
786 (((*((uint32_t *) (block + 4))) |
787 ((((const struct unaligned_32 *) (pixels + 4))->l))) -
788 ((((*((uint32_t *) (block + 4))) ^
789 ((((const struct unaligned_32 *) (pixels +
790 4))->
791 l))) & 0xFEFEFEFEUL) >> 1));
792 pixels += line_size;
793 block += line_size;
794 }
e45a2872 795POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
35e5fb06
RD
796
797#else /* ALTIVEC_USE_REFERENCE_C_CODE */
798 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
799 int i;
800
e45a2872 801POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
115329f1 802
35e5fb06
RD
803 for (i = 0; i < h; i++) {
804 /*
805 block is 8 bytes-aligned, so we're either in the
806 left block (16 bytes-aligned) or in the right block (not)
807 */
808 int rightside = ((unsigned long)block & 0x0000000F);
115329f1 809
35e5fb06
RD
810 blockv = vec_ld(0, block);
811 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
812 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
813 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
115329f1 814
35e5fb06
RD
815 if (rightside)
816 {
817 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
818 }
819 else
820 {
821 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
822 }
115329f1 823
35e5fb06
RD
824 blockv = vec_avg(blockv, pixelsv);
825
826 vec_st(blockv, 0, block);
115329f1 827
35e5fb06
RD
828 pixels += line_size;
829 block += line_size;
830 }
115329f1 831
e45a2872 832POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
115329f1 833
35e5fb06
RD
834#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
835}
836
fe50f385 837/* next one assumes that ((line_size % 8) == 0) */
35e5fb06
RD
838void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
839{
e45a2872 840POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
841#ifdef ALTIVEC_USE_REFERENCE_C_CODE
842 int j;
e45a2872 843POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
844 for (j = 0; j < 2; j++) {
845 int i;
846 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
847 const uint32_t b =
848 (((const struct unaligned_32 *) (pixels + 1))->l);
849 uint32_t l0 =
850 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
851 uint32_t h0 =
852 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
853 uint32_t l1, h1;
854 pixels += line_size;
855 for (i = 0; i < h; i += 2) {
856 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
857 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
858 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
859 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
860 *((uint32_t *) block) =
861 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
862 pixels += line_size;
863 block += line_size;
864 a = (((const struct unaligned_32 *) (pixels))->l);
865 b = (((const struct unaligned_32 *) (pixels + 1))->l);
866 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
867 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
868 *((uint32_t *) block) =
869 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
870 pixels += line_size;
871 block += line_size;
872 } pixels += 4 - line_size * (h + 1);
873 block += 4 - line_size * h;
874 }
875
e45a2872 876POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
877
878#else /* ALTIVEC_USE_REFERENCE_C_CODE */
879 register int i;
880 register vector unsigned char
881 pixelsv1, pixelsv2,
882 pixelsavg;
883 register vector unsigned char
884 blockv, temp1, temp2;
885 register vector unsigned short
886 pixelssum1, pixelssum2, temp3;
aab34ca0
MN
887 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
888 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
115329f1 889
35e5fb06
RD
890 temp1 = vec_ld(0, pixels);
891 temp2 = vec_ld(16, pixels);
892 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
893 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
894 {
895 pixelsv2 = temp2;
896 }
897 else
898 {
899 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
900 }
901 pixelsv1 = vec_mergeh(vczero, pixelsv1);
902 pixelsv2 = vec_mergeh(vczero, pixelsv2);
903 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
904 (vector unsigned short)pixelsv2);
905 pixelssum1 = vec_add(pixelssum1, vctwo);
115329f1
DB
906
907POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
908 for (i = 0; i < h ; i++) {
909 int rightside = ((unsigned long)block & 0x0000000F);
910 blockv = vec_ld(0, block);
911
912 temp1 = vec_ld(line_size, pixels);
913 temp2 = vec_ld(line_size + 16, pixels);
914 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
915 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
916 {
917 pixelsv2 = temp2;
918 }
919 else
920 {
921 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
922 }
923
924 pixelsv1 = vec_mergeh(vczero, pixelsv1);
925 pixelsv2 = vec_mergeh(vczero, pixelsv2);
926 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
927 (vector unsigned short)pixelsv2);
928 temp3 = vec_add(pixelssum1, pixelssum2);
929 temp3 = vec_sra(temp3, vctwo);
930 pixelssum1 = vec_add(pixelssum2, vctwo);
931 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
115329f1 932
35e5fb06
RD
933 if (rightside)
934 {
935 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
936 }
937 else
938 {
939 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
940 }
115329f1 941
35e5fb06 942 vec_st(blockv, 0, block);
115329f1 943
35e5fb06
RD
944 block += line_size;
945 pixels += line_size;
946 }
115329f1 947
e45a2872 948POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
35e5fb06
RD
949#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
950}
951
fe50f385
RD
952/* next one assumes that ((line_size % 8) == 0) */
953void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
954{
e45a2872 955POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
956#ifdef ALTIVEC_USE_REFERENCE_C_CODE
957 int j;
e45a2872 958POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
959 for (j = 0; j < 2; j++) {
960 int i;
961 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
962 const uint32_t b =
963 (((const struct unaligned_32 *) (pixels + 1))->l);
964 uint32_t l0 =
965 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
966 uint32_t h0 =
967 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
968 uint32_t l1, h1;
969 pixels += line_size;
970 for (i = 0; i < h; i += 2) {
971 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
972 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
973 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
974 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
975 *((uint32_t *) block) =
976 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
977 pixels += line_size;
978 block += line_size;
979 a = (((const struct unaligned_32 *) (pixels))->l);
980 b = (((const struct unaligned_32 *) (pixels + 1))->l);
981 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
982 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
983 *((uint32_t *) block) =
984 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
985 pixels += line_size;
986 block += line_size;
987 } pixels += 4 - line_size * (h + 1);
988 block += 4 - line_size * h;
989 }
115329f1 990
e45a2872 991POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
992
993#else /* ALTIVEC_USE_REFERENCE_C_CODE */
994 register int i;
995 register vector unsigned char
996 pixelsv1, pixelsv2,
997 pixelsavg;
998 register vector unsigned char
999 blockv, temp1, temp2;
1000 register vector unsigned short
1001 pixelssum1, pixelssum2, temp3;
aab34ca0
MN
1002 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1003 register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
1004 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
115329f1 1005
fe50f385
RD
1006 temp1 = vec_ld(0, pixels);
1007 temp2 = vec_ld(16, pixels);
1008 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1009 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1010 {
1011 pixelsv2 = temp2;
1012 }
1013 else
1014 {
1015 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1016 }
1017 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1018 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1019 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1020 (vector unsigned short)pixelsv2);
1021 pixelssum1 = vec_add(pixelssum1, vcone);
115329f1
DB
1022
1023POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
1024 for (i = 0; i < h ; i++) {
1025 int rightside = ((unsigned long)block & 0x0000000F);
1026 blockv = vec_ld(0, block);
1027
1028 temp1 = vec_ld(line_size, pixels);
1029 temp2 = vec_ld(line_size + 16, pixels);
1030 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1031 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1032 {
1033 pixelsv2 = temp2;
1034 }
1035 else
1036 {
1037 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1038 }
1039
1040 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1041 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1042 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1043 (vector unsigned short)pixelsv2);
1044 temp3 = vec_add(pixelssum1, pixelssum2);
1045 temp3 = vec_sra(temp3, vctwo);
1046 pixelssum1 = vec_add(pixelssum2, vcone);
1047 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
115329f1 1048
fe50f385
RD
1049 if (rightside)
1050 {
1051 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1052 }
1053 else
1054 {
1055 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1056 }
115329f1 1057
fe50f385 1058 vec_st(blockv, 0, block);
115329f1 1059
fe50f385
RD
1060 block += line_size;
1061 pixels += line_size;
1062 }
115329f1 1063
e45a2872 1064POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
fe50f385
RD
1065#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1066}
1067
1068/* next one assumes that ((line_size % 16) == 0) */
1069void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1070{
e45a2872 1071POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
fe50f385
RD
1072#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1073 int j;
e45a2872 1074POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
fe50f385
RD
1075 for (j = 0; j < 4; j++) {
1076 int i;
1077 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1078 const uint32_t b =
1079 (((const struct unaligned_32 *) (pixels + 1))->l);
1080 uint32_t l0 =
1081 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1082 uint32_t h0 =
1083 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1084 uint32_t l1, h1;
1085 pixels += line_size;
1086 for (i = 0; i < h; i += 2) {
1087 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1088 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1089 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1090 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1091 *((uint32_t *) block) =
1092 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1093 pixels += line_size;
1094 block += line_size;
1095 a = (((const struct unaligned_32 *) (pixels))->l);
1096 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1097 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1098 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1099 *((uint32_t *) block) =
1100 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1101 pixels += line_size;
1102 block += line_size;
1103 } pixels += 4 - line_size * (h + 1);
1104 block += 4 - line_size * h;
1105 }
1106
e45a2872 1107POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
fe50f385
RD
1108
1109#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1110 register int i;
1111 register vector unsigned char
1112 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1113 register vector unsigned char
1114 blockv, temp1, temp2;
1115 register vector unsigned short
1116 pixelssum1, pixelssum2, temp3,
1117 pixelssum3, pixelssum4, temp4;
aab34ca0
MN
1118 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1119 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
3efd4952 1120
e45a2872 1121POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
115329f1 1122
fe50f385
RD
1123 temp1 = vec_ld(0, pixels);
1124 temp2 = vec_ld(16, pixels);
1125 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1126 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1127 {
1128 pixelsv2 = temp2;
1129 }
1130 else
1131 {
1132 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1133 }
1134 pixelsv3 = vec_mergel(vczero, pixelsv1);
1135 pixelsv4 = vec_mergel(vczero, pixelsv2);
1136 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1137 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1138 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1139 (vector unsigned short)pixelsv4);
1140 pixelssum3 = vec_add(pixelssum3, vctwo);
1141 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1142 (vector unsigned short)pixelsv2);
1143 pixelssum1 = vec_add(pixelssum1, vctwo);
115329f1 1144
fe50f385
RD
1145 for (i = 0; i < h ; i++) {
1146 blockv = vec_ld(0, block);
1147
1148 temp1 = vec_ld(line_size, pixels);
1149 temp2 = vec_ld(line_size + 16, pixels);
1150 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1151 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1152 {
1153 pixelsv2 = temp2;
1154 }
1155 else
1156 {
1157 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1158 }
1159
1160 pixelsv3 = vec_mergel(vczero, pixelsv1);
1161 pixelsv4 = vec_mergel(vczero, pixelsv2);
1162 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1163 pixelsv2 = vec_mergeh(vczero, pixelsv2);
115329f1 1164
fe50f385
RD
1165 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1166 (vector unsigned short)pixelsv4);
1167 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1168 (vector unsigned short)pixelsv2);
1169 temp4 = vec_add(pixelssum3, pixelssum4);
1170 temp4 = vec_sra(temp4, vctwo);
1171 temp3 = vec_add(pixelssum1, pixelssum2);
1172 temp3 = vec_sra(temp3, vctwo);
1173
1174 pixelssum3 = vec_add(pixelssum4, vctwo);
1175 pixelssum1 = vec_add(pixelssum2, vctwo);
1176
1177 blockv = vec_packsu(temp3, temp4);
115329f1 1178
fe50f385 1179 vec_st(blockv, 0, block);
115329f1 1180
fe50f385
RD
1181 block += line_size;
1182 pixels += line_size;
1183 }
115329f1 1184
e45a2872 1185POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
fe50f385
RD
1186#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1187}
1188
1189/* next one assumes that ((line_size % 16) == 0) */
1190void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1191{
e45a2872 1192POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
fe50f385
RD
1193#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1194 int j;
e45a2872 1195POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
fe50f385
RD
1196 for (j = 0; j < 4; j++) {
1197 int i;
1198 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1199 const uint32_t b =
1200 (((const struct unaligned_32 *) (pixels + 1))->l);
1201 uint32_t l0 =
1202 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1203 uint32_t h0 =
1204 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1205 uint32_t l1, h1;
1206 pixels += line_size;
1207 for (i = 0; i < h; i += 2) {
1208 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1209 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1210 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1211 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1212 *((uint32_t *) block) =
1213 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1214 pixels += line_size;
1215 block += line_size;
1216 a = (((const struct unaligned_32 *) (pixels))->l);
1217 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1218 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1219 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1220 *((uint32_t *) block) =
1221 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1222 pixels += line_size;
1223 block += line_size;
1224 } pixels += 4 - line_size * (h + 1);
1225 block += 4 - line_size * h;
1226 }
1227
e45a2872 1228POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
fe50f385
RD
1229
1230#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1231 register int i;
1232 register vector unsigned char
1233 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1234 register vector unsigned char
1235 blockv, temp1, temp2;
1236 register vector unsigned short
1237 pixelssum1, pixelssum2, temp3,
1238 pixelssum3, pixelssum4, temp4;
aab34ca0
MN
1239 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1240 register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
1241 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
3efd4952 1242
e45a2872 1243POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
115329f1 1244
fe50f385
RD
1245 temp1 = vec_ld(0, pixels);
1246 temp2 = vec_ld(16, pixels);
1247 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1248 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1249 {
1250 pixelsv2 = temp2;
1251 }
1252 else
1253 {
1254 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1255 }
1256 pixelsv3 = vec_mergel(vczero, pixelsv1);
1257 pixelsv4 = vec_mergel(vczero, pixelsv2);
1258 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1259 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1260 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1261 (vector unsigned short)pixelsv4);
1262 pixelssum3 = vec_add(pixelssum3, vcone);
1263 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1264 (vector unsigned short)pixelsv2);
1265 pixelssum1 = vec_add(pixelssum1, vcone);
115329f1 1266
fe50f385
RD
1267 for (i = 0; i < h ; i++) {
1268 blockv = vec_ld(0, block);
1269
1270 temp1 = vec_ld(line_size, pixels);
1271 temp2 = vec_ld(line_size + 16, pixels);
1272 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1273 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1274 {
1275 pixelsv2 = temp2;
1276 }
1277 else
1278 {
1279 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1280 }
1281
1282 pixelsv3 = vec_mergel(vczero, pixelsv1);
1283 pixelsv4 = vec_mergel(vczero, pixelsv2);
1284 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1285 pixelsv2 = vec_mergeh(vczero, pixelsv2);
115329f1 1286
fe50f385
RD
1287 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1288 (vector unsigned short)pixelsv4);
1289 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1290 (vector unsigned short)pixelsv2);
1291 temp4 = vec_add(pixelssum3, pixelssum4);
1292 temp4 = vec_sra(temp4, vctwo);
1293 temp3 = vec_add(pixelssum1, pixelssum2);
1294 temp3 = vec_sra(temp3, vctwo);
1295
1296 pixelssum3 = vec_add(pixelssum4, vcone);
1297 pixelssum1 = vec_add(pixelssum2, vcone);
1298
1299 blockv = vec_packsu(temp3, temp4);
115329f1 1300
fe50f385 1301 vec_st(blockv, 0, block);
115329f1 1302
fe50f385
RD
1303 block += line_size;
1304 pixels += line_size;
1305 }
115329f1 1306
e45a2872 1307POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
fe50f385
RD
1308#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1309}
1310
c4a17148
MN
1311int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1312POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
1313 int sum;
1314POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
9007f514
RD
1315 register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
1316 register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
9007f514 1317 {
16f5ef9c
AB
1318 register const_vector signed short vprod1 = (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
1319 register const_vector signed short vprod2 = (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
1320 register const_vector signed short vprod3 = (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
9007f514 1321 register const_vector unsigned char perm1 = (const_vector unsigned char)
16f5ef9c 1322 AVV(0x02, 0x03, 0x00, 0x01,
c4a17148
MN
1323 0x06, 0x07, 0x04, 0x05,
1324 0x0A, 0x0B, 0x08, 0x09,
1325 0x0E, 0x0F, 0x0C, 0x0D);
9007f514 1326 register const_vector unsigned char perm2 = (const_vector unsigned char)
16f5ef9c 1327 AVV(0x04, 0x05, 0x06, 0x07,
c4a17148
MN
1328 0x00, 0x01, 0x02, 0x03,
1329 0x0C, 0x0D, 0x0E, 0x0F,
1330 0x08, 0x09, 0x0A, 0x0B);
9007f514 1331 register const_vector unsigned char perm3 = (const_vector unsigned char)
16f5ef9c 1332 AVV(0x08, 0x09, 0x0A, 0x0B,
c4a17148
MN
1333 0x0C, 0x0D, 0x0E, 0x0F,
1334 0x00, 0x01, 0x02, 0x03,
1335 0x04, 0x05, 0x06, 0x07);
c4a17148 1336
bb270c08
DB
1337#define ONEITERBUTTERFLY(i, res) \
1338 { \
1339 register vector unsigned char src1, src2, srcO; \
1340 register vector unsigned char dst1, dst2, dstO; \
1341 src1 = vec_ld(stride * i, src); \
1342 if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
1343 src2 = vec_ld((stride * i) + 16, src); \
1344 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1345 dst1 = vec_ld(stride * i, dst); \
1346 if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
1347 dst2 = vec_ld((stride * i) + 16, dst); \
1348 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1349 /* promote the unsigned chars to signed shorts */ \
1350 /* we're in the 8x8 function, we only care for the first 8 */ \
1351 register vector signed short srcV = \
1352 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1353 register vector signed short dstV = \
1354 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1355 /* substractions inside the first butterfly */ \
1356 register vector signed short but0 = vec_sub(srcV, dstV); \
1357 register vector signed short op1 = vec_perm(but0, but0, perm1); \
1358 register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
1359 register vector signed short op2 = vec_perm(but1, but1, perm2); \
1360 register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
1361 register vector signed short op3 = vec_perm(but2, but2, perm3); \
1362 res = vec_mladd(but2, vprod3, op3); \
c4a17148 1363 }
c4a17148
MN
1364 ONEITERBUTTERFLY(0, temp0);
1365 ONEITERBUTTERFLY(1, temp1);
1366 ONEITERBUTTERFLY(2, temp2);
1367 ONEITERBUTTERFLY(3, temp3);
1368 ONEITERBUTTERFLY(4, temp4);
1369 ONEITERBUTTERFLY(5, temp5);
1370 ONEITERBUTTERFLY(6, temp6);
1371 ONEITERBUTTERFLY(7, temp7);
9007f514 1372 }
c4a17148 1373#undef ONEITERBUTTERFLY
9007f514
RD
1374 {
1375 register vector signed int vsum;
1376 register vector signed short line0 = vec_add(temp0, temp1);
1377 register vector signed short line1 = vec_sub(temp0, temp1);
1378 register vector signed short line2 = vec_add(temp2, temp3);
1379 register vector signed short line3 = vec_sub(temp2, temp3);
1380 register vector signed short line4 = vec_add(temp4, temp5);
1381 register vector signed short line5 = vec_sub(temp4, temp5);
1382 register vector signed short line6 = vec_add(temp6, temp7);
1383 register vector signed short line7 = vec_sub(temp6, temp7);
115329f1 1384
9007f514
RD
1385 register vector signed short line0B = vec_add(line0, line2);
1386 register vector signed short line2B = vec_sub(line0, line2);
1387 register vector signed short line1B = vec_add(line1, line3);
1388 register vector signed short line3B = vec_sub(line1, line3);
1389 register vector signed short line4B = vec_add(line4, line6);
1390 register vector signed short line6B = vec_sub(line4, line6);
1391 register vector signed short line5B = vec_add(line5, line7);
1392 register vector signed short line7B = vec_sub(line5, line7);
115329f1 1393
9007f514
RD
1394 register vector signed short line0C = vec_add(line0B, line4B);
1395 register vector signed short line4C = vec_sub(line0B, line4B);
1396 register vector signed short line1C = vec_add(line1B, line5B);
1397 register vector signed short line5C = vec_sub(line1B, line5B);
1398 register vector signed short line2C = vec_add(line2B, line6B);
1399 register vector signed short line6C = vec_sub(line2B, line6B);
1400 register vector signed short line3C = vec_add(line3B, line7B);
1401 register vector signed short line7C = vec_sub(line3B, line7B);
115329f1 1402
9007f514
RD
1403 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1404 vsum = vec_sum4s(vec_abs(line1C), vsum);
1405 vsum = vec_sum4s(vec_abs(line2C), vsum);
1406 vsum = vec_sum4s(vec_abs(line3C), vsum);
1407 vsum = vec_sum4s(vec_abs(line4C), vsum);
1408 vsum = vec_sum4s(vec_abs(line5C), vsum);
1409 vsum = vec_sum4s(vec_abs(line6C), vsum);
1410 vsum = vec_sum4s(vec_abs(line7C), vsum);
1411 vsum = vec_sums(vsum, (vector signed int)vzero);
1412 vsum = vec_splat(vsum, 3);
1413 vec_ste(vsum, 0, &sum);
1414 }
1415POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
1416 return sum;
1417}
1418
1419/*
1420 16x8 works with 16 elements ; it allows to avoid replicating
1421 loads, and give the compiler more rooms for scheduling.
1422 It's only used from inside hadamard8_diff16_altivec.
115329f1 1423
9007f514
RD
1424 Unfortunately, it seems gcc-3.3 is a bit dumb, and
1425 the compiled code has a LOT of spill code, it seems
1426 gcc (unlike xlc) cannot keep everything in registers
1427 by itself. The following code include hand-made
1428 registers allocation. It's not clean, but on
1429 a 7450 the resulting code is much faster (best case
1430 fall from 700+ cycles to 550).
115329f1 1431
9007f514
RD
1432 xlc doesn't add spill code, but it doesn't know how to
1433 schedule for the 7450, and its code isn't much faster than
1434 gcc-3.3 on the 7450 (but uses 25% less instructions...)
115329f1 1435
9007f514
RD
1436 On the 970, the hand-made RA is still a win (arount 690
1437 vs. around 780), but xlc goes to around 660 on the
1438 regular C code...
1439*/
1440
1441static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
1442 int sum;
1443 register vector signed short
a01e08ee
LB
1444 temp0 REG_v(v0),
1445 temp1 REG_v(v1),
1446 temp2 REG_v(v2),
1447 temp3 REG_v(v3),
1448 temp4 REG_v(v4),
1449 temp5 REG_v(v5),
1450 temp6 REG_v(v6),
1451 temp7 REG_v(v7);
9007f514 1452 register vector signed short
a01e08ee
LB
1453 temp0S REG_v(v8),
1454 temp1S REG_v(v9),
1455 temp2S REG_v(v10),
1456 temp3S REG_v(v11),
1457 temp4S REG_v(v12),
1458 temp5S REG_v(v13),
1459 temp6S REG_v(v14),
1460 temp7S REG_v(v15);
1461 register const_vector unsigned char vzero REG_v(v31)= (const_vector unsigned char)vec_splat_u8(0);
9007f514 1462 {
a01e08ee
LB
1463 register const_vector signed short vprod1 REG_v(v16)= (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
1464 register const_vector signed short vprod2 REG_v(v17)= (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
1465 register const_vector signed short vprod3 REG_v(v18)= (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
1466 register const_vector unsigned char perm1 REG_v(v19)= (const_vector unsigned char)
16f5ef9c 1467 AVV(0x02, 0x03, 0x00, 0x01,
9007f514
RD
1468 0x06, 0x07, 0x04, 0x05,
1469 0x0A, 0x0B, 0x08, 0x09,
1470 0x0E, 0x0F, 0x0C, 0x0D);
a01e08ee 1471 register const_vector unsigned char perm2 REG_v(v20)= (const_vector unsigned char)
16f5ef9c 1472 AVV(0x04, 0x05, 0x06, 0x07,
9007f514
RD
1473 0x00, 0x01, 0x02, 0x03,
1474 0x0C, 0x0D, 0x0E, 0x0F,
1475 0x08, 0x09, 0x0A, 0x0B);
a01e08ee 1476 register const_vector unsigned char perm3 REG_v(v21)= (const_vector unsigned char)
16f5ef9c 1477 AVV(0x08, 0x09, 0x0A, 0x0B,
9007f514
RD
1478 0x0C, 0x0D, 0x0E, 0x0F,
1479 0x00, 0x01, 0x02, 0x03,
1480 0x04, 0x05, 0x06, 0x07);
16f5ef9c 1481
bb270c08
DB
1482#define ONEITERBUTTERFLY(i, res1, res2) \
1483 { \
a01e08ee
LB
1484 register vector unsigned char src1 REG_v(v22), src2 REG_v(v23); \
1485 register vector unsigned char dst1 REG_v(v24), dst2 REG_v(v25); \
bb270c08
DB
1486 src1 = vec_ld(stride * i, src); \
1487 src2 = vec_ld((stride * i) + 16, src); \
a01e08ee 1488 register vector unsigned char srcO REG_v(v22) = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
bb270c08
DB
1489 dst1 = vec_ld(stride * i, dst); \
1490 dst2 = vec_ld((stride * i) + 16, dst); \
a01e08ee 1491 register vector unsigned char dstO REG_v(v23) = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
bb270c08 1492 /* promote the unsigned chars to signed shorts */ \
a01e08ee 1493 register vector signed short srcV REG_v(v24) = \
bb270c08 1494 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
a01e08ee 1495 register vector signed short dstV REG_v(v25) = \
bb270c08 1496 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
a01e08ee 1497 register vector signed short srcW REG_v(v26) = \
bb270c08 1498 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
a01e08ee 1499 register vector signed short dstW REG_v(v27) = \
bb270c08
DB
1500 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
1501 /* substractions inside the first butterfly */ \
a01e08ee
LB
1502 register vector signed short but0 REG_v(v28) = vec_sub(srcV, dstV); \
1503 register vector signed short but0S REG_v(v29) = vec_sub(srcW, dstW); \
1504 register vector signed short op1 REG_v(v30) = vec_perm(but0, but0, perm1); \
1505 register vector signed short but1 REG_v(v22) = vec_mladd(but0, vprod1, op1); \
1506 register vector signed short op1S REG_v(v23) = vec_perm(but0S, but0S, perm1); \
1507 register vector signed short but1S REG_v(v24) = vec_mladd(but0S, vprod1, op1S); \
1508 register vector signed short op2 REG_v(v25) = vec_perm(but1, but1, perm2); \
1509 register vector signed short but2 REG_v(v26) = vec_mladd(but1, vprod2, op2); \
1510 register vector signed short op2S REG_v(v27) = vec_perm(but1S, but1S, perm2); \
1511 register vector signed short but2S REG_v(v28) = vec_mladd(but1S, vprod2, op2S); \
1512 register vector signed short op3 REG_v(v29) = vec_perm(but2, but2, perm3); \
bb270c08 1513 res1 = vec_mladd(but2, vprod3, op3); \
a01e08ee 1514 register vector signed short op3S REG_v(v30) = vec_perm(but2S, but2S, perm3); \
bb270c08 1515 res2 = vec_mladd(but2S, vprod3, op3S); \
9007f514
RD
1516 }
1517 ONEITERBUTTERFLY(0, temp0, temp0S);
1518 ONEITERBUTTERFLY(1, temp1, temp1S);
1519 ONEITERBUTTERFLY(2, temp2, temp2S);
1520 ONEITERBUTTERFLY(3, temp3, temp3S);
1521 ONEITERBUTTERFLY(4, temp4, temp4S);
1522 ONEITERBUTTERFLY(5, temp5, temp5S);
1523 ONEITERBUTTERFLY(6, temp6, temp6S);
1524 ONEITERBUTTERFLY(7, temp7, temp7S);
1525 }
1526#undef ONEITERBUTTERFLY
1527 {
1528 register vector signed int vsum;
1529 register vector signed short line0 = vec_add(temp0, temp1);
1530 register vector signed short line1 = vec_sub(temp0, temp1);
1531 register vector signed short line2 = vec_add(temp2, temp3);
1532 register vector signed short line3 = vec_sub(temp2, temp3);
1533 register vector signed short line4 = vec_add(temp4, temp5);
1534 register vector signed short line5 = vec_sub(temp4, temp5);
1535 register vector signed short line6 = vec_add(temp6, temp7);
1536 register vector signed short line7 = vec_sub(temp6, temp7);
115329f1 1537
9007f514
RD
1538 register vector signed short line0B = vec_add(line0, line2);
1539 register vector signed short line2B = vec_sub(line0, line2);
1540 register vector signed short line1B = vec_add(line1, line3);
1541 register vector signed short line3B = vec_sub(line1, line3);
1542 register vector signed short line4B = vec_add(line4, line6);
1543 register vector signed short line6B = vec_sub(line4, line6);
1544 register vector signed short line5B = vec_add(line5, line7);
1545 register vector signed short line7B = vec_sub(line5, line7);
115329f1 1546
9007f514
RD
1547 register vector signed short line0C = vec_add(line0B, line4B);
1548 register vector signed short line4C = vec_sub(line0B, line4B);
1549 register vector signed short line1C = vec_add(line1B, line5B);
1550 register vector signed short line5C = vec_sub(line1B, line5B);
1551 register vector signed short line2C = vec_add(line2B, line6B);
1552 register vector signed short line6C = vec_sub(line2B, line6B);
1553 register vector signed short line3C = vec_add(line3B, line7B);
1554 register vector signed short line7C = vec_sub(line3B, line7B);
115329f1 1555
9007f514
RD
1556 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1557 vsum = vec_sum4s(vec_abs(line1C), vsum);
1558 vsum = vec_sum4s(vec_abs(line2C), vsum);
1559 vsum = vec_sum4s(vec_abs(line3C), vsum);
1560 vsum = vec_sum4s(vec_abs(line4C), vsum);
1561 vsum = vec_sum4s(vec_abs(line5C), vsum);
1562 vsum = vec_sum4s(vec_abs(line6C), vsum);
1563 vsum = vec_sum4s(vec_abs(line7C), vsum);
1564
1565 register vector signed short line0S = vec_add(temp0S, temp1S);
1566 register vector signed short line1S = vec_sub(temp0S, temp1S);
1567 register vector signed short line2S = vec_add(temp2S, temp3S);
1568 register vector signed short line3S = vec_sub(temp2S, temp3S);
1569 register vector signed short line4S = vec_add(temp4S, temp5S);
1570 register vector signed short line5S = vec_sub(temp4S, temp5S);
1571 register vector signed short line6S = vec_add(temp6S, temp7S);
1572 register vector signed short line7S = vec_sub(temp6S, temp7S);
1573
1574 register vector signed short line0BS = vec_add(line0S, line2S);
1575 register vector signed short line2BS = vec_sub(line0S, line2S);
1576 register vector signed short line1BS = vec_add(line1S, line3S);
1577 register vector signed short line3BS = vec_sub(line1S, line3S);
1578 register vector signed short line4BS = vec_add(line4S, line6S);
1579 register vector signed short line6BS = vec_sub(line4S, line6S);
1580 register vector signed short line5BS = vec_add(line5S, line7S);
1581 register vector signed short line7BS = vec_sub(line5S, line7S);
1582
1583 register vector signed short line0CS = vec_add(line0BS, line4BS);
1584 register vector signed short line4CS = vec_sub(line0BS, line4BS);
1585 register vector signed short line1CS = vec_add(line1BS, line5BS);
1586 register vector signed short line5CS = vec_sub(line1BS, line5BS);
1587 register vector signed short line2CS = vec_add(line2BS, line6BS);
1588 register vector signed short line6CS = vec_sub(line2BS, line6BS);
1589 register vector signed short line3CS = vec_add(line3BS, line7BS);
1590 register vector signed short line7CS = vec_sub(line3BS, line7BS);
1591
1592 vsum = vec_sum4s(vec_abs(line0CS), vsum);
1593 vsum = vec_sum4s(vec_abs(line1CS), vsum);
1594 vsum = vec_sum4s(vec_abs(line2CS), vsum);
1595 vsum = vec_sum4s(vec_abs(line3CS), vsum);
1596 vsum = vec_sum4s(vec_abs(line4CS), vsum);
1597 vsum = vec_sum4s(vec_abs(line5CS), vsum);
1598 vsum = vec_sum4s(vec_abs(line6CS), vsum);
1599 vsum = vec_sum4s(vec_abs(line7CS), vsum);
1600 vsum = vec_sums(vsum, (vector signed int)vzero);
1601 vsum = vec_splat(vsum, 3);
1602 vec_ste(vsum, 0, &sum);
c4a17148 1603 }
c4a17148
MN
1604 return sum;
1605}
1606
9007f514
RD
1607int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1608POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
1609 int score;
1610POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
1611 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1612 if (h==16) {
1613 dst += 8*stride;
1614 src += 8*stride;
1615 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1616 }
1617POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
1618 return score;
1619}
1620
59925ef2
BF
1621int has_altivec(void)
1622{
69339009 1623#ifdef __AMIGAOS4__
bb270c08
DB
1624 ULONG result = 0;
1625 extern struct ExecIFace *IExec;
69339009 1626
bb270c08
DB
1627 IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
1628 if (result == VECTORTYPE_ALTIVEC) return 1;
1629 return 0;
69339009
C
1630#else /* __AMIGAOS4__ */
1631
3b991c54 1632#ifdef CONFIG_DARWIN
59925ef2
BF
1633 int sels[2] = {CTL_HW, HW_VECTORUNIT};
1634 int has_vu = 0;
1635 size_t len = sizeof(has_vu);
1636 int err;
1637
1638 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1639
1640 if (err == 0) return (has_vu != 0);
3b991c54
RD
1641#else /* CONFIG_DARWIN */
1642/* no Darwin, do it the brute-force way */
1643/* this is borrowed from the libmpeg2 library */
1644 {
1645 signal (SIGILL, sigill_handler);
1646 if (sigsetjmp (jmpbuf, 1)) {
1647 signal (SIGILL, SIG_DFL);
1648 } else {
1649 canjump = 1;
115329f1 1650
3b991c54
RD
1651 asm volatile ("mtspr 256, %0\n\t"
1652 "vand %%v0, %%v0, %%v0"
1653 :
1654 : "r" (-1));
115329f1 1655
3b991c54
RD
1656 signal (SIGILL, SIG_DFL);
1657 return 1;
1658 }
1659 }
1660#endif /* CONFIG_DARWIN */
59925ef2 1661 return 0;
69339009 1662#endif /* __AMIGAOS4__ */
59925ef2 1663}
2a5a1bda
MN
1664
1665/* next one assumes that ((line_size % 8) == 0) */
1666void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
1667{
1668POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
1669#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1670
1671 int j;
1672POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
1673 for (j = 0; j < 2; j++) {
1674 int i;
1675 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1676 const uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1677 uint32_t l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1678 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1679 uint32_t l1, h1;
1680 pixels += line_size;
1681 for (i = 0; i < h; i += 2) {
1682 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1683 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1684 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1685 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1686 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1687 pixels += line_size;
1688 block += line_size;
1689 a = (((const struct unaligned_32 *) (pixels))->l);
1690 b = (((const struct unaligned_32 *) (pixels + 1))->l);
1691 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1692 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1693 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1694 pixels += line_size;
1695 block += line_size;
1696 } pixels += 4 - line_size * (h + 1);
1697 block += 4 - line_size * h;
1698 }
1699POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
1700#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1701 register int i;
1702 register vector unsigned char
1703 pixelsv1, pixelsv2,
1704 pixelsavg;
1705 register vector unsigned char
1706 blockv, temp1, temp2, blocktemp;
1707 register vector unsigned short
1708 pixelssum1, pixelssum2, temp3;
1709 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1710 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
115329f1 1711
2a5a1bda
MN
1712 temp1 = vec_ld(0, pixels);
1713 temp2 = vec_ld(16, pixels);
1714 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1715 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1716 {
1717 pixelsv2 = temp2;
1718 }
1719 else
1720 {
1721 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1722 }
1723 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1724 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1725 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1726 (vector unsigned short)pixelsv2);
1727 pixelssum1 = vec_add(pixelssum1, vctwo);
115329f1
DB
1728
1729POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
2a5a1bda
MN
1730 for (i = 0; i < h ; i++) {
1731 int rightside = ((unsigned long)block & 0x0000000F);
1732 blockv = vec_ld(0, block);
1733
1734 temp1 = vec_ld(line_size, pixels);
1735 temp2 = vec_ld(line_size + 16, pixels);
1736 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1737 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1738 {
1739 pixelsv2 = temp2;
1740 }
1741 else
1742 {
1743 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1744 }
1745
1746 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1747 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1748 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1749 (vector unsigned short)pixelsv2);
1750 temp3 = vec_add(pixelssum1, pixelssum2);
1751 temp3 = vec_sra(temp3, vctwo);
1752 pixelssum1 = vec_add(pixelssum2, vctwo);
1753 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
115329f1 1754
2a5a1bda
MN
1755 if (rightside)
1756 {
1757 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1758 }
1759 else
1760 {
1761 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1762 }
115329f1 1763
2a5a1bda
MN
1764 blockv = vec_avg(blocktemp, blockv);
1765 vec_st(blockv, 0, block);
115329f1 1766
2a5a1bda
MN
1767 block += line_size;
1768 pixels += line_size;
1769 }
115329f1 1770
2a5a1bda
MN
1771POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
1772#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1773}