Remove PPC perf counter support
[libav.git] / libavcodec / ppc / h264_template_altivec.c
1 /*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24 #else
25 #define ASSERT_ALIGNED(ptr) ;
26 #endif
27
28 /* this code assume that stride % 16 == 0 */
29
30 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33 \
34 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = BIAS2(psum);\
39 psum = vec_sr(psum, v6us);\
40 \
41 vdst = vec_ld(0, dst);\
42 ppsum = (vec_u8)vec_pack(psum, psum);\
43 vfdst = vec_perm(vdst, ppsum, fperm);\
44 \
45 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46 \
47 vec_st(fsum, 0, dst);\
48 \
49 vsrc0ssH = vsrc2ssH;\
50 vsrc1ssH = vsrc3ssH;\
51 \
52 dst += stride;\
53 src += stride;
54
55 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56 \
57 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
58 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
59 \
60 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
61 psum = vec_mladd(vE, vsrc1ssH, psum);\
62 psum = vec_sr(psum, v6us);\
63 \
64 vdst = vec_ld(0, dst);\
65 ppsum = (vec_u8)vec_pack(psum, psum);\
66 vfdst = vec_perm(vdst, ppsum, fperm);\
67 \
68 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69 \
70 vec_st(fsum, 0, dst);\
71 \
72 dst += stride;\
73 src += stride;
74
75 #define noop(a) a
76 #define add28(a) vec_add(v28ss, a)
77
78 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
79 int stride, int h, int x, int y) {
80 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
81 {((8 - x) * (8 - y)),
82 (( x) * (8 - y)),
83 ((8 - x) * ( y)),
84 (( x) * ( y))};
85 register int i;
86 vec_u8 fperm;
87 const vec_s32 vABCD = vec_ld(0, ABCD);
88 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
89 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
90 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
91 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
92 LOAD_ZERO;
93 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
94 const vec_u16 v6us = vec_splat_u16(6);
95 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
96 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
97
98 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
99 vec_u8 vsrc0uc, vsrc1uc;
100 vec_s16 vsrc0ssH, vsrc1ssH;
101 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
102 vec_s16 vsrc2ssH, vsrc3ssH, psum;
103 vec_u8 vdst, ppsum, vfdst, fsum;
104
105 if (((unsigned long)dst) % 16 == 0) {
106 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
107 0x14, 0x15, 0x16, 0x17,
108 0x08, 0x09, 0x0A, 0x0B,
109 0x0C, 0x0D, 0x0E, 0x0F};
110 } else {
111 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
112 0x04, 0x05, 0x06, 0x07,
113 0x18, 0x19, 0x1A, 0x1B,
114 0x1C, 0x1D, 0x1E, 0x1F};
115 }
116
117 vsrcAuc = vec_ld(0, src);
118
119 if (loadSecond)
120 vsrcBuc = vec_ld(16, src);
121 vsrcperm0 = vec_lvsl(0, src);
122 vsrcperm1 = vec_lvsl(1, src);
123
124 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
125 if (reallyBadAlign)
126 vsrc1uc = vsrcBuc;
127 else
128 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
129
130 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
131 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
132
133 if (ABCD[3]) {
134 if (!loadSecond) {// -> !reallyBadAlign
135 for (i = 0 ; i < h ; i++) {
136 vsrcCuc = vec_ld(stride + 0, src);
137 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
138 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
139
140 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
141 }
142 } else {
143 vec_u8 vsrcDuc;
144 for (i = 0 ; i < h ; i++) {
145 vsrcCuc = vec_ld(stride + 0, src);
146 vsrcDuc = vec_ld(stride + 16, src);
147 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
148 if (reallyBadAlign)
149 vsrc3uc = vsrcDuc;
150 else
151 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
152
153 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
154 }
155 }
156 } else {
157 const vec_s16 vE = vec_add(vB, vC);
158 if (ABCD[2]) { // x == 0 B == 0
159 if (!loadSecond) {// -> !reallyBadAlign
160 for (i = 0 ; i < h ; i++) {
161 vsrcCuc = vec_ld(stride + 0, src);
162 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
163 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
164
165 vsrc0uc = vsrc1uc;
166 }
167 } else {
168 vec_u8 vsrcDuc;
169 for (i = 0 ; i < h ; i++) {
170 vsrcCuc = vec_ld(stride + 0, src);
171 vsrcDuc = vec_ld(stride + 15, src);
172 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
173 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
174
175 vsrc0uc = vsrc1uc;
176 }
177 }
178 } else { // y == 0 C == 0
179 if (!loadSecond) {// -> !reallyBadAlign
180 for (i = 0 ; i < h ; i++) {
181 vsrcCuc = vec_ld(0, src);
182 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
183 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
184
185 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
186 }
187 } else {
188 vec_u8 vsrcDuc;
189 for (i = 0 ; i < h ; i++) {
190 vsrcCuc = vec_ld(0, src);
191 vsrcDuc = vec_ld(15, src);
192 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
193 if (reallyBadAlign)
194 vsrc1uc = vsrcDuc;
195 else
196 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
197
198 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
199 }
200 }
201 }
202 }
203 }
204
205 /* this code assume that stride % 16 == 0 */
206 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
207 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
208 {((8 - x) * (8 - y)),
209 (( x) * (8 - y)),
210 ((8 - x) * ( y)),
211 (( x) * ( y))};
212 register int i;
213 vec_u8 fperm;
214 const vec_s32 vABCD = vec_ld(0, ABCD);
215 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
216 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
217 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
218 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
219 LOAD_ZERO;
220 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
221 const vec_u16 v6us = vec_splat_u16(6);
222 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
223 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
224
225 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
226 vec_u8 vsrc0uc, vsrc1uc;
227 vec_s16 vsrc0ssH, vsrc1ssH;
228 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
229 vec_s16 vsrc2ssH, vsrc3ssH, psum;
230 vec_u8 vdst, ppsum, vfdst, fsum;
231
232 if (((unsigned long)dst) % 16 == 0) {
233 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
234 0x14, 0x15, 0x16, 0x17,
235 0x08, 0x09, 0x0A, 0x0B,
236 0x0C, 0x0D, 0x0E, 0x0F};
237 } else {
238 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
239 0x04, 0x05, 0x06, 0x07,
240 0x18, 0x19, 0x1A, 0x1B,
241 0x1C, 0x1D, 0x1E, 0x1F};
242 }
243
244 vsrcAuc = vec_ld(0, src);
245
246 if (loadSecond)
247 vsrcBuc = vec_ld(16, src);
248 vsrcperm0 = vec_lvsl(0, src);
249 vsrcperm1 = vec_lvsl(1, src);
250
251 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
252 if (reallyBadAlign)
253 vsrc1uc = vsrcBuc;
254 else
255 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
256
257 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
258 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
259
260 if (!loadSecond) {// -> !reallyBadAlign
261 for (i = 0 ; i < h ; i++) {
262
263
264 vsrcCuc = vec_ld(stride + 0, src);
265
266 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
267 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
268
269 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
270 }
271 } else {
272 vec_u8 vsrcDuc;
273 for (i = 0 ; i < h ; i++) {
274 vsrcCuc = vec_ld(stride + 0, src);
275 vsrcDuc = vec_ld(stride + 16, src);
276
277 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
278 if (reallyBadAlign)
279 vsrc3uc = vsrcDuc;
280 else
281 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
282
283 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
284 }
285 }
286 }
287
288 #undef noop
289 #undef add28
290 #undef CHROMA_MC8_ALTIVEC_CORE
291
292 /* this code assume stride % 16 == 0 */
293 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
294 register int i;
295
296 LOAD_ZERO;
297 const vec_u8 permM2 = vec_lvsl(-2, src);
298 const vec_u8 permM1 = vec_lvsl(-1, src);
299 const vec_u8 permP0 = vec_lvsl(+0, src);
300 const vec_u8 permP1 = vec_lvsl(+1, src);
301 const vec_u8 permP2 = vec_lvsl(+2, src);
302 const vec_u8 permP3 = vec_lvsl(+3, src);
303 const vec_s16 v5ss = vec_splat_s16(5);
304 const vec_u16 v5us = vec_splat_u16(5);
305 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
306 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
307
308 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
309
310 register int align = ((((unsigned long)src) - 2) % 16);
311
312 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
313 srcP2A, srcP2B, srcP3A, srcP3B,
314 srcM1A, srcM1B, srcM2A, srcM2B,
315 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
316 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
317 psumA, psumB, sumA, sumB;
318
319 vec_u8 sum, vdst, fsum;
320
321 for (i = 0 ; i < 16 ; i ++) {
322 vec_u8 srcR1 = vec_ld(-2, src);
323 vec_u8 srcR2 = vec_ld(14, src);
324
325 switch (align) {
326 default: {
327 srcM2 = vec_perm(srcR1, srcR2, permM2);
328 srcM1 = vec_perm(srcR1, srcR2, permM1);
329 srcP0 = vec_perm(srcR1, srcR2, permP0);
330 srcP1 = vec_perm(srcR1, srcR2, permP1);
331 srcP2 = vec_perm(srcR1, srcR2, permP2);
332 srcP3 = vec_perm(srcR1, srcR2, permP3);
333 } break;
334 case 11: {
335 srcM2 = vec_perm(srcR1, srcR2, permM2);
336 srcM1 = vec_perm(srcR1, srcR2, permM1);
337 srcP0 = vec_perm(srcR1, srcR2, permP0);
338 srcP1 = vec_perm(srcR1, srcR2, permP1);
339 srcP2 = vec_perm(srcR1, srcR2, permP2);
340 srcP3 = srcR2;
341 } break;
342 case 12: {
343 vec_u8 srcR3 = vec_ld(30, src);
344 srcM2 = vec_perm(srcR1, srcR2, permM2);
345 srcM1 = vec_perm(srcR1, srcR2, permM1);
346 srcP0 = vec_perm(srcR1, srcR2, permP0);
347 srcP1 = vec_perm(srcR1, srcR2, permP1);
348 srcP2 = srcR2;
349 srcP3 = vec_perm(srcR2, srcR3, permP3);
350 } break;
351 case 13: {
352 vec_u8 srcR3 = vec_ld(30, src);
353 srcM2 = vec_perm(srcR1, srcR2, permM2);
354 srcM1 = vec_perm(srcR1, srcR2, permM1);
355 srcP0 = vec_perm(srcR1, srcR2, permP0);
356 srcP1 = srcR2;
357 srcP2 = vec_perm(srcR2, srcR3, permP2);
358 srcP3 = vec_perm(srcR2, srcR3, permP3);
359 } break;
360 case 14: {
361 vec_u8 srcR3 = vec_ld(30, src);
362 srcM2 = vec_perm(srcR1, srcR2, permM2);
363 srcM1 = vec_perm(srcR1, srcR2, permM1);
364 srcP0 = srcR2;
365 srcP1 = vec_perm(srcR2, srcR3, permP1);
366 srcP2 = vec_perm(srcR2, srcR3, permP2);
367 srcP3 = vec_perm(srcR2, srcR3, permP3);
368 } break;
369 case 15: {
370 vec_u8 srcR3 = vec_ld(30, src);
371 srcM2 = vec_perm(srcR1, srcR2, permM2);
372 srcM1 = srcR2;
373 srcP0 = vec_perm(srcR2, srcR3, permP0);
374 srcP1 = vec_perm(srcR2, srcR3, permP1);
375 srcP2 = vec_perm(srcR2, srcR3, permP2);
376 srcP3 = vec_perm(srcR2, srcR3, permP3);
377 } break;
378 }
379
380 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
381 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
382 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
383 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
384
385 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
386 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
387 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
388 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
389
390 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
391 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
392 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
393 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
394
395 sum1A = vec_adds(srcP0A, srcP1A);
396 sum1B = vec_adds(srcP0B, srcP1B);
397 sum2A = vec_adds(srcM1A, srcP2A);
398 sum2B = vec_adds(srcM1B, srcP2B);
399 sum3A = vec_adds(srcM2A, srcP3A);
400 sum3B = vec_adds(srcM2B, srcP3B);
401
402 pp1A = vec_mladd(sum1A, v20ss, v16ss);
403 pp1B = vec_mladd(sum1B, v20ss, v16ss);
404
405 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
406 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
407
408 pp3A = vec_add(sum3A, pp1A);
409 pp3B = vec_add(sum3B, pp1B);
410
411 psumA = vec_sub(pp3A, pp2A);
412 psumB = vec_sub(pp3B, pp2B);
413
414 sumA = vec_sra(psumA, v5us);
415 sumB = vec_sra(psumB, v5us);
416
417 sum = vec_packsu(sumA, sumB);
418
419 ASSERT_ALIGNED(dst);
420 vdst = vec_ld(0, dst);
421
422 OP_U8_ALTIVEC(fsum, sum, vdst);
423
424 vec_st(fsum, 0, dst);
425
426 src += srcStride;
427 dst += dstStride;
428 }
429 }
430
431 /* this code assume stride % 16 == 0 */
432 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
433 register int i;
434
435 LOAD_ZERO;
436 const vec_u8 perm = vec_lvsl(0, src);
437 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
438 const vec_u16 v5us = vec_splat_u16(5);
439 const vec_s16 v5ss = vec_splat_s16(5);
440 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
441
442 uint8_t *srcbis = src - (srcStride * 2);
443
444 const vec_u8 srcM2a = vec_ld(0, srcbis);
445 const vec_u8 srcM2b = vec_ld(16, srcbis);
446 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
447 //srcbis += srcStride;
448 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
449 const vec_u8 srcM1b = vec_ld(16, srcbis);
450 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
451 //srcbis += srcStride;
452 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
453 const vec_u8 srcP0b = vec_ld(16, srcbis);
454 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
455 //srcbis += srcStride;
456 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
457 const vec_u8 srcP1b = vec_ld(16, srcbis);
458 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
459 //srcbis += srcStride;
460 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
461 const vec_u8 srcP2b = vec_ld(16, srcbis);
462 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
463 //srcbis += srcStride;
464
465 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
466 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
467 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
468 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
469 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
470 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
471 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
472 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
473 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
474 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
475
476 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
477 psumA, psumB, sumA, sumB,
478 srcP3ssA, srcP3ssB,
479 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
480
481 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
482
483 for (i = 0 ; i < 16 ; i++) {
484 srcP3a = vec_ld(0, srcbis += srcStride);
485 srcP3b = vec_ld(16, srcbis);
486 srcP3 = vec_perm(srcP3a, srcP3b, perm);
487 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
488 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
489 //srcbis += srcStride;
490
491 sum1A = vec_adds(srcP0ssA, srcP1ssA);
492 sum1B = vec_adds(srcP0ssB, srcP1ssB);
493 sum2A = vec_adds(srcM1ssA, srcP2ssA);
494 sum2B = vec_adds(srcM1ssB, srcP2ssB);
495 sum3A = vec_adds(srcM2ssA, srcP3ssA);
496 sum3B = vec_adds(srcM2ssB, srcP3ssB);
497
498 srcM2ssA = srcM1ssA;
499 srcM2ssB = srcM1ssB;
500 srcM1ssA = srcP0ssA;
501 srcM1ssB = srcP0ssB;
502 srcP0ssA = srcP1ssA;
503 srcP0ssB = srcP1ssB;
504 srcP1ssA = srcP2ssA;
505 srcP1ssB = srcP2ssB;
506 srcP2ssA = srcP3ssA;
507 srcP2ssB = srcP3ssB;
508
509 pp1A = vec_mladd(sum1A, v20ss, v16ss);
510 pp1B = vec_mladd(sum1B, v20ss, v16ss);
511
512 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
513 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
514
515 pp3A = vec_add(sum3A, pp1A);
516 pp3B = vec_add(sum3B, pp1B);
517
518 psumA = vec_sub(pp3A, pp2A);
519 psumB = vec_sub(pp3B, pp2B);
520
521 sumA = vec_sra(psumA, v5us);
522 sumB = vec_sra(psumB, v5us);
523
524 sum = vec_packsu(sumA, sumB);
525
526 ASSERT_ALIGNED(dst);
527 vdst = vec_ld(0, dst);
528
529 OP_U8_ALTIVEC(fsum, sum, vdst);
530
531 vec_st(fsum, 0, dst);
532
533 dst += dstStride;
534 }
535 }
536
537 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
538 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
539 register int i;
540 LOAD_ZERO;
541 const vec_u8 permM2 = vec_lvsl(-2, src);
542 const vec_u8 permM1 = vec_lvsl(-1, src);
543 const vec_u8 permP0 = vec_lvsl(+0, src);
544 const vec_u8 permP1 = vec_lvsl(+1, src);
545 const vec_u8 permP2 = vec_lvsl(+2, src);
546 const vec_u8 permP3 = vec_lvsl(+3, src);
547 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
548 const vec_u32 v10ui = vec_splat_u32(10);
549 const vec_s16 v5ss = vec_splat_s16(5);
550 const vec_s16 v1ss = vec_splat_s16(1);
551 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
552 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
553
554 register int align = ((((unsigned long)src) - 2) % 16);
555
556 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
557 srcP2A, srcP2B, srcP3A, srcP3B,
558 srcM1A, srcM1B, srcM2A, srcM2B,
559 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
560 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
561
562 const vec_u8 mperm = (const vec_u8)
563 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
564 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
565 int16_t *tmpbis = tmp;
566
567 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
568 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
569 tmpP2ssA, tmpP2ssB;
570
571 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
572 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
573 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
574 ssumAe, ssumAo, ssumBe, ssumBo;
575 vec_u8 fsum, sumv, sum, vdst;
576 vec_s16 ssume, ssumo;
577
578 src -= (2 * srcStride);
579 for (i = 0 ; i < 21 ; i ++) {
580 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
581 vec_u8 srcR1 = vec_ld(-2, src);
582 vec_u8 srcR2 = vec_ld(14, src);
583
584 switch (align) {
585 default: {
586 srcM2 = vec_perm(srcR1, srcR2, permM2);
587 srcM1 = vec_perm(srcR1, srcR2, permM1);
588 srcP0 = vec_perm(srcR1, srcR2, permP0);
589 srcP1 = vec_perm(srcR1, srcR2, permP1);
590 srcP2 = vec_perm(srcR1, srcR2, permP2);
591 srcP3 = vec_perm(srcR1, srcR2, permP3);
592 } break;
593 case 11: {
594 srcM2 = vec_perm(srcR1, srcR2, permM2);
595 srcM1 = vec_perm(srcR1, srcR2, permM1);
596 srcP0 = vec_perm(srcR1, srcR2, permP0);
597 srcP1 = vec_perm(srcR1, srcR2, permP1);
598 srcP2 = vec_perm(srcR1, srcR2, permP2);
599 srcP3 = srcR2;
600 } break;
601 case 12: {
602 vec_u8 srcR3 = vec_ld(30, src);
603 srcM2 = vec_perm(srcR1, srcR2, permM2);
604 srcM1 = vec_perm(srcR1, srcR2, permM1);
605 srcP0 = vec_perm(srcR1, srcR2, permP0);
606 srcP1 = vec_perm(srcR1, srcR2, permP1);
607 srcP2 = srcR2;
608 srcP3 = vec_perm(srcR2, srcR3, permP3);
609 } break;
610 case 13: {
611 vec_u8 srcR3 = vec_ld(30, src);
612 srcM2 = vec_perm(srcR1, srcR2, permM2);
613 srcM1 = vec_perm(srcR1, srcR2, permM1);
614 srcP0 = vec_perm(srcR1, srcR2, permP0);
615 srcP1 = srcR2;
616 srcP2 = vec_perm(srcR2, srcR3, permP2);
617 srcP3 = vec_perm(srcR2, srcR3, permP3);
618 } break;
619 case 14: {
620 vec_u8 srcR3 = vec_ld(30, src);
621 srcM2 = vec_perm(srcR1, srcR2, permM2);
622 srcM1 = vec_perm(srcR1, srcR2, permM1);
623 srcP0 = srcR2;
624 srcP1 = vec_perm(srcR2, srcR3, permP1);
625 srcP2 = vec_perm(srcR2, srcR3, permP2);
626 srcP3 = vec_perm(srcR2, srcR3, permP3);
627 } break;
628 case 15: {
629 vec_u8 srcR3 = vec_ld(30, src);
630 srcM2 = vec_perm(srcR1, srcR2, permM2);
631 srcM1 = srcR2;
632 srcP0 = vec_perm(srcR2, srcR3, permP0);
633 srcP1 = vec_perm(srcR2, srcR3, permP1);
634 srcP2 = vec_perm(srcR2, srcR3, permP2);
635 srcP3 = vec_perm(srcR2, srcR3, permP3);
636 } break;
637 }
638
639 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
640 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
641 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
642 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
643
644 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
645 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
646 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
647 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
648
649 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
650 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
651 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
652 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
653
654 sum1A = vec_adds(srcP0A, srcP1A);
655 sum1B = vec_adds(srcP0B, srcP1B);
656 sum2A = vec_adds(srcM1A, srcP2A);
657 sum2B = vec_adds(srcM1B, srcP2B);
658 sum3A = vec_adds(srcM2A, srcP3A);
659 sum3B = vec_adds(srcM2B, srcP3B);
660
661 pp1A = vec_mladd(sum1A, v20ss, sum3A);
662 pp1B = vec_mladd(sum1B, v20ss, sum3B);
663
664 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
665 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
666
667 psumA = vec_sub(pp1A, pp2A);
668 psumB = vec_sub(pp1B, pp2B);
669
670 vec_st(psumA, 0, tmp);
671 vec_st(psumB, 16, tmp);
672
673 src += srcStride;
674 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
675 }
676
677 tmpM2ssA = vec_ld(0, tmpbis);
678 tmpM2ssB = vec_ld(16, tmpbis);
679 tmpbis += tmpStride;
680 tmpM1ssA = vec_ld(0, tmpbis);
681 tmpM1ssB = vec_ld(16, tmpbis);
682 tmpbis += tmpStride;
683 tmpP0ssA = vec_ld(0, tmpbis);
684 tmpP0ssB = vec_ld(16, tmpbis);
685 tmpbis += tmpStride;
686 tmpP1ssA = vec_ld(0, tmpbis);
687 tmpP1ssB = vec_ld(16, tmpbis);
688 tmpbis += tmpStride;
689 tmpP2ssA = vec_ld(0, tmpbis);
690 tmpP2ssB = vec_ld(16, tmpbis);
691 tmpbis += tmpStride;
692
693 for (i = 0 ; i < 16 ; i++) {
694 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
695 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
696
697 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
698 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
699 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
700 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
701 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
702 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
703
704 tmpbis += tmpStride;
705
706 tmpM2ssA = tmpM1ssA;
707 tmpM2ssB = tmpM1ssB;
708 tmpM1ssA = tmpP0ssA;
709 tmpM1ssB = tmpP0ssB;
710 tmpP0ssA = tmpP1ssA;
711 tmpP0ssB = tmpP1ssB;
712 tmpP1ssA = tmpP2ssA;
713 tmpP1ssB = tmpP2ssB;
714 tmpP2ssA = tmpP3ssA;
715 tmpP2ssB = tmpP3ssB;
716
717 pp1Ae = vec_mule(sum1A, v20ss);
718 pp1Ao = vec_mulo(sum1A, v20ss);
719 pp1Be = vec_mule(sum1B, v20ss);
720 pp1Bo = vec_mulo(sum1B, v20ss);
721
722 pp2Ae = vec_mule(sum2A, v5ss);
723 pp2Ao = vec_mulo(sum2A, v5ss);
724 pp2Be = vec_mule(sum2B, v5ss);
725 pp2Bo = vec_mulo(sum2B, v5ss);
726
727 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
728 pp3Ao = vec_mulo(sum3A, v1ss);
729 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
730 pp3Bo = vec_mulo(sum3B, v1ss);
731
732 pp1cAe = vec_add(pp1Ae, v512si);
733 pp1cAo = vec_add(pp1Ao, v512si);
734 pp1cBe = vec_add(pp1Be, v512si);
735 pp1cBo = vec_add(pp1Bo, v512si);
736
737 pp32Ae = vec_sub(pp3Ae, pp2Ae);
738 pp32Ao = vec_sub(pp3Ao, pp2Ao);
739 pp32Be = vec_sub(pp3Be, pp2Be);
740 pp32Bo = vec_sub(pp3Bo, pp2Bo);
741
742 sumAe = vec_add(pp1cAe, pp32Ae);
743 sumAo = vec_add(pp1cAo, pp32Ao);
744 sumBe = vec_add(pp1cBe, pp32Be);
745 sumBo = vec_add(pp1cBo, pp32Bo);
746
747 ssumAe = vec_sra(sumAe, v10ui);
748 ssumAo = vec_sra(sumAo, v10ui);
749 ssumBe = vec_sra(sumBe, v10ui);
750 ssumBo = vec_sra(sumBo, v10ui);
751
752 ssume = vec_packs(ssumAe, ssumBe);
753 ssumo = vec_packs(ssumAo, ssumBo);
754
755 sumv = vec_packsu(ssume, ssumo);
756 sum = vec_perm(sumv, sumv, mperm);
757
758 ASSERT_ALIGNED(dst);
759 vdst = vec_ld(0, dst);
760
761 OP_U8_ALTIVEC(fsum, sum, vdst);
762
763 vec_st(fsum, 0, dst);
764
765 dst += dstStride;
766 }
767 }