c0a4eb7a6023111292bce367a85468d2c0cda6ff
[libav.git] / libavcodec / ppc / h264_template_altivec.c
1 /*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24 #else
25 #define ASSERT_ALIGNED(ptr) ;
26 #endif
27
28 /* this code assume that stride % 16 == 0 */
29
30 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33 \
34 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = BIAS2(psum);\
39 psum = vec_sr(psum, v6us);\
40 \
41 vdst = vec_ld(0, dst);\
42 ppsum = (vec_u8)vec_pack(psum, psum);\
43 vfdst = vec_perm(vdst, ppsum, fperm);\
44 \
45 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46 \
47 vec_st(fsum, 0, dst);\
48 \
49 vsrc0ssH = vsrc2ssH;\
50 vsrc1ssH = vsrc3ssH;\
51 \
52 dst += stride;\
53 src += stride;
54
55 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56 \
57 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
58 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
59 \
60 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
61 psum = vec_mladd(vE, vsrc1ssH, psum);\
62 psum = vec_sr(psum, v6us);\
63 \
64 vdst = vec_ld(0, dst);\
65 ppsum = (vec_u8)vec_pack(psum, psum);\
66 vfdst = vec_perm(vdst, ppsum, fperm);\
67 \
68 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69 \
70 vec_st(fsum, 0, dst);\
71 \
72 dst += stride;\
73 src += stride;
74
75 #define noop(a) a
76 #define add28(a) vec_add(v28ss, a)
77
78 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
79 int stride, int h, int x, int y) {
80 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
81 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
82 {((8 - x) * (8 - y)),
83 (( x) * (8 - y)),
84 ((8 - x) * ( y)),
85 (( x) * ( y))};
86 register int i;
87 vec_u8 fperm;
88 const vec_s32 vABCD = vec_ld(0, ABCD);
89 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
90 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
91 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
92 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
93 LOAD_ZERO;
94 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
95 const vec_u16 v6us = vec_splat_u16(6);
96 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
97 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
98
99 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
100 vec_u8 vsrc0uc, vsrc1uc;
101 vec_s16 vsrc0ssH, vsrc1ssH;
102 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
103 vec_s16 vsrc2ssH, vsrc3ssH, psum;
104 vec_u8 vdst, ppsum, vfdst, fsum;
105
106 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
107
108 if (((unsigned long)dst) % 16 == 0) {
109 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
110 0x14, 0x15, 0x16, 0x17,
111 0x08, 0x09, 0x0A, 0x0B,
112 0x0C, 0x0D, 0x0E, 0x0F};
113 } else {
114 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
115 0x04, 0x05, 0x06, 0x07,
116 0x18, 0x19, 0x1A, 0x1B,
117 0x1C, 0x1D, 0x1E, 0x1F};
118 }
119
120 vsrcAuc = vec_ld(0, src);
121
122 if (loadSecond)
123 vsrcBuc = vec_ld(16, src);
124 vsrcperm0 = vec_lvsl(0, src);
125 vsrcperm1 = vec_lvsl(1, src);
126
127 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
128 if (reallyBadAlign)
129 vsrc1uc = vsrcBuc;
130 else
131 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
132
133 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
134 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
135
136 if (ABCD[3]) {
137 if (!loadSecond) {// -> !reallyBadAlign
138 for (i = 0 ; i < h ; i++) {
139 vsrcCuc = vec_ld(stride + 0, src);
140 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
141 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
142
143 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
144 }
145 } else {
146 vec_u8 vsrcDuc;
147 for (i = 0 ; i < h ; i++) {
148 vsrcCuc = vec_ld(stride + 0, src);
149 vsrcDuc = vec_ld(stride + 16, src);
150 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
151 if (reallyBadAlign)
152 vsrc3uc = vsrcDuc;
153 else
154 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
155
156 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
157 }
158 }
159 } else {
160 const vec_s16 vE = vec_add(vB, vC);
161 if (ABCD[2]) { // x == 0 B == 0
162 if (!loadSecond) {// -> !reallyBadAlign
163 for (i = 0 ; i < h ; i++) {
164 vsrcCuc = vec_ld(stride + 0, src);
165 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
166 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
167
168 vsrc0uc = vsrc1uc;
169 }
170 } else {
171 vec_u8 vsrcDuc;
172 for (i = 0 ; i < h ; i++) {
173 vsrcCuc = vec_ld(stride + 0, src);
174 vsrcDuc = vec_ld(stride + 15, src);
175 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
176 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
177
178 vsrc0uc = vsrc1uc;
179 }
180 }
181 } else { // y == 0 C == 0
182 if (!loadSecond) {// -> !reallyBadAlign
183 for (i = 0 ; i < h ; i++) {
184 vsrcCuc = vec_ld(0, src);
185 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
186 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
187
188 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
189 }
190 } else {
191 vec_u8 vsrcDuc;
192 for (i = 0 ; i < h ; i++) {
193 vsrcCuc = vec_ld(0, src);
194 vsrcDuc = vec_ld(15, src);
195 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
196 if (reallyBadAlign)
197 vsrc1uc = vsrcDuc;
198 else
199 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
200
201 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
202 }
203 }
204 }
205 }
206 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
207 }
208
209 /* this code assume that stride % 16 == 0 */
210 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
211 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
212 {((8 - x) * (8 - y)),
213 (( x) * (8 - y)),
214 ((8 - x) * ( y)),
215 (( x) * ( y))};
216 register int i;
217 vec_u8 fperm;
218 const vec_s32 vABCD = vec_ld(0, ABCD);
219 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
220 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
221 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
222 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
223 LOAD_ZERO;
224 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
225 const vec_u16 v6us = vec_splat_u16(6);
226 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
227 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
228
229 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
230 vec_u8 vsrc0uc, vsrc1uc;
231 vec_s16 vsrc0ssH, vsrc1ssH;
232 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
233 vec_s16 vsrc2ssH, vsrc3ssH, psum;
234 vec_u8 vdst, ppsum, vfdst, fsum;
235
236 if (((unsigned long)dst) % 16 == 0) {
237 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
238 0x14, 0x15, 0x16, 0x17,
239 0x08, 0x09, 0x0A, 0x0B,
240 0x0C, 0x0D, 0x0E, 0x0F};
241 } else {
242 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
243 0x04, 0x05, 0x06, 0x07,
244 0x18, 0x19, 0x1A, 0x1B,
245 0x1C, 0x1D, 0x1E, 0x1F};
246 }
247
248 vsrcAuc = vec_ld(0, src);
249
250 if (loadSecond)
251 vsrcBuc = vec_ld(16, src);
252 vsrcperm0 = vec_lvsl(0, src);
253 vsrcperm1 = vec_lvsl(1, src);
254
255 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
256 if (reallyBadAlign)
257 vsrc1uc = vsrcBuc;
258 else
259 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
260
261 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
262 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
263
264 if (!loadSecond) {// -> !reallyBadAlign
265 for (i = 0 ; i < h ; i++) {
266
267
268 vsrcCuc = vec_ld(stride + 0, src);
269
270 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
271 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
272
273 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
274 }
275 } else {
276 vec_u8 vsrcDuc;
277 for (i = 0 ; i < h ; i++) {
278 vsrcCuc = vec_ld(stride + 0, src);
279 vsrcDuc = vec_ld(stride + 16, src);
280
281 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
282 if (reallyBadAlign)
283 vsrc3uc = vsrcDuc;
284 else
285 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
286
287 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
288 }
289 }
290 }
291
292 #undef noop
293 #undef add28
294 #undef CHROMA_MC8_ALTIVEC_CORE
295
296 /* this code assume stride % 16 == 0 */
297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
298 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
299 register int i;
300
301 LOAD_ZERO;
302 const vec_u8 permM2 = vec_lvsl(-2, src);
303 const vec_u8 permM1 = vec_lvsl(-1, src);
304 const vec_u8 permP0 = vec_lvsl(+0, src);
305 const vec_u8 permP1 = vec_lvsl(+1, src);
306 const vec_u8 permP2 = vec_lvsl(+2, src);
307 const vec_u8 permP3 = vec_lvsl(+3, src);
308 const vec_s16 v5ss = vec_splat_s16(5);
309 const vec_u16 v5us = vec_splat_u16(5);
310 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
311 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
312
313 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
314
315 register int align = ((((unsigned long)src) - 2) % 16);
316
317 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
318 srcP2A, srcP2B, srcP3A, srcP3B,
319 srcM1A, srcM1B, srcM2A, srcM2B,
320 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
321 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
322 psumA, psumB, sumA, sumB;
323
324 vec_u8 sum, vdst, fsum;
325
326 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
327
328 for (i = 0 ; i < 16 ; i ++) {
329 vec_u8 srcR1 = vec_ld(-2, src);
330 vec_u8 srcR2 = vec_ld(14, src);
331
332 switch (align) {
333 default: {
334 srcM2 = vec_perm(srcR1, srcR2, permM2);
335 srcM1 = vec_perm(srcR1, srcR2, permM1);
336 srcP0 = vec_perm(srcR1, srcR2, permP0);
337 srcP1 = vec_perm(srcR1, srcR2, permP1);
338 srcP2 = vec_perm(srcR1, srcR2, permP2);
339 srcP3 = vec_perm(srcR1, srcR2, permP3);
340 } break;
341 case 11: {
342 srcM2 = vec_perm(srcR1, srcR2, permM2);
343 srcM1 = vec_perm(srcR1, srcR2, permM1);
344 srcP0 = vec_perm(srcR1, srcR2, permP0);
345 srcP1 = vec_perm(srcR1, srcR2, permP1);
346 srcP2 = vec_perm(srcR1, srcR2, permP2);
347 srcP3 = srcR2;
348 } break;
349 case 12: {
350 vec_u8 srcR3 = vec_ld(30, src);
351 srcM2 = vec_perm(srcR1, srcR2, permM2);
352 srcM1 = vec_perm(srcR1, srcR2, permM1);
353 srcP0 = vec_perm(srcR1, srcR2, permP0);
354 srcP1 = vec_perm(srcR1, srcR2, permP1);
355 srcP2 = srcR2;
356 srcP3 = vec_perm(srcR2, srcR3, permP3);
357 } break;
358 case 13: {
359 vec_u8 srcR3 = vec_ld(30, src);
360 srcM2 = vec_perm(srcR1, srcR2, permM2);
361 srcM1 = vec_perm(srcR1, srcR2, permM1);
362 srcP0 = vec_perm(srcR1, srcR2, permP0);
363 srcP1 = srcR2;
364 srcP2 = vec_perm(srcR2, srcR3, permP2);
365 srcP3 = vec_perm(srcR2, srcR3, permP3);
366 } break;
367 case 14: {
368 vec_u8 srcR3 = vec_ld(30, src);
369 srcM2 = vec_perm(srcR1, srcR2, permM2);
370 srcM1 = vec_perm(srcR1, srcR2, permM1);
371 srcP0 = srcR2;
372 srcP1 = vec_perm(srcR2, srcR3, permP1);
373 srcP2 = vec_perm(srcR2, srcR3, permP2);
374 srcP3 = vec_perm(srcR2, srcR3, permP3);
375 } break;
376 case 15: {
377 vec_u8 srcR3 = vec_ld(30, src);
378 srcM2 = vec_perm(srcR1, srcR2, permM2);
379 srcM1 = srcR2;
380 srcP0 = vec_perm(srcR2, srcR3, permP0);
381 srcP1 = vec_perm(srcR2, srcR3, permP1);
382 srcP2 = vec_perm(srcR2, srcR3, permP2);
383 srcP3 = vec_perm(srcR2, srcR3, permP3);
384 } break;
385 }
386
387 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
388 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
389 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
390 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
391
392 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
393 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
394 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
395 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
396
397 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
398 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
399 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
400 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
401
402 sum1A = vec_adds(srcP0A, srcP1A);
403 sum1B = vec_adds(srcP0B, srcP1B);
404 sum2A = vec_adds(srcM1A, srcP2A);
405 sum2B = vec_adds(srcM1B, srcP2B);
406 sum3A = vec_adds(srcM2A, srcP3A);
407 sum3B = vec_adds(srcM2B, srcP3B);
408
409 pp1A = vec_mladd(sum1A, v20ss, v16ss);
410 pp1B = vec_mladd(sum1B, v20ss, v16ss);
411
412 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
413 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
414
415 pp3A = vec_add(sum3A, pp1A);
416 pp3B = vec_add(sum3B, pp1B);
417
418 psumA = vec_sub(pp3A, pp2A);
419 psumB = vec_sub(pp3B, pp2B);
420
421 sumA = vec_sra(psumA, v5us);
422 sumB = vec_sra(psumB, v5us);
423
424 sum = vec_packsu(sumA, sumB);
425
426 ASSERT_ALIGNED(dst);
427 vdst = vec_ld(0, dst);
428
429 OP_U8_ALTIVEC(fsum, sum, vdst);
430
431 vec_st(fsum, 0, dst);
432
433 src += srcStride;
434 dst += dstStride;
435 }
436 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
437 }
438
439 /* this code assume stride % 16 == 0 */
440 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
441 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
442
443 register int i;
444
445 LOAD_ZERO;
446 const vec_u8 perm = vec_lvsl(0, src);
447 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
448 const vec_u16 v5us = vec_splat_u16(5);
449 const vec_s16 v5ss = vec_splat_s16(5);
450 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
451
452 uint8_t *srcbis = src - (srcStride * 2);
453
454 const vec_u8 srcM2a = vec_ld(0, srcbis);
455 const vec_u8 srcM2b = vec_ld(16, srcbis);
456 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
457 //srcbis += srcStride;
458 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
459 const vec_u8 srcM1b = vec_ld(16, srcbis);
460 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
461 //srcbis += srcStride;
462 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
463 const vec_u8 srcP0b = vec_ld(16, srcbis);
464 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
465 //srcbis += srcStride;
466 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
467 const vec_u8 srcP1b = vec_ld(16, srcbis);
468 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
469 //srcbis += srcStride;
470 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
471 const vec_u8 srcP2b = vec_ld(16, srcbis);
472 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
473 //srcbis += srcStride;
474
475 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
476 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
477 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
478 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
479 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
480 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
481 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
482 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
483 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
484 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
485
486 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
487 psumA, psumB, sumA, sumB,
488 srcP3ssA, srcP3ssB,
489 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
490
491 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
492
493 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
494
495 for (i = 0 ; i < 16 ; i++) {
496 srcP3a = vec_ld(0, srcbis += srcStride);
497 srcP3b = vec_ld(16, srcbis);
498 srcP3 = vec_perm(srcP3a, srcP3b, perm);
499 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
500 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
501 //srcbis += srcStride;
502
503 sum1A = vec_adds(srcP0ssA, srcP1ssA);
504 sum1B = vec_adds(srcP0ssB, srcP1ssB);
505 sum2A = vec_adds(srcM1ssA, srcP2ssA);
506 sum2B = vec_adds(srcM1ssB, srcP2ssB);
507 sum3A = vec_adds(srcM2ssA, srcP3ssA);
508 sum3B = vec_adds(srcM2ssB, srcP3ssB);
509
510 srcM2ssA = srcM1ssA;
511 srcM2ssB = srcM1ssB;
512 srcM1ssA = srcP0ssA;
513 srcM1ssB = srcP0ssB;
514 srcP0ssA = srcP1ssA;
515 srcP0ssB = srcP1ssB;
516 srcP1ssA = srcP2ssA;
517 srcP1ssB = srcP2ssB;
518 srcP2ssA = srcP3ssA;
519 srcP2ssB = srcP3ssB;
520
521 pp1A = vec_mladd(sum1A, v20ss, v16ss);
522 pp1B = vec_mladd(sum1B, v20ss, v16ss);
523
524 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
525 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
526
527 pp3A = vec_add(sum3A, pp1A);
528 pp3B = vec_add(sum3B, pp1B);
529
530 psumA = vec_sub(pp3A, pp2A);
531 psumB = vec_sub(pp3B, pp2B);
532
533 sumA = vec_sra(psumA, v5us);
534 sumB = vec_sra(psumB, v5us);
535
536 sum = vec_packsu(sumA, sumB);
537
538 ASSERT_ALIGNED(dst);
539 vdst = vec_ld(0, dst);
540
541 OP_U8_ALTIVEC(fsum, sum, vdst);
542
543 vec_st(fsum, 0, dst);
544
545 dst += dstStride;
546 }
547 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
548 }
549
550 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
551 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
552 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
553 register int i;
554 LOAD_ZERO;
555 const vec_u8 permM2 = vec_lvsl(-2, src);
556 const vec_u8 permM1 = vec_lvsl(-1, src);
557 const vec_u8 permP0 = vec_lvsl(+0, src);
558 const vec_u8 permP1 = vec_lvsl(+1, src);
559 const vec_u8 permP2 = vec_lvsl(+2, src);
560 const vec_u8 permP3 = vec_lvsl(+3, src);
561 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
562 const vec_u32 v10ui = vec_splat_u32(10);
563 const vec_s16 v5ss = vec_splat_s16(5);
564 const vec_s16 v1ss = vec_splat_s16(1);
565 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
566 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
567
568 register int align = ((((unsigned long)src) - 2) % 16);
569
570 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
571 srcP2A, srcP2B, srcP3A, srcP3B,
572 srcM1A, srcM1B, srcM2A, srcM2B,
573 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
574 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
575
576 const vec_u8 mperm = (const vec_u8)
577 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
578 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
579 int16_t *tmpbis = tmp;
580
581 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
582 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
583 tmpP2ssA, tmpP2ssB;
584
585 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
586 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
587 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
588 ssumAe, ssumAo, ssumBe, ssumBo;
589 vec_u8 fsum, sumv, sum, vdst;
590 vec_s16 ssume, ssumo;
591
592 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
593 src -= (2 * srcStride);
594 for (i = 0 ; i < 21 ; i ++) {
595 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
596 vec_u8 srcR1 = vec_ld(-2, src);
597 vec_u8 srcR2 = vec_ld(14, src);
598
599 switch (align) {
600 default: {
601 srcM2 = vec_perm(srcR1, srcR2, permM2);
602 srcM1 = vec_perm(srcR1, srcR2, permM1);
603 srcP0 = vec_perm(srcR1, srcR2, permP0);
604 srcP1 = vec_perm(srcR1, srcR2, permP1);
605 srcP2 = vec_perm(srcR1, srcR2, permP2);
606 srcP3 = vec_perm(srcR1, srcR2, permP3);
607 } break;
608 case 11: {
609 srcM2 = vec_perm(srcR1, srcR2, permM2);
610 srcM1 = vec_perm(srcR1, srcR2, permM1);
611 srcP0 = vec_perm(srcR1, srcR2, permP0);
612 srcP1 = vec_perm(srcR1, srcR2, permP1);
613 srcP2 = vec_perm(srcR1, srcR2, permP2);
614 srcP3 = srcR2;
615 } break;
616 case 12: {
617 vec_u8 srcR3 = vec_ld(30, src);
618 srcM2 = vec_perm(srcR1, srcR2, permM2);
619 srcM1 = vec_perm(srcR1, srcR2, permM1);
620 srcP0 = vec_perm(srcR1, srcR2, permP0);
621 srcP1 = vec_perm(srcR1, srcR2, permP1);
622 srcP2 = srcR2;
623 srcP3 = vec_perm(srcR2, srcR3, permP3);
624 } break;
625 case 13: {
626 vec_u8 srcR3 = vec_ld(30, src);
627 srcM2 = vec_perm(srcR1, srcR2, permM2);
628 srcM1 = vec_perm(srcR1, srcR2, permM1);
629 srcP0 = vec_perm(srcR1, srcR2, permP0);
630 srcP1 = srcR2;
631 srcP2 = vec_perm(srcR2, srcR3, permP2);
632 srcP3 = vec_perm(srcR2, srcR3, permP3);
633 } break;
634 case 14: {
635 vec_u8 srcR3 = vec_ld(30, src);
636 srcM2 = vec_perm(srcR1, srcR2, permM2);
637 srcM1 = vec_perm(srcR1, srcR2, permM1);
638 srcP0 = srcR2;
639 srcP1 = vec_perm(srcR2, srcR3, permP1);
640 srcP2 = vec_perm(srcR2, srcR3, permP2);
641 srcP3 = vec_perm(srcR2, srcR3, permP3);
642 } break;
643 case 15: {
644 vec_u8 srcR3 = vec_ld(30, src);
645 srcM2 = vec_perm(srcR1, srcR2, permM2);
646 srcM1 = srcR2;
647 srcP0 = vec_perm(srcR2, srcR3, permP0);
648 srcP1 = vec_perm(srcR2, srcR3, permP1);
649 srcP2 = vec_perm(srcR2, srcR3, permP2);
650 srcP3 = vec_perm(srcR2, srcR3, permP3);
651 } break;
652 }
653
654 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
655 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
656 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
657 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
658
659 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
660 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
661 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
662 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
663
664 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
665 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
666 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
667 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
668
669 sum1A = vec_adds(srcP0A, srcP1A);
670 sum1B = vec_adds(srcP0B, srcP1B);
671 sum2A = vec_adds(srcM1A, srcP2A);
672 sum2B = vec_adds(srcM1B, srcP2B);
673 sum3A = vec_adds(srcM2A, srcP3A);
674 sum3B = vec_adds(srcM2B, srcP3B);
675
676 pp1A = vec_mladd(sum1A, v20ss, sum3A);
677 pp1B = vec_mladd(sum1B, v20ss, sum3B);
678
679 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
680 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
681
682 psumA = vec_sub(pp1A, pp2A);
683 psumB = vec_sub(pp1B, pp2B);
684
685 vec_st(psumA, 0, tmp);
686 vec_st(psumB, 16, tmp);
687
688 src += srcStride;
689 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
690 }
691
692 tmpM2ssA = vec_ld(0, tmpbis);
693 tmpM2ssB = vec_ld(16, tmpbis);
694 tmpbis += tmpStride;
695 tmpM1ssA = vec_ld(0, tmpbis);
696 tmpM1ssB = vec_ld(16, tmpbis);
697 tmpbis += tmpStride;
698 tmpP0ssA = vec_ld(0, tmpbis);
699 tmpP0ssB = vec_ld(16, tmpbis);
700 tmpbis += tmpStride;
701 tmpP1ssA = vec_ld(0, tmpbis);
702 tmpP1ssB = vec_ld(16, tmpbis);
703 tmpbis += tmpStride;
704 tmpP2ssA = vec_ld(0, tmpbis);
705 tmpP2ssB = vec_ld(16, tmpbis);
706 tmpbis += tmpStride;
707
708 for (i = 0 ; i < 16 ; i++) {
709 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
710 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
711
712 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
713 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
714 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
715 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
716 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
717 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
718
719 tmpbis += tmpStride;
720
721 tmpM2ssA = tmpM1ssA;
722 tmpM2ssB = tmpM1ssB;
723 tmpM1ssA = tmpP0ssA;
724 tmpM1ssB = tmpP0ssB;
725 tmpP0ssA = tmpP1ssA;
726 tmpP0ssB = tmpP1ssB;
727 tmpP1ssA = tmpP2ssA;
728 tmpP1ssB = tmpP2ssB;
729 tmpP2ssA = tmpP3ssA;
730 tmpP2ssB = tmpP3ssB;
731
732 pp1Ae = vec_mule(sum1A, v20ss);
733 pp1Ao = vec_mulo(sum1A, v20ss);
734 pp1Be = vec_mule(sum1B, v20ss);
735 pp1Bo = vec_mulo(sum1B, v20ss);
736
737 pp2Ae = vec_mule(sum2A, v5ss);
738 pp2Ao = vec_mulo(sum2A, v5ss);
739 pp2Be = vec_mule(sum2B, v5ss);
740 pp2Bo = vec_mulo(sum2B, v5ss);
741
742 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
743 pp3Ao = vec_mulo(sum3A, v1ss);
744 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
745 pp3Bo = vec_mulo(sum3B, v1ss);
746
747 pp1cAe = vec_add(pp1Ae, v512si);
748 pp1cAo = vec_add(pp1Ao, v512si);
749 pp1cBe = vec_add(pp1Be, v512si);
750 pp1cBo = vec_add(pp1Bo, v512si);
751
752 pp32Ae = vec_sub(pp3Ae, pp2Ae);
753 pp32Ao = vec_sub(pp3Ao, pp2Ao);
754 pp32Be = vec_sub(pp3Be, pp2Be);
755 pp32Bo = vec_sub(pp3Bo, pp2Bo);
756
757 sumAe = vec_add(pp1cAe, pp32Ae);
758 sumAo = vec_add(pp1cAo, pp32Ao);
759 sumBe = vec_add(pp1cBe, pp32Be);
760 sumBo = vec_add(pp1cBo, pp32Bo);
761
762 ssumAe = vec_sra(sumAe, v10ui);
763 ssumAo = vec_sra(sumAo, v10ui);
764 ssumBe = vec_sra(sumBe, v10ui);
765 ssumBo = vec_sra(sumBo, v10ui);
766
767 ssume = vec_packs(ssumAe, ssumBe);
768 ssumo = vec_packs(ssumAo, ssumBo);
769
770 sumv = vec_packsu(ssume, ssumo);
771 sum = vec_perm(sumv, sumv, mperm);
772
773 ASSERT_ALIGNED(dst);
774 vdst = vec_ld(0, dst);
775
776 OP_U8_ALTIVEC(fsum, sum, vdst);
777
778 vec_st(fsum, 0, dst);
779
780 dst += dstStride;
781 }
782 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
783 }