2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25 #define ASSERT_ALIGNED(ptr) ;
28 /* this code assume that stride % 16 == 0 */
30 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
34 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
39 psum = vec_sr(psum, v6us);\
41 vdst = vec_ld(0, dst);\
42 ppsum = (vec_u8)vec_pack(psum, psum);\
43 vfdst = vec_perm(vdst, ppsum, fperm);\
45 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
47 vec_st(fsum, 0, dst);\
55 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
57 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
58 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
60 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
61 psum = vec_mladd(vE, vsrc1ssH, psum);\
62 psum = vec_sr(psum, v6us);\
64 vdst = vec_ld(0, dst);\
65 ppsum = (vec_u8)vec_pack(psum, psum);\
66 vfdst = vec_perm(vdst, ppsum, fperm);\
68 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
70 vec_st(fsum, 0, dst);\
76 #define add28(a) vec_add(v28ss, a)
78 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst
, uint8_t * src
,
79 int stride
, int h
, int x
, int y
) {
80 DECLARE_ALIGNED(16, signed int, ABCD
)[4] =
87 const vec_s32 vABCD
= vec_ld(0, ABCD
);
88 const vec_s16 vA
= vec_splat((vec_s16
)vABCD
, 1);
89 const vec_s16 vB
= vec_splat((vec_s16
)vABCD
, 3);
90 const vec_s16 vC
= vec_splat((vec_s16
)vABCD
, 5);
91 const vec_s16 vD
= vec_splat((vec_s16
)vABCD
, 7);
93 const vec_s16 v32ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(5));
94 const vec_u16 v6us
= vec_splat_u16(6);
95 register int loadSecond
= (((unsigned long)src
) % 16) <= 7 ?
0 : 1;
96 register int reallyBadAlign
= (((unsigned long)src
) % 16) == 15 ?
1 : 0;
98 vec_u8 vsrcAuc
, av_uninit(vsrcBuc
), vsrcperm0
, vsrcperm1
;
99 vec_u8 vsrc0uc
, vsrc1uc
;
100 vec_s16 vsrc0ssH
, vsrc1ssH
;
101 vec_u8 vsrcCuc
, vsrc2uc
, vsrc3uc
;
102 vec_s16 vsrc2ssH
, vsrc3ssH
, psum
;
103 vec_u8 vdst
, ppsum
, vfdst
, fsum
;
105 if (((unsigned long)dst
) % 16 == 0) {
106 fperm
= (vec_u8
){0x10, 0x11, 0x12, 0x13,
107 0x14, 0x15, 0x16, 0x17,
108 0x08, 0x09, 0x0A, 0x0B,
109 0x0C, 0x0D, 0x0E, 0x0F};
111 fperm
= (vec_u8
){0x00, 0x01, 0x02, 0x03,
112 0x04, 0x05, 0x06, 0x07,
113 0x18, 0x19, 0x1A, 0x1B,
114 0x1C, 0x1D, 0x1E, 0x1F};
117 vsrcAuc
= vec_ld(0, src
);
120 vsrcBuc
= vec_ld(16, src
);
121 vsrcperm0
= vec_lvsl(0, src
);
122 vsrcperm1
= vec_lvsl(1, src
);
124 vsrc0uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm0
);
128 vsrc1uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm1
);
130 vsrc0ssH
= (vec_s16
)vec_mergeh(zero_u8v
,(vec_u8
)vsrc0uc
);
131 vsrc1ssH
= (vec_s16
)vec_mergeh(zero_u8v
,(vec_u8
)vsrc1uc
);
134 if (!loadSecond
) {// -> !reallyBadAlign
135 for (i
= 0 ; i
< h
; i
++) {
136 vsrcCuc
= vec_ld(stride
+ 0, src
);
137 vsrc2uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
138 vsrc3uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
140 CHROMA_MC8_ALTIVEC_CORE(v32ss
, noop
)
144 for (i
= 0 ; i
< h
; i
++) {
145 vsrcCuc
= vec_ld(stride
+ 0, src
);
146 vsrcDuc
= vec_ld(stride
+ 16, src
);
147 vsrc2uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
151 vsrc3uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
153 CHROMA_MC8_ALTIVEC_CORE(v32ss
, noop
)
157 const vec_s16 vE
= vec_add(vB
, vC
);
158 if (ABCD
[2]) { // x == 0 B == 0
159 if (!loadSecond
) {// -> !reallyBadAlign
160 for (i
= 0 ; i
< h
; i
++) {
161 vsrcCuc
= vec_ld(stride
+ 0, src
);
162 vsrc1uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
163 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
169 for (i
= 0 ; i
< h
; i
++) {
170 vsrcCuc
= vec_ld(stride
+ 0, src
);
171 vsrcDuc
= vec_ld(stride
+ 15, src
);
172 vsrc1uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
173 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
178 } else { // y == 0 C == 0
179 if (!loadSecond
) {// -> !reallyBadAlign
180 for (i
= 0 ; i
< h
; i
++) {
181 vsrcCuc
= vec_ld(0, src
);
182 vsrc0uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
183 vsrc1uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
185 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
189 for (i
= 0 ; i
< h
; i
++) {
190 vsrcCuc
= vec_ld(0, src
);
191 vsrcDuc
= vec_ld(15, src
);
192 vsrc0uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
196 vsrc1uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
198 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
205 /* this code assume that stride % 16 == 0 */
206 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst
, uint8_t * src
, int stride
, int h
, int x
, int y
) {
207 DECLARE_ALIGNED(16, signed int, ABCD
)[4] =
208 {((8 - x
) * (8 - y
)),
214 const vec_s32 vABCD
= vec_ld(0, ABCD
);
215 const vec_s16 vA
= vec_splat((vec_s16
)vABCD
, 1);
216 const vec_s16 vB
= vec_splat((vec_s16
)vABCD
, 3);
217 const vec_s16 vC
= vec_splat((vec_s16
)vABCD
, 5);
218 const vec_s16 vD
= vec_splat((vec_s16
)vABCD
, 7);
220 const vec_s16 v28ss
= vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
221 const vec_u16 v6us
= vec_splat_u16(6);
222 register int loadSecond
= (((unsigned long)src
) % 16) <= 7 ?
0 : 1;
223 register int reallyBadAlign
= (((unsigned long)src
) % 16) == 15 ?
1 : 0;
225 vec_u8 vsrcAuc
, av_uninit(vsrcBuc
), vsrcperm0
, vsrcperm1
;
226 vec_u8 vsrc0uc
, vsrc1uc
;
227 vec_s16 vsrc0ssH
, vsrc1ssH
;
228 vec_u8 vsrcCuc
, vsrc2uc
, vsrc3uc
;
229 vec_s16 vsrc2ssH
, vsrc3ssH
, psum
;
230 vec_u8 vdst
, ppsum
, vfdst
, fsum
;
232 if (((unsigned long)dst
) % 16 == 0) {
233 fperm
= (vec_u8
){0x10, 0x11, 0x12, 0x13,
234 0x14, 0x15, 0x16, 0x17,
235 0x08, 0x09, 0x0A, 0x0B,
236 0x0C, 0x0D, 0x0E, 0x0F};
238 fperm
= (vec_u8
){0x00, 0x01, 0x02, 0x03,
239 0x04, 0x05, 0x06, 0x07,
240 0x18, 0x19, 0x1A, 0x1B,
241 0x1C, 0x1D, 0x1E, 0x1F};
244 vsrcAuc
= vec_ld(0, src
);
247 vsrcBuc
= vec_ld(16, src
);
248 vsrcperm0
= vec_lvsl(0, src
);
249 vsrcperm1
= vec_lvsl(1, src
);
251 vsrc0uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm0
);
255 vsrc1uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm1
);
257 vsrc0ssH
= (vec_s16
)vec_mergeh(zero_u8v
, (vec_u8
)vsrc0uc
);
258 vsrc1ssH
= (vec_s16
)vec_mergeh(zero_u8v
, (vec_u8
)vsrc1uc
);
260 if (!loadSecond
) {// -> !reallyBadAlign
261 for (i
= 0 ; i
< h
; i
++) {
264 vsrcCuc
= vec_ld(stride
+ 0, src
);
266 vsrc2uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
267 vsrc3uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
269 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28
)
273 for (i
= 0 ; i
< h
; i
++) {
274 vsrcCuc
= vec_ld(stride
+ 0, src
);
275 vsrcDuc
= vec_ld(stride
+ 16, src
);
277 vsrc2uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
281 vsrc3uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
283 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28
)
290 #undef CHROMA_MC8_ALTIVEC_CORE
292 /* this code assume stride % 16 == 0 */
293 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst
, uint8_t * src
, int dstStride
, int srcStride
) {
297 const vec_u8 permM2
= vec_lvsl(-2, src
);
298 const vec_u8 permM1
= vec_lvsl(-1, src
);
299 const vec_u8 permP0
= vec_lvsl(+0, src
);
300 const vec_u8 permP1
= vec_lvsl(+1, src
);
301 const vec_u8 permP2
= vec_lvsl(+2, src
);
302 const vec_u8 permP3
= vec_lvsl(+3, src
);
303 const vec_s16 v5ss
= vec_splat_s16(5);
304 const vec_u16 v5us
= vec_splat_u16(5);
305 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
306 const vec_s16 v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
308 vec_u8 srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
310 register int align
= ((((unsigned long)src
) - 2) % 16);
312 vec_s16 srcP0A
, srcP0B
, srcP1A
, srcP1B
,
313 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
314 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
315 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
316 pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
317 psumA
, psumB
, sumA
, sumB
;
319 vec_u8 sum
, vdst
, fsum
;
321 for (i
= 0 ; i
< 16 ; i
++) {
322 vec_u8 srcR1
= vec_ld(-2, src
);
323 vec_u8 srcR2
= vec_ld(14, src
);
327 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
328 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
329 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
330 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
331 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
332 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
335 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
336 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
337 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
338 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
339 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
343 vec_u8 srcR3
= vec_ld(30, src
);
344 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
345 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
346 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
347 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
349 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
352 vec_u8 srcR3
= vec_ld(30, src
);
353 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
354 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
355 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
357 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
358 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
361 vec_u8 srcR3
= vec_ld(30, src
);
362 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
363 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
365 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
366 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
367 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
370 vec_u8 srcR3
= vec_ld(30, src
);
371 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
373 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
374 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
375 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
376 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
380 srcP0A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP0
);
381 srcP0B
= (vec_s16
) vec_mergel(zero_u8v
, srcP0
);
382 srcP1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP1
);
383 srcP1B
= (vec_s16
) vec_mergel(zero_u8v
, srcP1
);
385 srcP2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP2
);
386 srcP2B
= (vec_s16
) vec_mergel(zero_u8v
, srcP2
);
387 srcP3A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP3
);
388 srcP3B
= (vec_s16
) vec_mergel(zero_u8v
, srcP3
);
390 srcM1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM1
);
391 srcM1B
= (vec_s16
) vec_mergel(zero_u8v
, srcM1
);
392 srcM2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM2
);
393 srcM2B
= (vec_s16
) vec_mergel(zero_u8v
, srcM2
);
395 sum1A
= vec_adds(srcP0A
, srcP1A
);
396 sum1B
= vec_adds(srcP0B
, srcP1B
);
397 sum2A
= vec_adds(srcM1A
, srcP2A
);
398 sum2B
= vec_adds(srcM1B
, srcP2B
);
399 sum3A
= vec_adds(srcM2A
, srcP3A
);
400 sum3B
= vec_adds(srcM2B
, srcP3B
);
402 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
403 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
405 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
406 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
408 pp3A
= vec_add(sum3A
, pp1A
);
409 pp3B
= vec_add(sum3B
, pp1B
);
411 psumA
= vec_sub(pp3A
, pp2A
);
412 psumB
= vec_sub(pp3B
, pp2B
);
414 sumA
= vec_sra(psumA
, v5us
);
415 sumB
= vec_sra(psumB
, v5us
);
417 sum
= vec_packsu(sumA
, sumB
);
420 vdst
= vec_ld(0, dst
);
422 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
424 vec_st(fsum
, 0, dst
);
431 /* this code assume stride % 16 == 0 */
432 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst
, uint8_t * src
, int dstStride
, int srcStride
) {
436 const vec_u8 perm
= vec_lvsl(0, src
);
437 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
438 const vec_u16 v5us
= vec_splat_u16(5);
439 const vec_s16 v5ss
= vec_splat_s16(5);
440 const vec_s16 v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
442 uint8_t *srcbis
= src
- (srcStride
* 2);
444 const vec_u8 srcM2a
= vec_ld(0, srcbis
);
445 const vec_u8 srcM2b
= vec_ld(16, srcbis
);
446 const vec_u8 srcM2
= vec_perm(srcM2a
, srcM2b
, perm
);
447 //srcbis += srcStride;
448 const vec_u8 srcM1a
= vec_ld(0, srcbis
+= srcStride
);
449 const vec_u8 srcM1b
= vec_ld(16, srcbis
);
450 const vec_u8 srcM1
= vec_perm(srcM1a
, srcM1b
, perm
);
451 //srcbis += srcStride;
452 const vec_u8 srcP0a
= vec_ld(0, srcbis
+= srcStride
);
453 const vec_u8 srcP0b
= vec_ld(16, srcbis
);
454 const vec_u8 srcP0
= vec_perm(srcP0a
, srcP0b
, perm
);
455 //srcbis += srcStride;
456 const vec_u8 srcP1a
= vec_ld(0, srcbis
+= srcStride
);
457 const vec_u8 srcP1b
= vec_ld(16, srcbis
);
458 const vec_u8 srcP1
= vec_perm(srcP1a
, srcP1b
, perm
);
459 //srcbis += srcStride;
460 const vec_u8 srcP2a
= vec_ld(0, srcbis
+= srcStride
);
461 const vec_u8 srcP2b
= vec_ld(16, srcbis
);
462 const vec_u8 srcP2
= vec_perm(srcP2a
, srcP2b
, perm
);
463 //srcbis += srcStride;
465 vec_s16 srcM2ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcM2
);
466 vec_s16 srcM2ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcM2
);
467 vec_s16 srcM1ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcM1
);
468 vec_s16 srcM1ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcM1
);
469 vec_s16 srcP0ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP0
);
470 vec_s16 srcP0ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP0
);
471 vec_s16 srcP1ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP1
);
472 vec_s16 srcP1ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP1
);
473 vec_s16 srcP2ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP2
);
474 vec_s16 srcP2ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP2
);
476 vec_s16 pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
477 psumA
, psumB
, sumA
, sumB
,
479 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
;
481 vec_u8 sum
, vdst
, fsum
, srcP3a
, srcP3b
, srcP3
;
483 for (i
= 0 ; i
< 16 ; i
++) {
484 srcP3a
= vec_ld(0, srcbis
+= srcStride
);
485 srcP3b
= vec_ld(16, srcbis
);
486 srcP3
= vec_perm(srcP3a
, srcP3b
, perm
);
487 srcP3ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP3
);
488 srcP3ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP3
);
489 //srcbis += srcStride;
491 sum1A
= vec_adds(srcP0ssA
, srcP1ssA
);
492 sum1B
= vec_adds(srcP0ssB
, srcP1ssB
);
493 sum2A
= vec_adds(srcM1ssA
, srcP2ssA
);
494 sum2B
= vec_adds(srcM1ssB
, srcP2ssB
);
495 sum3A
= vec_adds(srcM2ssA
, srcP3ssA
);
496 sum3B
= vec_adds(srcM2ssB
, srcP3ssB
);
509 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
510 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
512 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
513 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
515 pp3A
= vec_add(sum3A
, pp1A
);
516 pp3B
= vec_add(sum3B
, pp1B
);
518 psumA
= vec_sub(pp3A
, pp2A
);
519 psumB
= vec_sub(pp3B
, pp2B
);
521 sumA
= vec_sra(psumA
, v5us
);
522 sumB
= vec_sra(psumB
, v5us
);
524 sum
= vec_packsu(sumA
, sumB
);
527 vdst
= vec_ld(0, dst
);
529 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
531 vec_st(fsum
, 0, dst
);
537 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
538 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst
, int16_t * tmp
, uint8_t * src
, int dstStride
, int tmpStride
, int srcStride
) {
541 const vec_u8 permM2
= vec_lvsl(-2, src
);
542 const vec_u8 permM1
= vec_lvsl(-1, src
);
543 const vec_u8 permP0
= vec_lvsl(+0, src
);
544 const vec_u8 permP1
= vec_lvsl(+1, src
);
545 const vec_u8 permP2
= vec_lvsl(+2, src
);
546 const vec_u8 permP3
= vec_lvsl(+3, src
);
547 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
548 const vec_u32 v10ui
= vec_splat_u32(10);
549 const vec_s16 v5ss
= vec_splat_s16(5);
550 const vec_s16 v1ss
= vec_splat_s16(1);
551 const vec_s32 v512si
= vec_sl(vec_splat_s32(1),vec_splat_u32(9));
552 const vec_u32 v16ui
= vec_sl(vec_splat_u32(1),vec_splat_u32(4));
554 register int align
= ((((unsigned long)src
) - 2) % 16);
556 vec_s16 srcP0A
, srcP0B
, srcP1A
, srcP1B
,
557 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
558 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
559 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
560 pp1A
, pp1B
, pp2A
, pp2B
, psumA
, psumB
;
562 const vec_u8 mperm
= (const vec_u8
)
563 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
564 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
565 int16_t *tmpbis
= tmp
;
567 vec_s16 tmpM1ssA
, tmpM1ssB
, tmpM2ssA
, tmpM2ssB
,
568 tmpP0ssA
, tmpP0ssB
, tmpP1ssA
, tmpP1ssB
,
571 vec_s32 pp1Ae
, pp1Ao
, pp1Be
, pp1Bo
, pp2Ae
, pp2Ao
, pp2Be
, pp2Bo
,
572 pp3Ae
, pp3Ao
, pp3Be
, pp3Bo
, pp1cAe
, pp1cAo
, pp1cBe
, pp1cBo
,
573 pp32Ae
, pp32Ao
, pp32Be
, pp32Bo
, sumAe
, sumAo
, sumBe
, sumBo
,
574 ssumAe
, ssumAo
, ssumBe
, ssumBo
;
575 vec_u8 fsum
, sumv
, sum
, vdst
;
576 vec_s16 ssume
, ssumo
;
578 src
-= (2 * srcStride
);
579 for (i
= 0 ; i
< 21 ; i
++) {
580 vec_u8 srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
581 vec_u8 srcR1
= vec_ld(-2, src
);
582 vec_u8 srcR2
= vec_ld(14, src
);
586 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
587 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
588 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
589 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
590 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
591 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
594 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
595 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
596 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
597 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
598 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
602 vec_u8 srcR3
= vec_ld(30, src
);
603 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
604 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
605 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
606 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
608 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
611 vec_u8 srcR3
= vec_ld(30, src
);
612 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
613 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
614 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
616 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
617 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
620 vec_u8 srcR3
= vec_ld(30, src
);
621 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
622 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
624 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
625 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
626 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
629 vec_u8 srcR3
= vec_ld(30, src
);
630 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
632 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
633 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
634 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
635 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
639 srcP0A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP0
);
640 srcP0B
= (vec_s16
) vec_mergel(zero_u8v
, srcP0
);
641 srcP1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP1
);
642 srcP1B
= (vec_s16
) vec_mergel(zero_u8v
, srcP1
);
644 srcP2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP2
);
645 srcP2B
= (vec_s16
) vec_mergel(zero_u8v
, srcP2
);
646 srcP3A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP3
);
647 srcP3B
= (vec_s16
) vec_mergel(zero_u8v
, srcP3
);
649 srcM1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM1
);
650 srcM1B
= (vec_s16
) vec_mergel(zero_u8v
, srcM1
);
651 srcM2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM2
);
652 srcM2B
= (vec_s16
) vec_mergel(zero_u8v
, srcM2
);
654 sum1A
= vec_adds(srcP0A
, srcP1A
);
655 sum1B
= vec_adds(srcP0B
, srcP1B
);
656 sum2A
= vec_adds(srcM1A
, srcP2A
);
657 sum2B
= vec_adds(srcM1B
, srcP2B
);
658 sum3A
= vec_adds(srcM2A
, srcP3A
);
659 sum3B
= vec_adds(srcM2B
, srcP3B
);
661 pp1A
= vec_mladd(sum1A
, v20ss
, sum3A
);
662 pp1B
= vec_mladd(sum1B
, v20ss
, sum3B
);
664 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
665 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
667 psumA
= vec_sub(pp1A
, pp2A
);
668 psumB
= vec_sub(pp1B
, pp2B
);
670 vec_st(psumA
, 0, tmp
);
671 vec_st(psumB
, 16, tmp
);
674 tmp
+= tmpStride
; /* int16_t*, and stride is 16, so it's OK here */
677 tmpM2ssA
= vec_ld(0, tmpbis
);
678 tmpM2ssB
= vec_ld(16, tmpbis
);
680 tmpM1ssA
= vec_ld(0, tmpbis
);
681 tmpM1ssB
= vec_ld(16, tmpbis
);
683 tmpP0ssA
= vec_ld(0, tmpbis
);
684 tmpP0ssB
= vec_ld(16, tmpbis
);
686 tmpP1ssA
= vec_ld(0, tmpbis
);
687 tmpP1ssB
= vec_ld(16, tmpbis
);
689 tmpP2ssA
= vec_ld(0, tmpbis
);
690 tmpP2ssB
= vec_ld(16, tmpbis
);
693 for (i
= 0 ; i
< 16 ; i
++) {
694 const vec_s16 tmpP3ssA
= vec_ld(0, tmpbis
);
695 const vec_s16 tmpP3ssB
= vec_ld(16, tmpbis
);
697 const vec_s16 sum1A
= vec_adds(tmpP0ssA
, tmpP1ssA
);
698 const vec_s16 sum1B
= vec_adds(tmpP0ssB
, tmpP1ssB
);
699 const vec_s16 sum2A
= vec_adds(tmpM1ssA
, tmpP2ssA
);
700 const vec_s16 sum2B
= vec_adds(tmpM1ssB
, tmpP2ssB
);
701 const vec_s16 sum3A
= vec_adds(tmpM2ssA
, tmpP3ssA
);
702 const vec_s16 sum3B
= vec_adds(tmpM2ssB
, tmpP3ssB
);
717 pp1Ae
= vec_mule(sum1A
, v20ss
);
718 pp1Ao
= vec_mulo(sum1A
, v20ss
);
719 pp1Be
= vec_mule(sum1B
, v20ss
);
720 pp1Bo
= vec_mulo(sum1B
, v20ss
);
722 pp2Ae
= vec_mule(sum2A
, v5ss
);
723 pp2Ao
= vec_mulo(sum2A
, v5ss
);
724 pp2Be
= vec_mule(sum2B
, v5ss
);
725 pp2Bo
= vec_mulo(sum2B
, v5ss
);
727 pp3Ae
= vec_sra((vec_s32
)sum3A
, v16ui
);
728 pp3Ao
= vec_mulo(sum3A
, v1ss
);
729 pp3Be
= vec_sra((vec_s32
)sum3B
, v16ui
);
730 pp3Bo
= vec_mulo(sum3B
, v1ss
);
732 pp1cAe
= vec_add(pp1Ae
, v512si
);
733 pp1cAo
= vec_add(pp1Ao
, v512si
);
734 pp1cBe
= vec_add(pp1Be
, v512si
);
735 pp1cBo
= vec_add(pp1Bo
, v512si
);
737 pp32Ae
= vec_sub(pp3Ae
, pp2Ae
);
738 pp32Ao
= vec_sub(pp3Ao
, pp2Ao
);
739 pp32Be
= vec_sub(pp3Be
, pp2Be
);
740 pp32Bo
= vec_sub(pp3Bo
, pp2Bo
);
742 sumAe
= vec_add(pp1cAe
, pp32Ae
);
743 sumAo
= vec_add(pp1cAo
, pp32Ao
);
744 sumBe
= vec_add(pp1cBe
, pp32Be
);
745 sumBo
= vec_add(pp1cBo
, pp32Bo
);
747 ssumAe
= vec_sra(sumAe
, v10ui
);
748 ssumAo
= vec_sra(sumAo
, v10ui
);
749 ssumBe
= vec_sra(sumBe
, v10ui
);
750 ssumBo
= vec_sra(sumBo
, v10ui
);
752 ssume
= vec_packs(ssumAe
, ssumBe
);
753 ssumo
= vec_packs(ssumAo
, ssumBo
);
755 sumv
= vec_packsu(ssume
, ssumo
);
756 sum
= vec_perm(sumv
, sumv
, mperm
);
759 vdst
= vec_ld(0, dst
);
761 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
763 vec_st(fsum
, 0, dst
);