add second vlc table
[libav.git] / libavcodec / ppc / h264_altivec.c
CommitLineData
a6a12a8a
RD
1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
b78e7197
DB
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
a6a12a8a
RD
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
b78e7197 9 * version 2.1 of the License, or (at your option) any later version.
a6a12a8a 10 *
b78e7197 11 * FFmpeg is distributed in the hope that it will be useful,
a6a12a8a
RD
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
b78e7197 17 * License along with FFmpeg; if not, write to the Free Software
5509bffa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
a6a12a8a 19 */
115329f1 20
b550bfaa 21#include "dsputil.h"
a6a12a8a
RD
22
23#include "gcc_fixes.h"
24
25#include "dsputil_altivec.h"
3813dcc9 26#include "types_altivec.h"
a6a12a8a
RD
27
28#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
29#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
30
31#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
32#define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec
33#define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num
34#define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec
35#define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num
36#define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec
37#define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num
38#define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec
39#define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num
b5f7e6eb 40#include "h264_template_altivec.c"
a6a12a8a
RD
41#undef OP_U8_ALTIVEC
42#undef PREFIX_h264_chroma_mc8_altivec
43#undef PREFIX_h264_chroma_mc8_num
44#undef PREFIX_h264_qpel16_h_lowpass_altivec
45#undef PREFIX_h264_qpel16_h_lowpass_num
46#undef PREFIX_h264_qpel16_v_lowpass_altivec
47#undef PREFIX_h264_qpel16_v_lowpass_num
48#undef PREFIX_h264_qpel16_hv_lowpass_altivec
49#undef PREFIX_h264_qpel16_hv_lowpass_num
50
51#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
52#define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec
53#define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num
54#define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec
55#define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num
56#define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec
57#define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num
58#define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec
59#define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num
b5f7e6eb 60#include "h264_template_altivec.c"
a6a12a8a
RD
61#undef OP_U8_ALTIVEC
62#undef PREFIX_h264_chroma_mc8_altivec
63#undef PREFIX_h264_chroma_mc8_num
64#undef PREFIX_h264_qpel16_h_lowpass_altivec
65#undef PREFIX_h264_qpel16_h_lowpass_num
66#undef PREFIX_h264_qpel16_v_lowpass_altivec
67#undef PREFIX_h264_qpel16_v_lowpass_num
68#undef PREFIX_h264_qpel16_hv_lowpass_altivec
69#undef PREFIX_h264_qpel16_hv_lowpass_num
70
71#define H264_MC(OPNAME, SIZE, CODETYPE) \
72static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
73 OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
74}\
75\
76static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
8047fe72 77 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
a6a12a8a
RD
78 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
79 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
80}\
81\
82static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
83 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
84}\
85\
86static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72 87 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
a6a12a8a
RD
88 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
89 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
90}\
91\
92static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72 93 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
a6a12a8a
RD
94 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
95 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
96}\
97\
98static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
99 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
100}\
101\
102static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72 103 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
a6a12a8a
RD
104 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
105 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
106}\
107\
108static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72
LB
109 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
110 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
a6a12a8a
RD
111 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
112 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
113 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
114}\
115\
116static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72
LB
117 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
118 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
a6a12a8a
RD
119 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
120 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
121 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
122}\
123\
124static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72
LB
125 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
126 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
a6a12a8a
RD
127 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
128 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
129 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
130}\
131\
132static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72
LB
133 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
134 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
a6a12a8a
RD
135 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
136 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
137 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
138}\
139\
140static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72 141 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
a6a12a8a
RD
142 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
143}\
144\
145static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72
LB
146 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
147 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
148 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
a6a12a8a
RD
149 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
150 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
151 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
152}\
153\
154static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72
LB
155 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
156 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
157 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
a6a12a8a
RD
158 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
159 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
160 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
161}\
162\
163static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72
LB
164 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
165 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
166 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
a6a12a8a
RD
167 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
168 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
169 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
170}\
171\
172static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
8047fe72
LB
173 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
174 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
175 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
a6a12a8a
RD
176 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
177 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
178 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
179}\
180
60aae27a
KS
181/* this code assume that stride % 16 == 0 */
182void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
88bcb6c2 183 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
60aae27a
KS
184 {((8 - x) * (8 - y)),
185 ((x) * (8 - y)),
186 ((8 - x) * (y)),
187 ((x) * (y))};
188 register int i;
189 vector unsigned char fperm;
190 const vector signed int vABCD = vec_ld(0, ABCD);
191 const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
192 const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
193 const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
194 const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
195 const vector signed int vzero = vec_splat_s32(0);
196 const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197 const vector unsigned short v6us = vec_splat_u16(6);
198 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
199 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
200
201 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
202 vector unsigned char vsrc0uc, vsrc1uc;
203 vector signed short vsrc0ssH, vsrc1ssH;
204 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
205 vector signed short vsrc2ssH, vsrc3ssH, psum;
4b47d258 206 vector unsigned char vdst, ppsum, fsum;
60aae27a
KS
207
208 if (((unsigned long)dst) % 16 == 0) {
209 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
210 0x14, 0x15, 0x16, 0x17,
211 0x08, 0x09, 0x0A, 0x0B,
212 0x0C, 0x0D, 0x0E, 0x0F);
213 } else {
214 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
215 0x04, 0x05, 0x06, 0x07,
216 0x18, 0x19, 0x1A, 0x1B,
217 0x1C, 0x1D, 0x1E, 0x1F);
218 }
219
220 vsrcAuc = vec_ld(0, src);
221
222 if (loadSecond)
223 vsrcBuc = vec_ld(16, src);
224 vsrcperm0 = vec_lvsl(0, src);
225 vsrcperm1 = vec_lvsl(1, src);
226
227 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
228 if (reallyBadAlign)
229 vsrc1uc = vsrcBuc;
230 else
231 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
232
233 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
234 (vector unsigned char)vsrc0uc);
235 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
236 (vector unsigned char)vsrc1uc);
237
238 if (!loadSecond) {// -> !reallyBadAlign
239 for (i = 0 ; i < h ; i++) {
240
241
242 vsrcCuc = vec_ld(stride + 0, src);
243
244 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
245 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
246
247 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
248 (vector unsigned char)vsrc2uc);
249 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
250 (vector unsigned char)vsrc3uc);
251
252 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
253 psum = vec_mladd(vB, vsrc1ssH, psum);
254 psum = vec_mladd(vC, vsrc2ssH, psum);
255 psum = vec_mladd(vD, vsrc3ssH, psum);
256 psum = vec_add(v28ss, psum);
257 psum = vec_sra(psum, v6us);
258
259 vdst = vec_ld(0, dst);
260 ppsum = (vector unsigned char)vec_packsu(psum, psum);
261 fsum = vec_perm(vdst, ppsum, fperm);
262
263 vec_st(fsum, 0, dst);
264
265 vsrc0ssH = vsrc2ssH;
266 vsrc1ssH = vsrc3ssH;
267
268 dst += stride;
269 src += stride;
270 }
271 } else {
272 vector unsigned char vsrcDuc;
273 for (i = 0 ; i < h ; i++) {
274 vsrcCuc = vec_ld(stride + 0, src);
275 vsrcDuc = vec_ld(stride + 16, src);
276
277 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
278 if (reallyBadAlign)
279 vsrc3uc = vsrcDuc;
280 else
281 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
282
283 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
284 (vector unsigned char)vsrc2uc);
285 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
286 (vector unsigned char)vsrc3uc);
287
288 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
289 psum = vec_mladd(vB, vsrc1ssH, psum);
290 psum = vec_mladd(vC, vsrc2ssH, psum);
291 psum = vec_mladd(vD, vsrc3ssH, psum);
292 psum = vec_add(v28ss, psum);
293 psum = vec_sr(psum, v6us);
294
295 vdst = vec_ld(0, dst);
296 ppsum = (vector unsigned char)vec_pack(psum, psum);
297 fsum = vec_perm(vdst, ppsum, fperm);
298
299 vec_st(fsum, 0, dst);
300
301 vsrc0ssH = vsrc2ssH;
302 vsrc1ssH = vsrc3ssH;
303
304 dst += stride;
305 src += stride;
306 }
307 }
308}
309
0d18f798
LB
310static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
311 const uint8_t * src2, int dst_stride,
312 int src_stride1, int h)
313{
314 int i;
315 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
a6a12a8a 316
0d18f798
LB
317 mask_ = vec_lvsl(0, src2);
318
319 for (i = 0; i < h; i++) {
320
321 tmp1 = vec_ld(i * src_stride1, src1);
322 mask = vec_lvsl(i * src_stride1, src1);
323 tmp2 = vec_ld(i * src_stride1 + 15, src1);
324
325 a = vec_perm(tmp1, tmp2, mask);
326
327 tmp1 = vec_ld(i * 16, src2);
328 tmp2 = vec_ld(i * 16 + 15, src2);
329
330 b = vec_perm(tmp1, tmp2, mask_);
331
332 tmp1 = vec_ld(0, dst);
333 mask = vec_lvsl(0, dst);
334 tmp2 = vec_ld(15, dst);
335
336 d = vec_avg(a, b);
337
338 edges = vec_perm(tmp2, tmp1, mask);
339
340 align = vec_lvsr(0, dst);
341
7e821457 342 tmp2 = vec_perm(d, edges, align);
27303c8a 343 tmp1 = vec_perm(edges, d, align);
0d18f798 344
cb243ea2 345 vec_st(tmp2, 15, dst);
27303c8a 346 vec_st(tmp1, 0 , dst);
0d18f798
LB
347
348 dst += dst_stride;
349 }
350}
351
352static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
353 const uint8_t * src2, int dst_stride,
354 int src_stride1, int h)
355{
356 int i;
357 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
358
359 mask_ = vec_lvsl(0, src2);
360
361 for (i = 0; i < h; i++) {
362
363 tmp1 = vec_ld(i * src_stride1, src1);
364 mask = vec_lvsl(i * src_stride1, src1);
365 tmp2 = vec_ld(i * src_stride1 + 15, src1);
366
367 a = vec_perm(tmp1, tmp2, mask);
368
369 tmp1 = vec_ld(i * 16, src2);
370 tmp2 = vec_ld(i * 16 + 15, src2);
371
372 b = vec_perm(tmp1, tmp2, mask_);
373
374 tmp1 = vec_ld(0, dst);
375 mask = vec_lvsl(0, dst);
376 tmp2 = vec_ld(15, dst);
377
378 d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
379
380 edges = vec_perm(tmp2, tmp1, mask);
381
382 align = vec_lvsr(0, dst);
383
7e821457 384 tmp2 = vec_perm(d, edges, align);
27303c8a 385 tmp1 = vec_perm(edges, d, align);
0d18f798 386
cb243ea2 387 vec_st(tmp2, 15, dst);
27303c8a 388 vec_st(tmp1, 0 , dst);
0d18f798
LB
389
390 dst += dst_stride;
391 }
a6a12a8a
RD
392}
393
0d18f798 394/* Implemented but could be faster
a6a12a8a
RD
395#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
396#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
0d18f798 397 */
a6a12a8a 398
0d18f798
LB
399 H264_MC(put_, 16, altivec)
400 H264_MC(avg_, 16, altivec)
a6a12a8a 401
3813dcc9
GP
402
403/****************************************************************************
404 * IDCT transform:
405 ****************************************************************************/
406
5dda2539
LB
407#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \
408 /* 1st stage */ \
409 vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \
410 vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \
411 vz2 = vec_sra(vb1,vec_splat_u16(1)); \
412 vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \
413 vz3 = vec_sra(vb3,vec_splat_u16(1)); \
414 vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \
415 /* 2nd stage: output */ \
416 va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \
417 va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \
418 va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \
419 va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */
420
421#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
422 b0 = vec_mergeh( a0, a0 ); \
423 b1 = vec_mergeh( a1, a0 ); \
424 b2 = vec_mergeh( a2, a0 ); \
425 b3 = vec_mergeh( a3, a0 ); \
426 a0 = vec_mergeh( b0, b2 ); \
427 a1 = vec_mergel( b0, b2 ); \
428 a2 = vec_mergeh( b1, b3 ); \
429 a3 = vec_mergel( b1, b3 ); \
430 b0 = vec_mergeh( a0, a2 ); \
431 b1 = vec_mergel( a0, a2 ); \
432 b2 = vec_mergeh( a1, a3 ); \
433 b3 = vec_mergel( a1, a3 )
434
435#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
436 vdst_orig = vec_ld(0, dst); \
437 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
438 vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \
439 va = vec_add(va, vdst_ss); \
440 va_u8 = vec_packsu(va, zero_s16v); \
441 va_u32 = vec_splat((vec_u32_t)va_u8, 0); \
442 vec_ste(va_u32, element, (uint32_t*)dst);
443
444static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
445{
446 vec_s16_t va0, va1, va2, va3;
447 vec_s16_t vz0, vz1, vz2, vz3;
448 vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
449 vec_u8_t va_u8;
450 vec_u32_t va_u32;
451 vec_s16_t vdst_ss;
452 const vec_u16_t v6us = vec_splat_u16(6);
453 vec_u8_t vdst, vdst_orig;
454 vec_u8_t vdst_mask = vec_lvsl(0, dst);
455 int element = ((unsigned long)dst & 0xf) >> 2;
456 LOAD_ZERO;
457
458 block[0] += 32; /* add 32 as a DC-level for rounding */
459
460 vtmp0 = vec_ld(0,block);
461 vtmp1 = vec_sld(vtmp0, vtmp0, 8);
462 vtmp2 = vec_ld(16,block);
463 vtmp3 = vec_sld(vtmp2, vtmp2, 8);
464
465 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
466 VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
467 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
468
469 va0 = vec_sra(va0,v6us);
470 va1 = vec_sra(va1,v6us);
471 va2 = vec_sra(va2,v6us);
472 va3 = vec_sra(va3,v6us);
473
474 VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
475 dst += stride;
476 VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
477 dst += stride;
478 VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
479 dst += stride;
480 VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
481}
482
3813dcc9
GP
483#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
484 /* a0 = SRC(0) + SRC(4); */ \
485 vec_s16_t a0v = vec_add(s0, s4); \
486 /* a2 = SRC(0) - SRC(4); */ \
487 vec_s16_t a2v = vec_sub(s0, s4); \
488 /* a4 = (SRC(2)>>1) - SRC(6); */ \
489 vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \
490 /* a6 = (SRC(6)>>1) + SRC(2); */ \
491 vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \
492 /* b0 = a0 + a6; */ \
493 vec_s16_t b0v = vec_add(a0v, a6v); \
494 /* b2 = a2 + a4; */ \
495 vec_s16_t b2v = vec_add(a2v, a4v); \
496 /* b4 = a2 - a4; */ \
497 vec_s16_t b4v = vec_sub(a2v, a4v); \
498 /* b6 = a0 - a6; */ \
499 vec_s16_t b6v = vec_sub(a0v, a6v); \
500 /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
501 /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
502 vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
503 /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
504 /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
505 vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
506 /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
507 /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
508 vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
509 /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
510 vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
511 /* b1 = (a7>>2) + a1; */ \
512 vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
513 /* b3 = a3 + (a5>>2); */ \
514 vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
515 /* b5 = (a3>>2) - a5; */ \
516 vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
517 /* b7 = a7 - (a1>>2); */ \
518 vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
519 /* DST(0, b0 + b7); */ \
520 d0 = vec_add(b0v, b7v); \
521 /* DST(1, b2 + b5); */ \
522 d1 = vec_add(b2v, b5v); \
523 /* DST(2, b4 + b3); */ \
524 d2 = vec_add(b4v, b3v); \
525 /* DST(3, b6 + b1); */ \
526 d3 = vec_add(b6v, b1v); \
527 /* DST(4, b6 - b1); */ \
528 d4 = vec_sub(b6v, b1v); \
529 /* DST(5, b4 - b3); */ \
530 d5 = vec_sub(b4v, b3v); \
531 /* DST(6, b2 - b5); */ \
532 d6 = vec_sub(b2v, b5v); \
533 /* DST(7, b0 - b7); */ \
534 d7 = vec_sub(b0v, b7v); \
535}
536
537#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
538 /* unaligned load */ \
539 vec_u8_t hv = vec_ld( 0, dest ); \
540 vec_u8_t lv = vec_ld( 7, dest ); \
541 vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \
542 vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
ec4e0056 543 vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \
3813dcc9
GP
544 vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
545 vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \
546 vec_u8_t edgehv; \
547 /* unaligned store */ \
548 vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
549 vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
550 lv = vec_sel( lv, bodyv, edgelv ); \
551 vec_st( lv, 7, dest ); \
552 hv = vec_ld( 0, dest ); \
553 edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
554 hv = vec_sel( hv, bodyv, edgehv ); \
555 vec_st( hv, 0, dest ); \
556 }
557
558void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
559 vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
560 vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
561 vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
562
563 vec_u8_t perm_ldv = vec_lvsl(0, dst);
564 vec_u8_t perm_stv = vec_lvsr(8, dst);
565
566 const vec_u16_t onev = vec_splat_u16(1);
567 const vec_u16_t twov = vec_splat_u16(2);
568 const vec_u16_t sixv = vec_splat_u16(6);
569
570 const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
571 -1,-1,-1,-1,-1,-1,-1,-1);
572 LOAD_ZERO;
573
574 dct[0] += 32; // rounding for the >>6 at the end
575
576 s0 = vec_ld(0x00, (int16_t*)dct);
577 s1 = vec_ld(0x10, (int16_t*)dct);
578 s2 = vec_ld(0x20, (int16_t*)dct);
579 s3 = vec_ld(0x30, (int16_t*)dct);
580 s4 = vec_ld(0x40, (int16_t*)dct);
581 s5 = vec_ld(0x50, (int16_t*)dct);
582 s6 = vec_ld(0x60, (int16_t*)dct);
583 s7 = vec_ld(0x70, (int16_t*)dct);
584
585 IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
586 d0, d1, d2, d3, d4, d5, d6, d7);
587
588 TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 );
589
590 IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7,
591 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
592
593 ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
594 ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
595 ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
596 ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
597 ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
598 ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
599 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
600 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
601}
602
f057cc03
GB
603#define transpose4x16(r0, r1, r2, r3) { \
604 register vector unsigned char r4; \
605 register vector unsigned char r5; \
606 register vector unsigned char r6; \
607 register vector unsigned char r7; \
608 \
609 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
610 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
611 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
612 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
613 \
614 r0 = vec_mergeh(r4, r6); /*all set 0*/ \
615 r1 = vec_mergel(r4, r6); /*all set 1*/ \
616 r2 = vec_mergeh(r5, r7); /*all set 2*/ \
617 r3 = vec_mergel(r5, r7); /*all set 3*/ \
618}
619
da1fce39
GP
620static inline void write16x4(uint8_t *dst, int dst_stride,
621 register vector unsigned char r0, register vector unsigned char r1,
622 register vector unsigned char r2, register vector unsigned char r3) {
f057cc03
GB
623 DECLARE_ALIGNED_16(unsigned char, result[64]);
624 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
625 int int_dst_stride = dst_stride/4;
626
627 vec_st(r0, 0, result);
628 vec_st(r1, 16, result);
629 vec_st(r2, 32, result);
630 vec_st(r3, 48, result);
631 /* FIXME: there has to be a better way!!!! */
632 *dst_int = *src_int;
633 *(dst_int+ int_dst_stride) = *(src_int + 1);
634 *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
635 *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
636 *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
637 *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
638 *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
639 *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
640 *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
641 *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
642 *(dst_int+10*int_dst_stride) = *(src_int + 10);
643 *(dst_int+11*int_dst_stride) = *(src_int + 11);
644 *(dst_int+12*int_dst_stride) = *(src_int + 12);
645 *(dst_int+13*int_dst_stride) = *(src_int + 13);
646 *(dst_int+14*int_dst_stride) = *(src_int + 14);
647 *(dst_int+15*int_dst_stride) = *(src_int + 15);
648}
649
650/** \brief performs a 6x16 transpose of data in src, and stores it to dst
651 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
652 out of unaligned_load() */
653#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
654 register vector unsigned char r0 = unaligned_load(0, src);\
655 register vector unsigned char r1 = unaligned_load( src_stride, src);\
656 register vector unsigned char r2 = unaligned_load(2* src_stride, src);\
657 register vector unsigned char r3 = unaligned_load(3* src_stride, src);\
658 register vector unsigned char r4 = unaligned_load(4* src_stride, src);\
659 register vector unsigned char r5 = unaligned_load(5* src_stride, src);\
660 register vector unsigned char r6 = unaligned_load(6* src_stride, src);\
661 register vector unsigned char r7 = unaligned_load(7* src_stride, src);\
662 register vector unsigned char r14 = unaligned_load(14*src_stride, src);\
663 register vector unsigned char r15 = unaligned_load(15*src_stride, src);\
664 \
665 r8 = unaligned_load( 8*src_stride, src); \
666 r9 = unaligned_load( 9*src_stride, src); \
667 r10 = unaligned_load(10*src_stride, src); \
668 r11 = unaligned_load(11*src_stride, src); \
669 r12 = unaligned_load(12*src_stride, src); \
670 r13 = unaligned_load(13*src_stride, src); \
671 \
672 /*Merge first pairs*/ \
673 r0 = vec_mergeh(r0, r8); /*0, 8*/ \
674 r1 = vec_mergeh(r1, r9); /*1, 9*/ \
675 r2 = vec_mergeh(r2, r10); /*2,10*/ \
676 r3 = vec_mergeh(r3, r11); /*3,11*/ \
677 r4 = vec_mergeh(r4, r12); /*4,12*/ \
678 r5 = vec_mergeh(r5, r13); /*5,13*/ \
679 r6 = vec_mergeh(r6, r14); /*6,14*/ \
680 r7 = vec_mergeh(r7, r15); /*7,15*/ \
681 \
682 /*Merge second pairs*/ \
683 r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \
684 r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \
685 r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \
686 r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \
687 r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \
688 r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \
689 r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \
690 r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
691 \
692 /*Third merge*/ \
693 r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
694 r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
695 r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
696 r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
697 r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
698 r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
699 /* Don't need to compute 3 and 7*/ \
700 \
701 /*Final merge*/ \
702 r8 = vec_mergeh(r0, r4); /*all set 0*/ \
703 r9 = vec_mergel(r0, r4); /*all set 1*/ \
704 r10 = vec_mergeh(r1, r5); /*all set 2*/ \
705 r11 = vec_mergel(r1, r5); /*all set 3*/ \
706 r12 = vec_mergeh(r2, r6); /*all set 4*/ \
707 r13 = vec_mergel(r2, r6); /*all set 5*/ \
708 /* Don't need to compute 14 and 15*/ \
709 \
710}
711
712// out: o = |x-y| < a
713static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
714 register vector unsigned char y,
715 register vector unsigned char a) {
716
717 register vector unsigned char diff = vec_subs(x, y);
718 register vector unsigned char diffneg = vec_subs(y, x);
719 register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
5ce27e8c 720 o = (vector unsigned char)vec_cmplt(o, a);
f057cc03
GB
721 return o;
722}
723
724static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0,
725 register vector unsigned char p1,
726 register vector unsigned char q0,
727 register vector unsigned char q1,
728 register vector unsigned char alpha,
729 register vector unsigned char beta) {
730
731 register vector unsigned char mask;
732 register vector unsigned char tempmask;
733
734 mask = diff_lt_altivec(p0, q0, alpha);
735 tempmask = diff_lt_altivec(p1, p0, beta);
736 mask = vec_and(mask, tempmask);
737 tempmask = diff_lt_altivec(q1, q0, beta);
738 mask = vec_and(mask, tempmask);
739
740 return mask;
741}
742
22fa38f0
GB
743// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
744static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0,
963eca22
GP
745 register vector unsigned char p1,
746 register vector unsigned char p2,
747 register vector unsigned char q0,
748 register vector unsigned char tc0) {
749
750 register vector unsigned char average = vec_avg(p0, q0);
751 register vector unsigned char temp;
752 register vector unsigned char uncliped;
753 register vector unsigned char ones;
754 register vector unsigned char max;
755 register vector unsigned char min;
22fa38f0 756 register vector unsigned char newp1;
963eca22
GP
757
758 temp = vec_xor(average, p2);
759 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
760 ones = vec_splat_u8(1);
761 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */
762 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
763 max = vec_adds(p1, tc0);
764 min = vec_subs(p1, tc0);
22fa38f0
GB
765 newp1 = vec_max(min, uncliped);
766 newp1 = vec_min(max, newp1);
767 return newp1;
f057cc03
GB
768}
769
770#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
771 \
0aec30c5 772 const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
f057cc03
GB
773 \
774 register vector unsigned char pq0bit = vec_xor(p0,q0); \
f057cc03
GB
775 register vector unsigned char q1minus; \
776 register vector unsigned char p0minus; \
777 register vector unsigned char stage1; \
778 register vector unsigned char stage2; \
779 register vector unsigned char vec160; \
780 register vector unsigned char delta; \
781 register vector unsigned char deltaneg; \
782 \
f4a02f6e 783 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
f057cc03
GB
784 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
785 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
f4a02f6e 786 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \
f057cc03
GB
787 stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
788 pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \
789 stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
790 stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
791 vec160 = vec_ld(0, &A0v); \
792 deltaneg = vec_subs(vec160, stage2); /* -d */ \
793 delta = vec_subs(stage2, vec160); /* d */ \
794 deltaneg = vec_min(tc0masked, deltaneg); \
795 delta = vec_min(tc0masked, delta); \
796 p0 = vec_subs(p0, deltaneg); \
797 q0 = vec_subs(q0, delta); \
798 p0 = vec_adds(p0, delta); \
799 q0 = vec_adds(q0, deltaneg); \
800}
801
802#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
803 DECLARE_ALIGNED_16(unsigned char, temp[16]); \
804 register vector unsigned char alphavec; \
805 register vector unsigned char betavec; \
806 register vector unsigned char mask; \
807 register vector unsigned char p1mask; \
808 register vector unsigned char q1mask; \
e970d98c 809 register vector signed char tc0vec; \
f057cc03
GB
810 register vector unsigned char finaltc0; \
811 register vector unsigned char tc0masked; \
22fa38f0
GB
812 register vector unsigned char newp1; \
813 register vector unsigned char newq1; \
f057cc03
GB
814 \
815 temp[0] = alpha; \
816 temp[1] = beta; \
817 alphavec = vec_ld(0, temp); \
818 betavec = vec_splat(alphavec, 0x1); \
819 alphavec = vec_splat(alphavec, 0x0); \
820 mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \
821 \
822 *((int *)temp) = *((int *)tc0); \
e970d98c 823 tc0vec = vec_ld(0, (signed char*)temp); \
f057cc03
GB
824 tc0vec = vec_mergeh(tc0vec, tc0vec); \
825 tc0vec = vec_mergeh(tc0vec, tc0vec); \
22fa38f0
GB
826 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
827 finaltc0 = vec_and((vector unsigned char)tc0vec, mask); /* tc = tc0 */ \
f057cc03
GB
828 \
829 p1mask = diff_lt_altivec(p2, p0, betavec); \
830 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \
e970d98c 831 tc0masked = vec_and(p1mask, (vector unsigned char)tc0vec); \
f057cc03 832 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
22fa38f0 833 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
f057cc03
GB
834 /*end if*/ \
835 \
836 q1mask = diff_lt_altivec(q2, q0, betavec); \
837 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
e970d98c 838 tc0masked = vec_and(q1mask, (vector unsigned char)tc0vec); \
f057cc03 839 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
22fa38f0 840 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
f057cc03
GB
841 /*end if*/ \
842 \
843 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
22fa38f0
GB
844 p1 = newp1; \
845 q1 = newq1; \
f057cc03
GB
846}
847
848static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
849
850 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
851 register vector unsigned char p2 = vec_ld(-3*stride, pix);
852 register vector unsigned char p1 = vec_ld(-2*stride, pix);
853 register vector unsigned char p0 = vec_ld(-1*stride, pix);
854 register vector unsigned char q0 = vec_ld(0, pix);
855 register vector unsigned char q1 = vec_ld(stride, pix);
856 register vector unsigned char q2 = vec_ld(2*stride, pix);
857 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
858 vec_st(p1, -2*stride, pix);
859 vec_st(p0, -1*stride, pix);
860 vec_st(q0, 0, pix);
861 vec_st(q1, stride, pix);
862 }
863}
864
865static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
866
867 register vector unsigned char line0, line1, line2, line3, line4, line5;
868 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
869 return;
870 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
871 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
872 transpose4x16(line1, line2, line3, line4);
873 write16x4(pix-2, stride, line1, line2, line3, line4);
874}
875
a6a12a8a 876void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
115329f1 877
a6a12a8a
RD
878#ifdef HAVE_ALTIVEC
879 if (has_altivec()) {
880 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
60aae27a 881 c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
a6a12a8a 882 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
5dda2539 883 c->h264_idct_add = ff_h264_idct_add_altivec;
3813dcc9 884 c->h264_idct8_add = ff_h264_idct8_add_altivec;
f057cc03
GB
885 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
886 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
a6a12a8a
RD
887
888#define dspfunc(PFX, IDX, NUM) \
889 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
890 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
891 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
892 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
893 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
894 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
895 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
896 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
897 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
898 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
899 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
900 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
901 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
902 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
903 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
904 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
115329f1 905
a6a12a8a
RD
906 dspfunc(put_h264_qpel, 0, 16);
907 dspfunc(avg_h264_qpel, 0, 16);
908#undef dspfunc
115329f1 909
a6a12a8a
RD
910 } else
911#endif /* HAVE_ALTIVEC */
912 {
913 // Non-AltiVec PPC optimisations
115329f1 914
a6a12a8a
RD
915 // ... pending ...
916 }
917}