cosmetics
[libav.git] / libavcodec / ppc / h264_altivec.c
1 /*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "dsputil.h"
22
23 #include "gcc_fixes.h"
24
25 #include "dsputil_altivec.h"
26 #include "types_altivec.h"
27
28 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
29 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
30
31 #define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
32 #define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec
33 #define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num
34 #define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec
35 #define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num
36 #define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec
37 #define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num
38 #define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec
39 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num
40 #include "h264_template_altivec.c"
41 #undef OP_U8_ALTIVEC
42 #undef PREFIX_h264_chroma_mc8_altivec
43 #undef PREFIX_h264_chroma_mc8_num
44 #undef PREFIX_h264_qpel16_h_lowpass_altivec
45 #undef PREFIX_h264_qpel16_h_lowpass_num
46 #undef PREFIX_h264_qpel16_v_lowpass_altivec
47 #undef PREFIX_h264_qpel16_v_lowpass_num
48 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
49 #undef PREFIX_h264_qpel16_hv_lowpass_num
50
51 #define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
52 #define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec
53 #define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num
54 #define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec
55 #define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num
56 #define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec
57 #define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num
58 #define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec
59 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num
60 #include "h264_template_altivec.c"
61 #undef OP_U8_ALTIVEC
62 #undef PREFIX_h264_chroma_mc8_altivec
63 #undef PREFIX_h264_chroma_mc8_num
64 #undef PREFIX_h264_qpel16_h_lowpass_altivec
65 #undef PREFIX_h264_qpel16_h_lowpass_num
66 #undef PREFIX_h264_qpel16_v_lowpass_altivec
67 #undef PREFIX_h264_qpel16_v_lowpass_num
68 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
69 #undef PREFIX_h264_qpel16_hv_lowpass_num
70
71 #define H264_MC(OPNAME, SIZE, CODETYPE) \
72 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
73 OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
74 }\
75 \
76 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
77 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
78 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
79 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
80 }\
81 \
82 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
83 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
84 }\
85 \
86 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
87 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
88 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
89 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
90 }\
91 \
92 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
93 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
94 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
95 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
96 }\
97 \
98 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
99 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
100 }\
101 \
102 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
103 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
104 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
105 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
106 }\
107 \
108 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
109 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
110 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
111 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
112 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
113 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
114 }\
115 \
116 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
117 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
118 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
119 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
120 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
121 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
122 }\
123 \
124 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
125 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
126 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
127 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
128 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
129 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
130 }\
131 \
132 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
133 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
134 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
135 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
136 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
137 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
138 }\
139 \
140 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
141 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
142 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
143 }\
144 \
145 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
146 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
147 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
148 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
149 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
150 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
151 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
152 }\
153 \
154 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
155 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
156 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
157 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
158 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
159 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
160 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
161 }\
162 \
163 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
164 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
165 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
166 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
167 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
168 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
169 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
170 }\
171 \
172 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
173 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
174 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
175 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
176 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
177 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
178 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
179 }\
180
181 /* this code assume that stride % 16 == 0 */
182 void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
183 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
184 {((8 - x) * (8 - y)),
185 ((x) * (8 - y)),
186 ((8 - x) * (y)),
187 ((x) * (y))};
188 register int i;
189 vector unsigned char fperm;
190 const vector signed int vABCD = vec_ld(0, ABCD);
191 const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
192 const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
193 const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
194 const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
195 const vector signed int vzero = vec_splat_s32(0);
196 const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197 const vector unsigned short v6us = vec_splat_u16(6);
198 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
199 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
200
201 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
202 vector unsigned char vsrc0uc, vsrc1uc;
203 vector signed short vsrc0ssH, vsrc1ssH;
204 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
205 vector signed short vsrc2ssH, vsrc3ssH, psum;
206 vector unsigned char vdst, ppsum, fsum;
207
208 if (((unsigned long)dst) % 16 == 0) {
209 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
210 0x14, 0x15, 0x16, 0x17,
211 0x08, 0x09, 0x0A, 0x0B,
212 0x0C, 0x0D, 0x0E, 0x0F);
213 } else {
214 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
215 0x04, 0x05, 0x06, 0x07,
216 0x18, 0x19, 0x1A, 0x1B,
217 0x1C, 0x1D, 0x1E, 0x1F);
218 }
219
220 vsrcAuc = vec_ld(0, src);
221
222 if (loadSecond)
223 vsrcBuc = vec_ld(16, src);
224 vsrcperm0 = vec_lvsl(0, src);
225 vsrcperm1 = vec_lvsl(1, src);
226
227 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
228 if (reallyBadAlign)
229 vsrc1uc = vsrcBuc;
230 else
231 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
232
233 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
234 (vector unsigned char)vsrc0uc);
235 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
236 (vector unsigned char)vsrc1uc);
237
238 if (!loadSecond) {// -> !reallyBadAlign
239 for (i = 0 ; i < h ; i++) {
240
241
242 vsrcCuc = vec_ld(stride + 0, src);
243
244 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
245 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
246
247 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
248 (vector unsigned char)vsrc2uc);
249 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
250 (vector unsigned char)vsrc3uc);
251
252 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
253 psum = vec_mladd(vB, vsrc1ssH, psum);
254 psum = vec_mladd(vC, vsrc2ssH, psum);
255 psum = vec_mladd(vD, vsrc3ssH, psum);
256 psum = vec_add(v28ss, psum);
257 psum = vec_sra(psum, v6us);
258
259 vdst = vec_ld(0, dst);
260 ppsum = (vector unsigned char)vec_packsu(psum, psum);
261 fsum = vec_perm(vdst, ppsum, fperm);
262
263 vec_st(fsum, 0, dst);
264
265 vsrc0ssH = vsrc2ssH;
266 vsrc1ssH = vsrc3ssH;
267
268 dst += stride;
269 src += stride;
270 }
271 } else {
272 vector unsigned char vsrcDuc;
273 for (i = 0 ; i < h ; i++) {
274 vsrcCuc = vec_ld(stride + 0, src);
275 vsrcDuc = vec_ld(stride + 16, src);
276
277 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
278 if (reallyBadAlign)
279 vsrc3uc = vsrcDuc;
280 else
281 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
282
283 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
284 (vector unsigned char)vsrc2uc);
285 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
286 (vector unsigned char)vsrc3uc);
287
288 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
289 psum = vec_mladd(vB, vsrc1ssH, psum);
290 psum = vec_mladd(vC, vsrc2ssH, psum);
291 psum = vec_mladd(vD, vsrc3ssH, psum);
292 psum = vec_add(v28ss, psum);
293 psum = vec_sr(psum, v6us);
294
295 vdst = vec_ld(0, dst);
296 ppsum = (vector unsigned char)vec_pack(psum, psum);
297 fsum = vec_perm(vdst, ppsum, fperm);
298
299 vec_st(fsum, 0, dst);
300
301 vsrc0ssH = vsrc2ssH;
302 vsrc1ssH = vsrc3ssH;
303
304 dst += stride;
305 src += stride;
306 }
307 }
308 }
309
310 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
311 const uint8_t * src2, int dst_stride,
312 int src_stride1, int h)
313 {
314 int i;
315 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
316
317 mask_ = vec_lvsl(0, src2);
318
319 for (i = 0; i < h; i++) {
320
321 tmp1 = vec_ld(i * src_stride1, src1);
322 mask = vec_lvsl(i * src_stride1, src1);
323 tmp2 = vec_ld(i * src_stride1 + 15, src1);
324
325 a = vec_perm(tmp1, tmp2, mask);
326
327 tmp1 = vec_ld(i * 16, src2);
328 tmp2 = vec_ld(i * 16 + 15, src2);
329
330 b = vec_perm(tmp1, tmp2, mask_);
331
332 tmp1 = vec_ld(0, dst);
333 mask = vec_lvsl(0, dst);
334 tmp2 = vec_ld(15, dst);
335
336 d = vec_avg(a, b);
337
338 edges = vec_perm(tmp2, tmp1, mask);
339
340 align = vec_lvsr(0, dst);
341
342 tmp2 = vec_perm(d, edges, align);
343 tmp1 = vec_perm(edges, d, align);
344
345 vec_st(tmp2, 15, dst);
346 vec_st(tmp1, 0 , dst);
347
348 dst += dst_stride;
349 }
350 }
351
352 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
353 const uint8_t * src2, int dst_stride,
354 int src_stride1, int h)
355 {
356 int i;
357 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
358
359 mask_ = vec_lvsl(0, src2);
360
361 for (i = 0; i < h; i++) {
362
363 tmp1 = vec_ld(i * src_stride1, src1);
364 mask = vec_lvsl(i * src_stride1, src1);
365 tmp2 = vec_ld(i * src_stride1 + 15, src1);
366
367 a = vec_perm(tmp1, tmp2, mask);
368
369 tmp1 = vec_ld(i * 16, src2);
370 tmp2 = vec_ld(i * 16 + 15, src2);
371
372 b = vec_perm(tmp1, tmp2, mask_);
373
374 tmp1 = vec_ld(0, dst);
375 mask = vec_lvsl(0, dst);
376 tmp2 = vec_ld(15, dst);
377
378 d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
379
380 edges = vec_perm(tmp2, tmp1, mask);
381
382 align = vec_lvsr(0, dst);
383
384 tmp2 = vec_perm(d, edges, align);
385 tmp1 = vec_perm(edges, d, align);
386
387 vec_st(tmp2, 15, dst);
388 vec_st(tmp1, 0 , dst);
389
390 dst += dst_stride;
391 }
392 }
393
394 /* Implemented but could be faster
395 #define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
396 #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
397 */
398
399 H264_MC(put_, 16, altivec)
400 H264_MC(avg_, 16, altivec)
401
402
403 /****************************************************************************
404 * IDCT transform:
405 ****************************************************************************/
406
407 #define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \
408 /* 1st stage */ \
409 vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \
410 vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \
411 vz2 = vec_sra(vb1,vec_splat_u16(1)); \
412 vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \
413 vz3 = vec_sra(vb3,vec_splat_u16(1)); \
414 vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \
415 /* 2nd stage: output */ \
416 va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \
417 va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \
418 va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \
419 va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */
420
421 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
422 b0 = vec_mergeh( a0, a0 ); \
423 b1 = vec_mergeh( a1, a0 ); \
424 b2 = vec_mergeh( a2, a0 ); \
425 b3 = vec_mergeh( a3, a0 ); \
426 a0 = vec_mergeh( b0, b2 ); \
427 a1 = vec_mergel( b0, b2 ); \
428 a2 = vec_mergeh( b1, b3 ); \
429 a3 = vec_mergel( b1, b3 ); \
430 b0 = vec_mergeh( a0, a2 ); \
431 b1 = vec_mergel( a0, a2 ); \
432 b2 = vec_mergeh( a1, a3 ); \
433 b3 = vec_mergel( a1, a3 )
434
435 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
436 vdst_orig = vec_ld(0, dst); \
437 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
438 vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \
439 va = vec_add(va, vdst_ss); \
440 va_u8 = vec_packsu(va, zero_s16v); \
441 va_u32 = vec_splat((vec_u32_t)va_u8, 0); \
442 vec_ste(va_u32, element, (uint32_t*)dst);
443
444 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
445 {
446 vec_s16_t va0, va1, va2, va3;
447 vec_s16_t vz0, vz1, vz2, vz3;
448 vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
449 vec_u8_t va_u8;
450 vec_u32_t va_u32;
451 vec_s16_t vdst_ss;
452 const vec_u16_t v6us = vec_splat_u16(6);
453 vec_u8_t vdst, vdst_orig;
454 vec_u8_t vdst_mask = vec_lvsl(0, dst);
455 int element = ((unsigned long)dst & 0xf) >> 2;
456 LOAD_ZERO;
457
458 block[0] += 32; /* add 32 as a DC-level for rounding */
459
460 vtmp0 = vec_ld(0,block);
461 vtmp1 = vec_sld(vtmp0, vtmp0, 8);
462 vtmp2 = vec_ld(16,block);
463 vtmp3 = vec_sld(vtmp2, vtmp2, 8);
464
465 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
466 VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
467 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
468
469 va0 = vec_sra(va0,v6us);
470 va1 = vec_sra(va1,v6us);
471 va2 = vec_sra(va2,v6us);
472 va3 = vec_sra(va3,v6us);
473
474 VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
475 dst += stride;
476 VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
477 dst += stride;
478 VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
479 dst += stride;
480 VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
481 }
482
483 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
484 /* a0 = SRC(0) + SRC(4); */ \
485 vec_s16_t a0v = vec_add(s0, s4); \
486 /* a2 = SRC(0) - SRC(4); */ \
487 vec_s16_t a2v = vec_sub(s0, s4); \
488 /* a4 = (SRC(2)>>1) - SRC(6); */ \
489 vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \
490 /* a6 = (SRC(6)>>1) + SRC(2); */ \
491 vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \
492 /* b0 = a0 + a6; */ \
493 vec_s16_t b0v = vec_add(a0v, a6v); \
494 /* b2 = a2 + a4; */ \
495 vec_s16_t b2v = vec_add(a2v, a4v); \
496 /* b4 = a2 - a4; */ \
497 vec_s16_t b4v = vec_sub(a2v, a4v); \
498 /* b6 = a0 - a6; */ \
499 vec_s16_t b6v = vec_sub(a0v, a6v); \
500 /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
501 /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
502 vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
503 /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
504 /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
505 vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
506 /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
507 /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
508 vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
509 /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
510 vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
511 /* b1 = (a7>>2) + a1; */ \
512 vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
513 /* b3 = a3 + (a5>>2); */ \
514 vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
515 /* b5 = (a3>>2) - a5; */ \
516 vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
517 /* b7 = a7 - (a1>>2); */ \
518 vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
519 /* DST(0, b0 + b7); */ \
520 d0 = vec_add(b0v, b7v); \
521 /* DST(1, b2 + b5); */ \
522 d1 = vec_add(b2v, b5v); \
523 /* DST(2, b4 + b3); */ \
524 d2 = vec_add(b4v, b3v); \
525 /* DST(3, b6 + b1); */ \
526 d3 = vec_add(b6v, b1v); \
527 /* DST(4, b6 - b1); */ \
528 d4 = vec_sub(b6v, b1v); \
529 /* DST(5, b4 - b3); */ \
530 d5 = vec_sub(b4v, b3v); \
531 /* DST(6, b2 - b5); */ \
532 d6 = vec_sub(b2v, b5v); \
533 /* DST(7, b0 - b7); */ \
534 d7 = vec_sub(b0v, b7v); \
535 }
536
537 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
538 /* unaligned load */ \
539 vec_u8_t hv = vec_ld( 0, dest ); \
540 vec_u8_t lv = vec_ld( 7, dest ); \
541 vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \
542 vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
543 vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \
544 vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
545 vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \
546 vec_u8_t edgehv; \
547 /* unaligned store */ \
548 vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
549 vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
550 lv = vec_sel( lv, bodyv, edgelv ); \
551 vec_st( lv, 7, dest ); \
552 hv = vec_ld( 0, dest ); \
553 edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
554 hv = vec_sel( hv, bodyv, edgehv ); \
555 vec_st( hv, 0, dest ); \
556 }
557
558 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
559 vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
560 vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
561 vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
562
563 vec_u8_t perm_ldv = vec_lvsl(0, dst);
564 vec_u8_t perm_stv = vec_lvsr(8, dst);
565
566 const vec_u16_t onev = vec_splat_u16(1);
567 const vec_u16_t twov = vec_splat_u16(2);
568 const vec_u16_t sixv = vec_splat_u16(6);
569
570 const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
571 -1,-1,-1,-1,-1,-1,-1,-1);
572 LOAD_ZERO;
573
574 dct[0] += 32; // rounding for the >>6 at the end
575
576 s0 = vec_ld(0x00, (int16_t*)dct);
577 s1 = vec_ld(0x10, (int16_t*)dct);
578 s2 = vec_ld(0x20, (int16_t*)dct);
579 s3 = vec_ld(0x30, (int16_t*)dct);
580 s4 = vec_ld(0x40, (int16_t*)dct);
581 s5 = vec_ld(0x50, (int16_t*)dct);
582 s6 = vec_ld(0x60, (int16_t*)dct);
583 s7 = vec_ld(0x70, (int16_t*)dct);
584
585 IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
586 d0, d1, d2, d3, d4, d5, d6, d7);
587
588 TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 );
589
590 IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7,
591 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
592
593 ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
594 ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
595 ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
596 ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
597 ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
598 ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
599 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
600 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
601 }
602
603 #define transpose4x16(r0, r1, r2, r3) { \
604 register vector unsigned char r4; \
605 register vector unsigned char r5; \
606 register vector unsigned char r6; \
607 register vector unsigned char r7; \
608 \
609 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
610 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
611 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
612 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
613 \
614 r0 = vec_mergeh(r4, r6); /*all set 0*/ \
615 r1 = vec_mergel(r4, r6); /*all set 1*/ \
616 r2 = vec_mergeh(r5, r7); /*all set 2*/ \
617 r3 = vec_mergel(r5, r7); /*all set 3*/ \
618 }
619
620 static inline void write16x4(uint8_t *dst, int dst_stride,
621 register vector unsigned char r0, register vector unsigned char r1,
622 register vector unsigned char r2, register vector unsigned char r3) {
623 DECLARE_ALIGNED_16(unsigned char, result[64]);
624 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
625 int int_dst_stride = dst_stride/4;
626
627 vec_st(r0, 0, result);
628 vec_st(r1, 16, result);
629 vec_st(r2, 32, result);
630 vec_st(r3, 48, result);
631 /* FIXME: there has to be a better way!!!! */
632 *dst_int = *src_int;
633 *(dst_int+ int_dst_stride) = *(src_int + 1);
634 *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
635 *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
636 *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
637 *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
638 *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
639 *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
640 *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
641 *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
642 *(dst_int+10*int_dst_stride) = *(src_int + 10);
643 *(dst_int+11*int_dst_stride) = *(src_int + 11);
644 *(dst_int+12*int_dst_stride) = *(src_int + 12);
645 *(dst_int+13*int_dst_stride) = *(src_int + 13);
646 *(dst_int+14*int_dst_stride) = *(src_int + 14);
647 *(dst_int+15*int_dst_stride) = *(src_int + 15);
648 }
649
650 /** \brief performs a 6x16 transpose of data in src, and stores it to dst
651 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
652 out of unaligned_load() */
653 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
654 register vector unsigned char r0 = unaligned_load(0, src);\
655 register vector unsigned char r1 = unaligned_load( src_stride, src);\
656 register vector unsigned char r2 = unaligned_load(2* src_stride, src);\
657 register vector unsigned char r3 = unaligned_load(3* src_stride, src);\
658 register vector unsigned char r4 = unaligned_load(4* src_stride, src);\
659 register vector unsigned char r5 = unaligned_load(5* src_stride, src);\
660 register vector unsigned char r6 = unaligned_load(6* src_stride, src);\
661 register vector unsigned char r7 = unaligned_load(7* src_stride, src);\
662 register vector unsigned char r14 = unaligned_load(14*src_stride, src);\
663 register vector unsigned char r15 = unaligned_load(15*src_stride, src);\
664 \
665 r8 = unaligned_load( 8*src_stride, src); \
666 r9 = unaligned_load( 9*src_stride, src); \
667 r10 = unaligned_load(10*src_stride, src); \
668 r11 = unaligned_load(11*src_stride, src); \
669 r12 = unaligned_load(12*src_stride, src); \
670 r13 = unaligned_load(13*src_stride, src); \
671 \
672 /*Merge first pairs*/ \
673 r0 = vec_mergeh(r0, r8); /*0, 8*/ \
674 r1 = vec_mergeh(r1, r9); /*1, 9*/ \
675 r2 = vec_mergeh(r2, r10); /*2,10*/ \
676 r3 = vec_mergeh(r3, r11); /*3,11*/ \
677 r4 = vec_mergeh(r4, r12); /*4,12*/ \
678 r5 = vec_mergeh(r5, r13); /*5,13*/ \
679 r6 = vec_mergeh(r6, r14); /*6,14*/ \
680 r7 = vec_mergeh(r7, r15); /*7,15*/ \
681 \
682 /*Merge second pairs*/ \
683 r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \
684 r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \
685 r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \
686 r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \
687 r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \
688 r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \
689 r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \
690 r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
691 \
692 /*Third merge*/ \
693 r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
694 r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
695 r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
696 r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
697 r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
698 r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
699 /* Don't need to compute 3 and 7*/ \
700 \
701 /*Final merge*/ \
702 r8 = vec_mergeh(r0, r4); /*all set 0*/ \
703 r9 = vec_mergel(r0, r4); /*all set 1*/ \
704 r10 = vec_mergeh(r1, r5); /*all set 2*/ \
705 r11 = vec_mergel(r1, r5); /*all set 3*/ \
706 r12 = vec_mergeh(r2, r6); /*all set 4*/ \
707 r13 = vec_mergel(r2, r6); /*all set 5*/ \
708 /* Don't need to compute 14 and 15*/ \
709 \
710 }
711
712 // out: o = |x-y| < a
713 static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
714 register vector unsigned char y,
715 register vector unsigned char a) {
716
717 register vector unsigned char diff = vec_subs(x, y);
718 register vector unsigned char diffneg = vec_subs(y, x);
719 register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
720 o = (vector unsigned char)vec_cmplt(o, a);
721 return o;
722 }
723
724 static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0,
725 register vector unsigned char p1,
726 register vector unsigned char q0,
727 register vector unsigned char q1,
728 register vector unsigned char alpha,
729 register vector unsigned char beta) {
730
731 register vector unsigned char mask;
732 register vector unsigned char tempmask;
733
734 mask = diff_lt_altivec(p0, q0, alpha);
735 tempmask = diff_lt_altivec(p1, p0, beta);
736 mask = vec_and(mask, tempmask);
737 tempmask = diff_lt_altivec(q1, q0, beta);
738 mask = vec_and(mask, tempmask);
739
740 return mask;
741 }
742
743 // out: p1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
744 #define h264_deblock_q1(p0, p1, p2, q0, tc0) { \
745 \
746 register vector unsigned char average = vec_avg(p0, q0); \
747 register vector unsigned char temp; \
748 register vector unsigned char uncliped; \
749 register vector unsigned char ones; \
750 register vector unsigned char max; \
751 register vector unsigned char min; \
752 \
753 temp = vec_xor(average, p2); \
754 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ \
755 ones = vec_splat_u8(1); \
756 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ \
757 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */\
758 max = vec_adds(p1, tc0); \
759 min = vec_subs(p1, tc0); \
760 p1 = vec_max(min, uncliped); \
761 p1 = vec_min(max, p1); \
762 }
763
764 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
765 \
766 const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
767 \
768 register vector unsigned char pq0bit = vec_xor(p0,q0); \
769 register vector unsigned char temp; \
770 register vector unsigned char q1minus; \
771 register vector unsigned char p0minus; \
772 register vector unsigned char stage1; \
773 register vector unsigned char stage2; \
774 register vector unsigned char vec160; \
775 register vector unsigned char delta; \
776 register vector unsigned char deltaneg; \
777 \
778 temp = (vector unsigned char)vec_cmpeq(p0, p0); \
779 q1minus = vec_xor(temp, q1); /* 255 - q1 */ \
780 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
781 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
782 p0minus = vec_xor(temp, p0); /* 255 - p0 */ \
783 stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
784 pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \
785 stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
786 stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
787 vec160 = vec_ld(0, &A0v); \
788 deltaneg = vec_subs(vec160, stage2); /* -d */ \
789 delta = vec_subs(stage2, vec160); /* d */ \
790 deltaneg = vec_min(tc0masked, deltaneg); \
791 delta = vec_min(tc0masked, delta); \
792 p0 = vec_subs(p0, deltaneg); \
793 q0 = vec_subs(q0, delta); \
794 p0 = vec_adds(p0, delta); \
795 q0 = vec_adds(q0, deltaneg); \
796 }
797
798 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
799 DECLARE_ALIGNED_16(unsigned char, temp[16]); \
800 register vector unsigned char alphavec; \
801 register vector unsigned char betavec; \
802 register vector unsigned char mask; \
803 register vector unsigned char p1mask; \
804 register vector unsigned char q1mask; \
805 register vector unsigned char tc0vec; \
806 register vector unsigned char finaltc0; \
807 register vector unsigned char tc0masked; \
808 \
809 temp[0] = alpha; \
810 temp[1] = beta; \
811 alphavec = vec_ld(0, temp); \
812 betavec = vec_splat(alphavec, 0x1); \
813 alphavec = vec_splat(alphavec, 0x0); \
814 mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \
815 \
816 *((int *)temp) = *((int *)tc0); \
817 tc0vec = vec_ld(0, temp); \
818 tc0vec = vec_mergeh(tc0vec, tc0vec); \
819 tc0vec = vec_mergeh(tc0vec, tc0vec); \
820 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_u8(-1))); /* if tc0[i] >= 0 */ \
821 finaltc0 = vec_and(tc0vec, mask); /*tc = tc0[i]*/ \
822 \
823 p1mask = diff_lt_altivec(p2, p0, betavec); \
824 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \
825 tc0masked = vec_and(p1mask, tc0vec); \
826 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
827 h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
828 /*end if*/ \
829 \
830 q1mask = diff_lt_altivec(q2, q0, betavec); \
831 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
832 tc0masked = vec_and(q1mask, tc0vec); \
833 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
834 h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
835 /*end if*/ \
836 \
837 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
838 }
839
840 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
841
842 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
843 register vector unsigned char p2 = vec_ld(-3*stride, pix);
844 register vector unsigned char p1 = vec_ld(-2*stride, pix);
845 register vector unsigned char p0 = vec_ld(-1*stride, pix);
846 register vector unsigned char q0 = vec_ld(0, pix);
847 register vector unsigned char q1 = vec_ld(stride, pix);
848 register vector unsigned char q2 = vec_ld(2*stride, pix);
849 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
850 vec_st(p1, -2*stride, pix);
851 vec_st(p0, -1*stride, pix);
852 vec_st(q0, 0, pix);
853 vec_st(q1, stride, pix);
854 }
855 }
856
857 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
858
859 register vector unsigned char line0, line1, line2, line3, line4, line5;
860 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
861 return;
862 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
863 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
864 transpose4x16(line1, line2, line3, line4);
865 write16x4(pix-2, stride, line1, line2, line3, line4);
866 }
867
868 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
869
870 #ifdef HAVE_ALTIVEC
871 if (has_altivec()) {
872 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
873 c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
874 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
875 c->h264_idct_add = ff_h264_idct_add_altivec;
876 c->h264_idct8_add = ff_h264_idct8_add_altivec;
877 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
878 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
879
880 #define dspfunc(PFX, IDX, NUM) \
881 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
882 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
883 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
884 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
885 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
886 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
887 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
888 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
889 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
890 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
891 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
892 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
893 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
894 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
895 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
896 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
897
898 dspfunc(put_h264_qpel, 0, 16);
899 dspfunc(avg_h264_qpel, 0, 16);
900 #undef dspfunc
901
902 } else
903 #endif /* HAVE_ALTIVEC */
904 {
905 // Non-AltiVec PPC optimisations
906
907 // ... pending ...
908 }
909 }