Commit | Line | Data |
---|---|---|
b0ac780a MN |
1 | /* |
2 | AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org> | |
3 | ||
4 | based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) | |
5 | ||
6 | This program is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 2 of the License, or | |
9 | (at your option) any later version. | |
10 | ||
11 | This program is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with this program; if not, write to the Free Software | |
18 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
19 | */ | |
20 | ||
21 | ||
22 | #ifdef CONFIG_DARWIN | |
23 | #define AVV(x...) (x) | |
24 | #else | |
25 | #define AVV(x...) {x} | |
26 | #endif | |
27 | ||
28 | static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { | |
29 | /* | |
30 | this code makes no assumption on src or stride. | |
31 | One could remove the recomputation of the perm | |
32 | vector by assuming (stride % 16) == 0, unfortunately | |
33 | this is not always true. | |
34 | */ | |
35 | register int y; | |
36 | short __attribute__ ((aligned(16))) data[8]; | |
37 | int numEq; | |
38 | uint8_t *src2 = src; | |
39 | vector signed short v_dcOffset; | |
40 | vector signed short v2QP; | |
41 | vector unsigned short v4QP; | |
42 | vector unsigned short v_dcThreshold; | |
43 | int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0; | |
44 | const vector signed int zero = vec_splat_s32(0); | |
45 | const vector signed short mask = vec_splat_s16(1); | |
46 | vector signed int v_numEq = vec_splat_s32(0); | |
47 | ||
48 | data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; | |
49 | data[1] = data[0] * 2 + 1; | |
50 | data[2] = c->QP * 2; | |
51 | data[3] = c->QP * 4; | |
52 | vector signed short v_data = vec_ld(0, data); | |
53 | v_dcOffset = vec_splat(v_data, 0); | |
54 | v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1); | |
55 | v2QP = vec_splat(v_data, 2); | |
56 | v4QP = (vector unsigned short)vec_splat(v_data, 3); | |
57 | ||
58 | src2 += stride * 4; | |
59 | ||
60 | #define LOAD_LINE(i) \ | |
61 | register int j##i = i * stride; \ | |
62 | vector unsigned char perm##i = vec_lvsl(j##i, src2); \ | |
63 | const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ | |
64 | vector unsigned char v_srcA2##i; \ | |
65 | if (two_vectors) \ | |
66 | v_srcA2##i = vec_ld(j##i + 16, src2); \ | |
67 | const vector unsigned char v_srcA##i = \ | |
68 | vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ | |
69 | vector signed short v_srcAss##i = \ | |
70 | (vector signed short)vec_mergeh((vector signed char)zero, \ | |
71 | (vector signed char)v_srcA##i) | |
72 | ||
73 | LOAD_LINE(0); | |
74 | LOAD_LINE(1); | |
75 | LOAD_LINE(2); | |
76 | LOAD_LINE(3); | |
77 | LOAD_LINE(4); | |
78 | LOAD_LINE(5); | |
79 | LOAD_LINE(6); | |
80 | LOAD_LINE(7); | |
81 | #undef LOAD_LINE | |
82 | ||
83 | #define ITER(i, j) \ | |
84 | const vector signed short v_diff##i = \ | |
85 | vec_sub(v_srcAss##i, v_srcAss##j); \ | |
86 | const vector signed short v_sum##i = \ | |
87 | vec_add(v_diff##i, v_dcOffset); \ | |
88 | const vector signed short v_comp##i = \ | |
89 | (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ | |
90 | v_dcThreshold); \ | |
91 | const vector signed short v_part##i = vec_and(mask, v_comp##i); \ | |
92 | v_numEq = vec_sum4s(v_part##i, v_numEq); | |
93 | ||
94 | ITER(0, 1); | |
95 | ITER(1, 2); | |
96 | ITER(2, 3); | |
97 | ITER(3, 4); | |
98 | ITER(4, 5); | |
99 | ITER(5, 6); | |
100 | ITER(6, 7); | |
101 | #undef ITER | |
102 | ||
103 | v_numEq = vec_sums(v_numEq, zero); | |
104 | ||
105 | v_numEq = vec_splat(v_numEq, 3); | |
106 | vec_ste(v_numEq, 0, &numEq); | |
107 | ||
108 | if (numEq > c->ppMode.flatnessThreshold) | |
109 | { | |
110 | const vector unsigned char mmoP1 = (const vector unsigned char) | |
111 | AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, | |
112 | 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B); | |
113 | const vector unsigned char mmoP2 = (const vector unsigned char) | |
114 | AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, | |
115 | 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); | |
116 | const vector unsigned char mmoP = (const vector unsigned char) | |
117 | vec_lvsl(8, (unsigned char*)0); | |
118 | ||
119 | vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); | |
120 | vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); | |
121 | vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); | |
122 | vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); | |
123 | vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2); | |
124 | vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); | |
125 | vector signed short mmoDiff = vec_sub(mmoL, mmoR); | |
126 | vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); | |
127 | ||
128 | if (vec_any_gt(mmoSum, v4QP)) | |
129 | return 0; | |
130 | else | |
131 | return 1; | |
132 | } | |
133 | else return 2; | |
134 | } | |
135 | ||
136 | ||
137 | static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { | |
138 | /* | |
139 | this code makes no assumption on src or stride. | |
140 | One could remove the recomputation of the perm | |
141 | vector by assuming (stride % 16) == 0, unfortunately | |
142 | this is not always true. Quite a lot of load/stores | |
143 | can be removed by assuming proper alignement of | |
144 | src & stride :-( | |
145 | */ | |
146 | uint8_t *src2 = src; | |
147 | const vector signed int zero = vec_splat_s32(0); | |
148 | short __attribute__ ((aligned(16))) qp[8]; | |
149 | qp[0] = c->QP; | |
150 | vector signed short vqp = vec_ld(0, qp); | |
151 | vqp = vec_splat(vqp, 0); | |
152 | ||
153 | #define LOAD_LINE(i) \ | |
154 | const vector unsigned char perml##i = \ | |
155 | vec_lvsl(i * stride, src2); \ | |
156 | const vector unsigned char vbA##i = \ | |
157 | vec_ld(i * stride, src2); \ | |
158 | const vector unsigned char vbB##i = \ | |
159 | vec_ld(i * stride + 16, src2); \ | |
160 | const vector unsigned char vbT##i = \ | |
161 | vec_perm(vbA##i, vbB##i, perml##i); \ | |
162 | const vector signed short vb##i = \ | |
163 | (vector signed short)vec_mergeh((vector unsigned char)zero, \ | |
164 | (vector unsigned char)vbT##i) | |
165 | ||
166 | src2 += stride*3; | |
167 | ||
168 | LOAD_LINE(0); | |
169 | LOAD_LINE(1); | |
170 | LOAD_LINE(2); | |
171 | LOAD_LINE(3); | |
172 | LOAD_LINE(4); | |
173 | LOAD_LINE(5); | |
174 | LOAD_LINE(6); | |
175 | LOAD_LINE(7); | |
176 | LOAD_LINE(8); | |
177 | LOAD_LINE(9); | |
178 | #undef LOAD_LINE | |
179 | ||
180 | const vector unsigned short v_1 = vec_splat_u16(1); | |
181 | const vector unsigned short v_2 = vec_splat_u16(2); | |
182 | const vector unsigned short v_4 = vec_splat_u16(4); | |
183 | const vector signed short v_8 = vec_splat_s16(8); | |
184 | ||
185 | const vector signed short v_first = vec_sel(vb1, vb0, | |
186 | vec_cmplt(vec_abs(vec_sub(vb0, vb1)), | |
187 | vqp)); | |
188 | const vector signed short v_last = vec_sel(vb8, vb9, | |
189 | vec_cmplt(vec_abs(vec_sub(vb8, vb9)), | |
190 | vqp)); | |
191 | ||
192 | const vector signed short v_sums0 = vec_add(v_first, vb1); | |
193 | const vector signed short v_sums1 = vec_add(vb1, vb2); | |
194 | const vector signed short v_sums2 = vec_add(vb2, vb3); | |
195 | const vector signed short v_sums3 = vec_add(vb3, vb4); | |
196 | const vector signed short v_sums4 = vec_add(vb4, vb5); | |
197 | const vector signed short v_sums5 = vec_add(vb5, vb6); | |
198 | const vector signed short v_sums6 = vec_add(vb6, vb7); | |
199 | const vector signed short v_sums7 = vec_add(vb7, vb8); | |
200 | const vector signed short v_sums8 = vec_add(vb8, v_last); | |
201 | ||
202 | const vector signed short vr1 = vec_sra(vec_add(vec_add(vec_sl(v_sums0, v_2), | |
203 | vec_sl(vec_add(v_first, v_sums2), v_1)), | |
204 | vec_add(v_sums4, v_8)), | |
205 | v_4); | |
206 | const vector signed short vr2 = vec_sra(vec_add(vec_add(vec_sl(vb2, v_2), | |
207 | v_sums5), | |
208 | vec_add(v_8, | |
209 | vec_sl(vec_add(v_first, | |
210 | vec_add(v_sums0, v_sums3)), | |
211 | v_1))), | |
212 | v_4); | |
213 | const vector signed short vr3 = vec_sra(vec_add(vec_add(vec_sl(vb3, v_2), | |
214 | v_sums6), | |
215 | vec_add(v_8, | |
216 | vec_sl(vec_add(v_first, | |
217 | vec_add(v_sums1, v_sums4)), | |
218 | v_1))), | |
219 | v_4); | |
220 | const vector signed short vr4 = vec_sra(vec_add(vec_add(vec_sl(vb4, v_2), | |
221 | v_sums7), | |
222 | vec_add(v_8, | |
223 | vec_add(v_sums0, | |
224 | vec_sl(vec_add(v_sums2, v_sums5), | |
225 | v_1)))), | |
226 | v_4); | |
227 | const vector signed short vr5 = vec_sra(vec_add(vec_add(vec_sl(vb5, v_2), | |
228 | v_sums8), | |
229 | vec_add(v_8, | |
230 | vec_add(v_sums1, | |
231 | vec_sl(vec_add(v_sums3, v_sums6), | |
232 | v_1)))), | |
233 | v_4); | |
234 | const vector signed short vr6 = vec_sra(vec_add(vec_add(vec_sl(vb6, v_2), | |
235 | v_sums2), | |
236 | vec_add(v_8, | |
237 | vec_sl(vec_add(v_last, | |
238 | vec_add(v_sums7, v_sums4)), | |
239 | v_1))), | |
240 | v_4); | |
241 | const vector signed short vr7 = vec_sra(vec_add(vec_add(vec_sl(vec_add(v_last, vb7), v_2), | |
242 | vec_sl(vec_add(vb8, v_sums5), v_1)), | |
243 | vec_add(v_8, v_sums3)), | |
244 | v_4); | |
245 | const vector signed short vr8 = vec_sra(vec_add(vec_add(vec_sl(v_sums8, v_2), | |
246 | vec_sl(vec_add(v_last, v_sums6), v_1)), | |
247 | vec_add(v_sums4, v_8)), | |
248 | v_4); | |
249 | ||
250 | const vector unsigned char neg1 = (vector unsigned char)AVV(-1, -1, -1, -1, -1, -1, -1, -1, | |
251 | -1, -1, -1, -1, -1, -1, -1, -1); | |
252 | const vector unsigned char permHH = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
253 | 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
254 | ||
255 | #define PACK_AND_STORE(i) \ | |
256 | const vector unsigned char perms##i = \ | |
257 | vec_lvsr(i * stride, src2); \ | |
258 | const vector unsigned char vf##i = \ | |
259 | vec_packsu(vr##i, (vector signed short)zero); \ | |
260 | const vector unsigned char vg##i = \ | |
261 | vec_perm(vf##i, vbT##i, permHH); \ | |
262 | const vector unsigned char mask##i = \ | |
263 | vec_perm((vector unsigned char)zero, neg1, perms##i); \ | |
264 | const vector unsigned char vg2##i = \ | |
265 | vec_perm(vg##i, vg##i, perms##i); \ | |
266 | const vector unsigned char svA##i = \ | |
267 | vec_sel(vbA##i, vg2##i, mask##i); \ | |
268 | const vector unsigned char svB##i = \ | |
269 | vec_sel(vg2##i, vbB##i, mask##i); \ | |
270 | vec_st(svA##i, i * stride, src2); \ | |
271 | vec_st(svB##i, i * stride + 16, src2) | |
272 | ||
273 | PACK_AND_STORE(1); | |
274 | PACK_AND_STORE(2); | |
275 | PACK_AND_STORE(3); | |
276 | PACK_AND_STORE(4); | |
277 | PACK_AND_STORE(5); | |
278 | PACK_AND_STORE(6); | |
279 | PACK_AND_STORE(7); | |
280 | PACK_AND_STORE(8); | |
281 | ||
282 | #undef PACK_AND_STORE | |
283 | } | |
284 | ||
285 | ||
286 | ||
287 | static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) { | |
288 | /* | |
289 | this code makes no assumption on src or stride. | |
290 | One could remove the recomputation of the perm | |
291 | vector by assuming (stride % 16) == 0, unfortunately | |
292 | this is not always true. Quite a lot of load/stores | |
293 | can be removed by assuming proper alignement of | |
294 | src & stride :-( | |
295 | */ | |
296 | uint8_t *src2 = src; | |
297 | const vector signed int zero = vec_splat_s32(0); | |
298 | short __attribute__ ((aligned(16))) qp[8]; | |
299 | qp[0] = 8*c->QP; | |
300 | vector signed short vqp = vec_ld(0, qp); | |
301 | vqp = vec_splat(vqp, 0); | |
302 | ||
303 | #define LOAD_LINE(i) \ | |
304 | const vector unsigned char perm##i = \ | |
305 | vec_lvsl(i * stride, src2); \ | |
306 | const vector unsigned char vbA##i = \ | |
307 | vec_ld(i * stride, src2); \ | |
308 | const vector unsigned char vbB##i = \ | |
309 | vec_ld(i * stride + 16, src2); \ | |
310 | const vector unsigned char vbT##i = \ | |
311 | vec_perm(vbA##i, vbB##i, perm##i); \ | |
312 | const vector signed short vb##i = \ | |
313 | (vector signed short)vec_mergeh((vector unsigned char)zero, \ | |
314 | (vector unsigned char)vbT##i) | |
315 | ||
316 | src2 += stride*3; | |
317 | ||
318 | LOAD_LINE(1); | |
319 | LOAD_LINE(2); | |
320 | LOAD_LINE(3); | |
321 | LOAD_LINE(4); | |
322 | LOAD_LINE(5); | |
323 | LOAD_LINE(6); | |
324 | LOAD_LINE(7); | |
325 | LOAD_LINE(8); | |
326 | #undef LOAD_LINE | |
327 | ||
328 | const vector signed short v_1 = vec_splat_s16(1); | |
329 | const vector signed short v_2 = vec_splat_s16(2); | |
330 | const vector signed short v_5 = vec_splat_s16(5); | |
331 | const vector signed short v_32 = vec_sl(v_1, | |
332 | (vector unsigned short)v_5); | |
333 | /* middle energy */ | |
334 | const vector signed short l3minusl6 = vec_sub(vb3, vb6); | |
335 | const vector signed short l5minusl4 = vec_sub(vb5, vb4); | |
336 | const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero); | |
337 | const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6); | |
338 | const vector signed short absmE = vec_abs(mE); | |
339 | /* left & right energy */ | |
340 | const vector signed short l1minusl4 = vec_sub(vb1, vb4); | |
341 | const vector signed short l3minusl2 = vec_sub(vb3, vb2); | |
342 | const vector signed short l5minusl8 = vec_sub(vb5, vb8); | |
343 | const vector signed short l7minusl6 = vec_sub(vb7, vb6); | |
344 | const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero); | |
345 | const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero); | |
346 | const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4); | |
347 | const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8); | |
348 | /* d */ | |
349 | const vector signed short ddiff = vec_sub(absmE, | |
350 | vec_min(vec_abs(lE), | |
351 | vec_abs(rE))); | |
352 | const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero); | |
353 | const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32); | |
354 | const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6)); | |
355 | const vector signed short minusd = vec_sub((vector signed short)zero, d); | |
356 | const vector signed short finald = vec_sel(minusd, | |
357 | d, | |
358 | vec_cmpgt(vec_sub((vector signed short)zero, mE), | |
359 | (vector signed short)zero)); | |
360 | /* q */ | |
361 | const vector signed short qtimes2 = vec_sub(vb4, vb5); | |
362 | /* for a shift right to behave like /2, we need to add one | |
363 | to all negative integer */ | |
364 | const vector signed short rounddown = vec_sel((vector signed short)zero, | |
365 | v_1, | |
366 | vec_cmplt(qtimes2, (vector signed short)zero)); | |
367 | const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1)); | |
368 | /* clamp */ | |
369 | const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald); | |
370 | const vector signed short dclamp_P = vec_min(dclamp_P1, q); | |
371 | const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald); | |
372 | const vector signed short dclamp_N = vec_max(dclamp_N1, q); | |
373 | ||
374 | const vector signed short dclampedfinal = vec_sel(dclamp_N, | |
375 | dclamp_P, | |
376 | vec_cmpgt(q, (vector signed short)zero)); | |
377 | const vector signed short dornotd = vec_sel((vector signed short)zero, | |
378 | dclampedfinal, | |
379 | vec_cmplt(absmE, vqp)); | |
380 | /* add/substract to l4 and l5 */ | |
381 | const vector signed short vb4minusd = vec_sub(vb4, dornotd); | |
382 | const vector signed short vb5plusd = vec_add(vb5, dornotd); | |
383 | /* finally, stores */ | |
384 | const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); | |
385 | const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); | |
386 | ||
387 | const vector unsigned char neg1 = (vector unsigned char)AVV(-1, -1, -1, -1, -1, -1, -1, -1, | |
388 | -1, -1, -1, -1, -1, -1, -1, -1); | |
389 | ||
390 | const vector unsigned char permHH = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
391 | 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
392 | ||
393 | #define STORE(i) \ | |
394 | const vector unsigned char perms##i = \ | |
395 | vec_lvsr(i * stride, src2); \ | |
396 | const vector unsigned char vg##i = \ | |
397 | vec_perm(st##i, vbT##i, permHH); \ | |
398 | const vector unsigned char mask##i = \ | |
399 | vec_perm((vector unsigned char)zero, neg1, perms##i); \ | |
400 | const vector unsigned char vg2##i = \ | |
401 | vec_perm(vg##i, vg##i, perms##i); \ | |
402 | const vector unsigned char svA##i = \ | |
403 | vec_sel(vbA##i, vg2##i, mask##i); \ | |
404 | const vector unsigned char svB##i = \ | |
405 | vec_sel(vg2##i, vbB##i, mask##i); \ | |
406 | vec_st(svA##i, i * stride, src2); \ | |
407 | vec_st(svB##i, i * stride + 16, src2) | |
408 | ||
409 | STORE(4); | |
410 | STORE(5); | |
411 | } | |
412 | ||
413 | static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) { | |
414 | /* | |
415 | this code makes no assumption on src or stride. | |
416 | One could remove the recomputation of the perm | |
417 | vector by assuming (stride % 16) == 0, unfortunately | |
418 | this is not always true. Quite a lot of load/stores | |
419 | can be removed by assuming proper alignement of | |
420 | src & stride :-( | |
421 | */ | |
422 | uint8_t *srcCopy = src; | |
423 | uint8_t __attribute__((aligned(16))) dt[16]; | |
424 | const vector unsigned char vuint8_1 = vec_splat_u8(1); | |
425 | const vector signed int zero = vec_splat_s32(0); | |
426 | vector unsigned char v_dt; | |
427 | dt[0] = deringThreshold; | |
428 | v_dt = vec_splat(vec_ld(0, dt), 0); | |
429 | ||
430 | #define LOAD_LINE(i) \ | |
431 | const vector unsigned char perm##i = \ | |
432 | vec_lvsl(i * stride, srcCopy); \ | |
433 | vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ | |
434 | vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ | |
435 | vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) | |
436 | ||
437 | LOAD_LINE(0); | |
438 | LOAD_LINE(1); | |
439 | LOAD_LINE(2); | |
440 | LOAD_LINE(3); | |
441 | LOAD_LINE(4); | |
442 | LOAD_LINE(5); | |
443 | LOAD_LINE(6); | |
444 | LOAD_LINE(7); | |
445 | LOAD_LINE(8); | |
446 | LOAD_LINE(9); | |
447 | #undef LOAD_LINE | |
448 | ||
449 | vector unsigned char v_avg; | |
450 | { | |
451 | const vector unsigned char trunc_perm = (vector unsigned char) | |
452 | AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, | |
453 | 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); | |
454 | const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); | |
455 | const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); | |
456 | const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); | |
457 | const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); | |
458 | ||
459 | #define EXTRACT(op) do { \ | |
460 | const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ | |
461 | const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ | |
462 | const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ | |
463 | const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ | |
464 | const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \ | |
465 | const vector unsigned char s##op##_9 = vec_##op(s##op##_8h, s##op##_8l); \ | |
466 | const vector unsigned char s##op##_9h = vec_mergeh(s##op##_9, s##op##_9); \ | |
467 | const vector unsigned char s##op##_9l = vec_mergel(s##op##_9, s##op##_9); \ | |
468 | const vector unsigned char s##op##_10 = vec_##op(s##op##_9h, s##op##_9l); \ | |
469 | const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \ | |
470 | const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \ | |
471 | const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \ | |
472 | const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \ | |
473 | const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \ | |
474 | v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0) | |
475 | ||
476 | vector unsigned char v_min; | |
477 | vector unsigned char v_max; | |
478 | EXTRACT(min); | |
479 | EXTRACT(max); | |
480 | #undef EXTRACT | |
481 | ||
482 | if (vec_all_lt(vec_sub(v_max, v_min), v_dt)) | |
483 | return; | |
484 | ||
485 | v_avg = vec_avg(v_min, v_max); | |
486 | } | |
487 | ||
488 | signed int __attribute__((aligned(16))) S[8]; | |
489 | { | |
490 | const vector unsigned short mask1 = (vector unsigned short) | |
491 | AVV(0x0001, 0x0002, 0x0004, 0x0008, | |
492 | 0x0010, 0x0020, 0x0040, 0x0080); | |
493 | const vector unsigned short mask2 = (vector unsigned short) | |
494 | AVV(0x0100, 0x0200, 0x0000, 0x0000, | |
495 | 0x0000, 0x0000, 0x0000, 0x0000); | |
496 | ||
497 | const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); | |
498 | const vector unsigned int vuint32_1 = vec_splat_u32(1); | |
499 | ||
500 | #define COMPARE(i) \ | |
501 | vector signed int sum##i; \ | |
502 | do { \ | |
503 | const vector unsigned char cmp##i = \ | |
504 | (vector unsigned char)vec_cmpgt(src##i, v_avg); \ | |
505 | const vector unsigned short cmpHi##i = \ | |
506 | (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \ | |
507 | const vector unsigned short cmpLi##i = \ | |
508 | (vector unsigned short)vec_mergel(cmp##i, cmp##i); \ | |
509 | const vector signed short cmpHf##i = \ | |
510 | (vector signed short)vec_and(cmpHi##i, mask1); \ | |
511 | const vector signed short cmpLf##i = \ | |
512 | (vector signed short)vec_and(cmpLi##i, mask2); \ | |
513 | const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ | |
514 | const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ | |
515 | sum##i = vec_sums(sumq##i, zero); } while (0) | |
516 | ||
517 | COMPARE(0); | |
518 | COMPARE(1); | |
519 | COMPARE(2); | |
520 | COMPARE(3); | |
521 | COMPARE(4); | |
522 | COMPARE(5); | |
523 | COMPARE(6); | |
524 | COMPARE(7); | |
525 | COMPARE(8); | |
526 | COMPARE(9); | |
527 | #undef COMPARE | |
528 | ||
529 | vector signed int sumA2; | |
530 | vector signed int sumB2; | |
531 | { | |
532 | const vector signed int sump02 = vec_mergel(sum0, sum2); | |
533 | const vector signed int sump13 = vec_mergel(sum1, sum3); | |
534 | const vector signed int sumA = vec_mergel(sump02, sump13); | |
535 | ||
536 | const vector signed int sump46 = vec_mergel(sum4, sum6); | |
537 | const vector signed int sump57 = vec_mergel(sum5, sum7); | |
538 | const vector signed int sumB = vec_mergel(sump46, sump57); | |
539 | ||
540 | const vector signed int sump8A = vec_mergel(sum8, zero); | |
541 | const vector signed int sump9B = vec_mergel(sum9, zero); | |
542 | const vector signed int sumC = vec_mergel(sump8A, sump9B); | |
543 | ||
544 | const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16); | |
545 | const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16); | |
546 | const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); | |
547 | const vector signed int t2A = vec_or(sumA, tA); | |
548 | const vector signed int t2B = vec_or(sumB, tB); | |
549 | const vector signed int t2C = vec_or(sumC, tC); | |
550 | const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1), | |
551 | vec_sl(t2A, vuint32_1)); | |
552 | const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1), | |
553 | vec_sl(t2B, vuint32_1)); | |
554 | const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), | |
555 | vec_sl(t2C, vuint32_1)); | |
556 | const vector signed int yA = vec_and(t2A, t3A); | |
557 | const vector signed int yB = vec_and(t2B, t3B); | |
558 | const vector signed int yC = vec_and(t2C, t3C); | |
559 | ||
560 | const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); | |
561 | const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0); | |
562 | const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); | |
563 | const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); | |
564 | const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); | |
565 | const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2); | |
566 | const vector signed int sumAp = vec_and(yA, | |
567 | vec_and(sumAd4,sumAd8)); | |
568 | const vector signed int sumBp = vec_and(yB, | |
569 | vec_and(sumBd4,sumBd8)); | |
570 | sumA2 = vec_or(sumAp, | |
571 | vec_sra(sumAp, | |
572 | vuint32_16)); | |
573 | sumB2 = vec_or(sumBp, | |
574 | vec_sra(sumBp, | |
575 | vuint32_16)); | |
576 | } | |
577 | vec_st(sumA2, 0, S); | |
578 | vec_st(sumB2, 16, S); | |
579 | } | |
580 | ||
581 | /* I'm not sure the following is actually faster | |
582 | than straight, unvectorized C code :-( */ | |
583 | ||
584 | int __attribute__((aligned(16))) tQP2[4]; | |
585 | tQP2[0]= c->QP/2 + 1; | |
586 | vector signed int vQP2 = vec_ld(0, tQP2); | |
587 | vQP2 = vec_splat(vQP2, 0); | |
588 | const vector unsigned char vuint8_2 = vec_splat_u8(2); | |
589 | const vector signed int vsint32_8 = vec_splat_s32(8); | |
590 | const vector unsigned int vuint32_4 = vec_splat_u32(4); | |
591 | ||
592 | const vector unsigned char permA1 = (vector unsigned char) | |
593 | AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F, | |
594 | 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); | |
595 | const vector unsigned char permA2 = (vector unsigned char) | |
596 | AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, | |
597 | 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); | |
598 | const vector unsigned char permA1inc = (vector unsigned char) | |
599 | AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, | |
600 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | |
601 | const vector unsigned char permA2inc = (vector unsigned char) | |
602 | AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, | |
603 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | |
604 | const vector unsigned char magic = (vector unsigned char) | |
605 | AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02, | |
606 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | |
607 | const vector unsigned char extractPerm = (vector unsigned char) | |
608 | AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01, | |
609 | 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01); | |
610 | const vector unsigned char extractPermInc = (vector unsigned char) | |
611 | AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, | |
612 | 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01); | |
613 | const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0); | |
614 | const vector unsigned char tenRight = (vector unsigned char) | |
615 | AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
616 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | |
617 | const vector unsigned char eightLeft = (vector unsigned char) | |
618 | AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
619 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08); | |
620 | ||
621 | ||
622 | #define F_INIT(i) \ | |
623 | vector unsigned char tenRightM##i = tenRight; \ | |
624 | vector unsigned char permA1M##i = permA1; \ | |
625 | vector unsigned char permA2M##i = permA2; \ | |
626 | vector unsigned char extractPermM##i = extractPerm | |
627 | ||
628 | #define F2(i, j, k, l) \ | |
629 | if (S[i] & (1 << (l+1))) { \ | |
630 | const vector unsigned char a_##j##_A##l = \ | |
631 | vec_perm(src##i, src##j, permA1M##i); \ | |
632 | const vector unsigned char a_##j##_B##l = \ | |
633 | vec_perm(a_##j##_A##l, src##k, permA2M##i); \ | |
634 | const vector signed int a_##j##_sump##l = \ | |
635 | (vector signed int)vec_msum(a_##j##_B##l, magic, \ | |
636 | (vector unsigned int)zero); \ | |
637 | vector signed int F_##j##_##l = \ | |
638 | vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \ | |
639 | F_##j##_##l = vec_splat(F_##j##_##l, 3); \ | |
640 | const vector signed int p_##j##_##l = \ | |
641 | (vector signed int)vec_perm(src##j, \ | |
642 | (vector unsigned char)zero, \ | |
643 | extractPermM##i); \ | |
644 | const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2); \ | |
645 | const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2); \ | |
646 | vector signed int newpm_##j##_##l; \ | |
647 | if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \ | |
648 | newpm_##j##_##l = sum_##j##_##l; \ | |
649 | else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \ | |
650 | newpm_##j##_##l = diff_##j##_##l; \ | |
651 | else newpm_##j##_##l = F_##j##_##l; \ | |
652 | const vector unsigned char newpm2_##j##_##l = \ | |
653 | vec_splat((vector unsigned char)newpm_##j##_##l, 15); \ | |
654 | const vector unsigned char mask##j##l = vec_add(identity, \ | |
655 | tenRightM##i); \ | |
656 | src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \ | |
657 | } \ | |
658 | permA1M##i = vec_add(permA1M##i, permA1inc); \ | |
659 | permA2M##i = vec_add(permA2M##i, permA2inc); \ | |
660 | tenRightM##i = vec_sro(tenRightM##i, eightLeft); \ | |
661 | extractPermM##i = vec_add(extractPermM##i, extractPermInc) | |
662 | ||
663 | #define ITER(i, j, k) \ | |
664 | F_INIT(i); \ | |
665 | F2(i, j, k, 0); \ | |
666 | F2(i, j, k, 1); \ | |
667 | F2(i, j, k, 2); \ | |
668 | F2(i, j, k, 3); \ | |
669 | F2(i, j, k, 4); \ | |
670 | F2(i, j, k, 5); \ | |
671 | F2(i, j, k, 6); \ | |
672 | F2(i, j, k, 7) | |
673 | ||
674 | ITER(0, 1, 2); | |
675 | ITER(1, 2, 3); | |
676 | ITER(2, 3, 4); | |
677 | ITER(3, 4, 5); | |
678 | ITER(4, 5, 6); | |
679 | ITER(5, 6, 7); | |
680 | ITER(6, 7, 8); | |
681 | ITER(7, 8, 9); | |
682 | ||
683 | const vector signed char neg1 = vec_splat_s8( -1 ); | |
684 | ||
685 | #define STORE_LINE(i) \ | |
686 | const vector unsigned char permST##i = \ | |
687 | vec_lvsr(i * stride, srcCopy); \ | |
688 | const vector unsigned char maskST##i = \ | |
689 | vec_perm((vector unsigned char)zero, \ | |
690 | (vector unsigned char)neg1, permST##i); \ | |
691 | src##i = vec_perm(src##i ,src##i, permST##i); \ | |
692 | sA##i= vec_sel(sA##i, src##i, maskST##i); \ | |
693 | sB##i= vec_sel(src##i, sB##i, maskST##i); \ | |
694 | vec_st(sA##i, i * stride, srcCopy); \ | |
695 | vec_st(sB##i, i * stride + 16, srcCopy) | |
696 | ||
697 | STORE_LINE(1); | |
698 | STORE_LINE(2); | |
699 | STORE_LINE(3); | |
700 | STORE_LINE(4); | |
701 | STORE_LINE(5); | |
702 | STORE_LINE(6); | |
703 | STORE_LINE(7); | |
704 | STORE_LINE(8); | |
705 | ||
706 | #undef STORE_LINE | |
707 | #undef ITER | |
708 | #undef F2 | |
709 | } | |
710 | ||
711 | #define horizClassify_altivec(a...) horizClassify_C(a) | |
712 | #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) | |
713 | #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) |